feat(ingestion): add content chunking and LLM JSON output parser
This commit is contained in:
46
ingestion/internal/pipeline/parse_test.go
Normal file
46
ingestion/internal/pipeline/parse_test.go
Normal file
@@ -0,0 +1,46 @@
|
||||
// ingestion/internal/pipeline/parse_test.go
|
||||
package pipeline
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestParsePages_ValidJSON(t *testing.T) {
|
||||
input := `[{"path":"wiki/sources/foo.md","content":"# Foo"},{"path":"wiki/concepts/bar.md","content":"# Bar"}]`
|
||||
pages, warnings := ParsePages(input)
|
||||
require.Len(t, pages, 2)
|
||||
assert.Empty(t, warnings)
|
||||
assert.Equal(t, "wiki/sources/foo.md", pages[0].Path)
|
||||
assert.Equal(t, "wiki/concepts/bar.md", pages[1].Path)
|
||||
}
|
||||
|
||||
func TestParsePages_StripsFences(t *testing.T) {
|
||||
input := "```json\n[{\"path\":\"wiki/sources/foo.md\",\"content\":\"# Foo\"}]\n```"
|
||||
pages, warnings := ParsePages(input)
|
||||
assert.Len(t, pages, 1)
|
||||
assert.Empty(t, warnings)
|
||||
}
|
||||
|
||||
func TestParsePages_TruncationRecovery(t *testing.T) {
|
||||
input := `[{"path":"wiki/sources/foo.md","content":"# Foo"},{"path":"wiki/concepts/bar.md","content":"trunc`
|
||||
pages, warnings := ParsePages(input)
|
||||
require.Len(t, pages, 1)
|
||||
assert.Equal(t, "wiki/sources/foo.md", pages[0].Path)
|
||||
assert.NotEmpty(t, warnings)
|
||||
}
|
||||
|
||||
func TestParsePages_EmptyInput(t *testing.T) {
|
||||
pages, warnings := ParsePages("")
|
||||
assert.Empty(t, pages)
|
||||
assert.NotEmpty(t, warnings)
|
||||
}
|
||||
|
||||
func TestParsePages_PlainFence(t *testing.T) {
|
||||
input := "```\n[{\"path\":\"wiki/sources/foo.md\",\"content\":\"ok\"}]\n```"
|
||||
pages, warnings := ParsePages(input)
|
||||
assert.Len(t, pages, 1)
|
||||
assert.Empty(t, warnings)
|
||||
}
|
||||
Reference in New Issue
Block a user