feat(ingestion): add content chunking and LLM JSON output parser

This commit is contained in:
Mathias Bergqvist
2026-04-22 22:37:14 +02:00
parent d405346f07
commit 9b11719481
4 changed files with 176 additions and 0 deletions

View File

@@ -0,0 +1,39 @@
// ingestion/internal/pipeline/chunk.go
package pipeline
import "strings"
// Chunk splits content into pieces of at most maxSize bytes, splitting at
// paragraph boundaries (\n\n). If maxSize <= 0, returns content as one chunk.
func Chunk(content string, maxSize int) []string {
content = strings.TrimSpace(content)
if maxSize <= 0 || len(content) <= maxSize {
return []string{content}
}
paragraphs := strings.Split(content, "\n\n")
var chunks []string
var cur strings.Builder
for _, para := range paragraphs {
para = strings.TrimSpace(para)
if para == "" {
continue
}
addition := para
if cur.Len() > 0 {
addition = "\n\n" + para
}
if cur.Len() > 0 && cur.Len()+len(addition) > maxSize {
chunks = append(chunks, cur.String())
cur.Reset()
cur.WriteString(para)
} else {
cur.WriteString(addition)
}
}
if cur.Len() > 0 {
chunks = append(chunks, cur.String())
}
return chunks
}