40 lines
929 B
Go
40 lines
929 B
Go
// ingestion/internal/pipeline/chunk.go
|
|
package pipeline
|
|
|
|
import "strings"
|
|
|
|
// Chunk splits content into pieces of at most maxSize bytes, splitting at
|
|
// paragraph boundaries (\n\n). If maxSize <= 0, returns content as one chunk.
|
|
func Chunk(content string, maxSize int) []string {
|
|
content = strings.TrimSpace(content)
|
|
if maxSize <= 0 || len(content) <= maxSize {
|
|
return []string{content}
|
|
}
|
|
|
|
paragraphs := strings.Split(content, "\n\n")
|
|
var chunks []string
|
|
var cur strings.Builder
|
|
|
|
for _, para := range paragraphs {
|
|
para = strings.TrimSpace(para)
|
|
if para == "" {
|
|
continue
|
|
}
|
|
addition := para
|
|
if cur.Len() > 0 {
|
|
addition = "\n\n" + para
|
|
}
|
|
if cur.Len() > 0 && cur.Len()+len(addition) > maxSize {
|
|
chunks = append(chunks, cur.String())
|
|
cur.Reset()
|
|
cur.WriteString(para)
|
|
} else {
|
|
cur.WriteString(addition)
|
|
}
|
|
}
|
|
if cur.Len() > 0 {
|
|
chunks = append(chunks, cur.String())
|
|
}
|
|
return chunks
|
|
}
|