Files
hyperguild/ingestion/internal/pipeline/chunk.go

40 lines
929 B
Go

// ingestion/internal/pipeline/chunk.go
package pipeline
import "strings"
// Chunk splits content into pieces of at most maxSize bytes, splitting at
// paragraph boundaries (\n\n). If maxSize <= 0, returns content as one chunk.
func Chunk(content string, maxSize int) []string {
content = strings.TrimSpace(content)
if maxSize <= 0 || len(content) <= maxSize {
return []string{content}
}
paragraphs := strings.Split(content, "\n\n")
var chunks []string
var cur strings.Builder
for _, para := range paragraphs {
para = strings.TrimSpace(para)
if para == "" {
continue
}
addition := para
if cur.Len() > 0 {
addition = "\n\n" + para
}
if cur.Len() > 0 && cur.Len()+len(addition) > maxSize {
chunks = append(chunks, cur.String())
cur.Reset()
cur.WriteString(para)
} else {
cur.WriteString(addition)
}
}
if cur.Len() > 0 {
chunks = append(chunks, cur.String())
}
return chunks
}