feat(ingestion): add content chunking and LLM JSON output parser
This commit is contained in:
55
ingestion/internal/pipeline/parse.go
Normal file
55
ingestion/internal/pipeline/parse.go
Normal file
@@ -0,0 +1,55 @@
|
||||
// ingestion/internal/pipeline/parse.go
|
||||
package pipeline
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
|
||||
)
|
||||
|
||||
// ParsePages parses LLM output as a JSON array of {path, content} objects.
|
||||
// If the array is truncated mid-object (token limit), it salvages all complete objects.
|
||||
func ParsePages(output string) ([]wiki.Page, []string) {
|
||||
output = strings.TrimSpace(output)
|
||||
if output == "" {
|
||||
return nil, []string{"LLM returned empty output"}
|
||||
}
|
||||
|
||||
output = stripFences(output)
|
||||
|
||||
var pages []wiki.Page
|
||||
if err := json.Unmarshal([]byte(output), &pages); err == nil {
|
||||
return pages, nil
|
||||
}
|
||||
|
||||
// Truncation recovery: find last `}` that closes a complete object.
|
||||
idx := strings.LastIndex(output, "}")
|
||||
if idx < 0 {
|
||||
return nil, []string{"LLM output contained no complete JSON objects"}
|
||||
}
|
||||
|
||||
start := strings.Index(output, "[")
|
||||
if start < 0 {
|
||||
return nil, []string{"LLM output contained no JSON array opening bracket"}
|
||||
}
|
||||
|
||||
candidate := output[start:idx+1] + "]"
|
||||
if err := json.Unmarshal([]byte(candidate), &pages); err != nil {
|
||||
return nil, []string{fmt.Sprintf("truncation recovery failed: %v", err)}
|
||||
}
|
||||
|
||||
return pages, []string{fmt.Sprintf("LLM output was truncated; recovered %d page(s)", len(pages))}
|
||||
}
|
||||
|
||||
func stripFences(s string) string {
|
||||
for _, prefix := range []string{"```json\n", "```json\r\n", "```\n", "```\r\n"} {
|
||||
if strings.HasPrefix(s, prefix) {
|
||||
s = strings.TrimPrefix(s, prefix)
|
||||
s = strings.TrimSuffix(strings.TrimSpace(s), "```")
|
||||
return strings.TrimSpace(s)
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
||||
Reference in New Issue
Block a user