Strips slug authority from the LLM. The new RawPage type carries only
{title, type, subtype, domain, content} — no paths or frontmatter.
Pipeline will derive slugs deterministically (Task 4).
pipeline.go gets a temporary bridge stub (TODO task4) to keep the
package compiling between tasks.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
64 lines
2.0 KiB
Go
64 lines
2.0 KiB
Go
// ingestion/internal/pipeline/parse.go
|
|
package pipeline
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"strings"
|
|
)
|
|
|
|
// RawPage is the LLM's output format — minimal structured data with no path or frontmatter.
|
|
// The pipeline derives slugs, paths, and frontmatter from these fields.
|
|
type RawPage struct {
|
|
Title string `json:"title"`
|
|
Type string `json:"type"` // "source" | "concept" | "entity"
|
|
Subtype string `json:"subtype"` // entity: person|company|tool|model|framework|technology; source: article|pdf|book|video|note|project
|
|
Domain string `json:"domain"`
|
|
Content string `json:"content"` // Markdown body only — no frontmatter
|
|
}
|
|
|
|
// ParseRawPages parses LLM output as a JSON array of RawPage objects.
|
|
// If the array is truncated mid-object (token limit), it salvages all complete objects.
|
|
func ParseRawPages(output string) ([]RawPage, []string) {
|
|
output = strings.TrimSpace(output)
|
|
if output == "" {
|
|
return nil, []string{"LLM returned empty output"}
|
|
}
|
|
|
|
output = stripFences(output)
|
|
|
|
var pages []RawPage
|
|
if err := json.Unmarshal([]byte(output), &pages); err == nil {
|
|
return pages, nil
|
|
}
|
|
|
|
// Truncation recovery: find last `}` that closes a complete object.
|
|
idx := strings.LastIndex(output, "}")
|
|
if idx < 0 {
|
|
return nil, []string{"LLM output contained no complete JSON objects"}
|
|
}
|
|
|
|
start := strings.Index(output, "[")
|
|
if start < 0 {
|
|
return nil, []string{"LLM output contained no JSON array opening bracket"}
|
|
}
|
|
|
|
candidate := output[start:idx+1] + "]"
|
|
if err := json.Unmarshal([]byte(candidate), &pages); err != nil {
|
|
return nil, []string{fmt.Sprintf("truncation recovery failed: %v", err)}
|
|
}
|
|
|
|
return pages, []string{fmt.Sprintf("LLM output was truncated; recovered %d page(s)", len(pages))}
|
|
}
|
|
|
|
func stripFences(s string) string {
|
|
for _, prefix := range []string{"```json\n", "```json\r\n", "```\n", "```\r\n"} {
|
|
if strings.HasPrefix(s, prefix) {
|
|
s = strings.TrimPrefix(s, prefix)
|
|
s = strings.TrimSuffix(strings.TrimSpace(s), "```")
|
|
return strings.TrimSpace(s)
|
|
}
|
|
}
|
|
return s
|
|
}
|