// ingestion/internal/pipeline/parse.go package pipeline import ( "encoding/json" "fmt" "strings" ) // RawPage is the LLM's output format — minimal structured data with no path or frontmatter. // The pipeline derives slugs, paths, and frontmatter from these fields. type RawPage struct { Title string `json:"title"` Type string `json:"type"` // "source" | "concept" | "entity" Subtype string `json:"subtype"` // entity: person|company|tool|model|framework|technology; source: article|pdf|book|video|note|project Domain string `json:"domain"` Content string `json:"content"` // Markdown body only — no frontmatter } // ParseRawPages parses LLM output as a JSON array of RawPage objects. // If the output contains invalid JSON escape sequences (e.g. \. from Markdown), // it attempts repair before falling back to truncation recovery. func ParseRawPages(output string) ([]RawPage, []string) { output = strings.TrimSpace(output) if output == "" { return nil, []string{"LLM returned empty output"} } output = stripFences(output) // Fast path: valid JSON. var pages []RawPage if err := json.Unmarshal([]byte(output), &pages); err == nil { return pages, nil } // Repair pass: fix invalid escape sequences (e.g. \. \d from Markdown content). repaired := repairJSON(output) if err := json.Unmarshal([]byte(repaired), &pages); err == nil { return pages, []string{"repaired invalid JSON escape sequences in LLM output"} } // Truncation recovery: find last `}` that closes a complete object. idx := strings.LastIndex(repaired, "}") if idx < 0 { return nil, []string{"LLM output contained no complete JSON objects"} } start := strings.Index(repaired, "[") if start < 0 { return nil, []string{"LLM output contained no JSON array opening bracket"} } candidate := repaired[start:idx+1] + "]" if err := json.Unmarshal([]byte(candidate), &pages); err != nil { return nil, []string{fmt.Sprintf("truncation recovery failed: %v", err)} } return pages, []string{fmt.Sprintf("LLM output was truncated; recovered %d page(s)", len(pages))} } // repairJSON replaces invalid JSON escape sequences (e.g. \. \d \p) with // a properly escaped backslash followed by the same character. // It iterates byte-by-byte to correctly skip already-valid escape sequences // (including \\) without requiring lookbehind support. func repairJSON(s string) string { var b strings.Builder b.Grow(len(s)) i := 0 for i < len(s) { if s[i] != '\\' { b.WriteByte(s[i]) i++ continue } // We have a backslash. Peek at the next character. if i+1 >= len(s) { // Trailing backslash — emit as-is. b.WriteByte(s[i]) i++ continue } next := s[i+1] switch next { case '"', '\\', '/', 'b', 'f', 'n', 'r', 't', 'u': // Valid JSON escape sequence — emit both characters as-is. b.WriteByte(s[i]) b.WriteByte(next) i += 2 default: // Invalid escape — double the backslash. b.WriteByte('\\') b.WriteByte('\\') b.WriteByte(next) i += 2 } } return b.String() } func stripFences(s string) string { for _, prefix := range []string{"```json\n", "```json\r\n", "```\n", "```\r\n"} { if strings.HasPrefix(s, prefix) { s = strings.TrimPrefix(s, prefix) s = strings.TrimSuffix(strings.TrimSpace(s), "```") return strings.TrimSpace(s) } } return s }