2 Commits

Author SHA1 Message Date
Mathias Bergqvist
0a70d9e972 feat(pipeline): add POST /ingest-raw for direct batch ingestion without LLM
All checks were successful
CI / Lint / Test / Vet (push) Successful in 9s
CI / Mirror to GitHub (push) Has been skipped
Allows callers to provide pre-structured RawPage data directly, bypassing the
LLM extraction step. The pipeline still handles slug computation, frontmatter,
link canonicalization, source back-references, and dedup — only the extraction
is skipped. Useful when a more capable model or manual curation produces the
structured data.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 11:15:59 +02:00
Mathias Bergqvist
3e9a648115 fix(pipeline): repair invalid JSON escape sequences from LLM output before parsing
All checks were successful
CI / Lint / Test / Vet (push) Successful in 11s
CI / Mirror to GitHub (push) Has been skipped
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-23 22:04:27 +02:00
8 changed files with 281 additions and 11 deletions

View File

@@ -68,6 +68,7 @@ func main() {
mux.HandleFunc("POST /write", h.Write) mux.HandleFunc("POST /write", h.Write)
mux.HandleFunc("POST /ingest", h.Ingest) mux.HandleFunc("POST /ingest", h.Ingest)
mux.HandleFunc("POST /ingest-path", h.IngestPath) mux.HandleFunc("POST /ingest-path", h.IngestPath)
mux.HandleFunc("POST /ingest-raw", h.IngestRaw)
mux.HandleFunc("POST /backfill-refs", h.BackfillRefs) mux.HandleFunc("POST /backfill-refs", h.BackfillRefs)
addr := ":" + port addr := ":" + port

View File

@@ -272,6 +272,48 @@ func (h *Handler) IngestPath(w http.ResponseWriter, r *http.Request) {
writeJSON(w, ingestResponse{Pages: allPages, Warnings: allWarnings}) writeJSON(w, ingestResponse{Pages: allPages, Warnings: allWarnings})
} }
type ingestRawRequest struct {
Source string `json:"source"`
Pages []pipeline.RawPage `json:"pages"`
DryRun bool `json:"dry_run"`
}
// IngestRaw handles POST /ingest-raw — run the pipeline on pre-parsed RawPages,
// skipping the LLM extraction step. Use when the caller has already produced
// structured page data (e.g. from a more capable model or manual curation).
func (h *Handler) IngestRaw(w http.ResponseWriter, r *http.Request) {
var req ingestRawRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeError(w, http.StatusBadRequest, "invalid JSON")
return
}
if strings.TrimSpace(req.Source) == "" {
writeError(w, http.StatusBadRequest, "source is required")
return
}
if len(req.Pages) == 0 {
writeError(w, http.StatusBadRequest, "pages is required and must be non-empty")
return
}
result, err := pipeline.RunRaw(h.brainDir, req.Source, req.Pages, req.DryRun)
if err != nil {
h.logger.Error("ingest-raw failed", "source", req.Source, "err", err)
writeError(w, http.StatusInternalServerError, "ingest error")
return
}
pages := result.Pages
if pages == nil {
pages = []string{}
}
warnings := result.Warnings
if warnings == nil {
warnings = []string{}
}
writeJSON(w, ingestResponse{Pages: pages, Warnings: warnings})
}
// BackfillRefs handles POST /backfill-refs — injects source back-references // BackfillRefs handles POST /backfill-refs — injects source back-references
// into all concept and entity pages based on existing wiki/sources/ pages. // into all concept and entity pages based on existing wiki/sources/ pages.
func (h *Handler) BackfillRefs(w http.ResponseWriter, r *http.Request) { func (h *Handler) BackfillRefs(w http.ResponseWriter, r *http.Request) {

View File

@@ -226,6 +226,85 @@ func TestIngestPath_File(t *testing.T) {
assert.NotEmpty(t, pagesSlice) assert.NotEmpty(t, pagesSlice)
} }
// ---------------------------------------------------------------------------
// POST /ingest-raw
// ---------------------------------------------------------------------------
func TestIngestRaw_Validation(t *testing.T) {
cases := []struct {
name string
body map[string]any
}{
{"missing source", map[string]any{"pages": []any{map[string]any{"title": "X", "type": "concept", "content": "x"}}}},
{"missing pages", map[string]any{"source": "test-source"}},
{"empty pages", map[string]any{"source": "test-source", "pages": []any{}}},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
_, h := setup(t)
body, _ := json.Marshal(tc.body)
req := httptest.NewRequest(http.MethodPost, "/ingest-raw", bytes.NewReader(body))
rec := httptest.NewRecorder()
h.IngestRaw(rec, req)
assert.Equal(t, http.StatusBadRequest, rec.Code)
})
}
}
func TestIngestRaw_Success(t *testing.T) {
dir, h := setup(t)
body, _ := json.Marshal(map[string]any{
"source": "test-article",
"pages": []any{
map[string]any{"title": "Test Article", "type": "source", "subtype": "article", "domain": "Testing", "content": "## Summary\n\nThis is a test article about [[Test Concept]].\n"},
map[string]any{"title": "Test Concept", "type": "concept", "domain": "Testing", "content": "A concept for testing.\n"},
},
})
req := httptest.NewRequest(http.MethodPost, "/ingest-raw", bytes.NewReader(body))
rec := httptest.NewRecorder()
h.IngestRaw(rec, req)
require.Equal(t, http.StatusOK, rec.Code)
var resp map[string]any
require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &resp))
pages := resp["pages"].([]any)
assert.Len(t, pages, 2)
// Verify files were written
sourcePath := filepath.Join(dir, "wiki", "sources", "test-article.md")
assert.FileExists(t, sourcePath)
conceptPath := filepath.Join(dir, "wiki", "concepts", "test-concept.md")
assert.FileExists(t, conceptPath)
}
func TestIngestRaw_DryRun(t *testing.T) {
dir, h := setup(t)
body, _ := json.Marshal(map[string]any{
"source": "dry-run-test",
"pages": []any{
map[string]any{"title": "Dry Run Source", "type": "source", "subtype": "article", "content": "Content."},
},
"dry_run": true,
})
req := httptest.NewRequest(http.MethodPost, "/ingest-raw", bytes.NewReader(body))
rec := httptest.NewRecorder()
h.IngestRaw(rec, req)
require.Equal(t, http.StatusOK, rec.Code)
var resp map[string]any
require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &resp))
pages := resp["pages"].([]any)
assert.NotEmpty(t, pages)
// Verify no files were written
sourcePath := filepath.Join(dir, "wiki", "sources", "dry-run-test.md")
assert.NoFileExists(t, sourcePath)
}
func TestIngestPath_Directory(t *testing.T) { func TestIngestPath_Directory(t *testing.T) {
_, h := setup(t) _, h := setup(t)

View File

@@ -18,7 +18,8 @@ type RawPage struct {
} }
// ParseRawPages parses LLM output as a JSON array of RawPage objects. // ParseRawPages parses LLM output as a JSON array of RawPage objects.
// If the array is truncated mid-object (token limit), it salvages all complete objects. // If the output contains invalid JSON escape sequences (e.g. \. from Markdown),
// it attempts repair before falling back to truncation recovery.
func ParseRawPages(output string) ([]RawPage, []string) { func ParseRawPages(output string) ([]RawPage, []string) {
output = strings.TrimSpace(output) output = strings.TrimSpace(output)
if output == "" { if output == "" {
@@ -27,23 +28,30 @@ func ParseRawPages(output string) ([]RawPage, []string) {
output = stripFences(output) output = stripFences(output)
// Fast path: valid JSON.
var pages []RawPage var pages []RawPage
if err := json.Unmarshal([]byte(output), &pages); err == nil { if err := json.Unmarshal([]byte(output), &pages); err == nil {
return pages, nil return pages, nil
} }
// Repair pass: fix invalid escape sequences (e.g. \. \d from Markdown content).
repaired := repairJSON(output)
if err := json.Unmarshal([]byte(repaired), &pages); err == nil {
return pages, []string{"repaired invalid JSON escape sequences in LLM output"}
}
// Truncation recovery: find last `}` that closes a complete object. // Truncation recovery: find last `}` that closes a complete object.
idx := strings.LastIndex(output, "}") idx := strings.LastIndex(repaired, "}")
if idx < 0 { if idx < 0 {
return nil, []string{"LLM output contained no complete JSON objects"} return nil, []string{"LLM output contained no complete JSON objects"}
} }
start := strings.Index(output, "[") start := strings.Index(repaired, "[")
if start < 0 { if start < 0 {
return nil, []string{"LLM output contained no JSON array opening bracket"} return nil, []string{"LLM output contained no JSON array opening bracket"}
} }
candidate := output[start:idx+1] + "]" candidate := repaired[start:idx+1] + "]"
if err := json.Unmarshal([]byte(candidate), &pages); err != nil { if err := json.Unmarshal([]byte(candidate), &pages); err != nil {
return nil, []string{fmt.Sprintf("truncation recovery failed: %v", err)} return nil, []string{fmt.Sprintf("truncation recovery failed: %v", err)}
} }
@@ -51,6 +59,45 @@ func ParseRawPages(output string) ([]RawPage, []string) {
return pages, []string{fmt.Sprintf("LLM output was truncated; recovered %d page(s)", len(pages))} return pages, []string{fmt.Sprintf("LLM output was truncated; recovered %d page(s)", len(pages))}
} }
// repairJSON replaces invalid JSON escape sequences (e.g. \. \d \p) with
// a properly escaped backslash followed by the same character.
// It iterates byte-by-byte to correctly skip already-valid escape sequences
// (including \\) without requiring lookbehind support.
func repairJSON(s string) string {
var b strings.Builder
b.Grow(len(s))
i := 0
for i < len(s) {
if s[i] != '\\' {
b.WriteByte(s[i])
i++
continue
}
// We have a backslash. Peek at the next character.
if i+1 >= len(s) {
// Trailing backslash — emit as-is.
b.WriteByte(s[i])
i++
continue
}
next := s[i+1]
switch next {
case '"', '\\', '/', 'b', 'f', 'n', 'r', 't', 'u':
// Valid JSON escape sequence — emit both characters as-is.
b.WriteByte(s[i])
b.WriteByte(next)
i += 2
default:
// Invalid escape — double the backslash.
b.WriteByte('\\')
b.WriteByte('\\')
b.WriteByte(next)
i += 2
}
}
return b.String()
}
func stripFences(s string) string { func stripFences(s string) string {
for _, prefix := range []string{"```json\n", "```json\r\n", "```\n", "```\r\n"} { for _, prefix := range []string{"```json\n", "```json\r\n", "```\n", "```\r\n"} {
if strings.HasPrefix(s, prefix) { if strings.HasPrefix(s, prefix) {

View File

@@ -59,3 +59,29 @@ func TestParseRawPages_MissingTitle(t *testing.T) {
assert.Empty(t, warnings) assert.Empty(t, warnings)
assert.Empty(t, pages[0].Title) assert.Empty(t, pages[0].Title)
} }
func TestParseRawPages_InvalidEscapeRepaired(t *testing.T) {
// LLM copied markdown escaped list numbers (\.) into JSON — invalid escape
raw := "[{\"title\":\"Foo\",\"type\":\"concept\",\"content\":\"Step 4\\. Do it.\"}]"
pages, warnings := ParseRawPages(raw)
require.Len(t, pages, 1)
assert.Equal(t, "Foo", pages[0].Title)
assert.Contains(t, pages[0].Content, `4\.`)
assert.NotEmpty(t, warnings) // repair warning
}
func TestRepairJSON_FixesInvalidEscapes(t *testing.T) {
cases := []struct {
in string
want string
}{
{`{"a":"foo\.bar"}`, `{"a":"foo\\.bar"}`},
{`{"a":"\\n is fine"}`, `{"a":"\\n is fine"}`}, // valid \n untouched
{`{"a":"\d+ items"}`, `{"a":"\\d+ items"}`},
{`{"a":"already \\ escaped"}`, `{"a":"already \\ escaped"}`}, // valid \\ untouched
}
for _, tc := range cases {
got := repairJSON(tc.in)
assert.Equal(t, tc.want, got, "input: %s", tc.in)
}
}

View File

@@ -59,11 +59,31 @@ func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryR
allWarnings = append(allWarnings, warnings...) allWarnings = append(allWarnings, warnings...)
} }
pages, buildWarnings := BuildPages(allRaw, sourceSlug, date) return buildAndWrite(allRaw, sourceSlug, date, brainDir, source, inventory, allWarnings, dryRun)
allWarnings = append(allWarnings, buildWarnings...) }
// RunRaw runs the pipeline on pre-parsed RawPages, skipping the LLM extraction
// step. Use this when the caller has already produced the structured RawPage data
// (e.g. from a more capable model or manual curation).
func RunRaw(brainDir, source string, rawPages []RawPage, dryRun bool) (Result, error) {
inventory, err := wiki.LoadInventory(brainDir)
if err != nil {
return Result{}, fmt.Errorf("load inventory: %w", err)
}
sourceSlug := wiki.Slug(source)
date := time.Now().UTC().Format("2006-01-02")
return buildAndWrite(rawPages, sourceSlug, date, brainDir, source, inventory, nil, dryRun)
}
// buildAndWrite runs BuildPages through write for both Run and RunRaw.
func buildAndWrite(rawPages []RawPage, sourceSlug, date, brainDir, source string, inventory map[wiki.PageType][]wiki.Entry, warnings []string, dryRun bool) (Result, error) {
pages, buildWarnings := BuildPages(rawPages, sourceSlug, date)
warnings = append(warnings, buildWarnings...)
resolved := Resolve(pages, inventory) resolved := Resolve(pages, inventory)
canonicalized, linkWarnings := CanonicalizeLinks(resolved, inventory) canonicalized, linkWarnings := CanonicalizeLinks(resolved, inventory)
allWarnings = append(allWarnings, linkWarnings...) warnings = append(warnings, linkWarnings...)
withRefs := injectSourceRefs(canonicalized, inventory, brainDir) withRefs := injectSourceRefs(canonicalized, inventory, brainDir)
merged := mergeAll(withRefs) merged := mergeAll(withRefs)
@@ -83,14 +103,14 @@ func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryR
if !dryRun { if !dryRun {
if err := wiki.RebuildIndex(brainDir, date); err != nil { if err := wiki.RebuildIndex(brainDir, date); err != nil {
allWarnings = append(allWarnings, fmt.Sprintf("rebuild index: %v", err)) warnings = append(warnings, fmt.Sprintf("rebuild index: %v", err))
} }
if err := wiki.AppendLog(brainDir, source, written, allWarnings, date); err != nil { if err := wiki.AppendLog(brainDir, source, written, warnings, date); err != nil {
allWarnings = append(allWarnings, fmt.Sprintf("append log: %v", err)) warnings = append(warnings, fmt.Sprintf("append log: %v", err))
} }
} }
return Result{Pages: written, Warnings: allWarnings}, nil return Result{Pages: written, Warnings: warnings}, nil
} }
// mergeAll deduplicates pages by path, merging content from later occurrences. // mergeAll deduplicates pages by path, merging content from later occurrences.

View File

@@ -17,6 +17,8 @@ func (s *Skill) Handle(ctx context.Context, tool string, args json.RawMessage) (
return s.query(ctx, args) return s.query(ctx, args)
case "brain_write": case "brain_write":
return s.write(ctx, args) return s.write(ctx, args)
case "brain_ingest_raw":
return s.ingestRaw(ctx, args)
case "brain_ingest": case "brain_ingest":
return s.ingest(ctx, args) return s.ingest(ctx, args)
case "brain_search": case "brain_search":
@@ -98,6 +100,33 @@ func (s *Skill) ingest(ctx context.Context, args json.RawMessage) (json.RawMessa
return nil, fmt.Errorf("either content+source or path is required") return nil, fmt.Errorf("either content+source or path is required")
} }
type ingestRawArgs struct {
Source string `json:"source"`
Pages []any `json:"pages"`
DryRun bool `json:"dry_run,omitempty"`
}
func (s *Skill) ingestRaw(ctx context.Context, args json.RawMessage) (json.RawMessage, error) {
var a ingestRawArgs
if err := json.Unmarshal(args, &a); err != nil {
return nil, fmt.Errorf("parse args: %w", err)
}
if s.cfg.IngestSvcURL == "" {
return nil, fmt.Errorf("brain_ingest_raw: INGEST_SVC_URL not configured")
}
if a.Source == "" {
return nil, fmt.Errorf("source is required")
}
if len(a.Pages) == 0 {
return nil, fmt.Errorf("pages is required and must be non-empty")
}
return s.postTo(ctx, s.cfg.IngestSvcURL+"/ingest-raw", map[string]any{
"source": a.Source,
"pages": a.Pages,
"dry_run": a.DryRun,
})
}
type searchArgs struct { type searchArgs struct {
Query string `json:"query"` Query string `json:"query"`
Collection string `json:"collection,omitempty"` Collection string `json:"collection,omitempty"`

View File

@@ -55,6 +55,32 @@ func (s *Skill) Tools() []registry.ToolDef {
}, },
} }
if s.cfg.IngestSvcURL != "" { if s.cfg.IngestSvcURL != "" {
tools = append(tools, registry.ToolDef{
Name: "brain_ingest_raw",
Description: "Ingest pre-structured pages into the brain wiki, bypassing the LLM extraction step. " +
"Use when you (the calling agent) have already extracted entities, concepts, and content from a source. " +
"Provide source (human-readable name) and pages (array of {title, type, subtype, domain, content} objects). " +
"The pipeline computes slugs, paths, frontmatter, wikilink canonicalization, and source back-references. " +
"Returns the list of wiki pages written.",
InputSchema: schema([]string{"source", "pages"}, map[string]any{
"source": map[string]any{"type": "string", "description": "human-readable name for the source, e.g. 'shape-up-book'"},
"pages": map[string]any{
"type": "array",
"items": map[string]any{
"type": "object",
"required": []string{"title", "type", "content"},
"properties": map[string]any{
"title": map[string]any{"type": "string", "description": "page title, e.g. 'Hash Encoding'"},
"type": map[string]any{"type": "string", "enum": []string{"source", "concept", "entity"}, "description": "page type"},
"subtype": map[string]any{"type": "string", "description": "entity: person|company|tool|model|framework|technology; source: article|pdf|book|video|note|project"},
"domain": map[string]any{"type": "string", "description": "knowledge domain, e.g. 'Machine Learning'"},
"content": map[string]any{"type": "string", "description": "markdown body — no frontmatter, use [[Display Name]] for wikilinks"},
},
},
},
"dry_run": map[string]any{"type": "boolean"},
}),
})
tools = append(tools, registry.ToolDef{ tools = append(tools, registry.ToolDef{
Name: "brain_ingest", Name: "brain_ingest",
Description: "Ingest content into the brain wiki (brain/wiki/). Calls an LLM to produce structured wiki pages. " + Description: "Ingest content into the brain wiki (brain/wiki/). Calls an LLM to produce structured wiki pages. " +