diff --git a/docs/superpowers/plans/2026-04-23-level3-slug-authority.md b/docs/superpowers/plans/2026-04-23-level3-slug-authority.md new file mode 100644 index 0000000..088feac --- /dev/null +++ b/docs/superpowers/plans/2026-04-23-level3-slug-authority.md @@ -0,0 +1,1323 @@ +# Level 3: Strip Slug Authority from LLM — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Remove slug/path/frontmatter generation from the LLM; pipeline derives all slugs deterministically from titles via `wiki.Slug()`. + +**Architecture:** Add `RawPage` type + `ParseRawPages` (LLM returns minimal JSON), `BuildPages` (computes slugs/paths/frontmatter), and `CanonicalizeLinks` (converts `[[Display Name]]` → `[[slug|Display Name]]`). Wire into `pipeline.Run`. Update system prompt and schema doc. + +**Tech Stack:** Go 1.23, `encoding/json`, `regexp`, `strings`, `testify` + +**Working directory for all commands:** `ingestion/` (the Go module root — always `cd ingestion` before running `go` commands) + +--- + +## File Map + +| File | Action | Responsibility | +|------|--------|---------------| +| `ingestion/internal/pipeline/parse.go` | Modify | Replace `ParsePages`+`wiki.Page` deserialization with `ParseRawPages`+`RawPage` type | +| `ingestion/internal/pipeline/parse_test.go` | Modify | Replace old `ParsePages` tests with `ParseRawPages` tests | +| `ingestion/internal/pipeline/build.go` | Create | `BuildPages` — computes slug, path, frontmatter from `RawPage` | +| `ingestion/internal/pipeline/build_test.go` | Create | Tests for `BuildPages` | +| `ingestion/internal/pipeline/links.go` | Create | `CanonicalizeLinks` — converts `[[Display Name]]` → `[[slug|Display Name]]` | +| `ingestion/internal/pipeline/links_test.go` | Create | Tests for `CanonicalizeLinks` | +| `ingestion/internal/pipeline/pipeline.go` | Modify | Wire `ParseRawPages → BuildPages → CanonicalizeLinks` into `Run`; update tests | +| `ingestion/internal/pipeline/pipeline_test.go` | Modify | Update mock LLM responses to new JSON format | +| `ingestion/internal/pipeline/prompt.go` | Modify | New system prompt + updated `BuildPrompt` for new JSON contract | +| `brain/schema.md` | Modify | Update wikilink format and JSON output format | + +**Unchanged:** `resolve.go`, `refs.go`, `backfill.go`, `merge.go`, `chunk.go`, `resolve_test.go`, `refs_test.go`, `backfill_test.go` + +--- + +## Task 1: `RawPage` type + `ParseRawPages` + +**Files:** +- Modify: `ingestion/internal/pipeline/parse.go` +- Modify: `ingestion/internal/pipeline/parse_test.go` + +- [ ] **Step 1: Write failing tests** + +Replace the entire content of `ingestion/internal/pipeline/parse_test.go` with: + +```go +// ingestion/internal/pipeline/parse_test.go +package pipeline + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestParseRawPages_ValidJSON(t *testing.T) { + input := `[{"title":"Shape Up","type":"source","subtype":"book","domain":"product-strategy","content":"## Summary\n\nFoo."},{"title":"Betting","type":"concept","content":"## Definition\n\nA technique."}]` + pages, warnings := ParseRawPages(input) + require.Len(t, pages, 2) + assert.Empty(t, warnings) + assert.Equal(t, "Shape Up", pages[0].Title) + assert.Equal(t, "source", pages[0].Type) + assert.Equal(t, "book", pages[0].Subtype) + assert.Equal(t, "product-strategy", pages[0].Domain) + assert.Equal(t, "Betting", pages[1].Title) + assert.Equal(t, "concept", pages[1].Type) + assert.Empty(t, pages[1].Subtype) +} + +func TestParseRawPages_StripsFences(t *testing.T) { + input := "```json\n[{\"title\":\"Foo\",\"type\":\"concept\",\"content\":\"## Definition\\n\\nFoo.\"}]\n```" + pages, warnings := ParseRawPages(input) + require.Len(t, pages, 1) + assert.Empty(t, warnings) + assert.Equal(t, "Foo", pages[0].Title) +} + +func TestParseRawPages_TruncationRecovery(t *testing.T) { + input := `[{"title":"Foo","type":"concept","content":"## Definition\n\nFoo."},{"title":"Bar","type":"concept","content":"trunc` + pages, warnings := ParseRawPages(input) + require.Len(t, pages, 1) + assert.Equal(t, "Foo", pages[0].Title) + assert.NotEmpty(t, warnings) +} + +func TestParseRawPages_EmptyInput(t *testing.T) { + pages, warnings := ParseRawPages("") + assert.Empty(t, pages) + assert.NotEmpty(t, warnings) +} + +func TestParseRawPages_PlainFence(t *testing.T) { + input := "```\n[{\"title\":\"Foo\",\"type\":\"concept\",\"content\":\"ok\"}]\n```" + pages, warnings := ParseRawPages(input) + require.Len(t, pages, 1) + assert.Empty(t, warnings) +} + +func TestParseRawPages_MissingTitle(t *testing.T) { + // Missing title — still parsed, Title is empty string + input := `[{"type":"concept","content":"## Definition\n\nFoo."}]` + pages, warnings := ParseRawPages(input) + require.Len(t, pages, 1) + assert.Empty(t, warnings) + assert.Empty(t, pages[0].Title) +} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +```bash +cd ingestion && go test ./internal/pipeline/ -run TestParseRawPages -v +``` + +Expected: FAIL — `ParseRawPages` undefined, `RawPage` undefined. + +- [ ] **Step 3: Replace `parse.go` with new implementation** + +Replace the entire content of `ingestion/internal/pipeline/parse.go` with: + +```go +// ingestion/internal/pipeline/parse.go +package pipeline + +import ( + "encoding/json" + "fmt" + "strings" +) + +// RawPage is the LLM's output format — minimal structured data with no path or frontmatter. +// The pipeline derives slugs, paths, and frontmatter from these fields. +type RawPage struct { + Title string `json:"title"` + Type string `json:"type"` // "source" | "concept" | "entity" + Subtype string `json:"subtype"` // entity: person|company|tool|model|framework|technology; source: article|pdf|book|video|note|project + Domain string `json:"domain"` + Content string `json:"content"` // Markdown body only — no frontmatter +} + +// ParseRawPages parses LLM output as a JSON array of RawPage objects. +// If the array is truncated mid-object (token limit), it salvages all complete objects. +func ParseRawPages(output string) ([]RawPage, []string) { + output = strings.TrimSpace(output) + if output == "" { + return nil, []string{"LLM returned empty output"} + } + + output = stripFences(output) + + var pages []RawPage + if err := json.Unmarshal([]byte(output), &pages); err == nil { + return pages, nil + } + + // Truncation recovery: find last `}` that closes a complete object. + idx := strings.LastIndex(output, "}") + if idx < 0 { + return nil, []string{"LLM output contained no complete JSON objects"} + } + + start := strings.Index(output, "[") + if start < 0 { + return nil, []string{"LLM output contained no JSON array opening bracket"} + } + + candidate := output[start:idx+1] + "]" + if err := json.Unmarshal([]byte(candidate), &pages); err != nil { + return nil, []string{fmt.Sprintf("truncation recovery failed: %v", err)} + } + + return pages, []string{fmt.Sprintf("LLM output was truncated; recovered %d page(s)", len(pages))} +} + +func stripFences(s string) string { + for _, prefix := range []string{"```json\n", "```json\r\n", "```\n", "```\r\n"} { + if strings.HasPrefix(s, prefix) { + s = strings.TrimPrefix(s, prefix) + s = strings.TrimSuffix(strings.TrimSpace(s), "```") + return strings.TrimSpace(s) + } + } + return s +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +```bash +cd ingestion && go test ./internal/pipeline/ -run TestParseRawPages -v +``` + +Expected: all 6 tests PASS. + +- [ ] **Step 5: Verify package still compiles (pipeline.go still references old ParsePages — that's expected for now)** + +```bash +cd ingestion && go build ./... +``` + +Expected: compile error mentioning `ParsePages` undefined — that's fine, we'll fix it in Task 4. If the error is something else, investigate. + +Actually the old `ParsePages` no longer exists so `pipeline.go` will fail. That's OK — this task is intentionally breaking the pipeline step to set up for Task 4. If CI blocks on this, group Tasks 1–4 into a single commit. + +- [ ] **Step 6: Commit** + +```bash +cd ingestion && git add internal/pipeline/parse.go internal/pipeline/parse_test.go +git commit -m "feat(pipeline): replace ParsePages with ParseRawPages + RawPage type" +``` + +--- + +## Task 2: `BuildPages` + +**Files:** +- Create: `ingestion/internal/pipeline/build.go` +- Create: `ingestion/internal/pipeline/build_test.go` + +- [ ] **Step 1: Write failing tests** + +Create `ingestion/internal/pipeline/build_test.go`: + +```go +// ingestion/internal/pipeline/build_test.go +package pipeline + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestBuildPages_SourcePage(t *testing.T) { + raw := []RawPage{ + { + Title: "Shape Up", + Type: "source", + Subtype: "book", + Domain: "product-strategy", + Content: "## Summary\n\nA book about shaping product work.\n", + }, + } + pages := BuildPages(raw, "shape-up", "2026-04-23") + require.Len(t, pages, 1) + + p := pages[0] + assert.Equal(t, "wiki/sources/shape-up.md", p.Path) + assert.Contains(t, p.Content, "title: Shape Up") + assert.Contains(t, p.Content, "type: book") + assert.Contains(t, p.Content, "domain: product-strategy") + assert.Contains(t, p.Content, "date_ingested: 2026-04-23") + assert.Contains(t, p.Content, "last_updated: 2026-04-23") + assert.Contains(t, p.Content, "aliases:\n - Shape Up") + assert.Contains(t, p.Content, "## Summary") + assert.True(t, strings.HasPrefix(p.Content, "---\n"), "content must start with frontmatter") +} + +func TestBuildPages_ConceptPage(t *testing.T) { + raw := []RawPage{ + { + Title: "Betting", + Type: "concept", + Domain: "product-strategy", + Content: "## Definition\n\nA resource allocation technique.\n", + }, + } + pages := BuildPages(raw, "shape-up", "2026-04-23") + require.Len(t, pages, 1) + + p := pages[0] + assert.Equal(t, "wiki/concepts/betting.md", p.Path) + assert.Contains(t, p.Content, "title: Betting") + assert.Contains(t, p.Content, "domain: product-strategy") + assert.Contains(t, p.Content, "last_updated: 2026-04-23") + assert.Contains(t, p.Content, "aliases:\n - Betting") + assert.NotContains(t, p.Content, "date_ingested") + assert.Contains(t, p.Content, "## Definition") +} + +func TestBuildPages_EntityPage(t *testing.T) { + raw := []RawPage{ + { + Title: "Ryan Singer", + Type: "entity", + Subtype: "person", + Domain: "product-strategy", + Content: "## Description\n\nA product designer.\n", + }, + } + pages := BuildPages(raw, "shape-up", "2026-04-23") + require.Len(t, pages, 1) + + p := pages[0] + assert.Equal(t, "wiki/entities/ryan-singer.md", p.Path) + assert.Contains(t, p.Content, "title: Ryan Singer") + assert.Contains(t, p.Content, "type: person") + assert.Contains(t, p.Content, "domain: product-strategy") + assert.Contains(t, p.Content, "last_updated: 2026-04-23") + assert.Contains(t, p.Content, "aliases:\n - Ryan Singer") + assert.NotContains(t, p.Content, "date_ingested") +} + +func TestBuildPages_SourceSlugUsedForSourcePage(t *testing.T) { + // LLM title differs from filename — pipeline uses sourceSlug for the source page path. + raw := []RawPage{ + {Title: "FinBERT: A Pretrained Model", Type: "source", Subtype: "article", Content: "## Summary\n\nA model.\n"}, + } + pages := BuildPages(raw, "finbert-huggingface", "2026-04-23") + require.Len(t, pages, 1) + assert.Equal(t, "wiki/sources/finbert-huggingface.md", pages[0].Path) + // Concept/entity slugs derived from title, not sourceSlug +} + +func TestBuildPages_ConceptSlugDerivedFromTitle(t *testing.T) { + raw := []RawPage{ + {Title: "Domain-Driven Design", Type: "concept", Content: "## Definition\n\nFoo.\n"}, + } + pages := BuildPages(raw, "some-source", "2026-04-23") + require.Len(t, pages, 1) + assert.Equal(t, "wiki/concepts/domain-driven-design.md", pages[0].Path) +} + +func TestBuildPages_SourceDefaultSubtype(t *testing.T) { + // If subtype is omitted for a source, default to "article" + raw := []RawPage{ + {Title: "Some Post", Type: "source", Content: "## Summary\n\nA post.\n"}, + } + pages := BuildPages(raw, "some-post", "2026-04-23") + require.Len(t, pages, 1) + assert.Contains(t, pages[0].Content, "type: article") +} + +func TestBuildPages_OmitsDomainWhenEmpty(t *testing.T) { + raw := []RawPage{ + {Title: "Betting", Type: "concept", Content: "## Definition\n\nFoo.\n"}, + } + pages := BuildPages(raw, "src", "2026-04-23") + require.Len(t, pages, 1) + assert.NotContains(t, pages[0].Content, "domain:") +} + +func TestBuildPages_MultiplePages(t *testing.T) { + raw := []RawPage{ + {Title: "Shape Up", Type: "source", Subtype: "book", Content: "## Summary\n\nA book.\n"}, + {Title: "Betting", Type: "concept", Content: "## Definition\n\nA technique.\n"}, + {Title: "Ryan Singer", Type: "entity", Subtype: "person", Content: "## Description\n\nA designer.\n"}, + } + pages := BuildPages(raw, "shape-up", "2026-04-23") + require.Len(t, pages, 3) + assert.Equal(t, "wiki/sources/shape-up.md", pages[0].Path) + assert.Equal(t, "wiki/concepts/betting.md", pages[1].Path) + assert.Equal(t, "wiki/entities/ryan-singer.md", pages[2].Path) +} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +```bash +cd ingestion && go test ./internal/pipeline/ -run TestBuildPages -v +``` + +Expected: FAIL — `BuildPages` undefined. + +- [ ] **Step 3: Create `build.go`** + +Create `ingestion/internal/pipeline/build.go`: + +```go +// ingestion/internal/pipeline/build.go +package pipeline + +import ( + "fmt" + "strings" + + "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" +) + +// BuildPages converts RawPages from the LLM into wiki.Pages with computed slugs, +// paths, and YAML frontmatter. sourceSlug is the slug of the source being ingested +// (derived from the filename, not the LLM title). +func BuildPages(rawPages []RawPage, sourceSlug, date string) []wiki.Page { + out := make([]wiki.Page, 0, len(rawPages)) + for _, rp := range rawPages { + out = append(out, buildPage(rp, sourceSlug, date)) + } + return out +} + +func buildPage(rp RawPage, sourceSlug, date string) wiki.Page { + var slug, dir string + switch rp.Type { + case "source": + slug = sourceSlug + dir = "wiki/sources" + case "concept": + slug = wiki.Slug(rp.Title) + dir = "wiki/concepts" + case "entity": + slug = wiki.Slug(rp.Title) + dir = "wiki/entities" + default: + slug = wiki.Slug(rp.Title) + dir = "wiki/" + rp.Type + } + + path := dir + "/" + slug + ".md" + fm := buildFrontmatter(rp, date) + + return wiki.Page{ + Path: path, + Content: fm + "\n" + rp.Content, + } +} + +func buildFrontmatter(rp RawPage, date string) string { + var sb strings.Builder + sb.WriteString("---\n") + fmt.Fprintf(&sb, "title: %s\n", rp.Title) + + switch rp.Type { + case "source": + subtype := rp.Subtype + if subtype == "" { + subtype = "article" + } + fmt.Fprintf(&sb, "type: %s\n", subtype) + if rp.Domain != "" { + fmt.Fprintf(&sb, "domain: %s\n", rp.Domain) + } + fmt.Fprintf(&sb, "date_ingested: %s\n", date) + fmt.Fprintf(&sb, "last_updated: %s\n", date) + case "concept": + if rp.Domain != "" { + fmt.Fprintf(&sb, "domain: %s\n", rp.Domain) + } + fmt.Fprintf(&sb, "last_updated: %s\n", date) + case "entity": + if rp.Subtype != "" { + fmt.Fprintf(&sb, "type: %s\n", rp.Subtype) + } + if rp.Domain != "" { + fmt.Fprintf(&sb, "domain: %s\n", rp.Domain) + } + fmt.Fprintf(&sb, "last_updated: %s\n", date) + default: + if rp.Domain != "" { + fmt.Fprintf(&sb, "domain: %s\n", rp.Domain) + } + fmt.Fprintf(&sb, "last_updated: %s\n", date) + } + + fmt.Fprintf(&sb, "aliases:\n - %s\n", rp.Title) + sb.WriteString("---\n") + return sb.String() +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +```bash +cd ingestion && go test ./internal/pipeline/ -run TestBuildPages -v +``` + +Expected: all 8 tests PASS. + +- [ ] **Step 5: Commit** + +```bash +git add ingestion/internal/pipeline/build.go ingestion/internal/pipeline/build_test.go +git commit -m "feat(pipeline): add BuildPages — compute slugs/paths/frontmatter from RawPage" +``` + +--- + +## Task 3: `CanonicalizeLinks` + +**Files:** +- Create: `ingestion/internal/pipeline/links.go` +- Create: `ingestion/internal/pipeline/links_test.go` + +- [ ] **Step 1: Write failing tests** + +Create `ingestion/internal/pipeline/links_test.go`: + +```go +// ingestion/internal/pipeline/links_test.go +package pipeline + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" +) + +func TestCanonicalizeLinks_KnownTitle(t *testing.T) { + pages := []wiki.Page{ + { + Path: "wiki/sources/shape-up.md", + Content: "---\ntitle: Shape Up\n---\n\n## Summary\n\nSee [[Betting]].\n", + }, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeConcept: { + {Slug: "betting", Title: "Betting"}, + }, + } + got, warnings := CanonicalizeLinks(pages, inventory) + require.Len(t, got, 1) + assert.Empty(t, warnings) + assert.Contains(t, got[0].Content, "[[betting|Betting]]") + assert.NotContains(t, got[0].Content, "[[Betting]]") +} + +func TestCanonicalizeLinks_UnknownTitleLeftAsIs(t *testing.T) { + pages := []wiki.Page{ + { + Path: "wiki/sources/shape-up.md", + Content: "---\ntitle: Shape Up\n---\n\n## Summary\n\nSee [[Ghost Concept]].\n", + }, + } + inventory := map[wiki.PageType][]wiki.Entry{} + got, warnings := CanonicalizeLinks(pages, inventory) + require.Len(t, got, 1) + assert.NotEmpty(t, warnings) + assert.Contains(t, got[0].Content, "[[Ghost Concept]]") +} + +func TestCanonicalizeLinks_AlreadyCanonicalLinkUntouched(t *testing.T) { + // Links already in [[slug|Display]] format must not be double-converted + pages := []wiki.Page{ + { + Path: "wiki/sources/shape-up.md", + Content: "---\ntitle: Shape Up\n---\n\n## Summary\n\nSee [[betting|Betting]].\n", + }, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeConcept: { + {Slug: "betting", Title: "Betting"}, + }, + } + got, warnings := CanonicalizeLinks(pages, inventory) + require.Len(t, got, 1) + assert.Empty(t, warnings) + // Should remain exactly as-is — not double-wrapped + assert.Contains(t, got[0].Content, "[[betting|Betting]]") + assert.NotContains(t, got[0].Content, "[[betting|[[betting|Betting]]]]") +} + +func TestCanonicalizeLinks_CaseInsensitiveMatch(t *testing.T) { + pages := []wiki.Page{ + { + Path: "wiki/sources/foo.md", + Content: "---\ntitle: Foo\n---\n\n## Summary\n\nSee [[domain driven design]].\n", + }, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeConcept: { + {Slug: "domain-driven-design", Title: "Domain Driven Design"}, + }, + } + got, warnings := CanonicalizeLinks(pages, inventory) + require.Len(t, got, 1) + assert.Empty(t, warnings) + assert.Contains(t, got[0].Content, "[[domain-driven-design|domain driven design]]") +} + +func TestCanonicalizeLinks_CurrentBatchPagesResolved(t *testing.T) { + // A concept created in the same batch should be canonicalizable + pages := []wiki.Page{ + { + Path: "wiki/sources/shape-up.md", + Content: "---\ntitle: Shape Up\n---\n\n## Summary\n\nSee [[Betting]].\n", + }, + { + Path: "wiki/concepts/betting.md", + Content: "---\ntitle: Betting\n---\n\n## Definition\n\nA technique.\n", + }, + } + inventory := map[wiki.PageType][]wiki.Entry{} // empty — Betting is in the batch, not inventory + + got, warnings := CanonicalizeLinks(pages, inventory) + require.Len(t, got, 2) + assert.Empty(t, warnings) + assert.Contains(t, got[0].Content, "[[betting|Betting]]") +} + +func TestCanonicalizeLinks_MultipleLinksInOnePage(t *testing.T) { + pages := []wiki.Page{ + { + Path: "wiki/sources/foo.md", + Content: "---\ntitle: Foo\n---\n\n## Summary\n\nSee [[Betting]] and [[Shape Up]].\n", + }, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeConcept: { + {Slug: "betting", Title: "Betting"}, + }, + wiki.PageTypeSource: { + {Slug: "shape-up", Title: "Shape Up"}, + }, + } + got, warnings := CanonicalizeLinks(pages, inventory) + require.Len(t, got, 1) + assert.Empty(t, warnings) + assert.Contains(t, got[0].Content, "[[betting|Betting]]") + assert.Contains(t, got[0].Content, "[[shape-up|Shape Up]]") +} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +```bash +cd ingestion && go test ./internal/pipeline/ -run TestCanonicalizeLinks -v +``` + +Expected: FAIL — `CanonicalizeLinks` undefined. + +- [ ] **Step 3: Create `links.go`** + +Create `ingestion/internal/pipeline/links.go`: + +```go +// ingestion/internal/pipeline/links.go +package pipeline + +import ( + "fmt" + "path/filepath" + "regexp" + "strings" + + "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" +) + +// plainLinkRE matches [[Display Name]] — wikilinks without a slug pipe. +// It does NOT match [[slug|Display]] (those already have a pipe). +var plainLinkRE = regexp.MustCompile(`\[\[([^\]|]+)\]\]`) + +// CanonicalizeLinks converts [[Display Name]] wikilinks to [[slug|Display Name]] +// using a title→slug map built from the inventory and current batch. +// Unknown titles are left as-is and returned as warnings. +func CanonicalizeLinks(pages []wiki.Page, inventory map[wiki.PageType][]wiki.Entry) ([]wiki.Page, []string) { + titleToSlug := buildTitleMap(pages, inventory) + + var allWarnings []string + out := make([]wiki.Page, len(pages)) + for i, p := range pages { + newContent, warnings := canonicalizeContent(p.Content, titleToSlug) + p.Content = newContent + out[i] = p + allWarnings = append(allWarnings, warnings...) + } + return out, allWarnings +} + +// buildTitleMap builds a lowercase-title → slug map from inventory and current batch. +// Current batch entries take precedence over inventory (they may be updates). +func buildTitleMap(pages []wiki.Page, inventory map[wiki.PageType][]wiki.Entry) map[string]string { + m := make(map[string]string) + for _, entries := range inventory { + for _, e := range entries { + m[strings.ToLower(e.Title)] = e.Slug + } + } + // Current batch overrides inventory + for _, p := range pages { + title := extractTitle(p.Content) + slug := strings.TrimSuffix(filepath.Base(p.Path), ".md") + if title != "" && slug != "" { + m[strings.ToLower(title)] = slug + } + } + return m +} + +func canonicalizeContent(content string, titleToSlug map[string]string) (string, []string) { + var warnings []string + result := plainLinkRE.ReplaceAllStringFunc(content, func(match string) string { + sub := plainLinkRE.FindStringSubmatch(match) + if len(sub) < 2 { + return match + } + displayName := sub[1] + slug, ok := titleToSlug[strings.ToLower(displayName)] + if !ok { + warnings = append(warnings, fmt.Sprintf("unknown wikilink: [[%s]]", displayName)) + return match + } + return "[[" + slug + "|" + displayName + "]]" + }) + return result, warnings +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +```bash +cd ingestion && go test ./internal/pipeline/ -run TestCanonicalizeLinks -v +``` + +Expected: all 6 tests PASS. + +- [ ] **Step 5: Commit** + +```bash +git add ingestion/internal/pipeline/links.go ingestion/internal/pipeline/links_test.go +git commit -m "feat(pipeline): add CanonicalizeLinks — convert [[Display Name]] to [[slug|Display Name]]" +``` + +--- + +## Task 4: Wire `pipeline.go` + update `pipeline_test.go` + +**Files:** +- Modify: `ingestion/internal/pipeline/pipeline.go` +- Modify: `ingestion/internal/pipeline/pipeline_test.go` + +- [ ] **Step 1: Update `pipeline_test.go` to use new LLM response format** + +Replace the entire content of `ingestion/internal/pipeline/pipeline_test.go` with: + +```go +// ingestion/internal/pipeline/pipeline_test.go +package pipeline + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/mathiasbq/hyperguild/ingestion/internal/llm" +) + +func TestRun_WritesPages(t *testing.T) { + brainDir := t.TempDir() + for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} { + require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755)) + } + + llmResponse := mustJSON([]RawPage{ + { + Title: "Test Article", + Type: "source", + Subtype: "article", + Domain: "software-engineering", + Content: "## Summary\n\nA test article.\n\n## Key Claims\n\n- It tests things.\n\n## Concepts Introduced or Reinforced\n\n[[Testing]]\n\n## Entities Mentioned\n\n## Open Questions Raised\n", + }, + { + Title: "Testing", + Type: "concept", + Domain: "software-engineering", + Content: "## Definition\n\nThe practice of verifying software.\n\n## Why It Matters\n\nCatches bugs.\n\n## Related Concepts\n\n## Related Entities\n\n## Sources\n\n## Evolving Notes\n", + }, + }) + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(map[string]any{ + "choices": []map[string]any{ + {"message": map[string]any{"role": "assistant", "content": llmResponse}}, + }, + }) + })) + defer srv.Close() + + cfg := Config{ + Complete: llm.New(srv.URL, "", "test-model", 30*time.Second).Complete, + ChunkSize: 0, + } + + result, err := Run(context.Background(), cfg, brainDir, "An article about testing.", "test-article", false) + require.NoError(t, err) + assert.Len(t, result.Pages, 2) + + _, err = os.Stat(filepath.Join(brainDir, "wiki", "sources", "test-article.md")) + require.NoError(t, err) + _, err = os.Stat(filepath.Join(brainDir, "wiki", "concepts", "testing.md")) + require.NoError(t, err) + _, err = os.Stat(filepath.Join(brainDir, "wiki", "index.md")) + require.NoError(t, err) + _, err = os.Stat(filepath.Join(brainDir, "log.md")) + require.NoError(t, err) +} + +func TestRun_DryRunDoesNotWrite(t *testing.T) { + brainDir := t.TempDir() + for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} { + require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755)) + } + + llmResponse := mustJSON([]RawPage{{ + Title: "Foo", + Type: "source", + Subtype: "article", + Content: "## Summary\n\nFoo.\n", + }}) + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _ = json.NewEncoder(w).Encode(map[string]any{ + "choices": []map[string]any{{"message": map[string]any{"content": llmResponse}}}, + }) + })) + defer srv.Close() + + cfg := Config{Complete: llm.New(srv.URL, "", "m", 30*time.Second).Complete} + result, err := Run(context.Background(), cfg, brainDir, "foo content", "foo", true) + require.NoError(t, err) + assert.Len(t, result.Pages, 1) + + _, err = os.Stat(filepath.Join(brainDir, "wiki", "sources", "foo.md")) + assert.True(t, os.IsNotExist(err)) +} + +func TestRun_MergesDuplicatePaths(t *testing.T) { + brainDir := t.TempDir() + for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} { + require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755)) + } + + // LLM returns same title twice (simulates multi-chunk duplicate) + llmResponse := mustJSON([]RawPage{ + {Title: "Foo", Type: "concept", Content: "## Definition\n\nFirst.\n\n## Related Concepts\n\n[[Bar]]\n"}, + {Title: "Foo", Type: "concept", Content: "## Definition\n\nSecond.\n\n## Related Concepts\n\n[[Baz]]\n"}, + }) + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _ = json.NewEncoder(w).Encode(map[string]any{ + "choices": []map[string]any{{"message": map[string]any{"content": llmResponse}}}, + }) + })) + defer srv.Close() + + cfg := Config{Complete: llm.New(srv.URL, "", "m", 30*time.Second).Complete} + result, err := Run(context.Background(), cfg, brainDir, "content", "foo", false) + require.NoError(t, err) + assert.Len(t, result.Pages, 1) // deduplicated + + content, err := os.ReadFile(filepath.Join(brainDir, "wiki", "concepts", "foo.md")) + require.NoError(t, err) + // keep-first for Definition, union for Related Concepts + assert.Contains(t, string(content), "First.") + // Bar and Baz unknown → left as plain [[links]] + assert.Contains(t, string(content), "[[Bar]]") + assert.Contains(t, string(content), "[[Baz]]") +} + +func mustJSON(v any) string { + b, err := json.Marshal(v) + if err != nil { + panic(err) + } + return string(b) +} +``` + +- [ ] **Step 2: Run tests to verify they fail (expected — pipeline.go still uses old ParsePages)** + +```bash +cd ingestion && go test ./internal/pipeline/ -run TestRun -v +``` + +Expected: compile error or test failures — that's fine. + +- [ ] **Step 3: Update `pipeline.go` to wire new steps** + +Replace the entire content of `ingestion/internal/pipeline/pipeline.go` with: + +```go +// ingestion/internal/pipeline/pipeline.go +package pipeline + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strings" + "time" + + "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" +) + +// CompleteFunc is the function signature for LLM calls. +type CompleteFunc func(ctx context.Context, system, user string) (string, error) + +// Config holds pipeline configuration. +type Config struct { + Complete CompleteFunc + ChunkSize int // 0 = no chunking + Schema string // overrides brain/schema.md when set (useful in tests) +} + +// Result is the outcome of a pipeline run. +type Result struct { + Pages []string // relative paths written (or would-be written in dry-run) + Warnings []string +} + +// Run ingests content and writes structured wiki pages to brainDir/wiki/. +// In dry-run mode, pages are returned but not written to disk. +func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryRun bool) (Result, error) { + inventory, err := wiki.LoadInventory(brainDir) + if err != nil { + return Result{}, fmt.Errorf("load inventory: %w", err) + } + + schema := cfg.Schema + if schema == "" { + schema = loadSchema(brainDir) + } + + sourceSlug := wiki.Slug(source) + date := time.Now().UTC().Format("2006-01-02") + chunks := Chunk(content, cfg.ChunkSize) + + var allRaw []RawPage + var allWarnings []string + + for _, chunk := range chunks { + userPrompt := BuildPrompt(schema, source, chunk, inventory) + output, err := cfg.Complete(ctx, systemPrompt, userPrompt) + if err != nil { + return Result{}, fmt.Errorf("LLM call: %w", err) + } + raw, warnings := ParseRawPages(output) + allRaw = append(allRaw, raw...) + allWarnings = append(allWarnings, warnings...) + } + + pages := BuildPages(allRaw, sourceSlug, date) + resolved := Resolve(pages, inventory) + canonicalized, linkWarnings := CanonicalizeLinks(resolved, inventory) + allWarnings = append(allWarnings, linkWarnings...) + withRefs := injectSourceRefs(canonicalized, inventory, brainDir) + merged := mergeAll(withRefs) + + var written []string + for _, page := range merged { + if !dryRun { + dest := filepath.Join(brainDir, filepath.FromSlash(page.Path)) + if err := os.MkdirAll(filepath.Dir(dest), 0o755); err != nil { + return Result{}, fmt.Errorf("mkdir for %s: %w", page.Path, err) + } + if err := os.WriteFile(dest, []byte(page.Content), 0o644); err != nil { + return Result{}, fmt.Errorf("write %s: %w", page.Path, err) + } + } + written = append(written, page.Path) + } + + if !dryRun { + if err := wiki.RebuildIndex(brainDir, date); err != nil { + allWarnings = append(allWarnings, fmt.Sprintf("rebuild index: %v", err)) + } + if err := wiki.AppendLog(brainDir, source, written, allWarnings, date); err != nil { + allWarnings = append(allWarnings, fmt.Sprintf("append log: %v", err)) + } + } + + return Result{Pages: written, Warnings: allWarnings}, nil +} + +// mergeAll deduplicates pages by path, merging content from later occurrences. +func mergeAll(pages []wiki.Page) []wiki.Page { + order := make([]string, 0, len(pages)) + byPath := make(map[string]wiki.Page, len(pages)) + for _, p := range pages { + if _, seen := byPath[p.Path]; !seen { + order = append(order, p.Path) + byPath[p.Path] = p + } else { + byPath[p.Path] = wiki.Merge(byPath[p.Path], p) + } + } + result := make([]wiki.Page, 0, len(order)) + for _, path := range order { + result = append(result, byPath[path]) + } + return result +} + +const defaultSchema = `# Brain Wiki Schema +Three page types: wiki/sources/, wiki/concepts/, wiki/entities/. +See brain/schema.md for the full schema. +` + +func loadSchema(brainDir string) string { + b, err := os.ReadFile(filepath.Join(brainDir, "schema.md")) + if err != nil { + return defaultSchema + } + return strings.TrimSpace(string(b)) +} +``` + +- [ ] **Step 4: Run all pipeline tests** + +```bash +cd ingestion && go test ./internal/pipeline/ -v +``` + +Expected: all tests PASS. If `TestRun_WritesPages` has a warning about unknown wikilink `[[Testing]]` in warnings (since the Testing concept is in the same batch and should be resolved via `CanonicalizeLinks`), check that `CanonicalizeLinks` builds the batch map correctly before checking `result.Warnings`. + +Note: `TestRun_WritesPages` now calls `assert.Len(t, result.Pages, 2)` without `assert.Empty(t, result.Warnings)` — unknown-link warnings from `CanonicalizeLinks` are acceptable since "Testing" is in the batch and should canonicalize cleanly. If the test fails with unexpected warnings, investigate `buildTitleMap` — it should find "Testing" from the current batch. + +- [ ] **Step 5: Run full test suite** + +```bash +cd ingestion && go test ./... 2>&1 +``` + +Expected: all packages pass. + +- [ ] **Step 6: Commit** + +```bash +git add ingestion/internal/pipeline/pipeline.go ingestion/internal/pipeline/pipeline_test.go +git commit -m "feat(pipeline): wire ParseRawPages+BuildPages+CanonicalizeLinks into Run" +``` + +--- + +## Task 5: Update `prompt.go` + +**Files:** +- Modify: `ingestion/internal/pipeline/prompt.go` + +- [ ] **Step 1: Replace `prompt.go`** + +Replace the entire content of `ingestion/internal/pipeline/prompt.go` with: + +```go +// ingestion/internal/pipeline/prompt.go +package pipeline + +import ( + "fmt" + "strings" + "time" + + "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" +) + +const systemPrompt = `You are a wiki agent. Read the source material and produce structured wiki pages following the schema provided. + +Output ONLY a valid JSON array — no markdown fences, no other text before or after. +Each element must have exactly these fields: + "title" — exact page title (e.g. "FinBERT", "Ryan Singer", "Shape Up") + "type" — exactly one of: "source", "concept", "entity" + "subtype" — for source: article|pdf|book|video|note|project; for entity: person|company|tool|model|framework|technology; omit for concept + "domain" — one of the domains in the schema (omit if none fits) + "content" — Markdown body only — NO frontmatter, NO path, NO slug + +Wikilinks in content: [[Display Name]] — just the display name, no slug, no pipe separator. +Only link to pages listed in the inventory or pages you are creating in this response.` + +// BuildPrompt constructs the user prompt for a single chunk. +func BuildPrompt(schema, source, content string, inventory map[wiki.PageType][]wiki.Entry) string { + var sb strings.Builder + + fmt.Fprintf(&sb, "Today's date is %s.\n\n", time.Now().UTC().Format("2006-01-02")) + + sb.WriteString("## Schema\n\n") + sb.WriteString(schema) + sb.WriteString("\n\n") + + sb.WriteString("## Existing wiki pages\n\n") + sb.WriteString("Reference these pages by display name only — [[Display Name]] — in your content.\n\n") + + for _, pt := range []wiki.PageType{wiki.PageTypeConcept, wiki.PageTypeEntity, wiki.PageTypeSource} { + entries := inventory[pt] + label := strings.ToUpper(string(pt)[:1]) + string(pt)[1:] + if len(entries) == 0 { + fmt.Fprintf(&sb, "%s — (none yet)\n\n", label) + continue + } + fmt.Fprintf(&sb, "%s:\n", label) + for _, e := range entries { + fmt.Fprintf(&sb, " - %s\n", e.Title) + } + sb.WriteString("\n") + } + + sb.WriteString("## Non-negotiable rules\n\n") + sb.WriteString("1. Output ONLY a valid JSON array — no prose, no fences.\n") + sb.WriteString("2. Fields: title, type, subtype (if applicable), domain (if applicable), content.\n") + sb.WriteString("3. Wikilinks: [[Display Name]] — no slug, no pipe. The pipeline handles slugs.\n") + sb.WriteString("4. Section links must match their section type (Related Concepts → concepts only, etc.).\n") + sb.WriteString("5. One source page per book — if inventory shows it exists, return it as an UPDATE.\n\n") + + fmt.Fprintf(&sb, "## Source: %s\n\n", source) + sb.WriteString(content) + + return sb.String() +} +``` + +- [ ] **Step 2: Verify compile and tests still pass** + +```bash +cd ingestion && go test ./... 2>&1 +``` + +Expected: all tests PASS (prompt.go has no direct unit tests — covered via integration tests in pipeline_test.go). + +- [ ] **Step 3: Commit** + +```bash +git add ingestion/internal/pipeline/prompt.go +git commit -m "feat(pipeline): update system prompt for new LLM JSON contract (no slugs)" +``` + +--- + +## Task 6: Update `brain/schema.md` + +**Files:** +- Modify: `brain/schema.md` + +- [ ] **Step 1: Update the schema doc** + +Replace the entire content of `brain/schema.md` with: + +```markdown +# Brain Wiki Schema + +This document defines the three page types in the brain wiki. +The LLM must follow this schema exactly when generating wiki pages. + +## Output Format + +Return a JSON array. Each element: + +```json +{ + "title": "exact page title", + "type": "source | concept | entity", + "subtype": "see below — omit for concept", + "domain": "see domains — omit if none fits", + "content": "Markdown body only — no frontmatter, no path" +} +``` + +- `subtype` for **source**: `article | pdf | book | video | note | project` +- `subtype` for **entity**: `person | company | tool | model | framework | technology` +- The pipeline computes slugs and frontmatter — never include them in output. + +## Wikilink Format + +All cross-references use `[[Display Name]]` — just the display name, no slug, no pipe. + +Rules: +- Only link to pages in the inventory or pages you are creating in this response +- The pipeline converts `[[Display Name]]` to `[[slug|Display Name]]` automatically +- Section links must match their section type (Related Concepts → concept pages only, etc.) + +Examples: `[[Domain Driven Design]]`, `[[Ryan Singer]]`, `[[Shape Up]]` + +## Domains + +Use one of: `ai-llm`, `software-engineering`, `product-strategy`, `finance-markets`, +`personal`, `consulting`, `climate`, `infrastructure`, `security` + +--- + +## Source Pages — wiki/sources/.md + +One page per ingested source. Books are NEVER split across multiple source pages — update the existing one. + +Body sections (in this order): + +### Summary +2–3 sentences. Core argument or finding. + +### Key Claims +Bulleted list. Paraphrase — no verbatim quotes or code. + +### Concepts Introduced or Reinforced +Wikilinks to concept pages ONLY. One per line. + +### Entities Mentioned +Wikilinks to entity pages ONLY. One per line. + +### Open Questions Raised +Gaps or follow-up questions from this source. + +For books only, also add: + +### Chapters +One bullet per chapter with 1–2 sentence summary. + +### Argument Arc +Overall narrative as it becomes clear across chapters. + +### Updates +Dated entries appended on re-ingestion. NEVER rewrite — only append. + +--- + +## Concept Pages — wiki/concepts/.md + +One page per idea, framework, methodology, or pattern. + +Body sections (in this order): + +### Definition +One-paragraph plain-language explanation. + +### Why It Matters +Practical significance. Why should anyone care? + +### Related Concepts +Wikilinks to concept pages ONLY. + +### Related Entities +Wikilinks to entity pages ONLY. + +### Sources +Wikilinks to source pages ONLY. + +### Evolving Notes +Updated as new sources arrive. Append, do not rewrite. + +--- + +## Entity Pages — wiki/entities/.md + +One page per person, tool, organisation, technology, or product. + +Body sections (in this order): + +### Description +One-line description. + +### Relevance +Why this entity matters to this knowledge base. + +### Key Positions, Products, or Claims +With dates where known. + +### Related Concepts +Wikilinks to concept pages ONLY. + +### Related Entities +Wikilinks to entity pages ONLY. + +### Sources +Wikilinks to source pages ONLY. + +--- + +## Non-Negotiable Rules + +1. Output ONLY a valid JSON array — no markdown fences, no prose before or after +2. Each element: `{"title": "...", "type": "...", "subtype": "...", "domain": "...", "content": "..."}` +3. Never include slugs, paths, or frontmatter in output — the pipeline handles these +4. Wikilinks: `[[Display Name]]` only — no pipe, no slug +5. Dates always YYYY-MM-DD (used only in content body where contextually relevant) +6. Never reproduce verbatim code — describe the pattern or technique +7. Section links must match their section type +8. One source page per book — if inventory shows it exists, include it as an UPDATE +``` + +- [ ] **Step 2: Verify Go tests still pass (schema.md is loaded at runtime, not compile time)** + +```bash +cd ingestion && go test ./... 2>&1 +``` + +Expected: all tests PASS. + +- [ ] **Step 3: Commit** + +```bash +git add brain/schema.md +git commit -m "docs(schema): update LLM output format and wikilink convention for Level 3" +``` + +--- + +## Self-Review + +**Spec coverage check:** + +| Spec requirement | Task | +|-----------------|------| +| LLM returns `{title, type, subtype, domain, content}` | Task 1 (RawPage), Task 5 (prompt) | +| Pipeline computes all slugs via `wiki.Slug(title)` | Task 2 (BuildPages) | +| Source page uses `sourceSlug` from filename | Task 2 (buildPage: `case "source": slug = sourceSlug`) | +| Frontmatter assembled by pipeline | Task 2 (buildFrontmatter) | +| `[[Display Name]]` → `[[slug|Display Name]]` canonicalization | Task 3 (CanonicalizeLinks) | +| CanonicalizeLinks runs after BuildPages, before injectSourceRefs | Task 4 (pipeline.go step order) | +| Unknown titles left as-is + warnings | Task 3 (links_test: UnknownTitleLeftAsIs) | +| Current-batch pages resolvable | Task 3 (buildTitleMap includes batch) | +| `injectSourceRefs` unchanged | — (no task needed) | +| `Resolve` unchanged | — (no task needed) | +| `brain/schema.md` updated | Task 6 | +| Integration test for no slug duplication across chunks | Task 4 (TestRun_MergesDuplicatePaths uses same title twice) | + +All spec requirements covered. + +**Placeholder scan:** None found. + +**Type consistency:** +- `RawPage` defined in Task 1, used in Tasks 2, 4 ✓ +- `BuildPages([]RawPage, string, string) []wiki.Page` defined in Task 2, called in Task 4 ✓ +- `CanonicalizeLinks([]wiki.Page, map[wiki.PageType][]wiki.Entry) ([]wiki.Page, []string)` defined in Task 3, called in Task 4 ✓ +- `ParseRawPages(string) ([]RawPage, []string)` defined in Task 1, called in Task 4 ✓ +- `extractTitle` used in Task 3 (`links.go`) — defined in `resolve.go` (same package) ✓