diff --git a/ingestion/internal/api/handler_test.go b/ingestion/internal/api/handler_test.go index b06d1ee..7ac9e8a 100644 --- a/ingestion/internal/api/handler_test.go +++ b/ingestion/internal/api/handler_test.go @@ -20,9 +20,9 @@ import ( "github.com/mathiasbq/hyperguild/ingestion/internal/pipeline" ) -// stubComplete returns a fixed JSON page so tests never call a real LLM. +// stubComplete returns a fixed JSON RawPage so tests never call a real LLM. func stubComplete(_ context.Context, _, _ string) (string, error) { - return `[{"path":"wiki/sources/test-source.md","content":"# Test Source\n\nSome content here.\n"}]`, nil + return `[{"title":"Test Source","type":"source","subtype":"article","content":"## Summary\n\nSome content here.\n"}]`, nil } func stubPipelineCfg() pipeline.Config { diff --git a/ingestion/internal/pipeline/build.go b/ingestion/internal/pipeline/build.go index eb324af..85fb965 100644 --- a/ingestion/internal/pipeline/build.go +++ b/ingestion/internal/pipeline/build.go @@ -10,13 +10,27 @@ import ( // BuildPages converts RawPages from the LLM into wiki.Pages with computed slugs, // paths, and YAML frontmatter. sourceSlug is the slug of the source being ingested -// (derived from the filename, not the LLM title). -func BuildPages(rawPages []RawPage, sourceSlug, date string) []wiki.Page { +// (derived from the filename, not the LLM title). Pages whose title resolves to an +// empty slug are skipped and returned as warnings instead. +func BuildPages(rawPages []RawPage, sourceSlug, date string) ([]wiki.Page, []string) { out := make([]wiki.Page, 0, len(rawPages)) + var warnings []string for _, rp := range rawPages { + slug := computeSlug(rp, sourceSlug) + if slug == "" { + warnings = append(warnings, fmt.Sprintf("skipped page with empty title (type: %s)", rp.Type)) + continue + } out = append(out, buildPage(rp, sourceSlug, date)) } - return out + return out, warnings +} + +func computeSlug(rp RawPage, sourceSlug string) string { + if rp.Type == "source" { + return sourceSlug + } + return wiki.Slug(rp.Title) } func buildPage(rp RawPage, sourceSlug, date string) wiki.Page { diff --git a/ingestion/internal/pipeline/build_test.go b/ingestion/internal/pipeline/build_test.go index 19c2f3d..482fd56 100644 --- a/ingestion/internal/pipeline/build_test.go +++ b/ingestion/internal/pipeline/build_test.go @@ -19,8 +19,9 @@ func TestBuildPages_SourcePage(t *testing.T) { Content: "## Summary\n\nA book about shaping product work.\n", }, } - pages := BuildPages(raw, "shape-up", "2026-04-23") + pages, warnings := BuildPages(raw, "shape-up", "2026-04-23") require.Len(t, pages, 1) + assert.Empty(t, warnings) p := pages[0] assert.Equal(t, "wiki/sources/shape-up.md", p.Path) @@ -43,8 +44,9 @@ func TestBuildPages_ConceptPage(t *testing.T) { Content: "## Definition\n\nA resource allocation technique.\n", }, } - pages := BuildPages(raw, "shape-up", "2026-04-23") + pages, warnings := BuildPages(raw, "shape-up", "2026-04-23") require.Len(t, pages, 1) + assert.Empty(t, warnings) p := pages[0] assert.Equal(t, "wiki/concepts/betting.md", p.Path) @@ -66,8 +68,9 @@ func TestBuildPages_EntityPage(t *testing.T) { Content: "## Description\n\nA product designer.\n", }, } - pages := BuildPages(raw, "shape-up", "2026-04-23") + pages, warnings := BuildPages(raw, "shape-up", "2026-04-23") require.Len(t, pages, 1) + assert.Empty(t, warnings) p := pages[0] assert.Equal(t, "wiki/entities/ryan-singer.md", p.Path) @@ -84,7 +87,7 @@ func TestBuildPages_SourceSlugUsedForSourcePage(t *testing.T) { raw := []RawPage{ {Title: "FinBERT: A Pretrained Model", Type: "source", Subtype: "article", Content: "## Summary\n\nA model.\n"}, } - pages := BuildPages(raw, "finbert-huggingface", "2026-04-23") + pages, _ := BuildPages(raw, "finbert-huggingface", "2026-04-23") require.Len(t, pages, 1) assert.Equal(t, "wiki/sources/finbert-huggingface.md", pages[0].Path) } @@ -93,7 +96,7 @@ func TestBuildPages_ConceptSlugDerivedFromTitle(t *testing.T) { raw := []RawPage{ {Title: "Domain-Driven Design", Type: "concept", Content: "## Definition\n\nFoo.\n"}, } - pages := BuildPages(raw, "some-source", "2026-04-23") + pages, _ := BuildPages(raw, "some-source", "2026-04-23") require.Len(t, pages, 1) assert.Equal(t, "wiki/concepts/domain-driven-design.md", pages[0].Path) } @@ -103,7 +106,7 @@ func TestBuildPages_SourceDefaultSubtype(t *testing.T) { raw := []RawPage{ {Title: "Some Post", Type: "source", Content: "## Summary\n\nA post.\n"}, } - pages := BuildPages(raw, "some-post", "2026-04-23") + pages, _ := BuildPages(raw, "some-post", "2026-04-23") require.Len(t, pages, 1) assert.Contains(t, pages[0].Content, "type: 'article'") } @@ -112,7 +115,7 @@ func TestBuildPages_OmitsDomainWhenEmpty(t *testing.T) { raw := []RawPage{ {Title: "Betting", Type: "concept", Content: "## Definition\n\nFoo.\n"}, } - pages := BuildPages(raw, "src", "2026-04-23") + pages, _ := BuildPages(raw, "src", "2026-04-23") require.Len(t, pages, 1) assert.NotContains(t, pages[0].Content, "domain:") } @@ -123,7 +126,7 @@ func TestBuildPages_MultiplePages(t *testing.T) { {Title: "Betting", Type: "concept", Content: "## Definition\n\nA technique.\n"}, {Title: "Ryan Singer", Type: "entity", Subtype: "person", Content: "## Description\n\nA designer.\n"}, } - pages := BuildPages(raw, "shape-up", "2026-04-23") + pages, _ := BuildPages(raw, "shape-up", "2026-04-23") require.Len(t, pages, 3) assert.Equal(t, "wiki/sources/shape-up.md", pages[0].Path) assert.Equal(t, "wiki/concepts/betting.md", pages[1].Path) @@ -134,7 +137,7 @@ func TestBuildPages_TitleWithColon(t *testing.T) { raw := []RawPage{ {Title: "Shape Up: The Basecamp Method", Type: "source", Subtype: "book", Content: "## Summary\n\nA book.\n"}, } - pages := BuildPages(raw, "shape-up", "2026-04-23") + pages, _ := BuildPages(raw, "shape-up", "2026-04-23") require.Len(t, pages, 1) // Title with colon must be quoted in YAML assert.Contains(t, pages[0].Content, "title: 'Shape Up: The Basecamp Method'") @@ -145,8 +148,20 @@ func TestBuildPages_EntityNoSubtype(t *testing.T) { raw := []RawPage{ {Title: "Basecamp", Type: "entity", Content: "## Description\n\nA company.\n"}, } - pages := BuildPages(raw, "src", "2026-04-23") + pages, _ := BuildPages(raw, "src", "2026-04-23") require.Len(t, pages, 1) assert.NotContains(t, pages[0].Content, "type:") assert.Contains(t, pages[0].Content, "title: 'Basecamp'") } + +func TestBuildPages_EmptyTitleSkippedWithWarning(t *testing.T) { + raw := []RawPage{ + {Title: "", Type: "concept", Content: "## Definition\n\nFoo.\n"}, + {Title: "Betting", Type: "concept", Content: "## Definition\n\nA technique.\n"}, + } + pages, warnings := BuildPages(raw, "src", "2026-04-23") + require.Len(t, pages, 1, "empty-title page should be skipped") + assert.Equal(t, "wiki/concepts/betting.md", pages[0].Path) + assert.Len(t, warnings, 1) + assert.Contains(t, warnings[0], "empty title") +} diff --git a/ingestion/internal/pipeline/pipeline.go b/ingestion/internal/pipeline/pipeline.go index f0f273c..19b650a 100644 --- a/ingestion/internal/pipeline/pipeline.go +++ b/ingestion/internal/pipeline/pipeline.go @@ -59,7 +59,8 @@ func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryR allWarnings = append(allWarnings, warnings...) } - pages := BuildPages(allRaw, sourceSlug, date) + pages, buildWarnings := BuildPages(allRaw, sourceSlug, date) + allWarnings = append(allWarnings, buildWarnings...) resolved := Resolve(pages, inventory) canonicalized, linkWarnings := CanonicalizeLinks(resolved, inventory) allWarnings = append(allWarnings, linkWarnings...)