diff --git a/ingestion/internal/pipeline/build.go b/ingestion/internal/pipeline/build.go new file mode 100644 index 0000000..aad4a2d --- /dev/null +++ b/ingestion/internal/pipeline/build.go @@ -0,0 +1,88 @@ +// ingestion/internal/pipeline/build.go +package pipeline + +import ( + "fmt" + "strings" + + "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" +) + +// BuildPages converts RawPages from the LLM into wiki.Pages with computed slugs, +// paths, and YAML frontmatter. sourceSlug is the slug of the source being ingested +// (derived from the filename, not the LLM title). +func BuildPages(rawPages []RawPage, sourceSlug, date string) []wiki.Page { + out := make([]wiki.Page, 0, len(rawPages)) + for _, rp := range rawPages { + out = append(out, buildPage(rp, sourceSlug, date)) + } + return out +} + +func buildPage(rp RawPage, sourceSlug, date string) wiki.Page { + var slug, dir string + switch rp.Type { + case "source": + slug = sourceSlug + dir = "wiki/sources" + case "concept": + slug = wiki.Slug(rp.Title) + dir = "wiki/concepts" + case "entity": + slug = wiki.Slug(rp.Title) + dir = "wiki/entities" + default: + slug = wiki.Slug(rp.Title) + dir = "wiki/" + rp.Type + } + + path := dir + "/" + slug + ".md" + fm := buildFrontmatter(rp, date) + + return wiki.Page{ + Path: path, + Content: fm + "\n" + rp.Content, + } +} + +func buildFrontmatter(rp RawPage, date string) string { + var sb strings.Builder + sb.WriteString("---\n") + fmt.Fprintf(&sb, "title: %s\n", rp.Title) + + switch rp.Type { + case "source": + subtype := rp.Subtype + if subtype == "" { + subtype = "article" + } + fmt.Fprintf(&sb, "type: %s\n", subtype) + if rp.Domain != "" { + fmt.Fprintf(&sb, "domain: %s\n", rp.Domain) + } + fmt.Fprintf(&sb, "date_ingested: %s\n", date) + fmt.Fprintf(&sb, "last_updated: %s\n", date) + case "concept": + if rp.Domain != "" { + fmt.Fprintf(&sb, "domain: %s\n", rp.Domain) + } + fmt.Fprintf(&sb, "last_updated: %s\n", date) + case "entity": + if rp.Subtype != "" { + fmt.Fprintf(&sb, "type: %s\n", rp.Subtype) + } + if rp.Domain != "" { + fmt.Fprintf(&sb, "domain: %s\n", rp.Domain) + } + fmt.Fprintf(&sb, "last_updated: %s\n", date) + default: + if rp.Domain != "" { + fmt.Fprintf(&sb, "domain: %s\n", rp.Domain) + } + fmt.Fprintf(&sb, "last_updated: %s\n", date) + } + + fmt.Fprintf(&sb, "aliases:\n - %s\n", rp.Title) + sb.WriteString("---\n") + return sb.String() +} diff --git a/ingestion/internal/pipeline/build_test.go b/ingestion/internal/pipeline/build_test.go new file mode 100644 index 0000000..8f127cb --- /dev/null +++ b/ingestion/internal/pipeline/build_test.go @@ -0,0 +1,131 @@ +// ingestion/internal/pipeline/build_test.go +package pipeline + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestBuildPages_SourcePage(t *testing.T) { + raw := []RawPage{ + { + Title: "Shape Up", + Type: "source", + Subtype: "book", + Domain: "product-strategy", + Content: "## Summary\n\nA book about shaping product work.\n", + }, + } + pages := BuildPages(raw, "shape-up", "2026-04-23") + require.Len(t, pages, 1) + + p := pages[0] + assert.Equal(t, "wiki/sources/shape-up.md", p.Path) + assert.Contains(t, p.Content, "title: Shape Up") + assert.Contains(t, p.Content, "type: book") + assert.Contains(t, p.Content, "domain: product-strategy") + assert.Contains(t, p.Content, "date_ingested: 2026-04-23") + assert.Contains(t, p.Content, "last_updated: 2026-04-23") + assert.Contains(t, p.Content, "aliases:\n - Shape Up") + assert.Contains(t, p.Content, "## Summary") + assert.True(t, strings.HasPrefix(p.Content, "---\n"), "content must start with frontmatter") +} + +func TestBuildPages_ConceptPage(t *testing.T) { + raw := []RawPage{ + { + Title: "Betting", + Type: "concept", + Domain: "product-strategy", + Content: "## Definition\n\nA resource allocation technique.\n", + }, + } + pages := BuildPages(raw, "shape-up", "2026-04-23") + require.Len(t, pages, 1) + + p := pages[0] + assert.Equal(t, "wiki/concepts/betting.md", p.Path) + assert.Contains(t, p.Content, "title: Betting") + assert.Contains(t, p.Content, "domain: product-strategy") + assert.Contains(t, p.Content, "last_updated: 2026-04-23") + assert.Contains(t, p.Content, "aliases:\n - Betting") + assert.NotContains(t, p.Content, "date_ingested") + assert.Contains(t, p.Content, "## Definition") +} + +func TestBuildPages_EntityPage(t *testing.T) { + raw := []RawPage{ + { + Title: "Ryan Singer", + Type: "entity", + Subtype: "person", + Domain: "product-strategy", + Content: "## Description\n\nA product designer.\n", + }, + } + pages := BuildPages(raw, "shape-up", "2026-04-23") + require.Len(t, pages, 1) + + p := pages[0] + assert.Equal(t, "wiki/entities/ryan-singer.md", p.Path) + assert.Contains(t, p.Content, "title: Ryan Singer") + assert.Contains(t, p.Content, "type: person") + assert.Contains(t, p.Content, "domain: product-strategy") + assert.Contains(t, p.Content, "last_updated: 2026-04-23") + assert.Contains(t, p.Content, "aliases:\n - Ryan Singer") + assert.NotContains(t, p.Content, "date_ingested") +} + +func TestBuildPages_SourceSlugUsedForSourcePage(t *testing.T) { + // LLM title differs from filename — pipeline uses sourceSlug for the source page path. + raw := []RawPage{ + {Title: "FinBERT: A Pretrained Model", Type: "source", Subtype: "article", Content: "## Summary\n\nA model.\n"}, + } + pages := BuildPages(raw, "finbert-huggingface", "2026-04-23") + require.Len(t, pages, 1) + assert.Equal(t, "wiki/sources/finbert-huggingface.md", pages[0].Path) +} + +func TestBuildPages_ConceptSlugDerivedFromTitle(t *testing.T) { + raw := []RawPage{ + {Title: "Domain-Driven Design", Type: "concept", Content: "## Definition\n\nFoo.\n"}, + } + pages := BuildPages(raw, "some-source", "2026-04-23") + require.Len(t, pages, 1) + assert.Equal(t, "wiki/concepts/domain-driven-design.md", pages[0].Path) +} + +func TestBuildPages_SourceDefaultSubtype(t *testing.T) { + // If subtype is omitted for a source, default to "article" + raw := []RawPage{ + {Title: "Some Post", Type: "source", Content: "## Summary\n\nA post.\n"}, + } + pages := BuildPages(raw, "some-post", "2026-04-23") + require.Len(t, pages, 1) + assert.Contains(t, pages[0].Content, "type: article") +} + +func TestBuildPages_OmitsDomainWhenEmpty(t *testing.T) { + raw := []RawPage{ + {Title: "Betting", Type: "concept", Content: "## Definition\n\nFoo.\n"}, + } + pages := BuildPages(raw, "src", "2026-04-23") + require.Len(t, pages, 1) + assert.NotContains(t, pages[0].Content, "domain:") +} + +func TestBuildPages_MultiplePages(t *testing.T) { + raw := []RawPage{ + {Title: "Shape Up", Type: "source", Subtype: "book", Content: "## Summary\n\nA book.\n"}, + {Title: "Betting", Type: "concept", Content: "## Definition\n\nA technique.\n"}, + {Title: "Ryan Singer", Type: "entity", Subtype: "person", Content: "## Description\n\nA designer.\n"}, + } + pages := BuildPages(raw, "shape-up", "2026-04-23") + require.Len(t, pages, 3) + assert.Equal(t, "wiki/sources/shape-up.md", pages[0].Path) + assert.Equal(t, "wiki/concepts/betting.md", pages[1].Path) + assert.Equal(t, "wiki/entities/ryan-singer.md", pages[2].Path) +}