diff --git a/ingestion/internal/pipeline/refs.go b/ingestion/internal/pipeline/refs.go new file mode 100644 index 0000000..ae633a2 --- /dev/null +++ b/ingestion/internal/pipeline/refs.go @@ -0,0 +1,115 @@ +// ingestion/internal/pipeline/refs.go +package pipeline + +import ( + "os" + "path/filepath" + "regexp" + "strings" + + "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" +) + +var wikilinkRE = regexp.MustCompile(`\[\[([^|\]]+)\|`) + +// injectSourceRefs finds the source page in the proposed batch, extracts its +// wikilinks, and injects a back-reference into every linked concept or entity page. +// Pages that exist on disk but are not in the current batch are loaded and +// appended so they will be updated on write. +func injectSourceRefs(pages []wiki.Page, inventory map[wiki.PageType][]wiki.Entry, brainDir string) []wiki.Page { + sourceSlug, sourceTitle, found := findSourcePage(pages) + if !found { + return pages + } + + var sourceContent string + for _, p := range pages { + if strings.HasPrefix(p.Path, "wiki/sources/") && + strings.TrimSuffix(filepath.Base(p.Path), ".md") == sourceSlug { + sourceContent = p.Content + break + } + } + + linkedSlugs := extractWikilinks(sourceContent) + sourceRef := "- [[" + sourceSlug + "|" + sourceTitle + "]]" + + bySlug := make(map[string]int, len(pages)) + for i, p := range pages { + if !strings.HasPrefix(p.Path, "wiki/sources/") { + bySlug[strings.TrimSuffix(filepath.Base(p.Path), ".md")] = i + } + } + + for slug := range linkedSlugs { + if slug == sourceSlug { + continue + } + if idx, ok := bySlug[slug]; ok { + pages[idx] = addSourceRef(pages[idx], sourceRef) + continue + } + pt, ok := findInInventory(slug, inventory) + if !ok { + continue + } + diskPath := filepath.Join(brainDir, "wiki", string(pt), slug+".md") + b, err := os.ReadFile(diskPath) + if err != nil { + continue + } + page := wiki.Page{ + Path: "wiki/" + string(pt) + "/" + slug + ".md", + Content: string(b), + } + pages = append(pages, addSourceRef(page, sourceRef)) + } + + return pages +} + +// addSourceRef injects sourceRef into the ## Sources bullet section of page +// using wiki.Merge, which deduplicates bullets automatically. +func addSourceRef(page wiki.Page, sourceRef string) wiki.Page { + patch := wiki.Page{ + Path: page.Path, + Content: "\n## Sources\n\n" + sourceRef + "\n", + } + return wiki.Merge(page, patch) +} + +// extractWikilinks returns the set of slugs referenced as [[slug|...]] in content. +func extractWikilinks(content string) map[string]bool { + slugs := make(map[string]bool) + for _, m := range wikilinkRE.FindAllStringSubmatch(content, -1) { + slugs[m[1]] = true + } + return slugs +} + +// findSourcePage returns the slug and title of the first wiki/sources/ page in pages. +func findSourcePage(pages []wiki.Page) (slug, title string, found bool) { + for _, p := range pages { + if strings.HasPrefix(p.Path, "wiki/sources/") { + slug = strings.TrimSuffix(filepath.Base(p.Path), ".md") + title = extractTitle(p.Content) + if title == "" { + title = slug + } + return slug, title, true + } + } + return "", "", false +} + +// findInInventory returns the PageType for a slug if it appears in the inventory. +func findInInventory(slug string, inventory map[wiki.PageType][]wiki.Entry) (wiki.PageType, bool) { + for pt, entries := range inventory { + for _, e := range entries { + if e.Slug == slug { + return pt, true + } + } + } + return "", false +} diff --git a/ingestion/internal/pipeline/refs_test.go b/ingestion/internal/pipeline/refs_test.go new file mode 100644 index 0000000..a694d2c --- /dev/null +++ b/ingestion/internal/pipeline/refs_test.go @@ -0,0 +1,172 @@ +// ingestion/internal/pipeline/refs_test.go +package pipeline + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" +) + +func makeInventory(concepts, entities []string) map[wiki.PageType][]wiki.Entry { + inv := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeConcept: {}, + wiki.PageTypeEntity: {}, + wiki.PageTypeSource: {}, + } + for _, slug := range concepts { + inv[wiki.PageTypeConcept] = append(inv[wiki.PageTypeConcept], wiki.Entry{Slug: slug, Title: slug}) + } + for _, slug := range entities { + inv[wiki.PageTypeEntity] = append(inv[wiki.PageTypeEntity], wiki.Entry{Slug: slug, Title: slug}) + } + return inv +} + +func TestInjectSourceRefs_NoSourcePage(t *testing.T) { + pages := []wiki.Page{ + {Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Definition\n\nFoo.\n"}, + } + got := injectSourceRefs(pages, makeInventory(nil, nil), t.TempDir()) + assert.Equal(t, pages, got) +} + +func TestInjectSourceRefs_InjectsIntoProposedConcept(t *testing.T) { + pages := []wiki.Page{ + { + Path: "wiki/sources/my-article.md", + Content: "---\ntitle: My Article\n---\n\n## Summary\n\nSee [[domain-driven-design|Domain Driven Design]].\n", + }, + { + Path: "wiki/concepts/domain-driven-design.md", + Content: "---\ntitle: Domain Driven Design\n---\n\n## Definition\n\nA methodology.\n", + }, + } + + got := injectSourceRefs(pages, makeInventory(nil, nil), t.TempDir()) + + require.Len(t, got, 2) + assert.Contains(t, got[1].Content, "## Sources") + assert.Contains(t, got[1].Content, "[[my-article|My Article]]") +} + +func TestInjectSourceRefs_LoadsConceptFromDisk(t *testing.T) { + brainDir := t.TempDir() + conceptDir := filepath.Join(brainDir, "wiki", "concepts") + require.NoError(t, os.MkdirAll(conceptDir, 0o755)) + require.NoError(t, os.WriteFile( + filepath.Join(conceptDir, "shape-up.md"), + []byte("---\ntitle: Shape Up\n---\n\n## Definition\n\nA methodology.\n"), + 0o644, + )) + + pages := []wiki.Page{ + { + Path: "wiki/sources/my-article.md", + Content: "---\ntitle: My Article\n---\n\n## Summary\n\nSee [[shape-up|Shape Up]].\n", + }, + } + inv := makeInventory([]string{"shape-up"}, nil) + + got := injectSourceRefs(pages, inv, brainDir) + + require.Len(t, got, 2) + var conceptPage wiki.Page + for _, p := range got { + if p.Path == "wiki/concepts/shape-up.md" { + conceptPage = p + } + } + assert.Contains(t, conceptPage.Content, "## Sources") + assert.Contains(t, conceptPage.Content, "[[my-article|My Article]]") + assert.Contains(t, conceptPage.Content, "## Definition") +} + +func TestInjectSourceRefs_NoSelfReference(t *testing.T) { + pages := []wiki.Page{ + { + Path: "wiki/sources/my-article.md", + Content: "---\ntitle: My Article\n---\n\n## Summary\n\nSelf-link [[my-article|My Article]].\n", + }, + } + + got := injectSourceRefs(pages, makeInventory(nil, nil), t.TempDir()) + assert.Len(t, got, 1) +} + +func TestInjectSourceRefs_DeduplicatesOnReingestion(t *testing.T) { + pages := []wiki.Page{ + { + Path: "wiki/sources/my-article.md", + Content: "---\ntitle: My Article\n---\n\n## Summary\n\nSee [[ddd|DDD]].\n", + }, + { + Path: "wiki/concepts/ddd.md", + Content: "---\ntitle: DDD\n---\n\n## Definition\n\nA thing.\n\n## Sources\n\n- [[my-article|My Article]]\n", + }, + } + + got := injectSourceRefs(pages, makeInventory(nil, nil), t.TempDir()) + + require.Len(t, got, 2) + count := 0 + for _, line := range splitLines(got[1].Content) { + if line == "- [[my-article|My Article]]" { + count++ + } + } + assert.Equal(t, 1, count, "source ref should appear exactly once") +} + +func TestInjectSourceRefs_InjectsIntoEntity(t *testing.T) { + pages := []wiki.Page{ + { + Path: "wiki/sources/book.md", + Content: "---\ntitle: Book\n---\n\n## Summary\n\nBy [[ryan-singer|Ryan Singer]].\n", + }, + { + Path: "wiki/entities/ryan-singer.md", + Content: "---\ntitle: Ryan Singer\n---\n\n## Description\n\nA designer.\n", + }, + } + + got := injectSourceRefs(pages, makeInventory(nil, nil), t.TempDir()) + + require.Len(t, got, 2) + var entity wiki.Page + for _, p := range got { + if p.Path == "wiki/entities/ryan-singer.md" { + entity = p + } + } + assert.Contains(t, entity.Content, "[[book|Book]]") +} + +func TestExtractWikilinks(t *testing.T) { + content := "See [[foo|Foo]] and [[bar|Bar]] and [[foo|Foo again]]." + got := extractWikilinks(content) + assert.True(t, got["foo"]) + assert.True(t, got["bar"]) + assert.Len(t, got, 2, "duplicate slugs should be deduplicated") +} + +func splitLines(s string) []string { + var out []string + start := 0 + for i := 0; i < len(s); i++ { + if s[i] == '\n' { + if line := s[start:i]; line != "" { + out = append(out, line) + } + start = i + 1 + } + } + if last := s[start:]; last != "" { + out = append(out, last) + } + return out +}