From 26855f69b0ea527914834ee9c563fc4cbb00da02 Mon Sep 17 00:00:00 2001 From: Mathias Bergqvist Date: Thu, 23 Apr 2026 18:59:10 +0200 Subject: [PATCH] =?UTF-8?q?feat(pipeline):=20add=20CanonicalizeLinks=20?= =?UTF-8?q?=E2=80=94=20convert=20[[Display=20Name]]=20to=20[[slug|Display?= =?UTF-8?q?=20Name]]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ingestion/internal/pipeline/links.go | 70 ++++++++++++ ingestion/internal/pipeline/links_test.go | 125 ++++++++++++++++++++++ 2 files changed, 195 insertions(+) create mode 100644 ingestion/internal/pipeline/links.go create mode 100644 ingestion/internal/pipeline/links_test.go diff --git a/ingestion/internal/pipeline/links.go b/ingestion/internal/pipeline/links.go new file mode 100644 index 0000000..92814e6 --- /dev/null +++ b/ingestion/internal/pipeline/links.go @@ -0,0 +1,70 @@ +// ingestion/internal/pipeline/links.go +package pipeline + +import ( + "fmt" + "path/filepath" + "regexp" + "strings" + + "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" +) + +// plainLinkRE matches [[Display Name]] — wikilinks without a slug pipe. +// It does NOT match [[slug|Display]] (those already have a pipe). +var plainLinkRE = regexp.MustCompile(`\[\[([^\]|]+)\]\]`) + +// CanonicalizeLinks converts [[Display Name]] wikilinks to [[slug|Display Name]] +// using a title→slug map built from the inventory and current batch. +// Unknown titles are left as-is and returned as warnings. +func CanonicalizeLinks(pages []wiki.Page, inventory map[wiki.PageType][]wiki.Entry) ([]wiki.Page, []string) { + titleToSlug := buildTitleMap(pages, inventory) + + var allWarnings []string + out := make([]wiki.Page, len(pages)) + for i, p := range pages { + newContent, warnings := canonicalizeContent(p.Content, titleToSlug) + p.Content = newContent + out[i] = p + allWarnings = append(allWarnings, warnings...) + } + return out, allWarnings +} + +// buildTitleMap builds a lowercase-title → slug map from inventory and current batch. +// Current batch entries take precedence over inventory (they may be updates). +func buildTitleMap(pages []wiki.Page, inventory map[wiki.PageType][]wiki.Entry) map[string]string { + m := make(map[string]string) + for _, entries := range inventory { + for _, e := range entries { + m[strings.ToLower(e.Title)] = e.Slug + } + } + // Current batch overrides inventory + for _, p := range pages { + title := extractTitle(p.Content) + slug := strings.TrimSuffix(filepath.Base(p.Path), ".md") + if title != "" && slug != "" { + m[strings.ToLower(title)] = slug + } + } + return m +} + +func canonicalizeContent(content string, titleToSlug map[string]string) (string, []string) { + var warnings []string + result := plainLinkRE.ReplaceAllStringFunc(content, func(match string) string { + sub := plainLinkRE.FindStringSubmatch(match) + if len(sub) < 2 { + return match + } + displayName := sub[1] + slug, ok := titleToSlug[strings.ToLower(displayName)] + if !ok { + warnings = append(warnings, fmt.Sprintf("unknown wikilink: [[%s]]", displayName)) + return match + } + return "[[" + slug + "|" + displayName + "]]" + }) + return result, warnings +} diff --git a/ingestion/internal/pipeline/links_test.go b/ingestion/internal/pipeline/links_test.go new file mode 100644 index 0000000..340ab38 --- /dev/null +++ b/ingestion/internal/pipeline/links_test.go @@ -0,0 +1,125 @@ +// ingestion/internal/pipeline/links_test.go +package pipeline + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" +) + +func TestCanonicalizeLinks_KnownTitle(t *testing.T) { + pages := []wiki.Page{ + { + Path: "wiki/sources/shape-up.md", + Content: "---\ntitle: 'Shape Up'\n---\n\n## Summary\n\nSee [[Betting]].\n", + }, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeConcept: { + {Slug: "betting", Title: "Betting"}, + }, + } + got, warnings := CanonicalizeLinks(pages, inventory) + require.Len(t, got, 1) + assert.Empty(t, warnings) + assert.Contains(t, got[0].Content, "[[betting|Betting]]") + assert.NotContains(t, got[0].Content, "[[Betting]]") +} + +func TestCanonicalizeLinks_UnknownTitleLeftAsIs(t *testing.T) { + pages := []wiki.Page{ + { + Path: "wiki/sources/shape-up.md", + Content: "---\ntitle: 'Shape Up'\n---\n\n## Summary\n\nSee [[Ghost Concept]].\n", + }, + } + inventory := map[wiki.PageType][]wiki.Entry{} + got, warnings := CanonicalizeLinks(pages, inventory) + require.Len(t, got, 1) + assert.NotEmpty(t, warnings) + assert.Contains(t, got[0].Content, "[[Ghost Concept]]") +} + +func TestCanonicalizeLinks_AlreadyCanonicalLinkUntouched(t *testing.T) { + // Links already in [[slug|Display]] format must not be double-converted + pages := []wiki.Page{ + { + Path: "wiki/sources/shape-up.md", + Content: "---\ntitle: 'Shape Up'\n---\n\n## Summary\n\nSee [[betting|Betting]].\n", + }, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeConcept: { + {Slug: "betting", Title: "Betting"}, + }, + } + got, warnings := CanonicalizeLinks(pages, inventory) + require.Len(t, got, 1) + assert.Empty(t, warnings) + // Should remain exactly as-is — not double-wrapped + assert.Contains(t, got[0].Content, "[[betting|Betting]]") + assert.NotContains(t, got[0].Content, "[[betting|[[betting|Betting]]]]") +} + +func TestCanonicalizeLinks_CaseInsensitiveMatch(t *testing.T) { + pages := []wiki.Page{ + { + Path: "wiki/sources/foo.md", + Content: "---\ntitle: 'Foo'\n---\n\n## Summary\n\nSee [[domain driven design]].\n", + }, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeConcept: { + {Slug: "domain-driven-design", Title: "Domain Driven Design"}, + }, + } + got, warnings := CanonicalizeLinks(pages, inventory) + require.Len(t, got, 1) + assert.Empty(t, warnings) + assert.Contains(t, got[0].Content, "[[domain-driven-design|domain driven design]]") +} + +func TestCanonicalizeLinks_CurrentBatchPagesResolved(t *testing.T) { + // A concept created in the same batch should be canonicalizable + pages := []wiki.Page{ + { + Path: "wiki/sources/shape-up.md", + Content: "---\ntitle: 'Shape Up'\n---\n\n## Summary\n\nSee [[Betting]].\n", + }, + { + Path: "wiki/concepts/betting.md", + Content: "---\ntitle: 'Betting'\n---\n\n## Definition\n\nA technique.\n", + }, + } + inventory := map[wiki.PageType][]wiki.Entry{} // empty — Betting is in the batch, not inventory + + got, warnings := CanonicalizeLinks(pages, inventory) + require.Len(t, got, 2) + assert.Empty(t, warnings) + assert.Contains(t, got[0].Content, "[[betting|Betting]]") +} + +func TestCanonicalizeLinks_MultipleLinksInOnePage(t *testing.T) { + pages := []wiki.Page{ + { + Path: "wiki/sources/foo.md", + Content: "---\ntitle: 'Foo'\n---\n\n## Summary\n\nSee [[Betting]] and [[Shape Up]].\n", + }, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeConcept: { + {Slug: "betting", Title: "Betting"}, + }, + wiki.PageTypeSource: { + {Slug: "shape-up", Title: "Shape Up"}, + }, + } + got, warnings := CanonicalizeLinks(pages, inventory) + require.Len(t, got, 1) + assert.Empty(t, warnings) + assert.Contains(t, got[0].Content, "[[betting|Betting]]") + assert.Contains(t, got[0].Content, "[[shape-up|Shape Up]]") +}