From e9b5cc401cbd64c850bf53459434dc7f63220df8 Mon Sep 17 00:00:00 2001 From: Mathias Bergqvist Date: Thu, 23 Apr 2026 15:59:36 +0200 Subject: [PATCH] feat(pipeline): add fuzzy entity resolution to prevent slug proliferation --- ingestion/internal/pipeline/resolve.go | 88 ++++++++++++++++++++ ingestion/internal/pipeline/resolve_test.go | 90 +++++++++++++++++++++ 2 files changed, 178 insertions(+) create mode 100644 ingestion/internal/pipeline/resolve.go create mode 100644 ingestion/internal/pipeline/resolve_test.go diff --git a/ingestion/internal/pipeline/resolve.go b/ingestion/internal/pipeline/resolve.go new file mode 100644 index 0000000..df08249 --- /dev/null +++ b/ingestion/internal/pipeline/resolve.go @@ -0,0 +1,88 @@ +// ingestion/internal/pipeline/resolve.go +package pipeline + +import ( + "path/filepath" + "strings" + + "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" +) + +// Resolve remaps proposed pages to existing slugs when a fuzzy title match is found. +// It only matches within the same page type (entities→entities, concepts→concepts). +// Pages with no inventory match are returned unchanged. +func Resolve(proposed []wiki.Page, inventory map[wiki.PageType][]wiki.Entry) []wiki.Page { + type key struct { + pt wiki.PageType + normalized string + } + lookup := make(map[key]string) // key → canonical slug + for pt, entries := range inventory { + for _, e := range entries { + k := key{pt: pt, normalized: normalizeTitle(e.Title)} + lookup[k] = e.Slug + for _, alias := range e.Aliases { + ak := key{pt: pt, normalized: normalizeTitle(alias)} + if _, exists := lookup[ak]; !exists { + lookup[ak] = e.Slug + } + } + } + } + + out := make([]wiki.Page, 0, len(proposed)) + for _, page := range proposed { + pt := pageTypeFromPath(page.Path) + title := extractTitle(page.Content) + k := key{pt: pt, normalized: normalizeTitle(title)} + if canonicalSlug, ok := lookup[k]; ok { + dir := filepath.Dir(page.Path) + page.Path = dir + "/" + canonicalSlug + ".md" + } + out = append(out, page) + } + return out +} + +// normalizeTitle lowercases, removes leading articles, collapses whitespace. +// "The Shape Up Method" → "shape up method" +func normalizeTitle(s string) string { + s = strings.ToLower(strings.TrimSpace(s)) + for _, article := range []string{"the ", "a ", "an "} { + s = strings.TrimPrefix(s, article) + } + s = strings.ReplaceAll(s, "-", " ") + return strings.Join(strings.Fields(s), " ") +} + +// pageTypeFromPath extracts the wiki.PageType from a path like "wiki/entities/foo.md". +func pageTypeFromPath(path string) wiki.PageType { + parts := strings.Split(filepath.ToSlash(path), "/") + if len(parts) >= 2 { + return wiki.PageType(parts[1]) + } + return "" +} + +// extractTitle reads the title field from YAML frontmatter in content. +// Falls back to empty string if not found. +func extractTitle(content string) string { + lines := strings.SplitN(content, "\n", 30) + inFM := false + for _, line := range lines { + if strings.TrimSpace(line) == "---" { + if !inFM { + inFM = true + continue + } + break + } + if inFM { + key, val, ok := strings.Cut(line, ":") + if ok && strings.TrimSpace(key) == "title" { + return strings.Trim(strings.TrimSpace(val), `"'`) + } + } + } + return "" +} diff --git a/ingestion/internal/pipeline/resolve_test.go b/ingestion/internal/pipeline/resolve_test.go new file mode 100644 index 0000000..19b66e5 --- /dev/null +++ b/ingestion/internal/pipeline/resolve_test.go @@ -0,0 +1,90 @@ +// ingestion/internal/pipeline/resolve_test.go +package pipeline + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/mathiasbq/hyperguild/ingestion/internal/wiki" +) + +func TestResolve_NoMatch(t *testing.T) { + proposed := []wiki.Page{ + {Path: "wiki/entities/new-person.md", Content: "---\ntitle: New Person\n---\n"}, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeEntity: { + {Slug: "ryan-singer", Title: "Ryan Singer", Aliases: []string{"Singer"}}, + }, + } + got := Resolve(proposed, inventory) + assert.Len(t, got, 1) + assert.Equal(t, "wiki/entities/new-person.md", got[0].Path) +} + +func TestResolve_TitleMatchRedirectsSlug(t *testing.T) { + proposed := []wiki.Page{ + {Path: "wiki/entities/ryan-singer-the-designer.md", Content: "---\ntitle: Ryan Singer\n---\n"}, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeEntity: { + {Slug: "ryan-singer", Title: "Ryan Singer", Aliases: nil}, + }, + } + got := Resolve(proposed, inventory) + assert.Len(t, got, 1) + assert.Equal(t, "wiki/entities/ryan-singer.md", got[0].Path) +} + +func TestResolve_AliasMatchRedirectsSlug(t *testing.T) { + proposed := []wiki.Page{ + {Path: "wiki/entities/singer.md", Content: "---\ntitle: Singer\n---\n"}, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeEntity: { + {Slug: "ryan-singer", Title: "Ryan Singer", Aliases: []string{"Singer", "R. Singer"}}, + }, + } + got := Resolve(proposed, inventory) + assert.Len(t, got, 1) + assert.Equal(t, "wiki/entities/ryan-singer.md", got[0].Path) +} + +func TestResolve_NormalizationCaseAndArticles(t *testing.T) { + proposed := []wiki.Page{ + {Path: "wiki/concepts/the-shape-up-method.md", Content: "---\ntitle: The Shape Up Method\n---\n"}, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeConcept: { + {Slug: "shape-up-method", Title: "Shape Up Method", Aliases: nil}, + }, + } + got := Resolve(proposed, inventory) + assert.Len(t, got, 1) + assert.Equal(t, "wiki/concepts/shape-up-method.md", got[0].Path) +} + +func TestResolve_OnlyMatchesSamePageType(t *testing.T) { + proposed := []wiki.Page{ + {Path: "wiki/concepts/ryan-singer.md", Content: "---\ntitle: Ryan Singer\n---\n"}, + } + inventory := map[wiki.PageType][]wiki.Entry{ + wiki.PageTypeEntity: { + {Slug: "ryan-singer", Title: "Ryan Singer", Aliases: nil}, + }, + wiki.PageTypeConcept: {}, + } + got := Resolve(proposed, inventory) + assert.Len(t, got, 1) + assert.Equal(t, "wiki/concepts/ryan-singer.md", got[0].Path) +} + +func TestResolve_EmptyInventory(t *testing.T) { + proposed := []wiki.Page{ + {Path: "wiki/entities/first.md", Content: "---\ntitle: First\n---\n"}, + } + inventory := map[wiki.PageType][]wiki.Entry{} + got := Resolve(proposed, inventory) + assert.Equal(t, proposed, got) +}