feat(pipeline): add fuzzy entity resolution to prevent slug proliferation
This commit is contained in:
88
ingestion/internal/pipeline/resolve.go
Normal file
88
ingestion/internal/pipeline/resolve.go
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
// ingestion/internal/pipeline/resolve.go
|
||||||
|
package pipeline
|
||||||
|
|
||||||
|
import (
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Resolve remaps proposed pages to existing slugs when a fuzzy title match is found.
|
||||||
|
// It only matches within the same page type (entities→entities, concepts→concepts).
|
||||||
|
// Pages with no inventory match are returned unchanged.
|
||||||
|
func Resolve(proposed []wiki.Page, inventory map[wiki.PageType][]wiki.Entry) []wiki.Page {
|
||||||
|
type key struct {
|
||||||
|
pt wiki.PageType
|
||||||
|
normalized string
|
||||||
|
}
|
||||||
|
lookup := make(map[key]string) // key → canonical slug
|
||||||
|
for pt, entries := range inventory {
|
||||||
|
for _, e := range entries {
|
||||||
|
k := key{pt: pt, normalized: normalizeTitle(e.Title)}
|
||||||
|
lookup[k] = e.Slug
|
||||||
|
for _, alias := range e.Aliases {
|
||||||
|
ak := key{pt: pt, normalized: normalizeTitle(alias)}
|
||||||
|
if _, exists := lookup[ak]; !exists {
|
||||||
|
lookup[ak] = e.Slug
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out := make([]wiki.Page, 0, len(proposed))
|
||||||
|
for _, page := range proposed {
|
||||||
|
pt := pageTypeFromPath(page.Path)
|
||||||
|
title := extractTitle(page.Content)
|
||||||
|
k := key{pt: pt, normalized: normalizeTitle(title)}
|
||||||
|
if canonicalSlug, ok := lookup[k]; ok {
|
||||||
|
dir := filepath.Dir(page.Path)
|
||||||
|
page.Path = dir + "/" + canonicalSlug + ".md"
|
||||||
|
}
|
||||||
|
out = append(out, page)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalizeTitle lowercases, removes leading articles, collapses whitespace.
|
||||||
|
// "The Shape Up Method" → "shape up method"
|
||||||
|
func normalizeTitle(s string) string {
|
||||||
|
s = strings.ToLower(strings.TrimSpace(s))
|
||||||
|
for _, article := range []string{"the ", "a ", "an "} {
|
||||||
|
s = strings.TrimPrefix(s, article)
|
||||||
|
}
|
||||||
|
s = strings.ReplaceAll(s, "-", " ")
|
||||||
|
return strings.Join(strings.Fields(s), " ")
|
||||||
|
}
|
||||||
|
|
||||||
|
// pageTypeFromPath extracts the wiki.PageType from a path like "wiki/entities/foo.md".
|
||||||
|
func pageTypeFromPath(path string) wiki.PageType {
|
||||||
|
parts := strings.Split(filepath.ToSlash(path), "/")
|
||||||
|
if len(parts) >= 2 {
|
||||||
|
return wiki.PageType(parts[1])
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// extractTitle reads the title field from YAML frontmatter in content.
|
||||||
|
// Falls back to empty string if not found.
|
||||||
|
func extractTitle(content string) string {
|
||||||
|
lines := strings.SplitN(content, "\n", 30)
|
||||||
|
inFM := false
|
||||||
|
for _, line := range lines {
|
||||||
|
if strings.TrimSpace(line) == "---" {
|
||||||
|
if !inFM {
|
||||||
|
inFM = true
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if inFM {
|
||||||
|
key, val, ok := strings.Cut(line, ":")
|
||||||
|
if ok && strings.TrimSpace(key) == "title" {
|
||||||
|
return strings.Trim(strings.TrimSpace(val), `"'`)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
90
ingestion/internal/pipeline/resolve_test.go
Normal file
90
ingestion/internal/pipeline/resolve_test.go
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
// ingestion/internal/pipeline/resolve_test.go
|
||||||
|
package pipeline
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
|
||||||
|
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestResolve_NoMatch(t *testing.T) {
|
||||||
|
proposed := []wiki.Page{
|
||||||
|
{Path: "wiki/entities/new-person.md", Content: "---\ntitle: New Person\n---\n"},
|
||||||
|
}
|
||||||
|
inventory := map[wiki.PageType][]wiki.Entry{
|
||||||
|
wiki.PageTypeEntity: {
|
||||||
|
{Slug: "ryan-singer", Title: "Ryan Singer", Aliases: []string{"Singer"}},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
got := Resolve(proposed, inventory)
|
||||||
|
assert.Len(t, got, 1)
|
||||||
|
assert.Equal(t, "wiki/entities/new-person.md", got[0].Path)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolve_TitleMatchRedirectsSlug(t *testing.T) {
|
||||||
|
proposed := []wiki.Page{
|
||||||
|
{Path: "wiki/entities/ryan-singer-the-designer.md", Content: "---\ntitle: Ryan Singer\n---\n"},
|
||||||
|
}
|
||||||
|
inventory := map[wiki.PageType][]wiki.Entry{
|
||||||
|
wiki.PageTypeEntity: {
|
||||||
|
{Slug: "ryan-singer", Title: "Ryan Singer", Aliases: nil},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
got := Resolve(proposed, inventory)
|
||||||
|
assert.Len(t, got, 1)
|
||||||
|
assert.Equal(t, "wiki/entities/ryan-singer.md", got[0].Path)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolve_AliasMatchRedirectsSlug(t *testing.T) {
|
||||||
|
proposed := []wiki.Page{
|
||||||
|
{Path: "wiki/entities/singer.md", Content: "---\ntitle: Singer\n---\n"},
|
||||||
|
}
|
||||||
|
inventory := map[wiki.PageType][]wiki.Entry{
|
||||||
|
wiki.PageTypeEntity: {
|
||||||
|
{Slug: "ryan-singer", Title: "Ryan Singer", Aliases: []string{"Singer", "R. Singer"}},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
got := Resolve(proposed, inventory)
|
||||||
|
assert.Len(t, got, 1)
|
||||||
|
assert.Equal(t, "wiki/entities/ryan-singer.md", got[0].Path)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolve_NormalizationCaseAndArticles(t *testing.T) {
|
||||||
|
proposed := []wiki.Page{
|
||||||
|
{Path: "wiki/concepts/the-shape-up-method.md", Content: "---\ntitle: The Shape Up Method\n---\n"},
|
||||||
|
}
|
||||||
|
inventory := map[wiki.PageType][]wiki.Entry{
|
||||||
|
wiki.PageTypeConcept: {
|
||||||
|
{Slug: "shape-up-method", Title: "Shape Up Method", Aliases: nil},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
got := Resolve(proposed, inventory)
|
||||||
|
assert.Len(t, got, 1)
|
||||||
|
assert.Equal(t, "wiki/concepts/shape-up-method.md", got[0].Path)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolve_OnlyMatchesSamePageType(t *testing.T) {
|
||||||
|
proposed := []wiki.Page{
|
||||||
|
{Path: "wiki/concepts/ryan-singer.md", Content: "---\ntitle: Ryan Singer\n---\n"},
|
||||||
|
}
|
||||||
|
inventory := map[wiki.PageType][]wiki.Entry{
|
||||||
|
wiki.PageTypeEntity: {
|
||||||
|
{Slug: "ryan-singer", Title: "Ryan Singer", Aliases: nil},
|
||||||
|
},
|
||||||
|
wiki.PageTypeConcept: {},
|
||||||
|
}
|
||||||
|
got := Resolve(proposed, inventory)
|
||||||
|
assert.Len(t, got, 1)
|
||||||
|
assert.Equal(t, "wiki/concepts/ryan-singer.md", got[0].Path)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResolve_EmptyInventory(t *testing.T) {
|
||||||
|
proposed := []wiki.Page{
|
||||||
|
{Path: "wiki/entities/first.md", Content: "---\ntitle: First\n---\n"},
|
||||||
|
}
|
||||||
|
inventory := map[wiki.PageType][]wiki.Entry{}
|
||||||
|
got := Resolve(proposed, inventory)
|
||||||
|
assert.Equal(t, proposed, got)
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user