feat(pipeline): add fuzzy entity resolution to prevent slug proliferation
This commit is contained in:
88
ingestion/internal/pipeline/resolve.go
Normal file
88
ingestion/internal/pipeline/resolve.go
Normal file
@@ -0,0 +1,88 @@
|
||||
// ingestion/internal/pipeline/resolve.go
|
||||
package pipeline
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
|
||||
)
|
||||
|
||||
// Resolve remaps proposed pages to existing slugs when a fuzzy title match is found.
|
||||
// It only matches within the same page type (entities→entities, concepts→concepts).
|
||||
// Pages with no inventory match are returned unchanged.
|
||||
func Resolve(proposed []wiki.Page, inventory map[wiki.PageType][]wiki.Entry) []wiki.Page {
|
||||
type key struct {
|
||||
pt wiki.PageType
|
||||
normalized string
|
||||
}
|
||||
lookup := make(map[key]string) // key → canonical slug
|
||||
for pt, entries := range inventory {
|
||||
for _, e := range entries {
|
||||
k := key{pt: pt, normalized: normalizeTitle(e.Title)}
|
||||
lookup[k] = e.Slug
|
||||
for _, alias := range e.Aliases {
|
||||
ak := key{pt: pt, normalized: normalizeTitle(alias)}
|
||||
if _, exists := lookup[ak]; !exists {
|
||||
lookup[ak] = e.Slug
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out := make([]wiki.Page, 0, len(proposed))
|
||||
for _, page := range proposed {
|
||||
pt := pageTypeFromPath(page.Path)
|
||||
title := extractTitle(page.Content)
|
||||
k := key{pt: pt, normalized: normalizeTitle(title)}
|
||||
if canonicalSlug, ok := lookup[k]; ok {
|
||||
dir := filepath.Dir(page.Path)
|
||||
page.Path = dir + "/" + canonicalSlug + ".md"
|
||||
}
|
||||
out = append(out, page)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// normalizeTitle lowercases, removes leading articles, collapses whitespace.
|
||||
// "The Shape Up Method" → "shape up method"
|
||||
func normalizeTitle(s string) string {
|
||||
s = strings.ToLower(strings.TrimSpace(s))
|
||||
for _, article := range []string{"the ", "a ", "an "} {
|
||||
s = strings.TrimPrefix(s, article)
|
||||
}
|
||||
s = strings.ReplaceAll(s, "-", " ")
|
||||
return strings.Join(strings.Fields(s), " ")
|
||||
}
|
||||
|
||||
// pageTypeFromPath extracts the wiki.PageType from a path like "wiki/entities/foo.md".
|
||||
func pageTypeFromPath(path string) wiki.PageType {
|
||||
parts := strings.Split(filepath.ToSlash(path), "/")
|
||||
if len(parts) >= 2 {
|
||||
return wiki.PageType(parts[1])
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// extractTitle reads the title field from YAML frontmatter in content.
|
||||
// Falls back to empty string if not found.
|
||||
func extractTitle(content string) string {
|
||||
lines := strings.SplitN(content, "\n", 30)
|
||||
inFM := false
|
||||
for _, line := range lines {
|
||||
if strings.TrimSpace(line) == "---" {
|
||||
if !inFM {
|
||||
inFM = true
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
if inFM {
|
||||
key, val, ok := strings.Cut(line, ":")
|
||||
if ok && strings.TrimSpace(key) == "title" {
|
||||
return strings.Trim(strings.TrimSpace(val), `"'`)
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
Reference in New Issue
Block a user