feat(ingestion): add wiki page merge logic
This commit is contained in:
120
ingestion/internal/wiki/merge.go
Normal file
120
ingestion/internal/wiki/merge.go
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
// ingestion/internal/wiki/merge.go
|
||||||
|
package wiki
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
var bulletSections = map[string]bool{
|
||||||
|
"Related Concepts": true,
|
||||||
|
"Related Entities": true,
|
||||||
|
"Sources": true,
|
||||||
|
"Key Claims": true,
|
||||||
|
"Entities Mentioned": true,
|
||||||
|
"Concepts Introduced or Reinforced": true,
|
||||||
|
"Chapters": true,
|
||||||
|
}
|
||||||
|
|
||||||
|
var appendSections = map[string]bool{
|
||||||
|
"Evolving Notes": true,
|
||||||
|
"Updates": true,
|
||||||
|
"Open Questions Raised": true,
|
||||||
|
"Open Questions": true,
|
||||||
|
}
|
||||||
|
|
||||||
|
type section struct {
|
||||||
|
heading string
|
||||||
|
content string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Merge combines two Page values with the same path.
|
||||||
|
// Frontmatter is taken from a. Sections are merged by strategy:
|
||||||
|
// bullet sections union unique lines, append sections concatenate,
|
||||||
|
// all others keep a's version. Sections in b not present in a are appended.
|
||||||
|
func Merge(a, b Page) Page {
|
||||||
|
fmA, secsA := parseSections(a.Content)
|
||||||
|
_, secsB := parseSections(b.Content)
|
||||||
|
|
||||||
|
idx := make(map[string]int, len(secsA))
|
||||||
|
for i, s := range secsA {
|
||||||
|
idx[s.heading] = i
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, sB := range secsB {
|
||||||
|
i, exists := idx[sB.heading]
|
||||||
|
if !exists {
|
||||||
|
idx[sB.heading] = len(secsA)
|
||||||
|
secsA = append(secsA, sB)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sA := secsA[i]
|
||||||
|
switch {
|
||||||
|
case bulletSections[sB.heading]:
|
||||||
|
secsA[i].content = mergeBullets(sA.content, sB.content)
|
||||||
|
case appendSections[sB.heading]:
|
||||||
|
secsA[i].content = strings.TrimRight(sA.content, "\n") + "\n\n" + strings.TrimLeft(sB.content, "\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Page{Path: a.Path, Content: rebuildContent(fmA, secsA)}
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseSections(markdown string) (frontmatter string, sections []section) {
|
||||||
|
lines := strings.Split(markdown, "\n")
|
||||||
|
i := 0
|
||||||
|
|
||||||
|
if i < len(lines) && strings.TrimSpace(lines[i]) == "---" {
|
||||||
|
i++
|
||||||
|
var fmLines []string
|
||||||
|
for i < len(lines) {
|
||||||
|
if strings.TrimSpace(lines[i]) == "---" {
|
||||||
|
i++
|
||||||
|
break
|
||||||
|
}
|
||||||
|
fmLines = append(fmLines, lines[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
frontmatter = fmt.Sprintf("---\n%s\n---\n", strings.Join(fmLines, "\n"))
|
||||||
|
}
|
||||||
|
|
||||||
|
var cur *section
|
||||||
|
for ; i < len(lines); i++ {
|
||||||
|
line := lines[i]
|
||||||
|
if strings.HasPrefix(line, "## ") {
|
||||||
|
if cur != nil {
|
||||||
|
sections = append(sections, *cur)
|
||||||
|
}
|
||||||
|
cur = §ion{heading: strings.TrimPrefix(line, "## ")}
|
||||||
|
} else if cur != nil {
|
||||||
|
cur.content += line + "\n"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if cur != nil {
|
||||||
|
sections = append(sections, *cur)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func rebuildContent(frontmatter string, sections []section) string {
|
||||||
|
var sb strings.Builder
|
||||||
|
sb.WriteString(frontmatter)
|
||||||
|
for _, sec := range sections {
|
||||||
|
fmt.Fprintf(&sb, "\n## %s\n\n%s", sec.heading, sec.content)
|
||||||
|
}
|
||||||
|
return sb.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func mergeBullets(a, b string) string {
|
||||||
|
seen := make(map[string]bool)
|
||||||
|
var lines []string
|
||||||
|
for _, line := range strings.Split(a+b, "\n") {
|
||||||
|
trimmed := strings.TrimSpace(line)
|
||||||
|
if trimmed == "" || seen[trimmed] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[trimmed] = true
|
||||||
|
lines = append(lines, line)
|
||||||
|
}
|
||||||
|
return strings.Join(lines, "\n") + "\n"
|
||||||
|
}
|
||||||
55
ingestion/internal/wiki/merge_test.go
Normal file
55
ingestion/internal/wiki/merge_test.go
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
// ingestion/internal/wiki/merge_test.go
|
||||||
|
package wiki
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestMerge_BulletSectionsUnion(t *testing.T) {
|
||||||
|
a := Page{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Related Concepts\n\n- [[bar|Bar]]\n"}
|
||||||
|
b := Page{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Related Concepts\n\n- [[bar|Bar]]\n- [[baz|Baz]]\n"}
|
||||||
|
|
||||||
|
got := Merge(a, b)
|
||||||
|
assert.Contains(t, got.Content, "[[bar|Bar]]")
|
||||||
|
assert.Contains(t, got.Content, "[[baz|Baz]]")
|
||||||
|
assert.Equal(t, 1, strings.Count(got.Content, "[[bar|Bar]]"))
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMerge_AppendSections(t *testing.T) {
|
||||||
|
a := Page{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Evolving Notes\n\nFirst note.\n"}
|
||||||
|
b := Page{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Evolving Notes\n\nSecond note.\n"}
|
||||||
|
|
||||||
|
got := Merge(a, b)
|
||||||
|
assert.Contains(t, got.Content, "First note.")
|
||||||
|
assert.Contains(t, got.Content, "Second note.")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMerge_KeepFirstForOtherSections(t *testing.T) {
|
||||||
|
a := Page{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Definition\n\nFirst definition.\n"}
|
||||||
|
b := Page{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Definition\n\nSecond definition.\n"}
|
||||||
|
|
||||||
|
got := Merge(a, b)
|
||||||
|
assert.Contains(t, got.Content, "First definition.")
|
||||||
|
assert.NotContains(t, got.Content, "Second definition.")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMerge_NewSectionFromB(t *testing.T) {
|
||||||
|
a := Page{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Definition\n\nA thing.\n"}
|
||||||
|
b := Page{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Why It Matters\n\nBecause reasons.\n"}
|
||||||
|
|
||||||
|
got := Merge(a, b)
|
||||||
|
assert.Contains(t, got.Content, "A thing.")
|
||||||
|
assert.Contains(t, got.Content, "Because reasons.")
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMerge_KeepsFrontmatterFromA(t *testing.T) {
|
||||||
|
a := Page{Path: "p.md", Content: "---\ntitle: A\nlast_updated: 2026-01-01\n---\n\n## Definition\n\nA.\n"}
|
||||||
|
b := Page{Path: "p.md", Content: "---\ntitle: B\nlast_updated: 2026-06-01\n---\n\n## Definition\n\nB.\n"}
|
||||||
|
|
||||||
|
got := Merge(a, b)
|
||||||
|
assert.Contains(t, got.Content, "title: A")
|
||||||
|
assert.NotContains(t, got.Content, "title: B")
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user