feat(ingestion): add wiki page merge logic
This commit is contained in:
120
ingestion/internal/wiki/merge.go
Normal file
120
ingestion/internal/wiki/merge.go
Normal file
@@ -0,0 +1,120 @@
|
||||
// ingestion/internal/wiki/merge.go
|
||||
package wiki
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var bulletSections = map[string]bool{
|
||||
"Related Concepts": true,
|
||||
"Related Entities": true,
|
||||
"Sources": true,
|
||||
"Key Claims": true,
|
||||
"Entities Mentioned": true,
|
||||
"Concepts Introduced or Reinforced": true,
|
||||
"Chapters": true,
|
||||
}
|
||||
|
||||
var appendSections = map[string]bool{
|
||||
"Evolving Notes": true,
|
||||
"Updates": true,
|
||||
"Open Questions Raised": true,
|
||||
"Open Questions": true,
|
||||
}
|
||||
|
||||
type section struct {
|
||||
heading string
|
||||
content string
|
||||
}
|
||||
|
||||
// Merge combines two Page values with the same path.
|
||||
// Frontmatter is taken from a. Sections are merged by strategy:
|
||||
// bullet sections union unique lines, append sections concatenate,
|
||||
// all others keep a's version. Sections in b not present in a are appended.
|
||||
func Merge(a, b Page) Page {
|
||||
fmA, secsA := parseSections(a.Content)
|
||||
_, secsB := parseSections(b.Content)
|
||||
|
||||
idx := make(map[string]int, len(secsA))
|
||||
for i, s := range secsA {
|
||||
idx[s.heading] = i
|
||||
}
|
||||
|
||||
for _, sB := range secsB {
|
||||
i, exists := idx[sB.heading]
|
||||
if !exists {
|
||||
idx[sB.heading] = len(secsA)
|
||||
secsA = append(secsA, sB)
|
||||
continue
|
||||
}
|
||||
sA := secsA[i]
|
||||
switch {
|
||||
case bulletSections[sB.heading]:
|
||||
secsA[i].content = mergeBullets(sA.content, sB.content)
|
||||
case appendSections[sB.heading]:
|
||||
secsA[i].content = strings.TrimRight(sA.content, "\n") + "\n\n" + strings.TrimLeft(sB.content, "\n")
|
||||
}
|
||||
}
|
||||
|
||||
return Page{Path: a.Path, Content: rebuildContent(fmA, secsA)}
|
||||
}
|
||||
|
||||
func parseSections(markdown string) (frontmatter string, sections []section) {
|
||||
lines := strings.Split(markdown, "\n")
|
||||
i := 0
|
||||
|
||||
if i < len(lines) && strings.TrimSpace(lines[i]) == "---" {
|
||||
i++
|
||||
var fmLines []string
|
||||
for i < len(lines) {
|
||||
if strings.TrimSpace(lines[i]) == "---" {
|
||||
i++
|
||||
break
|
||||
}
|
||||
fmLines = append(fmLines, lines[i])
|
||||
i++
|
||||
}
|
||||
frontmatter = fmt.Sprintf("---\n%s\n---\n", strings.Join(fmLines, "\n"))
|
||||
}
|
||||
|
||||
var cur *section
|
||||
for ; i < len(lines); i++ {
|
||||
line := lines[i]
|
||||
if strings.HasPrefix(line, "## ") {
|
||||
if cur != nil {
|
||||
sections = append(sections, *cur)
|
||||
}
|
||||
cur = §ion{heading: strings.TrimPrefix(line, "## ")}
|
||||
} else if cur != nil {
|
||||
cur.content += line + "\n"
|
||||
}
|
||||
}
|
||||
if cur != nil {
|
||||
sections = append(sections, *cur)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func rebuildContent(frontmatter string, sections []section) string {
|
||||
var sb strings.Builder
|
||||
sb.WriteString(frontmatter)
|
||||
for _, sec := range sections {
|
||||
fmt.Fprintf(&sb, "\n## %s\n\n%s", sec.heading, sec.content)
|
||||
}
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
func mergeBullets(a, b string) string {
|
||||
seen := make(map[string]bool)
|
||||
var lines []string
|
||||
for _, line := range strings.Split(a+b, "\n") {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if trimmed == "" || seen[trimmed] {
|
||||
continue
|
||||
}
|
||||
seen[trimmed] = true
|
||||
lines = append(lines, line)
|
||||
}
|
||||
return strings.Join(lines, "\n") + "\n"
|
||||
}
|
||||
Reference in New Issue
Block a user