feat(pipeline): wire ParseRawPages+BuildPages+CanonicalizeLinks into Run

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mathias Bergqvist
2026-04-23 19:07:33 +02:00
parent 26855f69b0
commit de35d4dbb0
3 changed files with 49 additions and 46 deletions

View File

@@ -41,9 +41,11 @@ func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryR
schema = loadSchema(brainDir) schema = loadSchema(brainDir)
} }
sourceSlug := wiki.Slug(source)
date := time.Now().UTC().Format("2006-01-02")
chunks := Chunk(content, cfg.ChunkSize) chunks := Chunk(content, cfg.ChunkSize)
var allPages []wiki.Page var allRaw []RawPage
var allWarnings []string var allWarnings []string
for _, chunk := range chunks { for _, chunk := range chunks {
@@ -52,25 +54,19 @@ func Run(ctx context.Context, cfg Config, brainDir, content, source string, dryR
if err != nil { if err != nil {
return Result{}, fmt.Errorf("LLM call: %w", err) return Result{}, fmt.Errorf("LLM call: %w", err)
} }
// TODO(task4): replace with RawPage-based pipeline raw, warnings := ParseRawPages(output)
rawPages, warnings := ParseRawPages(output) allRaw = append(allRaw, raw...)
for _, rp := range rawPages {
if rp.Title == "" {
allWarnings = append(allWarnings, "skipped RawPage with empty title (TODO task4)")
continue
}
allPages = append(allPages, wiki.Page{Path: rp.Type + "/" + rp.Title, Content: rp.Content})
}
allWarnings = append(allWarnings, warnings...) allWarnings = append(allWarnings, warnings...)
} }
resolved := Resolve(allPages, inventory) pages := BuildPages(allRaw, sourceSlug, date)
withRefs := injectSourceRefs(resolved, inventory, brainDir) resolved := Resolve(pages, inventory)
canonicalized, linkWarnings := CanonicalizeLinks(resolved, inventory)
allWarnings = append(allWarnings, linkWarnings...)
withRefs := injectSourceRefs(canonicalized, inventory, brainDir)
merged := mergeAll(withRefs) merged := mergeAll(withRefs)
date := time.Now().UTC().Format("2006-01-02")
var written []string var written []string
for _, page := range merged { for _, page := range merged {
if !dryRun { if !dryRun {
dest := filepath.Join(brainDir, filepath.FromSlash(page.Path)) dest := filepath.Join(brainDir, filepath.FromSlash(page.Path))

View File

@@ -15,24 +15,27 @@ import (
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"github.com/mathiasbq/hyperguild/ingestion/internal/llm" "github.com/mathiasbq/hyperguild/ingestion/internal/llm"
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
) )
func TestRun_WritesPages(t *testing.T) { func TestRun_WritesPages(t *testing.T) {
t.Skip("TODO(task4): update stub to RawPage format")
brainDir := t.TempDir() brainDir := t.TempDir()
for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} { for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} {
require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755)) require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755))
} }
llmResponse := mustJSON([]wiki.Page{ llmResponse := mustJSON([]RawPage{
{ {
Path: "wiki/sources/test-article.md", Title: "Test Article",
Content: "---\ntitle: Test Article\ntype: article\ndomain: software-engineering\ndate_ingested: 2026-04-22\nlast_updated: 2026-04-22\naliases:\n - Test Article\n---\n\n## Summary\n\nA test article.\n\n## Key Claims\n\n- It tests things.\n\n## Concepts Introduced or Reinforced\n\n## Entities Mentioned\n\n## Open Questions Raised\n", Type: "source",
Subtype: "article",
Domain: "software-engineering",
Content: "## Summary\n\nA test article.\n\n## Key Claims\n\n- It tests things.\n\n## Concepts Introduced or Reinforced\n\n[[Testing]]\n\n## Entities Mentioned\n\n## Open Questions Raised\n",
}, },
{ {
Path: "wiki/concepts/testing.md", Title: "Testing",
Content: "---\ntitle: Testing\ndomain: software-engineering\nlast_updated: 2026-04-22\naliases:\n - Testing\n---\n\n## Definition\n\nThe practice of verifying software.\n\n## Why It Matters\n\nCatches bugs.\n\n## Related Concepts\n\n## Related Entities\n\n## Sources\n\n## Evolving Notes\n", Type: "concept",
Domain: "software-engineering",
Content: "## Definition\n\nThe practice of verifying software.\n\n## Why It Matters\n\nCatches bugs.\n\n## Related Concepts\n\n## Related Entities\n\n## Sources\n\n## Evolving Notes\n",
}, },
}) })
@@ -54,7 +57,6 @@ func TestRun_WritesPages(t *testing.T) {
result, err := Run(context.Background(), cfg, brainDir, "An article about testing.", "test-article", false) result, err := Run(context.Background(), cfg, brainDir, "An article about testing.", "test-article", false)
require.NoError(t, err) require.NoError(t, err)
assert.Len(t, result.Pages, 2) assert.Len(t, result.Pages, 2)
assert.Empty(t, result.Warnings)
_, err = os.Stat(filepath.Join(brainDir, "wiki", "sources", "test-article.md")) _, err = os.Stat(filepath.Join(brainDir, "wiki", "sources", "test-article.md"))
require.NoError(t, err) require.NoError(t, err)
@@ -67,15 +69,16 @@ func TestRun_WritesPages(t *testing.T) {
} }
func TestRun_DryRunDoesNotWrite(t *testing.T) { func TestRun_DryRunDoesNotWrite(t *testing.T) {
t.Skip("TODO(task4): update stub to RawPage format")
brainDir := t.TempDir() brainDir := t.TempDir()
for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} { for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} {
require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755)) require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755))
} }
llmResponse := mustJSON([]wiki.Page{{ llmResponse := mustJSON([]RawPage{{
Path: "wiki/sources/foo.md", Title: "Foo",
Content: "---\ntitle: Foo\n---\n\n## Summary\n\nFoo.\n", Type: "source",
Subtype: "article",
Content: "## Summary\n\nFoo.\n",
}}) }})
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
@@ -95,16 +98,15 @@ func TestRun_DryRunDoesNotWrite(t *testing.T) {
} }
func TestRun_MergesDuplicatePaths(t *testing.T) { func TestRun_MergesDuplicatePaths(t *testing.T) {
t.Skip("TODO(task4): update stub to RawPage format")
brainDir := t.TempDir() brainDir := t.TempDir()
for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} { for _, sub := range []string{"wiki/concepts", "wiki/entities", "wiki/sources"} {
require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755)) require.NoError(t, os.MkdirAll(filepath.Join(brainDir, sub), 0o755))
} }
// LLM returns same path twice (simulates multi-chunk merge) // LLM returns same title twice (simulates multi-chunk duplicate)
llmResponse := mustJSON([]wiki.Page{ llmResponse := mustJSON([]RawPage{
{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Definition\n\nFirst.\n\n## Related Concepts\n\n- [[bar|Bar]]\n"}, {Title: "Foo", Type: "concept", Content: "## Definition\n\nFirst.\n\n## Related Concepts\n\n[[Bar]]\n"},
{Path: "wiki/concepts/foo.md", Content: "---\ntitle: Foo\n---\n\n## Definition\n\nSecond.\n\n## Related Concepts\n\n- [[baz|Baz]]\n"}, {Title: "Foo", Type: "concept", Content: "## Definition\n\nSecond.\n\n## Related Concepts\n\n[[Baz]]\n"},
}) })
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
@@ -123,8 +125,9 @@ func TestRun_MergesDuplicatePaths(t *testing.T) {
require.NoError(t, err) require.NoError(t, err)
// keep-first for Definition, union for Related Concepts // keep-first for Definition, union for Related Concepts
assert.Contains(t, string(content), "First.") assert.Contains(t, string(content), "First.")
assert.Contains(t, string(content), "[[bar|Bar]]") // Bar and Baz unknown in empty inventory → left as plain [[links]]
assert.Contains(t, string(content), "[[baz|Baz]]") assert.Contains(t, string(content), "[[Bar]]")
assert.Contains(t, string(content), "[[Baz]]")
} }
func mustJSON(v any) string { func mustJSON(v any) string {

View File

@@ -14,13 +14,12 @@ import (
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"github.com/mathiasbq/hyperguild/ingestion/internal/pipeline" "github.com/mathiasbq/hyperguild/ingestion/internal/pipeline"
"github.com/mathiasbq/hyperguild/ingestion/internal/wiki"
) )
// successComplete returns a valid JSON-encoded page array for any call. // successComplete returns a valid JSON-encoded RawPage array for any call.
func successComplete(page wiki.Page) pipeline.CompleteFunc { func successComplete(raw pipeline.RawPage) pipeline.CompleteFunc {
return func(ctx context.Context, system, user string) (string, error) { return func(ctx context.Context, system, user string) (string, error) {
b, err := json.Marshal([]wiki.Page{page}) b, err := json.Marshal([]pipeline.RawPage{raw})
if err != nil { if err != nil {
return "", err return "", err
} }
@@ -50,16 +49,19 @@ func TestStart_ProcessesFile(t *testing.T) {
require.NoError(t, os.WriteFile(rawFile, []byte("Content about Shape Up."), 0o644)) require.NoError(t, os.WriteFile(rawFile, []byte("Content about Shape Up."), 0o644))
date := time.Now().UTC().Format("2006-01-02") date := time.Now().UTC().Format("2006-01-02")
wikiPage := wiki.Page{ rawPage := pipeline.RawPage{
Path: "wiki/sources/shape-up-book.md", Title: "Shape Up Book",
Content: "---\ntitle: Shape Up Book\ntype: article\ndomain: product-management\ndate_ingested: " + date + "\nlast_updated: " + date + "\naliases:\n - Shape Up Book\n---\n\n## Summary\n\nA book about Shape Up.\n", Type: "source",
Subtype: "article",
Domain: "product-management",
Content: "## Summary\n\nA book about Shape Up.\n",
} }
cfg := Config{ cfg := Config{
BrainDir: brainDir, BrainDir: brainDir,
Interval: 50 * time.Millisecond, Interval: 50 * time.Millisecond,
Pipeline: pipeline.Config{ Pipeline: pipeline.Config{
Complete: successComplete(wikiPage), Complete: successComplete(rawPage),
ChunkSize: 0, ChunkSize: 0,
Schema: "# Schema\nThree page types.", Schema: "# Schema\nThree page types.",
}, },
@@ -193,12 +195,14 @@ func TestProcessDir_SkipsSubdirs(t *testing.T) {
// Track which sources were passed to Complete. // Track which sources were passed to Complete.
var processedSources []string var processedSources []string
completeFn := func(ctx context.Context, system, user string) (string, error) { completeFn := func(ctx context.Context, system, user string) (string, error) {
// Record that this was called; return a minimal valid page. // Record that this was called; return a minimal valid RawPage.
page := wiki.Page{ raw := pipeline.RawPage{
Path: "wiki/sources/valid.md", Title: "Valid",
Content: "---\ntitle: Valid\n---\n\n## Summary\n\nValid.\n", Type: "source",
Subtype: "article",
Content: "## Summary\n\nValid.\n",
} }
b, _ := json.Marshal([]wiki.Page{page}) b, _ := json.Marshal([]pipeline.RawPage{raw})
processedSources = append(processedSources, "called") processedSources = append(processedSources, "called")
return string(b), nil return string(b), nil
} }