fix(ingestion): embed sync also scans brain/knowledge/ + logs per-item errors

The embed sync goroutine only walked brain/wiki/. brain/knowledge/ (112 curated entries, per CLAUDE.md the most-important brain content) had zero coverage in brain_embeddings — vector retrieval was blind to it. Hybrid BM25 + pgvector retrieval would never surface a curated knowledge entry via the vector arm. Extract the per-root walk into a loop over a small subdir list and add "knowledge" alongside "wiki". scanDirs is package-level so it stays a single source of truth for what gets embedded. Also log each failing item's path + error string from StartSync. Previously only the aggregate count was logged, so a persistent `errors=1` per cycle was opaque. With per-item warnings, the actual ollama "input length exceeds the context length" surface immediately. Refs gitea/mathias/infra#37 (this commit covers the knowledge/ scan bug; the long-file chunking bug is a separate change.)
2026-05-19 21:27:15 +02:00
parent 4af1036423
commit 078ec029da
2 changed files with 67 additions and 42 deletions
--- a/ingestion/internal/vectorstore/sync_test.go
+++ b/ingestion/internal/vectorstore/sync_test.go
@@ -117,6 +117,20 @@ func TestSync_SkipsIndexFiles(t *testing.T) {
 	assert.NotContains(t, store.upserts, "wiki/a/_index.md")
 }

+func TestSync_ScansKnowledgeDir(t *testing.T) {
+	dir := t.TempDir()
+	writeNote(t, dir, "wiki/a/facts/x.md", "x")
+	writeNote(t, dir, "knowledge/2026-05-19-koala-gpu-setup.md", "knowledge body")
+
+	store := &stubStore{known: map[string]struct{}{}}
+	emb := stubEmbedder{vec: make([]float32, 768)}
+	res, err := vectorstore.Sync(context.Background(), dir, store, emb)
+	require.NoError(t, err)
+	assert.Equal(t, 2, res.Added)
+	assert.Contains(t, store.upserts, "wiki/a/facts/x.md")
+	assert.Contains(t, store.upserts, "knowledge/2026-05-19-koala-gpu-setup.md")
+}
+
 func TestSync_NoOpWhenComponentsNil(t *testing.T) {
 	dir := t.TempDir()
 	writeNote(t, dir, "wiki/a/facts/x.md", "x")