Long markdown files (>~8KB) silently failed to embed because nomic-embed-text on iguana has a 2048-token context. embed sync logged errors=1 every cycle with no useful body until #37 added per-item logging — three files exceed the ceiling: finbert source (8 KB), koala-machine-state (7.1 KB), litellm-absorption (8.8 KB). Curated knowledge entries should never be vector-blind. Approach: chunk-before-embed, no schema change. vectorstore/chunk.go (new) - ChunkMarkdown splits at H1/H2 boundaries; sections over maxBytes are further split at paragraph boundaries, packing greedily under budget. - NumberChunks assigns "<parent>#NNNN" storage paths (1-based, zero-padded to 4 digits — handles files with up to ~10k sections in stable sort order). - ParentPath strips the chunk suffix for retrieval-side dedup. vectorstore/sync.go - After ChunkMarkdown produces N pieces, each is embedded + upserted as a separate brain_embeddings row at "<parent>#NNNN". maxChunkBytes = 4000 (≈1000 nomic tokens, well under the 2048 ceiling with headroom for unicode/code blocks). - "Already embedded?" check now reduces known paths to parent set via ParentPath, so the first chunk hit short-circuits the file. - Delete walk also reduces via ParentPath; when a parent file disappears, every chunk row (and any pre-existing bare-path row, for backward compatibility with rows written before this change) gets dropped. search/search.go - hybridMerge collapses chunk-path vector hits to parent via ParentPath before scope check, RRF accumulation, and hydration. A file with three chunk hits returns one result row, not three. Backward compatibility: pre-existing bare-path rows in brain_embeddings keep working — ParentPath returns them unchanged, knownParents handles them as if they were "wiki/foo.md#NNNN" hits, sync skips re-embed, and search dedup is a no-op for them. No migration required to ship. Tests: - chunk_test.go covers short / heading split / oversized section / content preservation / chunk numbering / parent-path stripping. - sync_test.go adds long-file chunking, single-chunk-row short file, skip-if-any-chunk-known, delete-all-chunks-of-disappeared-file. Existing tests updated for #NNNN paths. - search_test.go adds chunk-paths-dedupe-to-parent. Closes gitea/mathias/infra#38.
220 lines
6.9 KiB
Go
220 lines
6.9 KiB
Go
package vectorstore_test
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"testing"
|
|
|
|
"github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
type stubStore struct {
|
|
known map[string]struct{}
|
|
upserts map[string][]float32
|
|
deletes []string
|
|
failNext error
|
|
}
|
|
|
|
func (s *stubStore) KnownPaths(_ context.Context) (map[string]struct{}, error) {
|
|
out := make(map[string]struct{}, len(s.known))
|
|
for k := range s.known {
|
|
out[k] = struct{}{}
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func (s *stubStore) Upsert(_ context.Context, path string, v []float32) error {
|
|
if s.failNext != nil {
|
|
err := s.failNext
|
|
s.failNext = nil
|
|
return err
|
|
}
|
|
if s.upserts == nil {
|
|
s.upserts = make(map[string][]float32)
|
|
}
|
|
s.upserts[path] = v
|
|
return nil
|
|
}
|
|
|
|
func (s *stubStore) Delete(_ context.Context, path string) error {
|
|
s.deletes = append(s.deletes, path)
|
|
return nil
|
|
}
|
|
|
|
type stubEmbedder struct {
|
|
vec []float32
|
|
err error
|
|
}
|
|
|
|
func (e stubEmbedder) Embed(_ context.Context, _ string) ([]float32, error) {
|
|
return e.vec, e.err
|
|
}
|
|
|
|
func writeNote(t *testing.T, dir, rel, body string) {
|
|
t.Helper()
|
|
full := filepath.Join(dir, rel)
|
|
require.NoError(t, os.MkdirAll(filepath.Dir(full), 0o755))
|
|
require.NoError(t, os.WriteFile(full, []byte(body), 0o644))
|
|
}
|
|
|
|
func TestSync_AddsNewFiles(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/jepa-fx/facts/x.md", "body of x")
|
|
writeNote(t, dir, "wiki/jepa-fx/facts/y.md", "body of y")
|
|
|
|
store := &stubStore{known: map[string]struct{}{}}
|
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 2, res.Added)
|
|
assert.Empty(t, res.Deleted)
|
|
assert.Contains(t, store.upserts, "wiki/jepa-fx/facts/x.md#0001")
|
|
assert.Contains(t, store.upserts, "wiki/jepa-fx/facts/y.md#0001")
|
|
}
|
|
|
|
func TestSync_SkipsAlreadyKnown(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/a/facts/x.md", "x")
|
|
|
|
store := &stubStore{known: map[string]struct{}{"wiki/a/facts/x.md#0001": {}}}
|
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 0, res.Added)
|
|
assert.Empty(t, store.upserts)
|
|
}
|
|
|
|
func TestSync_DeletesDisappearedFiles(t *testing.T) {
|
|
dir := t.TempDir()
|
|
require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755))
|
|
// store has a path that doesn't exist on disk anymore
|
|
store := &stubStore{known: map[string]struct{}{"wiki/old/facts/ghost.md#0001": {}}}
|
|
res, err := vectorstore.Sync(context.Background(), dir, &stubStoreWithDelete{stubStore: store}, stubEmbedder{vec: make([]float32, 768)})
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 1, res.Deleted)
|
|
}
|
|
|
|
// stubStoreWithDelete is a thin wrapper to capture Delete calls;
|
|
// stubStore already implements Delete but we need the wrapper to mix
|
|
// store interfaces with sync-specific expectations.
|
|
type stubStoreWithDelete struct {
|
|
*stubStore
|
|
}
|
|
|
|
func TestSync_SkipsIndexFiles(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/a/_index.md", "moc")
|
|
writeNote(t, dir, "wiki/a/facts/real.md", "body")
|
|
|
|
store := &stubStore{known: map[string]struct{}{}}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, stubEmbedder{vec: make([]float32, 768)})
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 1, res.Added)
|
|
assert.NotContains(t, store.upserts, "wiki/a/_index.md#0001")
|
|
}
|
|
|
|
func TestSync_ScansKnowledgeDir(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/a/facts/x.md", "x")
|
|
writeNote(t, dir, "knowledge/2026-05-19-koala-gpu-setup.md", "knowledge body")
|
|
|
|
store := &stubStore{known: map[string]struct{}{}}
|
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 2, res.Added)
|
|
assert.Contains(t, store.upserts, "wiki/a/facts/x.md#0001")
|
|
assert.Contains(t, store.upserts, "knowledge/2026-05-19-koala-gpu-setup.md#0001")
|
|
}
|
|
|
|
func TestSync_ChunksLongFiles(t *testing.T) {
|
|
dir := t.TempDir()
|
|
// Build a file that's well over the chunk byte budget. Multi-section
|
|
// markdown so the chunker has heading boundaries to cut on.
|
|
body := "# Doc\n\nintro line.\n\n"
|
|
for i := 0; i < 10; i++ {
|
|
body += "## Section " + string(rune('A'+i)) + "\n\n"
|
|
body += strings.Repeat("This section has a fair amount of content. ", 50) + "\n\n"
|
|
}
|
|
writeNote(t, dir, "knowledge/long.md", body)
|
|
|
|
store := &stubStore{known: map[string]struct{}{}}
|
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
|
require.NoError(t, err)
|
|
assert.Greater(t, res.Added, 1, "long file should produce multiple chunk rows")
|
|
// Every upserted path for this file must be a chunk path.
|
|
chunkCount := 0
|
|
for p := range store.upserts {
|
|
if strings.HasPrefix(p, "knowledge/long.md#") {
|
|
chunkCount++
|
|
}
|
|
}
|
|
assert.Equal(t, res.Added, chunkCount, "all rows for long file should be chunk-suffixed")
|
|
// The bare parent path must NOT be upserted directly.
|
|
assert.NotContains(t, store.upserts, "knowledge/long.md")
|
|
}
|
|
|
|
func TestSync_ShortFileGetsSingleChunkRow(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/short.md", "tiny body\n")
|
|
|
|
store := &stubStore{known: map[string]struct{}{}}
|
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 1, res.Added)
|
|
assert.Contains(t, store.upserts, "wiki/short.md#0001")
|
|
}
|
|
|
|
func TestSync_SkipsFileIfAnyChunkAlreadyKnown(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/foo.md", "body\n")
|
|
|
|
store := &stubStore{known: map[string]struct{}{
|
|
"wiki/foo.md#0001": {},
|
|
}}
|
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 0, res.Added)
|
|
assert.Empty(t, store.upserts)
|
|
}
|
|
|
|
func TestSync_DeletesAllChunksOfDisappearedFile(t *testing.T) {
|
|
dir := t.TempDir()
|
|
require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755))
|
|
store := &stubStore{known: map[string]struct{}{
|
|
"wiki/ghost.md#0001": {},
|
|
"wiki/ghost.md#0002": {},
|
|
"wiki/ghost.md#0003": {},
|
|
}}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, stubEmbedder{vec: make([]float32, 768)})
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 3, res.Deleted)
|
|
}
|
|
|
|
func TestSync_NoOpWhenComponentsNil(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/a/facts/x.md", "x")
|
|
res, err := vectorstore.Sync(context.Background(), dir, nil, nil)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 0, res.Added)
|
|
}
|
|
|
|
func TestSync_CollectsEmbedderErrors(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/a/facts/x.md", "x")
|
|
store := &stubStore{known: map[string]struct{}{}}
|
|
emb := stubEmbedder{err: errors.New("upstream down")}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 0, res.Added)
|
|
assert.Len(t, res.Errors, 1)
|
|
}
|