Removes the TODO in Sync that left files static after their first embed. Edits to brain/wiki/ and brain/knowledge/ now surface in subsequent syncs without manual /backfill-embeddings calls. Approach - Store interface: KnownPaths → KnownPathsWithTime returning path → updated_at. Callers compare against file mtime to detect edits. - PGStore: SELECT path, updated_at FROM brain_embeddings. - Sync groups known chunks by parent path and tracks the EARLIEST updated_at per parent. A file is stale when its mtime is after that oldest chunk's timestamp — any chunk older than the file means at least one chunk hasn't been refreshed since the last edit. - Stale-path rewrite: delete every old chunk for the parent (handles "file shrunk → fewer chunks → orphan rows at higher #NNNN" cleanly), then re-chunk + re-embed + re-upsert. Tests - New: TestSync_ReembedsFileWhenMtimeNewer — file mtime forced into the future vs store updated_at; Sync deletes old chunk + upserts fresh one. - New: TestSync_SkipsFileWhenMtimeOlder — file mtime backdated; Sync is a no-op (no upserts, no deletes). - Updated: stubStore.known is now map[string]time.Time. A zero value resolves to a far-future sentinel so existing "skip if already known" tests keep passing without per-test setup. - pg_test renamed KnownPaths integration → KnownPathsWithTime; asserts updated_at is non-zero and within 5s of insert wall-clock. Backward compat - brain_embeddings rows pre-dating this change carry valid updated_at values (column was always populated via `DEFAULT now()` + ON CONFLICT `updated_at = now()`). No migration needed. Live pod will start re-embedding any file whose source has been edited since its chunks were originally written. Closes gitea/mathias/hyperguild#23.
275 lines
8.9 KiB
Go
275 lines
8.9 KiB
Go
package vectorstore_test
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
type stubStore struct {
|
|
// known maps chunk-path → updated_at. Tests that don't care about
|
|
// re-embed-on-mtime use a far-future time so the Sync skip path
|
|
// always wins. Tests that do exercise the mtime path set the
|
|
// updated_at explicitly.
|
|
known map[string]time.Time
|
|
upserts map[string][]float32
|
|
deletes []string
|
|
failNext error
|
|
}
|
|
|
|
// farFuture is "newer than any file mtime", used as the default
|
|
// updated_at in stubs that don't care about re-embed behavior.
|
|
var farFuture = time.Now().Add(24 * time.Hour)
|
|
|
|
func (s *stubStore) KnownPathsWithTime(_ context.Context) (map[string]time.Time, error) {
|
|
out := make(map[string]time.Time, len(s.known))
|
|
for k, t := range s.known {
|
|
if t.IsZero() {
|
|
t = farFuture
|
|
}
|
|
out[k] = t
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func (s *stubStore) Upsert(_ context.Context, path string, v []float32) error {
|
|
if s.failNext != nil {
|
|
err := s.failNext
|
|
s.failNext = nil
|
|
return err
|
|
}
|
|
if s.upserts == nil {
|
|
s.upserts = make(map[string][]float32)
|
|
}
|
|
s.upserts[path] = v
|
|
return nil
|
|
}
|
|
|
|
func (s *stubStore) Delete(_ context.Context, path string) error {
|
|
s.deletes = append(s.deletes, path)
|
|
return nil
|
|
}
|
|
|
|
type stubEmbedder struct {
|
|
vec []float32
|
|
err error
|
|
}
|
|
|
|
func (e stubEmbedder) Embed(_ context.Context, _ string) ([]float32, error) {
|
|
return e.vec, e.err
|
|
}
|
|
|
|
func writeNote(t *testing.T, dir, rel, body string) {
|
|
t.Helper()
|
|
full := filepath.Join(dir, rel)
|
|
require.NoError(t, os.MkdirAll(filepath.Dir(full), 0o755))
|
|
require.NoError(t, os.WriteFile(full, []byte(body), 0o644))
|
|
}
|
|
|
|
func TestSync_AddsNewFiles(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/jepa-fx/facts/x.md", "body of x")
|
|
writeNote(t, dir, "wiki/jepa-fx/facts/y.md", "body of y")
|
|
|
|
store := &stubStore{known: map[string]time.Time{}}
|
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 2, res.Added)
|
|
assert.Empty(t, res.Deleted)
|
|
assert.Contains(t, store.upserts, "wiki/jepa-fx/facts/x.md#0001")
|
|
assert.Contains(t, store.upserts, "wiki/jepa-fx/facts/y.md#0001")
|
|
}
|
|
|
|
func TestSync_SkipsAlreadyKnown(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/a/facts/x.md", "x")
|
|
|
|
store := &stubStore{known: map[string]time.Time{"wiki/a/facts/x.md#0001": {}}}
|
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 0, res.Added)
|
|
assert.Empty(t, store.upserts)
|
|
}
|
|
|
|
func TestSync_DeletesDisappearedFiles(t *testing.T) {
|
|
dir := t.TempDir()
|
|
require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755))
|
|
// store has a path that doesn't exist on disk anymore
|
|
store := &stubStore{known: map[string]time.Time{"wiki/old/facts/ghost.md#0001": {}}}
|
|
res, err := vectorstore.Sync(context.Background(), dir, &stubStoreWithDelete{stubStore: store}, stubEmbedder{vec: make([]float32, 768)})
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 1, res.Deleted)
|
|
}
|
|
|
|
// stubStoreWithDelete is a thin wrapper to capture Delete calls;
|
|
// stubStore already implements Delete but we need the wrapper to mix
|
|
// store interfaces with sync-specific expectations.
|
|
type stubStoreWithDelete struct {
|
|
*stubStore
|
|
}
|
|
|
|
func TestSync_SkipsIndexFiles(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/a/_index.md", "moc")
|
|
writeNote(t, dir, "wiki/a/facts/real.md", "body")
|
|
|
|
store := &stubStore{known: map[string]time.Time{}}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, stubEmbedder{vec: make([]float32, 768)})
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 1, res.Added)
|
|
assert.NotContains(t, store.upserts, "wiki/a/_index.md#0001")
|
|
}
|
|
|
|
func TestSync_ScansKnowledgeDir(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/a/facts/x.md", "x")
|
|
writeNote(t, dir, "knowledge/2026-05-19-koala-gpu-setup.md", "knowledge body")
|
|
|
|
store := &stubStore{known: map[string]time.Time{}}
|
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 2, res.Added)
|
|
assert.Contains(t, store.upserts, "wiki/a/facts/x.md#0001")
|
|
assert.Contains(t, store.upserts, "knowledge/2026-05-19-koala-gpu-setup.md#0001")
|
|
}
|
|
|
|
func TestSync_ChunksLongFiles(t *testing.T) {
|
|
dir := t.TempDir()
|
|
// Build a file that's well over the chunk byte budget. Multi-section
|
|
// markdown so the chunker has heading boundaries to cut on.
|
|
body := "# Doc\n\nintro line.\n\n"
|
|
for i := 0; i < 10; i++ {
|
|
body += "## Section " + string(rune('A'+i)) + "\n\n"
|
|
body += strings.Repeat("This section has a fair amount of content. ", 50) + "\n\n"
|
|
}
|
|
writeNote(t, dir, "knowledge/long.md", body)
|
|
|
|
store := &stubStore{known: map[string]time.Time{}}
|
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
|
require.NoError(t, err)
|
|
assert.Greater(t, res.Added, 1, "long file should produce multiple chunk rows")
|
|
// Every upserted path for this file must be a chunk path.
|
|
chunkCount := 0
|
|
for p := range store.upserts {
|
|
if strings.HasPrefix(p, "knowledge/long.md#") {
|
|
chunkCount++
|
|
}
|
|
}
|
|
assert.Equal(t, res.Added, chunkCount, "all rows for long file should be chunk-suffixed")
|
|
// The bare parent path must NOT be upserted directly.
|
|
assert.NotContains(t, store.upserts, "knowledge/long.md")
|
|
}
|
|
|
|
func TestSync_ShortFileGetsSingleChunkRow(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/short.md", "tiny body\n")
|
|
|
|
store := &stubStore{known: map[string]time.Time{}}
|
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 1, res.Added)
|
|
assert.Contains(t, store.upserts, "wiki/short.md#0001")
|
|
}
|
|
|
|
func TestSync_SkipsFileIfAnyChunkAlreadyKnown(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/foo.md", "body\n")
|
|
|
|
store := &stubStore{known: map[string]time.Time{
|
|
"wiki/foo.md#0001": {},
|
|
}}
|
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 0, res.Added)
|
|
assert.Empty(t, store.upserts)
|
|
}
|
|
|
|
func TestSync_DeletesAllChunksOfDisappearedFile(t *testing.T) {
|
|
dir := t.TempDir()
|
|
require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755))
|
|
store := &stubStore{known: map[string]time.Time{
|
|
"wiki/ghost.md#0001": {},
|
|
"wiki/ghost.md#0002": {},
|
|
"wiki/ghost.md#0003": {},
|
|
}}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, stubEmbedder{vec: make([]float32, 768)})
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 3, res.Deleted)
|
|
}
|
|
|
|
func TestSync_ReembedsFileWhenMtimeNewer(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/edited.md", "original body\n")
|
|
// Force the file's mtime ahead of any plausible store updated_at.
|
|
future := time.Now().Add(1 * time.Hour)
|
|
require.NoError(t, os.Chtimes(filepath.Join(dir, "wiki/edited.md"), future, future))
|
|
|
|
store := &stubStore{
|
|
known: map[string]time.Time{
|
|
// Existing chunk row pre-dates the file's mtime.
|
|
"wiki/edited.md#0001": time.Now().Add(-1 * time.Hour),
|
|
},
|
|
}
|
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 1, res.Added, "file with newer mtime should be re-embedded")
|
|
assert.Contains(t, store.upserts, "wiki/edited.md#0001")
|
|
// Old chunks of the same parent must be deleted before re-embed so
|
|
// shrunk files don't leave orphan rows at higher #NNNN indexes.
|
|
assert.Contains(t, store.deletes, "wiki/edited.md#0001")
|
|
}
|
|
|
|
func TestSync_SkipsFileWhenMtimeOlder(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/stable.md", "body\n")
|
|
// Backdate mtime to before the store's recorded updated_at.
|
|
past := time.Now().Add(-2 * time.Hour)
|
|
require.NoError(t, os.Chtimes(filepath.Join(dir, "wiki/stable.md"), past, past))
|
|
|
|
store := &stubStore{
|
|
known: map[string]time.Time{
|
|
"wiki/stable.md#0001": time.Now(),
|
|
},
|
|
}
|
|
emb := stubEmbedder{vec: make([]float32, 768)}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 0, res.Added)
|
|
assert.Empty(t, store.upserts)
|
|
assert.Empty(t, store.deletes)
|
|
}
|
|
|
|
func TestSync_NoOpWhenComponentsNil(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/a/facts/x.md", "x")
|
|
res, err := vectorstore.Sync(context.Background(), dir, nil, nil)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 0, res.Added)
|
|
}
|
|
|
|
func TestSync_CollectsEmbedderErrors(t *testing.T) {
|
|
dir := t.TempDir()
|
|
writeNote(t, dir, "wiki/a/facts/x.md", "x")
|
|
store := &stubStore{known: map[string]time.Time{}}
|
|
emb := stubEmbedder{err: errors.New("upstream down")}
|
|
res, err := vectorstore.Sync(context.Background(), dir, store, emb)
|
|
require.NoError(t, err)
|
|
assert.Equal(t, 0, res.Added)
|
|
assert.Len(t, res.Errors, 1)
|
|
}
|