// ingestion/internal/search/search_test.go package search_test import ( "context" "fmt" "os" "path/filepath" "strings" "testing" "github.com/mathiasbq/hyperguild/ingestion/internal/search" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) type stubEmbedder struct{ vec []float32 } func (s stubEmbedder) Embed(_ context.Context, _ string) ([]float32, error) { return s.vec, nil } type stubVector struct{ hits []search.VectorHit } func (s stubVector) Search(_ context.Context, _ []float32, _ int) ([]search.VectorHit, error) { return s.hits, nil } func TestSearch_HybridRRFPromotesVectorOnlyHit(t *testing.T) { dir := t.TempDir() for _, p := range []struct{ rel, body string }{ // BM25-keyword note (matches "lejpa" once) {"wiki/jepa-fx/facts/foo.md", "---\ntitle: Foo\n---\nlejpa keyword\n"}, // Semantically related note that does NOT contain the keyword. {"wiki/jepa-fx/facts/semantic.md", "---\ntitle: Semantic\n---\nNo keyword in body.\n"}, } { full := filepath.Join(dir, p.rel) require.NoError(t, os.MkdirAll(filepath.Dir(full), 0o755)) require.NoError(t, os.WriteFile(full, []byte(p.body), 0o644)) } embedder := stubEmbedder{vec: []float32{0.1}} vector := stubVector{hits: []search.VectorHit{ {Path: "wiki/jepa-fx/facts/semantic.md", Distance: 0.05}, // best vector match {Path: "wiki/jepa-fx/facts/foo.md", Distance: 0.10}, }} got, err := search.Query(dir, search.QueryOptions{ Query: "lejpa", Limit: 5, Vector: vector, Embedder: embedder, }) require.NoError(t, err) require.Len(t, got, 2, "vector-only hit should be hydrated into results") paths := []string{got[0].Path, got[1].Path} assert.Contains(t, paths, "wiki/jepa-fx/facts/foo.md") assert.Contains(t, paths, "wiki/jepa-fx/facts/semantic.md") } func TestSearch_HybridDedupesChunkPathsToParent(t *testing.T) { dir := t.TempDir() full := filepath.Join(dir, "knowledge", "long.md") require.NoError(t, os.MkdirAll(filepath.Dir(full), 0o755)) // Body contains the BM25 keyword "alpaca" so hybridMerge actually runs // (it only kicks in when BM25 returns at least one candidate). require.NoError(t, os.WriteFile(full, []byte("---\ntitle: Long\n---\nalpaca content.\n"), 0o644)) embedder := stubEmbedder{vec: []float32{0.1}} // Vector store returns three chunk-path hits all pointing at the same // parent file. The merged result must surface ONE row per parent — not // three rows with chunk-suffixed paths. vector := stubVector{hits: []search.VectorHit{ {Path: "knowledge/long.md#0001", Distance: 0.05}, {Path: "knowledge/long.md#0002", Distance: 0.07}, {Path: "knowledge/long.md#0003", Distance: 0.09}, }} got, err := search.Query(dir, search.QueryOptions{ Query: "alpaca", Limit: 5, Vector: vector, Embedder: embedder, }) require.NoError(t, err) require.Len(t, got, 1, "three chunk hits for one parent must merge to one result") assert.Equal(t, "knowledge/long.md", got[0].Path) assert.Equal(t, "Long", got[0].Title) } func TestSearch_HybridFallsBackOnEmbedderError(t *testing.T) { dir := t.TempDir() require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755)) require.NoError(t, os.WriteFile(filepath.Join(dir, "wiki", "x.md"), []byte("keyword foo"), 0o644)) embedder := errorEmbedder{} vector := stubVector{} got, err := search.Query(dir, search.QueryOptions{ Query: "keyword", Limit: 5, Vector: vector, Embedder: embedder, }) require.NoError(t, err) require.Len(t, got, 1, "BM25 result should still come back when embedder fails") assert.Equal(t, "wiki/x.md", got[0].Path) } type errorEmbedder struct{} func (errorEmbedder) Embed(_ context.Context, _ string) ([]float32, error) { return nil, assert.AnError } func TestSearch_ReturnsMatchingPages(t *testing.T) { dir := t.TempDir() require.NoError(t, os.MkdirAll(filepath.Join(dir, "knowledge"), 0o755)) require.NoError(t, os.WriteFile( filepath.Join(dir, "knowledge", "retry-logic.md"), []byte("---\ntitle: Retry Logic\ndomain: software\n---\n\nRetry logic handles transient failures by re-attempting operations.\n"), 0o644, )) require.NoError(t, os.WriteFile( filepath.Join(dir, "knowledge", "database.md"), []byte("---\ntitle: Database\ndomain: software\n---\n\nA database stores structured data.\n"), 0o644, )) results, err := search.Query(dir, search.QueryOptions{Query: "retry transient", Limit: 5}) require.NoError(t, err) require.Len(t, results, 1) assert.Equal(t, "knowledge/retry-logic.md", results[0].Path) assert.Equal(t, "Retry Logic", results[0].Title) assert.Greater(t, results[0].Score, 0) assert.Contains(t, results[0].Excerpt, "Retry") } func TestSearch_TierWeightingReordersResults(t *testing.T) { dir := t.TempDir() // A long note-tier dump mentions the keyword many times (high raw // BM25 score); a short knowledge entry mentions it three times. // Raw BM25 prefers the dump; tier weighting (knowledge ×1.5 vs // note ×1.0) flips the order if the score gap is within reach. // note raw = 5 × 2 terms = 10 hits, weight 1.0 → 10 // knowledge raw = 4 × 2 terms = 8 hits, weight 1.5 → 12 (overtakes) noteBody := "---\ntier: note\n---\n" + strings.Repeat("scram trap. ", 5) knowledgeBody := "---\ntier: knowledge\n---\n" + strings.Repeat("scram trap. ", 4) require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki", "sources"), 0o755)) require.NoError(t, os.MkdirAll(filepath.Join(dir, "knowledge"), 0o755)) require.NoError(t, os.WriteFile(filepath.Join(dir, "wiki", "sources", "dump.md"), []byte(noteBody), 0o644)) require.NoError(t, os.WriteFile(filepath.Join(dir, "knowledge", "trap.md"), []byte(knowledgeBody), 0o644)) results, err := search.Query(dir, search.QueryOptions{Query: "scram trap", Limit: 5}) require.NoError(t, err) require.GreaterOrEqual(t, len(results), 2) assert.Equal(t, "knowledge/trap.md", results[0].Path, "knowledge tier weight should beat note tier") assert.Equal(t, "knowledge", results[0].Tier) assert.Equal(t, "note", results[1].Tier) } func TestSearch_WingHallScoping(t *testing.T) { dir := t.TempDir() for _, p := range []struct{ rel, body string }{ {"wiki/jepa-fx/decisions/val-vol.md", "---\nwing: jepa-fx\nhall: decisions\n---\nval-vol-r2 keyword.\n"}, {"wiki/jepa-fx/facts/architecture.md", "---\nwing: jepa-fx\nhall: facts\n---\nval-vol-r2 keyword in facts.\n"}, {"wiki/hyperguild/decisions/routing.md", "---\nwing: hyperguild\nhall: decisions\n---\nval-vol-r2 reference.\n"}, {"knowledge/loose.md", "---\n---\nval-vol-r2 in knowledge.\n"}, } { full := filepath.Join(dir, p.rel) require.NoError(t, os.MkdirAll(filepath.Dir(full), 0o755)) require.NoError(t, os.WriteFile(full, []byte(p.body), 0o644)) } // No filter: walk both knowledge/ and wiki/ — all 4 match. got, err := search.Query(dir, search.QueryOptions{Query: "val-vol-r2", Limit: 10}) require.NoError(t, err) assert.Len(t, got, 4) // Wing scope: 2 jepa-fx hits, no hyperguild, no knowledge. got, err = search.Query(dir, search.QueryOptions{Query: "val-vol-r2", Limit: 10, Wing: "jepa-fx"}) require.NoError(t, err) require.Len(t, got, 2) for _, r := range got { assert.Equal(t, "jepa-fx", r.Wing) } // Wing+Hall scope: 1 hit. got, err = search.Query(dir, search.QueryOptions{Query: "val-vol-r2", Limit: 10, Wing: "jepa-fx", Hall: "decisions"}) require.NoError(t, err) require.Len(t, got, 1) assert.Equal(t, "jepa-fx", got[0].Wing) assert.Equal(t, "decisions", got[0].Hall) assert.Equal(t, "wiki/jepa-fx/decisions/val-vol.md", got[0].Path) // Invalid hall rejected. _, err = search.Query(dir, search.QueryOptions{Query: "x", Wing: "jepa-fx", Hall: "garbage"}) require.Error(t, err) // Hall without wing rejected. _, err = search.Query(dir, search.QueryOptions{Query: "x", Hall: "facts"}) require.Error(t, err) } func TestSearch_RespectsLimit(t *testing.T) { dir := t.TempDir() require.NoError(t, os.MkdirAll(filepath.Join(dir, "knowledge"), 0o755)) for i := 0; i < 5; i++ { require.NoError(t, os.WriteFile( filepath.Join(dir, "knowledge", fmt.Sprintf("page-%d.md", i)), []byte(fmt.Sprintf("---\ntitle: Page %d\n---\n\nThis page mentions retry.\n", i)), 0o644, )) } results, err := search.Query(dir, search.QueryOptions{Query: "retry", Limit: 3}) require.NoError(t, err) assert.LessOrEqual(t, len(results), 3) }