The eval set under brain/eval/qa-2026-05.md showed BM25 top-1 at 20% with 5 of the missing slugs being short focused knowledge entries that lost to long aggregate docs on raw term-frequency. Tier weighting addresses that without touching the BM25 algorithm itself. How - Result struct gains a Tier field, populated during the file walk via extractTier (frontmatter wins, path prefix as fallback — mirrors the graph.inferTierFromPath logic so the two callers stay in lockstep). - After the existing sort (and optional hybridMerge), do a final stable re-sort by float64(Score) * tierWeight(Tier). Knowledge ×1.5, note ×1.0, inbox ×0.3, unknown ×1.0. - hydrate() (vector-only hits) also fills Tier so re-ranking covers the hybrid path. Test covers the load-bearing case: a long note-tier doc with raw=10 loses to a short knowledge-tier doc with raw=8 after weighting (8×1.5=12 vs 10×1.0=10). Measurement gate is in infra#72: re-run brain/eval/score.py against the live brain after this image lands; close the issue when top-1 hit rate lifts by ≥10 absolute points.
214 lines
8.1 KiB
Go
214 lines
8.1 KiB
Go
// ingestion/internal/search/search_test.go
|
||
package search_test
|
||
|
||
import (
|
||
"context"
|
||
"fmt"
|
||
"os"
|
||
"path/filepath"
|
||
"strings"
|
||
"testing"
|
||
|
||
"github.com/mathiasbq/hyperguild/ingestion/internal/search"
|
||
"github.com/stretchr/testify/assert"
|
||
"github.com/stretchr/testify/require"
|
||
)
|
||
|
||
type stubEmbedder struct{ vec []float32 }
|
||
|
||
func (s stubEmbedder) Embed(_ context.Context, _ string) ([]float32, error) { return s.vec, nil }
|
||
|
||
type stubVector struct{ hits []search.VectorHit }
|
||
|
||
func (s stubVector) Search(_ context.Context, _ []float32, _ int) ([]search.VectorHit, error) {
|
||
return s.hits, nil
|
||
}
|
||
|
||
func TestSearch_HybridRRFPromotesVectorOnlyHit(t *testing.T) {
|
||
dir := t.TempDir()
|
||
for _, p := range []struct{ rel, body string }{
|
||
// BM25-keyword note (matches "lejpa" once)
|
||
{"wiki/jepa-fx/facts/foo.md", "---\ntitle: Foo\n---\nlejpa keyword\n"},
|
||
// Semantically related note that does NOT contain the keyword.
|
||
{"wiki/jepa-fx/facts/semantic.md", "---\ntitle: Semantic\n---\nNo keyword in body.\n"},
|
||
} {
|
||
full := filepath.Join(dir, p.rel)
|
||
require.NoError(t, os.MkdirAll(filepath.Dir(full), 0o755))
|
||
require.NoError(t, os.WriteFile(full, []byte(p.body), 0o644))
|
||
}
|
||
|
||
embedder := stubEmbedder{vec: []float32{0.1}}
|
||
vector := stubVector{hits: []search.VectorHit{
|
||
{Path: "wiki/jepa-fx/facts/semantic.md", Distance: 0.05}, // best vector match
|
||
{Path: "wiki/jepa-fx/facts/foo.md", Distance: 0.10},
|
||
}}
|
||
|
||
got, err := search.Query(dir, search.QueryOptions{
|
||
Query: "lejpa",
|
||
Limit: 5,
|
||
Vector: vector,
|
||
Embedder: embedder,
|
||
})
|
||
require.NoError(t, err)
|
||
require.Len(t, got, 2, "vector-only hit should be hydrated into results")
|
||
paths := []string{got[0].Path, got[1].Path}
|
||
assert.Contains(t, paths, "wiki/jepa-fx/facts/foo.md")
|
||
assert.Contains(t, paths, "wiki/jepa-fx/facts/semantic.md")
|
||
}
|
||
|
||
func TestSearch_HybridDedupesChunkPathsToParent(t *testing.T) {
|
||
dir := t.TempDir()
|
||
full := filepath.Join(dir, "knowledge", "long.md")
|
||
require.NoError(t, os.MkdirAll(filepath.Dir(full), 0o755))
|
||
// Body contains the BM25 keyword "alpaca" so hybridMerge actually runs
|
||
// (it only kicks in when BM25 returns at least one candidate).
|
||
require.NoError(t, os.WriteFile(full, []byte("---\ntitle: Long\n---\nalpaca content.\n"), 0o644))
|
||
|
||
embedder := stubEmbedder{vec: []float32{0.1}}
|
||
// Vector store returns three chunk-path hits all pointing at the same
|
||
// parent file. The merged result must surface ONE row per parent — not
|
||
// three rows with chunk-suffixed paths.
|
||
vector := stubVector{hits: []search.VectorHit{
|
||
{Path: "knowledge/long.md#0001", Distance: 0.05},
|
||
{Path: "knowledge/long.md#0002", Distance: 0.07},
|
||
{Path: "knowledge/long.md#0003", Distance: 0.09},
|
||
}}
|
||
|
||
got, err := search.Query(dir, search.QueryOptions{
|
||
Query: "alpaca",
|
||
Limit: 5,
|
||
Vector: vector,
|
||
Embedder: embedder,
|
||
})
|
||
require.NoError(t, err)
|
||
require.Len(t, got, 1, "three chunk hits for one parent must merge to one result")
|
||
assert.Equal(t, "knowledge/long.md", got[0].Path)
|
||
assert.Equal(t, "Long", got[0].Title)
|
||
}
|
||
|
||
func TestSearch_HybridFallsBackOnEmbedderError(t *testing.T) {
|
||
dir := t.TempDir()
|
||
require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755))
|
||
require.NoError(t, os.WriteFile(filepath.Join(dir, "wiki", "x.md"), []byte("keyword foo"), 0o644))
|
||
|
||
embedder := errorEmbedder{}
|
||
vector := stubVector{}
|
||
got, err := search.Query(dir, search.QueryOptions{
|
||
Query: "keyword", Limit: 5, Vector: vector, Embedder: embedder,
|
||
})
|
||
require.NoError(t, err)
|
||
require.Len(t, got, 1, "BM25 result should still come back when embedder fails")
|
||
assert.Equal(t, "wiki/x.md", got[0].Path)
|
||
}
|
||
|
||
type errorEmbedder struct{}
|
||
|
||
func (errorEmbedder) Embed(_ context.Context, _ string) ([]float32, error) {
|
||
return nil, assert.AnError
|
||
}
|
||
|
||
func TestSearch_ReturnsMatchingPages(t *testing.T) {
|
||
dir := t.TempDir()
|
||
require.NoError(t, os.MkdirAll(filepath.Join(dir, "knowledge"), 0o755))
|
||
|
||
require.NoError(t, os.WriteFile(
|
||
filepath.Join(dir, "knowledge", "retry-logic.md"),
|
||
[]byte("---\ntitle: Retry Logic\ndomain: software\n---\n\nRetry logic handles transient failures by re-attempting operations.\n"),
|
||
0o644,
|
||
))
|
||
require.NoError(t, os.WriteFile(
|
||
filepath.Join(dir, "knowledge", "database.md"),
|
||
[]byte("---\ntitle: Database\ndomain: software\n---\n\nA database stores structured data.\n"),
|
||
0o644,
|
||
))
|
||
|
||
results, err := search.Query(dir, search.QueryOptions{Query: "retry transient", Limit: 5})
|
||
require.NoError(t, err)
|
||
require.Len(t, results, 1)
|
||
assert.Equal(t, "knowledge/retry-logic.md", results[0].Path)
|
||
assert.Equal(t, "Retry Logic", results[0].Title)
|
||
assert.Greater(t, results[0].Score, 0)
|
||
assert.Contains(t, results[0].Excerpt, "Retry")
|
||
}
|
||
|
||
func TestSearch_TierWeightingReordersResults(t *testing.T) {
|
||
dir := t.TempDir()
|
||
// A long note-tier dump mentions the keyword many times (high raw
|
||
// BM25 score); a short knowledge entry mentions it three times.
|
||
// Raw BM25 prefers the dump; tier weighting (knowledge ×1.5 vs
|
||
// note ×1.0) flips the order if the score gap is within reach.
|
||
// note raw = 5 × 2 terms = 10 hits, weight 1.0 → 10
|
||
// knowledge raw = 4 × 2 terms = 8 hits, weight 1.5 → 12 (overtakes)
|
||
noteBody := "---\ntier: note\n---\n" + strings.Repeat("scram trap. ", 5)
|
||
knowledgeBody := "---\ntier: knowledge\n---\n" + strings.Repeat("scram trap. ", 4)
|
||
require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki", "sources"), 0o755))
|
||
require.NoError(t, os.MkdirAll(filepath.Join(dir, "knowledge"), 0o755))
|
||
require.NoError(t, os.WriteFile(filepath.Join(dir, "wiki", "sources", "dump.md"), []byte(noteBody), 0o644))
|
||
require.NoError(t, os.WriteFile(filepath.Join(dir, "knowledge", "trap.md"), []byte(knowledgeBody), 0o644))
|
||
|
||
results, err := search.Query(dir, search.QueryOptions{Query: "scram trap", Limit: 5})
|
||
require.NoError(t, err)
|
||
require.GreaterOrEqual(t, len(results), 2)
|
||
assert.Equal(t, "knowledge/trap.md", results[0].Path, "knowledge tier weight should beat note tier")
|
||
assert.Equal(t, "knowledge", results[0].Tier)
|
||
assert.Equal(t, "note", results[1].Tier)
|
||
}
|
||
|
||
func TestSearch_WingHallScoping(t *testing.T) {
|
||
dir := t.TempDir()
|
||
for _, p := range []struct{ rel, body string }{
|
||
{"wiki/jepa-fx/decisions/val-vol.md", "---\nwing: jepa-fx\nhall: decisions\n---\nval-vol-r2 keyword.\n"},
|
||
{"wiki/jepa-fx/facts/architecture.md", "---\nwing: jepa-fx\nhall: facts\n---\nval-vol-r2 keyword in facts.\n"},
|
||
{"wiki/hyperguild/decisions/routing.md", "---\nwing: hyperguild\nhall: decisions\n---\nval-vol-r2 reference.\n"},
|
||
{"knowledge/loose.md", "---\n---\nval-vol-r2 in knowledge.\n"},
|
||
} {
|
||
full := filepath.Join(dir, p.rel)
|
||
require.NoError(t, os.MkdirAll(filepath.Dir(full), 0o755))
|
||
require.NoError(t, os.WriteFile(full, []byte(p.body), 0o644))
|
||
}
|
||
|
||
// No filter: walk both knowledge/ and wiki/ — all 4 match.
|
||
got, err := search.Query(dir, search.QueryOptions{Query: "val-vol-r2", Limit: 10})
|
||
require.NoError(t, err)
|
||
assert.Len(t, got, 4)
|
||
|
||
// Wing scope: 2 jepa-fx hits, no hyperguild, no knowledge.
|
||
got, err = search.Query(dir, search.QueryOptions{Query: "val-vol-r2", Limit: 10, Wing: "jepa-fx"})
|
||
require.NoError(t, err)
|
||
require.Len(t, got, 2)
|
||
for _, r := range got {
|
||
assert.Equal(t, "jepa-fx", r.Wing)
|
||
}
|
||
|
||
// Wing+Hall scope: 1 hit.
|
||
got, err = search.Query(dir, search.QueryOptions{Query: "val-vol-r2", Limit: 10, Wing: "jepa-fx", Hall: "decisions"})
|
||
require.NoError(t, err)
|
||
require.Len(t, got, 1)
|
||
assert.Equal(t, "jepa-fx", got[0].Wing)
|
||
assert.Equal(t, "decisions", got[0].Hall)
|
||
assert.Equal(t, "wiki/jepa-fx/decisions/val-vol.md", got[0].Path)
|
||
|
||
// Invalid hall rejected.
|
||
_, err = search.Query(dir, search.QueryOptions{Query: "x", Wing: "jepa-fx", Hall: "garbage"})
|
||
require.Error(t, err)
|
||
|
||
// Hall without wing rejected.
|
||
_, err = search.Query(dir, search.QueryOptions{Query: "x", Hall: "facts"})
|
||
require.Error(t, err)
|
||
}
|
||
|
||
func TestSearch_RespectsLimit(t *testing.T) {
|
||
dir := t.TempDir()
|
||
require.NoError(t, os.MkdirAll(filepath.Join(dir, "knowledge"), 0o755))
|
||
for i := 0; i < 5; i++ {
|
||
require.NoError(t, os.WriteFile(
|
||
filepath.Join(dir, "knowledge", fmt.Sprintf("page-%d.md", i)),
|
||
[]byte(fmt.Sprintf("---\ntitle: Page %d\n---\n\nThis page mentions retry.\n", i)),
|
||
0o644,
|
||
))
|
||
}
|
||
results, err := search.Query(dir, search.QueryOptions{Query: "retry", Limit: 3})
|
||
require.NoError(t, err)
|
||
assert.LessOrEqual(t, len(results), 3)
|
||
}
|