Files
hyperguild/ingestion/internal/search/search_test.go
Mathias 4f78fecd06
All checks were successful
CI / Lint / Test / Vet (push) Successful in 12s
CI / Mirror to GitHub (push) Successful in 3s
feat(search): M4 tier-weighted BM25 re-rank (infra#72)
The eval set under brain/eval/qa-2026-05.md showed BM25 top-1 at 20%
with 5 of the missing slugs being short focused knowledge entries
that lost to long aggregate docs on raw term-frequency. Tier weighting
addresses that without touching the BM25 algorithm itself.

How

- Result struct gains a Tier field, populated during the file walk
  via extractTier (frontmatter wins, path prefix as fallback —
  mirrors the graph.inferTierFromPath logic so the two callers stay
  in lockstep).
- After the existing sort (and optional hybridMerge), do a final
  stable re-sort by float64(Score) * tierWeight(Tier). Knowledge
  ×1.5, note ×1.0, inbox ×0.3, unknown ×1.0.
- hydrate() (vector-only hits) also fills Tier so re-ranking covers
  the hybrid path.

Test covers the load-bearing case: a long note-tier doc with raw=10
loses to a short knowledge-tier doc with raw=8 after weighting
(8×1.5=12 vs 10×1.0=10).

Measurement gate is in infra#72: re-run brain/eval/score.py against
the live brain after this image lands; close the issue when top-1
hit rate lifts by ≥10 absolute points.
2026-05-25 18:45:20 +02:00

214 lines
8.1 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// ingestion/internal/search/search_test.go
package search_test
import (
"context"
"fmt"
"os"
"path/filepath"
"strings"
"testing"
"github.com/mathiasbq/hyperguild/ingestion/internal/search"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
type stubEmbedder struct{ vec []float32 }
func (s stubEmbedder) Embed(_ context.Context, _ string) ([]float32, error) { return s.vec, nil }
type stubVector struct{ hits []search.VectorHit }
func (s stubVector) Search(_ context.Context, _ []float32, _ int) ([]search.VectorHit, error) {
return s.hits, nil
}
func TestSearch_HybridRRFPromotesVectorOnlyHit(t *testing.T) {
dir := t.TempDir()
for _, p := range []struct{ rel, body string }{
// BM25-keyword note (matches "lejpa" once)
{"wiki/jepa-fx/facts/foo.md", "---\ntitle: Foo\n---\nlejpa keyword\n"},
// Semantically related note that does NOT contain the keyword.
{"wiki/jepa-fx/facts/semantic.md", "---\ntitle: Semantic\n---\nNo keyword in body.\n"},
} {
full := filepath.Join(dir, p.rel)
require.NoError(t, os.MkdirAll(filepath.Dir(full), 0o755))
require.NoError(t, os.WriteFile(full, []byte(p.body), 0o644))
}
embedder := stubEmbedder{vec: []float32{0.1}}
vector := stubVector{hits: []search.VectorHit{
{Path: "wiki/jepa-fx/facts/semantic.md", Distance: 0.05}, // best vector match
{Path: "wiki/jepa-fx/facts/foo.md", Distance: 0.10},
}}
got, err := search.Query(dir, search.QueryOptions{
Query: "lejpa",
Limit: 5,
Vector: vector,
Embedder: embedder,
})
require.NoError(t, err)
require.Len(t, got, 2, "vector-only hit should be hydrated into results")
paths := []string{got[0].Path, got[1].Path}
assert.Contains(t, paths, "wiki/jepa-fx/facts/foo.md")
assert.Contains(t, paths, "wiki/jepa-fx/facts/semantic.md")
}
func TestSearch_HybridDedupesChunkPathsToParent(t *testing.T) {
dir := t.TempDir()
full := filepath.Join(dir, "knowledge", "long.md")
require.NoError(t, os.MkdirAll(filepath.Dir(full), 0o755))
// Body contains the BM25 keyword "alpaca" so hybridMerge actually runs
// (it only kicks in when BM25 returns at least one candidate).
require.NoError(t, os.WriteFile(full, []byte("---\ntitle: Long\n---\nalpaca content.\n"), 0o644))
embedder := stubEmbedder{vec: []float32{0.1}}
// Vector store returns three chunk-path hits all pointing at the same
// parent file. The merged result must surface ONE row per parent — not
// three rows with chunk-suffixed paths.
vector := stubVector{hits: []search.VectorHit{
{Path: "knowledge/long.md#0001", Distance: 0.05},
{Path: "knowledge/long.md#0002", Distance: 0.07},
{Path: "knowledge/long.md#0003", Distance: 0.09},
}}
got, err := search.Query(dir, search.QueryOptions{
Query: "alpaca",
Limit: 5,
Vector: vector,
Embedder: embedder,
})
require.NoError(t, err)
require.Len(t, got, 1, "three chunk hits for one parent must merge to one result")
assert.Equal(t, "knowledge/long.md", got[0].Path)
assert.Equal(t, "Long", got[0].Title)
}
func TestSearch_HybridFallsBackOnEmbedderError(t *testing.T) {
dir := t.TempDir()
require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki"), 0o755))
require.NoError(t, os.WriteFile(filepath.Join(dir, "wiki", "x.md"), []byte("keyword foo"), 0o644))
embedder := errorEmbedder{}
vector := stubVector{}
got, err := search.Query(dir, search.QueryOptions{
Query: "keyword", Limit: 5, Vector: vector, Embedder: embedder,
})
require.NoError(t, err)
require.Len(t, got, 1, "BM25 result should still come back when embedder fails")
assert.Equal(t, "wiki/x.md", got[0].Path)
}
type errorEmbedder struct{}
func (errorEmbedder) Embed(_ context.Context, _ string) ([]float32, error) {
return nil, assert.AnError
}
func TestSearch_ReturnsMatchingPages(t *testing.T) {
dir := t.TempDir()
require.NoError(t, os.MkdirAll(filepath.Join(dir, "knowledge"), 0o755))
require.NoError(t, os.WriteFile(
filepath.Join(dir, "knowledge", "retry-logic.md"),
[]byte("---\ntitle: Retry Logic\ndomain: software\n---\n\nRetry logic handles transient failures by re-attempting operations.\n"),
0o644,
))
require.NoError(t, os.WriteFile(
filepath.Join(dir, "knowledge", "database.md"),
[]byte("---\ntitle: Database\ndomain: software\n---\n\nA database stores structured data.\n"),
0o644,
))
results, err := search.Query(dir, search.QueryOptions{Query: "retry transient", Limit: 5})
require.NoError(t, err)
require.Len(t, results, 1)
assert.Equal(t, "knowledge/retry-logic.md", results[0].Path)
assert.Equal(t, "Retry Logic", results[0].Title)
assert.Greater(t, results[0].Score, 0)
assert.Contains(t, results[0].Excerpt, "Retry")
}
func TestSearch_TierWeightingReordersResults(t *testing.T) {
dir := t.TempDir()
// A long note-tier dump mentions the keyword many times (high raw
// BM25 score); a short knowledge entry mentions it three times.
// Raw BM25 prefers the dump; tier weighting (knowledge ×1.5 vs
// note ×1.0) flips the order if the score gap is within reach.
// note raw = 5 × 2 terms = 10 hits, weight 1.0 → 10
// knowledge raw = 4 × 2 terms = 8 hits, weight 1.5 → 12 (overtakes)
noteBody := "---\ntier: note\n---\n" + strings.Repeat("scram trap. ", 5)
knowledgeBody := "---\ntier: knowledge\n---\n" + strings.Repeat("scram trap. ", 4)
require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki", "sources"), 0o755))
require.NoError(t, os.MkdirAll(filepath.Join(dir, "knowledge"), 0o755))
require.NoError(t, os.WriteFile(filepath.Join(dir, "wiki", "sources", "dump.md"), []byte(noteBody), 0o644))
require.NoError(t, os.WriteFile(filepath.Join(dir, "knowledge", "trap.md"), []byte(knowledgeBody), 0o644))
results, err := search.Query(dir, search.QueryOptions{Query: "scram trap", Limit: 5})
require.NoError(t, err)
require.GreaterOrEqual(t, len(results), 2)
assert.Equal(t, "knowledge/trap.md", results[0].Path, "knowledge tier weight should beat note tier")
assert.Equal(t, "knowledge", results[0].Tier)
assert.Equal(t, "note", results[1].Tier)
}
func TestSearch_WingHallScoping(t *testing.T) {
dir := t.TempDir()
for _, p := range []struct{ rel, body string }{
{"wiki/jepa-fx/decisions/val-vol.md", "---\nwing: jepa-fx\nhall: decisions\n---\nval-vol-r2 keyword.\n"},
{"wiki/jepa-fx/facts/architecture.md", "---\nwing: jepa-fx\nhall: facts\n---\nval-vol-r2 keyword in facts.\n"},
{"wiki/hyperguild/decisions/routing.md", "---\nwing: hyperguild\nhall: decisions\n---\nval-vol-r2 reference.\n"},
{"knowledge/loose.md", "---\n---\nval-vol-r2 in knowledge.\n"},
} {
full := filepath.Join(dir, p.rel)
require.NoError(t, os.MkdirAll(filepath.Dir(full), 0o755))
require.NoError(t, os.WriteFile(full, []byte(p.body), 0o644))
}
// No filter: walk both knowledge/ and wiki/ — all 4 match.
got, err := search.Query(dir, search.QueryOptions{Query: "val-vol-r2", Limit: 10})
require.NoError(t, err)
assert.Len(t, got, 4)
// Wing scope: 2 jepa-fx hits, no hyperguild, no knowledge.
got, err = search.Query(dir, search.QueryOptions{Query: "val-vol-r2", Limit: 10, Wing: "jepa-fx"})
require.NoError(t, err)
require.Len(t, got, 2)
for _, r := range got {
assert.Equal(t, "jepa-fx", r.Wing)
}
// Wing+Hall scope: 1 hit.
got, err = search.Query(dir, search.QueryOptions{Query: "val-vol-r2", Limit: 10, Wing: "jepa-fx", Hall: "decisions"})
require.NoError(t, err)
require.Len(t, got, 1)
assert.Equal(t, "jepa-fx", got[0].Wing)
assert.Equal(t, "decisions", got[0].Hall)
assert.Equal(t, "wiki/jepa-fx/decisions/val-vol.md", got[0].Path)
// Invalid hall rejected.
_, err = search.Query(dir, search.QueryOptions{Query: "x", Wing: "jepa-fx", Hall: "garbage"})
require.Error(t, err)
// Hall without wing rejected.
_, err = search.Query(dir, search.QueryOptions{Query: "x", Hall: "facts"})
require.Error(t, err)
}
func TestSearch_RespectsLimit(t *testing.T) {
dir := t.TempDir()
require.NoError(t, os.MkdirAll(filepath.Join(dir, "knowledge"), 0o755))
for i := 0; i < 5; i++ {
require.NoError(t, os.WriteFile(
filepath.Join(dir, "knowledge", fmt.Sprintf("page-%d.md", i)),
[]byte(fmt.Sprintf("---\ntitle: Page %d\n---\n\nThis page mentions retry.\n", i)),
0o644,
))
}
results, err := search.Query(dir, search.QueryOptions{Query: "retry", Limit: 3})
require.NoError(t, err)
assert.LessOrEqual(t, len(results), 3)
}