feat(search): M4 tier-weighted BM25 re-rank (infra#72)
The eval set under brain/eval/qa-2026-05.md showed BM25 top-1 at 20% with 5 of the missing slugs being short focused knowledge entries that lost to long aggregate docs on raw term-frequency. Tier weighting addresses that without touching the BM25 algorithm itself. How - Result struct gains a Tier field, populated during the file walk via extractTier (frontmatter wins, path prefix as fallback — mirrors the graph.inferTierFromPath logic so the two callers stay in lockstep). - After the existing sort (and optional hybridMerge), do a final stable re-sort by float64(Score) * tierWeight(Tier). Knowledge ×1.5, note ×1.0, inbox ×0.3, unknown ×1.0. - hydrate() (vector-only hits) also fills Tier so re-ranking covers the hybrid path. Test covers the load-bearing case: a long note-tier doc with raw=10 loses to a short knowledge-tier doc with raw=8 after weighting (8×1.5=12 vs 10×1.0=10). Measurement gate is in infra#72: re-run brain/eval/score.py against the live brain after this image lands; close the issue when top-1 hit rate lifts by ≥10 absolute points.
This commit is contained in:
@@ -6,6 +6,7 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/mathiasbq/hyperguild/ingestion/internal/search"
|
||||
@@ -130,6 +131,29 @@ func TestSearch_ReturnsMatchingPages(t *testing.T) {
|
||||
assert.Contains(t, results[0].Excerpt, "Retry")
|
||||
}
|
||||
|
||||
func TestSearch_TierWeightingReordersResults(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
// A long note-tier dump mentions the keyword many times (high raw
|
||||
// BM25 score); a short knowledge entry mentions it three times.
|
||||
// Raw BM25 prefers the dump; tier weighting (knowledge ×1.5 vs
|
||||
// note ×1.0) flips the order if the score gap is within reach.
|
||||
// note raw = 5 × 2 terms = 10 hits, weight 1.0 → 10
|
||||
// knowledge raw = 4 × 2 terms = 8 hits, weight 1.5 → 12 (overtakes)
|
||||
noteBody := "---\ntier: note\n---\n" + strings.Repeat("scram trap. ", 5)
|
||||
knowledgeBody := "---\ntier: knowledge\n---\n" + strings.Repeat("scram trap. ", 4)
|
||||
require.NoError(t, os.MkdirAll(filepath.Join(dir, "wiki", "sources"), 0o755))
|
||||
require.NoError(t, os.MkdirAll(filepath.Join(dir, "knowledge"), 0o755))
|
||||
require.NoError(t, os.WriteFile(filepath.Join(dir, "wiki", "sources", "dump.md"), []byte(noteBody), 0o644))
|
||||
require.NoError(t, os.WriteFile(filepath.Join(dir, "knowledge", "trap.md"), []byte(knowledgeBody), 0o644))
|
||||
|
||||
results, err := search.Query(dir, search.QueryOptions{Query: "scram trap", Limit: 5})
|
||||
require.NoError(t, err)
|
||||
require.GreaterOrEqual(t, len(results), 2)
|
||||
assert.Equal(t, "knowledge/trap.md", results[0].Path, "knowledge tier weight should beat note tier")
|
||||
assert.Equal(t, "knowledge", results[0].Tier)
|
||||
assert.Equal(t, "note", results[1].Tier)
|
||||
}
|
||||
|
||||
func TestSearch_WingHallScoping(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
for _, p := range []struct{ rel, body string }{
|
||||
|
||||
Reference in New Issue
Block a user