classifyByPath had a hole: paths like wiki/index.md or wiki/<slug>.md (direct children of wiki/, no subdirectory) hit the default branch and wrote Wing=parts[1] — which IS the filename, not a wing. Symptom in brain_entities: rows like (slug=index, wing=index.md) and (slug=autobe-..., wing=autobe-evaluation-pattern-....md). Fix: when len(parts) < 3 (no subdirectory at all), fall through to Type=knowledge and let frontmatter set wing/hall if present. Add brain/eval/ artifacts at the same time: - qa-2026-05.md — 20 hand-authored Q→expected-slug pairs covering the homelab knowledge corpus across mcp, dex, gitops, postgres, go, models, methodology - score.py — calls brain_query for each pair, scores top-1 + top-3, emits per-question detail. BRAIN_MCP_TOKEN via env. Pre-fix baseline against the live brain: top-1 = 20% (4/20), top-3 = 65% (13/20). Six hard misses where the expected slug doesn't even land in the top-5. Used to gate the phase 2 DIKW redesign (infra#62 follow-up): if phase 1 fixes (this parser fix + 20 backlink authoring on top orphans) lift top-1 by <10 absolute points, structure is the bottleneck and the tier redesign is justified.
132 lines
3.8 KiB
Python
132 lines
3.8 KiB
Python
#!/usr/bin/env python3
|
|
"""Score brain_query against the qa-2026-05.md eval set.
|
|
|
|
Reads `q:` / `expected:` pairs, calls brain_query MCP for each, records
|
|
top-1 + top-3 hit rate. Run:
|
|
|
|
BRAIN_MCP_TOKEN=$(grep '^export BRAIN_MCP_TOKEN=' ~/.llmkeys | cut -d= -f2-) \\
|
|
python3 score.py qa-2026-05.md
|
|
|
|
Optionally pass --baseline <name> to save the result as a labeled run.
|
|
"""
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
import urllib.request
|
|
|
|
ENDPOINT = "https://brain-mcp.d-ma.be/mcp"
|
|
|
|
|
|
def load_pairs(path):
|
|
pairs = []
|
|
q = None
|
|
with open(path) as f:
|
|
for line in f:
|
|
line = line.rstrip()
|
|
if line.startswith("q:"):
|
|
q = line[2:].strip()
|
|
elif line.startswith("expected:") and q is not None:
|
|
expected = line[len("expected:"):].strip()
|
|
pairs.append((q, expected))
|
|
q = None
|
|
return pairs
|
|
|
|
|
|
def brain_query(token, query, k=5):
|
|
body = json.dumps({
|
|
"jsonrpc": "2.0",
|
|
"id": 1,
|
|
"method": "tools/call",
|
|
"params": {"name": "brain_query", "arguments": {"query": query, "k": k}},
|
|
}).encode()
|
|
req = urllib.request.Request(
|
|
ENDPOINT,
|
|
data=body,
|
|
headers={
|
|
"Authorization": f"Bearer {token}",
|
|
"Content-Type": "application/json",
|
|
"Accept": "application/json, text/event-stream",
|
|
},
|
|
method="POST",
|
|
)
|
|
with urllib.request.urlopen(req, timeout=30) as r:
|
|
raw = r.read().decode()
|
|
for line in raw.splitlines():
|
|
if line.startswith("data:"):
|
|
raw = line[5:].strip()
|
|
break
|
|
d = json.loads(raw)
|
|
if "error" in d:
|
|
raise RuntimeError(d["error"])
|
|
text = d["result"]["content"][0]["text"]
|
|
return json.loads(text).get("results", [])
|
|
|
|
|
|
def slug_of(result):
|
|
# `title` mirrors the slug in brain_entities for normal entries.
|
|
# Fall back to basename(path) if title is missing.
|
|
t = result.get("title", "")
|
|
if t:
|
|
return t
|
|
p = result.get("path", "")
|
|
return re.sub(r"\.md$", "", os.path.basename(p))
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("evalset")
|
|
ap.add_argument("--baseline", default="run")
|
|
ap.add_argument("--k", type=int, default=5)
|
|
args = ap.parse_args()
|
|
|
|
token = os.environ.get("BRAIN_MCP_TOKEN")
|
|
if not token:
|
|
sys.exit("BRAIN_MCP_TOKEN not set")
|
|
|
|
pairs = load_pairs(args.evalset)
|
|
if not pairs:
|
|
sys.exit(f"no pairs in {args.evalset}")
|
|
|
|
print(f"# {args.baseline} — {len(pairs)} questions, k={args.k}")
|
|
print()
|
|
hits1 = 0
|
|
hits3 = 0
|
|
detail = []
|
|
for q, expected in pairs:
|
|
try:
|
|
results = brain_query(token, q, k=args.k)
|
|
except Exception as e:
|
|
detail.append((q, expected, [], f"ERR {e}"))
|
|
continue
|
|
slugs = [slug_of(r) for r in results]
|
|
rank = slugs.index(expected) + 1 if expected in slugs else 0
|
|
h1 = 1 if rank == 1 else 0
|
|
h3 = 1 if 0 < rank <= 3 else 0
|
|
hits1 += h1
|
|
hits3 += h3
|
|
detail.append((q, expected, slugs, rank))
|
|
|
|
total = len(pairs)
|
|
print(f"top-1 hit rate: {hits1}/{total} = {100*hits1/total:.0f}%")
|
|
print(f"top-3 hit rate: {hits3}/{total} = {100*hits3/total:.0f}%")
|
|
print()
|
|
print("## per-question detail")
|
|
print()
|
|
for q, expected, slugs, rank in detail:
|
|
marker = {0: "✗", 1: "★", 2: "·", 3: "·"}.get(rank, "?")
|
|
if isinstance(rank, str):
|
|
marker = "!"
|
|
print(f"{marker} rank={rank} expected={expected}")
|
|
print(f" q: {q}")
|
|
for i, s in enumerate(slugs[:args.k], 1):
|
|
mark = " <-- expected" if s == expected else ""
|
|
print(f" {i}. {s}{mark}")
|
|
print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|