// ingestion/internal/search/search.go package search import ( "bufio" "context" "fmt" "log/slog" "os" "path/filepath" "sort" "strings" "github.com/mathiasbq/hyperguild/ingestion/internal/brain" ) // VectorSearcher returns the top-limit nearest paths by cosine // distance. The vectorstore package implements this against pgvector. type VectorSearcher interface { Search(ctx context.Context, query []float32, limit int) ([]VectorHit, error) } // VectorHit is a single path + distance pair from a vector search. // Re-declared here (rather than imported) to keep search package // free of vectorstore/embed deps and to make stubbing trivial in tests. type VectorHit struct { Path string Distance float64 } // Embedder turns a query string into a dense vector. The embed package // implements this against Ollama's /api/embed. type Embedder interface { Embed(ctx context.Context, text string) ([]float32, error) } // Result is a single search hit from the brain wiki. type Result struct { Path string `json:"path"` Title string `json:"title"` Excerpt string `json:"excerpt"` Score int `json:"score"` Wing string `json:"wing,omitempty"` Hall string `json:"hall,omitempty"` } // QueryOptions configures a search. // // When Wing is set, the walk is restricted to brain/wiki//. // When Hall is additionally set, the walk is restricted to // brain/wiki///. Without either, the legacy walk over // brain/knowledge/ and brain/wiki/ is used. // // When both Vector and Embedder are non-nil, results are computed // hybridly: BM25 and vector candidate lists are merged via Reciprocal // Rank Fusion. With either nil the function falls back to BM25 only, // keeping behaviour unchanged for callers that have not opted in. type QueryOptions struct { Query string Limit int Wing string Hall string Vector VectorSearcher Embedder Embedder } // Query searches the brain. Returns up to opts.Limit results sorted by // score descending. Empty query returns nil. func Query(brainDir string, opts QueryOptions) ([]Result, error) { return QueryContext(context.Background(), brainDir, opts) } // QueryContext is the cancellable variant of Query. Hybrid retrieval // requires a context because both the embedder and the vector store are // network calls. func QueryContext(ctx context.Context, brainDir string, opts QueryOptions) ([]Result, error) { if opts.Limit <= 0 { opts.Limit = 5 } terms := strings.Fields(strings.ToLower(opts.Query)) if len(terms) == 0 { return nil, nil } roots, err := resolveRoots(brainDir, opts.Wing, opts.Hall) if err != nil { return nil, err } var results []Result for _, dir := range roots { if _, statErr := os.Stat(dir); os.IsNotExist(statErr) { continue } err := filepath.WalkDir(dir, func(path string, d os.DirEntry, err error) error { if err != nil { slog.Warn("search: skipping path", "path", path, "err", err) return nil } if d.IsDir() || !strings.HasSuffix(path, ".md") { return nil } content, err := os.ReadFile(path) if err != nil { slog.Warn("search: skipping unreadable file", "path", path, "err", err) return nil } lower := strings.ToLower(string(content)) score := 0 for _, term := range terms { score += strings.Count(lower, term) } if score == 0 { return nil } rel, err := filepath.Rel(brainDir, path) if err != nil { return fmt.Errorf("rel path: %w", err) } rel = filepath.ToSlash(rel) wing, hall := extractWingHall(string(content), rel) results = append(results, Result{ Path: rel, Title: extractTitle(string(content), d.Name()), Excerpt: excerpt(string(content), 300), Score: score, Wing: wing, Hall: hall, }) return nil }) if err != nil { return nil, err } } sort.Slice(results, func(i, j int) bool { return results[i].Score > results[j].Score }) // Hybrid scoring kicks in only when both the embedder and the // vector store are wired and BM25 actually returned candidates. if opts.Vector != nil && opts.Embedder != nil && len(results) > 0 { merged, err := hybridMerge(ctx, brainDir, opts, results) if err != nil { slog.Warn("search: hybrid merge failed, falling back to BM25", "err", err) } else { results = merged } } if len(results) > opts.Limit { results = results[:opts.Limit] } return results, nil } // rrfK is the constant in the Reciprocal Rank Fusion formula. 60 is // standard (Cormack et al. 2009) and parameter-free in practice. const rrfK = 60.0 // hybridMerge embeds the query, runs a vector search, and merges its // candidates with the BM25 list via Reciprocal Rank Fusion. Results // that came only from the vector side are hydrated by reading the // note's frontmatter for title/wing/hall and excerpting the body. // // rrf(d) = sum_r 1 / (k + rank_r(d)) over rankers r ∈ {BM25, vector}. func hybridMerge(ctx context.Context, brainDir string, opts QueryOptions, bm25 []Result) ([]Result, error) { q, err := opts.Embedder.Embed(ctx, opts.Query) if err != nil { return nil, fmt.Errorf("embed query: %w", err) } vectorLimit := opts.Limit * 4 if vectorLimit < 20 { vectorLimit = 20 } hits, err := opts.Vector.Search(ctx, q, vectorLimit) if err != nil { return nil, fmt.Errorf("vector search: %w", err) } rrf := make(map[string]float64) byPath := make(map[string]Result) for rank, r := range bm25 { rrf[r.Path] += 1.0 / (rrfK + float64(rank+1)) byPath[r.Path] = r } for rank, h := range hits { if opts.Wing != "" && !pathInScope(h.Path, opts.Wing, opts.Hall) { continue } rrf[h.Path] += 1.0 / (rrfK + float64(rank+1)) if _, seen := byPath[h.Path]; !seen { r, err := hydrate(brainDir, h.Path) if err != nil { slog.Warn("search: hydrate failed for vector hit", "path", h.Path, "err", err) continue } byPath[h.Path] = r } } merged := make([]Result, 0, len(byPath)) for p, r := range byPath { r.Score = int(rrf[p] * 1e6) // scale to int for stable JSON; relative order is what matters merged = append(merged, r) } sort.Slice(merged, func(i, j int) bool { return merged[i].Score > merged[j].Score }) return merged, nil } // pathInScope reports whether a wiki path satisfies the wing/hall filter. func pathInScope(relPath, wing, hall string) bool { prefix := "wiki/" + brain.Sanitise(wing) + "/" if hall != "" { prefix += hall + "/" } return strings.HasPrefix(relPath, prefix) } // hydrate reads a single note from disk and returns a Result with title, // excerpt, wing, and hall populated. Used for paths that surface only // via vector search. func hydrate(brainDir, relPath string) (Result, error) { full := filepath.Join(brainDir, filepath.FromSlash(relPath)) content, err := os.ReadFile(full) if err != nil { return Result{}, err } wing, hall := extractWingHall(string(content), relPath) return Result{ Path: relPath, Title: extractTitle(string(content), filepath.Base(relPath)), Excerpt: excerpt(string(content), 300), Wing: wing, Hall: hall, }, nil } // resolveRoots returns the directories to walk for the given wing/hall // filters. Validates hall against the closed vocabulary when set. func resolveRoots(brainDir, wing, hall string) ([]string, error) { if hall != "" && !brain.IsValidHall(hall) { return nil, fmt.Errorf("invalid hall %q", hall) } if wing != "" { w := brain.Sanitise(wing) if w == "" { return nil, fmt.Errorf("invalid wing %q", wing) } if hall != "" { return []string{filepath.Join(brainDir, "wiki", w, hall)}, nil } return []string{filepath.Join(brainDir, "wiki", w)}, nil } if hall != "" { return nil, fmt.Errorf("hall filter requires wing") } return []string{ filepath.Join(brainDir, "knowledge"), filepath.Join(brainDir, "wiki"), }, nil } // extractWingHall reads wing/hall from frontmatter first, falling back to // path segments brain/wiki///. func extractWingHall(content, relPath string) (wing, hall string) { scanner := bufio.NewScanner(strings.NewReader(content)) inFrontmatter := false for scanner.Scan() { line := scanner.Text() if strings.TrimSpace(line) == "---" { if !inFrontmatter { inFrontmatter = true continue } break } if !inFrontmatter { continue } key, val, ok := strings.Cut(line, ":") if !ok { continue } v := strings.Trim(strings.TrimSpace(val), `"'`) switch strings.TrimSpace(key) { case "wing": wing = v case "hall": hall = v } } if wing != "" && hall != "" { return wing, hall } parts := strings.Split(relPath, "/") if len(parts) >= 4 && parts[0] == "wiki" { if wing == "" { wing = parts[1] } if hall == "" && brain.IsValidHall(parts[2]) { hall = parts[2] } } return wing, hall } func extractTitle(content, filename string) string { scanner := bufio.NewScanner(strings.NewReader(content)) inFrontmatter := false for scanner.Scan() { line := scanner.Text() if strings.TrimSpace(line) == "---" { if !inFrontmatter { inFrontmatter = true continue } break } if inFrontmatter { key, val, ok := strings.Cut(line, ":") if ok && strings.TrimSpace(key) == "title" { return strings.Trim(strings.TrimSpace(val), `"'`) } } } return strings.TrimSuffix(filename, ".md") } func excerpt(content string, maxLen int) string { parts := strings.SplitN(content, "---", 3) body := content if len(parts) == 3 { body = strings.TrimSpace(parts[2]) } if len(body) > maxLen { return body[:maxLen] + "…" } return body }