hyperguild/ingestion/internal/search/search.go

// ingestion/internal/search/search.go
package search

import (
	"bufio"
	"context"
	"fmt"
	"log/slog"
	"os"
	"path/filepath"
	"sort"
	"strings"

	"github.com/mathiasbq/hyperguild/ingestion/internal/brain"
	"github.com/mathiasbq/hyperguild/ingestion/internal/vectorstore"
)

// VectorSearcher returns the top-limit nearest paths by cosine
// distance. The vectorstore package implements this against pgvector.
type VectorSearcher interface {
	Search(ctx context.Context, query []float32, limit int) ([]VectorHit, error)
}

// VectorHit is a single path + distance pair from a vector search.
// Re-declared here (rather than imported) to keep search package
// free of vectorstore/embed deps and to make stubbing trivial in tests.
type VectorHit struct {
	Path     string
	Distance float64
}

// Embedder turns a query string into a dense vector. The embed package
// implements this against Ollama's /api/embed.
type Embedder interface {
	Embed(ctx context.Context, text string) ([]float32, error)
}

// Result is a single search hit from the brain wiki.
type Result struct {
	Path    string `json:"path"`
	Title   string `json:"title"`
	Excerpt string `json:"excerpt"`
	Score   int    `json:"score"`
	Wing    string `json:"wing,omitempty"`
	Hall    string `json:"hall,omitempty"`
}

// QueryOptions configures a search.
//
// When Wing is set, the walk is restricted to brain/wiki/<wing>/.
// When Hall is additionally set, the walk is restricted to
// brain/wiki/<wing>/<hall>/. Without either, the legacy walk over
// brain/knowledge/ and brain/wiki/ is used.
//
// When both Vector and Embedder are non-nil, results are computed
// hybridly: BM25 and vector candidate lists are merged via Reciprocal
// Rank Fusion. With either nil the function falls back to BM25 only,
// keeping behaviour unchanged for callers that have not opted in.
type QueryOptions struct {
	Query    string
	Limit    int
	Wing     string
	Hall     string
	Vector   VectorSearcher
	Embedder Embedder
}

// Query searches the brain. Returns up to opts.Limit results sorted by
// score descending. Empty query returns nil.
func Query(brainDir string, opts QueryOptions) ([]Result, error) {
	return QueryContext(context.Background(), brainDir, opts)
}

// QueryContext is the cancellable variant of Query. Hybrid retrieval
// requires a context because both the embedder and the vector store are
// network calls.
func QueryContext(ctx context.Context, brainDir string, opts QueryOptions) ([]Result, error) {
	if opts.Limit <= 0 {
		opts.Limit = 5
	}
	terms := strings.Fields(strings.ToLower(opts.Query))
	if len(terms) == 0 {
		return nil, nil
	}

	roots, err := resolveRoots(brainDir, opts.Wing, opts.Hall)
	if err != nil {
		return nil, err
	}

	var results []Result
	for _, dir := range roots {
		if _, statErr := os.Stat(dir); os.IsNotExist(statErr) {
			continue
		}
		err := filepath.WalkDir(dir, func(path string, d os.DirEntry, err error) error {
			if err != nil {
				slog.Warn("search: skipping path", "path", path, "err", err)
				return nil
			}
			if d.IsDir() || !strings.HasSuffix(path, ".md") {
				return nil
			}
			content, err := os.ReadFile(path)
			if err != nil {
				slog.Warn("search: skipping unreadable file", "path", path, "err", err)
				return nil
			}
			lower := strings.ToLower(string(content))
			score := 0
			for _, term := range terms {
				score += strings.Count(lower, term)
			}
			if score == 0 {
				return nil
			}
			rel, err := filepath.Rel(brainDir, path)
			if err != nil {
				return fmt.Errorf("rel path: %w", err)
			}
			rel = filepath.ToSlash(rel)
			wing, hall := extractWingHall(string(content), rel)
			results = append(results, Result{
				Path:    rel,
				Title:   extractTitle(string(content), d.Name()),
				Excerpt: excerpt(string(content), 300),
				Score:   score,
				Wing:    wing,
				Hall:    hall,
			})
			return nil
		})
		if err != nil {
			return nil, err
		}
	}

	sort.Slice(results, func(i, j int) bool {
		return results[i].Score > results[j].Score
	})

	// Hybrid scoring kicks in only when both the embedder and the
	// vector store are wired and BM25 actually returned candidates.
	if opts.Vector != nil && opts.Embedder != nil && len(results) > 0 {
		merged, err := hybridMerge(ctx, brainDir, opts, results)
		if err != nil {
			slog.Warn("search: hybrid merge failed, falling back to BM25", "err", err)
		} else {
			results = merged
		}
	}

	if len(results) > opts.Limit {
		results = results[:opts.Limit]
	}
	return results, nil
}

// rrfK is the constant in the Reciprocal Rank Fusion formula. 60 is
// standard (Cormack et al. 2009) and parameter-free in practice.
const rrfK = 60.0

// hybridMerge embeds the query, runs a vector search, and merges its
// candidates with the BM25 list via Reciprocal Rank Fusion. Results
// that came only from the vector side are hydrated by reading the
// note's frontmatter for title/wing/hall and excerpting the body.
//
// rrf(d) = sum_r 1 / (k + rank_r(d)) over rankers r ∈ {BM25, vector}.
func hybridMerge(ctx context.Context, brainDir string, opts QueryOptions, bm25 []Result) ([]Result, error) {
	q, err := opts.Embedder.Embed(ctx, opts.Query)
	if err != nil {
		return nil, fmt.Errorf("embed query: %w", err)
	}
	vectorLimit := opts.Limit * 4
	if vectorLimit < 20 {
		vectorLimit = 20
	}
	hits, err := opts.Vector.Search(ctx, q, vectorLimit)
	if err != nil {
		return nil, fmt.Errorf("vector search: %w", err)
	}

	rrf := make(map[string]float64)
	byPath := make(map[string]Result)
	for rank, r := range bm25 {
		rrf[r.Path] += 1.0 / (rrfK + float64(rank+1))
		byPath[r.Path] = r
	}
	for rank, h := range hits {
		// Vector store keys are chunk paths ("wiki/foo.md#0001"); collapse
		// back to the parent so multiple chunk hits from the same file
		// score against a single result row.
		parent := vectorstore.ParentPath(h.Path)
		if opts.Wing != "" && !pathInScope(parent, opts.Wing, opts.Hall) {
			continue
		}
		rrf[parent] += 1.0 / (rrfK + float64(rank+1))
		if _, seen := byPath[parent]; !seen {
			r, err := hydrate(brainDir, parent)
			if err != nil {
				slog.Warn("search: hydrate failed for vector hit", "path", parent, "err", err)
				continue
			}
			byPath[parent] = r
		}
	}

	merged := make([]Result, 0, len(byPath))
	for p, r := range byPath {
		r.Score = int(rrf[p] * 1e6) // scale to int for stable JSON; relative order is what matters
		merged = append(merged, r)
	}
	sort.Slice(merged, func(i, j int) bool {
		return merged[i].Score > merged[j].Score
	})
	return merged, nil
}

// pathInScope reports whether a wiki path satisfies the wing/hall filter.
func pathInScope(relPath, wing, hall string) bool {
	prefix := "wiki/" + brain.Sanitise(wing) + "/"
	if hall != "" {
		prefix += hall + "/"
	}
	return strings.HasPrefix(relPath, prefix)
}

// hydrate reads a single note from disk and returns a Result with title,
// excerpt, wing, and hall populated. Used for paths that surface only
// via vector search.
func hydrate(brainDir, relPath string) (Result, error) {
	full := filepath.Join(brainDir, filepath.FromSlash(relPath))
	content, err := os.ReadFile(full)
	if err != nil {
		return Result{}, err
	}
	wing, hall := extractWingHall(string(content), relPath)
	return Result{
		Path:    relPath,
		Title:   extractTitle(string(content), filepath.Base(relPath)),
		Excerpt: excerpt(string(content), 300),
		Wing:    wing,
		Hall:    hall,
	}, nil
}

// resolveRoots returns the directories to walk for the given wing/hall
// filters. Validates hall against the closed vocabulary when set.
func resolveRoots(brainDir, wing, hall string) ([]string, error) {
	if hall != "" && !brain.IsValidHall(hall) {
		return nil, fmt.Errorf("invalid hall %q", hall)
	}
	if wing != "" {
		w := brain.Sanitise(wing)
		if w == "" {
			return nil, fmt.Errorf("invalid wing %q", wing)
		}
		if hall != "" {
			return []string{filepath.Join(brainDir, "wiki", w, hall)}, nil
		}
		return []string{filepath.Join(brainDir, "wiki", w)}, nil
	}
	if hall != "" {
		return nil, fmt.Errorf("hall filter requires wing")
	}
	return []string{
		filepath.Join(brainDir, "knowledge"),
		filepath.Join(brainDir, "wiki"),
	}, nil
}

// extractWingHall reads wing/hall from frontmatter first, falling back to
// path segments brain/wiki/<wing>/<hall>/.
func extractWingHall(content, relPath string) (wing, hall string) {
	scanner := bufio.NewScanner(strings.NewReader(content))
	inFrontmatter := false
	for scanner.Scan() {
		line := scanner.Text()
		if strings.TrimSpace(line) == "---" {
			if !inFrontmatter {
				inFrontmatter = true
				continue
			}
			break
		}
		if !inFrontmatter {
			continue
		}
		key, val, ok := strings.Cut(line, ":")
		if !ok {
			continue
		}
		v := strings.Trim(strings.TrimSpace(val), `"'`)
		switch strings.TrimSpace(key) {
		case "wing":
			wing = v
		case "hall":
			hall = v
		}
	}
	if wing != "" && hall != "" {
		return wing, hall
	}
	parts := strings.Split(relPath, "/")
	if len(parts) >= 4 && parts[0] == "wiki" {
		if wing == "" {
			wing = parts[1]
		}
		if hall == "" && brain.IsValidHall(parts[2]) {
			hall = parts[2]
		}
	}
	return wing, hall
}

func extractTitle(content, filename string) string {
	scanner := bufio.NewScanner(strings.NewReader(content))
	inFrontmatter := false
	for scanner.Scan() {
		line := scanner.Text()
		if strings.TrimSpace(line) == "---" {
			if !inFrontmatter {
				inFrontmatter = true
				continue
			}
			break
		}
		if inFrontmatter {
			key, val, ok := strings.Cut(line, ":")
			if ok && strings.TrimSpace(key) == "title" {
				return strings.Trim(strings.TrimSpace(val), `"'`)
			}
		}
	}
	return strings.TrimSuffix(filename, ".md")
}

func excerpt(content string, maxLen int) string {
	parts := strings.SplitN(content, "---", 3)
	body := content
	if len(parts) == 3 {
		body = strings.TrimSpace(parts[2])
	}
	if len(body) > maxLen {
		return body[:maxLen] + "…"
	}
	return body
}