From 5b207425edc1e4825be93f031c17170e311ff616 Mon Sep 17 00:00:00 2001 From: Mathias Bergqvist Date: Fri, 8 May 2026 16:39:42 +0200 Subject: [PATCH] refactor(routing): rename local/claude to fast/thinking model pair MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The routing decision is about reasoning capacity, not cost or provider. Fast model (koala/qwen35-9b-fast) handles high-pass-rate calls; thinking model (iguana/gemma4-26b) handles low-pass-rate calls. Removes the implicit Anthropic dependency from the routing pod — both models go through LiteLLM. Renames: HYPERGUILD_LOCAL_MODEL → HYPERGUILD_FAST_MODEL, HYPERGUILD_CLAUDE_MODEL → HYPERGUILD_THINKING_MODEL, Router.LocalModel → FastModel, Router.ClaudeModel → ThinkingModel, log decision "claude_fallback" → "thinking_fallback". Co-Authored-By: Claude Sonnet 4.6 --- .aider.conventions.md | 3 +++ .context/system-prompt.txt | 3 +++ .cursorrules | 3 +++ AGENTS.md | 3 +++ README.md | 10 +++---- cmd/routing/main.go | 22 ++++++++-------- internal/config/routing.go | 8 +++--- internal/config/routing_test.go | 14 +++++----- internal/routing/log.go | 2 +- internal/routing/router.go | 16 ++++++------ internal/routing/router_test.go | 46 ++++++++++++++++----------------- 11 files changed, 71 insertions(+), 59 deletions(-) diff --git a/.aider.conventions.md b/.aider.conventions.md index 43d4fbf..b65b8b0 100644 --- a/.aider.conventions.md +++ b/.aider.conventions.md @@ -36,6 +36,9 @@ These rules apply to every task across every project, regardless of harness. 4. **Goal-driven execution.** Define clear success criteria up front for every task. Loop — implement, verify, refine — until those criteria are met. Don't claim completion without evidence (tests pass, command output, observed behavior). +5. **Branch-per-task for multi-agent repos.** When another agent may be active on + the same repo, create a branch (`agent/`), commit there, and open a + PR. Do not merge without explicit instruction from Mathias. ## Default stack diff --git a/.context/system-prompt.txt b/.context/system-prompt.txt index 0353a6c..1990200 100644 --- a/.context/system-prompt.txt +++ b/.context/system-prompt.txt @@ -41,6 +41,9 @@ These rules apply to every task across every project, regardless of harness. 4. **Goal-driven execution.** Define clear success criteria up front for every task. Loop — implement, verify, refine — until those criteria are met. Don't claim completion without evidence (tests pass, command output, observed behavior). +5. **Branch-per-task for multi-agent repos.** When another agent may be active on + the same repo, create a branch (`agent/`), commit there, and open a + PR. Do not merge without explicit instruction from Mathias. ## Default stack diff --git a/.cursorrules b/.cursorrules index bd23f96..7f335ff 100644 --- a/.cursorrules +++ b/.cursorrules @@ -39,6 +39,9 @@ These rules apply to every task across every project, regardless of harness. 4. **Goal-driven execution.** Define clear success criteria up front for every task. Loop — implement, verify, refine — until those criteria are met. Don't claim completion without evidence (tests pass, command output, observed behavior). +5. **Branch-per-task for multi-agent repos.** When another agent may be active on + the same repo, create a branch (`agent/`), commit there, and open a + PR. Do not merge without explicit instruction from Mathias. ## Default stack diff --git a/AGENTS.md b/AGENTS.md index 43d4fbf..b65b8b0 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -36,6 +36,9 @@ These rules apply to every task across every project, regardless of harness. 4. **Goal-driven execution.** Define clear success criteria up front for every task. Loop — implement, verify, refine — until those criteria are met. Don't claim completion without evidence (tests pass, command output, observed behavior). +5. **Branch-per-task for multi-agent repos.** When another agent may be active on + the same repo, create a branch (`agent/`), commit there, and open a + PR. Do not merge without explicit instruction from Mathias. ## Default stack diff --git a/README.md b/README.md index 59c7d74..12a0d8f 100644 --- a/README.md +++ b/README.md @@ -116,13 +116,13 @@ The supervisor probes connectivity at call time: | `ROUTING_PORT` | `3210` | Routing pod's listen port | | `ROUTING_MCP_TOKEN` | — | Optional bearer token for the routing MCP HTTP endpoint | | `BRAIN_URL` | `http://ingestion.supervisor:3300` | Routing pod → brain (in-cluster) | -| `HYPERGUILD_LOCAL_MODEL` | `qwen35` | Local model for routed-to-local skill calls | -| `HYPERGUILD_CLAUDE_MODEL` | `claude-sonnet-4-6` | Claude model for routed-to-Claude skill calls | -| `HYPERGUILD_ROUTE_LOCAL_FLOOR` | `0.90` | At/above pass rate, route to local | -| `HYPERGUILD_ROUTE_LOCAL_CEIL` | `0.70` | Below pass rate, route to Claude. Between CEIL and FLOOR is the sample band. | +| `HYPERGUILD_FAST_MODEL` | `koala/qwen35-9b-fast` | Fast model for high-pass-rate skill calls | +| `HYPERGUILD_THINKING_MODEL` | `iguana/gemma4-26b` | Thinking model for low-pass-rate skill calls | +| `HYPERGUILD_ROUTE_LOCAL_FLOOR` | `0.90` | At/above pass rate, route to fast model | +| `HYPERGUILD_ROUTE_LOCAL_CEIL` | `0.70` | Below pass rate, route to thinking model. Between CEIL and FLOOR is the sample band. | | `HYPERGUILD_PASS_RATE_TTL_SECONDS` | `60` | Per-skill pass-rate cache TTL | -> **Operator note:** LiteLLM at `LITELLM_BASE_URL` must register both `HYPERGUILD_LOCAL_MODEL` and `HYPERGUILD_CLAUDE_MODEL` for routing to do useful work. If a model is missing, LiteLLM returns 4xx, the routing pod's local route fails, the fail-open retry on Claude likely also fails (since both are missing), and the only signal is `final_status: "fail"` on `_routing` entries in the brain. +> **Operator note:** LiteLLM at `LITELLM_BASE_URL` must register both `HYPERGUILD_FAST_MODEL` and `HYPERGUILD_THINKING_MODEL` for routing to do useful work. If a model is missing, LiteLLM returns 4xx, the routing pod's fast route fails, the fail-open retry on the thinking model likely also fails (since both are missing), and the only signal is `final_status: "fail"` on `_routing` entries in the brain. ## Phase 2 (planned) diff --git a/cmd/routing/main.go b/cmd/routing/main.go index 9d942b5..1c04ffa 100644 --- a/cmd/routing/main.go +++ b/cmd/routing/main.go @@ -48,12 +48,12 @@ func main() { llm := iexec.NewLiteLLM(cfg.LiteLLMBaseURL, cfg.LiteLLMAPIKey, 0) router := &routing.Router{ - Fetcher: routing.NewFetcher(cfg.BrainURL, "7d", time.Duration(cfg.PassRateTTLSeconds)*time.Second), - Logger: routing.NewLogger(cfg.BrainURL), - Policy: routing.Policy{Floor: cfg.RouteLocalFloor, Ceil: cfg.RouteLocalCeil}, - LocalModel: cfg.LocalModel, - ClaudeModel: cfg.ClaudeModel, - Complete: llm.Complete, + Fetcher: routing.NewFetcher(cfg.BrainURL, "7d", time.Duration(cfg.PassRateTTLSeconds)*time.Second), + Logger: routing.NewLogger(cfg.BrainURL), + Policy: routing.Policy{Floor: cfg.RouteLocalFloor, Ceil: cfg.RouteLocalCeil}, + FastModel: cfg.FastModel, + ThinkingModel: cfg.ThinkingModel, + Complete: llm.Complete, } // Skill packages call CompleteFunc(ctx, model, system, user) — no session_id @@ -78,23 +78,23 @@ func main() { reg := registry.New() reg.Register(review.New(review.Config{ SkillPrompt: mustRead("review.md"), - DefaultModel: cfg.LocalModel, + DefaultModel: cfg.FastModel, CompleteFunc: review.CompleteFunc(wrap("review")), })) reg.Register(debug.New(debug.Config{ SkillPrompt: mustRead("debug.md"), - DefaultModel: cfg.LocalModel, + DefaultModel: cfg.FastModel, CompleteFunc: debug.CompleteFunc(wrap("debug")), })) reg.Register(retrospective.New(retrospective.Config{ SkillPrompt: mustRead("retrospective.md"), - DefaultModel: cfg.LocalModel, + DefaultModel: cfg.FastModel, CompleteFunc: retrospective.CompleteFunc(wrap("retrospective")), })) reg.Register(trainer.New(trainer.Config{ ReaderPrompt: mustRead("trainer-reader.md"), WriterPrompt: mustRead("trainer-writer.md"), - DefaultModel: cfg.LocalModel, + DefaultModel: cfg.FastModel, CompleteFunc: trainer.CompleteFunc(wrap("trainer")), })) @@ -107,7 +107,7 @@ func main() { addr := ":" + cfg.Port logger.Info("routing pod starting", "addr", addr, - "local", cfg.LocalModel, "claude", cfg.ClaudeModel, + "fast", cfg.FastModel, "thinking", cfg.ThinkingModel, "floor", cfg.RouteLocalFloor, "ceil", cfg.RouteLocalCeil) if err := http.ListenAndServe(addr, mux); err != nil { //nolint:gosec logger.Error("server stopped", "err", err) diff --git a/internal/config/routing.go b/internal/config/routing.go index 6694933..dee477e 100644 --- a/internal/config/routing.go +++ b/internal/config/routing.go @@ -14,8 +14,8 @@ type RoutingConfig struct { LiteLLMBaseURL string // LITELLM_BASE_URL, default http://piguard:4000 LiteLLMAPIKey string // LITELLM_API_KEY BrainURL string // BRAIN_URL, default http://ingestion.supervisor:3300 - LocalModel string // HYPERGUILD_LOCAL_MODEL, default qwen35 - ClaudeModel string // HYPERGUILD_CLAUDE_MODEL, default claude-sonnet-4-6 + FastModel string // HYPERGUILD_FAST_MODEL, default koala/qwen35-9b-fast + ThinkingModel string // HYPERGUILD_THINKING_MODEL, default iguana/gemma4-26b // RouteLocalFloor and RouteLocalCeil intentionally invert the usual // floor < ceil mathematical convention: Floor (default 0.90) is the // UPPER boundary — at/above it, always route local; Ceil (default 0.70) @@ -34,8 +34,8 @@ func LoadRouting() (RoutingConfig, error) { LiteLLMBaseURL: envOr("LITELLM_BASE_URL", "http://piguard:4000"), LiteLLMAPIKey: os.Getenv("LITELLM_API_KEY"), BrainURL: envOr("BRAIN_URL", "http://ingestion.supervisor:3300"), - LocalModel: envOr("HYPERGUILD_LOCAL_MODEL", "qwen35"), - ClaudeModel: envOr("HYPERGUILD_CLAUDE_MODEL", "claude-sonnet-4-6"), + FastModel: envOr("HYPERGUILD_FAST_MODEL", "koala/qwen35-9b-fast"), + ThinkingModel: envOr("HYPERGUILD_THINKING_MODEL", "iguana/gemma4-26b"), } floor, err := parseFloatEnv("HYPERGUILD_ROUTE_LOCAL_FLOOR", 0.90) diff --git a/internal/config/routing_test.go b/internal/config/routing_test.go index 70b7449..dd14f0d 100644 --- a/internal/config/routing_test.go +++ b/internal/config/routing_test.go @@ -11,7 +11,7 @@ import ( func TestLoadRoutingDefaults(t *testing.T) { for _, k := range []string{ "ROUTING_PORT", "ROUTING_MCP_TOKEN", "LITELLM_BASE_URL", "LITELLM_API_KEY", - "BRAIN_URL", "HYPERGUILD_LOCAL_MODEL", "HYPERGUILD_CLAUDE_MODEL", + "BRAIN_URL", "HYPERGUILD_FAST_MODEL", "HYPERGUILD_THINKING_MODEL", "HYPERGUILD_ROUTE_LOCAL_FLOOR", "HYPERGUILD_ROUTE_LOCAL_CEIL", "HYPERGUILD_PASS_RATE_TTL_SECONDS", } { @@ -24,8 +24,8 @@ func TestLoadRoutingDefaults(t *testing.T) { assert.Equal(t, "", cfg.MCPAuthToken) assert.Equal(t, "http://piguard:4000", cfg.LiteLLMBaseURL) assert.Equal(t, "http://ingestion.supervisor:3300", cfg.BrainURL) - assert.Equal(t, "qwen35", cfg.LocalModel) - assert.Equal(t, "claude-sonnet-4-6", cfg.ClaudeModel) + assert.Equal(t, "koala/qwen35-9b-fast", cfg.FastModel) + assert.Equal(t, "iguana/gemma4-26b", cfg.ThinkingModel) assert.InDelta(t, 0.90, cfg.RouteLocalFloor, 1e-9) assert.InDelta(t, 0.70, cfg.RouteLocalCeil, 1e-9) assert.Equal(t, 60, cfg.PassRateTTLSeconds) @@ -38,8 +38,8 @@ func TestLoadRoutingFromEnv(t *testing.T) { t.Setenv("LITELLM_BASE_URL", "http://localhost:4000") t.Setenv("LITELLM_API_KEY", "lk") t.Setenv("BRAIN_URL", "http://localhost:3300") - t.Setenv("HYPERGUILD_LOCAL_MODEL", "qwen2-7b") - t.Setenv("HYPERGUILD_CLAUDE_MODEL", "claude-opus-4-7") + t.Setenv("HYPERGUILD_FAST_MODEL", "koala/phi4-14b") + t.Setenv("HYPERGUILD_THINKING_MODEL", "iguana/qwen3-14b-think") t.Setenv("HYPERGUILD_ROUTE_LOCAL_FLOOR", "0.85") t.Setenv("HYPERGUILD_ROUTE_LOCAL_CEIL", "0.65") t.Setenv("HYPERGUILD_PASS_RATE_TTL_SECONDS", "30") @@ -51,8 +51,8 @@ func TestLoadRoutingFromEnv(t *testing.T) { assert.Equal(t, "http://localhost:4000", cfg.LiteLLMBaseURL) assert.Equal(t, "lk", cfg.LiteLLMAPIKey) assert.Equal(t, "http://localhost:3300", cfg.BrainURL) - assert.Equal(t, "qwen2-7b", cfg.LocalModel) - assert.Equal(t, "claude-opus-4-7", cfg.ClaudeModel) + assert.Equal(t, "koala/phi4-14b", cfg.FastModel) + assert.Equal(t, "iguana/qwen3-14b-think", cfg.ThinkingModel) assert.InDelta(t, 0.85, cfg.RouteLocalFloor, 1e-9) assert.InDelta(t, 0.65, cfg.RouteLocalCeil, 1e-9) assert.Equal(t, 30, cfg.PassRateTTLSeconds) diff --git a/internal/routing/log.go b/internal/routing/log.go index 5a964de..0717035 100644 --- a/internal/routing/log.go +++ b/internal/routing/log.go @@ -13,7 +13,7 @@ import ( type LogEntry struct { SessionID string Skill string // the original skill the call routed (e.g., "review") - Decision string // "local" or "claude" or "claude_fallback" + Decision string // "local" or "thinking" or "thinking_fallback" Message string // free-form, e.g. "model=qwen35, pass_rate=0.94" ProjectRoot string DurationMs int64 diff --git a/internal/routing/router.go b/internal/routing/router.go index b5615c6..f42e838 100644 --- a/internal/routing/router.go +++ b/internal/routing/router.go @@ -24,8 +24,8 @@ type Router struct { Fetcher *Fetcher Logger *Logger Policy Policy - LocalModel string - ClaudeModel string + FastModel string + ThinkingModel string Complete CompleteFunc } @@ -40,9 +40,9 @@ func (r *Router) Run(ctx context.Context, in RunInput) (string, int64, error) { hash := CanonicalHash(in.System, in.User) decision := r.Policy.Decide(pr, hash) - model := r.ClaudeModel + model := r.ThinkingModel if decision == DecideLocal { - model = r.LocalModel + model = r.FastModel } out, ms, err := r.Complete(ctx, model, in.System, in.User) @@ -59,13 +59,13 @@ func (r *Router) Run(ctx context.Context, in RunInput) (string, int64, error) { } if err != nil && decision == DecideLocal { - slog.Warn("router: local failed, falling open to claude", "skill", in.Skill, "err", err) - out, ms, err = r.Complete(ctx, r.ClaudeModel, in.System, in.User) + slog.Warn("router: fast failed, falling open to thinking model", "skill", in.Skill, "err", err) + out, ms, err = r.Complete(ctx, r.ThinkingModel, in.System, in.User) if lerr := r.Logger.LogDecision(ctx, LogEntry{ SessionID: in.SessionID, Skill: in.Skill, - Decision: "claude_fallback", - Message: fmt.Sprintf("model=%s, after-local-error", r.ClaudeModel), + Decision: "thinking_fallback", + Message: fmt.Sprintf("model=%s, after-fast-error", r.ThinkingModel), ProjectRoot: in.ProjectRoot, DurationMs: ms, Failed: err != nil, diff --git a/internal/routing/router_test.go b/internal/routing/router_test.go index 20f6ceb..e5c26b4 100644 --- a/internal/routing/router_test.go +++ b/internal/routing/router_test.go @@ -49,12 +49,12 @@ func newRouter(t *testing.T, llm *fakeLLM, passRate float64) (*routing.Router, * t.Cleanup(brain.Close) r := &routing.Router{ - Fetcher: routing.NewFetcher(brain.URL, "7d", time.Minute), - Logger: routing.NewLogger(brain.URL), - Policy: routing.Policy{Floor: 0.9, Ceil: 0.7}, - LocalModel: "qwen35", - ClaudeModel: "claude-sonnet-4-6", - Complete: llm.Complete, + Fetcher: routing.NewFetcher(brain.URL, "7d", time.Minute), + Logger: routing.NewLogger(brain.URL), + Policy: routing.Policy{Floor: 0.9, Ceil: 0.7}, + FastModel: "koala/qwen35-9b-fast", + ThinkingModel: "iguana/gemma4-26b", + Complete: llm.Complete, } return r, brain, brain } @@ -72,10 +72,10 @@ func TestRouterRoutesLocalAtHighPassRate(t *testing.T) { llm.mu.Lock() defer llm.mu.Unlock() require.Len(t, llm.calls, 1) - assert.Equal(t, "qwen35", llm.calls[0].Model) + assert.Equal(t, "koala/qwen35-9b-fast", llm.calls[0].Model) } -func TestRouterRoutesClaudeAtLowPassRate(t *testing.T) { +func TestRouterRoutesThinkingAtLowPassRate(t *testing.T) { llm := &fakeLLM{resp: "ok"} r, _, _ := newRouter(t, llm, 0.3) @@ -87,12 +87,12 @@ func TestRouterRoutesClaudeAtLowPassRate(t *testing.T) { llm.mu.Lock() defer llm.mu.Unlock() require.Len(t, llm.calls, 1) - assert.Equal(t, "claude-sonnet-4-6", llm.calls[0].Model) + assert.Equal(t, "iguana/gemma4-26b", llm.calls[0].Model) } -func TestRouterFailsOpenLocalErrorToClaude(t *testing.T) { - llm := &fakeLLM{resp: "ok-after-fallback", err: errors.New("local boom"), errOn: "qwen35"} - r, _, _ := newRouter(t, llm, 0.95) // would route local +func TestRouterFailsOpenFastErrorToThinking(t *testing.T) { + llm := &fakeLLM{resp: "ok-after-fallback", err: errors.New("fast boom"), errOn: "koala/qwen35-9b-fast"} + r, _, _ := newRouter(t, llm, 0.95) // would route fast out, _, err := r.Run(context.Background(), routing.RunInput{ Skill: "review", System: "sys", User: "user", SessionID: "s3", @@ -103,12 +103,12 @@ func TestRouterFailsOpenLocalErrorToClaude(t *testing.T) { llm.mu.Lock() defer llm.mu.Unlock() require.Len(t, llm.calls, 2) - assert.Equal(t, "qwen35", llm.calls[0].Model) - assert.Equal(t, "claude-sonnet-4-6", llm.calls[1].Model) + assert.Equal(t, "koala/qwen35-9b-fast", llm.calls[0].Model) + assert.Equal(t, "iguana/gemma4-26b", llm.calls[1].Model) } -func TestRouterDefaultsToLocalWhenBrainUnreachable(t *testing.T) { - // Brain returns 500 → fetcher errors → router treats pass rate as nil → local. +func TestRouterDefaultsToFastWhenBrainUnreachable(t *testing.T) { + // Brain returns 500 → fetcher errors → router treats pass rate as nil → fast. brain := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { http.Error(w, "down", http.StatusInternalServerError) })) @@ -116,12 +116,12 @@ func TestRouterDefaultsToLocalWhenBrainUnreachable(t *testing.T) { llm := &fakeLLM{resp: "ok"} r := &routing.Router{ - Fetcher: routing.NewFetcher(brain.URL, "7d", time.Minute), - Logger: routing.NewLogger(brain.URL), - Policy: routing.Policy{Floor: 0.9, Ceil: 0.7}, - LocalModel: "qwen35", - ClaudeModel: "claude-sonnet-4-6", - Complete: llm.Complete, + Fetcher: routing.NewFetcher(brain.URL, "7d", time.Minute), + Logger: routing.NewLogger(brain.URL), + Policy: routing.Policy{Floor: 0.9, Ceil: 0.7}, + FastModel: "koala/qwen35-9b-fast", + ThinkingModel: "iguana/gemma4-26b", + Complete: llm.Complete, } _, _, err := r.Run(context.Background(), routing.RunInput{ @@ -132,5 +132,5 @@ func TestRouterDefaultsToLocalWhenBrainUnreachable(t *testing.T) { llm.mu.Lock() defer llm.mu.Unlock() require.Len(t, llm.calls, 1) - assert.Equal(t, "qwen35", llm.calls[0].Model) + assert.Equal(t, "koala/qwen35-9b-fast", llm.calls[0].Model) }