10 Commits

Author SHA1 Message Date
Mathias Bergqvist
6328766c7f fix(main): re-evaluate chain per call to respect caller model override
All checks were successful
CI / Lint / Test / Vet (push) Successful in 1m8s
CI / Mirror to GitHub (push) Has been skipped
buildOrch now returns a closure instead of *Orchestrator. Each invocation
calls models.ChainFor(skill, req.Model) so a non-empty caller override
collapses to a single-entry chain (no escalation). The attempts slice is
also allocated fresh per call, preventing unbounded growth across requests.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-20 11:34:09 +02:00
Mathias Bergqvist
f1deedd39d feat(main): wire per-skill Orchestrators replacing single executor.Run
Each skill now gets its own Orchestrator built from its ChainFor entry,
with LiteLLM for local tiers and Claude for cloud tiers. Removes the
defunct models.Resolve calls and single shared executor.Run pattern.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-20 11:28:10 +02:00
Mathias Bergqvist
5cb272a869 feat(exec): add Orchestrator chain walker with verification and warm-state logging 2026-04-20 11:06:13 +02:00
Mathias Bergqvist
e96b39a812 feat(exec): add Claude verifier for local model output quality gate 2026-04-20 11:02:59 +02:00
Mathias Bergqvist
5db5b33cd7 feat(exec): add LiteLLM HTTP executor for local model dispatch 2026-04-20 10:46:08 +02:00
Mathias Bergqvist
a32457b5bc feat(exec): pass --model flag to claude subprocess for cloud-tier dispatch 2026-04-20 08:55:03 +02:00
Mathias Bergqvist
e0be5f0f98 feat(config): replace single-model config with chain-based routing
Implements escalation chains per skill with three-layer priority:
1. Caller override (model param) — no escalation
2. Per-skill chain from models.yaml
3. default_chain fallback

New APIs:
- Verifier() — fixed verifier for output validation
- LlamaSwapURL() — base URL for warm-state probing
- ChainFor(skill, override) — ordered model list for escalation

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-20 08:48:33 +02:00
Mathias Bergqvist
6d410b810b feat(session): extend Attempt with tier, timing, and verdict fields 2026-04-20 08:35:27 +02:00
Mathias Bergqvist
76f195de2a docs: model orchestration design spec for Phase 3
All checks were successful
CI / Lint / Test / Vet (push) Successful in 1m8s
CI / Mirror to GitHub (push) Successful in 4s
2026-04-20 07:45:32 +02:00
Mathias Bergqvist
f901d4e67d fix(ci): use --follow-tags instead of --tags to avoid re-pushing existing tags
All checks were successful
CI / Lint / Test / Vet (push) Successful in 1m9s
CI / Mirror to GitHub (push) Successful in 3s
2026-04-19 19:16:22 +02:00
16 changed files with 1293 additions and 61 deletions

View File

@@ -53,6 +53,6 @@ jobs:
chmod 600 ~/.ssh/id_rsa_gh_mirror chmod 600 ~/.ssh/id_rsa_gh_mirror
ssh-keyscan github.com >> ~/.ssh/known_hosts 2>/dev/null ssh-keyscan github.com >> ~/.ssh/known_hosts 2>/dev/null
GIT_SSH_COMMAND="ssh -i ~/.ssh/id_rsa_gh_mirror -o IdentitiesOnly=yes" \ GIT_SSH_COMMAND="ssh -i ~/.ssh/id_rsa_gh_mirror -o IdentitiesOnly=yes" \
git push git@github.com:mathiasb/hyperguild.git HEAD:main --tags git push git@github.com:mathiasb/hyperguild.git HEAD:main --follow-tags
rm ~/.ssh/id_rsa_gh_mirror rm ~/.ssh/id_rsa_gh_mirror
echo "✓ Mirrored to GitHub" echo "✓ Mirrored to GitHub"

View File

@@ -84,11 +84,26 @@ func main() {
os.Exit(1) os.Exit(1)
} }
executor := iexec.New(iexec.Config{ claudeExec := iexec.New(iexec.Config{
SystemPrompt: string(systemPrompt), SystemPrompt: string(systemPrompt),
LiteLLMBaseURL: cfg.LiteLLMBaseURL, LiteLLMBaseURL: cfg.LiteLLMBaseURL,
LiteLLMAPIKey: cfg.LiteLLMAPIKey, LiteLLMAPIKey: cfg.LiteLLMAPIKey,
}) })
litellmExec := iexec.NewLiteLLM(cfg.LiteLLMBaseURL, cfg.LiteLLMAPIKey, 0)
verifier := iexec.NewVerifier("", models.Verifier(), 0)
buildOrch := func(skill string) func(ctx context.Context, req iexec.Request) (iexec.Result, error) {
return func(ctx context.Context, req iexec.Request) (iexec.Result, error) {
rawChain := models.ChainFor(skill, req.Model)
chain := make([]iexec.ChainEntry, len(rawChain))
for i, m := range rawChain {
chain[i] = iexec.EntryFor(m)
}
attempts := make([]iexec.AttemptRecord, 0, len(chain))
orch := iexec.NewOrchestrator(chain, litellmExec.Run, claudeExec.Run, verifier, models.LlamaSwapURL(), &attempts)
return orch.Run(ctx, req)
}
}
tierFn := func(ctx context.Context) tier.Info { tierFn := func(ctx context.Context) tier.Info {
return tier.Detect(ctx, "https://api.anthropic.com", cfg.LiteLLMBaseURL) return tier.Detect(ctx, "https://api.anthropic.com", cfg.LiteLLMBaseURL)
@@ -98,8 +113,8 @@ func main() {
reg.Register(tdd.New(tdd.Config{ reg.Register(tdd.New(tdd.Config{
SystemPrompt: string(systemPrompt), SystemPrompt: string(systemPrompt),
SkillPrompt: string(tddPrompt), SkillPrompt: string(tddPrompt),
DefaultModel: models.Resolve("tdd", ""), DefaultModel: models.ChainFor("tdd", "")[0],
ExecutorFn: executor.Run, ExecutorFn: buildOrch("tdd"),
SessionsDir: cfg.SessionsDir, SessionsDir: cfg.SessionsDir,
})) }))
reg.Register(brain.New(brain.Config{ reg.Register(brain.New(brain.Config{
@@ -113,33 +128,33 @@ func main() {
})) }))
reg.Register(retrospective.New(retrospective.Config{ reg.Register(retrospective.New(retrospective.Config{
SkillPrompt: string(retroPrompt), SkillPrompt: string(retroPrompt),
DefaultModel: models.Resolve("retrospective", ""), DefaultModel: models.ChainFor("retrospective", "")[0],
SessionsDir: cfg.SessionsDir, SessionsDir: cfg.SessionsDir,
ExecutorFn: executor.Run, ExecutorFn: buildOrch("retrospective"),
})) }))
reg.Register(review.New(review.Config{ reg.Register(review.New(review.Config{
SkillPrompt: string(reviewPrompt), SkillPrompt: string(reviewPrompt),
DefaultModel: models.Resolve("review", ""), DefaultModel: models.ChainFor("review", "")[0],
ExecutorFn: executor.Run, ExecutorFn: buildOrch("review"),
SessionsDir: cfg.SessionsDir, SessionsDir: cfg.SessionsDir,
})) }))
reg.Register(skilldebug.New(skilldebug.Config{ reg.Register(skilldebug.New(skilldebug.Config{
SkillPrompt: string(debugPrompt), SkillPrompt: string(debugPrompt),
DefaultModel: models.Resolve("debug", ""), DefaultModel: models.ChainFor("debug", "")[0],
ExecutorFn: executor.Run, ExecutorFn: buildOrch("debug"),
SessionsDir: cfg.SessionsDir, SessionsDir: cfg.SessionsDir,
})) }))
reg.Register(spec.New(spec.Config{ reg.Register(spec.New(spec.Config{
SkillPrompt: string(specPrompt), SkillPrompt: string(specPrompt),
DefaultModel: models.Resolve("spec", ""), DefaultModel: models.ChainFor("spec", "")[0],
ExecutorFn: executor.Run, ExecutorFn: buildOrch("spec"),
SessionsDir: cfg.SessionsDir, SessionsDir: cfg.SessionsDir,
})) }))
reg.Register(trainer.New(trainer.Config{ reg.Register(trainer.New(trainer.Config{
ReaderPrompt: string(trainerReaderPrompt), ReaderPrompt: string(trainerReaderPrompt),
WriterPrompt: string(trainerWriterPrompt), WriterPrompt: string(trainerWriterPrompt),
DefaultModel: models.Resolve("trainer", ""), DefaultModel: models.ChainFor("trainer", "")[0],
ExecutorFn: executor.Run, ExecutorFn: buildOrch("trainer"),
SessionsDir: cfg.SessionsDir, SessionsDir: cfg.SessionsDir,
BrainDir: cfg.BrainDir, BrainDir: cfg.BrainDir,
})) }))

View File

@@ -1,13 +1,41 @@
# Model routing table — three-layer priority: # Model routing chains — three-layer priority:
# 1. model param in MCP tool call (caller override) # 1. model param in MCP tool call (caller override — collapses to single entry, no escalation)
# 2. per-skill entry here # 2. per-skill chain here
# 3. default (fallback) # 3. default_chain fallback
default: ollama/qwen3-coder-30b-tuned
verifier: claude-sonnet-4-6 # fixed verifier for all local tiers
llama_swap_url: http://koala:8080 # for warm-state probing
default_chain:
- ollama/qwen3-coder-30b-tuned
- claude-sonnet-4-6
skills: skills:
tdd: ollama/qwen3-coder-30b-tuned tdd:
review: ollama/devstral-tuned chain:
debug: ollama/deepseek-r1-tuned - ollama/qwen3-coder-30b-tuned
retrospective: ollama/qwen3-coder-30b-tuned - claude-sonnet-4-6
spec: ollama/qwen3-coder-30b-tuned review:
trainer: ollama/qwen3-coder-30b-tuned chain:
- ollama/devstral-tuned
- ollama/gemma4
- claude-sonnet-4-6
debug:
chain:
- ollama/deepseek-r1-tuned
- claude-sonnet-4-6
spec:
chain:
- ollama/phi4
- ollama/gemma4
- claude-sonnet-4-6
- claude-opus-4-6
retrospective:
chain:
- ollama/qwen3-coder-30b-tuned
- claude-sonnet-4-6
trainer:
chain:
- ollama/qwen3-coder-30b-tuned
- claude-sonnet-4-6

View File

@@ -0,0 +1,322 @@
# Model Orchestration Design
**Date:** 2026-04-20
**Status:** Approved for implementation
## Problem statement
The hyperguild supervisor currently spawns a `claude --print` subprocess for every skill call. The model routing config (`models.yaml`) exists but is dead weight — the model name is injected as text into the task prompt and ignored. Every skill call costs Claude tokens regardless of task complexity or data sensitivity.
## Goal
Route skill work to the most appropriate model — weighing cost, latency, and quality — with Claude acting as the real supervisor: verifying outputs and deciding when to escalate. Local models on owned hardware handle the common case; Claude escalates through a chain to frontier models only when local quality is insufficient.
## Success criteria
- [ ] Each skill dispatches generation to its configured local model via LiteLLM by default
- [ ] Claude verifies every local output and either accepts or escalates
- [ ] Escalation walks a per-skill chain (local small → local large → Sonnet → Opus) with one attempt per tier
- [ ] Every attempt (model, tier, duration, warm state, verdict) is logged in the session JSONL
- [ ] Cloud tiers (Sonnet/Opus) self-certify — no separate verifier call
- [ ] Zero changes to skill handlers — they call `ExecutorFn` exactly as today
- [ ] `LiteLTMBaseURL` already in config; no new env vars required beyond `LLAMA_SWAP_URL`
## Constraints
- One attempt per tier before escalating (no retry within a tier)
- Anthropic T&C: Claude is called normally via Anthropic API; local models are called directly via LiteLLM HTTP — no API redirection
- `models.yaml` remains the single routing config file
## Out of scope
- Auto-rerouting based on real-time warm state (logged, not acted on — Phase 4)
- Multi-tenant / public service exposure
- RAG/CAG model boosting
- Managed Agent cloud delegation (chain stub only in Phase 3)
---
## Architecture
```
MCP tool call (Claude Code)
Skill handler — calls ExecutorFn (unchanged)
Orchestrator.Run (implements ExecutorFn)
├─ Resolve chain from models.yaml
├─ For each model in chain:
│ ├─ [ollama/*] → LiteLLM executor → generate
│ │ ↓
│ │ Claude verifier (task + output + discipline)
│ │ ├─ accept → return Result (log attempt)
│ │ └─ escalate → next tier (log attempt)
│ │
│ └─ [claude-*] → Claude executor (current) → generate + self-certify
│ └─ return Result (log attempt)
└─ All tiers exhausted → return best attempt with escalation note
```
Claude is always the verifier for local tiers. At cloud tiers, Claude generates and self-certifies — the verifier call is skipped.
---
## Components
### 1. `internal/exec/litellm.go` — LiteLLM executor
Calls `POST /v1/chat/completions` on the configured LiteLLM server. Implements the same `ExecutorFn` signature as the existing claude executor.
```go
type LiteLLMExecutor struct {
BaseURL string
APIKey string
HTTPClient *http.Client
Timeout time.Duration
}
func NewLiteLLM(baseURL, apiKey string, timeout time.Duration) *LiteLLMExecutor
func (e *LiteLLMExecutor) Run(ctx context.Context, req Request) (Result, error)
```
Request mapping:
- `req.SkillPrompt` → system message
- `req.TaskPrompt` → user message
- `req.Model``model` field in the chat completions request
Response handling: local models are prompted (via the discipline file output contract) to return a JSON object matching the `Result` schema. The executor attempts `json.Unmarshal` into `Result` directly — no envelope unwrapping needed (unlike the `--output-format json` claude envelope). If unmarshalling fails, the executor returns an error that the orchestrator treats as an automatic escalation trigger.
### 2. `internal/exec/verifier.go` — Claude verifier
A focused Claude call that judges local model output. Uses the existing `Executor` (claude subprocess) internally.
```go
type Verdict struct {
Accept bool `json:"accept"`
Feedback string `json:"feedback"` // reason if not accepting; empty if accept
}
type Verifier struct {
executor *Executor // the existing claude executor
}
func NewVerifier(executor *Executor) *Verifier
func (v *Verifier) Verify(ctx context.Context, skillPrompt, taskPrompt string, output Result) (Verdict, error)
```
The verifier prompt gives Claude:
1. The skill discipline file (so it knows the iron laws and output contract)
2. The original task prompt (informed verification — Claude sees what was asked)
3. The generated output
4. A short instruction: "Does this output satisfy the discipline's iron laws and output contract? Reply with JSON: `{\"accept\": true|false, \"feedback\": \"...\"}`"
The verifier uses a lightweight JSON schema for its own output (a `Verdict` schema), keeping the call fast.
### 3. `internal/exec/orchestrator.go` — chain walker
Implements `ExecutorFn`. Walks the escalation chain, delegating generation and verification per tier.
```go
type Chain []ChainEntry
type ChainEntry struct {
Model string // e.g. "ollama/phi4", "claude-sonnet-4-5"
Tier string // "local" | "subagent" | "managed"
IsCloud bool // true for claude-* models; skips verifier
}
type Orchestrator struct {
chain Chain
litellm *LiteLLMExecutor
claude *Executor
verifier *Verifier
llamaSwapURL string // for warm-state probe
}
func NewOrchestrator(chain Chain, litellm *LiteLLMExecutor, claude *Executor, verifier *Verifier, llamaSwapURL string) *Orchestrator
func (o *Orchestrator) Run(ctx context.Context, req Request) (Result, error)
```
Algorithm:
```
for each entry in chain:
warm = probe llama-swap (if local tier)
start = now()
if entry.IsCloud:
result, err = claude.Run(ctx, req with entry.Model)
log attempt(model, tier, duration, warm, verified=true)
if err == nil: return result
else:
result, err = litellm.Run(ctx, req with entry.Model)
duration = now() - start
if err != nil:
log attempt(model, tier, duration, warm, verified=false)
continue // automatic escalation on parse/network error
verdict = verifier.Verify(ctx, req.SkillPrompt, req.TaskPrompt, result)
log attempt(model, tier, duration, warm, verified=verdict.Accept)
if verdict.Accept: return result
// inject verifier feedback into next tier's task prompt
req.TaskPrompt = req.TaskPrompt + "\n\nPrior attempt feedback: " + verdict.Feedback
return error("all tiers exhausted")
```
### 4. `internal/config/models.go` — chain parser
Replaces the current single-model resolution with chain parsing.
Updated `models.yaml` format:
```yaml
verifier: claude-sonnet-4-6 # fixed verifier for all local tiers
llama_swap_url: http://koala:8080 # for warm-state probing
default_chain:
- ollama/qwen3-coder-30b-tuned
- claude-sonnet-4-5
skills:
tdd:
chain:
- ollama/qwen3-coder-30b-tuned
- claude-sonnet-4-5
review:
chain:
- ollama/devstral-tuned
- ollama/gemma4
- claude-sonnet-4-5
debug:
chain:
- ollama/deepseek-r1-tuned
- claude-sonnet-4-5
spec:
chain:
- ollama/phi4
- ollama/gemma4
- claude-sonnet-4-5
- claude-opus-4-6
retrospective:
chain:
- ollama/qwen3-coder-30b-tuned
- claude-sonnet-4-5
trainer:
chain:
- ollama/qwen3-coder-30b-tuned
- claude-sonnet-4-5
```
The parser exposes:
```go
func (m *Models) ChainFor(skill string) Chain
func (m *Models) Verifier() string
func (m *Models) LlamaSwapURL() string
```
Caller override (`model` param in MCP tool call) pins the chain to a single entry — one model, no escalation. This preserves the existing override behaviour for power users.
### 5. `internal/session/session.go` — updated `Attempt` struct
```go
type Attempt struct {
Attempt int `json:"attempt"`
Model string `json:"model"`
Tier string `json:"tier"` // local | subagent | managed
DurationMs int64 `json:"duration_ms"`
WarmStart bool `json:"warm_start"` // model was already loaded in llama-swap
Verified bool `json:"verified"`
Verdict string `json:"verdict,omitempty"` // accept | escalate | error
Feedback string `json:"feedback,omitempty"` // verifier feedback on escalation
OutputSummary string `json:"output_summary,omitempty"`
RunnerOutput string `json:"runner_output,omitempty"`
}
```
### 6. `cmd/supervisor/main.go` — one wiring change
```go
// Before:
reg.Register(review.New(review.Config{ExecutorFn: executor.Run, ...}))
// After:
chain := models.ChainFor("review")
orch := exec.NewOrchestrator(chain, litellmExec, claudeExec, verifier, models.LlamaSwapURL())
reg.Register(review.New(review.Config{ExecutorFn: orch.Run, ...}))
```
One orchestrator per skill, sharing the same `litellmExec`, `claudeExec`, and `verifier` instances.
---
## Data flow example: `review` skill call
1. Claude Code calls `review` tool with `files: ["internal/foo.go"]`
2. Skill handler builds task prompt, calls `orch.Run`
3. Orchestrator resolves chain: `[devstral, gemma4, sonnet]`
4. Probes llama-swap: devstral is warm
5. LiteLLM calls devstral → returns JSON result
6. Verifier asks Claude: "does this review satisfy the iron laws?"
7. Claude: `{"accept": false, "feedback": "missing line references for all findings"}`
8. Orchestrator logs attempt #1 (devstral, local, 4200ms, warm, escalate)
9. Injects feedback into task prompt, calls gemma4
10. Verifier: `{"accept": true}`
11. Orchestrator logs attempt #2 (gemma4, local, 6100ms, cold, accept)
12. Returns result to skill handler → MCP response
Session JSONL records both attempts. You can see: devstral was warm but produced weak output; gemma4 was cold but passed.
---
## Observability
Session JSONL is the primary store. Each `Entry.Attempts` slice records the full escalation trail. To analyse across sessions:
```bash
# Which models are escalating most?
jq -r '.attempts[] | select(.verdict == "escalate") | .model' brain/sessions/*.jsonl | sort | uniq -c
# Average latency per model
jq -r '.attempts[] | [.model, .duration_ms] | @tsv' brain/sessions/*.jsonl | awk '{sum[$1]+=$2; n[$1]++} END {for (m in sum) print m, sum[m]/n[m]}'
# Cold start frequency
jq -r '.attempts[] | select(.warm_start == false) | .model' brain/sessions/*.jsonl | sort | uniq -c
```
No new metrics infrastructure needed for Phase 3. Phase 4 can build a dashboard on top of this data.
---
## Error handling
| Scenario | Behaviour |
|----------|-----------|
| LiteLLM unreachable | Log attempt as error, escalate immediately |
| Local model returns unparseable JSON | Log attempt as error, escalate |
| Verifier call fails | Log, treat as escalate (safe default) |
| All tiers exhausted | Return error to skill handler; skill returns MCP error to caller |
| Caller passes `model` override | Single-entry chain, no escalation, no verifier call |
---
## Testing approach
- `TestLiteLLMExecutor`: mock HTTP server returning valid/invalid JSON; verify parse logic and error escalation
- `TestVerifier`: fake claude executor returning accept/escalate verdicts; verify prompt construction
- `TestOrchestrator`: table-driven — chains of 1/2/3 tiers, various accept/escalate/error combinations; verify attempt log contents and final result
- `TestModelsChainFor`: YAML parsing for all skill overrides and default_chain fallback
- Integration smoke test: start real LiteLLM (or mock), call `review` tool via MCP, verify attempt log written
---
## Risks
| Risk | Mitigation |
|------|------------|
| Local models ignore output contract → bad JSON | Discipline files already specify JSON output contract; parse failure auto-escalates |
| Verifier Claude call adds latency to every local attempt | Verifier prompt is small and fast; acceptable tradeoff for quality gate |
| llama-swap warm probe adds overhead | Probe is a single lightweight HTTP GET; timeout at 200ms, treat failure as `warm_start: false` |
| Chain exhaustion leaves caller with no result | Return structured error via MCP; caller can retry with explicit `model` override |

View File

@@ -7,9 +7,15 @@ import (
"gopkg.in/yaml.v3" "gopkg.in/yaml.v3"
) )
type skillChain struct {
Chain []string `yaml:"chain"`
}
type modelsFile struct { type modelsFile struct {
Default string `yaml:"default"` Verifier string `yaml:"verifier"`
Skills map[string]string `yaml:"skills"` LlamaSwapURL string `yaml:"llama_swap_url"`
DefaultChain []string `yaml:"default_chain"`
Skills map[string]skillChain `yaml:"skills"`
} }
type Models struct { type Models struct {
@@ -28,16 +34,23 @@ func LoadModels(path string) (Models, error) {
return Models{data: f}, nil return Models{data: f}, nil
} }
// Resolve returns the model for a skill, respecting three-layer priority: // Verifier returns the model name to use for all local-tier output verification.
// 1. override (from MCP call) — highest func (m Models) Verifier() string { return m.data.Verifier }
// 2. per-skill default from models.yaml
// 3. global default // LlamaSwapURL returns the llama-swap base URL for warm-state probing.
func (m Models) Resolve(skill, override string) string { func (m Models) LlamaSwapURL() string { return m.data.LlamaSwapURL }
// ChainFor returns the ordered list of model names for a skill.
// If override is non-empty, returns a single-entry chain (no escalation).
// Falls back to default_chain when the skill has no explicit entry.
func (m Models) ChainFor(skill, override string) []string {
if override != "" { if override != "" {
return override return []string{override}
} }
if model, ok := m.data.Skills[skill]; ok { if sc, ok := m.data.Skills[skill]; ok && len(sc.Chain) > 0 {
return model return sc.Chain
} }
return m.data.Default out := make([]string, len(m.data.DefaultChain))
copy(out, m.data.DefaultChain)
return out
} }

View File

@@ -10,35 +10,71 @@ import (
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
) )
func TestModelsResolve(t *testing.T) { const testYAML = `
yaml := ` verifier: claude-sonnet-4-6
default: ollama/default-model llama_swap_url: http://koala:8080
default_chain:
- ollama/qwen3-coder-30b-tuned
- claude-sonnet-4-6
skills: skills:
tdd: ollama/qwen3-coder-30b-tuned review:
review: ollama/devstral-tuned chain:
- ollama/devstral-tuned
- ollama/gemma4
- claude-sonnet-4-6
spec:
chain:
- ollama/phi4
- claude-opus-4-6
` `
func writeModels(t *testing.T, content string) string {
t.Helper()
f := filepath.Join(t.TempDir(), "models.yaml") f := filepath.Join(t.TempDir(), "models.yaml")
require.NoError(t, os.WriteFile(f, []byte(yaml), 0644)) require.NoError(t, os.WriteFile(f, []byte(content), 0644))
return f
m, err := config.LoadModels(f)
require.NoError(t, err)
assert.Equal(t, "ollama/qwen3-coder-30b-tuned", m.Resolve("tdd", ""))
assert.Equal(t, "ollama/devstral-tuned", m.Resolve("review", ""))
assert.Equal(t, "ollama/default-model", m.Resolve("unknown", ""))
} }
func TestModelsOverride(t *testing.T) { func TestModelsVerifier(t *testing.T) {
yaml := ` m, err := config.LoadModels(writeModels(t, testYAML))
default: ollama/default-model require.NoError(t, err)
skills: assert.Equal(t, "claude-sonnet-4-6", m.Verifier())
tdd: ollama/qwen3-coder-30b-tuned }
`
f := filepath.Join(t.TempDir(), "models.yaml")
require.NoError(t, os.WriteFile(f, []byte(yaml), 0644))
m, err := config.LoadModels(f) func TestModelsLlamaSwapURL(t *testing.T) {
m, err := config.LoadModels(writeModels(t, testYAML))
require.NoError(t, err)
assert.Equal(t, "http://koala:8080", m.LlamaSwapURL())
}
func TestModelsChainForSkillOverride(t *testing.T) {
m, err := config.LoadModels(writeModels(t, testYAML))
require.NoError(t, err) require.NoError(t, err)
assert.Equal(t, "anthropic/claude-sonnet-4-6", m.Resolve("tdd", "anthropic/claude-sonnet-4-6")) chain := m.ChainFor("review", "")
require.Len(t, chain, 3)
assert.Equal(t, "ollama/devstral-tuned", chain[0])
assert.Equal(t, "ollama/gemma4", chain[1])
assert.Equal(t, "claude-sonnet-4-6", chain[2])
}
func TestModelsChainForDefaultFallback(t *testing.T) {
m, err := config.LoadModels(writeModels(t, testYAML))
require.NoError(t, err)
chain := m.ChainFor("trainer", "") // not in skills map
require.Len(t, chain, 2)
assert.Equal(t, "ollama/qwen3-coder-30b-tuned", chain[0])
assert.Equal(t, "claude-sonnet-4-6", chain[1])
}
func TestModelsChainForCallerOverride(t *testing.T) {
m, err := config.LoadModels(writeModels(t, testYAML))
require.NoError(t, err)
chain := m.ChainFor("review", "claude-opus-4-6")
require.Len(t, chain, 1)
assert.Equal(t, "claude-opus-4-6", chain[0])
} }

View File

@@ -72,8 +72,11 @@ func (e *Executor) Run(ctx context.Context, req Request) (Result, error) {
"--tools", tools, "--tools", tools,
"--json-schema", Schema, "--json-schema", Schema,
"--output-format", "json", "--output-format", "json",
prompt,
} }
if strings.HasPrefix(req.Model, "claude-") {
args = append(args, "--model", req.Model)
}
args = append(args, prompt)
cmd := exec.CommandContext(ctx, e.cfg.ClaudeBinary, args...) cmd := exec.CommandContext(ctx, e.cfg.ClaudeBinary, args...)
cmd.Env = append(os.Environ(), "LITELLM_API_KEY="+e.cfg.LiteLLMAPIKey) cmd.Env = append(os.Environ(), "LITELLM_API_KEY="+e.cfg.LiteLLMAPIKey)

View File

@@ -75,3 +75,58 @@ func TestExecutorTimesOut(t *testing.T) {
_, err := ex.Run(context.Background(), iexec.Request{TaskPrompt: "slow"}) _, err := ex.Run(context.Background(), iexec.Request{TaskPrompt: "slow"})
assert.ErrorContains(t, err, "timeout") assert.ErrorContains(t, err, "timeout")
} }
func TestExecutorPassesModelFlagForCloudModel(t *testing.T) {
// The script captures its args to a temp file so we can assert --model was passed.
argsFile := filepath.Join(t.TempDir(), "args.txt")
envelope := `{"type":"result","subtype":"success","is_error":false,"structured_output":{"status":"pass","phase":"review","skill":"review","file_path":"","runner_output":"","verified":true,"model_used":"claude-sonnet-4-6","message":"ok"}}`
dir := t.TempDir()
script := filepath.Join(dir, "claude")
content := "#!/bin/sh\necho \"$@\" > " + argsFile + "\necho '" + envelope + "'\n"
require.NoError(t, os.WriteFile(script, []byte(content), 0755))
ex := iexec.New(iexec.Config{
ClaudeBinary: script,
SystemPrompt: "sys",
Timeout: 5 * time.Second,
})
_, err := ex.Run(context.Background(), iexec.Request{
SkillPrompt: "review rules",
TaskPrompt: "do review",
Model: "claude-sonnet-4-6",
})
require.NoError(t, err)
argsData, err := os.ReadFile(argsFile)
require.NoError(t, err)
assert.Contains(t, string(argsData), "--model claude-sonnet-4-6")
}
func TestExecutorSkipsModelFlagForLocalModel(t *testing.T) {
argsFile := filepath.Join(t.TempDir(), "args.txt")
envelope := `{"type":"result","subtype":"success","is_error":false,"structured_output":{"status":"pass","phase":"review","skill":"review","file_path":"","runner_output":"","verified":true,"model_used":"ollama/devstral","message":"ok"}}`
dir := t.TempDir()
script := filepath.Join(dir, "claude")
content := "#!/bin/sh\necho \"$@\" > " + argsFile + "\necho '" + envelope + "'\n"
require.NoError(t, os.WriteFile(script, []byte(content), 0755))
ex := iexec.New(iexec.Config{
ClaudeBinary: script,
SystemPrompt: "sys",
Timeout: 5 * time.Second,
})
_, err := ex.Run(context.Background(), iexec.Request{
SkillPrompt: "review rules",
TaskPrompt: "do review",
Model: "ollama/devstral",
})
require.NoError(t, err)
argsData, err := os.ReadFile(argsFile)
require.NoError(t, err)
assert.NotContains(t, string(argsData), "--model")
}

103
internal/exec/litellm.go Normal file
View File

@@ -0,0 +1,103 @@
package exec
import (
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"time"
)
// LiteLLMExecutor calls a LiteLLM-compatible /v1/chat/completions endpoint.
// Local models are expected to return a JSON object matching the Result schema
// as their response content — no envelope.
type LiteLLMExecutor struct {
baseURL string
apiKey string
httpClient *http.Client
}
// NewLiteLLM creates a LiteLLMExecutor.
// timeout applies to the full HTTP round-trip per call.
func NewLiteLLM(baseURL, apiKey string, timeout time.Duration) *LiteLLMExecutor {
return &LiteLLMExecutor{
baseURL: baseURL,
apiKey: apiKey,
httpClient: &http.Client{Timeout: timeout},
}
}
type litellmMessage struct {
Role string `json:"role"`
Content string `json:"content"`
}
type litellmRequest struct {
Model string `json:"model"`
Messages []litellmMessage `json:"messages"`
}
type litellmChoice struct {
Message litellmMessage `json:"message"`
}
type litellmResponse struct {
Choices []litellmChoice `json:"choices"`
}
// Run dispatches req to the LiteLLM server and parses the Result from the
// assistant message content. Returns an error on network failure, non-200
// status, or unparseable/invalid JSON — all of which the Orchestrator treats
// as automatic escalation triggers.
func (e *LiteLLMExecutor) Run(ctx context.Context, req Request) (Result, error) {
body := litellmRequest{
Model: req.Model,
Messages: []litellmMessage{
{Role: "system", Content: req.SkillPrompt},
{Role: "user", Content: req.TaskPrompt},
},
}
bodyBytes, err := json.Marshal(body)
if err != nil {
return Result{}, fmt.Errorf("litellm: marshal request: %w", err)
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, e.baseURL+"/v1/chat/completions", bytes.NewReader(bodyBytes))
if err != nil {
return Result{}, fmt.Errorf("litellm: create request: %w", err)
}
httpReq.Header.Set("Content-Type", "application/json")
if e.apiKey != "" {
httpReq.Header.Set("Authorization", "Bearer "+e.apiKey)
}
resp, err := e.httpClient.Do(httpReq)
if err != nil {
return Result{}, fmt.Errorf("litellm: request failed: %w", err)
}
defer resp.Body.Close() //nolint:errcheck
if resp.StatusCode != http.StatusOK {
return Result{}, fmt.Errorf("litellm: server returned status %d", resp.StatusCode)
}
var chatResp litellmResponse
if err := json.NewDecoder(resp.Body).Decode(&chatResp); err != nil {
return Result{}, fmt.Errorf("litellm: decode response: %w", err)
}
if len(chatResp.Choices) == 0 {
return Result{}, fmt.Errorf("litellm: no choices in response")
}
content := chatResp.Choices[0].Message.Content
var result Result
if err := json.Unmarshal([]byte(content), &result); err != nil {
return Result{}, fmt.Errorf("litellm: parse result JSON: %w — content: %s", err, content)
}
if err := result.Validate(); err != nil {
return Result{}, fmt.Errorf("litellm: invalid result: %w", err)
}
return result, nil
}

View File

@@ -0,0 +1,112 @@
package exec_test
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"testing"
"time"
iexec "github.com/mathiasbq/supervisor/internal/exec"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func validLiteLLMResult() iexec.Result {
return iexec.Result{
Status: "pass",
Phase: "review",
Skill: "review",
ModelUsed: "ollama/devstral",
Message: "looks good",
}
}
func chatResponseFor(t *testing.T, result iexec.Result) []byte {
t.Helper()
content, err := json.Marshal(result)
require.NoError(t, err)
resp := map[string]any{
"choices": []map[string]any{
{"message": map[string]any{"role": "assistant", "content": string(content)}},
},
}
data, err := json.Marshal(resp)
require.NoError(t, err)
return data
}
func TestLiteLLMParsesValidResult(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
assert.Equal(t, "/v1/chat/completions", r.URL.Path)
assert.Equal(t, "application/json", r.Header.Get("Content-Type"))
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
_, _ = w.Write(chatResponseFor(t, validLiteLLMResult()))
}))
defer srv.Close()
ex := iexec.NewLiteLLM(srv.URL, "", 5*time.Second)
result, err := ex.Run(context.Background(), iexec.Request{
SkillPrompt: "review rules",
TaskPrompt: "review the code",
Model: "ollama/devstral",
})
require.NoError(t, err)
assert.Equal(t, "pass", result.Status)
assert.Equal(t, "review", result.Skill)
}
func TestLiteLLMSendsAuthHeader(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
assert.Equal(t, "Bearer secret", r.Header.Get("Authorization"))
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
_, _ = w.Write(chatResponseFor(t, validLiteLLMResult()))
}))
defer srv.Close()
ex := iexec.NewLiteLLM(srv.URL, "secret", 5*time.Second)
_, err := ex.Run(context.Background(), iexec.Request{Model: "x", TaskPrompt: "t", SkillPrompt: "s"})
require.NoError(t, err)
}
func TestLiteLLMErrorOnNonOKStatus(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusServiceUnavailable)
}))
defer srv.Close()
ex := iexec.NewLiteLLM(srv.URL, "", 5*time.Second)
_, err := ex.Run(context.Background(), iexec.Request{Model: "x", TaskPrompt: "t"})
assert.ErrorContains(t, err, "503")
}
func TestLiteLLMErrorOnUnparsableJSON(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
resp := map[string]any{
"choices": []map[string]any{
{"message": map[string]any{"role": "assistant", "content": "not json at all"}},
},
}
data, _ := json.Marshal(resp)
_, _ = w.Write(data)
}))
defer srv.Close()
ex := iexec.NewLiteLLM(srv.URL, "", 5*time.Second)
_, err := ex.Run(context.Background(), iexec.Request{Model: "x", TaskPrompt: "t"})
assert.Error(t, err)
}
func TestLiteLLMRespectsContextCancellation(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
cancel() // Cancel immediately
ex := iexec.NewLiteLLM("http://invalid.example.com", "", 1*time.Second)
_, err := ex.Run(ctx, iexec.Request{Model: "x", TaskPrompt: "t"})
assert.Error(t, err)
}

View File

@@ -0,0 +1,197 @@
package exec
import (
"context"
"fmt"
"io"
"net/http"
"strings"
"time"
)
// ChainEntry is one tier in an escalation chain.
type ChainEntry struct {
Model string // e.g. "ollama/phi4", "claude-sonnet-4-6"
Tier string // "local" | "subagent" | "managed"
IsCloud bool // true for claude-* models; skips verifier call
}
// EntryFor builds a ChainEntry from a model name string.
func EntryFor(model string) ChainEntry {
cloud := strings.HasPrefix(model, "claude-")
tier := "local"
if cloud {
tier = "subagent"
}
return ChainEntry{Model: model, Tier: tier, IsCloud: cloud}
}
// AttemptRecord captures the outcome of one tier attempt for session logging.
type AttemptRecord struct {
Model string
Tier string
DurationMs int64
WarmStart bool
Verdict string // "accept" | "escalate" | "error"
Feedback string
}
// VerifierFn is the interface the orchestrator uses to verify local output.
type VerifierFn interface {
Verify(ctx context.Context, skillPrompt, taskPrompt string, output Result) (Verdict, error)
}
// ExecutorRunFn is the signature of Executor.Run and LiteLLMExecutor.Run.
type ExecutorRunFn func(ctx context.Context, req Request) (Result, error)
// Orchestrator walks an escalation chain, delegating generation and verification.
// It implements the ExecutorFn shape expected by skill handlers.
type Orchestrator struct {
chain []ChainEntry
localRun ExecutorRunFn // for local (non-cloud) tiers; may be nil
cloudRun ExecutorRunFn // for cloud tiers; may be nil
verifier VerifierFn
llamaSwapURL string
attempts *[]AttemptRecord
}
// NewOrchestrator creates an Orchestrator.
// attempts is a pointer to a slice that will be appended to on each tier attempt.
// Pass nil for localRun or cloudRun if no tiers of that type exist in the chain.
func NewOrchestrator(
chain []ChainEntry,
localRun ExecutorRunFn,
cloudRun ExecutorRunFn,
verifier VerifierFn,
llamaSwapURL string,
attempts *[]AttemptRecord,
) *Orchestrator {
return &Orchestrator{
chain: chain,
localRun: localRun,
cloudRun: cloudRun,
verifier: verifier,
llamaSwapURL: llamaSwapURL,
attempts: attempts,
}
}
// Run walks the escalation chain and returns the first accepted result.
// Satisfies the ExecutorFn signature: func(context.Context, Request) (Result, error).
func (o *Orchestrator) Run(ctx context.Context, req Request) (Result, error) {
taskPrompt := req.TaskPrompt
for _, entry := range o.chain {
warm := o.probeWarm(entry.Model)
start := time.Now()
tierReq := req
tierReq.Model = entry.Model
tierReq.TaskPrompt = taskPrompt
if entry.IsCloud {
result, genErr := o.cloudRun(ctx, tierReq)
dur := time.Since(start).Milliseconds()
verdict := "accept"
if genErr != nil {
verdict = "error"
}
o.appendAttempt(AttemptRecord{
Model: entry.Model,
Tier: entry.Tier,
DurationMs: dur,
WarmStart: warm,
Verdict: verdict,
})
if genErr == nil {
return result, nil
}
continue
}
// Local tier.
result, genErr := o.localRun(ctx, tierReq)
dur := time.Since(start).Milliseconds()
if genErr != nil {
o.appendAttempt(AttemptRecord{
Model: entry.Model,
Tier: entry.Tier,
DurationMs: dur,
WarmStart: warm,
Verdict: "error",
Feedback: genErr.Error(),
})
continue
}
verdict, verErr := o.verifier.Verify(ctx, req.SkillPrompt, taskPrompt, result)
if verErr != nil {
// Treat verifier failure as escalate (safe default).
o.appendAttempt(AttemptRecord{
Model: entry.Model,
Tier: entry.Tier,
DurationMs: dur,
WarmStart: warm,
Verdict: "escalate",
Feedback: "verifier error: " + verErr.Error(),
})
continue
}
if verdict.Accept {
o.appendAttempt(AttemptRecord{
Model: entry.Model,
Tier: entry.Tier,
DurationMs: dur,
WarmStart: warm,
Verdict: "accept",
})
return result, nil
}
o.appendAttempt(AttemptRecord{
Model: entry.Model,
Tier: entry.Tier,
DurationMs: dur,
WarmStart: warm,
Verdict: "escalate",
Feedback: verdict.Feedback,
})
// Inject verifier feedback into the next tier's task prompt.
taskPrompt = taskPrompt + "\n\nPrior attempt feedback: " + verdict.Feedback
}
return Result{}, fmt.Errorf("all tiers exhausted after %d attempt(s)", len(o.chain))
}
func (o *Orchestrator) appendAttempt(rec AttemptRecord) {
if o.attempts != nil {
*o.attempts = append(*o.attempts, rec)
}
}
// probeWarm checks whether the model is currently loaded in llama-swap.
// Returns false on any error or if llamaSwapURL is empty.
func (o *Orchestrator) probeWarm(model string) bool {
if o.llamaSwapURL == "" {
return false
}
ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
defer cancel()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, o.llamaSwapURL+"/v1/models", nil)
if err != nil {
return false
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return false
}
defer resp.Body.Close() //nolint:errcheck
body, err := io.ReadAll(resp.Body)
if err != nil {
return false
}
return strings.Contains(string(body), model)
}

View File

@@ -0,0 +1,151 @@
package exec_test
import (
"context"
"errors"
"testing"
iexec "github.com/mathiasbq/supervisor/internal/exec"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// stubRunFn returns preset results sequentially.
type stubRunFn struct {
calls []stubCall
callIdx int
}
type stubCall struct {
result iexec.Result
err error
}
func (s *stubRunFn) Run(_ context.Context, _ iexec.Request) (iexec.Result, error) {
if s.callIdx >= len(s.calls) {
return iexec.Result{}, errors.New("unexpected call")
}
c := s.calls[s.callIdx]
s.callIdx++
return c.result, c.err
}
// stubVerifier returns preset verdicts sequentially.
type stubVerifier struct {
verdicts []iexec.Verdict
idx int
}
func (s *stubVerifier) Verify(_ context.Context, _, _ string, _ iexec.Result) (iexec.Verdict, error) {
if s.idx >= len(s.verdicts) {
return iexec.Verdict{}, errors.New("unexpected verify call")
}
v := s.verdicts[s.idx]
s.idx++
return v, nil
}
func okResult(skill string) iexec.Result {
return iexec.Result{Status: "pass", Phase: "review", Skill: skill, Message: "ok", ModelUsed: "m"}
}
func TestOrchestratorSingleLocalAccept(t *testing.T) {
local := &stubRunFn{calls: []stubCall{{result: okResult("review")}}}
verifier := &stubVerifier{verdicts: []iexec.Verdict{{Accept: true}}}
var attempts []iexec.AttemptRecord
orch := iexec.NewOrchestrator(
[]iexec.ChainEntry{{Model: "ollama/devstral", Tier: "local", IsCloud: false}},
local.Run, nil, verifier, "", &attempts,
)
result, err := orch.Run(context.Background(), iexec.Request{TaskPrompt: "review"})
require.NoError(t, err)
assert.Equal(t, "pass", result.Status)
require.Len(t, attempts, 1)
assert.Equal(t, "local", attempts[0].Tier)
assert.Equal(t, "accept", attempts[0].Verdict)
}
func TestOrchestratorEscalatesOnVerifierReject(t *testing.T) {
local := &stubRunFn{calls: []stubCall{
{result: iexec.Result{Status: "fail", Phase: "review", Skill: "review", Message: "weak"}},
{result: okResult("review")},
}}
verifier := &stubVerifier{verdicts: []iexec.Verdict{
{Accept: false, Feedback: "missing line refs"},
{Accept: true},
}}
var attempts []iexec.AttemptRecord
orch := iexec.NewOrchestrator(
[]iexec.ChainEntry{
{Model: "ollama/devstral", Tier: "local", IsCloud: false},
{Model: "ollama/gemma4", Tier: "local", IsCloud: false},
},
local.Run, nil, verifier, "", &attempts,
)
result, err := orch.Run(context.Background(), iexec.Request{TaskPrompt: "review"})
require.NoError(t, err)
assert.Equal(t, "pass", result.Status)
require.Len(t, attempts, 2)
assert.Equal(t, "escalate", attempts[0].Verdict)
assert.Equal(t, "missing line refs", attempts[0].Feedback)
assert.Equal(t, "accept", attempts[1].Verdict)
}
func TestOrchestratorEscalatesOnLocalError(t *testing.T) {
local := &stubRunFn{calls: []stubCall{
{err: errors.New("network failure")},
{result: okResult("review")},
}}
verifier := &stubVerifier{verdicts: []iexec.Verdict{{Accept: true}}}
var attempts []iexec.AttemptRecord
orch := iexec.NewOrchestrator(
[]iexec.ChainEntry{
{Model: "ollama/devstral", Tier: "local", IsCloud: false},
{Model: "ollama/gemma4", Tier: "local", IsCloud: false},
},
local.Run, nil, verifier, "", &attempts,
)
_, err := orch.Run(context.Background(), iexec.Request{TaskPrompt: "review"})
require.NoError(t, err)
require.Len(t, attempts, 2)
assert.Equal(t, "error", attempts[0].Verdict)
assert.Equal(t, "accept", attempts[1].Verdict)
}
func TestOrchestratorCloudTierSelfCertifies(t *testing.T) {
cloud := &stubRunFn{calls: []stubCall{{result: okResult("review")}}}
verifier := &stubVerifier{} // no verdicts — must not be called
var attempts []iexec.AttemptRecord
orch := iexec.NewOrchestrator(
[]iexec.ChainEntry{{Model: "claude-sonnet-4-6", Tier: "subagent", IsCloud: true}},
nil, cloud.Run, verifier, "", &attempts,
)
result, err := orch.Run(context.Background(), iexec.Request{TaskPrompt: "review"})
require.NoError(t, err)
assert.Equal(t, "pass", result.Status)
require.Len(t, attempts, 1)
assert.Equal(t, "subagent", attempts[0].Tier)
assert.Equal(t, "accept", attempts[0].Verdict)
assert.Equal(t, 0, verifier.idx) // verifier never called
}
func TestOrchestratorAllTiersExhausted(t *testing.T) {
local := &stubRunFn{calls: []stubCall{{err: errors.New("unavailable")}}}
var attempts []iexec.AttemptRecord
orch := iexec.NewOrchestrator(
[]iexec.ChainEntry{{Model: "ollama/devstral", Tier: "local", IsCloud: false}},
local.Run, nil, &stubVerifier{}, "", &attempts,
)
_, err := orch.Run(context.Background(), iexec.Request{TaskPrompt: "review"})
assert.ErrorContains(t, err, "all tiers exhausted")
}

99
internal/exec/verifier.go Normal file
View File

@@ -0,0 +1,99 @@
package exec
import (
"bytes"
"context"
"encoding/json"
"fmt"
"os"
"os/exec"
"time"
)
// Verdict is the output of a Claude verification call.
type Verdict struct {
Accept bool `json:"accept"`
Feedback string `json:"feedback"` // empty when Accept is true
}
// Verifier runs a focused Claude call to judge local model output.
type Verifier struct {
claudeBinary string
model string
timeout time.Duration
}
// NewVerifier creates a Verifier that calls claude with the given binary path and model.
// Empty claudeBinary defaults to "claude". Zero timeout defaults to 30s.
func NewVerifier(claudeBinary, model string, timeout time.Duration) *Verifier {
if claudeBinary == "" {
claudeBinary = "claude"
}
if timeout == 0 {
timeout = 30 * time.Second
}
return &Verifier{
claudeBinary: claudeBinary,
model: model,
timeout: timeout,
}
}
// Verify asks Claude whether output satisfies the skill discipline's iron laws.
// Returns Verdict{Accept: true} to accept or Verdict{Accept: false, Feedback: "..."}
// to escalate. Returns an error on subprocess failure or unparseable response.
func (v *Verifier) Verify(ctx context.Context, skillPrompt, taskPrompt string, output Result) (Verdict, error) {
ctx, cancel := context.WithTimeout(ctx, v.timeout)
defer cancel()
outputJSON, err := json.Marshal(output)
if err != nil {
return Verdict{}, fmt.Errorf("verifier: marshal output: %w", err)
}
prompt := fmt.Sprintf(`You are a quality verifier for an AI supervisor system.
Given the skill discipline, the original task, and the generated output, decide whether the output satisfies the discipline's iron laws and output contract.
Reply with JSON only — no other text:
{"accept": true, "feedback": ""}
or
{"accept": false, "feedback": "<one sentence reason>"}
## Skill discipline
%s
## Original task
%s
## Generated output
%s`, skillPrompt, taskPrompt, string(outputJSON))
args := []string{
"--print",
"--permission-mode", "bypassPermissions",
}
if v.model != "" {
args = append(args, "--model", v.model)
}
args = append(args, prompt)
cmd := exec.CommandContext(ctx, v.claudeBinary, args...)
cmd.Env = os.Environ()
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
if ctx.Err() != nil {
return Verdict{}, fmt.Errorf("verifier: timeout after %s", v.timeout)
}
return Verdict{}, fmt.Errorf("verifier: claude exited with error: %w — stderr: %s", err, stderr.String())
}
var verdict Verdict
if err := json.Unmarshal(bytes.TrimSpace(stdout.Bytes()), &verdict); err != nil {
return Verdict{}, fmt.Errorf("verifier: parse verdict JSON: %w — raw: %s", err, stdout.String())
}
return verdict, nil
}

View File

@@ -0,0 +1,74 @@
package exec_test
import (
"context"
"encoding/json"
"fmt"
"os"
"path/filepath"
"testing"
"time"
iexec "github.com/mathiasbq/supervisor/internal/exec"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func fakeVerifierClaude(t *testing.T, verdict iexec.Verdict) string {
t.Helper()
data, err := json.Marshal(verdict)
require.NoError(t, err)
dir := t.TempDir()
script := filepath.Join(dir, "claude")
content := fmt.Sprintf("#!/bin/sh\necho '%s'\n", string(data))
require.NoError(t, os.WriteFile(script, []byte(content), 0755))
return script
}
func TestVerifierAccepts(t *testing.T) {
claude := fakeVerifierClaude(t, iexec.Verdict{Accept: true, Feedback: ""})
v := iexec.NewVerifier(claude, "claude-sonnet-4-6", 5*time.Second)
verdict, err := v.Verify(context.Background(), "skill rules", "do the task", iexec.Result{
Status: "pass", Phase: "review", Skill: "review", Message: "ok",
})
require.NoError(t, err)
assert.True(t, verdict.Accept)
assert.Empty(t, verdict.Feedback)
}
func TestVerifierEscalates(t *testing.T) {
claude := fakeVerifierClaude(t, iexec.Verdict{Accept: false, Feedback: "missing line references"})
v := iexec.NewVerifier(claude, "claude-sonnet-4-6", 5*time.Second)
verdict, err := v.Verify(context.Background(), "skill rules", "do the task", iexec.Result{
Status: "pass", Phase: "review", Skill: "review", Message: "incomplete",
})
require.NoError(t, err)
assert.False(t, verdict.Accept)
assert.Equal(t, "missing line references", verdict.Feedback)
}
func TestVerifierErrorOnUnparsableOutput(t *testing.T) {
dir := t.TempDir()
script := filepath.Join(dir, "claude")
require.NoError(t, os.WriteFile(script, []byte("#!/bin/sh\necho 'not json'\n"), 0755))
v := iexec.NewVerifier(script, "claude-sonnet-4-6", 5*time.Second)
_, err := v.Verify(context.Background(), "rules", "task", iexec.Result{
Status: "pass", Phase: "review", Skill: "review", Message: "ok",
})
assert.Error(t, err)
}
func TestVerifierErrorOnNonZeroExit(t *testing.T) {
dir := t.TempDir()
script := filepath.Join(dir, "claude")
require.NoError(t, os.WriteFile(script, []byte("#!/bin/sh\nexit 1\n"), 0755))
v := iexec.NewVerifier(script, "claude-sonnet-4-6", 5*time.Second)
_, err := v.Verify(context.Background(), "rules", "task", iexec.Result{
Status: "pass", Phase: "review", Skill: "review", Message: "ok",
})
assert.Error(t, err)
}

View File

@@ -32,9 +32,14 @@ type Entry struct {
type Attempt struct { type Attempt struct {
Attempt int `json:"attempt"` Attempt int `json:"attempt"`
Model string `json:"model"` Model string `json:"model"`
Tier string `json:"tier"` // local | subagent | managed
DurationMs int64 `json:"duration_ms"`
WarmStart bool `json:"warm_start"` // model already loaded in llama-swap
Verified bool `json:"verified"`
Verdict string `json:"verdict,omitempty"` // accept | escalate | error
Feedback string `json:"feedback,omitempty"` // verifier feedback on escalation
OutputSummary string `json:"output_summary,omitempty"` OutputSummary string `json:"output_summary,omitempty"`
RunnerOutput string `json:"runner_output,omitempty"` RunnerOutput string `json:"runner_output,omitempty"`
Verified bool `json:"verified"`
} }
// Append writes entry as a single JSON line to sessionsDir/{sessionID}.jsonl. // Append writes entry as a single JSON line to sessionsDir/{sessionID}.jsonl.

View File

@@ -61,3 +61,22 @@ func TestRead_EmptyWhenNoFile(t *testing.T) {
require.NoError(t, err) require.NoError(t, err)
assert.Empty(t, entries) assert.Empty(t, entries)
} }
func TestAttemptRoundTrip(t *testing.T) {
a := session.Attempt{
Attempt: 1,
Model: "ollama/devstral",
Tier: "local",
DurationMs: 4200,
WarmStart: true,
Verified: false,
Verdict: "escalate",
Feedback: "missing line references",
}
data, err := json.Marshal(a)
require.NoError(t, err)
var got session.Attempt
require.NoError(t, json.Unmarshal(data, &got))
assert.Equal(t, a, got)
}