Compare commits
19 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6328766c7f | ||
|
|
f1deedd39d | ||
|
|
5cb272a869 | ||
|
|
e96b39a812 | ||
|
|
5db5b33cd7 | ||
|
|
a32457b5bc | ||
|
|
e0be5f0f98 | ||
|
|
6d410b810b | ||
|
|
76f195de2a | ||
|
|
f901d4e67d | ||
|
|
509c04b6e4 | ||
|
|
738275252c | ||
|
|
38fcac4cba | ||
|
|
7697e901d2 | ||
|
|
8cff57009a | ||
|
|
8fb44affef | ||
|
|
582ca5019b | ||
|
|
858a9ba1a1 | ||
|
|
cbef2da8de |
@@ -53,6 +53,6 @@ jobs:
|
||||
chmod 600 ~/.ssh/id_rsa_gh_mirror
|
||||
ssh-keyscan github.com >> ~/.ssh/known_hosts 2>/dev/null
|
||||
GIT_SSH_COMMAND="ssh -i ~/.ssh/id_rsa_gh_mirror -o IdentitiesOnly=yes" \
|
||||
git push git@github.com:mathiasb/hyperguild.git HEAD:main --tags
|
||||
git push git@github.com:mathiasb/hyperguild.git HEAD:main --follow-tags
|
||||
rm ~/.ssh/id_rsa_gh_mirror
|
||||
echo "✓ Mirrored to GitHub"
|
||||
|
||||
@@ -13,6 +13,10 @@ import (
|
||||
"github.com/mathiasbq/supervisor/internal/skills/brain"
|
||||
"github.com/mathiasbq/supervisor/internal/skills/org"
|
||||
"github.com/mathiasbq/supervisor/internal/skills/retrospective"
|
||||
skilldebug "github.com/mathiasbq/supervisor/internal/skills/debug"
|
||||
"github.com/mathiasbq/supervisor/internal/skills/review"
|
||||
"github.com/mathiasbq/supervisor/internal/skills/spec"
|
||||
"github.com/mathiasbq/supervisor/internal/skills/trainer"
|
||||
"github.com/mathiasbq/supervisor/internal/skills/sessionlog"
|
||||
"github.com/mathiasbq/supervisor/internal/skills/tdd"
|
||||
"github.com/mathiasbq/supervisor/internal/tier"
|
||||
@@ -51,11 +55,55 @@ func main() {
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
executor := iexec.New(iexec.Config{
|
||||
reviewPrompt, err := os.ReadFile(cfg.ConfigDir + "/review.md")
|
||||
if err != nil {
|
||||
logger.Error("read review.md", "path", cfg.ConfigDir+"/review.md", "err", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
debugPrompt, err := os.ReadFile(cfg.ConfigDir + "/debug.md")
|
||||
if err != nil {
|
||||
logger.Error("read debug.md", "path", cfg.ConfigDir+"/debug.md", "err", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
specPrompt, err := os.ReadFile(cfg.ConfigDir + "/spec.md")
|
||||
if err != nil {
|
||||
logger.Error("read spec.md", "path", cfg.ConfigDir+"/spec.md", "err", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
trainerReaderPrompt, err := os.ReadFile(cfg.ConfigDir + "/trainer-reader.md")
|
||||
if err != nil {
|
||||
logger.Error("read trainer-reader.md", "path", cfg.ConfigDir+"/trainer-reader.md", "err", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
trainerWriterPrompt, err := os.ReadFile(cfg.ConfigDir + "/trainer-writer.md")
|
||||
if err != nil {
|
||||
logger.Error("read trainer-writer.md", "path", cfg.ConfigDir+"/trainer-writer.md", "err", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
claudeExec := iexec.New(iexec.Config{
|
||||
SystemPrompt: string(systemPrompt),
|
||||
LiteLLMBaseURL: cfg.LiteLLMBaseURL,
|
||||
LiteLLMAPIKey: cfg.LiteLLMAPIKey,
|
||||
})
|
||||
litellmExec := iexec.NewLiteLLM(cfg.LiteLLMBaseURL, cfg.LiteLLMAPIKey, 0)
|
||||
verifier := iexec.NewVerifier("", models.Verifier(), 0)
|
||||
|
||||
buildOrch := func(skill string) func(ctx context.Context, req iexec.Request) (iexec.Result, error) {
|
||||
return func(ctx context.Context, req iexec.Request) (iexec.Result, error) {
|
||||
rawChain := models.ChainFor(skill, req.Model)
|
||||
chain := make([]iexec.ChainEntry, len(rawChain))
|
||||
for i, m := range rawChain {
|
||||
chain[i] = iexec.EntryFor(m)
|
||||
}
|
||||
attempts := make([]iexec.AttemptRecord, 0, len(chain))
|
||||
orch := iexec.NewOrchestrator(chain, litellmExec.Run, claudeExec.Run, verifier, models.LlamaSwapURL(), &attempts)
|
||||
return orch.Run(ctx, req)
|
||||
}
|
||||
}
|
||||
|
||||
tierFn := func(ctx context.Context) tier.Info {
|
||||
return tier.Detect(ctx, "https://api.anthropic.com", cfg.LiteLLMBaseURL)
|
||||
@@ -65,8 +113,9 @@ func main() {
|
||||
reg.Register(tdd.New(tdd.Config{
|
||||
SystemPrompt: string(systemPrompt),
|
||||
SkillPrompt: string(tddPrompt),
|
||||
DefaultModel: models.Resolve("tdd", ""),
|
||||
ExecutorFn: executor.Run,
|
||||
DefaultModel: models.ChainFor("tdd", "")[0],
|
||||
ExecutorFn: buildOrch("tdd"),
|
||||
SessionsDir: cfg.SessionsDir,
|
||||
}))
|
||||
reg.Register(brain.New(brain.Config{
|
||||
IngestBaseURL: cfg.IngestBaseURL,
|
||||
@@ -79,9 +128,35 @@ func main() {
|
||||
}))
|
||||
reg.Register(retrospective.New(retrospective.Config{
|
||||
SkillPrompt: string(retroPrompt),
|
||||
DefaultModel: models.Resolve("retrospective", ""),
|
||||
DefaultModel: models.ChainFor("retrospective", "")[0],
|
||||
SessionsDir: cfg.SessionsDir,
|
||||
ExecutorFn: executor.Run,
|
||||
ExecutorFn: buildOrch("retrospective"),
|
||||
}))
|
||||
reg.Register(review.New(review.Config{
|
||||
SkillPrompt: string(reviewPrompt),
|
||||
DefaultModel: models.ChainFor("review", "")[0],
|
||||
ExecutorFn: buildOrch("review"),
|
||||
SessionsDir: cfg.SessionsDir,
|
||||
}))
|
||||
reg.Register(skilldebug.New(skilldebug.Config{
|
||||
SkillPrompt: string(debugPrompt),
|
||||
DefaultModel: models.ChainFor("debug", "")[0],
|
||||
ExecutorFn: buildOrch("debug"),
|
||||
SessionsDir: cfg.SessionsDir,
|
||||
}))
|
||||
reg.Register(spec.New(spec.Config{
|
||||
SkillPrompt: string(specPrompt),
|
||||
DefaultModel: models.ChainFor("spec", "")[0],
|
||||
ExecutorFn: buildOrch("spec"),
|
||||
SessionsDir: cfg.SessionsDir,
|
||||
}))
|
||||
reg.Register(trainer.New(trainer.Config{
|
||||
ReaderPrompt: string(trainerReaderPrompt),
|
||||
WriterPrompt: string(trainerWriterPrompt),
|
||||
DefaultModel: models.ChainFor("trainer", "")[0],
|
||||
ExecutorFn: buildOrch("trainer"),
|
||||
SessionsDir: cfg.SessionsDir,
|
||||
BrainDir: cfg.BrainDir,
|
||||
}))
|
||||
|
||||
srv := mcp.NewServer(reg)
|
||||
|
||||
@@ -1,11 +1,41 @@
|
||||
# Model routing table — three-layer priority:
|
||||
# 1. model param in MCP tool call (caller override)
|
||||
# 2. per-skill entry here
|
||||
# 3. default (fallback)
|
||||
default: ollama/qwen3-coder-30b-tuned
|
||||
# Model routing chains — three-layer priority:
|
||||
# 1. model param in MCP tool call (caller override — collapses to single entry, no escalation)
|
||||
# 2. per-skill chain here
|
||||
# 3. default_chain fallback
|
||||
|
||||
verifier: claude-sonnet-4-6 # fixed verifier for all local tiers
|
||||
|
||||
llama_swap_url: http://koala:8080 # for warm-state probing
|
||||
|
||||
default_chain:
|
||||
- ollama/qwen3-coder-30b-tuned
|
||||
- claude-sonnet-4-6
|
||||
|
||||
skills:
|
||||
tdd: ollama/qwen3-coder-30b-tuned
|
||||
review: ollama/devstral-tuned
|
||||
debug: ollama/deepseek-r1-tuned
|
||||
retrospective: ollama/qwen3-coder-30b-tuned
|
||||
tdd:
|
||||
chain:
|
||||
- ollama/qwen3-coder-30b-tuned
|
||||
- claude-sonnet-4-6
|
||||
review:
|
||||
chain:
|
||||
- ollama/devstral-tuned
|
||||
- ollama/gemma4
|
||||
- claude-sonnet-4-6
|
||||
debug:
|
||||
chain:
|
||||
- ollama/deepseek-r1-tuned
|
||||
- claude-sonnet-4-6
|
||||
spec:
|
||||
chain:
|
||||
- ollama/phi4
|
||||
- ollama/gemma4
|
||||
- claude-sonnet-4-6
|
||||
- claude-opus-4-6
|
||||
retrospective:
|
||||
chain:
|
||||
- ollama/qwen3-coder-30b-tuned
|
||||
- claude-sonnet-4-6
|
||||
trainer:
|
||||
chain:
|
||||
- ollama/qwen3-coder-30b-tuned
|
||||
- claude-sonnet-4-6
|
||||
|
||||
31
config/supervisor/debug.md
Normal file
31
config/supervisor/debug.md
Normal file
@@ -0,0 +1,31 @@
|
||||
# Debug Discipline
|
||||
|
||||
You are a systematic debugger. Form hypotheses before suggesting fixes.
|
||||
|
||||
## Iron laws
|
||||
1. Never suggest "try X and see what happens" — every hypothesis must have a specific expected outcome if correct
|
||||
2. Generate exactly 3-5 hypotheses, ordered by likelihood (most likely first)
|
||||
3. Never fix the bug — diagnose only; the caller decides what to do with the hypotheses
|
||||
|
||||
## Output contract
|
||||
Return JSON result with:
|
||||
- `status`: "pass" (hypotheses generated) or "error" (error too ambiguous to analyse)
|
||||
- `phase`: "debug"
|
||||
- `skill`: "debug"
|
||||
- `file_path`: the most relevant file to the error (read it)
|
||||
- `runner_output`: your hypotheses, formatted as:
|
||||
```
|
||||
HYPOTHESIS 1 (likelihood: high): <mechanism>
|
||||
VERIFY: <exact command or file to check> → expected if correct: <specific output>
|
||||
|
||||
HYPOTHESIS 2 (likelihood: medium): <mechanism>
|
||||
VERIFY: <exact command or file to check> → expected if correct: <specific output>
|
||||
```
|
||||
- `verified`: false — verification is the caller's job
|
||||
- `message`: "N hypotheses for: <one-line error summary>"
|
||||
|
||||
## Rules
|
||||
1. Read the error and any context files provided before forming hypotheses
|
||||
2. Identify the failure mode first — what actually went wrong, not just what the error says
|
||||
3. For each hypothesis: name the mechanism, explain why it would produce this exact error, give a concrete verification command with expected output
|
||||
4. If the error is clearly a typo or trivial mistake, still form 3 hypotheses — surface the most likely cause as #1
|
||||
30
config/supervisor/review.md
Normal file
30
config/supervisor/review.md
Normal file
@@ -0,0 +1,30 @@
|
||||
# Code Review Discipline
|
||||
|
||||
You are a disciplined code reviewer. Read files carefully before commenting.
|
||||
|
||||
## Iron laws
|
||||
1. Never approve security vulnerabilities: command injection, SQL injection, credential exposure, path traversal, unchecked input at system boundaries
|
||||
2. Never approve silently swallowed errors — `err != nil` without wrapping or handling is always wrong
|
||||
3. Never approve missing validation at system boundaries (user input, external APIs, file reads)
|
||||
|
||||
## Output contract
|
||||
Return JSON result with:
|
||||
- `status`: "pass" if no blocking issues; "fail" if any iron law is violated
|
||||
- `phase`: "review"
|
||||
- `skill`: "review"
|
||||
- `file_path`: first file reviewed
|
||||
- `runner_output`: full review formatted as:
|
||||
```
|
||||
CRITICAL: <issue> at <file>:<line>
|
||||
WARNING: <issue> at <file>:<line>
|
||||
SUGGESTION: <issue> at <file>:<line>
|
||||
```
|
||||
- `verified`: true if you read all specified files; false if any were missing or unreadable
|
||||
- `message`: "N critical, M warnings, K suggestions" or "clean: <which iron law checks passed and why>"
|
||||
|
||||
## Rules
|
||||
1. Read every file listed before writing feedback
|
||||
2. Check iron laws first — any violation is CRITICAL and sets status to "fail"
|
||||
3. Then check: correctness, test coverage for new code, Go style conventions
|
||||
4. Never rubber-stamp — if nothing is wrong, explain specifically which iron law checks you ran and why they passed
|
||||
5. Line references are required for every finding — "roughly around the middle" is not acceptable
|
||||
46
config/supervisor/spec.md
Normal file
46
config/supervisor/spec.md
Normal file
@@ -0,0 +1,46 @@
|
||||
# Spec Writing Discipline
|
||||
|
||||
You write structured implementation specs. Nothing is left ambiguous.
|
||||
|
||||
## Iron laws
|
||||
1. Success criteria must be measurable — "the system is fast" is banned; "p99 < 200ms under 100 RPS" is valid
|
||||
2. Always include an explicit "Out of scope" section — if you don't draw the boundary, the developer will guess wrong
|
||||
3. Every technical decision in the approach must have a rationale
|
||||
|
||||
## Output contract
|
||||
Return JSON result with:
|
||||
- `status`: "pass" (spec written) or "error" (requirements too ambiguous to spec without more input)
|
||||
- `phase`: "spec"
|
||||
- `skill`: "spec"
|
||||
- `file_path`: the output_path where the spec was written (absolute path)
|
||||
- `runner_output`: ""
|
||||
- `verified`: true if the file was written successfully
|
||||
- `message`: "spec written: <one-line summary of what was specced>"
|
||||
|
||||
## Spec structure
|
||||
Write the spec as markdown to the output_path:
|
||||
|
||||
```markdown
|
||||
# [Feature] Spec
|
||||
|
||||
## Problem statement
|
||||
[What problem does this solve? For whom? Why now?]
|
||||
|
||||
## Success criteria
|
||||
- [ ] [Criterion 1 — measurable and verifiable]
|
||||
- [ ] [Criterion 2 — measurable and verifiable]
|
||||
|
||||
## Constraints
|
||||
[Non-negotiable requirements the solution must satisfy]
|
||||
|
||||
## Out of scope
|
||||
[What we are explicitly NOT doing in this iteration]
|
||||
|
||||
## Technical approach
|
||||
[Architecture decisions, key components, rationale for each choice]
|
||||
|
||||
## Risks
|
||||
[What could go wrong, and how we'd mitigate it]
|
||||
```
|
||||
|
||||
If the requirements are too vague to produce measurable success criteria, return status "error" with a message listing the specific questions that need answers.
|
||||
31
config/supervisor/trainer-reader.md
Normal file
31
config/supervisor/trainer-reader.md
Normal file
@@ -0,0 +1,31 @@
|
||||
# Trainer Reader Discipline
|
||||
|
||||
You scan session logs and identify candidate learning moments worth converting to training data.
|
||||
|
||||
## What to look for
|
||||
- **SFT candidates**: the worker did exactly the right thing — a clean pattern worth reinforcing
|
||||
- **DPO candidates**: the worker first produced a wrong or suboptimal response, then corrected — you have both rejected and chosen
|
||||
|
||||
## Scoring (1–5)
|
||||
- 5: novel pattern, clearly correct, generalises across projects
|
||||
- 4: good pattern, correct, somewhat project-specific but still useful
|
||||
- 3: correct but obvious — include only if especially clean
|
||||
- 2 or below: skip — too ambiguous or too context-specific
|
||||
|
||||
## Output contract
|
||||
Return JSON result with:
|
||||
- `status`: "pass" or "error"
|
||||
- `phase`: "trainer"
|
||||
- `skill`: "trainer"
|
||||
- `file_path`: ""
|
||||
- `runner_output`: JSON array of candidates (valid JSON, not markdown):
|
||||
[{"type":"sft","moment":"<what happened>","prompt":"<what was asked>","completion":"<what was done right>","score":4},
|
||||
{"type":"dpo","moment":"<what happened>","prompt":"<what was asked>","chosen":"<correct>","rejected":"<incorrect>","score":3}]
|
||||
- `verified`: true
|
||||
- `message`: "N sft candidates, M dpo candidates found"
|
||||
|
||||
## Rules
|
||||
1. Read all session entries in the task prompt
|
||||
2. Score each entry — only include entries scoring >= 3
|
||||
3. Prompt/completion fields must be phrased to generalise: no project-specific paths or names
|
||||
4. If no candidates score >= 3, return an empty array `[]` — never force low-quality candidates
|
||||
35
config/supervisor/trainer-writer.md
Normal file
35
config/supervisor/trainer-writer.md
Normal file
@@ -0,0 +1,35 @@
|
||||
# Trainer Writer Discipline
|
||||
|
||||
You receive candidate learning moments from the reader and write clean SFT/DPO training pairs.
|
||||
|
||||
## Quality gate (apply before writing)
|
||||
- SFT: prompt must be phrased so it could come from any project, not just this one
|
||||
- DPO: chosen and rejected must be clearly distinguishable — skip if a reader can't tell which is better
|
||||
- Never include project-specific paths, variable names, or identifiers in any pair
|
||||
|
||||
## Output contract
|
||||
Return JSON result with:
|
||||
- `status`: "pass" (pairs written or skipped due to quality) or "error" (candidates JSON was malformed)
|
||||
- `phase`: "trainer"
|
||||
- `skill`: "trainer"
|
||||
- `file_path`: path of the last file written (empty if nothing passed quality gate)
|
||||
- `runner_output`: "N SFT pairs written to brain/training-data/sft/, M DPO pairs to brain/training-data/dpo/" or "0 pairs passed quality gate"
|
||||
- `verified`: true if files were written; false if nothing passed
|
||||
- `message`: "N sft + M dpo pairs for session <id>" or "no pairs passed quality gate"
|
||||
|
||||
## File format
|
||||
JSONL — one JSON object per line.
|
||||
|
||||
SFT: `{"prompt": "...", "completion": "..."}`
|
||||
DPO: `{"prompt": "...", "chosen": "...", "rejected": "..."}`
|
||||
|
||||
Write SFT to: `<brain_dir>/training-data/sft/<session_id>.jsonl`
|
||||
Write DPO to: `<brain_dir>/training-data/dpo/<session_id>.jsonl`
|
||||
|
||||
Append to existing files if they exist (don't overwrite).
|
||||
|
||||
## Rules
|
||||
1. Parse the `reader_candidates` JSON from the task prompt
|
||||
2. For each candidate: apply quality gate
|
||||
3. Write passing SFT candidates to sft JSONL, DPO candidates to dpo JSONL
|
||||
4. If nothing passes, return status "pass" with verified: false and message "no pairs passed quality gate"
|
||||
322
docs/superpowers/specs/2026-04-20-model-orchestration-design.md
Normal file
322
docs/superpowers/specs/2026-04-20-model-orchestration-design.md
Normal file
@@ -0,0 +1,322 @@
|
||||
# Model Orchestration Design
|
||||
|
||||
**Date:** 2026-04-20
|
||||
**Status:** Approved for implementation
|
||||
|
||||
## Problem statement
|
||||
|
||||
The hyperguild supervisor currently spawns a `claude --print` subprocess for every skill call. The model routing config (`models.yaml`) exists but is dead weight — the model name is injected as text into the task prompt and ignored. Every skill call costs Claude tokens regardless of task complexity or data sensitivity.
|
||||
|
||||
## Goal
|
||||
|
||||
Route skill work to the most appropriate model — weighing cost, latency, and quality — with Claude acting as the real supervisor: verifying outputs and deciding when to escalate. Local models on owned hardware handle the common case; Claude escalates through a chain to frontier models only when local quality is insufficient.
|
||||
|
||||
## Success criteria
|
||||
|
||||
- [ ] Each skill dispatches generation to its configured local model via LiteLLM by default
|
||||
- [ ] Claude verifies every local output and either accepts or escalates
|
||||
- [ ] Escalation walks a per-skill chain (local small → local large → Sonnet → Opus) with one attempt per tier
|
||||
- [ ] Every attempt (model, tier, duration, warm state, verdict) is logged in the session JSONL
|
||||
- [ ] Cloud tiers (Sonnet/Opus) self-certify — no separate verifier call
|
||||
- [ ] Zero changes to skill handlers — they call `ExecutorFn` exactly as today
|
||||
- [ ] `LiteLTMBaseURL` already in config; no new env vars required beyond `LLAMA_SWAP_URL`
|
||||
|
||||
## Constraints
|
||||
|
||||
- One attempt per tier before escalating (no retry within a tier)
|
||||
- Anthropic T&C: Claude is called normally via Anthropic API; local models are called directly via LiteLLM HTTP — no API redirection
|
||||
- `models.yaml` remains the single routing config file
|
||||
|
||||
## Out of scope
|
||||
|
||||
- Auto-rerouting based on real-time warm state (logged, not acted on — Phase 4)
|
||||
- Multi-tenant / public service exposure
|
||||
- RAG/CAG model boosting
|
||||
- Managed Agent cloud delegation (chain stub only in Phase 3)
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
MCP tool call (Claude Code)
|
||||
↓
|
||||
Skill handler — calls ExecutorFn (unchanged)
|
||||
↓
|
||||
Orchestrator.Run (implements ExecutorFn)
|
||||
├─ Resolve chain from models.yaml
|
||||
├─ For each model in chain:
|
||||
│ ├─ [ollama/*] → LiteLLM executor → generate
|
||||
│ │ ↓
|
||||
│ │ Claude verifier (task + output + discipline)
|
||||
│ │ ├─ accept → return Result (log attempt)
|
||||
│ │ └─ escalate → next tier (log attempt)
|
||||
│ │
|
||||
│ └─ [claude-*] → Claude executor (current) → generate + self-certify
|
||||
│ └─ return Result (log attempt)
|
||||
│
|
||||
└─ All tiers exhausted → return best attempt with escalation note
|
||||
```
|
||||
|
||||
Claude is always the verifier for local tiers. At cloud tiers, Claude generates and self-certifies — the verifier call is skipped.
|
||||
|
||||
---
|
||||
|
||||
## Components
|
||||
|
||||
### 1. `internal/exec/litellm.go` — LiteLLM executor
|
||||
|
||||
Calls `POST /v1/chat/completions` on the configured LiteLLM server. Implements the same `ExecutorFn` signature as the existing claude executor.
|
||||
|
||||
```go
|
||||
type LiteLLMExecutor struct {
|
||||
BaseURL string
|
||||
APIKey string
|
||||
HTTPClient *http.Client
|
||||
Timeout time.Duration
|
||||
}
|
||||
|
||||
func NewLiteLLM(baseURL, apiKey string, timeout time.Duration) *LiteLLMExecutor
|
||||
|
||||
func (e *LiteLLMExecutor) Run(ctx context.Context, req Request) (Result, error)
|
||||
```
|
||||
|
||||
Request mapping:
|
||||
- `req.SkillPrompt` → system message
|
||||
- `req.TaskPrompt` → user message
|
||||
- `req.Model` → `model` field in the chat completions request
|
||||
|
||||
Response handling: local models are prompted (via the discipline file output contract) to return a JSON object matching the `Result` schema. The executor attempts `json.Unmarshal` into `Result` directly — no envelope unwrapping needed (unlike the `--output-format json` claude envelope). If unmarshalling fails, the executor returns an error that the orchestrator treats as an automatic escalation trigger.
|
||||
|
||||
### 2. `internal/exec/verifier.go` — Claude verifier
|
||||
|
||||
A focused Claude call that judges local model output. Uses the existing `Executor` (claude subprocess) internally.
|
||||
|
||||
```go
|
||||
type Verdict struct {
|
||||
Accept bool `json:"accept"`
|
||||
Feedback string `json:"feedback"` // reason if not accepting; empty if accept
|
||||
}
|
||||
|
||||
type Verifier struct {
|
||||
executor *Executor // the existing claude executor
|
||||
}
|
||||
|
||||
func NewVerifier(executor *Executor) *Verifier
|
||||
|
||||
func (v *Verifier) Verify(ctx context.Context, skillPrompt, taskPrompt string, output Result) (Verdict, error)
|
||||
```
|
||||
|
||||
The verifier prompt gives Claude:
|
||||
1. The skill discipline file (so it knows the iron laws and output contract)
|
||||
2. The original task prompt (informed verification — Claude sees what was asked)
|
||||
3. The generated output
|
||||
4. A short instruction: "Does this output satisfy the discipline's iron laws and output contract? Reply with JSON: `{\"accept\": true|false, \"feedback\": \"...\"}`"
|
||||
|
||||
The verifier uses a lightweight JSON schema for its own output (a `Verdict` schema), keeping the call fast.
|
||||
|
||||
### 3. `internal/exec/orchestrator.go` — chain walker
|
||||
|
||||
Implements `ExecutorFn`. Walks the escalation chain, delegating generation and verification per tier.
|
||||
|
||||
```go
|
||||
type Chain []ChainEntry
|
||||
|
||||
type ChainEntry struct {
|
||||
Model string // e.g. "ollama/phi4", "claude-sonnet-4-5"
|
||||
Tier string // "local" | "subagent" | "managed"
|
||||
IsCloud bool // true for claude-* models; skips verifier
|
||||
}
|
||||
|
||||
type Orchestrator struct {
|
||||
chain Chain
|
||||
litellm *LiteLLMExecutor
|
||||
claude *Executor
|
||||
verifier *Verifier
|
||||
llamaSwapURL string // for warm-state probe
|
||||
}
|
||||
|
||||
func NewOrchestrator(chain Chain, litellm *LiteLLMExecutor, claude *Executor, verifier *Verifier, llamaSwapURL string) *Orchestrator
|
||||
|
||||
func (o *Orchestrator) Run(ctx context.Context, req Request) (Result, error)
|
||||
```
|
||||
|
||||
Algorithm:
|
||||
```
|
||||
for each entry in chain:
|
||||
warm = probe llama-swap (if local tier)
|
||||
start = now()
|
||||
if entry.IsCloud:
|
||||
result, err = claude.Run(ctx, req with entry.Model)
|
||||
log attempt(model, tier, duration, warm, verified=true)
|
||||
if err == nil: return result
|
||||
else:
|
||||
result, err = litellm.Run(ctx, req with entry.Model)
|
||||
duration = now() - start
|
||||
if err != nil:
|
||||
log attempt(model, tier, duration, warm, verified=false)
|
||||
continue // automatic escalation on parse/network error
|
||||
verdict = verifier.Verify(ctx, req.SkillPrompt, req.TaskPrompt, result)
|
||||
log attempt(model, tier, duration, warm, verified=verdict.Accept)
|
||||
if verdict.Accept: return result
|
||||
// inject verifier feedback into next tier's task prompt
|
||||
req.TaskPrompt = req.TaskPrompt + "\n\nPrior attempt feedback: " + verdict.Feedback
|
||||
|
||||
return error("all tiers exhausted")
|
||||
```
|
||||
|
||||
### 4. `internal/config/models.go` — chain parser
|
||||
|
||||
Replaces the current single-model resolution with chain parsing.
|
||||
|
||||
Updated `models.yaml` format:
|
||||
|
||||
```yaml
|
||||
verifier: claude-sonnet-4-6 # fixed verifier for all local tiers
|
||||
|
||||
llama_swap_url: http://koala:8080 # for warm-state probing
|
||||
|
||||
default_chain:
|
||||
- ollama/qwen3-coder-30b-tuned
|
||||
- claude-sonnet-4-5
|
||||
|
||||
skills:
|
||||
tdd:
|
||||
chain:
|
||||
- ollama/qwen3-coder-30b-tuned
|
||||
- claude-sonnet-4-5
|
||||
review:
|
||||
chain:
|
||||
- ollama/devstral-tuned
|
||||
- ollama/gemma4
|
||||
- claude-sonnet-4-5
|
||||
debug:
|
||||
chain:
|
||||
- ollama/deepseek-r1-tuned
|
||||
- claude-sonnet-4-5
|
||||
spec:
|
||||
chain:
|
||||
- ollama/phi4
|
||||
- ollama/gemma4
|
||||
- claude-sonnet-4-5
|
||||
- claude-opus-4-6
|
||||
retrospective:
|
||||
chain:
|
||||
- ollama/qwen3-coder-30b-tuned
|
||||
- claude-sonnet-4-5
|
||||
trainer:
|
||||
chain:
|
||||
- ollama/qwen3-coder-30b-tuned
|
||||
- claude-sonnet-4-5
|
||||
```
|
||||
|
||||
The parser exposes:
|
||||
```go
|
||||
func (m *Models) ChainFor(skill string) Chain
|
||||
func (m *Models) Verifier() string
|
||||
func (m *Models) LlamaSwapURL() string
|
||||
```
|
||||
|
||||
Caller override (`model` param in MCP tool call) pins the chain to a single entry — one model, no escalation. This preserves the existing override behaviour for power users.
|
||||
|
||||
### 5. `internal/session/session.go` — updated `Attempt` struct
|
||||
|
||||
```go
|
||||
type Attempt struct {
|
||||
Attempt int `json:"attempt"`
|
||||
Model string `json:"model"`
|
||||
Tier string `json:"tier"` // local | subagent | managed
|
||||
DurationMs int64 `json:"duration_ms"`
|
||||
WarmStart bool `json:"warm_start"` // model was already loaded in llama-swap
|
||||
Verified bool `json:"verified"`
|
||||
Verdict string `json:"verdict,omitempty"` // accept | escalate | error
|
||||
Feedback string `json:"feedback,omitempty"` // verifier feedback on escalation
|
||||
OutputSummary string `json:"output_summary,omitempty"`
|
||||
RunnerOutput string `json:"runner_output,omitempty"`
|
||||
}
|
||||
```
|
||||
|
||||
### 6. `cmd/supervisor/main.go` — one wiring change
|
||||
|
||||
```go
|
||||
// Before:
|
||||
reg.Register(review.New(review.Config{ExecutorFn: executor.Run, ...}))
|
||||
|
||||
// After:
|
||||
chain := models.ChainFor("review")
|
||||
orch := exec.NewOrchestrator(chain, litellmExec, claudeExec, verifier, models.LlamaSwapURL())
|
||||
reg.Register(review.New(review.Config{ExecutorFn: orch.Run, ...}))
|
||||
```
|
||||
|
||||
One orchestrator per skill, sharing the same `litellmExec`, `claudeExec`, and `verifier` instances.
|
||||
|
||||
---
|
||||
|
||||
## Data flow example: `review` skill call
|
||||
|
||||
1. Claude Code calls `review` tool with `files: ["internal/foo.go"]`
|
||||
2. Skill handler builds task prompt, calls `orch.Run`
|
||||
3. Orchestrator resolves chain: `[devstral, gemma4, sonnet]`
|
||||
4. Probes llama-swap: devstral is warm
|
||||
5. LiteLLM calls devstral → returns JSON result
|
||||
6. Verifier asks Claude: "does this review satisfy the iron laws?"
|
||||
7. Claude: `{"accept": false, "feedback": "missing line references for all findings"}`
|
||||
8. Orchestrator logs attempt #1 (devstral, local, 4200ms, warm, escalate)
|
||||
9. Injects feedback into task prompt, calls gemma4
|
||||
10. Verifier: `{"accept": true}`
|
||||
11. Orchestrator logs attempt #2 (gemma4, local, 6100ms, cold, accept)
|
||||
12. Returns result to skill handler → MCP response
|
||||
|
||||
Session JSONL records both attempts. You can see: devstral was warm but produced weak output; gemma4 was cold but passed.
|
||||
|
||||
---
|
||||
|
||||
## Observability
|
||||
|
||||
Session JSONL is the primary store. Each `Entry.Attempts` slice records the full escalation trail. To analyse across sessions:
|
||||
|
||||
```bash
|
||||
# Which models are escalating most?
|
||||
jq -r '.attempts[] | select(.verdict == "escalate") | .model' brain/sessions/*.jsonl | sort | uniq -c
|
||||
|
||||
# Average latency per model
|
||||
jq -r '.attempts[] | [.model, .duration_ms] | @tsv' brain/sessions/*.jsonl | awk '{sum[$1]+=$2; n[$1]++} END {for (m in sum) print m, sum[m]/n[m]}'
|
||||
|
||||
# Cold start frequency
|
||||
jq -r '.attempts[] | select(.warm_start == false) | .model' brain/sessions/*.jsonl | sort | uniq -c
|
||||
```
|
||||
|
||||
No new metrics infrastructure needed for Phase 3. Phase 4 can build a dashboard on top of this data.
|
||||
|
||||
---
|
||||
|
||||
## Error handling
|
||||
|
||||
| Scenario | Behaviour |
|
||||
|----------|-----------|
|
||||
| LiteLLM unreachable | Log attempt as error, escalate immediately |
|
||||
| Local model returns unparseable JSON | Log attempt as error, escalate |
|
||||
| Verifier call fails | Log, treat as escalate (safe default) |
|
||||
| All tiers exhausted | Return error to skill handler; skill returns MCP error to caller |
|
||||
| Caller passes `model` override | Single-entry chain, no escalation, no verifier call |
|
||||
|
||||
---
|
||||
|
||||
## Testing approach
|
||||
|
||||
- `TestLiteLLMExecutor`: mock HTTP server returning valid/invalid JSON; verify parse logic and error escalation
|
||||
- `TestVerifier`: fake claude executor returning accept/escalate verdicts; verify prompt construction
|
||||
- `TestOrchestrator`: table-driven — chains of 1/2/3 tiers, various accept/escalate/error combinations; verify attempt log contents and final result
|
||||
- `TestModelsChainFor`: YAML parsing for all skill overrides and default_chain fallback
|
||||
- Integration smoke test: start real LiteLLM (or mock), call `review` tool via MCP, verify attempt log written
|
||||
|
||||
---
|
||||
|
||||
## Risks
|
||||
|
||||
| Risk | Mitigation |
|
||||
|------|------------|
|
||||
| Local models ignore output contract → bad JSON | Discipline files already specify JSON output contract; parse failure auto-escalates |
|
||||
| Verifier Claude call adds latency to every local attempt | Verifier prompt is small and fast; acceptable tradeoff for quality gate |
|
||||
| llama-swap warm probe adds overhead | Probe is a single lightweight HTTP GET; timeout at 200ms, treat failure as `warm_start: false` |
|
||||
| Chain exhaustion leaves caller with no result | Return structured error via MCP; caller can retry with explicit `model` override |
|
||||
@@ -7,9 +7,15 @@ import (
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
type skillChain struct {
|
||||
Chain []string `yaml:"chain"`
|
||||
}
|
||||
|
||||
type modelsFile struct {
|
||||
Default string `yaml:"default"`
|
||||
Skills map[string]string `yaml:"skills"`
|
||||
Verifier string `yaml:"verifier"`
|
||||
LlamaSwapURL string `yaml:"llama_swap_url"`
|
||||
DefaultChain []string `yaml:"default_chain"`
|
||||
Skills map[string]skillChain `yaml:"skills"`
|
||||
}
|
||||
|
||||
type Models struct {
|
||||
@@ -28,16 +34,23 @@ func LoadModels(path string) (Models, error) {
|
||||
return Models{data: f}, nil
|
||||
}
|
||||
|
||||
// Resolve returns the model for a skill, respecting three-layer priority:
|
||||
// 1. override (from MCP call) — highest
|
||||
// 2. per-skill default from models.yaml
|
||||
// 3. global default
|
||||
func (m Models) Resolve(skill, override string) string {
|
||||
// Verifier returns the model name to use for all local-tier output verification.
|
||||
func (m Models) Verifier() string { return m.data.Verifier }
|
||||
|
||||
// LlamaSwapURL returns the llama-swap base URL for warm-state probing.
|
||||
func (m Models) LlamaSwapURL() string { return m.data.LlamaSwapURL }
|
||||
|
||||
// ChainFor returns the ordered list of model names for a skill.
|
||||
// If override is non-empty, returns a single-entry chain (no escalation).
|
||||
// Falls back to default_chain when the skill has no explicit entry.
|
||||
func (m Models) ChainFor(skill, override string) []string {
|
||||
if override != "" {
|
||||
return override
|
||||
return []string{override}
|
||||
}
|
||||
if model, ok := m.data.Skills[skill]; ok {
|
||||
return model
|
||||
if sc, ok := m.data.Skills[skill]; ok && len(sc.Chain) > 0 {
|
||||
return sc.Chain
|
||||
}
|
||||
return m.data.Default
|
||||
out := make([]string, len(m.data.DefaultChain))
|
||||
copy(out, m.data.DefaultChain)
|
||||
return out
|
||||
}
|
||||
|
||||
@@ -10,35 +10,71 @@ import (
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestModelsResolve(t *testing.T) {
|
||||
yaml := `
|
||||
default: ollama/default-model
|
||||
const testYAML = `
|
||||
verifier: claude-sonnet-4-6
|
||||
llama_swap_url: http://koala:8080
|
||||
|
||||
default_chain:
|
||||
- ollama/qwen3-coder-30b-tuned
|
||||
- claude-sonnet-4-6
|
||||
|
||||
skills:
|
||||
tdd: ollama/qwen3-coder-30b-tuned
|
||||
review: ollama/devstral-tuned
|
||||
review:
|
||||
chain:
|
||||
- ollama/devstral-tuned
|
||||
- ollama/gemma4
|
||||
- claude-sonnet-4-6
|
||||
spec:
|
||||
chain:
|
||||
- ollama/phi4
|
||||
- claude-opus-4-6
|
||||
`
|
||||
|
||||
func writeModels(t *testing.T, content string) string {
|
||||
t.Helper()
|
||||
f := filepath.Join(t.TempDir(), "models.yaml")
|
||||
require.NoError(t, os.WriteFile(f, []byte(yaml), 0644))
|
||||
|
||||
m, err := config.LoadModels(f)
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.Equal(t, "ollama/qwen3-coder-30b-tuned", m.Resolve("tdd", ""))
|
||||
assert.Equal(t, "ollama/devstral-tuned", m.Resolve("review", ""))
|
||||
assert.Equal(t, "ollama/default-model", m.Resolve("unknown", ""))
|
||||
require.NoError(t, os.WriteFile(f, []byte(content), 0644))
|
||||
return f
|
||||
}
|
||||
|
||||
func TestModelsOverride(t *testing.T) {
|
||||
yaml := `
|
||||
default: ollama/default-model
|
||||
skills:
|
||||
tdd: ollama/qwen3-coder-30b-tuned
|
||||
`
|
||||
f := filepath.Join(t.TempDir(), "models.yaml")
|
||||
require.NoError(t, os.WriteFile(f, []byte(yaml), 0644))
|
||||
func TestModelsVerifier(t *testing.T) {
|
||||
m, err := config.LoadModels(writeModels(t, testYAML))
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "claude-sonnet-4-6", m.Verifier())
|
||||
}
|
||||
|
||||
m, err := config.LoadModels(f)
|
||||
func TestModelsLlamaSwapURL(t *testing.T) {
|
||||
m, err := config.LoadModels(writeModels(t, testYAML))
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "http://koala:8080", m.LlamaSwapURL())
|
||||
}
|
||||
|
||||
func TestModelsChainForSkillOverride(t *testing.T) {
|
||||
m, err := config.LoadModels(writeModels(t, testYAML))
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.Equal(t, "anthropic/claude-sonnet-4-6", m.Resolve("tdd", "anthropic/claude-sonnet-4-6"))
|
||||
chain := m.ChainFor("review", "")
|
||||
require.Len(t, chain, 3)
|
||||
assert.Equal(t, "ollama/devstral-tuned", chain[0])
|
||||
assert.Equal(t, "ollama/gemma4", chain[1])
|
||||
assert.Equal(t, "claude-sonnet-4-6", chain[2])
|
||||
}
|
||||
|
||||
func TestModelsChainForDefaultFallback(t *testing.T) {
|
||||
m, err := config.LoadModels(writeModels(t, testYAML))
|
||||
require.NoError(t, err)
|
||||
|
||||
chain := m.ChainFor("trainer", "") // not in skills map
|
||||
require.Len(t, chain, 2)
|
||||
assert.Equal(t, "ollama/qwen3-coder-30b-tuned", chain[0])
|
||||
assert.Equal(t, "claude-sonnet-4-6", chain[1])
|
||||
}
|
||||
|
||||
func TestModelsChainForCallerOverride(t *testing.T) {
|
||||
m, err := config.LoadModels(writeModels(t, testYAML))
|
||||
require.NoError(t, err)
|
||||
|
||||
chain := m.ChainFor("review", "claude-opus-4-6")
|
||||
require.Len(t, chain, 1)
|
||||
assert.Equal(t, "claude-opus-4-6", chain[0])
|
||||
}
|
||||
|
||||
@@ -72,8 +72,11 @@ func (e *Executor) Run(ctx context.Context, req Request) (Result, error) {
|
||||
"--tools", tools,
|
||||
"--json-schema", Schema,
|
||||
"--output-format", "json",
|
||||
prompt,
|
||||
}
|
||||
if strings.HasPrefix(req.Model, "claude-") {
|
||||
args = append(args, "--model", req.Model)
|
||||
}
|
||||
args = append(args, prompt)
|
||||
|
||||
cmd := exec.CommandContext(ctx, e.cfg.ClaudeBinary, args...)
|
||||
cmd.Env = append(os.Environ(), "LITELLM_API_KEY="+e.cfg.LiteLLMAPIKey)
|
||||
|
||||
@@ -75,3 +75,58 @@ func TestExecutorTimesOut(t *testing.T) {
|
||||
_, err := ex.Run(context.Background(), iexec.Request{TaskPrompt: "slow"})
|
||||
assert.ErrorContains(t, err, "timeout")
|
||||
}
|
||||
|
||||
func TestExecutorPassesModelFlagForCloudModel(t *testing.T) {
|
||||
// The script captures its args to a temp file so we can assert --model was passed.
|
||||
argsFile := filepath.Join(t.TempDir(), "args.txt")
|
||||
envelope := `{"type":"result","subtype":"success","is_error":false,"structured_output":{"status":"pass","phase":"review","skill":"review","file_path":"","runner_output":"","verified":true,"model_used":"claude-sonnet-4-6","message":"ok"}}`
|
||||
|
||||
dir := t.TempDir()
|
||||
script := filepath.Join(dir, "claude")
|
||||
content := "#!/bin/sh\necho \"$@\" > " + argsFile + "\necho '" + envelope + "'\n"
|
||||
require.NoError(t, os.WriteFile(script, []byte(content), 0755))
|
||||
|
||||
ex := iexec.New(iexec.Config{
|
||||
ClaudeBinary: script,
|
||||
SystemPrompt: "sys",
|
||||
Timeout: 5 * time.Second,
|
||||
})
|
||||
|
||||
_, err := ex.Run(context.Background(), iexec.Request{
|
||||
SkillPrompt: "review rules",
|
||||
TaskPrompt: "do review",
|
||||
Model: "claude-sonnet-4-6",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
argsData, err := os.ReadFile(argsFile)
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, string(argsData), "--model claude-sonnet-4-6")
|
||||
}
|
||||
|
||||
func TestExecutorSkipsModelFlagForLocalModel(t *testing.T) {
|
||||
argsFile := filepath.Join(t.TempDir(), "args.txt")
|
||||
envelope := `{"type":"result","subtype":"success","is_error":false,"structured_output":{"status":"pass","phase":"review","skill":"review","file_path":"","runner_output":"","verified":true,"model_used":"ollama/devstral","message":"ok"}}`
|
||||
|
||||
dir := t.TempDir()
|
||||
script := filepath.Join(dir, "claude")
|
||||
content := "#!/bin/sh\necho \"$@\" > " + argsFile + "\necho '" + envelope + "'\n"
|
||||
require.NoError(t, os.WriteFile(script, []byte(content), 0755))
|
||||
|
||||
ex := iexec.New(iexec.Config{
|
||||
ClaudeBinary: script,
|
||||
SystemPrompt: "sys",
|
||||
Timeout: 5 * time.Second,
|
||||
})
|
||||
|
||||
_, err := ex.Run(context.Background(), iexec.Request{
|
||||
SkillPrompt: "review rules",
|
||||
TaskPrompt: "do review",
|
||||
Model: "ollama/devstral",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
argsData, err := os.ReadFile(argsFile)
|
||||
require.NoError(t, err)
|
||||
assert.NotContains(t, string(argsData), "--model")
|
||||
}
|
||||
|
||||
103
internal/exec/litellm.go
Normal file
103
internal/exec/litellm.go
Normal file
@@ -0,0 +1,103 @@
|
||||
package exec
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
// LiteLLMExecutor calls a LiteLLM-compatible /v1/chat/completions endpoint.
|
||||
// Local models are expected to return a JSON object matching the Result schema
|
||||
// as their response content — no envelope.
|
||||
type LiteLLMExecutor struct {
|
||||
baseURL string
|
||||
apiKey string
|
||||
httpClient *http.Client
|
||||
}
|
||||
|
||||
// NewLiteLLM creates a LiteLLMExecutor.
|
||||
// timeout applies to the full HTTP round-trip per call.
|
||||
func NewLiteLLM(baseURL, apiKey string, timeout time.Duration) *LiteLLMExecutor {
|
||||
return &LiteLLMExecutor{
|
||||
baseURL: baseURL,
|
||||
apiKey: apiKey,
|
||||
httpClient: &http.Client{Timeout: timeout},
|
||||
}
|
||||
}
|
||||
|
||||
type litellmMessage struct {
|
||||
Role string `json:"role"`
|
||||
Content string `json:"content"`
|
||||
}
|
||||
|
||||
type litellmRequest struct {
|
||||
Model string `json:"model"`
|
||||
Messages []litellmMessage `json:"messages"`
|
||||
}
|
||||
|
||||
type litellmChoice struct {
|
||||
Message litellmMessage `json:"message"`
|
||||
}
|
||||
|
||||
type litellmResponse struct {
|
||||
Choices []litellmChoice `json:"choices"`
|
||||
}
|
||||
|
||||
// Run dispatches req to the LiteLLM server and parses the Result from the
|
||||
// assistant message content. Returns an error on network failure, non-200
|
||||
// status, or unparseable/invalid JSON — all of which the Orchestrator treats
|
||||
// as automatic escalation triggers.
|
||||
func (e *LiteLLMExecutor) Run(ctx context.Context, req Request) (Result, error) {
|
||||
body := litellmRequest{
|
||||
Model: req.Model,
|
||||
Messages: []litellmMessage{
|
||||
{Role: "system", Content: req.SkillPrompt},
|
||||
{Role: "user", Content: req.TaskPrompt},
|
||||
},
|
||||
}
|
||||
|
||||
bodyBytes, err := json.Marshal(body)
|
||||
if err != nil {
|
||||
return Result{}, fmt.Errorf("litellm: marshal request: %w", err)
|
||||
}
|
||||
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, e.baseURL+"/v1/chat/completions", bytes.NewReader(bodyBytes))
|
||||
if err != nil {
|
||||
return Result{}, fmt.Errorf("litellm: create request: %w", err)
|
||||
}
|
||||
httpReq.Header.Set("Content-Type", "application/json")
|
||||
if e.apiKey != "" {
|
||||
httpReq.Header.Set("Authorization", "Bearer "+e.apiKey)
|
||||
}
|
||||
|
||||
resp, err := e.httpClient.Do(httpReq)
|
||||
if err != nil {
|
||||
return Result{}, fmt.Errorf("litellm: request failed: %w", err)
|
||||
}
|
||||
defer resp.Body.Close() //nolint:errcheck
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return Result{}, fmt.Errorf("litellm: server returned status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var chatResp litellmResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&chatResp); err != nil {
|
||||
return Result{}, fmt.Errorf("litellm: decode response: %w", err)
|
||||
}
|
||||
if len(chatResp.Choices) == 0 {
|
||||
return Result{}, fmt.Errorf("litellm: no choices in response")
|
||||
}
|
||||
|
||||
content := chatResp.Choices[0].Message.Content
|
||||
var result Result
|
||||
if err := json.Unmarshal([]byte(content), &result); err != nil {
|
||||
return Result{}, fmt.Errorf("litellm: parse result JSON: %w — content: %s", err, content)
|
||||
}
|
||||
if err := result.Validate(); err != nil {
|
||||
return Result{}, fmt.Errorf("litellm: invalid result: %w", err)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
112
internal/exec/litellm_test.go
Normal file
112
internal/exec/litellm_test.go
Normal file
@@ -0,0 +1,112 @@
|
||||
package exec_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
iexec "github.com/mathiasbq/supervisor/internal/exec"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func validLiteLLMResult() iexec.Result {
|
||||
return iexec.Result{
|
||||
Status: "pass",
|
||||
Phase: "review",
|
||||
Skill: "review",
|
||||
ModelUsed: "ollama/devstral",
|
||||
Message: "looks good",
|
||||
}
|
||||
}
|
||||
|
||||
func chatResponseFor(t *testing.T, result iexec.Result) []byte {
|
||||
t.Helper()
|
||||
content, err := json.Marshal(result)
|
||||
require.NoError(t, err)
|
||||
resp := map[string]any{
|
||||
"choices": []map[string]any{
|
||||
{"message": map[string]any{"role": "assistant", "content": string(content)}},
|
||||
},
|
||||
}
|
||||
data, err := json.Marshal(resp)
|
||||
require.NoError(t, err)
|
||||
return data
|
||||
}
|
||||
|
||||
func TestLiteLLMParsesValidResult(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
assert.Equal(t, "/v1/chat/completions", r.URL.Path)
|
||||
assert.Equal(t, "application/json", r.Header.Get("Content-Type"))
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write(chatResponseFor(t, validLiteLLMResult()))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
ex := iexec.NewLiteLLM(srv.URL, "", 5*time.Second)
|
||||
result, err := ex.Run(context.Background(), iexec.Request{
|
||||
SkillPrompt: "review rules",
|
||||
TaskPrompt: "review the code",
|
||||
Model: "ollama/devstral",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "pass", result.Status)
|
||||
assert.Equal(t, "review", result.Skill)
|
||||
}
|
||||
|
||||
func TestLiteLLMSendsAuthHeader(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
assert.Equal(t, "Bearer secret", r.Header.Get("Authorization"))
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write(chatResponseFor(t, validLiteLLMResult()))
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
ex := iexec.NewLiteLLM(srv.URL, "secret", 5*time.Second)
|
||||
_, err := ex.Run(context.Background(), iexec.Request{Model: "x", TaskPrompt: "t", SkillPrompt: "s"})
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
func TestLiteLLMErrorOnNonOKStatus(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusServiceUnavailable)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
ex := iexec.NewLiteLLM(srv.URL, "", 5*time.Second)
|
||||
_, err := ex.Run(context.Background(), iexec.Request{Model: "x", TaskPrompt: "t"})
|
||||
assert.ErrorContains(t, err, "503")
|
||||
}
|
||||
|
||||
func TestLiteLLMErrorOnUnparsableJSON(t *testing.T) {
|
||||
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
resp := map[string]any{
|
||||
"choices": []map[string]any{
|
||||
{"message": map[string]any{"role": "assistant", "content": "not json at all"}},
|
||||
},
|
||||
}
|
||||
data, _ := json.Marshal(resp)
|
||||
_, _ = w.Write(data)
|
||||
}))
|
||||
defer srv.Close()
|
||||
|
||||
ex := iexec.NewLiteLLM(srv.URL, "", 5*time.Second)
|
||||
_, err := ex.Run(context.Background(), iexec.Request{Model: "x", TaskPrompt: "t"})
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
func TestLiteLLMRespectsContextCancellation(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel() // Cancel immediately
|
||||
|
||||
ex := iexec.NewLiteLLM("http://invalid.example.com", "", 1*time.Second)
|
||||
_, err := ex.Run(ctx, iexec.Request{Model: "x", TaskPrompt: "t"})
|
||||
assert.Error(t, err)
|
||||
}
|
||||
197
internal/exec/orchestrator.go
Normal file
197
internal/exec/orchestrator.go
Normal file
@@ -0,0 +1,197 @@
|
||||
package exec
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ChainEntry is one tier in an escalation chain.
|
||||
type ChainEntry struct {
|
||||
Model string // e.g. "ollama/phi4", "claude-sonnet-4-6"
|
||||
Tier string // "local" | "subagent" | "managed"
|
||||
IsCloud bool // true for claude-* models; skips verifier call
|
||||
}
|
||||
|
||||
// EntryFor builds a ChainEntry from a model name string.
|
||||
func EntryFor(model string) ChainEntry {
|
||||
cloud := strings.HasPrefix(model, "claude-")
|
||||
tier := "local"
|
||||
if cloud {
|
||||
tier = "subagent"
|
||||
}
|
||||
return ChainEntry{Model: model, Tier: tier, IsCloud: cloud}
|
||||
}
|
||||
|
||||
// AttemptRecord captures the outcome of one tier attempt for session logging.
|
||||
type AttemptRecord struct {
|
||||
Model string
|
||||
Tier string
|
||||
DurationMs int64
|
||||
WarmStart bool
|
||||
Verdict string // "accept" | "escalate" | "error"
|
||||
Feedback string
|
||||
}
|
||||
|
||||
// VerifierFn is the interface the orchestrator uses to verify local output.
|
||||
type VerifierFn interface {
|
||||
Verify(ctx context.Context, skillPrompt, taskPrompt string, output Result) (Verdict, error)
|
||||
}
|
||||
|
||||
// ExecutorRunFn is the signature of Executor.Run and LiteLLMExecutor.Run.
|
||||
type ExecutorRunFn func(ctx context.Context, req Request) (Result, error)
|
||||
|
||||
// Orchestrator walks an escalation chain, delegating generation and verification.
|
||||
// It implements the ExecutorFn shape expected by skill handlers.
|
||||
type Orchestrator struct {
|
||||
chain []ChainEntry
|
||||
localRun ExecutorRunFn // for local (non-cloud) tiers; may be nil
|
||||
cloudRun ExecutorRunFn // for cloud tiers; may be nil
|
||||
verifier VerifierFn
|
||||
llamaSwapURL string
|
||||
attempts *[]AttemptRecord
|
||||
}
|
||||
|
||||
// NewOrchestrator creates an Orchestrator.
|
||||
// attempts is a pointer to a slice that will be appended to on each tier attempt.
|
||||
// Pass nil for localRun or cloudRun if no tiers of that type exist in the chain.
|
||||
func NewOrchestrator(
|
||||
chain []ChainEntry,
|
||||
localRun ExecutorRunFn,
|
||||
cloudRun ExecutorRunFn,
|
||||
verifier VerifierFn,
|
||||
llamaSwapURL string,
|
||||
attempts *[]AttemptRecord,
|
||||
) *Orchestrator {
|
||||
return &Orchestrator{
|
||||
chain: chain,
|
||||
localRun: localRun,
|
||||
cloudRun: cloudRun,
|
||||
verifier: verifier,
|
||||
llamaSwapURL: llamaSwapURL,
|
||||
attempts: attempts,
|
||||
}
|
||||
}
|
||||
|
||||
// Run walks the escalation chain and returns the first accepted result.
|
||||
// Satisfies the ExecutorFn signature: func(context.Context, Request) (Result, error).
|
||||
func (o *Orchestrator) Run(ctx context.Context, req Request) (Result, error) {
|
||||
taskPrompt := req.TaskPrompt
|
||||
|
||||
for _, entry := range o.chain {
|
||||
warm := o.probeWarm(entry.Model)
|
||||
start := time.Now()
|
||||
|
||||
tierReq := req
|
||||
tierReq.Model = entry.Model
|
||||
tierReq.TaskPrompt = taskPrompt
|
||||
|
||||
if entry.IsCloud {
|
||||
result, genErr := o.cloudRun(ctx, tierReq)
|
||||
dur := time.Since(start).Milliseconds()
|
||||
verdict := "accept"
|
||||
if genErr != nil {
|
||||
verdict = "error"
|
||||
}
|
||||
o.appendAttempt(AttemptRecord{
|
||||
Model: entry.Model,
|
||||
Tier: entry.Tier,
|
||||
DurationMs: dur,
|
||||
WarmStart: warm,
|
||||
Verdict: verdict,
|
||||
})
|
||||
if genErr == nil {
|
||||
return result, nil
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Local tier.
|
||||
result, genErr := o.localRun(ctx, tierReq)
|
||||
dur := time.Since(start).Milliseconds()
|
||||
|
||||
if genErr != nil {
|
||||
o.appendAttempt(AttemptRecord{
|
||||
Model: entry.Model,
|
||||
Tier: entry.Tier,
|
||||
DurationMs: dur,
|
||||
WarmStart: warm,
|
||||
Verdict: "error",
|
||||
Feedback: genErr.Error(),
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
verdict, verErr := o.verifier.Verify(ctx, req.SkillPrompt, taskPrompt, result)
|
||||
if verErr != nil {
|
||||
// Treat verifier failure as escalate (safe default).
|
||||
o.appendAttempt(AttemptRecord{
|
||||
Model: entry.Model,
|
||||
Tier: entry.Tier,
|
||||
DurationMs: dur,
|
||||
WarmStart: warm,
|
||||
Verdict: "escalate",
|
||||
Feedback: "verifier error: " + verErr.Error(),
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
if verdict.Accept {
|
||||
o.appendAttempt(AttemptRecord{
|
||||
Model: entry.Model,
|
||||
Tier: entry.Tier,
|
||||
DurationMs: dur,
|
||||
WarmStart: warm,
|
||||
Verdict: "accept",
|
||||
})
|
||||
return result, nil
|
||||
}
|
||||
|
||||
o.appendAttempt(AttemptRecord{
|
||||
Model: entry.Model,
|
||||
Tier: entry.Tier,
|
||||
DurationMs: dur,
|
||||
WarmStart: warm,
|
||||
Verdict: "escalate",
|
||||
Feedback: verdict.Feedback,
|
||||
})
|
||||
// Inject verifier feedback into the next tier's task prompt.
|
||||
taskPrompt = taskPrompt + "\n\nPrior attempt feedback: " + verdict.Feedback
|
||||
}
|
||||
|
||||
return Result{}, fmt.Errorf("all tiers exhausted after %d attempt(s)", len(o.chain))
|
||||
}
|
||||
|
||||
func (o *Orchestrator) appendAttempt(rec AttemptRecord) {
|
||||
if o.attempts != nil {
|
||||
*o.attempts = append(*o.attempts, rec)
|
||||
}
|
||||
}
|
||||
|
||||
// probeWarm checks whether the model is currently loaded in llama-swap.
|
||||
// Returns false on any error or if llamaSwapURL is empty.
|
||||
func (o *Orchestrator) probeWarm(model string) bool {
|
||||
if o.llamaSwapURL == "" {
|
||||
return false
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
|
||||
defer cancel()
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, o.llamaSwapURL+"/v1/models", nil)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
defer resp.Body.Close() //nolint:errcheck
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return strings.Contains(string(body), model)
|
||||
}
|
||||
151
internal/exec/orchestrator_test.go
Normal file
151
internal/exec/orchestrator_test.go
Normal file
@@ -0,0 +1,151 @@
|
||||
package exec_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"testing"
|
||||
|
||||
iexec "github.com/mathiasbq/supervisor/internal/exec"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// stubRunFn returns preset results sequentially.
|
||||
type stubRunFn struct {
|
||||
calls []stubCall
|
||||
callIdx int
|
||||
}
|
||||
|
||||
type stubCall struct {
|
||||
result iexec.Result
|
||||
err error
|
||||
}
|
||||
|
||||
func (s *stubRunFn) Run(_ context.Context, _ iexec.Request) (iexec.Result, error) {
|
||||
if s.callIdx >= len(s.calls) {
|
||||
return iexec.Result{}, errors.New("unexpected call")
|
||||
}
|
||||
c := s.calls[s.callIdx]
|
||||
s.callIdx++
|
||||
return c.result, c.err
|
||||
}
|
||||
|
||||
// stubVerifier returns preset verdicts sequentially.
|
||||
type stubVerifier struct {
|
||||
verdicts []iexec.Verdict
|
||||
idx int
|
||||
}
|
||||
|
||||
func (s *stubVerifier) Verify(_ context.Context, _, _ string, _ iexec.Result) (iexec.Verdict, error) {
|
||||
if s.idx >= len(s.verdicts) {
|
||||
return iexec.Verdict{}, errors.New("unexpected verify call")
|
||||
}
|
||||
v := s.verdicts[s.idx]
|
||||
s.idx++
|
||||
return v, nil
|
||||
}
|
||||
|
||||
func okResult(skill string) iexec.Result {
|
||||
return iexec.Result{Status: "pass", Phase: "review", Skill: skill, Message: "ok", ModelUsed: "m"}
|
||||
}
|
||||
|
||||
func TestOrchestratorSingleLocalAccept(t *testing.T) {
|
||||
local := &stubRunFn{calls: []stubCall{{result: okResult("review")}}}
|
||||
verifier := &stubVerifier{verdicts: []iexec.Verdict{{Accept: true}}}
|
||||
|
||||
var attempts []iexec.AttemptRecord
|
||||
orch := iexec.NewOrchestrator(
|
||||
[]iexec.ChainEntry{{Model: "ollama/devstral", Tier: "local", IsCloud: false}},
|
||||
local.Run, nil, verifier, "", &attempts,
|
||||
)
|
||||
|
||||
result, err := orch.Run(context.Background(), iexec.Request{TaskPrompt: "review"})
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "pass", result.Status)
|
||||
require.Len(t, attempts, 1)
|
||||
assert.Equal(t, "local", attempts[0].Tier)
|
||||
assert.Equal(t, "accept", attempts[0].Verdict)
|
||||
}
|
||||
|
||||
func TestOrchestratorEscalatesOnVerifierReject(t *testing.T) {
|
||||
local := &stubRunFn{calls: []stubCall{
|
||||
{result: iexec.Result{Status: "fail", Phase: "review", Skill: "review", Message: "weak"}},
|
||||
{result: okResult("review")},
|
||||
}}
|
||||
verifier := &stubVerifier{verdicts: []iexec.Verdict{
|
||||
{Accept: false, Feedback: "missing line refs"},
|
||||
{Accept: true},
|
||||
}}
|
||||
|
||||
var attempts []iexec.AttemptRecord
|
||||
orch := iexec.NewOrchestrator(
|
||||
[]iexec.ChainEntry{
|
||||
{Model: "ollama/devstral", Tier: "local", IsCloud: false},
|
||||
{Model: "ollama/gemma4", Tier: "local", IsCloud: false},
|
||||
},
|
||||
local.Run, nil, verifier, "", &attempts,
|
||||
)
|
||||
|
||||
result, err := orch.Run(context.Background(), iexec.Request{TaskPrompt: "review"})
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "pass", result.Status)
|
||||
require.Len(t, attempts, 2)
|
||||
assert.Equal(t, "escalate", attempts[0].Verdict)
|
||||
assert.Equal(t, "missing line refs", attempts[0].Feedback)
|
||||
assert.Equal(t, "accept", attempts[1].Verdict)
|
||||
}
|
||||
|
||||
func TestOrchestratorEscalatesOnLocalError(t *testing.T) {
|
||||
local := &stubRunFn{calls: []stubCall{
|
||||
{err: errors.New("network failure")},
|
||||
{result: okResult("review")},
|
||||
}}
|
||||
verifier := &stubVerifier{verdicts: []iexec.Verdict{{Accept: true}}}
|
||||
|
||||
var attempts []iexec.AttemptRecord
|
||||
orch := iexec.NewOrchestrator(
|
||||
[]iexec.ChainEntry{
|
||||
{Model: "ollama/devstral", Tier: "local", IsCloud: false},
|
||||
{Model: "ollama/gemma4", Tier: "local", IsCloud: false},
|
||||
},
|
||||
local.Run, nil, verifier, "", &attempts,
|
||||
)
|
||||
|
||||
_, err := orch.Run(context.Background(), iexec.Request{TaskPrompt: "review"})
|
||||
require.NoError(t, err)
|
||||
require.Len(t, attempts, 2)
|
||||
assert.Equal(t, "error", attempts[0].Verdict)
|
||||
assert.Equal(t, "accept", attempts[1].Verdict)
|
||||
}
|
||||
|
||||
func TestOrchestratorCloudTierSelfCertifies(t *testing.T) {
|
||||
cloud := &stubRunFn{calls: []stubCall{{result: okResult("review")}}}
|
||||
verifier := &stubVerifier{} // no verdicts — must not be called
|
||||
|
||||
var attempts []iexec.AttemptRecord
|
||||
orch := iexec.NewOrchestrator(
|
||||
[]iexec.ChainEntry{{Model: "claude-sonnet-4-6", Tier: "subagent", IsCloud: true}},
|
||||
nil, cloud.Run, verifier, "", &attempts,
|
||||
)
|
||||
|
||||
result, err := orch.Run(context.Background(), iexec.Request{TaskPrompt: "review"})
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "pass", result.Status)
|
||||
require.Len(t, attempts, 1)
|
||||
assert.Equal(t, "subagent", attempts[0].Tier)
|
||||
assert.Equal(t, "accept", attempts[0].Verdict)
|
||||
assert.Equal(t, 0, verifier.idx) // verifier never called
|
||||
}
|
||||
|
||||
func TestOrchestratorAllTiersExhausted(t *testing.T) {
|
||||
local := &stubRunFn{calls: []stubCall{{err: errors.New("unavailable")}}}
|
||||
|
||||
var attempts []iexec.AttemptRecord
|
||||
orch := iexec.NewOrchestrator(
|
||||
[]iexec.ChainEntry{{Model: "ollama/devstral", Tier: "local", IsCloud: false}},
|
||||
local.Run, nil, &stubVerifier{}, "", &attempts,
|
||||
)
|
||||
|
||||
_, err := orch.Run(context.Background(), iexec.Request{TaskPrompt: "review"})
|
||||
assert.ErrorContains(t, err, "all tiers exhausted")
|
||||
}
|
||||
@@ -10,7 +10,7 @@ import (
|
||||
// validates its own output before returning.
|
||||
type Result struct {
|
||||
Status string `json:"status"` // pass | fail | error
|
||||
Phase string `json:"phase"` // red | green | refactor
|
||||
Phase string `json:"phase"` // red | green | refactor | retrospective | review | debug | spec | trainer
|
||||
Skill string `json:"skill"` // tdd | review | ...
|
||||
FilePath string `json:"file_path"` // absolute path to generated file
|
||||
RunnerOutput string `json:"runner_output"` // raw stdout+stderr from test runner
|
||||
@@ -25,6 +25,10 @@ var validPhases = map[string]bool{
|
||||
"green": true,
|
||||
"refactor": true,
|
||||
"retrospective": true,
|
||||
"review": true,
|
||||
"debug": true,
|
||||
"spec": true,
|
||||
"trainer": true,
|
||||
}
|
||||
|
||||
func (r Result) Validate() error {
|
||||
@@ -33,7 +37,7 @@ func (r Result) Validate() error {
|
||||
errs = append(errs, "status must be pass|fail|error, got: "+r.Status)
|
||||
}
|
||||
if !validPhases[r.Phase] {
|
||||
errs = append(errs, "phase must be red|green|refactor, got: "+r.Phase)
|
||||
errs = append(errs, "phase must be one of red|green|refactor|retrospective|review|debug|spec|trainer, got: "+r.Phase)
|
||||
}
|
||||
if r.Skill == "" {
|
||||
errs = append(errs, "skill is required")
|
||||
@@ -50,7 +54,7 @@ const Schema = `{
|
||||
"required": ["status","phase","skill","file_path","runner_output","verified","model_used","message"],
|
||||
"properties": {
|
||||
"status": {"type": "string", "enum": ["pass","fail","error"]},
|
||||
"phase": {"type": "string", "enum": ["red","green","refactor"]},
|
||||
"phase": {"type": "string"},
|
||||
"skill": {"type": "string"},
|
||||
"file_path": {"type": "string"},
|
||||
"runner_output": {"type": "string"},
|
||||
|
||||
@@ -69,3 +69,11 @@ func TestResultValidation(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateAcceptsAllPhases(t *testing.T) {
|
||||
phases := []string{"red", "green", "refactor", "retrospective", "review", "debug", "spec", "trainer"}
|
||||
for _, phase := range phases {
|
||||
r := exec.Result{Status: "pass", Phase: phase, Skill: "test", ModelUsed: "self", Message: "ok"}
|
||||
assert.NoError(t, r.Validate(), "phase %q should be valid", phase)
|
||||
}
|
||||
}
|
||||
|
||||
99
internal/exec/verifier.go
Normal file
99
internal/exec/verifier.go
Normal file
@@ -0,0 +1,99 @@
|
||||
package exec
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Verdict is the output of a Claude verification call.
|
||||
type Verdict struct {
|
||||
Accept bool `json:"accept"`
|
||||
Feedback string `json:"feedback"` // empty when Accept is true
|
||||
}
|
||||
|
||||
// Verifier runs a focused Claude call to judge local model output.
|
||||
type Verifier struct {
|
||||
claudeBinary string
|
||||
model string
|
||||
timeout time.Duration
|
||||
}
|
||||
|
||||
// NewVerifier creates a Verifier that calls claude with the given binary path and model.
|
||||
// Empty claudeBinary defaults to "claude". Zero timeout defaults to 30s.
|
||||
func NewVerifier(claudeBinary, model string, timeout time.Duration) *Verifier {
|
||||
if claudeBinary == "" {
|
||||
claudeBinary = "claude"
|
||||
}
|
||||
if timeout == 0 {
|
||||
timeout = 30 * time.Second
|
||||
}
|
||||
return &Verifier{
|
||||
claudeBinary: claudeBinary,
|
||||
model: model,
|
||||
timeout: timeout,
|
||||
}
|
||||
}
|
||||
|
||||
// Verify asks Claude whether output satisfies the skill discipline's iron laws.
|
||||
// Returns Verdict{Accept: true} to accept or Verdict{Accept: false, Feedback: "..."}
|
||||
// to escalate. Returns an error on subprocess failure or unparseable response.
|
||||
func (v *Verifier) Verify(ctx context.Context, skillPrompt, taskPrompt string, output Result) (Verdict, error) {
|
||||
ctx, cancel := context.WithTimeout(ctx, v.timeout)
|
||||
defer cancel()
|
||||
|
||||
outputJSON, err := json.Marshal(output)
|
||||
if err != nil {
|
||||
return Verdict{}, fmt.Errorf("verifier: marshal output: %w", err)
|
||||
}
|
||||
|
||||
prompt := fmt.Sprintf(`You are a quality verifier for an AI supervisor system.
|
||||
|
||||
Given the skill discipline, the original task, and the generated output, decide whether the output satisfies the discipline's iron laws and output contract.
|
||||
|
||||
Reply with JSON only — no other text:
|
||||
{"accept": true, "feedback": ""}
|
||||
or
|
||||
{"accept": false, "feedback": "<one sentence reason>"}
|
||||
|
||||
## Skill discipline
|
||||
%s
|
||||
|
||||
## Original task
|
||||
%s
|
||||
|
||||
## Generated output
|
||||
%s`, skillPrompt, taskPrompt, string(outputJSON))
|
||||
|
||||
args := []string{
|
||||
"--print",
|
||||
"--permission-mode", "bypassPermissions",
|
||||
}
|
||||
if v.model != "" {
|
||||
args = append(args, "--model", v.model)
|
||||
}
|
||||
args = append(args, prompt)
|
||||
|
||||
cmd := exec.CommandContext(ctx, v.claudeBinary, args...)
|
||||
cmd.Env = os.Environ()
|
||||
var stdout, stderr bytes.Buffer
|
||||
cmd.Stdout = &stdout
|
||||
cmd.Stderr = &stderr
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
if ctx.Err() != nil {
|
||||
return Verdict{}, fmt.Errorf("verifier: timeout after %s", v.timeout)
|
||||
}
|
||||
return Verdict{}, fmt.Errorf("verifier: claude exited with error: %w — stderr: %s", err, stderr.String())
|
||||
}
|
||||
|
||||
var verdict Verdict
|
||||
if err := json.Unmarshal(bytes.TrimSpace(stdout.Bytes()), &verdict); err != nil {
|
||||
return Verdict{}, fmt.Errorf("verifier: parse verdict JSON: %w — raw: %s", err, stdout.String())
|
||||
}
|
||||
return verdict, nil
|
||||
}
|
||||
74
internal/exec/verifier_test.go
Normal file
74
internal/exec/verifier_test.go
Normal file
@@ -0,0 +1,74 @@
|
||||
package exec_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
iexec "github.com/mathiasbq/supervisor/internal/exec"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func fakeVerifierClaude(t *testing.T, verdict iexec.Verdict) string {
|
||||
t.Helper()
|
||||
data, err := json.Marshal(verdict)
|
||||
require.NoError(t, err)
|
||||
dir := t.TempDir()
|
||||
script := filepath.Join(dir, "claude")
|
||||
content := fmt.Sprintf("#!/bin/sh\necho '%s'\n", string(data))
|
||||
require.NoError(t, os.WriteFile(script, []byte(content), 0755))
|
||||
return script
|
||||
}
|
||||
|
||||
func TestVerifierAccepts(t *testing.T) {
|
||||
claude := fakeVerifierClaude(t, iexec.Verdict{Accept: true, Feedback: ""})
|
||||
v := iexec.NewVerifier(claude, "claude-sonnet-4-6", 5*time.Second)
|
||||
|
||||
verdict, err := v.Verify(context.Background(), "skill rules", "do the task", iexec.Result{
|
||||
Status: "pass", Phase: "review", Skill: "review", Message: "ok",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
assert.True(t, verdict.Accept)
|
||||
assert.Empty(t, verdict.Feedback)
|
||||
}
|
||||
|
||||
func TestVerifierEscalates(t *testing.T) {
|
||||
claude := fakeVerifierClaude(t, iexec.Verdict{Accept: false, Feedback: "missing line references"})
|
||||
v := iexec.NewVerifier(claude, "claude-sonnet-4-6", 5*time.Second)
|
||||
|
||||
verdict, err := v.Verify(context.Background(), "skill rules", "do the task", iexec.Result{
|
||||
Status: "pass", Phase: "review", Skill: "review", Message: "incomplete",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
assert.False(t, verdict.Accept)
|
||||
assert.Equal(t, "missing line references", verdict.Feedback)
|
||||
}
|
||||
|
||||
func TestVerifierErrorOnUnparsableOutput(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
script := filepath.Join(dir, "claude")
|
||||
require.NoError(t, os.WriteFile(script, []byte("#!/bin/sh\necho 'not json'\n"), 0755))
|
||||
|
||||
v := iexec.NewVerifier(script, "claude-sonnet-4-6", 5*time.Second)
|
||||
_, err := v.Verify(context.Background(), "rules", "task", iexec.Result{
|
||||
Status: "pass", Phase: "review", Skill: "review", Message: "ok",
|
||||
})
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
func TestVerifierErrorOnNonZeroExit(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
script := filepath.Join(dir, "claude")
|
||||
require.NoError(t, os.WriteFile(script, []byte("#!/bin/sh\nexit 1\n"), 0755))
|
||||
|
||||
v := iexec.NewVerifier(script, "claude-sonnet-4-6", 5*time.Second)
|
||||
_, err := v.Verify(context.Background(), "rules", "task", iexec.Result{
|
||||
Status: "pass", Phase: "review", Skill: "review", Message: "ok",
|
||||
})
|
||||
assert.Error(t, err)
|
||||
}
|
||||
38
internal/session/history.go
Normal file
38
internal/session/history.go
Normal file
@@ -0,0 +1,38 @@
|
||||
// internal/session/history.go
|
||||
package session
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// FormatHistory formats prior session entries as a structured block for
|
||||
// injection into a worker task prompt. Entries matching excludePhase are
|
||||
// omitted (pass the current phase to avoid circular injection).
|
||||
func FormatHistory(entries []Entry, excludePhase string) string {
|
||||
var filtered []Entry
|
||||
for _, e := range entries {
|
||||
if e.Phase != excludePhase {
|
||||
filtered = append(filtered, e)
|
||||
}
|
||||
}
|
||||
if len(filtered) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
b.WriteString("## Session history\n\n")
|
||||
for _, e := range filtered {
|
||||
fmt.Fprintf(&b, "### Phase: %s\n", e.Phase) //nolint:errcheck // strings.Builder never errors
|
||||
fmt.Fprintf(&b, "- Skill: %s\n", e.Skill) //nolint:errcheck
|
||||
fmt.Fprintf(&b, "- Status: %s\n", e.FinalStatus) //nolint:errcheck
|
||||
if e.FilePath != "" {
|
||||
fmt.Fprintf(&b, "- File: %s\n", e.FilePath) //nolint:errcheck
|
||||
}
|
||||
if e.Message != "" {
|
||||
fmt.Fprintf(&b, "- Summary: %s\n", e.Message) //nolint:errcheck
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
41
internal/session/history_test.go
Normal file
41
internal/session/history_test.go
Normal file
@@ -0,0 +1,41 @@
|
||||
// internal/session/history_test.go
|
||||
package session_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/mathiasbq/supervisor/internal/session"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestFormatHistoryEmpty(t *testing.T) {
|
||||
result := session.FormatHistory(nil, "")
|
||||
assert.Equal(t, "", result)
|
||||
}
|
||||
|
||||
func TestFormatHistoryFormatsEntries(t *testing.T) {
|
||||
entries := []session.Entry{
|
||||
{
|
||||
Skill: "tdd", Phase: "red", FinalStatus: "pass",
|
||||
FilePath: "internal/foo/foo_test.go",
|
||||
Message: "wrote failing test for Foo",
|
||||
Timestamp: time.Now(),
|
||||
},
|
||||
}
|
||||
result := session.FormatHistory(entries, "")
|
||||
assert.Contains(t, result, "## Session history")
|
||||
assert.Contains(t, result, "Phase: red")
|
||||
assert.Contains(t, result, "wrote failing test for Foo")
|
||||
assert.Contains(t, result, "internal/foo/foo_test.go")
|
||||
}
|
||||
|
||||
func TestFormatHistoryExcludesCurrentPhase(t *testing.T) {
|
||||
entries := []session.Entry{
|
||||
{Skill: "tdd", Phase: "red", Message: "red done", FinalStatus: "pass"},
|
||||
{Skill: "tdd", Phase: "green", Message: "green done", FinalStatus: "pass"},
|
||||
}
|
||||
result := session.FormatHistory(entries, "green")
|
||||
assert.Contains(t, result, "red done")
|
||||
assert.NotContains(t, result, "green done")
|
||||
}
|
||||
@@ -32,9 +32,14 @@ type Entry struct {
|
||||
type Attempt struct {
|
||||
Attempt int `json:"attempt"`
|
||||
Model string `json:"model"`
|
||||
Tier string `json:"tier"` // local | subagent | managed
|
||||
DurationMs int64 `json:"duration_ms"`
|
||||
WarmStart bool `json:"warm_start"` // model already loaded in llama-swap
|
||||
Verified bool `json:"verified"`
|
||||
Verdict string `json:"verdict,omitempty"` // accept | escalate | error
|
||||
Feedback string `json:"feedback,omitempty"` // verifier feedback on escalation
|
||||
OutputSummary string `json:"output_summary,omitempty"`
|
||||
RunnerOutput string `json:"runner_output,omitempty"`
|
||||
Verified bool `json:"verified"`
|
||||
}
|
||||
|
||||
// Append writes entry as a single JSON line to sessionsDir/{sessionID}.jsonl.
|
||||
|
||||
@@ -61,3 +61,22 @@ func TestRead_EmptyWhenNoFile(t *testing.T) {
|
||||
require.NoError(t, err)
|
||||
assert.Empty(t, entries)
|
||||
}
|
||||
|
||||
func TestAttemptRoundTrip(t *testing.T) {
|
||||
a := session.Attempt{
|
||||
Attempt: 1,
|
||||
Model: "ollama/devstral",
|
||||
Tier: "local",
|
||||
DurationMs: 4200,
|
||||
WarmStart: true,
|
||||
Verified: false,
|
||||
Verdict: "escalate",
|
||||
Feedback: "missing line references",
|
||||
}
|
||||
data, err := json.Marshal(a)
|
||||
require.NoError(t, err)
|
||||
|
||||
var got session.Attempt
|
||||
require.NoError(t, json.Unmarshal(data, &got))
|
||||
assert.Equal(t, a, got)
|
||||
}
|
||||
|
||||
80
internal/skills/debug/handlers.go
Normal file
80
internal/skills/debug/handlers.go
Normal file
@@ -0,0 +1,80 @@
|
||||
// internal/skills/debug/handlers.go
|
||||
package debug
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
|
||||
iexec "github.com/mathiasbq/supervisor/internal/exec"
|
||||
"github.com/mathiasbq/supervisor/internal/session"
|
||||
)
|
||||
|
||||
type debugArgs struct {
|
||||
ProjectRoot string `json:"project_root"`
|
||||
Error string `json:"error"`
|
||||
Context string `json:"context"`
|
||||
Model string `json:"model"`
|
||||
SessionID string `json:"session_id"`
|
||||
}
|
||||
|
||||
// Handle dispatches the MCP tool call to the appropriate handler.
|
||||
func (s *Skill) Handle(ctx context.Context, tool string, args json.RawMessage) (json.RawMessage, error) {
|
||||
if tool != "debug" {
|
||||
return nil, fmt.Errorf("unknown tool: %s", tool)
|
||||
}
|
||||
var a debugArgs
|
||||
if err := json.Unmarshal(args, &a); err != nil {
|
||||
return nil, fmt.Errorf("parse args: %w", err)
|
||||
}
|
||||
if a.ProjectRoot == "" {
|
||||
return nil, fmt.Errorf("project_root is required")
|
||||
}
|
||||
if a.Error == "" {
|
||||
return nil, fmt.Errorf("error is required")
|
||||
}
|
||||
|
||||
model := a.Model
|
||||
if model == "" {
|
||||
model = s.cfg.DefaultModel
|
||||
}
|
||||
|
||||
task := fmt.Sprintf(
|
||||
"phase: debug\nproject_root: %s\nerror: %s\ncontext: %s\nmodel: %s",
|
||||
a.ProjectRoot, a.Error, a.Context, model,
|
||||
)
|
||||
task = s.prependHistory(a.SessionID, "debug", task)
|
||||
|
||||
if s.cfg.ExecutorFn == nil {
|
||||
return nil, fmt.Errorf("no executor configured")
|
||||
}
|
||||
result, err := s.cfg.ExecutorFn(ctx, iexec.Request{
|
||||
SkillPrompt: s.cfg.SkillPrompt,
|
||||
TaskPrompt: task,
|
||||
Model: model,
|
||||
Tools: "Read,Bash",
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
b, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("marshal result: %w", err)
|
||||
}
|
||||
return b, nil
|
||||
}
|
||||
|
||||
func (s *Skill) prependHistory(sessionID, currentPhase, task string) string {
|
||||
if sessionID == "" || s.cfg.SessionsDir == "" {
|
||||
return task
|
||||
}
|
||||
entries, err := session.Read(s.cfg.SessionsDir, sessionID)
|
||||
if err != nil || len(entries) == 0 {
|
||||
return task
|
||||
}
|
||||
history := session.FormatHistory(entries, currentPhase)
|
||||
if history == "" {
|
||||
return task
|
||||
}
|
||||
return history + "\n---\n\n" + task
|
||||
}
|
||||
61
internal/skills/debug/handlers_test.go
Normal file
61
internal/skills/debug/handlers_test.go
Normal file
@@ -0,0 +1,61 @@
|
||||
// internal/skills/debug/handlers_test.go
|
||||
package debug_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"testing"
|
||||
|
||||
iexec "github.com/mathiasbq/supervisor/internal/exec"
|
||||
"github.com/mathiasbq/supervisor/internal/skills/debug"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestDebugToolRegistered(t *testing.T) {
|
||||
sk := debug.New(debug.Config{SkillPrompt: "debug rules"})
|
||||
names := make([]string, 0)
|
||||
for _, tool := range sk.Tools() {
|
||||
names = append(names, tool.Name)
|
||||
}
|
||||
assert.Contains(t, names, "debug")
|
||||
}
|
||||
|
||||
func TestDebugRequiresProjectRoot(t *testing.T) {
|
||||
sk := debug.New(debug.Config{SkillPrompt: "d"})
|
||||
_, err := sk.Handle(context.Background(), "debug", json.RawMessage(`{"error":"panic: nil pointer"}`))
|
||||
assert.ErrorContains(t, err, "project_root")
|
||||
}
|
||||
|
||||
func TestDebugRequiresError(t *testing.T) {
|
||||
sk := debug.New(debug.Config{SkillPrompt: "d"})
|
||||
_, err := sk.Handle(context.Background(), "debug", json.RawMessage(`{"project_root":"/tmp"}`))
|
||||
assert.ErrorContains(t, err, "error")
|
||||
}
|
||||
|
||||
func TestDebugCallsExecutor(t *testing.T) {
|
||||
called := false
|
||||
var capturedTask string
|
||||
fakeFn := func(_ context.Context, req iexec.Request) (iexec.Result, error) {
|
||||
called = true
|
||||
capturedTask = req.TaskPrompt
|
||||
return iexec.Result{
|
||||
Status: "pass", Phase: "debug", Skill: "debug",
|
||||
RunnerOutput: "HYPOTHESIS 1 (likelihood: high): nil map access\nVERIFY: go test ./... → expected: panic line reference",
|
||||
Verified: false, ModelUsed: "self", Message: "3 hypotheses for: panic nil pointer at foo.go:42",
|
||||
}, nil
|
||||
}
|
||||
|
||||
sk := debug.New(debug.Config{SkillPrompt: "debug rules", ExecutorFn: fakeFn, SessionsDir: t.TempDir()})
|
||||
out, err := sk.Handle(context.Background(), "debug", json.RawMessage(
|
||||
`{"project_root":"/tmp/proj","error":"panic: nil pointer dereference at foo.go:42","context":"occurs on startup"}`,
|
||||
))
|
||||
require.NoError(t, err)
|
||||
assert.True(t, called)
|
||||
assert.Contains(t, capturedTask, "panic: nil pointer dereference")
|
||||
assert.Contains(t, capturedTask, "occurs on startup")
|
||||
|
||||
var result iexec.Result
|
||||
require.NoError(t, json.Unmarshal(out, &result))
|
||||
assert.Equal(t, "debug", result.Phase)
|
||||
}
|
||||
55
internal/skills/debug/skill.go
Normal file
55
internal/skills/debug/skill.go
Normal file
@@ -0,0 +1,55 @@
|
||||
// internal/skills/debug/skill.go
|
||||
package debug
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
|
||||
iexec "github.com/mathiasbq/supervisor/internal/exec"
|
||||
"github.com/mathiasbq/supervisor/internal/registry"
|
||||
)
|
||||
|
||||
// ExecutorFn is the function signature for running a worker subprocess.
|
||||
type ExecutorFn func(ctx context.Context, req iexec.Request) (iexec.Result, error)
|
||||
|
||||
// Config holds dependencies for the debug skill.
|
||||
type Config struct {
|
||||
SkillPrompt string
|
||||
DefaultModel string
|
||||
ExecutorFn ExecutorFn
|
||||
SessionsDir string
|
||||
}
|
||||
|
||||
// Skill implements the debug MCP tool.
|
||||
type Skill struct{ cfg Config }
|
||||
|
||||
// New creates a new debug Skill.
|
||||
func New(cfg Config) *Skill { return &Skill{cfg: cfg} }
|
||||
|
||||
// Name returns the skill identifier.
|
||||
func (s *Skill) Name() string { return "debug" }
|
||||
|
||||
// Tools returns the MCP tool definitions for this skill.
|
||||
func (s *Skill) Tools() []registry.ToolDef {
|
||||
schema := func(required []string, props map[string]any) json.RawMessage {
|
||||
b, _ := json.Marshal(map[string]any{"type": "object", "required": required, "properties": props})
|
||||
return b
|
||||
}
|
||||
str := map[string]any{"type": "string"}
|
||||
return []registry.ToolDef{
|
||||
{
|
||||
Name: "debug",
|
||||
Description: "Analyse an error and return 3-5 hypotheses ordered by likelihood, each with a concrete verification step.",
|
||||
InputSchema: schema(
|
||||
[]string{"project_root", "error"},
|
||||
map[string]any{
|
||||
"project_root": str,
|
||||
"error": str,
|
||||
"context": str,
|
||||
"model": str,
|
||||
"session_id": str,
|
||||
},
|
||||
),
|
||||
},
|
||||
}
|
||||
}
|
||||
81
internal/skills/review/handlers.go
Normal file
81
internal/skills/review/handlers.go
Normal file
@@ -0,0 +1,81 @@
|
||||
// internal/skills/review/handlers.go
|
||||
package review
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
iexec "github.com/mathiasbq/supervisor/internal/exec"
|
||||
"github.com/mathiasbq/supervisor/internal/session"
|
||||
)
|
||||
|
||||
type reviewArgs struct {
|
||||
ProjectRoot string `json:"project_root"`
|
||||
Files []string `json:"files"`
|
||||
Context string `json:"context"`
|
||||
Model string `json:"model"`
|
||||
SessionID string `json:"session_id"`
|
||||
}
|
||||
|
||||
// Handle dispatches the MCP tool call to the appropriate handler.
|
||||
func (s *Skill) Handle(ctx context.Context, tool string, args json.RawMessage) (json.RawMessage, error) {
|
||||
if tool != "review" {
|
||||
return nil, fmt.Errorf("unknown tool: %s", tool)
|
||||
}
|
||||
var a reviewArgs
|
||||
if err := json.Unmarshal(args, &a); err != nil {
|
||||
return nil, fmt.Errorf("parse args: %w", err)
|
||||
}
|
||||
if a.ProjectRoot == "" {
|
||||
return nil, fmt.Errorf("project_root is required")
|
||||
}
|
||||
if len(a.Files) == 0 {
|
||||
return nil, fmt.Errorf("files is required")
|
||||
}
|
||||
|
||||
model := a.Model
|
||||
if model == "" {
|
||||
model = s.cfg.DefaultModel
|
||||
}
|
||||
|
||||
task := fmt.Sprintf(
|
||||
"phase: review\nproject_root: %s\nfiles: %s\ncontext: %s\nmodel: %s",
|
||||
a.ProjectRoot, strings.Join(a.Files, ", "), a.Context, model,
|
||||
)
|
||||
task = s.prependHistory(a.SessionID, "review", task)
|
||||
|
||||
if s.cfg.ExecutorFn == nil {
|
||||
return nil, fmt.Errorf("no executor configured")
|
||||
}
|
||||
result, err := s.cfg.ExecutorFn(ctx, iexec.Request{
|
||||
SkillPrompt: s.cfg.SkillPrompt,
|
||||
TaskPrompt: task,
|
||||
Model: model,
|
||||
Tools: "Read,Bash",
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
b, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("marshal result: %w", err)
|
||||
}
|
||||
return b, nil
|
||||
}
|
||||
|
||||
func (s *Skill) prependHistory(sessionID, currentPhase, task string) string {
|
||||
if sessionID == "" || s.cfg.SessionsDir == "" {
|
||||
return task
|
||||
}
|
||||
entries, err := session.Read(s.cfg.SessionsDir, sessionID)
|
||||
if err != nil || len(entries) == 0 {
|
||||
return task
|
||||
}
|
||||
history := session.FormatHistory(entries, currentPhase)
|
||||
if history == "" {
|
||||
return task
|
||||
}
|
||||
return history + "\n---\n\n" + task
|
||||
}
|
||||
61
internal/skills/review/handlers_test.go
Normal file
61
internal/skills/review/handlers_test.go
Normal file
@@ -0,0 +1,61 @@
|
||||
// internal/skills/review/handlers_test.go
|
||||
package review_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"testing"
|
||||
|
||||
iexec "github.com/mathiasbq/supervisor/internal/exec"
|
||||
"github.com/mathiasbq/supervisor/internal/skills/review"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestReviewToolRegistered(t *testing.T) {
|
||||
sk := review.New(review.Config{SkillPrompt: "review rules"})
|
||||
names := make([]string, 0)
|
||||
for _, tool := range sk.Tools() {
|
||||
names = append(names, tool.Name)
|
||||
}
|
||||
assert.Contains(t, names, "review")
|
||||
}
|
||||
|
||||
func TestReviewRequiresProjectRoot(t *testing.T) {
|
||||
sk := review.New(review.Config{SkillPrompt: "r"})
|
||||
_, err := sk.Handle(context.Background(), "review", json.RawMessage(`{"files":["main.go"]}`))
|
||||
assert.ErrorContains(t, err, "project_root")
|
||||
}
|
||||
|
||||
func TestReviewRequiresFiles(t *testing.T) {
|
||||
sk := review.New(review.Config{SkillPrompt: "r"})
|
||||
_, err := sk.Handle(context.Background(), "review", json.RawMessage(`{"project_root":"/tmp"}`))
|
||||
assert.ErrorContains(t, err, "files")
|
||||
}
|
||||
|
||||
func TestReviewCallsExecutor(t *testing.T) {
|
||||
called := false
|
||||
var capturedTask string
|
||||
fakeFn := func(_ context.Context, req iexec.Request) (iexec.Result, error) {
|
||||
called = true
|
||||
capturedTask = req.TaskPrompt
|
||||
return iexec.Result{
|
||||
Status: "pass", Phase: "review", Skill: "review",
|
||||
Verified: true, ModelUsed: "self", Message: "2 warnings found",
|
||||
}, nil
|
||||
}
|
||||
|
||||
sk := review.New(review.Config{SkillPrompt: "review rules", ExecutorFn: fakeFn, SessionsDir: t.TempDir()})
|
||||
out, err := sk.Handle(context.Background(), "review", json.RawMessage(
|
||||
`{"project_root":"/tmp/proj","files":["internal/foo/foo.go"],"context":"PR: add Foo helper"}`,
|
||||
))
|
||||
require.NoError(t, err)
|
||||
assert.True(t, called)
|
||||
assert.Contains(t, capturedTask, "internal/foo/foo.go")
|
||||
assert.Contains(t, capturedTask, "PR: add Foo helper")
|
||||
|
||||
var result iexec.Result
|
||||
require.NoError(t, json.Unmarshal(out, &result))
|
||||
assert.Equal(t, "pass", result.Status)
|
||||
assert.Equal(t, "review", result.Phase)
|
||||
}
|
||||
55
internal/skills/review/skill.go
Normal file
55
internal/skills/review/skill.go
Normal file
@@ -0,0 +1,55 @@
|
||||
// internal/skills/review/skill.go
|
||||
package review
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
|
||||
iexec "github.com/mathiasbq/supervisor/internal/exec"
|
||||
"github.com/mathiasbq/supervisor/internal/registry"
|
||||
)
|
||||
|
||||
// ExecutorFn is the function signature for running a worker subprocess.
|
||||
type ExecutorFn func(ctx context.Context, req iexec.Request) (iexec.Result, error)
|
||||
|
||||
// Config holds dependencies for the review skill.
|
||||
type Config struct {
|
||||
SkillPrompt string
|
||||
DefaultModel string
|
||||
ExecutorFn ExecutorFn
|
||||
SessionsDir string
|
||||
}
|
||||
|
||||
// Skill implements the review MCP tool.
|
||||
type Skill struct{ cfg Config }
|
||||
|
||||
// New creates a new review Skill.
|
||||
func New(cfg Config) *Skill { return &Skill{cfg: cfg} }
|
||||
|
||||
// Name returns the skill identifier.
|
||||
func (s *Skill) Name() string { return "review" }
|
||||
|
||||
// Tools returns the MCP tool definitions for this skill.
|
||||
func (s *Skill) Tools() []registry.ToolDef {
|
||||
schema := func(required []string, props map[string]any) json.RawMessage {
|
||||
b, _ := json.Marshal(map[string]any{"type": "object", "required": required, "properties": props})
|
||||
return b
|
||||
}
|
||||
str := map[string]any{"type": "string"}
|
||||
return []registry.ToolDef{
|
||||
{
|
||||
Name: "review",
|
||||
Description: "Perform a structured code review of the specified files. Returns findings with severity levels.",
|
||||
InputSchema: schema(
|
||||
[]string{"project_root", "files"},
|
||||
map[string]any{
|
||||
"project_root": str,
|
||||
"files": map[string]any{"type": "array", "items": map[string]any{"type": "string"}},
|
||||
"context": str,
|
||||
"model": str,
|
||||
"session_id": str,
|
||||
},
|
||||
),
|
||||
},
|
||||
}
|
||||
}
|
||||
85
internal/skills/spec/handlers.go
Normal file
85
internal/skills/spec/handlers.go
Normal file
@@ -0,0 +1,85 @@
|
||||
// internal/skills/spec/handlers.go
|
||||
package spec
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
|
||||
iexec "github.com/mathiasbq/supervisor/internal/exec"
|
||||
"github.com/mathiasbq/supervisor/internal/session"
|
||||
)
|
||||
|
||||
type specArgs struct {
|
||||
ProjectRoot string `json:"project_root"`
|
||||
Requirements string `json:"requirements"`
|
||||
OutputPath string `json:"output_path"`
|
||||
Context string `json:"context"`
|
||||
Model string `json:"model"`
|
||||
SessionID string `json:"session_id"`
|
||||
}
|
||||
|
||||
// Handle dispatches the MCP tool call to the appropriate handler.
|
||||
func (s *Skill) Handle(ctx context.Context, tool string, args json.RawMessage) (json.RawMessage, error) {
|
||||
if tool != "spec" {
|
||||
return nil, fmt.Errorf("unknown tool: %s", tool)
|
||||
}
|
||||
var a specArgs
|
||||
if err := json.Unmarshal(args, &a); err != nil {
|
||||
return nil, fmt.Errorf("parse args: %w", err)
|
||||
}
|
||||
if a.ProjectRoot == "" {
|
||||
return nil, fmt.Errorf("project_root is required")
|
||||
}
|
||||
if a.Requirements == "" {
|
||||
return nil, fmt.Errorf("requirements is required")
|
||||
}
|
||||
outputPath := a.OutputPath
|
||||
if outputPath == "" {
|
||||
outputPath = "docs/spec.md"
|
||||
}
|
||||
|
||||
model := a.Model
|
||||
if model == "" {
|
||||
model = s.cfg.DefaultModel
|
||||
}
|
||||
|
||||
task := fmt.Sprintf(
|
||||
"phase: spec\nproject_root: %s\nrequirements: %s\noutput_path: %s\ncontext: %s\nmodel: %s",
|
||||
a.ProjectRoot, a.Requirements, outputPath, a.Context, model,
|
||||
)
|
||||
task = s.prependHistory(a.SessionID, "spec", task)
|
||||
|
||||
if s.cfg.ExecutorFn == nil {
|
||||
return nil, fmt.Errorf("no executor configured")
|
||||
}
|
||||
result, err := s.cfg.ExecutorFn(ctx, iexec.Request{
|
||||
SkillPrompt: s.cfg.SkillPrompt,
|
||||
TaskPrompt: task,
|
||||
Model: model,
|
||||
Tools: "Read,Write",
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
b, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("marshal result: %w", err)
|
||||
}
|
||||
return b, nil
|
||||
}
|
||||
|
||||
func (s *Skill) prependHistory(sessionID, currentPhase, task string) string {
|
||||
if sessionID == "" || s.cfg.SessionsDir == "" {
|
||||
return task
|
||||
}
|
||||
entries, err := session.Read(s.cfg.SessionsDir, sessionID)
|
||||
if err != nil || len(entries) == 0 {
|
||||
return task
|
||||
}
|
||||
history := session.FormatHistory(entries, currentPhase)
|
||||
if history == "" {
|
||||
return task
|
||||
}
|
||||
return history + "\n---\n\n" + task
|
||||
}
|
||||
61
internal/skills/spec/handlers_test.go
Normal file
61
internal/skills/spec/handlers_test.go
Normal file
@@ -0,0 +1,61 @@
|
||||
// internal/skills/spec/handlers_test.go
|
||||
package spec_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"testing"
|
||||
|
||||
iexec "github.com/mathiasbq/supervisor/internal/exec"
|
||||
"github.com/mathiasbq/supervisor/internal/skills/spec"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestSpecToolRegistered(t *testing.T) {
|
||||
sk := spec.New(spec.Config{SkillPrompt: "spec rules"})
|
||||
names := make([]string, 0)
|
||||
for _, tool := range sk.Tools() {
|
||||
names = append(names, tool.Name)
|
||||
}
|
||||
assert.Contains(t, names, "spec")
|
||||
}
|
||||
|
||||
func TestSpecRequiresProjectRoot(t *testing.T) {
|
||||
sk := spec.New(spec.Config{SkillPrompt: "s"})
|
||||
_, err := sk.Handle(context.Background(), "spec", json.RawMessage(`{"requirements":"add login"}`))
|
||||
assert.ErrorContains(t, err, "project_root")
|
||||
}
|
||||
|
||||
func TestSpecRequiresRequirements(t *testing.T) {
|
||||
sk := spec.New(spec.Config{SkillPrompt: "s"})
|
||||
_, err := sk.Handle(context.Background(), "spec", json.RawMessage(`{"project_root":"/tmp"}`))
|
||||
assert.ErrorContains(t, err, "requirements")
|
||||
}
|
||||
|
||||
func TestSpecCallsExecutor(t *testing.T) {
|
||||
called := false
|
||||
var capturedTask string
|
||||
fakeFn := func(_ context.Context, req iexec.Request) (iexec.Result, error) {
|
||||
called = true
|
||||
capturedTask = req.TaskPrompt
|
||||
return iexec.Result{
|
||||
Status: "pass", Phase: "spec", Skill: "spec",
|
||||
FilePath: "/tmp/proj/docs/login-spec.md",
|
||||
Verified: true, ModelUsed: "self", Message: "spec written: login feature",
|
||||
}, nil
|
||||
}
|
||||
|
||||
sk := spec.New(spec.Config{SkillPrompt: "spec rules", ExecutorFn: fakeFn, SessionsDir: t.TempDir()})
|
||||
out, err := sk.Handle(context.Background(), "spec", json.RawMessage(
|
||||
`{"project_root":"/tmp/proj","requirements":"add OAuth2 login","output_path":"docs/login-spec.md"}`,
|
||||
))
|
||||
require.NoError(t, err)
|
||||
assert.True(t, called)
|
||||
assert.Contains(t, capturedTask, "OAuth2 login")
|
||||
assert.Contains(t, capturedTask, "docs/login-spec.md")
|
||||
|
||||
var result iexec.Result
|
||||
require.NoError(t, json.Unmarshal(out, &result))
|
||||
assert.Equal(t, "spec", result.Phase)
|
||||
}
|
||||
56
internal/skills/spec/skill.go
Normal file
56
internal/skills/spec/skill.go
Normal file
@@ -0,0 +1,56 @@
|
||||
// internal/skills/spec/skill.go
|
||||
package spec
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
|
||||
iexec "github.com/mathiasbq/supervisor/internal/exec"
|
||||
"github.com/mathiasbq/supervisor/internal/registry"
|
||||
)
|
||||
|
||||
// ExecutorFn is the function signature for running a worker subprocess.
|
||||
type ExecutorFn func(ctx context.Context, req iexec.Request) (iexec.Result, error)
|
||||
|
||||
// Config holds dependencies for the spec skill.
|
||||
type Config struct {
|
||||
SkillPrompt string
|
||||
DefaultModel string
|
||||
ExecutorFn ExecutorFn
|
||||
SessionsDir string
|
||||
}
|
||||
|
||||
// Skill implements the spec MCP tool.
|
||||
type Skill struct{ cfg Config }
|
||||
|
||||
// New creates a new spec Skill.
|
||||
func New(cfg Config) *Skill { return &Skill{cfg: cfg} }
|
||||
|
||||
// Name returns the skill identifier.
|
||||
func (s *Skill) Name() string { return "spec" }
|
||||
|
||||
// Tools returns the MCP tool definitions for this skill.
|
||||
func (s *Skill) Tools() []registry.ToolDef {
|
||||
schema := func(required []string, props map[string]any) json.RawMessage {
|
||||
b, _ := json.Marshal(map[string]any{"type": "object", "required": required, "properties": props})
|
||||
return b
|
||||
}
|
||||
str := map[string]any{"type": "string"}
|
||||
return []registry.ToolDef{
|
||||
{
|
||||
Name: "spec",
|
||||
Description: "Generate a structured implementation spec from requirements. Writes the spec to output_path in the project.",
|
||||
InputSchema: schema(
|
||||
[]string{"project_root", "requirements"},
|
||||
map[string]any{
|
||||
"project_root": str,
|
||||
"requirements": str,
|
||||
"output_path": str,
|
||||
"context": str,
|
||||
"model": str,
|
||||
"session_id": str,
|
||||
},
|
||||
),
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"fmt"
|
||||
|
||||
iexec "github.com/mathiasbq/supervisor/internal/exec"
|
||||
"github.com/mathiasbq/supervisor/internal/session"
|
||||
)
|
||||
|
||||
func (s *Skill) Handle(ctx context.Context, tool string, args json.RawMessage) (json.RawMessage, error) {
|
||||
@@ -51,6 +52,7 @@ type greenArgs struct {
|
||||
TestPath string `json:"test_path"`
|
||||
Model string `json:"model"`
|
||||
TestCmd string `json:"test_cmd"`
|
||||
SessionID string `json:"session_id"`
|
||||
}
|
||||
|
||||
func (s *Skill) handleGreen(ctx context.Context, raw json.RawMessage) (json.RawMessage, error) {
|
||||
@@ -68,6 +70,7 @@ func (s *Skill) handleGreen(ctx context.Context, raw json.RawMessage) (json.RawM
|
||||
"phase: green\nproject_root: %s\ntest_path: %s\nmodel: %s\ntest_cmd: %s",
|
||||
args.ProjectRoot, args.TestPath, s.resolveModel(args.Model), args.TestCmd,
|
||||
)
|
||||
task = s.prependHistory(args.SessionID, "green", task)
|
||||
return s.execute(ctx, task)
|
||||
}
|
||||
|
||||
@@ -77,6 +80,7 @@ type refactorArgs struct {
|
||||
ImplPath string `json:"impl_path"`
|
||||
Model string `json:"model"`
|
||||
TestCmd string `json:"test_cmd"`
|
||||
SessionID string `json:"session_id"`
|
||||
}
|
||||
|
||||
func (s *Skill) handleRefactor(ctx context.Context, raw json.RawMessage) (json.RawMessage, error) {
|
||||
@@ -97,9 +101,25 @@ func (s *Skill) handleRefactor(ctx context.Context, raw json.RawMessage) (json.R
|
||||
"phase: refactor\nproject_root: %s\ntest_path: %s\nimpl_path: %s\nmodel: %s\ntest_cmd: %s",
|
||||
args.ProjectRoot, args.TestPath, args.ImplPath, s.resolveModel(args.Model), args.TestCmd,
|
||||
)
|
||||
task = s.prependHistory(args.SessionID, "refactor", task)
|
||||
return s.execute(ctx, task)
|
||||
}
|
||||
|
||||
func (s *Skill) prependHistory(sessionID, currentPhase, task string) string {
|
||||
if sessionID == "" || s.cfg.SessionsDir == "" {
|
||||
return task
|
||||
}
|
||||
entries, err := session.Read(s.cfg.SessionsDir, sessionID)
|
||||
if err != nil || len(entries) == 0 {
|
||||
return task
|
||||
}
|
||||
history := session.FormatHistory(entries, currentPhase)
|
||||
if history == "" {
|
||||
return task
|
||||
}
|
||||
return history + "\n---\n\n" + task
|
||||
}
|
||||
|
||||
func (s *Skill) resolveModel(override string) string {
|
||||
if override != "" {
|
||||
return override
|
||||
|
||||
@@ -5,6 +5,8 @@ import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
|
||||
iexec "github.com/mathiasbq/supervisor/internal/exec"
|
||||
"github.com/mathiasbq/supervisor/internal/session"
|
||||
"github.com/mathiasbq/supervisor/internal/skills/tdd"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
@@ -41,5 +43,43 @@ func TestTDDRedRequiresSpec(t *testing.T) {
|
||||
assert.ErrorContains(t, err, "spec")
|
||||
}
|
||||
|
||||
func TestTDDGreenInjectsSessionHistory(t *testing.T) {
|
||||
sessDir := t.TempDir()
|
||||
require.NoError(t, session.Append(sessDir, "sess-1", session.Entry{
|
||||
SessionID: "sess-1", Skill: "tdd", Phase: "red", FinalStatus: "pass",
|
||||
FilePath: "internal/foo/foo_test.go",
|
||||
Message: "wrote failing test for Foo",
|
||||
}))
|
||||
|
||||
var capturedPrompt string
|
||||
fakeFn := func(_ context.Context, req iexec.Request) (iexec.Result, error) {
|
||||
capturedPrompt = req.TaskPrompt
|
||||
return iexec.Result{Status: "pass", Phase: "green", Skill: "tdd", Verified: true, ModelUsed: "self", Message: "ok"}, nil
|
||||
}
|
||||
|
||||
sk := tdd.New(tdd.Config{SkillPrompt: "tdd", ExecutorFn: fakeFn, SessionsDir: sessDir})
|
||||
_, err := sk.Handle(context.Background(), "tdd_green", json.RawMessage(
|
||||
`{"project_root":"/tmp","test_path":"internal/foo/foo_test.go","test_cmd":"go test ./...","session_id":"sess-1"}`,
|
||||
))
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, capturedPrompt, "## Session history")
|
||||
assert.Contains(t, capturedPrompt, "wrote failing test for Foo")
|
||||
}
|
||||
|
||||
func TestTDDGreenNoHistoryWhenSessionIDEmpty(t *testing.T) {
|
||||
var capturedPrompt string
|
||||
fakeFn := func(_ context.Context, req iexec.Request) (iexec.Result, error) {
|
||||
capturedPrompt = req.TaskPrompt
|
||||
return iexec.Result{Status: "pass", Phase: "green", Skill: "tdd", Verified: true, ModelUsed: "self", Message: "ok"}, nil
|
||||
}
|
||||
|
||||
sk := tdd.New(tdd.Config{SkillPrompt: "tdd", ExecutorFn: fakeFn, SessionsDir: t.TempDir()})
|
||||
_, err := sk.Handle(context.Background(), "tdd_green", json.RawMessage(
|
||||
`{"project_root":"/tmp","test_path":"internal/foo/foo_test.go"}`,
|
||||
))
|
||||
require.NoError(t, err)
|
||||
assert.NotContains(t, capturedPrompt, "## Session history")
|
||||
}
|
||||
|
||||
// Ensure require is used (avoids import error).
|
||||
var _ = require.New
|
||||
|
||||
@@ -16,6 +16,7 @@ type Config struct {
|
||||
SkillPrompt string
|
||||
ExecutorFn ExecutorFn // nil = no executor (tests that don't reach execute())
|
||||
DefaultModel string
|
||||
SessionsDir string // optional: path to brain/sessions/ for history injection
|
||||
}
|
||||
|
||||
type Skill struct {
|
||||
@@ -63,6 +64,7 @@ func (s *Skill) Tools() []registry.ToolDef {
|
||||
"test_path": strProp,
|
||||
"model": strProp,
|
||||
"test_cmd": strProp,
|
||||
"session_id": strProp,
|
||||
},
|
||||
),
|
||||
},
|
||||
@@ -77,6 +79,7 @@ func (s *Skill) Tools() []registry.ToolDef {
|
||||
"impl_path": strProp,
|
||||
"model": strProp,
|
||||
"test_cmd": strProp,
|
||||
"session_id": strProp,
|
||||
},
|
||||
),
|
||||
},
|
||||
|
||||
80
internal/skills/trainer/handlers.go
Normal file
80
internal/skills/trainer/handlers.go
Normal file
@@ -0,0 +1,80 @@
|
||||
// internal/skills/trainer/handlers.go
|
||||
package trainer
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
|
||||
iexec "github.com/mathiasbq/supervisor/internal/exec"
|
||||
"github.com/mathiasbq/supervisor/internal/session"
|
||||
)
|
||||
|
||||
type trainArgs struct {
|
||||
SessionID string `json:"session_id"`
|
||||
Model string `json:"model"`
|
||||
}
|
||||
|
||||
// Handle dispatches the MCP tool call to the trainer handler.
|
||||
func (s *Skill) Handle(ctx context.Context, tool string, args json.RawMessage) (json.RawMessage, error) {
|
||||
if tool != "trainer" {
|
||||
return nil, fmt.Errorf("unknown tool: %s", tool)
|
||||
}
|
||||
var a trainArgs
|
||||
if err := json.Unmarshal(args, &a); err != nil {
|
||||
return nil, fmt.Errorf("parse args: %w", err)
|
||||
}
|
||||
if a.SessionID == "" {
|
||||
return nil, fmt.Errorf("session_id is required")
|
||||
}
|
||||
if s.cfg.ExecutorFn == nil {
|
||||
return nil, fmt.Errorf("no executor configured")
|
||||
}
|
||||
|
||||
model := a.Model
|
||||
if model == "" {
|
||||
model = s.cfg.DefaultModel
|
||||
}
|
||||
|
||||
entries, err := session.Read(s.cfg.SessionsDir, a.SessionID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read session log: %w", err)
|
||||
}
|
||||
|
||||
// ── Step 1: Reader agent ─────────────────────────────────────────────────
|
||||
history := session.FormatHistory(entries, "")
|
||||
readerTask := fmt.Sprintf(
|
||||
"role: reader\nsession_id: %s\nbrain_dir: %s\n\n%s",
|
||||
a.SessionID, s.cfg.BrainDir, history,
|
||||
)
|
||||
readerResult, err := s.cfg.ExecutorFn(ctx, iexec.Request{
|
||||
SkillPrompt: s.cfg.ReaderPrompt,
|
||||
TaskPrompt: readerTask,
|
||||
Model: model,
|
||||
Tools: "Read",
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reader agent: %w", err)
|
||||
}
|
||||
|
||||
// ── Step 2: Writer agent (receives reader candidates) ────────────────────
|
||||
writerTask := fmt.Sprintf(
|
||||
"role: writer\nsession_id: %s\nbrain_dir: %s\n\nreader_summary: %s\nreader_candidates:\n%s",
|
||||
a.SessionID, s.cfg.BrainDir, readerResult.Message, readerResult.RunnerOutput,
|
||||
)
|
||||
writerResult, err := s.cfg.ExecutorFn(ctx, iexec.Request{
|
||||
SkillPrompt: s.cfg.WriterPrompt,
|
||||
TaskPrompt: writerTask,
|
||||
Model: model,
|
||||
Tools: "Read,Write",
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("writer agent: %w", err)
|
||||
}
|
||||
|
||||
b, err := json.Marshal(writerResult)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("marshal result: %w", err)
|
||||
}
|
||||
return b, nil
|
||||
}
|
||||
82
internal/skills/trainer/handlers_test.go
Normal file
82
internal/skills/trainer/handlers_test.go
Normal file
@@ -0,0 +1,82 @@
|
||||
// internal/skills/trainer/handlers_test.go
|
||||
package trainer_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"testing"
|
||||
|
||||
iexec "github.com/mathiasbq/supervisor/internal/exec"
|
||||
"github.com/mathiasbq/supervisor/internal/session"
|
||||
"github.com/mathiasbq/supervisor/internal/skills/trainer"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestTrainerToolRegistered(t *testing.T) {
|
||||
sk := trainer.New(trainer.Config{ReaderPrompt: "r", WriterPrompt: "w"})
|
||||
names := make([]string, 0)
|
||||
for _, tool := range sk.Tools() {
|
||||
names = append(names, tool.Name)
|
||||
}
|
||||
assert.Contains(t, names, "trainer")
|
||||
}
|
||||
|
||||
func TestTrainerRequiresSessionID(t *testing.T) {
|
||||
sk := trainer.New(trainer.Config{ReaderPrompt: "r", WriterPrompt: "w"})
|
||||
_, err := sk.Handle(context.Background(), "trainer", json.RawMessage(`{}`))
|
||||
assert.ErrorContains(t, err, "session_id")
|
||||
}
|
||||
|
||||
func TestTrainerCallsReaderThenWriter(t *testing.T) {
|
||||
sessDir := t.TempDir()
|
||||
require.NoError(t, session.Append(sessDir, "sess-1", session.Entry{
|
||||
SessionID: "sess-1", Skill: "tdd", Phase: "red", FinalStatus: "pass",
|
||||
Message: "wrote failing test", FilePath: "internal/foo/foo_test.go",
|
||||
}))
|
||||
|
||||
callCount := 0
|
||||
var readerTask, writerTask string
|
||||
|
||||
fakeFn := func(_ context.Context, req iexec.Request) (iexec.Result, error) {
|
||||
callCount++
|
||||
if callCount == 1 {
|
||||
// reader call
|
||||
readerTask = req.TaskPrompt
|
||||
return iexec.Result{
|
||||
Status: "pass", Phase: "trainer", Skill: "trainer",
|
||||
RunnerOutput: `[{"type":"sft","moment":"first-pass clean TDD","score":4}]`,
|
||||
Verified: true, ModelUsed: "self", Message: "1 sft candidate found",
|
||||
}, nil
|
||||
}
|
||||
// writer call
|
||||
writerTask = req.TaskPrompt
|
||||
return iexec.Result{
|
||||
Status: "pass", Phase: "trainer", Skill: "trainer",
|
||||
FilePath: sessDir + "/training-data/sft/sess-1.jsonl",
|
||||
Verified: true, ModelUsed: "self", Message: "1 sft pair written",
|
||||
}, nil
|
||||
}
|
||||
|
||||
sk := trainer.New(trainer.Config{
|
||||
ReaderPrompt: "reader rules",
|
||||
WriterPrompt: "writer rules",
|
||||
ExecutorFn: fakeFn,
|
||||
SessionsDir: sessDir,
|
||||
BrainDir: t.TempDir(),
|
||||
})
|
||||
out, err := sk.Handle(context.Background(), "trainer", json.RawMessage(`{"session_id":"sess-1"}`))
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.Equal(t, 2, callCount, "executor must be called exactly twice: reader then writer")
|
||||
assert.Contains(t, readerTask, "role: reader")
|
||||
assert.Contains(t, readerTask, "sess-1")
|
||||
assert.Contains(t, readerTask, "wrote failing test") // session history in reader prompt
|
||||
assert.Contains(t, writerTask, "role: writer")
|
||||
assert.Contains(t, writerTask, "sft candidate") // reader output passed to writer
|
||||
|
||||
var result iexec.Result
|
||||
require.NoError(t, json.Unmarshal(out, &result))
|
||||
assert.Equal(t, "trainer", result.Phase)
|
||||
assert.Equal(t, "pass", result.Status)
|
||||
}
|
||||
53
internal/skills/trainer/skill.go
Normal file
53
internal/skills/trainer/skill.go
Normal file
@@ -0,0 +1,53 @@
|
||||
// internal/skills/trainer/skill.go
|
||||
package trainer
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
|
||||
iexec "github.com/mathiasbq/supervisor/internal/exec"
|
||||
"github.com/mathiasbq/supervisor/internal/registry"
|
||||
)
|
||||
|
||||
// ExecutorFn is the function signature for running a worker subprocess.
|
||||
type ExecutorFn func(ctx context.Context, req iexec.Request) (iexec.Result, error)
|
||||
|
||||
// Config holds dependencies for the trainer skill.
|
||||
type Config struct {
|
||||
ReaderPrompt string
|
||||
WriterPrompt string
|
||||
DefaultModel string
|
||||
ExecutorFn ExecutorFn
|
||||
SessionsDir string
|
||||
BrainDir string // root of brain/ directory; writer writes to BrainDir/training-data/
|
||||
}
|
||||
|
||||
// Skill implements the trainer MCP tool.
|
||||
type Skill struct{ cfg Config }
|
||||
|
||||
// New creates a new trainer Skill.
|
||||
func New(cfg Config) *Skill { return &Skill{cfg: cfg} }
|
||||
|
||||
// Name returns the skill identifier.
|
||||
func (s *Skill) Name() string { return "trainer" }
|
||||
|
||||
// Tools returns the MCP tool definitions for this skill.
|
||||
func (s *Skill) Tools() []registry.ToolDef {
|
||||
schema := func(required []string, props map[string]any) json.RawMessage {
|
||||
b, _ := json.Marshal(map[string]any{"type": "object", "required": required, "properties": props})
|
||||
return b
|
||||
}
|
||||
return []registry.ToolDef{
|
||||
{
|
||||
Name: "trainer",
|
||||
Description: "Extract SFT and DPO training pairs from a session log. Runs a reader→writer chain: reader identifies learning moments, writer formats and writes pairs to brain/training-data/.",
|
||||
InputSchema: schema(
|
||||
[]string{"session_id"},
|
||||
map[string]any{
|
||||
"session_id": map[string]any{"type": "string"},
|
||||
"model": map[string]any{"type": "string"},
|
||||
},
|
||||
),
|
||||
},
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user