package claudewatcher import ( "fmt" "regexp" "sync" ) // Scrubber drops any turn whose content matches a known-bad pattern. // Fail-closed by design: we'd rather lose signal than ingest credentials // into a public-readable brain. The caller logs the drop reason. // // Rules cover the credential shapes most common to leak through Claude // Code sessions: bearer tokens, postgres URIs with embedded auth, OAuth // secret values, SOPS-encrypted secret blobs (we don't want the // ciphertext either — it's a marker that the original message contained // secret state), PEM-encoded private keys, and the explicit env-var // naming conventions used in the homelab. // // Pattern philosophy: match by shape, not by content. A 40-char hex // string in isolation is fine; the same string after `Authorization: // Bearer ` is not. Tuned to catch known leak vectors from prior // secret-hygiene incidents (POSTGRES_PASSWORD via kubectl exec env, // INFRA_MCP_TOKEN via sops -d output) without dropping every Edit on a // config file. // Rule is a single named regex with a redact hint shown in the warn log. type Rule struct { Name string RE *regexp.Regexp } // DefaultRules is the regex set applied by Scrub. Mutable for tests but // callers should treat it as read-only at runtime. var DefaultRules = []Rule{ // authorization-header is checked before the bare bearer rule so // contextual hits ("Authorization: Bearer X") report the more // specific match name in logs. {Name: "authorization-header", RE: regexp.MustCompile(`(?i)Authorization\s*:\s*[A-Za-z]+\s+\S{8,}`)}, {Name: "bearer-token", RE: regexp.MustCompile(`(?i)Bearer\s+[A-Za-z0-9._\-]{16,}`)}, {Name: "postgres-uri-with-password", RE: regexp.MustCompile(`postgres(?:ql)?://[^:\s/]+:[^@\s/]+@`)}, {Name: "private-key", RE: regexp.MustCompile(`-----BEGIN[^-]*PRIVATE KEY-----`)}, {Name: "ssh-key", RE: regexp.MustCompile(`ssh-(?:rsa|ed25519|ecdsa)\s+[A-Za-z0-9+/=]{40,}`)}, {Name: "github-pat", RE: regexp.MustCompile(`\b(?:ghp|gho|ghu|ghr|gha)_[A-Za-z0-9]{30,}\b`)}, {Name: "openai-sk", RE: regexp.MustCompile(`\bsk-(?:proj-)?[A-Za-z0-9]{32,}\b`)}, {Name: "anthropic-sk", RE: regexp.MustCompile(`\bsk-ant-[A-Za-z0-9_\-]{32,}\b`)}, {Name: "aws-access-key", RE: regexp.MustCompile(`\bAKIA[0-9A-Z]{16}\b`)}, {Name: "homelab-env-token", RE: regexp.MustCompile(`(?i)(?:_TOKEN|_PASSWORD|_API_KEY|_SECRET)\s*[:=]\s*['"]?[A-Za-z0-9._/+\-]{12,}`)}, {Name: "sops-encrypted-marker", RE: regexp.MustCompile(`ENC\[AES256_GCM,data:[A-Za-z0-9+/=]{8,}`)}, } // extraRules is appended to DefaultRules at process startup via // RegisterRule. The mutex guards concurrent RegisterRule calls (rare) // against concurrent Scrub reads (hot path). Scrub takes a read lock // only when extraRules is non-empty, so steady-state cost is zero // when no client-name guard is configured. var ( extraRulesMu sync.RWMutex extraRules []Rule ) // RegisterRule appends a runtime-configured regex to the scrubber's // rule set. Used by main to inject client-name guards from // CLAUDE_INGEST_CLIENT_BLOCK env var (or equivalent SOPS-encrypted // secret) without baking client identities into source code. // // pattern is compiled as-is — callers wrap with `\b...\b` and case // flags as needed. Duplicate names are accepted (rules are positional); // the second registration just fires after the first. func RegisterRule(name, pattern string) error { re, err := regexp.Compile(pattern) if err != nil { return fmt.Errorf("compile rule %q: %w", name, err) } extraRulesMu.Lock() extraRules = append(extraRules, Rule{Name: name, RE: re}) extraRulesMu.Unlock() return nil } // ResetExtraRules clears every RegisterRule-added rule. Test-only. func ResetExtraRules() { extraRulesMu.Lock() extraRules = nil extraRulesMu.Unlock() } // Scrub reports the first matching rule, or empty when content is clean. // Empty string is treated as clean. Caller decides what to do on a hit; // the convention in claudewatcher is to drop the turn entirely and emit // a slog.Warn naming the rule. // // Rule order: DefaultRules first (credential shapes), then runtime // RegisterRule additions (client-name guards). Credential leaks // outrank client-name hits in the log because they're strictly more // dangerous. func Scrub(content string) string { if content == "" { return "" } for _, r := range DefaultRules { if r.RE.MatchString(content) { return r.Name } } extraRulesMu.RLock() defer extraRulesMu.RUnlock() for _, r := range extraRules { if r.RE.MatchString(content) { return r.Name } } return "" }