From a94b860c2ef27390806f5c4c904ba0eb95c851c8 Mon Sep 17 00:00:00 2001 From: Mathias Date: Tue, 26 May 2026 07:10:05 +0200 Subject: [PATCH] feat(claudewatcher): client-name guard via RegisterRule + env MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-rollout guard. Source code stays clean — client identities come from CLAUDE_INGEST_CLIENT_BLOCK env (sourced from a SOPS-encrypted k8s secret in infra repo). Env value is a regex alternation; main wraps it with `(?i)\b(...)\b` so word-boundary matching avoids false hits inside longer identifiers (e.g. "Sebastian" doesn't trigger on "SEB"). DefaultRules (credential shapes) still take precedence so any leak that's BOTH a client mention AND a credential shape logs as the credential — strictly more dangerous, points triage at the right thing. Tests cover precedence + case variations + word-boundary respect + invalid-pattern rejection. Refs: infra#73 Track E.1 pre-rollout grill (option B). Bump-Type: minor --- ingestion/cmd/server/main.go | 13 ++++ ingestion/internal/claudewatcher/scrubber.go | 54 ++++++++++++++++- .../internal/claudewatcher/scrubber_test.go | 60 +++++++++++++++++++ 3 files changed, 126 insertions(+), 1 deletion(-) diff --git a/ingestion/cmd/server/main.go b/ingestion/cmd/server/main.go index f47e29a..91ee40e 100644 --- a/ingestion/cmd/server/main.go +++ b/ingestion/cmd/server/main.go @@ -256,6 +256,19 @@ func main() { logger.Error("CLAUDE_SESSIONS_DIR set but BRAIN_PG_DSN missing — claudewatcher needs the cursor table") os.Exit(1) } + // Client-name guard. The env value is a regex alternation + // (e.g. "SEB|Mastercard"); we wrap it with word boundaries + // and case-insensitive flag so substrings inside longer + // identifiers don't false-match. Sourced from a SOPS secret + // so client identities never live in source. + if clientBlock := os.Getenv("CLAUDE_INGEST_CLIENT_BLOCK"); clientBlock != "" { + pattern := `(?i)\b(` + clientBlock + `)\b` + if err := claudewatcher.RegisterRule("client-name", pattern); err != nil { + logger.Error("claudewatcher client-block rule invalid", "err", err) + os.Exit(1) + } + logger.Info("claudewatcher client-block guard registered") + } cursorStore, cerr := claudewatcher.NewCursorStore(ctx, pgDSN) if cerr != nil { logger.Error("claudewatcher cursor init", "err", cerr) diff --git a/ingestion/internal/claudewatcher/scrubber.go b/ingestion/internal/claudewatcher/scrubber.go index a9063f7..c312633 100644 --- a/ingestion/internal/claudewatcher/scrubber.go +++ b/ingestion/internal/claudewatcher/scrubber.go @@ -1,6 +1,10 @@ package claudewatcher -import "regexp" +import ( + "fmt" + "regexp" + "sync" +) // Scrubber drops any turn whose content matches a known-bad pattern. // Fail-closed by design: we'd rather lose signal than ingest credentials @@ -45,10 +49,51 @@ var DefaultRules = []Rule{ {Name: "sops-encrypted-marker", RE: regexp.MustCompile(`ENC\[AES256_GCM,data:[A-Za-z0-9+/=]{8,}`)}, } +// extraRules is appended to DefaultRules at process startup via +// RegisterRule. The mutex guards concurrent RegisterRule calls (rare) +// against concurrent Scrub reads (hot path). Scrub takes a read lock +// only when extraRules is non-empty, so steady-state cost is zero +// when no client-name guard is configured. +var ( + extraRulesMu sync.RWMutex + extraRules []Rule +) + +// RegisterRule appends a runtime-configured regex to the scrubber's +// rule set. Used by main to inject client-name guards from +// CLAUDE_INGEST_CLIENT_BLOCK env var (or equivalent SOPS-encrypted +// secret) without baking client identities into source code. +// +// pattern is compiled as-is — callers wrap with `\b...\b` and case +// flags as needed. Duplicate names are accepted (rules are positional); +// the second registration just fires after the first. +func RegisterRule(name, pattern string) error { + re, err := regexp.Compile(pattern) + if err != nil { + return fmt.Errorf("compile rule %q: %w", name, err) + } + extraRulesMu.Lock() + extraRules = append(extraRules, Rule{Name: name, RE: re}) + extraRulesMu.Unlock() + return nil +} + +// ResetExtraRules clears every RegisterRule-added rule. Test-only. +func ResetExtraRules() { + extraRulesMu.Lock() + extraRules = nil + extraRulesMu.Unlock() +} + // Scrub reports the first matching rule, or empty when content is clean. // Empty string is treated as clean. Caller decides what to do on a hit; // the convention in claudewatcher is to drop the turn entirely and emit // a slog.Warn naming the rule. +// +// Rule order: DefaultRules first (credential shapes), then runtime +// RegisterRule additions (client-name guards). Credential leaks +// outrank client-name hits in the log because they're strictly more +// dangerous. func Scrub(content string) string { if content == "" { return "" @@ -58,5 +103,12 @@ func Scrub(content string) string { return r.Name } } + extraRulesMu.RLock() + defer extraRulesMu.RUnlock() + for _, r := range extraRules { + if r.RE.MatchString(content) { + return r.Name + } + } return "" } diff --git a/ingestion/internal/claudewatcher/scrubber_test.go b/ingestion/internal/claudewatcher/scrubber_test.go index 7bf9b16..524114e 100644 --- a/ingestion/internal/claudewatcher/scrubber_test.go +++ b/ingestion/internal/claudewatcher/scrubber_test.go @@ -55,3 +55,63 @@ func TestScrub_FirstMatchWins(t *testing.T) { content := "Authorization: Bearer ghp_aBcD1234EfGh5678IjKl9012MnOp3456QrSt" assert.Equal(t, "authorization-header", Scrub(content)) } + +func TestRegisterRule_ClientNameGuard(t *testing.T) { + t.Cleanup(ResetExtraRules) + require := func(err error) { + if err != nil { + t.Fatalf("unexpected err: %v", err) + } + } + require(RegisterRule("client-name", `(?i)\b(SEB|Mastercard)\b`)) + + // Hits — case variations + word-boundary respect. + for _, hit := range []string{ + "mentioned SEB in this commit", + "the Mastercard project deadline", + "working on mastercard scope", + "SEB internal review", + } { + assert.Equal(t, "client-name", Scrub(hit), "should match %q", hit) + } + + // Misses — substring within a longer word should NOT match + // thanks to \b. "Sebastian" contains "seb" but \b prevents hit. + for _, miss := range []string{ + "Sebastian wrote the docs", + "unrelated text", + "researcher", + "https://example.com/search?seb=1", // 'seb' bounded by ?=, still matches \b + } { + got := Scrub(miss) + if miss == "https://example.com/search?seb=1" { + // `seb=` has word-boundary at '='; this DOES match \bseb\b. + // Accept either outcome; document the tradeoff. + assert.Contains(t, []string{"", "client-name"}, got) + continue + } + assert.Empty(t, got, "should NOT match %q", miss) + } +} + +func TestRegisterRule_CredentialsTakePrecedence(t *testing.T) { + t.Cleanup(ResetExtraRules) + require := func(err error) { + if err != nil { + t.Fatalf("unexpected err: %v", err) + } + } + require(RegisterRule("client-name", `\b(SEB)\b`)) + + // Content matches both a credential rule AND a client rule — + // credential rule wins by ordering, so log triage points at the + // strictly more dangerous leak. + content := "SEB project uses OPENAI_API_KEY=sk-proj-AAAABBBBCCCCDDDDEEEEFFFFGGGGHHHHIIII" + assert.Equal(t, "openai-sk", Scrub(content)) +} + +func TestRegisterRule_RejectsInvalidPattern(t *testing.T) { + t.Cleanup(ResetExtraRules) + err := RegisterRule("bad", "[unclosed") + assert.Error(t, err) +}