hookdeck · leggetter · Jun 19, 2026 · Jun 19, 2026
diff --git a/docs/agent-evaluation/.env.example b/docs/agent-evaluation/.env.example
@@ -36,4 +36,4 @@ EVAL_TEST_DESTINATION_URL=
 # Scoring is ON by default after each scenario (heuristic + LLM). Opt out:
 # EVAL_NO_SCORE_HEURISTIC=1
 # EVAL_NO_SCORE_LLM=1
-# EVAL_SCORE_MODEL=claude-sonnet-4-20250514
+# EVAL_SCORE_MODEL=claude-sonnet-4-6
diff --git a/docs/agent-evaluation/src/llm-judge.ts b/docs/agent-evaluation/src/llm-judge.ts
@@ -8,7 +8,7 @@ import { basename, dirname, join } from "node:path";
 import { extractTranscriptScoringText } from "./score-transcript.js";
 
 const ANTHROPIC_MESSAGES_URL = "https://api.anthropic.com/v1/messages";
-const DEFAULT_SCORE_MODEL = "claude-sonnet-4-20250514";
+const DEFAULT_SCORE_MODEL = "claude-sonnet-4-6";
 const MAX_TRANSCRIPT_CHARS = 180_000;
 
 export interface LlmCriterionJudgment {

diff --git a/docs/agent-evaluation/src/score-eval.ts b/docs/agent-evaluation/src/score-eval.ts
@@ -60,7 +60,7 @@ Score an eval transcript.
   npm run score -- --llm [--write]      # Anthropic judge (needs ANTHROPIC_API_KEY)
   npm run score -- --llm --no-heuristic # LLM only (no regex heuristic)
 
-Heuristic: src/score-transcript.ts. LLM: reads scenarios/*.md Success criteria + assistant text; model from EVAL_SCORE_MODEL (default claude-sonnet-4-20250514).
+Heuristic: src/score-transcript.ts. LLM: reads scenarios/*.md Success criteria + assistant text; model from EVAL_SCORE_MODEL (default claude-sonnet-4-6).
 
 Options:
   --run <path>      transcript.json, a run directory, or legacy flat *-scenario-NN.json