Scalable Eval Approaches
scalable-eval-approachesapproachPath: /knowledge-base/responses/scalable-eval-approaches/
E440Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "scalable-eval-approaches",
"numericId": null,
"path": "/knowledge-base/responses/scalable-eval-approaches/",
"filePath": "knowledge-base/responses/scalable-eval-approaches.mdx",
"title": "Scalable Eval Approaches",
"quality": 65,
"readerImportance": 40,
"researchImportance": 31.5,
"tacticalValue": null,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": "2026-03-13",
"dateCreated": "2026-02-15",
"llmSummary": "Survey of practical approaches for scaling AI evaluation. LLM-as-judge has reached ~40% production adoption with 80%+ human agreement, but Dorner et al. (ICLR 2025 oral) proved a theoretical ceiling: at best 2x sample efficiency at the frontier (tau_max <= 2), meaning judges cannot meaningfully replace human evaluation of models stronger than themselves. Anthropic's Bloom framework (Dec 2025) generates automated behavioral evals achieving 0.86 Spearman correlation across 16 models. METR's Time Horizon 1.1 (Jan 2026) shows Opus 4.5 at 4h49m (highest), GPT-5 at 3h34m, with capability doubling every ~131 days across 14 models. Chain-of-thought monitoring (OpenAI, Dec 2025) achieves near-perfect recall for detecting reward hacking but is fragile—penalizing 'bad thoughts' produces obfuscated reward hacking. UK AISI sandbagging auditing games found black-box detection methods had 'very little success'; white-box methods were more promising but 'fragile.' Debate-based evaluation (ICML 2024 Best Paper) achieves 76-88% accuracy, moving from theoretical to practical. Petri 2.0 (Jan 2026) achieves 47.3% reduction in eval awareness via realism classifier. Despite these advances, the third-party audit ecosystem (METR + Apollo Research) remains severely capacity-constrained relative to frontier lab development.",
"description": "Practical approaches for scaling AI evaluation to keep pace with capability growth, including LLM-as-judge (40% production adoption but theoretically capped at 2x sample efficiency per ICLR 2025), automated behavioral evals (Anthropic Bloom, Spearman 0.86), AI-assisted red teaming (Petri 2.0 with 47.3% eval-awareness reduction), CoT monitoring (near-perfect recall but vulnerable to obfuscated reward hacking), METR Time Horizon (Opus 4.5 at 4h49m, doubling every ~131 days), sandbagging detection (UK AISI auditing games: black-box methods 'very little success'), and debate-based evaluation (ICML 2024 Best Paper: 76-88% accuracy). Third-party audit ecosystem remains severely capacity-constrained.",
"ratings": {
"focus": 7,
"novelty": 7,
"rigor": 7,
"completeness": 7,
"objectivity": 7,
"concreteness": 8,
"actionability": 6
},
"category": "responses",
"subcategory": "alignment-evaluation",
"clusters": [
"ai-safety",
"governance"
],
"metrics": {
"wordCount": 3459,
"tableCount": 12,
"diagramCount": 1,
"internalLinks": 21,
"externalLinks": 24,
"footnoteCount": 0,
"bulletRatio": 0.19,
"sectionCount": 64,
"hasOverview": true,
"structuralScore": 15
},
"suggestedQuality": 100,
"updateFrequency": 21,
"evergreen": true,
"wordCount": 3459,
"unconvertedLinks": [
{
"text": "Anthropic: Bloom",
"url": "https://alignment.anthropic.com/2025/bloom-auto-evals/",
"resourceId": "7fa7d4cb797a5edd",
"resourceTitle": "Bloom: Automated Behavioral Evaluations"
},
{
"text": "UK AISI: Inspect Framework",
"url": "https://inspect.aisi.org.uk/",
"resourceId": "fc3078f3c2ba5ebb",
"resourceTitle": "UK AI Safety Institute's Inspect framework"
},
{
"text": "METR: Measuring Long Tasks (March 2025)",
"url": "https://metr.org/blog/2025-03-19-measuring-ai-ability-to-complete-long-tasks/",
"resourceId": "271fc5f73a8304b2",
"resourceTitle": "Measuring AI Ability to Complete Long Tasks - METR"
},
{
"text": "Anthropic-OpenAI Safety Evaluation Pilot",
"url": "https://alignment.anthropic.com/2025/openai-findings/",
"resourceId": "2fdf91febf06daaf",
"resourceTitle": "Anthropic-OpenAI joint evaluation"
},
{
"text": "OpenAI: Chain of Thought Monitorability (arXiv 2507.11473)",
"url": "https://arxiv.org/abs/2507.11473",
"resourceId": "e2a66d86361bb628",
"resourceTitle": "Recent multi-lab research"
},
{
"text": "OpenAI: Detecting Misbehavior in Frontier Reasoning Models",
"url": "https://openai.com/index/chain-of-thought-monitoring/",
"resourceId": "d4700c15258393ad",
"resourceTitle": "OpenAI CoT Monitoring"
},
{
"text": "METR: GPT-5 Evaluation Report",
"url": "https://evaluations.metr.org/gpt-5-report/",
"resourceId": "7457262d461e2206",
"resourceTitle": "evaluations.metr.org"
}
],
"unconvertedLinkCount": 7,
"convertedLinkCount": 0,
"backlinkCount": 3,
"hallucinationRisk": {
"level": "low",
"score": 30,
"factors": [
"no-citations",
"high-rigor",
"conceptual-content"
]
},
"entityType": "approach",
"redundancy": {
"maxSimilarity": 20,
"similarPages": [
{
"id": "eval-saturation",
"title": "Eval Saturation & The Evals Gap",
"path": "/knowledge-base/responses/eval-saturation/",
"similarity": 20
},
{
"id": "agentic-ai",
"title": "Agentic AI",
"path": "/knowledge-base/capabilities/agentic-ai/",
"similarity": 18
},
{
"id": "reasoning",
"title": "Reasoning and Planning",
"path": "/knowledge-base/capabilities/reasoning/",
"similarity": 18
},
{
"id": "interpretability",
"title": "Mechanistic Interpretability",
"path": "/knowledge-base/responses/interpretability/",
"similarity": 18
},
{
"id": "scalable-oversight",
"title": "Scalable Oversight",
"path": "/knowledge-base/responses/scalable-oversight/",
"similarity": 18
}
]
},
"coverage": {
"passing": 6,
"total": 13,
"targets": {
"tables": 14,
"diagrams": 1,
"internalLinks": 28,
"externalLinks": 17,
"footnotes": 10,
"references": 10
},
"actuals": {
"tables": 12,
"diagrams": 1,
"internalLinks": 21,
"externalLinks": 24,
"footnotes": 0,
"references": 7,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"llmSummary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "red",
"overview": "green",
"tables": "amber",
"diagrams": "green",
"internalLinks": "amber",
"externalLinks": "green",
"footnotes": "red",
"references": "amber",
"quotes": "red",
"accuracy": "red"
},
"ratingsString": "N:7 R:7 A:6 C:7"
},
"readerRank": 378,
"researchRank": 406,
"recommendedScore": 171.86
}External Links
No external links
Backlinks (3)
| id | title | type | relationship |
|---|---|---|---|
| alignment-evaluation-overview | Evaluation & Detection (Overview) | concept | — |
| eval-saturation | Eval Saturation & The Evals Gap | approach | — |
| evaluation-awareness | Evaluation Awareness | approach | — |