Eval Saturation & The Evals Gap
eval-saturationapproachPath: /knowledge-base/responses/eval-saturation/
E437Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "eval-saturation",
"numericId": null,
"path": "/knowledge-base/responses/eval-saturation/",
"filePath": "knowledge-base/responses/eval-saturation.mdx",
"title": "Eval Saturation & The Evals Gap",
"quality": 65,
"readerImportance": 22.5,
"researchImportance": 79,
"tacticalValue": null,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": "2026-03-13",
"dateCreated": "2026-02-15",
"llmSummary": "Analysis of accelerating AI evaluation saturation, showing benchmarks intended to last years are being saturated in months (MMLU ~4 years, MMLU-Pro ~18 months, HLE ~12 months). A 2022 Nature Communications study of 3,765 benchmarks found a 'large fraction quickly trends towards near-saturation.' Safety-critical evaluations face the same dynamic: Anthropic reports Opus 4.6 saturated most automated AI R&D, CBRN, and cyber evaluations; OpenAI cannot rule out High cyber capability for GPT-5.3-Codex. Epoch AI finds publicly described biorisk benchmarks 'essentially entirely saturated.' Apollo Research identifies an 'evals gap' where evaluation quality/quantity required for safety claims outpaces available evals, while evaluation awareness (58% in Claude Sonnet 4.5) and unfaithful chain-of-thought reasoning (25% faithfulness in Claude 3.7 Sonnet) create compounding challenges. The International AI Safety Report 2026 formally identifies the 'evaluation gap' as a central finding. Counter-arguments include LLM-as-judge scaling, adversarial benchmark resurrection, and adaptive evaluation approaches, but time-to-saturation is shrinking and domain-specific safety evals are inherently harder to create than academic benchmarks.",
"description": "Benchmark saturation is accelerating—MMLU lasted 4 years, MMLU-Pro 18 months, HLE roughly 12 months—while safety-critical evaluations for CBRN, cyber, and AI R&D capabilities are losing signal at frontier labs. The core question is whether evaluation development can keep pace with capability growth, or whether the evaluation-based governance frameworks underpinning responsible scaling policies are structurally undermined.",
"ratings": {
"novelty": 7,
"rigor": 7,
"completeness": 7,
"concreteness": 8,
"focus": 8
},
"category": "responses",
"subcategory": "alignment-evaluation",
"clusters": [
"ai-safety",
"governance"
],
"metrics": {
"wordCount": 4568,
"tableCount": 16,
"diagramCount": 1,
"internalLinks": 19,
"externalLinks": 40,
"footnoteCount": 0,
"bulletRatio": 0.19,
"sectionCount": 39,
"hasOverview": true,
"structuralScore": 15
},
"suggestedQuality": 100,
"updateFrequency": 21,
"evergreen": true,
"wordCount": 4568,
"unconvertedLinks": [
{
"text": "Frontier AI Trends Report",
"url": "https://www.aisi.gov.uk/frontier-ai-trends-report",
"resourceId": "7042c7f8de04ccb1",
"resourceTitle": "AISI Frontier AI Trends"
},
{
"text": "METR: GPT-5 Evaluation Report",
"url": "https://evaluations.metr.org/gpt-5-report/",
"resourceId": "7457262d461e2206",
"resourceTitle": "evaluations.metr.org"
},
{
"text": "Anthropic: Claude Opus 4.6 Announcement",
"url": "https://www.anthropic.com/news/claude-opus-4-6",
"resourceId": "6c20810f6d90adf9"
},
{
"text": "Anthropic: Reflections on RSP",
"url": "https://www.anthropic.com/news/reflections-on-our-responsible-scaling-policy",
"resourceId": "a8bbfa34e7210ac2",
"resourceTitle": "Anthropic acknowledged"
},
{
"text": "Anthropic: RSP v2.2",
"url": "https://www.anthropic.com/responsible-scaling-policy",
"resourceId": "afe1e125f3ba3f14"
},
{
"text": "OpenAI: Preparedness Framework v2",
"url": "https://cdn.openai.com/pdf/18a02b5d-6b67-4cec-ab64-68cdfbddebcd/preparedness-framework-v2.pdf",
"resourceId": "ec5d8e7d6a1b2c7c",
"resourceTitle": "OpenAI: Preparedness Framework Version 2"
},
{
"text": "OpenAI: Detecting Misbehavior in Frontier Reasoning Models",
"url": "https://openai.com/index/chain-of-thought-monitoring/",
"resourceId": "d4700c15258393ad",
"resourceTitle": "OpenAI CoT Monitoring"
},
{
"text": "SaferAI: Anthropic RSP Critique",
"url": "https://www.safer-ai.org/anthropics-responsible-scaling-policy-update-makes-a-step-backwards",
"resourceId": "a5e4c7b49f5d3e1b",
"resourceTitle": "SaferAI has argued"
},
{
"text": "METR: Common Elements of Frontier AI Safety Policies",
"url": "https://metr.org/common-elements",
"resourceId": "30b9f5e826260d9d",
"resourceTitle": "METR: Common Elements of Frontier AI Safety Policies"
},
{
"text": "ARC Prize: 2025 Results",
"url": "https://arcprize.org/blog/arc-prize-2025-results-analysis",
"resourceId": "f369a16dd38155b8",
"resourceTitle": "ARC Prize 2024-2025 results"
},
{
"text": "WebArena",
"url": "https://webarena.dev/",
"resourceId": "c2614357fa198ba4",
"resourceTitle": "WebArena"
},
{
"text": "OSWorld",
"url": "https://os-world.github.io/",
"resourceId": "c819ef71cbf34802",
"resourceTitle": "OSWorld"
}
],
"unconvertedLinkCount": 12,
"convertedLinkCount": 0,
"backlinkCount": 3,
"hallucinationRisk": {
"level": "low",
"score": 30,
"factors": [
"no-citations",
"high-rigor",
"conceptual-content"
]
},
"entityType": "approach",
"redundancy": {
"maxSimilarity": 20,
"similarPages": [
{
"id": "scalable-eval-approaches",
"title": "Scalable Eval Approaches",
"path": "/knowledge-base/responses/scalable-eval-approaches/",
"similarity": 20
},
{
"id": "reasoning",
"title": "Reasoning and Planning",
"path": "/knowledge-base/capabilities/reasoning/",
"similarity": 18
},
{
"id": "metr",
"title": "METR",
"path": "/knowledge-base/organizations/metr/",
"similarity": 18
},
{
"id": "self-improvement",
"title": "Self-Improvement and Recursive Enhancement",
"path": "/knowledge-base/capabilities/self-improvement/",
"similarity": 17
},
{
"id": "situational-awareness",
"title": "Situational Awareness",
"path": "/knowledge-base/capabilities/situational-awareness/",
"similarity": 17
}
]
},
"coverage": {
"passing": 5,
"total": 13,
"targets": {
"tables": 18,
"diagrams": 2,
"internalLinks": 37,
"externalLinks": 23,
"footnotes": 14,
"references": 14
},
"actuals": {
"tables": 16,
"diagrams": 1,
"internalLinks": 19,
"externalLinks": 40,
"footnotes": 0,
"references": 12,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"llmSummary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "red",
"overview": "green",
"tables": "amber",
"diagrams": "amber",
"internalLinks": "amber",
"externalLinks": "green",
"footnotes": "red",
"references": "amber",
"quotes": "red",
"accuracy": "red"
},
"ratingsString": "N:7 R:7 C:7"
},
"readerRank": 506,
"researchRank": 98,
"recommendedScore": 163.11
}External Links
No external links
Backlinks (3)
| id | title | type | relationship |
|---|---|---|---|
| evaluation-awareness | Evaluation Awareness | approach | — |
| scalable-eval-approaches | Scalable Eval Approaches | approach | — |
| alignment-evaluation-overview | Evaluation & Detection (Overview) | concept | — |