Scalable Oversight
scalable-oversightsafety-agendaPath: /knowledge-base/responses/scalable-oversight/
E271Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "scalable-oversight",
"numericId": null,
"path": "/knowledge-base/responses/scalable-oversight/",
"filePath": "knowledge-base/responses/scalable-oversight.mdx",
"title": "Scalable Oversight",
"quality": 68,
"readerImportance": 51.5,
"researchImportance": 30,
"tacticalValue": null,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": "2026-03-13",
"dateCreated": "2026-02-15",
"llmSummary": "Process supervision achieves 78.2% accuracy on MATH benchmarks (vs 72.4% outcome-based) and is deployed in OpenAI's o1 models, while debate shows 60-80% accuracy on factual questions with +4% improvement from self-play training. However, effectiveness against sophisticated deception remains unproven, with debate accuracy dropping to 50-65% on complex reasoning tasks.",
"description": "Methods for supervising AI systems on tasks too complex for direct human evaluation, including debate, recursive reward modeling, and process supervision. Process supervision achieves 78.2% accuracy on MATH benchmarks (vs 72.4% outcome-based), while debate shows 60-80% accuracy on factual questions with +4% improvement from self-play training. Critical for maintaining oversight as AI capabilities exceed human expertise.",
"ratings": {
"novelty": 4.2,
"rigor": 6.8,
"actionability": 5.9,
"completeness": 7.1
},
"category": "responses",
"subcategory": "alignment-theoretical",
"clusters": [
"ai-safety"
],
"metrics": {
"wordCount": 5698,
"tableCount": 2,
"diagramCount": 1,
"internalLinks": 42,
"externalLinks": 0,
"footnoteCount": 0,
"bulletRatio": 0.1,
"sectionCount": 38,
"hasOverview": true,
"structuralScore": 12
},
"suggestedQuality": 80,
"updateFrequency": 90,
"evergreen": true,
"wordCount": 5698,
"unconvertedLinks": [],
"unconvertedLinkCount": 0,
"convertedLinkCount": 31,
"backlinkCount": 58,
"hallucinationRisk": {
"level": "medium",
"score": 50,
"factors": [
"no-citations",
"few-external-sources",
"conceptual-content"
]
},
"entityType": "safety-agenda",
"redundancy": {
"maxSimilarity": 23,
"similarPages": [
{
"id": "agentic-ai",
"title": "Agentic AI",
"path": "/knowledge-base/capabilities/agentic-ai/",
"similarity": 23
},
{
"id": "reasoning",
"title": "Reasoning and Planning",
"path": "/knowledge-base/capabilities/reasoning/",
"similarity": 23
},
{
"id": "why-alignment-hard",
"title": "Why Alignment Might Be Hard",
"path": "/knowledge-base/debates/why-alignment-hard/",
"similarity": 23
},
{
"id": "reward-hacking-taxonomy",
"title": "Reward Hacking Taxonomy and Severity Model",
"path": "/knowledge-base/models/reward-hacking-taxonomy/",
"similarity": 23
},
{
"id": "interpretability",
"title": "Mechanistic Interpretability",
"path": "/knowledge-base/responses/interpretability/",
"similarity": 22
}
]
},
"coverage": {
"passing": 5,
"total": 13,
"targets": {
"tables": 23,
"diagrams": 2,
"internalLinks": 46,
"externalLinks": 28,
"footnotes": 17,
"references": 17
},
"actuals": {
"tables": 2,
"diagrams": 1,
"internalLinks": 42,
"externalLinks": 0,
"footnotes": 0,
"references": 20,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"llmSummary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "red",
"overview": "green",
"tables": "amber",
"diagrams": "amber",
"internalLinks": "amber",
"externalLinks": "red",
"footnotes": "red",
"references": "green",
"quotes": "red",
"accuracy": "red"
},
"ratingsString": "N:4.2 R:6.8 A:5.9 C:7.1"
},
"readerRank": 294,
"researchRank": 417,
"recommendedScore": 183.61
}External Links
{
"lesswrong": "https://www.lesswrong.com/tag/scalable-oversight",
"stampy": "https://aisafety.info/questions/8IHH/What-is-scalable-oversight",
"alignmentForum": "https://www.alignmentforum.org/tag/scalable-oversight"
}Backlinks (58)
| id | title | type | relationship |
|---|---|---|---|
| rlhf | RLHF | capability | — |
| capability-alignment-race | Capability-Alignment Race Model | analysis | — |
| technical-pathways | AI Safety Technical Pathway Decomposition | analysis | — |
| reward-hacking-taxonomy | Reward Hacking Taxonomy and Severity Model | analysis | mitigation |
| anthropic | Anthropic | organization | research |
| deepmind | Google DeepMind | organization | research |
| openai | OpenAI | organization | research |
| arc | ARC | organization | — |
| jan-leike | Jan Leike | person | — |
| paul-christiano | Paul Christiano | person | — |
| anthropic-core-views | Anthropic Core Views | safety-agenda | — |
| process-supervision | Process Supervision | approach | — |
| eliciting-latent-knowledge | Eliciting Latent Knowledge (ELK) | approach | — |
| debate | AI Safety via Debate | approach | — |
| multi-agent | Multi-Agent Safety | approach | — |
| deceptive-alignment | Deceptive Alignment | risk | — |
| reward-hacking | Reward Hacking | risk | — |
| sycophancy | Sycophancy | risk | — |
| agentic-ai | Agentic AI | capability | — |
| language-models | Large Language Models | capability | — |
| scientific-research | Scientific Research Capabilities | capability | — |
| solutions | AI Safety Solution Cruxes | crux | — |
| why-alignment-easy | Why Alignment Might Be Easy | argument | — |
| why-alignment-hard | Why Alignment Might Be Hard | argument | — |
| deep-learning-era | Deep Learning Revolution (2012-2020) | historical | — |
| miri-era | The MIRI Era (2000-2015) | historical | — |
| ai-timelines | AI Timelines | concept | — |
| alignment-robustness-trajectory | Alignment Robustness Trajectory | analysis | — |
| intervention-effectiveness-matrix | Intervention Effectiveness Matrix | analysis | — |
| model-organisms-of-misalignment | Model Organisms of Misalignment | analysis | — |
| pre-tai-capital-deployment | Pre-TAI Capital Deployment: $100B-$300B+ Spending Analysis | analysis | — |
| safety-spending-at-scale | Safety Spending at Scale | analysis | — |
| short-timeline-policy-implications | Short Timeline Policy Implications | analysis | — |
| elicit | Elicit (AI Research Tool) | organization | — |
| lionheart-ventures | Lionheart Ventures | organization | — |
| mats | MATS ML Alignment Theory Scholars program | organization | — |
| metr | METR | organization | — |
| pause-ai | Pause AI | organization | — |
| chris-olah | Chris Olah | person | — |
| connor-leahy | Connor Leahy | person | — |
| dario-amodei | Dario Amodei | person | — |
| neel-nanda | Neel Nanda | person | — |
| agent-foundations | Agent Foundations | approach | — |
| ai-control | AI Control | safety-agenda | — |
| alignment-theoretical-overview | Theoretical Foundations (Overview) | concept | — |
| alignment | AI Alignment | approach | — |
| interpretability | Mechanistic Interpretability | safety-agenda | — |
| intervention-portfolio | AI Safety Intervention Portfolio | approach | — |
| preference-optimization | Preference Optimization Methods | approach | — |
| research-agendas | AI Alignment Research Agenda Comparison | crux | — |
| scalable-eval-approaches | Scalable Eval Approaches | approach | — |
| scheming-detection | Scheming & Deception Detection | approach | — |
| sleeper-agent-detection | Sleeper Agent Detection | approach | — |
| training-programs | AI Safety Training Programs | approach | — |
| weak-to-strong | Weak-to-Strong Generalization | approach | — |
| automation-bias | Automation Bias (AI Systems) | risk | — |
| existential-risk | Existential Risk from AI | concept | — |
| expertise-atrophy | AI-Induced Expertise Atrophy | risk | — |