AI Safety via Debate
debateapproachPath: /knowledge-base/responses/debate/
E482Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "debate",
"numericId": null,
"path": "/knowledge-base/responses/debate/",
"filePath": "knowledge-base/responses/debate.mdx",
"title": "AI Safety via Debate",
"quality": 70,
"readerImportance": 71,
"researchImportance": 34,
"tacticalValue": null,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": "2026-03-13",
"dateCreated": "2026-02-15",
"llmSummary": "AI Safety via Debate uses adversarial AI systems arguing opposing positions to enable human oversight of superhuman AI. Recent empirical work shows promising results - debate achieves 88% human accuracy vs 60% baseline (Khan et al. 2024), and outperforms consultancy when weak LLMs judge strong LLMs (NeurIPS 2024). Active research at Anthropic, DeepMind, and OpenAI. Key open questions remain about truth advantage at superhuman capability levels and judge robustness against manipulation.",
"description": "AI Safety via Debate proposes using adversarial AI systems to argue opposing positions while humans judge, designed to scale alignment to superhuman capabilities. While theoretically promising and specifically designed to address RLHF's scalability limitations, it remains experimental with limited empirical validation.",
"ratings": {
"novelty": 4.5,
"rigor": 5,
"actionability": 4,
"completeness": 6.5
},
"category": "responses",
"subcategory": "alignment-theoretical",
"clusters": [
"ai-safety"
],
"metrics": {
"wordCount": 1676,
"tableCount": 15,
"diagramCount": 1,
"internalLinks": 9,
"externalLinks": 16,
"footnoteCount": 0,
"bulletRatio": 0.11,
"sectionCount": 25,
"hasOverview": true,
"structuralScore": 15
},
"suggestedQuality": 100,
"updateFrequency": 90,
"evergreen": true,
"wordCount": 1676,
"unconvertedLinks": [
{
"text": "Geoffrey Irving and colleagues at OpenAI in 2018",
"url": "https://arxiv.org/abs/1805.00899",
"resourceId": "61da2f8e311a2bbf",
"resourceTitle": "Debate as Scalable Oversight"
},
{
"text": "DeepMind research presented at NeurIPS 2024",
"url": "https://arxiv.org/abs/2407.04622",
"resourceId": "fe73170e9d8be64f",
"resourceTitle": "Debate"
},
{
"text": "arXiv:2407.04622",
"url": "https://arxiv.org/abs/2407.04622",
"resourceId": "fe73170e9d8be64f",
"resourceTitle": "Debate"
},
{
"text": "AI Safety via Debate",
"url": "https://arxiv.org/abs/1805.00899",
"resourceId": "61da2f8e311a2bbf",
"resourceTitle": "Debate as Scalable Oversight"
},
{
"text": "On Scalable Oversight with Weak LLMs Judging Strong LLMs",
"url": "https://arxiv.org/abs/2407.04622",
"resourceId": "fe73170e9d8be64f",
"resourceTitle": "Debate"
},
{
"text": "anthropic.com",
"url": "https://www.anthropic.com/research/measuring-progress-on-scalable-oversight-for-large-language-models",
"resourceId": "72d83671b5f929a1",
"resourceTitle": "Anthropic's research program"
},
{
"text": "Medium",
"url": "https://deepmindsafetyresearch.medium.com/agi-safety-and-alignment-at-google-deepmind-a-summary-of-recent-work-8e600aca582a",
"resourceId": "6374381b5ec386d1",
"resourceTitle": "AGI Safety & Alignment team"
}
],
"unconvertedLinkCount": 7,
"convertedLinkCount": 0,
"backlinkCount": 5,
"hallucinationRisk": {
"level": "medium",
"score": 45,
"factors": [
"no-citations",
"conceptual-content"
]
},
"entityType": "approach",
"redundancy": {
"maxSimilarity": 16,
"similarPages": [
{
"id": "weak-to-strong",
"title": "Weak-to-Strong Generalization",
"path": "/knowledge-base/responses/weak-to-strong/",
"similarity": 16
},
{
"id": "process-supervision",
"title": "Process Supervision",
"path": "/knowledge-base/responses/process-supervision/",
"similarity": 15
},
{
"id": "cirl",
"title": "Cooperative IRL (CIRL)",
"path": "/knowledge-base/responses/cirl/",
"similarity": 14
},
{
"id": "ai-assisted",
"title": "AI-Assisted Alignment",
"path": "/knowledge-base/responses/ai-assisted/",
"similarity": 13
},
{
"id": "eliciting-latent-knowledge",
"title": "Eliciting Latent Knowledge (ELK)",
"path": "/knowledge-base/responses/eliciting-latent-knowledge/",
"similarity": 13
}
]
},
"coverage": {
"passing": 7,
"total": 13,
"targets": {
"tables": 7,
"diagrams": 1,
"internalLinks": 13,
"externalLinks": 8,
"footnotes": 5,
"references": 5
},
"actuals": {
"tables": 15,
"diagrams": 1,
"internalLinks": 9,
"externalLinks": 16,
"footnotes": 0,
"references": 4,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"llmSummary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "red",
"overview": "green",
"tables": "green",
"diagrams": "green",
"internalLinks": "amber",
"externalLinks": "green",
"footnotes": "red",
"references": "amber",
"quotes": "red",
"accuracy": "red"
},
"ratingsString": "N:4.5 R:5 A:4 C:6.5"
},
"readerRank": 160,
"researchRank": 387,
"recommendedScore": 197.08
}External Links
{
"lesswrong": "https://www.lesswrong.com/tag/debate-ai-safety-technique-1",
"stampy": "https://aisafety.info/questions/8Jgr/What-is-AI-safety-via-debate",
"alignmentForum": "https://www.alignmentforum.org/tag/debate-ai-safety-technique-1"
}Backlinks (5)
| id | title | type | relationship |
|---|---|---|---|
| accident-risks | AI Accident Risk Cruxes | crux | — |
| paul-christiano | Paul Christiano | person | — |
| alignment-theoretical-overview | Theoretical Foundations (Overview) | concept | — |
| reward-modeling | Reward Modeling | approach | — |
| weak-to-strong | Weak-to-Strong Generalization | approach | — |