AI Safety via Debate

debateapproachPath: /knowledge-base/responses/debate/

E482Entity ID (EID)

← Back to page5 backlinksQuality: 70Updated: 2026-01-28

Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time

{
  "id": "debate",
  "wikiId": "E482",
  "path": "/knowledge-base/responses/debate/",
  "filePath": "knowledge-base/responses/debate.mdx",
  "title": "AI Safety via Debate",
  "quality": 70,
  "readerImportance": 71,
  "researchImportance": 34,
  "tacticalValue": null,
  "contentFormat": "article",
  "causalLevel": null,
  "lastUpdated": "2026-01-28",
  "dateCreated": "2026-02-15",
  "summary": "AI Safety via Debate uses adversarial AI systems arguing opposing positions to enable human oversight of superhuman AI. Recent empirical work shows promising results - debate achieves 88% human accuracy vs 60% baseline (Khan et al. 2024), and outperforms consultancy when weak LLMs judge strong LLMs (NeurIPS 2024). Active research at Anthropic, DeepMind, and OpenAI. Key open questions remain about truth advantage at superhuman capability levels and judge robustness against manipulation.",
  "description": "AI Safety via Debate proposes using adversarial AI systems to argue opposing positions while humans judge, designed to scale alignment to superhuman capabilities.",
  "ratings": {
    "novelty": 4.5,
    "rigor": 5,
    "completeness": 6.5,
    "actionability": 4
  },
  "category": "responses",
  "subcategory": "alignment-theoretical",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 1676,
    "tableCount": 15,
    "diagramCount": 1,
    "internalLinks": 9,
    "externalLinks": 16,
    "footnoteCount": 0,
    "bulletRatio": 0.11,
    "sectionCount": 25,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": 90,
  "evergreen": true,
  "wordCount": 1676,
  "unconvertedLinks": [
    {
      "text": "Geoffrey Irving and colleagues at OpenAI in 2018",
      "url": "https://arxiv.org/abs/1805.00899",
      "resourceId": "61da2f8e311a2bbf",
      "resourceTitle": "Debate as Scalable Oversight"
    },
    {
      "text": "DeepMind research presented at NeurIPS 2024",
      "url": "https://arxiv.org/abs/2407.04622",
      "resourceId": "fe73170e9d8be64f",
      "resourceTitle": "[2407.04622] On scalable oversight with weak LLMs judging strong LLMs"
    },
    {
      "text": "arXiv:2407.04622",
      "url": "https://arxiv.org/abs/2407.04622",
      "resourceId": "fe73170e9d8be64f",
      "resourceTitle": "[2407.04622] On scalable oversight with weak LLMs judging strong LLMs"
    },
    {
      "text": "AI Safety via Debate",
      "url": "https://arxiv.org/abs/1805.00899",
      "resourceId": "61da2f8e311a2bbf",
      "resourceTitle": "Debate as Scalable Oversight"
    },
    {
      "text": "On Scalable Oversight with Weak LLMs Judging Strong LLMs",
      "url": "https://arxiv.org/abs/2407.04622",
      "resourceId": "fe73170e9d8be64f",
      "resourceTitle": "[2407.04622] On scalable oversight with weak LLMs judging strong LLMs"
    },
    {
      "text": "anthropic.com",
      "url": "https://www.anthropic.com/research/measuring-progress-on-scalable-oversight-for-large-language-models",
      "resourceId": "72d83671b5f929a1",
      "resourceTitle": "Anthropic's research program"
    },
    {
      "text": "Medium",
      "url": "https://deepmindsafetyresearch.medium.com/agi-safety-and-alignment-at-google-deepmind-a-summary-of-recent-work-8e600aca582a",
      "resourceId": "6374381b5ec386d1",
      "resourceTitle": "AGI Safety & Alignment team"
    }
  ],
  "unconvertedLinkCount": 7,
  "convertedLinkCount": 0,
  "backlinkCount": 5,
  "hallucinationRisk": {
    "level": "medium",
    "score": 45,
    "factors": [
      "no-citations",
      "conceptual-content"
    ]
  },
  "entityType": "approach",
  "redundancy": {
    "maxSimilarity": 16,
    "similarPages": [
      {
        "id": "weak-to-strong",
        "title": "Weak-to-Strong Generalization",
        "path": "/knowledge-base/responses/weak-to-strong/",
        "similarity": 16
      },
      {
        "id": "process-supervision",
        "title": "Process Supervision",
        "path": "/knowledge-base/responses/process-supervision/",
        "similarity": 15
      },
      {
        "id": "cirl",
        "title": "Cooperative IRL (CIRL)",
        "path": "/knowledge-base/responses/cirl/",
        "similarity": 14
      },
      {
        "id": "ai-assisted",
        "title": "AI-Assisted Alignment",
        "path": "/knowledge-base/responses/ai-assisted/",
        "similarity": 13
      },
      {
        "id": "eliciting-latent-knowledge",
        "title": "Eliciting Latent Knowledge (ELK)",
        "path": "/knowledge-base/responses/eliciting-latent-knowledge/",
        "similarity": 13
      }
    ]
  },
  "coverage": {
    "passing": 7,
    "total": 13,
    "targets": {
      "tables": 7,
      "diagrams": 1,
      "internalLinks": 13,
      "externalLinks": 8,
      "footnotes": 5,
      "references": 5
    },
    "actuals": {
      "tables": 15,
      "diagrams": 1,
      "internalLinks": 9,
      "externalLinks": 16,
      "footnotes": 0,
      "references": 4,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "summary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "green",
      "diagrams": "green",
      "internalLinks": "amber",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "amber",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:4.5 R:5 A:4 C:6.5"
  },
  "readerRank": 160,
  "researchRank": 383,
  "recommendedScore": 186.45
}

External Links

{
  "lesswrong": "https://www.lesswrong.com/tag/debate-ai-safety-technique-1",
  "stampy": "https://aisafety.info/questions/8Jgr/What-is-AI-safety-via-debate",
  "alignmentForum": "https://www.alignmentforum.org/tag/debate-ai-safety-technique-1"
}

Backlinks (5)

id	title	type	relationship
accident-risks	AI Accident Risk Cruxes	crux	—
paul-christiano	Paul Christiano	person	—
alignment-theoretical-overview	Theoretical Foundations (Overview)	concept	—
reward-modeling	Reward Modeling	approach	—
weak-to-strong	Weak-to-Strong Generalization	approach	—