Longterm Wiki

Why Alignment Might Be Hard

why-alignment-hardargumentPath: /knowledge-base/debates/why-alignment-hard/
E373Entity ID (EID)
← Back to page8 backlinksQuality: 69Updated: 2026-03-13
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "why-alignment-hard",
  "numericId": null,
  "path": "/knowledge-base/debates/why-alignment-hard/",
  "filePath": "knowledge-base/debates/why-alignment-hard.mdx",
  "title": "Why Alignment Might Be Hard",
  "quality": 69,
  "readerImportance": 72.5,
  "researchImportance": 94,
  "tacticalValue": 52,
  "contentFormat": "article",
  "tractability": null,
  "neglectedness": null,
  "uncertainty": null,
  "causalLevel": null,
  "lastUpdated": "2026-03-13",
  "dateCreated": "2026-02-15",
  "llmSummary": "A comprehensive taxonomy of alignment difficulty arguments spanning specification problems, inner alignment failures, verification limits, and adversarial dynamics, with expert p(doom) estimates ranging from 5-15% (ML researchers) to ~95% (Yudkowsky/MIRI) and empirical evidence including Sleeper Agents, misalignment generalization, and reward hacking cases. The page synthesizes existing literature well but offers little original analysis, and policy implications remain descriptive rather than actionable.",
  "description": "AI alignment faces fundamental challenges: specification problems (value complexity, Goodhart's Law), inner alignment failures (mesa-optimization, deceptive alignment), and verification difficulties. Expert estimates of alignment failure probability range from 10-20% (Paul Christiano) to 95%+ (Eliezer Yudkowsky), with empirical research demonstrating persistent deceptive behaviors in current models.",
  "ratings": {
    "focus": 8.5,
    "novelty": 4.5,
    "rigor": 6.5,
    "completeness": 8,
    "concreteness": 6.5,
    "actionability": 4.5,
    "objectivity": 7
  },
  "category": "debates",
  "subcategory": "formal-arguments",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 9280,
    "tableCount": 4,
    "diagramCount": 1,
    "internalLinks": 86,
    "externalLinks": 13,
    "footnoteCount": 3,
    "bulletRatio": 0.43,
    "sectionCount": 61,
    "hasOverview": false,
    "structuralScore": 13
  },
  "suggestedQuality": 87,
  "updateFrequency": 90,
  "evergreen": true,
  "wordCount": 9280,
  "unconvertedLinks": [
    {
      "text": "Faulty Reward Functions in the Wild",
      "url": "https://openai.com/index/faulty-reward-functions/",
      "resourceId": "b5d44bf4a1e9b96a",
      "resourceTitle": "CoastRunners boat"
    },
    {
      "text": "Deep Reinforcement Learning from Human Preferences",
      "url": "https://arxiv.org/abs/1706.03741",
      "resourceId": "14df73723b4d14d7",
      "resourceTitle": "[1706.03741] Deep Reinforcement Learning from Human Preferences"
    },
    {
      "text": "From Hard Refusals to Safe-Completions",
      "url": "https://arxiv.org/abs/2508.09224",
      "resourceId": "abf5f3cc624bed21",
      "resourceTitle": "From Hard Refusals to Safe-Completions: Toward Output-Centric Safety Training"
    },
    {
      "text": "Collective Constitutional AI",
      "url": "https://arxiv.org/html/2406.07814v1",
      "resourceId": "d82c1768cf5e080b",
      "resourceTitle": "Collective Constitutional AI: Aligning a Language Model with Public Input"
    },
    {
      "text": "Collective Alignment: Public Input on Our Model Spec",
      "url": "https://openai.com/index/collective-alignment-aug-2025-updates/",
      "resourceId": "75b66340eb2fadc2",
      "resourceTitle": "Collective Alignment: Public Input on Our Model Spec"
    },
    {
      "text": "Detecting and Mitigating Reward Hacking in Reinforcement Learning Systems",
      "url": "https://arxiv.org/html/2507.05619v1",
      "resourceId": "8e361fe6e472c82d",
      "resourceTitle": "Detecting and Mitigating Reward Hacking in Reinforcement Learning Systems: A Comprehensive Empirical Study"
    },
    {
      "text": "Toward Understanding and Preventing Misalignment Generalization",
      "url": "https://openai.com/index/emergent-misalignment/",
      "resourceId": "ccac2622760fd6c8",
      "resourceTitle": "Toward Understanding and Preventing Misalignment Generalization"
    },
    {
      "text": "Goal Misgeneralization in Deep Reinforcement Learning",
      "url": "https://proceedings.mlr.press/v162/langosco22a/langosco22a.pdf",
      "resourceId": "5227fd17cb52cb88",
      "resourceTitle": "Goal Misgeneralization in Deep Reinforcement Learning"
    },
    {
      "text": "Debating with More Persuasive LLMs Leads to More Truthful Answers",
      "url": "https://gist.github.com/bigsnarfdude/a95dbb3f8b560edd352665071ddf7312",
      "resourceId": "6e157f79186d4c37",
      "resourceTitle": "Debating with More Persuasive LLMs Leads to More Truthful Answers"
    },
    {
      "text": "On Scalable Oversight with Weak LLMs Judging Strong LLMs",
      "url": "https://proceedings.neurips.cc/paper_files/paper/2024/hash/899511e37a8e01e1bd6f6f1d377cc250-Abstract-Conference.html",
      "resourceId": "94d975b239bb0cad",
      "resourceTitle": "On Scalable Oversight with Weak LLMs Judging Strong LLMs"
    },
    {
      "text": "Scaling Laws for Scalable Oversight",
      "url": "https://arxiv.org/html/2504.18530v2",
      "resourceId": "08c92819cc0fc2dd",
      "resourceTitle": "Scaling Laws For Scalable Oversight"
    }
  ],
  "unconvertedLinkCount": 11,
  "convertedLinkCount": 28,
  "backlinkCount": 8,
  "citationHealth": {
    "total": 14,
    "withQuotes": 9,
    "verified": 8,
    "accuracyChecked": 8,
    "accurate": 6,
    "inaccurate": 1,
    "avgScore": 0.9021164021558232
  },
  "hallucinationRisk": {
    "level": "high",
    "score": 70,
    "factors": [
      "low-citation-density",
      "conceptual-content",
      "severe-truncation"
    ],
    "integrityIssues": [
      "severe-truncation"
    ]
  },
  "entityType": "argument",
  "redundancy": {
    "maxSimilarity": 23,
    "similarPages": [
      {
        "id": "reward-hacking-taxonomy",
        "title": "Reward Hacking Taxonomy and Severity Model",
        "path": "/knowledge-base/models/reward-hacking-taxonomy/",
        "similarity": 23
      },
      {
        "id": "scalable-oversight",
        "title": "Scalable Oversight",
        "path": "/knowledge-base/responses/scalable-oversight/",
        "similarity": 23
      },
      {
        "id": "agentic-ai",
        "title": "Agentic AI",
        "path": "/knowledge-base/capabilities/agentic-ai/",
        "similarity": 22
      },
      {
        "id": "language-models",
        "title": "Large Language Models",
        "path": "/knowledge-base/capabilities/language-models/",
        "similarity": 22
      },
      {
        "id": "case-for-xrisk",
        "title": "The Case FOR AI Existential Risk",
        "path": "/knowledge-base/debates/case-for-xrisk/",
        "similarity": 22
      }
    ]
  },
  "changeHistory": [
    {
      "date": "2026-03-13",
      "branch": "auto-update/2026-03-13",
      "title": "Auto-improve (standard): Why Alignment Might Be Hard",
      "summary": "Improved \"Why Alignment Might Be Hard\" via standard pipeline (545.8s). Quality score: 71. Issues resolved: Frontmatter 'lastEdited' date '2026-03-13' is a future date,; Mermaid chart contains EntityLink JSX components inside a te; Page is truncated: the KeyQuestions component is cut off mid.",
      "duration": "545.8s",
      "cost": "$5-8"
    },
    {
      "date": "2026-03-10",
      "branch": "auto-update/2026-03-10",
      "title": "Auto-improve (standard): Why Alignment Might Be Hard",
      "summary": "Improved \"Why Alignment Might Be Hard\" via standard pipeline (532.7s). Quality score: 74. Issues resolved: Truncated content: The page ends mid-sentence at '...does no; Bare URL in Expert Estimates table: '[OpenAI: Our approach t; Bare URL in 'Views Among Researchers' section: OpenAI perspe.",
      "duration": "532.7s",
      "cost": "$5-8"
    },
    {
      "date": "2026-03-09",
      "branch": "auto-update/2026-03-09",
      "title": "Auto-improve (standard): Why Alignment Might Be Hard",
      "summary": "Improved \"Why Alignment Might Be Hard\" via standard pipeline (559.2s). Quality score: 71. Issues resolved: Content is truncated — the KeyQuestions component and closin; Frontmatter field 'lastEdited' uses a future date ('2026-03-; Mermaid chart contains JSX EntityLink components inside a te.",
      "duration": "559.2s",
      "cost": "$5-8"
    },
    {
      "date": "2026-03-07",
      "branch": "auto-update/2026-03-07",
      "title": "Auto-improve (standard): Why Alignment Might Be Hard",
      "summary": "Improved \"Why Alignment Might Be Hard\" via standard pipeline (572.6s). Quality score: 74. Issues resolved: Footnote references (e.g., [^rc-0423], [^rc-d92f], [^rc-adec; The Key Sources section at the end is incomplete — it refere; The Mermaid diagram uses an EntityLink component inside the .",
      "duration": "572.6s",
      "cost": "$5-8"
    },
    {
      "date": "2026-02-21",
      "branch": "",
      "title": "Test session",
      "summary": "Testing session API",
      "model": "claude-opus-4-6"
    },
    {
      "date": "2026-02-18",
      "branch": "claude/fix-issue-240-N5irU",
      "title": "Surface tacticalValue in /wiki table and score 53 pages",
      "summary": "Added `tacticalValue` to `ExploreItem` interface, `getExploreItems()` mappings, the `/wiki` explore table (new sortable \"Tact.\" column), and the card view sort dropdown. Scored 49 new pages with tactical values (4 were already scored), bringing total to 53.",
      "model": "sonnet-4",
      "duration": "~30min"
    }
  ],
  "coverage": {
    "passing": 5,
    "total": 13,
    "targets": {
      "tables": 37,
      "diagrams": 4,
      "internalLinks": 74,
      "externalLinks": 46,
      "footnotes": 28,
      "references": 28
    },
    "actuals": {
      "tables": 4,
      "diagrams": 1,
      "internalLinks": 86,
      "externalLinks": 13,
      "footnotes": 3,
      "references": 27,
      "quotesWithQuotes": 9,
      "quotesTotal": 14,
      "accuracyChecked": 8,
      "accuracyTotal": 14
    },
    "items": {
      "llmSummary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "green",
      "overview": "red",
      "tables": "amber",
      "diagrams": "amber",
      "internalLinks": "green",
      "externalLinks": "amber",
      "footnotes": "amber",
      "references": "amber",
      "quotes": "amber",
      "accuracy": "amber"
    },
    "editHistoryCount": 6,
    "ratingsString": "N:4.5 R:6.5 A:4.5 C:8"
  },
  "readerRank": 140,
  "researchRank": 9,
  "recommendedScore": 196.11
}
External Links

No external links

Backlinks (8)
idtitletyperelationship
case-for-xriskThe Case FOR AI Existential Riskargument
__index__/knowledge-base/debatesKey Debatesconcept
why-alignment-easyWhy Alignment Might Be Easyargument
deceptive-alignment-decompositionDeceptive Alignment Decomposition Modelanalysis
conjectureConjectureorganization
holden-karnofskyHolden Karnofskyperson
alignmentAI Alignmentapproach
red-teamingRed Teamingapproach
Longterm Wiki