Why Alignment Might Be Hard

why-alignment-hardargumentPath: /knowledge-base/debates/why-alignment-hard/
E373Entity ID (EID)
← Back to page8 backlinksQuality: 69Updated: 2026-03-13
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "why-alignment-hard",
  "wikiId": "E373",
  "path": "/knowledge-base/debates/why-alignment-hard/",
  "filePath": "knowledge-base/debates/why-alignment-hard.mdx",
  "title": "Why Alignment Might Be Hard",
  "quality": 69,
  "readerImportance": 72.5,
  "researchImportance": 94,
  "tacticalValue": 52,
  "contentFormat": "article",
  "causalLevel": null,
  "lastUpdated": "2026-03-13",
  "dateCreated": "2026-02-15",
  "summary": "A comprehensive taxonomy of alignment difficulty arguments spanning specification problems, inner alignment failures, verification limits, and adversarial dynamics, with expert p(doom) estimates ranging from 5-15% (ML researchers) to ~95% (Yudkowsky/MIRI) and empirical evidence including Sleeper Agents, misalignment generalization, and reward hacking cases. The page synthesizes existing literature well but offers little original analysis, and policy implications remain descriptive rather than actionable.",
  "description": "AI alignment faces fundamental challenges: specification problems (value complexity, Goodhart's Law), inner alignment failures (mesa-optimization, deceptive alignment), and verification difficulties.",
  "ratings": {
    "focus": 8.5,
    "novelty": 4.5,
    "rigor": 6.5,
    "completeness": 8,
    "concreteness": 6.5,
    "actionability": 4.5,
    "objectivity": 7
  },
  "category": "debates",
  "subcategory": "formal-arguments",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 9280,
    "tableCount": 4,
    "diagramCount": 1,
    "internalLinks": 86,
    "externalLinks": 13,
    "footnoteCount": 3,
    "bulletRatio": 0.43,
    "sectionCount": 61,
    "hasOverview": false,
    "structuralScore": 13
  },
  "suggestedQuality": 87,
  "updateFrequency": 90,
  "evergreen": true,
  "wordCount": 9280,
  "unconvertedLinks": [
    {
      "text": "Faulty Reward Functions in the Wild",
      "url": "https://openai.com/index/faulty-reward-functions/",
      "resourceId": "b5d44bf4a1e9b96a",
      "resourceTitle": "Faulty Reward Functions in the Wild: CoastRunners Boat Example"
    },
    {
      "text": "Deep Reinforcement Learning from Human Preferences",
      "url": "https://arxiv.org/abs/1706.03741",
      "resourceId": "14df73723b4d14d7",
      "resourceTitle": "[1706.03741] Deep Reinforcement Learning from Human Preferences"
    },
    {
      "text": "From Hard Refusals to Safe-Completions",
      "url": "https://arxiv.org/abs/2508.09224",
      "resourceId": "abf5f3cc624bed21",
      "resourceTitle": "From Hard Refusals to Safe-Completions: Toward Output-Centric Safety Training"
    },
    {
      "text": "Collective Constitutional AI",
      "url": "https://arxiv.org/html/2406.07814v1",
      "resourceId": "d82c1768cf5e080b",
      "resourceTitle": "Collective Constitutional AI: Aligning a Language Model with Public Input"
    },
    {
      "text": "Collective Alignment: Public Input on Our Model Spec",
      "url": "https://openai.com/index/collective-alignment-aug-2025-updates/",
      "resourceId": "75b66340eb2fadc2",
      "resourceTitle": "Collective Alignment: Public Input on Our Model Spec"
    },
    {
      "text": "Detecting and Mitigating Reward Hacking in Reinforcement Learning Systems",
      "url": "https://arxiv.org/html/2507.05619v1",
      "resourceId": "8e361fe6e472c82d",
      "resourceTitle": "Detecting and Mitigating Reward Hacking in Reinforcement Learning Systems: A Comprehensive Empirical Study"
    },
    {
      "text": "Toward Understanding and Preventing Misalignment Generalization",
      "url": "https://openai.com/index/emergent-misalignment/",
      "resourceId": "ccac2622760fd6c8",
      "resourceTitle": "Toward Understanding and Preventing Misalignment Generalization"
    },
    {
      "text": "Goal Misgeneralization in Deep Reinforcement Learning",
      "url": "https://proceedings.mlr.press/v162/langosco22a/langosco22a.pdf",
      "resourceId": "5227fd17cb52cb88",
      "resourceTitle": "Goal Misgeneralization in Deep Reinforcement Learning"
    },
    {
      "text": "Debating with More Persuasive LLMs Leads to More Truthful Answers",
      "url": "https://gist.github.com/bigsnarfdude/a95dbb3f8b560edd352665071ddf7312",
      "resourceId": "6e157f79186d4c37",
      "resourceTitle": "Debating with More Persuasive LLMs Leads to More Truthful Answers"
    },
    {
      "text": "On Scalable Oversight with Weak LLMs Judging Strong LLMs",
      "url": "https://proceedings.neurips.cc/paper_files/paper/2024/hash/899511e37a8e01e1bd6f6f1d377cc250-Abstract-Conference.html",
      "resourceId": "94d975b239bb0cad",
      "resourceTitle": "On Scalable Oversight with Weak LLMs Judging Strong LLMs"
    },
    {
      "text": "Scaling Laws for Scalable Oversight",
      "url": "https://arxiv.org/html/2504.18530v2",
      "resourceId": "08c92819cc0fc2dd",
      "resourceTitle": "Scaling Laws For Scalable Oversight"
    }
  ],
  "unconvertedLinkCount": 11,
  "convertedLinkCount": 28,
  "backlinkCount": 8,
  "citationHealth": {
    "total": 14,
    "withQuotes": 9,
    "verified": 8,
    "accuracyChecked": 8,
    "accurate": 6,
    "inaccurate": 1,
    "avgScore": 0.9021164021558232
  },
  "hallucinationRisk": {
    "level": "high",
    "score": 70,
    "factors": [
      "low-citation-density",
      "conceptual-content",
      "severe-truncation"
    ],
    "integrityIssues": [
      "severe-truncation"
    ]
  },
  "entityType": "argument",
  "redundancy": {
    "maxSimilarity": 23,
    "similarPages": [
      {
        "id": "language-models",
        "title": "Large Language Models",
        "path": "/knowledge-base/capabilities/language-models/",
        "similarity": 23
      },
      {
        "id": "is-ai-xrisk-real",
        "title": "Is AI Existential Risk Real?",
        "path": "/knowledge-base/debates/is-ai-xrisk-real/",
        "similarity": 23
      },
      {
        "id": "reward-hacking-taxonomy",
        "title": "Reward Hacking Taxonomy and Severity Model",
        "path": "/knowledge-base/models/reward-hacking-taxonomy/",
        "similarity": 23
      },
      {
        "id": "scalable-oversight",
        "title": "Scalable Oversight",
        "path": "/knowledge-base/responses/scalable-oversight/",
        "similarity": 23
      },
      {
        "id": "agentic-ai",
        "title": "Agentic AI",
        "path": "/knowledge-base/capabilities/agentic-ai/",
        "similarity": 22
      }
    ]
  },
  "changeHistory": [
    {
      "date": "2026-03-13",
      "branch": "auto-update/2026-03-13",
      "title": "Auto-improve (standard): Why Alignment Might Be Hard",
      "summary": "Improved \"Why Alignment Might Be Hard\" via standard pipeline (545.8s). Quality score: 71. Issues resolved: Frontmatter 'lastEdited' date '2026-03-13' is a future date,; Mermaid chart contains EntityLink JSX components inside a te; Page is truncated: the KeyQuestions component is cut off mid.",
      "duration": "545.8s",
      "cost": "$5-8"
    },
    {
      "date": "2026-03-10",
      "branch": "auto-update/2026-03-10",
      "title": "Auto-improve (standard): Why Alignment Might Be Hard",
      "summary": "Improved \"Why Alignment Might Be Hard\" via standard pipeline (532.7s). Quality score: 74. Issues resolved: Truncated content: The page ends mid-sentence at '...does no; Bare URL in Expert Estimates table: '[OpenAI: Our approach t; Bare URL in 'Views Among Researchers' section: OpenAI perspe.",
      "duration": "532.7s",
      "cost": "$5-8"
    },
    {
      "date": "2026-03-09",
      "branch": "auto-update/2026-03-09",
      "title": "Auto-improve (standard): Why Alignment Might Be Hard",
      "summary": "Improved \"Why Alignment Might Be Hard\" via standard pipeline (559.2s). Quality score: 71. Issues resolved: Content is truncated — the KeyQuestions component and closin; Frontmatter field 'lastEdited' uses a future date ('2026-03-; Mermaid chart contains JSX EntityLink components inside a te.",
      "duration": "559.2s",
      "cost": "$5-8"
    },
    {
      "date": "2026-03-07",
      "branch": "auto-update/2026-03-07",
      "title": "Auto-improve (standard): Why Alignment Might Be Hard",
      "summary": "Improved \"Why Alignment Might Be Hard\" via standard pipeline (572.6s). Quality score: 74. Issues resolved: Footnote references (e.g., [^rc-0423], [^rc-d92f], [^rc-adec; The Key Sources section at the end is incomplete — it refere; The Mermaid diagram uses an EntityLink component inside the .",
      "duration": "572.6s",
      "cost": "$5-8"
    },
    {
      "date": "2026-02-21",
      "branch": "",
      "title": "Test session",
      "summary": "Testing session API",
      "model": "claude-opus-4-6"
    },
    {
      "date": "2026-02-18",
      "branch": "claude/fix-issue-240-N5irU",
      "title": "Surface tacticalValue in /wiki table and score 53 pages",
      "summary": "Added `tacticalValue` to `ExploreItem` interface, `getExploreItems()` mappings, the `/wiki` explore table (new sortable \"Tact.\" column), and the card view sort dropdown. Scored 49 new pages with tactical values (4 were already scored), bringing total to 53.",
      "model": "sonnet-4",
      "duration": "~30min"
    }
  ],
  "coverage": {
    "passing": 5,
    "total": 13,
    "targets": {
      "tables": 37,
      "diagrams": 4,
      "internalLinks": 74,
      "externalLinks": 46,
      "footnotes": 28,
      "references": 28
    },
    "actuals": {
      "tables": 4,
      "diagrams": 1,
      "internalLinks": 86,
      "externalLinks": 13,
      "footnotes": 3,
      "references": 26,
      "quotesWithQuotes": 9,
      "quotesTotal": 14,
      "accuracyChecked": 8,
      "accuracyTotal": 14
    },
    "items": {
      "summary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "green",
      "overview": "red",
      "tables": "amber",
      "diagrams": "amber",
      "internalLinks": "green",
      "externalLinks": "amber",
      "footnotes": "amber",
      "references": "amber",
      "quotes": "amber",
      "accuracy": "amber"
    },
    "editHistoryCount": 6,
    "ratingsString": "N:4.5 R:6.5 A:4.5 C:8"
  },
  "readerRank": 140,
  "researchRank": 9,
  "recommendedScore": 190.03
}
External Links
No external links
Backlinks (8)
id	title	type	relationship
case-for-xrisk	The Case FOR AI Existential Risk	argument	—
__index__/knowledge-base/debates	Key Debates	concept	—
why-alignment-easy	Why Alignment Might Be Easy	argument	—
deceptive-alignment-decomposition	Deceptive Alignment Decomposition Model	analysis	—
conjecture	Conjecture	organization	—
holden-karnofsky	Holden Karnofsky	person	—
alignment	AI Alignment	approach	—
red-teaming	Red Teaming	research-area	—