Corrigibility Research

corrigibilityresearch-areaPath: /knowledge-base/responses/corrigibility/

E79Entity ID (EID)

← Back to page12 backlinksQuality: 59Updated: 2026-01-29

Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time

{
  "id": "corrigibility",
  "wikiId": "E79",
  "path": "/knowledge-base/responses/corrigibility/",
  "filePath": "knowledge-base/responses/corrigibility.mdx",
  "title": "Corrigibility Research",
  "quality": 59,
  "readerImportance": 24,
  "researchImportance": 71.5,
  "tacticalValue": null,
  "contentFormat": "article",
  "causalLevel": null,
  "lastUpdated": "2026-01-29",
  "dateCreated": "2026-02-15",
  "summary": "Comprehensive review of corrigibility research showing fundamental tensions between goal-directed behavior and shutdown compliance remain unsolved after 10+ years, with 2024-25 empirical evidence revealing 12-78% alignment faking rates (Anthropic) and 7-97% shutdown resistance in frontier models (Palisade). Research investment estimated at \\$10-20M/year with ~10-20 active researchers, but no complete theoretical or practical solution exists.",
  "description": "Designing AI systems that accept human correction and shutdown, addressing fundamental tensions between goal-directed behavior and compliance.",
  "ratings": {
    "novelty": 4.2,
    "rigor": 6.8,
    "completeness": 7.5,
    "actionability": 5.5
  },
  "category": "responses",
  "subcategory": "alignment-theoretical",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 2383,
    "tableCount": 10,
    "diagramCount": 2,
    "internalLinks": 20,
    "externalLinks": 24,
    "footnoteCount": 0,
    "bulletRatio": 0.15,
    "sectionCount": 20,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": 90,
  "evergreen": true,
  "wordCount": 2383,
  "unconvertedLinks": [
    {
      "text": "December 2024 study",
      "url": "https://www.anthropic.com/research/alignment-faking",
      "resourceId": "c2cfd72baafd64a9",
      "resourceTitle": "Anthropic's 2024 alignment faking study"
    },
    {
      "text": "Palisade Research (2025)",
      "url": "https://palisaderesearch.org/blog/shutdown-resistance",
      "resourceId": "0f6fb2f1a95e716a",
      "resourceTitle": "Shutdown resistance in reasoning models"
    },
    {
      "text": "MIRI 2015 paper",
      "url": "https://intelligence.org/files/Corrigibility.pdf",
      "resourceId": "33c4da848ef72141",
      "resourceTitle": "Corrigibility Research"
    },
    {
      "text": "2025 AI Safety Index",
      "url": "https://futureoflife.org/ai-safety-index-summer-2025/",
      "resourceId": "df46edd6fa2078d1",
      "resourceTitle": "FLI AI Safety Index Summer 2025"
    },
    {
      "text": "UK DSIT £8.5M",
      "url": "https://link.springer.com/article/10.1007/s43681-024-00484-9",
      "resourceId": "e41c0b9d8de1061b",
      "resourceTitle": "Addressing corrigibility in near-future AI systems"
    },
    {
      "text": "Anthropic Dec 2024",
      "url": "https://www.anthropic.com/research/alignment-faking",
      "resourceId": "c2cfd72baafd64a9",
      "resourceTitle": "Anthropic's 2024 alignment faking study"
    },
    {
      "text": "Palisade Research 2025",
      "url": "https://palisaderesearch.org/blog/shutdown-resistance",
      "resourceId": "0f6fb2f1a95e716a",
      "resourceTitle": "Shutdown resistance in reasoning models"
    },
    {
      "text": "Alignment faking research",
      "url": "https://www.anthropic.com/research/alignment-faking",
      "resourceId": "c2cfd72baafd64a9",
      "resourceTitle": "Anthropic's 2024 alignment faking study"
    },
    {
      "text": "Empirical shutdown resistance studies",
      "url": "https://palisaderesearch.org/blog/shutdown-resistance",
      "resourceId": "0f6fb2f1a95e716a",
      "resourceTitle": "Shutdown resistance in reasoning models"
    },
    {
      "text": "Multi-tier architectures",
      "url": "https://link.springer.com/article/10.1007/s43681-024-00484-9",
      "resourceId": "e41c0b9d8de1061b",
      "resourceTitle": "Addressing corrigibility in near-future AI systems"
    },
    {
      "text": "International AI Safety Report 2025",
      "url": "https://internationalaisafetyreport.org/publication/international-ai-safety-report-2025",
      "resourceId": "b163447fdc804872",
      "resourceTitle": "International AI Safety Report 2025"
    },
    {
      "text": "DSIT announcement",
      "url": "https://link.springer.com/article/10.1007/s43681-024-00484-9",
      "resourceId": "e41c0b9d8de1061b",
      "resourceTitle": "Addressing corrigibility in near-future AI systems"
    },
    {
      "text": "Anthropic",
      "url": "https://www.anthropic.com/research/alignment-faking",
      "resourceId": "c2cfd72baafd64a9",
      "resourceTitle": "Anthropic's 2024 alignment faking study"
    },
    {
      "text": "Anthropic",
      "url": "https://www.anthropic.com/research/alignment-faking",
      "resourceId": "c2cfd72baafd64a9",
      "resourceTitle": "Anthropic's 2024 alignment faking study"
    },
    {
      "text": "Palisade Research",
      "url": "https://palisaderesearch.org/blog/shutdown-resistance",
      "resourceId": "0f6fb2f1a95e716a",
      "resourceTitle": "Shutdown resistance in reasoning models"
    },
    {
      "text": "Palisade Research",
      "url": "https://palisaderesearch.org/blog/shutdown-resistance",
      "resourceId": "0f6fb2f1a95e716a",
      "resourceTitle": "Shutdown resistance in reasoning models"
    },
    {
      "text": "Palisade Research",
      "url": "https://palisaderesearch.org/blog/shutdown-resistance",
      "resourceId": "0f6fb2f1a95e716a",
      "resourceTitle": "Shutdown resistance in reasoning models"
    },
    {
      "text": "LessWrong replication",
      "url": "https://www.lesswrong.com/posts/pCMmLiBcHbKohQgwA/i-replicated-the-anthropic-alignment-faking-experiment-on",
      "resourceId": "f2090465021b5be3",
      "resourceTitle": "I replicated the Anthropic alignment faking experiment on other models, and they didn't fake alignment"
    },
    {
      "text": "LessWrong replication",
      "url": "https://www.lesswrong.com/posts/pCMmLiBcHbKohQgwA/i-replicated-the-anthropic-alignment-faking-experiment-on",
      "resourceId": "f2090465021b5be3",
      "resourceTitle": "I replicated the Anthropic alignment faking experiment on other models, and they didn't fake alignment"
    }
  ],
  "unconvertedLinkCount": 19,
  "convertedLinkCount": 14,
  "backlinkCount": 12,
  "hallucinationRisk": {
    "level": "medium",
    "score": 55,
    "factors": [
      "no-citations"
    ]
  },
  "entityType": "research-area",
  "redundancy": {
    "maxSimilarity": 20,
    "similarPages": [
      {
        "id": "corrigibility-failure",
        "title": "Corrigibility Failure",
        "path": "/knowledge-base/risks/corrigibility-failure/",
        "similarity": 20
      },
      {
        "id": "instrumental-convergence",
        "title": "Instrumental Convergence",
        "path": "/knowledge-base/risks/instrumental-convergence/",
        "similarity": 16
      },
      {
        "id": "power-seeking",
        "title": "Power-Seeking AI",
        "path": "/knowledge-base/risks/power-seeking/",
        "similarity": 16
      },
      {
        "id": "instrumental-convergence-framework",
        "title": "Instrumental Convergence Framework",
        "path": "/knowledge-base/models/instrumental-convergence-framework/",
        "similarity": 15
      },
      {
        "id": "agent-foundations",
        "title": "Agent Foundations",
        "path": "/knowledge-base/responses/agent-foundations/",
        "similarity": 15
      }
    ]
  },
  "coverage": {
    "passing": 9,
    "total": 13,
    "targets": {
      "tables": 10,
      "diagrams": 1,
      "internalLinks": 19,
      "externalLinks": 12,
      "footnotes": 7,
      "references": 7
    },
    "actuals": {
      "tables": 10,
      "diagrams": 2,
      "internalLinks": 20,
      "externalLinks": 24,
      "footnotes": 0,
      "references": 18,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "summary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "green",
      "diagrams": "green",
      "internalLinks": "green",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "green",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:4.2 R:6.8 A:5.5 C:7.5"
  },
  "readerRank": 493,
  "researchRank": 137,
  "recommendedScore": 141.51
}

External Links

{
  "lesswrong": "https://www.lesswrong.com/tag/corrigibility",
  "stampy": "https://aisafety.info/questions/7750/What-is-corrigibility",
  "arbital": "https://arbital.greaterwrong.com/p/corrigibility",
  "alignmentForum": "https://www.alignmentforum.org/tag/corrigibility"
}

Backlinks (12)

id	title	type	relationship
chai	Center for Human-Compatible AI (CHAI)	organization	—
instrumental-convergence	Instrumental Convergence	risk	—
power-seeking	Power-Seeking AI	risk	—
treacherous-turn	Treacherous Turn	risk	—
accident-risks	AI Accident Risk Cruxes	crux	—
is-ai-xrisk-real	Is AI Existential Risk Real?	crux	—
why-alignment-hard	Why Alignment Might Be Hard	argument	—
instrumental-convergence-framework	Instrumental Convergence Framework	analysis	—
safety-research-allocation	Safety Research Allocation Model	analysis	—
stuart-russell	Stuart Russell	person	—
alignment-theoretical-overview	Theoretical Foundations (Overview)	concept	—
cirl	Cooperative IRL (CIRL)	approach	—