Longterm Wiki

AI Control

ai-controlsafety-agendaPath: /knowledge-base/responses/ai-control/
E6Entity ID (EID)
← Back to page48 backlinksQuality: 75Updated: 2026-03-13
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "ai-control",
  "numericId": null,
  "path": "/knowledge-base/responses/ai-control/",
  "filePath": "knowledge-base/responses/ai-control.mdx",
  "title": "AI Control",
  "quality": 75,
  "readerImportance": 69,
  "researchImportance": 41,
  "tacticalValue": null,
  "contentFormat": "article",
  "tractability": null,
  "neglectedness": null,
  "uncertainty": null,
  "causalLevel": null,
  "lastUpdated": "2026-03-13",
  "dateCreated": "2026-02-15",
  "llmSummary": "AI Control is a defensive safety approach that maintains control over potentially misaligned AI through monitoring, containment, and redundancy, offering 40-60% catastrophic risk reduction if alignment fails with 70-85% tractability for near-human AI. Current research shows 80-95% detection rates against GPT-4-level adversarial behavior with 5-30% computational overhead, though effectiveness likely drops to 10-30% for superintelligent systems.",
  "description": "A defensive safety approach maintaining control over potentially misaligned AI systems through monitoring, containment, and redundancy, offering 40-60% catastrophic risk reduction if alignment fails while remaining 70-85% tractable for near-human AI capabilities.",
  "ratings": {
    "novelty": 5,
    "rigor": 7,
    "actionability": 7,
    "completeness": 8
  },
  "category": "responses",
  "subcategory": "alignment-deployment",
  "clusters": [
    "ai-safety",
    "governance"
  ],
  "metrics": {
    "wordCount": 3085,
    "tableCount": 17,
    "diagramCount": 1,
    "internalLinks": 36,
    "externalLinks": 13,
    "footnoteCount": 0,
    "bulletRatio": 0.11,
    "sectionCount": 36,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": 21,
  "evergreen": true,
  "wordCount": 3085,
  "unconvertedLinks": [
    {
      "text": "empirical results",
      "url": "https://arxiv.org/abs/2312.06942",
      "resourceId": "187aaa26886ce183",
      "resourceTitle": "AI Control Framework"
    },
    {
      "text": "UK AISI",
      "url": "https://alignmentproject.aisi.gov.uk/",
      "resourceId": "2c54187a89647ed5",
      "resourceTitle": "The Alignment Project"
    },
    {
      "text": "\"AI Control: Improving Safety Despite Intentional Subversion\"",
      "url": "https://arxiv.org/abs/2312.06942",
      "resourceId": "187aaa26886ce183",
      "resourceTitle": "AI Control Framework"
    },
    {
      "text": "foundational ICML 2024 paper",
      "url": "https://arxiv.org/abs/2312.06942",
      "resourceId": "187aaa26886ce183",
      "resourceTitle": "AI Control Framework"
    },
    {
      "text": "UK AI Security Institute's Alignment Project",
      "url": "https://alignmentproject.aisi.gov.uk/",
      "resourceId": "2c54187a89647ed5",
      "resourceTitle": "The Alignment Project"
    },
    {
      "text": "alignment faking",
      "url": "https://www.anthropic.com/research/alignment-faking",
      "resourceId": "c2cfd72baafd64a9",
      "resourceTitle": "Anthropic's 2024 alignment faking study"
    },
    {
      "text": "Alignment faking paper",
      "url": "https://arxiv.org/abs/2412.14093",
      "resourceId": "19a35a5cec9d9b80",
      "resourceTitle": "Anthropic Alignment Faking (2024)"
    },
    {
      "text": "Greenblatt et al. (2024)",
      "url": "https://arxiv.org/abs/2312.06942",
      "resourceId": "187aaa26886ce183",
      "resourceTitle": "AI Control Framework"
    },
    {
      "text": "Greenblatt et al. (2024)",
      "url": "https://www.anthropic.com/research/alignment-faking",
      "resourceId": "c2cfd72baafd64a9",
      "resourceTitle": "Anthropic's 2024 alignment faking study"
    },
    {
      "text": "Shlegeris & Greenblatt (2024)",
      "url": "https://blog.redwoodresearch.org/p/the-case-for-ensuring-that-powerful",
      "resourceId": "32c44bb7ba8a1bbe",
      "resourceTitle": "\"The case for ensuring that powerful AIs are controlled\" (May 2024)"
    },
    {
      "text": "UK AISI (2025)",
      "url": "https://alignmentproject.aisi.gov.uk/",
      "resourceId": "2c54187a89647ed5",
      "resourceTitle": "The Alignment Project"
    }
  ],
  "unconvertedLinkCount": 11,
  "convertedLinkCount": 12,
  "backlinkCount": 48,
  "hallucinationRisk": {
    "level": "low",
    "score": 30,
    "factors": [
      "no-citations",
      "high-rigor",
      "conceptual-content"
    ]
  },
  "entityType": "safety-agenda",
  "redundancy": {
    "maxSimilarity": 20,
    "similarPages": [
      {
        "id": "intervention-effectiveness-matrix",
        "title": "Intervention Effectiveness Matrix",
        "path": "/knowledge-base/models/intervention-effectiveness-matrix/",
        "similarity": 20
      },
      {
        "id": "interpretability",
        "title": "Mechanistic Interpretability",
        "path": "/knowledge-base/responses/interpretability/",
        "similarity": 20
      },
      {
        "id": "scalable-oversight",
        "title": "Scalable Oversight",
        "path": "/knowledge-base/responses/scalable-oversight/",
        "similarity": 20
      },
      {
        "id": "self-improvement",
        "title": "Self-Improvement and Recursive Enhancement",
        "path": "/knowledge-base/capabilities/self-improvement/",
        "similarity": 19
      },
      {
        "id": "research-agendas",
        "title": "AI Alignment Research Agenda Comparison",
        "path": "/knowledge-base/responses/research-agendas/",
        "similarity": 19
      }
    ]
  },
  "coverage": {
    "passing": 8,
    "total": 13,
    "targets": {
      "tables": 12,
      "diagrams": 1,
      "internalLinks": 25,
      "externalLinks": 15,
      "footnotes": 9,
      "references": 9
    },
    "actuals": {
      "tables": 17,
      "diagrams": 1,
      "internalLinks": 36,
      "externalLinks": 13,
      "footnotes": 0,
      "references": 15,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "llmSummary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "green",
      "diagrams": "green",
      "internalLinks": "green",
      "externalLinks": "amber",
      "footnotes": "red",
      "references": "green",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:5 R:7 A:7 C:8"
  },
  "readerRank": 172,
  "researchRank": 338,
  "recommendedScore": 206.34
}
External Links
{
  "lesswrong": "https://www.lesswrong.com/tag/ai-control",
  "wikipedia": "https://en.wikipedia.org/wiki/AI_capability_control",
  "alignmentForum": "https://www.alignmentforum.org/tag/ai-control",
  "grokipedia": "https://grokipedia.com/page/AI_capability_control"
}
Backlinks (48)
idtitletyperelationship
agentic-aiAgentic AIcapability
long-horizonLong-Horizon Autonomous Taskscapability
short-timeline-policy-implicationsShort AI Timeline Policy Implicationsanalysis
corrigibility-failure-pathwaysCorrigibility Failure Pathwaysanalysismitigation
redwood-researchRedwood Researchorganization
corrigibilityCorrigibilitysafety-agenda
sleeper-agent-detectionSleeper Agent Detectionapproach
representation-engineeringRepresentation Engineeringapproach
provably-safeProvably Safe AI (davidad agenda)approach
corrigibility-failureCorrigibility Failurerisk
deceptive-alignmentDeceptive Alignmentrisk
scientific-researchScientific Research Capabilitiescapability
why-alignment-easyWhy Alignment Might Be Easyargument
__index__/knowledge-baseKnowledge Baseconcept
carlsmith-six-premisesCarlsmith's Six-Premise Argumentanalysis
defense-in-depth-modelDefense in Depth Modelanalysis
flash-dynamics-thresholdFlash Dynamics Threshold Modelanalysis
goal-misgeneralization-probabilityGoal Misgeneralization Probability Modelanalysis
instrumental-convergence-frameworkInstrumental Convergence Frameworkanalysis
intervention-effectiveness-matrixIntervention Effectiveness Matrixanalysis
mesa-optimization-analysisMesa-Optimization Risk Analysisanalysis
risk-activation-timelineRisk Activation Timeline Modelanalysis
risk-interaction-matrixRisk Interaction Matrix Modelanalysis
scheming-likelihood-modelScheming Likelihood Assessmentanalysis
ai-impactsAI Impactsorganization
conjectureConjectureorganization
controlaiControlAIorganization
matsMATS ML Alignment Theory Scholars programorganization
safety-orgs-overviewAI Safety Organizations (Overview)concept
ajeya-cotraAjeya Cotraperson
agent-foundationsAgent Foundationsapproach
alignment-deployment-overviewDeployment & Control (Overview)concept
evalsEvals & Red-teamingsafety-agenda
__index__/knowledge-base/responsesSafety Responsesconcept
interpretabilityMechanistic Interpretabilitysafety-agenda
intervention-portfolioAI Safety Intervention Portfolioapproach
research-agendasAI Alignment Research Agenda Comparisoncrux
scheming-detectionScheming & Deception Detectionapproach
sparse-autoencodersSparse Autoencoders (SAEs)approach
technical-researchTechnical AI Safety Researchcrux
training-programsAI Safety Training Programsapproach
instrumental-convergenceInstrumental Convergencerisk
proliferationProliferationrisk
reward-hackingReward Hackingrisk
rogue-ai-scenariosRogue AI Scenariosrisk
schemingSchemingrisk
sharp-left-turnSharp Left Turnrisk
longtermwiki-value-propositionLongtermWiki Value Propositionconcept
Longterm Wiki