AI Control
ai-controlsafety-agendaPath: /knowledge-base/responses/ai-control/
E6Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "ai-control",
"numericId": null,
"path": "/knowledge-base/responses/ai-control/",
"filePath": "knowledge-base/responses/ai-control.mdx",
"title": "AI Control",
"quality": 75,
"readerImportance": 69,
"researchImportance": 41,
"tacticalValue": null,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": "2026-03-13",
"dateCreated": "2026-02-15",
"llmSummary": "AI Control is a defensive safety approach that maintains control over potentially misaligned AI through monitoring, containment, and redundancy, offering 40-60% catastrophic risk reduction if alignment fails with 70-85% tractability for near-human AI. Current research shows 80-95% detection rates against GPT-4-level adversarial behavior with 5-30% computational overhead, though effectiveness likely drops to 10-30% for superintelligent systems.",
"description": "A defensive safety approach maintaining control over potentially misaligned AI systems through monitoring, containment, and redundancy, offering 40-60% catastrophic risk reduction if alignment fails while remaining 70-85% tractable for near-human AI capabilities.",
"ratings": {
"novelty": 5,
"rigor": 7,
"actionability": 7,
"completeness": 8
},
"category": "responses",
"subcategory": "alignment-deployment",
"clusters": [
"ai-safety",
"governance"
],
"metrics": {
"wordCount": 3085,
"tableCount": 17,
"diagramCount": 1,
"internalLinks": 36,
"externalLinks": 13,
"footnoteCount": 0,
"bulletRatio": 0.11,
"sectionCount": 36,
"hasOverview": true,
"structuralScore": 15
},
"suggestedQuality": 100,
"updateFrequency": 21,
"evergreen": true,
"wordCount": 3085,
"unconvertedLinks": [
{
"text": "empirical results",
"url": "https://arxiv.org/abs/2312.06942",
"resourceId": "187aaa26886ce183",
"resourceTitle": "AI Control Framework"
},
{
"text": "UK AISI",
"url": "https://alignmentproject.aisi.gov.uk/",
"resourceId": "2c54187a89647ed5",
"resourceTitle": "The Alignment Project"
},
{
"text": "\"AI Control: Improving Safety Despite Intentional Subversion\"",
"url": "https://arxiv.org/abs/2312.06942",
"resourceId": "187aaa26886ce183",
"resourceTitle": "AI Control Framework"
},
{
"text": "foundational ICML 2024 paper",
"url": "https://arxiv.org/abs/2312.06942",
"resourceId": "187aaa26886ce183",
"resourceTitle": "AI Control Framework"
},
{
"text": "UK AI Security Institute's Alignment Project",
"url": "https://alignmentproject.aisi.gov.uk/",
"resourceId": "2c54187a89647ed5",
"resourceTitle": "The Alignment Project"
},
{
"text": "alignment faking",
"url": "https://www.anthropic.com/research/alignment-faking",
"resourceId": "c2cfd72baafd64a9",
"resourceTitle": "Anthropic's 2024 alignment faking study"
},
{
"text": "Alignment faking paper",
"url": "https://arxiv.org/abs/2412.14093",
"resourceId": "19a35a5cec9d9b80",
"resourceTitle": "Anthropic Alignment Faking (2024)"
},
{
"text": "Greenblatt et al. (2024)",
"url": "https://arxiv.org/abs/2312.06942",
"resourceId": "187aaa26886ce183",
"resourceTitle": "AI Control Framework"
},
{
"text": "Greenblatt et al. (2024)",
"url": "https://www.anthropic.com/research/alignment-faking",
"resourceId": "c2cfd72baafd64a9",
"resourceTitle": "Anthropic's 2024 alignment faking study"
},
{
"text": "Shlegeris & Greenblatt (2024)",
"url": "https://blog.redwoodresearch.org/p/the-case-for-ensuring-that-powerful",
"resourceId": "32c44bb7ba8a1bbe",
"resourceTitle": "\"The case for ensuring that powerful AIs are controlled\" (May 2024)"
},
{
"text": "UK AISI (2025)",
"url": "https://alignmentproject.aisi.gov.uk/",
"resourceId": "2c54187a89647ed5",
"resourceTitle": "The Alignment Project"
}
],
"unconvertedLinkCount": 11,
"convertedLinkCount": 12,
"backlinkCount": 48,
"hallucinationRisk": {
"level": "low",
"score": 30,
"factors": [
"no-citations",
"high-rigor",
"conceptual-content"
]
},
"entityType": "safety-agenda",
"redundancy": {
"maxSimilarity": 20,
"similarPages": [
{
"id": "intervention-effectiveness-matrix",
"title": "Intervention Effectiveness Matrix",
"path": "/knowledge-base/models/intervention-effectiveness-matrix/",
"similarity": 20
},
{
"id": "interpretability",
"title": "Mechanistic Interpretability",
"path": "/knowledge-base/responses/interpretability/",
"similarity": 20
},
{
"id": "scalable-oversight",
"title": "Scalable Oversight",
"path": "/knowledge-base/responses/scalable-oversight/",
"similarity": 20
},
{
"id": "self-improvement",
"title": "Self-Improvement and Recursive Enhancement",
"path": "/knowledge-base/capabilities/self-improvement/",
"similarity": 19
},
{
"id": "research-agendas",
"title": "AI Alignment Research Agenda Comparison",
"path": "/knowledge-base/responses/research-agendas/",
"similarity": 19
}
]
},
"coverage": {
"passing": 8,
"total": 13,
"targets": {
"tables": 12,
"diagrams": 1,
"internalLinks": 25,
"externalLinks": 15,
"footnotes": 9,
"references": 9
},
"actuals": {
"tables": 17,
"diagrams": 1,
"internalLinks": 36,
"externalLinks": 13,
"footnotes": 0,
"references": 15,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"llmSummary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "red",
"overview": "green",
"tables": "green",
"diagrams": "green",
"internalLinks": "green",
"externalLinks": "amber",
"footnotes": "red",
"references": "green",
"quotes": "red",
"accuracy": "red"
},
"ratingsString": "N:5 R:7 A:7 C:8"
},
"readerRank": 172,
"researchRank": 338,
"recommendedScore": 206.34
}External Links
{
"lesswrong": "https://www.lesswrong.com/tag/ai-control",
"wikipedia": "https://en.wikipedia.org/wiki/AI_capability_control",
"alignmentForum": "https://www.alignmentforum.org/tag/ai-control",
"grokipedia": "https://grokipedia.com/page/AI_capability_control"
}Backlinks (48)
| id | title | type | relationship |
|---|---|---|---|
| agentic-ai | Agentic AI | capability | — |
| long-horizon | Long-Horizon Autonomous Tasks | capability | — |
| short-timeline-policy-implications | Short AI Timeline Policy Implications | analysis | — |
| corrigibility-failure-pathways | Corrigibility Failure Pathways | analysis | mitigation |
| redwood-research | Redwood Research | organization | — |
| corrigibility | Corrigibility | safety-agenda | — |
| sleeper-agent-detection | Sleeper Agent Detection | approach | — |
| representation-engineering | Representation Engineering | approach | — |
| provably-safe | Provably Safe AI (davidad agenda) | approach | — |
| corrigibility-failure | Corrigibility Failure | risk | — |
| deceptive-alignment | Deceptive Alignment | risk | — |
| scientific-research | Scientific Research Capabilities | capability | — |
| why-alignment-easy | Why Alignment Might Be Easy | argument | — |
| __index__/knowledge-base | Knowledge Base | concept | — |
| carlsmith-six-premises | Carlsmith's Six-Premise Argument | analysis | — |
| defense-in-depth-model | Defense in Depth Model | analysis | — |
| flash-dynamics-threshold | Flash Dynamics Threshold Model | analysis | — |
| goal-misgeneralization-probability | Goal Misgeneralization Probability Model | analysis | — |
| instrumental-convergence-framework | Instrumental Convergence Framework | analysis | — |
| intervention-effectiveness-matrix | Intervention Effectiveness Matrix | analysis | — |
| mesa-optimization-analysis | Mesa-Optimization Risk Analysis | analysis | — |
| risk-activation-timeline | Risk Activation Timeline Model | analysis | — |
| risk-interaction-matrix | Risk Interaction Matrix Model | analysis | — |
| scheming-likelihood-model | Scheming Likelihood Assessment | analysis | — |
| ai-impacts | AI Impacts | organization | — |
| conjecture | Conjecture | organization | — |
| controlai | ControlAI | organization | — |
| mats | MATS ML Alignment Theory Scholars program | organization | — |
| safety-orgs-overview | AI Safety Organizations (Overview) | concept | — |
| ajeya-cotra | Ajeya Cotra | person | — |
| agent-foundations | Agent Foundations | approach | — |
| alignment-deployment-overview | Deployment & Control (Overview) | concept | — |
| evals | Evals & Red-teaming | safety-agenda | — |
| __index__/knowledge-base/responses | Safety Responses | concept | — |
| interpretability | Mechanistic Interpretability | safety-agenda | — |
| intervention-portfolio | AI Safety Intervention Portfolio | approach | — |
| research-agendas | AI Alignment Research Agenda Comparison | crux | — |
| scheming-detection | Scheming & Deception Detection | approach | — |
| sparse-autoencoders | Sparse Autoencoders (SAEs) | approach | — |
| technical-research | Technical AI Safety Research | crux | — |
| training-programs | AI Safety Training Programs | approach | — |
| instrumental-convergence | Instrumental Convergence | risk | — |
| proliferation | Proliferation | risk | — |
| reward-hacking | Reward Hacking | risk | — |
| rogue-ai-scenarios | Rogue AI Scenarios | risk | — |
| scheming | Scheming | risk | — |
| sharp-left-turn | Sharp Left Turn | risk | — |
| longtermwiki-value-proposition | LongtermWiki Value Proposition | concept | — |