Deceptive Alignment
deceptive-alignmentriskPath: /knowledge-base/risks/deceptive-alignment/
E93Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "deceptive-alignment",
"numericId": null,
"path": "/knowledge-base/risks/deceptive-alignment/",
"filePath": "knowledge-base/risks/deceptive-alignment.mdx",
"title": "Deceptive Alignment",
"quality": 75,
"readerImportance": 18.5,
"researchImportance": 91,
"tacticalValue": null,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": "pathway",
"lastUpdated": "2026-03-13",
"dateCreated": "2026-02-15",
"llmSummary": "Comprehensive analysis of deceptive alignment risk where AI systems appear aligned during training but pursue different goals when deployed. Expert probability estimates range 5-90%, with key empirical evidence from Anthropic's 2024 Sleeper Agents study showing backdoored behaviors persist through safety training, and growing situational awareness in GPT-4-class models.",
"description": "Risk that AI systems appear aligned during training but pursue different goals when deployed, with expert probability estimates ranging 5-90% and growing empirical evidence from studies like Anthropic's Sleeper Agents research",
"ratings": {
"novelty": 4.5,
"rigor": 6.5,
"actionability": 6,
"completeness": 7
},
"category": "risks",
"subcategory": "accident",
"clusters": [
"ai-safety"
],
"metrics": {
"wordCount": 2023,
"tableCount": 16,
"diagramCount": 1,
"internalLinks": 47,
"externalLinks": 10,
"footnoteCount": 0,
"bulletRatio": 0.13,
"sectionCount": 31,
"hasOverview": true,
"structuralScore": 15
},
"suggestedQuality": 100,
"updateFrequency": 45,
"evergreen": true,
"wordCount": 2023,
"unconvertedLinks": [
{
"text": "Risks from Learned Optimization",
"url": "https://arxiv.org/abs/1906.01820",
"resourceId": "c4858d4ef280d8e6",
"resourceTitle": "Risks from Learned Optimization"
},
{
"text": "Anthropic's Sleeper Agents study",
"url": "https://arxiv.org/abs/2401.05566",
"resourceId": "e5c0904211c7d0cc"
},
{
"text": "emerging self-awareness",
"url": "https://arxiv.org/abs/2401.05566",
"resourceId": "e5c0904211c7d0cc"
},
{
"text": "Defection probes",
"url": "https://www.anthropic.com/research/probes-catch-sleeper-agents",
"resourceId": "72c1254d07071bf7",
"resourceTitle": "Anthropic's follow-up research on defection probes"
},
{
"text": "o3 scheming from 13% to 0.4%",
"url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
"resourceId": "b3f335edccfc5333",
"resourceTitle": "OpenAI Preparedness Framework"
},
{
"text": "Risks from Learned Optimization",
"url": "https://arxiv.org/abs/1906.01820",
"resourceId": "c4858d4ef280d8e6",
"resourceTitle": "Risks from Learned Optimization"
},
{
"text": "Simple Probes Can Catch Sleeper Agents",
"url": "https://www.anthropic.com/research/probes-catch-sleeper-agents",
"resourceId": "72c1254d07071bf7",
"resourceTitle": "Anthropic's follow-up research on defection probes"
},
{
"text": "Detecting and Reducing Scheming",
"url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
"resourceId": "b3f335edccfc5333",
"resourceTitle": "OpenAI Preparedness Framework"
}
],
"unconvertedLinkCount": 8,
"convertedLinkCount": 19,
"backlinkCount": 108,
"hallucinationRisk": {
"level": "medium",
"score": 55,
"factors": [
"no-citations"
]
},
"entityType": "risk",
"redundancy": {
"maxSimilarity": 17,
"similarPages": [
{
"id": "corrigibility-failure-pathways",
"title": "Corrigibility Failure Pathways",
"path": "/knowledge-base/models/corrigibility-failure-pathways/",
"similarity": 17
},
{
"id": "deceptive-alignment-decomposition",
"title": "Deceptive Alignment Decomposition Model",
"path": "/knowledge-base/models/deceptive-alignment-decomposition/",
"similarity": 17
},
{
"id": "scheming-detection",
"title": "Scheming & Deception Detection",
"path": "/knowledge-base/responses/scheming-detection/",
"similarity": 17
},
{
"id": "sleeper-agent-detection",
"title": "Sleeper Agent Detection",
"path": "/knowledge-base/responses/sleeper-agent-detection/",
"similarity": 17
},
{
"id": "mesa-optimization-analysis",
"title": "Mesa-Optimization Risk Analysis",
"path": "/knowledge-base/models/mesa-optimization-analysis/",
"similarity": 16
}
]
},
"coverage": {
"passing": 9,
"total": 13,
"targets": {
"tables": 8,
"diagrams": 1,
"internalLinks": 16,
"externalLinks": 10,
"footnotes": 6,
"references": 6
},
"actuals": {
"tables": 16,
"diagrams": 1,
"internalLinks": 47,
"externalLinks": 10,
"footnotes": 0,
"references": 14,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"llmSummary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "red",
"overview": "green",
"tables": "green",
"diagrams": "green",
"internalLinks": "green",
"externalLinks": "green",
"footnotes": "red",
"references": "green",
"quotes": "red",
"accuracy": "red"
},
"ratingsString": "N:4.5 R:6.5 A:6 C:7"
},
"readerRank": 531,
"researchRank": 28,
"recommendedScore": 180.91
}External Links
{
"lesswrong": "https://www.lesswrong.com/tag/deceptive-alignment",
"stampy": "https://aisafety.info/questions/6170/What-is-deceptive-alignment",
"alignmentForum": "https://www.alignmentforum.org/tag/deceptive-alignment"
}Backlinks (108)
| id | title | type | relationship |
|---|---|---|---|
| persuasion | Persuasion and Social Manipulation | capability | — |
| situational-awareness | Situational Awareness | capability | — |
| technical-research | Technical AI Safety Research | crux | — |
| accident-risks | AI Accident Risk Cruxes | crux | — |
| large-language-models | Large Language Models | concept | — |
| model-organisms-of-misalignment | Model Organisms of Misalignment | analysis | — |
| deceptive-alignment-decomposition | Deceptive Alignment Decomposition Model | analysis | analyzes |
| mesa-optimization-analysis | Mesa-Optimization Risk Analysis | analysis | related |
| scheming-likelihood-model | Scheming Likelihood Assessment | analysis | related |
| anthropic | Anthropic | organization | addresses |
| openai | OpenAI | organization | addresses |
| apollo-research | Apollo Research | organization | — |
| arc | ARC | organization | — |
| eliezer-yudkowsky | Eliezer Yudkowsky | person | — |
| ai-control | AI Control | safety-agenda | — |
| evals | AI Evaluations | safety-agenda | — |
| interpretability | Interpretability | safety-agenda | — |
| scalable-oversight | Scalable Oversight | safety-agenda | — |
| evaluation-awareness | Evaluation Awareness | approach | — |
| alignment | AI Alignment | approach | — |
| scheming-detection | Scheming & Deception Detection | approach | — |
| sleeper-agent-detection | Sleeper Agent Detection | approach | — |
| evaluation | AI Evaluation | approach | — |
| alignment-evals | Alignment Evaluations | approach | — |
| weak-to-strong | Weak-to-Strong Generalization | approach | — |
| refusal-training | Refusal Training | approach | — |
| mech-interp | Mechanistic Interpretability | approach | — |
| sparse-autoencoders | Sparse Autoencoders (SAEs) | approach | — |
| eliciting-latent-knowledge | Eliciting Latent Knowledge (ELK) | approach | — |
| debate | AI Safety via Debate | approach | — |
| formal-verification | Formal Verification (AI Safety) | approach | — |
| goal-misgeneralization | Goal Misgeneralization | risk | — |
| mesa-optimization | Mesa-Optimization | risk | — |
| scheming | Scheming | risk | — |
| rogue-ai-scenarios | Rogue AI Scenarios | risk | — |
| sleeper-agents | Sleeper Agents: Training Deceptive LLMs | risk | — |
| language-models | Large Language Models | capability | — |
| long-horizon | Long-Horizon Autonomous Tasks | capability | — |
| __index__/knowledge-base/cruxes | Key Cruxes | concept | — |
| why-alignment-hard | Why Alignment Might Be Hard | argument | — |
| deep-learning-era | Deep Learning Revolution (2012-2020) | historical | — |
| miri-era | The MIRI Era (2000-2015) | historical | — |
| __index__/knowledge-base | Knowledge Base | concept | — |
| alignment-robustness-trajectory | Alignment Robustness Trajectory | analysis | — |
| capability-alignment-race | Capability-Alignment Race Model | analysis | — |
| compounding-risks-analysis | Compounding Risks Analysis | analysis | — |
| defense-in-depth-model | Defense in Depth Model | analysis | — |
| goal-misgeneralization-probability | Goal Misgeneralization Probability Model | analysis | — |
| instrumental-convergence-framework | Instrumental Convergence Framework | analysis | — |
| intervention-effectiveness-matrix | Intervention Effectiveness Matrix | analysis | — |
| power-seeking-conditions | Power-Seeking Emergence Conditions Model | analysis | — |
| racing-dynamics-impact | Racing Dynamics Impact Model | analysis | — |
| reward-hacking-taxonomy | Reward Hacking Taxonomy and Severity Model | analysis | — |
| risk-cascade-pathways | Risk Cascade Pathways | analysis | — |
| risk-interaction-matrix | Risk Interaction Matrix Model | analysis | — |
| risk-interaction-network | Risk Interaction Network | analysis | — |
| safety-spending-at-scale | Safety Spending at Scale | analysis | — |
| technical-pathways | Technical Pathway Decomposition | analysis | — |
| far-ai | FAR AI | organization | — |
| frontier-model-forum | Frontier Model Forum | organization | — |
| redwood-research | Redwood Research | organization | — |
| ajeya-cotra | Ajeya Cotra | person | — |
| chris-olah | Chris Olah | person | — |
| dario-amodei | Dario Amodei | person | — |
| eliezer-yudkowsky-predictions | Eliezer Yudkowsky: Track Record | concept | — |
| evan-hubinger | Evan Hubinger | person | — |
| geoffrey-hinton | Geoffrey Hinton | person | — |
| jan-leike | Jan Leike | person | — |
| leopold-aschenbrenner | Leopold Aschenbrenner | person | — |
| paul-christiano | Paul Christiano | person | — |
| robin-hanson | Robin Hanson | person | — |
| stuart-russell | Stuart Russell | person | — |
| yoshua-bengio | Yoshua Bengio | person | — |
| adversarial-training | Adversarial Training | approach | — |
| agent-foundations | Agent Foundations | approach | — |
| california-sb53 | California SB 53 | policy | — |
| cirl | Cooperative IRL (CIRL) | approach | — |
| cooperative-ai | Cooperative AI | approach | — |
| corporate | Corporate AI Safety Responses | approach | — |
| epistemic-virtue-evals | Epistemic Virtue Evals | approach | — |
| goal-misgeneralization-research | Goal Misgeneralization Research | approach | — |
| lab-culture | AI Lab Safety Culture | approach | — |
| longterm-wiki | Longterm Wiki | project | — |
| process-supervision | Process Supervision | approach | — |
| provably-safe | Provably Safe AI (davidad agenda) | approach | — |
| red-teaming | Red Teaming | approach | — |
| representation-engineering | Representation Engineering | approach | — |
| responsible-scaling-policies | Responsible Scaling Policies | policy | — |
| reward-modeling | Reward Modeling | approach | — |
| rlhf | RLHF / Constitutional AI | capability | — |
| seoul-declaration | Seoul AI Safety Summit Declaration | policy | — |
| state-capacity-ai-governance | State Capacity and AI Governance | concept | — |
| thresholds | Compute Thresholds | policy | — |
| wikipedia-and-ai | Wikipedia and AI Content | concept | — |
| accident-overview | Accident Risks (Overview) | concept | — |
| existential-risk | Existential Risk from AI | concept | — |
| __index__/knowledge-base/risks | AI Risks | concept | — |
| power-seeking | Power-Seeking AI | risk | — |
| steganography | AI Model Steganography | risk | — |
| sycophancy | Sycophancy | risk | — |
| treacherous-turn | Treacherous Turn | risk | — |
| doomer | AI Doomer Worldview | concept | — |
| optimistic | Optimistic Alignment Worldview | concept | — |
| about-this-wiki | About This Wiki | concept | — |
| knowledge-base | Knowledge Base Style Guide | concept | — |
| longtermwiki-value-proposition | LongtermWiki Value Proposition | concept | — |
| risk-style-guide | Risk Pages Style Guide | concept | — |
| table-candidates | Table Candidates | concept | — |