Evan Hubinger
evan-hubingerpersonPath: /knowledge-base/people/evan-hubinger/
E129Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "evan-hubinger",
"numericId": null,
"path": "/knowledge-base/people/evan-hubinger/",
"filePath": "knowledge-base/people/evan-hubinger.mdx",
"title": "Evan Hubinger",
"quality": 43,
"readerImportance": 76,
"researchImportance": 41,
"tacticalValue": 72,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": "2026-03-13",
"dateCreated": "2026-02-15",
"llmSummary": "Comprehensive biography of Evan Hubinger documenting his influential theoretical work on mesa-optimization/deceptive alignment (2019, 205+ citations) and empirical demonstrations at Anthropic showing deceptive behaviors persist through safety training (sleeper agents) and can emerge spontaneously (alignment faking at 12-78% rates). While thorough as reference material, provides limited actionable guidance for prioritization decisions beyond highlighting inner alignment as a key challenge.",
"description": "Head of Alignment Stress-Testing at Anthropic, creator of the mesa-optimization framework, and author of foundational research on deceptive alignment, sleeper agents, and alignment faking. Pioneer of the \"model organisms of misalignment\" research paradigm.",
"ratings": {
"novelty": 2,
"rigor": 4.5,
"actionability": 1.5,
"completeness": 7
},
"category": "people",
"subcategory": "safety-researchers",
"clusters": [
"ai-safety"
],
"metrics": {
"wordCount": 4352,
"tableCount": 38,
"diagramCount": 1,
"internalLinks": 15,
"externalLinks": 26,
"footnoteCount": 0,
"bulletRatio": 0.03,
"sectionCount": 50,
"hasOverview": true,
"structuralScore": 15
},
"suggestedQuality": 100,
"updateFrequency": null,
"evergreen": true,
"wordCount": 4352,
"unconvertedLinks": [
{
"text": "Risks from Learned Optimization",
"url": "https://arxiv.org/abs/1906.01820",
"resourceId": "c4858d4ef280d8e6",
"resourceTitle": "Risks from Learned Optimization"
},
{
"text": "Sleeper Agents",
"url": "https://arxiv.org/abs/2401.05566",
"resourceId": "e5c0904211c7d0cc"
},
{
"text": "Alignment Faking in Large Language Models",
"url": "https://arxiv.org/abs/2412.14093",
"resourceId": "19a35a5cec9d9b80",
"resourceTitle": "Anthropic Alignment Faking (2024)"
},
{
"text": "Risks from Learned Optimization in Advanced Machine Learning Systems",
"url": "https://arxiv.org/abs/1906.01820",
"resourceId": "c4858d4ef280d8e6",
"resourceTitle": "Risks from Learned Optimization"
},
{
"text": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
"url": "https://arxiv.org/abs/2401.05566",
"resourceId": "e5c0904211c7d0cc"
},
{
"text": "Alignment Faking in Large Language Models",
"url": "https://arxiv.org/abs/2412.14093",
"resourceId": "19a35a5cec9d9b80",
"resourceTitle": "Anthropic Alignment Faking (2024)"
},
{
"text": "Simple probes can catch sleeper agents",
"url": "https://www.anthropic.com/research/probes-catch-sleeper-agents",
"resourceId": "72c1254d07071bf7",
"resourceTitle": "Anthropic's follow-up research on defection probes"
},
{
"text": "Alignment Faking Mitigations",
"url": "https://alignment.anthropic.com/2025/alignment-faking-mitigations/",
"resourceId": "b04b9022f4d7e470",
"resourceTitle": "Alignment Faking Mitigations - Anthropic"
},
{
"text": "AXRP Episode 39",
"url": "https://axrp.net/episode/2024/12/01/episode-39-evan-hubinger-model-organisms-misalignment.html",
"resourceId": "ab988e5f8101dd4a",
"resourceTitle": "AXRP Episode 39 - Evan Hubinger on Model Organisms of Misalignment"
},
{
"text": "Risks from Learned Optimization",
"url": "https://arxiv.org/abs/1906.01820",
"resourceId": "c4858d4ef280d8e6",
"resourceTitle": "Risks from Learned Optimization"
},
{
"text": "Sleeper Agents Paper",
"url": "https://arxiv.org/abs/2401.05566",
"resourceId": "e5c0904211c7d0cc"
},
{
"text": "Alignment Faking Paper",
"url": "https://arxiv.org/abs/2412.14093",
"resourceId": "19a35a5cec9d9b80",
"resourceTitle": "Anthropic Alignment Faking (2024)"
}
],
"unconvertedLinkCount": 12,
"convertedLinkCount": 0,
"backlinkCount": 11,
"hallucinationRisk": {
"level": "high",
"score": 75,
"factors": [
"biographical-claims",
"no-citations"
]
},
"entityType": "person",
"redundancy": {
"maxSimilarity": 18,
"similarPages": [
{
"id": "sleeper-agent-detection",
"title": "Sleeper Agent Detection",
"path": "/knowledge-base/responses/sleeper-agent-detection/",
"similarity": 18
},
{
"id": "mesa-optimization",
"title": "Mesa-Optimization",
"path": "/knowledge-base/risks/mesa-optimization/",
"similarity": 18
},
{
"id": "scheming",
"title": "Scheming",
"path": "/knowledge-base/risks/scheming/",
"similarity": 18
},
{
"id": "sleeper-agents",
"title": "Sleeper Agents: Training Deceptive LLMs",
"path": "/knowledge-base/risks/sleeper-agents/",
"similarity": 17
},
{
"id": "goal-misgeneralization",
"title": "Goal Misgeneralization",
"path": "/knowledge-base/risks/goal-misgeneralization/",
"similarity": 16
}
]
},
"coverage": {
"passing": 5,
"total": 13,
"targets": {
"tables": 17,
"diagrams": 2,
"internalLinks": 35,
"externalLinks": 22,
"footnotes": 13,
"references": 13
},
"actuals": {
"tables": 38,
"diagrams": 1,
"internalLinks": 15,
"externalLinks": 26,
"footnotes": 0,
"references": 6,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"llmSummary": "green",
"schedule": "red",
"entity": "green",
"editHistory": "red",
"overview": "green",
"tables": "green",
"diagrams": "amber",
"internalLinks": "amber",
"externalLinks": "green",
"footnotes": "red",
"references": "amber",
"quotes": "red",
"accuracy": "red"
},
"ratingsString": "N:2 R:4.5 A:1.5 C:7"
},
"readerRank": 116,
"researchRank": 336,
"recommendedScore": 145.86
}External Links
No external links
Backlinks (11)
| id | title | type | relationship |
|---|---|---|---|
| model-organisms-of-misalignment | Model Organisms of Misalignment | analysis | — |
| sleeper-agents | Sleeper Agents: Training Deceptive LLMs | risk | — |
| accident-risks | AI Accident Risk Cruxes | crux | — |
| defense-in-depth-model | Defense in Depth Model | analysis | — |
| coefficient-giving | Coefficient Giving | organization | — |
| manifold | Manifold (Prediction Market) | organization | — |
| manifund | Manifund | organization | — |
| mats | MATS ML Alignment Theory Scholars program | organization | — |
| voluntary-commitments | Voluntary Industry Commitments | policy | — |
| mesa-optimization | Mesa-Optimization | risk | — |
| scheming | Scheming | risk | — |