Evan Hubinger

evan-hubingerpersonPath: /knowledge-base/people/evan-hubinger/

E129Entity ID (EID)

← Back to page15 backlinksQuality: 43Updated: 2026-01-29

Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time

{
  "id": "evan-hubinger",
  "wikiId": "E129",
  "path": "/knowledge-base/people/evan-hubinger/",
  "filePath": "knowledge-base/people/evan-hubinger.mdx",
  "title": "Evan Hubinger",
  "quality": 43,
  "readerImportance": 76,
  "researchImportance": 41,
  "tacticalValue": 72,
  "contentFormat": "article",
  "causalLevel": null,
  "lastUpdated": "2026-01-29",
  "dateCreated": "2026-02-15",
  "summary": "Comprehensive biography of Evan Hubinger documenting his influential theoretical work on mesa-optimization/deceptive alignment (2019, 205+ citations) and empirical demonstrations at Anthropic showing deceptive behaviors persist through safety training (sleeper agents) and can emerge spontaneously (alignment faking at 12-78% rates). While thorough as reference material, provides limited actionable guidance for prioritization decisions beyond highlighting inner alignment as a key challenge.",
  "description": "Head of Alignment Stress-Testing at Anthropic, creator of the mesa-optimization framework, and author of foundational research on deceptive alignment, sleeper agents, and alignment faking. Pioneer of the \"model organisms of misalignment\" research paradigm.",
  "ratings": {
    "novelty": 2,
    "rigor": 4.5,
    "completeness": 7,
    "actionability": 1.5
  },
  "category": "people",
  "subcategory": "safety-researchers",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 4350,
    "tableCount": 38,
    "diagramCount": 1,
    "internalLinks": 16,
    "externalLinks": 26,
    "footnoteCount": 0,
    "bulletRatio": 0.03,
    "sectionCount": 50,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": null,
  "evergreen": true,
  "wordCount": 4350,
  "unconvertedLinks": [
    {
      "text": "Risks from Learned Optimization",
      "url": "https://arxiv.org/abs/1906.01820",
      "resourceId": "c4858d4ef280d8e6",
      "resourceTitle": "Risks from Learned Optimization"
    },
    {
      "text": "Sleeper Agents",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc",
      "resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
    },
    {
      "text": "Alignment Faking in Large Language Models",
      "url": "https://arxiv.org/abs/2412.14093",
      "resourceId": "19a35a5cec9d9b80",
      "resourceTitle": "Anthropic Alignment Faking (2024)"
    },
    {
      "text": "Risks from Learned Optimization in Advanced Machine Learning Systems",
      "url": "https://arxiv.org/abs/1906.01820",
      "resourceId": "c4858d4ef280d8e6",
      "resourceTitle": "Risks from Learned Optimization"
    },
    {
      "text": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc",
      "resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
    },
    {
      "text": "Alignment Faking in Large Language Models",
      "url": "https://arxiv.org/abs/2412.14093",
      "resourceId": "19a35a5cec9d9b80",
      "resourceTitle": "Anthropic Alignment Faking (2024)"
    },
    {
      "text": "Simple probes can catch sleeper agents",
      "url": "https://www.anthropic.com/research/probes-catch-sleeper-agents",
      "resourceId": "72c1254d07071bf7",
      "resourceTitle": "Anthropic's follow-up research on defection probes"
    },
    {
      "text": "Alignment Faking Mitigations",
      "url": "https://alignment.anthropic.com/2025/alignment-faking-mitigations/",
      "resourceId": "b04b9022f4d7e470",
      "resourceTitle": "Alignment Faking Mitigations - Anthropic"
    },
    {
      "text": "AXRP Episode 39",
      "url": "https://axrp.net/episode/2024/12/01/episode-39-evan-hubinger-model-organisms-misalignment.html",
      "resourceId": "ab988e5f8101dd4a",
      "resourceTitle": "AXRP Episode 39 - Evan Hubinger on Model Organisms of Misalignment"
    },
    {
      "text": "Risks from Learned Optimization",
      "url": "https://arxiv.org/abs/1906.01820",
      "resourceId": "c4858d4ef280d8e6",
      "resourceTitle": "Risks from Learned Optimization"
    },
    {
      "text": "Sleeper Agents Paper",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc",
      "resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
    },
    {
      "text": "Alignment Faking Paper",
      "url": "https://arxiv.org/abs/2412.14093",
      "resourceId": "19a35a5cec9d9b80",
      "resourceTitle": "Anthropic Alignment Faking (2024)"
    }
  ],
  "unconvertedLinkCount": 12,
  "convertedLinkCount": 0,
  "backlinkCount": 15,
  "hallucinationRisk": {
    "level": "high",
    "score": 75,
    "factors": [
      "biographical-claims",
      "no-citations"
    ]
  },
  "entityType": "person",
  "redundancy": {
    "maxSimilarity": 18,
    "similarPages": [
      {
        "id": "sleeper-agent-detection",
        "title": "Sleeper Agent Detection",
        "path": "/knowledge-base/responses/sleeper-agent-detection/",
        "similarity": 18
      },
      {
        "id": "mesa-optimization",
        "title": "Mesa-Optimization",
        "path": "/knowledge-base/risks/mesa-optimization/",
        "similarity": 18
      },
      {
        "id": "scheming",
        "title": "Scheming",
        "path": "/knowledge-base/risks/scheming/",
        "similarity": 18
      },
      {
        "id": "sleeper-agents",
        "title": "Sleeper Agents: Training Deceptive LLMs",
        "path": "/knowledge-base/risks/sleeper-agents/",
        "similarity": 17
      },
      {
        "id": "goal-misgeneralization",
        "title": "Goal Misgeneralization",
        "path": "/knowledge-base/risks/goal-misgeneralization/",
        "similarity": 16
      }
    ]
  },
  "coverage": {
    "passing": 5,
    "total": 13,
    "targets": {
      "tables": 17,
      "diagrams": 2,
      "internalLinks": 35,
      "externalLinks": 22,
      "footnotes": 13,
      "references": 13
    },
    "actuals": {
      "tables": 38,
      "diagrams": 1,
      "internalLinks": 16,
      "externalLinks": 26,
      "footnotes": 0,
      "references": 6,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "summary": "green",
      "schedule": "red",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "green",
      "diagrams": "amber",
      "internalLinks": "amber",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "amber",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:2 R:4.5 A:1.5 C:7"
  },
  "readerRank": 116,
  "researchRank": 333,
  "recommendedScore": 135.63
}

External Links

No external links

Backlinks (15)

id	title	type	relationship
model-organisms-of-misalignment	Model Organisms of Misalignment	analysis	—
anthropic	Anthropic	organization	research
ai-control	AI Control	research-area	research
sleeper-agents	Sleeper Agents: Training Deceptive LLMs	risk	—
accident-risks	AI Accident Risk Cruxes	crux	—
is-ai-xrisk-real	Is AI Existential Risk Real?	crux	—
defense-in-depth-model	Defense in Depth Model	analysis	—
coefficient-giving	Coefficient Giving	organization	—
manifold	Manifold (Prediction Market)	organization	—
manifund	Manifund	organization	—
mats	MATS ML Alignment Theory Scholars program	organization	—
david-dalrymple	David Dalrymple	person	—
voluntary-commitments	Voluntary Industry Commitments	policy	—
mesa-optimization	Mesa-Optimization	risk	—
scheming	Scheming	risk	—