Longterm Wiki

Evan Hubinger

evan-hubingerpersonPath: /knowledge-base/people/evan-hubinger/
E129Entity ID (EID)
← Back to page11 backlinksQuality: 43Updated: 2026-03-13
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "evan-hubinger",
  "numericId": null,
  "path": "/knowledge-base/people/evan-hubinger/",
  "filePath": "knowledge-base/people/evan-hubinger.mdx",
  "title": "Evan Hubinger",
  "quality": 43,
  "readerImportance": 76,
  "researchImportance": 41,
  "tacticalValue": 72,
  "contentFormat": "article",
  "tractability": null,
  "neglectedness": null,
  "uncertainty": null,
  "causalLevel": null,
  "lastUpdated": "2026-03-13",
  "dateCreated": "2026-02-15",
  "llmSummary": "Comprehensive biography of Evan Hubinger documenting his influential theoretical work on mesa-optimization/deceptive alignment (2019, 205+ citations) and empirical demonstrations at Anthropic showing deceptive behaviors persist through safety training (sleeper agents) and can emerge spontaneously (alignment faking at 12-78% rates). While thorough as reference material, provides limited actionable guidance for prioritization decisions beyond highlighting inner alignment as a key challenge.",
  "description": "Head of Alignment Stress-Testing at Anthropic, creator of the mesa-optimization framework, and author of foundational research on deceptive alignment, sleeper agents, and alignment faking. Pioneer of the \"model organisms of misalignment\" research paradigm.",
  "ratings": {
    "novelty": 2,
    "rigor": 4.5,
    "actionability": 1.5,
    "completeness": 7
  },
  "category": "people",
  "subcategory": "safety-researchers",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 4352,
    "tableCount": 38,
    "diagramCount": 1,
    "internalLinks": 15,
    "externalLinks": 26,
    "footnoteCount": 0,
    "bulletRatio": 0.03,
    "sectionCount": 50,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": null,
  "evergreen": true,
  "wordCount": 4352,
  "unconvertedLinks": [
    {
      "text": "Risks from Learned Optimization",
      "url": "https://arxiv.org/abs/1906.01820",
      "resourceId": "c4858d4ef280d8e6",
      "resourceTitle": "Risks from Learned Optimization"
    },
    {
      "text": "Sleeper Agents",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc"
    },
    {
      "text": "Alignment Faking in Large Language Models",
      "url": "https://arxiv.org/abs/2412.14093",
      "resourceId": "19a35a5cec9d9b80",
      "resourceTitle": "Anthropic Alignment Faking (2024)"
    },
    {
      "text": "Risks from Learned Optimization in Advanced Machine Learning Systems",
      "url": "https://arxiv.org/abs/1906.01820",
      "resourceId": "c4858d4ef280d8e6",
      "resourceTitle": "Risks from Learned Optimization"
    },
    {
      "text": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc"
    },
    {
      "text": "Alignment Faking in Large Language Models",
      "url": "https://arxiv.org/abs/2412.14093",
      "resourceId": "19a35a5cec9d9b80",
      "resourceTitle": "Anthropic Alignment Faking (2024)"
    },
    {
      "text": "Simple probes can catch sleeper agents",
      "url": "https://www.anthropic.com/research/probes-catch-sleeper-agents",
      "resourceId": "72c1254d07071bf7",
      "resourceTitle": "Anthropic's follow-up research on defection probes"
    },
    {
      "text": "Alignment Faking Mitigations",
      "url": "https://alignment.anthropic.com/2025/alignment-faking-mitigations/",
      "resourceId": "b04b9022f4d7e470",
      "resourceTitle": "Alignment Faking Mitigations - Anthropic"
    },
    {
      "text": "AXRP Episode 39",
      "url": "https://axrp.net/episode/2024/12/01/episode-39-evan-hubinger-model-organisms-misalignment.html",
      "resourceId": "ab988e5f8101dd4a",
      "resourceTitle": "AXRP Episode 39 - Evan Hubinger on Model Organisms of Misalignment"
    },
    {
      "text": "Risks from Learned Optimization",
      "url": "https://arxiv.org/abs/1906.01820",
      "resourceId": "c4858d4ef280d8e6",
      "resourceTitle": "Risks from Learned Optimization"
    },
    {
      "text": "Sleeper Agents Paper",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc"
    },
    {
      "text": "Alignment Faking Paper",
      "url": "https://arxiv.org/abs/2412.14093",
      "resourceId": "19a35a5cec9d9b80",
      "resourceTitle": "Anthropic Alignment Faking (2024)"
    }
  ],
  "unconvertedLinkCount": 12,
  "convertedLinkCount": 0,
  "backlinkCount": 11,
  "hallucinationRisk": {
    "level": "high",
    "score": 75,
    "factors": [
      "biographical-claims",
      "no-citations"
    ]
  },
  "entityType": "person",
  "redundancy": {
    "maxSimilarity": 18,
    "similarPages": [
      {
        "id": "sleeper-agent-detection",
        "title": "Sleeper Agent Detection",
        "path": "/knowledge-base/responses/sleeper-agent-detection/",
        "similarity": 18
      },
      {
        "id": "mesa-optimization",
        "title": "Mesa-Optimization",
        "path": "/knowledge-base/risks/mesa-optimization/",
        "similarity": 18
      },
      {
        "id": "scheming",
        "title": "Scheming",
        "path": "/knowledge-base/risks/scheming/",
        "similarity": 18
      },
      {
        "id": "sleeper-agents",
        "title": "Sleeper Agents: Training Deceptive LLMs",
        "path": "/knowledge-base/risks/sleeper-agents/",
        "similarity": 17
      },
      {
        "id": "goal-misgeneralization",
        "title": "Goal Misgeneralization",
        "path": "/knowledge-base/risks/goal-misgeneralization/",
        "similarity": 16
      }
    ]
  },
  "coverage": {
    "passing": 5,
    "total": 13,
    "targets": {
      "tables": 17,
      "diagrams": 2,
      "internalLinks": 35,
      "externalLinks": 22,
      "footnotes": 13,
      "references": 13
    },
    "actuals": {
      "tables": 38,
      "diagrams": 1,
      "internalLinks": 15,
      "externalLinks": 26,
      "footnotes": 0,
      "references": 6,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "llmSummary": "green",
      "schedule": "red",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "green",
      "diagrams": "amber",
      "internalLinks": "amber",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "amber",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:2 R:4.5 A:1.5 C:7"
  },
  "readerRank": 116,
  "researchRank": 336,
  "recommendedScore": 145.86
}
External Links

No external links

Backlinks (11)
idtitletyperelationship
model-organisms-of-misalignmentModel Organisms of Misalignmentanalysis
sleeper-agentsSleeper Agents: Training Deceptive LLMsrisk
accident-risksAI Accident Risk Cruxescrux
defense-in-depth-modelDefense in Depth Modelanalysis
coefficient-givingCoefficient Givingorganization
manifoldManifold (Prediction Market)organization
manifundManifundorganization
matsMATS ML Alignment Theory Scholars programorganization
voluntary-commitmentsVoluntary Industry Commitmentspolicy
mesa-optimizationMesa-Optimizationrisk
schemingSchemingrisk
Longterm Wiki