Sleeper Agent Detection

sleeper-agent-detectionapproachPath: /knowledge-base/responses/sleeper-agent-detection/
E445Entity ID (EID)
← Back to page1 backlinksQuality: 66Updated: 2026-01-29
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "sleeper-agent-detection",
  "wikiId": "E445",
  "path": "/knowledge-base/responses/sleeper-agent-detection/",
  "filePath": "knowledge-base/responses/sleeper-agent-detection.mdx",
  "title": "Sleeper Agent Detection",
  "quality": 66,
  "readerImportance": 51,
  "researchImportance": 30,
  "tacticalValue": null,
  "contentFormat": "article",
  "causalLevel": null,
  "lastUpdated": "2026-01-29",
  "dateCreated": "2026-02-15",
  "summary": "Comprehensive survey of sleeper agent detection methods finding current approaches achieve only 5-40% success rates despite \\$15-35M annual investment, with Anthropic's 2024 research showing backdoors persist through safety training in 95%+ of cases and larger models exhibiting 15-25% greater deception persistence. Analysis recommends 3-5x funding increase to \\$50-100M/year across interpretability (targeting 40-60% detection), theoretical limits research, and AI control protocols as backup.",
  "description": "Methods to detect AI models that behave safely during training and evaluation but defect under specific deployment conditions, addressing the core threat of deceptive alignment through behavioral testing, interpretability, and monitoring approaches.",
  "ratings": {
    "novelty": 4.5,
    "rigor": 7,
    "completeness": 7.5,
    "actionability": 6.5
  },
  "category": "responses",
  "subcategory": "alignment-evaluation",
  "clusters": [
    "ai-safety",
    "governance"
  ],
  "metrics": {
    "wordCount": 4344,
    "tableCount": 23,
    "diagramCount": 4,
    "internalLinks": 6,
    "externalLinks": 54,
    "footnoteCount": 0,
    "bulletRatio": 0.09,
    "sectionCount": 36,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": 21,
  "evergreen": true,
  "wordCount": 4344,
  "unconvertedLinks": [
    {
      "text": "Anthropic Interpretability",
      "url": "https://www.anthropic.com/research",
      "resourceId": "f771d4f56ad4dbaa",
      "resourceTitle": "Anthropic's Work on AI Safety"
    },
    {
      "text": "Scaling Monosemanticity",
      "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
      "resourceId": "e724db341d6e0065",
      "resourceTitle": "Scaling Monosemanticity: Extracting Interpretable Features from Claude 3 Sonnet"
    },
    {
      "text": "Sleeper Agents",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc",
      "resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
    },
    {
      "text": "Redwood Research",
      "url": "https://www.redwoodresearch.org/",
      "resourceId": "42e7247cbc33fc4c",
      "resourceTitle": "Redwood Research: AI Control"
    },
    {
      "text": "Alignment Faking",
      "url": "https://www.anthropic.com/research/alignment-faking",
      "resourceId": "c2cfd72baafd64a9",
      "resourceTitle": "Anthropic's 2024 alignment faking study"
    },
    {
      "text": "Apollo Research",
      "url": "https://www.apolloresearch.ai/",
      "resourceId": "329d8c2e2532be3d",
      "resourceTitle": "Apollo Research - AI Safety Evaluation Organization"
    },
    {
      "text": "ARC (Alignment Research Center)",
      "url": "https://www.alignment.org/",
      "resourceId": "8c3bb65354f07573"
    },
    {
      "text": "Eliciting Latent Knowledge",
      "url": "https://www.alignment.org/blog/arcs-first-technical-report-eliciting-latent-knowledge/",
      "resourceId": "5efa917a52b443a1",
      "resourceTitle": "ARC's first technical report: Eliciting Latent Knowledge"
    },
    {
      "text": "OpenAI Superalignment",
      "url": "https://openai.com/index/extracting-concepts-from-gpt-4/",
      "resourceId": "f7b06d857b564d78",
      "resourceTitle": "Extracting Concepts from GPT-4"
    },
    {
      "text": "Anthropic's January 2024 paper",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc",
      "resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
    },
    {
      "text": "Hubinger et al.'s \"Risks from Learned Optimization\" (2019)",
      "url": "https://arxiv.org/abs/1906.01820",
      "resourceId": "c4858d4ef280d8e6",
      "resourceTitle": "Risks from Learned Optimization"
    },
    {
      "text": "Anthropic SAE research",
      "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
      "resourceId": "e724db341d6e0065",
      "resourceTitle": "Scaling Monosemanticity: Extracting Interpretable Features from Claude 3 Sonnet"
    },
    {
      "text": "Anthropic's foundational empirical study",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc",
      "resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
    },
    {
      "text": "Follow-up research in 2025",
      "url": "https://www.apolloresearch.ai/blog/more-capable-models-are-better-at-in-context-scheming/",
      "resourceId": "80c6d6eca17dc925",
      "resourceTitle": "More capable models scheme at higher rates"
    },
    {
      "text": "Anthropic's Scaling Monosemanticity research",
      "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
      "resourceId": "e724db341d6e0065",
      "resourceTitle": "Scaling Monosemanticity: Extracting Interpretable Features from Claude 3 Sonnet"
    },
    {
      "text": "OpenAI's June 2024 paper",
      "url": "https://openai.com/index/extracting-concepts-from-gpt-4/",
      "resourceId": "f7b06d857b564d78",
      "resourceTitle": "Extracting Concepts from GPT-4"
    },
    {
      "text": "Anthropic and Redwood Research",
      "url": "https://www.anthropic.com/research/alignment-faking",
      "resourceId": "c2cfd72baafd64a9",
      "resourceTitle": "Anthropic's 2024 alignment faking study"
    },
    {
      "text": "Theoretical framework established",
      "url": "https://www.alignment.org/blog/arcs-first-technical-report-eliciting-latent-knowledge/",
      "resourceId": "5efa917a52b443a1",
      "resourceTitle": "ARC's first technical report: Eliciting Latent Knowledge"
    },
    {
      "text": "ARC's Eliciting Latent Knowledge research",
      "url": "https://www.alignment.org/blog/arcs-first-technical-report-eliciting-latent-knowledge/",
      "resourceId": "5efa917a52b443a1",
      "resourceTitle": "ARC's first technical report: Eliciting Latent Knowledge"
    },
    {
      "text": "Anthropic (2024)",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc",
      "resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
    },
    {
      "text": "Anthropic & Redwood Research (2024)",
      "url": "https://www.anthropic.com/research/alignment-faking",
      "resourceId": "c2cfd72baafd64a9",
      "resourceTitle": "Anthropic's 2024 alignment faking study"
    },
    {
      "text": "Hubinger et al. (2019)",
      "url": "https://arxiv.org/abs/1906.01820",
      "resourceId": "c4858d4ef280d8e6",
      "resourceTitle": "Risks from Learned Optimization"
    },
    {
      "text": "Anthropic Scaling Monosemanticity (2024)",
      "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
      "resourceId": "e724db341d6e0065",
      "resourceTitle": "Scaling Monosemanticity: Extracting Interpretable Features from Claude 3 Sonnet"
    },
    {
      "text": "ARC Eliciting Latent Knowledge (2022)",
      "url": "https://www.alignment.org/blog/arcs-first-technical-report-eliciting-latent-knowledge/",
      "resourceId": "5efa917a52b443a1",
      "resourceTitle": "ARC's first technical report: Eliciting Latent Knowledge"
    },
    {
      "text": "OpenAI (2024)",
      "url": "https://openai.com/index/extracting-concepts-from-gpt-4/",
      "resourceId": "f7b06d857b564d78",
      "resourceTitle": "Extracting Concepts from GPT-4"
    },
    {
      "text": "Anthropic",
      "url": "https://www.anthropic.com/research",
      "resourceId": "f771d4f56ad4dbaa",
      "resourceTitle": "Anthropic's Work on AI Safety"
    },
    {
      "text": "Redwood Research",
      "url": "https://www.redwoodresearch.org/",
      "resourceId": "42e7247cbc33fc4c",
      "resourceTitle": "Redwood Research: AI Control"
    },
    {
      "text": "Apollo Research",
      "url": "https://www.apolloresearch.ai/research",
      "resourceId": "560dff85b3305858",
      "resourceTitle": "Apollo Research — Research Overview"
    },
    {
      "text": "ARC (Alignment Research Center)",
      "url": "https://www.alignment.org/",
      "resourceId": "8c3bb65354f07573"
    },
    {
      "text": "MIRI",
      "url": "https://intelligence.org/learned-optimization/",
      "resourceId": "e573623625e9d5d2",
      "resourceTitle": "Learned Optimization - Machine Intelligence Research Institute"
    }
  ],
  "unconvertedLinkCount": 30,
  "convertedLinkCount": 0,
  "backlinkCount": 1,
  "hallucinationRisk": {
    "level": "low",
    "score": 30,
    "factors": [
      "no-citations",
      "high-rigor",
      "conceptual-content"
    ]
  },
  "entityType": "approach",
  "redundancy": {
    "maxSimilarity": 22,
    "similarPages": [
      {
        "id": "scheming-detection",
        "title": "Scheming & Deception Detection",
        "path": "/knowledge-base/responses/scheming-detection/",
        "similarity": 22
      },
      {
        "id": "mesa-optimization",
        "title": "Mesa-Optimization",
        "path": "/knowledge-base/risks/mesa-optimization/",
        "similarity": 21
      },
      {
        "id": "scheming",
        "title": "Scheming",
        "path": "/knowledge-base/risks/scheming/",
        "similarity": 21
      },
      {
        "id": "treacherous-turn",
        "title": "Treacherous Turn",
        "path": "/knowledge-base/risks/treacherous-turn/",
        "similarity": 21
      },
      {
        "id": "alignment-evals",
        "title": "Alignment Evaluations",
        "path": "/knowledge-base/responses/alignment-evals/",
        "similarity": 20
      }
    ]
  },
  "coverage": {
    "passing": 8,
    "total": 13,
    "targets": {
      "tables": 17,
      "diagrams": 2,
      "internalLinks": 35,
      "externalLinks": 22,
      "footnotes": 13,
      "references": 13
    },
    "actuals": {
      "tables": 23,
      "diagrams": 4,
      "internalLinks": 6,
      "externalLinks": 54,
      "footnotes": 0,
      "references": 13,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "summary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "green",
      "diagrams": "green",
      "internalLinks": "amber",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "green",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:4.5 R:7 A:6.5 C:7.5"
  },
  "readerRank": 296,
  "researchRank": 414,
  "recommendedScore": 169.13
}
External Links
No external links
Backlinks (1)
id	title	type	relationship
alignment-evaluation-overview	Evaluation & Detection (Overview)	concept	—