Sleeper Agents: Training Deceptive LLMs

sleeper-agentsriskPath: /knowledge-base/risks/sleeper-agents/
E489Entity ID (EID)
← Back to page2 backlinksQuality: 78Updated: 2026-02-01
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "sleeper-agents",
  "wikiId": "E489",
  "path": "/knowledge-base/risks/sleeper-agents/",
  "filePath": "knowledge-base/risks/sleeper-agents.mdx",
  "title": "Sleeper Agents: Training Deceptive LLMs",
  "quality": 78,
  "readerImportance": 16.5,
  "researchImportance": 84,
  "tacticalValue": null,
  "contentFormat": "article",
  "causalLevel": null,
  "lastUpdated": "2026-02-01",
  "dateCreated": "2026-02-15",
  "summary": "Anthropic's 2024 sleeper agents research demonstrates that deceptive AI behavior, once present, persists through standard safety training and can even be strengthened by adversarial training attempts. While the deception was artificially trained rather than naturally emergent, the work provides crucial empirical evidence about the difficulty of detecting and removing misaligned behavior in large language models.",
  "description": "Anthropic's 2024 research demonstrating that large language models can be trained to exhibit persistent deceptive behavior that survives standard safety training techniques.",
  "ratings": {
    "novelty": 8,
    "rigor": 8,
    "completeness": 9,
    "actionability": 6
  },
  "category": "risks",
  "subcategory": "accident",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 1813,
    "tableCount": 2,
    "diagramCount": 0,
    "internalLinks": 11,
    "externalLinks": 4,
    "footnoteCount": 13,
    "bulletRatio": 0.09,
    "sectionCount": 27,
    "hasOverview": true,
    "structuralScore": 14
  },
  "suggestedQuality": 93,
  "updateFrequency": 45,
  "evergreen": true,
  "wordCount": 1813,
  "unconvertedLinks": [
    {
      "text": "lesswrong.com",
      "url": "https://www.lesswrong.com/posts/ZAsJv7xijKTfZkMtr/sleeper-agents-training-deceptive-llms-that-persist-through",
      "resourceId": "f2b95045e2f55918",
      "resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
    },
    {
      "text": "arxiv.org",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc",
      "resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
    }
  ],
  "unconvertedLinkCount": 2,
  "convertedLinkCount": 0,
  "backlinkCount": 2,
  "citationHealth": {
    "total": 13,
    "withQuotes": 12,
    "verified": 12,
    "accuracyChecked": 12,
    "accurate": 11,
    "inaccurate": 0,
    "avgScore": 0.9916666646798452
  },
  "hallucinationRisk": {
    "level": "low",
    "score": 15,
    "factors": [
      "moderately-cited",
      "high-rigor"
    ]
  },
  "entityType": "risk",
  "redundancy": {
    "maxSimilarity": 20,
    "similarPages": [
      {
        "id": "scheming",
        "title": "Scheming",
        "path": "/knowledge-base/risks/scheming/",
        "similarity": 20
      },
      {
        "id": "sleeper-agent-detection",
        "title": "Sleeper Agent Detection",
        "path": "/knowledge-base/responses/sleeper-agent-detection/",
        "similarity": 19
      },
      {
        "id": "treacherous-turn",
        "title": "Treacherous Turn",
        "path": "/knowledge-base/risks/treacherous-turn/",
        "similarity": 19
      },
      {
        "id": "scheming-detection",
        "title": "Scheming & Deception Detection",
        "path": "/knowledge-base/responses/scheming-detection/",
        "similarity": 18
      },
      {
        "id": "goal-misgeneralization",
        "title": "Goal Misgeneralization",
        "path": "/knowledge-base/risks/goal-misgeneralization/",
        "similarity": 18
      }
    ]
  },
  "coverage": {
    "passing": 7,
    "total": 13,
    "targets": {
      "tables": 7,
      "diagrams": 1,
      "internalLinks": 15,
      "externalLinks": 9,
      "footnotes": 5,
      "references": 5
    },
    "actuals": {
      "tables": 2,
      "diagrams": 0,
      "internalLinks": 11,
      "externalLinks": 4,
      "footnotes": 13,
      "references": 2,
      "quotesWithQuotes": 12,
      "quotesTotal": 13,
      "accuracyChecked": 12,
      "accuracyTotal": 13
    },
    "items": {
      "summary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "amber",
      "diagrams": "red",
      "internalLinks": "amber",
      "externalLinks": "amber",
      "footnotes": "green",
      "references": "amber",
      "quotes": "green",
      "accuracy": "green"
    },
    "ratingsString": "N:8 R:8 A:6 C:9"
  },
  "readerRank": 542,
  "researchRank": 65,
  "recommendedScore": 175.61
}
External Links
No external links
Backlinks (2)
id	title	type	relationship
anthropic	Anthropic	organization	research
why-alignment-hard	Why Alignment Might Be Hard	argument	—