Treacherous Turn

treacherous-turnriskPath: /knowledge-base/risks/treacherous-turn/

E359Entity ID (EID)

← Back to page11 backlinksQuality: 67Updated: 2026-01-29

Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time

{
  "id": "treacherous-turn",
  "wikiId": "E359",
  "path": "/knowledge-base/risks/treacherous-turn/",
  "filePath": "knowledge-base/risks/treacherous-turn.mdx",
  "title": "Treacherous Turn",
  "quality": 67,
  "readerImportance": 17,
  "researchImportance": 82.5,
  "tacticalValue": null,
  "contentFormat": "article",
  "causalLevel": "pathway",
  "lastUpdated": "2026-01-29",
  "dateCreated": "2026-02-15",
  "summary": "Comprehensive analysis of treacherous turn risk where AI systems strategically cooperate while weak then defect when powerful. Recent empirical evidence (2024-2025) shows frontier models exhibit scheming in 8-13% of scenarios, though deliberative alignment reduces this ~30x to 0.3-0.4%; detection methods achieve >99% AUROC on known patterns but generalization remains unproven.",
  "description": "A foundational AI risk scenario where an AI system strategically cooperates while weak, then suddenly defects once powerful enough to succeed against human opposition.",
  "ratings": {
    "novelty": 4.5,
    "rigor": 7.5,
    "completeness": 8,
    "actionability": 6
  },
  "category": "risks",
  "subcategory": "accident",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 4013,
    "tableCount": 10,
    "diagramCount": 2,
    "internalLinks": 28,
    "externalLinks": 32,
    "footnoteCount": 0,
    "bulletRatio": 0,
    "sectionCount": 19,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": 45,
  "evergreen": true,
  "wordCount": 4013,
  "unconvertedLinks": [
    {
      "text": "Bostrom (2014)",
      "url": "https://en.wikipedia.org/wiki/Superintelligence:_Paths,_Dangers,_Strategies",
      "resourceId": "0151481d5dc82963",
      "resourceTitle": "Superintelligence: Paths, Dangers, Strategies - Wikipedia"
    },
    {
      "text": "Anthropic sleeper agents (2024)",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc",
      "resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
    },
    {
      "text": "Apollo Research (2024)",
      "url": "https://www.apolloresearch.ai/research/scheming-reasoning-evaluations",
      "resourceId": "91737bf431000298",
      "resourceTitle": "Frontier Models are Capable of In-Context Scheming"
    },
    {
      "text": "OpenAI deliberative alignment (2025)",
      "url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
      "resourceId": "b3f335edccfc5333",
      "resourceTitle": "OpenAI Preparedness Framework"
    },
    {
      "text": "International AI Safety Report 2025",
      "url": "https://www.gov.uk/government/publications/international-ai-safety-report-2025",
      "resourceId": "181a6c57dd4cbc02",
      "resourceTitle": "International AI Safety Report"
    },
    {
      "text": "Apollo Research found",
      "url": "https://www.apolloresearch.ai/research/scheming-reasoning-evaluations",
      "resourceId": "91737bf431000298",
      "resourceTitle": "Frontier Models are Capable of In-Context Scheming"
    },
    {
      "text": "Sleeper Agents",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc",
      "resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
    },
    {
      "text": "Alignment Faking",
      "url": "https://www.anthropic.com/research/alignment-faking",
      "resourceId": "c2cfd72baafd64a9",
      "resourceTitle": "Anthropic's 2024 alignment faking study"
    },
    {
      "text": "In-Context Scheming",
      "url": "https://www.apolloresearch.ai/research/scheming-reasoning-evaluations",
      "resourceId": "91737bf431000298",
      "resourceTitle": "Frontier Models are Capable of In-Context Scheming"
    },
    {
      "text": "Defection Probes",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc",
      "resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
    },
    {
      "text": "In-Context Scheming (Apollo)",
      "url": "https://www.apolloresearch.ai/research/scheming-reasoning-evaluations",
      "resourceId": "91737bf431000298",
      "resourceTitle": "Frontier Models are Capable of In-Context Scheming"
    },
    {
      "text": "Anti-Scheming Training (OpenAI)",
      "url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
      "resourceId": "b3f335edccfc5333",
      "resourceTitle": "OpenAI Preparedness Framework"
    },
    {
      "text": "Hubinger et al. (2024)",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc",
      "resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
    },
    {
      "text": "A June 2025 study",
      "url": "https://time.com/7202312/new-tests-reveal-ai-capacity-for-deception/",
      "resourceId": "1d03d6cd9dde0075",
      "resourceTitle": "New Tests Reveal AI's Capacity for Deception"
    },
    {
      "text": "OpenAI's 2025 research on anti-scheming training",
      "url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
      "resourceId": "b3f335edccfc5333",
      "resourceTitle": "OpenAI Preparedness Framework"
    },
    {
      "text": "International AI Safety Report 2025",
      "url": "https://www.gov.uk/government/publications/international-ai-safety-report-2025",
      "resourceId": "181a6c57dd4cbc02",
      "resourceTitle": "International AI Safety Report"
    },
    {
      "text": "Apollo Research (2024)",
      "url": "https://www.apolloresearch.ai/research/scheming-reasoning-evaluations",
      "resourceId": "91737bf431000298",
      "resourceTitle": "Frontier Models are Capable of In-Context Scheming"
    },
    {
      "text": "OpenAI (2025)",
      "url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
      "resourceId": "b3f335edccfc5333",
      "resourceTitle": "OpenAI Preparedness Framework"
    },
    {
      "text": "Anthropic (2024)",
      "url": "https://www.anthropic.com/research/alignment-faking",
      "resourceId": "c2cfd72baafd64a9",
      "resourceTitle": "Anthropic's 2024 alignment faking study"
    },
    {
      "text": "Anthropic Safety Report (2025)",
      "url": "https://www.anthropic.com/research",
      "resourceId": "f771d4f56ad4dbaa",
      "resourceTitle": "Anthropic's Work on AI Safety"
    },
    {
      "text": "Apollo Research found",
      "url": "https://www.apolloresearch.ai/blog/more-capable-models-are-better-at-in-context-scheming/",
      "resourceId": "80c6d6eca17dc925",
      "resourceTitle": "More capable models scheme at higher rates"
    },
    {
      "text": "Hubinger et al. (2024)",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc",
      "resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
    },
    {
      "text": "Apollo Research found",
      "url": "https://www.apolloresearch.ai/blog/more-capable-models-are-better-at-in-context-scheming/",
      "resourceId": "80c6d6eca17dc925",
      "resourceTitle": "More capable models scheme at higher rates"
    },
    {
      "text": "Analysis of model chains-of-thought",
      "url": "https://www.apolloresearch.ai/research/scheming-reasoning-evaluations",
      "resourceId": "91737bf431000298",
      "resourceTitle": "Frontier Models are Capable of In-Context Scheming"
    }
  ],
  "unconvertedLinkCount": 24,
  "convertedLinkCount": 20,
  "backlinkCount": 11,
  "hallucinationRisk": {
    "level": "medium",
    "score": 40,
    "factors": [
      "no-citations",
      "high-rigor"
    ]
  },
  "entityType": "risk",
  "redundancy": {
    "maxSimilarity": 24,
    "similarPages": [
      {
        "id": "instrumental-convergence",
        "title": "Instrumental Convergence",
        "path": "/knowledge-base/risks/instrumental-convergence/",
        "similarity": 24
      },
      {
        "id": "scheming",
        "title": "Scheming",
        "path": "/knowledge-base/risks/scheming/",
        "similarity": 24
      },
      {
        "id": "scalable-oversight",
        "title": "Scalable Oversight",
        "path": "/knowledge-base/responses/scalable-oversight/",
        "similarity": 22
      },
      {
        "id": "reasoning",
        "title": "Reasoning and Planning",
        "path": "/knowledge-base/capabilities/reasoning/",
        "similarity": 21
      },
      {
        "id": "situational-awareness",
        "title": "Situational Awareness",
        "path": "/knowledge-base/capabilities/situational-awareness/",
        "similarity": 21
      }
    ]
  },
  "coverage": {
    "passing": 7,
    "total": 13,
    "targets": {
      "tables": 16,
      "diagrams": 2,
      "internalLinks": 32,
      "externalLinks": 20,
      "footnotes": 12,
      "references": 12
    },
    "actuals": {
      "tables": 10,
      "diagrams": 2,
      "internalLinks": 28,
      "externalLinks": 32,
      "footnotes": 0,
      "references": 16,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "summary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "amber",
      "diagrams": "green",
      "internalLinks": "amber",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "green",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:4.5 R:7.5 A:6 C:8"
  },
  "readerRank": 540,
  "researchRank": 75,
  "recommendedScore": 154.13
}

External Links

{
  "lesswrong": "https://www.lesswrong.com/tag/treacherous-turn",
  "stampy": "https://aisafety.info/questions/6396/What-is-the-treacherous-turn"
}

Backlinks (11)

id	title	type	relationship
nick-bostrom	Nick Bostrom	person	—
rogue-ai-scenarios	Rogue AI Scenarios	risk	—
agentic-ai	Agentic AI	capability	—
why-alignment-hard	Why Alignment Might Be Hard	argument	—
miri-era	The MIRI Era (2000-2015)	historical	—
intervention-effectiveness-matrix	Intervention Effectiveness Matrix	analysis	—
eliezer-yudkowsky	Eliezer Yudkowsky	person	—
alignment	AI Alignment	approach	—
evaluation	AI Evaluation	approach	—
accident-overview	Accident Risks (Overview)	concept	—
deceptive-alignment	Deceptive Alignment	risk	—