Scheming & Deception Detection

scheming-detectionapproachPath: /knowledge-base/responses/scheming-detection/
E441Entity ID (EID)
← Back to page2 backlinksQuality: 91Updated: 2026-01-30
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "scheming-detection",
  "wikiId": "E441",
  "path": "/knowledge-base/responses/scheming-detection/",
  "filePath": "knowledge-base/responses/scheming-detection.mdx",
  "title": "Scheming & Deception Detection",
  "quality": 91,
  "readerImportance": 57.5,
  "researchImportance": 31.5,
  "tacticalValue": null,
  "contentFormat": "article",
  "causalLevel": null,
  "lastUpdated": "2026-01-30",
  "dateCreated": "2026-02-15",
  "summary": "Reviews empirical evidence that frontier models (o1, Claude 3.5, Gemini 1.5) exhibit in-context scheming capabilities at rates of 0.3-13%, including disabling oversight and self-exfiltration attempts. Presents detection approaches (behavioral tests, chain-of-thought monitoring, internal probes) and mitigation strategies, finding deliberative alignment reduces scheming 97% but doesn't eliminate capability.",
  "description": "Research and evaluation methods for identifying when AI models engage in strategic deception—pretending to be aligned while secretly pursuing other goals—including behavioral tests, internal monitoring, and emerging detection techniques.",
  "ratings": {
    "novelty": 5,
    "rigor": 7,
    "completeness": 7.5,
    "actionability": 6.5
  },
  "category": "responses",
  "subcategory": "alignment-evaluation",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 3309,
    "tableCount": 19,
    "diagramCount": 3,
    "internalLinks": 12,
    "externalLinks": 35,
    "footnoteCount": 0,
    "bulletRatio": 0.12,
    "sectionCount": 33,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": 21,
  "evergreen": true,
  "wordCount": 3309,
  "unconvertedLinks": [
    {
      "text": "Apollo Research published findings",
      "url": "https://www.apolloresearch.ai/research/scheming-reasoning-evaluations",
      "resourceId": "91737bf431000298",
      "resourceTitle": "Frontier Models are Capable of In-Context Scheming"
    },
    {
      "text": "Apollo Research notes",
      "url": "https://www.apolloresearch.ai/blog/more-capable-models-are-better-at-in-context-scheming/",
      "resourceId": "80c6d6eca17dc925",
      "resourceTitle": "More capable models scheme at higher rates"
    },
    {
      "text": "OpenAI, 2025",
      "url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
      "resourceId": "b3f335edccfc5333",
      "resourceTitle": "OpenAI Preparedness Framework"
    },
    {
      "text": "Apollo Research",
      "url": "https://www.apolloresearch.ai/research/scheming-reasoning-evaluations",
      "resourceId": "91737bf431000298",
      "resourceTitle": "Frontier Models are Capable of In-Context Scheming"
    },
    {
      "text": "Anthropic, 2024",
      "url": "https://www.anthropic.com/research/probes-catch-sleeper-agents",
      "resourceId": "72c1254d07071bf7",
      "resourceTitle": "Anthropic's follow-up research on defection probes"
    },
    {
      "text": "Anthropic's \"Sleeper Agents\" research",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc",
      "resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
    },
    {
      "text": "December 2024",
      "url": "https://www.apolloresearch.ai/research/scheming-reasoning-evaluations",
      "resourceId": "91737bf431000298",
      "resourceTitle": "Frontier Models are Capable of In-Context Scheming"
    },
    {
      "text": "Anthropic, 2024",
      "url": "https://www.anthropic.com/research/probes-catch-sleeper-agents",
      "resourceId": "72c1254d07071bf7",
      "resourceTitle": "Anthropic's follow-up research on defection probes"
    },
    {
      "text": "March 2025",
      "url": "https://www.anthropic.com/research",
      "resourceId": "f771d4f56ad4dbaa",
      "resourceTitle": "Anthropic's Work on AI Safety"
    },
    {
      "text": "OpenAI reported",
      "url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
      "resourceId": "b3f335edccfc5333",
      "resourceTitle": "OpenAI Preparedness Framework"
    },
    {
      "text": "AI control research agenda",
      "url": "https://arxiv.org/abs/2312.06942",
      "resourceId": "187aaa26886ce183",
      "resourceTitle": "AI Control Framework"
    },
    {
      "text": "Apollo Research",
      "url": "https://www.apolloresearch.ai/research/",
      "resourceId": "560dff85b3305858",
      "resourceTitle": "Apollo Research — Research Overview"
    },
    {
      "text": "Anthropic",
      "url": "https://www.anthropic.com/research",
      "resourceId": "f771d4f56ad4dbaa",
      "resourceTitle": "Anthropic's Work on AI Safety"
    },
    {
      "text": "OpenAI",
      "url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
      "resourceId": "b3f335edccfc5333",
      "resourceTitle": "OpenAI Preparedness Framework"
    },
    {
      "text": "METR",
      "url": "https://metr.org/",
      "resourceId": "45370a5153534152",
      "resourceTitle": "METR: Model Evaluation and Threat Research"
    },
    {
      "text": "Sleeper Agents: Training Deceptive LLMs That Persist Through Safety Training",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc",
      "resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
    },
    {
      "text": "Simple probes can catch sleeper agents",
      "url": "https://www.anthropic.com/research/probes-catch-sleeper-agents",
      "resourceId": "72c1254d07071bf7",
      "resourceTitle": "Anthropic's follow-up research on defection probes"
    },
    {
      "text": "Detecting and Reducing Scheming in AI Models",
      "url": "https://openai.com/index/detecting-and-reducing-scheming-in-ai-models/",
      "resourceId": "b3f335edccfc5333",
      "resourceTitle": "OpenAI Preparedness Framework"
    },
    {
      "text": "Risks from Learned Optimization",
      "url": "https://arxiv.org/abs/1906.01820",
      "resourceId": "c4858d4ef280d8e6",
      "resourceTitle": "Risks from Learned Optimization"
    },
    {
      "text": "The Alignment Problem from a Deep Learning Perspective",
      "url": "https://arxiv.org/abs/2209.00626",
      "resourceId": "9124298fbb913c3d",
      "resourceTitle": "Gaming RLHF evaluation"
    },
    {
      "text": "Apollo Research Scheming Evaluations",
      "url": "https://www.apolloresearch.ai/research/scheming-reasoning-evaluations",
      "resourceId": "91737bf431000298",
      "resourceTitle": "Frontier Models are Capable of In-Context Scheming"
    },
    {
      "text": "Deliberative Alignment",
      "url": "https://openai.com/index/deliberative-alignment/",
      "resourceId": "ee7628aa3f6282e5",
      "resourceTitle": "Deliberative alignment: reasoning enables safer language models"
    },
    {
      "text": "AI Control: Improving Safety Despite Intentional Subversion",
      "url": "https://arxiv.org/abs/2312.06942",
      "resourceId": "187aaa26886ce183",
      "resourceTitle": "AI Control Framework"
    },
    {
      "text": "More Capable Models Are Better At In-Context Scheming",
      "url": "https://www.apolloresearch.ai/blog/more-capable-models-are-better-at-in-context-scheming/",
      "resourceId": "80c6d6eca17dc925",
      "resourceTitle": "More capable models scheme at higher rates"
    }
  ],
  "unconvertedLinkCount": 24,
  "convertedLinkCount": 0,
  "backlinkCount": 2,
  "hallucinationRisk": {
    "level": "low",
    "score": 25,
    "factors": [
      "no-citations",
      "high-rigor",
      "conceptual-content",
      "high-quality"
    ]
  },
  "entityType": "approach",
  "redundancy": {
    "maxSimilarity": 22,
    "similarPages": [
      {
        "id": "sleeper-agent-detection",
        "title": "Sleeper Agent Detection",
        "path": "/knowledge-base/responses/sleeper-agent-detection/",
        "similarity": 22
      },
      {
        "id": "scheming",
        "title": "Scheming",
        "path": "/knowledge-base/risks/scheming/",
        "similarity": 22
      },
      {
        "id": "mesa-optimization",
        "title": "Mesa-Optimization",
        "path": "/knowledge-base/risks/mesa-optimization/",
        "similarity": 20
      },
      {
        "id": "situational-awareness",
        "title": "Situational Awareness",
        "path": "/knowledge-base/capabilities/situational-awareness/",
        "similarity": 19
      },
      {
        "id": "alignment-evals",
        "title": "Alignment Evaluations",
        "path": "/knowledge-base/responses/alignment-evals/",
        "similarity": 19
      }
    ]
  },
  "coverage": {
    "passing": 8,
    "total": 13,
    "targets": {
      "tables": 13,
      "diagrams": 1,
      "internalLinks": 26,
      "externalLinks": 17,
      "footnotes": 10,
      "references": 10
    },
    "actuals": {
      "tables": 19,
      "diagrams": 3,
      "internalLinks": 12,
      "externalLinks": 35,
      "footnotes": 0,
      "references": 12,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "summary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "green",
      "diagrams": "green",
      "internalLinks": "amber",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "green",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:5 R:7 A:6.5 C:7.5"
  },
  "readerRank": 249,
  "researchRank": 403,
  "recommendedScore": 222.46
}
External Links
No external links
Backlinks (2)
id	title	type	relationship
alignment-evaluation-overview	Evaluation & Detection (Overview)	concept	—
scalable-eval-approaches	Scalable Eval Approaches	approach	—