Alignment Robustness Trajectory

alignment-robustness-trajectoryanalysisPath: /knowledge-base/models/alignment-robustness-trajectory/
E21Entity ID (EID)
← Back to page0 backlinksQuality: 64Updated: 2026-02-23
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "alignment-robustness-trajectory",
  "wikiId": "E21",
  "path": "/knowledge-base/models/alignment-robustness-trajectory/",
  "filePath": "knowledge-base/models/alignment-robustness-trajectory.mdx",
  "title": "Alignment Robustness Trajectory",
  "quality": 64,
  "readerImportance": 86.5,
  "researchImportance": 63.5,
  "tacticalValue": 62,
  "contentFormat": "article",
  "causalLevel": null,
  "lastUpdated": "2026-02-23",
  "dateCreated": "2026-02-15",
  "summary": "This model estimates alignment robustness degrades from 50-65% at GPT-4 level to 15-30% at 100x capability, with a critical 'alignment valley' at 10-30x where systems are dangerous but can't help solve alignment. Empirical evidence from jailbreak research (96-100% success rates with adaptive attacks), sleeper agent studies, and OOD robustness benchmarks grounds these estimates. Prioritizes scalable oversight, interpretability, and deception detection research deployable within 2-5 years before entering the critical zone.",
  "description": "This model analyzes how alignment robustness changes with capability scaling. It estimates current techniques maintain 50-65% robustness at GPT-4 level but projects degradation to 15-30% at 100x capability, with critical thresholds around 10x-30x current capability.",
  "ratings": {
    "focus": 8.5,
    "novelty": 6,
    "rigor": 6.5,
    "concreteness": 7.5,
    "actionability": 7
  },
  "category": "models",
  "subcategory": "safety-models",
  "clusters": [
    "ai-safety",
    "governance"
  ],
  "metrics": {
    "wordCount": 3197,
    "tableCount": 16,
    "diagramCount": 4,
    "internalLinks": 21,
    "externalLinks": 11,
    "footnoteCount": 17,
    "bulletRatio": 0.09,
    "sectionCount": 36,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": 90,
  "evergreen": true,
  "wordCount": 3197,
  "unconvertedLinks": [
    {
      "text": "Hubinger, Evan et al. \"Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training\" (2024)",
      "url": "https://arxiv.org/abs/2401.05566",
      "resourceId": "e5c0904211c7d0cc",
      "resourceTitle": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training"
    },
    {
      "text": "Anthropic. \"Simple probes can catch sleeper agents\" (2024)",
      "url": "https://www.anthropic.com/research/probes-catch-sleeper-agents",
      "resourceId": "72c1254d07071bf7",
      "resourceTitle": "Anthropic's follow-up research on defection probes"
    },
    {
      "text": "Anthropic. \"Natural emergent misalignment from reward hacking\" (2025)",
      "url": "https://assets.anthropic.com/m/74342f2c96095771/original/Natural-emergent-misalignment-from-reward-hacking-paper.pdf",
      "resourceId": "b31b409bce6c24cb",
      "resourceTitle": "[PDF] Natural-emergent-misalignment-from-reward-hacking-paper.pdf"
    },
    {
      "text": "Andriushchenko et al. \"Jailbreaking Leading Safety-Aligned LLMs with Simple Adaptive Attacks\" (ICLR 2025)",
      "url": "https://arxiv.org/abs/2404.02151",
      "resourceId": "95354fcd3a9c2578",
      "resourceTitle": "Many-Shot Jailbreaking"
    },
    {
      "text": "Engels et al. \"Scaling Laws For Scalable Oversight\" (2025)",
      "url": "https://arxiv.org/abs/2504.18530",
      "resourceId": "48511d731320244b",
      "resourceTitle": "Scaling Laws For Scalable Oversight"
    },
    {
      "text": "Anthropic. \"Evaluating honesty and lie detection techniques\" (2025)",
      "url": "https://alignment.anthropic.com/2025/honesty-elicitation/",
      "resourceId": "d875cbfb1b50d2a2",
      "resourceTitle": "Evaluating Honesty and Lie Detection Techniques on a Diverse Suite of Dishonest Models"
    },
    {
      "text": "Weng, Lilian. \"Reward Hacking in Reinforcement Learning\" (2024)",
      "url": "https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",
      "resourceId": "570615e019d1cc74",
      "resourceTitle": "Reward Hacking in Reinforcement Learning"
    },
    {
      "text": "Anthropic Responsible Scaling Policy",
      "url": "https://www.anthropic.com/news/announcing-our-updated-responsible-scaling-policy",
      "resourceId": "d0ba81cc7a8fdb2b",
      "resourceTitle": "Anthropic: Announcing our updated Responsible Scaling Policy"
    },
    {
      "text": "Future of Life Institute AI Safety Index (2025)",
      "url": "https://futureoflife.org/ai-safety-index-summer-2025/",
      "resourceId": "df46edd6fa2078d1",
      "resourceTitle": "FLI AI Safety Index Summer 2025"
    },
    {
      "text": "Ngo, Richard et al. \"The Alignment Problem from a Deep Learning Perspective\" (2022)",
      "url": "https://arxiv.org/abs/2209.00626",
      "resourceId": "9124298fbb913c3d",
      "resourceTitle": "Gaming RLHF evaluation"
    }
  ],
  "unconvertedLinkCount": 10,
  "convertedLinkCount": 0,
  "backlinkCount": 0,
  "hallucinationRisk": {
    "level": "low",
    "score": 30,
    "factors": [
      "moderately-cited"
    ]
  },
  "entityType": "analysis",
  "redundancy": {
    "maxSimilarity": 17,
    "similarPages": [
      {
        "id": "accident-risks",
        "title": "AI Accident Risk Cruxes",
        "path": "/knowledge-base/cruxes/accident-risks/",
        "similarity": 17
      },
      {
        "id": "intervention-effectiveness-matrix",
        "title": "Intervention Effectiveness Matrix",
        "path": "/knowledge-base/models/intervention-effectiveness-matrix/",
        "similarity": 17
      },
      {
        "id": "technical-pathways",
        "title": "Technical Pathway Decomposition",
        "path": "/knowledge-base/models/technical-pathways/",
        "similarity": 17
      },
      {
        "id": "ai-control",
        "title": "AI Control",
        "path": "/knowledge-base/responses/ai-control/",
        "similarity": 17
      },
      {
        "id": "scheming",
        "title": "Scheming",
        "path": "/knowledge-base/risks/scheming/",
        "similarity": 17
      }
    ]
  },
  "changeHistory": [
    {
      "date": "2026-02-23",
      "branch": "claude/test-research-orchestrator-DUFts",
      "title": "Test Research Orchestrator (engine v2) on 3 alignment pages",
      "summary": "(fill in)"
    },
    {
      "date": "2026-02-23",
      "branch": "claude/test-research-orchestrator-DUFts",
      "title": "Orchestrator v2 (standard): Alignment Robustness Trajectory",
      "summary": "Improved \"Alignment Robustness Trajectory\" via orchestrator v2 (standard, 23 tool calls, 0 refinement cycles). Quality gate: passed. Cost: ~$6.45.",
      "duration": "528.0s",
      "cost": "~$6.45"
    }
  ],
  "coverage": {
    "passing": 9,
    "total": 13,
    "targets": {
      "tables": 13,
      "diagrams": 1,
      "internalLinks": 26,
      "externalLinks": 16,
      "footnotes": 10,
      "references": 10
    },
    "actuals": {
      "tables": 16,
      "diagrams": 4,
      "internalLinks": 21,
      "externalLinks": 11,
      "footnotes": 17,
      "references": 23,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "summary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "green",
      "overview": "green",
      "tables": "green",
      "diagrams": "green",
      "internalLinks": "amber",
      "externalLinks": "amber",
      "footnotes": "green",
      "references": "green",
      "quotes": "red",
      "accuracy": "red"
    },
    "editHistoryCount": 2,
    "ratingsString": "N:6 R:6.5 A:7"
  },
  "readerRank": 39,
  "researchRank": 194,
  "recommendedScore": 184.68
}
External Links
No external links
Backlinks (0)
No backlinks