Alignment Robustness Trajectory
alignment-robustness-trajectoryanalysisPath: /knowledge-base/models/alignment-robustness-trajectory/
E21Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "alignment-robustness-trajectory",
"numericId": null,
"path": "/knowledge-base/models/alignment-robustness-trajectory/",
"filePath": "knowledge-base/models/alignment-robustness-trajectory.mdx",
"title": "Alignment Robustness Trajectory",
"quality": 64,
"readerImportance": 86.5,
"researchImportance": 63.5,
"tacticalValue": 62,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": "2026-03-13",
"dateCreated": "2026-02-15",
"llmSummary": "This model estimates alignment robustness degrades from 50-65% at GPT-4 level to 15-30% at 100x capability, with a critical 'alignment valley' at 10-30x where systems are dangerous but can't help solve alignment. Empirical evidence from jailbreak research (96-100% success rates with adaptive attacks), sleeper agent studies, and OOD robustness benchmarks grounds these estimates. Prioritizes scalable oversight, interpretability, and deception detection research deployable within 2-5 years before entering the critical zone.",
"description": "This model analyzes how alignment robustness changes with capability scaling. It estimates current techniques maintain 50-65% robustness at GPT-4 level but projects degradation to 15-30% at 100x capability, with critical thresholds around 10x-30x current capability.",
"ratings": {
"focus": 8.5,
"novelty": 6,
"rigor": 6.5,
"concreteness": 7.5,
"actionability": 7
},
"category": "models",
"subcategory": "safety-models",
"clusters": [
"ai-safety",
"governance"
],
"metrics": {
"wordCount": 3197,
"tableCount": 16,
"diagramCount": 4,
"internalLinks": 21,
"externalLinks": 11,
"footnoteCount": 0,
"bulletRatio": 0.09,
"sectionCount": 36,
"hasOverview": true,
"structuralScore": 15
},
"suggestedQuality": 100,
"updateFrequency": 90,
"evergreen": true,
"wordCount": 3197,
"unconvertedLinks": [
{
"text": "Hubinger, Evan et al. \"Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training\" (2024)",
"url": "https://arxiv.org/abs/2401.05566",
"resourceId": "e5c0904211c7d0cc"
},
{
"text": "Anthropic. \"Simple probes can catch sleeper agents\" (2024)",
"url": "https://www.anthropic.com/research/probes-catch-sleeper-agents",
"resourceId": "72c1254d07071bf7",
"resourceTitle": "Anthropic's follow-up research on defection probes"
},
{
"text": "Andriushchenko et al. \"Jailbreaking Leading Safety-Aligned LLMs with Simple Adaptive Attacks\" (ICLR 2025)",
"url": "https://arxiv.org/abs/2404.02151",
"resourceId": "95354fcd3a9c2578",
"resourceTitle": "Many-Shot Jailbreaking"
},
{
"text": "Engels et al. \"Scaling Laws For Scalable Oversight\" (2025)",
"url": "https://arxiv.org/abs/2504.18530",
"resourceId": "48511d731320244b",
"resourceTitle": "Scaling Laws For Scalable Oversight"
},
{
"text": "Anthropic. \"Evaluating honesty and lie detection techniques\" (2025)",
"url": "https://alignment.anthropic.com/2025/honesty-elicitation/",
"resourceId": "d875cbfb1b50d2a2",
"resourceTitle": "Evaluating honesty and lie detection techniques on a diverse suite of dishonest models (https://alignment.anthropic.c..."
},
{
"text": "Weng, Lilian. \"Reward Hacking in Reinforcement Learning\" (2024)",
"url": "https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",
"resourceId": "570615e019d1cc74",
"resourceTitle": "Reward Hacking in Reinforcement Learning"
},
{
"text": "Anthropic Responsible Scaling Policy",
"url": "https://www.anthropic.com/news/announcing-our-updated-responsible-scaling-policy",
"resourceId": "d0ba81cc7a8fdb2b",
"resourceTitle": "Anthropic: Announcing our updated Responsible Scaling Policy"
},
{
"text": "Future of Life Institute AI Safety Index (2025)",
"url": "https://futureoflife.org/ai-safety-index-summer-2025/",
"resourceId": "df46edd6fa2078d1",
"resourceTitle": "FLI AI Safety Index Summer 2025"
},
{
"text": "Ngo, Richard et al. \"The Alignment Problem from a Deep Learning Perspective\" (2022)",
"url": "https://arxiv.org/abs/2209.00626",
"resourceId": "9124298fbb913c3d",
"resourceTitle": "Gaming RLHF evaluation"
}
],
"unconvertedLinkCount": 9,
"convertedLinkCount": 0,
"backlinkCount": 0,
"hallucinationRisk": {
"level": "medium",
"score": 55,
"factors": [
"no-citations"
]
},
"entityType": "analysis",
"redundancy": {
"maxSimilarity": 17,
"similarPages": [
{
"id": "accident-risks",
"title": "AI Accident Risk Cruxes",
"path": "/knowledge-base/cruxes/accident-risks/",
"similarity": 17
},
{
"id": "intervention-effectiveness-matrix",
"title": "Intervention Effectiveness Matrix",
"path": "/knowledge-base/models/intervention-effectiveness-matrix/",
"similarity": 17
},
{
"id": "technical-pathways",
"title": "Technical Pathway Decomposition",
"path": "/knowledge-base/models/technical-pathways/",
"similarity": 17
},
{
"id": "ai-control",
"title": "AI Control",
"path": "/knowledge-base/responses/ai-control/",
"similarity": 17
},
{
"id": "scheming",
"title": "Scheming",
"path": "/knowledge-base/risks/scheming/",
"similarity": 17
}
]
},
"changeHistory": [
{
"date": "2026-02-23",
"branch": "claude/test-research-orchestrator-DUFts",
"title": "Test Research Orchestrator (engine v2) on 3 alignment pages",
"summary": "(fill in)"
},
{
"date": "2026-02-23",
"branch": "claude/test-research-orchestrator-DUFts",
"title": "Orchestrator v2 (standard): Alignment Robustness Trajectory",
"summary": "Improved \"Alignment Robustness Trajectory\" via orchestrator v2 (standard, 23 tool calls, 0 refinement cycles). Quality gate: passed. Cost: ~$6.45.",
"duration": "528.0s",
"cost": "~$6.45"
}
],
"coverage": {
"passing": 8,
"total": 13,
"targets": {
"tables": 13,
"diagrams": 1,
"internalLinks": 26,
"externalLinks": 16,
"footnotes": 10,
"references": 10
},
"actuals": {
"tables": 16,
"diagrams": 4,
"internalLinks": 21,
"externalLinks": 11,
"footnotes": 0,
"references": 22,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"llmSummary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "green",
"overview": "green",
"tables": "green",
"diagrams": "green",
"internalLinks": "amber",
"externalLinks": "amber",
"footnotes": "red",
"references": "green",
"quotes": "red",
"accuracy": "red"
},
"editHistoryCount": 2,
"ratingsString": "N:6 R:6.5 A:7"
},
"readerRank": 40,
"researchRank": 197,
"recommendedScore": 193.11
}External Links
No external links
Backlinks (0)
No backlinks