AI Alignment
alignmentapproachPath: /knowledge-base/responses/alignment/
E439Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "alignment",
"numericId": null,
"path": "/knowledge-base/responses/alignment/",
"filePath": "knowledge-base/responses/alignment.mdx",
"title": "AI Alignment",
"quality": 91,
"readerImportance": 95,
"researchImportance": 35,
"tacticalValue": 50,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": "2026-03-13",
"dateCreated": "2026-02-15",
"llmSummary": "Comprehensive review of AI alignment approaches finding current methods (RLHF, Constitutional AI) show 75%+ effectiveness on measurable safety metrics for existing systems but face critical scalability challenges, with oversight success dropping to 52% at 400 Elo capability gaps and only 40-60% detection of sophisticated deception. Recent research demonstrates that safety classifiers embedded in aligned LLMs can be extracted using as little as 20% of model weights, achieving 70% attack success rates via surrogate models. Anthropic activated ASL-3 protections with Claude Opus 4 and established a National Security and Public Sector Advisory Council in August 2025. Expert consensus ranges from 10-60% probability of success for AGI alignment depending on approach and timelines.",
"description": "Technical approaches to ensuring AI systems pursue intended goals and remain aligned with human values throughout training and deployment. Current methods show promise but face fundamental scalability challenges.",
"ratings": {
"novelty": 5,
"rigor": 7,
"actionability": 6,
"completeness": 7.5
},
"category": "responses",
"subcategory": "alignment",
"clusters": [
"ai-safety"
],
"metrics": {
"wordCount": 5657,
"tableCount": 14,
"diagramCount": 2,
"internalLinks": 103,
"externalLinks": 31,
"footnoteCount": 13,
"bulletRatio": 0.1,
"sectionCount": 36,
"hasOverview": true,
"structuralScore": 15
},
"suggestedQuality": 100,
"updateFrequency": 21,
"evergreen": true,
"wordCount": 5657,
"unconvertedLinks": [
{
"text": "AI Impacts 2024 survey",
"url": "https://aiimpacts.org/2022-expert-survey-on-progress-in-ai/",
"resourceId": "38eba87d0a888e2e",
"resourceTitle": "AI experts show significant disagreement"
},
{
"text": "FLI AI Safety Index Winter 2025",
"url": "https://futureoflife.org/ai-safety-index-winter-2025/",
"resourceId": "97185b28d68545b4",
"resourceTitle": "AI Safety Index Winter 2025"
},
{
"text": "Noirot Ferrand et al. (2025)",
"url": "https://arxiv.org/abs/2501.16534",
"resourceId": "b97d5c85dbbacca3",
"resourceTitle": "\"Targeting Alignment: Extracting Safety Classifiers of Aligned LLMs\""
},
{
"text": "Zou et al. (2023)",
"url": "https://arxiv.org/abs/2307.15043",
"resourceId": "302c069146f3f6f2",
"resourceTitle": "jailbreaks"
},
{
"text": "Future of Life Institute's AI Safety Index",
"url": "https://futureoflife.org/ai-safety-index-winter-2025/",
"resourceId": "97185b28d68545b4",
"resourceTitle": "AI Safety Index Winter 2025"
},
{
"text": "\"Targeting Alignment: Extracting Safety Classifiers of Aligned LLMs\"",
"url": "https://arxiv.org/abs/2501.16534",
"resourceId": "b97d5c85dbbacca3",
"resourceTitle": "\"Targeting Alignment: Extracting Safety Classifiers of Aligned LLMs\""
},
{
"text": "activated ASL-3 Deployment and Security Standards",
"url": "https://www.anthropic.com/news/activating-asl3-protections",
"resourceId": "7512ddb574f82249"
},
{
"text": "CVPR 2024",
"url": "https://openaccess.thecvf.com/content/CVPR2024/papers/Yu_RLHF-V_Towards_Trustworthy_MLLMs_via_Behavior_Alignment_from_Fine-grained_Correctional_CVPR_2024_paper.pdf",
"resourceId": "108f52553230c4d5",
"resourceTitle": "CVPR 2024"
},
{
"text": "Zou et al. (2023)",
"url": "https://arxiv.org/abs/2307.15043",
"resourceId": "302c069146f3f6f2",
"resourceTitle": "jailbreaks"
},
{
"text": "AI Impacts 2024 survey",
"url": "https://aiimpacts.org/2022-expert-survey-on-progress-in-ai/",
"resourceId": "38eba87d0a888e2e",
"resourceTitle": "AI experts show significant disagreement"
},
{
"text": "Metaculus",
"url": "https://www.metaculus.com/",
"resourceId": "d99a6d0fb1edc2db",
"resourceTitle": "Metaculus"
},
{
"text": "Metaculus",
"url": "https://www.metaculus.com/",
"resourceId": "d99a6d0fb1edc2db",
"resourceTitle": "Metaculus"
},
{
"text": "Targeting Alignment: Extracting Safety Classifiers of Aligned LLMs",
"url": "https://arxiv.org/abs/2501.16534",
"resourceId": "b97d5c85dbbacca3",
"resourceTitle": "\"Targeting Alignment: Extracting Safety Classifiers of Aligned LLMs\""
},
{
"text": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
"url": "https://arxiv.org/abs/2307.15043",
"resourceId": "302c069146f3f6f2",
"resourceTitle": "jailbreaks"
},
{
"text": "Safety Misalignment Against Large Language Models",
"url": "https://www.ndss-symposium.org/wp-content/uploads/2025-1089-paper.pdf",
"resourceId": "3a7a904debb5b65f",
"resourceTitle": "Safety Misalignment Against Large Language Models"
}
],
"unconvertedLinkCount": 15,
"convertedLinkCount": 49,
"backlinkCount": 70,
"citationHealth": {
"total": 12,
"withQuotes": 9,
"verified": 7,
"accuracyChecked": 7,
"accurate": 6,
"inaccurate": 1,
"avgScore": 0.7998268281420072
},
"hallucinationRisk": {
"level": "medium",
"score": 40,
"factors": [
"high-rigor",
"conceptual-content",
"high-quality",
"severe-truncation"
],
"integrityIssues": [
"severe-truncation"
]
},
"entityType": "approach",
"redundancy": {
"maxSimilarity": 20,
"similarPages": [
{
"id": "why-alignment-hard",
"title": "Why Alignment Might Be Hard",
"path": "/knowledge-base/debates/why-alignment-hard/",
"similarity": 20
},
{
"id": "agentic-ai",
"title": "Agentic AI",
"path": "/knowledge-base/capabilities/agentic-ai/",
"similarity": 19
},
{
"id": "accident-risks",
"title": "AI Accident Risk Cruxes",
"path": "/knowledge-base/cruxes/accident-risks/",
"similarity": 19
},
{
"id": "solutions",
"title": "AI Safety Solution Cruxes",
"path": "/knowledge-base/cruxes/solutions/",
"similarity": 19
},
{
"id": "interpretability",
"title": "Mechanistic Interpretability",
"path": "/knowledge-base/responses/interpretability/",
"similarity": 19
}
]
},
"changeHistory": [
{
"date": "2026-02-18",
"branch": "claude/fix-issue-240-N5irU",
"title": "Surface tacticalValue in /wiki table and score 53 pages",
"summary": "Added `tacticalValue` to `ExploreItem` interface, `getExploreItems()` mappings, the `/wiki` explore table (new sortable \"Tact.\" column), and the card view sort dropdown. Scored 49 new pages with tactical values (4 were already scored), bringing total to 53.",
"model": "sonnet-4",
"duration": "~30min"
}
],
"coverage": {
"passing": 10,
"total": 13,
"targets": {
"tables": 23,
"diagrams": 2,
"internalLinks": 45,
"externalLinks": 28,
"footnotes": 17,
"references": 17
},
"actuals": {
"tables": 14,
"diagrams": 2,
"internalLinks": 103,
"externalLinks": 31,
"footnotes": 13,
"references": 41,
"quotesWithQuotes": 9,
"quotesTotal": 12,
"accuracyChecked": 7,
"accuracyTotal": 12
},
"items": {
"llmSummary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "green",
"overview": "green",
"tables": "amber",
"diagrams": "green",
"internalLinks": "green",
"externalLinks": "green",
"footnotes": "amber",
"references": "green",
"quotes": "green",
"accuracy": "amber"
},
"editHistoryCount": 1,
"ratingsString": "N:5 R:7 A:6 C:7.5"
},
"readerRank": 1,
"researchRank": 378,
"recommendedScore": 251.4
}External Links
{
"wikipedia": "https://en.wikipedia.org/wiki/AI_alignment",
"lesswrong": "https://www.lesswrong.com/tag/ai",
"eaForum": "https://forum.effectivealtruism.org/topics/ai-alignment",
"stampy": "https://aisafety.info/questions/9Tii/What-is-AI-alignment",
"arbital": "https://arbital.greaterwrong.com/p/ai_alignment",
"wikidata": "https://www.wikidata.org/wiki/Q24882728",
"eightyK": "https://80000hours.org/problem-profiles/artificial-intelligence/",
"grokipedia": "https://grokipedia.com/page/AI_alignment"
}Backlinks (70)
| id | title | type | relationship |
|---|---|---|---|
| ai-welfare | AI Welfare and Digital Minds | concept | — |
| palisade-research | Palisade Research | organization | — |
| marc-andreessen | Marc Andreessen (AI Investor) | person | — |
| constitutional-ai | Constitutional AI | approach | — |
| agentic-ai | Agentic AI | capability | — |
| accident-risks | AI Accident Risk Cruxes | crux | — |
| miri-era | The MIRI Era (2000-2015) | historical | — |
| openclaw-matplotlib-incident-2026 | OpenClaw Matplotlib Incident (2026) | concept | — |
| alignment-robustness-trajectory | Alignment Robustness Trajectory | analysis | — |
| disinformation-detection-race | Disinformation Detection Arms Race Model | analysis | — |
| intervention-timing-windows | Intervention Timing Windows | analysis | — |
| model-organisms-of-misalignment | Model Organisms of Misalignment | analysis | — |
| reward-hacking-taxonomy | Reward Hacking Taxonomy and Severity Model | analysis | — |
| anthropic-investors | Anthropic (Funder) | analysis | — |
| arb-research | Arb Research | organization | — |
| bridgewater-aia-labs | Bridgewater AIA Labs | organization | — |
| cais | CAIS (Center for AI Safety) | organization | — |
| center-for-applied-rationality | Center for Applied Rationality | organization | — |
| conjecture | Conjecture | organization | — |
| ea-global | EA Global | organization | — |
| elicit | Elicit (AI Research Tool) | organization | — |
| fhi | Future of Humanity Institute (FHI) | organization | — |
| fli | Future of Life Institute (FLI) | organization | — |
| frontier-model-forum | Frontier Model Forum | organization | — |
| goodfire | Goodfire | organization | — |
| lesswrong | LessWrong | organization | — |
| lighthaven | Lighthaven (Event Venue) | organization | — |
| lightning-rod-labs | Lightning Rod Labs | organization | — |
| lionheart-ventures | Lionheart Ventures | organization | — |
| manifest | Manifest (Forecasting Conference) | organization | — |
| mats | MATS ML Alignment Theory Scholars program | organization | — |
| miri | MIRI (Machine Intelligence Research Institute) | organization | — |
| nist-ai | NIST and AI Safety | organization | — |
| openai | OpenAI | organization | — |
| pause-ai | Pause AI | organization | — |
| safety-orgs-overview | AI Safety Organizations (Overview) | concept | — |
| secure-ai-project | Secure AI Project | organization | — |
| seldon-lab | Seldon Lab | organization | — |
| swift-centre | Swift Centre | organization | — |
| the-sequences | The Sequences by Eliezer Yudkowsky | organization | — |
| vara | Value Aligned Research Advisors | organization | — |
| xai | xAI | organization | — |
| connor-leahy | Connor Leahy | person | — |
| eliezer-yudkowsky | Eliezer Yudkowsky | person | — |
| elon-musk | Elon Musk (AI Industry) | person | — |
| evan-hubinger | Evan Hubinger | person | — |
| holden-karnofsky | Holden Karnofsky | person | — |
| leopold-aschenbrenner | Leopold Aschenbrenner | person | — |
| paul-christiano | Paul Christiano | person | — |
| vidur-kapur | Vidur Kapur | person | — |
| vipul-naik | Vipul Naik | person | — |
| ai-watch | AI Watch | project | — |
| canada-aida | Canada AIDA | policy | — |
| cirl | Cooperative IRL (CIRL) | approach | — |
| coe-ai-convention | Council of Europe Framework Convention on Artificial Intelligence | policy | — |
| compute-governance | Compute Governance: AI Chips Export Controls Policy | policy | — |
| corporate | Corporate AI Safety Responses | approach | — |
| eu-ai-act | EU AI Act | policy | — |
| evaluation | AI Evaluation | approach | — |
| new-york-raise-act | New York RAISE Act | policy | — |
| prediction-markets | Prediction Markets (AI Forecasting) | approach | — |
| rlhf | RLHF / Constitutional AI | capability | — |
| scalable-oversight | Scalable Oversight | safety-agenda | — |
| training-programs | AI Safety Training Programs | approach | — |
| wikipedia-and-ai | Wikipedia and AI Content | concept | — |
| distributional-shift | AI Distributional Shift | risk | — |
| lock-in | AI Value Lock-in | risk | — |
| mesa-optimization | Mesa-Optimization | risk | — |
| sharp-left-turn | Sharp Left Turn | risk | — |
| doomer | AI Doomer Worldview | concept | — |