Longterm Wiki

AI Alignment

alignmentapproachPath: /knowledge-base/responses/alignment/
E439Entity ID (EID)
← Back to page70 backlinksQuality: 91Updated: 2026-03-13
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "alignment",
  "numericId": null,
  "path": "/knowledge-base/responses/alignment/",
  "filePath": "knowledge-base/responses/alignment.mdx",
  "title": "AI Alignment",
  "quality": 91,
  "readerImportance": 95,
  "researchImportance": 35,
  "tacticalValue": 50,
  "contentFormat": "article",
  "tractability": null,
  "neglectedness": null,
  "uncertainty": null,
  "causalLevel": null,
  "lastUpdated": "2026-03-13",
  "dateCreated": "2026-02-15",
  "llmSummary": "Comprehensive review of AI alignment approaches finding current methods (RLHF, Constitutional AI) show 75%+ effectiveness on measurable safety metrics for existing systems but face critical scalability challenges, with oversight success dropping to 52% at 400 Elo capability gaps and only 40-60% detection of sophisticated deception. Recent research demonstrates that safety classifiers embedded in aligned LLMs can be extracted using as little as 20% of model weights, achieving 70% attack success rates via surrogate models. Anthropic activated ASL-3 protections with Claude Opus 4 and established a National Security and Public Sector Advisory Council in August 2025. Expert consensus ranges from 10-60% probability of success for AGI alignment depending on approach and timelines.",
  "description": "Technical approaches to ensuring AI systems pursue intended goals and remain aligned with human values throughout training and deployment. Current methods show promise but face fundamental scalability challenges.",
  "ratings": {
    "novelty": 5,
    "rigor": 7,
    "actionability": 6,
    "completeness": 7.5
  },
  "category": "responses",
  "subcategory": "alignment",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 5657,
    "tableCount": 14,
    "diagramCount": 2,
    "internalLinks": 103,
    "externalLinks": 31,
    "footnoteCount": 13,
    "bulletRatio": 0.1,
    "sectionCount": 36,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": 21,
  "evergreen": true,
  "wordCount": 5657,
  "unconvertedLinks": [
    {
      "text": "AI Impacts 2024 survey",
      "url": "https://aiimpacts.org/2022-expert-survey-on-progress-in-ai/",
      "resourceId": "38eba87d0a888e2e",
      "resourceTitle": "AI experts show significant disagreement"
    },
    {
      "text": "FLI AI Safety Index Winter 2025",
      "url": "https://futureoflife.org/ai-safety-index-winter-2025/",
      "resourceId": "97185b28d68545b4",
      "resourceTitle": "AI Safety Index Winter 2025"
    },
    {
      "text": "Noirot Ferrand et al. (2025)",
      "url": "https://arxiv.org/abs/2501.16534",
      "resourceId": "b97d5c85dbbacca3",
      "resourceTitle": "\"Targeting Alignment: Extracting Safety Classifiers of Aligned LLMs\""
    },
    {
      "text": "Zou et al. (2023)",
      "url": "https://arxiv.org/abs/2307.15043",
      "resourceId": "302c069146f3f6f2",
      "resourceTitle": "jailbreaks"
    },
    {
      "text": "Future of Life Institute's AI Safety Index",
      "url": "https://futureoflife.org/ai-safety-index-winter-2025/",
      "resourceId": "97185b28d68545b4",
      "resourceTitle": "AI Safety Index Winter 2025"
    },
    {
      "text": "\"Targeting Alignment: Extracting Safety Classifiers of Aligned LLMs\"",
      "url": "https://arxiv.org/abs/2501.16534",
      "resourceId": "b97d5c85dbbacca3",
      "resourceTitle": "\"Targeting Alignment: Extracting Safety Classifiers of Aligned LLMs\""
    },
    {
      "text": "activated ASL-3 Deployment and Security Standards",
      "url": "https://www.anthropic.com/news/activating-asl3-protections",
      "resourceId": "7512ddb574f82249"
    },
    {
      "text": "CVPR 2024",
      "url": "https://openaccess.thecvf.com/content/CVPR2024/papers/Yu_RLHF-V_Towards_Trustworthy_MLLMs_via_Behavior_Alignment_from_Fine-grained_Correctional_CVPR_2024_paper.pdf",
      "resourceId": "108f52553230c4d5",
      "resourceTitle": "CVPR 2024"
    },
    {
      "text": "Zou et al. (2023)",
      "url": "https://arxiv.org/abs/2307.15043",
      "resourceId": "302c069146f3f6f2",
      "resourceTitle": "jailbreaks"
    },
    {
      "text": "AI Impacts 2024 survey",
      "url": "https://aiimpacts.org/2022-expert-survey-on-progress-in-ai/",
      "resourceId": "38eba87d0a888e2e",
      "resourceTitle": "AI experts show significant disagreement"
    },
    {
      "text": "Metaculus",
      "url": "https://www.metaculus.com/",
      "resourceId": "d99a6d0fb1edc2db",
      "resourceTitle": "Metaculus"
    },
    {
      "text": "Metaculus",
      "url": "https://www.metaculus.com/",
      "resourceId": "d99a6d0fb1edc2db",
      "resourceTitle": "Metaculus"
    },
    {
      "text": "Targeting Alignment: Extracting Safety Classifiers of Aligned LLMs",
      "url": "https://arxiv.org/abs/2501.16534",
      "resourceId": "b97d5c85dbbacca3",
      "resourceTitle": "\"Targeting Alignment: Extracting Safety Classifiers of Aligned LLMs\""
    },
    {
      "text": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
      "url": "https://arxiv.org/abs/2307.15043",
      "resourceId": "302c069146f3f6f2",
      "resourceTitle": "jailbreaks"
    },
    {
      "text": "Safety Misalignment Against Large Language Models",
      "url": "https://www.ndss-symposium.org/wp-content/uploads/2025-1089-paper.pdf",
      "resourceId": "3a7a904debb5b65f",
      "resourceTitle": "Safety Misalignment Against Large Language Models"
    }
  ],
  "unconvertedLinkCount": 15,
  "convertedLinkCount": 49,
  "backlinkCount": 70,
  "citationHealth": {
    "total": 12,
    "withQuotes": 9,
    "verified": 7,
    "accuracyChecked": 7,
    "accurate": 6,
    "inaccurate": 1,
    "avgScore": 0.7998268281420072
  },
  "hallucinationRisk": {
    "level": "medium",
    "score": 40,
    "factors": [
      "high-rigor",
      "conceptual-content",
      "high-quality",
      "severe-truncation"
    ],
    "integrityIssues": [
      "severe-truncation"
    ]
  },
  "entityType": "approach",
  "redundancy": {
    "maxSimilarity": 20,
    "similarPages": [
      {
        "id": "why-alignment-hard",
        "title": "Why Alignment Might Be Hard",
        "path": "/knowledge-base/debates/why-alignment-hard/",
        "similarity": 20
      },
      {
        "id": "agentic-ai",
        "title": "Agentic AI",
        "path": "/knowledge-base/capabilities/agentic-ai/",
        "similarity": 19
      },
      {
        "id": "accident-risks",
        "title": "AI Accident Risk Cruxes",
        "path": "/knowledge-base/cruxes/accident-risks/",
        "similarity": 19
      },
      {
        "id": "solutions",
        "title": "AI Safety Solution Cruxes",
        "path": "/knowledge-base/cruxes/solutions/",
        "similarity": 19
      },
      {
        "id": "interpretability",
        "title": "Mechanistic Interpretability",
        "path": "/knowledge-base/responses/interpretability/",
        "similarity": 19
      }
    ]
  },
  "changeHistory": [
    {
      "date": "2026-02-18",
      "branch": "claude/fix-issue-240-N5irU",
      "title": "Surface tacticalValue in /wiki table and score 53 pages",
      "summary": "Added `tacticalValue` to `ExploreItem` interface, `getExploreItems()` mappings, the `/wiki` explore table (new sortable \"Tact.\" column), and the card view sort dropdown. Scored 49 new pages with tactical values (4 were already scored), bringing total to 53.",
      "model": "sonnet-4",
      "duration": "~30min"
    }
  ],
  "coverage": {
    "passing": 10,
    "total": 13,
    "targets": {
      "tables": 23,
      "diagrams": 2,
      "internalLinks": 45,
      "externalLinks": 28,
      "footnotes": 17,
      "references": 17
    },
    "actuals": {
      "tables": 14,
      "diagrams": 2,
      "internalLinks": 103,
      "externalLinks": 31,
      "footnotes": 13,
      "references": 41,
      "quotesWithQuotes": 9,
      "quotesTotal": 12,
      "accuracyChecked": 7,
      "accuracyTotal": 12
    },
    "items": {
      "llmSummary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "green",
      "overview": "green",
      "tables": "amber",
      "diagrams": "green",
      "internalLinks": "green",
      "externalLinks": "green",
      "footnotes": "amber",
      "references": "green",
      "quotes": "green",
      "accuracy": "amber"
    },
    "editHistoryCount": 1,
    "ratingsString": "N:5 R:7 A:6 C:7.5"
  },
  "readerRank": 1,
  "researchRank": 378,
  "recommendedScore": 251.4
}
External Links
{
  "wikipedia": "https://en.wikipedia.org/wiki/AI_alignment",
  "lesswrong": "https://www.lesswrong.com/tag/ai",
  "eaForum": "https://forum.effectivealtruism.org/topics/ai-alignment",
  "stampy": "https://aisafety.info/questions/9Tii/What-is-AI-alignment",
  "arbital": "https://arbital.greaterwrong.com/p/ai_alignment",
  "wikidata": "https://www.wikidata.org/wiki/Q24882728",
  "eightyK": "https://80000hours.org/problem-profiles/artificial-intelligence/",
  "grokipedia": "https://grokipedia.com/page/AI_alignment"
}
Backlinks (70)
idtitletyperelationship
ai-welfareAI Welfare and Digital Mindsconcept
palisade-researchPalisade Researchorganization
marc-andreessenMarc Andreessen (AI Investor)person
constitutional-aiConstitutional AIapproach
agentic-aiAgentic AIcapability
accident-risksAI Accident Risk Cruxescrux
miri-eraThe MIRI Era (2000-2015)historical
openclaw-matplotlib-incident-2026OpenClaw Matplotlib Incident (2026)concept
alignment-robustness-trajectoryAlignment Robustness Trajectoryanalysis
disinformation-detection-raceDisinformation Detection Arms Race Modelanalysis
intervention-timing-windowsIntervention Timing Windowsanalysis
model-organisms-of-misalignmentModel Organisms of Misalignmentanalysis
reward-hacking-taxonomyReward Hacking Taxonomy and Severity Modelanalysis
anthropic-investorsAnthropic (Funder)analysis
arb-researchArb Researchorganization
bridgewater-aia-labsBridgewater AIA Labsorganization
caisCAIS (Center for AI Safety)organization
center-for-applied-rationalityCenter for Applied Rationalityorganization
conjectureConjectureorganization
ea-globalEA Globalorganization
elicitElicit (AI Research Tool)organization
fhiFuture of Humanity Institute (FHI)organization
fliFuture of Life Institute (FLI)organization
frontier-model-forumFrontier Model Forumorganization
goodfireGoodfireorganization
lesswrongLessWrongorganization
lighthavenLighthaven (Event Venue)organization
lightning-rod-labsLightning Rod Labsorganization
lionheart-venturesLionheart Venturesorganization
manifestManifest (Forecasting Conference)organization
matsMATS ML Alignment Theory Scholars programorganization
miriMIRI (Machine Intelligence Research Institute)organization
nist-aiNIST and AI Safetyorganization
openaiOpenAIorganization
pause-aiPause AIorganization
safety-orgs-overviewAI Safety Organizations (Overview)concept
secure-ai-projectSecure AI Projectorganization
seldon-labSeldon Laborganization
swift-centreSwift Centreorganization
the-sequencesThe Sequences by Eliezer Yudkowskyorganization
varaValue Aligned Research Advisorsorganization
xaixAIorganization
connor-leahyConnor Leahyperson
eliezer-yudkowskyEliezer Yudkowskyperson
elon-muskElon Musk (AI Industry)person
evan-hubingerEvan Hubingerperson
holden-karnofskyHolden Karnofskyperson
leopold-aschenbrennerLeopold Aschenbrennerperson
paul-christianoPaul Christianoperson
vidur-kapurVidur Kapurperson
vipul-naikVipul Naikperson
ai-watchAI Watchproject
canada-aidaCanada AIDApolicy
cirlCooperative IRL (CIRL)approach
coe-ai-conventionCouncil of Europe Framework Convention on Artificial Intelligencepolicy
compute-governanceCompute Governance: AI Chips Export Controls Policypolicy
corporateCorporate AI Safety Responsesapproach
eu-ai-actEU AI Actpolicy
evaluationAI Evaluationapproach
new-york-raise-actNew York RAISE Actpolicy
prediction-marketsPrediction Markets (AI Forecasting)approach
rlhfRLHF / Constitutional AIcapability
scalable-oversightScalable Oversightsafety-agenda
training-programsAI Safety Training Programsapproach
wikipedia-and-aiWikipedia and AI Contentconcept
distributional-shiftAI Distributional Shiftrisk
lock-inAI Value Lock-inrisk
mesa-optimizationMesa-Optimizationrisk
sharp-left-turnSharp Left Turnrisk
doomerAI Doomer Worldviewconcept
Longterm Wiki