RLHF / Constitutional AI
rlhfcapabilityPath: /knowledge-base/responses/rlhf/
E259Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "rlhf",
"numericId": null,
"path": "/knowledge-base/responses/rlhf/",
"filePath": "knowledge-base/responses/rlhf.mdx",
"title": "RLHF / Constitutional AI",
"quality": 63,
"readerImportance": 22.5,
"researchImportance": 28,
"tacticalValue": null,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": "2026-03-13",
"dateCreated": "2026-02-15",
"llmSummary": "RLHF/Constitutional AI achieves 82-85% preference improvements and 40.8% adversarial attack reduction for current systems, but faces fundamental scalability limits: weak-to-strong supervision shows 10-20% performance gaps, sycophancy worsens with scale, and the approach cannot detect deceptive alignment. DPO variants reduce compute costs by 40-60% while matching performance, enabling widespread deployment across all frontier models (ChatGPT's 200M+ users).",
"description": "RLHF and Constitutional AI are the dominant techniques for aligning language models with human preferences. InstructGPT (1.3B) is preferred over GPT-3 (175B) 85% of the time, and Constitutional AI reduces adversarial attack success by 40.8%. However, fundamental limitations—reward hacking, sycophancy, and the scalable oversight problem—prevent these techniques from reliably scaling to superhuman systems.",
"ratings": {
"novelty": 4.2,
"rigor": 6.8,
"actionability": 6.5,
"completeness": 7.5
},
"category": "responses",
"subcategory": "alignment-training",
"clusters": [
"ai-safety"
],
"metrics": {
"wordCount": 2963,
"tableCount": 16,
"diagramCount": 1,
"internalLinks": 40,
"externalLinks": 29,
"footnoteCount": 0,
"bulletRatio": 0.18,
"sectionCount": 40,
"hasOverview": true,
"structuralScore": 15
},
"suggestedQuality": 100,
"updateFrequency": 45,
"evergreen": true,
"wordCount": 2963,
"unconvertedLinks": [
{
"text": "85±3% of time",
"url": "https://arxiv.org/abs/2203.02155",
"resourceId": "1098fc60be7ca2b0",
"resourceTitle": "Training Language Models to Follow Instructions with Human Feedback"
},
{
"text": "40.8%",
"url": "https://arxiv.org/abs/2212.08073",
"resourceId": "683aef834ac1612a"
},
{
"text": "10-20% performance gap",
"url": "https://arxiv.org/abs/2312.09390",
"resourceId": "0ba98ae3a8a72270",
"resourceTitle": "arXiv"
},
{
"text": "82% less likely",
"url": "https://cdn.openai.com/papers/gpt-4.pdf",
"resourceId": "227c865a2154436e",
"resourceTitle": "GPT-4 technical report"
},
{
"text": "≈75%",
"url": "https://arxiv.org/abs/2203.02155",
"resourceId": "1098fc60be7ca2b0",
"resourceTitle": "Training Language Models to Follow Instructions with Human Feedback"
},
{
"text": "OpenAI 2023",
"url": "https://cdn.openai.com/papers/gpt-4.pdf",
"resourceId": "227c865a2154436e",
"resourceTitle": "GPT-4 technical report"
},
{
"text": "OpenAI 2024",
"url": "https://openai.com/",
"resourceId": "04d39e8bd5d50dd5",
"resourceTitle": "OpenAI"
},
{
"text": "Anthropic",
"url": "https://www.anthropic.com/",
"resourceId": "afe2508ac4caf5ee",
"resourceTitle": "Anthropic"
},
{
"text": "Meta 2024",
"url": "https://ai.meta.com/llama/",
"resourceId": "69c685f410104791",
"resourceTitle": "Meta Llama 2 open-source"
},
{
"text": "OpenAI",
"url": "https://openai.com/gpt-4",
"resourceId": "39f08ad975b7f4db",
"resourceTitle": "GPT-4"
},
{
"text": "Mistral AI",
"url": "https://mistral.ai/",
"resourceId": "aa1786bb9025867e",
"resourceTitle": "Mistral"
},
{
"text": "Perez et al. 2023",
"url": "https://arxiv.org/abs/2212.09251",
"resourceId": "cd36bb65654c0147",
"resourceTitle": "Perez et al. (2022): \"Sycophancy in LLMs\""
},
{
"text": "Wei et al. 2024",
"url": "https://arxiv.org/abs/2310.13548",
"resourceId": "7951bdb54fd936a6",
"resourceTitle": "Anthropic: \"Discovering Sycophancy in Language Models\""
},
{
"text": "NeurIPS 2024",
"url": "https://arxiv.org/abs/2402.09345",
"resourceId": "14a9103bf7c2a1ef",
"resourceTitle": "InfoRM: Mitigating Reward Hacking in RLHF"
},
{
"text": "arXiv 2025",
"url": "https://arxiv.org/abs/2502.18770",
"resourceId": "d4e5b9bc7e21476c",
"resourceTitle": "Reward Shaping to Mitigate Reward Hacking in RLHF"
}
],
"unconvertedLinkCount": 15,
"convertedLinkCount": 31,
"backlinkCount": 66,
"hallucinationRisk": {
"level": "medium",
"score": 55,
"factors": [
"no-citations"
]
},
"entityType": "capability",
"redundancy": {
"maxSimilarity": 19,
"similarPages": [
{
"id": "preference-optimization",
"title": "Preference Optimization Methods",
"path": "/knowledge-base/responses/preference-optimization/",
"similarity": 19
},
{
"id": "reward-modeling",
"title": "Reward Modeling",
"path": "/knowledge-base/responses/reward-modeling/",
"similarity": 17
},
{
"id": "scalable-oversight",
"title": "Scalable Oversight",
"path": "/knowledge-base/responses/scalable-oversight/",
"similarity": 17
},
{
"id": "refusal-training",
"title": "Refusal Training",
"path": "/knowledge-base/responses/refusal-training/",
"similarity": 16
},
{
"id": "goal-misgeneralization",
"title": "Goal Misgeneralization",
"path": "/knowledge-base/risks/goal-misgeneralization/",
"similarity": 16
}
]
},
"coverage": {
"passing": 9,
"total": 13,
"targets": {
"tables": 12,
"diagrams": 1,
"internalLinks": 24,
"externalLinks": 15,
"footnotes": 9,
"references": 9
},
"actuals": {
"tables": 16,
"diagrams": 1,
"internalLinks": 40,
"externalLinks": 29,
"footnotes": 0,
"references": 29,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"llmSummary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "red",
"overview": "green",
"tables": "green",
"diagrams": "green",
"internalLinks": "green",
"externalLinks": "green",
"footnotes": "red",
"references": "green",
"quotes": "red",
"accuracy": "red"
},
"ratingsString": "N:4.2 R:6.8 A:6.5 C:7.5"
},
"readerRank": 508,
"researchRank": 432,
"recommendedScore": 159.08
}External Links
{
"lesswrong": "https://www.lesswrong.com/tag/rlhf",
"wikipedia": "https://en.wikipedia.org/wiki/Reinforcement_learning_from_human_feedback",
"stampy": "https://aisafety.info/questions/8RIL/What-is-RLHF",
"wikidata": "https://www.wikidata.org/wiki/Q115570683",
"grokipedia": "https://grokipedia.com/page/Reinforcement_learning_from_human_feedback"
}Backlinks (66)
| id | title | type | relationship |
|---|---|---|---|
| dense-transformers | Dense Transformers | concept | — |
| reward-hacking-taxonomy | Reward Hacking Taxonomy and Severity Model | analysis | vulnerable-technique |
| value-learning | AI Value Learning | safety-agenda | — |
| constitutional-ai | Constitutional AI | approach | — |
| weak-to-strong | Weak-to-Strong Generalization | approach | — |
| preference-optimization | Preference Optimization Methods | approach | — |
| process-supervision | Process Supervision | approach | — |
| refusal-training | Refusal Training | approach | — |
| debate | AI Safety via Debate | approach | — |
| reward-hacking | Reward Hacking | risk | — |
| language-models | Large Language Models | capability | — |
| accident-risks | AI Accident Risk Cruxes | crux | — |
| solutions | AI Safety Solution Cruxes | crux | — |
| why-alignment-easy | Why Alignment Might Be Easy | argument | — |
| why-alignment-hard | Why Alignment Might Be Hard | argument | — |
| deep-learning-era | Deep Learning Revolution (2012-2020) | historical | — |
| miri-era | The MIRI Era (2000-2015) | historical | — |
| __index__/knowledge-base | Knowledge Base | concept | — |
| alignment-robustness-trajectory | Alignment Robustness Trajectory | analysis | — |
| capability-alignment-race | Capability-Alignment Race Model | analysis | — |
| defense-in-depth-model | Defense in Depth Model | analysis | — |
| intervention-effectiveness-matrix | Intervention Effectiveness Matrix | analysis | — |
| model-organisms-of-misalignment | Model Organisms of Misalignment | analysis | — |
| risk-activation-timeline | Risk Activation Timeline Model | analysis | — |
| safety-research-allocation | Safety Research Allocation Model | analysis | — |
| safety-spending-at-scale | Safety Spending at Scale | analysis | — |
| anthropic | Anthropic | organization | — |
| arc | ARC (Alignment Research Center) | organization | — |
| chai | CHAI (Center for Human-Compatible AI) | organization | — |
| conjecture | Conjecture | organization | — |
| deepmind | Google DeepMind | organization | — |
| elicit | Elicit (AI Research Tool) | organization | — |
| goodfire | Goodfire | organization | — |
| microsoft | Microsoft AI | organization | — |
| openai | OpenAI | organization | — |
| pause-ai | Pause AI | organization | — |
| chris-olah | Chris Olah | person | — |
| connor-leahy | Connor Leahy | person | — |
| dario-amodei | Dario Amodei | person | — |
| eliezer-yudkowsky-predictions | Eliezer Yudkowsky: Track Record | concept | — |
| eliezer-yudkowsky | Eliezer Yudkowsky | person | — |
| ilya-sutskever | Ilya Sutskever | person | — |
| jan-leike | Jan Leike | person | — |
| paul-christiano | Paul Christiano | person | — |
| alignment-training-overview | Training Methods (Overview) | concept | — |
| alignment | AI Alignment | approach | — |
| circuit-breakers | Circuit Breakers / Inference Interventions | approach | — |
| cirl | Cooperative IRL (CIRL) | approach | — |
| __index__/knowledge-base/responses | Safety Responses | concept | — |
| intervention-portfolio | AI Safety Intervention Portfolio | approach | — |
| model-spec | AI Model Specifications | policy | — |
| reward-modeling | Reward Modeling | approach | — |
| scalable-oversight | Scalable Oversight | safety-agenda | — |
| cyber-psychosis | AI-Induced Cyber Psychosis | risk | — |
| deceptive-alignment | Deceptive Alignment | risk | — |
| epistemic-sycophancy | Epistemic Sycophancy | risk | — |
| existential-risk | Existential Risk from AI | concept | — |
| mesa-optimization | Mesa-Optimization | risk | — |
| power-seeking | Power-Seeking AI | risk | — |
| sleeper-agents | Sleeper Agents: Training Deceptive LLMs | risk | — |
| sycophancy | Sycophancy | risk | — |
| treacherous-turn | Treacherous Turn | risk | — |
| doomer | AI Doomer Worldview | concept | — |
| optimistic | Optimistic Alignment Worldview | concept | — |
| __index__/insight-hunting | Insight Hunting | concept | — |
| table-candidates | Table Candidates | concept | — |