RLHF / Constitutional AI

rlhfresearch-areaPath: /knowledge-base/responses/rlhf/

E259Entity ID (EID)

← Back to page69 backlinksQuality: 63Updated: 2026-01-29

Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time

{
  "id": "rlhf",
  "wikiId": "E259",
  "path": "/knowledge-base/responses/rlhf/",
  "filePath": "knowledge-base/responses/rlhf.mdx",
  "title": "RLHF / Constitutional AI",
  "quality": 63,
  "readerImportance": 22.5,
  "researchImportance": 28,
  "tacticalValue": null,
  "contentFormat": "article",
  "causalLevel": null,
  "lastUpdated": "2026-01-29",
  "dateCreated": "2026-02-15",
  "summary": "RLHF/Constitutional AI achieves 82-85% preference improvements and 40.8% adversarial attack reduction for current systems, but faces fundamental scalability limits: weak-to-strong supervision shows 10-20% performance gaps, sycophancy worsens with scale, and the approach cannot detect deceptive alignment. DPO variants reduce compute costs by 40-60% while matching performance, enabling widespread deployment across all frontier models (ChatGPT's 200M+ users).",
  "description": "RLHF and Constitutional AI are the dominant techniques for aligning language models with human preferences.",
  "ratings": {
    "novelty": 4.2,
    "rigor": 6.8,
    "completeness": 7.5,
    "actionability": 6.5
  },
  "category": "responses",
  "subcategory": "alignment-training",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 2963,
    "tableCount": 16,
    "diagramCount": 1,
    "internalLinks": 40,
    "externalLinks": 29,
    "footnoteCount": 0,
    "bulletRatio": 0.18,
    "sectionCount": 40,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": 45,
  "evergreen": true,
  "wordCount": 2963,
  "unconvertedLinks": [
    {
      "text": "85±3% of time",
      "url": "https://arxiv.org/abs/2203.02155",
      "resourceId": "1098fc60be7ca2b0",
      "resourceTitle": "Training Language Models to Follow Instructions with Human Feedback"
    },
    {
      "text": "40.8%",
      "url": "https://arxiv.org/abs/2212.08073",
      "resourceId": "683aef834ac1612a",
      "resourceTitle": "Constitutional AI: Harmlessness from AI Feedback"
    },
    {
      "text": "10-20% performance gap",
      "url": "https://arxiv.org/abs/2312.09390",
      "resourceId": "0ba98ae3a8a72270",
      "resourceTitle": "[2312.09390] Weak-to-Strong Generalization: Eliciting Strong Capabilities With Weak Supervision"
    },
    {
      "text": "82% less likely",
      "url": "https://cdn.openai.com/papers/gpt-4.pdf",
      "resourceId": "227c865a2154436e",
      "resourceTitle": "GPT-4 technical report"
    },
    {
      "text": "≈75%",
      "url": "https://arxiv.org/abs/2203.02155",
      "resourceId": "1098fc60be7ca2b0",
      "resourceTitle": "Training Language Models to Follow Instructions with Human Feedback"
    },
    {
      "text": "Anthropic",
      "url": "https://www.anthropic.com/news/claude-2",
      "resourceId": "39071a9443e5c8dd",
      "resourceTitle": "Announcing Claude 2"
    },
    {
      "text": "OpenAI 2023",
      "url": "https://cdn.openai.com/papers/gpt-4.pdf",
      "resourceId": "227c865a2154436e",
      "resourceTitle": "GPT-4 technical report"
    },
    {
      "text": "OpenAI 2024",
      "url": "https://openai.com/",
      "resourceId": "04d39e8bd5d50dd5",
      "resourceTitle": "OpenAI Official Homepage"
    },
    {
      "text": "Anthropic",
      "url": "https://www.anthropic.com/",
      "resourceId": "afe2508ac4caf5ee",
      "resourceTitle": "Anthropic - AI Safety Company Homepage"
    },
    {
      "text": "Meta 2024",
      "url": "https://ai.meta.com/llama/",
      "resourceId": "69c685f410104791",
      "resourceTitle": "Meta Llama 2 open-source"
    },
    {
      "text": "OpenAI",
      "url": "https://openai.com/gpt-4",
      "resourceId": "39f08ad975b7f4db",
      "resourceTitle": "GPT-4 - OpenAI Product Page"
    },
    {
      "text": "Mistral AI",
      "url": "https://mistral.ai/",
      "resourceId": "aa1786bb9025867e",
      "resourceTitle": "Frontier AI LLMs, assistants, agents, services | Mistral AI"
    },
    {
      "text": "Perez et al. 2023",
      "url": "https://arxiv.org/abs/2212.09251",
      "resourceId": "cd36bb65654c0147",
      "resourceTitle": "Perez et al. (2022): \"Sycophancy in LLMs\""
    },
    {
      "text": "Wei et al. 2024",
      "url": "https://arxiv.org/abs/2310.13548",
      "resourceId": "7951bdb54fd936a6",
      "resourceTitle": "Anthropic: \"Discovering Sycophancy in Language Models\""
    },
    {
      "text": "NeurIPS 2024",
      "url": "https://arxiv.org/abs/2402.09345",
      "resourceId": "14a9103bf7c2a1ef",
      "resourceTitle": "InfoRM: Mitigating Reward Hacking in RLHF"
    },
    {
      "text": "arXiv 2025",
      "url": "https://arxiv.org/abs/2502.18770",
      "resourceId": "d4e5b9bc7e21476c",
      "resourceTitle": "Reward Shaping to Mitigate Reward Hacking in RLHF"
    }
  ],
  "unconvertedLinkCount": 16,
  "convertedLinkCount": 31,
  "backlinkCount": 69,
  "hallucinationRisk": {
    "level": "medium",
    "score": 55,
    "factors": [
      "no-citations"
    ]
  },
  "entityType": "research-area",
  "redundancy": {
    "maxSimilarity": 19,
    "similarPages": [
      {
        "id": "preference-optimization",
        "title": "Preference Optimization Methods",
        "path": "/knowledge-base/responses/preference-optimization/",
        "similarity": 19
      },
      {
        "id": "reward-modeling",
        "title": "Reward Modeling",
        "path": "/knowledge-base/responses/reward-modeling/",
        "similarity": 17
      },
      {
        "id": "scalable-oversight",
        "title": "Scalable Oversight",
        "path": "/knowledge-base/responses/scalable-oversight/",
        "similarity": 17
      },
      {
        "id": "refusal-training",
        "title": "Refusal Training",
        "path": "/knowledge-base/responses/refusal-training/",
        "similarity": 16
      },
      {
        "id": "goal-misgeneralization",
        "title": "Goal Misgeneralization",
        "path": "/knowledge-base/risks/goal-misgeneralization/",
        "similarity": 16
      }
    ]
  },
  "coverage": {
    "passing": 9,
    "total": 13,
    "targets": {
      "tables": 12,
      "diagrams": 1,
      "internalLinks": 24,
      "externalLinks": 15,
      "footnotes": 9,
      "references": 9
    },
    "actuals": {
      "tables": 16,
      "diagrams": 1,
      "internalLinks": 40,
      "externalLinks": 29,
      "footnotes": 0,
      "references": 29,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "summary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "green",
      "diagrams": "green",
      "internalLinks": "green",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "green",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:4.2 R:6.8 A:6.5 C:7.5"
  },
  "readerRank": 506,
  "researchRank": 427,
  "recommendedScore": 148.85
}

External Links

{
  "lesswrong": "https://www.lesswrong.com/tag/rlhf",
  "wikipedia": "https://en.wikipedia.org/wiki/Reinforcement_learning_from_human_feedback",
  "stampy": "https://aisafety.info/questions/8RIL/What-is-RLHF",
  "wikidata": "https://www.wikidata.org/wiki/Q115570683",
  "grokipedia": "https://grokipedia.com/page/Reinforcement_learning_from_human_feedback"
}

Backlinks (69)

id	title	type	relationship
dense-transformers	Dense Transformers	concept	—
reward-hacking-taxonomy	Reward Hacking Taxonomy and Severity Model	analysis	vulnerable-technique
constitutional-ai	Constitutional AI	approach	—
weak-to-strong	Weak-to-Strong Generalization	approach	—
preference-optimization	Preference Optimization Methods	approach	—
process-supervision	Process Supervision	approach	—
refusal-training	Refusal Training	approach	—
debate	AI Safety via Debate	approach	—
reward-hacking	Reward Hacking	risk	—
language-models	Large Language Models	capability	—
accident-risks	AI Accident Risk Cruxes	crux	—
misuse-risks	AI Misuse Risk Cruxes	crux	—
solutions	AI Safety Solution Cruxes	crux	—
is-ai-xrisk-real	Is AI Existential Risk Real?	crux	—
why-alignment-easy	Why Alignment Might Be Easy	argument	—
why-alignment-hard	Why Alignment Might Be Hard	argument	—
deep-learning-era	Deep Learning Revolution (2012-2020)	historical	—
miri-era	The MIRI Era (2000-2015)	historical	—
__index__/knowledge-base	Knowledge Base	concept	—
alignment-robustness-trajectory	Alignment Robustness Trajectory	analysis	—
capability-alignment-race	Capability-Alignment Race Model	analysis	—
defense-in-depth-model	Defense in Depth Model	analysis	—
intervention-effectiveness-matrix	Intervention Effectiveness Matrix	analysis	—
model-organisms-of-misalignment	Model Organisms of Misalignment	analysis	—
risk-activation-timeline	Risk Activation Timeline Model	analysis	—
safety-research-allocation	Safety Research Allocation Model	analysis	—
safety-spending-at-scale	Safety Spending at Scale	analysis	—
anthropic	Anthropic	organization	—
arc	Alignment Research Center (ARC)	organization	—
chai	Center for Human-Compatible AI (CHAI)	organization	—
conjecture	Conjecture	organization	—
deepmind	Google DeepMind	organization	—
elicit	Elicit (AI Research Tool)	organization	—
foresight-institute	Foresight Institute	organization	—
goodfire	Goodfire	organization	—
labs-overview	Frontier AI Labs (Overview)	concept	—
microsoft	Microsoft AI	organization	—
openai	OpenAI	organization	—
pause-ai	Pause AI	organization	—
chris-olah	Chris Olah	person	—
connor-leahy	Connor Leahy	person	—
dario-amodei	Dario Amodei	person	—
eliezer-yudkowsky-predictions	Eliezer Yudkowsky: Track Record	concept	—
eliezer-yudkowsky	Eliezer Yudkowsky	person	—
ilya-sutskever	Ilya Sutskever	person	—
jan-leike	Jan Leike	person	—
paul-christiano	Paul Christiano	person	—
alignment-training-overview	Training Methods (Overview)	concept	—
alignment	AI Alignment	approach	—
circuit-breakers	Circuit Breakers / Inference Interventions	approach	—
cirl	Cooperative IRL (CIRL)	approach	—
__index__/knowledge-base/responses	Safety Responses	concept	—
intervention-portfolio	AI Safety Intervention Portfolio	approach	—
model-spec	AI Model Specifications	approach	—
reward-modeling	Reward Modeling	approach	—
scalable-oversight	Scalable Oversight	research-area	—
cyber-psychosis	AI-Induced Cyber Psychosis	risk	—
deceptive-alignment	Deceptive Alignment	risk	—
epistemic-sycophancy	Epistemic Sycophancy	risk	—
existential-risk	Existential Risk from AI	concept	—
mesa-optimization	Mesa-Optimization	risk	—
power-seeking	Power-Seeking AI	risk	—
sleeper-agents	Sleeper Agents: Training Deceptive LLMs	risk	—
sycophancy	Sycophancy	risk	—
treacherous-turn	Treacherous Turn	risk	—
doomer	AI Doomer Worldview	concept	—
optimistic	Optimistic Alignment Worldview	concept	—
__index__/insight-hunting	Insight Hunting	concept	—
table-candidates	Table Candidates	concept	—