Longterm Wiki

RLHF / Constitutional AI

rlhfcapabilityPath: /knowledge-base/responses/rlhf/
E259Entity ID (EID)
← Back to page66 backlinksQuality: 63Updated: 2026-03-13
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "rlhf",
  "numericId": null,
  "path": "/knowledge-base/responses/rlhf/",
  "filePath": "knowledge-base/responses/rlhf.mdx",
  "title": "RLHF / Constitutional AI",
  "quality": 63,
  "readerImportance": 22.5,
  "researchImportance": 28,
  "tacticalValue": null,
  "contentFormat": "article",
  "tractability": null,
  "neglectedness": null,
  "uncertainty": null,
  "causalLevel": null,
  "lastUpdated": "2026-03-13",
  "dateCreated": "2026-02-15",
  "llmSummary": "RLHF/Constitutional AI achieves 82-85% preference improvements and 40.8% adversarial attack reduction for current systems, but faces fundamental scalability limits: weak-to-strong supervision shows 10-20% performance gaps, sycophancy worsens with scale, and the approach cannot detect deceptive alignment. DPO variants reduce compute costs by 40-60% while matching performance, enabling widespread deployment across all frontier models (ChatGPT's 200M+ users).",
  "description": "RLHF and Constitutional AI are the dominant techniques for aligning language models with human preferences. InstructGPT (1.3B) is preferred over GPT-3 (175B) 85% of the time, and Constitutional AI reduces adversarial attack success by 40.8%. However, fundamental limitations—reward hacking, sycophancy, and the scalable oversight problem—prevent these techniques from reliably scaling to superhuman systems.",
  "ratings": {
    "novelty": 4.2,
    "rigor": 6.8,
    "actionability": 6.5,
    "completeness": 7.5
  },
  "category": "responses",
  "subcategory": "alignment-training",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 2963,
    "tableCount": 16,
    "diagramCount": 1,
    "internalLinks": 40,
    "externalLinks": 29,
    "footnoteCount": 0,
    "bulletRatio": 0.18,
    "sectionCount": 40,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": 45,
  "evergreen": true,
  "wordCount": 2963,
  "unconvertedLinks": [
    {
      "text": "85±3% of time",
      "url": "https://arxiv.org/abs/2203.02155",
      "resourceId": "1098fc60be7ca2b0",
      "resourceTitle": "Training Language Models to Follow Instructions with Human Feedback"
    },
    {
      "text": "40.8%",
      "url": "https://arxiv.org/abs/2212.08073",
      "resourceId": "683aef834ac1612a"
    },
    {
      "text": "10-20% performance gap",
      "url": "https://arxiv.org/abs/2312.09390",
      "resourceId": "0ba98ae3a8a72270",
      "resourceTitle": "arXiv"
    },
    {
      "text": "82% less likely",
      "url": "https://cdn.openai.com/papers/gpt-4.pdf",
      "resourceId": "227c865a2154436e",
      "resourceTitle": "GPT-4 technical report"
    },
    {
      "text": "≈75%",
      "url": "https://arxiv.org/abs/2203.02155",
      "resourceId": "1098fc60be7ca2b0",
      "resourceTitle": "Training Language Models to Follow Instructions with Human Feedback"
    },
    {
      "text": "OpenAI 2023",
      "url": "https://cdn.openai.com/papers/gpt-4.pdf",
      "resourceId": "227c865a2154436e",
      "resourceTitle": "GPT-4 technical report"
    },
    {
      "text": "OpenAI 2024",
      "url": "https://openai.com/",
      "resourceId": "04d39e8bd5d50dd5",
      "resourceTitle": "OpenAI"
    },
    {
      "text": "Anthropic",
      "url": "https://www.anthropic.com/",
      "resourceId": "afe2508ac4caf5ee",
      "resourceTitle": "Anthropic"
    },
    {
      "text": "Meta 2024",
      "url": "https://ai.meta.com/llama/",
      "resourceId": "69c685f410104791",
      "resourceTitle": "Meta Llama 2 open-source"
    },
    {
      "text": "OpenAI",
      "url": "https://openai.com/gpt-4",
      "resourceId": "39f08ad975b7f4db",
      "resourceTitle": "GPT-4"
    },
    {
      "text": "Mistral AI",
      "url": "https://mistral.ai/",
      "resourceId": "aa1786bb9025867e",
      "resourceTitle": "Mistral"
    },
    {
      "text": "Perez et al. 2023",
      "url": "https://arxiv.org/abs/2212.09251",
      "resourceId": "cd36bb65654c0147",
      "resourceTitle": "Perez et al. (2022): \"Sycophancy in LLMs\""
    },
    {
      "text": "Wei et al. 2024",
      "url": "https://arxiv.org/abs/2310.13548",
      "resourceId": "7951bdb54fd936a6",
      "resourceTitle": "Anthropic: \"Discovering Sycophancy in Language Models\""
    },
    {
      "text": "NeurIPS 2024",
      "url": "https://arxiv.org/abs/2402.09345",
      "resourceId": "14a9103bf7c2a1ef",
      "resourceTitle": "InfoRM: Mitigating Reward Hacking in RLHF"
    },
    {
      "text": "arXiv 2025",
      "url": "https://arxiv.org/abs/2502.18770",
      "resourceId": "d4e5b9bc7e21476c",
      "resourceTitle": "Reward Shaping to Mitigate Reward Hacking in RLHF"
    }
  ],
  "unconvertedLinkCount": 15,
  "convertedLinkCount": 31,
  "backlinkCount": 66,
  "hallucinationRisk": {
    "level": "medium",
    "score": 55,
    "factors": [
      "no-citations"
    ]
  },
  "entityType": "capability",
  "redundancy": {
    "maxSimilarity": 19,
    "similarPages": [
      {
        "id": "preference-optimization",
        "title": "Preference Optimization Methods",
        "path": "/knowledge-base/responses/preference-optimization/",
        "similarity": 19
      },
      {
        "id": "reward-modeling",
        "title": "Reward Modeling",
        "path": "/knowledge-base/responses/reward-modeling/",
        "similarity": 17
      },
      {
        "id": "scalable-oversight",
        "title": "Scalable Oversight",
        "path": "/knowledge-base/responses/scalable-oversight/",
        "similarity": 17
      },
      {
        "id": "refusal-training",
        "title": "Refusal Training",
        "path": "/knowledge-base/responses/refusal-training/",
        "similarity": 16
      },
      {
        "id": "goal-misgeneralization",
        "title": "Goal Misgeneralization",
        "path": "/knowledge-base/risks/goal-misgeneralization/",
        "similarity": 16
      }
    ]
  },
  "coverage": {
    "passing": 9,
    "total": 13,
    "targets": {
      "tables": 12,
      "diagrams": 1,
      "internalLinks": 24,
      "externalLinks": 15,
      "footnotes": 9,
      "references": 9
    },
    "actuals": {
      "tables": 16,
      "diagrams": 1,
      "internalLinks": 40,
      "externalLinks": 29,
      "footnotes": 0,
      "references": 29,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "llmSummary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "green",
      "diagrams": "green",
      "internalLinks": "green",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "green",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:4.2 R:6.8 A:6.5 C:7.5"
  },
  "readerRank": 508,
  "researchRank": 432,
  "recommendedScore": 159.08
}
External Links
{
  "lesswrong": "https://www.lesswrong.com/tag/rlhf",
  "wikipedia": "https://en.wikipedia.org/wiki/Reinforcement_learning_from_human_feedback",
  "stampy": "https://aisafety.info/questions/8RIL/What-is-RLHF",
  "wikidata": "https://www.wikidata.org/wiki/Q115570683",
  "grokipedia": "https://grokipedia.com/page/Reinforcement_learning_from_human_feedback"
}
Backlinks (66)
idtitletyperelationship
dense-transformersDense Transformersconcept
reward-hacking-taxonomyReward Hacking Taxonomy and Severity Modelanalysisvulnerable-technique
value-learningAI Value Learningsafety-agenda
constitutional-aiConstitutional AIapproach
weak-to-strongWeak-to-Strong Generalizationapproach
preference-optimizationPreference Optimization Methodsapproach
process-supervisionProcess Supervisionapproach
refusal-trainingRefusal Trainingapproach
debateAI Safety via Debateapproach
reward-hackingReward Hackingrisk
language-modelsLarge Language Modelscapability
accident-risksAI Accident Risk Cruxescrux
solutionsAI Safety Solution Cruxescrux
why-alignment-easyWhy Alignment Might Be Easyargument
why-alignment-hardWhy Alignment Might Be Hardargument
deep-learning-eraDeep Learning Revolution (2012-2020)historical
miri-eraThe MIRI Era (2000-2015)historical
__index__/knowledge-baseKnowledge Baseconcept
alignment-robustness-trajectoryAlignment Robustness Trajectoryanalysis
capability-alignment-raceCapability-Alignment Race Modelanalysis
defense-in-depth-modelDefense in Depth Modelanalysis
intervention-effectiveness-matrixIntervention Effectiveness Matrixanalysis
model-organisms-of-misalignmentModel Organisms of Misalignmentanalysis
risk-activation-timelineRisk Activation Timeline Modelanalysis
safety-research-allocationSafety Research Allocation Modelanalysis
safety-spending-at-scaleSafety Spending at Scaleanalysis
anthropicAnthropicorganization
arcARC (Alignment Research Center)organization
chaiCHAI (Center for Human-Compatible AI)organization
conjectureConjectureorganization
deepmindGoogle DeepMindorganization
elicitElicit (AI Research Tool)organization
goodfireGoodfireorganization
microsoftMicrosoft AIorganization
openaiOpenAIorganization
pause-aiPause AIorganization
chris-olahChris Olahperson
connor-leahyConnor Leahyperson
dario-amodeiDario Amodeiperson
eliezer-yudkowsky-predictionsEliezer Yudkowsky: Track Recordconcept
eliezer-yudkowskyEliezer Yudkowskyperson
ilya-sutskeverIlya Sutskeverperson
jan-leikeJan Leikeperson
paul-christianoPaul Christianoperson
alignment-training-overviewTraining Methods (Overview)concept
alignmentAI Alignmentapproach
circuit-breakersCircuit Breakers / Inference Interventionsapproach
cirlCooperative IRL (CIRL)approach
__index__/knowledge-base/responsesSafety Responsesconcept
intervention-portfolioAI Safety Intervention Portfolioapproach
model-specAI Model Specificationspolicy
reward-modelingReward Modelingapproach
scalable-oversightScalable Oversightsafety-agenda
cyber-psychosisAI-Induced Cyber Psychosisrisk
deceptive-alignmentDeceptive Alignmentrisk
epistemic-sycophancyEpistemic Sycophancyrisk
existential-riskExistential Risk from AIconcept
mesa-optimizationMesa-Optimizationrisk
power-seekingPower-Seeking AIrisk
sleeper-agentsSleeper Agents: Training Deceptive LLMsrisk
sycophancySycophancyrisk
treacherous-turnTreacherous Turnrisk
doomerAI Doomer Worldviewconcept
optimisticOptimistic Alignment Worldviewconcept
__index__/insight-huntingInsight Huntingconcept
table-candidatesTable Candidatesconcept
Longterm Wiki