Reward Modeling
reward-modelingapproachPath: /knowledge-base/responses/reward-modeling/
E600Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "reward-modeling",
"numericId": null,
"path": "/knowledge-base/responses/reward-modeling/",
"filePath": "knowledge-base/responses/reward-modeling.mdx",
"title": "Reward Modeling",
"quality": 55,
"readerImportance": 20,
"researchImportance": 28.5,
"tacticalValue": null,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": "2026-03-13",
"dateCreated": "2026-02-15",
"llmSummary": "Reward modeling, the core component of RLHF receiving \\$100M+/year investment, trains neural networks on human preference comparisons to enable scalable reinforcement learning. The technique is universally adopted but inherits fundamental limitations including reward hacking (which worsens with capability), vulnerability to deception, and Goodhart's law—making it capability-dominant rather than safety-enhancing.",
"description": "Reward modeling trains separate neural networks to predict human preferences, serving as the core component of RLHF pipelines. While essential for modern AI assistants and receiving over \\$500M/year in investment, it inherits all fundamental limitations of RLHF including reward hacking and lack of deception robustness.",
"ratings": {
"novelty": 3.5,
"rigor": 4,
"actionability": 3,
"completeness": 5.5
},
"category": "responses",
"subcategory": "alignment-training",
"clusters": [
"ai-safety"
],
"metrics": {
"wordCount": 1860,
"tableCount": 19,
"diagramCount": 1,
"internalLinks": 14,
"externalLinks": 12,
"footnoteCount": 0,
"bulletRatio": 0.07,
"sectionCount": 30,
"hasOverview": true,
"structuralScore": 15
},
"suggestedQuality": 100,
"updateFrequency": 45,
"evergreen": true,
"wordCount": 1860,
"unconvertedLinks": [
{
"text": "Deep RL from Human Preferences",
"url": "https://arxiv.org/abs/1706.03741",
"resourceId": "14df73723b4d14d7",
"resourceTitle": "[1706.03741] Deep Reinforcement Learning from Human Preferences"
},
{
"text": "InstructGPT",
"url": "https://arxiv.org/abs/2203.02155",
"resourceId": "1098fc60be7ca2b0",
"resourceTitle": "Training Language Models to Follow Instructions with Human Feedback"
},
{
"text": "Constitutional AI",
"url": "https://arxiv.org/abs/2212.08073",
"resourceId": "683aef834ac1612a"
},
{
"text": "Reward Hacking in RL",
"url": "https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",
"resourceId": "570615e019d1cc74",
"resourceTitle": "Reward Hacking in Reinforcement Learning"
},
{
"text": "Deep RL from Human Preferences",
"url": "https://arxiv.org/abs/1706.03741",
"resourceId": "14df73723b4d14d7",
"resourceTitle": "[1706.03741] Deep Reinforcement Learning from Human Preferences"
},
{
"text": "Training Language Models to Follow Instructions",
"url": "https://arxiv.org/abs/2203.02155",
"resourceId": "1098fc60be7ca2b0",
"resourceTitle": "Training Language Models to Follow Instructions with Human Feedback"
},
{
"text": "Constitutional AI",
"url": "https://arxiv.org/abs/2212.08073",
"resourceId": "683aef834ac1612a"
},
{
"text": "Reward Hacking in RL",
"url": "https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",
"resourceId": "570615e019d1cc74",
"resourceTitle": "Reward Hacking in Reinforcement Learning"
},
{
"text": "RLHF Book",
"url": "https://rlhfbook.com/",
"resourceId": "ebcbaba2d260e656",
"resourceTitle": "online iterative RLHF"
}
],
"unconvertedLinkCount": 9,
"convertedLinkCount": 0,
"backlinkCount": 11,
"hallucinationRisk": {
"level": "medium",
"score": 45,
"factors": [
"no-citations",
"conceptual-content"
]
},
"entityType": "approach",
"redundancy": {
"maxSimilarity": 17,
"similarPages": [
{
"id": "rlhf",
"title": "RLHF / Constitutional AI",
"path": "/knowledge-base/responses/rlhf/",
"similarity": 17
},
{
"id": "adversarial-training",
"title": "Adversarial Training",
"path": "/knowledge-base/responses/adversarial-training/",
"similarity": 16
},
{
"id": "process-supervision",
"title": "Process Supervision",
"path": "/knowledge-base/responses/process-supervision/",
"similarity": 16
},
{
"id": "preference-optimization",
"title": "Preference Optimization Methods",
"path": "/knowledge-base/responses/preference-optimization/",
"similarity": 14
},
{
"id": "refusal-training",
"title": "Refusal Training",
"path": "/knowledge-base/responses/refusal-training/",
"similarity": 14
}
]
},
"coverage": {
"passing": 7,
"total": 13,
"targets": {
"tables": 7,
"diagrams": 1,
"internalLinks": 15,
"externalLinks": 9,
"footnotes": 6,
"references": 6
},
"actuals": {
"tables": 19,
"diagrams": 1,
"internalLinks": 14,
"externalLinks": 12,
"footnotes": 0,
"references": 5,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"llmSummary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "red",
"overview": "green",
"tables": "green",
"diagrams": "green",
"internalLinks": "amber",
"externalLinks": "green",
"footnotes": "red",
"references": "amber",
"quotes": "red",
"accuracy": "red"
},
"ratingsString": "N:3.5 R:4 A:3 C:5.5"
},
"readerRank": 523,
"researchRank": 429,
"recommendedScore": 141.63
}External Links
No external links
Backlinks (11)
| id | title | type | relationship |
|---|---|---|---|
| solutions | AI Safety Solution Cruxes | crux | — |
| why-alignment-hard | Why Alignment Might Be Hard | argument | — |
| deepmind | Google DeepMind | organization | — |
| dario-amodei | Dario Amodei | person | — |
| jan-leike | Jan Leike | person | — |
| paul-christiano | Paul Christiano | person | — |
| alignment-training-overview | Training Methods (Overview) | concept | — |
| cirl | Cooperative IRL (CIRL) | approach | — |
| scalable-oversight | Scalable Oversight | safety-agenda | — |
| weak-to-strong | Weak-to-Strong Generalization | approach | — |
| mesa-optimization | Mesa-Optimization | risk | — |