Reward Modeling

reward-modelingapproachPath: /knowledge-base/responses/reward-modeling/

E600Entity ID (EID)

← Back to page11 backlinksQuality: 55Updated: 2026-01-28

Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time

{
  "id": "reward-modeling",
  "wikiId": "E600",
  "path": "/knowledge-base/responses/reward-modeling/",
  "filePath": "knowledge-base/responses/reward-modeling.mdx",
  "title": "Reward Modeling",
  "quality": 55,
  "readerImportance": 20,
  "researchImportance": 28.5,
  "tacticalValue": null,
  "contentFormat": "article",
  "causalLevel": null,
  "lastUpdated": "2026-01-28",
  "dateCreated": "2026-02-15",
  "summary": "Reward modeling, the core component of RLHF receiving \\$100M+/year investment, trains neural networks on human preference comparisons to enable scalable reinforcement learning. The technique is universally adopted but inherits fundamental limitations including reward hacking (which worsens with capability), vulnerability to deception, and Goodhart's law—making it capability-dominant rather than safety-enhancing.",
  "description": "Reward modeling trains separate neural networks to predict human preferences, serving as the core component of RLHF pipelines.",
  "ratings": {
    "novelty": 3.5,
    "rigor": 4,
    "completeness": 5.5,
    "actionability": 3
  },
  "category": "responses",
  "subcategory": "alignment-training",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 1860,
    "tableCount": 19,
    "diagramCount": 1,
    "internalLinks": 14,
    "externalLinks": 12,
    "footnoteCount": 0,
    "bulletRatio": 0.07,
    "sectionCount": 30,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": 45,
  "evergreen": true,
  "wordCount": 1860,
  "unconvertedLinks": [
    {
      "text": "Deep RL from Human Preferences",
      "url": "https://arxiv.org/abs/1706.03741",
      "resourceId": "14df73723b4d14d7",
      "resourceTitle": "[1706.03741] Deep Reinforcement Learning from Human Preferences"
    },
    {
      "text": "InstructGPT",
      "url": "https://arxiv.org/abs/2203.02155",
      "resourceId": "1098fc60be7ca2b0",
      "resourceTitle": "Training Language Models to Follow Instructions with Human Feedback"
    },
    {
      "text": "Constitutional AI",
      "url": "https://arxiv.org/abs/2212.08073",
      "resourceId": "683aef834ac1612a",
      "resourceTitle": "Constitutional AI: Harmlessness from AI Feedback"
    },
    {
      "text": "Reward Hacking in RL",
      "url": "https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",
      "resourceId": "570615e019d1cc74",
      "resourceTitle": "Reward Hacking in Reinforcement Learning"
    },
    {
      "text": "Deep RL from Human Preferences",
      "url": "https://arxiv.org/abs/1706.03741",
      "resourceId": "14df73723b4d14d7",
      "resourceTitle": "[1706.03741] Deep Reinforcement Learning from Human Preferences"
    },
    {
      "text": "Training Language Models to Follow Instructions",
      "url": "https://arxiv.org/abs/2203.02155",
      "resourceId": "1098fc60be7ca2b0",
      "resourceTitle": "Training Language Models to Follow Instructions with Human Feedback"
    },
    {
      "text": "Constitutional AI",
      "url": "https://arxiv.org/abs/2212.08073",
      "resourceId": "683aef834ac1612a",
      "resourceTitle": "Constitutional AI: Harmlessness from AI Feedback"
    },
    {
      "text": "Reward Hacking in RL",
      "url": "https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",
      "resourceId": "570615e019d1cc74",
      "resourceTitle": "Reward Hacking in Reinforcement Learning"
    },
    {
      "text": "RLHF Book",
      "url": "https://rlhfbook.com/",
      "resourceId": "ebcbaba2d260e656",
      "resourceTitle": "online iterative RLHF"
    }
  ],
  "unconvertedLinkCount": 9,
  "convertedLinkCount": 0,
  "backlinkCount": 11,
  "hallucinationRisk": {
    "level": "medium",
    "score": 45,
    "factors": [
      "no-citations",
      "conceptual-content"
    ]
  },
  "entityType": "approach",
  "redundancy": {
    "maxSimilarity": 17,
    "similarPages": [
      {
        "id": "rlhf",
        "title": "RLHF / Constitutional AI",
        "path": "/knowledge-base/responses/rlhf/",
        "similarity": 17
      },
      {
        "id": "adversarial-training",
        "title": "Adversarial Training",
        "path": "/knowledge-base/responses/adversarial-training/",
        "similarity": 16
      },
      {
        "id": "process-supervision",
        "title": "Process Supervision",
        "path": "/knowledge-base/responses/process-supervision/",
        "similarity": 16
      },
      {
        "id": "preference-optimization",
        "title": "Preference Optimization Methods",
        "path": "/knowledge-base/responses/preference-optimization/",
        "similarity": 14
      },
      {
        "id": "refusal-training",
        "title": "Refusal Training",
        "path": "/knowledge-base/responses/refusal-training/",
        "similarity": 14
      }
    ]
  },
  "coverage": {
    "passing": 7,
    "total": 13,
    "targets": {
      "tables": 7,
      "diagrams": 1,
      "internalLinks": 15,
      "externalLinks": 9,
      "footnotes": 6,
      "references": 6
    },
    "actuals": {
      "tables": 19,
      "diagrams": 1,
      "internalLinks": 14,
      "externalLinks": 12,
      "footnotes": 0,
      "references": 5,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "summary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "green",
      "diagrams": "green",
      "internalLinks": "amber",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "amber",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:3.5 R:4 A:3 C:5.5"
  },
  "readerRank": 521,
  "researchRank": 425,
  "recommendedScore": 131.32
}

External Links

No external links

Backlinks (11)

id	title	type	relationship
solutions	AI Safety Solution Cruxes	crux	—
why-alignment-hard	Why Alignment Might Be Hard	argument	—
deepmind	Google DeepMind	organization	—
dario-amodei	Dario Amodei	person	—
jan-leike	Jan Leike	person	—
paul-christiano	Paul Christiano	person	—
alignment-training-overview	Training Methods (Overview)	concept	—
cirl	Cooperative IRL (CIRL)	approach	—
scalable-oversight	Scalable Oversight	research-area	—
weak-to-strong	Weak-to-Strong Generalization	approach	—
mesa-optimization	Mesa-Optimization	risk	—