Longterm Wiki

Reward Modeling

reward-modelingapproachPath: /knowledge-base/responses/reward-modeling/
E600Entity ID (EID)
← Back to page11 backlinksQuality: 55Updated: 2026-03-13
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "reward-modeling",
  "numericId": null,
  "path": "/knowledge-base/responses/reward-modeling/",
  "filePath": "knowledge-base/responses/reward-modeling.mdx",
  "title": "Reward Modeling",
  "quality": 55,
  "readerImportance": 20,
  "researchImportance": 28.5,
  "tacticalValue": null,
  "contentFormat": "article",
  "tractability": null,
  "neglectedness": null,
  "uncertainty": null,
  "causalLevel": null,
  "lastUpdated": "2026-03-13",
  "dateCreated": "2026-02-15",
  "llmSummary": "Reward modeling, the core component of RLHF receiving \\$100M+/year investment, trains neural networks on human preference comparisons to enable scalable reinforcement learning. The technique is universally adopted but inherits fundamental limitations including reward hacking (which worsens with capability), vulnerability to deception, and Goodhart's law—making it capability-dominant rather than safety-enhancing.",
  "description": "Reward modeling trains separate neural networks to predict human preferences, serving as the core component of RLHF pipelines. While essential for modern AI assistants and receiving over \\$500M/year in investment, it inherits all fundamental limitations of RLHF including reward hacking and lack of deception robustness.",
  "ratings": {
    "novelty": 3.5,
    "rigor": 4,
    "actionability": 3,
    "completeness": 5.5
  },
  "category": "responses",
  "subcategory": "alignment-training",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 1860,
    "tableCount": 19,
    "diagramCount": 1,
    "internalLinks": 14,
    "externalLinks": 12,
    "footnoteCount": 0,
    "bulletRatio": 0.07,
    "sectionCount": 30,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": 45,
  "evergreen": true,
  "wordCount": 1860,
  "unconvertedLinks": [
    {
      "text": "Deep RL from Human Preferences",
      "url": "https://arxiv.org/abs/1706.03741",
      "resourceId": "14df73723b4d14d7",
      "resourceTitle": "[1706.03741] Deep Reinforcement Learning from Human Preferences"
    },
    {
      "text": "InstructGPT",
      "url": "https://arxiv.org/abs/2203.02155",
      "resourceId": "1098fc60be7ca2b0",
      "resourceTitle": "Training Language Models to Follow Instructions with Human Feedback"
    },
    {
      "text": "Constitutional AI",
      "url": "https://arxiv.org/abs/2212.08073",
      "resourceId": "683aef834ac1612a"
    },
    {
      "text": "Reward Hacking in RL",
      "url": "https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",
      "resourceId": "570615e019d1cc74",
      "resourceTitle": "Reward Hacking in Reinforcement Learning"
    },
    {
      "text": "Deep RL from Human Preferences",
      "url": "https://arxiv.org/abs/1706.03741",
      "resourceId": "14df73723b4d14d7",
      "resourceTitle": "[1706.03741] Deep Reinforcement Learning from Human Preferences"
    },
    {
      "text": "Training Language Models to Follow Instructions",
      "url": "https://arxiv.org/abs/2203.02155",
      "resourceId": "1098fc60be7ca2b0",
      "resourceTitle": "Training Language Models to Follow Instructions with Human Feedback"
    },
    {
      "text": "Constitutional AI",
      "url": "https://arxiv.org/abs/2212.08073",
      "resourceId": "683aef834ac1612a"
    },
    {
      "text": "Reward Hacking in RL",
      "url": "https://lilianweng.github.io/posts/2024-11-28-reward-hacking/",
      "resourceId": "570615e019d1cc74",
      "resourceTitle": "Reward Hacking in Reinforcement Learning"
    },
    {
      "text": "RLHF Book",
      "url": "https://rlhfbook.com/",
      "resourceId": "ebcbaba2d260e656",
      "resourceTitle": "online iterative RLHF"
    }
  ],
  "unconvertedLinkCount": 9,
  "convertedLinkCount": 0,
  "backlinkCount": 11,
  "hallucinationRisk": {
    "level": "medium",
    "score": 45,
    "factors": [
      "no-citations",
      "conceptual-content"
    ]
  },
  "entityType": "approach",
  "redundancy": {
    "maxSimilarity": 17,
    "similarPages": [
      {
        "id": "rlhf",
        "title": "RLHF / Constitutional AI",
        "path": "/knowledge-base/responses/rlhf/",
        "similarity": 17
      },
      {
        "id": "adversarial-training",
        "title": "Adversarial Training",
        "path": "/knowledge-base/responses/adversarial-training/",
        "similarity": 16
      },
      {
        "id": "process-supervision",
        "title": "Process Supervision",
        "path": "/knowledge-base/responses/process-supervision/",
        "similarity": 16
      },
      {
        "id": "preference-optimization",
        "title": "Preference Optimization Methods",
        "path": "/knowledge-base/responses/preference-optimization/",
        "similarity": 14
      },
      {
        "id": "refusal-training",
        "title": "Refusal Training",
        "path": "/knowledge-base/responses/refusal-training/",
        "similarity": 14
      }
    ]
  },
  "coverage": {
    "passing": 7,
    "total": 13,
    "targets": {
      "tables": 7,
      "diagrams": 1,
      "internalLinks": 15,
      "externalLinks": 9,
      "footnotes": 6,
      "references": 6
    },
    "actuals": {
      "tables": 19,
      "diagrams": 1,
      "internalLinks": 14,
      "externalLinks": 12,
      "footnotes": 0,
      "references": 5,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "llmSummary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "green",
      "diagrams": "green",
      "internalLinks": "amber",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "amber",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:3.5 R:4 A:3 C:5.5"
  },
  "readerRank": 523,
  "researchRank": 429,
  "recommendedScore": 141.63
}
External Links

No external links

Backlinks (11)
idtitletyperelationship
solutionsAI Safety Solution Cruxescrux
why-alignment-hardWhy Alignment Might Be Hardargument
deepmindGoogle DeepMindorganization
dario-amodeiDario Amodeiperson
jan-leikeJan Leikeperson
paul-christianoPaul Christianoperson
alignment-training-overviewTraining Methods (Overview)concept
cirlCooperative IRL (CIRL)approach
scalable-oversightScalable Oversightsafety-agenda
weak-to-strongWeak-to-Strong Generalizationapproach
mesa-optimizationMesa-Optimizationrisk
Longterm Wiki