Reward Hacking Taxonomy and Severity Model
reward-hacking-taxonomyanalysisPath: /knowledge-base/models/reward-hacking-taxonomy/
E254Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "reward-hacking-taxonomy",
"numericId": null,
"path": "/knowledge-base/models/reward-hacking-taxonomy/",
"filePath": "knowledge-base/models/reward-hacking-taxonomy.mdx",
"title": "Reward Hacking Taxonomy and Severity Model",
"quality": 71,
"readerImportance": 44.5,
"researchImportance": 88,
"tacticalValue": null,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": "2026-03-13",
"dateCreated": "2026-02-15",
"llmSummary": "Taxonomizes 12 reward hacking modes with likelihood (20-90%) and severity scores, finding proxy exploitation affects 80-95% of current systems (low severity) while deceptive hacking (5-40% likelihood in advanced systems) and meta-hacking pose catastrophic risks. Analysis shows severe reward hacking probability increases from 5-15% (current) to 30-60% (advanced systems), with no single mitigation effective across all modes—requiring defense-in-depth combining specification improvement, diverse oversight, interpretability, and AI control.",
"description": "This model classifies 12 reward hacking failure modes by mechanism, likelihood (20-90%), and severity. It finds that proxy exploitation affects 80-95% of current systems (low severity), while deceptive hacking and meta-hacking (5-40% likelihood) pose catastrophic risks requiring fundamentally different mitigations.",
"ratings": {
"focus": 8.5,
"novelty": 5.2,
"rigor": 7.8,
"completeness": 8,
"concreteness": 7.5,
"actionability": 6.5
},
"category": "models",
"subcategory": "risk-models",
"clusters": [
"ai-safety"
],
"metrics": {
"wordCount": 6609,
"tableCount": 9,
"diagramCount": 3,
"internalLinks": 29,
"externalLinks": 0,
"footnoteCount": 0,
"bulletRatio": 0.06,
"sectionCount": 47,
"hasOverview": true,
"structuralScore": 12
},
"suggestedQuality": 80,
"updateFrequency": 90,
"evergreen": true,
"wordCount": 6609,
"unconvertedLinks": [],
"unconvertedLinkCount": 0,
"convertedLinkCount": 22,
"backlinkCount": 0,
"hallucinationRisk": {
"level": "medium",
"score": 45,
"factors": [
"no-citations",
"few-external-sources",
"high-rigor"
]
},
"entityType": "analysis",
"redundancy": {
"maxSimilarity": 23,
"similarPages": [
{
"id": "why-alignment-hard",
"title": "Why Alignment Might Be Hard",
"path": "/knowledge-base/debates/why-alignment-hard/",
"similarity": 23
},
{
"id": "scalable-oversight",
"title": "Scalable Oversight",
"path": "/knowledge-base/responses/scalable-oversight/",
"similarity": 23
},
{
"id": "reward-hacking",
"title": "Reward Hacking",
"path": "/knowledge-base/risks/reward-hacking/",
"similarity": 23
},
{
"id": "agentic-ai",
"title": "Agentic AI",
"path": "/knowledge-base/capabilities/agentic-ai/",
"similarity": 21
},
{
"id": "scheming",
"title": "Scheming",
"path": "/knowledge-base/risks/scheming/",
"similarity": 21
}
]
},
"coverage": {
"passing": 5,
"total": 13,
"targets": {
"tables": 26,
"diagrams": 3,
"internalLinks": 53,
"externalLinks": 33,
"footnotes": 20,
"references": 20
},
"actuals": {
"tables": 9,
"diagrams": 3,
"internalLinks": 29,
"externalLinks": 0,
"footnotes": 0,
"references": 10,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"llmSummary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "red",
"overview": "green",
"tables": "amber",
"diagrams": "green",
"internalLinks": "amber",
"externalLinks": "red",
"footnotes": "red",
"references": "amber",
"quotes": "red",
"accuracy": "red"
},
"ratingsString": "N:5.2 R:7.8 A:6.5 C:8"
},
"readerRank": 341,
"researchRank": 36,
"recommendedScore": 186.11
}External Links
No external links
Backlinks (0)
No backlinks