Longterm Wiki

Reward Hacking Taxonomy and Severity Model

reward-hacking-taxonomyanalysisPath: /knowledge-base/models/reward-hacking-taxonomy/
E254Entity ID (EID)
← Back to page0 backlinksQuality: 71Updated: 2026-03-13
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "reward-hacking-taxonomy",
  "numericId": null,
  "path": "/knowledge-base/models/reward-hacking-taxonomy/",
  "filePath": "knowledge-base/models/reward-hacking-taxonomy.mdx",
  "title": "Reward Hacking Taxonomy and Severity Model",
  "quality": 71,
  "readerImportance": 44.5,
  "researchImportance": 88,
  "tacticalValue": null,
  "contentFormat": "article",
  "tractability": null,
  "neglectedness": null,
  "uncertainty": null,
  "causalLevel": null,
  "lastUpdated": "2026-03-13",
  "dateCreated": "2026-02-15",
  "llmSummary": "Taxonomizes 12 reward hacking modes with likelihood (20-90%) and severity scores, finding proxy exploitation affects 80-95% of current systems (low severity) while deceptive hacking (5-40% likelihood in advanced systems) and meta-hacking pose catastrophic risks. Analysis shows severe reward hacking probability increases from 5-15% (current) to 30-60% (advanced systems), with no single mitigation effective across all modes—requiring defense-in-depth combining specification improvement, diverse oversight, interpretability, and AI control.",
  "description": "This model classifies 12 reward hacking failure modes by mechanism, likelihood (20-90%), and severity. It finds that proxy exploitation affects 80-95% of current systems (low severity), while deceptive hacking and meta-hacking (5-40% likelihood) pose catastrophic risks requiring fundamentally different mitigations.",
  "ratings": {
    "focus": 8.5,
    "novelty": 5.2,
    "rigor": 7.8,
    "completeness": 8,
    "concreteness": 7.5,
    "actionability": 6.5
  },
  "category": "models",
  "subcategory": "risk-models",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 6609,
    "tableCount": 9,
    "diagramCount": 3,
    "internalLinks": 29,
    "externalLinks": 0,
    "footnoteCount": 0,
    "bulletRatio": 0.06,
    "sectionCount": 47,
    "hasOverview": true,
    "structuralScore": 12
  },
  "suggestedQuality": 80,
  "updateFrequency": 90,
  "evergreen": true,
  "wordCount": 6609,
  "unconvertedLinks": [],
  "unconvertedLinkCount": 0,
  "convertedLinkCount": 22,
  "backlinkCount": 0,
  "hallucinationRisk": {
    "level": "medium",
    "score": 45,
    "factors": [
      "no-citations",
      "few-external-sources",
      "high-rigor"
    ]
  },
  "entityType": "analysis",
  "redundancy": {
    "maxSimilarity": 23,
    "similarPages": [
      {
        "id": "why-alignment-hard",
        "title": "Why Alignment Might Be Hard",
        "path": "/knowledge-base/debates/why-alignment-hard/",
        "similarity": 23
      },
      {
        "id": "scalable-oversight",
        "title": "Scalable Oversight",
        "path": "/knowledge-base/responses/scalable-oversight/",
        "similarity": 23
      },
      {
        "id": "reward-hacking",
        "title": "Reward Hacking",
        "path": "/knowledge-base/risks/reward-hacking/",
        "similarity": 23
      },
      {
        "id": "agentic-ai",
        "title": "Agentic AI",
        "path": "/knowledge-base/capabilities/agentic-ai/",
        "similarity": 21
      },
      {
        "id": "scheming",
        "title": "Scheming",
        "path": "/knowledge-base/risks/scheming/",
        "similarity": 21
      }
    ]
  },
  "coverage": {
    "passing": 5,
    "total": 13,
    "targets": {
      "tables": 26,
      "diagrams": 3,
      "internalLinks": 53,
      "externalLinks": 33,
      "footnotes": 20,
      "references": 20
    },
    "actuals": {
      "tables": 9,
      "diagrams": 3,
      "internalLinks": 29,
      "externalLinks": 0,
      "footnotes": 0,
      "references": 10,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "llmSummary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "amber",
      "diagrams": "green",
      "internalLinks": "amber",
      "externalLinks": "red",
      "footnotes": "red",
      "references": "amber",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:5.2 R:7.8 A:6.5 C:8"
  },
  "readerRank": 341,
  "researchRank": 36,
  "recommendedScore": 186.11
}
External Links

No external links

Backlinks (0)

No backlinks

Longterm Wiki