Longterm Wiki

Scalable Oversight

scalable-oversightsafety-agendaPath: /knowledge-base/responses/scalable-oversight/
E271Entity ID (EID)
← Back to page58 backlinksQuality: 68Updated: 2026-03-13
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "scalable-oversight",
  "numericId": null,
  "path": "/knowledge-base/responses/scalable-oversight/",
  "filePath": "knowledge-base/responses/scalable-oversight.mdx",
  "title": "Scalable Oversight",
  "quality": 68,
  "readerImportance": 51.5,
  "researchImportance": 30,
  "tacticalValue": null,
  "contentFormat": "article",
  "tractability": null,
  "neglectedness": null,
  "uncertainty": null,
  "causalLevel": null,
  "lastUpdated": "2026-03-13",
  "dateCreated": "2026-02-15",
  "llmSummary": "Process supervision achieves 78.2% accuracy on MATH benchmarks (vs 72.4% outcome-based) and is deployed in OpenAI's o1 models, while debate shows 60-80% accuracy on factual questions with +4% improvement from self-play training. However, effectiveness against sophisticated deception remains unproven, with debate accuracy dropping to 50-65% on complex reasoning tasks.",
  "description": "Methods for supervising AI systems on tasks too complex for direct human evaluation, including debate, recursive reward modeling, and process supervision. Process supervision achieves 78.2% accuracy on MATH benchmarks (vs 72.4% outcome-based), while debate shows 60-80% accuracy on factual questions with +4% improvement from self-play training. Critical for maintaining oversight as AI capabilities exceed human expertise.",
  "ratings": {
    "novelty": 4.2,
    "rigor": 6.8,
    "actionability": 5.9,
    "completeness": 7.1
  },
  "category": "responses",
  "subcategory": "alignment-theoretical",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 5698,
    "tableCount": 2,
    "diagramCount": 1,
    "internalLinks": 42,
    "externalLinks": 0,
    "footnoteCount": 0,
    "bulletRatio": 0.1,
    "sectionCount": 38,
    "hasOverview": true,
    "structuralScore": 12
  },
  "suggestedQuality": 80,
  "updateFrequency": 90,
  "evergreen": true,
  "wordCount": 5698,
  "unconvertedLinks": [],
  "unconvertedLinkCount": 0,
  "convertedLinkCount": 31,
  "backlinkCount": 58,
  "hallucinationRisk": {
    "level": "medium",
    "score": 50,
    "factors": [
      "no-citations",
      "few-external-sources",
      "conceptual-content"
    ]
  },
  "entityType": "safety-agenda",
  "redundancy": {
    "maxSimilarity": 23,
    "similarPages": [
      {
        "id": "agentic-ai",
        "title": "Agentic AI",
        "path": "/knowledge-base/capabilities/agentic-ai/",
        "similarity": 23
      },
      {
        "id": "reasoning",
        "title": "Reasoning and Planning",
        "path": "/knowledge-base/capabilities/reasoning/",
        "similarity": 23
      },
      {
        "id": "why-alignment-hard",
        "title": "Why Alignment Might Be Hard",
        "path": "/knowledge-base/debates/why-alignment-hard/",
        "similarity": 23
      },
      {
        "id": "reward-hacking-taxonomy",
        "title": "Reward Hacking Taxonomy and Severity Model",
        "path": "/knowledge-base/models/reward-hacking-taxonomy/",
        "similarity": 23
      },
      {
        "id": "interpretability",
        "title": "Mechanistic Interpretability",
        "path": "/knowledge-base/responses/interpretability/",
        "similarity": 22
      }
    ]
  },
  "coverage": {
    "passing": 5,
    "total": 13,
    "targets": {
      "tables": 23,
      "diagrams": 2,
      "internalLinks": 46,
      "externalLinks": 28,
      "footnotes": 17,
      "references": 17
    },
    "actuals": {
      "tables": 2,
      "diagrams": 1,
      "internalLinks": 42,
      "externalLinks": 0,
      "footnotes": 0,
      "references": 20,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "llmSummary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "amber",
      "diagrams": "amber",
      "internalLinks": "amber",
      "externalLinks": "red",
      "footnotes": "red",
      "references": "green",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:4.2 R:6.8 A:5.9 C:7.1"
  },
  "readerRank": 294,
  "researchRank": 417,
  "recommendedScore": 183.61
}
External Links
{
  "lesswrong": "https://www.lesswrong.com/tag/scalable-oversight",
  "stampy": "https://aisafety.info/questions/8IHH/What-is-scalable-oversight",
  "alignmentForum": "https://www.alignmentforum.org/tag/scalable-oversight"
}
Backlinks (58)
idtitletyperelationship
rlhfRLHFcapability
capability-alignment-raceCapability-Alignment Race Modelanalysis
technical-pathwaysAI Safety Technical Pathway Decompositionanalysis
reward-hacking-taxonomyReward Hacking Taxonomy and Severity Modelanalysismitigation
anthropicAnthropicorganizationresearch
deepmindGoogle DeepMindorganizationresearch
openaiOpenAIorganizationresearch
arcARCorganization
jan-leikeJan Leikeperson
paul-christianoPaul Christianoperson
anthropic-core-viewsAnthropic Core Viewssafety-agenda
process-supervisionProcess Supervisionapproach
eliciting-latent-knowledgeEliciting Latent Knowledge (ELK)approach
debateAI Safety via Debateapproach
multi-agentMulti-Agent Safetyapproach
deceptive-alignmentDeceptive Alignmentrisk
reward-hackingReward Hackingrisk
sycophancySycophancyrisk
agentic-aiAgentic AIcapability
language-modelsLarge Language Modelscapability
scientific-researchScientific Research Capabilitiescapability
solutionsAI Safety Solution Cruxescrux
why-alignment-easyWhy Alignment Might Be Easyargument
why-alignment-hardWhy Alignment Might Be Hardargument
deep-learning-eraDeep Learning Revolution (2012-2020)historical
miri-eraThe MIRI Era (2000-2015)historical
ai-timelinesAI Timelinesconcept
alignment-robustness-trajectoryAlignment Robustness Trajectoryanalysis
intervention-effectiveness-matrixIntervention Effectiveness Matrixanalysis
model-organisms-of-misalignmentModel Organisms of Misalignmentanalysis
pre-tai-capital-deploymentPre-TAI Capital Deployment: $100B-$300B+ Spending Analysisanalysis
safety-spending-at-scaleSafety Spending at Scaleanalysis
short-timeline-policy-implicationsShort Timeline Policy Implicationsanalysis
elicitElicit (AI Research Tool)organization
lionheart-venturesLionheart Venturesorganization
matsMATS ML Alignment Theory Scholars programorganization
metrMETRorganization
pause-aiPause AIorganization
chris-olahChris Olahperson
connor-leahyConnor Leahyperson
dario-amodeiDario Amodeiperson
neel-nandaNeel Nandaperson
agent-foundationsAgent Foundationsapproach
ai-controlAI Controlsafety-agenda
alignment-theoretical-overviewTheoretical Foundations (Overview)concept
alignmentAI Alignmentapproach
interpretabilityMechanistic Interpretabilitysafety-agenda
intervention-portfolioAI Safety Intervention Portfolioapproach
preference-optimizationPreference Optimization Methodsapproach
research-agendasAI Alignment Research Agenda Comparisoncrux
scalable-eval-approachesScalable Eval Approachesapproach
scheming-detectionScheming & Deception Detectionapproach
sleeper-agent-detectionSleeper Agent Detectionapproach
training-programsAI Safety Training Programsapproach
weak-to-strongWeak-to-Strong Generalizationapproach
automation-biasAutomation Bias (AI Systems)risk
existential-riskExistential Risk from AIconcept
expertise-atrophyAI-Induced Expertise Atrophyrisk
Longterm Wiki