Longterm Wiki

Weak-to-Strong Generalization

weak-to-strongapproachPath: /knowledge-base/responses/weak-to-strong/
E452Entity ID (EID)
← Back to page9 backlinksQuality: 91Updated: 2026-03-13
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "weak-to-strong",
  "numericId": null,
  "path": "/knowledge-base/responses/weak-to-strong/",
  "filePath": "knowledge-base/responses/weak-to-strong.mdx",
  "title": "Weak-to-Strong Generalization",
  "quality": 91,
  "readerImportance": 19.5,
  "researchImportance": 27.5,
  "tacticalValue": null,
  "contentFormat": "article",
  "tractability": null,
  "neglectedness": null,
  "uncertainty": null,
  "causalLevel": null,
  "lastUpdated": "2026-03-13",
  "dateCreated": "2026-02-15",
  "llmSummary": "Weak-to-strong generalization tests whether weak supervisors can elicit good behavior from stronger AI systems. OpenAI's ICML 2024 experiments show 80% Performance Gap Recovery on NLP tasks with confidence loss (vs 30-50% naive), but reward modeling achieves only 20-40% PGR. OpenAI's Superalignment team (~30 researchers) funded \\$10M+ in grants. Critical limitation: no experiments yet test deceptive models.",
  "description": "Weak-to-strong generalization investigates whether weak supervisors can reliably elicit good behavior from stronger AI systems. OpenAI's ICML 2024 research shows GPT-2-level models can recover 80% of GPT-4's performance gap with auxiliary confidence loss, but reward modeling achieves only 20-40% PGR—suggesting RLHF may scale poorly. Deception scenarios remain untested.",
  "ratings": {
    "novelty": 5.5,
    "rigor": 6.5,
    "actionability": 6,
    "completeness": 7
  },
  "category": "responses",
  "subcategory": "alignment-training",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 2914,
    "tableCount": 23,
    "diagramCount": 1,
    "internalLinks": 12,
    "externalLinks": 58,
    "footnoteCount": 0,
    "bulletRatio": 0.09,
    "sectionCount": 37,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": 45,
  "evergreen": true,
  "wordCount": 2914,
  "unconvertedLinks": [
    {
      "text": "\"Weak-to-Strong Generalization: Eliciting Strong Capabilities With Weak Supervision\"",
      "url": "https://arxiv.org/abs/2312.09390",
      "resourceId": "0ba98ae3a8a72270",
      "resourceTitle": "arXiv"
    },
    {
      "text": "OpenAI Superalignment team",
      "url": "https://openai.com/index/weak-to-strong-generalization/",
      "resourceId": "e64c8268e5f58e63",
      "resourceTitle": "Weak-to-strong generalization"
    },
    {
      "text": "\\$10M grants program",
      "url": "https://openai.com/index/superalignment-fast-grants/",
      "resourceId": "82eb0a4b47c95d2a",
      "resourceTitle": "OpenAI Superalignment Fast Grants"
    },
    {
      "text": "Anthropic's 2025 research recommendations",
      "url": "https://alignment.anthropic.com/2025/recommended-directions/",
      "resourceId": "7ae6b3be2d2043c1",
      "resourceTitle": "Anthropic: Recommended Directions for AI Safety Research"
    },
    {
      "text": "OpenAI blog",
      "url": "https://openai.com/index/weak-to-strong-generalization/",
      "resourceId": "e64c8268e5f58e63",
      "resourceTitle": "Weak-to-strong generalization"
    },
    {
      "text": "Anthropic 2025 directions",
      "url": "https://alignment.anthropic.com/2025/recommended-directions/",
      "resourceId": "7ae6b3be2d2043c1",
      "resourceTitle": "Anthropic: Recommended Directions for AI Safety Research"
    },
    {
      "text": "Fast Grants",
      "url": "https://openai.com/index/superalignment-fast-grants/",
      "resourceId": "82eb0a4b47c95d2a",
      "resourceTitle": "OpenAI Superalignment Fast Grants"
    },
    {
      "text": "OpenAI Superalignment team",
      "url": "https://openai.com/index/weak-to-strong-generalization/",
      "resourceId": "e64c8268e5f58e63",
      "resourceTitle": "Weak-to-strong generalization"
    },
    {
      "text": "Fast Grants",
      "url": "https://openai.com/index/superalignment-fast-grants/",
      "resourceId": "82eb0a4b47c95d2a",
      "resourceTitle": "OpenAI Superalignment Fast Grants"
    },
    {
      "text": "Anthropic 2025",
      "url": "https://alignment.anthropic.com/2025/recommended-directions/",
      "resourceId": "7ae6b3be2d2043c1",
      "resourceTitle": "Anthropic: Recommended Directions for AI Safety Research"
    },
    {
      "text": "Open-source code released",
      "url": "https://openai.com/index/weak-to-strong-generalization/",
      "resourceId": "e64c8268e5f58e63",
      "resourceTitle": "Weak-to-strong generalization"
    },
    {
      "text": "original paper",
      "url": "https://arxiv.org/abs/2312.09390",
      "resourceId": "0ba98ae3a8a72270",
      "resourceTitle": "arXiv"
    },
    {
      "text": "Superalignment team",
      "url": "https://openai.com/index/weak-to-strong-generalization/",
      "resourceId": "e64c8268e5f58e63",
      "resourceTitle": "Weak-to-strong generalization"
    },
    {
      "text": "Fast Grants",
      "url": "https://openai.com/index/superalignment-fast-grants/",
      "resourceId": "82eb0a4b47c95d2a",
      "resourceTitle": "OpenAI Superalignment Fast Grants"
    },
    {
      "text": "Fast Grants",
      "url": "https://openai.com/index/superalignment-fast-grants/",
      "resourceId": "82eb0a4b47c95d2a",
      "resourceTitle": "OpenAI Superalignment Fast Grants"
    },
    {
      "text": "OpenAI Superalignment team",
      "url": "https://openai.com/index/weak-to-strong-generalization/",
      "resourceId": "e64c8268e5f58e63",
      "resourceTitle": "Weak-to-strong generalization"
    },
    {
      "text": "\\$10M Superalignment Fast Grants program",
      "url": "https://openai.com/index/superalignment-fast-grants/",
      "resourceId": "82eb0a4b47c95d2a",
      "resourceTitle": "OpenAI Superalignment Fast Grants"
    },
    {
      "text": "Anthropic's 2025 research recommendations",
      "url": "https://alignment.anthropic.com/2025/recommended-directions/",
      "resourceId": "7ae6b3be2d2043c1",
      "resourceTitle": "Anthropic: Recommended Directions for AI Safety Research"
    },
    {
      "text": "OpenAI Superalignment",
      "url": "https://openai.com/index/weak-to-strong-generalization/",
      "resourceId": "e64c8268e5f58e63",
      "resourceTitle": "Weak-to-strong generalization"
    },
    {
      "text": "Superalignment Fast Grants",
      "url": "https://openai.com/index/superalignment-fast-grants/",
      "resourceId": "82eb0a4b47c95d2a",
      "resourceTitle": "OpenAI Superalignment Fast Grants"
    },
    {
      "text": "Recommended Research Directions (2025)",
      "url": "https://alignment.anthropic.com/2025/recommended-directions/",
      "resourceId": "7ae6b3be2d2043c1",
      "resourceTitle": "Anthropic: Recommended Directions for AI Safety Research"
    },
    {
      "text": "Scalable Oversight and W2SG",
      "url": "https://www.alignmentforum.org/posts/hw2tGSsvLLyjFoLFS/scalable-oversight-and-weak-to-strong-generalization",
      "resourceId": "f386d42a2b5ff4f7",
      "resourceTitle": "Scalable Oversight and Weak-to-Strong Generalization"
    }
  ],
  "unconvertedLinkCount": 22,
  "convertedLinkCount": 0,
  "backlinkCount": 9,
  "hallucinationRisk": {
    "level": "medium",
    "score": 40,
    "factors": [
      "no-citations",
      "conceptual-content",
      "high-quality"
    ]
  },
  "entityType": "approach",
  "redundancy": {
    "maxSimilarity": 16,
    "similarPages": [
      {
        "id": "ai-assisted",
        "title": "AI-Assisted Alignment",
        "path": "/knowledge-base/responses/ai-assisted/",
        "similarity": 16
      },
      {
        "id": "debate",
        "title": "AI Safety via Debate",
        "path": "/knowledge-base/responses/debate/",
        "similarity": 16
      },
      {
        "id": "probing",
        "title": "Probing / Linear Probes",
        "path": "/knowledge-base/responses/probing/",
        "similarity": 14
      },
      {
        "id": "rlhf",
        "title": "RLHF / Constitutional AI",
        "path": "/knowledge-base/responses/rlhf/",
        "similarity": 14
      },
      {
        "id": "technical-research",
        "title": "Technical AI Safety Research",
        "path": "/knowledge-base/responses/technical-research/",
        "similarity": 14
      }
    ]
  },
  "coverage": {
    "passing": 7,
    "total": 13,
    "targets": {
      "tables": 12,
      "diagrams": 1,
      "internalLinks": 23,
      "externalLinks": 15,
      "footnotes": 9,
      "references": 9
    },
    "actuals": {
      "tables": 23,
      "diagrams": 1,
      "internalLinks": 12,
      "externalLinks": 58,
      "footnotes": 0,
      "references": 5,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "llmSummary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "green",
      "diagrams": "green",
      "internalLinks": "amber",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "amber",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:5.5 R:6.5 A:6 C:7"
  },
  "readerRank": 525,
  "researchRank": 437,
  "recommendedScore": 213.57
}
External Links

No external links

Backlinks (9)
idtitletyperelationship
ai-assistedAI-Assisted Alignmentapproach
solutionsAI Safety Solution Cruxescrux
alignment-robustness-trajectoryAlignment Robustness Trajectoryanalysis
anthropicAnthropicorganization
openaiOpenAIorganization
jan-leikeJan Leikeperson
leopold-aschenbrennerLeopold Aschenbrennerperson
alignment-training-overviewTraining Methods (Overview)concept
alignmentAI Alignmentapproach
Longterm Wiki