Longterm Wiki

Capability Unlearning / Removal

capability-unlearningapproachPath: /knowledge-base/responses/capability-unlearning/
E453Entity ID (EID)
← Back to page1 backlinksQuality: 65Updated: 2026-03-13
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "capability-unlearning",
  "numericId": null,
  "path": "/knowledge-base/responses/capability-unlearning/",
  "filePath": "knowledge-base/responses/capability-unlearning.mdx",
  "title": "Capability Unlearning / Removal",
  "quality": 65,
  "readerImportance": 66,
  "researchImportance": 71.5,
  "tacticalValue": null,
  "contentFormat": "article",
  "tractability": null,
  "neglectedness": null,
  "uncertainty": null,
  "causalLevel": null,
  "lastUpdated": "2026-03-13",
  "dateCreated": "2026-02-15",
  "llmSummary": "Capability unlearning removes dangerous capabilities (e.g., bioweapon synthesis) from AI models through gradient-based methods, representation engineering, and fine-tuning, achieving 60-80% reduction on WMDP benchmarks with combined approaches. However, verification is impossible, capabilities are recoverable through fine-tuning, and knowledge entanglement limits what can be safely removed, making this a defense-in-depth layer rather than complete solution.",
  "description": "Methods to remove specific dangerous capabilities from trained AI models, directly addressing misuse risks by eliminating harmful knowledge, though current techniques face challenges around verification, capability recovery, and general performance degradation.",
  "ratings": {
    "novelty": 4.5,
    "rigor": 5,
    "actionability": 6,
    "completeness": 6.5
  },
  "category": "responses",
  "subcategory": "alignment-training",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 1652,
    "tableCount": 20,
    "diagramCount": 1,
    "internalLinks": 3,
    "externalLinks": 20,
    "footnoteCount": 0,
    "bulletRatio": 0.04,
    "sectionCount": 27,
    "hasOverview": true,
    "structuralScore": 14
  },
  "suggestedQuality": 93,
  "updateFrequency": 45,
  "evergreen": true,
  "wordCount": 1652,
  "unconvertedLinks": [
    {
      "text": "publicly available",
      "url": "https://www.wmdp.ai/",
      "resourceId": "cfa49cff8bb3ac32",
      "resourceTitle": "Weapons of Mass Destruction Proxy Benchmark (WMDP)"
    },
    {
      "text": "Center for AI Safety",
      "url": "https://safe.ai",
      "resourceId": "a306e0b63bdedbd5",
      "resourceTitle": "CAIS Surveys"
    }
  ],
  "unconvertedLinkCount": 2,
  "convertedLinkCount": 0,
  "backlinkCount": 1,
  "hallucinationRisk": {
    "level": "medium",
    "score": 45,
    "factors": [
      "no-citations",
      "conceptual-content"
    ]
  },
  "entityType": "approach",
  "redundancy": {
    "maxSimilarity": 11,
    "similarPages": [
      {
        "id": "circuit-breakers",
        "title": "Circuit Breakers / Inference Interventions",
        "path": "/knowledge-base/responses/circuit-breakers/",
        "similarity": 11
      },
      {
        "id": "debate",
        "title": "AI Safety via Debate",
        "path": "/knowledge-base/responses/debate/",
        "similarity": 11
      },
      {
        "id": "eliciting-latent-knowledge",
        "title": "Eliciting Latent Knowledge (ELK)",
        "path": "/knowledge-base/responses/eliciting-latent-knowledge/",
        "similarity": 11
      },
      {
        "id": "goal-misgeneralization-research",
        "title": "Goal Misgeneralization Research",
        "path": "/knowledge-base/responses/goal-misgeneralization-research/",
        "similarity": 11
      },
      {
        "id": "provably-safe",
        "title": "Provably Safe AI (davidad agenda)",
        "path": "/knowledge-base/responses/provably-safe/",
        "similarity": 11
      }
    ]
  },
  "changeHistory": [
    {
      "date": "2026-02-18",
      "branch": "claude/review-pr-216-P4Fcu",
      "title": "Fix audit report findings from PR #216",
      "summary": "Reviewed PR #216 (comprehensive wiki audit report) and implemented fixes for the major issues it identified: fixed 181 path-style EntityLink IDs across 33 files, converted 164 broken EntityLinks (referencing non-existent entities) to plain text across 38 files, fixed a temporal inconsistency in anthropic.mdx, and added missing description fields to 53 ai-transition-model pages."
    }
  ],
  "coverage": {
    "passing": 8,
    "total": 13,
    "targets": {
      "tables": 7,
      "diagrams": 1,
      "internalLinks": 13,
      "externalLinks": 8,
      "footnotes": 5,
      "references": 5
    },
    "actuals": {
      "tables": 20,
      "diagrams": 1,
      "internalLinks": 3,
      "externalLinks": 20,
      "footnotes": 0,
      "references": 2,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "llmSummary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "green",
      "overview": "green",
      "tables": "green",
      "diagrams": "green",
      "internalLinks": "amber",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "amber",
      "quotes": "red",
      "accuracy": "red"
    },
    "editHistoryCount": 1,
    "ratingsString": "N:4.5 R:5 A:6 C:6.5"
  },
  "readerRank": 188,
  "researchRank": 138,
  "recommendedScore": 184.55
}
External Links
{
  "lesswrong": "https://www.lesswrong.com/tag/machine-unlearning"
}
Backlinks (1)
idtitletyperelationship
alignment-training-overviewTraining Methods (Overview)concept
Longterm Wiki