Longterm Wiki

Refusal Training

refusal-trainingapproachPath: /knowledge-base/responses/refusal-training/
E456Entity ID (EID)
← Back to page2 backlinksQuality: 63Updated: 2026-03-13
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "refusal-training",
  "numericId": null,
  "path": "/knowledge-base/responses/refusal-training/",
  "filePath": "knowledge-base/responses/refusal-training.mdx",
  "title": "Refusal Training",
  "quality": 63,
  "readerImportance": 21,
  "researchImportance": 28.5,
  "tacticalValue": null,
  "contentFormat": "article",
  "tractability": null,
  "neglectedness": null,
  "uncertainty": null,
  "causalLevel": null,
  "lastUpdated": "2026-03-13",
  "dateCreated": "2026-02-15",
  "llmSummary": "Refusal training achieves 99%+ refusal rates on explicit harmful requests but faces 1.5-6.5% jailbreak success rates (UK AISI 2025) and 12-43% over-refusal on legitimate queries. While necessary for deployment hygiene, it addresses behavior rather than goals, providing no defense against deceptive alignment or scheming.",
  "description": "Refusal training teaches AI models to decline harmful requests rather than comply. While universally deployed and achieving 99%+ refusal rates on explicit violations, jailbreak techniques bypass defenses with 1.5-6.5% success rates (UK AISI 2025), and over-refusal blocks 12-43% of legitimate queries. The technique represents necessary deployment hygiene but should not be confused with genuine safety.",
  "ratings": {
    "novelty": 4.2,
    "rigor": 7.1,
    "actionability": 6.8,
    "completeness": 7.3
  },
  "category": "responses",
  "subcategory": "alignment-training",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 2825,
    "tableCount": 20,
    "diagramCount": 1,
    "internalLinks": 8,
    "externalLinks": 30,
    "footnoteCount": 0,
    "bulletRatio": 0.12,
    "sectionCount": 34,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": 45,
  "evergreen": true,
  "wordCount": 2825,
  "unconvertedLinks": [
    {
      "text": "Constitutional AI",
      "url": "https://arxiv.org/abs/2212.08073",
      "resourceId": "683aef834ac1612a"
    },
    {
      "text": "Constitutional Classifiers",
      "url": "https://www.anthropic.com/research/constitutional-classifiers",
      "resourceId": "ce3ac91af6150e19",
      "resourceTitle": "Constitutional Classifiers: Defending against universal jailbreaks (https://anthropic.com/research/constitutional-cla..."
    },
    {
      "text": "Constitutional Classifiers",
      "url": "https://www.anthropic.com/research/constitutional-classifiers",
      "resourceId": "ce3ac91af6150e19",
      "resourceTitle": "Constitutional Classifiers: Defending against universal jailbreaks (https://anthropic.com/research/constitutional-cla..."
    },
    {
      "text": "Constitutional AI",
      "url": "https://arxiv.org/abs/2212.08073",
      "resourceId": "683aef834ac1612a"
    },
    {
      "text": "JailbreakBench",
      "url": "https://jailbreakbench.github.io/",
      "resourceId": "f302ae7c0bac3d3f",
      "resourceTitle": "JailbreakBench: LLM robustness benchmark"
    },
    {
      "text": "Constitutional Classifiers",
      "url": "https://www.anthropic.com/research/constitutional-classifiers",
      "resourceId": "ce3ac91af6150e19",
      "resourceTitle": "Constitutional Classifiers: Defending against universal jailbreaks (https://anthropic.com/research/constitutional-cla..."
    },
    {
      "text": "Constitutional AI: Harmlessness from AI Feedback",
      "url": "https://arxiv.org/abs/2212.08073",
      "resourceId": "683aef834ac1612a"
    },
    {
      "text": "Constitutional Classifiers",
      "url": "https://www.anthropic.com/research/constitutional-classifiers",
      "resourceId": "ce3ac91af6150e19",
      "resourceTitle": "Constitutional Classifiers: Defending against universal jailbreaks (https://anthropic.com/research/constitutional-cla..."
    },
    {
      "text": "JailbreakBench",
      "url": "https://jailbreakbench.github.io/",
      "resourceId": "f302ae7c0bac3d3f",
      "resourceTitle": "JailbreakBench: LLM robustness benchmark"
    },
    {
      "text": "AISI Frontier AI Trends Report",
      "url": "https://www.aisi.gov.uk/frontier-ai-trends-report",
      "resourceId": "7042c7f8de04ccb1",
      "resourceTitle": "AISI Frontier AI Trends"
    }
  ],
  "unconvertedLinkCount": 10,
  "convertedLinkCount": 0,
  "backlinkCount": 2,
  "hallucinationRisk": {
    "level": "low",
    "score": 30,
    "factors": [
      "no-citations",
      "high-rigor",
      "conceptual-content"
    ]
  },
  "entityType": "approach",
  "redundancy": {
    "maxSimilarity": 19,
    "similarPages": [
      {
        "id": "output-filtering",
        "title": "AI Output Filtering",
        "path": "/knowledge-base/responses/output-filtering/",
        "similarity": 19
      },
      {
        "id": "circuit-breakers",
        "title": "Circuit Breakers / Inference Interventions",
        "path": "/knowledge-base/responses/circuit-breakers/",
        "similarity": 18
      },
      {
        "id": "rlhf",
        "title": "RLHF / Constitutional AI",
        "path": "/knowledge-base/responses/rlhf/",
        "similarity": 16
      },
      {
        "id": "ai-assisted",
        "title": "AI-Assisted Alignment",
        "path": "/knowledge-base/responses/ai-assisted/",
        "similarity": 15
      },
      {
        "id": "scheming-detection",
        "title": "Scheming & Deception Detection",
        "path": "/knowledge-base/responses/scheming-detection/",
        "similarity": 15
      }
    ]
  },
  "coverage": {
    "passing": 7,
    "total": 13,
    "targets": {
      "tables": 11,
      "diagrams": 1,
      "internalLinks": 23,
      "externalLinks": 14,
      "footnotes": 8,
      "references": 8
    },
    "actuals": {
      "tables": 20,
      "diagrams": 1,
      "internalLinks": 8,
      "externalLinks": 30,
      "footnotes": 0,
      "references": 4,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "llmSummary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "green",
      "diagrams": "green",
      "internalLinks": "amber",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "amber",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:4.2 R:7.1 A:6.8 C:7.3"
  },
  "readerRank": 517,
  "researchRank": 427,
  "recommendedScore": 158.28
}
External Links

No external links

Backlinks (2)
idtitletyperelationship
circuit-breakersCircuit Breakers / Inference Interventionsapproach
alignment-training-overviewTraining Methods (Overview)concept
Longterm Wiki