Refusal Training
refusal-trainingapproachPath: /knowledge-base/responses/refusal-training/
E456Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "refusal-training",
"numericId": null,
"path": "/knowledge-base/responses/refusal-training/",
"filePath": "knowledge-base/responses/refusal-training.mdx",
"title": "Refusal Training",
"quality": 63,
"readerImportance": 21,
"researchImportance": 28.5,
"tacticalValue": null,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": "2026-03-13",
"dateCreated": "2026-02-15",
"llmSummary": "Refusal training achieves 99%+ refusal rates on explicit harmful requests but faces 1.5-6.5% jailbreak success rates (UK AISI 2025) and 12-43% over-refusal on legitimate queries. While necessary for deployment hygiene, it addresses behavior rather than goals, providing no defense against deceptive alignment or scheming.",
"description": "Refusal training teaches AI models to decline harmful requests rather than comply. While universally deployed and achieving 99%+ refusal rates on explicit violations, jailbreak techniques bypass defenses with 1.5-6.5% success rates (UK AISI 2025), and over-refusal blocks 12-43% of legitimate queries. The technique represents necessary deployment hygiene but should not be confused with genuine safety.",
"ratings": {
"novelty": 4.2,
"rigor": 7.1,
"actionability": 6.8,
"completeness": 7.3
},
"category": "responses",
"subcategory": "alignment-training",
"clusters": [
"ai-safety"
],
"metrics": {
"wordCount": 2825,
"tableCount": 20,
"diagramCount": 1,
"internalLinks": 8,
"externalLinks": 30,
"footnoteCount": 0,
"bulletRatio": 0.12,
"sectionCount": 34,
"hasOverview": true,
"structuralScore": 15
},
"suggestedQuality": 100,
"updateFrequency": 45,
"evergreen": true,
"wordCount": 2825,
"unconvertedLinks": [
{
"text": "Constitutional AI",
"url": "https://arxiv.org/abs/2212.08073",
"resourceId": "683aef834ac1612a"
},
{
"text": "Constitutional Classifiers",
"url": "https://www.anthropic.com/research/constitutional-classifiers",
"resourceId": "ce3ac91af6150e19",
"resourceTitle": "Constitutional Classifiers: Defending against universal jailbreaks (https://anthropic.com/research/constitutional-cla..."
},
{
"text": "Constitutional Classifiers",
"url": "https://www.anthropic.com/research/constitutional-classifiers",
"resourceId": "ce3ac91af6150e19",
"resourceTitle": "Constitutional Classifiers: Defending against universal jailbreaks (https://anthropic.com/research/constitutional-cla..."
},
{
"text": "Constitutional AI",
"url": "https://arxiv.org/abs/2212.08073",
"resourceId": "683aef834ac1612a"
},
{
"text": "JailbreakBench",
"url": "https://jailbreakbench.github.io/",
"resourceId": "f302ae7c0bac3d3f",
"resourceTitle": "JailbreakBench: LLM robustness benchmark"
},
{
"text": "Constitutional Classifiers",
"url": "https://www.anthropic.com/research/constitutional-classifiers",
"resourceId": "ce3ac91af6150e19",
"resourceTitle": "Constitutional Classifiers: Defending against universal jailbreaks (https://anthropic.com/research/constitutional-cla..."
},
{
"text": "Constitutional AI: Harmlessness from AI Feedback",
"url": "https://arxiv.org/abs/2212.08073",
"resourceId": "683aef834ac1612a"
},
{
"text": "Constitutional Classifiers",
"url": "https://www.anthropic.com/research/constitutional-classifiers",
"resourceId": "ce3ac91af6150e19",
"resourceTitle": "Constitutional Classifiers: Defending against universal jailbreaks (https://anthropic.com/research/constitutional-cla..."
},
{
"text": "JailbreakBench",
"url": "https://jailbreakbench.github.io/",
"resourceId": "f302ae7c0bac3d3f",
"resourceTitle": "JailbreakBench: LLM robustness benchmark"
},
{
"text": "AISI Frontier AI Trends Report",
"url": "https://www.aisi.gov.uk/frontier-ai-trends-report",
"resourceId": "7042c7f8de04ccb1",
"resourceTitle": "AISI Frontier AI Trends"
}
],
"unconvertedLinkCount": 10,
"convertedLinkCount": 0,
"backlinkCount": 2,
"hallucinationRisk": {
"level": "low",
"score": 30,
"factors": [
"no-citations",
"high-rigor",
"conceptual-content"
]
},
"entityType": "approach",
"redundancy": {
"maxSimilarity": 19,
"similarPages": [
{
"id": "output-filtering",
"title": "AI Output Filtering",
"path": "/knowledge-base/responses/output-filtering/",
"similarity": 19
},
{
"id": "circuit-breakers",
"title": "Circuit Breakers / Inference Interventions",
"path": "/knowledge-base/responses/circuit-breakers/",
"similarity": 18
},
{
"id": "rlhf",
"title": "RLHF / Constitutional AI",
"path": "/knowledge-base/responses/rlhf/",
"similarity": 16
},
{
"id": "ai-assisted",
"title": "AI-Assisted Alignment",
"path": "/knowledge-base/responses/ai-assisted/",
"similarity": 15
},
{
"id": "scheming-detection",
"title": "Scheming & Deception Detection",
"path": "/knowledge-base/responses/scheming-detection/",
"similarity": 15
}
]
},
"coverage": {
"passing": 7,
"total": 13,
"targets": {
"tables": 11,
"diagrams": 1,
"internalLinks": 23,
"externalLinks": 14,
"footnotes": 8,
"references": 8
},
"actuals": {
"tables": 20,
"diagrams": 1,
"internalLinks": 8,
"externalLinks": 30,
"footnotes": 0,
"references": 4,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"llmSummary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "red",
"overview": "green",
"tables": "green",
"diagrams": "green",
"internalLinks": "amber",
"externalLinks": "green",
"footnotes": "red",
"references": "amber",
"quotes": "red",
"accuracy": "red"
},
"ratingsString": "N:4.2 R:7.1 A:6.8 C:7.3"
},
"readerRank": 517,
"researchRank": 427,
"recommendedScore": 158.28
}External Links
No external links
Backlinks (2)
| id | title | type | relationship |
|---|---|---|---|
| circuit-breakers | Circuit Breakers / Inference Interventions | approach | — |
| alignment-training-overview | Training Methods (Overview) | concept | — |