Adversarial Training
adversarial-trainingapproachPath: /knowledge-base/responses/adversarial-training/
E583Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "adversarial-training",
"numericId": null,
"path": "/knowledge-base/responses/adversarial-training/",
"filePath": "knowledge-base/responses/adversarial-training.mdx",
"title": "Adversarial Training",
"quality": 58,
"readerImportance": 25.5,
"researchImportance": 39.5,
"tacticalValue": null,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": "2026-03-13",
"dateCreated": "2026-02-15",
"llmSummary": "Adversarial training, universally adopted at frontier labs with \\$10-150M/year investment, improves robustness to known attacks but creates an arms race dynamic and provides no protection against model deception or novel attack categories. While necessary for operational security, it only defends external attacks rather than addressing fundamental alignment challenges.",
"description": "Adversarial training improves AI robustness by training models on examples designed to cause failures, including jailbreaks and prompt injections. While universally adopted and effective against known attacks, it creates an arms race dynamic and provides no protection against model deception or novel attacks.",
"ratings": {
"novelty": 4,
"rigor": 5,
"actionability": 5,
"completeness": 6
},
"category": "responses",
"subcategory": "alignment-training",
"clusters": [
"ai-safety"
],
"metrics": {
"wordCount": 1815,
"tableCount": 22,
"diagramCount": 1,
"internalLinks": 4,
"externalLinks": 13,
"footnoteCount": 0,
"bulletRatio": 0.02,
"sectionCount": 31,
"hasOverview": true,
"structuralScore": 15
},
"suggestedQuality": 100,
"updateFrequency": 45,
"evergreen": true,
"wordCount": 1815,
"unconvertedLinks": [
{
"text": "GCG attack",
"url": "https://arxiv.org/abs/2307.15043",
"resourceId": "302c069146f3f6f2",
"resourceTitle": "jailbreaks"
},
{
"text": "Zou et al. (2023)",
"url": "https://arxiv.org/abs/2307.15043",
"resourceId": "302c069146f3f6f2",
"resourceTitle": "jailbreaks"
},
{
"text": "Anthropic (2025)",
"url": "https://arxiv.org/pdf/2501.18837",
"resourceId": "2d454deae01c7a1e",
"resourceTitle": "Constitutional Classifiers arXiv paper (https://arxiv.org/pdf/2501.18837)"
}
],
"unconvertedLinkCount": 3,
"convertedLinkCount": 0,
"backlinkCount": 11,
"hallucinationRisk": {
"level": "medium",
"score": 45,
"factors": [
"no-citations",
"conceptual-content"
]
},
"entityType": "approach",
"redundancy": {
"maxSimilarity": 16,
"similarPages": [
{
"id": "reward-modeling",
"title": "Reward Modeling",
"path": "/knowledge-base/responses/reward-modeling/",
"similarity": 16
},
{
"id": "refusal-training",
"title": "Refusal Training",
"path": "/knowledge-base/responses/refusal-training/",
"similarity": 13
},
{
"id": "cirl",
"title": "Cooperative IRL (CIRL)",
"path": "/knowledge-base/responses/cirl/",
"similarity": 12
},
{
"id": "cooperative-ai",
"title": "Cooperative AI",
"path": "/knowledge-base/responses/cooperative-ai/",
"similarity": 12
},
{
"id": "process-supervision",
"title": "Process Supervision",
"path": "/knowledge-base/responses/process-supervision/",
"similarity": 12
}
]
},
"coverage": {
"passing": 7,
"total": 13,
"targets": {
"tables": 7,
"diagrams": 1,
"internalLinks": 15,
"externalLinks": 9,
"footnotes": 5,
"references": 5
},
"actuals": {
"tables": 22,
"diagrams": 1,
"internalLinks": 4,
"externalLinks": 13,
"footnotes": 0,
"references": 2,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"llmSummary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "red",
"overview": "green",
"tables": "green",
"diagrams": "green",
"internalLinks": "amber",
"externalLinks": "green",
"footnotes": "red",
"references": "amber",
"quotes": "red",
"accuracy": "red"
},
"ratingsString": "N:4 R:5 A:5 C:6"
},
"readerRank": 487,
"researchRank": 347,
"recommendedScore": 150.36
}External Links
{
"lesswrong": "https://www.lesswrong.com/tag/adversarial-training"
}Backlinks (11)
| id | title | type | relationship |
|---|---|---|---|
| circuit-breakers | Circuit Breakers / Inference Interventions | approach | — |
| accident-risks | AI Accident Risk Cruxes | crux | — |
| why-alignment-hard | Why Alignment Might Be Hard | argument | — |
| intervention-effectiveness-matrix | Intervention Effectiveness Matrix | analysis | — |
| far-ai | FAR AI | organization | — |
| redwood-research | Redwood Research | organization | — |
| safety-orgs-overview | AI Safety Organizations (Overview) | concept | — |
| paul-christiano | Paul Christiano | person | — |
| alignment-training-overview | Training Methods (Overview) | concept | — |
| deepfakes | Deepfakes | risk | — |
| sleeper-agents | Sleeper Agents: Training Deceptive LLMs | risk | — |