Longterm Wiki

Adversarial Training

adversarial-trainingapproachPath: /knowledge-base/responses/adversarial-training/
E583Entity ID (EID)
← Back to page11 backlinksQuality: 58Updated: 2026-03-13
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "adversarial-training",
  "numericId": null,
  "path": "/knowledge-base/responses/adversarial-training/",
  "filePath": "knowledge-base/responses/adversarial-training.mdx",
  "title": "Adversarial Training",
  "quality": 58,
  "readerImportance": 25.5,
  "researchImportance": 39.5,
  "tacticalValue": null,
  "contentFormat": "article",
  "tractability": null,
  "neglectedness": null,
  "uncertainty": null,
  "causalLevel": null,
  "lastUpdated": "2026-03-13",
  "dateCreated": "2026-02-15",
  "llmSummary": "Adversarial training, universally adopted at frontier labs with \\$10-150M/year investment, improves robustness to known attacks but creates an arms race dynamic and provides no protection against model deception or novel attack categories. While necessary for operational security, it only defends external attacks rather than addressing fundamental alignment challenges.",
  "description": "Adversarial training improves AI robustness by training models on examples designed to cause failures, including jailbreaks and prompt injections. While universally adopted and effective against known attacks, it creates an arms race dynamic and provides no protection against model deception or novel attacks.",
  "ratings": {
    "novelty": 4,
    "rigor": 5,
    "actionability": 5,
    "completeness": 6
  },
  "category": "responses",
  "subcategory": "alignment-training",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 1815,
    "tableCount": 22,
    "diagramCount": 1,
    "internalLinks": 4,
    "externalLinks": 13,
    "footnoteCount": 0,
    "bulletRatio": 0.02,
    "sectionCount": 31,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": 45,
  "evergreen": true,
  "wordCount": 1815,
  "unconvertedLinks": [
    {
      "text": "GCG attack",
      "url": "https://arxiv.org/abs/2307.15043",
      "resourceId": "302c069146f3f6f2",
      "resourceTitle": "jailbreaks"
    },
    {
      "text": "Zou et al. (2023)",
      "url": "https://arxiv.org/abs/2307.15043",
      "resourceId": "302c069146f3f6f2",
      "resourceTitle": "jailbreaks"
    },
    {
      "text": "Anthropic (2025)",
      "url": "https://arxiv.org/pdf/2501.18837",
      "resourceId": "2d454deae01c7a1e",
      "resourceTitle": "Constitutional Classifiers arXiv paper (https://arxiv.org/pdf/2501.18837)"
    }
  ],
  "unconvertedLinkCount": 3,
  "convertedLinkCount": 0,
  "backlinkCount": 11,
  "hallucinationRisk": {
    "level": "medium",
    "score": 45,
    "factors": [
      "no-citations",
      "conceptual-content"
    ]
  },
  "entityType": "approach",
  "redundancy": {
    "maxSimilarity": 16,
    "similarPages": [
      {
        "id": "reward-modeling",
        "title": "Reward Modeling",
        "path": "/knowledge-base/responses/reward-modeling/",
        "similarity": 16
      },
      {
        "id": "refusal-training",
        "title": "Refusal Training",
        "path": "/knowledge-base/responses/refusal-training/",
        "similarity": 13
      },
      {
        "id": "cirl",
        "title": "Cooperative IRL (CIRL)",
        "path": "/knowledge-base/responses/cirl/",
        "similarity": 12
      },
      {
        "id": "cooperative-ai",
        "title": "Cooperative AI",
        "path": "/knowledge-base/responses/cooperative-ai/",
        "similarity": 12
      },
      {
        "id": "process-supervision",
        "title": "Process Supervision",
        "path": "/knowledge-base/responses/process-supervision/",
        "similarity": 12
      }
    ]
  },
  "coverage": {
    "passing": 7,
    "total": 13,
    "targets": {
      "tables": 7,
      "diagrams": 1,
      "internalLinks": 15,
      "externalLinks": 9,
      "footnotes": 5,
      "references": 5
    },
    "actuals": {
      "tables": 22,
      "diagrams": 1,
      "internalLinks": 4,
      "externalLinks": 13,
      "footnotes": 0,
      "references": 2,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "llmSummary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "green",
      "diagrams": "green",
      "internalLinks": "amber",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "amber",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:4 R:5 A:5 C:6"
  },
  "readerRank": 487,
  "researchRank": 347,
  "recommendedScore": 150.36
}
External Links
{
  "lesswrong": "https://www.lesswrong.com/tag/adversarial-training"
}
Backlinks (11)
idtitletyperelationship
circuit-breakersCircuit Breakers / Inference Interventionsapproach
accident-risksAI Accident Risk Cruxescrux
why-alignment-hardWhy Alignment Might Be Hardargument
intervention-effectiveness-matrixIntervention Effectiveness Matrixanalysis
far-aiFAR AIorganization
redwood-researchRedwood Researchorganization
safety-orgs-overviewAI Safety Organizations (Overview)concept
paul-christianoPaul Christianoperson
alignment-training-overviewTraining Methods (Overview)concept
deepfakesDeepfakesrisk
sleeper-agentsSleeper Agents: Training Deceptive LLMsrisk
Longterm Wiki