Longterm Wiki

Model Organisms of Misalignment

model-organisms-of-misalignmentanalysisPath: /knowledge-base/models/model-organisms-of-misalignment/
E419Entity ID (EID)
← Back to page1 backlinksQuality: 65Updated: 2026-03-13
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "model-organisms-of-misalignment",
  "numericId": null,
  "path": "/knowledge-base/models/model-organisms-of-misalignment/",
  "filePath": "knowledge-base/models/model-organisms-of-misalignment.mdx",
  "title": "Model Organisms of Misalignment",
  "quality": 65,
  "readerImportance": 72.5,
  "researchImportance": 87.5,
  "tacticalValue": null,
  "contentFormat": "article",
  "tractability": null,
  "neglectedness": null,
  "uncertainty": null,
  "causalLevel": null,
  "lastUpdated": "2026-03-13",
  "dateCreated": "2026-02-15",
  "llmSummary": "Model organisms of misalignment is a research agenda creating controlled AI systems exhibiting specific alignment failures as testbeds. Recent work achieves 99% coherence with 40% misalignment rates using models as small as 0.5B parameters, with a single rank-1 LoRA adapter inducing 9.5-21.5% misalignment in Qwen-14B while maintaining >99.5% coherence.",
  "description": "Research agenda creating controlled AI models that exhibit specific misalignment behaviors to study alignment failures and test interventions",
  "ratings": {
    "focus": 8.5,
    "novelty": 4,
    "rigor": 7,
    "completeness": 8,
    "concreteness": 7.5,
    "actionability": 5.5
  },
  "category": "models",
  "subcategory": "risk-models",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 2165,
    "tableCount": 2,
    "diagramCount": 0,
    "internalLinks": 37,
    "externalLinks": 4,
    "footnoteCount": 0,
    "bulletRatio": 0.34,
    "sectionCount": 26,
    "hasOverview": true,
    "structuralScore": 12
  },
  "suggestedQuality": 80,
  "updateFrequency": 90,
  "evergreen": true,
  "wordCount": 2165,
  "unconvertedLinks": [
    {
      "text": "arxiv.org",
      "url": "https://arxiv.org/abs/2506.11613",
      "resourceId": "b0d4f2313577c2b4",
      "resourceTitle": "Model Organisms for Emergent Misalignment - arXiv"
    }
  ],
  "unconvertedLinkCount": 1,
  "convertedLinkCount": 0,
  "backlinkCount": 1,
  "citationHealth": {
    "total": 65,
    "withQuotes": 34,
    "verified": 33,
    "accuracyChecked": 33,
    "accurate": 25,
    "inaccurate": 0,
    "avgScore": 0.9375163386849796
  },
  "hallucinationRisk": {
    "level": "medium",
    "score": 40,
    "factors": [
      "no-citations",
      "high-rigor"
    ]
  },
  "entityType": "analysis",
  "redundancy": {
    "maxSimilarity": 19,
    "similarPages": [
      {
        "id": "sleeper-agent-detection",
        "title": "Sleeper Agent Detection",
        "path": "/knowledge-base/responses/sleeper-agent-detection/",
        "similarity": 19
      },
      {
        "id": "goal-misgeneralization",
        "title": "Goal Misgeneralization",
        "path": "/knowledge-base/risks/goal-misgeneralization/",
        "similarity": 19
      },
      {
        "id": "interpretability",
        "title": "Mechanistic Interpretability",
        "path": "/knowledge-base/responses/interpretability/",
        "similarity": 18
      },
      {
        "id": "mesa-optimization",
        "title": "Mesa-Optimization",
        "path": "/knowledge-base/risks/mesa-optimization/",
        "similarity": 18
      },
      {
        "id": "scheming",
        "title": "Scheming",
        "path": "/knowledge-base/risks/scheming/",
        "similarity": 18
      }
    ]
  },
  "changeHistory": [
    {
      "date": "2026-03-08",
      "branch": "auto-update/2026-03-08",
      "title": "Auto-improve (standard): Model Organisms of Misalignment",
      "summary": "Improved \"Model Organisms of Misalignment\" via standard pipeline (1181.0s). Quality score: 78. Issues resolved: Dollar signs in funding section appear as raw $ in several p; The <F> component wraps some dollar amounts (e.g., <F e='ant; Footnote [^rc-feab] is cited twice in the same sentence in t.",
      "duration": "1181.0s",
      "cost": "$5-8"
    }
  ],
  "coverage": {
    "passing": 7,
    "total": 13,
    "targets": {
      "tables": 9,
      "diagrams": 1,
      "internalLinks": 17,
      "externalLinks": 11,
      "footnotes": 6,
      "references": 6
    },
    "actuals": {
      "tables": 2,
      "diagrams": 0,
      "internalLinks": 37,
      "externalLinks": 4,
      "footnotes": 0,
      "references": 25,
      "quotesWithQuotes": 34,
      "quotesTotal": 65,
      "accuracyChecked": 33,
      "accuracyTotal": 65
    },
    "items": {
      "llmSummary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "green",
      "overview": "green",
      "tables": "amber",
      "diagrams": "red",
      "internalLinks": "green",
      "externalLinks": "amber",
      "footnotes": "red",
      "references": "green",
      "quotes": "amber",
      "accuracy": "amber"
    },
    "editHistoryCount": 1,
    "ratingsString": "N:4 R:7 A:5.5 C:8"
  },
  "readerRank": 144,
  "researchRank": 37,
  "recommendedScore": 187.94
}
External Links

No external links

Backlinks (1)
idtitletyperelationship
evan-hubingerEvan Hubingerperson
Longterm Wiki