Model Organisms of Misalignment
model-organisms-of-misalignmentanalysisPath: /knowledge-base/models/model-organisms-of-misalignment/
E419Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "model-organisms-of-misalignment",
"numericId": null,
"path": "/knowledge-base/models/model-organisms-of-misalignment/",
"filePath": "knowledge-base/models/model-organisms-of-misalignment.mdx",
"title": "Model Organisms of Misalignment",
"quality": 65,
"readerImportance": 72.5,
"researchImportance": 87.5,
"tacticalValue": null,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": "2026-03-13",
"dateCreated": "2026-02-15",
"llmSummary": "Model organisms of misalignment is a research agenda creating controlled AI systems exhibiting specific alignment failures as testbeds. Recent work achieves 99% coherence with 40% misalignment rates using models as small as 0.5B parameters, with a single rank-1 LoRA adapter inducing 9.5-21.5% misalignment in Qwen-14B while maintaining >99.5% coherence.",
"description": "Research agenda creating controlled AI models that exhibit specific misalignment behaviors to study alignment failures and test interventions",
"ratings": {
"focus": 8.5,
"novelty": 4,
"rigor": 7,
"completeness": 8,
"concreteness": 7.5,
"actionability": 5.5
},
"category": "models",
"subcategory": "risk-models",
"clusters": [
"ai-safety"
],
"metrics": {
"wordCount": 2165,
"tableCount": 2,
"diagramCount": 0,
"internalLinks": 37,
"externalLinks": 4,
"footnoteCount": 0,
"bulletRatio": 0.34,
"sectionCount": 26,
"hasOverview": true,
"structuralScore": 12
},
"suggestedQuality": 80,
"updateFrequency": 90,
"evergreen": true,
"wordCount": 2165,
"unconvertedLinks": [
{
"text": "arxiv.org",
"url": "https://arxiv.org/abs/2506.11613",
"resourceId": "b0d4f2313577c2b4",
"resourceTitle": "Model Organisms for Emergent Misalignment - arXiv"
}
],
"unconvertedLinkCount": 1,
"convertedLinkCount": 0,
"backlinkCount": 1,
"citationHealth": {
"total": 65,
"withQuotes": 34,
"verified": 33,
"accuracyChecked": 33,
"accurate": 25,
"inaccurate": 0,
"avgScore": 0.9375163386849796
},
"hallucinationRisk": {
"level": "medium",
"score": 40,
"factors": [
"no-citations",
"high-rigor"
]
},
"entityType": "analysis",
"redundancy": {
"maxSimilarity": 19,
"similarPages": [
{
"id": "sleeper-agent-detection",
"title": "Sleeper Agent Detection",
"path": "/knowledge-base/responses/sleeper-agent-detection/",
"similarity": 19
},
{
"id": "goal-misgeneralization",
"title": "Goal Misgeneralization",
"path": "/knowledge-base/risks/goal-misgeneralization/",
"similarity": 19
},
{
"id": "interpretability",
"title": "Mechanistic Interpretability",
"path": "/knowledge-base/responses/interpretability/",
"similarity": 18
},
{
"id": "mesa-optimization",
"title": "Mesa-Optimization",
"path": "/knowledge-base/risks/mesa-optimization/",
"similarity": 18
},
{
"id": "scheming",
"title": "Scheming",
"path": "/knowledge-base/risks/scheming/",
"similarity": 18
}
]
},
"changeHistory": [
{
"date": "2026-03-08",
"branch": "auto-update/2026-03-08",
"title": "Auto-improve (standard): Model Organisms of Misalignment",
"summary": "Improved \"Model Organisms of Misalignment\" via standard pipeline (1181.0s). Quality score: 78. Issues resolved: Dollar signs in funding section appear as raw $ in several p; The <F> component wraps some dollar amounts (e.g., <F e='ant; Footnote [^rc-feab] is cited twice in the same sentence in t.",
"duration": "1181.0s",
"cost": "$5-8"
}
],
"coverage": {
"passing": 7,
"total": 13,
"targets": {
"tables": 9,
"diagrams": 1,
"internalLinks": 17,
"externalLinks": 11,
"footnotes": 6,
"references": 6
},
"actuals": {
"tables": 2,
"diagrams": 0,
"internalLinks": 37,
"externalLinks": 4,
"footnotes": 0,
"references": 25,
"quotesWithQuotes": 34,
"quotesTotal": 65,
"accuracyChecked": 33,
"accuracyTotal": 65
},
"items": {
"llmSummary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "green",
"overview": "green",
"tables": "amber",
"diagrams": "red",
"internalLinks": "green",
"externalLinks": "amber",
"footnotes": "red",
"references": "green",
"quotes": "amber",
"accuracy": "amber"
},
"editHistoryCount": 1,
"ratingsString": "N:4 R:7 A:5.5 C:8"
},
"readerRank": 144,
"researchRank": 37,
"recommendedScore": 187.94
}External Links
No external links
Backlinks (1)
| id | title | type | relationship |
|---|---|---|---|
| evan-hubinger | Evan Hubinger | person | — |