Dense Transformers
dense-transformersconceptPath: /knowledge-base/intelligence-paradigms/dense-transformers/
E403Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "dense-transformers",
"numericId": null,
"path": "/knowledge-base/intelligence-paradigms/dense-transformers/",
"filePath": "knowledge-base/intelligence-paradigms/dense-transformers.mdx",
"title": "Dense Transformers",
"quality": 58,
"readerImportance": 79.5,
"researchImportance": 62.5,
"tacticalValue": 50,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": "2026-03-13",
"dateCreated": "2026-02-15",
"llmSummary": "Comprehensive analysis of dense transformers (GPT-4, Claude 3, Llama 3) as the dominant AI architecture (95%+ of frontier models), with training costs reaching \\$100M-500M per run and 2.5x annual cost growth since 2016. Despite open weights for some models, mechanistic interpretability remains primitive—Anthropic's 2024 SAE research extracted millions of features from Claude 3 Sonnet but cannot predict emergent capabilities or detect deceptive reasoning, creating fundamental safety limitations for RLHF-based alignment approaches.",
"description": "Analysis of the standard transformer architecture that powers current frontier AI. Since Vaswani et al.'s 2017 paper (now 160,000+ citations), dense transformers power GPT-4, Claude 3, Llama 3, and Gemini. Despite open weights for some models, mechanistic interpretability remains primitive - Anthropic's 2024 SAE research found tens of millions of features in Claude 3 Sonnet but cannot yet predict emergent capabilities.",
"ratings": {
"novelty": 4.2,
"rigor": 6.8,
"actionability": 5.5,
"completeness": 7.3
},
"category": "intelligence-paradigms",
"subcategory": "architectures",
"clusters": [
"ai-safety"
],
"metrics": {
"wordCount": 3357,
"tableCount": 18,
"diagramCount": 1,
"internalLinks": 6,
"externalLinks": 50,
"footnoteCount": 0,
"bulletRatio": 0.11,
"sectionCount": 33,
"hasOverview": true,
"structuralScore": 15
},
"suggestedQuality": 100,
"updateFrequency": 21,
"evergreen": true,
"wordCount": 3357,
"unconvertedLinks": [
{
"text": "\"Attention Is All You Need\"",
"url": "https://arxiv.org/abs/1706.03762",
"resourceId": "a7468c6851652691",
"resourceTitle": "Attention Is All You Need"
},
{
"text": "2024 Scaling Monosemanticity research",
"url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
"resourceId": "e724db341d6e0065"
},
{
"text": "Scaling Monosemanticity",
"url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
"resourceId": "e724db341d6e0065"
},
{
"text": "Circuit Tracing",
"url": "https://transformer-circuits.pub/2025/july-update/index.html",
"resourceId": "0a2ab4f291c4a773",
"resourceTitle": "Circuits Updates - July 2025"
},
{
"text": "Circuit tracing",
"url": "https://transformer-circuits.pub/2025/july-update/index.html",
"resourceId": "0a2ab4f291c4a773",
"resourceTitle": "Circuits Updates - July 2025"
},
{
"text": "InstructGPT",
"url": "https://arxiv.org/abs/2203.02155",
"resourceId": "1098fc60be7ca2b0",
"resourceTitle": "Training Language Models to Follow Instructions with Human Feedback"
},
{
"text": "Attention Is All You Need",
"url": "https://arxiv.org/abs/1706.03762",
"resourceId": "a7468c6851652691",
"resourceTitle": "Attention Is All You Need"
},
{
"text": "Scaling Laws for Neural Language Models",
"url": "https://arxiv.org/abs/2001.08361",
"resourceId": "85f66a6419d173a7",
"resourceTitle": "Kaplan et al. (2020)"
},
{
"text": "Training language models to follow instructions with human feedback",
"url": "https://arxiv.org/abs/2203.02155",
"resourceId": "1098fc60be7ca2b0",
"resourceTitle": "Training Language Models to Follow Instructions with Human Feedback"
},
{
"text": "Constitutional AI",
"url": "https://arxiv.org/abs/2212.08073",
"resourceId": "683aef834ac1612a"
},
{
"text": "Scaling Monosemanticity",
"url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
"resourceId": "e724db341d6e0065"
},
{
"text": "Sam Altman public statements",
"url": "https://fortune.com/2024/04/04/ai-training-costs-how-much-is-too-much-openai-gpt-anthropic-microsoft/",
"resourceId": "b2534f71895a316d",
"resourceTitle": "Fortune AI training costs"
},
{
"text": "Epoch AI analysis",
"url": "https://epoch.ai/data-insights/openai-compute-spend",
"resourceId": "e5457746f2524afb",
"resourceTitle": "Epoch AI OpenAI compute spend"
},
{
"text": "AI industry analysts",
"url": "https://www.jonvet.com/blog/llm-scaling-in-2025",
"resourceId": "7226d362130b23f8",
"resourceTitle": "performance gap between US and Chinese models"
},
{
"text": "Kaplan et al. (2020)",
"url": "https://arxiv.org/abs/2001.08361",
"resourceId": "85f66a6419d173a7",
"resourceTitle": "Kaplan et al. (2020)"
},
{
"text": "2024 sparse autoencoder work",
"url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
"resourceId": "e724db341d6e0065"
},
{
"text": "\"Attention Is All You Need\"",
"url": "https://arxiv.org/abs/1706.03762",
"resourceId": "a7468c6851652691",
"resourceTitle": "Attention Is All You Need"
},
{
"text": "\"Scaling Laws for Neural Language Models\"",
"url": "https://arxiv.org/abs/2001.08361",
"resourceId": "85f66a6419d173a7",
"resourceTitle": "Kaplan et al. (2020)"
},
{
"text": "\"Training language models to follow instructions with human feedback\"",
"url": "https://arxiv.org/abs/2203.02155",
"resourceId": "1098fc60be7ca2b0",
"resourceTitle": "Training Language Models to Follow Instructions with Human Feedback"
},
{
"text": "\"GPT-4 Technical Report\"",
"url": "https://arxiv.org/abs/2303.08774",
"resourceId": "29a0882390ee7063",
"resourceTitle": "OpenAI's GPT-4"
},
{
"text": "\"Scaling Monosemanticity: Extracting Interpretable Features from Claude 3 Sonnet\"",
"url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
"resourceId": "e724db341d6e0065"
},
{
"text": "\"Circuit Tracing Updates - July 2025\"",
"url": "https://transformer-circuits.pub/2025/july-update/index.html",
"resourceId": "0a2ab4f291c4a773",
"resourceTitle": "Circuits Updates - July 2025"
},
{
"text": "\"Most of OpenAI's 2024 compute went to experiments\"",
"url": "https://epoch.ai/data-insights/openai-compute-spend",
"resourceId": "e5457746f2524afb",
"resourceTitle": "Epoch AI OpenAI compute spend"
},
{
"text": "\"Why the cost of AI could soon become too much to bear\"",
"url": "https://fortune.com/2024/04/04/ai-training-costs-how-much-is-too-much-openai-gpt-anthropic-microsoft/",
"resourceId": "b2534f71895a316d",
"resourceTitle": "Fortune AI training costs"
}
],
"unconvertedLinkCount": 24,
"convertedLinkCount": 0,
"backlinkCount": 3,
"hallucinationRisk": {
"level": "medium",
"score": 45,
"factors": [
"no-citations",
"conceptual-content"
]
},
"entityType": "concept",
"redundancy": {
"maxSimilarity": 15,
"similarPages": [
{
"id": "large-language-models",
"title": "Large Language Models",
"path": "/knowledge-base/capabilities/large-language-models/",
"similarity": 15
},
{
"id": "neuromorphic",
"title": "Neuromorphic Hardware",
"path": "/knowledge-base/intelligence-paradigms/neuromorphic/",
"similarity": 15
},
{
"id": "scaling-laws",
"title": "AI Scaling Laws",
"path": "/knowledge-base/models/scaling-laws/",
"similarity": 15
},
{
"id": "sparse-autoencoders",
"title": "Sparse Autoencoders (SAEs)",
"path": "/knowledge-base/responses/sparse-autoencoders/",
"similarity": 15
},
{
"id": "ssm-mamba",
"title": "State-Space Models / Mamba",
"path": "/knowledge-base/intelligence-paradigms/ssm-mamba/",
"similarity": 14
}
]
},
"coverage": {
"passing": 8,
"total": 13,
"targets": {
"tables": 13,
"diagrams": 1,
"internalLinks": 27,
"externalLinks": 17,
"footnotes": 10,
"references": 10
},
"actuals": {
"tables": 18,
"diagrams": 1,
"internalLinks": 6,
"externalLinks": 50,
"footnotes": 0,
"references": 10,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"llmSummary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "red",
"overview": "green",
"tables": "green",
"diagrams": "green",
"internalLinks": "amber",
"externalLinks": "green",
"footnotes": "red",
"references": "green",
"quotes": "red",
"accuracy": "red"
},
"ratingsString": "N:4.2 R:6.8 A:5.5 C:7.3"
},
"readerRank": 90,
"researchRank": 204,
"recommendedScore": 177.61
}External Links
{
"lesswrong": "https://www.lesswrong.com/tag/transformers",
"wikipedia": "https://en.wikipedia.org/wiki/Transformer_(deep_learning_architecture)"
}Backlinks (3)
| id | title | type | relationship |
|---|---|---|---|
| heavy-scaffolding | Heavy Scaffolding / Agentic Systems | concept | — |
| provable-safe | Provable / Guaranteed Safe AI | concept | — |
| __index__/knowledge-base/intelligence-paradigms | Intelligence Paradigms | concept | — |