Longterm Wiki

Dense Transformers

dense-transformersconceptPath: /knowledge-base/intelligence-paradigms/dense-transformers/
E403Entity ID (EID)
← Back to page3 backlinksQuality: 58Updated: 2026-03-13
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "dense-transformers",
  "numericId": null,
  "path": "/knowledge-base/intelligence-paradigms/dense-transformers/",
  "filePath": "knowledge-base/intelligence-paradigms/dense-transformers.mdx",
  "title": "Dense Transformers",
  "quality": 58,
  "readerImportance": 79.5,
  "researchImportance": 62.5,
  "tacticalValue": 50,
  "contentFormat": "article",
  "tractability": null,
  "neglectedness": null,
  "uncertainty": null,
  "causalLevel": null,
  "lastUpdated": "2026-03-13",
  "dateCreated": "2026-02-15",
  "llmSummary": "Comprehensive analysis of dense transformers (GPT-4, Claude 3, Llama 3) as the dominant AI architecture (95%+ of frontier models), with training costs reaching \\$100M-500M per run and 2.5x annual cost growth since 2016. Despite open weights for some models, mechanistic interpretability remains primitive—Anthropic's 2024 SAE research extracted millions of features from Claude 3 Sonnet but cannot predict emergent capabilities or detect deceptive reasoning, creating fundamental safety limitations for RLHF-based alignment approaches.",
  "description": "Analysis of the standard transformer architecture that powers current frontier AI. Since Vaswani et al.'s 2017 paper (now 160,000+ citations), dense transformers power GPT-4, Claude 3, Llama 3, and Gemini. Despite open weights for some models, mechanistic interpretability remains primitive - Anthropic's 2024 SAE research found tens of millions of features in Claude 3 Sonnet but cannot yet predict emergent capabilities.",
  "ratings": {
    "novelty": 4.2,
    "rigor": 6.8,
    "actionability": 5.5,
    "completeness": 7.3
  },
  "category": "intelligence-paradigms",
  "subcategory": "architectures",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 3357,
    "tableCount": 18,
    "diagramCount": 1,
    "internalLinks": 6,
    "externalLinks": 50,
    "footnoteCount": 0,
    "bulletRatio": 0.11,
    "sectionCount": 33,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": 21,
  "evergreen": true,
  "wordCount": 3357,
  "unconvertedLinks": [
    {
      "text": "\"Attention Is All You Need\"",
      "url": "https://arxiv.org/abs/1706.03762",
      "resourceId": "a7468c6851652691",
      "resourceTitle": "Attention Is All You Need"
    },
    {
      "text": "2024 Scaling Monosemanticity research",
      "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
      "resourceId": "e724db341d6e0065"
    },
    {
      "text": "Scaling Monosemanticity",
      "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
      "resourceId": "e724db341d6e0065"
    },
    {
      "text": "Circuit Tracing",
      "url": "https://transformer-circuits.pub/2025/july-update/index.html",
      "resourceId": "0a2ab4f291c4a773",
      "resourceTitle": "Circuits Updates - July 2025"
    },
    {
      "text": "Circuit tracing",
      "url": "https://transformer-circuits.pub/2025/july-update/index.html",
      "resourceId": "0a2ab4f291c4a773",
      "resourceTitle": "Circuits Updates - July 2025"
    },
    {
      "text": "InstructGPT",
      "url": "https://arxiv.org/abs/2203.02155",
      "resourceId": "1098fc60be7ca2b0",
      "resourceTitle": "Training Language Models to Follow Instructions with Human Feedback"
    },
    {
      "text": "Attention Is All You Need",
      "url": "https://arxiv.org/abs/1706.03762",
      "resourceId": "a7468c6851652691",
      "resourceTitle": "Attention Is All You Need"
    },
    {
      "text": "Scaling Laws for Neural Language Models",
      "url": "https://arxiv.org/abs/2001.08361",
      "resourceId": "85f66a6419d173a7",
      "resourceTitle": "Kaplan et al. (2020)"
    },
    {
      "text": "Training language models to follow instructions with human feedback",
      "url": "https://arxiv.org/abs/2203.02155",
      "resourceId": "1098fc60be7ca2b0",
      "resourceTitle": "Training Language Models to Follow Instructions with Human Feedback"
    },
    {
      "text": "Constitutional AI",
      "url": "https://arxiv.org/abs/2212.08073",
      "resourceId": "683aef834ac1612a"
    },
    {
      "text": "Scaling Monosemanticity",
      "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
      "resourceId": "e724db341d6e0065"
    },
    {
      "text": "Sam Altman public statements",
      "url": "https://fortune.com/2024/04/04/ai-training-costs-how-much-is-too-much-openai-gpt-anthropic-microsoft/",
      "resourceId": "b2534f71895a316d",
      "resourceTitle": "Fortune AI training costs"
    },
    {
      "text": "Epoch AI analysis",
      "url": "https://epoch.ai/data-insights/openai-compute-spend",
      "resourceId": "e5457746f2524afb",
      "resourceTitle": "Epoch AI OpenAI compute spend"
    },
    {
      "text": "AI industry analysts",
      "url": "https://www.jonvet.com/blog/llm-scaling-in-2025",
      "resourceId": "7226d362130b23f8",
      "resourceTitle": "performance gap between US and Chinese models"
    },
    {
      "text": "Kaplan et al. (2020)",
      "url": "https://arxiv.org/abs/2001.08361",
      "resourceId": "85f66a6419d173a7",
      "resourceTitle": "Kaplan et al. (2020)"
    },
    {
      "text": "2024 sparse autoencoder work",
      "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
      "resourceId": "e724db341d6e0065"
    },
    {
      "text": "\"Attention Is All You Need\"",
      "url": "https://arxiv.org/abs/1706.03762",
      "resourceId": "a7468c6851652691",
      "resourceTitle": "Attention Is All You Need"
    },
    {
      "text": "\"Scaling Laws for Neural Language Models\"",
      "url": "https://arxiv.org/abs/2001.08361",
      "resourceId": "85f66a6419d173a7",
      "resourceTitle": "Kaplan et al. (2020)"
    },
    {
      "text": "\"Training language models to follow instructions with human feedback\"",
      "url": "https://arxiv.org/abs/2203.02155",
      "resourceId": "1098fc60be7ca2b0",
      "resourceTitle": "Training Language Models to Follow Instructions with Human Feedback"
    },
    {
      "text": "\"GPT-4 Technical Report\"",
      "url": "https://arxiv.org/abs/2303.08774",
      "resourceId": "29a0882390ee7063",
      "resourceTitle": "OpenAI's GPT-4"
    },
    {
      "text": "\"Scaling Monosemanticity: Extracting Interpretable Features from Claude 3 Sonnet\"",
      "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
      "resourceId": "e724db341d6e0065"
    },
    {
      "text": "\"Circuit Tracing Updates - July 2025\"",
      "url": "https://transformer-circuits.pub/2025/july-update/index.html",
      "resourceId": "0a2ab4f291c4a773",
      "resourceTitle": "Circuits Updates - July 2025"
    },
    {
      "text": "\"Most of OpenAI's 2024 compute went to experiments\"",
      "url": "https://epoch.ai/data-insights/openai-compute-spend",
      "resourceId": "e5457746f2524afb",
      "resourceTitle": "Epoch AI OpenAI compute spend"
    },
    {
      "text": "\"Why the cost of AI could soon become too much to bear\"",
      "url": "https://fortune.com/2024/04/04/ai-training-costs-how-much-is-too-much-openai-gpt-anthropic-microsoft/",
      "resourceId": "b2534f71895a316d",
      "resourceTitle": "Fortune AI training costs"
    }
  ],
  "unconvertedLinkCount": 24,
  "convertedLinkCount": 0,
  "backlinkCount": 3,
  "hallucinationRisk": {
    "level": "medium",
    "score": 45,
    "factors": [
      "no-citations",
      "conceptual-content"
    ]
  },
  "entityType": "concept",
  "redundancy": {
    "maxSimilarity": 15,
    "similarPages": [
      {
        "id": "large-language-models",
        "title": "Large Language Models",
        "path": "/knowledge-base/capabilities/large-language-models/",
        "similarity": 15
      },
      {
        "id": "neuromorphic",
        "title": "Neuromorphic Hardware",
        "path": "/knowledge-base/intelligence-paradigms/neuromorphic/",
        "similarity": 15
      },
      {
        "id": "scaling-laws",
        "title": "AI Scaling Laws",
        "path": "/knowledge-base/models/scaling-laws/",
        "similarity": 15
      },
      {
        "id": "sparse-autoencoders",
        "title": "Sparse Autoencoders (SAEs)",
        "path": "/knowledge-base/responses/sparse-autoencoders/",
        "similarity": 15
      },
      {
        "id": "ssm-mamba",
        "title": "State-Space Models / Mamba",
        "path": "/knowledge-base/intelligence-paradigms/ssm-mamba/",
        "similarity": 14
      }
    ]
  },
  "coverage": {
    "passing": 8,
    "total": 13,
    "targets": {
      "tables": 13,
      "diagrams": 1,
      "internalLinks": 27,
      "externalLinks": 17,
      "footnotes": 10,
      "references": 10
    },
    "actuals": {
      "tables": 18,
      "diagrams": 1,
      "internalLinks": 6,
      "externalLinks": 50,
      "footnotes": 0,
      "references": 10,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "llmSummary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "green",
      "diagrams": "green",
      "internalLinks": "amber",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "green",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:4.2 R:6.8 A:5.5 C:7.3"
  },
  "readerRank": 90,
  "researchRank": 204,
  "recommendedScore": 177.61
}
External Links
{
  "lesswrong": "https://www.lesswrong.com/tag/transformers",
  "wikipedia": "https://en.wikipedia.org/wiki/Transformer_(deep_learning_architecture)"
}
Backlinks (3)
idtitletyperelationship
heavy-scaffoldingHeavy Scaffolding / Agentic Systemsconcept
provable-safeProvable / Guaranteed Safe AIconcept
__index__/knowledge-base/intelligence-paradigmsIntelligence Paradigmsconcept
Longterm Wiki