Sparse Autoencoders (SAEs)

sparse-autoencodersapproachPath: /knowledge-base/responses/sparse-autoencoders/

E480Entity ID (EID)

← Back to page2 backlinksQuality: 91Updated: 2026-01-30

Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time

{
  "id": "sparse-autoencoders",
  "wikiId": "E480",
  "path": "/knowledge-base/responses/sparse-autoencoders/",
  "filePath": "knowledge-base/responses/sparse-autoencoders.mdx",
  "title": "Sparse Autoencoders (SAEs)",
  "quality": 91,
  "readerImportance": 19.5,
  "researchImportance": 29,
  "tacticalValue": null,
  "contentFormat": "article",
  "causalLevel": null,
  "lastUpdated": "2026-01-30",
  "dateCreated": "2026-02-15",
  "summary": "Comprehensive review of sparse autoencoders (SAEs) for mechanistic interpretability, covering Anthropic's 34M features from Claude 3 Sonnet (90% interpretability), OpenAI's 16M latent GPT-4 SAEs, DeepMind's 1T+ parameter Gemma Scope releases, and Goodfire's \\$50M Series A and 671B DeepSeek R1 SAEs. Despite promising safety applications including deception detection features, DeepMind's March 2025 negative results showed SAEs underperforming simple probes on downstream tasks. Global investment estimated at \\$75-150M/year with 150-200 researchers.",
  "description": "Sparse autoencoders extract interpretable features from neural network activations using sparsity constraints.",
  "ratings": {
    "novelty": 5,
    "rigor": 7.5,
    "completeness": 8.5,
    "actionability": 6
  },
  "category": "responses",
  "subcategory": "alignment-interpretability",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 3238,
    "tableCount": 20,
    "diagramCount": 3,
    "internalLinks": 15,
    "externalLinks": 66,
    "footnoteCount": 0,
    "bulletRatio": 0.09,
    "sectionCount": 34,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": 21,
  "evergreen": true,
  "wordCount": 3238,
  "unconvertedLinks": [
    {
      "text": "raised \\$50M Series A",
      "url": "https://www.goodfire.ai/blog/announcing-our-50m-series-a",
      "resourceId": "b50c3a7eb6dac2c5",
      "resourceTitle": "Announcing Our $50M Series A to Advance AI Interpretability Research"
    },
    {
      "text": "Anthropic 2024",
      "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
      "resourceId": "e724db341d6e0065",
      "resourceTitle": "Scaling Monosemanticity: Extracting Interpretable Features from Claude 3 Sonnet"
    },
    {
      "text": "DeepMind deprioritized",
      "url": "https://deepmindsafetyresearch.medium.com/negative-results-for-sparse-autoencoders-on-downstream-tasks-and-deprioritising-sae-research-6cadcfc125b9",
      "resourceId": "244c1b93ef0a083c",
      "resourceTitle": "deprioritizing SAE research"
    },
    {
      "text": "arxiv.org",
      "url": "https://arxiv.org/abs/2309.08600",
      "resourceId": "8aae7b9df41d1455",
      "resourceTitle": "Sparse Autoencoders Find Highly Interpretable Features in Language Models"
    },
    {
      "text": "Anthropic",
      "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
      "resourceId": "e724db341d6e0065",
      "resourceTitle": "Scaling Monosemanticity: Extracting Interpretable Features from Claude 3 Sonnet"
    },
    {
      "text": "OpenAI",
      "url": "https://openai.com/index/extracting-concepts-from-gpt-4/",
      "resourceId": "f7b06d857b564d78",
      "resourceTitle": "Extracting Concepts from GPT-4"
    },
    {
      "text": "DeepMind",
      "url": "https://deepmind.google/blog/gemma-scope-2-helping-the-ai-safety-community-deepen-understanding-of-complex-language-model-behavior/",
      "resourceId": "a1036bc63472c5fc",
      "resourceTitle": "Gemma Scope 2: Helping the AI Safety Community Deepen Understanding of Complex Language Model Behavior"
    },
    {
      "text": "EleutherAI",
      "url": "https://blog.eleuther.ai/autointerp/",
      "resourceId": "daaf778f7ff52bc2",
      "resourceTitle": "open-source automated interpretability"
    },
    {
      "text": "Gated SAE",
      "url": "https://arxiv.org/abs/2309.08600",
      "resourceId": "8aae7b9df41d1455",
      "resourceTitle": "Sparse Autoencoders Find Highly Interpretable Features in Language Models"
    },
    {
      "text": "The landmark result",
      "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
      "resourceId": "e724db341d6e0065",
      "resourceTitle": "Scaling Monosemanticity: Extracting Interpretable Features from Claude 3 Sonnet"
    },
    {
      "text": "Gemma Scope 2",
      "url": "https://deepmind.google/blog/gemma-scope-2-helping-the-ai-safety-community-deepen-understanding-of-complex-language-model-behavior/",
      "resourceId": "a1036bc63472c5fc",
      "resourceTitle": "Gemma Scope 2: Helping the AI Safety Community Deepen Understanding of Complex Language Model Behavior"
    },
    {
      "text": "Goodfire's \\$50M Series A",
      "url": "https://www.goodfire.ai/blog/announcing-our-50m-series-a",
      "resourceId": "b50c3a7eb6dac2c5",
      "resourceTitle": "Announcing Our $50M Series A to Advance AI Interpretability Research"
    },
    {
      "text": "DeepMind deprioritization",
      "url": "https://deepmindsafetyresearch.medium.com/negative-results-for-sparse-autoencoders-on-downstream-tasks-and-deprioritising-sae-research-6cadcfc125b9",
      "resourceId": "244c1b93ef0a083c",
      "resourceTitle": "deprioritizing SAE research"
    },
    {
      "text": "Goodfire \\$50M Series A",
      "url": "https://www.goodfire.ai/blog/announcing-our-50m-series-a",
      "resourceId": "b50c3a7eb6dac2c5",
      "resourceTitle": "Announcing Our $50M Series A to Advance AI Interpretability Research"
    },
    {
      "text": "Protein language model SAEs",
      "url": "https://www.pnas.org/doi/10.1073/pnas.2506316122",
      "resourceId": "4d1186e8c443a9a9",
      "resourceTitle": "Sparse autoencoders uncover biologically interpretable features in protein language model representations"
    },
    {
      "text": "DeepMind's March 2025 announcement",
      "url": "https://deepmindsafetyresearch.medium.com/negative-results-for-sparse-autoencoders-on-downstream-tasks-and-deprioritising-sae-research-6cadcfc125b9",
      "resourceId": "244c1b93ef0a083c",
      "resourceTitle": "deprioritizing SAE research"
    },
    {
      "text": "Anthropic 2024",
      "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
      "resourceId": "e724db341d6e0065",
      "resourceTitle": "Scaling Monosemanticity: Extracting Interpretable Features from Claude 3 Sonnet"
    },
    {
      "text": "OpenAI 2024",
      "url": "https://openai.com/index/extracting-concepts-from-gpt-4/",
      "resourceId": "f7b06d857b564d78",
      "resourceTitle": "Extracting Concepts from GPT-4"
    },
    {
      "text": "EleutherAI 2024",
      "url": "https://blog.eleuther.ai/autointerp/",
      "resourceId": "daaf778f7ff52bc2",
      "resourceTitle": "open-source automated interpretability"
    },
    {
      "text": "DeepMind 2024",
      "url": "https://deepmind.google/blog/gemma-scope-2-helping-the-ai-safety-community-deepen-understanding-of-complex-language-model-behavior/",
      "resourceId": "a1036bc63472c5fc",
      "resourceTitle": "Gemma Scope 2: Helping the AI Safety Community Deepen Understanding of Complex Language Model Behavior"
    },
    {
      "text": "Scaling Monosemanticity",
      "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
      "resourceId": "e724db341d6e0065",
      "resourceTitle": "Scaling Monosemanticity: Extracting Interpretable Features from Claude 3 Sonnet"
    },
    {
      "text": "GPT-4 SAEs",
      "url": "https://openai.com/index/extracting-concepts-from-gpt-4/",
      "resourceId": "f7b06d857b564d78",
      "resourceTitle": "Extracting Concepts from GPT-4"
    },
    {
      "text": "negative results",
      "url": "https://deepmindsafetyresearch.medium.com/negative-results-for-sparse-autoencoders-on-downstream-tasks-and-deprioritising-sae-research-6cadcfc125b9",
      "resourceId": "244c1b93ef0a083c",
      "resourceTitle": "deprioritizing SAE research"
    },
    {
      "text": "Goodfire",
      "url": "https://www.goodfire.ai/",
      "resourceId": "2df80259f4ef3e14",
      "resourceTitle": "Goodfire AI - Interpretability Research Company"
    },
    {
      "text": "Automated interpretation",
      "url": "https://blog.eleuther.ai/autointerp/",
      "resourceId": "daaf778f7ff52bc2",
      "resourceTitle": "open-source automated interpretability"
    },
    {
      "text": "Original SAE paper",
      "url": "https://arxiv.org/abs/2309.08600",
      "resourceId": "8aae7b9df41d1455",
      "resourceTitle": "Sparse Autoencoders Find Highly Interpretable Features in Language Models"
    },
    {
      "text": "Scaling Monosemanticity",
      "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
      "resourceId": "e724db341d6e0065",
      "resourceTitle": "Scaling Monosemanticity: Extracting Interpretable Features from Claude 3 Sonnet"
    },
    {
      "text": "Extracting Concepts from GPT-4",
      "url": "https://openai.com/index/extracting-concepts-from-gpt-4/",
      "resourceId": "f7b06d857b564d78",
      "resourceTitle": "Extracting Concepts from GPT-4"
    },
    {
      "text": "Gemma Scope 2",
      "url": "https://deepmind.google/blog/gemma-scope-2-helping-the-ai-safety-community-deepen-understanding-of-complex-language-model-behavior/",
      "resourceId": "a1036bc63472c5fc",
      "resourceTitle": "Gemma Scope 2: Helping the AI Safety Community Deepen Understanding of Complex Language Model Behavior"
    },
    {
      "text": "Sparse Autoencoders Find Highly Interpretable Features",
      "url": "https://arxiv.org/abs/2309.08600",
      "resourceId": "8aae7b9df41d1455",
      "resourceTitle": "Sparse Autoencoders Find Highly Interpretable Features in Language Models"
    },
    {
      "text": "Negative Results for SAEs on Downstream Tasks",
      "url": "https://deepmindsafetyresearch.medium.com/negative-results-for-sparse-autoencoders-on-downstream-tasks-and-deprioritising-sae-research-6cadcfc125b9",
      "resourceId": "244c1b93ef0a083c",
      "resourceTitle": "deprioritizing SAE research"
    },
    {
      "text": "Open Source Automated Interpretability",
      "url": "https://blog.eleuther.ai/autointerp/",
      "resourceId": "daaf778f7ff52bc2",
      "resourceTitle": "open-source automated interpretability"
    }
  ],
  "unconvertedLinkCount": 32,
  "convertedLinkCount": 0,
  "backlinkCount": 2,
  "hallucinationRisk": {
    "level": "low",
    "score": 25,
    "factors": [
      "no-citations",
      "high-rigor",
      "conceptual-content",
      "high-quality"
    ]
  },
  "entityType": "approach",
  "redundancy": {
    "maxSimilarity": 22,
    "similarPages": [
      {
        "id": "interpretability",
        "title": "Mechanistic Interpretability",
        "path": "/knowledge-base/responses/interpretability/",
        "similarity": 22
      },
      {
        "id": "mech-interp",
        "title": "Mechanistic Interpretability",
        "path": "/knowledge-base/responses/mech-interp/",
        "similarity": 20
      },
      {
        "id": "probing",
        "title": "Probing / Linear Probes",
        "path": "/knowledge-base/responses/probing/",
        "similarity": 20
      },
      {
        "id": "sleeper-agent-detection",
        "title": "Sleeper Agent Detection",
        "path": "/knowledge-base/responses/sleeper-agent-detection/",
        "similarity": 20
      },
      {
        "id": "representation-engineering",
        "title": "Representation Engineering",
        "path": "/knowledge-base/responses/representation-engineering/",
        "similarity": 17
      }
    ]
  },
  "coverage": {
    "passing": 7,
    "total": 13,
    "targets": {
      "tables": 13,
      "diagrams": 1,
      "internalLinks": 26,
      "externalLinks": 16,
      "footnotes": 10,
      "references": 10
    },
    "actuals": {
      "tables": 20,
      "diagrams": 3,
      "internalLinks": 15,
      "externalLinks": 66,
      "footnotes": 0,
      "references": 9,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "summary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "green",
      "diagrams": "green",
      "internalLinks": "amber",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "amber",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:5 R:7.5 A:6 C:8.5"
  },
  "readerRank": 522,
  "researchRank": 422,
  "recommendedScore": 203.46
}

External Links

{
  "lesswrong": "https://www.lesswrong.com/tag/sparse-autoencoders-saes"
}

Backlinks (2)

id	title	type	relationship
neel-nanda	Neel Nanda	person	—
alignment-interpretability-overview	Interpretability (Overview)	concept	—