Sparse Autoencoders (SAEs)
sparse-autoencodersapproachPath: /knowledge-base/responses/sparse-autoencoders/
E480Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "sparse-autoencoders",
"numericId": null,
"path": "/knowledge-base/responses/sparse-autoencoders/",
"filePath": "knowledge-base/responses/sparse-autoencoders.mdx",
"title": "Sparse Autoencoders (SAEs)",
"quality": 91,
"readerImportance": 19.5,
"researchImportance": 29,
"tacticalValue": null,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": "2026-03-13",
"dateCreated": "2026-02-15",
"llmSummary": "Comprehensive review of sparse autoencoders (SAEs) for mechanistic interpretability, covering Anthropic's 34M features from Claude 3 Sonnet (90% interpretability), OpenAI's 16M latent GPT-4 SAEs, DeepMind's 1T+ parameter Gemma Scope releases, and Goodfire's \\$50M Series A and 671B DeepSeek R1 SAEs. Despite promising safety applications including deception detection features, DeepMind's March 2025 negative results showed SAEs underperforming simple probes on downstream tasks. Global investment estimated at \\$75-150M/year with 150-200 researchers.",
"description": "Sparse autoencoders extract interpretable features from neural network activations using sparsity constraints. Anthropic's 2024 research extracted 34 million features from Claude 3 Sonnet with 90% interpretability scores, while Goodfire raised \\$50M in 2025 and released first-ever SAEs for the 671B-parameter DeepSeek R1 reasoning model. Despite promising safety applications, DeepMind deprioritized SAE research in March 2025 after finding they underperform simple linear probes on downstream safety tasks.",
"ratings": {
"novelty": 5,
"rigor": 7.5,
"actionability": 6,
"completeness": 8.5
},
"category": "responses",
"subcategory": "alignment-interpretability",
"clusters": [
"ai-safety"
],
"metrics": {
"wordCount": 3238,
"tableCount": 20,
"diagramCount": 3,
"internalLinks": 15,
"externalLinks": 66,
"footnoteCount": 0,
"bulletRatio": 0.09,
"sectionCount": 34,
"hasOverview": true,
"structuralScore": 15
},
"suggestedQuality": 100,
"updateFrequency": 21,
"evergreen": true,
"wordCount": 3238,
"unconvertedLinks": [
{
"text": "Anthropic 2024",
"url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
"resourceId": "e724db341d6e0065"
},
{
"text": "DeepMind deprioritized",
"url": "https://deepmindsafetyresearch.medium.com/negative-results-for-sparse-autoencoders-on-downstream-tasks-and-deprioritising-sae-research-6cadcfc125b9",
"resourceId": "244c1b93ef0a083c",
"resourceTitle": "deprioritizing SAE research"
},
{
"text": "arxiv.org",
"url": "https://arxiv.org/abs/2309.08600",
"resourceId": "8aae7b9df41d1455",
"resourceTitle": "Sparse Autoencoders Find Highly Interpretable Features in Language Models"
},
{
"text": "Anthropic",
"url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
"resourceId": "e724db341d6e0065"
},
{
"text": "OpenAI",
"url": "https://openai.com/index/extracting-concepts-from-gpt-4/",
"resourceId": "f7b06d857b564d78",
"resourceTitle": "Extracting Concepts from GPT-4"
},
{
"text": "DeepMind",
"url": "https://deepmind.google/blog/gemma-scope-2-helping-the-ai-safety-community-deepen-understanding-of-complex-language-model-behavior/",
"resourceId": "a1036bc63472c5fc",
"resourceTitle": "Gemma Scope 2"
},
{
"text": "EleutherAI",
"url": "https://blog.eleuther.ai/autointerp/",
"resourceId": "daaf778f7ff52bc2",
"resourceTitle": "open-source automated interpretability"
},
{
"text": "Gated SAE",
"url": "https://arxiv.org/abs/2309.08600",
"resourceId": "8aae7b9df41d1455",
"resourceTitle": "Sparse Autoencoders Find Highly Interpretable Features in Language Models"
},
{
"text": "The landmark result",
"url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
"resourceId": "e724db341d6e0065"
},
{
"text": "Gemma Scope 2",
"url": "https://deepmind.google/blog/gemma-scope-2-helping-the-ai-safety-community-deepen-understanding-of-complex-language-model-behavior/",
"resourceId": "a1036bc63472c5fc",
"resourceTitle": "Gemma Scope 2"
},
{
"text": "DeepMind deprioritization",
"url": "https://deepmindsafetyresearch.medium.com/negative-results-for-sparse-autoencoders-on-downstream-tasks-and-deprioritising-sae-research-6cadcfc125b9",
"resourceId": "244c1b93ef0a083c",
"resourceTitle": "deprioritizing SAE research"
},
{
"text": "Protein language model SAEs",
"url": "https://www.pnas.org/doi/10.1073/pnas.2506316122",
"resourceId": "4d1186e8c443a9a9",
"resourceTitle": "Sparse autoencoders uncover biologically interpretable features in protein language model representations"
},
{
"text": "DeepMind's March 2025 announcement",
"url": "https://deepmindsafetyresearch.medium.com/negative-results-for-sparse-autoencoders-on-downstream-tasks-and-deprioritising-sae-research-6cadcfc125b9",
"resourceId": "244c1b93ef0a083c",
"resourceTitle": "deprioritizing SAE research"
},
{
"text": "Anthropic 2024",
"url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
"resourceId": "e724db341d6e0065"
},
{
"text": "OpenAI 2024",
"url": "https://openai.com/index/extracting-concepts-from-gpt-4/",
"resourceId": "f7b06d857b564d78",
"resourceTitle": "Extracting Concepts from GPT-4"
},
{
"text": "EleutherAI 2024",
"url": "https://blog.eleuther.ai/autointerp/",
"resourceId": "daaf778f7ff52bc2",
"resourceTitle": "open-source automated interpretability"
},
{
"text": "DeepMind 2024",
"url": "https://deepmind.google/blog/gemma-scope-2-helping-the-ai-safety-community-deepen-understanding-of-complex-language-model-behavior/",
"resourceId": "a1036bc63472c5fc",
"resourceTitle": "Gemma Scope 2"
},
{
"text": "Scaling Monosemanticity",
"url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
"resourceId": "e724db341d6e0065"
},
{
"text": "GPT-4 SAEs",
"url": "https://openai.com/index/extracting-concepts-from-gpt-4/",
"resourceId": "f7b06d857b564d78",
"resourceTitle": "Extracting Concepts from GPT-4"
},
{
"text": "negative results",
"url": "https://deepmindsafetyresearch.medium.com/negative-results-for-sparse-autoencoders-on-downstream-tasks-and-deprioritising-sae-research-6cadcfc125b9",
"resourceId": "244c1b93ef0a083c",
"resourceTitle": "deprioritizing SAE research"
},
{
"text": "Automated interpretation",
"url": "https://blog.eleuther.ai/autointerp/",
"resourceId": "daaf778f7ff52bc2",
"resourceTitle": "open-source automated interpretability"
},
{
"text": "Original SAE paper",
"url": "https://arxiv.org/abs/2309.08600",
"resourceId": "8aae7b9df41d1455",
"resourceTitle": "Sparse Autoencoders Find Highly Interpretable Features in Language Models"
},
{
"text": "Scaling Monosemanticity",
"url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
"resourceId": "e724db341d6e0065"
},
{
"text": "Extracting Concepts from GPT-4",
"url": "https://openai.com/index/extracting-concepts-from-gpt-4/",
"resourceId": "f7b06d857b564d78",
"resourceTitle": "Extracting Concepts from GPT-4"
},
{
"text": "Gemma Scope 2",
"url": "https://deepmind.google/blog/gemma-scope-2-helping-the-ai-safety-community-deepen-understanding-of-complex-language-model-behavior/",
"resourceId": "a1036bc63472c5fc",
"resourceTitle": "Gemma Scope 2"
},
{
"text": "Sparse Autoencoders Find Highly Interpretable Features",
"url": "https://arxiv.org/abs/2309.08600",
"resourceId": "8aae7b9df41d1455",
"resourceTitle": "Sparse Autoencoders Find Highly Interpretable Features in Language Models"
},
{
"text": "Negative Results for SAEs on Downstream Tasks",
"url": "https://deepmindsafetyresearch.medium.com/negative-results-for-sparse-autoencoders-on-downstream-tasks-and-deprioritising-sae-research-6cadcfc125b9",
"resourceId": "244c1b93ef0a083c",
"resourceTitle": "deprioritizing SAE research"
},
{
"text": "Open Source Automated Interpretability",
"url": "https://blog.eleuther.ai/autointerp/",
"resourceId": "daaf778f7ff52bc2",
"resourceTitle": "open-source automated interpretability"
}
],
"unconvertedLinkCount": 28,
"convertedLinkCount": 0,
"backlinkCount": 2,
"hallucinationRisk": {
"level": "low",
"score": 25,
"factors": [
"no-citations",
"high-rigor",
"conceptual-content",
"high-quality"
]
},
"entityType": "approach",
"redundancy": {
"maxSimilarity": 22,
"similarPages": [
{
"id": "interpretability",
"title": "Mechanistic Interpretability",
"path": "/knowledge-base/responses/interpretability/",
"similarity": 22
},
{
"id": "mech-interp",
"title": "Mechanistic Interpretability",
"path": "/knowledge-base/responses/mech-interp/",
"similarity": 20
},
{
"id": "probing",
"title": "Probing / Linear Probes",
"path": "/knowledge-base/responses/probing/",
"similarity": 20
},
{
"id": "sleeper-agent-detection",
"title": "Sleeper Agent Detection",
"path": "/knowledge-base/responses/sleeper-agent-detection/",
"similarity": 20
},
{
"id": "representation-engineering",
"title": "Representation Engineering",
"path": "/knowledge-base/responses/representation-engineering/",
"similarity": 17
}
]
},
"coverage": {
"passing": 7,
"total": 13,
"targets": {
"tables": 13,
"diagrams": 1,
"internalLinks": 26,
"externalLinks": 16,
"footnotes": 10,
"references": 10
},
"actuals": {
"tables": 20,
"diagrams": 3,
"internalLinks": 15,
"externalLinks": 66,
"footnotes": 0,
"references": 7,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"llmSummary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "red",
"overview": "green",
"tables": "green",
"diagrams": "green",
"internalLinks": "amber",
"externalLinks": "green",
"footnotes": "red",
"references": "amber",
"quotes": "red",
"accuracy": "red"
},
"ratingsString": "N:5 R:7.5 A:6 C:8.5"
},
"readerRank": 524,
"researchRank": 426,
"recommendedScore": 213.61
}External Links
{
"lesswrong": "https://www.lesswrong.com/tag/sparse-autoencoders-saes"
}Backlinks (2)
| id | title | type | relationship |
|---|---|---|---|
| mech-interp | Mechanistic Interpretability | approach | — |
| alignment-interpretability-overview | Interpretability (Overview) | concept | — |