Probing / Linear Probes

probingapproachPath: /knowledge-base/responses/probing/
E596Entity ID (EID)
← Back to page2 backlinksQuality: 55Updated: 2026-01-28
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "probing",
  "wikiId": "E596",
  "path": "/knowledge-base/responses/probing/",
  "filePath": "knowledge-base/responses/probing.mdx",
  "title": "Probing / Linear Probes",
  "quality": 55,
  "readerImportance": 20.5,
  "researchImportance": 31.5,
  "tacticalValue": null,
  "contentFormat": "article",
  "causalLevel": null,
  "lastUpdated": "2026-01-28",
  "dateCreated": "2026-02-20",
  "summary": "Linear probing achieves 71-83% accuracy detecting LLM truthfulness and is a foundational diagnostic tool for interpretability research. While computationally cheap and widely adopted, probes are vulnerable to adversarial hiding and only detect linearly separable features, limiting their standalone safety value to supporting other techniques.",
  "description": "Linear probes are simple classifiers trained on neural network activations to test what concepts models internally represent.",
  "ratings": {
    "novelty": 3.5,
    "rigor": 6,
    "completeness": 7,
    "actionability": 4.5
  },
  "category": "responses",
  "subcategory": "alignment-interpretability",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 2663,
    "tableCount": 10,
    "diagramCount": 2,
    "internalLinks": 1,
    "externalLinks": 36,
    "footnoteCount": 0,
    "bulletRatio": 0.18,
    "sectionCount": 25,
    "hasOverview": true,
    "structuralScore": 14
  },
  "suggestedQuality": 93,
  "updateFrequency": 21,
  "evergreen": true,
  "wordCount": 2663,
  "unconvertedLinks": [
    {
      "text": "Zou et al. 2023",
      "url": "https://arxiv.org/abs/2310.01405",
      "resourceId": "5d708a72c3af8ad9",
      "resourceTitle": "Representation Engineering: A Top-Down Approach to AI Transparency"
    },
    {
      "text": "Eliciting Latent Knowledge",
      "url": "https://www.alignmentforum.org/posts/qHCDysDnvhteW7kRd/arc-s-first-technical-report-eliciting-latent-knowledge",
      "resourceId": "b78e90eba568a344"
    },
    {
      "text": "Anthropic's March 2025 research",
      "url": "https://www.anthropic.com/research",
      "resourceId": "f771d4f56ad4dbaa",
      "resourceTitle": "Anthropic's Work on AI Safety"
    },
    {
      "text": "Anthropic's interpretability team",
      "url": "https://transformer-circuits.pub/",
      "resourceId": "5083d746c2728ff2",
      "resourceTitle": "Transformer Circuits Thread"
    },
    {
      "text": "arXiv",
      "url": "https://arxiv.org/abs/2310.01405",
      "resourceId": "5d708a72c3af8ad9",
      "resourceTitle": "Representation Engineering: A Top-Down Approach to AI Transparency"
    },
    {
      "text": "Technical report",
      "url": "https://www.alignmentforum.org/posts/qHCDysDnvhteW7kRd/arc-s-first-technical-report-eliciting-latent-knowledge",
      "resourceId": "b78e90eba568a344"
    },
    {
      "text": "Research page",
      "url": "https://www.anthropic.com/research/team/interpretability",
      "resourceId": "dfc21a319f95a75d",
      "resourceTitle": "Anthropic Interpretability Research Team"
    },
    {
      "text": "GitHub",
      "url": "https://github.com/neelnanda-io/TransformerLens",
      "resourceId": "9a1c10a5ca133223",
      "resourceTitle": "TransformerLens: A Library for Mechanistic Interpretability of Language Models"
    },
    {
      "text": "Neel Nanda's overview",
      "url": "https://www.neelnanda.io/mechanistic-interpretability",
      "resourceId": "46841681c285ec4c",
      "resourceTitle": "Neel Nanda's Mechanistic Interpretability Research Hub"
    }
  ],
  "unconvertedLinkCount": 9,
  "convertedLinkCount": 0,
  "backlinkCount": 2,
  "hallucinationRisk": {
    "level": "medium",
    "score": 45,
    "factors": [
      "no-citations",
      "conceptual-content"
    ]
  },
  "entityType": "approach",
  "redundancy": {
    "maxSimilarity": 20,
    "similarPages": [
      {
        "id": "sparse-autoencoders",
        "title": "Sparse Autoencoders (SAEs)",
        "path": "/knowledge-base/responses/sparse-autoencoders/",
        "similarity": 20
      },
      {
        "id": "mech-interp",
        "title": "Mechanistic Interpretability",
        "path": "/knowledge-base/responses/mech-interp/",
        "similarity": 19
      },
      {
        "id": "interpretability",
        "title": "Mechanistic Interpretability",
        "path": "/knowledge-base/responses/interpretability/",
        "similarity": 18
      },
      {
        "id": "representation-engineering",
        "title": "Representation Engineering",
        "path": "/knowledge-base/responses/representation-engineering/",
        "similarity": 18
      },
      {
        "id": "sleeper-agent-detection",
        "title": "Sleeper Agent Detection",
        "path": "/knowledge-base/responses/sleeper-agent-detection/",
        "similarity": 18
      }
    ]
  },
  "coverage": {
    "passing": 6,
    "total": 13,
    "targets": {
      "tables": 11,
      "diagrams": 1,
      "internalLinks": 21,
      "externalLinks": 13,
      "footnotes": 8,
      "references": 8
    },
    "actuals": {
      "tables": 10,
      "diagrams": 2,
      "internalLinks": 1,
      "externalLinks": 36,
      "footnotes": 0,
      "references": 7,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "summary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "amber",
      "diagrams": "green",
      "internalLinks": "amber",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "amber",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:3.5 R:6 A:4.5 C:7"
  },
  "readerRank": 517,
  "researchRank": 401,
  "recommendedScore": 128.91
}
External Links
No external links
Backlinks (2)
id	title	type	relationship
alignment-interpretability-overview	Interpretability (Overview)	concept	—
sparse-autoencoders	Sparse Autoencoders (SAEs)	approach	—