Longterm Wiki

Probing / Linear Probes

probingapproachPath: /knowledge-base/responses/probing/
E596Entity ID (EID)
← Back to page2 backlinksQuality: 55Updated: 2026-03-13
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "probing",
  "numericId": null,
  "path": "/knowledge-base/responses/probing/",
  "filePath": "knowledge-base/responses/probing.mdx",
  "title": "Probing / Linear Probes",
  "quality": 55,
  "readerImportance": 20.5,
  "researchImportance": 31.5,
  "tacticalValue": null,
  "contentFormat": "article",
  "tractability": null,
  "neglectedness": null,
  "uncertainty": null,
  "causalLevel": null,
  "lastUpdated": "2026-03-13",
  "dateCreated": "2026-02-20",
  "llmSummary": "Linear probing achieves 71-83% accuracy detecting LLM truthfulness and is a foundational diagnostic tool for interpretability research. While computationally cheap and widely adopted, probes are vulnerable to adversarial hiding and only detect linearly separable features, limiting their standalone safety value to supporting other techniques.",
  "description": "Linear probes are simple classifiers trained on neural network activations to test what concepts models internally represent. Research shows probes achieve 71-83% accuracy detecting LLM truthfulness (Azaria & Mitchell 2023), making them a foundational diagnostic tool for AI safety and deception detection.",
  "ratings": {
    "novelty": 3.5,
    "rigor": 6,
    "actionability": 4.5,
    "completeness": 7
  },
  "category": "responses",
  "subcategory": "alignment-interpretability",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 2665,
    "tableCount": 10,
    "diagramCount": 2,
    "internalLinks": 0,
    "externalLinks": 36,
    "footnoteCount": 0,
    "bulletRatio": 0.18,
    "sectionCount": 25,
    "hasOverview": true,
    "structuralScore": 13
  },
  "suggestedQuality": 87,
  "updateFrequency": 21,
  "evergreen": true,
  "wordCount": 2665,
  "unconvertedLinks": [
    {
      "text": "Zou et al. 2023",
      "url": "https://arxiv.org/abs/2310.01405",
      "resourceId": "5d708a72c3af8ad9",
      "resourceTitle": "Representation Engineering: A Top-Down Approach to AI Transparency"
    },
    {
      "text": "Anthropic's March 2025 research",
      "url": "https://www.anthropic.com/research",
      "resourceId": "f771d4f56ad4dbaa",
      "resourceTitle": "Anthropic's Work on AI Safety"
    },
    {
      "text": "Anthropic's interpretability team",
      "url": "https://transformer-circuits.pub/",
      "resourceId": "5083d746c2728ff2"
    },
    {
      "text": "arXiv",
      "url": "https://arxiv.org/abs/2310.01405",
      "resourceId": "5d708a72c3af8ad9",
      "resourceTitle": "Representation Engineering: A Top-Down Approach to AI Transparency"
    },
    {
      "text": "Research page",
      "url": "https://www.anthropic.com/research/team/interpretability",
      "resourceId": "dfc21a319f95a75d",
      "resourceTitle": "anthropic.com/research/team/interpretability"
    }
  ],
  "unconvertedLinkCount": 5,
  "convertedLinkCount": 0,
  "backlinkCount": 2,
  "hallucinationRisk": {
    "level": "medium",
    "score": 45,
    "factors": [
      "no-citations",
      "conceptual-content"
    ]
  },
  "entityType": "approach",
  "redundancy": {
    "maxSimilarity": 20,
    "similarPages": [
      {
        "id": "sparse-autoencoders",
        "title": "Sparse Autoencoders (SAEs)",
        "path": "/knowledge-base/responses/sparse-autoencoders/",
        "similarity": 20
      },
      {
        "id": "mech-interp",
        "title": "Mechanistic Interpretability",
        "path": "/knowledge-base/responses/mech-interp/",
        "similarity": 19
      },
      {
        "id": "interpretability",
        "title": "Mechanistic Interpretability",
        "path": "/knowledge-base/responses/interpretability/",
        "similarity": 18
      },
      {
        "id": "representation-engineering",
        "title": "Representation Engineering",
        "path": "/knowledge-base/responses/representation-engineering/",
        "similarity": 18
      },
      {
        "id": "sleeper-agent-detection",
        "title": "Sleeper Agent Detection",
        "path": "/knowledge-base/responses/sleeper-agent-detection/",
        "similarity": 18
      }
    ]
  },
  "coverage": {
    "passing": 6,
    "total": 13,
    "targets": {
      "tables": 11,
      "diagrams": 1,
      "internalLinks": 21,
      "externalLinks": 13,
      "footnotes": 8,
      "references": 8
    },
    "actuals": {
      "tables": 10,
      "diagrams": 2,
      "internalLinks": 0,
      "externalLinks": 36,
      "footnotes": 0,
      "references": 4,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "llmSummary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "amber",
      "diagrams": "green",
      "internalLinks": "red",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "amber",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:3.5 R:6 A:4.5 C:7"
  },
  "readerRank": 519,
  "researchRank": 405,
  "recommendedScore": 142.03
}
External Links

No external links

Backlinks (2)
idtitletyperelationship
alignment-interpretability-overviewInterpretability (Overview)concept
sparse-autoencodersSparse Autoencoders (SAEs)approach
Longterm Wiki