Sandboxing / Containment
sandboxingapproachPath: /knowledge-base/responses/sandboxing/
E485Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "sandboxing",
"numericId": null,
"path": "/knowledge-base/responses/sandboxing/",
"filePath": "knowledge-base/responses/sandboxing.mdx",
"title": "Sandboxing / Containment",
"quality": 91,
"readerImportance": 57.5,
"researchImportance": 28.5,
"tacticalValue": null,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": "2026-03-13",
"dateCreated": "2026-02-15",
"llmSummary": "Comprehensive analysis of AI sandboxing as defense-in-depth, synthesizing METR's 2025 evaluations (GPT-5 time horizon ~2h, capabilities doubling every 7 months), AI boxing experiments (60-70% escape rates), and 2024-2025 container vulnerabilities (CVE-2024-0132, NVIDIAScape, IDEsaster). Quantifies isolation technology tradeoffs (gVisor ~10% I/O, Firecracker ~85%) with market context (\\$7.6B AI agent market, 85% enterprise adoption, \\$3.8B 2024 investment). Includes Anthropic's Sabotage Risk Report findings and CVE-Bench benchmark (13% AI exploit success rate).",
"description": "Sandboxing limits AI system access to resources, networks, and capabilities as a defense-in-depth measure. METR's August 2025 evaluation found GPT-5's time horizon at ~2 hours—insufficient for autonomous replication. AI boxing experiments show 60-70% social engineering escape rates. Critical CVEs (CVE-2024-0132, CVE-2025-23266) demonstrate container escapes, while the IDEsaster disclosure revealed 30+ vulnerabilities in AI coding tools. Firecracker microVMs provide 85% native performance with hardware isolation; gVisor offers ~10% I/O performance but better compatibility.",
"ratings": {
"novelty": 5,
"rigor": 8,
"actionability": 8,
"completeness": 8
},
"category": "responses",
"subcategory": "alignment-deployment",
"clusters": [
"ai-safety",
"cyber"
],
"metrics": {
"wordCount": 4264,
"tableCount": 30,
"diagramCount": 5,
"internalLinks": 4,
"externalLinks": 121,
"footnoteCount": 0,
"bulletRatio": 0.09,
"sectionCount": 51,
"hasOverview": true,
"structuralScore": 15
},
"suggestedQuality": 100,
"updateFrequency": 21,
"evergreen": true,
"wordCount": 4264,
"unconvertedLinks": [
{
"text": "GPT-5 time horizon ≈2 hours",
"url": "https://evaluations.metr.org/gpt-5-report/",
"resourceId": "7457262d461e2206",
"resourceTitle": "evaluations.metr.org"
},
{
"text": "METR",
"url": "https://metr.org/",
"resourceId": "45370a5153534152",
"resourceTitle": "metr.org"
},
{
"text": "UK AISI",
"url": "https://www.aisi.gov.uk/",
"resourceId": "fdf68a8f30f57dee",
"resourceTitle": "AI Safety Institute"
},
{
"text": "METR's 2024 evaluations",
"url": "https://metr.org/blog/2024-11-12-rogue-replication-threat-model/",
"resourceId": "5b45342b68bf627e",
"resourceTitle": "The Rogue Replication Threat Model"
},
{
"text": "METR (Model Evaluation & Threat Research)",
"url": "https://metr.org/",
"resourceId": "45370a5153534152",
"resourceTitle": "metr.org"
},
{
"text": "METR Rogue Replication Threat Model (November 2024)",
"url": "https://metr.org/blog/2024-11-12-rogue-replication-threat-model/",
"resourceId": "5b45342b68bf627e",
"resourceTitle": "The Rogue Replication Threat Model"
},
{
"text": "GPT-5",
"url": "https://evaluations.metr.org/gpt-5-report/",
"resourceId": "7457262d461e2206",
"resourceTitle": "evaluations.metr.org"
},
{
"text": "Claude 3.5 Sonnet",
"url": "https://metr.org/blog/2025-01-31-update-sonnet-o1-evals/",
"resourceId": "89b92e6423256fc4",
"resourceTitle": "METR's research"
},
{
"text": "o1",
"url": "https://metr.org/blog/2025-01-31-update-sonnet-o1-evals/",
"resourceId": "89b92e6423256fc4",
"resourceTitle": "METR's research"
},
{
"text": "METR GPT-5 Report",
"url": "https://evaluations.metr.org/gpt-5-report/",
"resourceId": "7457262d461e2206",
"resourceTitle": "evaluations.metr.org"
},
{
"text": "METR Claude/o1 Update",
"url": "https://metr.org/blog/2025-01-31-update-sonnet-o1-evals/",
"resourceId": "89b92e6423256fc4",
"resourceTitle": "METR's research"
},
{
"text": "OpenAI GPT-4 System Card",
"url": "https://cdn.openai.com/papers/gpt-4-system-card.pdf",
"resourceId": "ebab6e05661645c5",
"resourceTitle": "OpenAI"
},
{
"text": "Anthropic's 2025 research recommendations",
"url": "https://alignment.anthropic.com/2025/recommended-directions/",
"resourceId": "7ae6b3be2d2043c1",
"resourceTitle": "Anthropic: Recommended Directions for AI Safety Research"
},
{
"text": "Activation-Based Monitoring",
"url": "https://alignment.anthropic.com/2025/recommended-directions/",
"resourceId": "7ae6b3be2d2043c1",
"resourceTitle": "Anthropic: Recommended Directions for AI Safety Research"
},
{
"text": "Anthropic AI Control Research",
"url": "https://alignment.anthropic.com/2025/recommended-directions/",
"resourceId": "7ae6b3be2d2043c1",
"resourceTitle": "Anthropic: Recommended Directions for AI Safety Research"
},
{
"text": "monitor failures might be systematically concentrated in episodes where the actor behaves maliciously",
"url": "https://alignment.anthropic.com/2025/recommended-directions/",
"resourceId": "7ae6b3be2d2043c1",
"resourceTitle": "Anthropic: Recommended Directions for AI Safety Research"
},
{
"text": "AI Safety Level 3 (ASL-3) protections",
"url": "https://www.anthropic.com/news/activating-asl3-protections",
"resourceId": "7512ddb574f82249"
},
{
"text": "METR GPT-5 Report (Aug 2025)",
"url": "https://evaluations.metr.org/gpt-5-report/",
"resourceId": "7457262d461e2206",
"resourceTitle": "evaluations.metr.org"
},
{
"text": "METR Claude/o1 Update (Jan 2025)",
"url": "https://metr.org/blog/2025-01-31-update-sonnet-o1-evals/",
"resourceId": "89b92e6423256fc4",
"resourceTitle": "METR's research"
},
{
"text": "METR Rogue Replication Threat Model",
"url": "https://metr.org/blog/2024-11-12-rogue-replication-threat-model/",
"resourceId": "5b45342b68bf627e",
"resourceTitle": "The Rogue Replication Threat Model"
},
{
"text": "Anthropic Research Directions 2025",
"url": "https://alignment.anthropic.com/2025/recommended-directions/",
"resourceId": "7ae6b3be2d2043c1",
"resourceTitle": "Anthropic: Recommended Directions for AI Safety Research"
},
{
"text": "Anthropic ASL-3 Announcement",
"url": "https://www.anthropic.com/news/activating-asl3-protections",
"resourceId": "7512ddb574f82249"
},
{
"text": "Palo Alto Unit 42",
"url": "https://unit42.paloaltonetworks.com/agentic-ai-threats/",
"resourceId": "d6f4face14780e85",
"resourceTitle": "EchoLeak exploit (CVE-2025-32711)"
},
{
"text": "Grand View Research",
"url": "https://www.grandviewresearch.com/industry-analysis/ai-agents-market-report",
"resourceId": "9a0353b668d6ab37",
"resourceTitle": "\"AI Agents Market Report.\" Grand View Research."
},
{
"text": "METR",
"url": "https://metr.org/",
"resourceId": "45370a5153534152",
"resourceTitle": "metr.org"
},
{
"text": "UK AI Safety Institute",
"url": "https://www.aisi.gov.uk/",
"resourceId": "fdf68a8f30f57dee",
"resourceTitle": "AI Safety Institute"
},
{
"text": "Anthropic",
"url": "https://www.anthropic.com/",
"resourceId": "afe2508ac4caf5ee",
"resourceTitle": "Anthropic"
},
{
"text": "OpenAI",
"url": "https://openai.com/safety/",
"resourceId": "838d7a59a02e11a7",
"resourceTitle": "OpenAI Safety Updates"
},
{
"text": "Palo Alto Unit 42",
"url": "https://unit42.paloaltonetworks.com/agentic-ai-threats/",
"resourceId": "d6f4face14780e85",
"resourceTitle": "EchoLeak exploit (CVE-2025-32711)"
},
{
"text": "Anthropic",
"url": "https://alignment.anthropic.com/2025/recommended-directions/",
"resourceId": "7ae6b3be2d2043c1",
"resourceTitle": "Anthropic: Recommended Directions for AI Safety Research"
}
],
"unconvertedLinkCount": 30,
"convertedLinkCount": 0,
"backlinkCount": 4,
"hallucinationRisk": {
"level": "low",
"score": 25,
"factors": [
"no-citations",
"high-rigor",
"conceptual-content",
"high-quality"
]
},
"entityType": "approach",
"redundancy": {
"maxSimilarity": 19,
"similarPages": [
{
"id": "tool-restrictions",
"title": "Tool-Use Restrictions",
"path": "/knowledge-base/responses/tool-restrictions/",
"similarity": 19
},
{
"id": "self-improvement",
"title": "Self-Improvement and Recursive Enhancement",
"path": "/knowledge-base/capabilities/self-improvement/",
"similarity": 16
},
{
"id": "dangerous-cap-evals",
"title": "Dangerous Capability Evaluations",
"path": "/knowledge-base/responses/dangerous-cap-evals/",
"similarity": 16
},
{
"id": "metr",
"title": "METR",
"path": "/knowledge-base/organizations/metr/",
"similarity": 15
},
{
"id": "sleeper-agent-detection",
"title": "Sleeper Agent Detection",
"path": "/knowledge-base/responses/sleeper-agent-detection/",
"similarity": 15
}
]
},
"coverage": {
"passing": 7,
"total": 13,
"targets": {
"tables": 17,
"diagrams": 2,
"internalLinks": 34,
"externalLinks": 21,
"footnotes": 13,
"references": 13
},
"actuals": {
"tables": 30,
"diagrams": 5,
"internalLinks": 4,
"externalLinks": 121,
"footnotes": 0,
"references": 12,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"llmSummary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "red",
"overview": "green",
"tables": "green",
"diagrams": "green",
"internalLinks": "amber",
"externalLinks": "green",
"footnotes": "red",
"references": "amber",
"quotes": "red",
"accuracy": "red"
},
"ratingsString": "N:5 R:8 A:8 C:8"
},
"readerRank": 249,
"researchRank": 430,
"recommendedScore": 232.61
}External Links
{
"lesswrong": "https://www.lesswrong.com/tag/ai-boxing-containment",
"wikipedia": "https://en.wikipedia.org/wiki/AI_capability_control",
"grokipedia": "https://grokipedia.com/page/AI_capability_control"
}Backlinks (4)
| id | title | type | relationship |
|---|---|---|---|
| structured-access | Structured Access / API-Only | approach | — |
| tool-restrictions | Tool-Use Restrictions | approach | — |
| rogue-ai-scenarios | Rogue AI Scenarios | risk | — |
| alignment-deployment-overview | Deployment & Control (Overview) | concept | — |