Long-Horizon Autonomous Tasks
long-horizoncapabilityPath: /knowledge-base/capabilities/long-horizon/
E192Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "long-horizon",
"numericId": null,
"path": "/knowledge-base/capabilities/long-horizon/",
"filePath": "knowledge-base/capabilities/long-horizon.mdx",
"title": "Long-Horizon Autonomous Tasks",
"quality": 65,
"readerImportance": 55,
"researchImportance": 94.5,
"tacticalValue": null,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": "2026-03-13",
"dateCreated": "2026-02-15",
"llmSummary": "METR research shows AI task completion horizons doubling every 7 months (accelerated to 4 months in 2024-2025), with current frontier models achieving ~1 hour autonomous operation at 50% success; Claude Opus 4.5 reaches 80.9% on SWE-bench Verified. Multi-day autonomy projected for 2026-2027 represents critical safety threshold where oversight breaks down (100-1000x decision volume increase) and power accumulation pathways emerge, while 80% of organizations already report risky agent behaviors.",
"description": "AI systems capable of autonomous operation over extended periods (hours to weeks), representing a critical transition from AI-as-tool to AI-as-agent with major safety implications including breakdown of oversight mechanisms and potential for power accumulation. METR research shows task horizons doubling every 7 months; Claude 3.7 achieves ~1 hour tasks while Claude Opus 4.5 reaches 80.9% on SWE-bench Verified.",
"ratings": {
"novelty": 5.5,
"rigor": 7,
"actionability": 6.5,
"completeness": 7.5
},
"category": "capabilities",
"subcategory": "agentic",
"clusters": [
"ai-safety"
],
"metrics": {
"wordCount": 2680,
"tableCount": 21,
"diagramCount": 1,
"internalLinks": 51,
"externalLinks": 37,
"footnoteCount": 0,
"bulletRatio": 0.15,
"sectionCount": 33,
"hasOverview": true,
"structuralScore": 15
},
"suggestedQuality": 100,
"updateFrequency": 21,
"evergreen": true,
"wordCount": 2680,
"unconvertedLinks": [
{
"text": "METR 2025",
"url": "https://metr.org/blog/2025-03-19-measuring-ai-ability-to-complete-long-tasks/",
"resourceId": "271fc5f73a8304b2",
"resourceTitle": "Measuring AI Ability to Complete Long Tasks - METR"
},
{
"text": "OpenAI",
"url": "https://openai.com/index/introducing-swe-bench-verified/",
"resourceId": "e1f512a932def9e2",
"resourceTitle": "SWE-bench Verified - OpenAI"
},
{
"text": "McKinsey 2025",
"url": "https://www.mckinsey.com/capabilities/risk-and-resilience/our-insights/deploying-agentic-ai-with-safety-and-security-a-playbook-for-technology-leaders",
"resourceId": "73b5426488075245",
"resourceTitle": "agentic AI market"
},
{
"text": "anthropic.com",
"url": "https://www.anthropic.com/news/claude-opus-4-5",
"resourceId": "57f01cae307e1cb1"
},
{
"text": "METR",
"url": "https://metr.org/blog/2025-03-19-measuring-ai-ability-to-complete-long-tasks/",
"resourceId": "271fc5f73a8304b2",
"resourceTitle": "Measuring AI Ability to Complete Long Tasks - METR"
},
{
"text": "McKinsey's 2025 analysis",
"url": "https://www.mckinsey.com/capabilities/risk-and-resilience/our-insights/deploying-agentic-ai-with-safety-and-security-a-playbook-for-technology-leaders",
"resourceId": "73b5426488075245",
"resourceTitle": "agentic AI market"
},
{
"text": "Anthropic",
"url": "https://www.anthropic.com/news/3-5-models-and-computer-use",
"resourceId": "9e4ef9c155b6d9f3"
},
{
"text": "Scale AI leaderboard",
"url": "https://scale.com/leaderboard/swe_bench_pro_public",
"resourceId": "9dbe484d48b6787a",
"resourceTitle": "SWE-bench Pro Leaderboard - Scale AI"
},
{
"text": "Scale AI",
"url": "https://scale.com/leaderboard/swe_bench_pro_public",
"resourceId": "9dbe484d48b6787a",
"resourceTitle": "SWE-bench Pro Leaderboard - Scale AI"
},
{
"text": "OpenAI",
"url": "https://openai.com/index/introducing-swe-bench-verified/",
"resourceId": "e1f512a932def9e2",
"resourceTitle": "SWE-bench Verified - OpenAI"
},
{
"text": "NIST AI RMF",
"url": "https://www.nist.gov/itl/ai-risk-management-framework",
"resourceId": "54dbc15413425997",
"resourceTitle": "NIST AI Risk Management Framework"
},
{
"text": "METR's March 2025 study",
"url": "https://metr.org/blog/2025-03-19-measuring-ai-ability-to-complete-long-tasks/",
"resourceId": "271fc5f73a8304b2",
"resourceTitle": "Measuring AI Ability to Complete Long Tasks - METR"
},
{
"text": "METR",
"url": "https://metr.org/blog/2025-03-19-measuring-ai-ability-to-complete-long-tasks/",
"resourceId": "271fc5f73a8304b2",
"resourceTitle": "Measuring AI Ability to Complete Long Tasks - METR"
},
{
"text": "McKinsey",
"url": "https://www.mckinsey.com/capabilities/risk-and-resilience/our-insights/deploying-agentic-ai-with-safety-and-security-a-playbook-for-technology-leaders",
"resourceId": "73b5426488075245",
"resourceTitle": "agentic AI market"
},
{
"text": "METR (2025)",
"url": "https://metr.org/blog/2025-03-19-measuring-ai-ability-to-complete-long-tasks/",
"resourceId": "271fc5f73a8304b2",
"resourceTitle": "Measuring AI Ability to Complete Long Tasks - METR"
},
{
"text": "Anthropic (2024)",
"url": "https://www.anthropic.com/news/3-5-models-and-computer-use",
"resourceId": "9e4ef9c155b6d9f3"
},
{
"text": "McKinsey (2025)",
"url": "https://www.mckinsey.com/capabilities/risk-and-resilience/our-insights/deploying-agentic-ai-with-safety-and-security-a-playbook-for-technology-leaders",
"resourceId": "73b5426488075245",
"resourceTitle": "agentic AI market"
},
{
"text": "METR HCAST",
"url": "https://arxiv.org/html/2503.14499v1",
"resourceId": "324cd2230cbea396",
"resourceTitle": "Measuring AI Long Tasks - arXiv"
},
{
"text": "NIST",
"url": "https://www.nist.gov/itl/ai-risk-management-framework",
"resourceId": "54dbc15413425997",
"resourceTitle": "NIST AI Risk Management Framework"
},
{
"text": "SWE-bench Pro",
"url": "https://scale.com/leaderboard/swe_bench_pro_public",
"resourceId": "9dbe484d48b6787a",
"resourceTitle": "SWE-bench Pro Leaderboard - Scale AI"
}
],
"unconvertedLinkCount": 20,
"convertedLinkCount": 40,
"backlinkCount": 2,
"hallucinationRisk": {
"level": "medium",
"score": 40,
"factors": [
"no-citations",
"high-rigor"
]
},
"entityType": "capability",
"redundancy": {
"maxSimilarity": 17,
"similarPages": [
{
"id": "power-seeking-conditions",
"title": "Power-Seeking Emergence Conditions Model",
"path": "/knowledge-base/models/power-seeking-conditions/",
"similarity": 17
},
{
"id": "corrigibility-failure-pathways",
"title": "Corrigibility Failure Pathways",
"path": "/knowledge-base/models/corrigibility-failure-pathways/",
"similarity": 16
},
{
"id": "coding",
"title": "Autonomous Coding",
"path": "/knowledge-base/capabilities/coding/",
"similarity": 15
},
{
"id": "agi-development",
"title": "AGI Development",
"path": "/knowledge-base/forecasting/agi-development/",
"similarity": 15
},
{
"id": "large-language-models",
"title": "Large Language Models",
"path": "/knowledge-base/capabilities/large-language-models/",
"similarity": 14
}
]
},
"coverage": {
"passing": 9,
"total": 13,
"targets": {
"tables": 11,
"diagrams": 1,
"internalLinks": 21,
"externalLinks": 13,
"footnotes": 8,
"references": 8
},
"actuals": {
"tables": 21,
"diagrams": 1,
"internalLinks": 51,
"externalLinks": 37,
"footnotes": 0,
"references": 39,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"llmSummary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "red",
"overview": "green",
"tables": "green",
"diagrams": "green",
"internalLinks": "green",
"externalLinks": "green",
"footnotes": "red",
"references": "green",
"quotes": "red",
"accuracy": "red"
},
"ratingsString": "N:5.5 R:7 A:6.5 C:7.5"
},
"readerRank": 264,
"researchRank": 4,
"recommendedScore": 179.28
}External Links
No external links
Backlinks (2)
| id | title | type | relationship |
|---|---|---|---|
| __index__/knowledge-base/capabilities | AI Capabilities | concept | — |
| agi-timeline | AGI Timeline | concept | — |