Longterm Wiki

Long-Horizon Autonomous Tasks

long-horizoncapabilityPath: /knowledge-base/capabilities/long-horizon/
E192Entity ID (EID)
← Back to page2 backlinksQuality: 65Updated: 2026-03-13
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
  "id": "long-horizon",
  "numericId": null,
  "path": "/knowledge-base/capabilities/long-horizon/",
  "filePath": "knowledge-base/capabilities/long-horizon.mdx",
  "title": "Long-Horizon Autonomous Tasks",
  "quality": 65,
  "readerImportance": 55,
  "researchImportance": 94.5,
  "tacticalValue": null,
  "contentFormat": "article",
  "tractability": null,
  "neglectedness": null,
  "uncertainty": null,
  "causalLevel": null,
  "lastUpdated": "2026-03-13",
  "dateCreated": "2026-02-15",
  "llmSummary": "METR research shows AI task completion horizons doubling every 7 months (accelerated to 4 months in 2024-2025), with current frontier models achieving ~1 hour autonomous operation at 50% success; Claude Opus 4.5 reaches 80.9% on SWE-bench Verified. Multi-day autonomy projected for 2026-2027 represents critical safety threshold where oversight breaks down (100-1000x decision volume increase) and power accumulation pathways emerge, while 80% of organizations already report risky agent behaviors.",
  "description": "AI systems capable of autonomous operation over extended periods (hours to weeks), representing a critical transition from AI-as-tool to AI-as-agent with major safety implications including breakdown of oversight mechanisms and potential for power accumulation. METR research shows task horizons doubling every 7 months; Claude 3.7 achieves ~1 hour tasks while Claude Opus 4.5 reaches 80.9% on SWE-bench Verified.",
  "ratings": {
    "novelty": 5.5,
    "rigor": 7,
    "actionability": 6.5,
    "completeness": 7.5
  },
  "category": "capabilities",
  "subcategory": "agentic",
  "clusters": [
    "ai-safety"
  ],
  "metrics": {
    "wordCount": 2680,
    "tableCount": 21,
    "diagramCount": 1,
    "internalLinks": 51,
    "externalLinks": 37,
    "footnoteCount": 0,
    "bulletRatio": 0.15,
    "sectionCount": 33,
    "hasOverview": true,
    "structuralScore": 15
  },
  "suggestedQuality": 100,
  "updateFrequency": 21,
  "evergreen": true,
  "wordCount": 2680,
  "unconvertedLinks": [
    {
      "text": "METR 2025",
      "url": "https://metr.org/blog/2025-03-19-measuring-ai-ability-to-complete-long-tasks/",
      "resourceId": "271fc5f73a8304b2",
      "resourceTitle": "Measuring AI Ability to Complete Long Tasks - METR"
    },
    {
      "text": "OpenAI",
      "url": "https://openai.com/index/introducing-swe-bench-verified/",
      "resourceId": "e1f512a932def9e2",
      "resourceTitle": "SWE-bench Verified - OpenAI"
    },
    {
      "text": "McKinsey 2025",
      "url": "https://www.mckinsey.com/capabilities/risk-and-resilience/our-insights/deploying-agentic-ai-with-safety-and-security-a-playbook-for-technology-leaders",
      "resourceId": "73b5426488075245",
      "resourceTitle": "agentic AI market"
    },
    {
      "text": "anthropic.com",
      "url": "https://www.anthropic.com/news/claude-opus-4-5",
      "resourceId": "57f01cae307e1cb1"
    },
    {
      "text": "METR",
      "url": "https://metr.org/blog/2025-03-19-measuring-ai-ability-to-complete-long-tasks/",
      "resourceId": "271fc5f73a8304b2",
      "resourceTitle": "Measuring AI Ability to Complete Long Tasks - METR"
    },
    {
      "text": "McKinsey's 2025 analysis",
      "url": "https://www.mckinsey.com/capabilities/risk-and-resilience/our-insights/deploying-agentic-ai-with-safety-and-security-a-playbook-for-technology-leaders",
      "resourceId": "73b5426488075245",
      "resourceTitle": "agentic AI market"
    },
    {
      "text": "Anthropic",
      "url": "https://www.anthropic.com/news/3-5-models-and-computer-use",
      "resourceId": "9e4ef9c155b6d9f3"
    },
    {
      "text": "Scale AI leaderboard",
      "url": "https://scale.com/leaderboard/swe_bench_pro_public",
      "resourceId": "9dbe484d48b6787a",
      "resourceTitle": "SWE-bench Pro Leaderboard - Scale AI"
    },
    {
      "text": "Scale AI",
      "url": "https://scale.com/leaderboard/swe_bench_pro_public",
      "resourceId": "9dbe484d48b6787a",
      "resourceTitle": "SWE-bench Pro Leaderboard - Scale AI"
    },
    {
      "text": "OpenAI",
      "url": "https://openai.com/index/introducing-swe-bench-verified/",
      "resourceId": "e1f512a932def9e2",
      "resourceTitle": "SWE-bench Verified - OpenAI"
    },
    {
      "text": "NIST AI RMF",
      "url": "https://www.nist.gov/itl/ai-risk-management-framework",
      "resourceId": "54dbc15413425997",
      "resourceTitle": "NIST AI Risk Management Framework"
    },
    {
      "text": "METR's March 2025 study",
      "url": "https://metr.org/blog/2025-03-19-measuring-ai-ability-to-complete-long-tasks/",
      "resourceId": "271fc5f73a8304b2",
      "resourceTitle": "Measuring AI Ability to Complete Long Tasks - METR"
    },
    {
      "text": "METR",
      "url": "https://metr.org/blog/2025-03-19-measuring-ai-ability-to-complete-long-tasks/",
      "resourceId": "271fc5f73a8304b2",
      "resourceTitle": "Measuring AI Ability to Complete Long Tasks - METR"
    },
    {
      "text": "McKinsey",
      "url": "https://www.mckinsey.com/capabilities/risk-and-resilience/our-insights/deploying-agentic-ai-with-safety-and-security-a-playbook-for-technology-leaders",
      "resourceId": "73b5426488075245",
      "resourceTitle": "agentic AI market"
    },
    {
      "text": "METR (2025)",
      "url": "https://metr.org/blog/2025-03-19-measuring-ai-ability-to-complete-long-tasks/",
      "resourceId": "271fc5f73a8304b2",
      "resourceTitle": "Measuring AI Ability to Complete Long Tasks - METR"
    },
    {
      "text": "Anthropic (2024)",
      "url": "https://www.anthropic.com/news/3-5-models-and-computer-use",
      "resourceId": "9e4ef9c155b6d9f3"
    },
    {
      "text": "McKinsey (2025)",
      "url": "https://www.mckinsey.com/capabilities/risk-and-resilience/our-insights/deploying-agentic-ai-with-safety-and-security-a-playbook-for-technology-leaders",
      "resourceId": "73b5426488075245",
      "resourceTitle": "agentic AI market"
    },
    {
      "text": "METR HCAST",
      "url": "https://arxiv.org/html/2503.14499v1",
      "resourceId": "324cd2230cbea396",
      "resourceTitle": "Measuring AI Long Tasks - arXiv"
    },
    {
      "text": "NIST",
      "url": "https://www.nist.gov/itl/ai-risk-management-framework",
      "resourceId": "54dbc15413425997",
      "resourceTitle": "NIST AI Risk Management Framework"
    },
    {
      "text": "SWE-bench Pro",
      "url": "https://scale.com/leaderboard/swe_bench_pro_public",
      "resourceId": "9dbe484d48b6787a",
      "resourceTitle": "SWE-bench Pro Leaderboard - Scale AI"
    }
  ],
  "unconvertedLinkCount": 20,
  "convertedLinkCount": 40,
  "backlinkCount": 2,
  "hallucinationRisk": {
    "level": "medium",
    "score": 40,
    "factors": [
      "no-citations",
      "high-rigor"
    ]
  },
  "entityType": "capability",
  "redundancy": {
    "maxSimilarity": 17,
    "similarPages": [
      {
        "id": "power-seeking-conditions",
        "title": "Power-Seeking Emergence Conditions Model",
        "path": "/knowledge-base/models/power-seeking-conditions/",
        "similarity": 17
      },
      {
        "id": "corrigibility-failure-pathways",
        "title": "Corrigibility Failure Pathways",
        "path": "/knowledge-base/models/corrigibility-failure-pathways/",
        "similarity": 16
      },
      {
        "id": "coding",
        "title": "Autonomous Coding",
        "path": "/knowledge-base/capabilities/coding/",
        "similarity": 15
      },
      {
        "id": "agi-development",
        "title": "AGI Development",
        "path": "/knowledge-base/forecasting/agi-development/",
        "similarity": 15
      },
      {
        "id": "large-language-models",
        "title": "Large Language Models",
        "path": "/knowledge-base/capabilities/large-language-models/",
        "similarity": 14
      }
    ]
  },
  "coverage": {
    "passing": 9,
    "total": 13,
    "targets": {
      "tables": 11,
      "diagrams": 1,
      "internalLinks": 21,
      "externalLinks": 13,
      "footnotes": 8,
      "references": 8
    },
    "actuals": {
      "tables": 21,
      "diagrams": 1,
      "internalLinks": 51,
      "externalLinks": 37,
      "footnotes": 0,
      "references": 39,
      "quotesWithQuotes": 0,
      "quotesTotal": 0,
      "accuracyChecked": 0,
      "accuracyTotal": 0
    },
    "items": {
      "llmSummary": "green",
      "schedule": "green",
      "entity": "green",
      "editHistory": "red",
      "overview": "green",
      "tables": "green",
      "diagrams": "green",
      "internalLinks": "green",
      "externalLinks": "green",
      "footnotes": "red",
      "references": "green",
      "quotes": "red",
      "accuracy": "red"
    },
    "ratingsString": "N:5.5 R:7 A:6.5 C:7.5"
  },
  "readerRank": 264,
  "researchRank": 4,
  "recommendedScore": 179.28
}
External Links

No external links

Backlinks (2)
idtitletyperelationship
__index__/knowledge-base/capabilitiesAI Capabilitiesconcept
agi-timelineAGI Timelineconcept
Longterm Wiki