Large Language Models
language-modelscapabilityPath: /knowledge-base/capabilities/language-models/
E186Entity ID (EID)
Page Recorddatabase.json — merged from MDX frontmatter + Entity YAML + computed metrics at build time
{
"id": "language-models",
"numericId": null,
"path": "/knowledge-base/capabilities/language-models/",
"filePath": "knowledge-base/capabilities/language-models.mdx",
"title": "Large Language Models",
"quality": 60,
"readerImportance": 94,
"researchImportance": 76.5,
"tacticalValue": 74,
"contentFormat": "article",
"tractability": null,
"neglectedness": null,
"uncertainty": null,
"causalLevel": null,
"lastUpdated": "2026-03-13",
"dateCreated": "2026-02-15",
"llmSummary": "Comprehensive analysis of LLM capabilities showing rapid progress from GPT-2 (1.5B parameters, 2019) to GPT-5 and Gemini 2.5 (2025), with training costs growing 2.4x annually and projected to exceed \\$1B by 2027. Documents emergence of inference-time scaling paradigm, mechanistic interpretability advances including Gemma Scope 2, multilingual alignment research, factuality benchmarking via FACTS suite, and identifies key safety concerns including 8-45% hallucination rates, persuasion capabilities, and growing autonomous agent capabilities.",
"description": "Foundation models trained on text that demonstrate emergent capabilities and represent the primary driver of current AI capabilities and risks, with rapid progression from GPT-2 (1.5B parameters, 2019) to GPT-5 and Gemini 2.5 (2025) showing predictable scaling laws alongside unpredictable capability emergence",
"ratings": {
"novelty": 4.5,
"rigor": 6.5,
"actionability": 5,
"completeness": 7.5
},
"category": "capabilities",
"subcategory": "core",
"clusters": [
"ai-safety",
"governance"
],
"metrics": {
"wordCount": 7369,
"tableCount": 17,
"diagramCount": 1,
"internalLinks": 60,
"externalLinks": 67,
"footnoteCount": 1,
"bulletRatio": 0.17,
"sectionCount": 55,
"hasOverview": true,
"structuralScore": 15
},
"suggestedQuality": 100,
"updateFrequency": 21,
"evergreen": true,
"wordCount": 7369,
"unconvertedLinks": [
{
"text": "Summarizing Books with Human Feedback (Wu et al., 2021)",
"url": "https://arxiv.org/abs/2109.10862",
"resourceId": "54eec19853953598",
"resourceTitle": "[2109.10862] Recursively Summarizing Books with Human Feedback"
},
{
"text": "TruthfulQA: Measuring How Models Mimic Human Falsehoods (Lin et al., 2022)",
"url": "https://arxiv.org/abs/2109.07958",
"resourceId": "fe2a3307a3dae3e5",
"resourceTitle": "Kenton et al. (2021)"
},
{
"text": "WebGPT: Browser-Assisted Question-Answering with Human Feedback (Nakano et al., 2021)",
"url": "https://arxiv.org/abs/2112.09332",
"resourceId": "3225de3850d36a20",
"resourceTitle": "OpenAI WebGPT behavior"
},
{
"text": "Evaluating Large Language Models Trained on Code (Chen et al., 2021)",
"url": "https://arxiv.org/abs/2107.03374",
"resourceId": "176fdaf24fa29d4c",
"resourceTitle": "Evaluating Large Language Models Trained on Code"
},
{
"text": "Helpful to a Fault (2025)",
"url": "https://arxiv.org/abs/2502.09933",
"resourceId": "92dfabf49665e538",
"resourceTitle": "Helpful to a Fault (2025)."
},
{
"text": "FACTS Grounding",
"url": "https://arxiv.org/abs/2501.03200",
"resourceId": "1f7d271db2f8a756",
"resourceTitle": "Google DeepMind, \"FACTS Grounding: A new benchmark for evaluating the factuality of large language models\" (https://a..."
},
{
"text": "Gemini 2.0",
"url": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/",
"resourceId": "08aca1a4de71818f",
"resourceTitle": "Gemini 2.0 Flash"
},
{
"text": "GPT-5 System Card",
"url": "https://openai.com/index/gpt-5-system-card/",
"resourceId": "817c3cbf13144f20",
"resourceTitle": "OpenAI."
},
{
"text": "Hello GPT-4o (2024)",
"url": "https://openai.com/index/hello-gpt-4o/",
"resourceId": "ee605bab036068f0",
"resourceTitle": "GPT-4o"
},
{
"text": "GPT-5 System Card",
"url": "https://openai.com/index/gpt-5-system-card/",
"resourceId": "817c3cbf13144f20",
"resourceTitle": "OpenAI."
},
{
"text": "Llama 4 Herd",
"url": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
"resourceId": "05f285e9757b863c",
"resourceTitle": "LlamaFirewall"
},
{
"text": "TruthfulQA",
"url": "https://arxiv.org/abs/2109.07958",
"resourceId": "fe2a3307a3dae3e5",
"resourceTitle": "Kenton et al. (2021)"
},
{
"text": "FACTS Grounding",
"url": "https://arxiv.org/abs/2501.03200",
"resourceId": "1f7d271db2f8a756",
"resourceTitle": "Google DeepMind, \"FACTS Grounding: A new benchmark for evaluating the factuality of large language models\" (https://a..."
},
{
"text": "Vectara",
"url": "https://github.com/vectara/hallucination-leaderboard",
"resourceId": "b44f883dc65dd0e9",
"resourceTitle": "Vectara, \"Hallucination Leaderboard\" (https://github.com/vectara/hallucination-leaderboard)"
},
{
"text": "HalluLens (ACL 2025)",
"url": "https://aclanthology.org/2025.acl-long.1176/",
"resourceId": "351b4e7354e2dc5b",
"resourceTitle": "HalluLens, ACL 2025 (https://aclanthology.org/2025.acl-long.1176/)"
},
{
"text": "CheckIfExist (2025)",
"url": "https://arxiv.org/abs/2502.09802",
"resourceId": "3df9ab6bab412a71",
"resourceTitle": "CheckIfExist authors, \"CheckIfExist: Citation Hallucination Detection in RAG Systems\" (https://arxiv.org/abs/2502.09802)"
},
{
"text": "CheckIfExist",
"url": "https://arxiv.org/abs/2502.09802",
"resourceId": "3df9ab6bab412a71",
"resourceTitle": "CheckIfExist authors, \"CheckIfExist: Citation Hallucination Detection in RAG Systems\" (https://arxiv.org/abs/2502.09802)"
}
],
"unconvertedLinkCount": 17,
"convertedLinkCount": 7,
"backlinkCount": 52,
"hallucinationRisk": {
"level": "high",
"score": 80,
"factors": [
"low-citation-density",
"severe-truncation"
],
"integrityIssues": [
"severe-truncation"
]
},
"entityType": "capability",
"redundancy": {
"maxSimilarity": 22,
"similarPages": [
{
"id": "agentic-ai",
"title": "Agentic AI",
"path": "/knowledge-base/capabilities/agentic-ai/",
"similarity": 22
},
{
"id": "reasoning",
"title": "Reasoning and Planning",
"path": "/knowledge-base/capabilities/reasoning/",
"similarity": 22
},
{
"id": "solutions",
"title": "AI Safety Solution Cruxes",
"path": "/knowledge-base/cruxes/solutions/",
"similarity": 22
},
{
"id": "why-alignment-hard",
"title": "Why Alignment Might Be Hard",
"path": "/knowledge-base/debates/why-alignment-hard/",
"similarity": 22
},
{
"id": "deep-learning-era",
"title": "Deep Learning Revolution (2012-2020)",
"path": "/knowledge-base/history/deep-learning-era/",
"similarity": 21
}
]
},
"changeHistory": [
{
"date": "2026-03-13",
"branch": "auto-update/2026-03-13",
"title": "Auto-improve (standard): Large Language Models",
"summary": "Improved \"Large Language Models\" via standard pipeline (489.5s). Quality score: 71. Issues resolved: Page is truncated — the section 'Cybersecurity Capabilities ; Frontmatter 'lastEdited' is '2026-03-13', which is a future ; Footnote references such as [^rc-ed93], [^rc-e8f4], [^rc-c58.",
"duration": "489.5s",
"cost": "$5-8"
},
{
"date": "2026-03-11",
"branch": "auto-update/2026-03-11",
"title": "Auto-improve (standard): Large Language Models",
"summary": "Improved \"Large Language Models\" via standard pipeline (515.5s). Quality score: 72. Issues resolved: Frontmatter 'lastEdited' is set to '2026-03-11', which is a ; Capability Progression Timeline table contains a row for 'Cl; The 'FACTS Benchmark Suite' row in the Hallucination benchma.",
"duration": "515.5s",
"cost": "$5-8"
},
{
"date": "2026-03-10",
"branch": "auto-update/2026-03-10",
"title": "Auto-improve (standard): Large Language Models",
"summary": "Improved \"Large Language Models\" via standard pipeline (516.9s). Quality score: 71. Issues resolved: Frontmatter 'lastEdited' date is '2026-03-10', which is a fu; Capability Progression Timeline: '<EntityLink id=\"E1030\">Cla; Benchmark Performance Comparison table: TruthfulQA row has '.",
"duration": "516.9s",
"cost": "$5-8"
},
{
"date": "2026-03-09",
"branch": "auto-update/2026-03-09",
"title": "Auto-improve (standard): Large Language Models",
"summary": "Improved \"Large Language Models\" via standard pipeline (510.4s). Quality score: 71. Issues resolved: Page is truncated mid-sentence at the end: '...achieving com; Frontmatter field 'lastEdited' is dated '2026-03-09' which i; Capability Progression Timeline table contains a duplicate E.",
"duration": "510.4s",
"cost": "$5-8"
},
{
"date": "2026-03-07",
"branch": "auto-update/2026-03-07",
"title": "Auto-improve (standard): Large Language Models",
"summary": "Improved \"Large Language Models\" via standard pipeline (1306.4s). Quality score: 71. Issues resolved: Frontmatter: 'llmSummary' contains an escaped dollar sign (\\; Capability Progression Timeline table: 'Claude Opus 4.5' row; Benchmark Performance Comparison table: ARC-AGI-2 row shows .",
"duration": "1306.4s",
"cost": "$5-8"
},
{
"date": "2026-02-18",
"branch": "claude/fix-issue-240-N5irU",
"title": "Surface tacticalValue in /wiki table and score 53 pages",
"summary": "Added `tacticalValue` to `ExploreItem` interface, `getExploreItems()` mappings, the `/wiki` explore table (new sortable \"Tact.\" column), and the card view sort dropdown. Scored 49 new pages with tactical values (4 were already scored), bringing total to 53.",
"model": "sonnet-4",
"duration": "~30min"
}
],
"coverage": {
"passing": 8,
"total": 13,
"targets": {
"tables": 29,
"diagrams": 3,
"internalLinks": 59,
"externalLinks": 37,
"footnotes": 22,
"references": 22
},
"actuals": {
"tables": 17,
"diagrams": 1,
"internalLinks": 60,
"externalLinks": 67,
"footnotes": 1,
"references": 36,
"quotesWithQuotes": 0,
"quotesTotal": 0,
"accuracyChecked": 0,
"accuracyTotal": 0
},
"items": {
"llmSummary": "green",
"schedule": "green",
"entity": "green",
"editHistory": "green",
"overview": "green",
"tables": "amber",
"diagrams": "amber",
"internalLinks": "green",
"externalLinks": "green",
"footnotes": "amber",
"references": "green",
"quotes": "red",
"accuracy": "red"
},
"editHistoryCount": 6,
"ratingsString": "N:4.5 R:6.5 A:5 C:7.5"
},
"readerRank": 4,
"researchRank": 109,
"recommendedScore": 188.86
}External Links
{
"wikipedia": "https://en.wikipedia.org/wiki/Large_language_model",
"lesswrong": "https://www.lesswrong.com/tag/language-models-llms",
"eaForum": "https://forum.effectivealtruism.org/topics/large-language-models",
"grokipedia": "https://grokipedia.com/page/Large_language_model"
}Backlinks (52)
| id | title | type | relationship |
|---|---|---|---|
| persuasion | Persuasion and Social Manipulation | capability | — |
| reasoning | Reasoning and Planning | capability | — |
| __index__/knowledge-base/capabilities | AI Capabilities | concept | — |
| large-language-models | Large Language Models | concept | — |
| scientific-research | Scientific Research Capabilities | capability | — |
| situational-awareness | Situational Awareness | capability | — |
| agi-timeline | AGI Timeline | concept | — |
| minimal-scaffolding | Minimal Scaffolding | capability | — |
| neuromorphic | Neuromorphic Hardware | capability | — |
| world-models | World Models + Planning | capability | — |
| bioweapons-ai-uplift | AI Uplift Assessment Model | analysis | — |
| cyberweapons-attack-automation | Autonomous Cyber Attack Timeline | analysis | — |
| deceptive-alignment-decomposition | Deceptive Alignment Decomposition Model | analysis | — |
| instrumental-convergence-framework | Instrumental Convergence Framework | analysis | — |
| power-seeking-conditions | Power-Seeking Emergence Conditions Model | analysis | — |
| anthropic | Anthropic | organization | — |
| bridgewater-aia-labs | Bridgewater AIA Labs | organization | — |
| conjecture | Conjecture | organization | — |
| elicit | Elicit (AI Research Tool) | organization | — |
| lightning-rod-labs | Lightning Rod Labs | organization | — |
| redwood-research | Redwood Research | organization | — |
| ssi | Safe Superintelligence Inc (SSI) | organization | — |
| evan-hubinger | Evan Hubinger | person | — |
| geoffrey-hinton | Geoffrey Hinton | person | — |
| paul-christiano | Paul Christiano | person | — |
| philip-tetlock | Philip Tetlock (Forecasting Pioneer) | person | — |
| robin-hanson | Robin Hanson | person | — |
| yann-lecun | Yann LeCun | person | — |
| ai-forecasting | AI-Augmented Forecasting | approach | — |
| alignment | AI Alignment | approach | — |
| bletchley-declaration | Bletchley Declaration | policy | — |
| cirl | Cooperative IRL (CIRL) | approach | — |
| coordination-tech | AI Governance Coordination Technologies | approach | — |
| deliberation | AI-Assisted Deliberation Platforms | approach | — |
| eu-ai-act | EU AI Act | policy | — |
| research-agendas | AI Alignment Research Agenda Comparison | crux | — |
| reward-modeling | Reward Modeling | approach | — |
| rlhf | RLHF / Constitutional AI | capability | — |
| roastmypost | RoastMyPost | project | — |
| squiggleai | SquiggleAI | project | — |
| standards-bodies | AI Standards Bodies | policy | — |
| texas-traiga | Texas TRAIGA Responsible AI Governance Act | policy | — |
| wikipedia-views | Wikipedia Views | project | — |
| ai-welfare | AI Welfare and Digital Minds | concept | — |
| bioweapons | Bioweapons | risk | — |
| emergent-capabilities | Emergent Capabilities | risk | — |
| fraud | AI-Powered Fraud | risk | — |
| historical-revisionism | Historical Revisionism | risk | — |
| mesa-optimization | Mesa-Optimization | risk | — |
| reward-hacking | Reward Hacking | risk | — |
| sleeper-agents | Sleeper Agents: Training Deceptive LLMs | risk | — |
| trust-cascade | AI Trust Cascade Failure | risk | — |