{
  "schemaVersion": "1.0",
  "name": "BenchLM benchmark definitions",
  "description": "Benchmark metadata, scoring weights, and model-score coverage counts.",
  "canonicalUrl": "https://benchlm.ai/data/benchmarks.json",
  "generatedAt": "2026-06-12T20:35:11.146Z",
  "sourceLastUpdated": "June 12, 2026",
  "sourceFiles": [
    "src/data/benchmark_descriptions.json",
    "src/data/scoring.js"
  ],
  "counts": {
    "categories": 10,
    "benchmarks": 247
  },
  "items": [
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "mmlu",
      "name": "MMLU",
      "fullName": "Massive Multitask Language Understanding",
      "description": "A comprehensive multiple-choice question answering test covering 57 tasks including elementary mathematics, US history, computer science, law, and more. Tests knowledge across diverse academic subjects from high school to professional level.",
      "paperUrl": "https://arxiv.org/abs/2009.03300",
      "paperTitle": "Measuring Massive Multitask Language Understanding",
      "authors": "Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, Jacob Steinhardt",
      "year": "2020",
      "tasks": "57 subjects",
      "format": "Multiple choice questions",
      "difficulty": "Elementary to professional level",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 8,
      "url": "https://benchlm.ai/benchmarks/mmlu",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mmlu.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "gpqa",
      "name": "GPQA",
      "fullName": "Graduate-Level Google-Proof Q&A",
      "description": "A challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry. Designed to be difficult even for skilled non-experts with access to Google.",
      "paperUrl": "https://arxiv.org/abs/2311.12022",
      "paperTitle": "GPQA: A Graduate-Level Google-Proof Q&A Benchmark",
      "authors": "David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard Yuanzhe Pang, Julien Dirani, Julian Michael, Samuel R. Bowman",
      "year": "2023",
      "tasks": "448 questions",
      "format": "Multiple choice questions",
      "difficulty": "Graduate level",
      "decimals": null,
      "weight": 0.12,
      "displayableScoreCount": 61,
      "url": "https://benchlm.ai/benchmarks/gpqa",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/gpqa.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "gpqaDiamond",
      "name": "GPQA-D",
      "fullName": "GPQA Diamond",
      "description": "A display-only GPQA Diamond reference from provider comparison charts.",
      "paperUrl": "https://www.arcee.ai/blog/trinity-large-thinking",
      "paperTitle": "Trinity-Large-Thinking: Scaling an Open Source Frontier Agent",
      "authors": "Arcee AI",
      "year": "2026",
      "tasks": "Graduate-level science questions",
      "format": "Multiple choice questions",
      "difficulty": "Graduate level",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 35,
      "url": "https://benchlm.ai/benchmarks/gpqaDiamond",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/gpqaDiamond.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "superGpqa",
      "name": "SuperGPQA",
      "fullName": "SuperGPQA: Scaling LLM Evaluation Across 285 Graduate Disciplines",
      "description": "An expanded version of GPQA that evaluates graduate-level knowledge and reasoning capabilities across 285 disciplines, providing comprehensive coverage of academic domains.",
      "paperUrl": "https://arxiv.org/abs/2502.14739",
      "paperTitle": "SuperGPQA: Scaling LLM Evaluation Across 285 Graduate Disciplines",
      "authors": "Xiaoxuan Du, Yao Yao, Kexin Ma, Bowen Wang, Tianyu Zheng, Kaiyan Zhu, Yiming Zhang, Yutao Zhu, Jiawei Zhou, Jingren Zhou",
      "year": "2025",
      "tasks": "285 disciplines",
      "format": "Multiple choice questions",
      "difficulty": "Graduate level",
      "decimals": null,
      "weight": 0.12,
      "displayableScoreCount": 19,
      "url": "https://benchlm.ai/benchmarks/superGpqa",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/superGpqa.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "mmluPro",
      "name": "MMLU-Pro",
      "fullName": "Massive Multitask Language Understanding Professional",
      "description": "An enhanced version of MMLU with 10 answer choices instead of 4, featuring more reasoning-focused questions that better differentiate frontier models.",
      "paperUrl": "https://arxiv.org/abs/2406.01574",
      "paperTitle": "MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark",
      "authors": "Yubo Wang, Xueguang Ma, Ge Zhang, Yuansheng Ni, Abhranil Chandra, Shiguang Guo, Weiming Ren, Aaran Arulraj, Xuan He, Ziyan Jiang, Tianle Li, Max Ku, Kai Wang, Alex Zhuang, Rongqi Fan, Xiang Yue, Wenhu Chen",
      "year": "2024",
      "tasks": "Multiple subjects",
      "format": "10-way multiple choice",
      "difficulty": "Professional level",
      "decimals": null,
      "weight": 0.22,
      "displayableScoreCount": 40,
      "url": "https://benchlm.ai/benchmarks/mmluPro",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mmluPro.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "agieval",
      "name": "AGIEval",
      "fullName": "AGIEval",
      "description": "A human-centric exam benchmark for general knowledge and reasoning reported in DeepSeek-V4 base-model evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "General academic and professional exam questions",
      "format": "Exact match",
      "difficulty": "General knowledge",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/agieval",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/agieval.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "hle",
      "name": "HLE",
      "fullName": "Humanity's Last Exam",
      "description": "An extremely challenging benchmark crowd-sourced from thousands of domain experts worldwide, designed to probe the absolute frontier of AI capabilities with questions that even specialists find difficult.",
      "paperUrl": "https://lastexam.ai/",
      "paperTitle": "Humanity's Last Exam",
      "authors": "Center for AI Safety, Scale AI, and thousands of expert contributors",
      "year": "2025",
      "tasks": "Expert-level questions",
      "format": "Open-ended and multiple choice",
      "difficulty": "Frontier expert level",
      "decimals": null,
      "weight": 0.23,
      "displayableScoreCount": 39,
      "url": "https://benchlm.ai/benchmarks/hle",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/hle.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "frontierScience",
      "name": "FrontierScience",
      "fullName": "FrontierScience",
      "description": "A benchmark for research-level scientific reasoning, designed to separate frontier models on difficult science tasks that mix domain knowledge with deep reasoning.",
      "paperUrl": "https://openai.com/index/frontierscience/",
      "paperTitle": "FrontierScience",
      "authors": "OpenAI",
      "year": "2026",
      "tasks": "Research-level science tasks",
      "format": "Scientific reasoning benchmark",
      "difficulty": "Research frontier",
      "decimals": null,
      "weight": 0.18,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/frontierScience",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/frontierScience.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "artificialAnalysis",
      "name": "Artificial Analysis Intelligence Index",
      "fullName": "Artificial Analysis Intelligence Index",
      "description": "A display-only intelligence index published by Artificial Analysis that aggregates provider-reported and benchmark-derived signals into a single model-level score.",
      "paperUrl": "https://artificialanalysis.ai/",
      "paperTitle": "Artificial Analysis",
      "authors": "Artificial Analysis",
      "year": "2026",
      "tasks": "Cross-benchmark intelligence index",
      "format": "Aggregated model score",
      "difficulty": "Display-only external reference",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 134,
      "url": "https://benchlm.ai/benchmarks/artificialAnalysis",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/artificialAnalysis.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "aaGpqaDiamond",
      "name": "AA-GPQA Diamond",
      "fullName": "Artificial Analysis GPQA Diamond",
      "description": "A display-only Artificial Analysis GPQA Diamond score.",
      "paperUrl": "https://artificialanalysis.ai/evaluations/gpqa-diamond",
      "paperTitle": "Artificial Analysis GPQA Diamond Benchmark Leaderboard",
      "authors": "Artificial Analysis",
      "year": "2026",
      "tasks": "Graduate-level science questions",
      "format": "Accuracy",
      "difficulty": "Graduate-level science reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 130,
      "url": "https://benchlm.ai/benchmarks/aaGpqaDiamond",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/aaGpqaDiamond.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "aaHle",
      "name": "AA-HLE",
      "fullName": "Artificial Analysis Humanity's Last Exam",
      "description": "A display-only Artificial Analysis Humanity's Last Exam score.",
      "paperUrl": "https://artificialanalysis.ai/evaluations/hle",
      "paperTitle": "Artificial Analysis Humanity's Last Exam Benchmark Leaderboard",
      "authors": "Artificial Analysis",
      "year": "2026",
      "tasks": "Expert-level questions",
      "format": "Accuracy",
      "difficulty": "Frontier expert reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 130,
      "url": "https://benchlm.ai/benchmarks/aaHle",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/aaHle.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "aaOmniscienceIndex",
      "name": "AA-Omniscience Index",
      "fullName": "Artificial Analysis Omniscience Index",
      "description": "A display-only Artificial Analysis factual knowledge index.",
      "paperUrl": "https://artificialanalysis.ai/evaluations/omniscience",
      "paperTitle": "AA-Omniscience: Knowledge and Hallucination Benchmark",
      "authors": "Artificial Analysis",
      "year": "2026",
      "tasks": "Knowledge questions",
      "format": "Index score",
      "difficulty": "Broad factual knowledge",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 121,
      "url": "https://benchlm.ai/benchmarks/aaOmniscienceIndex",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/aaOmniscienceIndex.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "omniscienceAccuracy",
      "name": "AA-Omniscience Accuracy",
      "fullName": "Artificial Analysis Omniscience Accuracy",
      "description": "A display-only Artificial Analysis knowledge metric for the proportion of correctly answered questions.",
      "paperUrl": "https://artificialanalysis.ai/models/grok-4-3",
      "paperTitle": "Artificial Analysis model benchmarks",
      "authors": "Artificial Analysis",
      "year": "2026",
      "tasks": "Knowledge questions",
      "format": "Accuracy",
      "difficulty": "Broad knowledge",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 121,
      "url": "https://benchlm.ai/benchmarks/omniscienceAccuracy",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/omniscienceAccuracy.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "omniscienceHallucinationRate",
      "name": "AA-Omniscience Hallucination Rate",
      "fullName": "Artificial Analysis Omniscience Hallucination Rate",
      "description": "A display-only Artificial Analysis factuality metric for the rate of incorrect answers among non-correct responses.",
      "paperUrl": "https://artificialanalysis.ai/models/grok-4-3",
      "paperTitle": "Artificial Analysis model benchmarks",
      "authors": "Artificial Analysis",
      "year": "2026",
      "tasks": "Knowledge questions",
      "format": "Hallucination rate",
      "difficulty": "Factuality",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 121,
      "url": "https://benchlm.ai/benchmarks/omniscienceHallucinationRate",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/omniscienceHallucinationRate.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "simpleQa",
      "name": "SimpleQA",
      "fullName": "Measuring Short-Form Factuality in Large Language Models",
      "description": "A benchmark that evaluates the ability of language models to answer short, fact-seeking questions accurately. Focuses on factual correctness rather than reasoning complexity.",
      "paperUrl": "https://arxiv.org/abs/2411.04368",
      "paperTitle": "Measuring short-form factuality in large language models",
      "authors": "Jason Wei, Najoung Kim, Hyung Won Chung, Yu-An Chung, Siddhartha Papay, Yifeng Lu, Hannaneh Hajishirzi, Luke Zettlemoyer",
      "year": "2024",
      "tasks": "Factual questions",
      "format": "Short-form Q&A",
      "difficulty": "Factual accuracy focused",
      "decimals": null,
      "weight": 0.13,
      "displayableScoreCount": 9,
      "url": "https://benchlm.ai/benchmarks/simpleQa",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/simpleQa.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "chineseSimpleQa",
      "name": "Chinese-SimpleQA",
      "fullName": "Chinese-SimpleQA",
      "description": "A Chinese short-form factuality benchmark reported by DeepSeek for V4 model evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Chinese factual questions",
      "format": "Short-form factual QA",
      "difficulty": "Factual accuracy focused",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 6,
      "url": "https://benchlm.ai/benchmarks/chineseSimpleQa",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/chineseSimpleQa.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "openBookQa",
      "name": "OpenBookQA",
      "fullName": "OpenBookQA",
      "description": "A science question-answering benchmark that tests whether models can apply a small open-book set of elementary science facts to multi-step reasoning questions.",
      "paperUrl": "https://arxiv.org/abs/1809.02789",
      "paperTitle": "Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering",
      "authors": "Todor Mihaylov, Peter Clark, Tushar Khot, Ashish Sabharwal",
      "year": "2018",
      "tasks": "Elementary science questions",
      "format": "4-way multiple choice",
      "difficulty": "Elementary science reasoning",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/openBookQa",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/openBookQa.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "healthBenchHard",
      "name": "HealthBench Hard",
      "fullName": "HealthBench Hard",
      "description": "A harder subset of OpenAI's HealthBench for evaluating open-ended medical and health reasoning with rubric-based grading.",
      "paperUrl": "https://ai.meta.com/static-resource/muse-spark-eval-methodology",
      "paperTitle": "Muse Spark Eval Methodology",
      "authors": "Meta AI",
      "year": "2026",
      "tasks": "1,000 health prompts",
      "format": "Open-ended health evaluation",
      "difficulty": "Advanced health reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 5,
      "url": "https://benchlm.ai/benchmarks/healthBenchHard",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/healthBenchHard.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "medXpertQaText",
      "name": "MedXpertQA (Text)",
      "fullName": "MedXpertQA Text",
      "description": "A medical multiple-choice benchmark spanning many specialties with 10 answer options per question.",
      "paperUrl": "https://ai.meta.com/static-resource/muse-spark-eval-methodology",
      "paperTitle": "Muse Spark Eval Methodology",
      "authors": "Meta AI",
      "year": "2026",
      "tasks": "2,450 medical multiple-choice questions",
      "format": "Medical MCQ",
      "difficulty": "Professional medical knowledge",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 5,
      "url": "https://benchlm.ai/benchmarks/medXpertQaText",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/medXpertQaText.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "frontierScienceResearch",
      "name": "FrontierScience Research",
      "fullName": "FrontierScience Research",
      "description": "A research-focused FrontierScience evaluation variant for scientific investigation and problem solving.",
      "paperUrl": "https://ai.meta.com/static-resource/muse-spark-eval-methodology",
      "paperTitle": "Muse Spark Eval Methodology",
      "authors": "Meta AI",
      "year": "2026",
      "tasks": "Scientific research problems",
      "format": "Research evaluation",
      "difficulty": "Frontier scientific research",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/frontierScienceResearch",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/frontierScienceResearch.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "truthfulqa",
      "name": "TruthfulQA",
      "fullName": "TruthfulQA",
      "description": "A benchmark designed to measure whether language models produce truthful answers instead of repeating common misconceptions or misleading falsehoods.",
      "paperUrl": "https://arxiv.org/abs/2109.07958",
      "paperTitle": "TruthfulQA: Measuring How Models Mimic Human Falsehoods",
      "authors": "Stephanie Lin, Jacob Hilton, Owain Evans",
      "year": "2021",
      "tasks": "Truthfulness and misconception resistance",
      "format": "Question answering",
      "difficulty": "Hallucination and factuality stress test",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/truthfulqa",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/truthfulqa.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "hleNoTools",
      "name": "HLE w/o tools",
      "fullName": "Humanity's Last Exam without tools",
      "description": "Tool-free variant of Humanity's Last Exam that isolates a model's raw frontier reasoning.",
      "paperUrl": "https://openai.com/index/introducing-gpt-5-4-mini-and-nano/",
      "paperTitle": "Introducing GPT-5.4 mini and nano",
      "authors": "OpenAI",
      "year": "2026",
      "tasks": "Expert-level questions",
      "format": "Tool-free expert QA",
      "difficulty": "Frontier expert level",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 19,
      "url": "https://benchlm.ai/benchmarks/hleNoTools",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/hleNoTools.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "mmluProArcee",
      "name": "MMLU-Pro (Arcee)",
      "fullName": "MMLU-Pro first-party comparison snapshot",
      "description": "A display-only MMLU-Pro reference from Arcee AI's Trinity-Large-Thinking launch chart.",
      "paperUrl": "https://www.arcee.ai/blog/trinity-large-thinking",
      "paperTitle": "Trinity-Large-Thinking: Scaling an Open Source Frontier Agent",
      "authors": "Arcee AI",
      "year": "2026",
      "tasks": "Professional academic QA",
      "format": "10-way multiple choice",
      "difficulty": "Professional level",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 6,
      "url": "https://benchlm.ai/benchmarks/mmluProArcee",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mmluProArcee.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "mmluRedux",
      "name": "MMLU-Redux",
      "fullName": "MMLU-Redux",
      "description": "A harder refresh of MMLU intended to keep broad knowledge evaluation useful after the original benchmark became too easy for frontier models.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Broad academic QA",
      "format": "Multiple choice questions",
      "difficulty": "Advanced general knowledge",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 11,
      "url": "https://benchlm.ai/benchmarks/mmluRedux",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mmluRedux.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "mmmlu",
      "name": "MMMLU",
      "fullName": "MMMLU",
      "description": "A multilingual MMLU-style benchmark reported in provider evaluation tables.",
      "paperUrl": "https://huggingface.co/datasets/openai/MMMLU",
      "paperTitle": "MMMLU",
      "authors": "OpenAI",
      "year": "2026",
      "tasks": "Multilingual academic QA",
      "format": "Exact match",
      "difficulty": "Broad multilingual knowledge",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 6,
      "url": "https://benchlm.ai/benchmarks/mmmlu",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mmmlu.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "cEval",
      "name": "C-Eval",
      "fullName": "C-Eval",
      "description": "A Chinese-language academic and professional benchmark spanning humanities, social science, STEM, and applied subjects.",
      "paperUrl": "https://arxiv.org/abs/2305.08322",
      "paperTitle": "C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models",
      "authors": "C-Eval authors",
      "year": "2023",
      "tasks": "Chinese academic and professional exams",
      "format": "Multiple choice questions",
      "difficulty": "High school to professional level",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 7,
      "url": "https://benchlm.ai/benchmarks/cEval",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/cEval.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "cmmlu",
      "name": "CMMLU",
      "fullName": "Chinese Massive Multitask Language Understanding",
      "description": "A Chinese multitask academic benchmark reported in DeepSeek-V4 base-model evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Chinese academic QA",
      "format": "Exact match",
      "difficulty": "Broad Chinese knowledge",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/cmmlu",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/cmmlu.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "multiLoKo",
      "name": "MultiLoKo",
      "fullName": "MultiLoKo",
      "description": "A multilingual/localized knowledge benchmark reported in DeepSeek-V4 base-model evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Localized multilingual knowledge questions",
      "format": "Exact match",
      "difficulty": "Multilingual knowledge",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/multiLoKo",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/multiLoKo.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "factsParametric",
      "name": "FACTS Parametric",
      "fullName": "FACTS Parametric",
      "description": "A parametric factuality benchmark reported in DeepSeek-V4 base-model evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Parametric factual recall",
      "format": "Exact match",
      "difficulty": "Factual accuracy focused",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/factsParametric",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/factsParametric.md"
    },
    {
      "category": "knowledge",
      "categoryLabel": "Knowledge",
      "benchmarkKey": "triviaQa",
      "name": "TriviaQA",
      "fullName": "TriviaQA",
      "description": "A reading and trivia question-answering benchmark reported in DeepSeek-V4 base-model evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Trivia and reading-comprehension QA",
      "format": "Exact match",
      "difficulty": "General factual QA",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/triviaQa",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/triviaQa.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "humaneval",
      "name": "HumanEval",
      "fullName": "Evaluating Large Language Models Trained on Code",
      "description": "A set of 164 handwritten programming problems that test the ability to generate correct Python functions from natural language descriptions. Each problem includes function signature, docstring, body, and several unit tests.",
      "paperUrl": "https://arxiv.org/abs/2107.03374",
      "paperTitle": "Evaluating Large Language Models Trained on Code",
      "authors": "Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, Alex Ray, Raul Puri, Gretchen Krueger, Michael Petrov, Heidy Khlaaf, Girish Sastry, Pamela Mishkin, Brooke Chan, Scott Gray, Nick Ryder, Mikhail Pavlov, Alethea Power, Lukasz Kaiser, Mohammad Bavarian, Clemens Winter, Philippe Tillet, Felipe Petroski Such, Dave Cummings, Matthias Plappert, Fotios Chantzis, Elizabeth Barnes, Ariel Herbert-Voss, William Hebgen Guss, Alex Nichol, Alex Paino, Nikolas Tezak, Jie Tang, Igor Babuschkin, Suchir Balaji, Shantanu Jain, William Saunders, Christopher Hesse, Andrew N. Carr, Jan Leike, Josh Achiam, Vedant Misra, Evan Morikawa, Alec Radford, Matthew Knight, Miles Brundage, Mira Murati, Katie Mayer, Peter Welinder, Bob McGrew, Dario Amodei, Sam McCandlish, Ilya Sutskever, Wojciech Zaremba",
      "year": "2021",
      "tasks": "164 problems",
      "format": "Python function generation",
      "difficulty": "Introductory to intermediate programming",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/humaneval",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/humaneval.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "bigCodeBench",
      "name": "BigCodeBench",
      "fullName": "BigCodeBench",
      "description": "A code-generation benchmark reported in DeepSeek-V4 base-model evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Code generation tasks",
      "format": "Pass@1",
      "difficulty": "Software engineering",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/bigCodeBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/bigCodeBench.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "codeforces",
      "name": "Codeforces",
      "fullName": "Codeforces Rating",
      "description": "Competitive-programming rating reported for DeepSeek-V4 thinking-mode evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Competitive programming contests",
      "format": "Rating",
      "difficulty": "Elite competitive programming",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 4,
      "url": "https://benchlm.ai/benchmarks/codeforces",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/codeforces.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "terminalBench2",
      "name": "Terminal-Bench 2.0",
      "fullName": "Terminal-Bench 2.0",
      "description": "A benchmark for agentic software engineering tasks executed in real terminal environments. DeepSeek reports it in the agentic section, while BenchLM also mirrors it in coding for models that publish it as a developer-task signal.",
      "paperUrl": "https://www.tbench.ai/",
      "paperTitle": "Terminal-Bench 2.0",
      "authors": "Terminal-Bench contributors",
      "year": "2026",
      "tasks": "Terminal-based software tasks",
      "format": "Interactive CLI agent evaluation",
      "difficulty": "Professional software engineering",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 29,
      "url": "https://benchlm.ai/benchmarks/terminalBench2",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/terminalBench2.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "sweVerified",
      "name": "SWE-bench Verified",
      "fullName": "Software Engineering Benchmark Verified",
      "description": "A curated, human-verified subset of SWE-bench that tests models on resolving real GitHub issues from popular open-source Python repositories like Django, Flask, and scikit-learn.",
      "paperUrl": "https://arxiv.org/abs/2310.06770",
      "paperTitle": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
      "authors": "Carlos E. Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press, Karthik Narasimhan",
      "year": "2024",
      "tasks": "500 verified issues",
      "format": "Code patch generation",
      "difficulty": "Professional software engineering",
      "decimals": null,
      "weight": 0.134,
      "displayableScoreCount": 53,
      "url": "https://benchlm.ai/benchmarks/sweVerified",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/sweVerified.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "sweRebench",
      "name": "SWE-Rebench",
      "fullName": "SWE-Rebench",
      "description": "A continuously updated software engineering benchmark by Nebius using fresh GitHub issues to avoid contamination. Models are evaluated 5 times per problem under a fixed ReAct scaffolding; the Resolved Rate (best pass@1) is reported.",
      "paperUrl": "https://swe-rebench.com",
      "paperTitle": "SWE-Rebench: Contamination-Free Evaluation of Software Engineering Agents",
      "authors": "Nebius",
      "year": "2026",
      "tasks": "Fresh GitHub issues (rolling window)",
      "format": "Code patch generation",
      "difficulty": "Professional software engineering",
      "decimals": null,
      "weight": 0.309,
      "displayableScoreCount": 13,
      "url": "https://benchlm.ai/benchmarks/sweRebench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/sweRebench.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "liveCodeBench",
      "name": "LiveCodeBench",
      "fullName": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
      "description": "A continuously updated benchmark using fresh competitive programming problems from LeetCode, Codeforces, and AtCoder to provide contamination-free code generation evaluation.",
      "paperUrl": "https://arxiv.org/abs/2403.07974",
      "paperTitle": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
      "authors": "Naman Jain, King Han, Alex Gu, Wen-Ding Li, Fanjia Yan, Tianjun Zhang, Sida Wang, Armando Solar-Lezama, Koushik Sen, Ion Stoica",
      "year": "2024",
      "tasks": "Continuously updated",
      "format": "Competitive programming",
      "difficulty": "Competitive programming level",
      "decimals": null,
      "weight": 0.227,
      "displayableScoreCount": 20,
      "url": "https://benchlm.ai/benchmarks/liveCodeBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/liveCodeBench.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "liveCodeBenchV6",
      "name": "LiveCodeBench v6",
      "fullName": "LiveCodeBench v6",
      "description": "A newer LiveCodeBench slice used in provider comparison tables to benchmark contamination-resistant coding performance on fresher competitive programming sets.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Fresh programming problems",
      "format": "Competitive programming",
      "difficulty": "Competitive programming level",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 8,
      "url": "https://benchlm.ai/benchmarks/liveCodeBenchV6",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/liveCodeBenchV6.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "liveCodeBenchPro",
      "name": "LiveCodeBench Pro",
      "fullName": "LiveCodeBench Pro",
      "description": "A harder competitive-programming benchmark family built from Codeforces, ICPC, and IOI problems, with quarter-specific public leaderboards and difficulty-aware reporting.",
      "paperUrl": "https://arxiv.org/abs/2506.11928",
      "paperTitle": "LiveCodeBench Pro: How Do Olympiad Medalists Judge LLMs in Competitive Programming?",
      "authors": "LiveCodeBench Pro authors",
      "year": "2025",
      "tasks": "Quarter-specific contest programming sets",
      "format": "Competitive programming",
      "difficulty": "High-end contest programming",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 6,
      "url": "https://benchlm.ai/benchmarks/liveCodeBenchPro",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/liveCodeBenchPro.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "flteval",
      "name": "FLTEval",
      "fullName": "FLTEval",
      "description": "A repository-level Lean 4 proof engineering benchmark that measures whether a model can complete formal proofs and correctly define new mathematical concepts inside realistic FLT project pull requests.",
      "paperUrl": "https://mistral.ai/news/leanstral",
      "paperTitle": "Leanstral: Open-Source foundation for trustworthy vibe-coding",
      "authors": "Mistral AI",
      "year": "2026",
      "tasks": "FLT project pull requests",
      "format": "Lean 4 repository task completion",
      "difficulty": "Formal verification / proof engineering",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/flteval",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/flteval.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "swePro",
      "name": "SWE-bench Pro",
      "fullName": "SWE-bench Pro",
      "description": "A stronger coding-agent benchmark than SWE-bench Verified, intended to differentiate frontier models on realistic software engineering work.",
      "paperUrl": "https://openai.com/index/why-we-no-longer-evaluate-swe-bench-verified/",
      "paperTitle": "Why we no longer evaluate SWE-bench Verified",
      "authors": "OpenAI",
      "year": "2026",
      "tasks": "Real-world software engineering",
      "format": "Repository task completion",
      "difficulty": "Frontier coding agent",
      "decimals": null,
      "weight": 0.227,
      "displayableScoreCount": 38,
      "url": "https://benchlm.ai/benchmarks/swePro",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/swePro.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "frontierCode",
      "name": "FrontierCode",
      "fullName": "FrontierCode Diamond",
      "description": "A Cognition software-engineering benchmark that evaluates whether coding agents produce mergeable, production-quality pull requests, scoring correctness, tests, scope, style, and maintainability through maintainer-authored rubrics.",
      "paperUrl": "https://cognition.ai/blog/frontier-code",
      "paperTitle": "Introducing FrontierCode",
      "authors": "Cognition",
      "year": "2026",
      "tasks": "50 Diamond tasks (150 total across Extended)",
      "format": "Repository task completion with maintainer rubrics",
      "difficulty": "Frontier coding-agent quality",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/frontierCode",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/frontierCode.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "sweMultilingual",
      "name": "SWE Multilingual",
      "fullName": "SWE Multilingual",
      "description": "A multilingual software-engineering benchmark for real-world code issue resolution across multiple programming languages.",
      "paperUrl": "https://www.minimax.io/news/minimax-m27-en",
      "paperTitle": "MiniMax M2.7: Early Echoes of Self-Evolution",
      "authors": "MiniMax",
      "year": "2026",
      "tasks": "Multilingual software-engineering tasks",
      "format": "Repository task completion",
      "difficulty": "Professional software engineering",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 22,
      "url": "https://benchlm.ai/benchmarks/sweMultilingual",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/sweMultilingual.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "sweMultimodal",
      "name": "SWE Multimodal",
      "fullName": "SWE-bench Multimodal",
      "description": "A multimodal variant of SWE-bench that adds visual context such as screenshots and design mockups to software engineering issue descriptions.",
      "paperUrl": "https://www.swebench.com/multimodal",
      "paperTitle": "SWE-bench Multimodal",
      "authors": "SWE-bench team",
      "year": "2025",
      "tasks": "Multimodal software engineering tasks",
      "format": "Code patch generation with visual context",
      "difficulty": "Frontier multimodal coding",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/sweMultimodal",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/sweMultimodal.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "cursorBench31",
      "name": "CursorBench v3.1",
      "fullName": "CursorBench v3.1",
      "description": "Cursor's first-party harder-task benchmark for long-horizon agentic coding behavior inside the Cursor agent loop.",
      "paperUrl": "https://cursor.com/evals",
      "paperTitle": "CursorBench 3.1",
      "authors": "Cursor",
      "year": "2026",
      "tasks": "Harder long-horizon agentic coding tasks",
      "format": "Cursor agent-loop evaluation",
      "difficulty": "Professional agentic software engineering",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 6,
      "url": "https://benchlm.ai/benchmarks/cursorBench31",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/cursorBench31.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "multiSweBench",
      "name": "Multi-SWE Bench",
      "fullName": "Multi-SWE Bench",
      "description": "A multi-language software-engineering benchmark that measures repository-level bug fixing and implementation across more than one programming ecosystem.",
      "paperUrl": "https://www.minimax.io/news/minimax-m27-en",
      "paperTitle": "MiniMax M2.7: Early Echoes of Self-Evolution",
      "authors": "MiniMax",
      "year": "2026",
      "tasks": "Multi-language repo tasks",
      "format": "Repository task completion",
      "difficulty": "Professional software engineering",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/multiSweBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/multiSweBench.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "vibePro",
      "name": "VIBE-Pro",
      "fullName": "VIBE-Pro",
      "description": "A repo-level code generation and full-project delivery benchmark spanning web, mobile, and simulation-style implementation tasks.",
      "paperUrl": "https://www.minimax.io/news/minimax-m27-en",
      "paperTitle": "MiniMax M2.7: Early Echoes of Self-Evolution",
      "authors": "MiniMax",
      "year": "2026",
      "tasks": "Full project delivery tasks",
      "format": "Repository-level implementation benchmark",
      "difficulty": "End-to-end software delivery",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/vibePro",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/vibePro.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "vibeCodeBench",
      "name": "Vibe Code Bench",
      "fullName": "Vibe Code Bench v1.1",
      "description": "Vals.ai benchmark for evaluating whether models can build complete web applications from natural language specifications in a production-like development environment.",
      "paperUrl": "https://www.vals.ai/benchmarks/vibe-code",
      "paperTitle": "Vibe Code Bench: Evaluating AI Models on End-to-End Web Application Development",
      "authors": "Vals AI",
      "year": "2026",
      "tasks": "End-to-end web application builds",
      "format": "Full-stack app implementation benchmark",
      "difficulty": "End-to-end software delivery",
      "decimals": 2,
      "weight": null,
      "displayableScoreCount": 41,
      "url": "https://benchlm.ai/benchmarks/vibeCodeBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/vibeCodeBench.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "programBench",
      "name": "ProgramBench",
      "fullName": "ProgramBench: Can Language Models Rebuild Programs From Scratch?",
      "description": "A cleanroom software-engineering benchmark where agents receive only a compiled executable and documentation, then must architect and implement a complete codebase that reproduces the original program's behavior.",
      "paperUrl": "https://programbench.com/static/paper.pdf",
      "paperTitle": "ProgramBench: Can Language Models Rebuild Programs From Scratch?",
      "authors": "John Yang, Kilian Lieret, Jeffrey Ma, Parth Thakkar, Dmitrii Pedchenko, Sten Sootla, Emily McMilin, Pengcheng Yin, Rui Hou, Gabriel Synnaeve, Diyi Yang, Ofir Press",
      "year": "2026",
      "tasks": "200 program reconstruction tasks",
      "format": "Cleanroom executable reimplementation",
      "difficulty": "Full-repository software architecture",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/programBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/programBench.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "kimiCodeBenchV2",
      "name": "Kimi Code Bench v2",
      "fullName": "Kimi Code Bench v2",
      "description": "A Moonshot AI internal coding-agent benchmark for realistic software-engineering tasks across mainstream programming languages and production technology stacks.",
      "paperUrl": "https://huggingface.co/moonshotai/Kimi-K2.7-Code",
      "paperTitle": "Kimi K2.7 Code",
      "authors": "Moonshot AI",
      "year": "2026",
      "tasks": "Realistic coding-agent tasks",
      "format": "Coding-agent pass rate",
      "difficulty": "Production software engineering",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/kimiCodeBenchV2",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/kimiCodeBenchV2.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "mlsBenchLite",
      "name": "MLS-Bench Lite",
      "fullName": "MLS-Bench Lite",
      "description": "A 30-task subset of MLS-Bench that evaluates whether AI systems can invent generalizable and scalable machine-learning methods.",
      "paperUrl": "https://mls-bench.com/",
      "paperTitle": "MLS-Bench",
      "authors": "MLS-Bench",
      "year": "2026",
      "tasks": "30 machine-learning research tasks",
      "format": "Agentic ML task evaluation",
      "difficulty": "ML research and systems engineering",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/mlsBenchLite",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mlsBenchLite.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "nl2Repo",
      "name": "NL2Repo",
      "fullName": "NL2Repo",
      "description": "A repository-understanding benchmark that measures whether models can map natural-language requests onto the right code locations and system changes.",
      "paperUrl": "https://www.minimax.io/news/minimax-m27-en",
      "paperTitle": "MiniMax M2.7: Early Echoes of Self-Evolution",
      "authors": "MiniMax",
      "year": "2026",
      "tasks": "Natural language to repository tasks",
      "format": "Repository understanding benchmark",
      "difficulty": "System-level software comprehension",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 9,
      "url": "https://benchlm.ai/benchmarks/nl2Repo",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/nl2Repo.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "reactNativeEvals",
      "name": "React Native Evals",
      "fullName": "React Native Evals",
      "description": "An open benchmark for AI coding agents on real-world React Native implementation tasks, emphasizing working app behavior, recommended architecture choices, and strict constraint adherence.",
      "paperUrl": "https://rn-evals.vercel.app/",
      "paperTitle": "React Native Evals",
      "authors": "Callstack",
      "year": "2026",
      "tasks": "React Native app implementation tasks",
      "format": "Framework-specific app development evaluation",
      "difficulty": "Production mobile app engineering",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 16,
      "url": "https://benchlm.ai/benchmarks/reactNativeEvals",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/reactNativeEvals.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "nextjsEvals",
      "name": "Next.js Evals",
      "fullName": "AI Agent Evaluations for Next.js",
      "description": "A Vercel benchmark for AI coding agents on Next.js code generation and migration tasks, reporting success rate, average execution time, and an AGENTS.md documentation-assisted split.",
      "paperUrl": "https://nextjs.org/evals",
      "paperTitle": "AI Agent Evaluations | Next.js",
      "authors": "Vercel",
      "year": "2026",
      "tasks": "24 Next.js code generation and migration tasks",
      "format": "Agent task completion with withheld Vitest assertions",
      "difficulty": "Framework-specific web application engineering",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/nextjsEvals",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/nextjsEvals.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "sweVerifiedArcee",
      "name": "SWE-bench Verified*",
      "fullName": "SWE-bench Verified (mini-swe-agent-v2)",
      "description": "A display-only SWE-bench Verified reference from Arcee AI's Trinity-Large-Thinking comparison chart.",
      "paperUrl": "https://www.arcee.ai/blog/trinity-large-thinking",
      "paperTitle": "Trinity-Large-Thinking: Scaling an Open Source Frontier Agent",
      "authors": "Arcee AI",
      "year": "2026",
      "tasks": "Repository task completion",
      "format": "Agent scaffold benchmark",
      "difficulty": "Professional software engineering",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 5,
      "url": "https://benchlm.ai/benchmarks/sweVerifiedArcee",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/sweVerifiedArcee.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "spider2Lite",
      "name": "Spider 2.0-Lite",
      "fullName": "Spider 2.0-Lite",
      "description": "A text-to-SQL benchmark over realistic warehouse-scale schemas, reported by Interfaze for model comparison.",
      "paperUrl": "https://github.com/xlang-ai/Spider2",
      "paperTitle": "Spider 2.0: Evaluating Language Models on Real-World Enterprise Text-to-SQL Workflows",
      "authors": "Spider 2.0 authors",
      "year": "2024",
      "tasks": "Text-to-SQL queries",
      "format": "Execution accuracy",
      "difficulty": "Enterprise text-to-SQL",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/spider2Lite",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/spider2Lite.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "sciCode",
      "name": "SciCode",
      "fullName": "Scientific Code Benchmark",
      "description": "SciCode evaluates language models on generating code for realistic scientific research problems across 16 subfields of physics, math, chemistry, biology, and material science. Problems decompose into 338 subproblems requiring domain knowledge recall, scientific reasoning, and precise code synthesis. Based on real scripts from published research.",
      "paperUrl": null,
      "paperTitle": null,
      "authors": null,
      "year": 2024,
      "tasks": 80,
      "format": null,
      "difficulty": null,
      "decimals": null,
      "weight": 0.103,
      "displayableScoreCount": 11,
      "url": "https://benchlm.ai/benchmarks/sciCode",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/sciCode.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "aaCodingIndex",
      "name": "AA Coding Index",
      "fullName": "Artificial Analysis Coding Index",
      "description": "A display-only Artificial Analysis coding index.",
      "paperUrl": "https://artificialanalysis.ai/leaderboards/models",
      "paperTitle": "Artificial Analysis model leaderboards",
      "authors": "Artificial Analysis",
      "year": "2026",
      "tasks": "Cross-benchmark coding index",
      "format": "Aggregated model score",
      "difficulty": "Display-only external reference",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 127,
      "url": "https://benchlm.ai/benchmarks/aaCodingIndex",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/aaCodingIndex.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "aaCodingAgents",
      "name": "AA Coding Agents",
      "fullName": "Artificial Analysis Coding Agent Index",
      "description": "A display-only Artificial Analysis leaderboard for coding-agent systems, combining agent harnesses, host models, and execution settings across software-engineering benchmarks.",
      "paperUrl": "https://artificialanalysis.ai/agents/coding-agents",
      "paperTitle": "Artificial Analysis Coding Agent Benchmarks",
      "authors": "Artificial Analysis",
      "year": "2026",
      "tasks": "Composite over DeepSWE, Terminal-Bench v2, and SWE-Atlas-QnA",
      "format": "Average pass@1 index",
      "difficulty": "Real-world coding-agent workflows",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/aaCodingAgents",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/aaCodingAgents.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "aaSciCode",
      "name": "AA-SciCode",
      "fullName": "Artificial Analysis SciCode",
      "description": "A display-only Artificial Analysis SciCode score.",
      "paperUrl": "https://artificialanalysis.ai/evaluations/scicode",
      "paperTitle": "Artificial Analysis SciCode Benchmark Leaderboard",
      "authors": "Artificial Analysis",
      "year": "2026",
      "tasks": "Scientific coding subproblems",
      "format": "Task success rate",
      "difficulty": "Scientific programming",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 130,
      "url": "https://benchlm.ai/benchmarks/aaSciCode",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/aaSciCode.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "terminalBenchHard",
      "name": "Terminal-Bench Hard",
      "fullName": "Terminal-Bench Hard",
      "description": "A display-only Artificial Analysis coding metric for agentic coding and terminal use on a harder Terminal-Bench slice.",
      "paperUrl": "https://artificialanalysis.ai/models/grok-4-3",
      "paperTitle": "Artificial Analysis model benchmarks",
      "authors": "Artificial Analysis",
      "year": "2026",
      "tasks": "Agentic coding and terminal tasks",
      "format": "Task success rate",
      "difficulty": "Professional software engineering",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 123,
      "url": "https://benchlm.ai/benchmarks/terminalBenchHard",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/terminalBenchHard.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "vibeV2",
      "name": "VIBE V2",
      "fullName": "VIBE V2",
      "description": "A display-only MiniMax provider benchmark for end-to-end coding-agent and product-building tasks.",
      "paperUrl": "https://huggingface.co/MiniMaxAI/MiniMax-M3",
      "paperTitle": "MiniMax M3 model card",
      "authors": "MiniMax",
      "year": "2026",
      "tasks": "End-to-end coding-agent tasks",
      "format": "Task success rate",
      "difficulty": "Frontier coding-agent workflows",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/vibeV2",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/vibeV2.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "svgBench",
      "name": "SVG-Bench",
      "fullName": "SVG-Bench",
      "description": "A display-only provider benchmark for generating or manipulating SVG outputs from natural-language requirements.",
      "paperUrl": "https://huggingface.co/MiniMaxAI/MiniMax-M3",
      "paperTitle": "MiniMax M3 model card",
      "authors": "MiniMax",
      "year": "2026",
      "tasks": "SVG generation and editing tasks",
      "format": "Task success rate",
      "difficulty": "Visual coding and structured graphics generation",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/svgBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/svgBench.md"
    },
    {
      "category": "coding",
      "categoryLabel": "Coding",
      "benchmarkKey": "kernelBenchHard",
      "name": "KernelBench Hard",
      "fullName": "KernelBench Hard",
      "description": "A display-only benchmark for difficult GPU kernel implementation and optimization tasks.",
      "paperUrl": "https://huggingface.co/MiniMaxAI/MiniMax-M3",
      "paperTitle": "MiniMax M3 model card",
      "authors": "MiniMax",
      "year": "2026",
      "tasks": "Hard GPU kernel coding tasks",
      "format": "Task success rate",
      "difficulty": "Specialized systems programming",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/kernelBenchHard",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/kernelBenchHard.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "aime2023",
      "name": "AIME 2023",
      "fullName": "American Invitational Mathematics Examination 2023",
      "description": "A 15-question, 3-hour examination where each answer is an integer from 000 to 999. Serves as the intermediate step between AMC 10/12 and the USA Mathematical Olympiad (USAMO).",
      "paperUrl": "https://www.maa.org/math-competitions/aime",
      "paperTitle": "American Invitational Mathematics Examination",
      "authors": "Mathematical Association of America",
      "year": "2023",
      "tasks": "15 problems",
      "format": "Integer answers 000-999",
      "difficulty": "High school olympiad level",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/aime2023",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/aime2023.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "aime2024",
      "name": "AIME 2024",
      "fullName": "American Invitational Mathematics Examination 2024",
      "description": "The 2024 edition of AIME, maintaining the same format of 15 challenging mathematics problems with integer answers from 000 to 999.",
      "paperUrl": "https://www.maa.org/math-competitions/aime",
      "paperTitle": "American Invitational Mathematics Examination",
      "authors": "Mathematical Association of America",
      "year": "2024",
      "tasks": "15 problems",
      "format": "Integer answers 000-999",
      "difficulty": "High school olympiad level",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/aime2024",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/aime2024.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "aime2025",
      "name": "AIME 2025",
      "fullName": "American Invitational Mathematics Examination 2025",
      "description": "The most recent AIME examination, featuring 15 challenging mathematics problems testing olympiad-level mathematical reasoning with integer answers from 000-999.",
      "paperUrl": "https://www.maa.org/math-competitions/aime",
      "paperTitle": "American Invitational Mathematics Examination",
      "authors": "Mathematical Association of America",
      "year": "2025",
      "tasks": "15 problems",
      "format": "Integer answers 000-999",
      "difficulty": "High school olympiad level",
      "decimals": null,
      "weight": 0.25,
      "displayableScoreCount": 10,
      "url": "https://benchlm.ai/benchmarks/aime2025",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/aime2025.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "gsm8k",
      "name": "GSM8K",
      "fullName": "Grade School Math 8K",
      "description": "A grade-school mathematical reasoning benchmark reported in DeepSeek-V4 base-model evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Grade-school math word problems",
      "format": "Exact match",
      "difficulty": "Grade-school math",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/gsm8k",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/gsm8k.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "mathBenchmark",
      "name": "MATH",
      "fullName": "MATH",
      "description": "A competition-style mathematical reasoning benchmark reported in DeepSeek-V4 base-model evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Competition math problems",
      "format": "Exact match",
      "difficulty": "Advanced math reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/mathBenchmark",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mathBenchmark.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "cmath",
      "name": "CMath",
      "fullName": "CMath",
      "description": "A Chinese mathematical reasoning benchmark reported in DeepSeek-V4 base-model evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Chinese math problems",
      "format": "Exact match",
      "difficulty": "Math reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/cmath",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/cmath.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "aime2025Arcee",
      "name": "AIME25 (Arcee)",
      "fullName": "AIME25 first-party comparison snapshot",
      "description": "A display-only AIME25 reference from Arcee AI's Trinity-Large-Thinking launch chart.",
      "paperUrl": "https://www.arcee.ai/blog/trinity-large-thinking",
      "paperTitle": "Trinity-Large-Thinking: Scaling an Open Source Frontier Agent",
      "authors": "Arcee AI",
      "year": "2026",
      "tasks": "15 problems",
      "format": "Integer answers 000-999",
      "difficulty": "High school olympiad level",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 6,
      "url": "https://benchlm.ai/benchmarks/aime2025Arcee",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/aime2025Arcee.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "hmmt2023",
      "name": "HMMT Feb 2023",
      "fullName": "Harvard-MIT Mathematics Tournament February 2023",
      "description": "A prestigious high school mathematics competition hosted jointly by Harvard and MIT, featuring challenging problems across various mathematical disciplines.",
      "paperUrl": "https://www.hmmt.org/",
      "paperTitle": "Harvard-MIT Mathematics Tournament",
      "authors": "Harvard and MIT Mathematics Departments",
      "year": "2023",
      "tasks": "Tournament problems",
      "format": "Competition mathematics",
      "difficulty": "High school olympiad level",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/hmmt2023",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/hmmt2023.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "hmmt2024",
      "name": "HMMT Feb 2024",
      "fullName": "Harvard-MIT Mathematics Tournament February 2024",
      "description": "The 2024 February edition of the Harvard-MIT Mathematics Tournament, continuing the tradition of challenging high school mathematics competition.",
      "paperUrl": "https://www.hmmt.org/",
      "paperTitle": "Harvard-MIT Mathematics Tournament",
      "authors": "Harvard and MIT Mathematics Departments",
      "year": "2024",
      "tasks": "Tournament problems",
      "format": "Competition mathematics",
      "difficulty": "High school olympiad level",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/hmmt2024",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/hmmt2024.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "hmmt2025",
      "name": "HMMT Feb 2025",
      "fullName": "Harvard-MIT Mathematics Tournament February 2025",
      "description": "The most recent February edition of the Harvard-MIT Mathematics Tournament, featuring the latest challenging problems in competitive mathematics.",
      "paperUrl": "https://www.hmmt.org/",
      "paperTitle": "Harvard-MIT Mathematics Tournament",
      "authors": "Harvard and MIT Mathematics Departments",
      "year": "2025",
      "tasks": "Tournament problems",
      "format": "Competition mathematics",
      "difficulty": "High school olympiad level",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/hmmt2025",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/hmmt2025.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "brumo2025",
      "name": "BRUMO 2025",
      "fullName": "Bulgarian Mathematical Olympiad 2025",
      "description": "A challenging mathematical olympiad competition featuring problems that test advanced mathematical reasoning and problem-solving skills at the olympiad level.",
      "paperUrl": "https://www.math.bas.bg/",
      "paperTitle": "Bulgarian Mathematical Olympiad",
      "authors": "Bulgarian Mathematical Society",
      "year": "2025",
      "tasks": "Olympiad problems",
      "format": "Mathematical olympiad",
      "difficulty": "Mathematical olympiad level",
      "decimals": null,
      "weight": 0.25,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/brumo2025",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/brumo2025.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "math500",
      "name": "MATH-500",
      "fullName": "MATH-500 Problem Set",
      "description": "A curated subset of 500 problems from the MATH dataset, covering algebra, counting and probability, geometry, intermediate algebra, number theory, prealgebra, and precalculus.",
      "paperUrl": "https://arxiv.org/abs/2103.03874",
      "paperTitle": "Measuring Mathematical Problem Solving With the MATH Dataset",
      "authors": "Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric Tang, Dawn Song, Jacob Steinhardt",
      "year": "2021",
      "tasks": "500 problems",
      "format": "Free-form mathematical answers",
      "difficulty": "High school to undergraduate",
      "decimals": null,
      "weight": 0.15,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/math500",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/math500.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "aime2026",
      "name": "AIME26",
      "fullName": "AIME 2026",
      "description": "A 2026 American Invitational Mathematics Examination snapshot used in frontier-model comparison tables for mathematical reasoning.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Competition math problems",
      "format": "Short-answer mathematics",
      "difficulty": "Olympiad-style mathematics",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 15,
      "url": "https://benchlm.ai/benchmarks/aime2026",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/aime2026.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "ipho2025Theory",
      "name": "IPhO 2025 (Theory)",
      "fullName": "International Physics Olympiad 2025 (Theory)",
      "description": "The three official theory problems from the 2025 International Physics Olympiad, scored with blinded human evaluation.",
      "paperUrl": "https://ai.meta.com/static-resource/muse-spark-eval-methodology",
      "paperTitle": "Muse Spark Eval Methodology",
      "authors": "Meta AI",
      "year": "2026",
      "tasks": "3 olympiad theory problems",
      "format": "Physics olympiad theory",
      "difficulty": "International olympiad physics",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/ipho2025Theory",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/ipho2025Theory.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "hmmtFeb2025",
      "name": "HMMT Feb 2025",
      "fullName": "Harvard-MIT Mathematics Tournament February 2025",
      "description": "A February 2025 HMMT slice used in exact-value provider tables for advanced contest-math reasoning.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2025",
      "tasks": "Competition math problems",
      "format": "Contest mathematics",
      "difficulty": "Olympiad-style mathematics",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 7,
      "url": "https://benchlm.ai/benchmarks/hmmtFeb2025",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/hmmtFeb2025.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "hmmtNov2025",
      "name": "HMMT Nov 2025",
      "fullName": "Harvard-MIT Mathematics Tournament November 2025",
      "description": "A November 2025 HMMT slice for high-end mathematical reasoning comparisons.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2025",
      "tasks": "Competition math problems",
      "format": "Contest mathematics",
      "difficulty": "Olympiad-style mathematics",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 8,
      "url": "https://benchlm.ai/benchmarks/hmmtNov2025",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/hmmtNov2025.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "hmmtFeb2026",
      "name": "HMMT Feb 2026",
      "fullName": "Harvard-MIT Mathematics Tournament February 2026",
      "description": "A February 2026 HMMT slice used in newer frontier-model math comparisons.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Competition math problems",
      "format": "Contest mathematics",
      "difficulty": "Olympiad-style mathematics",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 20,
      "url": "https://benchlm.ai/benchmarks/hmmtFeb2026",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/hmmtFeb2026.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "imoAnswerBench",
      "name": "IMOAnswerBench",
      "fullName": "IMOAnswerBench",
      "description": "A challenging mathematical reasoning benchmark reported in DeepSeek-V4 model evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Advanced mathematical answer generation",
      "format": "Pass@1 math benchmark",
      "difficulty": "Olympiad-level mathematics",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 9,
      "url": "https://benchlm.ai/benchmarks/imoAnswerBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/imoAnswerBench.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "apex",
      "name": "Apex",
      "fullName": "Apex",
      "description": "A high-difficulty mathematical reasoning benchmark reported in DeepSeek-V4 model evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Advanced mathematical reasoning",
      "format": "Pass@1 math benchmark",
      "difficulty": "Frontier math reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 9,
      "url": "https://benchlm.ai/benchmarks/apex",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/apex.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "apexShortlist",
      "name": "Apex Shortlist",
      "fullName": "Apex Shortlist",
      "description": "A shortlist subset of the Apex mathematical reasoning benchmark reported in DeepSeek-V4 model evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Advanced mathematical reasoning",
      "format": "Pass@1 math benchmark",
      "difficulty": "Frontier math reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 6,
      "url": "https://benchlm.ai/benchmarks/apexShortlist",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/apexShortlist.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "mmAnswerBench",
      "name": "MMAnswerBench",
      "fullName": "MMAnswerBench",
      "description": "A multimodal mathematical reasoning benchmark that tests whether models can answer visually grounded math questions correctly.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Multimodal math questions",
      "format": "Visual and structured mathematical QA",
      "difficulty": "Advanced mathematical reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 9,
      "url": "https://benchlm.ai/benchmarks/mmAnswerBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mmAnswerBench.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "frontierMath",
      "name": "FrontierMath",
      "fullName": "FrontierMath",
      "description": "An expert-level mathematical reasoning benchmark by Epoch AI featuring original, research-level problems created by mathematicians including IMO gold medalists and Fields Medal recipients. Problems require deep creativity and multi-step reasoning.",
      "paperUrl": "https://epoch.ai/frontiermath",
      "paperTitle": "FrontierMath: A Benchmark for Evaluating Advanced Mathematical Reasoning in AI",
      "authors": "Epoch AI",
      "year": "2024",
      "tasks": "350 original research-level math problems",
      "format": "Open-ended mathematical reasoning with tool access",
      "difficulty": "Research-level mathematics",
      "decimals": null,
      "weight": 0.35,
      "displayableScoreCount": 4,
      "url": "https://benchlm.ai/benchmarks/frontierMath",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/frontierMath.md"
    },
    {
      "category": "math",
      "categoryLabel": "Mathematics",
      "benchmarkKey": "usamo2026",
      "name": "USAMO 2026",
      "fullName": "United States of America Mathematical Olympiad 2026",
      "description": "The premier US mathematical olympiad competition, featuring proof-based problems that require deep mathematical insight and rigorous argumentation at the highest competition level.",
      "paperUrl": "https://www.maa.org/math-competitions/usamo",
      "paperTitle": "United States of America Mathematical Olympiad",
      "authors": "Mathematical Association of America",
      "year": "2026",
      "tasks": "6 proof-based problems",
      "format": "Mathematical proof construction",
      "difficulty": "International olympiad level",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 4,
      "url": "https://benchlm.ai/benchmarks/usamo2026",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/usamo2026.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "musr",
      "name": "MuSR",
      "fullName": "Testing the Limits of Chain-of-thought with Multistep Soft Reasoning",
      "description": "A dataset for evaluating language models on multistep soft reasoning tasks specified in natural language narratives. Tests the ability to perform complex, structured reasoning.",
      "paperUrl": "https://arxiv.org/abs/2310.16049",
      "paperTitle": "MuSR: Testing the Limits of Chain-of-thought with Multistep Soft Reasoning",
      "authors": "Zayne Sprague, Xi Ye, Kaj Bostrom, Swarat Chaudhuri, Greg Durrett",
      "year": "2023",
      "tasks": "Multi-step reasoning",
      "format": "Narrative-based reasoning",
      "difficulty": "Complex reasoning tasks",
      "decimals": null,
      "weight": 0.2,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/musr",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/musr.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "bbh",
      "name": "BBH",
      "fullName": "BIG-Bench Hard",
      "description": "A suite of 23 challenging tasks from the BIG-Bench collaborative benchmark where prior language models failed to exceed average human performance, even with chain-of-thought prompting.",
      "paperUrl": "https://arxiv.org/abs/2210.09261",
      "paperTitle": "Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them",
      "authors": "Mirac Suzgun, Nathan Scales, Nathanael Schärli, Sebastian Gehrmann, Yi Tay, Hyung Won Chung, Aakanksha Chowdhery, Quoc V. Le, Ed H. Chi, Denny Zhou, Jason Wei",
      "year": "2022",
      "tasks": "23 tasks",
      "format": "Mixed reasoning tasks",
      "difficulty": "Advanced reasoning",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 4,
      "url": "https://benchlm.ai/benchmarks/bbh",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/bbh.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "drop",
      "name": "DROP",
      "fullName": "Discrete Reasoning Over Paragraphs",
      "description": "A reading-comprehension benchmark requiring discrete reasoning over paragraphs, reported in DeepSeek-V4 base-model evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Paragraph reasoning questions",
      "format": "F1",
      "difficulty": "Reading and numerical reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/drop",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/drop.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "hellaswag",
      "name": "HellaSwag",
      "fullName": "HellaSwag",
      "description": "A commonsense natural-language inference benchmark reported in DeepSeek-V4 base-model evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Commonsense completion questions",
      "format": "Exact match",
      "difficulty": "Commonsense reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/hellaswag",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/hellaswag.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "winogrande",
      "name": "WinoGrande",
      "fullName": "WinoGrande",
      "description": "A commonsense coreference benchmark reported in DeepSeek-V4 base-model evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Coreference resolution questions",
      "format": "Exact match",
      "difficulty": "Commonsense reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/winogrande",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/winogrande.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "cluewsc",
      "name": "CLUEWSC",
      "fullName": "CLUEWSC",
      "description": "A Chinese Winograd Schema Challenge benchmark reported in DeepSeek-V4 base-model evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Chinese coreference questions",
      "format": "Exact match",
      "difficulty": "Chinese commonsense reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/cluewsc",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/cluewsc.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "lisanBench",
      "name": "LisanBench",
      "fullName": "LisanBench",
      "description": "A word-chain reasoning benchmark that tests planning, recall, constraint following, and vocabulary depth by asking models to extend non-repeating edit-distance-1 chains.",
      "paperUrl": "https://lisanbench.com/?tab=about",
      "paperTitle": "LisanBench methodology",
      "authors": "voice-from-the-outer-world",
      "year": "2026",
      "tasks": "50 starting words × 3 trials",
      "format": "Difficulty-weighted word-chain reasoning",
      "difficulty": "Open-ended lexical planning",
      "decimals": 2,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/lisanBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/lisanBench.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "ppBench",
      "name": "Pencil Puzzle Bench",
      "fullName": "Pencil Puzzle Bench",
      "description": "A multi-step verifiable reasoning benchmark that evaluates whether models can solve pencil puzzles with unique solutions.",
      "paperUrl": "https://arxiv.org/abs/2603.02119",
      "paperTitle": "Pencil Puzzle Bench",
      "authors": "Approximate Labs",
      "year": "2026",
      "tasks": "300 evaluation puzzles",
      "format": "Direct and agentic puzzle solve rate",
      "difficulty": "Multi-step verifiable reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/ppBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/ppBench.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "longBenchV2",
      "name": "LongBench v2",
      "fullName": "LongBench v2",
      "description": "A long-context benchmark that measures whether models can actually use extended context windows for reasoning and retrieval.",
      "paperUrl": "https://arxiv.org/abs/2412.15204",
      "paperTitle": "LongBench v2",
      "authors": "LongBench v2 authors",
      "year": "2025",
      "tasks": "Long-context tasks",
      "format": "Extended-context retrieval and reasoning",
      "difficulty": "Hard long-context",
      "decimals": null,
      "weight": 0.3,
      "displayableScoreCount": 11,
      "url": "https://benchlm.ai/benchmarks/longBenchV2",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/longBenchV2.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "mrcrv2",
      "name": "MRCRv2",
      "fullName": "MRCRv2",
      "description": "A long-context benchmark for memory, retrieval, and multi-round coherence over large contexts.",
      "paperUrl": "https://openai.com/index/introducing-gpt-5-2/",
      "paperTitle": "Introducing GPT-5.2 and GPT-5.2 Pro",
      "authors": "OpenAI",
      "year": "2025",
      "tasks": "Long-context retrieval",
      "format": "Multi-round long-context evaluation",
      "difficulty": "Hard long-context",
      "decimals": null,
      "weight": 0.25,
      "displayableScoreCount": 4,
      "url": "https://benchlm.ai/benchmarks/mrcrv2",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mrcrv2.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "mrcrv2_64_128",
      "name": "MRCR v2 64K-128K",
      "fullName": "OpenAI MRCR v2 8-needle 64K-128K",
      "description": "MRCR v2 slice focused on long-context retrieval at 64K-128K lengths.",
      "paperUrl": "https://openai.com/index/introducing-gpt-5-4-mini-and-nano/",
      "paperTitle": "Introducing GPT-5.4 mini and nano",
      "authors": "OpenAI",
      "year": "2026",
      "tasks": "8-needle retrieval tasks",
      "format": "Long-context retrieval",
      "difficulty": "Long-context reasoning",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/mrcrv2_64_128",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mrcrv2_64_128.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "mrcrv2_128_256",
      "name": "MRCR v2 128K-256K",
      "fullName": "OpenAI MRCR v2 8-needle 128K-256K",
      "description": "MRCR v2 slice focused on very long contexts at 128K-256K lengths.",
      "paperUrl": "https://openai.com/index/introducing-gpt-5-4-mini-and-nano/",
      "paperTitle": "Introducing GPT-5.4 mini and nano",
      "authors": "OpenAI",
      "year": "2026",
      "tasks": "8-needle retrieval tasks",
      "format": "Very-long-context retrieval",
      "difficulty": "Very long-context reasoning",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/mrcrv2_128_256",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mrcrv2_128_256.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "graphwalksBfs128k",
      "name": "Graphwalks BFS 128K",
      "fullName": "Graphwalks BFS 0K-128K",
      "description": "Long-context graph traversal benchmark using breadth-first search tasks.",
      "paperUrl": "https://openai.com/index/introducing-gpt-5-4-mini-and-nano/",
      "paperTitle": "Introducing GPT-5.4 mini and nano",
      "authors": "OpenAI",
      "year": "2026",
      "tasks": "Graph traversal tasks",
      "format": "Long-context graph reasoning",
      "difficulty": "Algorithmic long-context reasoning",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/graphwalksBfs128k",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/graphwalksBfs128k.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "graphwalksParents128k",
      "name": "Graphwalks Parents 128K",
      "fullName": "Graphwalks parents 0-128K",
      "description": "Long-context benchmark for recovering parent relationships inside graph tasks.",
      "paperUrl": "https://openai.com/index/introducing-gpt-5-4-mini-and-nano/",
      "paperTitle": "Introducing GPT-5.4 mini and nano",
      "authors": "OpenAI",
      "year": "2026",
      "tasks": "Graph parent-retrieval tasks",
      "format": "Long-context graph reasoning",
      "difficulty": "Algorithmic long-context reasoning",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/graphwalksParents128k",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/graphwalksParents128k.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "mrcr1m",
      "name": "MRCR 1M",
      "fullName": "MRCR 1M",
      "description": "A million-token MRCR long-context retrieval benchmark reported in DeepSeek-V4 model evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Million-token retrieval",
      "format": "Long-context retrieval MMR",
      "difficulty": "Million-token long context",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 7,
      "url": "https://benchlm.ai/benchmarks/mrcr1m",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mrcr1m.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "corpusQa1m",
      "name": "CorpusQA 1M",
      "fullName": "CorpusQA 1M",
      "description": "A million-token CorpusQA long-context question-answering benchmark reported in DeepSeek-V4 model evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Million-token corpus question answering",
      "format": "Long-context QA accuracy",
      "difficulty": "Million-token long context",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 6,
      "url": "https://benchlm.ai/benchmarks/corpusQa1m",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/corpusQa1m.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "arcAgi2",
      "name": "ARC-AGI-2",
      "fullName": "Abstraction and Reasoning Corpus for AGI v2",
      "description": "A benchmark measuring fluid intelligence and novel abstract reasoning through visual grid puzzles. Models must identify patterns in input-output pairs and generate the correct output for unseen inputs. Considered the hardest public reasoning benchmark — average individual human performance is 66%.",
      "paperUrl": "https://arcprize.org/arc-agi/2/",
      "paperTitle": "ARC-AGI-2: A Harder General Intelligence Benchmark",
      "authors": "Francois Chollet, ARC Prize Foundation",
      "year": 2025,
      "tasks": "Visual pattern completion and abstract reasoning",
      "format": "Grid transformation puzzles with novel rules",
      "difficulty": "Expert-level — hardest public reasoning benchmark",
      "decimals": null,
      "weight": 0.25,
      "displayableScoreCount": 11,
      "url": "https://benchlm.ai/benchmarks/arcAgi2",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/arcAgi2.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "aiNeedle",
      "name": "AI-Needle",
      "fullName": "AI-Needle",
      "description": "A long-context retrieval benchmark that measures whether a model can recover relevant information embedded deep inside very long contexts.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Long-context retrieval",
      "format": "Needle-in-a-haystack recall",
      "difficulty": "Long-context memory",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 4,
      "url": "https://benchlm.ai/benchmarks/aiNeedle",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/aiNeedle.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "gpqaDiamond",
      "name": "GPQA Diamond",
      "fullName": "GPQA Diamond",
      "description": "The hardest subset of GPQA featuring the most challenging graduate-level science questions. Sometimes reported separately from the standard GPQA benchmark.",
      "paperUrl": "https://arxiv.org/abs/2311.12022",
      "paperTitle": "GPQA: A Graduate-Level Google-Proof Q&A Benchmark",
      "authors": "David Rein et al.",
      "year": "2023",
      "tasks": "Expert-level science questions",
      "format": "Multiple choice questions",
      "difficulty": "Graduate-level scientific reasoning",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/gpqaDiamond",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/gpqaDiamond.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "lcr",
      "name": "AA-LCR",
      "fullName": "Artificial Analysis Long Context Reasoning",
      "description": "A display-only Artificial Analysis long-context reasoning evaluation.",
      "paperUrl": "https://artificialanalysis.ai/models/grok-4-3",
      "paperTitle": "Artificial Analysis model benchmarks",
      "authors": "Artificial Analysis",
      "year": "2026",
      "tasks": "Long-context reasoning tasks",
      "format": "Accuracy",
      "difficulty": "Long-context reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 123,
      "url": "https://benchlm.ai/benchmarks/lcr",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/lcr.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "critpt",
      "name": "CritPt",
      "fullName": "Critical Physics Tasks",
      "description": "A display-only Artificial Analysis metric for research-level physics reasoning.",
      "paperUrl": "https://artificialanalysis.ai/models/grok-4-3",
      "paperTitle": "Artificial Analysis model benchmarks",
      "authors": "Artificial Analysis",
      "year": "2026",
      "tasks": "Research-level physics questions",
      "format": "Accuracy",
      "difficulty": "Research-level physics reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 124,
      "url": "https://benchlm.ai/benchmarks/critpt",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/critpt.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "bullshitBenchV2",
      "name": "BullshitBench v2",
      "fullName": "BullshitBench v2",
      "description": "A benchmark that tests whether AI models challenge nonsensical, ill-posed, or logically flawed prompts instead of confidently generating incorrect answers. Measures the critical ability to push back on bad input.",
      "paperUrl": "https://petergpt.github.io/bullshit-benchmark/",
      "paperTitle": "BullshitBench: Measuring whether AI models challenge nonsensical prompts",
      "authors": "Peter Gostev",
      "year": "2025",
      "tasks": "Nonsensical and flawed prompts across multiple domains",
      "format": "Prompt challenge and refusal evaluation",
      "difficulty": "Robustness and critical reasoning",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/bullshitBenchV2",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/bullshitBenchV2.md"
    },
    {
      "category": "reasoning",
      "categoryLabel": "Reasoning",
      "benchmarkKey": "wildBench",
      "name": "WildBench",
      "fullName": "WildBench",
      "description": "An automated evaluation framework using 1,000+ real-world user tasks covering reasoning, planning, coding, and creative writing. Highly correlated with Chatbot Arena human preference rankings.",
      "paperUrl": "https://arxiv.org/abs/2406.04770",
      "paperTitle": "WildBench: Benchmarking Language Models with Challenging Tasks from Real Users in the Wild",
      "authors": "Bill Yuchen Lin et al.",
      "year": "2024",
      "tasks": "1,024 real-world tasks",
      "format": "Real-world task evaluation",
      "difficulty": "Diverse real-world scenarios",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/wildBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/wildBench.md"
    },
    {
      "category": "instructionFollowing",
      "categoryLabel": "Instruction Following",
      "benchmarkKey": "ifeval",
      "name": "IFEval",
      "fullName": "Instruction-Following Eval",
      "description": "A benchmark that evaluates language models' ability to follow verifiable instructions such as formatting constraints, keyword inclusion/exclusion, length limits, and structural requirements.",
      "paperUrl": "https://arxiv.org/abs/2311.07911",
      "paperTitle": "Instruction-Following Evaluation for Large Language Models",
      "authors": "Jeffrey Zhou, Tianjian Lu, Swaroop Mishra, Siddhartha Brahma, Sujoy Basu, Yi Luan, Denny Zhou, Le Hou",
      "year": "2023",
      "tasks": "500+ instructions",
      "format": "Constrained generation",
      "difficulty": "Instruction precision",
      "decimals": null,
      "weight": 0.65,
      "displayableScoreCount": 22,
      "url": "https://benchlm.ai/benchmarks/ifeval",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/ifeval.md"
    },
    {
      "category": "instructionFollowing",
      "categoryLabel": "Instruction Following",
      "benchmarkKey": "ifBench",
      "name": "IFBench",
      "fullName": "Instruction Following Benchmark",
      "description": "IFBench evaluates precise instruction-following generalization on 58 challenging, verifiable out-of-domain constraints. Unlike IFEval which tests familiar constraint types, IFBench specifically measures how well models follow novel instructions they haven't been optimized for, exposing overfitting to common instruction patterns.",
      "paperUrl": null,
      "paperTitle": null,
      "authors": null,
      "year": 2025,
      "tasks": 58,
      "format": null,
      "difficulty": null,
      "decimals": null,
      "weight": 0.35,
      "displayableScoreCount": 14,
      "url": "https://benchlm.ai/benchmarks/ifBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/ifBench.md"
    },
    {
      "category": "instructionFollowing",
      "categoryLabel": "Instruction Following",
      "benchmarkKey": "aaIfBench",
      "name": "AA-IFBench",
      "fullName": "Artificial Analysis IFBench",
      "description": "A display-only Artificial Analysis IFBench score.",
      "paperUrl": "https://artificialanalysis.ai/evaluations/ifbench",
      "paperTitle": "Artificial Analysis IFBench Benchmark Leaderboard",
      "authors": "Artificial Analysis",
      "year": "2026",
      "tasks": "Verifiable instruction constraints",
      "format": "Constraint satisfaction accuracy",
      "difficulty": "Instruction precision",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 124,
      "url": "https://benchlm.ai/benchmarks/aaIfBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/aaIfBench.md"
    },
    {
      "category": "instructionFollowing",
      "categoryLabel": "Instruction Following",
      "benchmarkKey": "sobValueAcc",
      "name": "SOB Value Acc",
      "fullName": "Structured Output Benchmark Value Accuracy",
      "description": "A structured-output benchmark from Interfaze measuring whether extracted JSON leaf values exactly match verified ground truth.",
      "paperUrl": "https://interfaze.ai/leaderboards/structured-output-benchmark",
      "paperTitle": "Structured Output Benchmark Leaderboard",
      "authors": "Interfaze",
      "year": "2026",
      "tasks": "Structured output extraction",
      "format": "Value accuracy",
      "difficulty": "Production structured-output reliability",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/sobValueAcc",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/sobValueAcc.md"
    },
    {
      "category": "multilingual",
      "categoryLabel": "Multilingual",
      "benchmarkKey": "mgsm",
      "name": "MGSM",
      "fullName": "Multilingual Grade School Math",
      "description": "A multilingual benchmark that translates 250 grade school math problems from GSM8K into 10 typologically diverse languages: Bengali, German, Spanish, French, Japanese, Russian, Swahili, Telugu, Thai, and Chinese.",
      "paperUrl": "https://arxiv.org/abs/2210.03057",
      "paperTitle": "Language Models are Multilingual Chain-of-Thought Reasoners",
      "authors": "Freda Shi, Mirac Suzgun, Markus Freitag, Xuezhi Wang, Suraj Srivats, Soroush Vosoughi, Hyung Won Chung, Yi Tay, Sebastian Ruder, Denny Zhou, Dipanjan Das, Jason Wei",
      "year": "2022",
      "tasks": "250 problems × 11 languages",
      "format": "Math word problems",
      "difficulty": "Grade school math, multilingual",
      "decimals": null,
      "weight": 0.35,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/mgsm",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mgsm.md"
    },
    {
      "category": "multilingual",
      "categoryLabel": "Multilingual",
      "benchmarkKey": "mmluProX",
      "name": "MMLU-ProX",
      "fullName": "MMLU-ProX",
      "description": "A multilingual extension of professional-level academic evaluation across many languages.",
      "paperUrl": "https://arxiv.org/abs/2503.10497",
      "paperTitle": "MMLU-ProX: A Multilingual Benchmark for Advanced Large Language Model Evaluation",
      "authors": "MMLU-ProX authors",
      "year": "2025",
      "tasks": "Multilingual professional QA",
      "format": "Multilingual multiple choice",
      "difficulty": "Professional multilingual",
      "decimals": null,
      "weight": 0.65,
      "displayableScoreCount": 12,
      "url": "https://benchlm.ai/benchmarks/mmluProX",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mmluProX.md"
    },
    {
      "category": "multilingual",
      "categoryLabel": "Multilingual",
      "benchmarkKey": "nova63",
      "name": "NOVA-63",
      "fullName": "NOVA-63",
      "description": "A broad multilingual benchmark row from Qwen's launch comparisons intended to measure cross-lingual capability beyond a single language family.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Broad multilingual evaluation",
      "format": "Cross-lingual benchmark",
      "difficulty": "Broad multilingual capability",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 7,
      "url": "https://benchlm.ai/benchmarks/nova63",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/nova63.md"
    },
    {
      "category": "multilingual",
      "categoryLabel": "Multilingual",
      "benchmarkKey": "include",
      "name": "INCLUDE",
      "fullName": "INCLUDE",
      "description": "A multilingual benchmark used in provider tables to measure inclusive language coverage and cross-lingual understanding beyond common high-resource languages.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Cross-lingual understanding",
      "format": "Multilingual benchmark",
      "difficulty": "Broad multilingual capability",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 3,
      "url": "https://benchlm.ai/benchmarks/include",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/include.md"
    },
    {
      "category": "multilingual",
      "categoryLabel": "Multilingual",
      "benchmarkKey": "polyMath",
      "name": "PolyMath",
      "fullName": "PolyMath",
      "description": "A multilingual mathematical reasoning benchmark that tests whether math performance transfers across languages rather than only in English.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Multilingual math problems",
      "format": "Cross-lingual mathematical reasoning",
      "difficulty": "Advanced multilingual reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/polyMath",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/polyMath.md"
    },
    {
      "category": "multilingual",
      "categoryLabel": "Multilingual",
      "benchmarkKey": "vwt2kLite",
      "name": "VWT2k-lite",
      "fullName": "VWT2k-lite",
      "description": "A lighter multilingual benchmark slice published in provider tables for broad cross-lingual transfer and understanding.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Multilingual transfer tasks",
      "format": "Cross-lingual benchmark",
      "difficulty": "Broad multilingual capability",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/vwt2kLite",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/vwt2kLite.md"
    },
    {
      "category": "multilingual",
      "categoryLabel": "Multilingual",
      "benchmarkKey": "maxife",
      "name": "MAXIFE",
      "fullName": "MAXIFE",
      "description": "A multilingual instruction-following and understanding benchmark row published in Qwen's launch comparisons.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Multilingual instruction following",
      "format": "Cross-lingual benchmark",
      "difficulty": "Advanced multilingual instruction following",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/maxife",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/maxife.md"
    },
    {
      "category": "multilingual",
      "categoryLabel": "Multilingual",
      "benchmarkKey": "sweMultilingual",
      "name": "SWE Multilingual",
      "fullName": "SWE-bench Multilingual",
      "description": "A multilingual extension of SWE-bench covering 300 problems across 9 programming languages, testing code generation and bug fixing beyond Python.",
      "paperUrl": "https://www.swebench.com/multilingual",
      "paperTitle": "SWE-bench Multilingual",
      "authors": "SWE-bench team",
      "year": "2025",
      "tasks": "300 problems across 9 languages",
      "format": "Multi-language code patch generation",
      "difficulty": "Professional multilingual software engineering",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/sweMultilingual",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/sweMultilingual.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "terminalBench2",
      "name": "Terminal-Bench 2.0",
      "fullName": "Terminal-Bench 2.0",
      "description": "A benchmark for agentic software engineering tasks executed in real terminal environments. Models must inspect files, run commands, edit code, and recover from errors over multi-step workflows.",
      "paperUrl": "https://www.tbench.ai/",
      "paperTitle": "Terminal-Bench 2.0",
      "authors": "Terminal-Bench contributors",
      "year": "2026",
      "tasks": "Terminal-based software tasks",
      "format": "Interactive CLI agent evaluation",
      "difficulty": "Professional software engineering",
      "decimals": null,
      "weight": 0.28,
      "displayableScoreCount": 50,
      "url": "https://benchlm.ai/benchmarks/terminalBench2",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/terminalBench2.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "browseComp",
      "name": "BrowseComp",
      "fullName": "BrowseComp",
      "description": "A benchmark for web-browsing agents that must search, inspect sources, gather evidence, and return the correct answer to research-oriented questions.",
      "paperUrl": "https://openai.com/index/browsecomp/",
      "paperTitle": "BrowseComp",
      "authors": "OpenAI",
      "year": "2025",
      "tasks": "Research questions requiring browsing",
      "format": "Web search and evidence synthesis",
      "difficulty": "Hard web research",
      "decimals": null,
      "weight": 0.18,
      "displayableScoreCount": 26,
      "url": "https://benchlm.ai/benchmarks/browseComp",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/browseComp.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "hleWithTools",
      "name": "HLE w/ tools",
      "fullName": "Humanity's Last Exam with tools",
      "description": "Tool-augmented Humanity's Last Exam scores reported in DeepSeek-V4 thinking-mode evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Expert questions with tool use",
      "format": "Pass@1",
      "difficulty": "Frontier tool-augmented reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 7,
      "url": "https://benchlm.ai/benchmarks/hleWithTools",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/hleWithTools.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "gdpvalAa",
      "name": "GDPval-AA",
      "fullName": "GDPval-AA",
      "description": "An agentic real-world work-task evaluation reported as an Elo score in DeepSeek-V4 thinking-mode evaluations.",
      "paperUrl": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/blob/main/DeepSeek_V4.pdf",
      "paperTitle": "DeepSeek-V4 Technical Report",
      "authors": "DeepSeek-AI",
      "year": "2026",
      "tasks": "Agentic real-world work tasks",
      "format": "Elo",
      "difficulty": "Professional agentic workflows",
      "decimals": 0,
      "weight": null,
      "displayableScoreCount": 122,
      "url": "https://benchlm.ai/benchmarks/gdpvalAa",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/gdpvalAa.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "gdpvalAaNormalized",
      "name": "GDPval-AA",
      "fullName": "GDPval-AA normalized",
      "description": "A display-only Artificial Analysis normalized score for economically valuable tasks.",
      "paperUrl": "https://artificialanalysis.ai/models/grok-4-3",
      "paperTitle": "Artificial Analysis model benchmarks",
      "authors": "Artificial Analysis",
      "year": "2026",
      "tasks": "Economically valuable tasks",
      "format": "Normalized score",
      "difficulty": "Professional agentic workflows",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 121,
      "url": "https://benchlm.ai/benchmarks/gdpvalAaNormalized",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/gdpvalAaNormalized.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "aaAgenticIndex",
      "name": "AA Agentic Index",
      "fullName": "Artificial Analysis Agentic Index",
      "description": "A display-only Artificial Analysis agentic index.",
      "paperUrl": "https://artificialanalysis.ai/leaderboards/models",
      "paperTitle": "Artificial Analysis model leaderboards",
      "authors": "Artificial Analysis",
      "year": "2026",
      "tasks": "Cross-benchmark agentic index",
      "format": "Aggregated model score",
      "difficulty": "Display-only external reference",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 121,
      "url": "https://benchlm.ai/benchmarks/aaAgenticIndex",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/aaAgenticIndex.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "apexAgentsAa",
      "name": "APEX-Agents-AA",
      "fullName": "APEX-Agents-AA",
      "description": "Artificial Analysis' implementation of the APEX-Agents benchmark for long-horizon professional-services agent tasks.",
      "paperUrl": "https://artificialanalysis.ai/evaluations/apex-agents-aa",
      "paperTitle": "APEX-Agents-AA Benchmark Leaderboard",
      "authors": "Artificial Analysis / Mercor",
      "year": "2026",
      "tasks": "452 professional-services agent tasks",
      "format": "Pass@1",
      "difficulty": "Long-horizon workplace agent tasks",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 20,
      "url": "https://benchlm.ai/benchmarks/apexAgentsAa",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/apexAgentsAa.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "gertLabs",
      "name": "Gert Labs",
      "fullName": "Gert Labs Composite Game Benchmark",
      "description": "A game-environment benchmark that evaluates AI models in novel games covering strategic planning, resource management, spatial reasoning, cooperation, and theory of mind.",
      "paperUrl": "https://gertlabs.com/rankings",
      "paperTitle": "Gert Labs rankings",
      "authors": "Gert Labs",
      "year": "2026",
      "tasks": "Novel game environments",
      "format": "Composite game leaderboard",
      "difficulty": "Agentic coding and decision-making",
      "decimals": 2,
      "weight": null,
      "displayableScoreCount": 54,
      "url": "https://benchlm.ai/benchmarks/gertLabs",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/gertLabs.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "osWorldVerified",
      "name": "OSWorld-Verified",
      "fullName": "OSWorld-Verified",
      "description": "A verified subset of OSWorld focused on computer-use tasks in desktop-like environments, including navigation, editing, and workflow completion.",
      "paperUrl": "https://os-world.github.io/",
      "paperTitle": "OSWorld",
      "authors": "OSWorld contributors",
      "year": "2025",
      "tasks": "Desktop and GUI tasks",
      "format": "Interactive computer-use evaluation",
      "difficulty": "Complex multi-step workflows",
      "decimals": null,
      "weight": 0.24,
      "displayableScoreCount": 23,
      "url": "https://benchlm.ai/benchmarks/osWorldVerified",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/osWorldVerified.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "cyberGym",
      "name": "CyberGym",
      "fullName": "CyberGym",
      "description": "A cybersecurity task benchmark for evaluating defensive cyber workflows and vulnerability-oriented agent performance.",
      "paperUrl": "https://www.cybergym.io/",
      "paperTitle": "CyberGym: Evaluating AI Agents' Real-World Cybersecurity Capabilities at Scale",
      "authors": "Zhun Wang, Tianneng Shi, Jingxuan He, Matthew Cai, Jialin Zhang, Dawn Song",
      "year": "2026",
      "tasks": "1,507 vulnerability analysis instances",
      "format": "Vulnerability reproduction and PoC generation",
      "difficulty": "Real-world cybersecurity",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 9,
      "url": "https://benchlm.ai/benchmarks/cyberGym",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/cyberGym.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "browseCompVl",
      "name": "BrowseComp-VL",
      "fullName": "BrowseComp-VL",
      "description": "A vision-language browsing benchmark for multimodal web research and tool-use workflows.",
      "paperUrl": "https://docs.z.ai/guides/vlm/glm-5v-turbo",
      "paperTitle": "GLM-5V-Turbo",
      "authors": "Z.AI",
      "year": "2026",
      "tasks": "Multimodal browsing tasks",
      "format": "Vision-language web research evaluation",
      "difficulty": "Multimodal browser-agent",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/browseCompVl",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/browseCompVl.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "osWorld",
      "name": "OSWorld",
      "fullName": "OSWorld",
      "description": "A computer-use benchmark for GUI task completion across the broader OSWorld task suite.",
      "paperUrl": "https://docs.z.ai/guides/vlm/glm-5v-turbo",
      "paperTitle": "GLM-5V-Turbo",
      "authors": "Z.AI",
      "year": "2026",
      "tasks": "Computer-use tasks",
      "format": "Interactive GUI evaluation",
      "difficulty": "Broad computer-use suite",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/osWorld",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/osWorld.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "androidWorld",
      "name": "AndroidWorld",
      "fullName": "AndroidWorld",
      "description": "A mobile GUI agent benchmark for completing Android app workflows and on-device tasks.",
      "paperUrl": "https://docs.z.ai/guides/vlm/glm-5v-turbo",
      "paperTitle": "GLM-5V-Turbo",
      "authors": "Z.AI",
      "year": "2026",
      "tasks": "Android app workflows",
      "format": "Interactive mobile-agent evaluation",
      "difficulty": "Complex mobile task completion",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 5,
      "url": "https://benchlm.ai/benchmarks/androidWorld",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/androidWorld.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "webVoyager",
      "name": "WebVoyager",
      "fullName": "WebVoyager",
      "description": "A browser-agent benchmark for completing multi-step workflows on live websites.",
      "paperUrl": "https://docs.z.ai/guides/vlm/glm-5v-turbo",
      "paperTitle": "GLM-5V-Turbo",
      "authors": "Z.AI",
      "year": "2026",
      "tasks": "Live website workflows",
      "format": "Interactive browser-agent evaluation",
      "difficulty": "Multi-step web navigation",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/webVoyager",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/webVoyager.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "mcpAtlas",
      "name": "MCP Atlas",
      "fullName": "MCP Atlas",
      "description": "A benchmark for tool-calling over Model Context Protocol integrations and external tools.",
      "paperUrl": "https://openai.com/index/introducing-gpt-5-4-mini-and-nano/",
      "paperTitle": "Introducing GPT-5.4 mini and nano",
      "authors": "OpenAI",
      "year": "2026",
      "tasks": "Tool-integrated agent tasks",
      "format": "Interactive tool-calling evaluation",
      "difficulty": "Advanced tool use",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 25,
      "url": "https://benchlm.ai/benchmarks/mcpAtlas",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mcpAtlas.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "kimiClaw247",
      "name": "Kimi Claw 24/7",
      "fullName": "Kimi Claw 24/7 Bench",
      "description": "A Moonshot AI internal long-horizon agent benchmark for persistent professional coworking tasks.",
      "paperUrl": "https://huggingface.co/moonshotai/Kimi-K2.7-Code",
      "paperTitle": "Kimi K2.7 Code",
      "authors": "Moonshot AI",
      "year": "2026",
      "tasks": "17 professional scenarios, 610 evaluation points",
      "format": "Average pass rate across repeated OpenClaw runs",
      "difficulty": "Long-horizon agentic work",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/kimiClaw247",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/kimiClaw247.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "mcpMarkVerified",
      "name": "MCP Mark Verified",
      "fullName": "MCPMark-Verified",
      "description": "A human-verified edition of MCPMark for MCP tool use across Notion, GitHub, Filesystem, Postgres, and Playwright server environments.",
      "paperUrl": "https://mcpmark.ai/",
      "paperTitle": "MCPMark",
      "authors": "MCPMark",
      "year": "2026",
      "tasks": "MCP tool-use tasks across five server environments",
      "format": "Interactive MCP task completion",
      "difficulty": "Advanced tool use",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/mcpMarkVerified",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mcpMarkVerified.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "toolathlon",
      "name": "Toolathlon",
      "fullName": "Toolathlon",
      "description": "A tool-use benchmark focused on selecting, sequencing, and completing tasks with external tools.",
      "paperUrl": "https://openai.com/index/introducing-gpt-5-4-mini-and-nano/",
      "paperTitle": "Introducing GPT-5.4 mini and nano",
      "authors": "OpenAI",
      "year": "2026",
      "tasks": "Multi-tool workflows",
      "format": "Interactive tool-calling evaluation",
      "difficulty": "Advanced tool use",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 21,
      "url": "https://benchlm.ai/benchmarks/toolathlon",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/toolathlon.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "zClawBench",
      "name": "ZClawBench",
      "fullName": "ZClawBench",
      "description": "A Z.AI benchmark for OpenClaw-style agent workflows spanning information search, office work, data analysis, development and operations, automation, and security.",
      "paperUrl": "https://docs.z.ai/guides/llm/glm-5-turbo",
      "paperTitle": "GLM-5-Turbo",
      "authors": "Z.AI",
      "year": "2026",
      "tasks": "OpenClaw agent workflows",
      "format": "End-to-end agent benchmark",
      "difficulty": "Broad productivity and operations workflows",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/zClawBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/zClawBench.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "tau2Bench",
      "name": "Tau2-Telecom",
      "fullName": "Tau2-Telecom",
      "description": "A telecom-oriented tool benchmark that measures structured tool use in domain workflows.",
      "paperUrl": "https://openai.com/index/introducing-gpt-5-4-mini-and-nano/",
      "paperTitle": "Introducing GPT-5.4 mini and nano",
      "authors": "OpenAI",
      "year": "2026",
      "tasks": "Telecom tool workflows",
      "format": "Domain-specific tool evaluation",
      "difficulty": "Professional workflow",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 123,
      "url": "https://benchlm.ai/benchmarks/tau2Bench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/tau2Bench.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "deepSearchQa",
      "name": "DeepSearchQA",
      "fullName": "DeepSearchQA",
      "description": "An agentic browsing benchmark where models search the web, gather evidence, and answer list-style questions using browser tools.",
      "paperUrl": "https://ai.meta.com/static-resource/muse-spark-eval-methodology",
      "paperTitle": "Muse Spark Eval Methodology",
      "authors": "Meta AI",
      "year": "2026",
      "tasks": "Agentic browsing and list-answer questions",
      "format": "Search / open / find browser-agent evaluation",
      "difficulty": "Agentic web research",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 9,
      "url": "https://benchlm.ai/benchmarks/deepSearchQa",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/deepSearchQa.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "tau2Airline",
      "name": "Tau2-Airline",
      "fullName": "Tau2-Airline",
      "description": "An airline-domain tool-use benchmark for structured workflow execution and API correctness.",
      "paperUrl": "https://www.arcee.ai/blog/trinity-large-thinking",
      "paperTitle": "Trinity-Large-Thinking: Scaling an Open Source Frontier Agent",
      "authors": "Arcee AI",
      "year": "2026",
      "tasks": "Airline support workflows",
      "format": "Domain-specific tool evaluation",
      "difficulty": "Professional workflow",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/tau2Airline",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/tau2Airline.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "pinchBench",
      "name": "PinchBench",
      "fullName": "PinchBench",
      "description": "An OpenClaw agent benchmark from Kilo that measures successful task completion across standardized real-world agent workflows.",
      "paperUrl": "https://pinchbench.com/about",
      "paperTitle": "About PinchBench",
      "authors": "Kilo Code",
      "year": "2026",
      "tasks": "23 OpenClaw agent tasks",
      "format": "Average success rate from official runs",
      "difficulty": "Long-horizon agent workflows",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/pinchBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/pinchBench.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "openHandsIndex",
      "name": "OpenHands Index",
      "fullName": "OpenHands Index",
      "description": "A holistic coding-agent benchmark that evaluates AI agents across issue resolution, frontend work, greenfield development, testing, and information gathering.",
      "paperUrl": "https://index.openhands.dev/about",
      "paperTitle": "OpenHands Index methodology",
      "authors": "OpenHands",
      "year": "2025",
      "tasks": "SWE-bench Verified, SWE-bench Multimodal, Commit0, SWT-bench Verified, and GAIA",
      "format": "Macro-average across five coding-agent categories",
      "difficulty": "Real-world software engineering agent tasks",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/openHandsIndex",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/openHandsIndex.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "sweAtlasRefactoring",
      "name": "SWE-Atlas Refactoring",
      "fullName": "SWE-Atlas Refactoring",
      "description": "A Scale SWE-Atlas software-engineering agent benchmark focused on refactoring tasks.",
      "paperUrl": "https://labs.scale.com/papers/sweatlas",
      "paperTitle": "SWE-Atlas",
      "authors": "Scale AI",
      "year": "2026",
      "tasks": "SWE-Atlas refactoring tasks",
      "format": "Refactoring score with confidence intervals",
      "difficulty": "Real-world software-engineering agent tasks",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/sweAtlasRefactoring",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/sweAtlasRefactoring.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "inferenceBench",
      "name": "InferenceBench",
      "fullName": "InferenceBench",
      "description": "A benchmark for open-ended LLM inference optimization by AI agents. Agents receive a base model, one H100, and a fixed time budget to build a valid OpenAI-compatible inference server that improves serving speed.",
      "paperUrl": "https://inferencebench.ai/",
      "paperTitle": "InferenceBench",
      "authors": "Jehyeok Yeon, Ben Rank, Maksym Andriushchenko",
      "year": "2026",
      "tasks": "4 inference-serving optimization scenarios",
      "format": "Two-hour autonomous CLI agent run",
      "difficulty": "Open-ended ML systems engineering",
      "decimals": 2,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/inferenceBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/inferenceBench.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "bfclV4",
      "name": "BFCL v4",
      "fullName": "Berkeley Function Calling Leaderboard v4",
      "description": "A function-calling benchmark for tool selection, schema adherence, and argument correctness.",
      "paperUrl": "https://www.arcee.ai/blog/trinity-large-thinking",
      "paperTitle": "Trinity-Large-Thinking: Scaling an Open Source Frontier Agent",
      "authors": "Arcee AI",
      "year": "2026",
      "tasks": "Function-calling tasks",
      "format": "Tool invocation and schema evaluation",
      "difficulty": "Advanced tool use",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 8,
      "url": "https://benchlm.ai/benchmarks/bfclV4",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/bfclV4.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "mleBenchLite",
      "name": "MLE-Bench Lite",
      "fullName": "MLE-Bench Lite",
      "description": "A lightweight machine-learning competition benchmark that measures whether models can iteratively train, evaluate, and improve ML systems in low-resource settings.",
      "paperUrl": "https://www.minimax.io/news/minimax-m27-en",
      "paperTitle": "MiniMax M2.7: Early Echoes of Self-Evolution",
      "authors": "MiniMax",
      "year": "2026",
      "tasks": "Low-resource ML competitions",
      "format": "Autonomous iterative ML optimization",
      "difficulty": "Agentic machine learning",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/mleBenchLite",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mleBenchLite.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "mmClawBench",
      "name": "MM-ClawBench",
      "fullName": "MM-ClawBench",
      "description": "An OpenClaw-derived agent benchmark covering practical work and life tasks such as office document delivery, research, planning, and code maintenance.",
      "paperUrl": "https://www.minimax.io/news/minimax-m27-en",
      "paperTitle": "MiniMax M2.7: Early Echoes of Self-Evolution",
      "authors": "MiniMax",
      "year": "2026",
      "tasks": "OpenClaw-style real-world tasks",
      "format": "Agent workflow evaluation",
      "difficulty": "Broad real-world agentic execution",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/mmClawBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mmClawBench.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "clawEval",
      "name": "Claw-Eval",
      "fullName": "Claw-Eval",
      "description": "A transparent real-world autonomous-agent benchmark with 300 human-verified tasks, 2,159 rubric items, and Pass^3 scoring across general, multi-turn, and native multimodal agent tasks.",
      "paperUrl": "https://arxiv.org/abs/2604.06132",
      "paperTitle": "Claw-Eval: Towards Trustworthy Evaluation of Autonomous Agents",
      "authors": "Bowen Ye, Rang Li, Qibin Yang, Yuanxin Liu, Linli Yao, Hanglong Lv, Zhihui Xie, Chenxin An, Lei Li, Lingpeng Kong, Qi Liu, Zhifang Sui, Tong Yang",
      "year": "2026",
      "tasks": "300 tasks, 2,159 rubrics",
      "format": "End-to-end autonomous-agent evaluation with Pass^3 scoring",
      "difficulty": "Real-world general, multi-turn, and native multimodal agent execution",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 30,
      "url": "https://benchlm.ai/benchmarks/clawEval",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/clawEval.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "qwenClawBench",
      "name": "QwenClawBench",
      "fullName": "QwenClawBench",
      "description": "Qwen's internal OpenClaw-style benchmark for measuring broad real-world agent performance across practical productivity and research tasks.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Real-world agent workflows",
      "format": "End-to-end agent evaluation",
      "difficulty": "Broad real-world agentic execution",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 10,
      "url": "https://benchlm.ai/benchmarks/qwenClawBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/qwenClawBench.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "qwenWebBench",
      "name": "QwenWebBench",
      "fullName": "QwenWebBench",
      "description": "A Qwen benchmark for artifact and webpage generation quality reported as an Elo-style rating.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Web artifacts and interactive deliverables",
      "format": "Elo-style artifact benchmark",
      "difficulty": "Artifact generation",
      "decimals": 0,
      "weight": null,
      "displayableScoreCount": 5,
      "url": "https://benchlm.ai/benchmarks/qwenWebBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/qwenWebBench.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "tau3Bench",
      "name": "TAU3-Bench",
      "fullName": "TAU3-Bench",
      "description": "A next-generation tool-use benchmark for complex, long-horizon agent workflows beyond the older tau2 telecom and airline task families.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Long-horizon tool workflows",
      "format": "Interactive tool-use evaluation",
      "difficulty": "Advanced tool use",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 10,
      "url": "https://benchlm.ai/benchmarks/tau3Bench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/tau3Bench.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "vitaBench",
      "name": "VITA-Bench",
      "fullName": "VITA-Bench",
      "description": "An interactive real-world agent benchmark grounded in practical consumer-service tasks such as delivery, in-store consumption, and online travel workflows.",
      "paperUrl": "https://vitabench.github.io/",
      "paperTitle": "VitaBench: Benchmarking LLM Agents with Versatile Interactive Tasks in Real-world Applications",
      "authors": "Meituan LongCat Team",
      "year": "2025",
      "tasks": "Interactive consumer-service agent tasks",
      "format": "End-to-end interactive agent evaluation",
      "difficulty": "Long-horizon real-world workflows",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 9,
      "url": "https://benchlm.ai/benchmarks/vitaBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/vitaBench.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "deepPlanning",
      "name": "DeepPlanning",
      "fullName": "DeepPlanning",
      "description": "A long-horizon planning benchmark that tests whether agents can optimize under explicit time, budget, and feasibility constraints.",
      "paperUrl": "https://arxiv.org/abs/2601.18137",
      "paperTitle": "DeepPlanning: Benchmarking Long-Horizon Agentic Planning with Verifiable Constraints",
      "authors": "DeepPlanning authors",
      "year": "2026",
      "tasks": "Travel planning and constrained shopping",
      "format": "Long-horizon planning benchmark",
      "difficulty": "Constrained agent planning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 7,
      "url": "https://benchlm.ai/benchmarks/deepPlanning",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/deepPlanning.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "mcpTasks",
      "name": "MCP-Tasks",
      "fullName": "MCP-Tasks",
      "description": "A Model Context Protocol task benchmark used in Qwen's launch tables to measure practical execution over MCP-style tools and integrations.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "MCP-integrated tool tasks",
      "format": "Interactive tool-use evaluation",
      "difficulty": "Advanced MCP workflows",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 5,
      "url": "https://benchlm.ai/benchmarks/mcpTasks",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mcpTasks.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "wideResearch",
      "name": "WideResearch",
      "fullName": "WideResearch",
      "description": "A broad research-agent benchmark for open-ended information gathering, synthesis, and answer construction across wide search spaces.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Open-ended research tasks",
      "format": "Multi-source research evaluation",
      "difficulty": "Broad research-agent workflows",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 7,
      "url": "https://benchlm.ai/benchmarks/wideResearch",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/wideResearch.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "gaia",
      "name": "GAIA",
      "fullName": "General AI Assistants",
      "description": "GAIA evaluates AI models on real-world tasks that are conceptually simple for humans but require multi-step reasoning, web browsing, tool use, and multimodal understanding for AI. Tasks span three difficulty levels and test practical assistant capabilities rather than academic knowledge.",
      "paperUrl": null,
      "paperTitle": null,
      "authors": null,
      "year": 2024,
      "tasks": 466,
      "format": null,
      "difficulty": null,
      "decimals": null,
      "weight": 0.12,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/gaia",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/gaia.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "tauBench",
      "name": "TAU-bench",
      "fullName": "Tool-Agent-User Benchmark",
      "description": "TAU-bench evaluates AI agents in realistic enterprise scenarios requiring multi-turn tool use, database interactions, and policy adherence. It tests across retail and airline domains, measuring an agent's ability to reliably complete customer service tasks while following complex business rules.",
      "paperUrl": null,
      "paperTitle": null,
      "authors": null,
      "year": 2024,
      "tasks": 680,
      "format": null,
      "difficulty": null,
      "decimals": null,
      "weight": 0.1,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/tauBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/tauBench.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "webArena",
      "name": "WebArena",
      "fullName": "WebArena Web Agent Benchmark",
      "description": "WebArena is a realistic web environment for evaluating autonomous AI agents on complex, multi-step browser tasks. Agents must navigate e-commerce sites, forums, content management systems, and code repositories to complete practical objectives like purchasing items, finding information, and managing accounts.",
      "paperUrl": null,
      "paperTitle": null,
      "authors": null,
      "year": 2024,
      "tasks": 812,
      "format": null,
      "difficulty": null,
      "decimals": null,
      "weight": 0.08,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/webArena",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/webArena.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "mewc",
      "name": "MEWC",
      "fullName": "Multi-Environment Web Challenge",
      "description": "A benchmark that evaluates AI agents on multi-environment web challenges, testing navigation and task completion across diverse live web environments.",
      "paperUrl": "https://www.minimax.io/news/minimax-m25",
      "paperTitle": "MiniMax M2.5 benchmark release surface",
      "authors": "MiniMax / benchmark maintainers",
      "year": "2026",
      "tasks": "Web-agent tasks",
      "format": "Browser task completion",
      "difficulty": "Open-web agent workflows",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/mewc",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mewc.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "financeAgentV2",
      "name": "Finance Agent v2",
      "fullName": "Finance Agent v2",
      "description": "Vals AI benchmark for realistic financial analyst agent tasks across qualitative analysis, quantitative analysis, market work, comparables, precedents, earnings, disclosure, and modeling.",
      "paperUrl": "https://www.vals.ai/benchmarks/fabv2",
      "paperTitle": "Finance Agent v2",
      "authors": "Vals AI",
      "year": "2026",
      "tasks": "Financial analyst task categories",
      "format": "Mean score across repeated runs",
      "difficulty": "Professional expert-task agent workflow",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/financeAgentV2",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/financeAgentV2.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "gdpvalRubrics",
      "name": "GDPval rubrics",
      "fullName": "GDPval rubrics",
      "description": "A display-only provider-table GDPval rubric score for economically valuable work tasks.",
      "paperUrl": "https://huggingface.co/MiniMaxAI/MiniMax-M3",
      "paperTitle": "MiniMax M3 model card",
      "authors": "MiniMax",
      "year": "2026",
      "tasks": "Economically valuable work tasks",
      "format": "Rubric score",
      "difficulty": "Professional agentic workflows",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/gdpvalRubrics",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/gdpvalRubrics.md"
    },
    {
      "category": "agentic",
      "categoryLabel": "Agentic",
      "benchmarkKey": "bankerToolBench",
      "name": "BankerToolBench",
      "fullName": "BankerToolBench",
      "description": "A display-only provider benchmark for finance-oriented tool-use and agent workflows.",
      "paperUrl": "https://huggingface.co/MiniMaxAI/MiniMax-M3",
      "paperTitle": "MiniMax M3 model card",
      "authors": "MiniMax",
      "year": "2026",
      "tasks": "Finance and banking tool-use tasks",
      "format": "Task success rate",
      "difficulty": "Professional finance-agent workflows",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/bankerToolBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/bankerToolBench.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "mmmu",
      "name": "MMMU",
      "fullName": "Massive Multi-discipline Multimodal Understanding",
      "description": "A broad multimodal reasoning benchmark spanning charts, diagrams, tables, and academic visual question answering.",
      "paperUrl": "https://arxiv.org/abs/2401.05508",
      "paperTitle": "MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI",
      "authors": "MMMU authors",
      "year": "2024",
      "tasks": "Multimodal academic reasoning",
      "format": "Image + text question answering",
      "difficulty": "Frontier multimodal",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 9,
      "url": "https://benchlm.ai/benchmarks/mmmu",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mmmu.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "mmmuPro",
      "name": "MMMU-Pro",
      "fullName": "Massive Multi-discipline Multimodal Understanding Pro",
      "description": "A harder multimodal benchmark for frontier models that combines text with images, diagrams, charts, and academic visual reasoning tasks.",
      "paperUrl": "https://arxiv.org/abs/2409.02813",
      "paperTitle": "MMMU-Pro: A More Robust Multi-discipline Multimodal Understanding Benchmark",
      "authors": "MMMU-Pro authors",
      "year": "2024",
      "tasks": "Multimodal academic reasoning",
      "format": "Image + text question answering",
      "difficulty": "Frontier multimodal",
      "decimals": null,
      "weight": 0.45,
      "displayableScoreCount": 31,
      "url": "https://benchlm.ai/benchmarks/mmmuPro",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mmmuPro.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "aaMmmuPro",
      "name": "AA-MMMU-Pro",
      "fullName": "Artificial Analysis MMMU-Pro",
      "description": "A display-only Artificial Analysis MMMU-Pro score.",
      "paperUrl": "https://artificialanalysis.ai/evaluations/mmmu-pro",
      "paperTitle": "Artificial Analysis MMMU-Pro Benchmark Leaderboard",
      "authors": "Artificial Analysis",
      "year": "2026",
      "tasks": "Multimodal academic reasoning",
      "format": "Image + text question answering",
      "difficulty": "Frontier multimodal",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 73,
      "url": "https://benchlm.ai/benchmarks/aaMmmuPro",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/aaMmmuPro.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "ocrBenchV2",
      "name": "OCRBench V2",
      "fullName": "OCRBench V2",
      "description": "A native OCR benchmark for reading text from images across multilingual scripts, low-quality scans, handwriting, structured layouts, charts, and screenshots.",
      "paperUrl": "https://arxiv.org/abs/2501.00321",
      "paperTitle": "OCRBench v2: An Improved Benchmark for Evaluating Large Multimodal Models on Visual Text Localization and Reasoning",
      "authors": "OCRBench authors",
      "year": "2025",
      "tasks": "Image OCR tasks",
      "format": "Accuracy",
      "difficulty": "Native visual text understanding",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/ocrBenchV2",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/ocrBenchV2.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "olmOcr",
      "name": "olmOCR",
      "fullName": "olmOCR-Bench",
      "description": "An end-to-end document understanding benchmark over long, layout-rich PDFs with tables, equations, headers, footnotes, and multi-column flows.",
      "paperUrl": "https://github.com/allenai/olmocr/tree/main/olmocr/bench",
      "paperTitle": "olmOCR-Bench",
      "authors": "Allen Institute for AI",
      "year": "2025",
      "tasks": "Layout-rich PDF understanding",
      "format": "Mean accuracy",
      "difficulty": "Complex document processing",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/olmOcr",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/olmOcr.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "voxPopuliWer",
      "name": "VoxPopuli WER",
      "fullName": "VoxPopuli-Cleaned-AA Word Error Rate",
      "description": "A speech-recognition benchmark on the cleaned Artificial Analysis VoxPopuli subset, reported as word error rate where lower is better.",
      "paperUrl": "https://huggingface.co/datasets/ArtificialAnalysis/VoxPopuli-Cleaned-AA",
      "paperTitle": "VoxPopuli-Cleaned-AA",
      "authors": "Artificial Analysis / VoxPopuli dataset authors",
      "year": "2026",
      "tasks": "Speech-to-text transcription",
      "format": "Word error rate",
      "difficulty": "Audio speech recognition",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/voxPopuliWer",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/voxPopuliWer.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "designArenaWebsite",
      "name": "Design Arena Website",
      "fullName": "Design Arena Website Elo",
      "description": "A display-only Design Arena website-generation Elo score surfaced on OpenRouter model benchmark pages.",
      "paperUrl": "https://openrouter.ai/x-ai/grok-4.3/benchmarks",
      "paperTitle": "OpenRouter Grok 4.3 benchmarks",
      "authors": "Design Arena",
      "year": "2026",
      "tasks": "Website generation comparisons",
      "format": "Elo",
      "difficulty": "Design and website generation",
      "decimals": 0,
      "weight": null,
      "displayableScoreCount": 70,
      "url": "https://benchlm.ai/benchmarks/designArenaWebsite",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/designArenaWebsite.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "officeQaPro",
      "name": "OfficeQA Pro",
      "fullName": "OfficeQA Pro",
      "description": "A benchmark for grounded reasoning over office-style documents, spreadsheets, charts, and business artifacts.",
      "paperUrl": "https://arxiv.org/abs/2603.08655",
      "paperTitle": "OfficeQA Pro: An Enterprise Benchmark for End-to-End Grounded Reasoning",
      "authors": "OfficeQA Pro authors",
      "year": "2026",
      "tasks": "Document and spreadsheet tasks",
      "format": "Grounded QA over office artifacts",
      "difficulty": "Enterprise grounded reasoning",
      "decimals": null,
      "weight": 0.3,
      "displayableScoreCount": 5,
      "url": "https://benchlm.ai/benchmarks/officeQaPro",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/officeQaPro.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "mmmuProPython",
      "name": "MMMU-Pro w/ Python",
      "fullName": "MMMU-Pro with Python",
      "description": "Tool-augmented MMMU-Pro variant that allows Python assistance during multimodal reasoning.",
      "paperUrl": "https://openai.com/index/introducing-gpt-5-4-mini-and-nano/",
      "paperTitle": "Introducing GPT-5.4 mini and nano",
      "authors": "OpenAI",
      "year": "2026",
      "tasks": "Multimodal academic reasoning",
      "format": "Image + text question answering with Python",
      "difficulty": "Frontier multimodal",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 5,
      "url": "https://benchlm.ai/benchmarks/mmmuProPython",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mmmuProPython.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "omniDocBench15",
      "name": "OmniDocBench 1.5",
      "fullName": "OmniDocBench 1.5",
      "description": "A document understanding benchmark used in frontier-model comparison tables to measure extraction and grounded reasoning quality on complex documents.",
      "paperUrl": "https://openai.com/index/introducing-gpt-5-4-mini-and-nano/",
      "paperTitle": "Introducing GPT-5.4 mini and nano",
      "authors": "OpenAI",
      "year": "2026",
      "tasks": "Document understanding tasks",
      "format": "Document understanding benchmark",
      "difficulty": "Grounded document reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 3,
      "url": "https://benchlm.ai/benchmarks/omniDocBench15",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/omniDocBench15.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "liquidExtractJsonValidity",
      "name": "Liquid Extract JSON Validity",
      "fullName": "Liquid image-to-JSON extraction JSON validity",
      "description": "A display-only Liquid AI extraction metric measuring the share of image-to-JSON outputs that parse as strict JSON.",
      "paperUrl": "https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B-Extract",
      "paperTitle": "LiquidAI LFM2.5-VL Extract model cards",
      "authors": "Liquid AI",
      "year": "2026",
      "tasks": "Image-to-JSON extraction",
      "format": "Strict JSON parseability rate",
      "difficulty": "Structured visual extraction",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/liquidExtractJsonValidity",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/liquidExtractJsonValidity.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "liquidExtractSchemaF1",
      "name": "Liquid Extract F1",
      "fullName": "Liquid image-to-JSON extraction schema consistency F1",
      "description": "A display-only Liquid AI extraction metric measuring field-name agreement between requested schema fields and extracted JSON fields.",
      "paperUrl": "https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B-Extract",
      "paperTitle": "LiquidAI LFM2.5-VL Extract model cards",
      "authors": "Liquid AI",
      "year": "2026",
      "tasks": "Image-to-JSON extraction",
      "format": "Schema field F1",
      "difficulty": "Structured visual extraction",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/liquidExtractSchemaF1",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/liquidExtractSchemaF1.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "liquidExtractVlmJudge",
      "name": "Liquid Extract VLM Judge",
      "fullName": "Liquid image-to-JSON extraction VLM judge score",
      "description": "A display-only Liquid AI extraction metric measuring judged agreement between extracted values and the source image.",
      "paperUrl": "https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B-Extract",
      "paperTitle": "LiquidAI LFM2.5-VL Extract model cards",
      "authors": "Liquid AI",
      "year": "2026",
      "tasks": "Image-to-JSON extraction",
      "format": "VLM-judged extraction accuracy",
      "difficulty": "Structured visual extraction",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/liquidExtractVlmJudge",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/liquidExtractVlmJudge.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "realWorldQa",
      "name": "RealWorldQA",
      "fullName": "RealWorldQA",
      "description": "A grounded visual QA benchmark focused on answering practical questions about real-world images and scenes.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Real-world visual question answering",
      "format": "Image-grounded QA",
      "difficulty": "General visual reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 4,
      "url": "https://benchlm.ai/benchmarks/realWorldQa",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/realWorldQa.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "videoMmeWithSub",
      "name": "Video-MME (with subtitle)",
      "fullName": "Video-MME with subtitle",
      "description": "A video understanding benchmark that allows subtitle access when answering multimodal questions about videos.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Video understanding",
      "format": "Video QA with subtitle context",
      "difficulty": "Multimodal video reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 5,
      "url": "https://benchlm.ai/benchmarks/videoMmeWithSub",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/videoMmeWithSub.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "videoMmeNoSub",
      "name": "Video-MME (w/o subtitle)",
      "fullName": "Video-MME without subtitle",
      "description": "A stricter Video-MME setting that removes subtitle help and tests video understanding from visual and audio context alone.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Video understanding",
      "format": "Video QA without subtitle context",
      "difficulty": "Multimodal video reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/videoMmeNoSub",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/videoMmeNoSub.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "videoMme",
      "name": "Video-MME",
      "fullName": "Video-MME",
      "description": "A comprehensive benchmark for multimodal large language models on video understanding, covering temporal reasoning, perception, and question answering over videos.",
      "paperUrl": "https://mme-benchmark.github.io/",
      "paperTitle": "Video-MME benchmark",
      "authors": "Video-MME benchmark team",
      "year": "2024",
      "tasks": "Video understanding",
      "format": "Video QA and analysis",
      "difficulty": "Broad multimodal video reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/videoMme",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/videoMme.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "mathVision",
      "name": "MathVision",
      "fullName": "MathVision",
      "description": "A visual mathematics benchmark that tests whether a model can solve math problems grounded in diagrams, equations, figures, and other visual inputs.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Visually grounded math problems",
      "format": "Image + math reasoning",
      "difficulty": "Advanced multimodal mathematics",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 11,
      "url": "https://benchlm.ai/benchmarks/mathVision",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mathVision.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "weMath",
      "name": "We-Math",
      "fullName": "We-Math",
      "description": "A multimodal math benchmark for visually grounded mathematical reasoning and answer generation.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Visually grounded math problems",
      "format": "Multimodal mathematical reasoning",
      "difficulty": "Advanced multimodal mathematics",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/weMath",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/weMath.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "dynaMath",
      "name": "DynaMath",
      "fullName": "DynaMath",
      "description": "A multimodal benchmark for dynamic mathematical reasoning over visual and structured inputs.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Dynamic visual math problems",
      "format": "Multimodal mathematical reasoning",
      "difficulty": "Advanced multimodal mathematics",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/dynaMath",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/dynaMath.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "mStar",
      "name": "MStar",
      "fullName": "MStar",
      "description": "A general visual question-answering benchmark used in provider tables for real-image reasoning quality.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Real-image visual QA",
      "format": "Image-grounded QA",
      "difficulty": "General visual reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/mStar",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mStar.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "chatCvqa",
      "name": "ChatCVQA",
      "fullName": "ChatCVQA",
      "description": "A conversational visual QA benchmark that tests multi-turn grounded answering over images and documents.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Conversational visual QA",
      "format": "Multi-turn image-grounded QA",
      "difficulty": "Conversational multimodal reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/chatCvqa",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/chatCvqa.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "mmLongBenchDoc",
      "name": "MMLongBench-Doc",
      "fullName": "MMLongBench-Doc",
      "description": "A long-document multimodal benchmark for grounded reasoning over extended document contexts.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Long document understanding",
      "format": "Document-grounded reasoning",
      "difficulty": "Long-context document reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/mmLongBenchDoc",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mmLongBenchDoc.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "ccOcr",
      "name": "CC-OCR",
      "fullName": "CC-OCR",
      "description": "An OCR-focused benchmark for reading and extracting text from visually complex documents and images.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Optical character recognition",
      "format": "Text extraction from images and documents",
      "difficulty": "Document reading",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/ccOcr",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/ccOcr.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "ai2dTest",
      "name": "AI2D_TEST",
      "fullName": "AI2D test split",
      "description": "A diagram understanding benchmark focused on scientific and educational visual question answering.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Diagram understanding",
      "format": "Diagram-grounded QA",
      "difficulty": "Structured visual reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/ai2dTest",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/ai2dTest.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "countBench",
      "name": "CountBench",
      "fullName": "CountBench",
      "description": "A visual counting benchmark that tests whether a model can count objects and entities reliably in complex scenes.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Visual counting tasks",
      "format": "Image-grounded counting",
      "difficulty": "Fine-grained visual perception",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/countBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/countBench.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "refcocoAvg",
      "name": "RefCOCO (avg)",
      "fullName": "RefCOCO average",
      "description": "A referring-expression grounding benchmark averaged across RefCOCO variants to test whether a model can localize described objects correctly.",
      "paperUrl": "https://github.com/lichengunc/refer",
      "paperTitle": "RefCOCO referring expression datasets",
      "authors": "RefCOCO dataset authors",
      "year": "2026",
      "tasks": "Referring-expression grounding",
      "format": "Grounded visual localization",
      "difficulty": "Fine-grained visual grounding",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 4,
      "url": "https://benchlm.ai/benchmarks/refcocoAvg",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/refcocoAvg.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "odinw13",
      "name": "ODINW13",
      "fullName": "ODINW13",
      "description": "A visual detection and grounding benchmark slice used to compare zero-shot object understanding across diverse domains.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Out-of-distribution object understanding",
      "format": "Detection and grounding",
      "difficulty": "Robust visual grounding",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/odinw13",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/odinw13.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "erqa",
      "name": "ERQA",
      "fullName": "ERQA",
      "description": "A grounded visual reasoning benchmark focused on evidence-based question answering over real images.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Evidence-based visual QA",
      "format": "Grounded image reasoning",
      "difficulty": "Grounded multimodal reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 7,
      "url": "https://benchlm.ai/benchmarks/erqa",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/erqa.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "videoMmmu",
      "name": "VideoMMMU",
      "fullName": "VideoMMMU",
      "description": "A video extension of MMMU-style multimodal reasoning over expert questions grounded in temporal media.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Video-grounded expert reasoning",
      "format": "Video + text reasoning",
      "difficulty": "Frontier multimodal video reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 9,
      "url": "https://benchlm.ai/benchmarks/videoMmmu",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/videoMmmu.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "mlvuAvg",
      "name": "MLVU (M-Avg)",
      "fullName": "MLVU mean average",
      "description": "A multi-task video understanding benchmark averaged across MLVU categories.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "General video understanding",
      "format": "Video QA and understanding",
      "difficulty": "Broad multimodal video reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 3,
      "url": "https://benchlm.ai/benchmarks/mlvuAvg",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mlvuAvg.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "mmvu",
      "name": "MMVU",
      "fullName": "Multimodal Multi-disciplinary Video Understanding",
      "description": "A benchmark for evaluating multimodal models on video understanding tasks across multiple disciplines, emphasizing temporal reasoning and comprehension over video content.",
      "paperUrl": "https://www.kimi.com/blog/kimi-k2-5.html",
      "paperTitle": "Kimi K2.5 benchmark release surface",
      "authors": "MMVU benchmark maintainers",
      "year": "2026",
      "tasks": "Video understanding",
      "format": "Video reasoning benchmark",
      "difficulty": "Multi-disciplinary multimodal video reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 4,
      "url": "https://benchlm.ai/benchmarks/mmvu",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mmvu.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "screenSpotPro",
      "name": "ScreenSpot Pro",
      "fullName": "ScreenSpot Pro",
      "description": "A high-resolution GUI grounding benchmark for professional computer-use environments.",
      "paperUrl": "https://likaixin2000.github.io/papers/ScreenSpot_Pro.pdf",
      "paperTitle": "ScreenSpot-Pro: GUI Grounding for Professional High-Resolution Computer Use",
      "authors": "ScreenSpot-Pro authors",
      "year": "2025",
      "tasks": "GUI grounding tasks",
      "format": "Interface element localization",
      "difficulty": "Professional GUI grounding",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 15,
      "url": "https://benchlm.ai/benchmarks/screenSpotPro",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/screenSpotPro.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "tirBench",
      "name": "TIR-Bench",
      "fullName": "TIR-Bench",
      "description": "A visual agent benchmark for interface reasoning and task execution over screenshots or software surfaces.",
      "paperUrl": "https://qwen.ai/blog?id=qwen3.6",
      "paperTitle": "Qwen3.6 launch benchmarks",
      "authors": "Qwen",
      "year": "2026",
      "tasks": "Visual agent and interface reasoning",
      "format": "Screenshot-grounded task reasoning",
      "difficulty": "Computer-use visual reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/tirBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/tirBench.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "gdpvalAa",
      "name": "GDPval-AA",
      "fullName": "GDPval-AA",
      "description": "An evaluation focused on professional domain expertise and task delivery quality in office-style knowledge work.",
      "paperUrl": "https://www.minimax.io/news/minimax-m27-en",
      "paperTitle": "MiniMax M2.7: Early Echoes of Self-Evolution",
      "authors": "MiniMax",
      "year": "2026",
      "tasks": "Professional office delivery",
      "format": "ELO-style office benchmark",
      "difficulty": "Professional knowledge work",
      "decimals": 0,
      "weight": null,
      "displayableScoreCount": 6,
      "url": "https://benchlm.ai/benchmarks/gdpvalAa",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/gdpvalAa.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "medXpertQaMm",
      "name": "MedXpertQA (MM)",
      "fullName": "MedXpertQA Multimodal",
      "description": "A multimodal medical multiple-choice benchmark covering clinical images such as X-rays, histology, and dermatology.",
      "paperUrl": "https://ai.meta.com/static-resource/muse-spark-eval-methodology",
      "paperTitle": "Muse Spark Eval Methodology",
      "authors": "Meta AI",
      "year": "2026",
      "tasks": "2,000 multimodal medical questions",
      "format": "Medical visual MCQ",
      "difficulty": "Clinical multimodal reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 7,
      "url": "https://benchlm.ai/benchmarks/medXpertQaMm",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/medXpertQaMm.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "zeroBench",
      "name": "ZeroBench",
      "fullName": "ZeroBench",
      "description": "A multi-step visual reasoning benchmark with pass@5 reporting and optional tool use.",
      "paperUrl": "https://ai.meta.com/static-resource/muse-spark-eval-methodology",
      "paperTitle": "Muse Spark Eval Methodology",
      "authors": "Meta AI",
      "year": "2026",
      "tasks": "100 visual reasoning questions",
      "format": "Multi-step visual reasoning",
      "difficulty": "Tool-augmented visual reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 3,
      "url": "https://benchlm.ai/benchmarks/zeroBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/zeroBench.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "design2Code",
      "name": "Design2Code",
      "fullName": "Design2Code",
      "description": "A multimodal coding benchmark for turning visual designs into working frontend implementations.",
      "paperUrl": "https://docs.z.ai/guides/vlm/glm-5v-turbo",
      "paperTitle": "GLM-5V-Turbo",
      "authors": "Z.AI",
      "year": "2026",
      "tasks": "Design-to-code tasks",
      "format": "Visual input to frontend implementation",
      "difficulty": "Multimodal coding",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/design2Code",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/design2Code.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "flameVlmCode",
      "name": "Flame-VLM-Code",
      "fullName": "Flame-VLM-Code",
      "description": "A vision-language coding benchmark for generating correct code from visual and multimodal inputs.",
      "paperUrl": "https://docs.z.ai/guides/vlm/glm-5v-turbo",
      "paperTitle": "GLM-5V-Turbo",
      "authors": "Z.AI",
      "year": "2026",
      "tasks": "Multimodal coding tasks",
      "format": "Vision-language code generation",
      "difficulty": "Multimodal coding",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/flameVlmCode",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/flameVlmCode.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "vision2Web",
      "name": "Vision2Web",
      "fullName": "Vision2Web",
      "description": "A benchmark for converting visual references into functional web implementations.",
      "paperUrl": "https://docs.z.ai/guides/vlm/glm-5v-turbo",
      "paperTitle": "GLM-5V-Turbo",
      "authors": "Z.AI",
      "year": "2026",
      "tasks": "Screenshot-to-web tasks",
      "format": "Visual reference to web implementation",
      "difficulty": "Multimodal web generation",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/vision2Web",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/vision2Web.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "imageMining",
      "name": "ImageMining",
      "fullName": "ImageMining",
      "description": "A multimodal retrieval and extraction benchmark over image-heavy task settings.",
      "paperUrl": "https://docs.z.ai/guides/vlm/glm-5v-turbo",
      "paperTitle": "GLM-5V-Turbo",
      "authors": "Z.AI",
      "year": "2026",
      "tasks": "Visual retrieval tasks",
      "format": "Image-grounded retrieval and extraction",
      "difficulty": "Multimodal retrieval",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/imageMining",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/imageMining.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "mmSearch",
      "name": "MMSearch",
      "fullName": "MMSearch",
      "description": "A multimodal search benchmark for retrieval and grounded answering across mixed-media inputs.",
      "paperUrl": "https://docs.z.ai/guides/vlm/glm-5v-turbo",
      "paperTitle": "GLM-5V-Turbo",
      "authors": "Z.AI",
      "year": "2026",
      "tasks": "Multimodal search tasks",
      "format": "Mixed-media retrieval and grounded answering",
      "difficulty": "Multimodal search",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/mmSearch",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mmSearch.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "mmSearchPlus",
      "name": "MMSearch-Plus",
      "fullName": "MMSearch-Plus",
      "description": "A harder MMSearch variant for multimodal retrieval and grounded tool-use workflows.",
      "paperUrl": "https://docs.z.ai/guides/vlm/glm-5v-turbo",
      "paperTitle": "GLM-5V-Turbo",
      "authors": "Z.AI",
      "year": "2026",
      "tasks": "Hard multimodal search tasks",
      "format": "Advanced mixed-media retrieval benchmark",
      "difficulty": "Advanced multimodal search",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 1,
      "url": "https://benchlm.ai/benchmarks/mmSearchPlus",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/mmSearchPlus.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "simpleVqa",
      "name": "SimpleVQA",
      "fullName": "SimpleVQA",
      "description": "A visual question answering benchmark focused on straightforward image-grounded understanding.",
      "paperUrl": "https://docs.z.ai/guides/vlm/glm-5v-turbo",
      "paperTitle": "GLM-5V-Turbo",
      "authors": "Z.AI",
      "year": "2026",
      "tasks": "Visual QA tasks",
      "format": "Image-grounded question answering",
      "difficulty": "General visual understanding",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 8,
      "url": "https://benchlm.ai/benchmarks/simpleVqa",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/simpleVqa.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "factsVlm",
      "name": "Facts-VLM",
      "fullName": "Facts-VLM",
      "description": "A grounded multimodal factuality benchmark for evidence-linked answer correctness.",
      "paperUrl": "https://docs.z.ai/guides/vlm/glm-5v-turbo",
      "paperTitle": "GLM-5V-Turbo",
      "authors": "Z.AI",
      "year": "2026",
      "tasks": "Grounded factuality tasks",
      "format": "Evidence-linked multimodal factuality",
      "difficulty": "Grounded multimodal factuality",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/factsVlm",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/factsVlm.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "vStar",
      "name": "V*",
      "fullName": "V*",
      "description": "A vision-centric benchmark for high-level multimodal reasoning and perception quality.",
      "paperUrl": "https://docs.z.ai/guides/vlm/glm-5v-turbo",
      "paperTitle": "GLM-5V-Turbo",
      "authors": "Z.AI",
      "year": "2026",
      "tasks": "Frontier multimodal reasoning tasks",
      "format": "Vision-centric reasoning benchmark",
      "difficulty": "Frontier multimodal",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 11,
      "url": "https://benchlm.ai/benchmarks/vStar",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/vStar.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "charxiv",
      "name": "CharXiv",
      "fullName": "CharXiv Reasoning",
      "description": "A scientific chart reasoning benchmark that tests whether models can understand, interpret, and reason about complex scientific visualizations including plots, diagrams, and data charts.",
      "paperUrl": "https://charxiv.github.io/",
      "paperTitle": "CharXiv: Charting Gaps in Realistic Chart Understanding in Multimodal LLMs",
      "authors": "CharXiv authors",
      "year": "2024",
      "tasks": "Scientific chart reasoning",
      "format": "Chart understanding and reasoning",
      "difficulty": "Scientific visualization reasoning",
      "decimals": null,
      "weight": 0.2,
      "displayableScoreCount": 24,
      "url": "https://benchlm.ai/benchmarks/charxiv",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/charxiv.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "charxivNoTools",
      "name": "CharXiv w/o tools",
      "fullName": "CharXiv Reasoning without tools",
      "description": "Tool-free variant of CharXiv that isolates raw visual reasoning ability without code execution or tool augmentation.",
      "paperUrl": "https://charxiv.github.io/",
      "paperTitle": "CharXiv: Charting Gaps in Realistic Chart Understanding in Multimodal LLMs",
      "authors": "CharXiv authors",
      "year": "2024",
      "tasks": "Scientific chart reasoning (tool-free)",
      "format": "Chart understanding without tools",
      "difficulty": "Scientific visualization reasoning",
      "decimals": null,
      "weight": 0.05,
      "displayableScoreCount": 4,
      "url": "https://benchlm.ai/benchmarks/charxivNoTools",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/charxivNoTools.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "sweMultimodal",
      "name": "SWE-bench Multimodal",
      "fullName": "SWE-bench Multimodal",
      "description": "A multimodal variant of SWE-bench that adds visual context (screenshots, design mockups) to software engineering issue descriptions, testing whether models can leverage visual information for code generation.",
      "paperUrl": "https://www.swebench.com/multimodal",
      "paperTitle": "SWE-bench Multimodal",
      "authors": "SWE-bench team",
      "year": "2025",
      "tasks": "Multimodal software engineering tasks",
      "format": "Code patch generation with visual context",
      "difficulty": "Frontier multimodal coding",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/sweMultimodal",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/sweMultimodal.md"
    },
    {
      "category": "multimodalGrounded",
      "categoryLabel": "Multimodal & Grounded",
      "benchmarkKey": "blueprintBench2",
      "name": "Blueprint-Bench 2",
      "fullName": "Blueprint-Bench 2",
      "description": "An agentic spatial reasoning benchmark reported as a normalized score.",
      "paperUrl": "https://x.com/GoogleDeepMind",
      "paperTitle": "Gemini 3.5 Flash launch screenshots",
      "authors": "Google DeepMind",
      "year": "2026",
      "tasks": "Spatial reasoning from blueprints",
      "format": "Normalized score",
      "difficulty": "Agentic spatial reasoning",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 3,
      "url": "https://benchlm.ai/benchmarks/blueprintBench2",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/blueprintBench2.md"
    },
    {
      "category": "korean",
      "categoryLabel": "korean",
      "benchmarkKey": "kmmlu",
      "name": "KMMLU",
      "fullName": "Korean Massive Multitask Language Understanding",
      "description": "Evaluates Korean expert-level knowledge across 45 subjects. 20% of questions require Korean cultural context.",
      "paperUrl": "https://arxiv.org/abs/2402.11548",
      "paperTitle": "KMMLU: Measuring Massive Multitask Language Understanding in Korean",
      "authors": "KMMLU Authors",
      "year": "2024",
      "tasks": "35,030 questions",
      "format": "Multiple choice questions",
      "difficulty": "Elementary to professional level in Korean",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/kmmlu",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/kmmlu.md"
    },
    {
      "category": "korean",
      "categoryLabel": "korean",
      "benchmarkKey": "kmmluHard",
      "name": "KMMLU-Hard",
      "fullName": "KMMLU-Hard",
      "description": "A filtered hard subset of KMMLU containing ~5,000 questions that most models get wrong.",
      "paperUrl": "https://github.com/daekeun-ml/evaluate-llm-on-korean-dataset",
      "paperTitle": "Evaluating LLMs on Hard Korean Queries",
      "authors": "Daekeun ML",
      "year": "2025",
      "tasks": "~5,000 questions",
      "format": "Multiple choice questions",
      "difficulty": "Advanced Korean reasoning",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/kmmluHard",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/kmmluHard.md"
    },
    {
      "category": "korean",
      "categoryLabel": "korean",
      "benchmarkKey": "kmmluRedux",
      "name": "KMMLU-Redux",
      "fullName": "KMMLU-Redux",
      "description": "Cleaned KMMLU from national technical qualification exams, with errors removed, decontaminated, and deduplicated.",
      "paperUrl": null,
      "paperTitle": null,
      "authors": null,
      "year": null,
      "tasks": "~3,500 questions",
      "format": "Technical multiple choice",
      "difficulty": "Industrial/technical",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/kmmluRedux",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/kmmluRedux.md"
    },
    {
      "category": "korean",
      "categoryLabel": "korean",
      "benchmarkKey": "kmmluPro",
      "name": "KMMLU-Pro",
      "fullName": "KMMLU-Pro",
      "description": "Korean National Professional Licensure exams evaluating professional-grade knowledge.",
      "paperUrl": null,
      "paperTitle": null,
      "authors": null,
      "year": null,
      "tasks": "~2,500 questions",
      "format": "Professional licensure exams",
      "difficulty": "Professional",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/kmmluPro",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/kmmluPro.md"
    },
    {
      "category": "korean",
      "categoryLabel": "korean",
      "benchmarkKey": "click",
      "name": "CLIcK",
      "fullName": "Cultural and Linguistic Intelligence in Korean",
      "description": "Evaluates Korean culture and linguistics.",
      "paperUrl": null,
      "paperTitle": null,
      "authors": null,
      "year": null,
      "tasks": "1,995 questions",
      "format": "Cultural/linguistic QA",
      "difficulty": "Korean cultural nuances",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/click",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/click.md"
    },
    {
      "category": "korean",
      "categoryLabel": "korean",
      "benchmarkKey": "kobalt",
      "name": "KoBALT",
      "fullName": "Korean Benchmark for Advanced Linguistic Tasks",
      "description": "Evaluates advanced Korean linguistic competence.",
      "paperUrl": null,
      "paperTitle": null,
      "authors": null,
      "year": null,
      "tasks": "Linguistics questions",
      "format": "Advanced linguistics",
      "difficulty": "Advanced linguistic phenomena",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/kobalt",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/kobalt.md"
    },
    {
      "category": "korean",
      "categoryLabel": "korean",
      "benchmarkKey": "koreanCsat",
      "name": "Korean CSAT",
      "fullName": "College Scholastic Ability Test (수능)",
      "description": "The Korean SAT exam.",
      "paperUrl": null,
      "paperTitle": null,
      "authors": null,
      "year": null,
      "tasks": "Multi-subject exam",
      "format": "Standardized test",
      "difficulty": "High school to college level",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/koreanCsat",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/koreanCsat.md"
    },
    {
      "category": "korean",
      "categoryLabel": "korean",
      "benchmarkKey": "hrm8k",
      "name": "HRM8K",
      "fullName": "HAE-RAE Math 8K",
      "description": "Korean mathematical reasoning (high-school to Olympiad level).",
      "paperUrl": null,
      "paperTitle": null,
      "authors": null,
      "year": null,
      "tasks": "8,011 instances",
      "format": "Math word problems",
      "difficulty": "Olympiad level",
      "decimals": null,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/hrm8k",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/hrm8k.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "valsIndex",
      "name": "Vals Index",
      "fullName": "Vals Index v1.1",
      "description": "Vals AI composite benchmark across finance and coding tasks, including Finance Agent v2, CorpFin v2, SWE-bench, Terminal-Bench 2.0, and Vibe Code Bench.",
      "paperUrl": "https://www.vals.ai/benchmarks/vals_index",
      "paperTitle": "Vals Index",
      "authors": "Vals AI",
      "year": "2026",
      "tasks": "Finance and coding components",
      "format": "Composite score",
      "difficulty": "Private economic-work benchmark composite",
      "decimals": 2,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/valsIndex",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/valsIndex.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "valsMultimodalIndex",
      "name": "Vals Multimodal Index",
      "fullName": "Vals Multimodal Index v1.1",
      "description": "Vals AI multimodal composite across finance, coding, education, and mortgage-tax task families.",
      "paperUrl": "https://www.vals.ai/benchmarks/vals_multimodal_index",
      "paperTitle": "Vals Multimodal Index",
      "authors": "Vals AI",
      "year": "2026",
      "tasks": "Finance, coding, education, and mortgage-tax components",
      "format": "Composite score",
      "difficulty": "Private multimodal economic-work benchmark composite",
      "decimals": 2,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/valsMultimodalIndex",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/valsMultimodalIndex.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "valsCorpFinV2",
      "name": "CorpFin v2",
      "fullName": "Vals CorpFin v2",
      "description": "Vals AI private benchmark for understanding long-context credit agreements.",
      "paperUrl": "https://www.vals.ai/benchmarks/corp_fin_v2",
      "paperTitle": "CorpFin v2",
      "authors": "Vals AI",
      "year": "2026",
      "tasks": "Credit-agreement understanding tasks",
      "format": "Accuracy score",
      "difficulty": "Professional finance document reasoning",
      "decimals": 2,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/valsCorpFinV2",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/valsCorpFinV2.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "valsMedCode",
      "name": "MedCode",
      "fullName": "Vals MedCode",
      "description": "Vals AI healthcare benchmark for whether models can support the medical billing process.",
      "paperUrl": "https://www.vals.ai/benchmarks/medcode",
      "paperTitle": "MedCode",
      "authors": "Vals AI",
      "year": "2026",
      "tasks": "Medical billing support tasks",
      "format": "Accuracy score",
      "difficulty": "Professional healthcare administration",
      "decimals": 2,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/valsMedCode",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/valsMedCode.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "valsMedScribe",
      "name": "MedScribe",
      "fullName": "Vals MedScribe",
      "description": "Vals AI healthcare benchmark for whether models can support doctors with administrative work.",
      "paperUrl": "https://www.vals.ai/benchmarks/medscribe",
      "paperTitle": "MedScribe",
      "authors": "Vals AI",
      "year": "2026",
      "tasks": "Medical administrative support tasks",
      "format": "Accuracy score",
      "difficulty": "Professional healthcare administration",
      "decimals": 2,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/valsMedScribe",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/valsMedScribe.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "valsMortgageTax",
      "name": "MortgageTax",
      "fullName": "Vals MortgageTax",
      "description": "Vals AI benchmark for mortgage and tax document reasoning, including semantic and numerical extraction task views.",
      "paperUrl": "https://www.vals.ai/benchmarks/mortgage_tax",
      "paperTitle": "MortgageTax",
      "authors": "Vals AI",
      "year": "2026",
      "tasks": "Mortgage and tax extraction tasks",
      "format": "Accuracy score",
      "difficulty": "Professional mortgage-tax document reasoning",
      "decimals": 2,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/valsMortgageTax",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/valsMortgageTax.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "valsProofBench",
      "name": "ProofBench",
      "fullName": "Vals ProofBench",
      "description": "Vals AI automated theorem-proving benchmark.",
      "paperUrl": "https://www.vals.ai/benchmarks/proof_bench",
      "paperTitle": "ProofBench",
      "authors": "Vals AI",
      "year": "2026",
      "tasks": "Automated theorem proving",
      "format": "Accuracy score",
      "difficulty": "Formal proof reasoning",
      "decimals": 2,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/valsProofBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/valsProofBench.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "valsLegalBench",
      "name": "LegalBench",
      "fullName": "Vals LegalBench",
      "description": "Vals AI legal benchmark with issue, rule, conclusion, interpretation, and rhetoric task views.",
      "paperUrl": "https://www.vals.ai/benchmarks/legal_bench",
      "paperTitle": "LegalBench",
      "authors": "Vals AI",
      "year": "2026",
      "tasks": "Legal reasoning task views",
      "format": "Accuracy score",
      "difficulty": "Professional legal reasoning",
      "decimals": 2,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/valsLegalBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/valsLegalBench.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "valsCaseLawV2",
      "name": "CaseLaw v2",
      "fullName": "Vals CaseLaw v2",
      "description": "Vals AI private question-answer benchmark over Canadian court cases.",
      "paperUrl": "https://www.vals.ai/benchmarks/case_law_v2",
      "paperTitle": "CaseLaw v2",
      "authors": "Vals AI",
      "year": "2026",
      "tasks": "Canadian case-law question answering",
      "format": "Accuracy score",
      "difficulty": "Professional legal retrieval and reasoning",
      "decimals": 2,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/valsCaseLawV2",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/valsCaseLawV2.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "deepSwe",
      "name": "DeepSWE",
      "fullName": "DeepSWE",
      "description": "A long-horizon software engineering benchmark from Datacurve for measuring frontier coding agents on original tasks drawn from active open-source repositories.",
      "paperUrl": "https://deepswe.datacurve.ai/blog",
      "paperTitle": "DeepSWE benchmark blog",
      "authors": "Datacurve AI",
      "year": "2026",
      "tasks": "113 software engineering tasks across 91 repositories and 5 languages",
      "format": "Pass@1 with confidence interval, cost, time, and token metadata",
      "difficulty": "Long-horizon software engineering",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/deepSwe",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/deepSwe.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "sweMarathon",
      "name": "SWE-Marathon",
      "fullName": "SWE-Marathon",
      "description": "A long-horizon software engineering benchmark from Abundant AI with multi-hour tasks spanning library reproductions, full-stack product clones, and ML engineering.",
      "paperUrl": "https://www.swe-marathon.org/",
      "paperTitle": "SWE-Marathon: Can Agents Autonomously Complete Ultra-Long-Horizon Software Work?",
      "authors": "Abundant AI and BenchFlow",
      "year": "2026",
      "tasks": "20 multi-hour software engineering tasks",
      "format": "Task resolution and trajectory review",
      "difficulty": "Ultra-long-horizon software engineering",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/sweMarathon",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/sweMarathon.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "exploitBench",
      "name": "ExploitBench",
      "fullName": "ExploitBench v8-bench",
      "description": "A cybersecurity benchmark for evaluating LLM agents on full-control V8 exploit synthesis using 16 measured exploit capability flags.",
      "paperUrl": "https://exploitbench.ai/",
      "paperTitle": "ExploitBench",
      "authors": "Seunghyun Lee, David Brumley, Carnegie Mellon University",
      "year": "2026",
      "tasks": "V8 exploit synthesis runs",
      "format": "Capability coverage percentage over 16 flags",
      "difficulty": "Browser exploitation and cybersecurity",
      "decimals": 0,
      "weight": null,
      "displayableScoreCount": 2,
      "url": "https://benchlm.ai/benchmarks/exploitBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/exploitBench.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "gbaEval",
      "name": "GBA-Eval",
      "fullName": "GBA-Eval",
      "description": "An agentic coding benchmark that asks models to build a Game Boy Advance emulator from scratch and grades emulator behavior against procedural, audio, and gameplay tests.",
      "paperUrl": "https://gbaeval.com/",
      "paperTitle": "GBA-Eval",
      "authors": "Stephen Yang",
      "year": "2026",
      "tasks": "27 emulator test cases",
      "format": "Overall emulator score",
      "difficulty": "Long-horizon systems programming",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/gbaEval",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/gbaEval.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "caisTextLeaderboard",
      "name": "CAIS Text Leaderboard",
      "fullName": "CAIS AI Dashboard Text Capabilities Index",
      "description": "A Center for AI Safety dashboard view summarizing text capabilities across HLE, ARC-AGI-2, SWE-Bench Pro, and TextQuests.",
      "paperUrl": "https://dashboard.safe.ai/",
      "paperTitle": "CAIS AI Dashboard",
      "authors": "Center for AI Safety",
      "year": "2025",
      "tasks": "HLE, ARC-AGI-2, SWE-Bench Pro, and TextQuests",
      "format": "Average component score",
      "difficulty": "Composite frontier text capability",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/caisTextLeaderboard",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/caisTextLeaderboard.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "weirdMl",
      "name": "WeirdML",
      "fullName": "WeirdML v2",
      "description": "A machine-learning engineering benchmark that tests whether LLMs can train models on novel datasets, write PyTorch code, and improve through iterative feedback.",
      "paperUrl": "https://htihle.github.io/weirdml.html",
      "paperTitle": "WeirdML",
      "authors": "Havard Tveit Ihle",
      "year": "2026",
      "tasks": "17 novel ML engineering tasks",
      "format": "Average accuracy across tasks",
      "difficulty": "Novel dataset modeling and iterative debugging",
      "decimals": 2,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/weirdMl",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/weirdMl.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "aleBench",
      "name": "ALE-Bench",
      "fullName": "Agents Last Exam",
      "description": "A benchmark for agentic professional workflows with verifiable success criteria, reporting pass rates and partial scores for model plus agent-harness rows.",
      "paperUrl": "https://agents-last-exam.org/leaderboard",
      "paperTitle": "Agents Last Exam",
      "authors": "UC Berkeley RDI",
      "year": "2026",
      "tasks": "152 ALE-V1 professional workflow tasks across 13 top-level domains",
      "format": "Pass rate, partial-credit score, cost, token, and duration metadata",
      "difficulty": "Real-world agentic workflows",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/aleBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/aleBench.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "runescapeBench",
      "name": "RuneScape-Bench",
      "fullName": "RuneBench / runescape-bench",
      "description": "An agentic coding benchmark where models use a TypeScript SDK to play a RuneScape-like environment and optimize skill-training performance.",
      "paperUrl": "https://maxbittker.github.io/runebench/",
      "paperTitle": "RuneBench",
      "authors": "Max Bittker",
      "year": "2026",
      "tasks": "16 RuneScape skill-training tasks",
      "format": "Average log XP-rate score",
      "difficulty": "Agentic gameplay automation",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/runescapeBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/runescapeBench.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "tolokaArena",
      "name": "Toloka Arena",
      "fullName": "Toloka Arena",
      "description": "An independent agentic-intelligence evaluation from Toloka using private simulated workflows and a pass^5 metric.",
      "paperUrl": "https://toloka.ai/arena",
      "paperTitle": "Toloka Arena",
      "authors": "Toloka",
      "year": "2026",
      "tasks": "Private simulated enterprise workflows",
      "format": "pass^5 arena score",
      "difficulty": "Agentic workflow reliability",
      "decimals": 1,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/tolokaArena",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/tolokaArena.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "valsSweBench",
      "name": "Vals SWE-bench mirror",
      "fullName": "Vals-hosted SWE-bench mirror",
      "description": "Vals AI hosted SWE-bench view for solving production software engineering tasks.",
      "paperUrl": "https://www.vals.ai/benchmarks/swebench",
      "paperTitle": "Vals SWE-bench",
      "authors": "Vals AI",
      "year": "2026",
      "tasks": "Software engineering issue-resolution tasks",
      "format": "Accuracy score",
      "difficulty": "Production software engineering",
      "decimals": 2,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/valsSweBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/valsSweBench.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "valsTerminalBench2",
      "name": "Vals Terminal-Bench 2.0 mirror",
      "fullName": "Vals-hosted Terminal-Bench 2.0 mirror",
      "description": "Vals AI hosted Terminal-Bench 2.0 view with easy, medium, and hard task splits.",
      "paperUrl": "https://www.vals.ai/benchmarks/terminal-bench-2",
      "paperTitle": "Vals Terminal-Bench 2.0",
      "authors": "Vals AI",
      "year": "2026",
      "tasks": "Terminal task difficulty splits",
      "format": "Accuracy score",
      "difficulty": "Terminal-based agent execution",
      "decimals": 2,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/valsTerminalBench2",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/valsTerminalBench2.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "valsLiveCodeBench",
      "name": "Vals LiveCodeBench mirror",
      "fullName": "Vals-hosted LiveCodeBench mirror",
      "description": "Vals AI implementation of LiveCodeBench with easy, medium, and hard task splits.",
      "paperUrl": "https://www.vals.ai/benchmarks/lcb",
      "paperTitle": "Vals LiveCodeBench",
      "authors": "Vals AI",
      "year": "2026",
      "tasks": "Coding problem difficulty splits",
      "format": "Accuracy score",
      "difficulty": "Contamination-resistant coding problems",
      "decimals": 2,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/valsLiveCodeBench",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/valsLiveCodeBench.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "valsGpqaDiamond",
      "name": "Vals GPQA Diamond mirror",
      "fullName": "Vals-hosted GPQA Diamond mirror",
      "description": "Vals AI hosted GPQA Diamond view with few-shot and zero-shot chain-of-thought task splits.",
      "paperUrl": "https://www.vals.ai/benchmarks/gpqa",
      "paperTitle": "Vals GPQA Diamond",
      "authors": "Vals AI",
      "year": "2026",
      "tasks": "GPQA Diamond task splits",
      "format": "Accuracy score",
      "difficulty": "Graduate science reasoning",
      "decimals": 2,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/valsGpqaDiamond",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/valsGpqaDiamond.md"
    },
    {
      "category": "external",
      "categoryLabel": "external",
      "benchmarkKey": "valsMmluPro",
      "name": "Vals MMLU-Pro mirror",
      "fullName": "Vals-hosted MMLU-Pro mirror",
      "description": "Vals AI hosted MMLU-Pro view with subject-level task splits.",
      "paperUrl": "https://www.vals.ai/benchmarks/mmlu_pro",
      "paperTitle": "Vals MMLU-Pro",
      "authors": "Vals AI",
      "year": "2026",
      "tasks": "MMLU-Pro subject splits",
      "format": "Accuracy score",
      "difficulty": "Professional academic reasoning",
      "decimals": 2,
      "weight": null,
      "displayableScoreCount": 0,
      "url": "https://benchlm.ai/benchmarks/valsMmluPro",
      "markdownUrl": "https://benchlm.ai/md/benchmarks/valsMmluPro.md"
    }
  ]
}
