langchain-ai · Lance Martin (rlancemartin) · Aug 8, 2025 · Aug 7, 2025 · Aug 8, 2025
diff --git a/README.md b/README.md
@@ -8,6 +8,8 @@ Deep research has broken out as one of the most popular agent applications. This
 
 ### 🔥 Recent Updates
 
+**August 7, 2025**: Added support for GPT-5 models and updated the Deep Research Bench evaluation to use GPT-5.
+
 **August 2, 2025**: Achieved #6 ranking on the [Deep Research Bench Leaderboard](https://huggingface.co/spaces/Ayanami0730/DeepResearch-Leaderboard) with an overall score of 0.4344. 
 
 **July 30, 2025**: Read about the evolution from our original implementations to the current version in our [blog post](https://rlancemartin.github.io/2025/07/30/bitter_lesson/).
@@ -103,9 +105,10 @@ This creates `tests/expt_results/deep_research_bench_model-name.jsonl` with the
 
 | Name | Commit | Summarization | Research | Compression | Total Cost | Total Tokens | RACE Score | Experiment |
 |------|--------|---------------|----------|-------------|------------|--------------|------------|------------|
+| GPT-5 | [168](https://github.com/langchain-ai/open_deep_research/pull/168/commits) | openai:gpt-4.1-mini | openai:gpt-5 | openai:gpt-4.1 | TBD | 204,640,896 | 0.4932 | [Link](https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/6e4766ca-613c-4bda-8bde-f64f0422bbf3/compare?selectedSessions=4d5941c8-69ce-4f3d-8b3e-e3c99dfbd4cc&baseline=undefined) |
 | Defaults | [6532a41](https://github.com/langchain-ai/open_deep_research/commit/6532a4176a93cc9bb2102b3d825dcefa560c85d9) | openai:gpt-4.1-mini | openai:gpt-4.1 | openai:gpt-4.1 | $45.98 | 58,015,332 | 0.4309 | [Link](https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/6e4766ca-6[…]ons=cf4355d7-6347-47e2-a774-484f290e79bc&baseline=undefined) |
 | Claude Sonnet 4 | [f877ea9](https://github.com/langchain-ai/open_deep_research/pull/163/commits/f877ea93641680879c420ea991e998b47aab9bcc) | openai:gpt-4.1-mini | anthropic:claude-sonnet-4-20250514 | openai:gpt-4.1 | $187.09 | 138,917,050 | 0.4401 | [Link](https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/6e4766ca-6[…]ons=04f6002d-6080-4759-bcf5-9a52e57449ea&baseline=undefined) |
-| Deep Research Bench Submission | [c0a160b](https://github.com/langchain-ai/open_deep_research/commit/c0a160b57a9b5ecd4b8217c3811a14d8eff97f72) | openai:gpt-4.1-nano | openai:gpt-4.1 | openai:gpt-4.1 | $87.83 | 207,005,549 | 0.4344 | [Link](https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/6e4766ca-6[…]ons=e6647f74-ad2f-4cb9-887e-acb38b5f73c0&baseline=undefined) | 
+| Deep Research Bench Submission | [c0a160b](https://github.com/langchain-ai/open_deep_research/commit/c0a160b57a9b5ecd4b8217c3811a14d8eff97f72) | openai:gpt-4.1-nano | openai:gpt-4.1 | openai:gpt-4.1 | $87.83 | 207,005,549 | 0.4344 | [Link](https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/6e4766ca-6[…]ons=e6647f74-ad2f-4cb9-887e-acb38b5f73c0&baseline=undefined) |
 
 ### 🚀 Deployments and Usage
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -11,13 +11,13 @@ requires-python = ">=3.10"
 dependencies = [
     "langgraph>=0.5.4",
     "langchain-community>=0.3.9",
-    "langchain-openai>=0.3.7",
+    "langchain-openai>=0.3.28",
     "langchain-anthropic>=0.3.15",
     "langchain-mcp-adapters>=0.1.6",
     "langchain-deepseek>=0.1.2",
     "langchain-tavily",
     "langchain-groq>=0.2.4",
-    "openai>=1.61.0",
+    "openai>=1.99.2",
     "tavily-python>=0.5.0",
     "arxiv>=2.1.3",
     "pymupdf>=1.25.3",

diff --git a/tests/expt_results/deep_research_bench_gpt-5.jsonl b/tests/expt_results/deep_research_bench_gpt-5.jsonl
diff --git a/tests/run_evaluate.py b/tests/run_evaluate.py
@@ -22,7 +22,7 @@
 max_react_tool_calls = 10
 summarization_model = "openai:gpt-4.1-mini"
 summarization_model_max_tokens = 8192
-research_model = "openai:gpt-4.1" # "anthropic:claude-sonnet-4-20250514"
+research_model = "openai:gpt-5" # "anthropic:claude-sonnet-4-20250514"
 research_model_max_tokens = 10000
 compression_model = "openai:gpt-4.1"
 compression_model_max_tokens = 10000
@@ -65,7 +65,7 @@ async def main():
         target,
         data=dataset_name,
         evaluators=evaluators,
-        experiment_prefix=f"ODR GPT-4.1, Tavily Search, Fix Max Supervisor Iterations",
+        experiment_prefix=f"ODR GPT-5, Tavily Search",
         max_concurrency=10,
         metadata={
             "max_structured_output_retries": max_structured_output_retries,

diff --git a/uv.lock b/uv.lock