diff --git a/doc/progress.rst b/doc/progress.rst
index a9f1e2f2a..6a5b38096 100644
--- a/doc/progress.rst
+++ b/doc/progress.rst
@@ -19,6 +19,7 @@ Changelog
 * MAINT #897: Dropping support for Python 3.5.
 * ADD #894: Support caching of datasets using feather format as an option.
 * ADD #945: PEP 561 compliance for distributing Type information
+* MAINT #371: ``list_evaluations`` default ``size`` changed from ``None`` to ``10_000``.
 
 0.10.2
 ~~~~~~
diff --git a/examples/40_paper/2018_ida_strang_example.py b/examples/40_paper/2018_ida_strang_example.py
index 687d973c2..8b225125b 100644
--- a/examples/40_paper/2018_ida_strang_example.py
+++ b/examples/40_paper/2018_ida_strang_example.py
@@ -47,7 +47,7 @@
 
 # Downloads all evaluation records related to this study
 evaluations = openml.evaluations.list_evaluations(
-    measure, flows=flow_ids, study=study_id, output_format="dataframe"
+    measure, size=None, flows=flow_ids, study=study_id, output_format="dataframe"
 )
 # gives us a table with columns data_id, flow1_value, flow2_value
 evaluations = evaluations.pivot(index="data_id", columns="flow_id", values="value").dropna()
diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
index 4c17f8ce7..b3fdd0aa0 100644
--- a/openml/evaluations/functions.py
+++ b/openml/evaluations/functions.py
@@ -16,7 +16,7 @@
 def list_evaluations(
     function: str,
     offset: Optional[int] = None,
-    size: Optional[int] = None,
+    size: Optional[int] = 10000,
     tasks: Optional[List[Union[str, int]]] = None,
     setups: Optional[List[Union[str, int]]] = None,
     flows: Optional[List[Union[str, int]]] = None,
@@ -38,8 +38,9 @@ def list_evaluations(
         the evaluation function. e.g., predictive_accuracy
     offset : int, optional
         the number of runs to skip, starting from the first
-    size : int, optional
-        the maximum number of runs to show
+    size : int, default 10000
+        The maximum number of runs to show.
+        If set to ``None``, it returns all the results.
 
     tasks : list[int,str], optional
         the list of task IDs
diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py
index 0127309a7..e4de9b03c 100644
--- a/tests/test_evaluations/test_evaluation_functions.py
+++ b/tests/test_evaluations/test_evaluation_functions.py
@@ -41,7 +41,9 @@ def test_evaluation_list_filter_task(self):
 
         task_id = 7312
 
-        evaluations = openml.evaluations.list_evaluations("predictive_accuracy", tasks=[task_id])
+        evaluations = openml.evaluations.list_evaluations(
+            "predictive_accuracy", size=110, tasks=[task_id]
+        )
 
         self.assertGreater(len(evaluations), 100)
         for run_id in evaluations.keys():
@@ -56,7 +58,7 @@ def test_evaluation_list_filter_uploader_ID_16(self):
 
         uploader_id = 16
         evaluations = openml.evaluations.list_evaluations(
-            "predictive_accuracy", uploaders=[uploader_id], output_format="dataframe"
+            "predictive_accuracy", size=60, uploaders=[uploader_id], output_format="dataframe"
         )
         self.assertEqual(evaluations["uploader"].unique(), [uploader_id])
 
@@ -66,7 +68,9 @@ def test_evaluation_list_filter_uploader_ID_10(self):
         openml.config.server = self.production_server
 
         setup_id = 10
-        evaluations = openml.evaluations.list_evaluations("predictive_accuracy", setups=[setup_id])
+        evaluations = openml.evaluations.list_evaluations(
+            "predictive_accuracy", size=60, setups=[setup_id]
+        )
 
         self.assertGreater(len(evaluations), 50)
         for run_id in evaluations.keys():
@@ -81,7 +85,9 @@ def test_evaluation_list_filter_flow(self):
 
         flow_id = 100
 
-        evaluations = openml.evaluations.list_evaluations("predictive_accuracy", flows=[flow_id])
+        evaluations = openml.evaluations.list_evaluations(
+            "predictive_accuracy", size=10, flows=[flow_id]
+        )
 
         self.assertGreater(len(evaluations), 2)
         for run_id in evaluations.keys():
@@ -96,7 +102,9 @@ def test_evaluation_list_filter_run(self):
 
         run_id = 12
 
-        evaluations = openml.evaluations.list_evaluations("predictive_accuracy", runs=[run_id])
+        evaluations = openml.evaluations.list_evaluations(
+            "predictive_accuracy", size=2, runs=[run_id]
+        )
 
         self.assertEqual(len(evaluations), 1)
         for run_id in evaluations.keys():
@@ -164,7 +172,7 @@ def test_evaluation_list_sort(self):
         task_id = 6
         # Get all evaluations of the task
         unsorted_eval = openml.evaluations.list_evaluations(
-            "predictive_accuracy", offset=0, tasks=[task_id]
+            "predictive_accuracy", size=None, offset=0, tasks=[task_id]
         )
         # Get top 10 evaluations of the same task
         sorted_eval = openml.evaluations.list_evaluations(
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
index b3adfc9d6..993771c90 100644
--- a/tests/test_study/test_study_functions.py
+++ b/tests/test_study/test_study_functions.py
@@ -152,7 +152,9 @@ def test_publish_study(self):
         self.assertSetEqual(set(run_ids), set(study_downloaded.runs))
 
         # test whether the list evaluation function also handles study data fine
-        run_ids = openml.evaluations.list_evaluations("predictive_accuracy", study=study.id)
+        run_ids = openml.evaluations.list_evaluations(
+            "predictive_accuracy", size=None, study=study.id
+        )
         self.assertSetEqual(set(run_ids), set(study_downloaded.runs))
 
         # attach more runs