diff --git a/doc/progress.rst b/doc/progress.rst
index 4b227cd2f..355e4f058 100644
--- a/doc/progress.rst
+++ b/doc/progress.rst
@@ -8,6 +8,7 @@ Changelog
 
 0.10.0
 ~~~~~~
+* FIX #838: Fix list_evaluations_setups to work when evaluations are not a 100 multiple.
 * ADD #737: Add list_evaluations_setups to return hyperparameters along with list of evaluations.
 * FIX #261: Test server is cleared of all files uploaded during unit testing.
 * FIX #447: All files created by unit tests no longer persist in local.
diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
index 044f49370..8de69ebc1 100644
--- a/openml/evaluations/functions.py
+++ b/openml/evaluations/functions.py
@@ -324,14 +324,17 @@ def list_evaluations_setups(
     evals = list_evaluations(function=function, offset=offset, size=size, run=run, task=task,
                              setup=setup, flow=flow, uploader=uploader, tag=tag,
                              per_fold=per_fold, sort_order=sort_order, output_format='dataframe')
-
     # List setups
-    # Split setups in evals into chunks of N setups as list_setups does not support large size
+    # list_setups by setup id does not support large sizes (exceeds URL length limit)
+    # Hence we split the list of unique setup ids returned by list_evaluations into chunks of size N
     df = pd.DataFrame()
     if len(evals) != 0:
-        N = 100
-        setup_chunks = np.split(evals['setup_id'].unique(),
-                                ((len(evals['setup_id'].unique()) - 1) // N) + 1)
+        N = 100  # size of section
+        length = len(evals['setup_id'].unique())  # length of the array we want to split
+        # array_split - allows indices_or_sections to not equally divide the array
+        # array_split -length % N sub-arrays of size length//N + 1 and the rest of size length//N.
+        setup_chunks = np.array_split(ary=evals['setup_id'].unique(),
+                                      indices_or_sections=((length - 1) // N) + 1)
         setups = pd.DataFrame()
         for setup in setup_chunks:
             result = pd.DataFrame(openml.setups.list_setups(setup=setup, output_format='dataframe'))
diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py
index 7dac00891..21e61c471 100644
--- a/tests/test_evaluations/test_evaluation_functions.py
+++ b/tests/test_evaluations/test_evaluation_functions.py
@@ -6,18 +6,20 @@
 class TestEvaluationFunctions(TestBase):
     _multiprocess_can_split_ = True
 
-    def _check_list_evaluation_setups(self, size, **kwargs):
+    def _check_list_evaluation_setups(self, **kwargs):
         evals_setups = openml.evaluations.list_evaluations_setups("predictive_accuracy",
-                                                                  **kwargs, size=size,
+                                                                  **kwargs,
                                                                   sort_order='desc',
                                                                   output_format='dataframe')
         evals = openml.evaluations.list_evaluations("predictive_accuracy",
-                                                    **kwargs, size=size,
+                                                    **kwargs,
                                                     sort_order='desc',
                                                     output_format='dataframe')
 
         # Check if list is non-empty
         self.assertGreater(len(evals_setups), 0)
+        # Check if length is accurate
+        self.assertEqual(len(evals_setups), len(evals))
         # Check if output from sort is sorted in the right order
         self.assertSequenceEqual(sorted(evals_setups['value'].tolist(), reverse=True),
                                  evals_setups['value'].tolist())
@@ -176,7 +178,7 @@ def test_list_evaluations_setups_filter_flow(self):
         openml.config.server = self.production_server
         flow_id = [405]
         size = 100
-        evals = self._check_list_evaluation_setups(size, flow=flow_id)
+        evals = self._check_list_evaluation_setups(flow=flow_id, size=size)
         # check if parameters in separate columns works
         evals_cols = openml.evaluations.list_evaluations_setups("predictive_accuracy",
                                                                 flow=flow_id, size=size,
@@ -191,5 +193,5 @@ def test_list_evaluations_setups_filter_flow(self):
     def test_list_evaluations_setups_filter_task(self):
         openml.config.server = self.production_server
         task_id = [6]
-        size = 100
-        self._check_list_evaluation_setups(size, task=task_id)
+        size = 121
+        self._check_list_evaluation_setups(task=task_id, size=size)