diff --git a/ci_scripts/test.sh b/ci_scripts/test.sh
index 0a1f94df6..504d15bbd 100644
--- a/ci_scripts/test.sh
+++ b/ci_scripts/test.sh
@@ -19,7 +19,7 @@ run_tests() {
     cd $TEST_DIR
 
     if [[ "$COVERAGE" == "true" ]]; then
-        PYTEST_ARGS='--cov=openml'
+        PYTEST_ARGS='--cov=openml --long'
     else
         PYTEST_ARGS=''
     fi
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index 84943b244..28bde17f6 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -347,7 +347,7 @@ def check_datasets_active(dataset_ids: List[int]) -> Dict[int, bool]:
     dict
         A dictionary with items {did: bool}
     """
-    dataset_list = list_datasets(status="all")
+    dataset_list = list_datasets(status="all", data_id=dataset_ids)
     active = {}
 
     for did in dataset_ids:
diff --git a/tests/conftest.py b/tests/conftest.py
index 461a513fd..6a66d4ed9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -25,6 +25,7 @@
 import os
 import logging
 from typing import List
+import pytest
 
 import openml
 from openml.testing import TestBase
@@ -182,3 +183,17 @@ def pytest_sessionfinish() -> None:
         logger.info("Local files deleted")
 
     logger.info("{} is killed".format(worker))
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--long",
+        action="store_true",
+        default=False,
+        help="Run the long version of tests which support both short and long scenarios.",
+    )
+
+
+@pytest.fixture(scope="class")
+def long_version(request):
+    request.cls.long_version = request.config.getoption("--long")
diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py
index e4de9b03c..70f36ce19 100644
--- a/tests/test_evaluations/test_evaluation_functions.py
+++ b/tests/test_evaluations/test_evaluation_functions.py
@@ -1,10 +1,12 @@
 # License: BSD 3-Clause
+import pytest
 
 import openml
 import openml.evaluations
 from openml.testing import TestBase
 
 
+@pytest.mark.usefixtures("long_version")
 class TestEvaluationFunctions(TestBase):
     _multiprocess_can_split_ = True
 
@@ -27,6 +29,10 @@ def _check_list_evaluation_setups(self, **kwargs):
 
         # Check if output and order of list_evaluations is preserved
         self.assertSequenceEqual(evals_setups["run_id"].tolist(), evals["run_id"].tolist())
+
+        if not self.long_version:
+            evals_setups = evals_setups.head(1)
+
         # Check if the hyper-parameter column is as accurate and flow_id
         for index, row in evals_setups.iterrows():
             params = openml.runs.get_run(row["run_id"]).parameter_settings
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index 12af05ffe..69771ee01 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -2,18 +2,22 @@
 
 from collections import OrderedDict
 import copy
+import functools
 import unittest
+from unittest.mock import patch
 
 from distutils.version import LooseVersion
 import sklearn
 from sklearn import ensemble
 import pandas as pd
+import pytest
 
 import openml
 from openml.testing import TestBase
 import openml.extensions.sklearn
 
 
+@pytest.mark.usefixtures("long_version")
 class TestFlowFunctions(TestBase):
     _multiprocess_can_split_ = True
 
@@ -334,20 +338,27 @@ def test_get_flow_reinstantiate_model_wrong_version(self):
             assert "0.19.1" not in flow.dependencies
 
     def test_get_flow_id(self):
-        clf = sklearn.tree.DecisionTreeClassifier()
-        flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish()
-
-        self.assertEqual(openml.flows.get_flow_id(model=clf, exact_version=True), flow.flow_id)
-        flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False)
-        self.assertIn(flow.flow_id, flow_ids)
-        self.assertGreater(len(flow_ids), 2)
-
-        # Check that the output of get_flow_id is identical if only the name is given, no matter
-        # whether exact_version is set to True or False.
-        flow_ids_exact_version_True = openml.flows.get_flow_id(name=flow.name, exact_version=True)
-        flow_ids_exact_version_False = openml.flows.get_flow_id(
-            name=flow.name, exact_version=False,
-        )
-        self.assertEqual(flow_ids_exact_version_True, flow_ids_exact_version_False)
-        self.assertIn(flow.flow_id, flow_ids_exact_version_True)
-        self.assertGreater(len(flow_ids_exact_version_True), 2)
+        if self.long_version:
+            list_all = openml.utils._list_all
+        else:
+            list_all = functools.lru_cache()(openml.utils._list_all)
+        with patch("openml.utils._list_all", list_all):
+            clf = sklearn.tree.DecisionTreeClassifier()
+            flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish()
+
+            self.assertEqual(openml.flows.get_flow_id(model=clf, exact_version=True), flow.flow_id)
+            flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False)
+            self.assertIn(flow.flow_id, flow_ids)
+            self.assertGreater(len(flow_ids), 2)
+
+            # Check that the output of get_flow_id is identical if only the name is given, no matter
+            # whether exact_version is set to True or False.
+            flow_ids_exact_version_True = openml.flows.get_flow_id(
+                name=flow.name, exact_version=True
+            )
+            flow_ids_exact_version_False = openml.flows.get_flow_id(
+                name=flow.name, exact_version=False,
+            )
+            self.assertEqual(flow_ids_exact_version_True, flow_ids_exact_version_False)
+            self.assertIn(flow.flow_id, flow_ids_exact_version_True)
+            self.assertGreater(len(flow_ids_exact_version_True), 2)
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 89f01c72e..c4628c452 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -10,7 +10,6 @@
 import unittest.mock
 
 import numpy as np
-import pytest
 
 import openml
 import openml.exceptions
@@ -335,7 +334,7 @@ def _check_sample_evaluations(
                         for sample in range(num_sample_entrees):
                             evaluation = sample_evaluations[measure][rep][fold][sample]
                             self.assertIsInstance(evaluation, float)
-                            if not os.environ.get("CI_WINDOWS"):
+                            if not (os.environ.get("CI_WINDOWS") or os.name == "nt"):
                                 # Either Appveyor is much faster than Travis
                                 # and/or measurements are not as accurate.
                                 # Either way, windows seems to get an eval-time
@@ -682,6 +681,8 @@ def test_run_and_upload_randomsearch(self):
             flow_expected_rsv="12172",
         )
         self.assertEqual(len(run.trace.trace_iterations), 5)
+        trace = openml.runs.get_run_trace(run.run_id)
+        self.assertEqual(len(trace.trace_iterations), 5)
 
     def test_run_and_upload_maskedarrays(self):
         # This testcase is important for 2 reasons:
@@ -828,31 +829,12 @@ def _test_local_evaluations(self, run):
                 self.assertGreaterEqual(alt_scores[idx], 0)
                 self.assertLessEqual(alt_scores[idx], 1)
 
-    @unittest.skipIf(
-        LooseVersion(sklearn.__version__) < "0.20",
-        reason="SimpleImputer doesn't handle mixed type DataFrame as input",
-    )
     def test_local_run_swapped_parameter_order_model(self):
+        clf = DecisionTreeClassifier()
+        australian_task = 595
+        task = openml.tasks.get_task(australian_task)
 
-        # construct sci-kit learn classifier
-        clf = Pipeline(
-            steps=[
-                (
-                    "imputer",
-                    make_pipeline(
-                        SimpleImputer(strategy="most_frequent"),
-                        OneHotEncoder(handle_unknown="ignore"),
-                    ),
-                ),
-                # random forest doesn't take categoricals
-                ("estimator", RandomForestClassifier()),
-            ]
-        )
-
-        # download task
-        task = openml.tasks.get_task(7)
-
-        # invoke OpenML run
+        # task and clf are purposely in the old order
         run = openml.runs.run_model_on_task(
             task, clf, avoid_duplicate_runs=False, upload_flow=False,
         )
@@ -950,55 +932,6 @@ def test_initialize_model_from_run(self):
         self.assertEqual(flowS.components["Imputer"].parameters["strategy"], '"most_frequent"')
         self.assertEqual(flowS.components["VarianceThreshold"].parameters["threshold"], "0.05")
 
-    @pytest.mark.flaky()
-    def test_get_run_trace(self):
-        # get_run_trace is already tested implicitly in test_run_and_publish
-        # this test is a bit additional.
-        num_iterations = 10
-        num_folds = 1
-        task_id = 119
-
-        task = openml.tasks.get_task(task_id)
-
-        # IMPORTANT! Do not sentinel this flow. is faster if we don't wait
-        # on openml server
-        clf = RandomizedSearchCV(
-            RandomForestClassifier(random_state=42, n_estimators=5),
-            {
-                "max_depth": [3, None],
-                "max_features": [1, 2, 3, 4],
-                "bootstrap": [True, False],
-                "criterion": ["gini", "entropy"],
-            },
-            num_iterations,
-            random_state=42,
-            cv=3,
-        )
-
-        # [SPEED] make unit test faster by exploiting run information
-        # from the past
-        try:
-            # in case the run did not exists yet
-            run = openml.runs.run_model_on_task(model=clf, task=task, avoid_duplicate_runs=True,)
-
-            self.assertEqual(
-                len(run.trace.trace_iterations), num_iterations * num_folds,
-            )
-            run = run.publish()
-            TestBase._mark_entity_for_removal("run", run.run_id)
-            TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id))
-            self._wait_for_processed_run(run.run_id, 400)
-            run_id = run.run_id
-        except openml.exceptions.OpenMLRunsExistError as e:
-            # The only error we expect, should fail otherwise.
-            run_ids = [int(run_id) for run_id in e.run_ids]
-            self.assertGreater(len(run_ids), 0)
-            run_id = random.choice(list(run_ids))
-
-        # now the actual unit test ...
-        run_trace = openml.runs.get_run_trace(run_id)
-        self.assertEqual(len(run_trace.trace_iterations), num_iterations * num_folds)
-
     @unittest.skipIf(
         LooseVersion(sklearn.__version__) < "0.20",
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
index 5f9b65495..1e7642b35 100644
--- a/tests/test_tasks/test_task_functions.py
+++ b/tests/test_tasks/test_task_functions.py
@@ -110,7 +110,7 @@ def test_list_tasks_paginate(self):
                 self._check_task(tasks[tid])
 
     def test_list_tasks_per_type_paginate(self):
-        size = 10
+        size = 40
         max = 100
         task_types = [
             TaskType.SUPERVISED_CLASSIFICATION,
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index 9729100bb..b5ef7b2bf 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -11,7 +11,6 @@
 
 class OpenMLTaskTest(TestBase):
     _multiprocess_can_split_ = True
-    _batch_size = 25
 
     def mocked_perform_api_call(call, request_method):
         # TODO: JvR: Why is this not a staticmethod?
@@ -33,7 +32,7 @@ def test_list_all_few_results_available(self, _perform_api_call):
 
     def test_list_all_for_datasets(self):
         required_size = 127  # default test server reset value
-        datasets = openml.datasets.list_datasets(batch_size=self._batch_size, size=required_size)
+        datasets = openml.datasets.list_datasets(batch_size=100, size=required_size)
 
         self.assertEqual(len(datasets), required_size)
         for did in datasets:
@@ -53,13 +52,13 @@ def test_list_datasets_with_high_size_parameter(self):
 
     def test_list_all_for_tasks(self):
         required_size = 1068  # default test server reset value
-        tasks = openml.tasks.list_tasks(batch_size=self._batch_size, size=required_size)
+        tasks = openml.tasks.list_tasks(batch_size=1000, size=required_size)
 
         self.assertEqual(len(tasks), required_size)
 
     def test_list_all_for_flows(self):
         required_size = 15  # default test server reset value
-        flows = openml.flows.list_flows(batch_size=self._batch_size, size=required_size)
+        flows = openml.flows.list_flows(batch_size=25, size=required_size)
 
         self.assertEqual(len(flows), required_size)
 
@@ -73,7 +72,7 @@ def test_list_all_for_setups(self):
 
     def test_list_all_for_runs(self):
         required_size = 21
-        runs = openml.runs.list_runs(batch_size=self._batch_size, size=required_size)
+        runs = openml.runs.list_runs(batch_size=25, size=required_size)
 
         # might not be on test server after reset, please rerun test at least once if fails
         self.assertEqual(len(runs), required_size)