diff --git a/.all-contributorsrc b/.all-contributorsrc
new file mode 100644
index 000000000..3e16fe084
--- /dev/null
+++ b/.all-contributorsrc
@@ -0,0 +1,36 @@
+{
+ "files": [
+ "README.md"
+ ],
+ "imageSize": 100,
+ "commit": false,
+ "contributors": [
+ {
+ "login": "a-moadel",
+ "name": "a-moadel",
+ "avatar_url": "https://avatars0.githubusercontent.com/u/46557866?v=4",
+ "profile": "https://github.com/a-moadel",
+ "contributions": [
+ "doc",
+ "example"
+ ]
+ },
+ {
+ "login": "Neeratyoy",
+ "name": "Neeratyoy Mallik",
+ "avatar_url": "https://avatars2.githubusercontent.com/u/3191233?v=4",
+ "profile": "https://github.com/Neeratyoy",
+ "contributions": [
+ "code",
+ "doc",
+ "example"
+ ]
+ }
+ ],
+ "contributorsPerLine": 7,
+ "projectName": "openml-python",
+ "projectOwner": "openml",
+ "repoType": "github",
+ "repoHost": "https://github.com",
+ "skipCi": true
+}
diff --git a/.github/workflows/dist.yaml b/.github/workflows/dist.yaml
new file mode 100644
index 000000000..51ffe03d5
--- /dev/null
+++ b/.github/workflows/dist.yaml
@@ -0,0 +1,30 @@
+name: dist-check
+
+on: [push, pull_request]
+
+jobs:
+ dist:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - name: Setup Python
+ uses: actions/setup-python@v2
+ with:
+ python-version: 3.8
+ - name: Build dist
+ run: |
+ python setup.py sdist
+ - name: Twine check
+ run: |
+ pip install twine
+ last_dist=$(ls -t dist/openml-*.tar.gz | head -n 1)
+ twine check $last_dist
+ - name: Install dist
+ run: |
+ last_dist=$(ls -t dist/openml-*.tar.gz | head -n 1)
+ pip install $last_dist
+ - name: PEP 561 Compliance
+ run: |
+ pip install mypy
+ cd .. # required to use the installed version of openml
+ if ! python -m mypy -c "import openml"; then exit 1; fi
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
new file mode 100644
index 000000000..2219c7fac
--- /dev/null
+++ b/.github/workflows/docs.yaml
@@ -0,0 +1,43 @@
+name: Docs
+on: [pull_request, push]
+
+jobs:
+ build-and-deploy:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - name: Setup Python
+ uses: actions/setup-python@v2
+ with:
+ python-version: 3.8
+ - name: Install dependencies
+ run: |
+ pip install -e .[docs,examples,examples_unix]
+ - name: Make docs
+ run: |
+ cd doc
+ make html
+ - name: Pull latest gh-pages
+ if: (contains(github.ref, 'develop') || contains(github.ref, 'master')) && github.event_name == 'push'
+ run: |
+ cd ..
+ git clone https://github.com/openml/openml-python.git --branch gh-pages --single-branch gh-pages
+ - name: Copy new doc into gh-pages
+ if: (contains(github.ref, 'develop') || contains(github.ref, 'master')) && github.event_name == 'push'
+ run: |
+ branch_name=${GITHUB_REF##*/}
+ cd ../gh-pages
+ rm -rf $branch_name
+ cp -r ../openml-python/doc/build/html $branch_name
+ - name: Push to gh-pages
+ if: (contains(github.ref, 'develop') || contains(github.ref, 'master')) && github.event_name == 'push'
+ run: |
+ last_commit=$(git log --pretty=format:"%an: %s")
+ cd ../gh-pages
+ branch_name=${GITHUB_REF##*/}
+ git add $branch_name/
+ git config --global user.name 'Github Actions'
+ git config --global user.email 'not@mail.com'
+ git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}
+ git commit -am "$last_commit"
+ git push
diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml
new file mode 100644
index 000000000..6132b2de2
--- /dev/null
+++ b/.github/workflows/pre-commit.yaml
@@ -0,0 +1,20 @@
+name: pre-commit
+
+on: [push]
+
+jobs:
+ run-all-files:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - name: Setup Python 3.7
+ uses: actions/setup-python@v2
+ with:
+ python-version: 3.7
+ - name: Install pre-commit
+ run: |
+ pip install pre-commit
+ pre-commit install
+ - name: Run pre-commit
+ run: |
+ pre-commit run --all-files
diff --git a/.github/workflows/ubuntu-test.yml b/.github/workflows/ubuntu-test.yml
new file mode 100644
index 000000000..41cc155ac
--- /dev/null
+++ b/.github/workflows/ubuntu-test.yml
@@ -0,0 +1,74 @@
+name: Tests
+
+on: [push, pull_request]
+
+jobs:
+ ubuntu:
+
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: [3.6, 3.7, 3.8]
+ scikit-learn: [0.21.2, 0.22.2, 0.23.1, 0.24]
+ exclude: # no scikit-learn 0.21.2 release for Python 3.8
+ - python-version: 3.8
+ scikit-learn: 0.21.2
+ include:
+ - python-version: 3.6
+ scikit-learn: 0.18.2
+ scipy: 1.2.0
+ - python-version: 3.6
+ scikit-learn: 0.19.2
+ - python-version: 3.6
+ scikit-learn: 0.20.2
+ - python-version: 3.8
+ scikit-learn: 0.23.1
+ code-cov: true
+ fail-fast: false
+ max-parallel: 4
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 2
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install test dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -e .[test]
+ - name: Install scikit-learn ${{ matrix.scikit-learn }}
+ run: |
+ pip install scikit-learn==${{ matrix.scikit-learn }}
+ - name: Install scipy ${{ matrix.scipy }}
+ if: ${{ matrix.scipy }}
+ run: |
+ pip install scipy==${{ matrix.scipy }}
+ - name: Store repository status
+ id: status-before
+ run: |
+ echo "::set-output name=BEFORE::$(git status --porcelain -b)"
+ - name: Run tests
+ run: |
+ if [ ${{ matrix.code-cov }} ]; then codecov='--cov=openml --long --cov-report=xml'; fi
+ pytest -n 4 --durations=20 --timeout=600 --timeout-method=thread --dist load -sv $codecov --reruns 5 --reruns-delay 1
+ - name: Check for files left behind by test
+ if: ${{ always() }}
+ run: |
+ before="${{ steps.status-before.outputs.BEFORE }}"
+ after="$(git status --porcelain -b)"
+ if [[ "$before" != "$after" ]]; then
+ echo "git status from before: $before"
+ echo "git status from after: $after"
+ echo "Not all generated files have been deleted!"
+ exit 1
+ fi
+ - name: Upload coverage
+ if: matrix.code-cov && always()
+ uses: codecov/codecov-action@v1
+ with:
+ files: coverage.xml
+ fail_ci_if_error: true
+ verbose: true
\ No newline at end of file
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 9fd33403c..000000000
--- a/.travis.yml
+++ /dev/null
@@ -1,58 +0,0 @@
-language: python
-
-sudo: false
-
-cache:
- apt: true
- # We use three different cache directory
- # to work around a Travis bug with multi-platform cache
- directories:
- - $HOME/.cache/pip
- - $HOME/download
-env:
- global:
- # Directory where tests are run from
- - TEST_DIR=/tmp/test_dir/
- - MODULE=openml
- matrix:
- - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.23.1" COVERAGE="true" DOCPUSH="true" SKIP_TESTS="true"
- - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.23.1" RUN_FLAKE8="true" SKIP_TESTS="true"
- - DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
- - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
- - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
- - DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
- - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
- - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
- - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" TEST_DIST="true"
- - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2" TEST_DIST="true"
- - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.20.2"
- # Checks for older scikit-learn versions (which also don't nicely work with
- # Python3.7)
- - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2"
- - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.18.2" SCIPY_VERSION=1.2.0
-
-# Travis issue
-# https://github.com/travis-ci/travis-ci/issues/8920
-before_install:
- - python -c "import fcntl; fcntl.fcntl(1, fcntl.F_SETFL, 0)"
-
-install: source ci_scripts/install.sh
-script: bash ci_scripts/test.sh
-after_success: source ci_scripts/success.sh && source ci_scripts/create_doc.sh $TRAVIS_BRANCH "doc_result"
-
-# travis will check the deploy on condition, before actually running before_deploy
-# before_deploy: source ci_scripts/create_doc.sh $TRAVIS_BRANCH "doc_result"
-
-# For more info regarding the deploy process and the github token look at:
-# https://docs.travis-ci.com/user/deployment/pages/
-
-deploy:
- provider: pages
- skip_cleanup: true
- github_token: $GITHUB_TOKEN
- keep-history: true
- committer-from-gh: true
- on:
- all_branches: true
- condition: $doc_result = "success"
- local_dir: doc/$TRAVIS_BRANCH
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 6b7cffad3..6fe4fd605 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -260,14 +260,9 @@ The resulting HTML files will be placed in ``build/html/`` and are viewable in
a web browser. See the ``README`` file in the ``doc/`` directory for more
information.
-For building the documentation, you will need
-[sphinx](http://sphinx.pocoo.org/),
-[sphinx-bootstrap-theme](https://ryan-roemer.github.io/sphinx-bootstrap-theme/),
-[sphinx-gallery](https://sphinx-gallery.github.io/)
-and
-[numpydoc](https://numpydoc.readthedocs.io/en/latest/).
+For building the documentation, you will need to install a few additional dependencies:
```bash
-$ pip install sphinx sphinx-bootstrap-theme sphinx-gallery numpydoc
+$ pip install -e .[docs]
```
When dependencies are installed, run
```bash
diff --git a/README.md b/README.md
index 732085697..55bab368d 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,7 @@
# OpenML-Python
+
+[](#contributors-)
+
A python interface for [OpenML](http://openml.org), an online platform for open science collaboration in machine learning.
It can be used to download or upload OpenML data such as datasets and machine learning experiment results.
@@ -40,3 +43,23 @@ Bibtex entry:
year = {2019},
}
```
+
+## Contributors ✨
+
+Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
+
+
+
+
+
+
+
+
+
+
+This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
\ No newline at end of file
diff --git a/appveyor.yml b/appveyor.yml
index 151a5e3f7..e3fa74aaf 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -45,4 +45,4 @@ build: false
test_script:
- "cd C:\\projects\\openml-python"
- - "%CMD_IN_ENV% pytest -n 4 --timeout=600 --timeout-method=thread -sv"
+ - "%CMD_IN_ENV% pytest -n 4 --timeout=600 --timeout-method=thread --dist load -sv"
diff --git a/ci_scripts/create_doc.sh b/ci_scripts/create_doc.sh
deleted file mode 100644
index 83afaa26b..000000000
--- a/ci_scripts/create_doc.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-# License: BSD 3-Clause
-
-set -euo pipefail
-
-# Check if DOCPUSH is set
-if ! [[ -z ${DOCPUSH+x} ]]; then
-
- if [[ "$DOCPUSH" == "true" ]]; then
-
- # install documentation building dependencies
- pip install matplotlib seaborn sphinx pillow sphinx-gallery sphinx_bootstrap_theme cython numpydoc nbformat nbconvert
-
- # $1 is the branch name
- # $2 is the global variable where we set the script status
-
- if ! { [ $1 = "master" ] || [ $1 = "develop" ]; }; then
- { echo "Not one of the allowed branches"; exit 0; }
- fi
-
- # delete any previous documentation folder
- if [ -d doc/$1 ]; then
- rm -rf doc/$1
- fi
-
- # create the documentation
- cd doc && make html 2>&1
-
- # create directory with branch name
- # the documentation for dev/stable from git will be stored here
- mkdir $1
-
- # get previous documentation from github
- git clone https://github.com/openml/openml-python.git --branch gh-pages --single-branch
-
- # copy previous documentation
- cp -r openml-python/. $1
- rm -rf openml-python
-
- # if the documentation for the branch exists, remove it
- if [ -d $1/$1 ]; then
- rm -rf $1/$1
- fi
-
- # copy the updated documentation for this branch
- mkdir $1/$1
- cp -r build/html/. $1/$1
-
- # takes a variable name as an argument and assigns the script outcome to a
- # variable with the given name. If it got this far, the script was successful
- function set_return() {
- # $1 is the variable where we save the script outcome
- local __result=$1
- local status='success'
- eval $__result="'$status'"
- }
-
- set_return "$2"
- fi
-fi
-# Workaround for travis failure
-set +u
diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh
deleted file mode 100755
index 67530af53..000000000
--- a/ci_scripts/install.sh
+++ /dev/null
@@ -1,81 +0,0 @@
-# License: BSD 3-Clause
-
-set -e
-
-# Deactivate the travis-provided virtual environment and setup a
-# conda-based environment instead
-deactivate
-
-# Use the miniconda installer for faster download / install of conda
-# itself
-pushd .
-cd
-mkdir -p download
-cd download
-echo "Cached in $HOME/download :"
-ls -l
-echo
-if [[ ! -f miniconda.sh ]]
- then
- wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \
- -O miniconda.sh
- fi
-chmod +x miniconda.sh && ./miniconda.sh -b -p $HOME/miniconda
-cd ..
-export PATH=/home/travis/miniconda/bin:$PATH
-conda update --yes conda
-popd
-
-# Configure the conda environment and put it in the path using the
-# provided versions
-conda create -n testenv --yes python=$PYTHON_VERSION pip
-source activate testenv
-
-if [[ -v SCIPY_VERSION ]]; then
- conda install --yes scipy=$SCIPY_VERSION
-fi
-python --version
-
-if [[ "$TEST_DIST" == "true" ]]; then
- pip install twine nbconvert jupyter_client matplotlib pyarrow pytest pytest-xdist pytest-timeout \
- nbformat oslo.concurrency flaky mypy
- python setup.py sdist
- # Find file which was modified last as done in https://stackoverflow.com/a/4561987
- dist=`find dist -type f -printf '%T@ %p\n' | sort -n | tail -1 | cut -f2- -d" "`
- echo "Installing $dist"
- pip install "$dist"
- twine check "$dist"
-else
- pip install -e '.[test]'
-fi
-
-python -c "import numpy; print('numpy %s' % numpy.__version__)"
-python -c "import scipy; print('scipy %s' % scipy.__version__)"
-
-
-if [[ "$DOCPUSH" == "true" ]]; then
- conda install --yes gxx_linux-64 gcc_linux-64 swig
- pip install -e '.[examples,examples_unix]'
-fi
-if [[ "$COVERAGE" == "true" ]]; then
- pip install codecov pytest-cov
-fi
-if [[ "$RUN_FLAKE8" == "true" ]]; then
- pip install pre-commit
- pre-commit install
-fi
-
-# PEP 561 compliance check
-# Assumes mypy relies solely on the PEP 561 standard
-if ! python -m mypy -c "import openml"; then
- echo "Failed: PEP 561 compliance"
- exit 1
-else
- echo "Success: PEP 561 compliant"
-fi
-
-# Install scikit-learn last to make sure the openml package installation works
-# from a clean environment without scikit-learn.
-pip install scikit-learn==$SKLEARN_VERSION
-
-conda list
diff --git a/ci_scripts/success.sh b/ci_scripts/success.sh
deleted file mode 100644
index dad97d54e..000000000
--- a/ci_scripts/success.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-# License: BSD 3-Clause
-
-set -e
-
-if [[ "$COVERAGE" == "true" ]]; then
- # Need to run coveralls from a git checkout, so we copy .coverage
- # from TEST_DIR where pytest has been run
- cp $TEST_DIR/.coverage $TRAVIS_BUILD_DIR
- cd $TRAVIS_BUILD_DIR
- # Ignore coveralls failures as the coveralls server is not
- # very reliable but we don't want travis to report a failure
- # in the github UI just because the coverage report failed to
- # be published.
- codecov || echo "Codecov upload failed"
-fi
diff --git a/ci_scripts/test.sh b/ci_scripts/test.sh
deleted file mode 100644
index 0a1f94df6..000000000
--- a/ci_scripts/test.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-# License: BSD 3-Clause
-
-set -e
-
-# check status and branch before running the unit tests
-before="`git status --porcelain -b`"
-before="$before"
-# storing current working directory
-curr_dir=`pwd`
-
-run_tests() {
- # Get into a temp directory to run test from the installed scikit learn and
- # check if we do not leave artifacts
- mkdir -p $TEST_DIR
-
- cwd=`pwd`
- test_dir=$cwd/tests
-
- cd $TEST_DIR
-
- if [[ "$COVERAGE" == "true" ]]; then
- PYTEST_ARGS='--cov=openml'
- else
- PYTEST_ARGS=''
- fi
-
- pytest -n 4 --durations=20 --timeout=600 --timeout-method=thread -sv $PYTEST_ARGS $test_dir
-}
-
-if [[ "$RUN_FLAKE8" == "true" ]]; then
- pre-commit run --all-files
-fi
-
-if [[ "$SKIP_TESTS" != "true" ]]; then
- run_tests
-fi
-
-# changing directory to stored working directory
-cd $curr_dir
-# check status and branch after running the unit tests
-# compares with $before to check for remaining files
-after="`git status --porcelain -b`"
-if [[ "$before" != "$after" ]]; then
- echo 'git status from before: '$before
- echo 'git status from after: '$after
- echo "All generated files have not been deleted!"
- exit 1
-fi
diff --git a/doc/conf.py b/doc/conf.py
index 9c4606143..e5de2d551 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -64,10 +64,8 @@
master_doc = "index"
# General information about the project.
-project = u"OpenML"
-copyright = u"2014-{}, the OpenML-Python team.".format(
- time.strftime("%Y,%m,%d,%H,%M,%S").split(",")[0]
-)
+project = "OpenML"
+copyright = f"2014-{time.localtime().tm_year}, the OpenML-Python team"
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
@@ -263,7 +261,7 @@
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
- ("index", "OpenML.tex", u"OpenML Documentation", u"Matthias Feurer", "manual"),
+ ("index", "OpenML.tex", "OpenML Documentation", "Matthias Feurer", "manual"),
]
# The name of an image file (relative to this directory) to place at the top of
@@ -291,7 +289,7 @@
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
-man_pages = [("index", "openml", u"OpenML Documentation", [u"Matthias Feurer"], 1)]
+man_pages = [("index", "openml", "OpenML Documentation", ["Matthias Feurer"], 1)]
# If true, show URL addresses after external links.
# man_show_urls = False
@@ -306,8 +304,8 @@
(
"index",
"OpenML",
- u"OpenML Documentation",
- u"Matthias Feurer",
+ "OpenML Documentation",
+ "Matthias Feurer",
"OpenML",
"One line description of project.",
"Miscellaneous",
diff --git a/doc/index.rst b/doc/index.rst
index 789979023..e38e4d877 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -32,7 +32,7 @@ Example
)
# Download the OpenML task for the german credit card dataset with 10-fold
# cross-validation.
- task = openml.tasks.get_task(31)
+ task = openml.tasks.get_task(32)
# Run the scikit-learn model on the task.
run = openml.runs.run_model_on_task(clf, task)
# Publish the experiment on OpenML (optional, requires an API key.
diff --git a/doc/progress.rst b/doc/progress.rst
index 1956fcb42..1ca1e1d0e 100644
--- a/doc/progress.rst
+++ b/doc/progress.rst
@@ -6,6 +6,35 @@
Changelog
=========
+0.11.1
+~~~~~~
+* ADD #964: Validate ``ignore_attribute``, ``default_target_attribute``, ``row_id_attribute`` are set to attributes that exist on the dataset when calling ``create_dataset``.
+* ADD #979: Dataset features and qualities are now also cached in pickle format.
+* ADD #982: Add helper functions for column transformers.
+* ADD #989: ``run_model_on_task`` will now warn the user the the model passed has already been fitted.
+* ADD #1009 : Give possibility to not download the dataset qualities. The cached version is used even so download attribute is false.
+* ADD #1016: Add scikit-learn 0.24 support.
+* ADD #1020: Add option to parallelize evaluation of tasks with joblib.
+* ADD #1022: Allow minimum version of dependencies to be listed for a flow, use more accurate minimum versions for scikit-learn dependencies.
+* ADD #1023: Add admin-only calls for adding topics to datasets.
+* ADD #1029: Add support for fetching dataset from a minio server in parquet format.
+* ADD #1031: Generally improve runtime measurements, add them for some previously unsupported flows (e.g. BaseSearchCV derived flows).
+* DOC #973 : Change the task used in the welcome page example so it no longer fails using numerical dataset.
+* MAINT #671: Improved the performance of ``check_datasets_active`` by only querying the given list of datasets in contrast to querying all datasets. Modified the corresponding unit test.
+* MAINT #891: Changed the way that numerical features are stored. Numerical features that range from 0 to 255 are now stored as uint8, which reduces the storage space required as well as storing and loading times.
+* MAINT #975, #988: Add CI through Github Actions.
+* MAINT #977: Allow ``short`` and ``long`` scenarios for unit tests. Reduce the workload for some unit tests.
+* MAINT #985, #1000: Improve unit test stability and output readability, and adds load balancing.
+* MAINT #1018: Refactor data loading and storage. Data is now compressed on the first call to `get_data`.
+* MAINT #1024: Remove flaky decorator for study unit test.
+* FIX #883 #884 #906 #972: Various improvements to the caching system.
+* FIX #980: Speed up ``check_datasets_active``.
+* FIX #984: Add a retry mechanism when the server encounters a database issue.
+* FIX #1004: Fixed an issue that prevented installation on some systems (e.g. Ubuntu).
+* FIX #1013: Fixes a bug where ``OpenMLRun.setup_string`` was not uploaded to the server, prepares for ``run_details`` being sent from the server.
+* FIX #1021: Fixes an issue that could occur when running unit tests and openml-python was not in PATH.
+* FIX #1037: Fixes a bug where a dataset could not be loaded if a categorical value had listed nan-like as a possible category.
+
0.11.0
~~~~~~
* ADD #753: Allows uploading custom flows to OpenML via OpenML-Python.
diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_.py
similarity index 98%
rename from examples/30_extended/custom_flow_tutorial.py
rename to examples/30_extended/custom_flow_.py
index 3b918e108..02aef9c5c 100644
--- a/examples/30_extended/custom_flow_tutorial.py
+++ b/examples/30_extended/custom_flow_.py
@@ -82,10 +82,10 @@
# This allows people to specify auto-sklearn hyperparameters used in this flow.
# In general, using a subflow is not required.
#
-# Note: flow 15275 is not actually the right flow on the test server,
+# Note: flow 9313 is not actually the right flow on the test server,
# but that does not matter for this demonstration.
-autosklearn_flow = openml.flows.get_flow(15275) # auto-sklearn 0.5.1
+autosklearn_flow = openml.flows.get_flow(9313) # auto-sklearn 0.5.1
subflow = dict(components=OrderedDict(automl_tool=autosklearn_flow),)
####################################################################################################
@@ -120,7 +120,7 @@
OrderedDict([("oml:name", "time"), ("oml:value", 120), ("oml:component", flow_id)]),
]
-task_id = 1408 # Iris Task
+task_id = 1965 # Iris Task
task = openml.tasks.get_task(task_id)
dataset_id = task.get_dataset().dataset_id
diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
index 594a58930..7a51cce70 100644
--- a/examples/30_extended/datasets_tutorial.py
+++ b/examples/30_extended/datasets_tutorial.py
@@ -112,7 +112,7 @@
############################################################################
# Edit a created dataset
-# =================================================
+# ======================
# This example uses the test server, to avoid editing a dataset on the main server.
openml.config.start_using_configuration_for_example()
############################################################################
@@ -143,18 +143,23 @@
# tasks associated with it. To edit critical fields of a dataset (without tasks) owned by you,
# configure the API key:
# openml.config.apikey = 'FILL_IN_OPENML_API_KEY'
-data_id = edit_dataset(564, default_target_attribute="y")
-print(f"Edited dataset ID: {data_id}")
-
+# This example here only shows a failure when trying to work on a dataset not owned by you:
+try:
+ data_id = edit_dataset(1, default_target_attribute="shape")
+except openml.exceptions.OpenMLServerException as e:
+ print(e)
############################################################################
# Fork dataset
+# ============
# Used to create a copy of the dataset with you as the owner.
# Use this API only if you are unable to edit the critical fields (default_target_attribute,
# ignore_attribute, row_id_attribute) of a dataset through the edit_dataset API.
# After the dataset is forked, you can edit the new version of the dataset using edit_dataset.
-data_id = fork_dataset(564)
+data_id = fork_dataset(1)
+print(data_id)
+data_id = edit_dataset(data_id, default_target_attribute="shape")
print(f"Forked dataset ID: {data_id}")
openml.config.stop_using_configuration_for_example()
diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py
index 76eb2f219..9f8c89375 100644
--- a/examples/30_extended/flows_and_runs_tutorial.py
+++ b/examples/30_extended/flows_and_runs_tutorial.py
@@ -53,7 +53,7 @@
task = openml.tasks.get_task(403)
# Build any classifier or pipeline
-clf = tree.ExtraTreeClassifier()
+clf = tree.DecisionTreeClassifier()
# Run the flow
run = openml.runs.run_model_on_task(clf, task)
@@ -82,13 +82,14 @@
# ############################
#
# When you need to handle 'dirty' data, build pipelines to model then automatically.
-task = openml.tasks.get_task(1)
-features = task.get_dataset().features
-nominal_feature_indices = [
- i
- for i in range(len(features))
- if features[i].name != task.target_name and features[i].data_type == "nominal"
-]
+# To demonstrate this using the dataset `credit-a `_ via
+# `task `_ as it contains both numerical and categorical
+# variables and missing values in both.
+task = openml.tasks.get_task(96)
+
+# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines
+from openml.extensions.sklearn import cat, cont
+
pipe = pipeline.Pipeline(
steps=[
(
@@ -96,20 +97,15 @@
compose.ColumnTransformer(
[
(
- "Nominal",
- pipeline.Pipeline(
- [
- ("Imputer", impute.SimpleImputer(strategy="most_frequent")),
- (
- "Encoder",
- preprocessing.OneHotEncoder(
- sparse=False, handle_unknown="ignore",
- ),
- ),
- ]
- ),
- nominal_feature_indices,
+ "categorical",
+ preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"),
+ cat, # returns the categorical feature indices
),
+ (
+ "continuous",
+ impute.SimpleImputer(strategy="median"),
+ cont,
+ ), # returns the numeric feature indices
]
),
),
@@ -121,6 +117,50 @@
myrun = run.publish()
print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id))
+
+# The above pipeline works with the helper functions that internally deal with pandas DataFrame.
+# In the case, pandas is not available, or a NumPy based data processing is the requirement, the
+# above pipeline is presented below to work with NumPy.
+
+# Extracting the indices of the categorical columns
+features = task.get_dataset().features
+categorical_feature_indices = []
+numeric_feature_indices = []
+for i in range(len(features)):
+ if features[i].name == task.target_name:
+ continue
+ if features[i].data_type == "nominal":
+ categorical_feature_indices.append(i)
+ else:
+ numeric_feature_indices.append(i)
+
+pipe = pipeline.Pipeline(
+ steps=[
+ (
+ "Preprocessing",
+ compose.ColumnTransformer(
+ [
+ (
+ "categorical",
+ preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"),
+ categorical_feature_indices,
+ ),
+ (
+ "continuous",
+ impute.SimpleImputer(strategy="median"),
+ numeric_feature_indices,
+ ),
+ ]
+ ),
+ ),
+ ("Classifier", ensemble.RandomForestClassifier(n_estimators=10)),
+ ]
+)
+
+run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format="array")
+myrun = run.publish()
+print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id))
+
###############################################################################
# Running flows on tasks offline for later upload
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -132,7 +172,9 @@
task = openml.tasks.get_task(6)
# The following lines can then be executed offline:
-run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, upload_flow=False)
+run = openml.runs.run_model_on_task(
+ pipe, task, avoid_duplicate_runs=False, upload_flow=False, dataset_format="array",
+)
# The run may be stored offline, and the flow will be stored along with it:
run.to_filesystem(directory="myrun")
diff --git a/examples/30_extended/run_setup_tutorial.py b/examples/30_extended/run_setup_tutorial.py
index a46bf9699..8579d1d38 100644
--- a/examples/30_extended/run_setup_tutorial.py
+++ b/examples/30_extended/run_setup_tutorial.py
@@ -34,14 +34,14 @@
import numpy as np
import openml
-import sklearn.ensemble
-import sklearn.impute
-import sklearn.preprocessing
+from openml.extensions.sklearn import cat, cont
+
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
-from sklearn.experimental import enable_hist_gradient_boosting
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.decomposition import TruncatedSVD
openml.config.start_using_configuration_for_example()
@@ -58,37 +58,20 @@
# many potential hyperparameters. Of course, the model can be as complex and as
# easy as you want it to be
-from sklearn.ensemble import HistGradientBoostingClassifier
-from sklearn.decomposition import TruncatedSVD
-
-
-# Helper functions to return required columns for ColumnTransformer
-def cont(X):
- return X.dtypes != "category"
-
-
-def cat(X):
- return X.dtypes == "category"
-
-cat_imp = make_pipeline(
- SimpleImputer(strategy="most_frequent"),
- OneHotEncoder(handle_unknown="ignore", sparse=False),
- TruncatedSVD(),
-)
-ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", "passthrough", cont)])
-model_original = sklearn.pipeline.Pipeline(
- steps=[("transform", ct), ("estimator", HistGradientBoostingClassifier()),]
-)
+cat_imp = make_pipeline(OneHotEncoder(handle_unknown="ignore", sparse=False), TruncatedSVD(),)
+cont_imp = SimpleImputer(strategy="median")
+ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
+model_original = Pipeline(steps=[("transform", ct), ("estimator", RandomForestClassifier()),])
# Let's change some hyperparameters. Of course, in any good application we
# would tune them using, e.g., Random Search or Bayesian Optimization, but for
# the purpose of this tutorial we set them to some specific values that might
# or might not be optimal
hyperparameters_original = {
- "estimator__loss": "auto",
- "estimator__learning_rate": 0.15,
- "estimator__max_iter": 50,
+ "estimator__criterion": "gini",
+ "estimator__n_estimators": 50,
+ "estimator__max_depth": 10,
"estimator__min_samples_leaf": 1,
}
model_original.set_params(**hyperparameters_original)
diff --git a/examples/30_extended/study_tutorial.py b/examples/30_extended/study_tutorial.py
index c02a5c038..3c93a7e81 100644
--- a/examples/30_extended/study_tutorial.py
+++ b/examples/30_extended/study_tutorial.py
@@ -15,13 +15,7 @@
import uuid
-import numpy as np
-import sklearn.tree
-from sklearn.pipeline import make_pipeline, Pipeline
-from sklearn.compose import ColumnTransformer
-from sklearn.impute import SimpleImputer
-from sklearn.decomposition import TruncatedSVD
-from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
+from sklearn.ensemble import RandomForestClassifier
import openml
@@ -71,45 +65,25 @@
)
print(evaluations.head())
-###########################################################from openml.testing import cat, cont#################
+############################################################################
# Uploading studies
# =================
#
# Creating a study is as simple as creating any kind of other OpenML entity.
# In this examples we'll create a few runs for the OpenML-100 benchmark
# suite which is available on the OpenML test server.
-
openml.config.start_using_configuration_for_example()
-# Model that can handle missing values
-from sklearn.experimental import enable_hist_gradient_boosting
-from sklearn.ensemble import HistGradientBoostingClassifier
-
-
-# Helper functions to return required columns for ColumnTransformer
-def cont(X):
- return X.dtypes != "category"
-
-
-def cat(X):
- return X.dtypes == "category"
+# Model to be used
+clf = RandomForestClassifier()
+# We'll create a study with one run on 3 datasets present in the suite
+tasks = [115, 259, 307]
-cat_imp = make_pipeline(
- SimpleImputer(strategy="most_frequent"),
- OneHotEncoder(handle_unknown="ignore", sparse=False),
- TruncatedSVD(),
-)
-ct = ColumnTransformer(
- [("cat", cat_imp, cat), ("cont", FunctionTransformer(lambda x: x, validate=False), cont)]
-)
-clf = sklearn.pipeline.Pipeline(
- steps=[("transform", ct), ("estimator", HistGradientBoostingClassifier()),]
-)
-
+# To verify
suite = openml.study.get_suite(1)
-# We'll create a study with one run on three random datasets each
-tasks = np.random.choice(suite.tasks, size=3, replace=False)
+print(all([t_id in suite.tasks for t_id in tasks]))
+
run_ids = []
for task_id in tasks:
task = openml.tasks.get_task(task_id)
diff --git a/examples/30_extended/task_manual_iteration_tutorial.py b/examples/30_extended/task_manual_iteration_tutorial.py
index c879e9fea..533f645b2 100644
--- a/examples/30_extended/task_manual_iteration_tutorial.py
+++ b/examples/30_extended/task_manual_iteration_tutorial.py
@@ -61,11 +61,11 @@
####################################################################################################
# And then split the data based on this:
-X, y, _, _ = task.get_dataset().get_data(task.target_name)
-X_train = X.loc[train_indices]
-y_train = y[train_indices]
-X_test = X.loc[test_indices]
-y_test = y[test_indices]
+X, y = task.get_X_and_y(dataset_format="dataframe")
+X_train = X.iloc[train_indices]
+y_train = y.iloc[train_indices]
+X_test = X.iloc[test_indices]
+y_test = y.iloc[test_indices]
print(
"X_train.shape: {}, y_train.shape: {}, X_test.shape: {}, y_test.shape: {}".format(
@@ -78,6 +78,7 @@
task_id = 3
task = openml.tasks.get_task(task_id)
+X, y = task.get_X_and_y(dataset_format="dataframe")
n_repeats, n_folds, n_samples = task.get_split_dimensions()
print(
"Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
@@ -93,10 +94,10 @@
train_indices, test_indices = task.get_train_test_split_indices(
repeat=repeat_idx, fold=fold_idx, sample=sample_idx,
)
- X_train = X.loc[train_indices]
- y_train = y[train_indices]
- X_test = X.loc[test_indices]
- y_test = y[test_indices]
+ X_train = X.iloc[train_indices]
+ y_train = y.iloc[train_indices]
+ X_test = X.iloc[test_indices]
+ y_test = y.iloc[test_indices]
print(
"Repeat #{}, fold #{}, samples {}: X_train.shape: {}, "
@@ -116,6 +117,7 @@
task_id = 1767
task = openml.tasks.get_task(task_id)
+X, y = task.get_X_and_y(dataset_format="dataframe")
n_repeats, n_folds, n_samples = task.get_split_dimensions()
print(
"Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
@@ -131,10 +133,10 @@
train_indices, test_indices = task.get_train_test_split_indices(
repeat=repeat_idx, fold=fold_idx, sample=sample_idx,
)
- X_train = X.loc[train_indices]
- y_train = y[train_indices]
- X_test = X.loc[test_indices]
- y_test = y[test_indices]
+ X_train = X.iloc[train_indices]
+ y_train = y.iloc[train_indices]
+ X_test = X.iloc[test_indices]
+ y_test = y.iloc[test_indices]
print(
"Repeat #{}, fold #{}, samples {}: X_train.shape: {}, "
@@ -154,6 +156,7 @@
task_id = 1702
task = openml.tasks.get_task(task_id)
+X, y = task.get_X_and_y(dataset_format="dataframe")
n_repeats, n_folds, n_samples = task.get_split_dimensions()
print(
"Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
@@ -169,10 +172,10 @@
train_indices, test_indices = task.get_train_test_split_indices(
repeat=repeat_idx, fold=fold_idx, sample=sample_idx,
)
- X_train = X.loc[train_indices]
- y_train = y[train_indices]
- X_test = X.loc[test_indices]
- y_test = y[test_indices]
+ X_train = X.iloc[train_indices]
+ y_train = y.iloc[train_indices]
+ X_test = X.iloc[test_indices]
+ y_test = y.iloc[test_indices]
print(
"Repeat #{}, fold #{}, samples {}: X_train.shape: {}, "
diff --git a/examples/40_paper/2018_neurips_perrone_example.py b/examples/40_paper/2018_neurips_perrone_example.py
index 60d212116..5ae339ae2 100644
--- a/examples/40_paper/2018_neurips_perrone_example.py
+++ b/examples/40_paper/2018_neurips_perrone_example.py
@@ -177,18 +177,14 @@ def list_categorical_attributes(flow_type="svm"):
cat_cols = list_categorical_attributes(flow_type=flow_type)
num_cols = list(set(X.columns) - set(cat_cols))
-# Missing value imputers
-cat_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="None")
+# Missing value imputers for numeric columns
num_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=-1)
-# Creating the one-hot encoder
+# Creating the one-hot encoder for numerical representation of categorical columns
enc = OneHotEncoder(handle_unknown="ignore")
-# Pipeline to handle categorical column transformations
-cat_transforms = Pipeline(steps=[("impute", cat_imputer), ("encode", enc)])
-
# Combining column transformers
-ct = ColumnTransformer([("cat", cat_transforms, cat_cols), ("num", num_imputer, num_cols)])
+ct = ColumnTransformer([("cat", enc, cat_cols), ("num", num_imputer, num_cols)])
# Creating the full pipeline with the surrogate model
clf = RandomForestRegressor(n_estimators=50)
diff --git a/openml/__version__.py b/openml/__version__.py
index 07c9a950d..ff4effa59 100644
--- a/openml/__version__.py
+++ b/openml/__version__.py
@@ -3,4 +3,4 @@
# License: BSD 3-Clause
# The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.11.0"
+__version__ = "0.12.0"
diff --git a/openml/_api_calls.py b/openml/_api_calls.py
index 57599b912..aee67d8c6 100644
--- a/openml/_api_calls.py
+++ b/openml/_api_calls.py
@@ -3,9 +3,14 @@
import time
import hashlib
import logging
+import pathlib
import requests
+import urllib.parse
+import xml
import xmltodict
-from typing import Dict, Optional
+from typing import Dict, Optional, Union
+
+import minio
from . import config
from .exceptions import (
@@ -55,7 +60,7 @@ def _perform_api_call(call, request_method, data=None, file_elements=None):
if file_elements is not None:
if request_method != "post":
raise ValueError("request method must be post when file elements are present")
- response = __read_url_files(url, data=data, file_elements=file_elements)
+ response = _read_url_files(url, data=data, file_elements=file_elements)
else:
response = __read_url(url, request_method, data)
@@ -67,6 +72,45 @@ def _perform_api_call(call, request_method, data=None, file_elements=None):
return response.text
+def _download_minio_file(
+ source: str, destination: Union[str, pathlib.Path], exists_ok: bool = True,
+) -> None:
+ """ Download file ``source`` from a MinIO Bucket and store it at ``destination``.
+
+ Parameters
+ ----------
+ source : Union[str, pathlib.Path]
+ URL to a file in a MinIO bucket.
+ destination : str
+ Path to store the file to, if a directory is provided the original filename is used.
+ exists_ok : bool, optional (default=True)
+ If False, raise FileExists if a file already exists in ``destination``.
+
+ """
+ destination = pathlib.Path(destination)
+ parsed_url = urllib.parse.urlparse(source)
+
+ # expect path format: /BUCKET/path/to/file.ext
+ bucket, object_name = parsed_url.path[1:].split("/", maxsplit=1)
+ if destination.is_dir():
+ destination = pathlib.Path(destination, object_name)
+ if destination.is_file() and not exists_ok:
+ raise FileExistsError(f"File already exists in {destination}.")
+
+ client = minio.Minio(endpoint=parsed_url.netloc, secure=False)
+
+ try:
+ client.fget_object(
+ bucket_name=bucket, object_name=object_name, file_path=str(destination),
+ )
+ except minio.error.S3Error as e:
+ if e.message.startswith("Object does not exist"):
+ raise FileNotFoundError(f"Object at '{source}' does not exist.") from e
+ # e.g. permission error, or a bucket does not exist (which is also interpreted as a
+ # permission error on minio level).
+ raise FileNotFoundError("Bucket does not exist or is private.") from e
+
+
def _download_text_file(
source: str,
output_path: Optional[str] = None,
@@ -105,21 +149,9 @@ def _download_text_file(
logging.info("Starting [%s] request for the URL %s", "get", source)
start = time.time()
- response = __read_url(source, request_method="get")
- __check_response(response, source, None)
+ response = __read_url(source, request_method="get", md5_checksum=md5_checksum)
downloaded_file = response.text
- if md5_checksum is not None:
- md5 = hashlib.md5()
- md5.update(downloaded_file.encode("utf-8"))
- md5_checksum_download = md5.hexdigest()
- if md5_checksum != md5_checksum_download:
- raise OpenMLHashException(
- "Checksum {} of downloaded file is unequal to the expected checksum {}.".format(
- md5_checksum_download, md5_checksum
- )
- )
-
if output_path is None:
logging.info(
"%.7fs taken for [%s] request for the URL %s", time.time() - start, "get", source,
@@ -138,15 +170,6 @@ def _download_text_file(
return None
-def __check_response(response, url, file_elements):
- if response.status_code != 200:
- raise __parse_server_exception(response, url, file_elements=file_elements)
- elif (
- "Content-Encoding" not in response.headers or response.headers["Content-Encoding"] != "gzip"
- ):
- logging.warning("Received uncompressed content from OpenML for {}.".format(url))
-
-
def _file_id_to_url(file_id, filename=None):
"""
Presents the URL how to download a given file id
@@ -159,7 +182,7 @@ def _file_id_to_url(file_id, filename=None):
return url
-def __read_url_files(url, data=None, file_elements=None):
+def _read_url_files(url, data=None, file_elements=None):
"""do a post request to url with data
and sending file_elements as files"""
@@ -169,26 +192,37 @@ def __read_url_files(url, data=None, file_elements=None):
file_elements = {}
# Using requests.post sets header 'Accept-encoding' automatically to
# 'gzip,deflate'
- response = __send_request(request_method="post", url=url, data=data, files=file_elements,)
+ response = _send_request(request_method="post", url=url, data=data, files=file_elements,)
return response
-def __read_url(url, request_method, data=None):
+def __read_url(url, request_method, data=None, md5_checksum=None):
data = {} if data is None else data
- if config.apikey is not None:
+ if config.apikey:
data["api_key"] = config.apikey
+ return _send_request(
+ request_method=request_method, url=url, data=data, md5_checksum=md5_checksum
+ )
+
+
+def __is_checksum_equal(downloaded_file, md5_checksum=None):
+ if md5_checksum is None:
+ return True
+ md5 = hashlib.md5()
+ md5.update(downloaded_file.encode("utf-8"))
+ md5_checksum_download = md5.hexdigest()
+ if md5_checksum == md5_checksum_download:
+ return True
+ return False
- return __send_request(request_method=request_method, url=url, data=data)
+def _send_request(request_method, url, data, files=None, md5_checksum=None):
+ n_retries = max(1, min(config.connection_n_retries, config.max_retries))
-def __send_request(
- request_method, url, data, files=None,
-):
- n_retries = config.connection_n_retries
response = None
with requests.Session() as session:
# Start at one to have a non-zero multiplier for the sleep
- for i in range(1, n_retries + 1):
+ for retry_counter in range(1, n_retries + 1):
try:
if request_method == "get":
response = session.get(url, params=data)
@@ -198,17 +232,51 @@ def __send_request(
response = session.post(url, data=data, files=files)
else:
raise NotImplementedError()
+ __check_response(response=response, url=url, file_elements=files)
+ if request_method == "get" and not __is_checksum_equal(response.text, md5_checksum):
+ raise OpenMLHashException(
+ "Checksum of downloaded file is unequal to the expected checksum {} "
+ "when downloading {}.".format(md5_checksum, url)
+ )
break
- except (requests.exceptions.ConnectionError, requests.exceptions.SSLError,) as e:
- if i == n_retries:
- raise e
+ except (
+ requests.exceptions.ConnectionError,
+ requests.exceptions.SSLError,
+ OpenMLServerException,
+ xml.parsers.expat.ExpatError,
+ OpenMLHashException,
+ ) as e:
+ if isinstance(e, OpenMLServerException):
+ if e.code not in [107, 500]:
+ # 107: database connection error
+ # 500: internal server error
+ raise
+ elif isinstance(e, xml.parsers.expat.ExpatError):
+ if request_method != "get" or retry_counter >= n_retries:
+ raise OpenMLServerError(
+ "Unexpected server error when calling {}. Please contact the "
+ "developers!\nStatus code: {}\n{}".format(
+ url, response.status_code, response.text,
+ )
+ )
+ if retry_counter >= n_retries:
+ raise
else:
- time.sleep(0.1 * i)
+ time.sleep(retry_counter)
if response is None:
raise ValueError("This should never happen!")
return response
+def __check_response(response, url, file_elements):
+ if response.status_code != 200:
+ raise __parse_server_exception(response, url, file_elements=file_elements)
+ elif (
+ "Content-Encoding" not in response.headers or response.headers["Content-Encoding"] != "gzip"
+ ):
+ logging.warning("Received uncompressed content from OpenML for {}.".format(url))
+
+
def __parse_server_exception(
response: requests.Response, url: str, file_elements: Dict,
) -> OpenMLServerError:
@@ -217,6 +285,8 @@ def __parse_server_exception(
raise OpenMLServerError("URI too long! ({})".format(url))
try:
server_exception = xmltodict.parse(response.text)
+ except xml.parsers.expat.ExpatError:
+ raise
except Exception:
# OpenML has a sophisticated error system
# where information about failures is provided. try to parse this
diff --git a/openml/config.py b/openml/config.py
index 296b71663..9e2e697d5 100644
--- a/openml/config.py
+++ b/openml/config.py
@@ -7,6 +7,8 @@
import logging
import logging.handlers
import os
+from pathlib import Path
+import platform
from typing import Tuple, cast
from io import StringIO
@@ -19,7 +21,7 @@
file_handler = None
-def _create_log_handlers():
+def _create_log_handlers(create_file_handler=True):
""" Creates but does not attach the log handlers. """
global console_handler, file_handler
if console_handler is not None or file_handler is not None:
@@ -32,12 +34,13 @@ def _create_log_handlers():
console_handler = logging.StreamHandler()
console_handler.setFormatter(output_formatter)
- one_mb = 2 ** 20
- log_path = os.path.join(cache_directory, "openml_python.log")
- file_handler = logging.handlers.RotatingFileHandler(
- log_path, maxBytes=one_mb, backupCount=1, delay=True
- )
- file_handler.setFormatter(output_formatter)
+ if create_file_handler:
+ one_mb = 2 ** 20
+ log_path = os.path.join(cache_directory, "openml_python.log")
+ file_handler = logging.handlers.RotatingFileHandler(
+ log_path, maxBytes=one_mb, backupCount=1, delay=True
+ )
+ file_handler.setFormatter(output_formatter)
def _convert_log_levels(log_level: int) -> Tuple[int, int]:
@@ -83,14 +86,18 @@ def set_file_log_level(file_output_level: int):
# Default values (see also https://github.com/openml/OpenML/wiki/Client-API-Standards)
_defaults = {
- "apikey": None,
+ "apikey": "",
"server": "https://www.openml.org/api/v1/xml",
- "cachedir": os.path.expanduser(os.path.join("~", ".openml", "cache")),
+ "cachedir": (
+ os.environ.get("XDG_CACHE_HOME", os.path.join("~", ".cache", "openml",))
+ if platform.system() == "Linux"
+ else os.path.join("~", ".openml")
+ ),
"avoid_duplicate_runs": "True",
- "connection_n_retries": 2,
+ "connection_n_retries": "10",
+ "max_retries": "20",
}
-config_file = os.path.expanduser(os.path.join("~", ".openml", "config"))
# Default values are actually added here in the _setup() function which is
# called at the end of this module
@@ -115,7 +122,8 @@ def get_server_base_url() -> str:
avoid_duplicate_runs = True if _defaults["avoid_duplicate_runs"] == "True" else False
# Number of retries if the connection breaks
-connection_n_retries = _defaults["connection_n_retries"]
+connection_n_retries = int(_defaults["connection_n_retries"])
+max_retries = int(_defaults["max_retries"])
class ConfigurationForExamples:
@@ -169,7 +177,7 @@ def stop_using_configuration_for_example(cls):
cls._start_last_called = False
-def _setup():
+def _setup(config=None):
"""Setup openml package. Called on first import.
Reads the config file and sets up apikey, server, cache appropriately.
@@ -183,62 +191,102 @@ def _setup():
global cache_directory
global avoid_duplicate_runs
global connection_n_retries
+ global max_retries
- # read config file, create cache directory
- try:
- os.mkdir(os.path.expanduser(os.path.join("~", ".openml")))
- except FileExistsError:
- # For other errors, we want to propagate the error as openml does not work without cache
- pass
+ if platform.system() == "Linux":
+ config_dir = Path(os.environ.get("XDG_CONFIG_HOME", Path("~") / ".config" / "openml"))
+ else:
+ config_dir = Path("~") / ".openml"
+ # Still use os.path.expanduser to trigger the mock in the unit test
+ config_dir = Path(os.path.expanduser(config_dir))
+ config_file = config_dir / "config"
+
+ # read config file, create directory for config file
+ if not os.path.exists(config_dir):
+ try:
+ os.mkdir(config_dir)
+ cache_exists = True
+ except PermissionError:
+ cache_exists = False
+ else:
+ cache_exists = True
- config = _parse_config()
- apikey = config.get("FAKE_SECTION", "apikey")
- server = config.get("FAKE_SECTION", "server")
+ if config is None:
+ config = _parse_config(config_file)
- short_cache_dir = config.get("FAKE_SECTION", "cachedir")
- cache_directory = os.path.expanduser(short_cache_dir)
+ def _get(config, key):
+ return config.get("FAKE_SECTION", key)
+
+ avoid_duplicate_runs = config.getboolean("FAKE_SECTION", "avoid_duplicate_runs")
+ else:
+
+ def _get(config, key):
+ return config.get(key)
+
+ avoid_duplicate_runs = config.get("avoid_duplicate_runs")
+ apikey = _get(config, "apikey")
+ server = _get(config, "server")
+ short_cache_dir = _get(config, "cachedir")
+ connection_n_retries = int(_get(config, "connection_n_retries"))
+ max_retries = int(_get(config, "max_retries"))
+
+ cache_directory = os.path.expanduser(short_cache_dir)
# create the cache subdirectory
- try:
- os.mkdir(cache_directory)
- except FileExistsError:
- # For other errors, we want to propagate the error as openml does not work without cache
- pass
-
- avoid_duplicate_runs = config.getboolean("FAKE_SECTION", "avoid_duplicate_runs")
- connection_n_retries = config.get("FAKE_SECTION", "connection_n_retries")
- if connection_n_retries > 20:
+ if not os.path.exists(cache_directory):
+ try:
+ os.mkdir(cache_directory)
+ except PermissionError:
+ openml_logger.warning(
+ "No permission to create openml cache directory at %s! This can result in "
+ "OpenML-Python not working properly." % cache_directory
+ )
+
+ if cache_exists:
+ _create_log_handlers()
+ else:
+ _create_log_handlers(create_file_handler=False)
+ openml_logger.warning(
+ "No permission to create OpenML directory at %s! This can result in OpenML-Python "
+ "not working properly." % config_dir
+ )
+
+ if connection_n_retries > max_retries:
raise ValueError(
- "A higher number of retries than 20 is not allowed to keep the "
- "server load reasonable"
+ "A higher number of retries than {} is not allowed to keep the "
+ "server load reasonable".format(max_retries)
)
-def _parse_config():
+def _parse_config(config_file: str):
""" Parse the config file, set up defaults. """
config = configparser.RawConfigParser(defaults=_defaults)
- if not os.path.exists(config_file):
- # Create an empty config file if there was none so far
- fh = open(config_file, "w")
- fh.close()
- logger.info(
- "Could not find a configuration file at %s. Going to "
- "create an empty file there." % config_file
- )
-
+ # The ConfigParser requires a [SECTION_HEADER], which we do not expect in our config file.
+ # Cheat the ConfigParser module by adding a fake section header
+ config_file_ = StringIO()
+ config_file_.write("[FAKE_SECTION]\n")
try:
- # The ConfigParser requires a [SECTION_HEADER], which we do not expect in our config file.
- # Cheat the ConfigParser module by adding a fake section header
- config_file_ = StringIO()
- config_file_.write("[FAKE_SECTION]\n")
with open(config_file) as fh:
for line in fh:
config_file_.write(line)
- config_file_.seek(0)
- config.read_file(config_file_)
+ except FileNotFoundError:
+ logger.info("No config file found at %s, using default configuration.", config_file)
except OSError as e:
- logger.info("Error opening file %s: %s", config_file, e.message)
+ logger.info("Error opening file %s: %s", config_file, e.args[0])
+ config_file_.seek(0)
+ config.read_file(config_file_)
+ return config
+
+
+def get_config_as_dict():
+ config = dict()
+ config["apikey"] = apikey
+ config["server"] = server
+ config["cachedir"] = cache_directory
+ config["avoid_duplicate_runs"] = avoid_duplicate_runs
+ config["connection_n_retries"] = connection_n_retries
+ config["max_retries"] = max_retries
return config
@@ -253,11 +301,7 @@ def get_cache_directory():
"""
url_suffix = urlparse(server).netloc
reversed_url_suffix = os.sep.join(url_suffix.split(".")[::-1])
- if not cache_directory:
- _cachedir = _defaults(cache_directory)
- else:
- _cachedir = cache_directory
- _cachedir = os.path.join(_cachedir, reversed_url_suffix)
+ _cachedir = os.path.join(cache_directory, reversed_url_suffix)
return _cachedir
@@ -285,12 +329,13 @@ def set_cache_directory(cachedir):
)
stop_using_configuration_for_example = ConfigurationForExamples.stop_using_configuration_for_example
+
__all__ = [
"get_cache_directory",
"set_cache_directory",
"start_using_configuration_for_example",
"stop_using_configuration_for_example",
+ "get_config_as_dict",
]
_setup()
-_create_log_handlers()
diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py
index eb727b000..a1e2556be 100644
--- a/openml/datasets/data_feature.py
+++ b/openml/datasets/data_feature.py
@@ -1,5 +1,7 @@
# License: BSD 3-Clause
+from typing import List
+
class OpenMLDataFeature(object):
"""
@@ -20,7 +22,14 @@ class OpenMLDataFeature(object):
LEGAL_DATA_TYPES = ["nominal", "numeric", "string", "date"]
- def __init__(self, index, name, data_type, nominal_values, number_missing_values):
+ def __init__(
+ self,
+ index: int,
+ name: str,
+ data_type: str,
+ nominal_values: List[str],
+ number_missing_values: int,
+ ):
if type(index) != int:
raise ValueError("Index is of wrong datatype")
if data_type not in self.LEGAL_DATA_TYPES:
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
index 8c366dfb8..0c065b855 100644
--- a/openml/datasets/dataset.py
+++ b/openml/datasets/dataset.py
@@ -3,7 +3,6 @@
from collections import OrderedDict
import re
import gzip
-import io
import logging
import os
import pickle
@@ -13,7 +12,7 @@
import numpy as np
import pandas as pd
import scipy.sparse
-from warnings import warn
+import xmltodict
from openml.base import OpenMLBase
from .data_feature import OpenMLDataFeature
@@ -34,7 +33,7 @@ class OpenMLDataset(OpenMLBase):
Name of the dataset.
description : str
Description of the dataset.
- format : str
+ data_format : str
Format of the dataset which can be either 'arff' or 'sparse_arff'.
cache_format : str
Format for caching the dataset which can be either 'feather' or 'pickle'.
@@ -97,13 +96,16 @@ class OpenMLDataset(OpenMLBase):
which maps a quality name to a quality value.
dataset: string, optional
Serialized arff dataset string.
+ minio_url: string, optional
+ URL to the MinIO bucket with dataset files
+ parquet_file: string, optional
+ Path to the local parquet file.
"""
def __init__(
self,
name,
description,
- format=None,
data_format="arff",
cache_format="pickle",
dataset_id=None,
@@ -127,9 +129,11 @@ def __init__(
update_comment=None,
md5_checksum=None,
data_file=None,
- features=None,
- qualities=None,
+ features_file: Optional[str] = None,
+ qualities_file: Optional[str] = None,
dataset=None,
+ minio_url: Optional[str] = None,
+ parquet_file: Optional[str] = None,
):
def find_invalid_characters(string, pattern):
invalid_chars = set()
@@ -178,16 +182,8 @@ def find_invalid_characters(string, pattern):
)
self.cache_format = cache_format
- if format is None:
- self.format = data_format
- else:
- warn(
- "The format parameter in the init will be deprecated "
- "in the future."
- "Please use data_format instead",
- DeprecationWarning,
- )
- self.format = format
+ # Has to be called format, otherwise there will be an XML upload error
+ self.format = data_format
self.creator = creator
self.contributor = contributor
self.collection_date = collection_date
@@ -198,7 +194,7 @@ def find_invalid_characters(string, pattern):
self.default_target_attribute = default_target_attribute
self.row_id_attribute = row_id_attribute
if isinstance(ignore_attribute, str):
- self.ignore_attribute = [ignore_attribute]
+ self.ignore_attribute = [ignore_attribute] # type: Optional[List[str]]
elif isinstance(ignore_attribute, list) or ignore_attribute is None:
self.ignore_attribute = ignore_attribute
else:
@@ -212,39 +208,31 @@ def find_invalid_characters(string, pattern):
self.update_comment = update_comment
self.md5_checksum = md5_checksum
self.data_file = data_file
- self.features = None
- self.qualities = None
+ self.parquet_file = parquet_file
self._dataset = dataset
+ self._minio_url = minio_url
- if features is not None:
- self.features = {}
- for idx, xmlfeature in enumerate(features["oml:feature"]):
- nr_missing = xmlfeature.get("oml:number_of_missing_values", 0)
- feature = OpenMLDataFeature(
- int(xmlfeature["oml:index"]),
- xmlfeature["oml:name"],
- xmlfeature["oml:data_type"],
- xmlfeature.get("oml:nominal_value"),
- int(nr_missing),
- )
- if idx != feature.index:
- raise ValueError("Data features not provided " "in right order")
- self.features[feature.index] = feature
+ if features_file is not None:
+ self.features = _read_features(
+ features_file
+ ) # type: Optional[Dict[int, OpenMLDataFeature]]
+ else:
+ self.features = None
- self.qualities = _check_qualities(qualities)
+ if qualities_file:
+ self.qualities = _read_qualities(qualities_file) # type: Optional[Dict[str, float]]
+ else:
+ self.qualities = None
if data_file is not None:
- (
- self.data_pickle_file,
- self.data_feather_file,
- self.feather_attribute_file,
- ) = self._create_pickle_in_cache(data_file)
+ rval = self._compressed_cache_file_paths(data_file)
+ self.data_pickle_file = rval[0] if os.path.exists(rval[0]) else None
+ self.data_feather_file = rval[1] if os.path.exists(rval[1]) else None
+ self.feather_attribute_file = rval[2] if os.path.exists(rval[2]) else None
else:
- self.data_pickle_file, self.data_feather_file, self.feather_attribute_file = (
- None,
- None,
- None,
- )
+ self.data_pickle_file = None
+ self.data_feather_file = None
+ self.feather_attribute_file = None
@property
def id(self) -> Optional[int]:
@@ -311,9 +299,11 @@ def __eq__(self, other):
def _download_data(self) -> None:
""" Download ARFF data file to standard cache directory. Set `self.data_file`. """
# import required here to avoid circular import.
- from .functions import _get_dataset_arff
+ from .functions import _get_dataset_arff, _get_dataset_parquet
self.data_file = _get_dataset_arff(self)
+ if self._minio_url is not None:
+ self.parquet_file = _get_dataset_parquet(self)
def _get_arff(self, format: str) -> Dict:
"""Read ARFF file and return decoded arff.
@@ -367,7 +357,7 @@ def decode_arff(fh):
with gzip.open(filename) as fh:
return decode_arff(fh)
else:
- with io.open(filename, encoding="utf8") as fh:
+ with open(filename, encoding="utf8") as fh:
return decode_arff(fh)
def _parse_data_from_arff(
@@ -407,7 +397,7 @@ def _parse_data_from_arff(
categories_names = {}
categorical = []
for i, (name, type_) in enumerate(data["attributes"]):
- # if the feature is nominal and the a sparse matrix is
+ # if the feature is nominal and a sparse matrix is
# requested, the categories need to be numeric
if isinstance(type_, list) and self.format.lower() == "sparse_arff":
try:
@@ -415,12 +405,10 @@ def _parse_data_from_arff(
# can be encoded into integers
pd.factorize(type_)[0]
except ValueError:
- raise ValueError(
- "Categorical data needs to be numeric when " "using sparse ARFF."
- )
+ raise ValueError("Categorical data needs to be numeric when using sparse ARFF.")
# string can only be supported with pandas DataFrame
elif type_ == "STRING" and self.format.lower() == "sparse_arff":
- raise ValueError("Dataset containing strings is not supported " "with sparse ARFF.")
+ raise ValueError("Dataset containing strings is not supported with sparse ARFF.")
# infer the dtype from the ARFF header
if isinstance(type_, list):
@@ -456,6 +444,17 @@ def _parse_data_from_arff(
col.append(
self._unpack_categories(X[column_name], categories_names[column_name])
)
+ elif attribute_dtype[column_name] in ("floating", "integer"):
+ X_col = X[column_name]
+ if X_col.min() >= 0 and X_col.max() <= 255:
+ try:
+ X_col_uint = X_col.astype("uint8")
+ if (X_col == X_col_uint).all():
+ col.append(X_col_uint)
+ continue
+ except ValueError:
+ pass
+ col.append(X[column_name])
else:
col.append(X[column_name])
X = pd.concat(col, axis=1)
@@ -464,152 +463,117 @@ def _parse_data_from_arff(
return X, categorical, attribute_names
- def _create_pickle_in_cache(self, data_file: str) -> Tuple[str, str, str]:
- """ Parse the arff and pickle the result. Update any old pickle objects. """
- data_pickle_file = data_file.replace(".arff", ".pkl.py3")
- data_feather_file = data_file.replace(".arff", ".feather")
- feather_attribute_file = data_file.replace(".arff", ".feather.attributes.pkl.py3")
- if os.path.exists(data_pickle_file) and self.cache_format == "pickle":
- # Load the data to check if the pickle file is outdated (i.e. contains numpy array)
- with open(data_pickle_file, "rb") as fh:
- try:
- data, categorical, attribute_names = pickle.load(fh)
- except EOFError:
- # The file is likely corrupt, see #780.
- # We deal with this when loading the data in `_load_data`.
- return data_pickle_file, data_feather_file, feather_attribute_file
- except ModuleNotFoundError:
- # There was some issue loading the file, see #918
- # We deal with this when loading the data in `_load_data`.
- return data_pickle_file, data_feather_file, feather_attribute_file
- except ValueError as e:
- if "unsupported pickle protocol" in e.args[0]:
- # There was some issue loading the file, see #898
- # We deal with this when loading the data in `_load_data`.
- return data_pickle_file, data_feather_file, feather_attribute_file
- else:
- raise
-
- # Between v0.8 and v0.9 the format of pickled data changed from
- # np.ndarray to pd.DataFrame. This breaks some backwards compatibility,
- # e.g. for `run_model_on_task`. If a local file still exists with
- # np.ndarray data, we reprocess the data file to store a pickled
- # pd.DataFrame blob. See also #646.
- if isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data):
- logger.debug("Data pickle file already exists and is up to date.")
- return data_pickle_file, data_feather_file, feather_attribute_file
- elif os.path.exists(data_feather_file) and self.cache_format == "feather":
- # Load the data to check if the pickle file is outdated (i.e. contains numpy array)
- try:
- data = pd.read_feather(data_feather_file)
- except EOFError:
- # The file is likely corrupt, see #780.
- # We deal with this when loading the data in `_load_data`.
- return data_pickle_file, data_feather_file, feather_attribute_file
- except ModuleNotFoundError:
- # There was some issue loading the file, see #918
- # We deal with this when loading the data in `_load_data`.
- return data_pickle_file, data_feather_file, feather_attribute_file
- except ValueError as e:
- if "unsupported pickle protocol" in e.args[0]:
- # There was some issue loading the file, see #898
- # We deal with this when loading the data in `_load_data`.
- return data_pickle_file, data_feather_file, feather_attribute_file
- else:
- raise
+ def _compressed_cache_file_paths(self, data_file: str) -> Tuple[str, str, str]:
+ ext = f".{data_file.split('.')[-1]}"
+ data_pickle_file = data_file.replace(ext, ".pkl.py3")
+ data_feather_file = data_file.replace(ext, ".feather")
+ feather_attribute_file = data_file.replace(ext, ".feather.attributes.pkl.py3")
+ return data_pickle_file, data_feather_file, feather_attribute_file
+
+ def _cache_compressed_file_from_file(
+ self, data_file: str
+ ) -> Tuple[Union[pd.DataFrame, scipy.sparse.csr_matrix], List[bool], List[str]]:
+ """ Store data from the local file in compressed format.
- logger.debug("Data feather file already exists and is up to date.")
- return data_pickle_file, data_feather_file, feather_attribute_file
+ If a local parquet file is present it will be used instead of the arff file.
+ Sets cache_format to 'pickle' if data is sparse.
+ """
+ (
+ data_pickle_file,
+ data_feather_file,
+ feather_attribute_file,
+ ) = self._compressed_cache_file_paths(data_file)
+
+ if data_file.endswith(".arff"):
+ data, categorical, attribute_names = self._parse_data_from_arff(data_file)
+ elif data_file.endswith(".pq"):
+ try:
+ data = pd.read_parquet(data_file)
+ except Exception as e:
+ raise Exception(f"File: {data_file}") from e
- # At this point either the pickle file does not exist, or it had outdated formatting.
- # We parse the data from arff again and populate the cache with a recent pickle file.
- X, categorical, attribute_names = self._parse_data_from_arff(data_file)
+ categorical = [data[c].dtype.name == "category" for c in data.columns]
+ attribute_names = list(data.columns)
+ else:
+ raise ValueError(f"Unknown file type for file '{data_file}'.")
# Feather format does not work for sparse datasets, so we use pickle for sparse datasets
+ if scipy.sparse.issparse(data):
+ self.cache_format = "pickle"
- if self.cache_format == "feather" and not scipy.sparse.issparse(X):
- logger.info("feather write {}".format(self.name))
- X.to_feather(data_feather_file)
+ logger.info(f"{self.cache_format} write {self.name}")
+ if self.cache_format == "feather":
+ data.to_feather(data_feather_file)
with open(feather_attribute_file, "wb") as fh:
pickle.dump((categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL)
+ self.data_feather_file = data_feather_file
+ self.feather_attribute_file = feather_attribute_file
else:
- logger.info("pickle write {}".format(self.name))
- self.cache_format = "pickle"
with open(data_pickle_file, "wb") as fh:
- pickle.dump((X, categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL)
- logger.debug(
- "Saved dataset {did}: {name} to file {path}".format(
- did=int(self.dataset_id or -1), name=self.name, path=data_pickle_file
- )
- )
- return data_pickle_file, data_feather_file, feather_attribute_file
+ pickle.dump((data, categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL)
+ self.data_pickle_file = data_pickle_file
+
+ data_file = data_pickle_file if self.cache_format == "pickle" else data_feather_file
+ logger.debug(f"Saved dataset {int(self.dataset_id or -1)}: {self.name} to file {data_file}")
+
+ return data, categorical, attribute_names
def _load_data(self):
- """ Load data from pickle or arff. Download data first if not present on disk. """
- if (self.cache_format == "pickle" and self.data_pickle_file is None) or (
- self.cache_format == "feather" and self.data_feather_file is None
- ):
+ """ Load data from compressed format or arff. Download data if not present on disk. """
+ need_to_create_pickle = self.cache_format == "pickle" and self.data_pickle_file is None
+ need_to_create_feather = self.cache_format == "feather" and self.data_feather_file is None
+
+ if need_to_create_pickle or need_to_create_feather:
if self.data_file is None:
self._download_data()
- (
- self.data_pickle_file,
- self.data_feather_file,
- self.feather_attribute_file,
- ) = self._create_pickle_in_cache(self.data_file)
+ file_to_load = self.data_file if self.parquet_file is None else self.parquet_file
+ return self._cache_compressed_file_from_file(file_to_load)
+
+ # helper variable to help identify where errors occur
+ fpath = self.data_feather_file if self.cache_format == "feather" else self.data_pickle_file
+ logger.info(f"{self.cache_format} load data {self.name}")
try:
if self.cache_format == "feather":
- logger.info("feather load data {}".format(self.name))
data = pd.read_feather(self.data_feather_file)
-
+ fpath = self.feather_attribute_file
with open(self.feather_attribute_file, "rb") as fh:
categorical, attribute_names = pickle.load(fh)
else:
- logger.info("pickle load data {}".format(self.name))
with open(self.data_pickle_file, "rb") as fh:
data, categorical, attribute_names = pickle.load(fh)
- except EOFError:
- logger.warning(
- "Detected a corrupt cache file loading dataset %d: '%s'. "
- "We will continue loading data from the arff-file, "
- "but this will be much slower for big datasets. "
- "Please manually delete the cache file if you want OpenML-Python "
- "to attempt to reconstruct it."
- "" % (self.dataset_id, self.data_pickle_file)
- )
- data, categorical, attribute_names = self._parse_data_from_arff(self.data_file)
except FileNotFoundError:
- raise ValueError(
- "Cannot find a pickle file for dataset {} at "
- "location {} ".format(self.name, self.data_pickle_file)
- )
- except ModuleNotFoundError as e:
+ raise ValueError(f"Cannot find file for dataset {self.name} at location '{fpath}'.")
+ except (EOFError, ModuleNotFoundError, ValueError) as e:
+ error_message = e.message if hasattr(e, "message") else e.args[0]
+ hint = ""
+
+ if isinstance(e, EOFError):
+ readable_error = "Detected a corrupt cache file"
+ elif isinstance(e, ModuleNotFoundError):
+ readable_error = "Detected likely dependency issues"
+ hint = "This is most likely due to https://github.com/openml/openml-python/issues/918. " # noqa: 501
+ elif isinstance(e, ValueError) and "unsupported pickle protocol" in e.args[0]:
+ readable_error = "Encountered unsupported pickle protocol"
+ else:
+ raise # an unknown ValueError is raised, should crash and file bug report
+
logger.warning(
- "Encountered error message when loading cached dataset %d: '%s'. "
- "Error message was: %s. "
- "This is most likely due to https://github.com/openml/openml-python/issues/918. "
+ f"{readable_error} when loading dataset {self.id} from '{fpath}'. "
+ f"{hint}"
+ f"Error message was: {error_message}. "
"We will continue loading data from the arff-file, "
"but this will be much slower for big datasets. "
"Please manually delete the cache file if you want OpenML-Python "
"to attempt to reconstruct it."
- "" % (self.dataset_id, self.data_pickle_file, e.args[0]),
)
data, categorical, attribute_names = self._parse_data_from_arff(self.data_file)
- except ValueError as e:
- if "unsupported pickle protocol" in e.args[0]:
- logger.warning(
- "Encountered unsupported pickle protocol when loading cached dataset %d: '%s'. "
- "Error message was: %s. "
- "We will continue loading data from the arff-file, "
- "but this will be much slower for big datasets. "
- "Please manually delete the cache file if you want OpenML-Python "
- "to attempt to reconstruct it."
- "" % (self.dataset_id, self.data_pickle_file, e.args[0]),
- )
- data, categorical, attribute_names = self._parse_data_from_arff(self.data_file)
- else:
- raise
+ data_up_to_date = isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data)
+ if self.cache_format == "pickle" and not data_up_to_date:
+ logger.info("Updating outdated pickle file.")
+ file_to_load = self.data_file if self.parquet_file is None else self.parquet_file
+ return self._cache_compressed_file_from_file(file_to_load)
return data, categorical, attribute_names
@staticmethod
@@ -675,6 +639,11 @@ def _encode_if_category(column):
@staticmethod
def _unpack_categories(series, categories):
+ # nan-likes can not be explicitly specified as a category
+ def valid_category(cat):
+ return isinstance(cat, str) or (cat is not None and not np.isnan(cat))
+
+ filtered_categories = [c for c in categories if valid_category(c)]
col = []
for x in series:
try:
@@ -683,7 +652,7 @@ def _unpack_categories(series, categories):
col.append(np.nan)
# We require two lines to create a series of categories as detailed here:
# https://pandas.pydata.org/pandas-docs/version/0.24/user_guide/categorical.html#series-creation # noqa E501
- raw_cat = pd.Categorical(col, ordered=True, categories=categories)
+ raw_cat = pd.Categorical(col, ordered=True, categories=filtered_categories)
return pd.Series(raw_cat, index=series.index, name=series.name)
def get_data(
@@ -742,7 +711,7 @@ def get_data(
to_exclude.extend(self.ignore_attribute)
if len(to_exclude) > 0:
- logger.info("Going to remove the following attributes:" " %s" % to_exclude)
+ logger.info("Going to remove the following attributes: %s" % to_exclude)
keep = np.array(
[True if column not in to_exclude else False for column in attribute_names]
)
@@ -809,6 +778,10 @@ def retrieve_class_labels(self, target_name: str = "class") -> Union[None, List[
-------
list
"""
+ if self.features is None:
+ raise ValueError(
+ "retrieve_class_labels can only be called if feature information is available."
+ )
for feature in self.features.values():
if (feature.name == target_name) and (feature.data_type == "nominal"):
return feature.nominal_values
@@ -937,18 +910,73 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
return data_container
-def _check_qualities(qualities):
- if qualities is not None:
- qualities_ = {}
- for xmlquality in qualities:
- name = xmlquality["oml:name"]
- if xmlquality.get("oml:value", None) is None:
- value = float("NaN")
- elif xmlquality["oml:value"] == "null":
- value = float("NaN")
- else:
- value = float(xmlquality["oml:value"])
- qualities_[name] = value
- return qualities_
- else:
- return None
+def _read_features(features_file: str) -> Dict[int, OpenMLDataFeature]:
+ features_pickle_file = _get_features_pickle_file(features_file)
+ try:
+ with open(features_pickle_file, "rb") as fh_binary:
+ features = pickle.load(fh_binary)
+ except: # noqa E722
+ with open(features_file, encoding="utf8") as fh:
+ features_xml_string = fh.read()
+ xml_dict = xmltodict.parse(
+ features_xml_string, force_list=("oml:feature", "oml:nominal_value")
+ )
+ features_xml = xml_dict["oml:data_features"]
+
+ features = {}
+ for idx, xmlfeature in enumerate(features_xml["oml:feature"]):
+ nr_missing = xmlfeature.get("oml:number_of_missing_values", 0)
+ feature = OpenMLDataFeature(
+ int(xmlfeature["oml:index"]),
+ xmlfeature["oml:name"],
+ xmlfeature["oml:data_type"],
+ xmlfeature.get("oml:nominal_value"),
+ int(nr_missing),
+ )
+ if idx != feature.index:
+ raise ValueError("Data features not provided in right order")
+ features[feature.index] = feature
+
+ with open(features_pickle_file, "wb") as fh_binary:
+ pickle.dump(features, fh_binary)
+ return features
+
+
+def _get_features_pickle_file(features_file: str) -> str:
+ """This function only exists so it can be mocked during unit testing"""
+ return features_file + ".pkl"
+
+
+def _read_qualities(qualities_file: str) -> Dict[str, float]:
+ qualities_pickle_file = _get_qualities_pickle_file(qualities_file)
+ try:
+ with open(qualities_pickle_file, "rb") as fh_binary:
+ qualities = pickle.load(fh_binary)
+ except: # noqa E722
+ with open(qualities_file, encoding="utf8") as fh:
+ qualities_xml = fh.read()
+ xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",))
+ qualities = xml_as_dict["oml:data_qualities"]["oml:quality"]
+ qualities = _check_qualities(qualities)
+ with open(qualities_pickle_file, "wb") as fh_binary:
+ pickle.dump(qualities, fh_binary)
+ return qualities
+
+
+def _get_qualities_pickle_file(qualities_file: str) -> str:
+ """This function only exists so it can be mocked during unit testing"""
+ return qualities_file + ".pkl"
+
+
+def _check_qualities(qualities: List[Dict[str, str]]) -> Dict[str, float]:
+ qualities_ = {}
+ for xmlquality in qualities:
+ name = xmlquality["oml:name"]
+ if xmlquality.get("oml:value", None) is None:
+ value = float("NaN")
+ elif xmlquality["oml:value"] == "null":
+ value = float("NaN")
+ else:
+ value = float(xmlquality["oml:value"])
+ qualities_[name] = value
+ return qualities_
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index 84943b244..746285650 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -3,8 +3,7 @@
import io
import logging
import os
-import re
-from typing import List, Dict, Union, Optional
+from typing import List, Dict, Union, Optional, cast
import numpy as np
import arff
@@ -18,13 +17,11 @@
import openml._api_calls
from .dataset import OpenMLDataset
from ..exceptions import (
- OpenMLCacheException,
OpenMLHashException,
OpenMLServerException,
OpenMLPrivateDatasetError,
)
from ..utils import (
- _create_cache_directory,
_remove_cache_dir_for_id,
_create_cache_directory_for_id,
)
@@ -37,118 +34,6 @@
# Local getters/accessors to the cache directory
-def _list_cached_datasets():
- """ Return list with ids of all cached datasets.
-
- Returns
- -------
- list
- List with IDs of all cached datasets.
- """
- datasets = []
-
- dataset_cache_dir = _create_cache_directory(DATASETS_CACHE_DIR_NAME)
- directory_content = os.listdir(dataset_cache_dir)
- directory_content.sort()
-
- # Find all dataset ids for which we have downloaded the dataset
- # description
- for directory_name in directory_content:
- # First check if the directory name could be an OpenML dataset id
- if not re.match(r"[0-9]*", directory_name):
- continue
-
- dataset_id = int(directory_name)
-
- directory_name = os.path.join(dataset_cache_dir, directory_name)
- dataset_directory_content = os.listdir(directory_name)
-
- if (
- "dataset.arff" in dataset_directory_content
- and "description.xml" in dataset_directory_content
- ):
- if dataset_id not in datasets:
- datasets.append(dataset_id)
-
- datasets.sort()
- return datasets
-
-
-def _get_cached_datasets():
- """Searches for all OpenML datasets in the OpenML cache dir.
-
- Return a dictionary which maps dataset ids to dataset objects"""
- dataset_list = _list_cached_datasets()
- datasets = OrderedDict()
-
- for dataset_id in dataset_list:
- datasets[dataset_id] = _get_cached_dataset(dataset_id)
-
- return datasets
-
-
-def _get_cached_dataset(dataset_id: int) -> OpenMLDataset:
- """Get cached dataset for ID.
-
- Returns
- -------
- OpenMLDataset
- """
- description = _get_cached_dataset_description(dataset_id)
- arff_file = _get_cached_dataset_arff(dataset_id)
- features = _get_cached_dataset_features(dataset_id)
- qualities = _get_cached_dataset_qualities(dataset_id)
- dataset = _create_dataset_from_description(description, features, qualities, arff_file)
-
- return dataset
-
-
-def _get_cached_dataset_description(dataset_id):
- did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id,)
- description_file = os.path.join(did_cache_dir, "description.xml")
- try:
- with io.open(description_file, encoding="utf8") as fh:
- dataset_xml = fh.read()
- return xmltodict.parse(dataset_xml)["oml:data_set_description"]
- except (IOError, OSError):
- raise OpenMLCacheException(
- "Dataset description for dataset id %d not " "cached" % dataset_id
- )
-
-
-def _get_cached_dataset_features(dataset_id):
- did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id,)
- features_file = os.path.join(did_cache_dir, "features.xml")
- try:
- return _load_features_from_file(features_file)
- except (IOError, OSError):
- raise OpenMLCacheException("Dataset features for dataset id %d not " "cached" % dataset_id)
-
-
-def _get_cached_dataset_qualities(dataset_id):
- did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id,)
- qualities_file = os.path.join(did_cache_dir, "qualities.xml")
- try:
- with io.open(qualities_file, encoding="utf8") as fh:
- qualities_xml = fh.read()
- qualities_dict = xmltodict.parse(qualities_xml)
- return qualities_dict["oml:data_qualities"]["oml:quality"]
- except (IOError, OSError):
- raise OpenMLCacheException("Dataset qualities for dataset id %d not " "cached" % dataset_id)
-
-
-def _get_cached_dataset_arff(dataset_id):
- did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id,)
- output_file = os.path.join(did_cache_dir, "dataset.arff")
-
- try:
- with io.open(output_file, encoding="utf8"):
- pass
- return output_file
- except (OSError, IOError):
- raise OpenMLCacheException("ARFF file for dataset id %d not " "cached" % dataset_id)
-
-
def _get_cache_directory(dataset: OpenMLDataset) -> str:
""" Return the cache directory of the OpenMLDataset """
return _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset.dataset_id)
@@ -183,7 +68,7 @@ def list_datasets(
status: Optional[str] = None,
tag: Optional[str] = None,
output_format: str = "dict",
- **kwargs
+ **kwargs,
) -> Union[Dict, pd.DataFrame]:
"""
@@ -251,7 +136,7 @@ def list_datasets(
size=size,
status=status,
tag=tag,
- **kwargs
+ **kwargs,
)
@@ -326,34 +211,59 @@ def __list_datasets(api_call, output_format="dict"):
return datasets
-def _load_features_from_file(features_file: str) -> Dict:
- with io.open(features_file, encoding="utf8") as fh:
- features_xml = fh.read()
- xml_dict = xmltodict.parse(features_xml, force_list=("oml:feature", "oml:nominal_value"))
- return xml_dict["oml:data_features"]
+def _expand_parameter(parameter: Union[str, List[str]]) -> List[str]:
+ expanded_parameter = []
+ if isinstance(parameter, str):
+ expanded_parameter = [x.strip() for x in parameter.split(",")]
+ elif isinstance(parameter, list):
+ expanded_parameter = parameter
+ return expanded_parameter
+
+
+def _validated_data_attributes(
+ attributes: List[str], data_attributes: List[str], parameter_name: str
+) -> None:
+ for attribute_ in attributes:
+ is_attribute_a_data_attribute = any([attr[0] == attribute_ for attr in data_attributes])
+ if not is_attribute_a_data_attribute:
+ raise ValueError(
+ "all attribute of '{}' should be one of the data attribute. "
+ " Got '{}' while candidates are {}.".format(
+ parameter_name, attribute_, [attr[0] for attr in data_attributes]
+ )
+ )
-def check_datasets_active(dataset_ids: List[int]) -> Dict[int, bool]:
+def check_datasets_active(
+ dataset_ids: List[int], raise_error_if_not_exist: bool = True,
+) -> Dict[int, bool]:
"""
Check if the dataset ids provided are active.
+ Raises an error if a dataset_id in the given list
+ of dataset_ids does not exist on the server.
+
Parameters
----------
dataset_ids : List[int]
A list of integers representing dataset ids.
+ raise_error_if_not_exist : bool (default=True)
+ Flag that if activated can raise an error, if one or more of the
+ given dataset ids do not exist on the server.
Returns
-------
dict
A dictionary with items {did: bool}
"""
- dataset_list = list_datasets(status="all")
+ dataset_list = list_datasets(status="all", data_id=dataset_ids)
active = {}
for did in dataset_ids:
dataset = dataset_list.get(did, None)
if dataset is None:
- raise ValueError("Could not find dataset {} in OpenML dataset list.".format(did))
+ if raise_error_if_not_exist:
+ raise ValueError(f"Could not find dataset {did} in OpenML dataset list.")
else:
active[did] = dataset["status"] == "active"
@@ -380,6 +290,8 @@ def _name_to_id(
error_if_multiple : bool (default=False)
If `False`, if multiple datasets match, return the least recent active dataset.
If `True`, if multiple datasets match, raise an error.
+ download_qualities : bool, optional (default=True)
+ If `True`, also download qualities.xml file. If False it skip the qualities.xml.
Returns
-------
@@ -400,7 +312,7 @@ def _name_to_id(
def get_datasets(
- dataset_ids: List[Union[str, int]], download_data: bool = True,
+ dataset_ids: List[Union[str, int]], download_data: bool = True, download_qualities: bool = True
) -> List[OpenMLDataset]:
"""Download datasets.
@@ -416,6 +328,8 @@ def get_datasets(
make the operation noticeably slower. Metadata is also still retrieved.
If False, create the OpenMLDataset and only populate it with the metadata.
The data may later be retrieved through the `OpenMLDataset.get_data` method.
+ download_qualities : bool, optional (default=True)
+ If True, also download qualities.xml file. If False it skip the qualities.xml.
Returns
-------
@@ -424,7 +338,9 @@ def get_datasets(
"""
datasets = []
for dataset_id in dataset_ids:
- datasets.append(get_dataset(dataset_id, download_data))
+ datasets.append(
+ get_dataset(dataset_id, download_data, download_qualities=download_qualities)
+ )
return datasets
@@ -435,6 +351,7 @@ def get_dataset(
version: int = None,
error_if_multiple: bool = False,
cache_format: str = "pickle",
+ download_qualities: bool = True,
) -> OpenMLDataset:
""" Download the OpenML dataset representation, optionally also download actual data file.
@@ -489,21 +406,28 @@ def get_dataset(
did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id,)
+ remove_dataset_cache = True
try:
- remove_dataset_cache = True
description = _get_dataset_description(did_cache_dir, dataset_id)
- features = _get_dataset_features(did_cache_dir, dataset_id)
+ features_file = _get_dataset_features_file(did_cache_dir, dataset_id)
try:
- qualities = _get_dataset_qualities(did_cache_dir, dataset_id)
+ if download_qualities:
+ qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)
+ else:
+ qualities_file = ""
except OpenMLServerException as e:
if e.code == 362 and str(e) == "No qualities found - None":
logger.warning("No qualities found for dataset {}".format(dataset_id))
- qualities = None
+ qualities_file = None
else:
raise
arff_file = _get_dataset_arff(description) if download_data else None
+ if "oml:minio_url" in description and download_data:
+ parquet_file = _get_dataset_parquet(description)
+ else:
+ parquet_file = None
remove_dataset_cache = False
except OpenMLServerException as e:
# if there was an exception,
@@ -517,7 +441,7 @@ def get_dataset(
_remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir)
dataset = _create_dataset_from_description(
- description, features, qualities, arff_file, cache_format
+ description, features_file, qualities_file, arff_file, parquet_file, cache_format
)
return dataset
@@ -636,6 +560,7 @@ def create_dataset(
ignore_attribute : str | list
Attributes that should be excluded in modelling,
such as identifiers and indexes.
+ Can have multiple values, comma separated.
citation : str
Reference(s) that should be cited when building on this data.
version_label : str, optional
@@ -687,6 +612,11 @@ def create_dataset(
attributes_[attr_idx] = (attr_name, attributes[attr_name])
else:
attributes_ = attributes
+ ignore_attributes = _expand_parameter(ignore_attribute)
+ _validated_data_attributes(ignore_attributes, attributes_, "ignore_attribute")
+
+ default_target_attributes = _expand_parameter(default_target_attribute)
+ _validated_data_attributes(default_target_attributes, attributes_, "default_target_attribute")
if row_id_attribute is not None:
is_row_id_an_attribute = any([attr[0] == row_id_attribute for attr in attributes_])
@@ -943,6 +873,47 @@ def fork_dataset(data_id: int) -> int:
return int(data_id)
+def _topic_add_dataset(data_id: int, topic: str):
+ """
+ Adds a topic for a dataset.
+ This API is not available for all OpenML users and is accessible only by admins.
+ Parameters
+ ----------
+ data_id : int
+ id of the dataset for which the topic needs to be added
+ topic : str
+ Topic to be added for the dataset
+ """
+ if not isinstance(data_id, int):
+ raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))
+ form_data = {"data_id": data_id, "topic": topic}
+ result_xml = openml._api_calls._perform_api_call("data/topicadd", "post", data=form_data)
+ result = xmltodict.parse(result_xml)
+ data_id = result["oml:data_topic"]["oml:id"]
+ return int(data_id)
+
+
+def _topic_delete_dataset(data_id: int, topic: str):
+ """
+ Removes a topic from a dataset.
+ This API is not available for all OpenML users and is accessible only by admins.
+ Parameters
+ ----------
+ data_id : int
+ id of the dataset to be forked
+ topic : str
+ Topic to be deleted
+
+ """
+ if not isinstance(data_id, int):
+ raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))
+ form_data = {"data_id": data_id, "topic": topic}
+ result_xml = openml._api_calls._perform_api_call("data/topicdelete", "post", data=form_data)
+ result = xmltodict.parse(result_xml)
+ data_id = result["oml:data_topic"]["oml:id"]
+ return int(data_id)
+
+
def _get_dataset_description(did_cache_dir, dataset_id):
"""Get the dataset description as xml dictionary.
@@ -969,8 +940,9 @@ def _get_dataset_description(did_cache_dir, dataset_id):
description_file = os.path.join(did_cache_dir, "description.xml")
try:
- return _get_cached_dataset_description(dataset_id)
- except OpenMLCacheException:
+ with io.open(description_file, encoding="utf8") as fh:
+ dataset_xml = fh.read()
+ except Exception:
url_extension = "data/{}".format(dataset_id)
dataset_xml = openml._api_calls._perform_api_call(url_extension, "get")
with io.open(description_file, "w", encoding="utf8") as fh:
@@ -981,6 +953,55 @@ def _get_dataset_description(did_cache_dir, dataset_id):
return description
+def _get_dataset_parquet(
+ description: Union[Dict, OpenMLDataset], cache_directory: str = None
+) -> Optional[str]:
+ """ Return the path to the local parquet file of the dataset. If is not cached, it is downloaded.
+
+ Checks if the file is in the cache, if yes, return the path to the file.
+ If not, downloads the file and caches it, then returns the file path.
+ The cache directory is generated based on dataset information, but can also be specified.
+
+ This function is NOT thread/multiprocessing safe.
+ Unlike the ARFF equivalent, checksums are not available/used (for now).
+
+ Parameters
+ ----------
+ description : dictionary or OpenMLDataset
+ Either a dataset description as dict or OpenMLDataset.
+
+ cache_directory: str, optional (default=None)
+ Folder to store the parquet file in.
+ If None, use the default cache directory for the dataset.
+
+ Returns
+ -------
+ output_filename : string, optional
+ Location of the Parquet file if successfully downloaded, None otherwise.
+ """
+ if isinstance(description, dict):
+ url = description.get("oml:minio_url")
+ did = description.get("oml:id")
+ elif isinstance(description, OpenMLDataset):
+ url = description._minio_url
+ did = description.dataset_id
+ else:
+ raise TypeError("`description` should be either OpenMLDataset or Dict.")
+
+ if cache_directory is None:
+ cache_directory = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did)
+ output_file_path = os.path.join(cache_directory, "dataset.pq")
+
+ if not os.path.isfile(output_file_path):
+ try:
+ openml._api_calls._download_minio_file(
+ source=cast(str, url), destination=output_file_path
+ )
+ except FileNotFoundError:
+ return None
+ return output_file_path
+
+
def _get_dataset_arff(description: Union[Dict, OpenMLDataset], cache_directory: str = None) -> str:
""" Return the path to the local arff file of the dataset. If is not cached, it is downloaded.
@@ -1031,8 +1052,8 @@ def _get_dataset_arff(description: Union[Dict, OpenMLDataset], cache_directory:
return output_file_path
-def _get_dataset_features(did_cache_dir, dataset_id):
- """API call to get dataset features (cached)
+def _get_dataset_features_file(did_cache_dir: str, dataset_id: int) -> str:
+ """API call to load dataset features. Loads from cache or downloads them.
Features are feature descriptions for each column.
(name, index, categorical, ...)
@@ -1049,8 +1070,8 @@ def _get_dataset_features(did_cache_dir, dataset_id):
Returns
-------
- features : dict
- Dictionary containing dataset feature descriptions, parsed from XML.
+ str
+ Path of the cached dataset feature file
"""
features_file = os.path.join(did_cache_dir, "features.xml")
@@ -1061,11 +1082,11 @@ def _get_dataset_features(did_cache_dir, dataset_id):
with io.open(features_file, "w", encoding="utf8") as fh:
fh.write(features_xml)
- return _load_features_from_file(features_file)
+ return features_file
-def _get_dataset_qualities(did_cache_dir, dataset_id):
- """API call to get dataset qualities (cached)
+def _get_dataset_qualities_file(did_cache_dir, dataset_id):
+ """API call to load dataset qualities. Loads from cache or downloads them.
Features are metafeatures (number of features, number of classes, ...)
@@ -1079,10 +1100,12 @@ def _get_dataset_qualities(did_cache_dir, dataset_id):
dataset_id : int
Dataset ID
+ download_qualities : bool
+ wheather to download/use cahsed version or not.
Returns
-------
- qualities : dict
- Dictionary containing dataset qualities, parsed from XML.
+ str
+ Path of the cached qualities file
"""
# Dataset qualities are subject to change and must be fetched every time
qualities_file = os.path.join(did_cache_dir, "qualities.xml")
@@ -1092,21 +1115,17 @@ def _get_dataset_qualities(did_cache_dir, dataset_id):
except (OSError, IOError):
url_extension = "data/qualities/{}".format(dataset_id)
qualities_xml = openml._api_calls._perform_api_call(url_extension, "get")
-
with io.open(qualities_file, "w", encoding="utf8") as fh:
fh.write(qualities_xml)
-
- xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",))
- qualities = xml_as_dict["oml:data_qualities"]["oml:quality"]
-
- return qualities
+ return qualities_file
def _create_dataset_from_description(
description: Dict[str, str],
- features: Dict,
- qualities: List,
+ features_file: str,
+ qualities_file: str,
arff_file: str = None,
+ parquet_file: str = None,
cache_format: str = "pickle",
) -> OpenMLDataset:
"""Create a dataset object from a description dict.
@@ -1115,12 +1134,14 @@ def _create_dataset_from_description(
----------
description : dict
Description of a dataset in xml dict.
- features : dict
- Description of a dataset features.
+ featuresfile : str
+ Path of the dataset features as xml file.
qualities : list
- Description of a dataset qualities.
+ Path of the dataset qualities as xml file.
arff_file : string, optional
Path of dataset ARFF file.
+ parquet_file : string, optional
+ Path of dataset Parquet file.
cache_format: string, optional
Caching option for datasets (feather/pickle)
@@ -1155,8 +1176,10 @@ def _create_dataset_from_description(
md5_checksum=description.get("oml:md5_checksum"),
data_file=arff_file,
cache_format=cache_format,
- features=features,
- qualities=qualities,
+ features_file=features_file,
+ qualities_file=qualities_file,
+ minio_url=description.get("oml:minio_url"),
+ parquet_file=parquet_file,
)
diff --git a/openml/extensions/extension_interface.py b/openml/extensions/extension_interface.py
index 2d06b69e0..4529ad163 100644
--- a/openml/extensions/extension_interface.py
+++ b/openml/extensions/extension_interface.py
@@ -229,6 +229,19 @@ def obtain_parameter_values(
- ``oml:component`` : int: flow id to which the parameter belongs
"""
+ @abstractmethod
+ def check_if_model_fitted(self, model: Any) -> bool:
+ """Returns True/False denoting if the model has already been fitted/trained.
+
+ Parameters
+ ----------
+ model : Any
+
+ Returns
+ -------
+ bool
+ """
+
################################################################################################
# Abstract methods for hyperparameter optimization
diff --git a/openml/extensions/sklearn/__init__.py b/openml/extensions/sklearn/__init__.py
index 2003934db..135e5ccf6 100644
--- a/openml/extensions/sklearn/__init__.py
+++ b/openml/extensions/sklearn/__init__.py
@@ -7,3 +7,31 @@
__all__ = ["SklearnExtension"]
register_extension(SklearnExtension)
+
+
+def cont(X):
+ """Returns True for all non-categorical columns, False for the rest.
+
+ This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling
+ of mixed data types. To build sklearn models on mixed data types, a ColumnTransformer is
+ required to process each type of columns separately.
+ This function allows transformations meant for continuous/numeric columns to access the
+ continuous/numeric columns given the dataset as DataFrame.
+ """
+ if not hasattr(X, "dtypes"):
+ raise AttributeError("Not a Pandas DataFrame with 'dtypes' as attribute!")
+ return X.dtypes != "category"
+
+
+def cat(X):
+ """Returns True for all categorical columns, False for the rest.
+
+ This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling
+ of mixed data types. To build sklearn models on mixed data types, a ColumnTransformer is
+ required to process each type of columns separately.
+ This function allows transformations meant for categorical columns to access the
+ categorical columns given the dataset as DataFrame.
+ """
+ if not hasattr(X, "dtypes"):
+ raise AttributeError("Not a Pandas DataFrame with 'dtypes' as attribute!")
+ return X.dtypes == "category"
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
index edb14487b..3441b4a4e 100644
--- a/openml/extensions/sklearn/extension.py
+++ b/openml/extensions/sklearn/extension.py
@@ -52,7 +52,10 @@
SIMPLE_NUMPY_TYPES = [
- nptype for type_cat, nptypes in np.sctypes.items() for nptype in nptypes if type_cat != "others"
+ nptype
+ for type_cat, nptypes in np.sctypes.items()
+ for nptype in nptypes # type: ignore
+ if type_cat != "others"
]
SIMPLE_TYPES = tuple([bool, int, float, str] + SIMPLE_NUMPY_TYPES)
@@ -211,6 +214,61 @@ def remove_all_in_parentheses(string: str) -> str:
return short_name.format(pipeline)
+ @classmethod
+ def _min_dependency_str(cls, sklearn_version: str) -> str:
+ """ Returns a string containing the minimum dependencies for the sklearn version passed.
+
+ Parameters
+ ----------
+ sklearn_version : str
+ A version string of the xx.xx.xx
+
+ Returns
+ -------
+ str
+ """
+ openml_major_version = int(LooseVersion(openml.__version__).version[1])
+ # This explicit check is necessary to support existing entities on the OpenML servers
+ # that used the fixed dependency string (in the else block)
+ if openml_major_version > 11:
+ # OpenML v0.11 onwards supports sklearn>=0.24
+ # assumption: 0.24 onwards sklearn should contain a _min_dependencies.py file with
+ # variables declared for extracting minimum dependency for that version
+ if LooseVersion(sklearn_version) >= "0.24":
+ from sklearn import _min_dependencies as _mindep
+
+ dependency_list = {
+ "numpy": "{}".format(_mindep.NUMPY_MIN_VERSION),
+ "scipy": "{}".format(_mindep.SCIPY_MIN_VERSION),
+ "joblib": "{}".format(_mindep.JOBLIB_MIN_VERSION),
+ "threadpoolctl": "{}".format(_mindep.THREADPOOLCTL_MIN_VERSION),
+ }
+ elif LooseVersion(sklearn_version) >= "0.23":
+ dependency_list = {
+ "numpy": "1.13.3",
+ "scipy": "0.19.1",
+ "joblib": "0.11",
+ "threadpoolctl": "2.0.0",
+ }
+ if LooseVersion(sklearn_version).version[2] == 0:
+ dependency_list.pop("threadpoolctl")
+ elif LooseVersion(sklearn_version) >= "0.21":
+ dependency_list = {"numpy": "1.11.0", "scipy": "0.17.0", "joblib": "0.11"}
+ elif LooseVersion(sklearn_version) >= "0.19":
+ dependency_list = {"numpy": "1.8.2", "scipy": "0.13.3"}
+ else:
+ dependency_list = {"numpy": "1.6.1", "scipy": "0.9"}
+ else:
+ # this is INCORRECT for sklearn versions >= 0.19 and < 0.24
+ # given that OpenML has existing flows uploaded with such dependency information,
+ # we change no behaviour for older sklearn version, however from 0.24 onwards
+ # the dependency list will be accurately updated for any flow uploaded to OpenML
+ dependency_list = {"numpy": "1.6.1", "scipy": "0.9"}
+
+ sklearn_dep = "sklearn=={}".format(sklearn_version)
+ dep_str = "\n".join(["{}>={}".format(k, v) for k, v in dependency_list.items()])
+ return "\n".join([sklearn_dep, dep_str])
+
################################################################################################
# Methods for flow serialization and de-serialization
@@ -491,7 +549,7 @@ def get_version_information(self) -> List[str]:
major, minor, micro, _, _ = sys.version_info
python_version = "Python_{}.".format(".".join([str(major), str(minor), str(micro)]))
sklearn_version = "Sklearn_{}.".format(sklearn.__version__)
- numpy_version = "NumPy_{}.".format(numpy.__version__)
+ numpy_version = "NumPy_{}.".format(numpy.__version__) # type: ignore
scipy_version = "SciPy_{}.".format(scipy.__version__)
return [python_version, sklearn_version, numpy_version, scipy_version]
@@ -508,8 +566,7 @@ def create_setup_string(self, model: Any) -> str:
str
"""
run_environment = " ".join(self.get_version_information())
- # fixme str(model) might contain (...)
- return run_environment + " " + str(model)
+ return run_environment
def _is_cross_validator(self, o: Any) -> bool:
return isinstance(o, sklearn.model_selection.BaseCrossValidator)
@@ -769,20 +826,13 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
tags=tags,
extension=self,
language="English",
- # TODO fill in dependencies!
dependencies=dependencies,
)
return flow
def _get_dependencies(self) -> str:
- dependencies = "\n".join(
- [
- self._format_external_version("sklearn", sklearn.__version__,),
- "numpy>=1.6.1",
- "scipy>=0.9",
- ]
- )
+ dependencies = self._min_dependency_str(sklearn.__version__)
return dependencies
def _get_tags(self) -> List[str]:
@@ -1189,11 +1239,11 @@ def _check_dependencies(self, dependencies: str, strict_version: bool = True) ->
def _serialize_type(self, o: Any) -> "OrderedDict[str, str]":
mapping = {
float: "float",
- np.float: "np.float",
+ np.float: "np.float", # type: ignore
np.float32: "np.float32",
np.float64: "np.float64",
int: "int",
- np.int: "np.int",
+ np.int: "np.int", # type: ignore
np.int32: "np.int32",
np.int64: "np.int64",
}
@@ -1205,11 +1255,11 @@ def _serialize_type(self, o: Any) -> "OrderedDict[str, str]":
def _deserialize_type(self, o: str) -> Any:
mapping = {
"float": float,
- "np.float": np.float,
+ "np.float": np.float, # type: ignore
"np.float32": np.float32,
"np.float64": np.float64,
"int": int,
- "np.int": np.int,
+ "np.int": np.int, # type: ignore
"np.int32": np.int32,
"np.int64": np.int64,
}
@@ -1537,6 +1587,37 @@ def _seed_current_object(current_value):
model.set_params(**random_states)
return model
+ def check_if_model_fitted(self, model: Any) -> bool:
+ """Returns True/False denoting if the model has already been fitted/trained
+
+ Parameters
+ ----------
+ model : Any
+
+ Returns
+ -------
+ bool
+ """
+ try:
+ # check if model is fitted
+ from sklearn.exceptions import NotFittedError
+
+ # Creating random dummy data of arbitrary size
+ dummy_data = np.random.uniform(size=(10, 3))
+ # Using 'predict' instead of 'sklearn.utils.validation.check_is_fitted' for a more
+ # robust check that works across sklearn versions and models. Internally, 'predict'
+ # should call 'check_is_fitted' for every concerned attribute, thus offering a more
+ # assured check than explicit calls to 'check_is_fitted'
+ model.predict(dummy_data)
+ # Will reach here if the model was fit on a dataset with 3 features
+ return True
+ except NotFittedError: # needs to be the first exception to be caught
+ # Model is not fitted, as is required
+ return False
+ except ValueError:
+ # Will reach here if the model was fit on a dataset with more or less than 3 features
+ return True
+
def _run_model_on_fold(
self,
model: Any,
@@ -1546,7 +1627,9 @@ def _run_model_on_fold(
fold_no: int,
y_train: Optional[np.ndarray] = None,
X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame]] = None,
- ) -> Tuple[np.ndarray, pd.DataFrame, "OrderedDict[str, float]", Optional[OpenMLRunTrace]]:
+ ) -> Tuple[
+ np.ndarray, Optional[pd.DataFrame], "OrderedDict[str, float]", Optional[OpenMLRunTrace]
+ ]:
"""Run a model on a repeat,fold,subsample triplet of the task and return prediction
information.
@@ -1581,19 +1664,21 @@ def _run_model_on_fold(
-------
pred_y : np.ndarray
Predictions on the training/test set, depending on the task type.
- For supervised tasks, predicitons are on the test set.
- For unsupervised tasks, predicitons are on the training set.
- proba_y : pd.DataFrame
+ For supervised tasks, predictions are on the test set.
+ For unsupervised tasks, predictions are on the training set.
+ proba_y : pd.DataFrame, optional
Predicted probabilities for the test set.
None, if task is not Classification or Learning Curve prediction.
user_defined_measures : OrderedDict[str, float]
User defined measures that were generated on this fold
- trace : Optional[OpenMLRunTrace]]
+ trace : OpenMLRunTrace, optional
arff trace object from a fitted model and the trace content obtained by
repeatedly calling ``run_model_on_task``
"""
- def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.DataFrame:
+ def _prediction_to_probabilities(
+ y: Union[np.ndarray, List], model_classes: List[Any], class_labels: Optional[List[str]]
+ ) -> pd.DataFrame:
"""Transforms predicted probabilities to match with OpenML class indices.
Parameters
@@ -1603,28 +1688,26 @@ def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.
training data).
model_classes : list
List of classes known_predicted by the model, ordered by their index.
+ class_labels : list
+ List of classes as stored in the task object fetched from server.
Returns
-------
pd.DataFrame
"""
+ if class_labels is None:
+ raise ValueError("The task has no class labels")
- if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
- if task.class_labels is not None:
- if isinstance(y_train, np.ndarray) and isinstance(task.class_labels[0], str):
- # mapping (decoding) the predictions to the categories
- # creating a separate copy to not change the expected pred_y type
- y = [task.class_labels[pred] for pred in y]
- else:
- raise ValueError("The task has no class labels")
- else:
- return None
+ if isinstance(y_train, np.ndarray) and isinstance(class_labels[0], str):
+ # mapping (decoding) the predictions to the categories
+ # creating a separate copy to not change the expected pred_y type
+ y = [class_labels[pred] for pred in y] # list or numpy array of predictions
- # y: list or numpy array of predictions
# model_classes: sklearn classifier mapping from original array id to
# prediction index id
if not isinstance(model_classes, list):
raise ValueError("please convert model classes to list prior to calling this fn")
+
# DataFrame allows more accurate mapping of classes as column names
result = pd.DataFrame(
0, index=np.arange(len(y)), columns=model_classes, dtype=np.float32
@@ -1639,10 +1722,6 @@ def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.
if X_test is None:
raise TypeError("argument X_test must not be of type None")
- # TODO: if possible, give a warning if model is already fitted (acceptable
- # in case of custom experimentation,
- # but not desirable if we want to upload to OpenML).
-
model_copy = sklearn.base.clone(model, safe=True)
# sanity check: prohibit users from optimizing n_jobs
self._prevent_optimize_n_jobs(model_copy)
@@ -1667,6 +1746,8 @@ def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.
user_defined_measures["usercpu_time_millis_training"] = modelfit_dur_cputime
modelfit_dur_walltime = (time.time() - modelfit_start_walltime) * 1000
+ if hasattr(model_copy, "refit_time_"):
+ modelfit_dur_walltime += model_copy.refit_time_
if can_measure_wallclocktime:
user_defined_measures["wall_clock_time_millis_training"] = modelfit_dur_walltime
@@ -1732,10 +1813,7 @@ def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.
proba_y = model_copy.predict_proba(X_test)
proba_y = pd.DataFrame(proba_y, columns=model_classes) # handles X_test as numpy
except AttributeError: # predict_proba is not available when probability=False
- if task.class_labels is not None:
- proba_y = _prediction_to_probabilities(pred_y, model_classes)
- else:
- raise ValueError("The task has no class labels")
+ proba_y = _prediction_to_probabilities(pred_y, model_classes, task.class_labels)
if task.class_labels is not None:
if proba_y.shape[1] != len(task.class_labels):
@@ -1753,12 +1831,13 @@ def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.
proba_y.shape[1], len(task.class_labels),
)
warnings.warn(message)
- openml.config.logger.warn(message)
+ openml.config.logger.warning(message)
for i, col in enumerate(task.class_labels):
# adding missing columns with 0 probability
if col not in model_classes:
proba_y[col] = 0
+ # We re-order the columns to move possibly added missing columns into place.
proba_y = proba_y[task.class_labels]
else:
raise ValueError("The task has no class labels")
diff --git a/openml/flows/flow.py b/openml/flows/flow.py
index 5aaf70a9d..2acbcb0d1 100644
--- a/openml/flows/flow.py
+++ b/openml/flows/flow.py
@@ -229,7 +229,7 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
if not self.description:
logger = logging.getLogger(__name__)
- logger.warn("Flow % has empty description", self.name)
+ logger.warning("Flow % has empty description", self.name)
flow_parameters = []
for key in self.parameters:
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index a08c84df8..92044a1b4 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -10,7 +10,9 @@
import sklearn.metrics
import xmltodict
+import numpy as np
import pandas as pd
+from joblib.parallel import Parallel, delayed
import openml
import openml.utils
@@ -53,6 +55,7 @@ def run_model_on_task(
upload_flow: bool = False,
return_flow: bool = False,
dataset_format: str = "dataframe",
+ n_jobs: Optional[int] = None,
) -> Union[OpenMLRun, Tuple[OpenMLRun, OpenMLFlow]]:
"""Run the model on the dataset defined by the task.
@@ -83,6 +86,10 @@ def run_model_on_task(
dataset_format : str (default='dataframe')
If 'array', the dataset is passed to the model as a numpy array.
If 'dataframe', the dataset is passed to the model as a pandas dataframe.
+ n_jobs : int (default=None)
+ The number of processes/threads to distribute the evaluation asynchronously.
+ If `None` or `1`, then the evaluation is treated as synchronous and processed sequentially.
+ If `-1`, then the job uses as many cores available.
Returns
-------
@@ -130,6 +137,7 @@ def get_task_and_type_conversion(task: Union[int, str, OpenMLTask]) -> OpenMLTas
add_local_measures=add_local_measures,
upload_flow=upload_flow,
dataset_format=dataset_format,
+ n_jobs=n_jobs,
)
if return_flow:
return run, flow
@@ -145,6 +153,7 @@ def run_flow_on_task(
add_local_measures: bool = True,
upload_flow: bool = False,
dataset_format: str = "dataframe",
+ n_jobs: Optional[int] = None,
) -> OpenMLRun:
"""Run the model provided by the flow on the dataset defined by task.
@@ -180,6 +189,10 @@ def run_flow_on_task(
dataset_format : str (default='dataframe')
If 'array', the dataset is passed to the model as a numpy array.
If 'dataframe', the dataset is passed to the model as a pandas dataframe.
+ n_jobs : int (default=None)
+ The number of processes/threads to distribute the evaluation asynchronously.
+ If `None` or `1`, then the evaluation is treated as synchronous and processed sequentially.
+ If `-1`, then the job uses as many cores available.
Returns
-------
@@ -250,14 +263,20 @@ def run_flow_on_task(
run_environment = flow.extension.get_version_information()
tags = ["openml-python", run_environment[1]]
+ if flow.extension.check_if_model_fitted(flow.model):
+ warnings.warn(
+ "The model is already fitted!"
+ " This might cause inconsistency in comparison of results."
+ )
+
# execute the run
res = _run_task_get_arffcontent(
- flow=flow,
model=flow.model,
task=task,
extension=flow.extension,
add_local_measures=add_local_measures,
dataset_format=dataset_format,
+ n_jobs=n_jobs,
)
data_content, trace, fold_evaluations, sample_evaluations = res
@@ -412,12 +431,12 @@ def run_exists(task_id: int, setup_id: int) -> Set[int]:
def _run_task_get_arffcontent(
- flow: OpenMLFlow,
model: Any,
task: OpenMLTask,
extension: "Extension",
add_local_measures: bool,
dataset_format: str,
+ n_jobs: int = None,
) -> Tuple[
List[List],
Optional[OpenMLRunTrace],
@@ -440,55 +459,36 @@ def _run_task_get_arffcontent(
# methods, less maintenance, less confusion)
num_reps, num_folds, num_samples = task.get_split_dimensions()
+ jobs = []
for n_fit, (rep_no, fold_no, sample_no) in enumerate(
itertools.product(range(num_reps), range(num_folds), range(num_samples),), start=1
):
-
- train_indices, test_indices = task.get_train_test_split_indices(
- repeat=rep_no, fold=fold_no, sample=sample_no
- )
- if isinstance(task, OpenMLSupervisedTask):
- x, y = task.get_X_and_y(dataset_format=dataset_format)
- if dataset_format == "dataframe":
- train_x = x.iloc[train_indices]
- train_y = y.iloc[train_indices]
- test_x = x.iloc[test_indices]
- test_y = y.iloc[test_indices]
- else:
- train_x = x[train_indices]
- train_y = y[train_indices]
- test_x = x[test_indices]
- test_y = y[test_indices]
- elif isinstance(task, OpenMLClusteringTask):
- x = task.get_X(dataset_format=dataset_format)
- if dataset_format == "dataframe":
- train_x = x.iloc[train_indices]
- else:
- train_x = x[train_indices]
- train_y = None
- test_x = None
- test_y = None
- else:
- raise NotImplementedError(task.task_type)
-
- config.logger.info(
- "Going to execute flow '%s' on task %d for repeat %d fold %d sample %d.",
- flow.name,
- task.task_id,
- rep_no,
- fold_no,
- sample_no,
- )
-
- pred_y, proba_y, user_defined_measures_fold, trace = extension._run_model_on_fold(
+ jobs.append((n_fit, rep_no, fold_no, sample_no))
+
+ # The forked child process may not copy the configuration state of OpenML from the parent.
+ # Current configuration setup needs to be copied and passed to the child processes.
+ _config = config.get_config_as_dict()
+ # Execute runs in parallel
+ # assuming the same number of tasks as workers (n_jobs), the total compute time for this
+ # statement will be similar to the slowest run
+ job_rvals = Parallel(verbose=0, n_jobs=n_jobs)(
+ delayed(_run_task_get_arffcontent_parallel_helper)(
+ extension=extension,
+ fold_no=fold_no,
model=model,
- task=task,
- X_train=train_x,
- y_train=train_y,
rep_no=rep_no,
- fold_no=fold_no,
- X_test=test_x,
+ sample_no=sample_no,
+ task=task,
+ dataset_format=dataset_format,
+ configuration=_config,
)
+ for n_fit, rep_no, fold_no, sample_no in jobs
+ ) # job_rvals contain the output of all the runs with one-to-one correspondence with `jobs`
+
+ for n_fit, rep_no, fold_no, sample_no in jobs:
+ pred_y, proba_y, test_indices, test_y, trace, user_defined_measures_fold = job_rvals[
+ n_fit - 1
+ ]
if trace is not None:
traces.append(trace)
@@ -502,7 +502,9 @@ def _calculate_local_measure(sklearn_fn, openml_name):
for i, tst_idx in enumerate(test_indices):
if task.class_labels is not None:
prediction = (
- task.class_labels[pred_y[i]] if isinstance(pred_y[i], int) else pred_y[i]
+ task.class_labels[pred_y[i]]
+ if isinstance(pred_y[i], (int, np.integer))
+ else pred_y[i]
)
if isinstance(test_y, pd.Series):
test_prediction = (
@@ -513,7 +515,7 @@ def _calculate_local_measure(sklearn_fn, openml_name):
else:
test_prediction = (
task.class_labels[test_y[i]]
- if isinstance(test_y[i], int)
+ if isinstance(test_y[i], (int, np.integer))
else test_y[i]
)
pred_prob = proba_y.iloc[i] if isinstance(proba_y, pd.DataFrame) else proba_y[i]
@@ -606,6 +608,75 @@ def _calculate_local_measure(sklearn_fn, openml_name):
)
+def _run_task_get_arffcontent_parallel_helper(
+ extension: "Extension",
+ fold_no: int,
+ model: Any,
+ rep_no: int,
+ sample_no: int,
+ task: OpenMLTask,
+ dataset_format: str,
+ configuration: Dict = None,
+) -> Tuple[
+ np.ndarray,
+ Optional[pd.DataFrame],
+ np.ndarray,
+ Optional[pd.DataFrame],
+ Optional[OpenMLRunTrace],
+ "OrderedDict[str, float]",
+]:
+ # Sets up the OpenML instantiated in the child process to match that of the parent's
+ # if configuration=None, loads the default
+ config._setup(configuration)
+
+ train_indices, test_indices = task.get_train_test_split_indices(
+ repeat=rep_no, fold=fold_no, sample=sample_no
+ )
+
+ if isinstance(task, OpenMLSupervisedTask):
+ x, y = task.get_X_and_y(dataset_format=dataset_format)
+ if dataset_format == "dataframe":
+ train_x = x.iloc[train_indices]
+ train_y = y.iloc[train_indices]
+ test_x = x.iloc[test_indices]
+ test_y = y.iloc[test_indices]
+ else:
+ train_x = x[train_indices]
+ train_y = y[train_indices]
+ test_x = x[test_indices]
+ test_y = y[test_indices]
+ elif isinstance(task, OpenMLClusteringTask):
+ x = task.get_X(dataset_format=dataset_format)
+ if dataset_format == "dataframe":
+ train_x = x.iloc[train_indices]
+ else:
+ train_x = x[train_indices]
+ train_y = None
+ test_x = None
+ test_y = None
+ else:
+ raise NotImplementedError(task.task_type)
+ config.logger.info(
+ "Going to run model {} on dataset {} for repeat {} fold {} sample {}".format(
+ str(model),
+ openml.datasets.get_dataset(task.dataset_id).name,
+ rep_no,
+ fold_no,
+ sample_no,
+ )
+ )
+ pred_y, proba_y, user_defined_measures_fold, trace, = extension._run_model_on_fold(
+ model=model,
+ task=task,
+ X_train=train_x,
+ y_train=train_y,
+ rep_no=rep_no,
+ fold_no=fold_no,
+ X_test=test_x,
+ )
+ return pred_y, proba_y, test_indices, test_y, trace, user_defined_measures_fold
+
+
def get_runs(run_ids):
"""Gets all runs in run_ids list.
@@ -734,6 +805,9 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):
flow_name = obtain_field(run, "oml:flow_name", from_server)
setup_id = obtain_field(run, "oml:setup_id", from_server, cast=int)
setup_string = obtain_field(run, "oml:setup_string", from_server)
+ # run_details is currently not sent by the server, so we need to retrieve it safely.
+ # whenever that's resolved, we can enforce it being present (OpenML#1087)
+ run_details = obtain_field(run, "oml:run_details", from_server=False)
if "oml:input_data" in run:
dataset_id = int(run["oml:input_data"]["oml:dataset"]["oml:did"])
@@ -756,6 +830,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):
if "oml:output_data" not in run:
if from_server:
raise ValueError("Run does not contain output_data " "(OpenML server error?)")
+ predictions_url = None
else:
output_data = run["oml:output_data"]
predictions_url = None
@@ -840,6 +915,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):
sample_evaluations=sample_evaluations,
tags=tags,
predictions_url=predictions_url,
+ run_details=run_details,
)
diff --git a/openml/runs/run.py b/openml/runs/run.py
index 0311272b2..4c1c9907d 100644
--- a/openml/runs/run.py
+++ b/openml/runs/run.py
@@ -57,7 +57,9 @@ class OpenMLRun(OpenMLBase):
run_id: int
description_text: str, optional
Description text to add to the predictions file.
- If left None,
+ If left None, is set to the time the arff file is generated.
+ run_details: str, optional (default=None)
+ Description of the run stored in the run meta-data.
"""
def __init__(
@@ -86,6 +88,7 @@ def __init__(
flow=None,
run_id=None,
description_text=None,
+ run_details=None,
):
self.uploader = uploader
self.uploader_name = uploader_name
@@ -112,6 +115,7 @@ def __init__(
self.tags = tags
self.predictions_url = predictions_url
self.description_text = description_text
+ self.run_details = run_details
@property
def id(self) -> Optional[int]:
@@ -543,11 +547,15 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
description["oml:run"]["@xmlns:oml"] = "http://openml.org/openml"
description["oml:run"]["oml:task_id"] = self.task_id
description["oml:run"]["oml:flow_id"] = self.flow_id
+ if self.setup_string is not None:
+ description["oml:run"]["oml:setup_string"] = self.setup_string
if self.error_message is not None:
description["oml:run"]["oml:error_message"] = self.error_message
+ if self.run_details is not None:
+ description["oml:run"]["oml:run_details"] = self.run_details
description["oml:run"]["oml:parameter_setting"] = self.parameter_settings
if self.tags is not None:
- description["oml:run"]["oml:tag"] = self.tags # Tags describing the run
+ description["oml:run"]["oml:tag"] = self.tags
if (self.fold_evaluations is not None and len(self.fold_evaluations) > 0) or (
self.sample_evaluations is not None and len(self.sample_evaluations) > 0
):
diff --git a/openml/study/functions.py b/openml/study/functions.py
index 632581022..ee877ddf2 100644
--- a/openml/study/functions.py
+++ b/openml/study/functions.py
@@ -58,7 +58,7 @@ def get_study(
"of things have changed since then. Please use `get_suite('OpenML100')` instead."
)
warnings.warn(message, DeprecationWarning)
- openml.config.logger.warn(message)
+ openml.config.logger.warning(message)
study = _get_study(study_id, entity_type="task")
return cast(OpenMLBenchmarkSuite, study) # type: ignore
else:
diff --git a/openml/testing.py b/openml/testing.py
index da07b0ed7..f8e22bb4c 100644
--- a/openml/testing.py
+++ b/openml/testing.py
@@ -6,18 +6,13 @@
import shutil
import sys
import time
-from typing import Dict
+from typing import Dict, Union, cast
import unittest
-import warnings
-
-# Currently, importing oslo raises a lot of warning that it will stop working
-# under python3.8; remove this once they disappear
-with warnings.catch_warnings():
- warnings.simplefilter("ignore")
- from oslo_concurrency import lockutils
+import pandas as pd
import openml
from openml.tasks import TaskType
+from openml.exceptions import OpenMLServerException
import logging
@@ -98,13 +93,6 @@ def setUp(self, n_levels: int = 1):
openml.config.avoid_duplicate_runs = False
openml.config.cache_directory = self.workdir
- # If we're on travis, we save the api key in the config file to allow
- # the notebook tests to read them.
- if os.environ.get("TRAVIS") or os.environ.get("APPVEYOR"):
- with lockutils.external_lock("config", lock_path=self.workdir):
- with open(openml.config.config_file, "w") as fh:
- fh.write("apikey = %s" % openml.config.apikey)
-
# Increase the number of retries to avoid spurious server failures
self.connection_n_retries = openml.config.connection_n_retries
openml.config.connection_n_retries = 10
@@ -252,6 +240,55 @@ def _check_fold_timing_evaluations(
self.assertLessEqual(evaluation, max_val)
+def check_task_existence(
+ task_type: TaskType, dataset_id: int, target_name: str, **kwargs
+) -> Union[int, None]:
+ """Checks if any task with exists on test server that matches the meta data.
+
+ Parameter
+ ---------
+ task_type : openml.tasks.TaskType
+ dataset_id : int
+ target_name : str
+
+ Return
+ ------
+ int, None
+ """
+ return_val = None
+ tasks = openml.tasks.list_tasks(task_type=task_type, output_format="dataframe")
+ if len(tasks) == 0:
+ return None
+ tasks = cast(pd.DataFrame, tasks).loc[tasks["did"] == dataset_id]
+ if len(tasks) == 0:
+ return None
+ tasks = tasks.loc[tasks["target_feature"] == target_name]
+ if len(tasks) == 0:
+ return None
+ task_match = []
+ for task_id in tasks["tid"].to_list():
+ task_match.append(task_id)
+ try:
+ task = openml.tasks.get_task(task_id)
+ except OpenMLServerException:
+ # can fail if task_id deleted by another parallely run unit test
+ task_match.pop(-1)
+ return_val = None
+ continue
+ for k, v in kwargs.items():
+ if getattr(task, k) != v:
+ # even if one of the meta-data key mismatches, then task_id is not a match
+ task_match.pop(-1)
+ break
+ # if task_id is retained in the task_match list, it passed all meta key-value matches
+ if len(task_match) == 1:
+ return_val = task_id
+ break
+ if len(task_match) == 0:
+ return_val = None
+ return return_val
+
+
try:
from sklearn.impute import SimpleImputer
except ImportError:
@@ -267,12 +304,4 @@ class CustomImputer(SimpleImputer):
pass
-def cont(X):
- return X.dtypes != "category"
-
-
-def cat(X):
- return X.dtypes == "category"
-
-
-__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "cat", "cont"]
+__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "check_task_existence"]
diff --git a/openml/utils.py b/openml/utils.py
index a402564f9..a482bf0bc 100644
--- a/openml/utils.py
+++ b/openml/utils.py
@@ -9,6 +9,7 @@
from functools import wraps
import collections
+import openml
import openml._api_calls
import openml.exceptions
from . import config
@@ -243,7 +244,7 @@ def _list_all(listing_call, output_format="dict", *args, **filters):
limit=batch_size,
offset=current_offset,
output_format=output_format,
- **active_filters
+ **active_filters,
)
except openml.exceptions.OpenMLServerNoResult:
# we want to return an empty dict in this case
@@ -276,9 +277,11 @@ def _create_cache_directory(key):
cache = config.get_cache_directory()
cache_dir = os.path.join(cache, key)
try:
- os.makedirs(cache_dir)
- except OSError:
- pass
+ os.makedirs(cache_dir, exist_ok=True)
+ except Exception as e:
+ raise openml.exceptions.OpenMLCacheException(
+ f"Cannot create cache directory {cache_dir}."
+ ) from e
return cache_dir
@@ -304,9 +307,9 @@ def _create_cache_directory_for_id(key, id_):
Path of the created dataset cache directory.
"""
cache_dir = os.path.join(_create_cache_directory(key), str(id_))
- if os.path.exists(cache_dir) and os.path.isdir(cache_dir):
+ if os.path.isdir(cache_dir):
pass
- elif os.path.exists(cache_dir) and not os.path.isdir(cache_dir):
+ elif os.path.exists(cache_dir):
raise ValueError("%s cache dir exists but is not a directory!" % key)
else:
os.makedirs(cache_dir)
diff --git a/setup.py b/setup.py
index 9e9a093e4..dc1a58863 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@
)
)
-with open(os.path.join("README.md")) as fid:
+with open(os.path.join("README.md"), encoding="utf-8") as fid:
README = fid.read()
setuptools.setup(
@@ -53,6 +53,8 @@
"pandas>=1.0.0",
"scipy>=0.13.3",
"numpy>=1.6.2",
+ "minio",
+ "pyarrow",
],
extras_require={
"test": [
@@ -65,9 +67,9 @@
"nbformat",
"oslo.concurrency",
"flaky",
- "pyarrow",
"pre-commit",
"pytest-cov",
+ "pytest-rerunfailures",
"mypy",
],
"examples": [
@@ -81,7 +83,8 @@
"ipykernel",
"seaborn",
],
- "examples_unix": ["fanova",],
+ "examples_unix": ["fanova"],
+ "docs": ["sphinx", "sphinx-gallery", "sphinx_bootstrap_theme", "numpydoc"],
},
test_suite="pytest",
classifiers=[
diff --git a/tests/conftest.py b/tests/conftest.py
index 461a513fd..c1f728a72 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -25,6 +25,7 @@
import os
import logging
from typing import List
+import pytest
import openml
from openml.testing import TestBase
@@ -34,16 +35,6 @@
logger.setLevel(logging.DEBUG)
file_list = []
-directory = None
-
-# finding the root directory of conftest.py and going up to OpenML main directory
-# exploiting the fact that conftest.py always resides in the root directory for tests
-static_dir = os.path.dirname(os.path.abspath(__file__))
-logger.info("static directory: {}".format(static_dir))
-while True:
- if "openml" in os.listdir(static_dir):
- break
- static_dir = os.path.join(static_dir, "..")
def worker_id() -> str:
@@ -65,12 +56,11 @@ def read_file_list() -> List[str]:
:return: List[str]
"""
- directory = os.path.join(static_dir, "tests/files/")
- if worker_id() == "master":
- logger.info("Collecting file lists from: {}".format(directory))
- files = os.walk(directory)
+ this_dir = os.path.abspath(os.path.dirname(os.path.abspath(__file__)))
+ directory = os.path.join(this_dir, "..")
+ logger.info("Collecting file lists from: {}".format(directory))
file_list = []
- for root, _, filenames in files:
+ for root, _, filenames in os.walk(directory):
for filename in filenames:
file_list.append(os.path.join(root, filename))
return file_list
@@ -125,7 +115,7 @@ def delete_remote_files(tracker) -> None:
openml.utils._delete_entity(entity_type, entity)
logger.info("Deleted ({}, {})".format(entity_type, entity))
except Exception as e:
- logger.warn("Cannot delete ({},{}): {}".format(entity_type, entity, e))
+ logger.warning("Cannot delete ({},{}): {}".format(entity_type, entity, e))
def pytest_sessionstart() -> None:
@@ -182,3 +172,17 @@ def pytest_sessionfinish() -> None:
logger.info("Local files deleted")
logger.info("{} is killed".format(worker))
+
+
+def pytest_addoption(parser):
+ parser.addoption(
+ "--long",
+ action="store_true",
+ default=False,
+ help="Run the long version of tests which support both short and long scenarios.",
+ )
+
+
+@pytest.fixture(scope="class")
+def long_version(request):
+ request.cls.long_version = request.config.getoption("--long")
diff --git a/tests/files/org/openml/test/datasets/30/dataset.pq b/tests/files/org/openml/test/datasets/30/dataset.pq
new file mode 100644
index 000000000..b35597281
Binary files /dev/null and b/tests/files/org/openml/test/datasets/30/dataset.pq differ
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index 73dbfa133..416fce534 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -1,7 +1,8 @@
# License: BSD 3-Clause
+import os
from time import time
-from warnings import filterwarnings, catch_warnings
+import unittest.mock
import numpy as np
import pandas as pd
@@ -49,6 +50,17 @@ def test_init_string_validation(self):
name="somename", description="a description", citation="Something by Müller"
)
+ def test__unpack_categories_with_nan_likes(self):
+ # unpack_categories decodes numeric categorical values according to the header
+ # Containing a 'non' category in the header shouldn't lead to failure.
+ categories = ["a", "b", None, float("nan"), np.nan]
+ series = pd.Series([0, 1, None, float("nan"), np.nan, 1, 0])
+ clean_series = OpenMLDataset._unpack_categories(series, categories)
+
+ expected_values = ["a", "b", np.nan, np.nan, np.nan, "b", "a"]
+ self.assertListEqual(list(clean_series.values), expected_values)
+ self.assertListEqual(list(clean_series.cat.categories.values), list("ab"))
+
def test_get_data_array(self):
# Basic usage
rval, _, categorical, attribute_names = self.dataset.get_data(dataset_format="array")
@@ -72,13 +84,13 @@ def test_get_data_pandas(self):
self.assertEqual(data.shape[1], len(self.titanic.features))
self.assertEqual(data.shape[0], 1309)
col_dtype = {
- "pclass": "float64",
+ "pclass": "uint8",
"survived": "category",
"name": "object",
"sex": "category",
"age": "float64",
- "sibsp": "float64",
- "parch": "float64",
+ "sibsp": "uint8",
+ "parch": "uint8",
"ticket": "object",
"fare": "float64",
"cabin": "object",
@@ -118,21 +130,29 @@ def test_get_data_no_str_data_for_nparrays(self):
with pytest.raises(PyOpenMLError, match=err_msg):
self.titanic.get_data(dataset_format="array")
+ def _check_expected_type(self, dtype, is_cat, col):
+ if is_cat:
+ expected_type = "category"
+ elif not col.isna().any() and (col.astype("uint8") == col).all():
+ expected_type = "uint8"
+ else:
+ expected_type = "float64"
+
+ self.assertEqual(dtype.name, expected_type)
+
def test_get_data_with_rowid(self):
self.dataset.row_id_attribute = "condition"
rval, _, categorical, _ = self.dataset.get_data(include_row_id=True)
self.assertIsInstance(rval, pd.DataFrame)
- for (dtype, is_cat) in zip(rval.dtypes, categorical):
- expected_type = "category" if is_cat else "float64"
- self.assertEqual(dtype.name, expected_type)
+ for (dtype, is_cat, col) in zip(rval.dtypes, categorical, rval):
+ self._check_expected_type(dtype, is_cat, rval[col])
self.assertEqual(rval.shape, (898, 39))
self.assertEqual(len(categorical), 39)
rval, _, categorical, _ = self.dataset.get_data()
self.assertIsInstance(rval, pd.DataFrame)
- for (dtype, is_cat) in zip(rval.dtypes, categorical):
- expected_type = "category" if is_cat else "float64"
- self.assertEqual(dtype.name, expected_type)
+ for (dtype, is_cat, col) in zip(rval.dtypes, categorical, rval):
+ self._check_expected_type(dtype, is_cat, rval[col])
self.assertEqual(rval.shape, (898, 38))
self.assertEqual(len(categorical), 38)
@@ -149,9 +169,8 @@ def test_get_data_with_target_array(self):
def test_get_data_with_target_pandas(self):
X, y, categorical, attribute_names = self.dataset.get_data(target="class")
self.assertIsInstance(X, pd.DataFrame)
- for (dtype, is_cat) in zip(X.dtypes, categorical):
- expected_type = "category" if is_cat else "float64"
- self.assertEqual(dtype.name, expected_type)
+ for (dtype, is_cat, col) in zip(X.dtypes, categorical, X):
+ self._check_expected_type(dtype, is_cat, X[col])
self.assertIsInstance(y, pd.Series)
self.assertEqual(y.dtype.name, "category")
@@ -174,27 +193,17 @@ def test_get_data_rowid_and_ignore_and_target(self):
def test_get_data_with_ignore_attributes(self):
self.dataset.ignore_attribute = ["condition"]
rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=True)
- for (dtype, is_cat) in zip(rval.dtypes, categorical):
- expected_type = "category" if is_cat else "float64"
- self.assertEqual(dtype.name, expected_type)
+ for (dtype, is_cat, col) in zip(rval.dtypes, categorical, rval):
+ self._check_expected_type(dtype, is_cat, rval[col])
self.assertEqual(rval.shape, (898, 39))
self.assertEqual(len(categorical), 39)
rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=False)
- for (dtype, is_cat) in zip(rval.dtypes, categorical):
- expected_type = "category" if is_cat else "float64"
- self.assertEqual(dtype.name, expected_type)
+ for (dtype, is_cat, col) in zip(rval.dtypes, categorical, rval):
+ self._check_expected_type(dtype, is_cat, rval[col])
self.assertEqual(rval.shape, (898, 38))
self.assertEqual(len(categorical), 38)
- def test_dataset_format_constructor(self):
-
- with catch_warnings():
- filterwarnings("error")
- self.assertRaises(
- DeprecationWarning, openml.OpenMLDataset, "Test", "Test", format="arff"
- )
-
def test_get_data_with_nonexisting_class(self):
# This class is using the anneal dataset with labels [1, 2, 3, 4, 5, 'U']. However,
# label 4 does not exist and we test that the features 5 and 'U' are correctly mapped to
@@ -350,7 +359,48 @@ def test_get_sparse_categorical_data_id_395(self):
self.assertEqual(len(feature.nominal_values), 25)
-class OpenMLDatasetQualityTest(TestBase):
+class OpenMLDatasetFunctionTest(TestBase):
+ @unittest.mock.patch("openml.datasets.dataset.pickle")
+ @unittest.mock.patch("openml.datasets.dataset._get_features_pickle_file")
+ def test__read_features(self, filename_mock, pickle_mock):
+ """Test we read the features from the xml if no cache pickle is available.
+
+ This test also does some simple checks to verify that the features are read correctly"""
+ filename_mock.return_value = os.path.join(self.workdir, "features.xml.pkl")
+ pickle_mock.load.side_effect = FileNotFoundError
+ features = openml.datasets.dataset._read_features(
+ os.path.join(
+ self.static_cache_dir, "org", "openml", "test", "datasets", "2", "features.xml"
+ )
+ )
+ self.assertIsInstance(features, dict)
+ self.assertEqual(len(features), 39)
+ self.assertIsInstance(features[0], OpenMLDataFeature)
+ self.assertEqual(features[0].name, "family")
+ self.assertEqual(len(features[0].nominal_values), 9)
+ # pickle.load is never called because the features pickle file didn't exist
+ self.assertEqual(pickle_mock.load.call_count, 0)
+ self.assertEqual(pickle_mock.dump.call_count, 1)
+
+ @unittest.mock.patch("openml.datasets.dataset.pickle")
+ @unittest.mock.patch("openml.datasets.dataset._get_qualities_pickle_file")
+ def test__read_qualities(self, filename_mock, pickle_mock):
+ """Test we read the qualities from the xml if no cache pickle is available.
+
+ This test also does some minor checks to ensure that the qualities are read correctly."""
+ filename_mock.return_value = os.path.join(self.workdir, "qualities.xml.pkl")
+ pickle_mock.load.side_effect = FileNotFoundError
+ qualities = openml.datasets.dataset._read_qualities(
+ os.path.join(
+ self.static_cache_dir, "org", "openml", "test", "datasets", "2", "qualities.xml"
+ )
+ )
+ self.assertIsInstance(qualities, dict)
+ self.assertEqual(len(qualities), 106)
+ # pickle.load is never called because the qualities pickle file didn't exist
+ self.assertEqual(pickle_mock.load.call_count, 0)
+ self.assertEqual(pickle_mock.dump.call_count, 1)
+
def test__check_qualities(self):
qualities = [{"oml:name": "a", "oml:value": "0.5"}]
qualities = openml.datasets.dataset._check_qualities(qualities)
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index c6e6f78f8..ec9dd6c53 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -1,9 +1,11 @@
# License: BSD 3-Clause
import os
+import pathlib
import random
from itertools import product
from unittest import mock
+import shutil
import arff
import time
@@ -16,8 +18,8 @@
import openml
from openml import OpenMLDataset
+from openml._api_calls import _download_minio_file
from openml.exceptions import (
- OpenMLCacheException,
OpenMLHashException,
OpenMLPrivateDatasetError,
OpenMLServerException,
@@ -27,19 +29,19 @@
from openml.datasets.functions import (
create_dataset,
attributes_arff_from_df,
- _get_cached_dataset,
- _get_cached_dataset_features,
- _get_cached_dataset_qualities,
- _get_cached_datasets,
_get_dataset_arff,
_get_dataset_description,
- _get_dataset_features,
- _get_dataset_qualities,
+ _get_dataset_features_file,
+ _get_dataset_qualities_file,
_get_online_dataset_arff,
_get_online_dataset_format,
DATASETS_CACHE_DIR_NAME,
+ _get_dataset_parquet,
+ _topic_add_dataset,
+ _topic_delete_dataset,
)
from openml.datasets import fork_dataset, edit_dataset
+from openml.tasks import TaskType, create_task
class TestOpenMLDataset(TestBase):
@@ -85,60 +87,6 @@ def _get_empty_param_for_dataset(self):
"data": None,
}
- def test__list_cached_datasets(self):
- openml.config.cache_directory = self.static_cache_dir
- cached_datasets = openml.datasets.functions._list_cached_datasets()
- self.assertIsInstance(cached_datasets, list)
- self.assertEqual(len(cached_datasets), 2)
- self.assertIsInstance(cached_datasets[0], int)
-
- @mock.patch("openml.datasets.functions._list_cached_datasets")
- def test__get_cached_datasets(self, _list_cached_datasets_mock):
- openml.config.cache_directory = self.static_cache_dir
- _list_cached_datasets_mock.return_value = [-1, 2]
- datasets = _get_cached_datasets()
- self.assertIsInstance(datasets, dict)
- self.assertEqual(len(datasets), 2)
- self.assertIsInstance(list(datasets.values())[0], OpenMLDataset)
-
- def test__get_cached_dataset(self,):
- openml.config.cache_directory = self.static_cache_dir
- dataset = _get_cached_dataset(2)
- features = _get_cached_dataset_features(2)
- qualities = _get_cached_dataset_qualities(2)
- self.assertIsInstance(dataset, OpenMLDataset)
- self.assertTrue(len(dataset.features) > 0)
- self.assertTrue(len(dataset.features) == len(features["oml:feature"]))
- self.assertTrue(len(dataset.qualities) == len(qualities))
-
- def test_get_cached_dataset_description(self):
- openml.config.cache_directory = self.static_cache_dir
- description = openml.datasets.functions._get_cached_dataset_description(2)
- self.assertIsInstance(description, dict)
-
- def test_get_cached_dataset_description_not_cached(self):
- openml.config.cache_directory = self.static_cache_dir
- self.assertRaisesRegex(
- OpenMLCacheException,
- "Dataset description for dataset id 3 not cached",
- openml.datasets.functions._get_cached_dataset_description,
- dataset_id=3,
- )
-
- def test_get_cached_dataset_arff(self):
- openml.config.cache_directory = self.static_cache_dir
- description = openml.datasets.functions._get_cached_dataset_arff(dataset_id=2)
- self.assertIsInstance(description, str)
-
- def test_get_cached_dataset_arff_not_cached(self):
- openml.config.cache_directory = self.static_cache_dir
- self.assertRaisesRegex(
- OpenMLCacheException,
- "ARFF file for dataset id 3 not cached",
- openml.datasets.functions._get_cached_dataset_arff,
- dataset_id=3,
- )
-
def _check_dataset(self, dataset):
self.assertEqual(type(dataset), dict)
self.assertGreaterEqual(len(dataset), 2)
@@ -227,9 +175,10 @@ def test_list_datasets_empty(self):
def test_check_datasets_active(self):
# Have to test on live because there is no deactivated dataset on the test server.
openml.config.server = self.production_server
- active = openml.datasets.check_datasets_active([2, 17])
+ active = openml.datasets.check_datasets_active([2, 17, 79], raise_error_if_not_exist=False,)
self.assertTrue(active[2])
self.assertFalse(active[17])
+ self.assertIsNone(active.get(79))
self.assertRaisesRegex(
ValueError,
"Could not find dataset 79 in OpenML dataset list.",
@@ -369,6 +318,13 @@ def test_get_dataset_by_name(self):
openml.config.server = self.production_server
self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45)
+ def test_get_dataset_uint8_dtype(self):
+ dataset = openml.datasets.get_dataset(1)
+ self.assertEqual(type(dataset), OpenMLDataset)
+ self.assertEqual(dataset.name, "anneal")
+ df, _, _, _ = dataset.get_data()
+ self.assertEqual(df["carbon"].dtype, "uint8")
+
def test_get_dataset(self):
# This is the only non-lazy load to ensure default behaviour works.
dataset = openml.datasets.get_dataset(1)
@@ -451,11 +407,99 @@ def test__get_dataset_description(self):
def test__getarff_path_dataset_arff(self):
openml.config.cache_directory = self.static_cache_dir
- description = openml.datasets.functions._get_cached_dataset_description(2)
+ description = _get_dataset_description(self.workdir, 2)
arff_path = _get_dataset_arff(description, cache_directory=self.workdir)
self.assertIsInstance(arff_path, str)
self.assertTrue(os.path.exists(arff_path))
+ def test__download_minio_file_object_does_not_exist(self):
+ self.assertRaisesRegex(
+ FileNotFoundError,
+ r"Object at .* does not exist",
+ _download_minio_file,
+ source="http://openml1.win.tue.nl/dataset20/i_do_not_exist.pq",
+ destination=self.workdir,
+ exists_ok=True,
+ )
+
+ def test__download_minio_file_to_directory(self):
+ _download_minio_file(
+ source="http://openml1.win.tue.nl/dataset20/dataset_20.pq",
+ destination=self.workdir,
+ exists_ok=True,
+ )
+ self.assertTrue(
+ os.path.isfile(os.path.join(self.workdir, "dataset_20.pq")),
+ "_download_minio_file can save to a folder by copying the object name",
+ )
+
+ def test__download_minio_file_to_path(self):
+ file_destination = os.path.join(self.workdir, "custom.pq")
+ _download_minio_file(
+ source="http://openml1.win.tue.nl/dataset20/dataset_20.pq",
+ destination=file_destination,
+ exists_ok=True,
+ )
+ self.assertTrue(
+ os.path.isfile(file_destination),
+ "_download_minio_file can save to a folder by copying the object name",
+ )
+
+ def test__download_minio_file_raises_FileExists_if_destination_in_use(self):
+ file_destination = pathlib.Path(self.workdir, "custom.pq")
+ file_destination.touch()
+
+ self.assertRaises(
+ FileExistsError,
+ _download_minio_file,
+ source="http://openml1.win.tue.nl/dataset20/dataset_20.pq",
+ destination=str(file_destination),
+ exists_ok=False,
+ )
+
+ def test__download_minio_file_works_with_bucket_subdirectory(self):
+ file_destination = pathlib.Path(self.workdir, "custom.csv")
+ _download_minio_file(
+ source="http://openml1.win.tue.nl/test/subdirectory/test.csv",
+ destination=file_destination,
+ exists_ok=True,
+ )
+ self.assertTrue(
+ os.path.isfile(file_destination),
+ "_download_minio_file can download from subdirectories",
+ )
+
+ def test__get_dataset_parquet_not_cached(self):
+ description = {
+ "oml:minio_url": "http://openml1.win.tue.nl/dataset20/dataset_20.pq",
+ "oml:id": "20",
+ }
+ path = _get_dataset_parquet(description, cache_directory=self.workdir)
+ self.assertIsInstance(path, str, "_get_dataset_parquet returns a path")
+ self.assertTrue(os.path.isfile(path), "_get_dataset_parquet returns path to real file")
+
+ @mock.patch("openml._api_calls._download_minio_file")
+ def test__get_dataset_parquet_is_cached(self, patch):
+ openml.config.cache_directory = self.static_cache_dir
+ patch.side_effect = RuntimeError(
+ "_download_minio_file should not be called when loading from cache"
+ )
+ description = {
+ "oml:minio_url": "http://openml1.win.tue.nl/dataset30/dataset_30.pq",
+ "oml:id": "30",
+ }
+ path = _get_dataset_parquet(description, cache_directory=None)
+ self.assertIsInstance(path, str, "_get_dataset_parquet returns a path")
+ self.assertTrue(os.path.isfile(path), "_get_dataset_parquet returns path to real file")
+
+ def test__get_dataset_parquet_file_does_not_exist(self):
+ description = {
+ "oml:minio_url": "http://openml1.win.tue.nl/dataset20/does_not_exist.pq",
+ "oml:id": "20",
+ }
+ path = _get_dataset_parquet(description, cache_directory=self.workdir)
+ self.assertIsNone(path, "_get_dataset_parquet returns None if no file is found")
+
def test__getarff_md5_issue(self):
description = {
"oml:id": 5,
@@ -464,23 +508,27 @@ def test__getarff_md5_issue(self):
}
self.assertRaisesRegex(
OpenMLHashException,
- "Checksum ad484452702105cbf3d30f8deaba39a9 of downloaded file "
- "is unequal to the expected checksum abc. "
- "Raised when downloading dataset 5.",
+ "Checksum of downloaded file is unequal to the expected checksum abc when downloading "
+ "https://www.openml.org/data/download/61. Raised when downloading dataset 5.",
_get_dataset_arff,
description,
)
def test__get_dataset_features(self):
- features = _get_dataset_features(self.workdir, 2)
- self.assertIsInstance(features, dict)
+ features_file = _get_dataset_features_file(self.workdir, 2)
+ self.assertIsInstance(features_file, str)
features_xml_path = os.path.join(self.workdir, "features.xml")
self.assertTrue(os.path.exists(features_xml_path))
def test__get_dataset_qualities(self):
- # Only a smoke check
- qualities = _get_dataset_qualities(self.workdir, 2)
- self.assertIsInstance(qualities, list)
+ qualities = _get_dataset_qualities_file(self.workdir, 2)
+ self.assertIsInstance(qualities, str)
+ qualities_xml_path = os.path.join(self.workdir, "qualities.xml")
+ self.assertTrue(os.path.exists(qualities_xml_path))
+
+ def test__get_dataset_skip_download(self):
+ qualities = openml.datasets.get_dataset(2, download_qualities=False).qualities
+ self.assertIsNone(qualities)
def test_deletion_of_cache_dir(self):
# Simple removal
@@ -547,6 +595,7 @@ def test_upload_dataset_with_url(self):
)
self.assertIsInstance(dataset.dataset_id, int)
+ @pytest.mark.flaky()
def test_data_status(self):
dataset = OpenMLDataset(
"%s-UploadTestWithURL" % self._get_sentinel(),
@@ -864,6 +913,24 @@ def test_get_online_dataset_arff(self):
"ARFF files are not equal",
)
+ def test_topic_api_error(self):
+ # Check server exception when non-admin accessses apis
+ self.assertRaisesRegex(
+ OpenMLServerException,
+ "Topic can only be added/removed by admin.",
+ _topic_add_dataset,
+ data_id=31,
+ topic="business",
+ )
+ # Check server exception when non-admin accessses apis
+ self.assertRaisesRegex(
+ OpenMLServerException,
+ "Topic can only be added/removed by admin.",
+ _topic_delete_dataset,
+ data_id=31,
+ topic="business",
+ )
+
def test_get_online_dataset_format(self):
# Phoneme dataset
@@ -897,7 +964,6 @@ def test_create_dataset_pandas(self):
collection_date = "01-01-2018"
language = "English"
licence = "MIT"
- default_target_attribute = "play"
citation = "None"
original_data_url = "http://openml.github.io/openml-python"
paper_url = "http://openml.github.io/openml-python"
@@ -909,7 +975,7 @@ def test_create_dataset_pandas(self):
collection_date=collection_date,
language=language,
licence=licence,
- default_target_attribute=default_target_attribute,
+ default_target_attribute="play",
row_id_attribute=None,
ignore_attribute=None,
citation=citation,
@@ -944,7 +1010,7 @@ def test_create_dataset_pandas(self):
collection_date=collection_date,
language=language,
licence=licence,
- default_target_attribute=default_target_attribute,
+ default_target_attribute="y",
row_id_attribute=None,
ignore_attribute=None,
citation=citation,
@@ -980,7 +1046,7 @@ def test_create_dataset_pandas(self):
collection_date=collection_date,
language=language,
licence=licence,
- default_target_attribute=default_target_attribute,
+ default_target_attribute="rnd_str",
row_id_attribute=None,
ignore_attribute=None,
citation=citation,
@@ -1147,27 +1213,31 @@ def test_publish_fetch_ignore_attribute(self):
# test if publish was successful
self.assertIsInstance(dataset.id, int)
+ downloaded_dataset = self._wait_for_dataset_being_processed(dataset.id)
+ self.assertEqual(downloaded_dataset.ignore_attribute, ignore_attribute)
+
+ def _wait_for_dataset_being_processed(self, dataset_id):
downloaded_dataset = None
# fetching from server
# loop till timeout or fetch not successful
- max_waiting_time_seconds = 400
+ max_waiting_time_seconds = 600
# time.time() works in seconds
start_time = time.time()
while time.time() - start_time < max_waiting_time_seconds:
try:
- downloaded_dataset = openml.datasets.get_dataset(dataset.id)
+ downloaded_dataset = openml.datasets.get_dataset(dataset_id)
break
except Exception as e:
# returned code 273: Dataset not processed yet
# returned code 362: No qualities found
TestBase.logger.error(
- "Failed to fetch dataset:{} with '{}'.".format(dataset.id, str(e))
+ "Failed to fetch dataset:{} with '{}'.".format(dataset_id, str(e))
)
time.sleep(10)
continue
if downloaded_dataset is None:
- raise ValueError("TIMEOUT: Failed to fetch uploaded dataset - {}".format(dataset.id))
- self.assertEqual(downloaded_dataset.ignore_attribute, ignore_attribute)
+ raise ValueError("TIMEOUT: Failed to fetch uploaded dataset - {}".format(dataset_id))
+ return downloaded_dataset
def test_create_dataset_row_id_attribute_error(self):
# meta-information
@@ -1303,6 +1373,8 @@ def test_list_qualities(self):
def test_get_dataset_cache_format_pickle(self):
dataset = openml.datasets.get_dataset(1)
+ dataset.get_data()
+
self.assertEqual(type(dataset), OpenMLDataset)
self.assertEqual(dataset.name, "anneal")
self.assertGreater(len(dataset.features), 1)
@@ -1317,6 +1389,7 @@ def test_get_dataset_cache_format_pickle(self):
def test_get_dataset_cache_format_feather(self):
dataset = openml.datasets.get_dataset(128, cache_format="feather")
+ dataset.get_data()
# Check if dataset is written to cache directory using feather
cache_dir = openml.config.get_cache_directory()
@@ -1340,7 +1413,7 @@ def test_get_dataset_cache_format_feather(self):
self.assertEqual(len(categorical), X.shape[1])
self.assertEqual(len(attribute_names), X.shape[1])
- def test_data_edit(self):
+ def test_data_edit_non_critical_field(self):
# Case 1
# All users can edit non-critical fields of datasets
desc = (
@@ -1361,14 +1434,31 @@ def test_data_edit(self):
edited_dataset = openml.datasets.get_dataset(did)
self.assertEqual(edited_dataset.description, desc)
+ def test_data_edit_critical_field(self):
# Case 2
# only owners (or admin) can edit all critical fields of datasets
- # this is a dataset created by CI, so it is editable by this test
- did = 315
- result = edit_dataset(did, default_target_attribute="col_1", ignore_attribute="col_2")
+ # for this, we need to first clone a dataset to do changes
+ did = fork_dataset(1)
+ self._wait_for_dataset_being_processed(did)
+ result = edit_dataset(did, default_target_attribute="shape", ignore_attribute="oil")
self.assertEqual(did, result)
- edited_dataset = openml.datasets.get_dataset(did)
- self.assertEqual(edited_dataset.ignore_attribute, ["col_2"])
+
+ n_tries = 10
+ # we need to wait for the edit to be reflected on the server
+ for i in range(n_tries):
+ edited_dataset = openml.datasets.get_dataset(did)
+ try:
+ self.assertEqual(edited_dataset.default_target_attribute, "shape", edited_dataset)
+ self.assertEqual(edited_dataset.ignore_attribute, ["oil"], edited_dataset)
+ break
+ except AssertionError as e:
+ if i == n_tries - 1:
+ raise e
+ time.sleep(10)
+ # Delete the cache dir to get the newer version of the dataset
+ shutil.rmtree(
+ os.path.join(self.workdir, "org", "openml", "test", "datasets", str(did))
+ )
def test_data_edit_errors(self):
# Check server exception when no field to edit is provided
@@ -1379,7 +1469,7 @@ def test_data_edit_errors(self):
"original_data_url, default_target_attribute, row_id_attribute, "
"ignore_attribute or paper_url to edit.",
edit_dataset,
- data_id=564,
+ data_id=64, # blood-transfusion-service-center
)
# Check server exception when unknown dataset is provided
self.assertRaisesRegex(
@@ -1389,15 +1479,32 @@ def test_data_edit_errors(self):
data_id=999999,
description="xor operation dataset",
)
+
+ # Need to own a dataset to be able to edit meta-data
+ # Will be creating a forked version of an existing dataset to allow the unit test user
+ # to edit meta-data of a dataset
+ did = fork_dataset(1)
+ self._wait_for_dataset_being_processed(did)
+ TestBase._mark_entity_for_removal("data", did)
+ # Need to upload a task attached to this data to test edit failure
+ task = create_task(
+ task_type=TaskType.SUPERVISED_CLASSIFICATION,
+ dataset_id=did,
+ target_name="class",
+ estimation_procedure_id=1,
+ )
+ task = task.publish()
+ TestBase._mark_entity_for_removal("task", task.task_id)
# Check server exception when owner/admin edits critical fields of dataset with tasks
self.assertRaisesRegex(
OpenMLServerException,
"Critical features default_target_attribute, row_id_attribute and ignore_attribute "
"can only be edited for datasets without any tasks.",
edit_dataset,
- data_id=223,
+ data_id=did,
default_target_attribute="y",
)
+
# Check server exception when a non-owner or non-admin tries to edit critical fields
self.assertRaisesRegex(
OpenMLServerException,
@@ -1416,3 +1523,124 @@ def test_data_fork(self):
self.assertRaisesRegex(
OpenMLServerException, "Unknown dataset", fork_dataset, data_id=999999,
)
+
+ def test_get_dataset_parquet(self):
+ dataset = openml.datasets.get_dataset(20)
+ self.assertIsNotNone(dataset._minio_url)
+ self.assertIsNotNone(dataset.parquet_file)
+ self.assertTrue(os.path.isfile(dataset.parquet_file))
+
+
+@pytest.mark.parametrize(
+ "default_target_attribute,row_id_attribute,ignore_attribute",
+ [
+ ("wrong", None, None),
+ (None, "wrong", None),
+ (None, None, "wrong"),
+ ("wrong,sunny", None, None),
+ (None, None, "wrong,sunny"),
+ (["wrong", "sunny"], None, None),
+ (None, None, ["wrong", "sunny"]),
+ ],
+)
+def test_invalid_attribute_validations(
+ default_target_attribute, row_id_attribute, ignore_attribute
+):
+ data = [
+ ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
+ ["b", "sunny", 80.0, 90.0, "TRUE", "no"],
+ ["c", "overcast", 83.0, 86.0, "FALSE", "yes"],
+ ["d", "rainy", 70.0, 96.0, "FALSE", "yes"],
+ ["e", "rainy", 68.0, 80.0, "FALSE", "yes"],
+ ]
+ column_names = ["rnd_str", "outlook", "temperature", "humidity", "windy", "play"]
+ df = pd.DataFrame(data, columns=column_names)
+ # enforce the type of each column
+ df["outlook"] = df["outlook"].astype("category")
+ df["windy"] = df["windy"].astype("bool")
+ df["play"] = df["play"].astype("category")
+ # meta-information
+ name = "pandas_testing_dataset"
+ description = "Synthetic dataset created from a Pandas DataFrame"
+ creator = "OpenML tester"
+ collection_date = "01-01-2018"
+ language = "English"
+ licence = "MIT"
+ citation = "None"
+ original_data_url = "http://openml.github.io/openml-python"
+ paper_url = "http://openml.github.io/openml-python"
+ with pytest.raises(ValueError, match="should be one of the data attribute"):
+ _ = openml.datasets.functions.create_dataset(
+ name=name,
+ description=description,
+ creator=creator,
+ contributor=None,
+ collection_date=collection_date,
+ language=language,
+ licence=licence,
+ default_target_attribute=default_target_attribute,
+ row_id_attribute=row_id_attribute,
+ ignore_attribute=ignore_attribute,
+ citation=citation,
+ attributes="auto",
+ data=df,
+ version_label="test",
+ original_data_url=original_data_url,
+ paper_url=paper_url,
+ )
+
+
+@pytest.mark.parametrize(
+ "default_target_attribute,row_id_attribute,ignore_attribute",
+ [
+ ("outlook", None, None),
+ (None, "outlook", None),
+ (None, None, "outlook"),
+ ("outlook,windy", None, None),
+ (None, None, "outlook,windy"),
+ (["outlook", "windy"], None, None),
+ (None, None, ["outlook", "windy"]),
+ ],
+)
+def test_valid_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute):
+ data = [
+ ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
+ ["b", "sunny", 80.0, 90.0, "TRUE", "no"],
+ ["c", "overcast", 83.0, 86.0, "FALSE", "yes"],
+ ["d", "rainy", 70.0, 96.0, "FALSE", "yes"],
+ ["e", "rainy", 68.0, 80.0, "FALSE", "yes"],
+ ]
+ column_names = ["rnd_str", "outlook", "temperature", "humidity", "windy", "play"]
+ df = pd.DataFrame(data, columns=column_names)
+ # enforce the type of each column
+ df["outlook"] = df["outlook"].astype("category")
+ df["windy"] = df["windy"].astype("bool")
+ df["play"] = df["play"].astype("category")
+ # meta-information
+ name = "pandas_testing_dataset"
+ description = "Synthetic dataset created from a Pandas DataFrame"
+ creator = "OpenML tester"
+ collection_date = "01-01-2018"
+ language = "English"
+ licence = "MIT"
+ citation = "None"
+ original_data_url = "http://openml.github.io/openml-python"
+ paper_url = "http://openml.github.io/openml-python"
+ _ = openml.datasets.functions.create_dataset(
+ name=name,
+ description=description,
+ creator=creator,
+ contributor=None,
+ collection_date=collection_date,
+ language=language,
+ licence=licence,
+ default_target_attribute=default_target_attribute,
+ row_id_attribute=row_id_attribute,
+ ignore_attribute=ignore_attribute,
+ citation=citation,
+ attributes="auto",
+ data=df,
+ version_label="test",
+ original_data_url=original_data_url,
+ paper_url=paper_url,
+ )
diff --git a/tests/test_evaluations/__init__.py b/tests/test_evaluations/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py
index e4de9b03c..70f36ce19 100644
--- a/tests/test_evaluations/test_evaluation_functions.py
+++ b/tests/test_evaluations/test_evaluation_functions.py
@@ -1,10 +1,12 @@
# License: BSD 3-Clause
+import pytest
import openml
import openml.evaluations
from openml.testing import TestBase
+@pytest.mark.usefixtures("long_version")
class TestEvaluationFunctions(TestBase):
_multiprocess_can_split_ = True
@@ -27,6 +29,10 @@ def _check_list_evaluation_setups(self, **kwargs):
# Check if output and order of list_evaluations is preserved
self.assertSequenceEqual(evals_setups["run_id"].tolist(), evals["run_id"].tolist())
+
+ if not self.long_version:
+ evals_setups = evals_setups.head(1)
+
# Check if the hyper-parameter column is as accurate and flow_id
for index, row in evals_setups.iterrows():
params = openml.runs.get_run(row["run_id"]).parameter_settings
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
index d34dc2ad3..c1f88bcda 100644
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -40,7 +40,8 @@
from openml.flows import OpenMLFlow
from openml.flows.functions import assert_flows_equal
from openml.runs.trace import OpenMLRunTrace
-from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont
+from openml.testing import TestBase, SimpleImputer, CustomImputer
+from openml.extensions.sklearn import cat, cont
this_directory = os.path.dirname(os.path.abspath(__file__))
@@ -145,7 +146,7 @@ def test_serialize_model(self):
fixture_short_name = "sklearn.DecisionTreeClassifier"
# str obtained from self.extension._get_sklearn_description(model)
fixture_description = "A decision tree classifier."
- version_fixture = "sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9" % sklearn.__version__
+ version_fixture = self.extension._min_dependency_str(sklearn.__version__)
presort_val = "false" if LooseVersion(sklearn.__version__) < "0.22" else '"deprecated"'
# min_impurity_decrease has been introduced in 0.20
@@ -188,6 +189,8 @@ def test_serialize_model(self):
if LooseVersion(sklearn.__version__) >= "0.22":
fixture_parameters.update({"ccp_alpha": "0.0"})
fixture_parameters.move_to_end("ccp_alpha", last=False)
+ if LooseVersion(sklearn.__version__) >= "0.24":
+ del fixture_parameters["presort"]
structure_fixture = {"sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): []}
@@ -224,7 +227,7 @@ def test_serialize_model_clustering(self):
fixture_description = "K-Means clustering{}".format(
"" if LooseVersion(sklearn.__version__) < "0.22" else "."
)
- version_fixture = "sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9" % sklearn.__version__
+ version_fixture = self.extension._min_dependency_str(sklearn.__version__)
n_jobs_val = "null" if LooseVersion(sklearn.__version__) < "0.23" else '"deprecated"'
precomp_val = '"auto"' if LooseVersion(sklearn.__version__) < "0.23" else '"deprecated"'
@@ -1251,7 +1254,7 @@ def test_paralizable_check(self):
# using this param distribution should raise an exception
illegal_param_dist = {"base__n_jobs": [-1, 0, 1]}
# using this param distribution should not raise an exception
- legal_param_dist = {"base__max_depth": [2, 3, 4]}
+ legal_param_dist = {"n_estimators": [2, 3, 4]}
legal_models = [
sklearn.ensemble.RandomForestClassifier(),
@@ -1279,12 +1282,19 @@ def test_paralizable_check(self):
can_measure_cputime_answers = [True, False, False, True, False, False, True, False, False]
can_measure_walltime_answers = [True, True, False, True, True, False, True, True, False]
+ if LooseVersion(sklearn.__version__) < "0.20":
+ has_refit_time = [False, False, False, False, False, False, False, False, False]
+ else:
+ has_refit_time = [False, False, False, False, False, False, True, True, False]
- for model, allowed_cputime, allowed_walltime in zip(
- legal_models, can_measure_cputime_answers, can_measure_walltime_answers
+ X, y = sklearn.datasets.load_iris(return_X_y=True)
+ for model, allowed_cputime, allowed_walltime, refit_time in zip(
+ legal_models, can_measure_cputime_answers, can_measure_walltime_answers, has_refit_time
):
self.assertEqual(self.extension._can_measure_cputime(model), allowed_cputime)
self.assertEqual(self.extension._can_measure_wallclocktime(model), allowed_walltime)
+ model.fit(X, y)
+ self.assertEqual(refit_time, hasattr(model, "refit_time_"))
for model in illegal_models:
with self.assertRaises(PyOpenMLError):
@@ -1316,12 +1326,18 @@ def test__get_fn_arguments_with_defaults(self):
(sklearn.tree.DecisionTreeClassifier.__init__, 14),
(sklearn.pipeline.Pipeline.__init__, 2),
]
- else:
+ elif sklearn_version < "0.24":
fns = [
(sklearn.ensemble.RandomForestRegressor.__init__, 18),
(sklearn.tree.DecisionTreeClassifier.__init__, 14),
(sklearn.pipeline.Pipeline.__init__, 2),
]
+ else:
+ fns = [
+ (sklearn.ensemble.RandomForestRegressor.__init__, 18),
+ (sklearn.tree.DecisionTreeClassifier.__init__, 13),
+ (sklearn.pipeline.Pipeline.__init__, 2),
+ ]
for fn, num_params_with_defaults in fns:
defaults, defaultless = self.extension._get_fn_arguments_with_defaults(fn)
@@ -1464,7 +1480,7 @@ def test_openml_param_name_to_sklearn(self):
)
model = sklearn.pipeline.Pipeline(steps=[("scaler", scaler), ("boosting", boosting)])
flow = self.extension.model_to_flow(model)
- task = openml.tasks.get_task(115)
+ task = openml.tasks.get_task(115) # diabetes; crossvalidation
run = openml.runs.run_flow_on_task(flow, task)
run = run.publish()
TestBase._mark_entity_for_removal("run", run.run_id)
@@ -1522,7 +1538,7 @@ def test_obtain_parameter_values(self):
"bootstrap": [True, False],
"criterion": ["gini", "entropy"],
},
- cv=sklearn.model_selection.StratifiedKFold(n_splits=2, random_state=1),
+ cv=sklearn.model_selection.StratifiedKFold(n_splits=2, random_state=1, shuffle=True),
n_iter=5,
)
flow = self.extension.model_to_flow(model)
@@ -1560,7 +1576,7 @@ def setUp(self):
# Test methods for performing runs with this extension module
def test_run_model_on_task(self):
- task = openml.tasks.get_task(1)
+ task = openml.tasks.get_task(1) # anneal; crossvalidation
# using most_frequent imputer since dataset has mixed types and to keep things simple
pipe = sklearn.pipeline.Pipeline(
[
@@ -1625,7 +1641,7 @@ def test_seed_model_raises(self):
self.extension.seed_model(model=clf, seed=42)
def test_run_model_on_fold_classification_1_array(self):
- task = openml.tasks.get_task(1)
+ task = openml.tasks.get_task(1) # anneal; crossvalidation
X, y = task.get_X_and_y()
train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0)
@@ -1688,7 +1704,7 @@ def test_run_model_on_fold_classification_1_array(self):
def test_run_model_on_fold_classification_1_dataframe(self):
from sklearn.compose import ColumnTransformer
- task = openml.tasks.get_task(1)
+ task = openml.tasks.get_task(1) # anneal; crossvalidation
# diff test_run_model_on_fold_classification_1_array()
X, y = task.get_X_and_y(dataset_format="dataframe")
@@ -1752,7 +1768,7 @@ def test_run_model_on_fold_classification_1_dataframe(self):
)
def test_run_model_on_fold_classification_2(self):
- task = openml.tasks.get_task(7)
+ task = openml.tasks.get_task(7) # kr-vs-kp; crossvalidation
X, y = task.get_X_and_y()
train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0)
@@ -1814,7 +1830,11 @@ def predict_proba(*args, **kwargs):
raise AttributeError("predict_proba is not available when " "probability=False")
# task 1 (test server) is important: it is a task with an unused class
- tasks = [1, 3, 115]
+ tasks = [
+ 1, # anneal; crossvalidation
+ 3, # anneal; crossvalidation
+ 115, # diabetes; crossvalidation
+ ]
flow = unittest.mock.Mock()
flow.name = "dummy"
@@ -1968,7 +1988,7 @@ def test__extract_trace_data(self):
"max_iter": [10, 20, 40, 80],
}
num_iters = 10
- task = openml.tasks.get_task(20)
+ task = openml.tasks.get_task(20) # balance-scale; crossvalidation
clf = sklearn.model_selection.RandomizedSearchCV(
sklearn.neural_network.MLPClassifier(), param_grid, num_iters,
)
@@ -2079,8 +2099,8 @@ def test_run_on_model_with_empty_steps(self):
from sklearn.compose import ColumnTransformer
# testing 'drop', 'passthrough', None as non-actionable sklearn estimators
- dataset = openml.datasets.get_dataset(128)
- task = openml.tasks.get_task(59)
+ dataset = openml.datasets.get_dataset(128) # iris
+ task = openml.tasks.get_task(59) # mfeat-pixel; crossvalidation
X, y, categorical_ind, feature_names = dataset.get_data(
target=dataset.default_target_attribute, dataset_format="array"
@@ -2183,16 +2203,6 @@ def test_failed_serialization_of_custom_class(self):
# for lower versions
from sklearn.preprocessing import Imputer as SimpleImputer
- class CustomImputer(SimpleImputer):
- pass
-
- def cont(X):
- return X.dtypes != "category"
-
- def cat(X):
- return X.dtypes == "category"
-
- import sklearn.metrics
import sklearn.tree
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
@@ -2207,7 +2217,7 @@ def cat(X):
steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())]
) # build a sklearn classifier
- task = openml.tasks.get_task(253) # data with mixed types from test server
+ task = openml.tasks.get_task(253) # profb; crossvalidation
try:
_ = openml.runs.run_model_on_task(clf, task)
except AttributeError as e:
@@ -2215,3 +2225,38 @@ def cat(X):
raise AttributeError(e)
else:
raise Exception(e)
+
+ @unittest.skipIf(
+ LooseVersion(sklearn.__version__) < "0.20",
+ reason="columntransformer introduction in 0.20.0",
+ )
+ def test_setupid_with_column_transformer(self):
+ """Test to check if inclusion of ColumnTransformer in a pipleline is treated as a new
+ flow each time.
+ """
+ import sklearn.compose
+ from sklearn.svm import SVC
+
+ def column_transformer_pipe(task_id):
+ task = openml.tasks.get_task(task_id)
+ # make columntransformer
+ preprocessor = sklearn.compose.ColumnTransformer(
+ transformers=[
+ ("num", StandardScaler(), cont),
+ ("cat", OneHotEncoder(handle_unknown="ignore"), cat),
+ ]
+ )
+ # make pipeline
+ clf = SVC(gamma="scale", random_state=1)
+ pipe = make_pipeline(preprocessor, clf)
+ # run task
+ run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
+ run.publish()
+ new_run = openml.runs.get_run(run.run_id)
+ return new_run
+
+ run1 = column_transformer_pipe(11) # only categorical
+ TestBase._mark_entity_for_removal("run", run1.run_id)
+ run2 = column_transformer_pipe(23) # only numeric
+ TestBase._mark_entity_for_removal("run", run2.run_id)
+ self.assertEqual(run1.setup_id, run2.setup_id)
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index 12af05ffe..a65dcbf70 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -2,18 +2,22 @@
from collections import OrderedDict
import copy
+import functools
import unittest
+from unittest.mock import patch
from distutils.version import LooseVersion
import sklearn
from sklearn import ensemble
import pandas as pd
+import pytest
import openml
from openml.testing import TestBase
import openml.extensions.sklearn
+@pytest.mark.usefixtures("long_version")
class TestFlowFunctions(TestBase):
_multiprocess_can_split_ = True
@@ -321,8 +325,16 @@ def test_get_flow_reinstantiate_model_wrong_version(self):
# Note that CI does not test against 0.19.1.
openml.config.server = self.production_server
_, sklearn_major, _ = LooseVersion(sklearn.__version__).version[:3]
- flow = 8175
- expected = "Trying to deserialize a model with dependency" " sklearn==0.19.1 not satisfied."
+ if sklearn_major > 23:
+ flow = 18587 # 18687, 18725 --- flows building random forest on >= 0.23
+ flow_sklearn_version = "0.23.1"
+ else:
+ flow = 8175
+ flow_sklearn_version = "0.19.1"
+ expected = (
+ "Trying to deserialize a model with dependency "
+ "sklearn=={} not satisfied.".format(flow_sklearn_version)
+ )
self.assertRaisesRegex(
ValueError, expected, openml.flows.get_flow, flow_id=flow, reinstantiate=True
)
@@ -331,23 +343,34 @@ def test_get_flow_reinstantiate_model_wrong_version(self):
flow = openml.flows.get_flow(flow_id=flow, reinstantiate=True, strict_version=False)
# ensure that a new flow was created
assert flow.flow_id is None
- assert "0.19.1" not in flow.dependencies
+ assert "sklearn==0.19.1" not in flow.dependencies
+ assert "sklearn>=0.19.1" not in flow.dependencies
def test_get_flow_id(self):
- clf = sklearn.tree.DecisionTreeClassifier()
- flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish()
-
- self.assertEqual(openml.flows.get_flow_id(model=clf, exact_version=True), flow.flow_id)
- flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False)
- self.assertIn(flow.flow_id, flow_ids)
- self.assertGreater(len(flow_ids), 2)
-
- # Check that the output of get_flow_id is identical if only the name is given, no matter
- # whether exact_version is set to True or False.
- flow_ids_exact_version_True = openml.flows.get_flow_id(name=flow.name, exact_version=True)
- flow_ids_exact_version_False = openml.flows.get_flow_id(
- name=flow.name, exact_version=False,
- )
- self.assertEqual(flow_ids_exact_version_True, flow_ids_exact_version_False)
- self.assertIn(flow.flow_id, flow_ids_exact_version_True)
- self.assertGreater(len(flow_ids_exact_version_True), 2)
+ if self.long_version:
+ list_all = openml.utils._list_all
+ else:
+ list_all = functools.lru_cache()(openml.utils._list_all)
+ with patch("openml.utils._list_all", list_all):
+ clf = sklearn.tree.DecisionTreeClassifier()
+ flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish()
+ TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name))
+ TestBase.logger.info(
+ "collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id)
+ )
+
+ self.assertEqual(openml.flows.get_flow_id(model=clf, exact_version=True), flow.flow_id)
+ flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False)
+ self.assertIn(flow.flow_id, flow_ids)
+ self.assertGreater(len(flow_ids), 0)
+
+ # Check that the output of get_flow_id is identical if only the name is given, no matter
+ # whether exact_version is set to True or False.
+ flow_ids_exact_version_True = openml.flows.get_flow_id(
+ name=flow.name, exact_version=True
+ )
+ flow_ids_exact_version_False = openml.flows.get_flow_id(
+ name=flow.name, exact_version=False,
+ )
+ self.assertEqual(flow_ids_exact_version_True, flow_ids_exact_version_False)
+ self.assertIn(flow.flow_id, flow_ids_exact_version_True)
diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py
index 8b470a45b..459a0cdf5 100644
--- a/tests/test_openml/test_api_calls.py
+++ b/tests/test_openml/test_api_calls.py
@@ -1,3 +1,5 @@
+import unittest.mock
+
import openml
import openml.testing
@@ -8,3 +10,23 @@ def test_too_long_uri(self):
openml.exceptions.OpenMLServerError, "URI too long!",
):
openml.datasets.list_datasets(data_id=list(range(10000)))
+
+ @unittest.mock.patch("time.sleep")
+ @unittest.mock.patch("requests.Session")
+ def test_retry_on_database_error(self, Session_class_mock, _):
+ response_mock = unittest.mock.Mock()
+ response_mock.text = (
+ "\n"
+ "107"
+ "Database connection error. "
+ "Usually due to high server load. "
+ "Please wait for N seconds and try again.\n"
+ ""
+ )
+ Session_class_mock.return_value.__enter__.return_value.get.return_value = response_mock
+ with self.assertRaisesRegex(
+ openml.exceptions.OpenMLServerException, "/abc returned code 107"
+ ):
+ openml._api_calls._send_request("get", "/abc", {})
+
+ self.assertEqual(Session_class_mock.return_value.__enter__.return_value.get.call_count, 10)
diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py
index 88136dbd9..5b15f781e 100644
--- a/tests/test_openml/test_config.py
+++ b/tests/test_openml/test_config.py
@@ -1,15 +1,59 @@
# License: BSD 3-Clause
+import tempfile
import os
+import unittest.mock
import openml.config
import openml.testing
class TestConfig(openml.testing.TestBase):
- def test_config_loading(self):
- self.assertTrue(os.path.exists(openml.config.config_file))
- self.assertTrue(os.path.isdir(os.path.expanduser("~/.openml")))
+ @unittest.mock.patch("os.path.expanduser")
+ @unittest.mock.patch("openml.config.openml_logger.warning")
+ @unittest.mock.patch("openml.config._create_log_handlers")
+ @unittest.skipIf(os.name == "nt", "https://github.com/openml/openml-python/issues/1033")
+ def test_non_writable_home(self, log_handler_mock, warnings_mock, expanduser_mock):
+ with tempfile.TemporaryDirectory(dir=self.workdir) as td:
+ expanduser_mock.side_effect = (
+ os.path.join(td, "openmldir"),
+ os.path.join(td, "cachedir"),
+ )
+ os.chmod(td, 0o444)
+ openml.config._setup()
+
+ self.assertEqual(warnings_mock.call_count, 2)
+ self.assertEqual(log_handler_mock.call_count, 1)
+ self.assertFalse(log_handler_mock.call_args_list[0][1]["create_file_handler"])
+
+ def test_get_config_as_dict(self):
+ """ Checks if the current configuration is returned accurately as a dict. """
+ config = openml.config.get_config_as_dict()
+ _config = dict()
+ _config["apikey"] = "610344db6388d9ba34f6db45a3cf71de"
+ _config["server"] = "https://test.openml.org/api/v1/xml"
+ _config["cachedir"] = self.workdir
+ _config["avoid_duplicate_runs"] = False
+ _config["connection_n_retries"] = 10
+ _config["max_retries"] = 20
+ self.assertIsInstance(config, dict)
+ self.assertEqual(len(config), 6)
+ self.assertDictEqual(config, _config)
+
+ def test_setup_with_config(self):
+ """ Checks if the OpenML configuration can be updated using _setup(). """
+ _config = dict()
+ _config["apikey"] = "610344db6388d9ba34f6db45a3cf71de"
+ _config["server"] = "https://www.openml.org/api/v1/xml"
+ _config["cachedir"] = self.workdir
+ _config["avoid_duplicate_runs"] = True
+ _config["connection_n_retries"] = 100
+ _config["max_retries"] = 1000
+ orig_config = openml.config.get_config_as_dict()
+ openml.config._setup(_config)
+ updated_config = openml.config.get_config_as_dict()
+ openml.config._setup(orig_config) # important to not affect other unit tests
+ self.assertDictEqual(_config, updated_config)
class TestConfigurationForExamples(openml.testing.TestBase):
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index 864863f4a..dd0da5c00 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -5,11 +5,13 @@
import os
from time import time
+import xmltodict
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
+from openml import OpenMLRun
from openml.testing import TestBase, SimpleImputer
import openml
import openml.extensions.sklearn
@@ -102,7 +104,7 @@ def test_to_from_filesystem_vanilla(self):
("classifier", DecisionTreeClassifier(max_depth=1)),
]
)
- task = openml.tasks.get_task(119)
+ task = openml.tasks.get_task(119) # diabetes; crossvalidation
run = openml.runs.run_model_on_task(
model=model,
task=task,
@@ -142,7 +144,7 @@ def test_to_from_filesystem_search(self):
},
)
- task = openml.tasks.get_task(119)
+ task = openml.tasks.get_task(119) # diabetes; crossvalidation
run = openml.runs.run_model_on_task(
model=model, task=task, add_local_measures=False, avoid_duplicate_runs=False,
)
@@ -163,7 +165,7 @@ def test_to_from_filesystem_no_model(self):
model = Pipeline(
[("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())]
)
- task = openml.tasks.get_task(119)
+ task = openml.tasks.get_task(119) # diabetes; crossvalidation
run = openml.runs.run_model_on_task(model=model, task=task, add_local_measures=False)
cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128)))
@@ -184,7 +186,7 @@ def test_publish_with_local_loaded_flow(self):
model = Pipeline(
[("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())]
)
- task = openml.tasks.get_task(119)
+ task = openml.tasks.get_task(119) # diabetes; crossvalidation
# Make sure the flow does not exist on the server yet.
flow = extension.model_to_flow(model)
@@ -215,3 +217,19 @@ def test_publish_with_local_loaded_flow(self):
# make sure the flow is published as part of publishing the run.
self.assertTrue(openml.flows.flow_exists(flow.name, flow.external_version))
openml.runs.get_run(loaded_run.run_id)
+
+ def test_run_setup_string_included_in_xml(self):
+ SETUP_STRING = "setup-string"
+ run = OpenMLRun(
+ task_id=0,
+ flow_id=None, # if not none, flow parameters are required.
+ dataset_id=0,
+ setup_string=SETUP_STRING,
+ )
+ xml = run._to_xml()
+ run_dict = xmltodict.parse(xml)["oml:run"]
+ assert "oml:setup_string" in run_dict
+ assert run_dict["oml:setup_string"] == SETUP_STRING
+
+ recreated_run = openml.runs.functions._create_run_from_xml(xml, from_server=False)
+ assert recreated_run.setup_string == SETUP_STRING
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 89f01c72e..4534f26a4 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -1,5 +1,4 @@
# License: BSD 3-Clause
-from typing import Tuple, List, Union
import arff
from distutils.version import LooseVersion
@@ -7,10 +6,12 @@
import random
import time
import sys
+import ast
import unittest.mock
import numpy as np
-import pytest
+import joblib
+from joblib import parallel_backend
import openml
import openml.exceptions
@@ -21,10 +22,13 @@
import pandas as pd
import openml.extensions.sklearn
-from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont
+from openml.testing import TestBase, SimpleImputer, CustomImputer
+from openml.extensions.sklearn import cat, cont
from openml.runs.functions import _run_task_get_arffcontent, run_exists, format_prediction
from openml.runs.trace import OpenMLRunTrace
from openml.tasks import TaskType
+from openml.testing import check_task_existence
+from openml.exceptions import OpenMLServerException
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection._search import BaseSearchCV
@@ -42,19 +46,45 @@
class TestRun(TestBase):
_multiprocess_can_split_ = True
- # diabetis dataset, 768 observations, 0 missing vals, 33% holdout set
- # (253 test obs), no nominal attributes, all numeric attributes
- TEST_SERVER_TASK_SIMPLE: Tuple[Union[int, List], ...] = (119, 0, 253, [], [*range(8)])
- TEST_SERVER_TASK_REGRESSION: Tuple[Union[int, List], ...] = (738, 0, 718, [], [*range(8)])
- # credit-a dataset, 690 observations, 67 missing vals, 33% holdout set
- # (227 test obs)
- TEST_SERVER_TASK_MISSING_VALS = (
- 96,
- 67,
- 227,
- [0, 3, 4, 5, 6, 8, 9, 11, 12],
- [1, 2, 7, 10, 13, 14],
- )
+ TEST_SERVER_TASK_MISSING_VALS = {
+ "task_id": 96,
+ "n_missing_vals": 67,
+ "n_test_obs": 227,
+ "nominal_indices": [0, 3, 4, 5, 6, 8, 9, 11, 12],
+ "numeric_indices": [1, 2, 7, 10, 13, 14],
+ "task_meta_data": {
+ "task_type": TaskType.SUPERVISED_CLASSIFICATION,
+ "dataset_id": 16, # credit-a
+ "estimation_procedure_id": 1,
+ "target_name": "class",
+ },
+ }
+ TEST_SERVER_TASK_SIMPLE = {
+ "task_id": 119,
+ "n_missing_vals": 0,
+ "n_test_obs": 253,
+ "nominal_indices": [],
+ "numeric_indices": [*range(8)],
+ "task_meta_data": {
+ "task_type": TaskType.SUPERVISED_CLASSIFICATION,
+ "dataset_id": 20, # diabetes
+ "estimation_procedure_id": 1,
+ "target_name": "class",
+ },
+ }
+ TEST_SERVER_TASK_REGRESSION = {
+ "task_id": 1605,
+ "n_missing_vals": 0,
+ "n_test_obs": 2178,
+ "nominal_indices": [],
+ "numeric_indices": [*range(8)],
+ "task_meta_data": {
+ "task_type": TaskType.SUPERVISED_REGRESSION,
+ "dataset_id": 123, # quake
+ "estimation_procedure_id": 7,
+ "target_name": "richter",
+ },
+ }
# Suppress warnings to facilitate testing
hide_warnings = True
@@ -335,7 +365,7 @@ def _check_sample_evaluations(
for sample in range(num_sample_entrees):
evaluation = sample_evaluations[measure][rep][fold][sample]
self.assertIsInstance(evaluation, float)
- if not os.environ.get("CI_WINDOWS"):
+ if not (os.environ.get("CI_WINDOWS") or os.name == "nt"):
# Either Appveyor is much faster than Travis
# and/or measurements are not as accurate.
# Either way, windows seems to get an eval-time
@@ -344,7 +374,7 @@ def _check_sample_evaluations(
self.assertLess(evaluation, max_time_allowed)
def test_run_regression_on_classif_task(self):
- task_id = 115
+ task_id = 115 # diabetes; crossvalidation
clf = LinearRegression()
task = openml.tasks.get_task(task_id)
@@ -358,7 +388,7 @@ def test_run_regression_on_classif_task(self):
)
def test_check_erronous_sklearn_flow_fails(self):
- task_id = 115
+ task_id = 115 # diabetes; crossvalidation
task = openml.tasks.get_task(task_id)
# Invalid parameter values
@@ -443,7 +473,7 @@ def determine_grid_size(param_grid):
# suboptimal (slow), and not guaranteed to work if evaluation
# engine is behind.
# TODO: mock this? We have the arff already on the server
- self._wait_for_processed_run(run.run_id, 400)
+ self._wait_for_processed_run(run.run_id, 600)
try:
model_prime = openml.runs.initialize_model_from_trace(
run_id=run.run_id, repeat=0, fold=0,
@@ -499,7 +529,7 @@ def _run_and_upload_classification(
def _run_and_upload_regression(
self, clf, task_id, n_missing_vals, n_test_obs, flow_expected_rsv, sentinel=None
):
- num_folds = 1 # because of holdout
+ num_folds = 10 # because of cross-validation
num_iterations = 5 # for base search algorithms
metric = sklearn.metrics.mean_absolute_error # metric class
metric_name = "mean_absolute_error" # openml metric name
@@ -520,17 +550,39 @@ def _run_and_upload_regression(
)
def test_run_and_upload_logistic_regression(self):
- lr = LogisticRegression(solver="lbfgs")
- task_id = self.TEST_SERVER_TASK_SIMPLE[0]
- n_missing_vals = self.TEST_SERVER_TASK_SIMPLE[1]
- n_test_obs = self.TEST_SERVER_TASK_SIMPLE[2]
+ lr = LogisticRegression(solver="lbfgs", max_iter=1000)
+ task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
+ n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"]
+ n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
self._run_and_upload_classification(lr, task_id, n_missing_vals, n_test_obs, "62501")
def test_run_and_upload_linear_regression(self):
lr = LinearRegression()
- task_id = self.TEST_SERVER_TASK_REGRESSION[0]
- n_missing_vals = self.TEST_SERVER_TASK_REGRESSION[1]
- n_test_obs = self.TEST_SERVER_TASK_REGRESSION[2]
+ task_id = self.TEST_SERVER_TASK_REGRESSION["task_id"]
+
+ task_meta_data = self.TEST_SERVER_TASK_REGRESSION["task_meta_data"]
+ _task_id = check_task_existence(**task_meta_data)
+ if _task_id is not None:
+ task_id = _task_id
+ else:
+ new_task = openml.tasks.create_task(**task_meta_data)
+ # publishes the new task
+ try:
+ new_task = new_task.publish()
+ task_id = new_task.task_id
+ except OpenMLServerException as e:
+ if e.code == 614: # Task already exists
+ # the exception message contains the task_id that was matched in the format
+ # 'Task already exists. - matched id(s): [xxxx]'
+ task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0]
+ else:
+ raise Exception(repr(e))
+ # mark to remove the uploaded task
+ TestBase._mark_entity_for_removal("task", task_id)
+ TestBase.logger.info("collected from test_run_functions: {}".format(task_id))
+
+ n_missing_vals = self.TEST_SERVER_TASK_REGRESSION["n_missing_vals"]
+ n_test_obs = self.TEST_SERVER_TASK_REGRESSION["n_test_obs"]
self._run_and_upload_regression(lr, task_id, n_missing_vals, n_test_obs, "62501")
def test_run_and_upload_pipeline_dummy_pipeline(self):
@@ -541,9 +593,9 @@ def test_run_and_upload_pipeline_dummy_pipeline(self):
("dummy", DummyClassifier(strategy="prior")),
]
)
- task_id = self.TEST_SERVER_TASK_SIMPLE[0]
- n_missing_vals = self.TEST_SERVER_TASK_SIMPLE[1]
- n_test_obs = self.TEST_SERVER_TASK_SIMPLE[2]
+ task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
+ n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"]
+ n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
self._run_and_upload_classification(pipeline1, task_id, n_missing_vals, n_test_obs, "62501")
@unittest.skipIf(
@@ -584,20 +636,26 @@ def get_ct_cf(nominal_indices, numeric_indices):
sentinel = self._get_sentinel()
self._run_and_upload_classification(
- get_ct_cf(self.TEST_SERVER_TASK_SIMPLE[3], self.TEST_SERVER_TASK_SIMPLE[4]),
- self.TEST_SERVER_TASK_SIMPLE[0],
- self.TEST_SERVER_TASK_SIMPLE[1],
- self.TEST_SERVER_TASK_SIMPLE[2],
+ get_ct_cf(
+ self.TEST_SERVER_TASK_SIMPLE["nominal_indices"],
+ self.TEST_SERVER_TASK_SIMPLE["numeric_indices"],
+ ),
+ self.TEST_SERVER_TASK_SIMPLE["task_id"],
+ self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"],
+ self.TEST_SERVER_TASK_SIMPLE["n_test_obs"],
"62501",
sentinel=sentinel,
)
# Due to #602, it is important to test this model on two tasks
# with different column specifications
self._run_and_upload_classification(
- get_ct_cf(self.TEST_SERVER_TASK_MISSING_VALS[3], self.TEST_SERVER_TASK_MISSING_VALS[4]),
- self.TEST_SERVER_TASK_MISSING_VALS[0],
- self.TEST_SERVER_TASK_MISSING_VALS[1],
- self.TEST_SERVER_TASK_MISSING_VALS[2],
+ get_ct_cf(
+ self.TEST_SERVER_TASK_MISSING_VALS["nominal_indices"],
+ self.TEST_SERVER_TASK_MISSING_VALS["numeric_indices"],
+ ),
+ self.TEST_SERVER_TASK_MISSING_VALS["task_id"],
+ self.TEST_SERVER_TASK_MISSING_VALS["n_missing_vals"],
+ self.TEST_SERVER_TASK_MISSING_VALS["n_test_obs"],
"62501",
sentinel=sentinel,
)
@@ -606,7 +664,8 @@ def get_ct_cf(nominal_indices, numeric_indices):
LooseVersion(sklearn.__version__) < "0.20",
reason="columntransformer introduction in 0.20.0",
)
- def test_run_and_upload_knn_pipeline(self):
+ @unittest.mock.patch("warnings.warn")
+ def test_run_and_upload_knn_pipeline(self, warnings_mock):
cat_imp = make_pipeline(
SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
@@ -632,19 +691,34 @@ def test_run_and_upload_knn_pipeline(self):
]
)
- task_id = self.TEST_SERVER_TASK_MISSING_VALS[0]
- n_missing_vals = self.TEST_SERVER_TASK_MISSING_VALS[1]
- n_test_obs = self.TEST_SERVER_TASK_MISSING_VALS[2]
+ task_id = self.TEST_SERVER_TASK_MISSING_VALS["task_id"]
+ n_missing_vals = self.TEST_SERVER_TASK_MISSING_VALS["n_missing_vals"]
+ n_test_obs = self.TEST_SERVER_TASK_MISSING_VALS["n_test_obs"]
self._run_and_upload_classification(pipeline2, task_id, n_missing_vals, n_test_obs, "62501")
+ # The warning raised is:
+ # "The total space of parameters 8 is smaller than n_iter=10.
+ # Running 8 iterations. For exhaustive searches, use GridSearchCV."
+ # It is raised three times because we once run the model to upload something and then run
+ # it again twice to compare that the predictions are reproducible.
+ warning_msg = (
+ "The total space of parameters 8 is smaller than n_iter=10. "
+ "Running 8 iterations. For exhaustive searches, use GridSearchCV."
+ )
+ call_count = 0
+ for _warnings in warnings_mock.call_args_list:
+ if _warnings[0][0] == warning_msg:
+ call_count += 1
+ self.assertEqual(call_count, 3)
def test_run_and_upload_gridsearch(self):
gridsearch = GridSearchCV(
BaggingClassifier(base_estimator=SVC()),
{"base_estimator__C": [0.01, 0.1, 10], "base_estimator__gamma": [0.01, 0.1, 10]},
+ cv=3,
)
- task_id = self.TEST_SERVER_TASK_SIMPLE[0]
- n_missing_vals = self.TEST_SERVER_TASK_SIMPLE[1]
- n_test_obs = self.TEST_SERVER_TASK_SIMPLE[2]
+ task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
+ n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"]
+ n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
run = self._run_and_upload_classification(
clf=gridsearch,
task_id=task_id,
@@ -671,9 +745,9 @@ def test_run_and_upload_randomsearch(self):
# The random states for the RandomizedSearchCV is set after the
# random state of the RandomForestClassifier is set, therefore,
# it has a different value than the other examples before
- task_id = self.TEST_SERVER_TASK_SIMPLE[0]
- n_missing_vals = self.TEST_SERVER_TASK_SIMPLE[1]
- n_test_obs = self.TEST_SERVER_TASK_SIMPLE[2]
+ task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
+ n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"]
+ n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
run = self._run_and_upload_classification(
clf=randomsearch,
task_id=task_id,
@@ -682,6 +756,8 @@ def test_run_and_upload_randomsearch(self):
flow_expected_rsv="12172",
)
self.assertEqual(len(run.trace.trace_iterations), 5)
+ trace = openml.runs.get_run_trace(run.run_id)
+ self.assertEqual(len(trace.trace_iterations), 5)
def test_run_and_upload_maskedarrays(self):
# This testcase is important for 2 reasons:
@@ -696,9 +772,9 @@ def test_run_and_upload_maskedarrays(self):
# The random states for the GridSearchCV is set after the
# random state of the RandomForestClassifier is set, therefore,
# it has a different value than the other examples before
- task_id = self.TEST_SERVER_TASK_SIMPLE[0]
- n_missing_vals = self.TEST_SERVER_TASK_SIMPLE[1]
- n_test_obs = self.TEST_SERVER_TASK_SIMPLE[2]
+ task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
+ n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"]
+ n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
self._run_and_upload_classification(
gridsearch, task_id, n_missing_vals, n_test_obs, "12172"
)
@@ -782,7 +858,7 @@ def test_initialize_cv_from_run(self):
]
)
- task = openml.tasks.get_task(11)
+ task = openml.tasks.get_task(11) # kr-vs-kp; holdout
run = openml.runs.run_model_on_task(
model=randomsearch, task=task, avoid_duplicate_runs=False, seed=1,
)
@@ -828,31 +904,12 @@ def _test_local_evaluations(self, run):
self.assertGreaterEqual(alt_scores[idx], 0)
self.assertLessEqual(alt_scores[idx], 1)
- @unittest.skipIf(
- LooseVersion(sklearn.__version__) < "0.20",
- reason="SimpleImputer doesn't handle mixed type DataFrame as input",
- )
def test_local_run_swapped_parameter_order_model(self):
+ clf = DecisionTreeClassifier()
+ australian_task = 595 # Australian; crossvalidation
+ task = openml.tasks.get_task(australian_task)
- # construct sci-kit learn classifier
- clf = Pipeline(
- steps=[
- (
- "imputer",
- make_pipeline(
- SimpleImputer(strategy="most_frequent"),
- OneHotEncoder(handle_unknown="ignore"),
- ),
- ),
- # random forest doesn't take categoricals
- ("estimator", RandomForestClassifier()),
- ]
- )
-
- # download task
- task = openml.tasks.get_task(7)
-
- # invoke OpenML run
+ # task and clf are purposely in the old order
run = openml.runs.run_model_on_task(
task, clf, avoid_duplicate_runs=False, upload_flow=False,
)
@@ -876,7 +933,7 @@ def test_local_run_swapped_parameter_order_flow(self):
flow = self.extension.model_to_flow(clf)
# download task
- task = openml.tasks.get_task(7)
+ task = openml.tasks.get_task(7) # kr-vs-kp; crossvalidation
# invoke OpenML run
run = openml.runs.run_flow_on_task(
@@ -901,7 +958,7 @@ def test_local_run_metric_score(self):
)
# download task
- task = openml.tasks.get_task(7)
+ task = openml.tasks.get_task(7) # kr-vs-kp; crossvalidation
# invoke OpenML run
run = openml.runs.run_model_on_task(
@@ -931,7 +988,33 @@ def test_initialize_model_from_run(self):
("Estimator", GaussianNB()),
]
)
- task = openml.tasks.get_task(1198)
+ task_meta_data = {
+ "task_type": TaskType.SUPERVISED_CLASSIFICATION,
+ "dataset_id": 128, # iris
+ "estimation_procedure_id": 1,
+ "target_name": "class",
+ }
+ _task_id = check_task_existence(**task_meta_data)
+ if _task_id is not None:
+ task_id = _task_id
+ else:
+ new_task = openml.tasks.create_task(**task_meta_data)
+ # publishes the new task
+ try:
+ new_task = new_task.publish()
+ task_id = new_task.task_id
+ except OpenMLServerException as e:
+ if e.code == 614: # Task already exists
+ # the exception message contains the task_id that was matched in the format
+ # 'Task already exists. - matched id(s): [xxxx]'
+ task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0]
+ else:
+ raise Exception(repr(e))
+ # mark to remove the uploaded task
+ TestBase._mark_entity_for_removal("task", task_id)
+ TestBase.logger.info("collected from test_run_functions: {}".format(task_id))
+
+ task = openml.tasks.get_task(task_id)
run = openml.runs.run_model_on_task(model=clf, task=task, avoid_duplicate_runs=False,)
run_ = run.publish()
TestBase._mark_entity_for_removal("run", run_.run_id)
@@ -950,55 +1033,6 @@ def test_initialize_model_from_run(self):
self.assertEqual(flowS.components["Imputer"].parameters["strategy"], '"most_frequent"')
self.assertEqual(flowS.components["VarianceThreshold"].parameters["threshold"], "0.05")
- @pytest.mark.flaky()
- def test_get_run_trace(self):
- # get_run_trace is already tested implicitly in test_run_and_publish
- # this test is a bit additional.
- num_iterations = 10
- num_folds = 1
- task_id = 119
-
- task = openml.tasks.get_task(task_id)
-
- # IMPORTANT! Do not sentinel this flow. is faster if we don't wait
- # on openml server
- clf = RandomizedSearchCV(
- RandomForestClassifier(random_state=42, n_estimators=5),
- {
- "max_depth": [3, None],
- "max_features": [1, 2, 3, 4],
- "bootstrap": [True, False],
- "criterion": ["gini", "entropy"],
- },
- num_iterations,
- random_state=42,
- cv=3,
- )
-
- # [SPEED] make unit test faster by exploiting run information
- # from the past
- try:
- # in case the run did not exists yet
- run = openml.runs.run_model_on_task(model=clf, task=task, avoid_duplicate_runs=True,)
-
- self.assertEqual(
- len(run.trace.trace_iterations), num_iterations * num_folds,
- )
- run = run.publish()
- TestBase._mark_entity_for_removal("run", run.run_id)
- TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id))
- self._wait_for_processed_run(run.run_id, 400)
- run_id = run.run_id
- except openml.exceptions.OpenMLRunsExistError as e:
- # The only error we expect, should fail otherwise.
- run_ids = [int(run_id) for run_id in e.run_ids]
- self.assertGreater(len(run_ids), 0)
- run_id = random.choice(list(run_ids))
-
- # now the actual unit test ...
- run_trace = openml.runs.get_run_trace(run_id)
- self.assertEqual(len(run_trace.trace_iterations), num_iterations * num_folds)
-
@unittest.skipIf(
LooseVersion(sklearn.__version__) < "0.20",
reason="SimpleImputer doesn't handle mixed type DataFrame as input",
@@ -1025,7 +1059,7 @@ def test__run_exists(self):
),
]
- task = openml.tasks.get_task(115)
+ task = openml.tasks.get_task(115) # diabetes; crossvalidation
for clf in clfs:
try:
@@ -1055,8 +1089,8 @@ def test__run_exists(self):
def test_run_with_illegal_flow_id(self):
# check the case where the user adds an illegal flow id to a
- # non-existing flow
- task = openml.tasks.get_task(115)
+ # non-existing flo
+ task = openml.tasks.get_task(115) # diabetes; crossvalidation
clf = DecisionTreeClassifier()
flow = self.extension.model_to_flow(clf)
flow, _ = self._add_sentinel_to_flow_name(flow, None)
@@ -1072,7 +1106,7 @@ def test_run_with_illegal_flow_id(self):
def test_run_with_illegal_flow_id_after_load(self):
# Same as `test_run_with_illegal_flow_id`, but test this error is also
# caught if the run is stored to and loaded from disk first.
- task = openml.tasks.get_task(115)
+ task = openml.tasks.get_task(115) # diabetes; crossvalidation
clf = DecisionTreeClassifier()
flow = self.extension.model_to_flow(clf)
flow, _ = self._add_sentinel_to_flow_name(flow, None)
@@ -1096,7 +1130,7 @@ def test_run_with_illegal_flow_id_after_load(self):
def test_run_with_illegal_flow_id_1(self):
# Check the case where the user adds an illegal flow id to an existing
# flow. Comes to a different value error than the previous test
- task = openml.tasks.get_task(115)
+ task = openml.tasks.get_task(115) # diabetes; crossvalidation
clf = DecisionTreeClassifier()
flow_orig = self.extension.model_to_flow(clf)
try:
@@ -1118,7 +1152,7 @@ def test_run_with_illegal_flow_id_1(self):
def test_run_with_illegal_flow_id_1_after_load(self):
# Same as `test_run_with_illegal_flow_id_1`, but test this error is
# also caught if the run is stored to and loaded from disk first.
- task = openml.tasks.get_task(115)
+ task = openml.tasks.get_task(115) # diabetes; crossvalidation
clf = DecisionTreeClassifier()
flow_orig = self.extension.model_to_flow(clf)
try:
@@ -1149,18 +1183,15 @@ def test_run_with_illegal_flow_id_1_after_load(self):
reason="OneHotEncoder cannot handle mixed type DataFrame as input",
)
def test__run_task_get_arffcontent(self):
- task = openml.tasks.get_task(7)
+ task = openml.tasks.get_task(7) # kr-vs-kp; crossvalidation
num_instances = 3196
num_folds = 10
num_repeats = 1
- flow = unittest.mock.Mock()
- flow.name = "dummy"
clf = make_pipeline(
OneHotEncoder(handle_unknown="ignore"), SGDClassifier(loss="log", random_state=1)
)
res = openml.runs.functions._run_task_get_arffcontent(
- flow=flow,
extension=self.extension,
model=clf,
task=task,
@@ -1371,9 +1402,7 @@ def test_run_on_dataset_with_missing_labels_dataframe(self):
# Check that _run_task_get_arffcontent works when one of the class
# labels only declared in the arff file, but is not present in the
# actual data
- flow = unittest.mock.Mock()
- flow.name = "dummy"
- task = openml.tasks.get_task(2)
+ task = openml.tasks.get_task(2) # anneal; crossvalidation
from sklearn.compose import ColumnTransformer
@@ -1387,7 +1416,6 @@ def test_run_on_dataset_with_missing_labels_dataframe(self):
) # build a sklearn classifier
data_content, _, _, _ = _run_task_get_arffcontent(
- flow=flow,
model=model,
task=task,
extension=self.extension,
@@ -1409,9 +1437,7 @@ def test_run_on_dataset_with_missing_labels_array(self):
# Check that _run_task_get_arffcontent works when one of the class
# labels only declared in the arff file, but is not present in the
# actual data
- flow = unittest.mock.Mock()
- flow.name = "dummy"
- task = openml.tasks.get_task(2)
+ task = openml.tasks.get_task(2) # anneal; crossvalidation
# task_id=2 on test server has 38 columns with 6 numeric columns
cont_idx = [3, 4, 8, 32, 33, 34]
cat_idx = list(set(np.arange(38)) - set(cont_idx))
@@ -1432,7 +1458,6 @@ def test_run_on_dataset_with_missing_labels_array(self):
) # build a sklearn classifier
data_content, _, _, _ = _run_task_get_arffcontent(
- flow=flow,
model=model,
task=task,
extension=self.extension,
@@ -1463,7 +1488,7 @@ def test_run_flow_on_task_downloaded_flow(self):
TestBase.logger.info("collected from test_run_functions: {}".format(flow.flow_id))
downloaded_flow = openml.flows.get_flow(flow.flow_id)
- task = openml.tasks.get_task(119) # diabetes
+ task = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE["task_id"])
run = openml.runs.run_flow_on_task(
flow=downloaded_flow, task=task, avoid_duplicate_runs=False, upload_flow=False,
)
@@ -1483,20 +1508,26 @@ def test_format_prediction_non_supervised(self):
format_prediction(clustering, *ignored_input)
def test_format_prediction_classification_no_probabilities(self):
- classification = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE[0], download_data=False)
+ classification = openml.tasks.get_task(
+ self.TEST_SERVER_TASK_SIMPLE["task_id"], download_data=False
+ )
ignored_input = [0] * 5
with self.assertRaisesRegex(ValueError, "`proba` is required for classification task"):
format_prediction(classification, *ignored_input, proba=None)
def test_format_prediction_classification_incomplete_probabilities(self):
- classification = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE[0], download_data=False)
+ classification = openml.tasks.get_task(
+ self.TEST_SERVER_TASK_SIMPLE["task_id"], download_data=False
+ )
ignored_input = [0] * 5
incomplete_probabilities = {c: 0.2 for c in classification.class_labels[1:]}
with self.assertRaisesRegex(ValueError, "Each class should have a predicted probability"):
format_prediction(classification, *ignored_input, proba=incomplete_probabilities)
def test_format_prediction_task_without_classlabels_set(self):
- classification = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE[0], download_data=False)
+ classification = openml.tasks.get_task(
+ self.TEST_SERVER_TASK_SIMPLE["task_id"], download_data=False
+ )
classification.class_labels = None
ignored_input = [0] * 5
with self.assertRaisesRegex(
@@ -1505,14 +1536,146 @@ def test_format_prediction_task_without_classlabels_set(self):
format_prediction(classification, *ignored_input, proba={})
def test_format_prediction_task_learning_curve_sample_not_set(self):
- learning_curve = openml.tasks.get_task(801, download_data=False)
+ learning_curve = openml.tasks.get_task(801, download_data=False) # diabetes;crossvalidation
probabilities = {c: 0.2 for c in learning_curve.class_labels}
ignored_input = [0] * 5
with self.assertRaisesRegex(ValueError, "`sample` can not be none for LearningCurveTask"):
format_prediction(learning_curve, *ignored_input, sample=None, proba=probabilities)
def test_format_prediction_task_regression(self):
- regression = openml.tasks.get_task(self.TEST_SERVER_TASK_REGRESSION[0], download_data=False)
+ task_meta_data = self.TEST_SERVER_TASK_REGRESSION["task_meta_data"]
+ _task_id = check_task_existence(**task_meta_data)
+ if _task_id is not None:
+ task_id = _task_id
+ else:
+ new_task = openml.tasks.create_task(**task_meta_data)
+ # publishes the new task
+ try:
+ new_task = new_task.publish()
+ task_id = new_task.task_id
+ except OpenMLServerException as e:
+ if e.code == 614: # Task already exists
+ # the exception message contains the task_id that was matched in the format
+ # 'Task already exists. - matched id(s): [xxxx]'
+ task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0]
+ else:
+ raise Exception(repr(e))
+ # mark to remove the uploaded task
+ TestBase._mark_entity_for_removal("task", task_id)
+ TestBase.logger.info("collected from test_run_functions: {}".format(task_id))
+
+ regression = openml.tasks.get_task(task_id, download_data=False)
ignored_input = [0] * 5
res = format_prediction(regression, *ignored_input)
self.assertListEqual(res, [0] * 5)
+
+ @unittest.skipIf(
+ LooseVersion(sklearn.__version__) < "0.21",
+ reason="couldn't perform local tests successfully w/o bloating RAM",
+ )
+ @unittest.mock.patch("openml.extensions.sklearn.SklearnExtension._prevent_optimize_n_jobs")
+ def test__run_task_get_arffcontent_2(self, parallel_mock):
+ """ Tests if a run executed in parallel is collated correctly. """
+ task = openml.tasks.get_task(7) # Supervised Classification on kr-vs-kp
+ x, y = task.get_X_and_y(dataset_format="dataframe")
+ num_instances = x.shape[0]
+ line_length = 6 + len(task.class_labels)
+ clf = SGDClassifier(loss="log", random_state=1)
+ n_jobs = 2
+ backend = "loky" if LooseVersion(joblib.__version__) > "0.11" else "multiprocessing"
+ with parallel_backend(backend, n_jobs=n_jobs):
+ res = openml.runs.functions._run_task_get_arffcontent(
+ extension=self.extension,
+ model=clf,
+ task=task,
+ add_local_measures=True,
+ dataset_format="array", # "dataframe" would require handling of categoricals
+ n_jobs=n_jobs,
+ )
+ # This unit test will fail if joblib is unable to distribute successfully since the
+ # function _run_model_on_fold is being mocked out. However, for a new spawned worker, it
+ # is not and the mock call_count should remain 0 while the subsequent check of actual
+ # results should also hold, only on successful distribution of tasks to workers.
+ # The _prevent_optimize_n_jobs() is a function executed within the _run_model_on_fold()
+ # block and mocking this function doesn't affect rest of the pipeline, but is adequately
+ # indicative if _run_model_on_fold() is being called or not.
+ self.assertEqual(parallel_mock.call_count, 0)
+ self.assertIsInstance(res[0], list)
+ self.assertEqual(len(res[0]), num_instances)
+ self.assertEqual(len(res[0][0]), line_length)
+ self.assertEqual(len(res[2]), 7)
+ self.assertEqual(len(res[3]), 7)
+ expected_scores = [
+ 0.965625,
+ 0.94375,
+ 0.946875,
+ 0.953125,
+ 0.96875,
+ 0.965625,
+ 0.9435736677115988,
+ 0.9467084639498433,
+ 0.9749216300940439,
+ 0.9655172413793104,
+ ]
+ scores = [v for k, v in res[2]["predictive_accuracy"][0].items()]
+ np.testing.assert_array_almost_equal(
+ scores, expected_scores, decimal=2 if os.name == "nt" else 7
+ )
+
+ @unittest.skipIf(
+ LooseVersion(sklearn.__version__) < "0.21",
+ reason="couldn't perform local tests successfully w/o bloating RAM",
+ )
+ @unittest.mock.patch("openml.extensions.sklearn.SklearnExtension._prevent_optimize_n_jobs")
+ def test_joblib_backends(self, parallel_mock):
+ """ Tests evaluation of a run using various joblib backends and n_jobs. """
+ task = openml.tasks.get_task(7) # Supervised Classification on kr-vs-kp
+ x, y = task.get_X_and_y(dataset_format="dataframe")
+ num_instances = x.shape[0]
+ line_length = 6 + len(task.class_labels)
+
+ backend_choice = "loky" if LooseVersion(joblib.__version__) > "0.11" else "multiprocessing"
+ for n_jobs, backend, len_time_stats, call_count in [
+ (1, backend_choice, 7, 10),
+ (2, backend_choice, 4, 10),
+ (-1, backend_choice, 1, 10),
+ (1, "threading", 7, 20),
+ (-1, "threading", 1, 30),
+ (1, "sequential", 7, 40),
+ ]:
+ clf = sklearn.model_selection.RandomizedSearchCV(
+ estimator=sklearn.ensemble.RandomForestClassifier(n_estimators=5),
+ param_distributions={
+ "max_depth": [3, None],
+ "max_features": [1, 2, 3, 4],
+ "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10],
+ "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+ "bootstrap": [True, False],
+ "criterion": ["gini", "entropy"],
+ },
+ random_state=1,
+ cv=sklearn.model_selection.StratifiedKFold(
+ n_splits=2, shuffle=True, random_state=1
+ ),
+ n_iter=5,
+ n_jobs=n_jobs,
+ )
+ with parallel_backend(backend, n_jobs=n_jobs):
+ res = openml.runs.functions._run_task_get_arffcontent(
+ extension=self.extension,
+ model=clf,
+ task=task,
+ add_local_measures=True,
+ dataset_format="array", # "dataframe" would require handling of categoricals
+ n_jobs=n_jobs,
+ )
+ self.assertEqual(type(res[0]), list)
+ self.assertEqual(len(res[0]), num_instances)
+ self.assertEqual(len(res[0][0]), line_length)
+ # usercpu_time_millis_* not recorded when n_jobs > 1
+ # *_time_millis_* not recorded when n_jobs = -1
+ self.assertEqual(len(res[2]), len_time_stats)
+ self.assertEqual(len(res[3]), len_time_stats)
+ self.assertEqual(len(res[2]["predictive_accuracy"][0]), 10)
+ self.assertEqual(len(res[3]["predictive_accuracy"][0]), 10)
+ self.assertEqual(parallel_mock.call_count, call_count)
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index e89318728..538b08821 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -67,7 +67,7 @@ def _existing_setup_exists(self, classif):
self.assertFalse(setup_id)
# now run the flow on an easy task:
- task = openml.tasks.get_task(115) # diabetes
+ task = openml.tasks.get_task(115) # diabetes; crossvalidation
run = openml.runs.run_flow_on_task(flow, task)
# spoof flow id, otherwise the sentinel is ignored
run.flow_id = flow.flow_id
diff --git a/tests/test_study/__init__.py b/tests/test_study/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py
index fdb2747ec..682359a61 100644
--- a/tests/test_study/test_study_examples.py
+++ b/tests/test_study/test_study_examples.py
@@ -1,6 +1,7 @@
# License: BSD 3-Clause
-from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont
+from openml.testing import TestBase
+from openml.extensions.sklearn import cat, cont
import sklearn
import unittest
@@ -12,8 +13,8 @@ class TestStudyFunctions(TestBase):
"""Test the example code of Bischl et al. (2018)"""
@unittest.skipIf(
- LooseVersion(sklearn.__version__) < "0.20",
- reason="columntransformer introduction in 0.20.0",
+ LooseVersion(sklearn.__version__) < "0.24",
+ reason="columntransformer introduction in 0.24.0",
)
def test_Figure1a(self):
"""Test listing in Figure 1a on a single task and the old OpenML100 study.
@@ -38,15 +39,14 @@ def test_Figure1a(self):
import openml
import sklearn.metrics
import sklearn.tree
+ from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
benchmark_suite = openml.study.get_study("OpenML100", "tasks") # obtain the benchmark suite
- cat_imp = make_pipeline(
- SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
- )
- cont_imp = make_pipeline(CustomImputer(), StandardScaler())
+ cat_imp = OneHotEncoder(handle_unknown="ignore")
+ cont_imp = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
clf = Pipeline(
steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())]
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
index 993771c90..e028ba2bd 100644
--- a/tests/test_study/test_study_functions.py
+++ b/tests/test_study/test_study_functions.py
@@ -4,6 +4,7 @@
import openml.study
from openml.testing import TestBase
import pandas as pd
+import pytest
class TestStudyFunctions(TestBase):
@@ -113,6 +114,7 @@ def test_publish_benchmark_suite(self):
self.assertEqual(study_downloaded.status, "deactivated")
# can't delete study, now it's not longer in preparation
+ @pytest.mark.flaky()
def test_publish_study(self):
# get some random runs to attach
run_list = openml.evaluations.list_evaluations("predictive_accuracy", size=10)
@@ -133,8 +135,8 @@ def test_publish_study(self):
run_ids=list(run_list.keys()),
)
study.publish()
- # not tracking upload for delete since _delete_entity called end of function
- # asserting return status from openml.study.delete_study()
+ TestBase._mark_entity_for_removal("study", study.id)
+ TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], study.id))
self.assertGreater(study.id, 0)
study_downloaded = openml.study.get_study(study.id)
self.assertEqual(study_downloaded.alias, fixt_alias)
@@ -213,9 +215,8 @@ def test_study_attach_illegal(self):
def test_study_list(self):
study_list = openml.study.list_studies(status="in_preparation")
# might fail if server is recently resetted
- self.assertGreater(len(study_list), 2)
+ self.assertGreaterEqual(len(study_list), 2)
def test_study_list_output_format(self):
study_list = openml.study.list_studies(status="in_preparation", output_format="dataframe")
self.assertIsInstance(study_list, pd.DataFrame)
- self.assertGreater(len(study_list), 2)
diff --git a/tests/test_tasks/test_classification_task.py b/tests/test_tasks/test_classification_task.py
index 4f03f8bff..c4f74c5ce 100644
--- a/tests/test_tasks/test_classification_task.py
+++ b/tests/test_tasks/test_classification_task.py
@@ -13,7 +13,7 @@ class OpenMLClassificationTaskTest(OpenMLSupervisedTaskTest):
def setUp(self, n_levels: int = 1):
super(OpenMLClassificationTaskTest, self).setUp()
- self.task_id = 119
+ self.task_id = 119 # diabetes
self.task_type = TaskType.SUPERVISED_CLASSIFICATION
self.estimation_procedure = 1
diff --git a/tests/test_tasks/test_learning_curve_task.py b/tests/test_tasks/test_learning_curve_task.py
index 9f0157187..b1422d308 100644
--- a/tests/test_tasks/test_learning_curve_task.py
+++ b/tests/test_tasks/test_learning_curve_task.py
@@ -13,7 +13,7 @@ class OpenMLLearningCurveTaskTest(OpenMLSupervisedTaskTest):
def setUp(self, n_levels: int = 1):
super(OpenMLLearningCurveTaskTest, self).setUp()
- self.task_id = 801
+ self.task_id = 801 # diabetes
self.task_type = TaskType.LEARNING_CURVE
self.estimation_procedure = 13
diff --git a/tests/test_tasks/test_regression_task.py b/tests/test_tasks/test_regression_task.py
index e751e63b5..c38d8fa91 100644
--- a/tests/test_tasks/test_regression_task.py
+++ b/tests/test_tasks/test_regression_task.py
@@ -1,8 +1,13 @@
# License: BSD 3-Clause
+import ast
import numpy as np
+import openml
from openml.tasks import TaskType
+from openml.testing import TestBase
+from openml.testing import check_task_existence
+from openml.exceptions import OpenMLServerException
from .test_supervised_task import OpenMLSupervisedTaskTest
@@ -11,9 +16,34 @@ class OpenMLRegressionTaskTest(OpenMLSupervisedTaskTest):
__test__ = True
def setUp(self, n_levels: int = 1):
-
super(OpenMLRegressionTaskTest, self).setUp()
- self.task_id = 625
+
+ task_meta_data = {
+ "task_type": TaskType.SUPERVISED_REGRESSION,
+ "dataset_id": 105, # wisconsin
+ "estimation_procedure_id": 7,
+ "target_name": "time",
+ }
+ _task_id = check_task_existence(**task_meta_data)
+ if _task_id is not None:
+ task_id = _task_id
+ else:
+ new_task = openml.tasks.create_task(**task_meta_data)
+ # publishes the new task
+ try:
+ new_task = new_task.publish()
+ task_id = new_task.task_id
+ # mark to remove the uploaded task
+ TestBase._mark_entity_for_removal("task", task_id)
+ TestBase.logger.info("collected from test_run_functions: {}".format(task_id))
+ except OpenMLServerException as e:
+ if e.code == 614: # Task already exists
+ # the exception message contains the task_id that was matched in the format
+ # 'Task already exists. - matched id(s): [xxxx]'
+ task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0]
+ else:
+ raise Exception(repr(e))
+ self.task_id = task_id
self.task_type = TaskType.SUPERVISED_REGRESSION
self.estimation_procedure = 7
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
index 5f9b65495..418b21b65 100644
--- a/tests/test_tasks/test_task_functions.py
+++ b/tests/test_tasks/test_task_functions.py
@@ -66,7 +66,7 @@ def _check_task(self, task):
self.assertIn(task["status"], ["in_preparation", "active", "deactivated"])
def test_list_tasks_by_type(self):
- num_curves_tasks = 200 # number is flexible, check server if fails
+ num_curves_tasks = 198 # number is flexible, check server if fails
ttid = TaskType.LEARNING_CURVE
tasks = openml.tasks.list_tasks(task_type=ttid)
self.assertGreaterEqual(len(tasks), num_curves_tasks)
@@ -110,7 +110,7 @@ def test_list_tasks_paginate(self):
self._check_task(tasks[tid])
def test_list_tasks_per_type_paginate(self):
- size = 10
+ size = 40
max = 100
task_types = [
TaskType.SUPERVISED_CLASSIFICATION,
@@ -139,7 +139,7 @@ def test__get_task_live(self):
openml.tasks.get_task(34536)
def test_get_task(self):
- task = openml.tasks.get_task(1)
+ task = openml.tasks.get_task(1) # anneal; crossvalidation
self.assertIsInstance(task, OpenMLTask)
self.assertTrue(
os.path.exists(
@@ -158,7 +158,7 @@ def test_get_task(self):
)
def test_get_task_lazy(self):
- task = openml.tasks.get_task(2, download_data=False)
+ task = openml.tasks.get_task(2, download_data=False) # anneal; crossvalidation
self.assertIsInstance(task, OpenMLTask)
self.assertTrue(
os.path.exists(
@@ -198,7 +198,7 @@ def assert_and_raise(*args, **kwargs):
get_dataset.side_effect = assert_and_raise
try:
- openml.tasks.get_task(1)
+ openml.tasks.get_task(1) # anneal; crossvalidation
except WeirdException:
pass
# Now the file should no longer exist
@@ -219,7 +219,7 @@ def test_get_task_different_types(self):
openml.tasks.functions.get_task(126033)
def test_download_split(self):
- task = openml.tasks.get_task(1)
+ task = openml.tasks.get_task(1) # anneal; crossvalidation
split = task.download_split()
self.assertEqual(type(split), OpenMLSplit)
self.assertTrue(
diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py
index 137e29fe4..9878feb96 100644
--- a/tests/test_tasks/test_task_methods.py
+++ b/tests/test_tasks/test_task_methods.py
@@ -15,7 +15,7 @@ def tearDown(self):
super(OpenMLTaskMethodsTest, self).tearDown()
def test_tagging(self):
- task = openml.tasks.get_task(1)
+ task = openml.tasks.get_task(1) # anneal; crossvalidation
tag = "testing_tag_{}_{}".format(self.id(), time())
task_list = openml.tasks.list_tasks(tag=tag)
self.assertEqual(len(task_list), 0)
@@ -40,9 +40,9 @@ def test_get_train_and_test_split_indices(self):
self.assertEqual(681, train_indices[-1])
self.assertEqual(583, test_indices[0])
self.assertEqual(24, test_indices[-1])
- self.assertRaisesRegexp(
+ self.assertRaisesRegex(
ValueError, "Fold 10 not known", task.get_train_test_split_indices, 10, 0
)
- self.assertRaisesRegexp(
+ self.assertRaisesRegex(
ValueError, "Repeat 10 not known", task.get_train_test_split_indices, 0, 10
)
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index 9729100bb..4fa08e1ab 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -1,17 +1,15 @@
-from openml.testing import TestBase
+import os
+import tempfile
+import unittest.mock
+
import numpy as np
-import openml
-import sys
-if sys.version_info[0] >= 3:
- from unittest import mock
-else:
- import mock
+import openml
+from openml.testing import TestBase
class OpenMLTaskTest(TestBase):
_multiprocess_can_split_ = True
- _batch_size = 25
def mocked_perform_api_call(call, request_method):
# TODO: JvR: Why is this not a staticmethod?
@@ -21,7 +19,7 @@ def mocked_perform_api_call(call, request_method):
def test_list_all(self):
openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks)
- @mock.patch("openml._api_calls._perform_api_call", side_effect=mocked_perform_api_call)
+ @unittest.mock.patch("openml._api_calls._perform_api_call", side_effect=mocked_perform_api_call)
def test_list_all_few_results_available(self, _perform_api_call):
# we want to make sure that the number of api calls is only 1.
# Although we have multiple versions of the iris dataset, there is only
@@ -33,7 +31,7 @@ def test_list_all_few_results_available(self, _perform_api_call):
def test_list_all_for_datasets(self):
required_size = 127 # default test server reset value
- datasets = openml.datasets.list_datasets(batch_size=self._batch_size, size=required_size)
+ datasets = openml.datasets.list_datasets(batch_size=100, size=required_size)
self.assertEqual(len(datasets), required_size)
for did in datasets:
@@ -53,13 +51,13 @@ def test_list_datasets_with_high_size_parameter(self):
def test_list_all_for_tasks(self):
required_size = 1068 # default test server reset value
- tasks = openml.tasks.list_tasks(batch_size=self._batch_size, size=required_size)
+ tasks = openml.tasks.list_tasks(batch_size=1000, size=required_size)
self.assertEqual(len(tasks), required_size)
def test_list_all_for_flows(self):
required_size = 15 # default test server reset value
- flows = openml.flows.list_flows(batch_size=self._batch_size, size=required_size)
+ flows = openml.flows.list_flows(batch_size=25, size=required_size)
self.assertEqual(len(flows), required_size)
@@ -73,7 +71,7 @@ def test_list_all_for_setups(self):
def test_list_all_for_runs(self):
required_size = 21
- runs = openml.runs.list_runs(batch_size=self._batch_size, size=required_size)
+ runs = openml.runs.list_runs(batch_size=25, size=required_size)
# might not be on test server after reset, please rerun test at least once if fails
self.assertEqual(len(runs), required_size)
@@ -87,3 +85,19 @@ def test_list_all_for_evaluations(self):
# might not be on test server after reset, please rerun test at least once if fails
self.assertEqual(len(evaluations), required_size)
+
+ @unittest.mock.patch("openml.config.get_cache_directory")
+ @unittest.skipIf(os.name == "nt", "https://github.com/openml/openml-python/issues/1033")
+ def test__create_cache_directory(self, config_mock):
+ with tempfile.TemporaryDirectory(dir=self.workdir) as td:
+ config_mock.return_value = td
+ openml.utils._create_cache_directory("abc")
+ self.assertTrue(os.path.exists(os.path.join(td, "abc")))
+ subdir = os.path.join(td, "def")
+ os.mkdir(subdir)
+ os.chmod(subdir, 0o444)
+ config_mock.return_value = subdir
+ with self.assertRaisesRegex(
+ openml.exceptions.OpenMLCacheException, r"Cannot create cache directory",
+ ):
+ openml.utils._create_cache_directory("ghi")