diff --git a/.all-contributorsrc b/.all-contributorsrc new file mode 100644 index 000000000..3e16fe084 --- /dev/null +++ b/.all-contributorsrc @@ -0,0 +1,36 @@ +{ + "files": [ + "README.md" + ], + "imageSize": 100, + "commit": false, + "contributors": [ + { + "login": "a-moadel", + "name": "a-moadel", + "avatar_url": "https://avatars0.githubusercontent.com/u/46557866?v=4", + "profile": "https://github.com/a-moadel", + "contributions": [ + "doc", + "example" + ] + }, + { + "login": "Neeratyoy", + "name": "Neeratyoy Mallik", + "avatar_url": "https://avatars2.githubusercontent.com/u/3191233?v=4", + "profile": "https://github.com/Neeratyoy", + "contributions": [ + "code", + "doc", + "example" + ] + } + ], + "contributorsPerLine": 7, + "projectName": "openml-python", + "projectOwner": "openml", + "repoType": "github", + "repoHost": "https://github.com", + "skipCi": true +} diff --git a/.github/workflows/dist.yaml b/.github/workflows/dist.yaml new file mode 100644 index 000000000..51ffe03d5 --- /dev/null +++ b/.github/workflows/dist.yaml @@ -0,0 +1,30 @@ +name: dist-check + +on: [push, pull_request] + +jobs: + dist: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + - name: Build dist + run: | + python setup.py sdist + - name: Twine check + run: | + pip install twine + last_dist=$(ls -t dist/openml-*.tar.gz | head -n 1) + twine check $last_dist + - name: Install dist + run: | + last_dist=$(ls -t dist/openml-*.tar.gz | head -n 1) + pip install $last_dist + - name: PEP 561 Compliance + run: | + pip install mypy + cd .. # required to use the installed version of openml + if ! python -m mypy -c "import openml"; then exit 1; fi diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml new file mode 100644 index 000000000..2219c7fac --- /dev/null +++ b/.github/workflows/docs.yaml @@ -0,0 +1,43 @@ +name: Docs +on: [pull_request, push] + +jobs: + build-and-deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + - name: Install dependencies + run: | + pip install -e .[docs,examples,examples_unix] + - name: Make docs + run: | + cd doc + make html + - name: Pull latest gh-pages + if: (contains(github.ref, 'develop') || contains(github.ref, 'master')) && github.event_name == 'push' + run: | + cd .. + git clone https://github.com/openml/openml-python.git --branch gh-pages --single-branch gh-pages + - name: Copy new doc into gh-pages + if: (contains(github.ref, 'develop') || contains(github.ref, 'master')) && github.event_name == 'push' + run: | + branch_name=${GITHUB_REF##*/} + cd ../gh-pages + rm -rf $branch_name + cp -r ../openml-python/doc/build/html $branch_name + - name: Push to gh-pages + if: (contains(github.ref, 'develop') || contains(github.ref, 'master')) && github.event_name == 'push' + run: | + last_commit=$(git log --pretty=format:"%an: %s") + cd ../gh-pages + branch_name=${GITHUB_REF##*/} + git add $branch_name/ + git config --global user.name 'Github Actions' + git config --global user.email 'not@mail.com' + git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }} + git commit -am "$last_commit" + git push diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml new file mode 100644 index 000000000..6132b2de2 --- /dev/null +++ b/.github/workflows/pre-commit.yaml @@ -0,0 +1,20 @@ +name: pre-commit + +on: [push] + +jobs: + run-all-files: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Setup Python 3.7 + uses: actions/setup-python@v2 + with: + python-version: 3.7 + - name: Install pre-commit + run: | + pip install pre-commit + pre-commit install + - name: Run pre-commit + run: | + pre-commit run --all-files diff --git a/.github/workflows/ubuntu-test.yml b/.github/workflows/ubuntu-test.yml new file mode 100644 index 000000000..41cc155ac --- /dev/null +++ b/.github/workflows/ubuntu-test.yml @@ -0,0 +1,74 @@ +name: Tests + +on: [push, pull_request] + +jobs: + ubuntu: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.6, 3.7, 3.8] + scikit-learn: [0.21.2, 0.22.2, 0.23.1, 0.24] + exclude: # no scikit-learn 0.21.2 release for Python 3.8 + - python-version: 3.8 + scikit-learn: 0.21.2 + include: + - python-version: 3.6 + scikit-learn: 0.18.2 + scipy: 1.2.0 + - python-version: 3.6 + scikit-learn: 0.19.2 + - python-version: 3.6 + scikit-learn: 0.20.2 + - python-version: 3.8 + scikit-learn: 0.23.1 + code-cov: true + fail-fast: false + max-parallel: 4 + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install test dependencies + run: | + python -m pip install --upgrade pip + pip install -e .[test] + - name: Install scikit-learn ${{ matrix.scikit-learn }} + run: | + pip install scikit-learn==${{ matrix.scikit-learn }} + - name: Install scipy ${{ matrix.scipy }} + if: ${{ matrix.scipy }} + run: | + pip install scipy==${{ matrix.scipy }} + - name: Store repository status + id: status-before + run: | + echo "::set-output name=BEFORE::$(git status --porcelain -b)" + - name: Run tests + run: | + if [ ${{ matrix.code-cov }} ]; then codecov='--cov=openml --long --cov-report=xml'; fi + pytest -n 4 --durations=20 --timeout=600 --timeout-method=thread --dist load -sv $codecov --reruns 5 --reruns-delay 1 + - name: Check for files left behind by test + if: ${{ always() }} + run: | + before="${{ steps.status-before.outputs.BEFORE }}" + after="$(git status --porcelain -b)" + if [[ "$before" != "$after" ]]; then + echo "git status from before: $before" + echo "git status from after: $after" + echo "Not all generated files have been deleted!" + exit 1 + fi + - name: Upload coverage + if: matrix.code-cov && always() + uses: codecov/codecov-action@v1 + with: + files: coverage.xml + fail_ci_if_error: true + verbose: true \ No newline at end of file diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 9fd33403c..000000000 --- a/.travis.yml +++ /dev/null @@ -1,58 +0,0 @@ -language: python - -sudo: false - -cache: - apt: true - # We use three different cache directory - # to work around a Travis bug with multi-platform cache - directories: - - $HOME/.cache/pip - - $HOME/download -env: - global: - # Directory where tests are run from - - TEST_DIR=/tmp/test_dir/ - - MODULE=openml - matrix: - - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.23.1" COVERAGE="true" DOCPUSH="true" SKIP_TESTS="true" - - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.23.1" RUN_FLAKE8="true" SKIP_TESTS="true" - - DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.23.1" TEST_DIST="true" - - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.23.1" TEST_DIST="true" - - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.23.1" TEST_DIST="true" - - DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.22.2" TEST_DIST="true" - - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.22.2" TEST_DIST="true" - - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.22.2" TEST_DIST="true" - - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" TEST_DIST="true" - - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2" TEST_DIST="true" - - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.20.2" - # Checks for older scikit-learn versions (which also don't nicely work with - # Python3.7) - - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2" - - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.18.2" SCIPY_VERSION=1.2.0 - -# Travis issue -# https://github.com/travis-ci/travis-ci/issues/8920 -before_install: - - python -c "import fcntl; fcntl.fcntl(1, fcntl.F_SETFL, 0)" - -install: source ci_scripts/install.sh -script: bash ci_scripts/test.sh -after_success: source ci_scripts/success.sh && source ci_scripts/create_doc.sh $TRAVIS_BRANCH "doc_result" - -# travis will check the deploy on condition, before actually running before_deploy -# before_deploy: source ci_scripts/create_doc.sh $TRAVIS_BRANCH "doc_result" - -# For more info regarding the deploy process and the github token look at: -# https://docs.travis-ci.com/user/deployment/pages/ - -deploy: - provider: pages - skip_cleanup: true - github_token: $GITHUB_TOKEN - keep-history: true - committer-from-gh: true - on: - all_branches: true - condition: $doc_result = "success" - local_dir: doc/$TRAVIS_BRANCH diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6b7cffad3..6fe4fd605 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -260,14 +260,9 @@ The resulting HTML files will be placed in ``build/html/`` and are viewable in a web browser. See the ``README`` file in the ``doc/`` directory for more information. -For building the documentation, you will need -[sphinx](http://sphinx.pocoo.org/), -[sphinx-bootstrap-theme](https://ryan-roemer.github.io/sphinx-bootstrap-theme/), -[sphinx-gallery](https://sphinx-gallery.github.io/) -and -[numpydoc](https://numpydoc.readthedocs.io/en/latest/). +For building the documentation, you will need to install a few additional dependencies: ```bash -$ pip install sphinx sphinx-bootstrap-theme sphinx-gallery numpydoc +$ pip install -e .[docs] ``` When dependencies are installed, run ```bash diff --git a/README.md b/README.md index 732085697..55bab368d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,7 @@ # OpenML-Python + +[![All Contributors](https://img.shields.io/badge/all_contributors-2-orange.svg?style=flat-square)](#contributors-) + A python interface for [OpenML](http://openml.org), an online platform for open science collaboration in machine learning. It can be used to download or upload OpenML data such as datasets and machine learning experiment results. @@ -40,3 +43,23 @@ Bibtex entry: year = {2019}, } ``` + +## Contributors ✨ + +Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)): + + + + + + + + + +

a-moadel

📖 💡

Neeratyoy Mallik

💻 📖 💡
+ + + + + +This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome! \ No newline at end of file diff --git a/appveyor.yml b/appveyor.yml index 151a5e3f7..e3fa74aaf 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -45,4 +45,4 @@ build: false test_script: - "cd C:\\projects\\openml-python" - - "%CMD_IN_ENV% pytest -n 4 --timeout=600 --timeout-method=thread -sv" + - "%CMD_IN_ENV% pytest -n 4 --timeout=600 --timeout-method=thread --dist load -sv" diff --git a/ci_scripts/create_doc.sh b/ci_scripts/create_doc.sh deleted file mode 100644 index 83afaa26b..000000000 --- a/ci_scripts/create_doc.sh +++ /dev/null @@ -1,61 +0,0 @@ -# License: BSD 3-Clause - -set -euo pipefail - -# Check if DOCPUSH is set -if ! [[ -z ${DOCPUSH+x} ]]; then - - if [[ "$DOCPUSH" == "true" ]]; then - - # install documentation building dependencies - pip install matplotlib seaborn sphinx pillow sphinx-gallery sphinx_bootstrap_theme cython numpydoc nbformat nbconvert - - # $1 is the branch name - # $2 is the global variable where we set the script status - - if ! { [ $1 = "master" ] || [ $1 = "develop" ]; }; then - { echo "Not one of the allowed branches"; exit 0; } - fi - - # delete any previous documentation folder - if [ -d doc/$1 ]; then - rm -rf doc/$1 - fi - - # create the documentation - cd doc && make html 2>&1 - - # create directory with branch name - # the documentation for dev/stable from git will be stored here - mkdir $1 - - # get previous documentation from github - git clone https://github.com/openml/openml-python.git --branch gh-pages --single-branch - - # copy previous documentation - cp -r openml-python/. $1 - rm -rf openml-python - - # if the documentation for the branch exists, remove it - if [ -d $1/$1 ]; then - rm -rf $1/$1 - fi - - # copy the updated documentation for this branch - mkdir $1/$1 - cp -r build/html/. $1/$1 - - # takes a variable name as an argument and assigns the script outcome to a - # variable with the given name. If it got this far, the script was successful - function set_return() { - # $1 is the variable where we save the script outcome - local __result=$1 - local status='success' - eval $__result="'$status'" - } - - set_return "$2" - fi -fi -# Workaround for travis failure -set +u diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh deleted file mode 100755 index 67530af53..000000000 --- a/ci_scripts/install.sh +++ /dev/null @@ -1,81 +0,0 @@ -# License: BSD 3-Clause - -set -e - -# Deactivate the travis-provided virtual environment and setup a -# conda-based environment instead -deactivate - -# Use the miniconda installer for faster download / install of conda -# itself -pushd . -cd -mkdir -p download -cd download -echo "Cached in $HOME/download :" -ls -l -echo -if [[ ! -f miniconda.sh ]] - then - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \ - -O miniconda.sh - fi -chmod +x miniconda.sh && ./miniconda.sh -b -p $HOME/miniconda -cd .. -export PATH=/home/travis/miniconda/bin:$PATH -conda update --yes conda -popd - -# Configure the conda environment and put it in the path using the -# provided versions -conda create -n testenv --yes python=$PYTHON_VERSION pip -source activate testenv - -if [[ -v SCIPY_VERSION ]]; then - conda install --yes scipy=$SCIPY_VERSION -fi -python --version - -if [[ "$TEST_DIST" == "true" ]]; then - pip install twine nbconvert jupyter_client matplotlib pyarrow pytest pytest-xdist pytest-timeout \ - nbformat oslo.concurrency flaky mypy - python setup.py sdist - # Find file which was modified last as done in https://stackoverflow.com/a/4561987 - dist=`find dist -type f -printf '%T@ %p\n' | sort -n | tail -1 | cut -f2- -d" "` - echo "Installing $dist" - pip install "$dist" - twine check "$dist" -else - pip install -e '.[test]' -fi - -python -c "import numpy; print('numpy %s' % numpy.__version__)" -python -c "import scipy; print('scipy %s' % scipy.__version__)" - - -if [[ "$DOCPUSH" == "true" ]]; then - conda install --yes gxx_linux-64 gcc_linux-64 swig - pip install -e '.[examples,examples_unix]' -fi -if [[ "$COVERAGE" == "true" ]]; then - pip install codecov pytest-cov -fi -if [[ "$RUN_FLAKE8" == "true" ]]; then - pip install pre-commit - pre-commit install -fi - -# PEP 561 compliance check -# Assumes mypy relies solely on the PEP 561 standard -if ! python -m mypy -c "import openml"; then - echo "Failed: PEP 561 compliance" - exit 1 -else - echo "Success: PEP 561 compliant" -fi - -# Install scikit-learn last to make sure the openml package installation works -# from a clean environment without scikit-learn. -pip install scikit-learn==$SKLEARN_VERSION - -conda list diff --git a/ci_scripts/success.sh b/ci_scripts/success.sh deleted file mode 100644 index dad97d54e..000000000 --- a/ci_scripts/success.sh +++ /dev/null @@ -1,15 +0,0 @@ -# License: BSD 3-Clause - -set -e - -if [[ "$COVERAGE" == "true" ]]; then - # Need to run coveralls from a git checkout, so we copy .coverage - # from TEST_DIR where pytest has been run - cp $TEST_DIR/.coverage $TRAVIS_BUILD_DIR - cd $TRAVIS_BUILD_DIR - # Ignore coveralls failures as the coveralls server is not - # very reliable but we don't want travis to report a failure - # in the github UI just because the coverage report failed to - # be published. - codecov || echo "Codecov upload failed" -fi diff --git a/ci_scripts/test.sh b/ci_scripts/test.sh deleted file mode 100644 index 0a1f94df6..000000000 --- a/ci_scripts/test.sh +++ /dev/null @@ -1,48 +0,0 @@ -# License: BSD 3-Clause - -set -e - -# check status and branch before running the unit tests -before="`git status --porcelain -b`" -before="$before" -# storing current working directory -curr_dir=`pwd` - -run_tests() { - # Get into a temp directory to run test from the installed scikit learn and - # check if we do not leave artifacts - mkdir -p $TEST_DIR - - cwd=`pwd` - test_dir=$cwd/tests - - cd $TEST_DIR - - if [[ "$COVERAGE" == "true" ]]; then - PYTEST_ARGS='--cov=openml' - else - PYTEST_ARGS='' - fi - - pytest -n 4 --durations=20 --timeout=600 --timeout-method=thread -sv $PYTEST_ARGS $test_dir -} - -if [[ "$RUN_FLAKE8" == "true" ]]; then - pre-commit run --all-files -fi - -if [[ "$SKIP_TESTS" != "true" ]]; then - run_tests -fi - -# changing directory to stored working directory -cd $curr_dir -# check status and branch after running the unit tests -# compares with $before to check for remaining files -after="`git status --porcelain -b`" -if [[ "$before" != "$after" ]]; then - echo 'git status from before: '$before - echo 'git status from after: '$after - echo "All generated files have not been deleted!" - exit 1 -fi diff --git a/doc/conf.py b/doc/conf.py index 9c4606143..e5de2d551 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -64,10 +64,8 @@ master_doc = "index" # General information about the project. -project = u"OpenML" -copyright = u"2014-{}, the OpenML-Python team.".format( - time.strftime("%Y,%m,%d,%H,%M,%S").split(",")[0] -) +project = "OpenML" +copyright = f"2014-{time.localtime().tm_year}, the OpenML-Python team" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -263,7 +261,7 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ("index", "OpenML.tex", u"OpenML Documentation", u"Matthias Feurer", "manual"), + ("index", "OpenML.tex", "OpenML Documentation", "Matthias Feurer", "manual"), ] # The name of an image file (relative to this directory) to place at the top of @@ -291,7 +289,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [("index", "openml", u"OpenML Documentation", [u"Matthias Feurer"], 1)] +man_pages = [("index", "openml", "OpenML Documentation", ["Matthias Feurer"], 1)] # If true, show URL addresses after external links. # man_show_urls = False @@ -306,8 +304,8 @@ ( "index", "OpenML", - u"OpenML Documentation", - u"Matthias Feurer", + "OpenML Documentation", + "Matthias Feurer", "OpenML", "One line description of project.", "Miscellaneous", diff --git a/doc/index.rst b/doc/index.rst index 789979023..e38e4d877 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -32,7 +32,7 @@ Example ) # Download the OpenML task for the german credit card dataset with 10-fold # cross-validation. - task = openml.tasks.get_task(31) + task = openml.tasks.get_task(32) # Run the scikit-learn model on the task. run = openml.runs.run_model_on_task(clf, task) # Publish the experiment on OpenML (optional, requires an API key. diff --git a/doc/progress.rst b/doc/progress.rst index 1956fcb42..1ca1e1d0e 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -6,6 +6,35 @@ Changelog ========= +0.11.1 +~~~~~~ +* ADD #964: Validate ``ignore_attribute``, ``default_target_attribute``, ``row_id_attribute`` are set to attributes that exist on the dataset when calling ``create_dataset``. +* ADD #979: Dataset features and qualities are now also cached in pickle format. +* ADD #982: Add helper functions for column transformers. +* ADD #989: ``run_model_on_task`` will now warn the user the the model passed has already been fitted. +* ADD #1009 : Give possibility to not download the dataset qualities. The cached version is used even so download attribute is false. +* ADD #1016: Add scikit-learn 0.24 support. +* ADD #1020: Add option to parallelize evaluation of tasks with joblib. +* ADD #1022: Allow minimum version of dependencies to be listed for a flow, use more accurate minimum versions for scikit-learn dependencies. +* ADD #1023: Add admin-only calls for adding topics to datasets. +* ADD #1029: Add support for fetching dataset from a minio server in parquet format. +* ADD #1031: Generally improve runtime measurements, add them for some previously unsupported flows (e.g. BaseSearchCV derived flows). +* DOC #973 : Change the task used in the welcome page example so it no longer fails using numerical dataset. +* MAINT #671: Improved the performance of ``check_datasets_active`` by only querying the given list of datasets in contrast to querying all datasets. Modified the corresponding unit test. +* MAINT #891: Changed the way that numerical features are stored. Numerical features that range from 0 to 255 are now stored as uint8, which reduces the storage space required as well as storing and loading times. +* MAINT #975, #988: Add CI through Github Actions. +* MAINT #977: Allow ``short`` and ``long`` scenarios for unit tests. Reduce the workload for some unit tests. +* MAINT #985, #1000: Improve unit test stability and output readability, and adds load balancing. +* MAINT #1018: Refactor data loading and storage. Data is now compressed on the first call to `get_data`. +* MAINT #1024: Remove flaky decorator for study unit test. +* FIX #883 #884 #906 #972: Various improvements to the caching system. +* FIX #980: Speed up ``check_datasets_active``. +* FIX #984: Add a retry mechanism when the server encounters a database issue. +* FIX #1004: Fixed an issue that prevented installation on some systems (e.g. Ubuntu). +* FIX #1013: Fixes a bug where ``OpenMLRun.setup_string`` was not uploaded to the server, prepares for ``run_details`` being sent from the server. +* FIX #1021: Fixes an issue that could occur when running unit tests and openml-python was not in PATH. +* FIX #1037: Fixes a bug where a dataset could not be loaded if a categorical value had listed nan-like as a possible category. + 0.11.0 ~~~~~~ * ADD #753: Allows uploading custom flows to OpenML via OpenML-Python. diff --git a/examples/30_extended/custom_flow_tutorial.py b/examples/30_extended/custom_flow_.py similarity index 98% rename from examples/30_extended/custom_flow_tutorial.py rename to examples/30_extended/custom_flow_.py index 3b918e108..02aef9c5c 100644 --- a/examples/30_extended/custom_flow_tutorial.py +++ b/examples/30_extended/custom_flow_.py @@ -82,10 +82,10 @@ # This allows people to specify auto-sklearn hyperparameters used in this flow. # In general, using a subflow is not required. # -# Note: flow 15275 is not actually the right flow on the test server, +# Note: flow 9313 is not actually the right flow on the test server, # but that does not matter for this demonstration. -autosklearn_flow = openml.flows.get_flow(15275) # auto-sklearn 0.5.1 +autosklearn_flow = openml.flows.get_flow(9313) # auto-sklearn 0.5.1 subflow = dict(components=OrderedDict(automl_tool=autosklearn_flow),) #################################################################################################### @@ -120,7 +120,7 @@ OrderedDict([("oml:name", "time"), ("oml:value", 120), ("oml:component", flow_id)]), ] -task_id = 1408 # Iris Task +task_id = 1965 # Iris Task task = openml.tasks.get_task(task_id) dataset_id = task.get_dataset().dataset_id diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py index 594a58930..7a51cce70 100644 --- a/examples/30_extended/datasets_tutorial.py +++ b/examples/30_extended/datasets_tutorial.py @@ -112,7 +112,7 @@ ############################################################################ # Edit a created dataset -# ================================================= +# ====================== # This example uses the test server, to avoid editing a dataset on the main server. openml.config.start_using_configuration_for_example() ############################################################################ @@ -143,18 +143,23 @@ # tasks associated with it. To edit critical fields of a dataset (without tasks) owned by you, # configure the API key: # openml.config.apikey = 'FILL_IN_OPENML_API_KEY' -data_id = edit_dataset(564, default_target_attribute="y") -print(f"Edited dataset ID: {data_id}") - +# This example here only shows a failure when trying to work on a dataset not owned by you: +try: + data_id = edit_dataset(1, default_target_attribute="shape") +except openml.exceptions.OpenMLServerException as e: + print(e) ############################################################################ # Fork dataset +# ============ # Used to create a copy of the dataset with you as the owner. # Use this API only if you are unable to edit the critical fields (default_target_attribute, # ignore_attribute, row_id_attribute) of a dataset through the edit_dataset API. # After the dataset is forked, you can edit the new version of the dataset using edit_dataset. -data_id = fork_dataset(564) +data_id = fork_dataset(1) +print(data_id) +data_id = edit_dataset(data_id, default_target_attribute="shape") print(f"Forked dataset ID: {data_id}") openml.config.stop_using_configuration_for_example() diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py index 76eb2f219..9f8c89375 100644 --- a/examples/30_extended/flows_and_runs_tutorial.py +++ b/examples/30_extended/flows_and_runs_tutorial.py @@ -53,7 +53,7 @@ task = openml.tasks.get_task(403) # Build any classifier or pipeline -clf = tree.ExtraTreeClassifier() +clf = tree.DecisionTreeClassifier() # Run the flow run = openml.runs.run_model_on_task(clf, task) @@ -82,13 +82,14 @@ # ############################ # # When you need to handle 'dirty' data, build pipelines to model then automatically. -task = openml.tasks.get_task(1) -features = task.get_dataset().features -nominal_feature_indices = [ - i - for i in range(len(features)) - if features[i].name != task.target_name and features[i].data_type == "nominal" -] +# To demonstrate this using the dataset `credit-a `_ via +# `task `_ as it contains both numerical and categorical +# variables and missing values in both. +task = openml.tasks.get_task(96) + +# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines +from openml.extensions.sklearn import cat, cont + pipe = pipeline.Pipeline( steps=[ ( @@ -96,20 +97,15 @@ compose.ColumnTransformer( [ ( - "Nominal", - pipeline.Pipeline( - [ - ("Imputer", impute.SimpleImputer(strategy="most_frequent")), - ( - "Encoder", - preprocessing.OneHotEncoder( - sparse=False, handle_unknown="ignore", - ), - ), - ] - ), - nominal_feature_indices, + "categorical", + preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"), + cat, # returns the categorical feature indices ), + ( + "continuous", + impute.SimpleImputer(strategy="median"), + cont, + ), # returns the numeric feature indices ] ), ), @@ -121,6 +117,50 @@ myrun = run.publish() print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id)) + +# The above pipeline works with the helper functions that internally deal with pandas DataFrame. +# In the case, pandas is not available, or a NumPy based data processing is the requirement, the +# above pipeline is presented below to work with NumPy. + +# Extracting the indices of the categorical columns +features = task.get_dataset().features +categorical_feature_indices = [] +numeric_feature_indices = [] +for i in range(len(features)): + if features[i].name == task.target_name: + continue + if features[i].data_type == "nominal": + categorical_feature_indices.append(i) + else: + numeric_feature_indices.append(i) + +pipe = pipeline.Pipeline( + steps=[ + ( + "Preprocessing", + compose.ColumnTransformer( + [ + ( + "categorical", + preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"), + categorical_feature_indices, + ), + ( + "continuous", + impute.SimpleImputer(strategy="median"), + numeric_feature_indices, + ), + ] + ), + ), + ("Classifier", ensemble.RandomForestClassifier(n_estimators=10)), + ] +) + +run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, dataset_format="array") +myrun = run.publish() +print("Uploaded to http://test.openml.org/r/" + str(myrun.run_id)) + ############################################################################### # Running flows on tasks offline for later upload # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -132,7 +172,9 @@ task = openml.tasks.get_task(6) # The following lines can then be executed offline: -run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, upload_flow=False) +run = openml.runs.run_model_on_task( + pipe, task, avoid_duplicate_runs=False, upload_flow=False, dataset_format="array", +) # The run may be stored offline, and the flow will be stored along with it: run.to_filesystem(directory="myrun") diff --git a/examples/30_extended/run_setup_tutorial.py b/examples/30_extended/run_setup_tutorial.py index a46bf9699..8579d1d38 100644 --- a/examples/30_extended/run_setup_tutorial.py +++ b/examples/30_extended/run_setup_tutorial.py @@ -34,14 +34,14 @@ import numpy as np import openml -import sklearn.ensemble -import sklearn.impute -import sklearn.preprocessing +from openml.extensions.sklearn import cat, cont + from sklearn.pipeline import make_pipeline, Pipeline from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder, FunctionTransformer -from sklearn.experimental import enable_hist_gradient_boosting +from sklearn.ensemble import RandomForestClassifier +from sklearn.decomposition import TruncatedSVD openml.config.start_using_configuration_for_example() @@ -58,37 +58,20 @@ # many potential hyperparameters. Of course, the model can be as complex and as # easy as you want it to be -from sklearn.ensemble import HistGradientBoostingClassifier -from sklearn.decomposition import TruncatedSVD - - -# Helper functions to return required columns for ColumnTransformer -def cont(X): - return X.dtypes != "category" - - -def cat(X): - return X.dtypes == "category" - -cat_imp = make_pipeline( - SimpleImputer(strategy="most_frequent"), - OneHotEncoder(handle_unknown="ignore", sparse=False), - TruncatedSVD(), -) -ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", "passthrough", cont)]) -model_original = sklearn.pipeline.Pipeline( - steps=[("transform", ct), ("estimator", HistGradientBoostingClassifier()),] -) +cat_imp = make_pipeline(OneHotEncoder(handle_unknown="ignore", sparse=False), TruncatedSVD(),) +cont_imp = SimpleImputer(strategy="median") +ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) +model_original = Pipeline(steps=[("transform", ct), ("estimator", RandomForestClassifier()),]) # Let's change some hyperparameters. Of course, in any good application we # would tune them using, e.g., Random Search or Bayesian Optimization, but for # the purpose of this tutorial we set them to some specific values that might # or might not be optimal hyperparameters_original = { - "estimator__loss": "auto", - "estimator__learning_rate": 0.15, - "estimator__max_iter": 50, + "estimator__criterion": "gini", + "estimator__n_estimators": 50, + "estimator__max_depth": 10, "estimator__min_samples_leaf": 1, } model_original.set_params(**hyperparameters_original) diff --git a/examples/30_extended/study_tutorial.py b/examples/30_extended/study_tutorial.py index c02a5c038..3c93a7e81 100644 --- a/examples/30_extended/study_tutorial.py +++ b/examples/30_extended/study_tutorial.py @@ -15,13 +15,7 @@ import uuid -import numpy as np -import sklearn.tree -from sklearn.pipeline import make_pipeline, Pipeline -from sklearn.compose import ColumnTransformer -from sklearn.impute import SimpleImputer -from sklearn.decomposition import TruncatedSVD -from sklearn.preprocessing import OneHotEncoder, FunctionTransformer +from sklearn.ensemble import RandomForestClassifier import openml @@ -71,45 +65,25 @@ ) print(evaluations.head()) -###########################################################from openml.testing import cat, cont################# +############################################################################ # Uploading studies # ================= # # Creating a study is as simple as creating any kind of other OpenML entity. # In this examples we'll create a few runs for the OpenML-100 benchmark # suite which is available on the OpenML test server. - openml.config.start_using_configuration_for_example() -# Model that can handle missing values -from sklearn.experimental import enable_hist_gradient_boosting -from sklearn.ensemble import HistGradientBoostingClassifier - - -# Helper functions to return required columns for ColumnTransformer -def cont(X): - return X.dtypes != "category" - - -def cat(X): - return X.dtypes == "category" +# Model to be used +clf = RandomForestClassifier() +# We'll create a study with one run on 3 datasets present in the suite +tasks = [115, 259, 307] -cat_imp = make_pipeline( - SimpleImputer(strategy="most_frequent"), - OneHotEncoder(handle_unknown="ignore", sparse=False), - TruncatedSVD(), -) -ct = ColumnTransformer( - [("cat", cat_imp, cat), ("cont", FunctionTransformer(lambda x: x, validate=False), cont)] -) -clf = sklearn.pipeline.Pipeline( - steps=[("transform", ct), ("estimator", HistGradientBoostingClassifier()),] -) - +# To verify suite = openml.study.get_suite(1) -# We'll create a study with one run on three random datasets each -tasks = np.random.choice(suite.tasks, size=3, replace=False) +print(all([t_id in suite.tasks for t_id in tasks])) + run_ids = [] for task_id in tasks: task = openml.tasks.get_task(task_id) diff --git a/examples/30_extended/task_manual_iteration_tutorial.py b/examples/30_extended/task_manual_iteration_tutorial.py index c879e9fea..533f645b2 100644 --- a/examples/30_extended/task_manual_iteration_tutorial.py +++ b/examples/30_extended/task_manual_iteration_tutorial.py @@ -61,11 +61,11 @@ #################################################################################################### # And then split the data based on this: -X, y, _, _ = task.get_dataset().get_data(task.target_name) -X_train = X.loc[train_indices] -y_train = y[train_indices] -X_test = X.loc[test_indices] -y_test = y[test_indices] +X, y = task.get_X_and_y(dataset_format="dataframe") +X_train = X.iloc[train_indices] +y_train = y.iloc[train_indices] +X_test = X.iloc[test_indices] +y_test = y.iloc[test_indices] print( "X_train.shape: {}, y_train.shape: {}, X_test.shape: {}, y_test.shape: {}".format( @@ -78,6 +78,7 @@ task_id = 3 task = openml.tasks.get_task(task_id) +X, y = task.get_X_and_y(dataset_format="dataframe") n_repeats, n_folds, n_samples = task.get_split_dimensions() print( "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format( @@ -93,10 +94,10 @@ train_indices, test_indices = task.get_train_test_split_indices( repeat=repeat_idx, fold=fold_idx, sample=sample_idx, ) - X_train = X.loc[train_indices] - y_train = y[train_indices] - X_test = X.loc[test_indices] - y_test = y[test_indices] + X_train = X.iloc[train_indices] + y_train = y.iloc[train_indices] + X_test = X.iloc[test_indices] + y_test = y.iloc[test_indices] print( "Repeat #{}, fold #{}, samples {}: X_train.shape: {}, " @@ -116,6 +117,7 @@ task_id = 1767 task = openml.tasks.get_task(task_id) +X, y = task.get_X_and_y(dataset_format="dataframe") n_repeats, n_folds, n_samples = task.get_split_dimensions() print( "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format( @@ -131,10 +133,10 @@ train_indices, test_indices = task.get_train_test_split_indices( repeat=repeat_idx, fold=fold_idx, sample=sample_idx, ) - X_train = X.loc[train_indices] - y_train = y[train_indices] - X_test = X.loc[test_indices] - y_test = y[test_indices] + X_train = X.iloc[train_indices] + y_train = y.iloc[train_indices] + X_test = X.iloc[test_indices] + y_test = y.iloc[test_indices] print( "Repeat #{}, fold #{}, samples {}: X_train.shape: {}, " @@ -154,6 +156,7 @@ task_id = 1702 task = openml.tasks.get_task(task_id) +X, y = task.get_X_and_y(dataset_format="dataframe") n_repeats, n_folds, n_samples = task.get_split_dimensions() print( "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format( @@ -169,10 +172,10 @@ train_indices, test_indices = task.get_train_test_split_indices( repeat=repeat_idx, fold=fold_idx, sample=sample_idx, ) - X_train = X.loc[train_indices] - y_train = y[train_indices] - X_test = X.loc[test_indices] - y_test = y[test_indices] + X_train = X.iloc[train_indices] + y_train = y.iloc[train_indices] + X_test = X.iloc[test_indices] + y_test = y.iloc[test_indices] print( "Repeat #{}, fold #{}, samples {}: X_train.shape: {}, " diff --git a/examples/40_paper/2018_neurips_perrone_example.py b/examples/40_paper/2018_neurips_perrone_example.py index 60d212116..5ae339ae2 100644 --- a/examples/40_paper/2018_neurips_perrone_example.py +++ b/examples/40_paper/2018_neurips_perrone_example.py @@ -177,18 +177,14 @@ def list_categorical_attributes(flow_type="svm"): cat_cols = list_categorical_attributes(flow_type=flow_type) num_cols = list(set(X.columns) - set(cat_cols)) -# Missing value imputers -cat_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="None") +# Missing value imputers for numeric columns num_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=-1) -# Creating the one-hot encoder +# Creating the one-hot encoder for numerical representation of categorical columns enc = OneHotEncoder(handle_unknown="ignore") -# Pipeline to handle categorical column transformations -cat_transforms = Pipeline(steps=[("impute", cat_imputer), ("encode", enc)]) - # Combining column transformers -ct = ColumnTransformer([("cat", cat_transforms, cat_cols), ("num", num_imputer, num_cols)]) +ct = ColumnTransformer([("cat", enc, cat_cols), ("num", num_imputer, num_cols)]) # Creating the full pipeline with the surrogate model clf = RandomForestRegressor(n_estimators=50) diff --git a/openml/__version__.py b/openml/__version__.py index 07c9a950d..ff4effa59 100644 --- a/openml/__version__.py +++ b/openml/__version__.py @@ -3,4 +3,4 @@ # License: BSD 3-Clause # The following line *must* be the last in the module, exactly as formatted: -__version__ = "0.11.0" +__version__ = "0.12.0" diff --git a/openml/_api_calls.py b/openml/_api_calls.py index 57599b912..aee67d8c6 100644 --- a/openml/_api_calls.py +++ b/openml/_api_calls.py @@ -3,9 +3,14 @@ import time import hashlib import logging +import pathlib import requests +import urllib.parse +import xml import xmltodict -from typing import Dict, Optional +from typing import Dict, Optional, Union + +import minio from . import config from .exceptions import ( @@ -55,7 +60,7 @@ def _perform_api_call(call, request_method, data=None, file_elements=None): if file_elements is not None: if request_method != "post": raise ValueError("request method must be post when file elements are present") - response = __read_url_files(url, data=data, file_elements=file_elements) + response = _read_url_files(url, data=data, file_elements=file_elements) else: response = __read_url(url, request_method, data) @@ -67,6 +72,45 @@ def _perform_api_call(call, request_method, data=None, file_elements=None): return response.text +def _download_minio_file( + source: str, destination: Union[str, pathlib.Path], exists_ok: bool = True, +) -> None: + """ Download file ``source`` from a MinIO Bucket and store it at ``destination``. + + Parameters + ---------- + source : Union[str, pathlib.Path] + URL to a file in a MinIO bucket. + destination : str + Path to store the file to, if a directory is provided the original filename is used. + exists_ok : bool, optional (default=True) + If False, raise FileExists if a file already exists in ``destination``. + + """ + destination = pathlib.Path(destination) + parsed_url = urllib.parse.urlparse(source) + + # expect path format: /BUCKET/path/to/file.ext + bucket, object_name = parsed_url.path[1:].split("/", maxsplit=1) + if destination.is_dir(): + destination = pathlib.Path(destination, object_name) + if destination.is_file() and not exists_ok: + raise FileExistsError(f"File already exists in {destination}.") + + client = minio.Minio(endpoint=parsed_url.netloc, secure=False) + + try: + client.fget_object( + bucket_name=bucket, object_name=object_name, file_path=str(destination), + ) + except minio.error.S3Error as e: + if e.message.startswith("Object does not exist"): + raise FileNotFoundError(f"Object at '{source}' does not exist.") from e + # e.g. permission error, or a bucket does not exist (which is also interpreted as a + # permission error on minio level). + raise FileNotFoundError("Bucket does not exist or is private.") from e + + def _download_text_file( source: str, output_path: Optional[str] = None, @@ -105,21 +149,9 @@ def _download_text_file( logging.info("Starting [%s] request for the URL %s", "get", source) start = time.time() - response = __read_url(source, request_method="get") - __check_response(response, source, None) + response = __read_url(source, request_method="get", md5_checksum=md5_checksum) downloaded_file = response.text - if md5_checksum is not None: - md5 = hashlib.md5() - md5.update(downloaded_file.encode("utf-8")) - md5_checksum_download = md5.hexdigest() - if md5_checksum != md5_checksum_download: - raise OpenMLHashException( - "Checksum {} of downloaded file is unequal to the expected checksum {}.".format( - md5_checksum_download, md5_checksum - ) - ) - if output_path is None: logging.info( "%.7fs taken for [%s] request for the URL %s", time.time() - start, "get", source, @@ -138,15 +170,6 @@ def _download_text_file( return None -def __check_response(response, url, file_elements): - if response.status_code != 200: - raise __parse_server_exception(response, url, file_elements=file_elements) - elif ( - "Content-Encoding" not in response.headers or response.headers["Content-Encoding"] != "gzip" - ): - logging.warning("Received uncompressed content from OpenML for {}.".format(url)) - - def _file_id_to_url(file_id, filename=None): """ Presents the URL how to download a given file id @@ -159,7 +182,7 @@ def _file_id_to_url(file_id, filename=None): return url -def __read_url_files(url, data=None, file_elements=None): +def _read_url_files(url, data=None, file_elements=None): """do a post request to url with data and sending file_elements as files""" @@ -169,26 +192,37 @@ def __read_url_files(url, data=None, file_elements=None): file_elements = {} # Using requests.post sets header 'Accept-encoding' automatically to # 'gzip,deflate' - response = __send_request(request_method="post", url=url, data=data, files=file_elements,) + response = _send_request(request_method="post", url=url, data=data, files=file_elements,) return response -def __read_url(url, request_method, data=None): +def __read_url(url, request_method, data=None, md5_checksum=None): data = {} if data is None else data - if config.apikey is not None: + if config.apikey: data["api_key"] = config.apikey + return _send_request( + request_method=request_method, url=url, data=data, md5_checksum=md5_checksum + ) + + +def __is_checksum_equal(downloaded_file, md5_checksum=None): + if md5_checksum is None: + return True + md5 = hashlib.md5() + md5.update(downloaded_file.encode("utf-8")) + md5_checksum_download = md5.hexdigest() + if md5_checksum == md5_checksum_download: + return True + return False - return __send_request(request_method=request_method, url=url, data=data) +def _send_request(request_method, url, data, files=None, md5_checksum=None): + n_retries = max(1, min(config.connection_n_retries, config.max_retries)) -def __send_request( - request_method, url, data, files=None, -): - n_retries = config.connection_n_retries response = None with requests.Session() as session: # Start at one to have a non-zero multiplier for the sleep - for i in range(1, n_retries + 1): + for retry_counter in range(1, n_retries + 1): try: if request_method == "get": response = session.get(url, params=data) @@ -198,17 +232,51 @@ def __send_request( response = session.post(url, data=data, files=files) else: raise NotImplementedError() + __check_response(response=response, url=url, file_elements=files) + if request_method == "get" and not __is_checksum_equal(response.text, md5_checksum): + raise OpenMLHashException( + "Checksum of downloaded file is unequal to the expected checksum {} " + "when downloading {}.".format(md5_checksum, url) + ) break - except (requests.exceptions.ConnectionError, requests.exceptions.SSLError,) as e: - if i == n_retries: - raise e + except ( + requests.exceptions.ConnectionError, + requests.exceptions.SSLError, + OpenMLServerException, + xml.parsers.expat.ExpatError, + OpenMLHashException, + ) as e: + if isinstance(e, OpenMLServerException): + if e.code not in [107, 500]: + # 107: database connection error + # 500: internal server error + raise + elif isinstance(e, xml.parsers.expat.ExpatError): + if request_method != "get" or retry_counter >= n_retries: + raise OpenMLServerError( + "Unexpected server error when calling {}. Please contact the " + "developers!\nStatus code: {}\n{}".format( + url, response.status_code, response.text, + ) + ) + if retry_counter >= n_retries: + raise else: - time.sleep(0.1 * i) + time.sleep(retry_counter) if response is None: raise ValueError("This should never happen!") return response +def __check_response(response, url, file_elements): + if response.status_code != 200: + raise __parse_server_exception(response, url, file_elements=file_elements) + elif ( + "Content-Encoding" not in response.headers or response.headers["Content-Encoding"] != "gzip" + ): + logging.warning("Received uncompressed content from OpenML for {}.".format(url)) + + def __parse_server_exception( response: requests.Response, url: str, file_elements: Dict, ) -> OpenMLServerError: @@ -217,6 +285,8 @@ def __parse_server_exception( raise OpenMLServerError("URI too long! ({})".format(url)) try: server_exception = xmltodict.parse(response.text) + except xml.parsers.expat.ExpatError: + raise except Exception: # OpenML has a sophisticated error system # where information about failures is provided. try to parse this diff --git a/openml/config.py b/openml/config.py index 296b71663..9e2e697d5 100644 --- a/openml/config.py +++ b/openml/config.py @@ -7,6 +7,8 @@ import logging import logging.handlers import os +from pathlib import Path +import platform from typing import Tuple, cast from io import StringIO @@ -19,7 +21,7 @@ file_handler = None -def _create_log_handlers(): +def _create_log_handlers(create_file_handler=True): """ Creates but does not attach the log handlers. """ global console_handler, file_handler if console_handler is not None or file_handler is not None: @@ -32,12 +34,13 @@ def _create_log_handlers(): console_handler = logging.StreamHandler() console_handler.setFormatter(output_formatter) - one_mb = 2 ** 20 - log_path = os.path.join(cache_directory, "openml_python.log") - file_handler = logging.handlers.RotatingFileHandler( - log_path, maxBytes=one_mb, backupCount=1, delay=True - ) - file_handler.setFormatter(output_formatter) + if create_file_handler: + one_mb = 2 ** 20 + log_path = os.path.join(cache_directory, "openml_python.log") + file_handler = logging.handlers.RotatingFileHandler( + log_path, maxBytes=one_mb, backupCount=1, delay=True + ) + file_handler.setFormatter(output_formatter) def _convert_log_levels(log_level: int) -> Tuple[int, int]: @@ -83,14 +86,18 @@ def set_file_log_level(file_output_level: int): # Default values (see also https://github.com/openml/OpenML/wiki/Client-API-Standards) _defaults = { - "apikey": None, + "apikey": "", "server": "https://www.openml.org/api/v1/xml", - "cachedir": os.path.expanduser(os.path.join("~", ".openml", "cache")), + "cachedir": ( + os.environ.get("XDG_CACHE_HOME", os.path.join("~", ".cache", "openml",)) + if platform.system() == "Linux" + else os.path.join("~", ".openml") + ), "avoid_duplicate_runs": "True", - "connection_n_retries": 2, + "connection_n_retries": "10", + "max_retries": "20", } -config_file = os.path.expanduser(os.path.join("~", ".openml", "config")) # Default values are actually added here in the _setup() function which is # called at the end of this module @@ -115,7 +122,8 @@ def get_server_base_url() -> str: avoid_duplicate_runs = True if _defaults["avoid_duplicate_runs"] == "True" else False # Number of retries if the connection breaks -connection_n_retries = _defaults["connection_n_retries"] +connection_n_retries = int(_defaults["connection_n_retries"]) +max_retries = int(_defaults["max_retries"]) class ConfigurationForExamples: @@ -169,7 +177,7 @@ def stop_using_configuration_for_example(cls): cls._start_last_called = False -def _setup(): +def _setup(config=None): """Setup openml package. Called on first import. Reads the config file and sets up apikey, server, cache appropriately. @@ -183,62 +191,102 @@ def _setup(): global cache_directory global avoid_duplicate_runs global connection_n_retries + global max_retries - # read config file, create cache directory - try: - os.mkdir(os.path.expanduser(os.path.join("~", ".openml"))) - except FileExistsError: - # For other errors, we want to propagate the error as openml does not work without cache - pass + if platform.system() == "Linux": + config_dir = Path(os.environ.get("XDG_CONFIG_HOME", Path("~") / ".config" / "openml")) + else: + config_dir = Path("~") / ".openml" + # Still use os.path.expanduser to trigger the mock in the unit test + config_dir = Path(os.path.expanduser(config_dir)) + config_file = config_dir / "config" + + # read config file, create directory for config file + if not os.path.exists(config_dir): + try: + os.mkdir(config_dir) + cache_exists = True + except PermissionError: + cache_exists = False + else: + cache_exists = True - config = _parse_config() - apikey = config.get("FAKE_SECTION", "apikey") - server = config.get("FAKE_SECTION", "server") + if config is None: + config = _parse_config(config_file) - short_cache_dir = config.get("FAKE_SECTION", "cachedir") - cache_directory = os.path.expanduser(short_cache_dir) + def _get(config, key): + return config.get("FAKE_SECTION", key) + + avoid_duplicate_runs = config.getboolean("FAKE_SECTION", "avoid_duplicate_runs") + else: + + def _get(config, key): + return config.get(key) + + avoid_duplicate_runs = config.get("avoid_duplicate_runs") + apikey = _get(config, "apikey") + server = _get(config, "server") + short_cache_dir = _get(config, "cachedir") + connection_n_retries = int(_get(config, "connection_n_retries")) + max_retries = int(_get(config, "max_retries")) + + cache_directory = os.path.expanduser(short_cache_dir) # create the cache subdirectory - try: - os.mkdir(cache_directory) - except FileExistsError: - # For other errors, we want to propagate the error as openml does not work without cache - pass - - avoid_duplicate_runs = config.getboolean("FAKE_SECTION", "avoid_duplicate_runs") - connection_n_retries = config.get("FAKE_SECTION", "connection_n_retries") - if connection_n_retries > 20: + if not os.path.exists(cache_directory): + try: + os.mkdir(cache_directory) + except PermissionError: + openml_logger.warning( + "No permission to create openml cache directory at %s! This can result in " + "OpenML-Python not working properly." % cache_directory + ) + + if cache_exists: + _create_log_handlers() + else: + _create_log_handlers(create_file_handler=False) + openml_logger.warning( + "No permission to create OpenML directory at %s! This can result in OpenML-Python " + "not working properly." % config_dir + ) + + if connection_n_retries > max_retries: raise ValueError( - "A higher number of retries than 20 is not allowed to keep the " - "server load reasonable" + "A higher number of retries than {} is not allowed to keep the " + "server load reasonable".format(max_retries) ) -def _parse_config(): +def _parse_config(config_file: str): """ Parse the config file, set up defaults. """ config = configparser.RawConfigParser(defaults=_defaults) - if not os.path.exists(config_file): - # Create an empty config file if there was none so far - fh = open(config_file, "w") - fh.close() - logger.info( - "Could not find a configuration file at %s. Going to " - "create an empty file there." % config_file - ) - + # The ConfigParser requires a [SECTION_HEADER], which we do not expect in our config file. + # Cheat the ConfigParser module by adding a fake section header + config_file_ = StringIO() + config_file_.write("[FAKE_SECTION]\n") try: - # The ConfigParser requires a [SECTION_HEADER], which we do not expect in our config file. - # Cheat the ConfigParser module by adding a fake section header - config_file_ = StringIO() - config_file_.write("[FAKE_SECTION]\n") with open(config_file) as fh: for line in fh: config_file_.write(line) - config_file_.seek(0) - config.read_file(config_file_) + except FileNotFoundError: + logger.info("No config file found at %s, using default configuration.", config_file) except OSError as e: - logger.info("Error opening file %s: %s", config_file, e.message) + logger.info("Error opening file %s: %s", config_file, e.args[0]) + config_file_.seek(0) + config.read_file(config_file_) + return config + + +def get_config_as_dict(): + config = dict() + config["apikey"] = apikey + config["server"] = server + config["cachedir"] = cache_directory + config["avoid_duplicate_runs"] = avoid_duplicate_runs + config["connection_n_retries"] = connection_n_retries + config["max_retries"] = max_retries return config @@ -253,11 +301,7 @@ def get_cache_directory(): """ url_suffix = urlparse(server).netloc reversed_url_suffix = os.sep.join(url_suffix.split(".")[::-1]) - if not cache_directory: - _cachedir = _defaults(cache_directory) - else: - _cachedir = cache_directory - _cachedir = os.path.join(_cachedir, reversed_url_suffix) + _cachedir = os.path.join(cache_directory, reversed_url_suffix) return _cachedir @@ -285,12 +329,13 @@ def set_cache_directory(cachedir): ) stop_using_configuration_for_example = ConfigurationForExamples.stop_using_configuration_for_example + __all__ = [ "get_cache_directory", "set_cache_directory", "start_using_configuration_for_example", "stop_using_configuration_for_example", + "get_config_as_dict", ] _setup() -_create_log_handlers() diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py index eb727b000..a1e2556be 100644 --- a/openml/datasets/data_feature.py +++ b/openml/datasets/data_feature.py @@ -1,5 +1,7 @@ # License: BSD 3-Clause +from typing import List + class OpenMLDataFeature(object): """ @@ -20,7 +22,14 @@ class OpenMLDataFeature(object): LEGAL_DATA_TYPES = ["nominal", "numeric", "string", "date"] - def __init__(self, index, name, data_type, nominal_values, number_missing_values): + def __init__( + self, + index: int, + name: str, + data_type: str, + nominal_values: List[str], + number_missing_values: int, + ): if type(index) != int: raise ValueError("Index is of wrong datatype") if data_type not in self.LEGAL_DATA_TYPES: diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 8c366dfb8..0c065b855 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -3,7 +3,6 @@ from collections import OrderedDict import re import gzip -import io import logging import os import pickle @@ -13,7 +12,7 @@ import numpy as np import pandas as pd import scipy.sparse -from warnings import warn +import xmltodict from openml.base import OpenMLBase from .data_feature import OpenMLDataFeature @@ -34,7 +33,7 @@ class OpenMLDataset(OpenMLBase): Name of the dataset. description : str Description of the dataset. - format : str + data_format : str Format of the dataset which can be either 'arff' or 'sparse_arff'. cache_format : str Format for caching the dataset which can be either 'feather' or 'pickle'. @@ -97,13 +96,16 @@ class OpenMLDataset(OpenMLBase): which maps a quality name to a quality value. dataset: string, optional Serialized arff dataset string. + minio_url: string, optional + URL to the MinIO bucket with dataset files + parquet_file: string, optional + Path to the local parquet file. """ def __init__( self, name, description, - format=None, data_format="arff", cache_format="pickle", dataset_id=None, @@ -127,9 +129,11 @@ def __init__( update_comment=None, md5_checksum=None, data_file=None, - features=None, - qualities=None, + features_file: Optional[str] = None, + qualities_file: Optional[str] = None, dataset=None, + minio_url: Optional[str] = None, + parquet_file: Optional[str] = None, ): def find_invalid_characters(string, pattern): invalid_chars = set() @@ -178,16 +182,8 @@ def find_invalid_characters(string, pattern): ) self.cache_format = cache_format - if format is None: - self.format = data_format - else: - warn( - "The format parameter in the init will be deprecated " - "in the future." - "Please use data_format instead", - DeprecationWarning, - ) - self.format = format + # Has to be called format, otherwise there will be an XML upload error + self.format = data_format self.creator = creator self.contributor = contributor self.collection_date = collection_date @@ -198,7 +194,7 @@ def find_invalid_characters(string, pattern): self.default_target_attribute = default_target_attribute self.row_id_attribute = row_id_attribute if isinstance(ignore_attribute, str): - self.ignore_attribute = [ignore_attribute] + self.ignore_attribute = [ignore_attribute] # type: Optional[List[str]] elif isinstance(ignore_attribute, list) or ignore_attribute is None: self.ignore_attribute = ignore_attribute else: @@ -212,39 +208,31 @@ def find_invalid_characters(string, pattern): self.update_comment = update_comment self.md5_checksum = md5_checksum self.data_file = data_file - self.features = None - self.qualities = None + self.parquet_file = parquet_file self._dataset = dataset + self._minio_url = minio_url - if features is not None: - self.features = {} - for idx, xmlfeature in enumerate(features["oml:feature"]): - nr_missing = xmlfeature.get("oml:number_of_missing_values", 0) - feature = OpenMLDataFeature( - int(xmlfeature["oml:index"]), - xmlfeature["oml:name"], - xmlfeature["oml:data_type"], - xmlfeature.get("oml:nominal_value"), - int(nr_missing), - ) - if idx != feature.index: - raise ValueError("Data features not provided " "in right order") - self.features[feature.index] = feature + if features_file is not None: + self.features = _read_features( + features_file + ) # type: Optional[Dict[int, OpenMLDataFeature]] + else: + self.features = None - self.qualities = _check_qualities(qualities) + if qualities_file: + self.qualities = _read_qualities(qualities_file) # type: Optional[Dict[str, float]] + else: + self.qualities = None if data_file is not None: - ( - self.data_pickle_file, - self.data_feather_file, - self.feather_attribute_file, - ) = self._create_pickle_in_cache(data_file) + rval = self._compressed_cache_file_paths(data_file) + self.data_pickle_file = rval[0] if os.path.exists(rval[0]) else None + self.data_feather_file = rval[1] if os.path.exists(rval[1]) else None + self.feather_attribute_file = rval[2] if os.path.exists(rval[2]) else None else: - self.data_pickle_file, self.data_feather_file, self.feather_attribute_file = ( - None, - None, - None, - ) + self.data_pickle_file = None + self.data_feather_file = None + self.feather_attribute_file = None @property def id(self) -> Optional[int]: @@ -311,9 +299,11 @@ def __eq__(self, other): def _download_data(self) -> None: """ Download ARFF data file to standard cache directory. Set `self.data_file`. """ # import required here to avoid circular import. - from .functions import _get_dataset_arff + from .functions import _get_dataset_arff, _get_dataset_parquet self.data_file = _get_dataset_arff(self) + if self._minio_url is not None: + self.parquet_file = _get_dataset_parquet(self) def _get_arff(self, format: str) -> Dict: """Read ARFF file and return decoded arff. @@ -367,7 +357,7 @@ def decode_arff(fh): with gzip.open(filename) as fh: return decode_arff(fh) else: - with io.open(filename, encoding="utf8") as fh: + with open(filename, encoding="utf8") as fh: return decode_arff(fh) def _parse_data_from_arff( @@ -407,7 +397,7 @@ def _parse_data_from_arff( categories_names = {} categorical = [] for i, (name, type_) in enumerate(data["attributes"]): - # if the feature is nominal and the a sparse matrix is + # if the feature is nominal and a sparse matrix is # requested, the categories need to be numeric if isinstance(type_, list) and self.format.lower() == "sparse_arff": try: @@ -415,12 +405,10 @@ def _parse_data_from_arff( # can be encoded into integers pd.factorize(type_)[0] except ValueError: - raise ValueError( - "Categorical data needs to be numeric when " "using sparse ARFF." - ) + raise ValueError("Categorical data needs to be numeric when using sparse ARFF.") # string can only be supported with pandas DataFrame elif type_ == "STRING" and self.format.lower() == "sparse_arff": - raise ValueError("Dataset containing strings is not supported " "with sparse ARFF.") + raise ValueError("Dataset containing strings is not supported with sparse ARFF.") # infer the dtype from the ARFF header if isinstance(type_, list): @@ -456,6 +444,17 @@ def _parse_data_from_arff( col.append( self._unpack_categories(X[column_name], categories_names[column_name]) ) + elif attribute_dtype[column_name] in ("floating", "integer"): + X_col = X[column_name] + if X_col.min() >= 0 and X_col.max() <= 255: + try: + X_col_uint = X_col.astype("uint8") + if (X_col == X_col_uint).all(): + col.append(X_col_uint) + continue + except ValueError: + pass + col.append(X[column_name]) else: col.append(X[column_name]) X = pd.concat(col, axis=1) @@ -464,152 +463,117 @@ def _parse_data_from_arff( return X, categorical, attribute_names - def _create_pickle_in_cache(self, data_file: str) -> Tuple[str, str, str]: - """ Parse the arff and pickle the result. Update any old pickle objects. """ - data_pickle_file = data_file.replace(".arff", ".pkl.py3") - data_feather_file = data_file.replace(".arff", ".feather") - feather_attribute_file = data_file.replace(".arff", ".feather.attributes.pkl.py3") - if os.path.exists(data_pickle_file) and self.cache_format == "pickle": - # Load the data to check if the pickle file is outdated (i.e. contains numpy array) - with open(data_pickle_file, "rb") as fh: - try: - data, categorical, attribute_names = pickle.load(fh) - except EOFError: - # The file is likely corrupt, see #780. - # We deal with this when loading the data in `_load_data`. - return data_pickle_file, data_feather_file, feather_attribute_file - except ModuleNotFoundError: - # There was some issue loading the file, see #918 - # We deal with this when loading the data in `_load_data`. - return data_pickle_file, data_feather_file, feather_attribute_file - except ValueError as e: - if "unsupported pickle protocol" in e.args[0]: - # There was some issue loading the file, see #898 - # We deal with this when loading the data in `_load_data`. - return data_pickle_file, data_feather_file, feather_attribute_file - else: - raise - - # Between v0.8 and v0.9 the format of pickled data changed from - # np.ndarray to pd.DataFrame. This breaks some backwards compatibility, - # e.g. for `run_model_on_task`. If a local file still exists with - # np.ndarray data, we reprocess the data file to store a pickled - # pd.DataFrame blob. See also #646. - if isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data): - logger.debug("Data pickle file already exists and is up to date.") - return data_pickle_file, data_feather_file, feather_attribute_file - elif os.path.exists(data_feather_file) and self.cache_format == "feather": - # Load the data to check if the pickle file is outdated (i.e. contains numpy array) - try: - data = pd.read_feather(data_feather_file) - except EOFError: - # The file is likely corrupt, see #780. - # We deal with this when loading the data in `_load_data`. - return data_pickle_file, data_feather_file, feather_attribute_file - except ModuleNotFoundError: - # There was some issue loading the file, see #918 - # We deal with this when loading the data in `_load_data`. - return data_pickle_file, data_feather_file, feather_attribute_file - except ValueError as e: - if "unsupported pickle protocol" in e.args[0]: - # There was some issue loading the file, see #898 - # We deal with this when loading the data in `_load_data`. - return data_pickle_file, data_feather_file, feather_attribute_file - else: - raise + def _compressed_cache_file_paths(self, data_file: str) -> Tuple[str, str, str]: + ext = f".{data_file.split('.')[-1]}" + data_pickle_file = data_file.replace(ext, ".pkl.py3") + data_feather_file = data_file.replace(ext, ".feather") + feather_attribute_file = data_file.replace(ext, ".feather.attributes.pkl.py3") + return data_pickle_file, data_feather_file, feather_attribute_file + + def _cache_compressed_file_from_file( + self, data_file: str + ) -> Tuple[Union[pd.DataFrame, scipy.sparse.csr_matrix], List[bool], List[str]]: + """ Store data from the local file in compressed format. - logger.debug("Data feather file already exists and is up to date.") - return data_pickle_file, data_feather_file, feather_attribute_file + If a local parquet file is present it will be used instead of the arff file. + Sets cache_format to 'pickle' if data is sparse. + """ + ( + data_pickle_file, + data_feather_file, + feather_attribute_file, + ) = self._compressed_cache_file_paths(data_file) + + if data_file.endswith(".arff"): + data, categorical, attribute_names = self._parse_data_from_arff(data_file) + elif data_file.endswith(".pq"): + try: + data = pd.read_parquet(data_file) + except Exception as e: + raise Exception(f"File: {data_file}") from e - # At this point either the pickle file does not exist, or it had outdated formatting. - # We parse the data from arff again and populate the cache with a recent pickle file. - X, categorical, attribute_names = self._parse_data_from_arff(data_file) + categorical = [data[c].dtype.name == "category" for c in data.columns] + attribute_names = list(data.columns) + else: + raise ValueError(f"Unknown file type for file '{data_file}'.") # Feather format does not work for sparse datasets, so we use pickle for sparse datasets + if scipy.sparse.issparse(data): + self.cache_format = "pickle" - if self.cache_format == "feather" and not scipy.sparse.issparse(X): - logger.info("feather write {}".format(self.name)) - X.to_feather(data_feather_file) + logger.info(f"{self.cache_format} write {self.name}") + if self.cache_format == "feather": + data.to_feather(data_feather_file) with open(feather_attribute_file, "wb") as fh: pickle.dump((categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL) + self.data_feather_file = data_feather_file + self.feather_attribute_file = feather_attribute_file else: - logger.info("pickle write {}".format(self.name)) - self.cache_format = "pickle" with open(data_pickle_file, "wb") as fh: - pickle.dump((X, categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL) - logger.debug( - "Saved dataset {did}: {name} to file {path}".format( - did=int(self.dataset_id or -1), name=self.name, path=data_pickle_file - ) - ) - return data_pickle_file, data_feather_file, feather_attribute_file + pickle.dump((data, categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL) + self.data_pickle_file = data_pickle_file + + data_file = data_pickle_file if self.cache_format == "pickle" else data_feather_file + logger.debug(f"Saved dataset {int(self.dataset_id or -1)}: {self.name} to file {data_file}") + + return data, categorical, attribute_names def _load_data(self): - """ Load data from pickle or arff. Download data first if not present on disk. """ - if (self.cache_format == "pickle" and self.data_pickle_file is None) or ( - self.cache_format == "feather" and self.data_feather_file is None - ): + """ Load data from compressed format or arff. Download data if not present on disk. """ + need_to_create_pickle = self.cache_format == "pickle" and self.data_pickle_file is None + need_to_create_feather = self.cache_format == "feather" and self.data_feather_file is None + + if need_to_create_pickle or need_to_create_feather: if self.data_file is None: self._download_data() - ( - self.data_pickle_file, - self.data_feather_file, - self.feather_attribute_file, - ) = self._create_pickle_in_cache(self.data_file) + file_to_load = self.data_file if self.parquet_file is None else self.parquet_file + return self._cache_compressed_file_from_file(file_to_load) + + # helper variable to help identify where errors occur + fpath = self.data_feather_file if self.cache_format == "feather" else self.data_pickle_file + logger.info(f"{self.cache_format} load data {self.name}") try: if self.cache_format == "feather": - logger.info("feather load data {}".format(self.name)) data = pd.read_feather(self.data_feather_file) - + fpath = self.feather_attribute_file with open(self.feather_attribute_file, "rb") as fh: categorical, attribute_names = pickle.load(fh) else: - logger.info("pickle load data {}".format(self.name)) with open(self.data_pickle_file, "rb") as fh: data, categorical, attribute_names = pickle.load(fh) - except EOFError: - logger.warning( - "Detected a corrupt cache file loading dataset %d: '%s'. " - "We will continue loading data from the arff-file, " - "but this will be much slower for big datasets. " - "Please manually delete the cache file if you want OpenML-Python " - "to attempt to reconstruct it." - "" % (self.dataset_id, self.data_pickle_file) - ) - data, categorical, attribute_names = self._parse_data_from_arff(self.data_file) except FileNotFoundError: - raise ValueError( - "Cannot find a pickle file for dataset {} at " - "location {} ".format(self.name, self.data_pickle_file) - ) - except ModuleNotFoundError as e: + raise ValueError(f"Cannot find file for dataset {self.name} at location '{fpath}'.") + except (EOFError, ModuleNotFoundError, ValueError) as e: + error_message = e.message if hasattr(e, "message") else e.args[0] + hint = "" + + if isinstance(e, EOFError): + readable_error = "Detected a corrupt cache file" + elif isinstance(e, ModuleNotFoundError): + readable_error = "Detected likely dependency issues" + hint = "This is most likely due to https://github.com/openml/openml-python/issues/918. " # noqa: 501 + elif isinstance(e, ValueError) and "unsupported pickle protocol" in e.args[0]: + readable_error = "Encountered unsupported pickle protocol" + else: + raise # an unknown ValueError is raised, should crash and file bug report + logger.warning( - "Encountered error message when loading cached dataset %d: '%s'. " - "Error message was: %s. " - "This is most likely due to https://github.com/openml/openml-python/issues/918. " + f"{readable_error} when loading dataset {self.id} from '{fpath}'. " + f"{hint}" + f"Error message was: {error_message}. " "We will continue loading data from the arff-file, " "but this will be much slower for big datasets. " "Please manually delete the cache file if you want OpenML-Python " "to attempt to reconstruct it." - "" % (self.dataset_id, self.data_pickle_file, e.args[0]), ) data, categorical, attribute_names = self._parse_data_from_arff(self.data_file) - except ValueError as e: - if "unsupported pickle protocol" in e.args[0]: - logger.warning( - "Encountered unsupported pickle protocol when loading cached dataset %d: '%s'. " - "Error message was: %s. " - "We will continue loading data from the arff-file, " - "but this will be much slower for big datasets. " - "Please manually delete the cache file if you want OpenML-Python " - "to attempt to reconstruct it." - "" % (self.dataset_id, self.data_pickle_file, e.args[0]), - ) - data, categorical, attribute_names = self._parse_data_from_arff(self.data_file) - else: - raise + data_up_to_date = isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data) + if self.cache_format == "pickle" and not data_up_to_date: + logger.info("Updating outdated pickle file.") + file_to_load = self.data_file if self.parquet_file is None else self.parquet_file + return self._cache_compressed_file_from_file(file_to_load) return data, categorical, attribute_names @staticmethod @@ -675,6 +639,11 @@ def _encode_if_category(column): @staticmethod def _unpack_categories(series, categories): + # nan-likes can not be explicitly specified as a category + def valid_category(cat): + return isinstance(cat, str) or (cat is not None and not np.isnan(cat)) + + filtered_categories = [c for c in categories if valid_category(c)] col = [] for x in series: try: @@ -683,7 +652,7 @@ def _unpack_categories(series, categories): col.append(np.nan) # We require two lines to create a series of categories as detailed here: # https://pandas.pydata.org/pandas-docs/version/0.24/user_guide/categorical.html#series-creation # noqa E501 - raw_cat = pd.Categorical(col, ordered=True, categories=categories) + raw_cat = pd.Categorical(col, ordered=True, categories=filtered_categories) return pd.Series(raw_cat, index=series.index, name=series.name) def get_data( @@ -742,7 +711,7 @@ def get_data( to_exclude.extend(self.ignore_attribute) if len(to_exclude) > 0: - logger.info("Going to remove the following attributes:" " %s" % to_exclude) + logger.info("Going to remove the following attributes: %s" % to_exclude) keep = np.array( [True if column not in to_exclude else False for column in attribute_names] ) @@ -809,6 +778,10 @@ def retrieve_class_labels(self, target_name: str = "class") -> Union[None, List[ ------- list """ + if self.features is None: + raise ValueError( + "retrieve_class_labels can only be called if feature information is available." + ) for feature in self.features.values(): if (feature.name == target_name) and (feature.data_type == "nominal"): return feature.nominal_values @@ -937,18 +910,73 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": return data_container -def _check_qualities(qualities): - if qualities is not None: - qualities_ = {} - for xmlquality in qualities: - name = xmlquality["oml:name"] - if xmlquality.get("oml:value", None) is None: - value = float("NaN") - elif xmlquality["oml:value"] == "null": - value = float("NaN") - else: - value = float(xmlquality["oml:value"]) - qualities_[name] = value - return qualities_ - else: - return None +def _read_features(features_file: str) -> Dict[int, OpenMLDataFeature]: + features_pickle_file = _get_features_pickle_file(features_file) + try: + with open(features_pickle_file, "rb") as fh_binary: + features = pickle.load(fh_binary) + except: # noqa E722 + with open(features_file, encoding="utf8") as fh: + features_xml_string = fh.read() + xml_dict = xmltodict.parse( + features_xml_string, force_list=("oml:feature", "oml:nominal_value") + ) + features_xml = xml_dict["oml:data_features"] + + features = {} + for idx, xmlfeature in enumerate(features_xml["oml:feature"]): + nr_missing = xmlfeature.get("oml:number_of_missing_values", 0) + feature = OpenMLDataFeature( + int(xmlfeature["oml:index"]), + xmlfeature["oml:name"], + xmlfeature["oml:data_type"], + xmlfeature.get("oml:nominal_value"), + int(nr_missing), + ) + if idx != feature.index: + raise ValueError("Data features not provided in right order") + features[feature.index] = feature + + with open(features_pickle_file, "wb") as fh_binary: + pickle.dump(features, fh_binary) + return features + + +def _get_features_pickle_file(features_file: str) -> str: + """This function only exists so it can be mocked during unit testing""" + return features_file + ".pkl" + + +def _read_qualities(qualities_file: str) -> Dict[str, float]: + qualities_pickle_file = _get_qualities_pickle_file(qualities_file) + try: + with open(qualities_pickle_file, "rb") as fh_binary: + qualities = pickle.load(fh_binary) + except: # noqa E722 + with open(qualities_file, encoding="utf8") as fh: + qualities_xml = fh.read() + xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",)) + qualities = xml_as_dict["oml:data_qualities"]["oml:quality"] + qualities = _check_qualities(qualities) + with open(qualities_pickle_file, "wb") as fh_binary: + pickle.dump(qualities, fh_binary) + return qualities + + +def _get_qualities_pickle_file(qualities_file: str) -> str: + """This function only exists so it can be mocked during unit testing""" + return qualities_file + ".pkl" + + +def _check_qualities(qualities: List[Dict[str, str]]) -> Dict[str, float]: + qualities_ = {} + for xmlquality in qualities: + name = xmlquality["oml:name"] + if xmlquality.get("oml:value", None) is None: + value = float("NaN") + elif xmlquality["oml:value"] == "null": + value = float("NaN") + else: + value = float(xmlquality["oml:value"]) + qualities_[name] = value + return qualities_ diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 84943b244..746285650 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -3,8 +3,7 @@ import io import logging import os -import re -from typing import List, Dict, Union, Optional +from typing import List, Dict, Union, Optional, cast import numpy as np import arff @@ -18,13 +17,11 @@ import openml._api_calls from .dataset import OpenMLDataset from ..exceptions import ( - OpenMLCacheException, OpenMLHashException, OpenMLServerException, OpenMLPrivateDatasetError, ) from ..utils import ( - _create_cache_directory, _remove_cache_dir_for_id, _create_cache_directory_for_id, ) @@ -37,118 +34,6 @@ # Local getters/accessors to the cache directory -def _list_cached_datasets(): - """ Return list with ids of all cached datasets. - - Returns - ------- - list - List with IDs of all cached datasets. - """ - datasets = [] - - dataset_cache_dir = _create_cache_directory(DATASETS_CACHE_DIR_NAME) - directory_content = os.listdir(dataset_cache_dir) - directory_content.sort() - - # Find all dataset ids for which we have downloaded the dataset - # description - for directory_name in directory_content: - # First check if the directory name could be an OpenML dataset id - if not re.match(r"[0-9]*", directory_name): - continue - - dataset_id = int(directory_name) - - directory_name = os.path.join(dataset_cache_dir, directory_name) - dataset_directory_content = os.listdir(directory_name) - - if ( - "dataset.arff" in dataset_directory_content - and "description.xml" in dataset_directory_content - ): - if dataset_id not in datasets: - datasets.append(dataset_id) - - datasets.sort() - return datasets - - -def _get_cached_datasets(): - """Searches for all OpenML datasets in the OpenML cache dir. - - Return a dictionary which maps dataset ids to dataset objects""" - dataset_list = _list_cached_datasets() - datasets = OrderedDict() - - for dataset_id in dataset_list: - datasets[dataset_id] = _get_cached_dataset(dataset_id) - - return datasets - - -def _get_cached_dataset(dataset_id: int) -> OpenMLDataset: - """Get cached dataset for ID. - - Returns - ------- - OpenMLDataset - """ - description = _get_cached_dataset_description(dataset_id) - arff_file = _get_cached_dataset_arff(dataset_id) - features = _get_cached_dataset_features(dataset_id) - qualities = _get_cached_dataset_qualities(dataset_id) - dataset = _create_dataset_from_description(description, features, qualities, arff_file) - - return dataset - - -def _get_cached_dataset_description(dataset_id): - did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id,) - description_file = os.path.join(did_cache_dir, "description.xml") - try: - with io.open(description_file, encoding="utf8") as fh: - dataset_xml = fh.read() - return xmltodict.parse(dataset_xml)["oml:data_set_description"] - except (IOError, OSError): - raise OpenMLCacheException( - "Dataset description for dataset id %d not " "cached" % dataset_id - ) - - -def _get_cached_dataset_features(dataset_id): - did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id,) - features_file = os.path.join(did_cache_dir, "features.xml") - try: - return _load_features_from_file(features_file) - except (IOError, OSError): - raise OpenMLCacheException("Dataset features for dataset id %d not " "cached" % dataset_id) - - -def _get_cached_dataset_qualities(dataset_id): - did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id,) - qualities_file = os.path.join(did_cache_dir, "qualities.xml") - try: - with io.open(qualities_file, encoding="utf8") as fh: - qualities_xml = fh.read() - qualities_dict = xmltodict.parse(qualities_xml) - return qualities_dict["oml:data_qualities"]["oml:quality"] - except (IOError, OSError): - raise OpenMLCacheException("Dataset qualities for dataset id %d not " "cached" % dataset_id) - - -def _get_cached_dataset_arff(dataset_id): - did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id,) - output_file = os.path.join(did_cache_dir, "dataset.arff") - - try: - with io.open(output_file, encoding="utf8"): - pass - return output_file - except (OSError, IOError): - raise OpenMLCacheException("ARFF file for dataset id %d not " "cached" % dataset_id) - - def _get_cache_directory(dataset: OpenMLDataset) -> str: """ Return the cache directory of the OpenMLDataset """ return _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset.dataset_id) @@ -183,7 +68,7 @@ def list_datasets( status: Optional[str] = None, tag: Optional[str] = None, output_format: str = "dict", - **kwargs + **kwargs, ) -> Union[Dict, pd.DataFrame]: """ @@ -251,7 +136,7 @@ def list_datasets( size=size, status=status, tag=tag, - **kwargs + **kwargs, ) @@ -326,34 +211,59 @@ def __list_datasets(api_call, output_format="dict"): return datasets -def _load_features_from_file(features_file: str) -> Dict: - with io.open(features_file, encoding="utf8") as fh: - features_xml = fh.read() - xml_dict = xmltodict.parse(features_xml, force_list=("oml:feature", "oml:nominal_value")) - return xml_dict["oml:data_features"] +def _expand_parameter(parameter: Union[str, List[str]]) -> List[str]: + expanded_parameter = [] + if isinstance(parameter, str): + expanded_parameter = [x.strip() for x in parameter.split(",")] + elif isinstance(parameter, list): + expanded_parameter = parameter + return expanded_parameter + + +def _validated_data_attributes( + attributes: List[str], data_attributes: List[str], parameter_name: str +) -> None: + for attribute_ in attributes: + is_attribute_a_data_attribute = any([attr[0] == attribute_ for attr in data_attributes]) + if not is_attribute_a_data_attribute: + raise ValueError( + "all attribute of '{}' should be one of the data attribute. " + " Got '{}' while candidates are {}.".format( + parameter_name, attribute_, [attr[0] for attr in data_attributes] + ) + ) -def check_datasets_active(dataset_ids: List[int]) -> Dict[int, bool]: +def check_datasets_active( + dataset_ids: List[int], raise_error_if_not_exist: bool = True, +) -> Dict[int, bool]: """ Check if the dataset ids provided are active. + Raises an error if a dataset_id in the given list + of dataset_ids does not exist on the server. + Parameters ---------- dataset_ids : List[int] A list of integers representing dataset ids. + raise_error_if_not_exist : bool (default=True) + Flag that if activated can raise an error, if one or more of the + given dataset ids do not exist on the server. Returns ------- dict A dictionary with items {did: bool} """ - dataset_list = list_datasets(status="all") + dataset_list = list_datasets(status="all", data_id=dataset_ids) active = {} for did in dataset_ids: dataset = dataset_list.get(did, None) if dataset is None: - raise ValueError("Could not find dataset {} in OpenML dataset list.".format(did)) + if raise_error_if_not_exist: + raise ValueError(f"Could not find dataset {did} in OpenML dataset list.") else: active[did] = dataset["status"] == "active" @@ -380,6 +290,8 @@ def _name_to_id( error_if_multiple : bool (default=False) If `False`, if multiple datasets match, return the least recent active dataset. If `True`, if multiple datasets match, raise an error. + download_qualities : bool, optional (default=True) + If `True`, also download qualities.xml file. If False it skip the qualities.xml. Returns ------- @@ -400,7 +312,7 @@ def _name_to_id( def get_datasets( - dataset_ids: List[Union[str, int]], download_data: bool = True, + dataset_ids: List[Union[str, int]], download_data: bool = True, download_qualities: bool = True ) -> List[OpenMLDataset]: """Download datasets. @@ -416,6 +328,8 @@ def get_datasets( make the operation noticeably slower. Metadata is also still retrieved. If False, create the OpenMLDataset and only populate it with the metadata. The data may later be retrieved through the `OpenMLDataset.get_data` method. + download_qualities : bool, optional (default=True) + If True, also download qualities.xml file. If False it skip the qualities.xml. Returns ------- @@ -424,7 +338,9 @@ def get_datasets( """ datasets = [] for dataset_id in dataset_ids: - datasets.append(get_dataset(dataset_id, download_data)) + datasets.append( + get_dataset(dataset_id, download_data, download_qualities=download_qualities) + ) return datasets @@ -435,6 +351,7 @@ def get_dataset( version: int = None, error_if_multiple: bool = False, cache_format: str = "pickle", + download_qualities: bool = True, ) -> OpenMLDataset: """ Download the OpenML dataset representation, optionally also download actual data file. @@ -489,21 +406,28 @@ def get_dataset( did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id,) + remove_dataset_cache = True try: - remove_dataset_cache = True description = _get_dataset_description(did_cache_dir, dataset_id) - features = _get_dataset_features(did_cache_dir, dataset_id) + features_file = _get_dataset_features_file(did_cache_dir, dataset_id) try: - qualities = _get_dataset_qualities(did_cache_dir, dataset_id) + if download_qualities: + qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id) + else: + qualities_file = "" except OpenMLServerException as e: if e.code == 362 and str(e) == "No qualities found - None": logger.warning("No qualities found for dataset {}".format(dataset_id)) - qualities = None + qualities_file = None else: raise arff_file = _get_dataset_arff(description) if download_data else None + if "oml:minio_url" in description and download_data: + parquet_file = _get_dataset_parquet(description) + else: + parquet_file = None remove_dataset_cache = False except OpenMLServerException as e: # if there was an exception, @@ -517,7 +441,7 @@ def get_dataset( _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir) dataset = _create_dataset_from_description( - description, features, qualities, arff_file, cache_format + description, features_file, qualities_file, arff_file, parquet_file, cache_format ) return dataset @@ -636,6 +560,7 @@ def create_dataset( ignore_attribute : str | list Attributes that should be excluded in modelling, such as identifiers and indexes. + Can have multiple values, comma separated. citation : str Reference(s) that should be cited when building on this data. version_label : str, optional @@ -687,6 +612,11 @@ def create_dataset( attributes_[attr_idx] = (attr_name, attributes[attr_name]) else: attributes_ = attributes + ignore_attributes = _expand_parameter(ignore_attribute) + _validated_data_attributes(ignore_attributes, attributes_, "ignore_attribute") + + default_target_attributes = _expand_parameter(default_target_attribute) + _validated_data_attributes(default_target_attributes, attributes_, "default_target_attribute") if row_id_attribute is not None: is_row_id_an_attribute = any([attr[0] == row_id_attribute for attr in attributes_]) @@ -943,6 +873,47 @@ def fork_dataset(data_id: int) -> int: return int(data_id) +def _topic_add_dataset(data_id: int, topic: str): + """ + Adds a topic for a dataset. + This API is not available for all OpenML users and is accessible only by admins. + Parameters + ---------- + data_id : int + id of the dataset for which the topic needs to be added + topic : str + Topic to be added for the dataset + """ + if not isinstance(data_id, int): + raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id))) + form_data = {"data_id": data_id, "topic": topic} + result_xml = openml._api_calls._perform_api_call("data/topicadd", "post", data=form_data) + result = xmltodict.parse(result_xml) + data_id = result["oml:data_topic"]["oml:id"] + return int(data_id) + + +def _topic_delete_dataset(data_id: int, topic: str): + """ + Removes a topic from a dataset. + This API is not available for all OpenML users and is accessible only by admins. + Parameters + ---------- + data_id : int + id of the dataset to be forked + topic : str + Topic to be deleted + + """ + if not isinstance(data_id, int): + raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id))) + form_data = {"data_id": data_id, "topic": topic} + result_xml = openml._api_calls._perform_api_call("data/topicdelete", "post", data=form_data) + result = xmltodict.parse(result_xml) + data_id = result["oml:data_topic"]["oml:id"] + return int(data_id) + + def _get_dataset_description(did_cache_dir, dataset_id): """Get the dataset description as xml dictionary. @@ -969,8 +940,9 @@ def _get_dataset_description(did_cache_dir, dataset_id): description_file = os.path.join(did_cache_dir, "description.xml") try: - return _get_cached_dataset_description(dataset_id) - except OpenMLCacheException: + with io.open(description_file, encoding="utf8") as fh: + dataset_xml = fh.read() + except Exception: url_extension = "data/{}".format(dataset_id) dataset_xml = openml._api_calls._perform_api_call(url_extension, "get") with io.open(description_file, "w", encoding="utf8") as fh: @@ -981,6 +953,55 @@ def _get_dataset_description(did_cache_dir, dataset_id): return description +def _get_dataset_parquet( + description: Union[Dict, OpenMLDataset], cache_directory: str = None +) -> Optional[str]: + """ Return the path to the local parquet file of the dataset. If is not cached, it is downloaded. + + Checks if the file is in the cache, if yes, return the path to the file. + If not, downloads the file and caches it, then returns the file path. + The cache directory is generated based on dataset information, but can also be specified. + + This function is NOT thread/multiprocessing safe. + Unlike the ARFF equivalent, checksums are not available/used (for now). + + Parameters + ---------- + description : dictionary or OpenMLDataset + Either a dataset description as dict or OpenMLDataset. + + cache_directory: str, optional (default=None) + Folder to store the parquet file in. + If None, use the default cache directory for the dataset. + + Returns + ------- + output_filename : string, optional + Location of the Parquet file if successfully downloaded, None otherwise. + """ + if isinstance(description, dict): + url = description.get("oml:minio_url") + did = description.get("oml:id") + elif isinstance(description, OpenMLDataset): + url = description._minio_url + did = description.dataset_id + else: + raise TypeError("`description` should be either OpenMLDataset or Dict.") + + if cache_directory is None: + cache_directory = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did) + output_file_path = os.path.join(cache_directory, "dataset.pq") + + if not os.path.isfile(output_file_path): + try: + openml._api_calls._download_minio_file( + source=cast(str, url), destination=output_file_path + ) + except FileNotFoundError: + return None + return output_file_path + + def _get_dataset_arff(description: Union[Dict, OpenMLDataset], cache_directory: str = None) -> str: """ Return the path to the local arff file of the dataset. If is not cached, it is downloaded. @@ -1031,8 +1052,8 @@ def _get_dataset_arff(description: Union[Dict, OpenMLDataset], cache_directory: return output_file_path -def _get_dataset_features(did_cache_dir, dataset_id): - """API call to get dataset features (cached) +def _get_dataset_features_file(did_cache_dir: str, dataset_id: int) -> str: + """API call to load dataset features. Loads from cache or downloads them. Features are feature descriptions for each column. (name, index, categorical, ...) @@ -1049,8 +1070,8 @@ def _get_dataset_features(did_cache_dir, dataset_id): Returns ------- - features : dict - Dictionary containing dataset feature descriptions, parsed from XML. + str + Path of the cached dataset feature file """ features_file = os.path.join(did_cache_dir, "features.xml") @@ -1061,11 +1082,11 @@ def _get_dataset_features(did_cache_dir, dataset_id): with io.open(features_file, "w", encoding="utf8") as fh: fh.write(features_xml) - return _load_features_from_file(features_file) + return features_file -def _get_dataset_qualities(did_cache_dir, dataset_id): - """API call to get dataset qualities (cached) +def _get_dataset_qualities_file(did_cache_dir, dataset_id): + """API call to load dataset qualities. Loads from cache or downloads them. Features are metafeatures (number of features, number of classes, ...) @@ -1079,10 +1100,12 @@ def _get_dataset_qualities(did_cache_dir, dataset_id): dataset_id : int Dataset ID + download_qualities : bool + wheather to download/use cahsed version or not. Returns ------- - qualities : dict - Dictionary containing dataset qualities, parsed from XML. + str + Path of the cached qualities file """ # Dataset qualities are subject to change and must be fetched every time qualities_file = os.path.join(did_cache_dir, "qualities.xml") @@ -1092,21 +1115,17 @@ def _get_dataset_qualities(did_cache_dir, dataset_id): except (OSError, IOError): url_extension = "data/qualities/{}".format(dataset_id) qualities_xml = openml._api_calls._perform_api_call(url_extension, "get") - with io.open(qualities_file, "w", encoding="utf8") as fh: fh.write(qualities_xml) - - xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",)) - qualities = xml_as_dict["oml:data_qualities"]["oml:quality"] - - return qualities + return qualities_file def _create_dataset_from_description( description: Dict[str, str], - features: Dict, - qualities: List, + features_file: str, + qualities_file: str, arff_file: str = None, + parquet_file: str = None, cache_format: str = "pickle", ) -> OpenMLDataset: """Create a dataset object from a description dict. @@ -1115,12 +1134,14 @@ def _create_dataset_from_description( ---------- description : dict Description of a dataset in xml dict. - features : dict - Description of a dataset features. + featuresfile : str + Path of the dataset features as xml file. qualities : list - Description of a dataset qualities. + Path of the dataset qualities as xml file. arff_file : string, optional Path of dataset ARFF file. + parquet_file : string, optional + Path of dataset Parquet file. cache_format: string, optional Caching option for datasets (feather/pickle) @@ -1155,8 +1176,10 @@ def _create_dataset_from_description( md5_checksum=description.get("oml:md5_checksum"), data_file=arff_file, cache_format=cache_format, - features=features, - qualities=qualities, + features_file=features_file, + qualities_file=qualities_file, + minio_url=description.get("oml:minio_url"), + parquet_file=parquet_file, ) diff --git a/openml/extensions/extension_interface.py b/openml/extensions/extension_interface.py index 2d06b69e0..4529ad163 100644 --- a/openml/extensions/extension_interface.py +++ b/openml/extensions/extension_interface.py @@ -229,6 +229,19 @@ def obtain_parameter_values( - ``oml:component`` : int: flow id to which the parameter belongs """ + @abstractmethod + def check_if_model_fitted(self, model: Any) -> bool: + """Returns True/False denoting if the model has already been fitted/trained. + + Parameters + ---------- + model : Any + + Returns + ------- + bool + """ + ################################################################################################ # Abstract methods for hyperparameter optimization diff --git a/openml/extensions/sklearn/__init__.py b/openml/extensions/sklearn/__init__.py index 2003934db..135e5ccf6 100644 --- a/openml/extensions/sklearn/__init__.py +++ b/openml/extensions/sklearn/__init__.py @@ -7,3 +7,31 @@ __all__ = ["SklearnExtension"] register_extension(SklearnExtension) + + +def cont(X): + """Returns True for all non-categorical columns, False for the rest. + + This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling + of mixed data types. To build sklearn models on mixed data types, a ColumnTransformer is + required to process each type of columns separately. + This function allows transformations meant for continuous/numeric columns to access the + continuous/numeric columns given the dataset as DataFrame. + """ + if not hasattr(X, "dtypes"): + raise AttributeError("Not a Pandas DataFrame with 'dtypes' as attribute!") + return X.dtypes != "category" + + +def cat(X): + """Returns True for all categorical columns, False for the rest. + + This is a helper function for OpenML datasets encoded as DataFrames simplifying the handling + of mixed data types. To build sklearn models on mixed data types, a ColumnTransformer is + required to process each type of columns separately. + This function allows transformations meant for categorical columns to access the + categorical columns given the dataset as DataFrame. + """ + if not hasattr(X, "dtypes"): + raise AttributeError("Not a Pandas DataFrame with 'dtypes' as attribute!") + return X.dtypes == "category" diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index edb14487b..3441b4a4e 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -52,7 +52,10 @@ SIMPLE_NUMPY_TYPES = [ - nptype for type_cat, nptypes in np.sctypes.items() for nptype in nptypes if type_cat != "others" + nptype + for type_cat, nptypes in np.sctypes.items() + for nptype in nptypes # type: ignore + if type_cat != "others" ] SIMPLE_TYPES = tuple([bool, int, float, str] + SIMPLE_NUMPY_TYPES) @@ -211,6 +214,61 @@ def remove_all_in_parentheses(string: str) -> str: return short_name.format(pipeline) + @classmethod + def _min_dependency_str(cls, sklearn_version: str) -> str: + """ Returns a string containing the minimum dependencies for the sklearn version passed. + + Parameters + ---------- + sklearn_version : str + A version string of the xx.xx.xx + + Returns + ------- + str + """ + openml_major_version = int(LooseVersion(openml.__version__).version[1]) + # This explicit check is necessary to support existing entities on the OpenML servers + # that used the fixed dependency string (in the else block) + if openml_major_version > 11: + # OpenML v0.11 onwards supports sklearn>=0.24 + # assumption: 0.24 onwards sklearn should contain a _min_dependencies.py file with + # variables declared for extracting minimum dependency for that version + if LooseVersion(sklearn_version) >= "0.24": + from sklearn import _min_dependencies as _mindep + + dependency_list = { + "numpy": "{}".format(_mindep.NUMPY_MIN_VERSION), + "scipy": "{}".format(_mindep.SCIPY_MIN_VERSION), + "joblib": "{}".format(_mindep.JOBLIB_MIN_VERSION), + "threadpoolctl": "{}".format(_mindep.THREADPOOLCTL_MIN_VERSION), + } + elif LooseVersion(sklearn_version) >= "0.23": + dependency_list = { + "numpy": "1.13.3", + "scipy": "0.19.1", + "joblib": "0.11", + "threadpoolctl": "2.0.0", + } + if LooseVersion(sklearn_version).version[2] == 0: + dependency_list.pop("threadpoolctl") + elif LooseVersion(sklearn_version) >= "0.21": + dependency_list = {"numpy": "1.11.0", "scipy": "0.17.0", "joblib": "0.11"} + elif LooseVersion(sklearn_version) >= "0.19": + dependency_list = {"numpy": "1.8.2", "scipy": "0.13.3"} + else: + dependency_list = {"numpy": "1.6.1", "scipy": "0.9"} + else: + # this is INCORRECT for sklearn versions >= 0.19 and < 0.24 + # given that OpenML has existing flows uploaded with such dependency information, + # we change no behaviour for older sklearn version, however from 0.24 onwards + # the dependency list will be accurately updated for any flow uploaded to OpenML + dependency_list = {"numpy": "1.6.1", "scipy": "0.9"} + + sklearn_dep = "sklearn=={}".format(sklearn_version) + dep_str = "\n".join(["{}>={}".format(k, v) for k, v in dependency_list.items()]) + return "\n".join([sklearn_dep, dep_str]) + ################################################################################################ # Methods for flow serialization and de-serialization @@ -491,7 +549,7 @@ def get_version_information(self) -> List[str]: major, minor, micro, _, _ = sys.version_info python_version = "Python_{}.".format(".".join([str(major), str(minor), str(micro)])) sklearn_version = "Sklearn_{}.".format(sklearn.__version__) - numpy_version = "NumPy_{}.".format(numpy.__version__) + numpy_version = "NumPy_{}.".format(numpy.__version__) # type: ignore scipy_version = "SciPy_{}.".format(scipy.__version__) return [python_version, sklearn_version, numpy_version, scipy_version] @@ -508,8 +566,7 @@ def create_setup_string(self, model: Any) -> str: str """ run_environment = " ".join(self.get_version_information()) - # fixme str(model) might contain (...) - return run_environment + " " + str(model) + return run_environment def _is_cross_validator(self, o: Any) -> bool: return isinstance(o, sklearn.model_selection.BaseCrossValidator) @@ -769,20 +826,13 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: tags=tags, extension=self, language="English", - # TODO fill in dependencies! dependencies=dependencies, ) return flow def _get_dependencies(self) -> str: - dependencies = "\n".join( - [ - self._format_external_version("sklearn", sklearn.__version__,), - "numpy>=1.6.1", - "scipy>=0.9", - ] - ) + dependencies = self._min_dependency_str(sklearn.__version__) return dependencies def _get_tags(self) -> List[str]: @@ -1189,11 +1239,11 @@ def _check_dependencies(self, dependencies: str, strict_version: bool = True) -> def _serialize_type(self, o: Any) -> "OrderedDict[str, str]": mapping = { float: "float", - np.float: "np.float", + np.float: "np.float", # type: ignore np.float32: "np.float32", np.float64: "np.float64", int: "int", - np.int: "np.int", + np.int: "np.int", # type: ignore np.int32: "np.int32", np.int64: "np.int64", } @@ -1205,11 +1255,11 @@ def _serialize_type(self, o: Any) -> "OrderedDict[str, str]": def _deserialize_type(self, o: str) -> Any: mapping = { "float": float, - "np.float": np.float, + "np.float": np.float, # type: ignore "np.float32": np.float32, "np.float64": np.float64, "int": int, - "np.int": np.int, + "np.int": np.int, # type: ignore "np.int32": np.int32, "np.int64": np.int64, } @@ -1537,6 +1587,37 @@ def _seed_current_object(current_value): model.set_params(**random_states) return model + def check_if_model_fitted(self, model: Any) -> bool: + """Returns True/False denoting if the model has already been fitted/trained + + Parameters + ---------- + model : Any + + Returns + ------- + bool + """ + try: + # check if model is fitted + from sklearn.exceptions import NotFittedError + + # Creating random dummy data of arbitrary size + dummy_data = np.random.uniform(size=(10, 3)) + # Using 'predict' instead of 'sklearn.utils.validation.check_is_fitted' for a more + # robust check that works across sklearn versions and models. Internally, 'predict' + # should call 'check_is_fitted' for every concerned attribute, thus offering a more + # assured check than explicit calls to 'check_is_fitted' + model.predict(dummy_data) + # Will reach here if the model was fit on a dataset with 3 features + return True + except NotFittedError: # needs to be the first exception to be caught + # Model is not fitted, as is required + return False + except ValueError: + # Will reach here if the model was fit on a dataset with more or less than 3 features + return True + def _run_model_on_fold( self, model: Any, @@ -1546,7 +1627,9 @@ def _run_model_on_fold( fold_no: int, y_train: Optional[np.ndarray] = None, X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame]] = None, - ) -> Tuple[np.ndarray, pd.DataFrame, "OrderedDict[str, float]", Optional[OpenMLRunTrace]]: + ) -> Tuple[ + np.ndarray, Optional[pd.DataFrame], "OrderedDict[str, float]", Optional[OpenMLRunTrace] + ]: """Run a model on a repeat,fold,subsample triplet of the task and return prediction information. @@ -1581,19 +1664,21 @@ def _run_model_on_fold( ------- pred_y : np.ndarray Predictions on the training/test set, depending on the task type. - For supervised tasks, predicitons are on the test set. - For unsupervised tasks, predicitons are on the training set. - proba_y : pd.DataFrame + For supervised tasks, predictions are on the test set. + For unsupervised tasks, predictions are on the training set. + proba_y : pd.DataFrame, optional Predicted probabilities for the test set. None, if task is not Classification or Learning Curve prediction. user_defined_measures : OrderedDict[str, float] User defined measures that were generated on this fold - trace : Optional[OpenMLRunTrace]] + trace : OpenMLRunTrace, optional arff trace object from a fitted model and the trace content obtained by repeatedly calling ``run_model_on_task`` """ - def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.DataFrame: + def _prediction_to_probabilities( + y: Union[np.ndarray, List], model_classes: List[Any], class_labels: Optional[List[str]] + ) -> pd.DataFrame: """Transforms predicted probabilities to match with OpenML class indices. Parameters @@ -1603,28 +1688,26 @@ def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd. training data). model_classes : list List of classes known_predicted by the model, ordered by their index. + class_labels : list + List of classes as stored in the task object fetched from server. Returns ------- pd.DataFrame """ + if class_labels is None: + raise ValueError("The task has no class labels") - if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)): - if task.class_labels is not None: - if isinstance(y_train, np.ndarray) and isinstance(task.class_labels[0], str): - # mapping (decoding) the predictions to the categories - # creating a separate copy to not change the expected pred_y type - y = [task.class_labels[pred] for pred in y] - else: - raise ValueError("The task has no class labels") - else: - return None + if isinstance(y_train, np.ndarray) and isinstance(class_labels[0], str): + # mapping (decoding) the predictions to the categories + # creating a separate copy to not change the expected pred_y type + y = [class_labels[pred] for pred in y] # list or numpy array of predictions - # y: list or numpy array of predictions # model_classes: sklearn classifier mapping from original array id to # prediction index id if not isinstance(model_classes, list): raise ValueError("please convert model classes to list prior to calling this fn") + # DataFrame allows more accurate mapping of classes as column names result = pd.DataFrame( 0, index=np.arange(len(y)), columns=model_classes, dtype=np.float32 @@ -1639,10 +1722,6 @@ def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd. if X_test is None: raise TypeError("argument X_test must not be of type None") - # TODO: if possible, give a warning if model is already fitted (acceptable - # in case of custom experimentation, - # but not desirable if we want to upload to OpenML). - model_copy = sklearn.base.clone(model, safe=True) # sanity check: prohibit users from optimizing n_jobs self._prevent_optimize_n_jobs(model_copy) @@ -1667,6 +1746,8 @@ def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd. user_defined_measures["usercpu_time_millis_training"] = modelfit_dur_cputime modelfit_dur_walltime = (time.time() - modelfit_start_walltime) * 1000 + if hasattr(model_copy, "refit_time_"): + modelfit_dur_walltime += model_copy.refit_time_ if can_measure_wallclocktime: user_defined_measures["wall_clock_time_millis_training"] = modelfit_dur_walltime @@ -1732,10 +1813,7 @@ def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd. proba_y = model_copy.predict_proba(X_test) proba_y = pd.DataFrame(proba_y, columns=model_classes) # handles X_test as numpy except AttributeError: # predict_proba is not available when probability=False - if task.class_labels is not None: - proba_y = _prediction_to_probabilities(pred_y, model_classes) - else: - raise ValueError("The task has no class labels") + proba_y = _prediction_to_probabilities(pred_y, model_classes, task.class_labels) if task.class_labels is not None: if proba_y.shape[1] != len(task.class_labels): @@ -1753,12 +1831,13 @@ def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd. proba_y.shape[1], len(task.class_labels), ) warnings.warn(message) - openml.config.logger.warn(message) + openml.config.logger.warning(message) for i, col in enumerate(task.class_labels): # adding missing columns with 0 probability if col not in model_classes: proba_y[col] = 0 + # We re-order the columns to move possibly added missing columns into place. proba_y = proba_y[task.class_labels] else: raise ValueError("The task has no class labels") diff --git a/openml/flows/flow.py b/openml/flows/flow.py index 5aaf70a9d..2acbcb0d1 100644 --- a/openml/flows/flow.py +++ b/openml/flows/flow.py @@ -229,7 +229,7 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": if not self.description: logger = logging.getLogger(__name__) - logger.warn("Flow % has empty description", self.name) + logger.warning("Flow % has empty description", self.name) flow_parameters = [] for key in self.parameters: diff --git a/openml/runs/functions.py b/openml/runs/functions.py index a08c84df8..92044a1b4 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -10,7 +10,9 @@ import sklearn.metrics import xmltodict +import numpy as np import pandas as pd +from joblib.parallel import Parallel, delayed import openml import openml.utils @@ -53,6 +55,7 @@ def run_model_on_task( upload_flow: bool = False, return_flow: bool = False, dataset_format: str = "dataframe", + n_jobs: Optional[int] = None, ) -> Union[OpenMLRun, Tuple[OpenMLRun, OpenMLFlow]]: """Run the model on the dataset defined by the task. @@ -83,6 +86,10 @@ def run_model_on_task( dataset_format : str (default='dataframe') If 'array', the dataset is passed to the model as a numpy array. If 'dataframe', the dataset is passed to the model as a pandas dataframe. + n_jobs : int (default=None) + The number of processes/threads to distribute the evaluation asynchronously. + If `None` or `1`, then the evaluation is treated as synchronous and processed sequentially. + If `-1`, then the job uses as many cores available. Returns ------- @@ -130,6 +137,7 @@ def get_task_and_type_conversion(task: Union[int, str, OpenMLTask]) -> OpenMLTas add_local_measures=add_local_measures, upload_flow=upload_flow, dataset_format=dataset_format, + n_jobs=n_jobs, ) if return_flow: return run, flow @@ -145,6 +153,7 @@ def run_flow_on_task( add_local_measures: bool = True, upload_flow: bool = False, dataset_format: str = "dataframe", + n_jobs: Optional[int] = None, ) -> OpenMLRun: """Run the model provided by the flow on the dataset defined by task. @@ -180,6 +189,10 @@ def run_flow_on_task( dataset_format : str (default='dataframe') If 'array', the dataset is passed to the model as a numpy array. If 'dataframe', the dataset is passed to the model as a pandas dataframe. + n_jobs : int (default=None) + The number of processes/threads to distribute the evaluation asynchronously. + If `None` or `1`, then the evaluation is treated as synchronous and processed sequentially. + If `-1`, then the job uses as many cores available. Returns ------- @@ -250,14 +263,20 @@ def run_flow_on_task( run_environment = flow.extension.get_version_information() tags = ["openml-python", run_environment[1]] + if flow.extension.check_if_model_fitted(flow.model): + warnings.warn( + "The model is already fitted!" + " This might cause inconsistency in comparison of results." + ) + # execute the run res = _run_task_get_arffcontent( - flow=flow, model=flow.model, task=task, extension=flow.extension, add_local_measures=add_local_measures, dataset_format=dataset_format, + n_jobs=n_jobs, ) data_content, trace, fold_evaluations, sample_evaluations = res @@ -412,12 +431,12 @@ def run_exists(task_id: int, setup_id: int) -> Set[int]: def _run_task_get_arffcontent( - flow: OpenMLFlow, model: Any, task: OpenMLTask, extension: "Extension", add_local_measures: bool, dataset_format: str, + n_jobs: int = None, ) -> Tuple[ List[List], Optional[OpenMLRunTrace], @@ -440,55 +459,36 @@ def _run_task_get_arffcontent( # methods, less maintenance, less confusion) num_reps, num_folds, num_samples = task.get_split_dimensions() + jobs = [] for n_fit, (rep_no, fold_no, sample_no) in enumerate( itertools.product(range(num_reps), range(num_folds), range(num_samples),), start=1 ): - - train_indices, test_indices = task.get_train_test_split_indices( - repeat=rep_no, fold=fold_no, sample=sample_no - ) - if isinstance(task, OpenMLSupervisedTask): - x, y = task.get_X_and_y(dataset_format=dataset_format) - if dataset_format == "dataframe": - train_x = x.iloc[train_indices] - train_y = y.iloc[train_indices] - test_x = x.iloc[test_indices] - test_y = y.iloc[test_indices] - else: - train_x = x[train_indices] - train_y = y[train_indices] - test_x = x[test_indices] - test_y = y[test_indices] - elif isinstance(task, OpenMLClusteringTask): - x = task.get_X(dataset_format=dataset_format) - if dataset_format == "dataframe": - train_x = x.iloc[train_indices] - else: - train_x = x[train_indices] - train_y = None - test_x = None - test_y = None - else: - raise NotImplementedError(task.task_type) - - config.logger.info( - "Going to execute flow '%s' on task %d for repeat %d fold %d sample %d.", - flow.name, - task.task_id, - rep_no, - fold_no, - sample_no, - ) - - pred_y, proba_y, user_defined_measures_fold, trace = extension._run_model_on_fold( + jobs.append((n_fit, rep_no, fold_no, sample_no)) + + # The forked child process may not copy the configuration state of OpenML from the parent. + # Current configuration setup needs to be copied and passed to the child processes. + _config = config.get_config_as_dict() + # Execute runs in parallel + # assuming the same number of tasks as workers (n_jobs), the total compute time for this + # statement will be similar to the slowest run + job_rvals = Parallel(verbose=0, n_jobs=n_jobs)( + delayed(_run_task_get_arffcontent_parallel_helper)( + extension=extension, + fold_no=fold_no, model=model, - task=task, - X_train=train_x, - y_train=train_y, rep_no=rep_no, - fold_no=fold_no, - X_test=test_x, + sample_no=sample_no, + task=task, + dataset_format=dataset_format, + configuration=_config, ) + for n_fit, rep_no, fold_no, sample_no in jobs + ) # job_rvals contain the output of all the runs with one-to-one correspondence with `jobs` + + for n_fit, rep_no, fold_no, sample_no in jobs: + pred_y, proba_y, test_indices, test_y, trace, user_defined_measures_fold = job_rvals[ + n_fit - 1 + ] if trace is not None: traces.append(trace) @@ -502,7 +502,9 @@ def _calculate_local_measure(sklearn_fn, openml_name): for i, tst_idx in enumerate(test_indices): if task.class_labels is not None: prediction = ( - task.class_labels[pred_y[i]] if isinstance(pred_y[i], int) else pred_y[i] + task.class_labels[pred_y[i]] + if isinstance(pred_y[i], (int, np.integer)) + else pred_y[i] ) if isinstance(test_y, pd.Series): test_prediction = ( @@ -513,7 +515,7 @@ def _calculate_local_measure(sklearn_fn, openml_name): else: test_prediction = ( task.class_labels[test_y[i]] - if isinstance(test_y[i], int) + if isinstance(test_y[i], (int, np.integer)) else test_y[i] ) pred_prob = proba_y.iloc[i] if isinstance(proba_y, pd.DataFrame) else proba_y[i] @@ -606,6 +608,75 @@ def _calculate_local_measure(sklearn_fn, openml_name): ) +def _run_task_get_arffcontent_parallel_helper( + extension: "Extension", + fold_no: int, + model: Any, + rep_no: int, + sample_no: int, + task: OpenMLTask, + dataset_format: str, + configuration: Dict = None, +) -> Tuple[ + np.ndarray, + Optional[pd.DataFrame], + np.ndarray, + Optional[pd.DataFrame], + Optional[OpenMLRunTrace], + "OrderedDict[str, float]", +]: + # Sets up the OpenML instantiated in the child process to match that of the parent's + # if configuration=None, loads the default + config._setup(configuration) + + train_indices, test_indices = task.get_train_test_split_indices( + repeat=rep_no, fold=fold_no, sample=sample_no + ) + + if isinstance(task, OpenMLSupervisedTask): + x, y = task.get_X_and_y(dataset_format=dataset_format) + if dataset_format == "dataframe": + train_x = x.iloc[train_indices] + train_y = y.iloc[train_indices] + test_x = x.iloc[test_indices] + test_y = y.iloc[test_indices] + else: + train_x = x[train_indices] + train_y = y[train_indices] + test_x = x[test_indices] + test_y = y[test_indices] + elif isinstance(task, OpenMLClusteringTask): + x = task.get_X(dataset_format=dataset_format) + if dataset_format == "dataframe": + train_x = x.iloc[train_indices] + else: + train_x = x[train_indices] + train_y = None + test_x = None + test_y = None + else: + raise NotImplementedError(task.task_type) + config.logger.info( + "Going to run model {} on dataset {} for repeat {} fold {} sample {}".format( + str(model), + openml.datasets.get_dataset(task.dataset_id).name, + rep_no, + fold_no, + sample_no, + ) + ) + pred_y, proba_y, user_defined_measures_fold, trace, = extension._run_model_on_fold( + model=model, + task=task, + X_train=train_x, + y_train=train_y, + rep_no=rep_no, + fold_no=fold_no, + X_test=test_x, + ) + return pred_y, proba_y, test_indices, test_y, trace, user_defined_measures_fold + + def get_runs(run_ids): """Gets all runs in run_ids list. @@ -734,6 +805,9 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): flow_name = obtain_field(run, "oml:flow_name", from_server) setup_id = obtain_field(run, "oml:setup_id", from_server, cast=int) setup_string = obtain_field(run, "oml:setup_string", from_server) + # run_details is currently not sent by the server, so we need to retrieve it safely. + # whenever that's resolved, we can enforce it being present (OpenML#1087) + run_details = obtain_field(run, "oml:run_details", from_server=False) if "oml:input_data" in run: dataset_id = int(run["oml:input_data"]["oml:dataset"]["oml:did"]) @@ -756,6 +830,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): if "oml:output_data" not in run: if from_server: raise ValueError("Run does not contain output_data " "(OpenML server error?)") + predictions_url = None else: output_data = run["oml:output_data"] predictions_url = None @@ -840,6 +915,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): sample_evaluations=sample_evaluations, tags=tags, predictions_url=predictions_url, + run_details=run_details, ) diff --git a/openml/runs/run.py b/openml/runs/run.py index 0311272b2..4c1c9907d 100644 --- a/openml/runs/run.py +++ b/openml/runs/run.py @@ -57,7 +57,9 @@ class OpenMLRun(OpenMLBase): run_id: int description_text: str, optional Description text to add to the predictions file. - If left None, + If left None, is set to the time the arff file is generated. + run_details: str, optional (default=None) + Description of the run stored in the run meta-data. """ def __init__( @@ -86,6 +88,7 @@ def __init__( flow=None, run_id=None, description_text=None, + run_details=None, ): self.uploader = uploader self.uploader_name = uploader_name @@ -112,6 +115,7 @@ def __init__( self.tags = tags self.predictions_url = predictions_url self.description_text = description_text + self.run_details = run_details @property def id(self) -> Optional[int]: @@ -543,11 +547,15 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": description["oml:run"]["@xmlns:oml"] = "http://openml.org/openml" description["oml:run"]["oml:task_id"] = self.task_id description["oml:run"]["oml:flow_id"] = self.flow_id + if self.setup_string is not None: + description["oml:run"]["oml:setup_string"] = self.setup_string if self.error_message is not None: description["oml:run"]["oml:error_message"] = self.error_message + if self.run_details is not None: + description["oml:run"]["oml:run_details"] = self.run_details description["oml:run"]["oml:parameter_setting"] = self.parameter_settings if self.tags is not None: - description["oml:run"]["oml:tag"] = self.tags # Tags describing the run + description["oml:run"]["oml:tag"] = self.tags if (self.fold_evaluations is not None and len(self.fold_evaluations) > 0) or ( self.sample_evaluations is not None and len(self.sample_evaluations) > 0 ): diff --git a/openml/study/functions.py b/openml/study/functions.py index 632581022..ee877ddf2 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -58,7 +58,7 @@ def get_study( "of things have changed since then. Please use `get_suite('OpenML100')` instead." ) warnings.warn(message, DeprecationWarning) - openml.config.logger.warn(message) + openml.config.logger.warning(message) study = _get_study(study_id, entity_type="task") return cast(OpenMLBenchmarkSuite, study) # type: ignore else: diff --git a/openml/testing.py b/openml/testing.py index da07b0ed7..f8e22bb4c 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -6,18 +6,13 @@ import shutil import sys import time -from typing import Dict +from typing import Dict, Union, cast import unittest -import warnings - -# Currently, importing oslo raises a lot of warning that it will stop working -# under python3.8; remove this once they disappear -with warnings.catch_warnings(): - warnings.simplefilter("ignore") - from oslo_concurrency import lockutils +import pandas as pd import openml from openml.tasks import TaskType +from openml.exceptions import OpenMLServerException import logging @@ -98,13 +93,6 @@ def setUp(self, n_levels: int = 1): openml.config.avoid_duplicate_runs = False openml.config.cache_directory = self.workdir - # If we're on travis, we save the api key in the config file to allow - # the notebook tests to read them. - if os.environ.get("TRAVIS") or os.environ.get("APPVEYOR"): - with lockutils.external_lock("config", lock_path=self.workdir): - with open(openml.config.config_file, "w") as fh: - fh.write("apikey = %s" % openml.config.apikey) - # Increase the number of retries to avoid spurious server failures self.connection_n_retries = openml.config.connection_n_retries openml.config.connection_n_retries = 10 @@ -252,6 +240,55 @@ def _check_fold_timing_evaluations( self.assertLessEqual(evaluation, max_val) +def check_task_existence( + task_type: TaskType, dataset_id: int, target_name: str, **kwargs +) -> Union[int, None]: + """Checks if any task with exists on test server that matches the meta data. + + Parameter + --------- + task_type : openml.tasks.TaskType + dataset_id : int + target_name : str + + Return + ------ + int, None + """ + return_val = None + tasks = openml.tasks.list_tasks(task_type=task_type, output_format="dataframe") + if len(tasks) == 0: + return None + tasks = cast(pd.DataFrame, tasks).loc[tasks["did"] == dataset_id] + if len(tasks) == 0: + return None + tasks = tasks.loc[tasks["target_feature"] == target_name] + if len(tasks) == 0: + return None + task_match = [] + for task_id in tasks["tid"].to_list(): + task_match.append(task_id) + try: + task = openml.tasks.get_task(task_id) + except OpenMLServerException: + # can fail if task_id deleted by another parallely run unit test + task_match.pop(-1) + return_val = None + continue + for k, v in kwargs.items(): + if getattr(task, k) != v: + # even if one of the meta-data key mismatches, then task_id is not a match + task_match.pop(-1) + break + # if task_id is retained in the task_match list, it passed all meta key-value matches + if len(task_match) == 1: + return_val = task_id + break + if len(task_match) == 0: + return_val = None + return return_val + + try: from sklearn.impute import SimpleImputer except ImportError: @@ -267,12 +304,4 @@ class CustomImputer(SimpleImputer): pass -def cont(X): - return X.dtypes != "category" - - -def cat(X): - return X.dtypes == "category" - - -__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "cat", "cont"] +__all__ = ["TestBase", "SimpleImputer", "CustomImputer", "check_task_existence"] diff --git a/openml/utils.py b/openml/utils.py index a402564f9..a482bf0bc 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -9,6 +9,7 @@ from functools import wraps import collections +import openml import openml._api_calls import openml.exceptions from . import config @@ -243,7 +244,7 @@ def _list_all(listing_call, output_format="dict", *args, **filters): limit=batch_size, offset=current_offset, output_format=output_format, - **active_filters + **active_filters, ) except openml.exceptions.OpenMLServerNoResult: # we want to return an empty dict in this case @@ -276,9 +277,11 @@ def _create_cache_directory(key): cache = config.get_cache_directory() cache_dir = os.path.join(cache, key) try: - os.makedirs(cache_dir) - except OSError: - pass + os.makedirs(cache_dir, exist_ok=True) + except Exception as e: + raise openml.exceptions.OpenMLCacheException( + f"Cannot create cache directory {cache_dir}." + ) from e return cache_dir @@ -304,9 +307,9 @@ def _create_cache_directory_for_id(key, id_): Path of the created dataset cache directory. """ cache_dir = os.path.join(_create_cache_directory(key), str(id_)) - if os.path.exists(cache_dir) and os.path.isdir(cache_dir): + if os.path.isdir(cache_dir): pass - elif os.path.exists(cache_dir) and not os.path.isdir(cache_dir): + elif os.path.exists(cache_dir): raise ValueError("%s cache dir exists but is not a directory!" % key) else: os.makedirs(cache_dir) diff --git a/setup.py b/setup.py index 9e9a093e4..dc1a58863 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ ) ) -with open(os.path.join("README.md")) as fid: +with open(os.path.join("README.md"), encoding="utf-8") as fid: README = fid.read() setuptools.setup( @@ -53,6 +53,8 @@ "pandas>=1.0.0", "scipy>=0.13.3", "numpy>=1.6.2", + "minio", + "pyarrow", ], extras_require={ "test": [ @@ -65,9 +67,9 @@ "nbformat", "oslo.concurrency", "flaky", - "pyarrow", "pre-commit", "pytest-cov", + "pytest-rerunfailures", "mypy", ], "examples": [ @@ -81,7 +83,8 @@ "ipykernel", "seaborn", ], - "examples_unix": ["fanova",], + "examples_unix": ["fanova"], + "docs": ["sphinx", "sphinx-gallery", "sphinx_bootstrap_theme", "numpydoc"], }, test_suite="pytest", classifiers=[ diff --git a/tests/conftest.py b/tests/conftest.py index 461a513fd..c1f728a72 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -25,6 +25,7 @@ import os import logging from typing import List +import pytest import openml from openml.testing import TestBase @@ -34,16 +35,6 @@ logger.setLevel(logging.DEBUG) file_list = [] -directory = None - -# finding the root directory of conftest.py and going up to OpenML main directory -# exploiting the fact that conftest.py always resides in the root directory for tests -static_dir = os.path.dirname(os.path.abspath(__file__)) -logger.info("static directory: {}".format(static_dir)) -while True: - if "openml" in os.listdir(static_dir): - break - static_dir = os.path.join(static_dir, "..") def worker_id() -> str: @@ -65,12 +56,11 @@ def read_file_list() -> List[str]: :return: List[str] """ - directory = os.path.join(static_dir, "tests/files/") - if worker_id() == "master": - logger.info("Collecting file lists from: {}".format(directory)) - files = os.walk(directory) + this_dir = os.path.abspath(os.path.dirname(os.path.abspath(__file__))) + directory = os.path.join(this_dir, "..") + logger.info("Collecting file lists from: {}".format(directory)) file_list = [] - for root, _, filenames in files: + for root, _, filenames in os.walk(directory): for filename in filenames: file_list.append(os.path.join(root, filename)) return file_list @@ -125,7 +115,7 @@ def delete_remote_files(tracker) -> None: openml.utils._delete_entity(entity_type, entity) logger.info("Deleted ({}, {})".format(entity_type, entity)) except Exception as e: - logger.warn("Cannot delete ({},{}): {}".format(entity_type, entity, e)) + logger.warning("Cannot delete ({},{}): {}".format(entity_type, entity, e)) def pytest_sessionstart() -> None: @@ -182,3 +172,17 @@ def pytest_sessionfinish() -> None: logger.info("Local files deleted") logger.info("{} is killed".format(worker)) + + +def pytest_addoption(parser): + parser.addoption( + "--long", + action="store_true", + default=False, + help="Run the long version of tests which support both short and long scenarios.", + ) + + +@pytest.fixture(scope="class") +def long_version(request): + request.cls.long_version = request.config.getoption("--long") diff --git a/tests/files/org/openml/test/datasets/30/dataset.pq b/tests/files/org/openml/test/datasets/30/dataset.pq new file mode 100644 index 000000000..b35597281 Binary files /dev/null and b/tests/files/org/openml/test/datasets/30/dataset.pq differ diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 73dbfa133..416fce534 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -1,7 +1,8 @@ # License: BSD 3-Clause +import os from time import time -from warnings import filterwarnings, catch_warnings +import unittest.mock import numpy as np import pandas as pd @@ -49,6 +50,17 @@ def test_init_string_validation(self): name="somename", description="a description", citation="Something by Müller" ) + def test__unpack_categories_with_nan_likes(self): + # unpack_categories decodes numeric categorical values according to the header + # Containing a 'non' category in the header shouldn't lead to failure. + categories = ["a", "b", None, float("nan"), np.nan] + series = pd.Series([0, 1, None, float("nan"), np.nan, 1, 0]) + clean_series = OpenMLDataset._unpack_categories(series, categories) + + expected_values = ["a", "b", np.nan, np.nan, np.nan, "b", "a"] + self.assertListEqual(list(clean_series.values), expected_values) + self.assertListEqual(list(clean_series.cat.categories.values), list("ab")) + def test_get_data_array(self): # Basic usage rval, _, categorical, attribute_names = self.dataset.get_data(dataset_format="array") @@ -72,13 +84,13 @@ def test_get_data_pandas(self): self.assertEqual(data.shape[1], len(self.titanic.features)) self.assertEqual(data.shape[0], 1309) col_dtype = { - "pclass": "float64", + "pclass": "uint8", "survived": "category", "name": "object", "sex": "category", "age": "float64", - "sibsp": "float64", - "parch": "float64", + "sibsp": "uint8", + "parch": "uint8", "ticket": "object", "fare": "float64", "cabin": "object", @@ -118,21 +130,29 @@ def test_get_data_no_str_data_for_nparrays(self): with pytest.raises(PyOpenMLError, match=err_msg): self.titanic.get_data(dataset_format="array") + def _check_expected_type(self, dtype, is_cat, col): + if is_cat: + expected_type = "category" + elif not col.isna().any() and (col.astype("uint8") == col).all(): + expected_type = "uint8" + else: + expected_type = "float64" + + self.assertEqual(dtype.name, expected_type) + def test_get_data_with_rowid(self): self.dataset.row_id_attribute = "condition" rval, _, categorical, _ = self.dataset.get_data(include_row_id=True) self.assertIsInstance(rval, pd.DataFrame) - for (dtype, is_cat) in zip(rval.dtypes, categorical): - expected_type = "category" if is_cat else "float64" - self.assertEqual(dtype.name, expected_type) + for (dtype, is_cat, col) in zip(rval.dtypes, categorical, rval): + self._check_expected_type(dtype, is_cat, rval[col]) self.assertEqual(rval.shape, (898, 39)) self.assertEqual(len(categorical), 39) rval, _, categorical, _ = self.dataset.get_data() self.assertIsInstance(rval, pd.DataFrame) - for (dtype, is_cat) in zip(rval.dtypes, categorical): - expected_type = "category" if is_cat else "float64" - self.assertEqual(dtype.name, expected_type) + for (dtype, is_cat, col) in zip(rval.dtypes, categorical, rval): + self._check_expected_type(dtype, is_cat, rval[col]) self.assertEqual(rval.shape, (898, 38)) self.assertEqual(len(categorical), 38) @@ -149,9 +169,8 @@ def test_get_data_with_target_array(self): def test_get_data_with_target_pandas(self): X, y, categorical, attribute_names = self.dataset.get_data(target="class") self.assertIsInstance(X, pd.DataFrame) - for (dtype, is_cat) in zip(X.dtypes, categorical): - expected_type = "category" if is_cat else "float64" - self.assertEqual(dtype.name, expected_type) + for (dtype, is_cat, col) in zip(X.dtypes, categorical, X): + self._check_expected_type(dtype, is_cat, X[col]) self.assertIsInstance(y, pd.Series) self.assertEqual(y.dtype.name, "category") @@ -174,27 +193,17 @@ def test_get_data_rowid_and_ignore_and_target(self): def test_get_data_with_ignore_attributes(self): self.dataset.ignore_attribute = ["condition"] rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=True) - for (dtype, is_cat) in zip(rval.dtypes, categorical): - expected_type = "category" if is_cat else "float64" - self.assertEqual(dtype.name, expected_type) + for (dtype, is_cat, col) in zip(rval.dtypes, categorical, rval): + self._check_expected_type(dtype, is_cat, rval[col]) self.assertEqual(rval.shape, (898, 39)) self.assertEqual(len(categorical), 39) rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=False) - for (dtype, is_cat) in zip(rval.dtypes, categorical): - expected_type = "category" if is_cat else "float64" - self.assertEqual(dtype.name, expected_type) + for (dtype, is_cat, col) in zip(rval.dtypes, categorical, rval): + self._check_expected_type(dtype, is_cat, rval[col]) self.assertEqual(rval.shape, (898, 38)) self.assertEqual(len(categorical), 38) - def test_dataset_format_constructor(self): - - with catch_warnings(): - filterwarnings("error") - self.assertRaises( - DeprecationWarning, openml.OpenMLDataset, "Test", "Test", format="arff" - ) - def test_get_data_with_nonexisting_class(self): # This class is using the anneal dataset with labels [1, 2, 3, 4, 5, 'U']. However, # label 4 does not exist and we test that the features 5 and 'U' are correctly mapped to @@ -350,7 +359,48 @@ def test_get_sparse_categorical_data_id_395(self): self.assertEqual(len(feature.nominal_values), 25) -class OpenMLDatasetQualityTest(TestBase): +class OpenMLDatasetFunctionTest(TestBase): + @unittest.mock.patch("openml.datasets.dataset.pickle") + @unittest.mock.patch("openml.datasets.dataset._get_features_pickle_file") + def test__read_features(self, filename_mock, pickle_mock): + """Test we read the features from the xml if no cache pickle is available. + + This test also does some simple checks to verify that the features are read correctly""" + filename_mock.return_value = os.path.join(self.workdir, "features.xml.pkl") + pickle_mock.load.side_effect = FileNotFoundError + features = openml.datasets.dataset._read_features( + os.path.join( + self.static_cache_dir, "org", "openml", "test", "datasets", "2", "features.xml" + ) + ) + self.assertIsInstance(features, dict) + self.assertEqual(len(features), 39) + self.assertIsInstance(features[0], OpenMLDataFeature) + self.assertEqual(features[0].name, "family") + self.assertEqual(len(features[0].nominal_values), 9) + # pickle.load is never called because the features pickle file didn't exist + self.assertEqual(pickle_mock.load.call_count, 0) + self.assertEqual(pickle_mock.dump.call_count, 1) + + @unittest.mock.patch("openml.datasets.dataset.pickle") + @unittest.mock.patch("openml.datasets.dataset._get_qualities_pickle_file") + def test__read_qualities(self, filename_mock, pickle_mock): + """Test we read the qualities from the xml if no cache pickle is available. + + This test also does some minor checks to ensure that the qualities are read correctly.""" + filename_mock.return_value = os.path.join(self.workdir, "qualities.xml.pkl") + pickle_mock.load.side_effect = FileNotFoundError + qualities = openml.datasets.dataset._read_qualities( + os.path.join( + self.static_cache_dir, "org", "openml", "test", "datasets", "2", "qualities.xml" + ) + ) + self.assertIsInstance(qualities, dict) + self.assertEqual(len(qualities), 106) + # pickle.load is never called because the qualities pickle file didn't exist + self.assertEqual(pickle_mock.load.call_count, 0) + self.assertEqual(pickle_mock.dump.call_count, 1) + def test__check_qualities(self): qualities = [{"oml:name": "a", "oml:value": "0.5"}] qualities = openml.datasets.dataset._check_qualities(qualities) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index c6e6f78f8..ec9dd6c53 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -1,9 +1,11 @@ # License: BSD 3-Clause import os +import pathlib import random from itertools import product from unittest import mock +import shutil import arff import time @@ -16,8 +18,8 @@ import openml from openml import OpenMLDataset +from openml._api_calls import _download_minio_file from openml.exceptions import ( - OpenMLCacheException, OpenMLHashException, OpenMLPrivateDatasetError, OpenMLServerException, @@ -27,19 +29,19 @@ from openml.datasets.functions import ( create_dataset, attributes_arff_from_df, - _get_cached_dataset, - _get_cached_dataset_features, - _get_cached_dataset_qualities, - _get_cached_datasets, _get_dataset_arff, _get_dataset_description, - _get_dataset_features, - _get_dataset_qualities, + _get_dataset_features_file, + _get_dataset_qualities_file, _get_online_dataset_arff, _get_online_dataset_format, DATASETS_CACHE_DIR_NAME, + _get_dataset_parquet, + _topic_add_dataset, + _topic_delete_dataset, ) from openml.datasets import fork_dataset, edit_dataset +from openml.tasks import TaskType, create_task class TestOpenMLDataset(TestBase): @@ -85,60 +87,6 @@ def _get_empty_param_for_dataset(self): "data": None, } - def test__list_cached_datasets(self): - openml.config.cache_directory = self.static_cache_dir - cached_datasets = openml.datasets.functions._list_cached_datasets() - self.assertIsInstance(cached_datasets, list) - self.assertEqual(len(cached_datasets), 2) - self.assertIsInstance(cached_datasets[0], int) - - @mock.patch("openml.datasets.functions._list_cached_datasets") - def test__get_cached_datasets(self, _list_cached_datasets_mock): - openml.config.cache_directory = self.static_cache_dir - _list_cached_datasets_mock.return_value = [-1, 2] - datasets = _get_cached_datasets() - self.assertIsInstance(datasets, dict) - self.assertEqual(len(datasets), 2) - self.assertIsInstance(list(datasets.values())[0], OpenMLDataset) - - def test__get_cached_dataset(self,): - openml.config.cache_directory = self.static_cache_dir - dataset = _get_cached_dataset(2) - features = _get_cached_dataset_features(2) - qualities = _get_cached_dataset_qualities(2) - self.assertIsInstance(dataset, OpenMLDataset) - self.assertTrue(len(dataset.features) > 0) - self.assertTrue(len(dataset.features) == len(features["oml:feature"])) - self.assertTrue(len(dataset.qualities) == len(qualities)) - - def test_get_cached_dataset_description(self): - openml.config.cache_directory = self.static_cache_dir - description = openml.datasets.functions._get_cached_dataset_description(2) - self.assertIsInstance(description, dict) - - def test_get_cached_dataset_description_not_cached(self): - openml.config.cache_directory = self.static_cache_dir - self.assertRaisesRegex( - OpenMLCacheException, - "Dataset description for dataset id 3 not cached", - openml.datasets.functions._get_cached_dataset_description, - dataset_id=3, - ) - - def test_get_cached_dataset_arff(self): - openml.config.cache_directory = self.static_cache_dir - description = openml.datasets.functions._get_cached_dataset_arff(dataset_id=2) - self.assertIsInstance(description, str) - - def test_get_cached_dataset_arff_not_cached(self): - openml.config.cache_directory = self.static_cache_dir - self.assertRaisesRegex( - OpenMLCacheException, - "ARFF file for dataset id 3 not cached", - openml.datasets.functions._get_cached_dataset_arff, - dataset_id=3, - ) - def _check_dataset(self, dataset): self.assertEqual(type(dataset), dict) self.assertGreaterEqual(len(dataset), 2) @@ -227,9 +175,10 @@ def test_list_datasets_empty(self): def test_check_datasets_active(self): # Have to test on live because there is no deactivated dataset on the test server. openml.config.server = self.production_server - active = openml.datasets.check_datasets_active([2, 17]) + active = openml.datasets.check_datasets_active([2, 17, 79], raise_error_if_not_exist=False,) self.assertTrue(active[2]) self.assertFalse(active[17]) + self.assertIsNone(active.get(79)) self.assertRaisesRegex( ValueError, "Could not find dataset 79 in OpenML dataset list.", @@ -369,6 +318,13 @@ def test_get_dataset_by_name(self): openml.config.server = self.production_server self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45) + def test_get_dataset_uint8_dtype(self): + dataset = openml.datasets.get_dataset(1) + self.assertEqual(type(dataset), OpenMLDataset) + self.assertEqual(dataset.name, "anneal") + df, _, _, _ = dataset.get_data() + self.assertEqual(df["carbon"].dtype, "uint8") + def test_get_dataset(self): # This is the only non-lazy load to ensure default behaviour works. dataset = openml.datasets.get_dataset(1) @@ -451,11 +407,99 @@ def test__get_dataset_description(self): def test__getarff_path_dataset_arff(self): openml.config.cache_directory = self.static_cache_dir - description = openml.datasets.functions._get_cached_dataset_description(2) + description = _get_dataset_description(self.workdir, 2) arff_path = _get_dataset_arff(description, cache_directory=self.workdir) self.assertIsInstance(arff_path, str) self.assertTrue(os.path.exists(arff_path)) + def test__download_minio_file_object_does_not_exist(self): + self.assertRaisesRegex( + FileNotFoundError, + r"Object at .* does not exist", + _download_minio_file, + source="http://openml1.win.tue.nl/dataset20/i_do_not_exist.pq", + destination=self.workdir, + exists_ok=True, + ) + + def test__download_minio_file_to_directory(self): + _download_minio_file( + source="http://openml1.win.tue.nl/dataset20/dataset_20.pq", + destination=self.workdir, + exists_ok=True, + ) + self.assertTrue( + os.path.isfile(os.path.join(self.workdir, "dataset_20.pq")), + "_download_minio_file can save to a folder by copying the object name", + ) + + def test__download_minio_file_to_path(self): + file_destination = os.path.join(self.workdir, "custom.pq") + _download_minio_file( + source="http://openml1.win.tue.nl/dataset20/dataset_20.pq", + destination=file_destination, + exists_ok=True, + ) + self.assertTrue( + os.path.isfile(file_destination), + "_download_minio_file can save to a folder by copying the object name", + ) + + def test__download_minio_file_raises_FileExists_if_destination_in_use(self): + file_destination = pathlib.Path(self.workdir, "custom.pq") + file_destination.touch() + + self.assertRaises( + FileExistsError, + _download_minio_file, + source="http://openml1.win.tue.nl/dataset20/dataset_20.pq", + destination=str(file_destination), + exists_ok=False, + ) + + def test__download_minio_file_works_with_bucket_subdirectory(self): + file_destination = pathlib.Path(self.workdir, "custom.csv") + _download_minio_file( + source="http://openml1.win.tue.nl/test/subdirectory/test.csv", + destination=file_destination, + exists_ok=True, + ) + self.assertTrue( + os.path.isfile(file_destination), + "_download_minio_file can download from subdirectories", + ) + + def test__get_dataset_parquet_not_cached(self): + description = { + "oml:minio_url": "http://openml1.win.tue.nl/dataset20/dataset_20.pq", + "oml:id": "20", + } + path = _get_dataset_parquet(description, cache_directory=self.workdir) + self.assertIsInstance(path, str, "_get_dataset_parquet returns a path") + self.assertTrue(os.path.isfile(path), "_get_dataset_parquet returns path to real file") + + @mock.patch("openml._api_calls._download_minio_file") + def test__get_dataset_parquet_is_cached(self, patch): + openml.config.cache_directory = self.static_cache_dir + patch.side_effect = RuntimeError( + "_download_minio_file should not be called when loading from cache" + ) + description = { + "oml:minio_url": "http://openml1.win.tue.nl/dataset30/dataset_30.pq", + "oml:id": "30", + } + path = _get_dataset_parquet(description, cache_directory=None) + self.assertIsInstance(path, str, "_get_dataset_parquet returns a path") + self.assertTrue(os.path.isfile(path), "_get_dataset_parquet returns path to real file") + + def test__get_dataset_parquet_file_does_not_exist(self): + description = { + "oml:minio_url": "http://openml1.win.tue.nl/dataset20/does_not_exist.pq", + "oml:id": "20", + } + path = _get_dataset_parquet(description, cache_directory=self.workdir) + self.assertIsNone(path, "_get_dataset_parquet returns None if no file is found") + def test__getarff_md5_issue(self): description = { "oml:id": 5, @@ -464,23 +508,27 @@ def test__getarff_md5_issue(self): } self.assertRaisesRegex( OpenMLHashException, - "Checksum ad484452702105cbf3d30f8deaba39a9 of downloaded file " - "is unequal to the expected checksum abc. " - "Raised when downloading dataset 5.", + "Checksum of downloaded file is unequal to the expected checksum abc when downloading " + "https://www.openml.org/data/download/61. Raised when downloading dataset 5.", _get_dataset_arff, description, ) def test__get_dataset_features(self): - features = _get_dataset_features(self.workdir, 2) - self.assertIsInstance(features, dict) + features_file = _get_dataset_features_file(self.workdir, 2) + self.assertIsInstance(features_file, str) features_xml_path = os.path.join(self.workdir, "features.xml") self.assertTrue(os.path.exists(features_xml_path)) def test__get_dataset_qualities(self): - # Only a smoke check - qualities = _get_dataset_qualities(self.workdir, 2) - self.assertIsInstance(qualities, list) + qualities = _get_dataset_qualities_file(self.workdir, 2) + self.assertIsInstance(qualities, str) + qualities_xml_path = os.path.join(self.workdir, "qualities.xml") + self.assertTrue(os.path.exists(qualities_xml_path)) + + def test__get_dataset_skip_download(self): + qualities = openml.datasets.get_dataset(2, download_qualities=False).qualities + self.assertIsNone(qualities) def test_deletion_of_cache_dir(self): # Simple removal @@ -547,6 +595,7 @@ def test_upload_dataset_with_url(self): ) self.assertIsInstance(dataset.dataset_id, int) + @pytest.mark.flaky() def test_data_status(self): dataset = OpenMLDataset( "%s-UploadTestWithURL" % self._get_sentinel(), @@ -864,6 +913,24 @@ def test_get_online_dataset_arff(self): "ARFF files are not equal", ) + def test_topic_api_error(self): + # Check server exception when non-admin accessses apis + self.assertRaisesRegex( + OpenMLServerException, + "Topic can only be added/removed by admin.", + _topic_add_dataset, + data_id=31, + topic="business", + ) + # Check server exception when non-admin accessses apis + self.assertRaisesRegex( + OpenMLServerException, + "Topic can only be added/removed by admin.", + _topic_delete_dataset, + data_id=31, + topic="business", + ) + def test_get_online_dataset_format(self): # Phoneme dataset @@ -897,7 +964,6 @@ def test_create_dataset_pandas(self): collection_date = "01-01-2018" language = "English" licence = "MIT" - default_target_attribute = "play" citation = "None" original_data_url = "http://openml.github.io/openml-python" paper_url = "http://openml.github.io/openml-python" @@ -909,7 +975,7 @@ def test_create_dataset_pandas(self): collection_date=collection_date, language=language, licence=licence, - default_target_attribute=default_target_attribute, + default_target_attribute="play", row_id_attribute=None, ignore_attribute=None, citation=citation, @@ -944,7 +1010,7 @@ def test_create_dataset_pandas(self): collection_date=collection_date, language=language, licence=licence, - default_target_attribute=default_target_attribute, + default_target_attribute="y", row_id_attribute=None, ignore_attribute=None, citation=citation, @@ -980,7 +1046,7 @@ def test_create_dataset_pandas(self): collection_date=collection_date, language=language, licence=licence, - default_target_attribute=default_target_attribute, + default_target_attribute="rnd_str", row_id_attribute=None, ignore_attribute=None, citation=citation, @@ -1147,27 +1213,31 @@ def test_publish_fetch_ignore_attribute(self): # test if publish was successful self.assertIsInstance(dataset.id, int) + downloaded_dataset = self._wait_for_dataset_being_processed(dataset.id) + self.assertEqual(downloaded_dataset.ignore_attribute, ignore_attribute) + + def _wait_for_dataset_being_processed(self, dataset_id): downloaded_dataset = None # fetching from server # loop till timeout or fetch not successful - max_waiting_time_seconds = 400 + max_waiting_time_seconds = 600 # time.time() works in seconds start_time = time.time() while time.time() - start_time < max_waiting_time_seconds: try: - downloaded_dataset = openml.datasets.get_dataset(dataset.id) + downloaded_dataset = openml.datasets.get_dataset(dataset_id) break except Exception as e: # returned code 273: Dataset not processed yet # returned code 362: No qualities found TestBase.logger.error( - "Failed to fetch dataset:{} with '{}'.".format(dataset.id, str(e)) + "Failed to fetch dataset:{} with '{}'.".format(dataset_id, str(e)) ) time.sleep(10) continue if downloaded_dataset is None: - raise ValueError("TIMEOUT: Failed to fetch uploaded dataset - {}".format(dataset.id)) - self.assertEqual(downloaded_dataset.ignore_attribute, ignore_attribute) + raise ValueError("TIMEOUT: Failed to fetch uploaded dataset - {}".format(dataset_id)) + return downloaded_dataset def test_create_dataset_row_id_attribute_error(self): # meta-information @@ -1303,6 +1373,8 @@ def test_list_qualities(self): def test_get_dataset_cache_format_pickle(self): dataset = openml.datasets.get_dataset(1) + dataset.get_data() + self.assertEqual(type(dataset), OpenMLDataset) self.assertEqual(dataset.name, "anneal") self.assertGreater(len(dataset.features), 1) @@ -1317,6 +1389,7 @@ def test_get_dataset_cache_format_pickle(self): def test_get_dataset_cache_format_feather(self): dataset = openml.datasets.get_dataset(128, cache_format="feather") + dataset.get_data() # Check if dataset is written to cache directory using feather cache_dir = openml.config.get_cache_directory() @@ -1340,7 +1413,7 @@ def test_get_dataset_cache_format_feather(self): self.assertEqual(len(categorical), X.shape[1]) self.assertEqual(len(attribute_names), X.shape[1]) - def test_data_edit(self): + def test_data_edit_non_critical_field(self): # Case 1 # All users can edit non-critical fields of datasets desc = ( @@ -1361,14 +1434,31 @@ def test_data_edit(self): edited_dataset = openml.datasets.get_dataset(did) self.assertEqual(edited_dataset.description, desc) + def test_data_edit_critical_field(self): # Case 2 # only owners (or admin) can edit all critical fields of datasets - # this is a dataset created by CI, so it is editable by this test - did = 315 - result = edit_dataset(did, default_target_attribute="col_1", ignore_attribute="col_2") + # for this, we need to first clone a dataset to do changes + did = fork_dataset(1) + self._wait_for_dataset_being_processed(did) + result = edit_dataset(did, default_target_attribute="shape", ignore_attribute="oil") self.assertEqual(did, result) - edited_dataset = openml.datasets.get_dataset(did) - self.assertEqual(edited_dataset.ignore_attribute, ["col_2"]) + + n_tries = 10 + # we need to wait for the edit to be reflected on the server + for i in range(n_tries): + edited_dataset = openml.datasets.get_dataset(did) + try: + self.assertEqual(edited_dataset.default_target_attribute, "shape", edited_dataset) + self.assertEqual(edited_dataset.ignore_attribute, ["oil"], edited_dataset) + break + except AssertionError as e: + if i == n_tries - 1: + raise e + time.sleep(10) + # Delete the cache dir to get the newer version of the dataset + shutil.rmtree( + os.path.join(self.workdir, "org", "openml", "test", "datasets", str(did)) + ) def test_data_edit_errors(self): # Check server exception when no field to edit is provided @@ -1379,7 +1469,7 @@ def test_data_edit_errors(self): "original_data_url, default_target_attribute, row_id_attribute, " "ignore_attribute or paper_url to edit.", edit_dataset, - data_id=564, + data_id=64, # blood-transfusion-service-center ) # Check server exception when unknown dataset is provided self.assertRaisesRegex( @@ -1389,15 +1479,32 @@ def test_data_edit_errors(self): data_id=999999, description="xor operation dataset", ) + + # Need to own a dataset to be able to edit meta-data + # Will be creating a forked version of an existing dataset to allow the unit test user + # to edit meta-data of a dataset + did = fork_dataset(1) + self._wait_for_dataset_being_processed(did) + TestBase._mark_entity_for_removal("data", did) + # Need to upload a task attached to this data to test edit failure + task = create_task( + task_type=TaskType.SUPERVISED_CLASSIFICATION, + dataset_id=did, + target_name="class", + estimation_procedure_id=1, + ) + task = task.publish() + TestBase._mark_entity_for_removal("task", task.task_id) # Check server exception when owner/admin edits critical fields of dataset with tasks self.assertRaisesRegex( OpenMLServerException, "Critical features default_target_attribute, row_id_attribute and ignore_attribute " "can only be edited for datasets without any tasks.", edit_dataset, - data_id=223, + data_id=did, default_target_attribute="y", ) + # Check server exception when a non-owner or non-admin tries to edit critical fields self.assertRaisesRegex( OpenMLServerException, @@ -1416,3 +1523,124 @@ def test_data_fork(self): self.assertRaisesRegex( OpenMLServerException, "Unknown dataset", fork_dataset, data_id=999999, ) + + def test_get_dataset_parquet(self): + dataset = openml.datasets.get_dataset(20) + self.assertIsNotNone(dataset._minio_url) + self.assertIsNotNone(dataset.parquet_file) + self.assertTrue(os.path.isfile(dataset.parquet_file)) + + +@pytest.mark.parametrize( + "default_target_attribute,row_id_attribute,ignore_attribute", + [ + ("wrong", None, None), + (None, "wrong", None), + (None, None, "wrong"), + ("wrong,sunny", None, None), + (None, None, "wrong,sunny"), + (["wrong", "sunny"], None, None), + (None, None, ["wrong", "sunny"]), + ], +) +def test_invalid_attribute_validations( + default_target_attribute, row_id_attribute, ignore_attribute +): + data = [ + ["a", "sunny", 85.0, 85.0, "FALSE", "no"], + ["b", "sunny", 80.0, 90.0, "TRUE", "no"], + ["c", "overcast", 83.0, 86.0, "FALSE", "yes"], + ["d", "rainy", 70.0, 96.0, "FALSE", "yes"], + ["e", "rainy", 68.0, 80.0, "FALSE", "yes"], + ] + column_names = ["rnd_str", "outlook", "temperature", "humidity", "windy", "play"] + df = pd.DataFrame(data, columns=column_names) + # enforce the type of each column + df["outlook"] = df["outlook"].astype("category") + df["windy"] = df["windy"].astype("bool") + df["play"] = df["play"].astype("category") + # meta-information + name = "pandas_testing_dataset" + description = "Synthetic dataset created from a Pandas DataFrame" + creator = "OpenML tester" + collection_date = "01-01-2018" + language = "English" + licence = "MIT" + citation = "None" + original_data_url = "http://openml.github.io/openml-python" + paper_url = "http://openml.github.io/openml-python" + with pytest.raises(ValueError, match="should be one of the data attribute"): + _ = openml.datasets.functions.create_dataset( + name=name, + description=description, + creator=creator, + contributor=None, + collection_date=collection_date, + language=language, + licence=licence, + default_target_attribute=default_target_attribute, + row_id_attribute=row_id_attribute, + ignore_attribute=ignore_attribute, + citation=citation, + attributes="auto", + data=df, + version_label="test", + original_data_url=original_data_url, + paper_url=paper_url, + ) + + +@pytest.mark.parametrize( + "default_target_attribute,row_id_attribute,ignore_attribute", + [ + ("outlook", None, None), + (None, "outlook", None), + (None, None, "outlook"), + ("outlook,windy", None, None), + (None, None, "outlook,windy"), + (["outlook", "windy"], None, None), + (None, None, ["outlook", "windy"]), + ], +) +def test_valid_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute): + data = [ + ["a", "sunny", 85.0, 85.0, "FALSE", "no"], + ["b", "sunny", 80.0, 90.0, "TRUE", "no"], + ["c", "overcast", 83.0, 86.0, "FALSE", "yes"], + ["d", "rainy", 70.0, 96.0, "FALSE", "yes"], + ["e", "rainy", 68.0, 80.0, "FALSE", "yes"], + ] + column_names = ["rnd_str", "outlook", "temperature", "humidity", "windy", "play"] + df = pd.DataFrame(data, columns=column_names) + # enforce the type of each column + df["outlook"] = df["outlook"].astype("category") + df["windy"] = df["windy"].astype("bool") + df["play"] = df["play"].astype("category") + # meta-information + name = "pandas_testing_dataset" + description = "Synthetic dataset created from a Pandas DataFrame" + creator = "OpenML tester" + collection_date = "01-01-2018" + language = "English" + licence = "MIT" + citation = "None" + original_data_url = "http://openml.github.io/openml-python" + paper_url = "http://openml.github.io/openml-python" + _ = openml.datasets.functions.create_dataset( + name=name, + description=description, + creator=creator, + contributor=None, + collection_date=collection_date, + language=language, + licence=licence, + default_target_attribute=default_target_attribute, + row_id_attribute=row_id_attribute, + ignore_attribute=ignore_attribute, + citation=citation, + attributes="auto", + data=df, + version_label="test", + original_data_url=original_data_url, + paper_url=paper_url, + ) diff --git a/tests/test_evaluations/__init__.py b/tests/test_evaluations/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py index e4de9b03c..70f36ce19 100644 --- a/tests/test_evaluations/test_evaluation_functions.py +++ b/tests/test_evaluations/test_evaluation_functions.py @@ -1,10 +1,12 @@ # License: BSD 3-Clause +import pytest import openml import openml.evaluations from openml.testing import TestBase +@pytest.mark.usefixtures("long_version") class TestEvaluationFunctions(TestBase): _multiprocess_can_split_ = True @@ -27,6 +29,10 @@ def _check_list_evaluation_setups(self, **kwargs): # Check if output and order of list_evaluations is preserved self.assertSequenceEqual(evals_setups["run_id"].tolist(), evals["run_id"].tolist()) + + if not self.long_version: + evals_setups = evals_setups.head(1) + # Check if the hyper-parameter column is as accurate and flow_id for index, row in evals_setups.iterrows(): params = openml.runs.get_run(row["run_id"]).parameter_settings diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index d34dc2ad3..c1f88bcda 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -40,7 +40,8 @@ from openml.flows import OpenMLFlow from openml.flows.functions import assert_flows_equal from openml.runs.trace import OpenMLRunTrace -from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont +from openml.testing import TestBase, SimpleImputer, CustomImputer +from openml.extensions.sklearn import cat, cont this_directory = os.path.dirname(os.path.abspath(__file__)) @@ -145,7 +146,7 @@ def test_serialize_model(self): fixture_short_name = "sklearn.DecisionTreeClassifier" # str obtained from self.extension._get_sklearn_description(model) fixture_description = "A decision tree classifier." - version_fixture = "sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9" % sklearn.__version__ + version_fixture = self.extension._min_dependency_str(sklearn.__version__) presort_val = "false" if LooseVersion(sklearn.__version__) < "0.22" else '"deprecated"' # min_impurity_decrease has been introduced in 0.20 @@ -188,6 +189,8 @@ def test_serialize_model(self): if LooseVersion(sklearn.__version__) >= "0.22": fixture_parameters.update({"ccp_alpha": "0.0"}) fixture_parameters.move_to_end("ccp_alpha", last=False) + if LooseVersion(sklearn.__version__) >= "0.24": + del fixture_parameters["presort"] structure_fixture = {"sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): []} @@ -224,7 +227,7 @@ def test_serialize_model_clustering(self): fixture_description = "K-Means clustering{}".format( "" if LooseVersion(sklearn.__version__) < "0.22" else "." ) - version_fixture = "sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9" % sklearn.__version__ + version_fixture = self.extension._min_dependency_str(sklearn.__version__) n_jobs_val = "null" if LooseVersion(sklearn.__version__) < "0.23" else '"deprecated"' precomp_val = '"auto"' if LooseVersion(sklearn.__version__) < "0.23" else '"deprecated"' @@ -1251,7 +1254,7 @@ def test_paralizable_check(self): # using this param distribution should raise an exception illegal_param_dist = {"base__n_jobs": [-1, 0, 1]} # using this param distribution should not raise an exception - legal_param_dist = {"base__max_depth": [2, 3, 4]} + legal_param_dist = {"n_estimators": [2, 3, 4]} legal_models = [ sklearn.ensemble.RandomForestClassifier(), @@ -1279,12 +1282,19 @@ def test_paralizable_check(self): can_measure_cputime_answers = [True, False, False, True, False, False, True, False, False] can_measure_walltime_answers = [True, True, False, True, True, False, True, True, False] + if LooseVersion(sklearn.__version__) < "0.20": + has_refit_time = [False, False, False, False, False, False, False, False, False] + else: + has_refit_time = [False, False, False, False, False, False, True, True, False] - for model, allowed_cputime, allowed_walltime in zip( - legal_models, can_measure_cputime_answers, can_measure_walltime_answers + X, y = sklearn.datasets.load_iris(return_X_y=True) + for model, allowed_cputime, allowed_walltime, refit_time in zip( + legal_models, can_measure_cputime_answers, can_measure_walltime_answers, has_refit_time ): self.assertEqual(self.extension._can_measure_cputime(model), allowed_cputime) self.assertEqual(self.extension._can_measure_wallclocktime(model), allowed_walltime) + model.fit(X, y) + self.assertEqual(refit_time, hasattr(model, "refit_time_")) for model in illegal_models: with self.assertRaises(PyOpenMLError): @@ -1316,12 +1326,18 @@ def test__get_fn_arguments_with_defaults(self): (sklearn.tree.DecisionTreeClassifier.__init__, 14), (sklearn.pipeline.Pipeline.__init__, 2), ] - else: + elif sklearn_version < "0.24": fns = [ (sklearn.ensemble.RandomForestRegressor.__init__, 18), (sklearn.tree.DecisionTreeClassifier.__init__, 14), (sklearn.pipeline.Pipeline.__init__, 2), ] + else: + fns = [ + (sklearn.ensemble.RandomForestRegressor.__init__, 18), + (sklearn.tree.DecisionTreeClassifier.__init__, 13), + (sklearn.pipeline.Pipeline.__init__, 2), + ] for fn, num_params_with_defaults in fns: defaults, defaultless = self.extension._get_fn_arguments_with_defaults(fn) @@ -1464,7 +1480,7 @@ def test_openml_param_name_to_sklearn(self): ) model = sklearn.pipeline.Pipeline(steps=[("scaler", scaler), ("boosting", boosting)]) flow = self.extension.model_to_flow(model) - task = openml.tasks.get_task(115) + task = openml.tasks.get_task(115) # diabetes; crossvalidation run = openml.runs.run_flow_on_task(flow, task) run = run.publish() TestBase._mark_entity_for_removal("run", run.run_id) @@ -1522,7 +1538,7 @@ def test_obtain_parameter_values(self): "bootstrap": [True, False], "criterion": ["gini", "entropy"], }, - cv=sklearn.model_selection.StratifiedKFold(n_splits=2, random_state=1), + cv=sklearn.model_selection.StratifiedKFold(n_splits=2, random_state=1, shuffle=True), n_iter=5, ) flow = self.extension.model_to_flow(model) @@ -1560,7 +1576,7 @@ def setUp(self): # Test methods for performing runs with this extension module def test_run_model_on_task(self): - task = openml.tasks.get_task(1) + task = openml.tasks.get_task(1) # anneal; crossvalidation # using most_frequent imputer since dataset has mixed types and to keep things simple pipe = sklearn.pipeline.Pipeline( [ @@ -1625,7 +1641,7 @@ def test_seed_model_raises(self): self.extension.seed_model(model=clf, seed=42) def test_run_model_on_fold_classification_1_array(self): - task = openml.tasks.get_task(1) + task = openml.tasks.get_task(1) # anneal; crossvalidation X, y = task.get_X_and_y() train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0) @@ -1688,7 +1704,7 @@ def test_run_model_on_fold_classification_1_array(self): def test_run_model_on_fold_classification_1_dataframe(self): from sklearn.compose import ColumnTransformer - task = openml.tasks.get_task(1) + task = openml.tasks.get_task(1) # anneal; crossvalidation # diff test_run_model_on_fold_classification_1_array() X, y = task.get_X_and_y(dataset_format="dataframe") @@ -1752,7 +1768,7 @@ def test_run_model_on_fold_classification_1_dataframe(self): ) def test_run_model_on_fold_classification_2(self): - task = openml.tasks.get_task(7) + task = openml.tasks.get_task(7) # kr-vs-kp; crossvalidation X, y = task.get_X_and_y() train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0) @@ -1814,7 +1830,11 @@ def predict_proba(*args, **kwargs): raise AttributeError("predict_proba is not available when " "probability=False") # task 1 (test server) is important: it is a task with an unused class - tasks = [1, 3, 115] + tasks = [ + 1, # anneal; crossvalidation + 3, # anneal; crossvalidation + 115, # diabetes; crossvalidation + ] flow = unittest.mock.Mock() flow.name = "dummy" @@ -1968,7 +1988,7 @@ def test__extract_trace_data(self): "max_iter": [10, 20, 40, 80], } num_iters = 10 - task = openml.tasks.get_task(20) + task = openml.tasks.get_task(20) # balance-scale; crossvalidation clf = sklearn.model_selection.RandomizedSearchCV( sklearn.neural_network.MLPClassifier(), param_grid, num_iters, ) @@ -2079,8 +2099,8 @@ def test_run_on_model_with_empty_steps(self): from sklearn.compose import ColumnTransformer # testing 'drop', 'passthrough', None as non-actionable sklearn estimators - dataset = openml.datasets.get_dataset(128) - task = openml.tasks.get_task(59) + dataset = openml.datasets.get_dataset(128) # iris + task = openml.tasks.get_task(59) # mfeat-pixel; crossvalidation X, y, categorical_ind, feature_names = dataset.get_data( target=dataset.default_target_attribute, dataset_format="array" @@ -2183,16 +2203,6 @@ def test_failed_serialization_of_custom_class(self): # for lower versions from sklearn.preprocessing import Imputer as SimpleImputer - class CustomImputer(SimpleImputer): - pass - - def cont(X): - return X.dtypes != "category" - - def cat(X): - return X.dtypes == "category" - - import sklearn.metrics import sklearn.tree from sklearn.pipeline import Pipeline, make_pipeline from sklearn.compose import ColumnTransformer @@ -2207,7 +2217,7 @@ def cat(X): steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())] ) # build a sklearn classifier - task = openml.tasks.get_task(253) # data with mixed types from test server + task = openml.tasks.get_task(253) # profb; crossvalidation try: _ = openml.runs.run_model_on_task(clf, task) except AttributeError as e: @@ -2215,3 +2225,38 @@ def cat(X): raise AttributeError(e) else: raise Exception(e) + + @unittest.skipIf( + LooseVersion(sklearn.__version__) < "0.20", + reason="columntransformer introduction in 0.20.0", + ) + def test_setupid_with_column_transformer(self): + """Test to check if inclusion of ColumnTransformer in a pipleline is treated as a new + flow each time. + """ + import sklearn.compose + from sklearn.svm import SVC + + def column_transformer_pipe(task_id): + task = openml.tasks.get_task(task_id) + # make columntransformer + preprocessor = sklearn.compose.ColumnTransformer( + transformers=[ + ("num", StandardScaler(), cont), + ("cat", OneHotEncoder(handle_unknown="ignore"), cat), + ] + ) + # make pipeline + clf = SVC(gamma="scale", random_state=1) + pipe = make_pipeline(preprocessor, clf) + # run task + run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False) + run.publish() + new_run = openml.runs.get_run(run.run_id) + return new_run + + run1 = column_transformer_pipe(11) # only categorical + TestBase._mark_entity_for_removal("run", run1.run_id) + run2 = column_transformer_pipe(23) # only numeric + TestBase._mark_entity_for_removal("run", run2.run_id) + self.assertEqual(run1.setup_id, run2.setup_id) diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index 12af05ffe..a65dcbf70 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -2,18 +2,22 @@ from collections import OrderedDict import copy +import functools import unittest +from unittest.mock import patch from distutils.version import LooseVersion import sklearn from sklearn import ensemble import pandas as pd +import pytest import openml from openml.testing import TestBase import openml.extensions.sklearn +@pytest.mark.usefixtures("long_version") class TestFlowFunctions(TestBase): _multiprocess_can_split_ = True @@ -321,8 +325,16 @@ def test_get_flow_reinstantiate_model_wrong_version(self): # Note that CI does not test against 0.19.1. openml.config.server = self.production_server _, sklearn_major, _ = LooseVersion(sklearn.__version__).version[:3] - flow = 8175 - expected = "Trying to deserialize a model with dependency" " sklearn==0.19.1 not satisfied." + if sklearn_major > 23: + flow = 18587 # 18687, 18725 --- flows building random forest on >= 0.23 + flow_sklearn_version = "0.23.1" + else: + flow = 8175 + flow_sklearn_version = "0.19.1" + expected = ( + "Trying to deserialize a model with dependency " + "sklearn=={} not satisfied.".format(flow_sklearn_version) + ) self.assertRaisesRegex( ValueError, expected, openml.flows.get_flow, flow_id=flow, reinstantiate=True ) @@ -331,23 +343,34 @@ def test_get_flow_reinstantiate_model_wrong_version(self): flow = openml.flows.get_flow(flow_id=flow, reinstantiate=True, strict_version=False) # ensure that a new flow was created assert flow.flow_id is None - assert "0.19.1" not in flow.dependencies + assert "sklearn==0.19.1" not in flow.dependencies + assert "sklearn>=0.19.1" not in flow.dependencies def test_get_flow_id(self): - clf = sklearn.tree.DecisionTreeClassifier() - flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish() - - self.assertEqual(openml.flows.get_flow_id(model=clf, exact_version=True), flow.flow_id) - flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False) - self.assertIn(flow.flow_id, flow_ids) - self.assertGreater(len(flow_ids), 2) - - # Check that the output of get_flow_id is identical if only the name is given, no matter - # whether exact_version is set to True or False. - flow_ids_exact_version_True = openml.flows.get_flow_id(name=flow.name, exact_version=True) - flow_ids_exact_version_False = openml.flows.get_flow_id( - name=flow.name, exact_version=False, - ) - self.assertEqual(flow_ids_exact_version_True, flow_ids_exact_version_False) - self.assertIn(flow.flow_id, flow_ids_exact_version_True) - self.assertGreater(len(flow_ids_exact_version_True), 2) + if self.long_version: + list_all = openml.utils._list_all + else: + list_all = functools.lru_cache()(openml.utils._list_all) + with patch("openml.utils._list_all", list_all): + clf = sklearn.tree.DecisionTreeClassifier() + flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish() + TestBase._mark_entity_for_removal("flow", (flow.flow_id, flow.name)) + TestBase.logger.info( + "collected from {}: {}".format(__file__.split("/")[-1], flow.flow_id) + ) + + self.assertEqual(openml.flows.get_flow_id(model=clf, exact_version=True), flow.flow_id) + flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False) + self.assertIn(flow.flow_id, flow_ids) + self.assertGreater(len(flow_ids), 0) + + # Check that the output of get_flow_id is identical if only the name is given, no matter + # whether exact_version is set to True or False. + flow_ids_exact_version_True = openml.flows.get_flow_id( + name=flow.name, exact_version=True + ) + flow_ids_exact_version_False = openml.flows.get_flow_id( + name=flow.name, exact_version=False, + ) + self.assertEqual(flow_ids_exact_version_True, flow_ids_exact_version_False) + self.assertIn(flow.flow_id, flow_ids_exact_version_True) diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py index 8b470a45b..459a0cdf5 100644 --- a/tests/test_openml/test_api_calls.py +++ b/tests/test_openml/test_api_calls.py @@ -1,3 +1,5 @@ +import unittest.mock + import openml import openml.testing @@ -8,3 +10,23 @@ def test_too_long_uri(self): openml.exceptions.OpenMLServerError, "URI too long!", ): openml.datasets.list_datasets(data_id=list(range(10000))) + + @unittest.mock.patch("time.sleep") + @unittest.mock.patch("requests.Session") + def test_retry_on_database_error(self, Session_class_mock, _): + response_mock = unittest.mock.Mock() + response_mock.text = ( + "\n" + "107" + "Database connection error. " + "Usually due to high server load. " + "Please wait for N seconds and try again.\n" + "" + ) + Session_class_mock.return_value.__enter__.return_value.get.return_value = response_mock + with self.assertRaisesRegex( + openml.exceptions.OpenMLServerException, "/abc returned code 107" + ): + openml._api_calls._send_request("get", "/abc", {}) + + self.assertEqual(Session_class_mock.return_value.__enter__.return_value.get.call_count, 10) diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py index 88136dbd9..5b15f781e 100644 --- a/tests/test_openml/test_config.py +++ b/tests/test_openml/test_config.py @@ -1,15 +1,59 @@ # License: BSD 3-Clause +import tempfile import os +import unittest.mock import openml.config import openml.testing class TestConfig(openml.testing.TestBase): - def test_config_loading(self): - self.assertTrue(os.path.exists(openml.config.config_file)) - self.assertTrue(os.path.isdir(os.path.expanduser("~/.openml"))) + @unittest.mock.patch("os.path.expanduser") + @unittest.mock.patch("openml.config.openml_logger.warning") + @unittest.mock.patch("openml.config._create_log_handlers") + @unittest.skipIf(os.name == "nt", "https://github.com/openml/openml-python/issues/1033") + def test_non_writable_home(self, log_handler_mock, warnings_mock, expanduser_mock): + with tempfile.TemporaryDirectory(dir=self.workdir) as td: + expanduser_mock.side_effect = ( + os.path.join(td, "openmldir"), + os.path.join(td, "cachedir"), + ) + os.chmod(td, 0o444) + openml.config._setup() + + self.assertEqual(warnings_mock.call_count, 2) + self.assertEqual(log_handler_mock.call_count, 1) + self.assertFalse(log_handler_mock.call_args_list[0][1]["create_file_handler"]) + + def test_get_config_as_dict(self): + """ Checks if the current configuration is returned accurately as a dict. """ + config = openml.config.get_config_as_dict() + _config = dict() + _config["apikey"] = "610344db6388d9ba34f6db45a3cf71de" + _config["server"] = "https://test.openml.org/api/v1/xml" + _config["cachedir"] = self.workdir + _config["avoid_duplicate_runs"] = False + _config["connection_n_retries"] = 10 + _config["max_retries"] = 20 + self.assertIsInstance(config, dict) + self.assertEqual(len(config), 6) + self.assertDictEqual(config, _config) + + def test_setup_with_config(self): + """ Checks if the OpenML configuration can be updated using _setup(). """ + _config = dict() + _config["apikey"] = "610344db6388d9ba34f6db45a3cf71de" + _config["server"] = "https://www.openml.org/api/v1/xml" + _config["cachedir"] = self.workdir + _config["avoid_duplicate_runs"] = True + _config["connection_n_retries"] = 100 + _config["max_retries"] = 1000 + orig_config = openml.config.get_config_as_dict() + openml.config._setup(_config) + updated_config = openml.config.get_config_as_dict() + openml.config._setup(orig_config) # important to not affect other unit tests + self.assertDictEqual(_config, updated_config) class TestConfigurationForExamples(openml.testing.TestBase): diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py index 864863f4a..dd0da5c00 100644 --- a/tests/test_runs/test_run.py +++ b/tests/test_runs/test_run.py @@ -5,11 +5,13 @@ import os from time import time +import xmltodict from sklearn.dummy import DummyClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline +from openml import OpenMLRun from openml.testing import TestBase, SimpleImputer import openml import openml.extensions.sklearn @@ -102,7 +104,7 @@ def test_to_from_filesystem_vanilla(self): ("classifier", DecisionTreeClassifier(max_depth=1)), ] ) - task = openml.tasks.get_task(119) + task = openml.tasks.get_task(119) # diabetes; crossvalidation run = openml.runs.run_model_on_task( model=model, task=task, @@ -142,7 +144,7 @@ def test_to_from_filesystem_search(self): }, ) - task = openml.tasks.get_task(119) + task = openml.tasks.get_task(119) # diabetes; crossvalidation run = openml.runs.run_model_on_task( model=model, task=task, add_local_measures=False, avoid_duplicate_runs=False, ) @@ -163,7 +165,7 @@ def test_to_from_filesystem_no_model(self): model = Pipeline( [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())] ) - task = openml.tasks.get_task(119) + task = openml.tasks.get_task(119) # diabetes; crossvalidation run = openml.runs.run_model_on_task(model=model, task=task, add_local_measures=False) cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128))) @@ -184,7 +186,7 @@ def test_publish_with_local_loaded_flow(self): model = Pipeline( [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())] ) - task = openml.tasks.get_task(119) + task = openml.tasks.get_task(119) # diabetes; crossvalidation # Make sure the flow does not exist on the server yet. flow = extension.model_to_flow(model) @@ -215,3 +217,19 @@ def test_publish_with_local_loaded_flow(self): # make sure the flow is published as part of publishing the run. self.assertTrue(openml.flows.flow_exists(flow.name, flow.external_version)) openml.runs.get_run(loaded_run.run_id) + + def test_run_setup_string_included_in_xml(self): + SETUP_STRING = "setup-string" + run = OpenMLRun( + task_id=0, + flow_id=None, # if not none, flow parameters are required. + dataset_id=0, + setup_string=SETUP_STRING, + ) + xml = run._to_xml() + run_dict = xmltodict.parse(xml)["oml:run"] + assert "oml:setup_string" in run_dict + assert run_dict["oml:setup_string"] == SETUP_STRING + + recreated_run = openml.runs.functions._create_run_from_xml(xml, from_server=False) + assert recreated_run.setup_string == SETUP_STRING diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 89f01c72e..4534f26a4 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -1,5 +1,4 @@ # License: BSD 3-Clause -from typing import Tuple, List, Union import arff from distutils.version import LooseVersion @@ -7,10 +6,12 @@ import random import time import sys +import ast import unittest.mock import numpy as np -import pytest +import joblib +from joblib import parallel_backend import openml import openml.exceptions @@ -21,10 +22,13 @@ import pandas as pd import openml.extensions.sklearn -from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont +from openml.testing import TestBase, SimpleImputer, CustomImputer +from openml.extensions.sklearn import cat, cont from openml.runs.functions import _run_task_get_arffcontent, run_exists, format_prediction from openml.runs.trace import OpenMLRunTrace from openml.tasks import TaskType +from openml.testing import check_task_existence +from openml.exceptions import OpenMLServerException from sklearn.naive_bayes import GaussianNB from sklearn.model_selection._search import BaseSearchCV @@ -42,19 +46,45 @@ class TestRun(TestBase): _multiprocess_can_split_ = True - # diabetis dataset, 768 observations, 0 missing vals, 33% holdout set - # (253 test obs), no nominal attributes, all numeric attributes - TEST_SERVER_TASK_SIMPLE: Tuple[Union[int, List], ...] = (119, 0, 253, [], [*range(8)]) - TEST_SERVER_TASK_REGRESSION: Tuple[Union[int, List], ...] = (738, 0, 718, [], [*range(8)]) - # credit-a dataset, 690 observations, 67 missing vals, 33% holdout set - # (227 test obs) - TEST_SERVER_TASK_MISSING_VALS = ( - 96, - 67, - 227, - [0, 3, 4, 5, 6, 8, 9, 11, 12], - [1, 2, 7, 10, 13, 14], - ) + TEST_SERVER_TASK_MISSING_VALS = { + "task_id": 96, + "n_missing_vals": 67, + "n_test_obs": 227, + "nominal_indices": [0, 3, 4, 5, 6, 8, 9, 11, 12], + "numeric_indices": [1, 2, 7, 10, 13, 14], + "task_meta_data": { + "task_type": TaskType.SUPERVISED_CLASSIFICATION, + "dataset_id": 16, # credit-a + "estimation_procedure_id": 1, + "target_name": "class", + }, + } + TEST_SERVER_TASK_SIMPLE = { + "task_id": 119, + "n_missing_vals": 0, + "n_test_obs": 253, + "nominal_indices": [], + "numeric_indices": [*range(8)], + "task_meta_data": { + "task_type": TaskType.SUPERVISED_CLASSIFICATION, + "dataset_id": 20, # diabetes + "estimation_procedure_id": 1, + "target_name": "class", + }, + } + TEST_SERVER_TASK_REGRESSION = { + "task_id": 1605, + "n_missing_vals": 0, + "n_test_obs": 2178, + "nominal_indices": [], + "numeric_indices": [*range(8)], + "task_meta_data": { + "task_type": TaskType.SUPERVISED_REGRESSION, + "dataset_id": 123, # quake + "estimation_procedure_id": 7, + "target_name": "richter", + }, + } # Suppress warnings to facilitate testing hide_warnings = True @@ -335,7 +365,7 @@ def _check_sample_evaluations( for sample in range(num_sample_entrees): evaluation = sample_evaluations[measure][rep][fold][sample] self.assertIsInstance(evaluation, float) - if not os.environ.get("CI_WINDOWS"): + if not (os.environ.get("CI_WINDOWS") or os.name == "nt"): # Either Appveyor is much faster than Travis # and/or measurements are not as accurate. # Either way, windows seems to get an eval-time @@ -344,7 +374,7 @@ def _check_sample_evaluations( self.assertLess(evaluation, max_time_allowed) def test_run_regression_on_classif_task(self): - task_id = 115 + task_id = 115 # diabetes; crossvalidation clf = LinearRegression() task = openml.tasks.get_task(task_id) @@ -358,7 +388,7 @@ def test_run_regression_on_classif_task(self): ) def test_check_erronous_sklearn_flow_fails(self): - task_id = 115 + task_id = 115 # diabetes; crossvalidation task = openml.tasks.get_task(task_id) # Invalid parameter values @@ -443,7 +473,7 @@ def determine_grid_size(param_grid): # suboptimal (slow), and not guaranteed to work if evaluation # engine is behind. # TODO: mock this? We have the arff already on the server - self._wait_for_processed_run(run.run_id, 400) + self._wait_for_processed_run(run.run_id, 600) try: model_prime = openml.runs.initialize_model_from_trace( run_id=run.run_id, repeat=0, fold=0, @@ -499,7 +529,7 @@ def _run_and_upload_classification( def _run_and_upload_regression( self, clf, task_id, n_missing_vals, n_test_obs, flow_expected_rsv, sentinel=None ): - num_folds = 1 # because of holdout + num_folds = 10 # because of cross-validation num_iterations = 5 # for base search algorithms metric = sklearn.metrics.mean_absolute_error # metric class metric_name = "mean_absolute_error" # openml metric name @@ -520,17 +550,39 @@ def _run_and_upload_regression( ) def test_run_and_upload_logistic_regression(self): - lr = LogisticRegression(solver="lbfgs") - task_id = self.TEST_SERVER_TASK_SIMPLE[0] - n_missing_vals = self.TEST_SERVER_TASK_SIMPLE[1] - n_test_obs = self.TEST_SERVER_TASK_SIMPLE[2] + lr = LogisticRegression(solver="lbfgs", max_iter=1000) + task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"] + n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"] + n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"] self._run_and_upload_classification(lr, task_id, n_missing_vals, n_test_obs, "62501") def test_run_and_upload_linear_regression(self): lr = LinearRegression() - task_id = self.TEST_SERVER_TASK_REGRESSION[0] - n_missing_vals = self.TEST_SERVER_TASK_REGRESSION[1] - n_test_obs = self.TEST_SERVER_TASK_REGRESSION[2] + task_id = self.TEST_SERVER_TASK_REGRESSION["task_id"] + + task_meta_data = self.TEST_SERVER_TASK_REGRESSION["task_meta_data"] + _task_id = check_task_existence(**task_meta_data) + if _task_id is not None: + task_id = _task_id + else: + new_task = openml.tasks.create_task(**task_meta_data) + # publishes the new task + try: + new_task = new_task.publish() + task_id = new_task.task_id + except OpenMLServerException as e: + if e.code == 614: # Task already exists + # the exception message contains the task_id that was matched in the format + # 'Task already exists. - matched id(s): [xxxx]' + task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0] + else: + raise Exception(repr(e)) + # mark to remove the uploaded task + TestBase._mark_entity_for_removal("task", task_id) + TestBase.logger.info("collected from test_run_functions: {}".format(task_id)) + + n_missing_vals = self.TEST_SERVER_TASK_REGRESSION["n_missing_vals"] + n_test_obs = self.TEST_SERVER_TASK_REGRESSION["n_test_obs"] self._run_and_upload_regression(lr, task_id, n_missing_vals, n_test_obs, "62501") def test_run_and_upload_pipeline_dummy_pipeline(self): @@ -541,9 +593,9 @@ def test_run_and_upload_pipeline_dummy_pipeline(self): ("dummy", DummyClassifier(strategy="prior")), ] ) - task_id = self.TEST_SERVER_TASK_SIMPLE[0] - n_missing_vals = self.TEST_SERVER_TASK_SIMPLE[1] - n_test_obs = self.TEST_SERVER_TASK_SIMPLE[2] + task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"] + n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"] + n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"] self._run_and_upload_classification(pipeline1, task_id, n_missing_vals, n_test_obs, "62501") @unittest.skipIf( @@ -584,20 +636,26 @@ def get_ct_cf(nominal_indices, numeric_indices): sentinel = self._get_sentinel() self._run_and_upload_classification( - get_ct_cf(self.TEST_SERVER_TASK_SIMPLE[3], self.TEST_SERVER_TASK_SIMPLE[4]), - self.TEST_SERVER_TASK_SIMPLE[0], - self.TEST_SERVER_TASK_SIMPLE[1], - self.TEST_SERVER_TASK_SIMPLE[2], + get_ct_cf( + self.TEST_SERVER_TASK_SIMPLE["nominal_indices"], + self.TEST_SERVER_TASK_SIMPLE["numeric_indices"], + ), + self.TEST_SERVER_TASK_SIMPLE["task_id"], + self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"], + self.TEST_SERVER_TASK_SIMPLE["n_test_obs"], "62501", sentinel=sentinel, ) # Due to #602, it is important to test this model on two tasks # with different column specifications self._run_and_upload_classification( - get_ct_cf(self.TEST_SERVER_TASK_MISSING_VALS[3], self.TEST_SERVER_TASK_MISSING_VALS[4]), - self.TEST_SERVER_TASK_MISSING_VALS[0], - self.TEST_SERVER_TASK_MISSING_VALS[1], - self.TEST_SERVER_TASK_MISSING_VALS[2], + get_ct_cf( + self.TEST_SERVER_TASK_MISSING_VALS["nominal_indices"], + self.TEST_SERVER_TASK_MISSING_VALS["numeric_indices"], + ), + self.TEST_SERVER_TASK_MISSING_VALS["task_id"], + self.TEST_SERVER_TASK_MISSING_VALS["n_missing_vals"], + self.TEST_SERVER_TASK_MISSING_VALS["n_test_obs"], "62501", sentinel=sentinel, ) @@ -606,7 +664,8 @@ def get_ct_cf(nominal_indices, numeric_indices): LooseVersion(sklearn.__version__) < "0.20", reason="columntransformer introduction in 0.20.0", ) - def test_run_and_upload_knn_pipeline(self): + @unittest.mock.patch("warnings.warn") + def test_run_and_upload_knn_pipeline(self, warnings_mock): cat_imp = make_pipeline( SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore") @@ -632,19 +691,34 @@ def test_run_and_upload_knn_pipeline(self): ] ) - task_id = self.TEST_SERVER_TASK_MISSING_VALS[0] - n_missing_vals = self.TEST_SERVER_TASK_MISSING_VALS[1] - n_test_obs = self.TEST_SERVER_TASK_MISSING_VALS[2] + task_id = self.TEST_SERVER_TASK_MISSING_VALS["task_id"] + n_missing_vals = self.TEST_SERVER_TASK_MISSING_VALS["n_missing_vals"] + n_test_obs = self.TEST_SERVER_TASK_MISSING_VALS["n_test_obs"] self._run_and_upload_classification(pipeline2, task_id, n_missing_vals, n_test_obs, "62501") + # The warning raised is: + # "The total space of parameters 8 is smaller than n_iter=10. + # Running 8 iterations. For exhaustive searches, use GridSearchCV." + # It is raised three times because we once run the model to upload something and then run + # it again twice to compare that the predictions are reproducible. + warning_msg = ( + "The total space of parameters 8 is smaller than n_iter=10. " + "Running 8 iterations. For exhaustive searches, use GridSearchCV." + ) + call_count = 0 + for _warnings in warnings_mock.call_args_list: + if _warnings[0][0] == warning_msg: + call_count += 1 + self.assertEqual(call_count, 3) def test_run_and_upload_gridsearch(self): gridsearch = GridSearchCV( BaggingClassifier(base_estimator=SVC()), {"base_estimator__C": [0.01, 0.1, 10], "base_estimator__gamma": [0.01, 0.1, 10]}, + cv=3, ) - task_id = self.TEST_SERVER_TASK_SIMPLE[0] - n_missing_vals = self.TEST_SERVER_TASK_SIMPLE[1] - n_test_obs = self.TEST_SERVER_TASK_SIMPLE[2] + task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"] + n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"] + n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"] run = self._run_and_upload_classification( clf=gridsearch, task_id=task_id, @@ -671,9 +745,9 @@ def test_run_and_upload_randomsearch(self): # The random states for the RandomizedSearchCV is set after the # random state of the RandomForestClassifier is set, therefore, # it has a different value than the other examples before - task_id = self.TEST_SERVER_TASK_SIMPLE[0] - n_missing_vals = self.TEST_SERVER_TASK_SIMPLE[1] - n_test_obs = self.TEST_SERVER_TASK_SIMPLE[2] + task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"] + n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"] + n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"] run = self._run_and_upload_classification( clf=randomsearch, task_id=task_id, @@ -682,6 +756,8 @@ def test_run_and_upload_randomsearch(self): flow_expected_rsv="12172", ) self.assertEqual(len(run.trace.trace_iterations), 5) + trace = openml.runs.get_run_trace(run.run_id) + self.assertEqual(len(trace.trace_iterations), 5) def test_run_and_upload_maskedarrays(self): # This testcase is important for 2 reasons: @@ -696,9 +772,9 @@ def test_run_and_upload_maskedarrays(self): # The random states for the GridSearchCV is set after the # random state of the RandomForestClassifier is set, therefore, # it has a different value than the other examples before - task_id = self.TEST_SERVER_TASK_SIMPLE[0] - n_missing_vals = self.TEST_SERVER_TASK_SIMPLE[1] - n_test_obs = self.TEST_SERVER_TASK_SIMPLE[2] + task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"] + n_missing_vals = self.TEST_SERVER_TASK_SIMPLE["n_missing_vals"] + n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"] self._run_and_upload_classification( gridsearch, task_id, n_missing_vals, n_test_obs, "12172" ) @@ -782,7 +858,7 @@ def test_initialize_cv_from_run(self): ] ) - task = openml.tasks.get_task(11) + task = openml.tasks.get_task(11) # kr-vs-kp; holdout run = openml.runs.run_model_on_task( model=randomsearch, task=task, avoid_duplicate_runs=False, seed=1, ) @@ -828,31 +904,12 @@ def _test_local_evaluations(self, run): self.assertGreaterEqual(alt_scores[idx], 0) self.assertLessEqual(alt_scores[idx], 1) - @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", - reason="SimpleImputer doesn't handle mixed type DataFrame as input", - ) def test_local_run_swapped_parameter_order_model(self): + clf = DecisionTreeClassifier() + australian_task = 595 # Australian; crossvalidation + task = openml.tasks.get_task(australian_task) - # construct sci-kit learn classifier - clf = Pipeline( - steps=[ - ( - "imputer", - make_pipeline( - SimpleImputer(strategy="most_frequent"), - OneHotEncoder(handle_unknown="ignore"), - ), - ), - # random forest doesn't take categoricals - ("estimator", RandomForestClassifier()), - ] - ) - - # download task - task = openml.tasks.get_task(7) - - # invoke OpenML run + # task and clf are purposely in the old order run = openml.runs.run_model_on_task( task, clf, avoid_duplicate_runs=False, upload_flow=False, ) @@ -876,7 +933,7 @@ def test_local_run_swapped_parameter_order_flow(self): flow = self.extension.model_to_flow(clf) # download task - task = openml.tasks.get_task(7) + task = openml.tasks.get_task(7) # kr-vs-kp; crossvalidation # invoke OpenML run run = openml.runs.run_flow_on_task( @@ -901,7 +958,7 @@ def test_local_run_metric_score(self): ) # download task - task = openml.tasks.get_task(7) + task = openml.tasks.get_task(7) # kr-vs-kp; crossvalidation # invoke OpenML run run = openml.runs.run_model_on_task( @@ -931,7 +988,33 @@ def test_initialize_model_from_run(self): ("Estimator", GaussianNB()), ] ) - task = openml.tasks.get_task(1198) + task_meta_data = { + "task_type": TaskType.SUPERVISED_CLASSIFICATION, + "dataset_id": 128, # iris + "estimation_procedure_id": 1, + "target_name": "class", + } + _task_id = check_task_existence(**task_meta_data) + if _task_id is not None: + task_id = _task_id + else: + new_task = openml.tasks.create_task(**task_meta_data) + # publishes the new task + try: + new_task = new_task.publish() + task_id = new_task.task_id + except OpenMLServerException as e: + if e.code == 614: # Task already exists + # the exception message contains the task_id that was matched in the format + # 'Task already exists. - matched id(s): [xxxx]' + task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0] + else: + raise Exception(repr(e)) + # mark to remove the uploaded task + TestBase._mark_entity_for_removal("task", task_id) + TestBase.logger.info("collected from test_run_functions: {}".format(task_id)) + + task = openml.tasks.get_task(task_id) run = openml.runs.run_model_on_task(model=clf, task=task, avoid_duplicate_runs=False,) run_ = run.publish() TestBase._mark_entity_for_removal("run", run_.run_id) @@ -950,55 +1033,6 @@ def test_initialize_model_from_run(self): self.assertEqual(flowS.components["Imputer"].parameters["strategy"], '"most_frequent"') self.assertEqual(flowS.components["VarianceThreshold"].parameters["threshold"], "0.05") - @pytest.mark.flaky() - def test_get_run_trace(self): - # get_run_trace is already tested implicitly in test_run_and_publish - # this test is a bit additional. - num_iterations = 10 - num_folds = 1 - task_id = 119 - - task = openml.tasks.get_task(task_id) - - # IMPORTANT! Do not sentinel this flow. is faster if we don't wait - # on openml server - clf = RandomizedSearchCV( - RandomForestClassifier(random_state=42, n_estimators=5), - { - "max_depth": [3, None], - "max_features": [1, 2, 3, 4], - "bootstrap": [True, False], - "criterion": ["gini", "entropy"], - }, - num_iterations, - random_state=42, - cv=3, - ) - - # [SPEED] make unit test faster by exploiting run information - # from the past - try: - # in case the run did not exists yet - run = openml.runs.run_model_on_task(model=clf, task=task, avoid_duplicate_runs=True,) - - self.assertEqual( - len(run.trace.trace_iterations), num_iterations * num_folds, - ) - run = run.publish() - TestBase._mark_entity_for_removal("run", run.run_id) - TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id)) - self._wait_for_processed_run(run.run_id, 400) - run_id = run.run_id - except openml.exceptions.OpenMLRunsExistError as e: - # The only error we expect, should fail otherwise. - run_ids = [int(run_id) for run_id in e.run_ids] - self.assertGreater(len(run_ids), 0) - run_id = random.choice(list(run_ids)) - - # now the actual unit test ... - run_trace = openml.runs.get_run_trace(run_id) - self.assertEqual(len(run_trace.trace_iterations), num_iterations * num_folds) - @unittest.skipIf( LooseVersion(sklearn.__version__) < "0.20", reason="SimpleImputer doesn't handle mixed type DataFrame as input", @@ -1025,7 +1059,7 @@ def test__run_exists(self): ), ] - task = openml.tasks.get_task(115) + task = openml.tasks.get_task(115) # diabetes; crossvalidation for clf in clfs: try: @@ -1055,8 +1089,8 @@ def test__run_exists(self): def test_run_with_illegal_flow_id(self): # check the case where the user adds an illegal flow id to a - # non-existing flow - task = openml.tasks.get_task(115) + # non-existing flo + task = openml.tasks.get_task(115) # diabetes; crossvalidation clf = DecisionTreeClassifier() flow = self.extension.model_to_flow(clf) flow, _ = self._add_sentinel_to_flow_name(flow, None) @@ -1072,7 +1106,7 @@ def test_run_with_illegal_flow_id(self): def test_run_with_illegal_flow_id_after_load(self): # Same as `test_run_with_illegal_flow_id`, but test this error is also # caught if the run is stored to and loaded from disk first. - task = openml.tasks.get_task(115) + task = openml.tasks.get_task(115) # diabetes; crossvalidation clf = DecisionTreeClassifier() flow = self.extension.model_to_flow(clf) flow, _ = self._add_sentinel_to_flow_name(flow, None) @@ -1096,7 +1130,7 @@ def test_run_with_illegal_flow_id_after_load(self): def test_run_with_illegal_flow_id_1(self): # Check the case where the user adds an illegal flow id to an existing # flow. Comes to a different value error than the previous test - task = openml.tasks.get_task(115) + task = openml.tasks.get_task(115) # diabetes; crossvalidation clf = DecisionTreeClassifier() flow_orig = self.extension.model_to_flow(clf) try: @@ -1118,7 +1152,7 @@ def test_run_with_illegal_flow_id_1(self): def test_run_with_illegal_flow_id_1_after_load(self): # Same as `test_run_with_illegal_flow_id_1`, but test this error is # also caught if the run is stored to and loaded from disk first. - task = openml.tasks.get_task(115) + task = openml.tasks.get_task(115) # diabetes; crossvalidation clf = DecisionTreeClassifier() flow_orig = self.extension.model_to_flow(clf) try: @@ -1149,18 +1183,15 @@ def test_run_with_illegal_flow_id_1_after_load(self): reason="OneHotEncoder cannot handle mixed type DataFrame as input", ) def test__run_task_get_arffcontent(self): - task = openml.tasks.get_task(7) + task = openml.tasks.get_task(7) # kr-vs-kp; crossvalidation num_instances = 3196 num_folds = 10 num_repeats = 1 - flow = unittest.mock.Mock() - flow.name = "dummy" clf = make_pipeline( OneHotEncoder(handle_unknown="ignore"), SGDClassifier(loss="log", random_state=1) ) res = openml.runs.functions._run_task_get_arffcontent( - flow=flow, extension=self.extension, model=clf, task=task, @@ -1371,9 +1402,7 @@ def test_run_on_dataset_with_missing_labels_dataframe(self): # Check that _run_task_get_arffcontent works when one of the class # labels only declared in the arff file, but is not present in the # actual data - flow = unittest.mock.Mock() - flow.name = "dummy" - task = openml.tasks.get_task(2) + task = openml.tasks.get_task(2) # anneal; crossvalidation from sklearn.compose import ColumnTransformer @@ -1387,7 +1416,6 @@ def test_run_on_dataset_with_missing_labels_dataframe(self): ) # build a sklearn classifier data_content, _, _, _ = _run_task_get_arffcontent( - flow=flow, model=model, task=task, extension=self.extension, @@ -1409,9 +1437,7 @@ def test_run_on_dataset_with_missing_labels_array(self): # Check that _run_task_get_arffcontent works when one of the class # labels only declared in the arff file, but is not present in the # actual data - flow = unittest.mock.Mock() - flow.name = "dummy" - task = openml.tasks.get_task(2) + task = openml.tasks.get_task(2) # anneal; crossvalidation # task_id=2 on test server has 38 columns with 6 numeric columns cont_idx = [3, 4, 8, 32, 33, 34] cat_idx = list(set(np.arange(38)) - set(cont_idx)) @@ -1432,7 +1458,6 @@ def test_run_on_dataset_with_missing_labels_array(self): ) # build a sklearn classifier data_content, _, _, _ = _run_task_get_arffcontent( - flow=flow, model=model, task=task, extension=self.extension, @@ -1463,7 +1488,7 @@ def test_run_flow_on_task_downloaded_flow(self): TestBase.logger.info("collected from test_run_functions: {}".format(flow.flow_id)) downloaded_flow = openml.flows.get_flow(flow.flow_id) - task = openml.tasks.get_task(119) # diabetes + task = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE["task_id"]) run = openml.runs.run_flow_on_task( flow=downloaded_flow, task=task, avoid_duplicate_runs=False, upload_flow=False, ) @@ -1483,20 +1508,26 @@ def test_format_prediction_non_supervised(self): format_prediction(clustering, *ignored_input) def test_format_prediction_classification_no_probabilities(self): - classification = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE[0], download_data=False) + classification = openml.tasks.get_task( + self.TEST_SERVER_TASK_SIMPLE["task_id"], download_data=False + ) ignored_input = [0] * 5 with self.assertRaisesRegex(ValueError, "`proba` is required for classification task"): format_prediction(classification, *ignored_input, proba=None) def test_format_prediction_classification_incomplete_probabilities(self): - classification = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE[0], download_data=False) + classification = openml.tasks.get_task( + self.TEST_SERVER_TASK_SIMPLE["task_id"], download_data=False + ) ignored_input = [0] * 5 incomplete_probabilities = {c: 0.2 for c in classification.class_labels[1:]} with self.assertRaisesRegex(ValueError, "Each class should have a predicted probability"): format_prediction(classification, *ignored_input, proba=incomplete_probabilities) def test_format_prediction_task_without_classlabels_set(self): - classification = openml.tasks.get_task(self.TEST_SERVER_TASK_SIMPLE[0], download_data=False) + classification = openml.tasks.get_task( + self.TEST_SERVER_TASK_SIMPLE["task_id"], download_data=False + ) classification.class_labels = None ignored_input = [0] * 5 with self.assertRaisesRegex( @@ -1505,14 +1536,146 @@ def test_format_prediction_task_without_classlabels_set(self): format_prediction(classification, *ignored_input, proba={}) def test_format_prediction_task_learning_curve_sample_not_set(self): - learning_curve = openml.tasks.get_task(801, download_data=False) + learning_curve = openml.tasks.get_task(801, download_data=False) # diabetes;crossvalidation probabilities = {c: 0.2 for c in learning_curve.class_labels} ignored_input = [0] * 5 with self.assertRaisesRegex(ValueError, "`sample` can not be none for LearningCurveTask"): format_prediction(learning_curve, *ignored_input, sample=None, proba=probabilities) def test_format_prediction_task_regression(self): - regression = openml.tasks.get_task(self.TEST_SERVER_TASK_REGRESSION[0], download_data=False) + task_meta_data = self.TEST_SERVER_TASK_REGRESSION["task_meta_data"] + _task_id = check_task_existence(**task_meta_data) + if _task_id is not None: + task_id = _task_id + else: + new_task = openml.tasks.create_task(**task_meta_data) + # publishes the new task + try: + new_task = new_task.publish() + task_id = new_task.task_id + except OpenMLServerException as e: + if e.code == 614: # Task already exists + # the exception message contains the task_id that was matched in the format + # 'Task already exists. - matched id(s): [xxxx]' + task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0] + else: + raise Exception(repr(e)) + # mark to remove the uploaded task + TestBase._mark_entity_for_removal("task", task_id) + TestBase.logger.info("collected from test_run_functions: {}".format(task_id)) + + regression = openml.tasks.get_task(task_id, download_data=False) ignored_input = [0] * 5 res = format_prediction(regression, *ignored_input) self.assertListEqual(res, [0] * 5) + + @unittest.skipIf( + LooseVersion(sklearn.__version__) < "0.21", + reason="couldn't perform local tests successfully w/o bloating RAM", + ) + @unittest.mock.patch("openml.extensions.sklearn.SklearnExtension._prevent_optimize_n_jobs") + def test__run_task_get_arffcontent_2(self, parallel_mock): + """ Tests if a run executed in parallel is collated correctly. """ + task = openml.tasks.get_task(7) # Supervised Classification on kr-vs-kp + x, y = task.get_X_and_y(dataset_format="dataframe") + num_instances = x.shape[0] + line_length = 6 + len(task.class_labels) + clf = SGDClassifier(loss="log", random_state=1) + n_jobs = 2 + backend = "loky" if LooseVersion(joblib.__version__) > "0.11" else "multiprocessing" + with parallel_backend(backend, n_jobs=n_jobs): + res = openml.runs.functions._run_task_get_arffcontent( + extension=self.extension, + model=clf, + task=task, + add_local_measures=True, + dataset_format="array", # "dataframe" would require handling of categoricals + n_jobs=n_jobs, + ) + # This unit test will fail if joblib is unable to distribute successfully since the + # function _run_model_on_fold is being mocked out. However, for a new spawned worker, it + # is not and the mock call_count should remain 0 while the subsequent check of actual + # results should also hold, only on successful distribution of tasks to workers. + # The _prevent_optimize_n_jobs() is a function executed within the _run_model_on_fold() + # block and mocking this function doesn't affect rest of the pipeline, but is adequately + # indicative if _run_model_on_fold() is being called or not. + self.assertEqual(parallel_mock.call_count, 0) + self.assertIsInstance(res[0], list) + self.assertEqual(len(res[0]), num_instances) + self.assertEqual(len(res[0][0]), line_length) + self.assertEqual(len(res[2]), 7) + self.assertEqual(len(res[3]), 7) + expected_scores = [ + 0.965625, + 0.94375, + 0.946875, + 0.953125, + 0.96875, + 0.965625, + 0.9435736677115988, + 0.9467084639498433, + 0.9749216300940439, + 0.9655172413793104, + ] + scores = [v for k, v in res[2]["predictive_accuracy"][0].items()] + np.testing.assert_array_almost_equal( + scores, expected_scores, decimal=2 if os.name == "nt" else 7 + ) + + @unittest.skipIf( + LooseVersion(sklearn.__version__) < "0.21", + reason="couldn't perform local tests successfully w/o bloating RAM", + ) + @unittest.mock.patch("openml.extensions.sklearn.SklearnExtension._prevent_optimize_n_jobs") + def test_joblib_backends(self, parallel_mock): + """ Tests evaluation of a run using various joblib backends and n_jobs. """ + task = openml.tasks.get_task(7) # Supervised Classification on kr-vs-kp + x, y = task.get_X_and_y(dataset_format="dataframe") + num_instances = x.shape[0] + line_length = 6 + len(task.class_labels) + + backend_choice = "loky" if LooseVersion(joblib.__version__) > "0.11" else "multiprocessing" + for n_jobs, backend, len_time_stats, call_count in [ + (1, backend_choice, 7, 10), + (2, backend_choice, 4, 10), + (-1, backend_choice, 1, 10), + (1, "threading", 7, 20), + (-1, "threading", 1, 30), + (1, "sequential", 7, 40), + ]: + clf = sklearn.model_selection.RandomizedSearchCV( + estimator=sklearn.ensemble.RandomForestClassifier(n_estimators=5), + param_distributions={ + "max_depth": [3, None], + "max_features": [1, 2, 3, 4], + "min_samples_split": [2, 3, 4, 5, 6, 7, 8, 9, 10], + "min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "bootstrap": [True, False], + "criterion": ["gini", "entropy"], + }, + random_state=1, + cv=sklearn.model_selection.StratifiedKFold( + n_splits=2, shuffle=True, random_state=1 + ), + n_iter=5, + n_jobs=n_jobs, + ) + with parallel_backend(backend, n_jobs=n_jobs): + res = openml.runs.functions._run_task_get_arffcontent( + extension=self.extension, + model=clf, + task=task, + add_local_measures=True, + dataset_format="array", # "dataframe" would require handling of categoricals + n_jobs=n_jobs, + ) + self.assertEqual(type(res[0]), list) + self.assertEqual(len(res[0]), num_instances) + self.assertEqual(len(res[0][0]), line_length) + # usercpu_time_millis_* not recorded when n_jobs > 1 + # *_time_millis_* not recorded when n_jobs = -1 + self.assertEqual(len(res[2]), len_time_stats) + self.assertEqual(len(res[3]), len_time_stats) + self.assertEqual(len(res[2]["predictive_accuracy"][0]), 10) + self.assertEqual(len(res[3]["predictive_accuracy"][0]), 10) + self.assertEqual(parallel_mock.call_count, call_count) diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py index e89318728..538b08821 100644 --- a/tests/test_setups/test_setup_functions.py +++ b/tests/test_setups/test_setup_functions.py @@ -67,7 +67,7 @@ def _existing_setup_exists(self, classif): self.assertFalse(setup_id) # now run the flow on an easy task: - task = openml.tasks.get_task(115) # diabetes + task = openml.tasks.get_task(115) # diabetes; crossvalidation run = openml.runs.run_flow_on_task(flow, task) # spoof flow id, otherwise the sentinel is ignored run.flow_id = flow.flow_id diff --git a/tests/test_study/__init__.py b/tests/test_study/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py index fdb2747ec..682359a61 100644 --- a/tests/test_study/test_study_examples.py +++ b/tests/test_study/test_study_examples.py @@ -1,6 +1,7 @@ # License: BSD 3-Clause -from openml.testing import TestBase, SimpleImputer, CustomImputer, cat, cont +from openml.testing import TestBase +from openml.extensions.sklearn import cat, cont import sklearn import unittest @@ -12,8 +13,8 @@ class TestStudyFunctions(TestBase): """Test the example code of Bischl et al. (2018)""" @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", - reason="columntransformer introduction in 0.20.0", + LooseVersion(sklearn.__version__) < "0.24", + reason="columntransformer introduction in 0.24.0", ) def test_Figure1a(self): """Test listing in Figure 1a on a single task and the old OpenML100 study. @@ -38,15 +39,14 @@ def test_Figure1a(self): import openml import sklearn.metrics import sklearn.tree + from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline, make_pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder, StandardScaler benchmark_suite = openml.study.get_study("OpenML100", "tasks") # obtain the benchmark suite - cat_imp = make_pipeline( - SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore") - ) - cont_imp = make_pipeline(CustomImputer(), StandardScaler()) + cat_imp = OneHotEncoder(handle_unknown="ignore") + cont_imp = make_pipeline(SimpleImputer(strategy="median"), StandardScaler()) ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) clf = Pipeline( steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())] diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py index 993771c90..e028ba2bd 100644 --- a/tests/test_study/test_study_functions.py +++ b/tests/test_study/test_study_functions.py @@ -4,6 +4,7 @@ import openml.study from openml.testing import TestBase import pandas as pd +import pytest class TestStudyFunctions(TestBase): @@ -113,6 +114,7 @@ def test_publish_benchmark_suite(self): self.assertEqual(study_downloaded.status, "deactivated") # can't delete study, now it's not longer in preparation + @pytest.mark.flaky() def test_publish_study(self): # get some random runs to attach run_list = openml.evaluations.list_evaluations("predictive_accuracy", size=10) @@ -133,8 +135,8 @@ def test_publish_study(self): run_ids=list(run_list.keys()), ) study.publish() - # not tracking upload for delete since _delete_entity called end of function - # asserting return status from openml.study.delete_study() + TestBase._mark_entity_for_removal("study", study.id) + TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], study.id)) self.assertGreater(study.id, 0) study_downloaded = openml.study.get_study(study.id) self.assertEqual(study_downloaded.alias, fixt_alias) @@ -213,9 +215,8 @@ def test_study_attach_illegal(self): def test_study_list(self): study_list = openml.study.list_studies(status="in_preparation") # might fail if server is recently resetted - self.assertGreater(len(study_list), 2) + self.assertGreaterEqual(len(study_list), 2) def test_study_list_output_format(self): study_list = openml.study.list_studies(status="in_preparation", output_format="dataframe") self.assertIsInstance(study_list, pd.DataFrame) - self.assertGreater(len(study_list), 2) diff --git a/tests/test_tasks/test_classification_task.py b/tests/test_tasks/test_classification_task.py index 4f03f8bff..c4f74c5ce 100644 --- a/tests/test_tasks/test_classification_task.py +++ b/tests/test_tasks/test_classification_task.py @@ -13,7 +13,7 @@ class OpenMLClassificationTaskTest(OpenMLSupervisedTaskTest): def setUp(self, n_levels: int = 1): super(OpenMLClassificationTaskTest, self).setUp() - self.task_id = 119 + self.task_id = 119 # diabetes self.task_type = TaskType.SUPERVISED_CLASSIFICATION self.estimation_procedure = 1 diff --git a/tests/test_tasks/test_learning_curve_task.py b/tests/test_tasks/test_learning_curve_task.py index 9f0157187..b1422d308 100644 --- a/tests/test_tasks/test_learning_curve_task.py +++ b/tests/test_tasks/test_learning_curve_task.py @@ -13,7 +13,7 @@ class OpenMLLearningCurveTaskTest(OpenMLSupervisedTaskTest): def setUp(self, n_levels: int = 1): super(OpenMLLearningCurveTaskTest, self).setUp() - self.task_id = 801 + self.task_id = 801 # diabetes self.task_type = TaskType.LEARNING_CURVE self.estimation_procedure = 13 diff --git a/tests/test_tasks/test_regression_task.py b/tests/test_tasks/test_regression_task.py index e751e63b5..c38d8fa91 100644 --- a/tests/test_tasks/test_regression_task.py +++ b/tests/test_tasks/test_regression_task.py @@ -1,8 +1,13 @@ # License: BSD 3-Clause +import ast import numpy as np +import openml from openml.tasks import TaskType +from openml.testing import TestBase +from openml.testing import check_task_existence +from openml.exceptions import OpenMLServerException from .test_supervised_task import OpenMLSupervisedTaskTest @@ -11,9 +16,34 @@ class OpenMLRegressionTaskTest(OpenMLSupervisedTaskTest): __test__ = True def setUp(self, n_levels: int = 1): - super(OpenMLRegressionTaskTest, self).setUp() - self.task_id = 625 + + task_meta_data = { + "task_type": TaskType.SUPERVISED_REGRESSION, + "dataset_id": 105, # wisconsin + "estimation_procedure_id": 7, + "target_name": "time", + } + _task_id = check_task_existence(**task_meta_data) + if _task_id is not None: + task_id = _task_id + else: + new_task = openml.tasks.create_task(**task_meta_data) + # publishes the new task + try: + new_task = new_task.publish() + task_id = new_task.task_id + # mark to remove the uploaded task + TestBase._mark_entity_for_removal("task", task_id) + TestBase.logger.info("collected from test_run_functions: {}".format(task_id)) + except OpenMLServerException as e: + if e.code == 614: # Task already exists + # the exception message contains the task_id that was matched in the format + # 'Task already exists. - matched id(s): [xxxx]' + task_id = ast.literal_eval(e.message.split("matched id(s):")[-1].strip())[0] + else: + raise Exception(repr(e)) + self.task_id = task_id self.task_type = TaskType.SUPERVISED_REGRESSION self.estimation_procedure = 7 diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py index 5f9b65495..418b21b65 100644 --- a/tests/test_tasks/test_task_functions.py +++ b/tests/test_tasks/test_task_functions.py @@ -66,7 +66,7 @@ def _check_task(self, task): self.assertIn(task["status"], ["in_preparation", "active", "deactivated"]) def test_list_tasks_by_type(self): - num_curves_tasks = 200 # number is flexible, check server if fails + num_curves_tasks = 198 # number is flexible, check server if fails ttid = TaskType.LEARNING_CURVE tasks = openml.tasks.list_tasks(task_type=ttid) self.assertGreaterEqual(len(tasks), num_curves_tasks) @@ -110,7 +110,7 @@ def test_list_tasks_paginate(self): self._check_task(tasks[tid]) def test_list_tasks_per_type_paginate(self): - size = 10 + size = 40 max = 100 task_types = [ TaskType.SUPERVISED_CLASSIFICATION, @@ -139,7 +139,7 @@ def test__get_task_live(self): openml.tasks.get_task(34536) def test_get_task(self): - task = openml.tasks.get_task(1) + task = openml.tasks.get_task(1) # anneal; crossvalidation self.assertIsInstance(task, OpenMLTask) self.assertTrue( os.path.exists( @@ -158,7 +158,7 @@ def test_get_task(self): ) def test_get_task_lazy(self): - task = openml.tasks.get_task(2, download_data=False) + task = openml.tasks.get_task(2, download_data=False) # anneal; crossvalidation self.assertIsInstance(task, OpenMLTask) self.assertTrue( os.path.exists( @@ -198,7 +198,7 @@ def assert_and_raise(*args, **kwargs): get_dataset.side_effect = assert_and_raise try: - openml.tasks.get_task(1) + openml.tasks.get_task(1) # anneal; crossvalidation except WeirdException: pass # Now the file should no longer exist @@ -219,7 +219,7 @@ def test_get_task_different_types(self): openml.tasks.functions.get_task(126033) def test_download_split(self): - task = openml.tasks.get_task(1) + task = openml.tasks.get_task(1) # anneal; crossvalidation split = task.download_split() self.assertEqual(type(split), OpenMLSplit) self.assertTrue( diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py index 137e29fe4..9878feb96 100644 --- a/tests/test_tasks/test_task_methods.py +++ b/tests/test_tasks/test_task_methods.py @@ -15,7 +15,7 @@ def tearDown(self): super(OpenMLTaskMethodsTest, self).tearDown() def test_tagging(self): - task = openml.tasks.get_task(1) + task = openml.tasks.get_task(1) # anneal; crossvalidation tag = "testing_tag_{}_{}".format(self.id(), time()) task_list = openml.tasks.list_tasks(tag=tag) self.assertEqual(len(task_list), 0) @@ -40,9 +40,9 @@ def test_get_train_and_test_split_indices(self): self.assertEqual(681, train_indices[-1]) self.assertEqual(583, test_indices[0]) self.assertEqual(24, test_indices[-1]) - self.assertRaisesRegexp( + self.assertRaisesRegex( ValueError, "Fold 10 not known", task.get_train_test_split_indices, 10, 0 ) - self.assertRaisesRegexp( + self.assertRaisesRegex( ValueError, "Repeat 10 not known", task.get_train_test_split_indices, 0, 10 ) diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index 9729100bb..4fa08e1ab 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -1,17 +1,15 @@ -from openml.testing import TestBase +import os +import tempfile +import unittest.mock + import numpy as np -import openml -import sys -if sys.version_info[0] >= 3: - from unittest import mock -else: - import mock +import openml +from openml.testing import TestBase class OpenMLTaskTest(TestBase): _multiprocess_can_split_ = True - _batch_size = 25 def mocked_perform_api_call(call, request_method): # TODO: JvR: Why is this not a staticmethod? @@ -21,7 +19,7 @@ def mocked_perform_api_call(call, request_method): def test_list_all(self): openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks) - @mock.patch("openml._api_calls._perform_api_call", side_effect=mocked_perform_api_call) + @unittest.mock.patch("openml._api_calls._perform_api_call", side_effect=mocked_perform_api_call) def test_list_all_few_results_available(self, _perform_api_call): # we want to make sure that the number of api calls is only 1. # Although we have multiple versions of the iris dataset, there is only @@ -33,7 +31,7 @@ def test_list_all_few_results_available(self, _perform_api_call): def test_list_all_for_datasets(self): required_size = 127 # default test server reset value - datasets = openml.datasets.list_datasets(batch_size=self._batch_size, size=required_size) + datasets = openml.datasets.list_datasets(batch_size=100, size=required_size) self.assertEqual(len(datasets), required_size) for did in datasets: @@ -53,13 +51,13 @@ def test_list_datasets_with_high_size_parameter(self): def test_list_all_for_tasks(self): required_size = 1068 # default test server reset value - tasks = openml.tasks.list_tasks(batch_size=self._batch_size, size=required_size) + tasks = openml.tasks.list_tasks(batch_size=1000, size=required_size) self.assertEqual(len(tasks), required_size) def test_list_all_for_flows(self): required_size = 15 # default test server reset value - flows = openml.flows.list_flows(batch_size=self._batch_size, size=required_size) + flows = openml.flows.list_flows(batch_size=25, size=required_size) self.assertEqual(len(flows), required_size) @@ -73,7 +71,7 @@ def test_list_all_for_setups(self): def test_list_all_for_runs(self): required_size = 21 - runs = openml.runs.list_runs(batch_size=self._batch_size, size=required_size) + runs = openml.runs.list_runs(batch_size=25, size=required_size) # might not be on test server after reset, please rerun test at least once if fails self.assertEqual(len(runs), required_size) @@ -87,3 +85,19 @@ def test_list_all_for_evaluations(self): # might not be on test server after reset, please rerun test at least once if fails self.assertEqual(len(evaluations), required_size) + + @unittest.mock.patch("openml.config.get_cache_directory") + @unittest.skipIf(os.name == "nt", "https://github.com/openml/openml-python/issues/1033") + def test__create_cache_directory(self, config_mock): + with tempfile.TemporaryDirectory(dir=self.workdir) as td: + config_mock.return_value = td + openml.utils._create_cache_directory("abc") + self.assertTrue(os.path.exists(os.path.join(td, "abc"))) + subdir = os.path.join(td, "def") + os.mkdir(subdir) + os.chmod(subdir, 0o444) + config_mock.return_value = subdir + with self.assertRaisesRegex( + openml.exceptions.OpenMLCacheException, r"Cannot create cache directory", + ): + openml.utils._create_cache_directory("ghi")