diff --git a/README.md b/README.md index b5e4d6c0c..63e33155b 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,18 @@ [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) -A python interface for [OpenML](http://openml.org). You can find the documentation on the [openml-python website](https://openml.github.io/openml-python). - -Please commit to the right branches following the gitflow pattern: -http://nvie.com/posts/a-successful-git-branching-model/ +A python interface for [OpenML](http://openml.org), an online platform for open science collaboration in machine learning. +It can be used to download or upload OpenML data such as datasets and machine learning experiment results. +You can find the documentation on the [openml-python website](https://openml.github.io/openml-python). +If you wish to contribute to the package, please see our [contribution guidelines](https://github.com/openml/openml-python/blob/develop/CONTRIBUTING.md). Master branch: [![Build Status](https://travis-ci.org/openml/openml-python.svg?branch=master)](https://travis-ci.org/openml/openml-python) -[![Code Health](https://landscape.io/github/openml/openml-python/master/landscape.svg)](https://landscape.io/github/openml/openml-python/master) +[![Build status](https://ci.appveyor.com/api/projects/status/blna1eip00kdyr25?svg=true)](https://ci.appveyor.com/project/OpenML/openml-python) [![Coverage Status](https://coveralls.io/repos/github/openml/openml-python/badge.svg?branch=master)](https://coveralls.io/github/openml/openml-python?branch=master) Development branch: [![Build Status](https://travis-ci.org/openml/openml-python.svg?branch=develop)](https://travis-ci.org/openml/openml-python) -[![Code Health](https://landscape.io/github/openml/openml-python/master/landscape.svg)](https://landscape.io/github/openml/openml-python/master) +[![Build status](https://ci.appveyor.com/api/projects/status/blna1eip00kdyr25/branch/develop?svg=true)](https://ci.appveyor.com/project/OpenML/openml-python/branch/develop) [![Coverage Status](https://coveralls.io/repos/github/openml/openml-python/badge.svg?branch=develop)](https://coveralls.io/github/openml/openml-python?branch=develop) diff --git a/appveyor.yml b/appveyor.yml index 8a8da9963..7f0800920 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -43,4 +43,4 @@ build: false test_script: - "cd C:\\projects\\openml-python" - - "%CMD_IN_ENV% pytest -n 4 --timeout=600 --timeout-method=thread -sv --ignore='test_OpenMLDemo.py'" + - "%CMD_IN_ENV% pytest -n 4 --timeout=600 --timeout-method=thread -sv" diff --git a/ci_scripts/install.sh b/ci_scripts/install.sh index ee8ec3b14..a223cf84b 100644 --- a/ci_scripts/install.sh +++ b/ci_scripts/install.sh @@ -36,12 +36,13 @@ pip install -e '.[test]' python -c "import numpy; print('numpy %s' % numpy.__version__)" python -c "import scipy; print('scipy %s' % scipy.__version__)" -if [[ "$EXAMPLES" == "true" ]]; then - pip install -e '.[examples]' -fi if [[ "$DOCTEST" == "true" ]]; then pip install sphinx_bootstrap_theme fi +if [[ "$DOCPUSH" == "true" ]]; then + conda install --yes gxx_linux-64 gcc_linux-64 swig + pip install -e '.[examples,examples_unix]' +fi if [[ "$COVERAGE" == "true" ]]; then pip install codecov pytest-cov fi @@ -52,3 +53,5 @@ fi # Install scikit-learn last to make sure the openml package installation works # from a clean environment without scikit-learn. pip install scikit-learn==$SKLEARN_VERSION + +conda list diff --git a/ci_scripts/test.sh b/ci_scripts/test.sh index 1c82591e0..f46b0eecb 100644 --- a/ci_scripts/test.sh +++ b/ci_scripts/test.sh @@ -28,7 +28,7 @@ run_tests() { PYTEST_ARGS='' fi - pytest -n 4 --durations=20 --timeout=600 --timeout-method=thread -sv --ignore='test_OpenMLDemo.py' $PYTEST_ARGS $test_dir + pytest -n 4 --durations=20 --timeout=600 --timeout-method=thread -sv $PYTEST_ARGS $test_dir } if [[ "$RUN_FLAKE8" == "true" ]]; then diff --git a/doc/api.rst b/doc/api.rst index 7979c7bfc..0f1329d45 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -85,6 +85,7 @@ Modules list_evaluations list_evaluation_measures + list_evaluations_setups :mod:`openml.flows`: Flow Functions ----------------------------------- diff --git a/doc/contributing.rst b/doc/contributing.rst index e614c8a25..067f2dcad 100644 --- a/doc/contributing.rst +++ b/doc/contributing.rst @@ -21,12 +21,12 @@ you can use github's assign feature, otherwise you can just leave a comment. Scope of the package ==================== -The scope of the OpenML python package is to provide a python interface to -the OpenML platform which integrates well with pythons scientific stack, most +The scope of the OpenML Python package is to provide a Python interface to +the OpenML platform which integrates well with Python's scientific stack, most notably `numpy `_ and `scipy `_. To reduce opportunity costs and demonstrate the usage of the package, it also implements an interface to the most popular machine learning package written -in python, `scikit-learn `_. +in Python, `scikit-learn `_. Thereby it will automatically be compatible with many machine learning libraries written in Python. @@ -34,7 +34,7 @@ We aim to keep the package as light-weight as possible and we will try to keep the number of potential installation dependencies as low as possible. Therefore, the connection to other machine learning libraries such as *pytorch*, *keras* or *tensorflow* should not be done directly inside this -package, but in a separate package using the OpenML python connector. +package, but in a separate package using the OpenML Python connector. .. _issues: @@ -52,7 +52,7 @@ contains longer-term goals. How to contribute ================= -There are many ways to contribute to the development of the OpenML python +There are many ways to contribute to the development of the OpenML Python connector and OpenML in general. We welcome all kinds of contributions, especially: @@ -158,5 +158,67 @@ Happy testing! Connecting new machine learning libraries ========================================= -Coming soon - please stay tuned! +Content of the Library +~~~~~~~~~~~~~~~~~~~~~~ +To leverage support from the community and to tap in the potential of OpenML, interfacing +with popular machine learning libraries is essential. However, the OpenML-Python team does +not have the capacity to develop and maintain such interfaces on its own. For this, we +have built an extension interface to allows others to contribute back. Building a suitable +extension for therefore requires an understanding of the current OpenML-Python support. + +`This example `_ +shows how scikit-learn currently works with OpenML-Python as an extension. The *sklearn* +extension packaged with the `openml-python `_ +repository can be used as a template/benchmark to build the new extension. + + +API ++++ +* The extension scripts must import the `openml` package and be able to interface with + any function from the OpenML-Python `API `_. +* The extension has to be defined as a Python class and must inherit from + :class:`openml.extensions.Extension`. +* This class needs to have all the functions from `class Extension` overloaded as required. +* The redefined functions should have adequate and appropriate docstrings. The + `Sklearn Extension API :class:`openml.extensions.sklearn.SklearnExtension.html` + is a good benchmark to follow. + + +Interfacing with OpenML-Python +++++++++++++++++++++++++++++++ +Once the new extension class has been defined, the openml-python module to +:meth:`openml.extensions.register_extension.html` must be called to allow OpenML-Python to +interface the new extension. + + +Hosting the library +~~~~~~~~~~~~~~~~~~~ + +Each extension created should be a stand-alone repository, compatible with the +`OpenML-Python repository `_. +The extension repository should work off-the-shelf with *OpenML-Python* installed. + +Create a `public Github repo `_ with +the following directory structure: + +:: + +| [repo name] +| |-- [extension name] +| | |-- __init__.py +| | |-- extension.py +| | |-- config.py (optionally) + + + +Recommended +~~~~~~~~~~~ +* Test cases to keep the extension up to date with the `openml-python` upstream changes. +* Documentation of the extension API, especially if any new functionality added to OpenML-Python's + extension design. +* Examples to show how the new extension interfaces and works with OpenML-Python. +* Create a PR to add the new extension to the OpenML-Python API documentation. + + +Happy contributing! diff --git a/doc/index.rst b/doc/index.rst index 96e534705..8d7cf2243 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -38,7 +38,7 @@ Example # Publish the experiment on OpenML (optional, requires an API key. # You can get your own API key by signing up to OpenML.org) run.publish() - print('View the run online: %s/run/%d' % (openml.config.server, run.run_id)) + print(f'View the run online: {openml.config.server}/run/{run.run_id}') You can find more examples in our `examples gallery `_. diff --git a/doc/progress.rst b/doc/progress.rst index 33db154ef..97fc165a1 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -6,8 +6,57 @@ Changelog ========= +0.10.1 +~~~~~~ +* ADD #175: Automatically adds the docstring of scikit-learn objects to flow and its parameters. +* ADD #737: New evaluation listing call that includes the hyperparameter settings. +* ADD #744: It is now possible to only issue a warning and not raise an exception if the package + versions for a flow are not met when deserializing it. +* ADD #783: The URL to download the predictions for a run is now stored in the run object. +* ADD #790: Adds the uploader name and id as new filtering options for ``list_evaluations``. +* ADD #792: New convenience function ``openml.flow.get_flow_id``. +* ADD #861: Debug-level log information now being written to a file in the cache directory (at most 2 MB). +* DOC #778: Introduces instructions on how to publish an extension to support other libraries + than scikit-learn. +* DOC #785: The examples section is completely restructured into simple simple examples, advanced + examples and examples showcasing the use of OpenML-Python to reproduce papers which were done + with OpenML-Python. +* DOC #788: New example on manually iterating through the split of a task. +* DOC #789: Improve the usage of dataframes in the examples. +* DOC #791: New example for the paper *Efficient and Robust Automated Machine Learning* by Feurer + et al. (2015). +* DOC #803: New example for the paper *Don’t Rule Out Simple Models Prematurely: + A Large Scale Benchmark Comparing Linear and Non-linear Classifiers in OpenML* by Benjamin + Strang et al. (2018). +* DOC #808: New example demonstrating basic use cases of a dataset. +* DOC #810: New example demonstrating the use of benchmarking studies and suites. +* DOC #832: New example for the paper *Scalable Hyperparameter Transfer Learning* by + Valerio Perrone et al. (2019) +* DOC #834: New example showing how to plot the loss surface for a support vector machine. +* FIX #305: Do not require the external version in the flow XML when loading an object. +* FIX #734: Better handling of *"old"* flows. +* FIX #736: Attach a StreamHandler to the openml logger instead of the root logger. +* FIX #758: Fixes an error which made the client API crash when loading a sparse data with + categorical variables. +* FIX #779: Do not fail on corrupt pickle +* FIX #782: Assign the study id to the correct class attribute. +* FIX #819: Automatically convert column names to type string when uploading a dataset. +* FIX #820: Make ``__repr__`` work for datasets which do not have an id. +* MAINT #796: Rename an argument to make the function ``list_evaluations`` more consistent. +* MAINT #811: Print the full error message given by the server. +* MAINT #828: Create base class for OpenML entity classes. +* MAINT #829: Reduce the number of data conversion warnings. +* MAINT #831: Warn if there's an empty flow description when publishing a flow. +* MAINT #837: Also print the flow XML if a flow fails to validate. +* FIX #838: Fix list_evaluations_setups to work when evaluations are not a 100 multiple. +* FIX #847: Fixes an issue where the client API would crash when trying to download a dataset + when there are no qualities available on the server. +* MAINT #849: Move logic of most different ``publish`` functions into the base class. +* MAINt #850: Remove outdated test code. + 0.10.0 ~~~~~~ + * ADD #737: Add list_evaluations_setups to return hyperparameters along with list of evaluations. * FIX #261: Test server is cleared of all files uploaded during unit testing. * FIX #447: All files created by unit tests no longer persist in local. @@ -25,6 +74,7 @@ Changelog * ADD #412: The scikit-learn extension populates the short name field for flows. * MAINT #726: Update examples to remove deprecation warnings from scikit-learn * MAINT #752: Update OpenML-Python to be compatible with sklearn 0.21 +* ADD #790: Add user ID and name to list_evaluations 0.9.0 diff --git a/doc/usage.rst b/doc/usage.rst index b607c1433..36c8584ff 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -21,11 +21,11 @@ Installation & Set up ~~~~~~~~~~~~~~~~~~~~~~ The OpenML Python package is a connector to `OpenML `_. -It allows to use and share datasets and tasks, run +It allows you to use and share datasets and tasks, run machine learning algorithms on them and then share the results online. The following tutorial gives a short introduction on how to install and set up -the OpenML python connector, followed up by a simple example. +the OpenML Python connector, followed up by a simple example. * `Introduction `_ @@ -52,7 +52,7 @@ Working with tasks ~~~~~~~~~~~~~~~~~~ You can think of a task as an experimentation protocol, describing how to apply -a machine learning model to a dataset in a way that it is comparable with the +a machine learning model to a dataset in a way that is comparable with the results of others (more on how to do that further down). Tasks are containers, defining which dataset to use, what kind of task we're solving (regression, classification, clustering, etc...) and which column to predict. Furthermore, @@ -86,7 +86,7 @@ predictions of that run. When a run is uploaded to the server, the server automatically calculates several metrics which can be used to compare the performance of different flows to each other. -So far, the OpenML python connector works only with estimator objects following +So far, the OpenML Python connector works only with estimator objects following the `scikit-learn estimator API `_. Those can be directly run on a task, and a flow will automatically be created or downloaded from the server if it already exists. @@ -114,7 +114,7 @@ requirements and how to download a dataset: OpenML is about sharing machine learning results and the datasets they were obtained on. Learn how to share your datasets in the following tutorial: -* `Upload a dataset `_ +* `Upload a dataset `_ ~~~~~~~~~~~~~~~~~~~~~~~ Extending OpenML-Python diff --git a/examples/20_basic/README.txt b/examples/20_basic/README.txt new file mode 100644 index 000000000..29c787116 --- /dev/null +++ b/examples/20_basic/README.txt @@ -0,0 +1,4 @@ +Introductory Examples +===================== + +Introductory examples to the usage of the OpenML python connector. diff --git a/examples/introduction_tutorial.py b/examples/20_basic/introduction_tutorial.py similarity index 94% rename from examples/introduction_tutorial.py rename to examples/20_basic/introduction_tutorial.py index 9cd88ceba..42537724c 100644 --- a/examples/introduction_tutorial.py +++ b/examples/20_basic/introduction_tutorial.py @@ -1,8 +1,8 @@ """ -Introduction -============ +Setup +===== -An introduction to OpenML, followed up by a simple example. +An example how to set up OpenML-Python followed up by a simple example. """ ############################################################################ # OpenML is an online collaboration platform for machine learning which allows @@ -61,7 +61,7 @@ openml.config.start_using_configuration_for_example() ############################################################################ -# When using the main server, instead make sure your apikey is configured. +# When using the main server instead, make sure your apikey is configured. # This can be done with the following line of code (uncomment it!). # Never share your apikey with others. @@ -96,7 +96,7 @@ # For this tutorial, our configuration publishes to the test server # as to not crowd the main server with runs created by examples. myrun = run.publish() -print("kNN on %s: http://test.openml.org/r/%d" % (data.name, myrun.run_id)) +print(f"kNN on {data.name}: http://test.openml.org/r/{myrun.run_id}") ############################################################################ openml.config.stop_using_configuration_for_example() diff --git a/examples/20_basic/simple_datasets_tutorial.py b/examples/20_basic/simple_datasets_tutorial.py new file mode 100644 index 000000000..dfefbe1e3 --- /dev/null +++ b/examples/20_basic/simple_datasets_tutorial.py @@ -0,0 +1,68 @@ +""" +======== +Datasets +======== + +A basic tutorial on how to list, load and visualize datasets. +""" +############################################################################ +# In general, we recommend working with tasks, so that the results can +# be easily reproduced. Furthermore, the results can be compared to existing results +# at OpenML. However, for the purposes of this tutorial, we are going to work with +# the datasets directly. + +import openml +############################################################################ +# List datasets +# ============= + +datasets_df = openml.datasets.list_datasets(output_format='dataframe') +print(datasets_df.head(n=10)) + +############################################################################ +# Download a dataset +# ================== + +# Iris dataset https://www.openml.org/d/61 +dataset = openml.datasets.get_dataset(61) + +# Print a summary +print(f"This is dataset '{dataset.name}', the target feature is " + f"'{dataset.default_target_attribute}'") +print(f"URL: {dataset.url}") +print(dataset.description[:500]) + +############################################################################ +# Load a dataset +# ============== + +# X - An array/dataframe where each row represents one example with +# the corresponding feature values. +# y - the classes for each example +# categorical_indicator - an array that indicates which feature is categorical +# attribute_names - the names of the features for the examples (X) and +# target feature (y) +X, y, categorical_indicator, attribute_names = dataset.get_data( + dataset_format='dataframe', + target=dataset.default_target_attribute +) +############################################################################ +# Visualize the dataset +# ===================== + +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +sns.set_style("darkgrid") + + +def hide_current_axis(*args, **kwds): + plt.gca().set_visible(False) + + +# We combine all the data so that we can map the different +# examples to different colors according to the classes. +combined_data = pd.concat([X, y], axis=1) +iris_plot = sns.pairplot(combined_data, hue="class") +iris_plot.map_upper(hide_current_axis) +plt.show() diff --git a/examples/20_basic/simple_flows_and_runs_tutorial.py b/examples/20_basic/simple_flows_and_runs_tutorial.py new file mode 100644 index 000000000..e3f028418 --- /dev/null +++ b/examples/20_basic/simple_flows_and_runs_tutorial.py @@ -0,0 +1,48 @@ +""" +Flows and Runs +============== + +A simple tutorial on how to train/run a model and how to upload the results. +""" + +import openml +from sklearn import ensemble, neighbors + +############################################################################ +# Train a machine learning model +# ============================== +# +# .. warning:: This example uploads data. For that reason, this example +# connects to the test server at test.openml.org. This prevents the main +# server from crowding with example datasets, tasks, runs, and so on. + +openml.config.start_using_configuration_for_example() + +# NOTE: We are using dataset 20 from the test server: https://test.openml.org/d/20 +dataset = openml.datasets.get_dataset(20) +X, y, categorical_indicator, attribute_names = dataset.get_data( + dataset_format='array', + target=dataset.default_target_attribute +) +clf = neighbors.KNeighborsClassifier(n_neighbors=3) +clf.fit(X, y) + +############################################################################ +# Running a model on a task +# ========================= + +task = openml.tasks.get_task(119) +clf = ensemble.RandomForestClassifier() +run = openml.runs.run_model_on_task(clf, task) +print(run) + +############################################################################ +# Publishing the run +# ================== + +myrun = run.publish() +print("Run was uploaded to http://test.openml.org/r/" + str(myrun.run_id)) +print("The flow can be found at http://test.openml.org/f/" + str(myrun.flow_id)) + +############################################################################ +openml.config.stop_using_configuration_for_example() diff --git a/examples/20_basic/simple_suites_tutorial.py b/examples/20_basic/simple_suites_tutorial.py new file mode 100644 index 000000000..3a555b9d3 --- /dev/null +++ b/examples/20_basic/simple_suites_tutorial.py @@ -0,0 +1,66 @@ +""" +================ +Benchmark suites +================ + +This is a brief showcase of OpenML benchmark suites, which were introduced by +`Bischl et al. (2019) `_. Benchmark suites standardize the +datasets and splits to be used in an experiment or paper. They are fully integrated into OpenML +and simplify both the sharing of the setup and the results. +""" + +import openml + +#################################################################################################### +# OpenML-CC18 +# =========== +# +# As an example we have a look at the OpenML-CC18, which is a suite of 72 classification datasets +# from OpenML which were carefully selected to be usable by many algorithms and also represent +# datasets commonly used in machine learning research. These are all datasets from mid-2018 that +# satisfy a large set of clear requirements for thorough yet practical benchmarking: +# +# 1. the number of observations are between 500 and 100,000 to focus on medium-sized datasets, +# 2. the number of features does not exceed 5,000 features to keep the runtime of the algorithms +# low +# 3. the target attribute has at least two classes with no class having less than 20 observations +# 4. the ratio of the minority class and the majority class is above 0.05 (to eliminate highly +# imbalanced datasets which require special treatment for both algorithms and evaluation +# measures). +# +# A full description can be found in the `OpenML benchmarking docs +# `_. +# +# In this example we'll focus on how to use benchmark suites in practice. + +#################################################################################################### +# Downloading benchmark suites +# ============================ + +suite = openml.study.get_suite(99) +print(suite) + +#################################################################################################### +# The benchmark suite does not download the included tasks and datasets itself, but only contains +# a list of which tasks constitute the study. +# +# Tasks can then be accessed via + +tasks = suite.tasks +print(tasks) + +#################################################################################################### +# and iterated over for benchmarking. For speed reasons we only iterate over the first three tasks: + +for task_id in tasks[:3]: + task = openml.tasks.get_task(task_id) + print(task) + +#################################################################################################### +# Further examples +# ================ +# +# * `Advanced benchmarking suites tutorial <../30_extended/suites_tutorial.html>`_ +# * `Benchmarking studies tutorial <../30_extended/study_tutorial.html>`_ +# * `Using studies to compare linear and non-linear classifiers +# <../40_paper/2018_ida_strang_example.html>`_ diff --git a/examples/30_extended/README.txt b/examples/30_extended/README.txt new file mode 100644 index 000000000..432fa68f0 --- /dev/null +++ b/examples/30_extended/README.txt @@ -0,0 +1,4 @@ +In-Depth Examples +================= + +Extended examples for the usage of the OpenML python connector. \ No newline at end of file diff --git a/examples/30_extended/configure_logging.py b/examples/30_extended/configure_logging.py new file mode 100644 index 000000000..e16dfe245 --- /dev/null +++ b/examples/30_extended/configure_logging.py @@ -0,0 +1,52 @@ +""" +======== +Logging +======== + +Explains openml-python logging, and shows how to configure it. +""" +################################################################################## +# Logging +# ^^^^^^^ +# Openml-python uses the `Python logging module `_ +# to provide users with log messages. Each log message is assigned a level of importance, see +# the table in Python's logging tutorial +# `here `_. +# +# By default, openml-python will print log messages of level `WARNING` and above to console. +# All log messages (including `DEBUG` and `INFO`) are also saved in a file, which can be +# found in your cache directory (see also the +# `introduction tutorial <../20_basic/introduction_tutorial.html>`_). +# These file logs are automatically deleted if needed, and use at most 2MB of space. +# +# It is possible to configure what log levels to send to console and file. +# When downloading a dataset from OpenML, a `DEBUG`-level message is written: + +import openml +openml.datasets.get_dataset('iris') + +# With default configuration, the above example will show no output to console. +# However, in your cache directory you should find a file named 'openml_python.log', +# which has a DEBUG message written to it. It should be either like +# "[DEBUG] [10:46:19:openml.datasets.dataset] Saved dataset 61: iris to file ..." +# or like +# "[DEBUG] [10:49:38:openml.datasets.dataset] Data pickle file already exists and is up to date." +# , depending on whether or not you had downloaded iris before. +# The processed log levels can be configured programmatically: + +import logging +openml.config.console_log.setLevel(logging.DEBUG) +openml.config.file_log.setLevel(logging.WARNING) +openml.datasets.get_dataset('iris') + +# Now the log level that was previously written to file should also be shown in the console. +# The message is now no longer written to file as the `file_log` was set to level `WARNING`. +# +# It is also possible to specify the desired log levels through the configuration file. +# This way you will not need to set them on each script separately. +# Add the line **verbosity = NUMBER** and/or **file_verbosity = NUMBER** to the config file, +# where 'NUMBER' should be one of: +# +# * 0: `logging.WARNING` and up. +# * 1: `logging.INFO` and up. +# * 2: `logging.DEBUG` and up (i.e. all messages). diff --git a/examples/create_upload_tutorial.py b/examples/30_extended/create_upload_tutorial.py similarity index 92% rename from examples/create_upload_tutorial.py rename to examples/30_extended/create_upload_tutorial.py index cb5506cfd..faca335ea 100644 --- a/examples/create_upload_tutorial.py +++ b/examples/30_extended/create_upload_tutorial.py @@ -119,8 +119,8 @@ ############################################################################ -upload_did = diabetes_dataset.publish() -print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did)) +diabetes_dataset.publish() +print(f"URL for dataset: {diabetes_dataset.openml_url}") ############################################################################ # Dataset is a list @@ -192,17 +192,17 @@ ############################################################################ -upload_did = weather_dataset.publish() -print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did)) +weather_dataset.publish() +print(f"URL for dataset: {weather_dataset.openml_url}") ############################################################################ # Dataset is a pandas DataFrame # ============================= -# It might happen that your dataset is made of heterogeneous data which can be -# usually stored as a Pandas DataFrame. DataFrame offers the adavantages to -# store the type of data for each column as well as the attribute names. -# Therefore, when providing a Pandas DataFrame, OpenML can infer those -# information without the need to specifically provide them when calling the +# It might happen that your dataset is made of heterogeneous data which can usually +# be stored as a Pandas DataFrame. DataFrames offer the advantage of +# storing the type of data for each column as well as the attribute names. +# Therefore, when providing a Pandas DataFrame, OpenML can infer this +# information without needing to explicitly provide it when calling the # function :func:`create_dataset`. In this regard, you only need to pass # ``'auto'`` to the ``attributes`` parameter. @@ -238,8 +238,8 @@ ############################################################################ -upload_did = weather_dataset.publish() -print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did)) +weather_dataset.publish() +print(f"URL for dataset: {weather_dataset.openml_url}") ############################################################################ # Dataset is a sparse matrix @@ -275,8 +275,8 @@ ############################################################################ -upload_did = xor_dataset.publish() -print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did)) +xor_dataset.publish() +print(f"URL for dataset: {xor_dataset.openml_url}") ############################################################################ @@ -310,8 +310,8 @@ ############################################################################ -upload_did = xor_dataset.publish() -print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did)) +xor_dataset.publish() +print(f"URL for dataset: {xor_dataset.openml_url}") ############################################################################ diff --git a/examples/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py similarity index 93% rename from examples/datasets_tutorial.py rename to examples/30_extended/datasets_tutorial.py index 70da03d15..357360f80 100644 --- a/examples/datasets_tutorial.py +++ b/examples/30_extended/datasets_tutorial.py @@ -14,8 +14,9 @@ # ********** # # * List datasets +# # * Use the output_format parameter to select output type -# * Default gives 'dict' (other option: 'dataframe') +# * Default gives 'dict' (other option: 'dataframe', see below) openml_list = openml.datasets.list_datasets() # returns a dict @@ -26,7 +27,7 @@ 'NumberOfFeatures', 'NumberOfClasses' ]] -print("First 10 of %s datasets..." % len(datalist)) +print(f"First 10 of {len(datalist)} datasets...") datalist.head(n=10) # The same can be done with lesser lines of code @@ -55,9 +56,9 @@ dataset = openml.datasets.get_dataset(1471) # Print a summary -print("This is dataset '%s', the target feature is '%s'" % - (dataset.name, dataset.default_target_attribute)) -print("URL: %s" % dataset.url) +print(f"This is dataset '{dataset.name}', the target feature is " + f"'{dataset.default_target_attribute}'") +print(f"URL: {dataset.url}") print(dataset.description[:500]) ############################################################################ diff --git a/examples/fetch_evaluations_tutorial.py b/examples/30_extended/fetch_evaluations_tutorial.py similarity index 83% rename from examples/fetch_evaluations_tutorial.py rename to examples/30_extended/fetch_evaluations_tutorial.py index 10511c540..b6e15e221 100644 --- a/examples/fetch_evaluations_tutorial.py +++ b/examples/30_extended/fetch_evaluations_tutorial.py @@ -3,7 +3,7 @@ Fetching Evaluations ==================== -Evalutions contain a concise summary of the results of all runs made. Each evaluation +Evaluations contain a concise summary of the results of all runs made. Each evaluation provides information on the dataset used, the flow applied, the setup used, the metric evaluated, and the result obtained on the metric, for each such run made. These collection of results can be used for efficient benchmarking of an algorithm and also allow transparent @@ -16,6 +16,7 @@ * Sort the obtained results in descending order of the metric * Plot a cumulative distribution function for the evaluations * Compare the top 10 performing flows based on the evaluation performance +* Retrieve evaluations with hyperparameter settings """ ############################################################################ @@ -147,3 +148,30 @@ def plot_flow_compare(evaluations, top_n=10, metric='predictive_accuracy'): flow_names = evals.flow_name.unique()[:top_n] for i in range(top_n): print((flow_ids[i], flow_names[i])) + +############################################################################# +# Obtaining evaluations with hyperparameter settings +# ================================================== +# We'll now obtain the evaluations of a task and a flow with the hyperparameters + +# List evaluations in descending order based on predictive_accuracy with +# hyperparameters +evals_setups = openml.evaluations.list_evaluations_setups(function='predictive_accuracy', task=[31], + size=100, sort_order='desc') + +"" +print(evals_setups.head()) + +"" +# Return evaluations for flow_id in descending order based on predictive_accuracy +# with hyperparameters. parameters_in_separate_columns returns parameters in +# separate columns +evals_setups = openml.evaluations.list_evaluations_setups(function='predictive_accuracy', + flow=[6767], + size=100, + parameters_in_separate_columns=True) + +"" +print(evals_setups.head(10)) + +"" diff --git a/examples/30_extended/flow_id_tutorial.py b/examples/30_extended/flow_id_tutorial.py new file mode 100644 index 000000000..5bb001493 --- /dev/null +++ b/examples/30_extended/flow_id_tutorial.py @@ -0,0 +1,68 @@ +""" +================== +Obtaining Flow IDs +================== + +This tutorial discusses different ways to obtain the ID of a flow in order to perform further +analysis. +""" + +#################################################################################################### +import sklearn.tree + +import openml + +clf = sklearn.tree.DecisionTreeClassifier() + +#################################################################################################### +# 1. Obtaining a flow given a classifier +# ====================================== +# + +flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish() +flow_id = flow.flow_id +print(flow_id) + +#################################################################################################### +# This piece of code is rather involved. First, it retrieves a +# :class:`~openml.extensions.Extension` which is registered and can handle the given model, +# in our case it is :class:`openml.extensions.sklearn.SklearnExtension`. Second, the extension +# converts the classifier into an instance of :class:`openml.flow.OpenMLFlow`. Third and finally, +# the publish method checks whether the current flow is already present on OpenML. If not, +# it uploads the flow, otherwise, it updates the current instance with all information computed +# by the server (which is obviously also done when uploading/publishing a flow). +# +# To simplify the usage we have created a helper function which automates all these steps: + +flow_id = openml.flows.get_flow_id(model=clf) +print(flow_id) + +#################################################################################################### +# 2. Obtaining a flow given its name +# ================================== +# The schema of a flow is given in XSD (`here +# `_). # noqa E501 +# Only two fields are required, a unique name, and an external version. While it should be pretty +# obvious why we need a name, the need for the additional external version information might not +# be immediately clear. However, this information is very important as it allows to have multiple +# flows with the same name for different versions of a software. This might be necessary if an +# algorithm or implementation introduces, renames or drop hyperparameters over time. + +print(flow.name, flow.external_version) + +#################################################################################################### +# The name and external version are automatically added to a flow when constructing it from a +# model. We can then use them to retrieve the flow id as follows: + +flow_id = openml.flows.flow_exists(name=flow.name, external_version=flow.external_version) +print(flow_id) + +#################################################################################################### +# We can also retrieve all flows for a given name: +flow_ids = openml.flows.get_flow_id(name=flow.name) +print(flow_ids) + +#################################################################################################### +# This also works with the actual model (generalizing the first part of this example): +flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False) +print(flow_ids) diff --git a/examples/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py similarity index 97% rename from examples/flows_and_runs_tutorial.py rename to examples/30_extended/flows_and_runs_tutorial.py index d65abdf28..d5740e5ab 100644 --- a/examples/flows_and_runs_tutorial.py +++ b/examples/30_extended/flows_and_runs_tutorial.py @@ -37,7 +37,7 @@ dataset_format='array', target=dataset.default_target_attribute ) -print("Categorical features: {}".format(categorical_indicator)) +print(f"Categorical features: {categorical_indicator}") transformer = compose.ColumnTransformer( [('one_hot_encoder', preprocessing.OneHotEncoder(categories='auto'), categorical_indicator)]) X = transformer.fit_transform(X) @@ -132,7 +132,7 @@ # The run may be stored offline, and the flow will be stored along with it: run.to_filesystem(directory='myrun') -# They made later be loaded and uploaded +# They may be loaded and uploaded at a later time run = openml.runs.OpenMLRun.from_filesystem(directory='myrun') run.publish() @@ -182,7 +182,7 @@ run = openml.runs.run_model_on_task(clf, task, avoid_duplicate_runs=False) myrun = run.publish() - print("kNN on %s: http://test.openml.org/r/%d" % (data.name, myrun.run_id)) + print(f"kNN on {data.name}: http://test.openml.org/r/{myrun.run_id}") ############################################################################ diff --git a/examples/30_extended/plot_svm_hyperparameters_tutorial.py b/examples/30_extended/plot_svm_hyperparameters_tutorial.py new file mode 100644 index 000000000..714e64221 --- /dev/null +++ b/examples/30_extended/plot_svm_hyperparameters_tutorial.py @@ -0,0 +1,78 @@ +""" +================================ +Plotting hyperparameter surfaces +================================ +""" +import openml +import numpy as np + +#################################################################################################### +# First step - obtaining the data +# =============================== +# First, we nood to choose an SVM flow, for example 8353, and a task. Finding the IDs of them are +# not part of this tutorial, this could for example be done via the website. +# +# For this we use the function ``list_evaluations_setup`` which can automatically join +# evaluations conducted by the server with the hyperparameter settings extracted from the +# uploaded runs (called *setup*). +df = openml.evaluations.list_evaluations_setups( + function='predictive_accuracy', + flow=[8353], + task=[6], + output_format='dataframe', + # Using this flag incorporates the hyperparameters into the returned dataframe. Otherwise, + # the dataframe would contain a field ``paramaters`` containing an unparsed dictionary. + parameters_in_separate_columns=True, +) +print(df.head(n=10)) + +#################################################################################################### +# We can see all the hyperparameter names in the columns of the dataframe: +for name in df.columns: + print(name) + +#################################################################################################### +# Next, we cast and transform the hyperparameters of interest (``C`` and ``gamma``) so that we +# can nicely plot them. +hyperparameters = ['sklearn.svm.classes.SVC(16)_C', 'sklearn.svm.classes.SVC(16)_gamma'] +df[hyperparameters] = df[hyperparameters].astype(float).apply(np.log) + +#################################################################################################### +# Option 1 - plotting via the pandas helper functions +# =================================================== +# +df.plot.hexbin( + x='sklearn.svm.classes.SVC(16)_C', + y='sklearn.svm.classes.SVC(16)_gamma', + C='value', + reduce_C_function=np.mean, + gridsize=25, + title='SVM performance landscape', +) + +#################################################################################################### +# Option 2 - plotting via matplotlib +# ================================== +# +import matplotlib.pyplot as plt + +fig, ax = plt.subplots() + +C = df['sklearn.svm.classes.SVC(16)_C'] +gamma = df['sklearn.svm.classes.SVC(16)_gamma'] +score = df['value'] + +# Plotting all evaluations: +ax.plot(C, gamma, 'ko', ms=1) +# Create a contour plot +cntr = ax.tricontourf(C, gamma, score, levels=12, cmap="RdBu_r") +# Adjusting the colorbar +fig.colorbar(cntr, ax=ax, label="accuracy") +# Adjusting the axis limits +ax.set( + xlim=(min(C), max(C)), + ylim=(min(gamma), max(gamma)), + xlabel="C (log10)", + ylabel="gamma (log10)", +) +ax.set_title('SVM performance landscape') diff --git a/examples/run_setup_tutorial.py b/examples/30_extended/run_setup_tutorial.py similarity index 98% rename from examples/run_setup_tutorial.py rename to examples/30_extended/run_setup_tutorial.py index d64f27e62..8ce03f4b6 100644 --- a/examples/run_setup_tutorial.py +++ b/examples/30_extended/run_setup_tutorial.py @@ -29,7 +29,6 @@ connects to the test server at test.openml.org. This prevents the main server from crowding with example datasets, tasks, runs, and so on. """ -import logging import numpy as np import openml import sklearn.ensemble @@ -37,8 +36,6 @@ import sklearn.preprocessing -root = logging.getLogger() -root.setLevel(logging.INFO) openml.config.start_using_configuration_for_example() ############################################################################### diff --git a/examples/30_extended/study_tutorial.py b/examples/30_extended/study_tutorial.py new file mode 100644 index 000000000..de2be33f8 --- /dev/null +++ b/examples/30_extended/study_tutorial.py @@ -0,0 +1,112 @@ +""" +================= +Benchmark studies +================= + +How to list, download and upload benchmark studies. + +In contrast to `benchmark suites `_ which +hold a list of tasks, studies hold a list of runs. As runs contain all information on flows and +tasks, all required information about a study can be retrieved. +""" +############################################################################ +import uuid + +import numpy as np +import sklearn.tree +import sklearn.pipeline +import sklearn.impute + +import openml + + +############################################################################ +# .. warning:: This example uploads data. For that reason, this example +# connects to the test server at test.openml.org before doing so. +# This prevents the crowding of the main server with example datasets, +# tasks, runs, and so on. +############################################################################ + + +############################################################################ +# Listing studies +# *************** +# +# * Use the output_format parameter to select output type +# * Default gives ``dict``, but we'll use ``dataframe`` to obtain an +# easier-to-work-with data structure + +studies = openml.study.list_studies(output_format='dataframe', status='all') +print(studies.head(n=10)) + + +############################################################################ +# Downloading studies +# =================== + +############################################################################ +# This is done based on the study ID. +study = openml.study.get_study(123) +print(study) + +############################################################################ +# Studies also features a description: +print(study.description) + +############################################################################ +# Studies are a container for runs: +print(study.runs) + +############################################################################ +# And we can use the evaluation listing functionality to learn more about +# the evaluations available for the conducted runs: +evaluations = openml.evaluations.list_evaluations( + function='predictive_accuracy', + output_format='dataframe', + study=study.study_id, +) +print(evaluations.head()) + +############################################################################ +# Uploading studies +# ================= +# +# Creating a study is as simple as creating any kind of other OpenML entity. +# In this examples we'll create a few runs for the OpenML-100 benchmark +# suite which is available on the OpenML test server. + +openml.config.start_using_configuration_for_example() + +# Very simple classifier which ignores the feature type +clf = sklearn.pipeline.Pipeline(steps=[ + ('imputer', sklearn.impute.SimpleImputer()), + ('estimator', sklearn.tree.DecisionTreeClassifier(max_depth=5)), +]) + +suite = openml.study.get_suite(1) +# We'll create a study with one run on three random datasets each +tasks = np.random.choice(suite.tasks, size=3, replace=False) +run_ids = [] +for task_id in tasks: + task = openml.tasks.get_task(task_id) + run = openml.runs.run_model_on_task(clf, task) + run.publish() + run_ids.append(run.run_id) + +# The study needs a machine-readable and unique alias. To obtain this, +# we simply generate a random uuid. +alias = uuid.uuid4().hex + +new_study = openml.study.create_study( + name='Test-Study', + description='Test study for the Python tutorial on studies', + run_ids=run_ids, + alias=alias, + benchmark_suite=suite.study_id, +) +new_study.publish() +print(new_study) + + +############################################################################ +openml.config.stop_using_configuration_for_example() diff --git a/examples/30_extended/suites_tutorial.py b/examples/30_extended/suites_tutorial.py new file mode 100644 index 000000000..c5eb5718f --- /dev/null +++ b/examples/30_extended/suites_tutorial.py @@ -0,0 +1,97 @@ +""" +================ +Benchmark suites +================ + +How to list, download and upload benchmark suites. + +If you want to learn more about benchmark suites, check out our +`brief introductory tutorial <../20_basic/simple_suites_tutorial.html>`_ or the +`OpenML benchmark docs `_. +""" +############################################################################ +import uuid + +import numpy as np + +import openml + + +############################################################################ +# .. warning:: This example uploads data. For that reason, this example +# connects to the test server at test.openml.org before doing so. +# This prevents the main server from crowding with example datasets, +# tasks, runs, and so on. +############################################################################ + + +############################################################################ +# Listing suites +# ************** +# +# * Use the output_format parameter to select output type +# * Default gives ``dict``, but we'll use ``dataframe`` to obtain an +# easier-to-work-with data structure + +suites = openml.study.list_suites(output_format='dataframe', status='all') +print(suites.head(n=10)) + +############################################################################ +# Downloading suites +# ================== + +############################################################################ +# This is done based on the dataset ID. +suite = openml.study.get_suite(99) +print(suite) + +############################################################################ +# Suites also feature a description: +print(suite.description) + +############################################################################ +# Suites are a container for tasks: +print(suite.tasks) + +############################################################################ +# And we can use the task listing functionality to learn more about them: +tasks = openml.tasks.list_tasks(output_format='dataframe') + +# Using ``@`` in `pd.DataFrame.query < +# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html>`_ +# accesses variables outside of the current dataframe. +tasks = tasks.query('tid in @suite.tasks') +print(tasks.describe().transpose()) + +############################################################################ +# Uploading suites +# ================ +# +# Uploading suites is as simple as uploading any kind of other OpenML +# entity - the only reason why we need so much code in this example is +# because we upload some random data. + +openml.config.start_using_configuration_for_example() + +# We'll take a random subset of at least ten tasks of all available tasks on +# the test server: +all_tasks = list(openml.tasks.list_tasks().keys()) +task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20)) + +# The study needs a machine-readable and unique alias. To obtain this, +# we simply generate a random uuid. + +alias = uuid.uuid4().hex + +new_suite = openml.study.create_benchmark_suite( + name='Test-Suite', + description='Test suite for the Python tutorial on benchmark suites', + task_ids=task_ids_for_suite, + alias=alias, +) +new_suite.publish() +print(new_suite) + + +############################################################################ +openml.config.stop_using_configuration_for_example() diff --git a/examples/30_extended/task_manual_iteration_tutorial.py b/examples/30_extended/task_manual_iteration_tutorial.py new file mode 100644 index 000000000..e4f070501 --- /dev/null +++ b/examples/30_extended/task_manual_iteration_tutorial.py @@ -0,0 +1,181 @@ +""" +Tasks: retrieving splits +======================== + +Tasks define a target and a train/test split. Normally, they are the input to the function +``openml.runs.run_model_on_task`` which automatically runs the model on all splits of the task. +However, sometimes it is necessary to manually split a dataset to perform experiments outside of +the functions provided by OpenML. One such example is in the benchmark library +`HPOlib2 `_ which extensively uses data from OpenML, +but not OpenML's functionality to conduct runs. +""" + +import openml + +#################################################################################################### +# For this tutorial we will use the famous King+Rook versus King+Pawn on A7 dataset, which has +# the dataset ID 3 (`dataset on OpenML `_), and for which there exist +# tasks with all important estimation procedures. It is small enough (less than 5000 samples) to +# efficiently use it in an example. +# +# We will first start with (`task 233 `_), which is a task with a +# holdout estimation procedure. +task_id = 233 +task = openml.tasks.get_task(task_id) + +#################################################################################################### +# Now that we have a task object we can obtain the number of repetitions, folds and samples as +# defined by the task: + +n_repeats, n_folds, n_samples = task.get_split_dimensions() + +#################################################################################################### +# * ``n_repeats``: Number of times the model quality estimation is performed +# * ``n_folds``: Number of folds per repeat +# * ``n_samples``: How many data points to use. This is only relevant for learning curve tasks +# +# A list of all available estimation procedures is available +# `here `_. +# +# Task ``233`` is a simple task using the holdout estimation procedure and therefore has only a +# single repeat, a single fold and a single sample size: + +print( + 'Task {}: number of repeats: {}, number of folds: {}, number of samples {}.'.format( + task_id, n_repeats, n_folds, n_samples, + ) +) + +#################################################################################################### +# We can now retrieve the train/test split for this combination of repeats, folds and number of +# samples (indexing is zero-based). Usually, one would loop over all repeats, folds and sample +# sizes, but we can neglect this here as there is only a single repetition. + +train_indices, test_indices = task.get_train_test_split_indices( + repeat=0, + fold=0, + sample=0, +) + +print(train_indices.shape, train_indices.dtype) +print(test_indices.shape, test_indices.dtype) + +#################################################################################################### +# And then split the data based on this: + +X, y, _, _ = task.get_dataset().get_data(task.target_name) +X_train = X.loc[train_indices] +y_train = y[train_indices] +X_test = X.loc[test_indices] +y_test = y[test_indices] + +print( + 'X_train.shape: {}, y_train.shape: {}, X_test.shape: {}, y_test.shape: {}'.format( + X_train.shape, y_train.shape, X_test.shape, y_test.shape, + ) +) + +#################################################################################################### +# Obviously, we can also retrieve cross-validation versions of the dataset used in task ``233``: + +task_id = 3 +task = openml.tasks.get_task(task_id) +n_repeats, n_folds, n_samples = task.get_split_dimensions() +print( + 'Task {}: number of repeats: {}, number of folds: {}, number of samples {}.'.format( + task_id, n_repeats, n_folds, n_samples, + ) +) + +#################################################################################################### +# And then perform the aforementioned iteration over all splits: +for repeat_idx in range(n_repeats): + for fold_idx in range(n_folds): + for sample_idx in range(n_samples): + train_indices, test_indices = task.get_train_test_split_indices( + repeat=repeat_idx, + fold=fold_idx, + sample=sample_idx, + ) + X_train = X.loc[train_indices] + y_train = y[train_indices] + X_test = X.loc[test_indices] + y_test = y[test_indices] + + print( + 'Repeat #{}, fold #{}, samples {}: X_train.shape: {}, ' + 'y_train.shape {}, X_test.shape {}, y_test.shape {}'.format( + repeat_idx, fold_idx, sample_idx, X_train.shape, y_train.shape, X_test.shape, + y_test.shape, + ) + ) + +#################################################################################################### +# And also versions with multiple repeats: + +task_id = 1767 +task = openml.tasks.get_task(task_id) +n_repeats, n_folds, n_samples = task.get_split_dimensions() +print( + 'Task {}: number of repeats: {}, number of folds: {}, number of samples {}.'.format( + task_id, n_repeats, n_folds, n_samples, + ) +) + +#################################################################################################### +# And then again perform the aforementioned iteration over all splits: +for repeat_idx in range(n_repeats): + for fold_idx in range(n_folds): + for sample_idx in range(n_samples): + train_indices, test_indices = task.get_train_test_split_indices( + repeat=repeat_idx, + fold=fold_idx, + sample=sample_idx, + ) + X_train = X.loc[train_indices] + y_train = y[train_indices] + X_test = X.loc[test_indices] + y_test = y[test_indices] + + print( + 'Repeat #{}, fold #{}, samples {}: X_train.shape: {}, ' + 'y_train.shape {}, X_test.shape {}, y_test.shape {}'.format( + repeat_idx, fold_idx, sample_idx, X_train.shape, y_train.shape, X_test.shape, + y_test.shape, + ) + ) + +#################################################################################################### +# And finally a task based on learning curves: + +task_id = 1702 +task = openml.tasks.get_task(task_id) +n_repeats, n_folds, n_samples = task.get_split_dimensions() +print( + 'Task {}: number of repeats: {}, number of folds: {}, number of samples {}.'.format( + task_id, n_repeats, n_folds, n_samples, + ) +) + +#################################################################################################### +# And then again perform the aforementioned iteration over all splits: +for repeat_idx in range(n_repeats): + for fold_idx in range(n_folds): + for sample_idx in range(n_samples): + train_indices, test_indices = task.get_train_test_split_indices( + repeat=repeat_idx, + fold=fold_idx, + sample=sample_idx, + ) + X_train = X.loc[train_indices] + y_train = y[train_indices] + X_test = X.loc[test_indices] + y_test = y[test_indices] + + print( + 'Repeat #{}, fold #{}, samples {}: X_train.shape: {}, ' + 'y_train.shape {}, X_test.shape {}, y_test.shape {}'.format( + repeat_idx, fold_idx, sample_idx, X_train.shape, y_train.shape, X_test.shape, + y_test.shape, + ) + ) diff --git a/examples/tasks_tutorial.py b/examples/30_extended/tasks_tutorial.py similarity index 75% rename from examples/tasks_tutorial.py rename to examples/30_extended/tasks_tutorial.py index c54ecdbd9..1fb23f63d 100644 --- a/examples/tasks_tutorial.py +++ b/examples/30_extended/tasks_tutorial.py @@ -13,15 +13,14 @@ # Tasks are identified by IDs and can be accessed in two different ways: # # 1. In a list providing basic information on all tasks available on OpenML. -# This function will not download the actual tasks, but will instead download -# meta data that can be used to filter the tasks and retrieve a set of IDs. -# We can filter this list, for example, we can only list tasks having a -# special tag or only tasks for a specific target such as -# *supervised classification*. -# +# This function will not download the actual tasks, but will instead download +# meta data that can be used to filter the tasks and retrieve a set of IDs. +# We can filter this list, for example, we can only list tasks having a +# special tag or only tasks for a specific target such as +# *supervised classification*. # 2. A single task by its ID. It contains all meta information, the target -# metric, the splits and an iterator which can be used to access the -# splits in a useful manner. +# metric, the splits and an iterator which can be used to access the +# splits in a useful manner. ############################################################################ # Listing tasks @@ -32,16 +31,18 @@ tasks = openml.tasks.list_tasks(task_type_id=1) ############################################################################ -# **openml.tasks.list_tasks()** returns a dictionary of dictionaries, we convert it into a +# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, which we convert +# into a # `pandas dataframe `_ -# to have better visualization and easier access: +# to have better visualization capabilities and easier access: tasks = pd.DataFrame.from_dict(tasks, orient='index') print(tasks.columns) -print("First 5 of %s tasks:" % len(tasks)) +print(f"First 5 of {len(tasks)} tasks:") print(tasks.head()) -# The same can be obtained through lesser lines of code +# As conversion to a pandas dataframe is a common task, we have added this functionality to the +# OpenML-Python library which can be used by passing ``output_format='dataframe'``: tasks_df = openml.tasks.list_tasks(task_type_id=1, output_format='dataframe') print(tasks_df.head()) @@ -74,24 +75,21 @@ # # Similar to listing tasks by task type, we can list tasks by tags: -tasks = openml.tasks.list_tasks(tag='OpenML100') -tasks = pd.DataFrame.from_dict(tasks, orient='index') -print("First 5 of %s tasks:" % len(tasks)) +tasks = openml.tasks.list_tasks(tag='OpenML100', output_format='dataframe') +print(f"First 5 of {len(tasks)} tasks:") print(tasks.head()) ############################################################################ # Furthermore, we can list tasks based on the dataset id: -tasks = openml.tasks.list_tasks(data_id=1471) -tasks = pd.DataFrame.from_dict(tasks, orient='index') -print("First 5 of %s tasks:" % len(tasks)) +tasks = openml.tasks.list_tasks(data_id=1471, output_format='dataframe') +print(f"First 5 of {len(tasks)} tasks:") print(tasks.head()) ############################################################################ # In addition, a size limit and an offset can be applied both separately and simultaneously: -tasks = openml.tasks.list_tasks(size=10, offset=50) -tasks = pd.DataFrame.from_dict(tasks, orient='index') +tasks = openml.tasks.list_tasks(size=10, offset=50, output_format='dataframe') print(tasks) ############################################################################ @@ -107,8 +105,7 @@ # Finally, it is also possible to list all tasks on OpenML with: ############################################################################ -tasks = openml.tasks.list_tasks() -tasks = pd.DataFrame.from_dict(tasks, orient='index') +tasks = openml.tasks.list_tasks(output_format='dataframe') print(len(tasks)) ############################################################################ @@ -148,9 +145,9 @@ # # You can also create new tasks. Take the following into account: # -# * You can only create tasks on _active_ datasets +# * You can only create tasks on *active* datasets # * For now, only the following tasks are supported: classification, regression, -# clustering, and learning curve analysis. +# clustering, and learning curve analysis. # * For now, tasks can only be created on a single dataset. # * The exact same task must not already exist. # @@ -158,10 +155,9 @@ # # * task_type_id: The task type ID, required (see below). Required. # * dataset_id: The dataset ID. Required. -# * target_name: The name of the attribute you aim to predict. -# Optional. +# * target_name: The name of the attribute you aim to predict. Optional. # * estimation_procedure_id : The ID of the estimation procedure used to create train-test -# splits. Optional. +# splits. Optional. # * evaluation_measure: The name of the evaluation measure. Optional. # * Any additional inputs for specific tasks # @@ -178,8 +174,8 @@ # # Let's create a classification task on a dataset. In this example we will do this on the # Iris dataset (ID=128 (on test server)). We'll use 10-fold cross-validation (ID=1), -# and _predictive accuracy_ as the predefined measure (this can also be left open). -# If a task with these parameters exist, we will get an appropriate exception. +# and *predictive accuracy* as the predefined measure (this can also be left open). +# If a task with these parameters exists, we will get an appropriate exception. # If such a task doesn't exist, a task will be created and the corresponding task_id # will be returned. @@ -200,11 +196,11 @@ # Error code for 'task already exists' if e.code == 614: # Lookup task - tasks = openml.tasks.list_tasks(data_id=128, output_format='dataframe').to_numpy() - tasks = tasks[tasks[:, 4] == "Supervised Classification"] - tasks = tasks[tasks[:, 6] == "10-fold Crossvalidation"] - tasks = tasks[tasks[:, 19] == "predictive_accuracy"] - task_id = tasks[0][0] + tasks = openml.tasks.list_tasks(data_id=128, output_format='dataframe') + tasks = tasks.query('task_type == "Supervised Classification" ' + 'and estimation_procedure == "10-fold Crossvalidation" ' + 'and evaluation_measures == "predictive_accuracy"') + task_id = tasks.loc[:, "tid"].values[0] print("Task already exists. Task ID is", task_id) # reverting to prod server @@ -212,8 +208,7 @@ ############################################################################ -# [Complete list of task types](https://www.openml.org/search?type=task_type) -# [Complete list of model estimation procedures]( -# https://www.openml.org/search?q=%2520measure_type%3Aestimation_procedure&type=measure) -# [Complete list of evaluation measures]( -# https://www.openml.org/search?q=measure_type%3Aevaluation_measure&type=measure) +# * `Complete list of task types `_. +# * `Complete list of model estimation procedures `_. +# * `Complete list of evaluation measures `_. +# diff --git a/examples/40_paper/2015_neurips_feurer_example.py b/examples/40_paper/2015_neurips_feurer_example.py new file mode 100644 index 000000000..8ca2412ba --- /dev/null +++ b/examples/40_paper/2015_neurips_feurer_example.py @@ -0,0 +1,89 @@ +""" +Feurer et al. (2015) +==================== + +A tutorial on how to get the datasets used in the paper introducing *Auto-sklearn* by Feurer et al.. + +Auto-sklearn website: https://automl.github.io/auto-sklearn/master/ + +Publication +~~~~~~~~~~~ + +| Efficient and Robust Automated Machine Learning +| Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter +| In *Advances in Neural Information Processing Systems 28*, 2015 +| Available at http://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf +""" # noqa F401 + +import pandas as pd + +import openml + +#################################################################################################### +# List of dataset IDs given in the supplementary material of Feurer et al.: +# https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning-supplemental.zip +dataset_ids = [ + 3, 6, 12, 14, 16, 18, 21, 22, 23, 24, 26, 28, 30, 31, 32, 36, 38, 44, 46, + 57, 60, 179, 180, 181, 182, 184, 185, 273, 293, 300, 351, 354, 357, 389, + 390, 391, 392, 393, 395, 396, 398, 399, 401, 554, 679, 715, 718, 720, 722, + 723, 727, 728, 734, 735, 737, 740, 741, 743, 751, 752, 761, 772, 797, 799, + 803, 806, 807, 813, 816, 819, 821, 822, 823, 833, 837, 843, 845, 846, 847, + 849, 866, 871, 881, 897, 901, 903, 904, 910, 912, 913, 914, 917, 923, 930, + 934, 953, 958, 959, 962, 966, 971, 976, 977, 978, 979, 980, 991, 993, 995, + 1000, 1002, 1018, 1019, 1020, 1021, 1036, 1040, 1041, 1049, 1050, 1053, + 1056, 1067, 1068, 1069, 1111, 1112, 1114, 1116, 1119, 1120, 1128, 1130, + 1134, 1138, 1139, 1142, 1146, 1161, 1166, +] + +#################################################################################################### +# The dataset IDs could be used directly to load the dataset and split the data into a training set +# and a test set. However, to be reproducible, we will first obtain the respective tasks from +# OpenML, which define both the target feature and the train/test split. +# +# .. note:: +# It is discouraged to work directly on datasets and only provide dataset IDs in a paper as +# this does not allow reproducibility (unclear splitting). Please do not use datasets but the +# respective tasks as basis for a paper and publish task IDS. This example is only given to +# showcase the use of OpenML-Python for a published paper and as a warning on how not to do it. +# Please check the `OpenML documentation of tasks `_ if you +# want to learn more about them. + +#################################################################################################### +# This lists both active and inactive tasks (because of ``status='all'``). Unfortunately, +# this is necessary as some of the datasets contain issues found after the publication and became +# deactivated, which also deactivated the tasks on them. More information on active or inactive +# datasets can be found in the `online docs `_. +tasks = openml.tasks.list_tasks( + task_type_id=openml.tasks.TaskTypeEnum.SUPERVISED_CLASSIFICATION, + status='all', + output_format='dataframe', +) + +# Query only those with holdout as the resampling startegy. +tasks = tasks.query('estimation_procedure == "33% Holdout set"') + +task_ids = [] +for did in dataset_ids: + tasks_ = list(tasks.query("did == {}".format(did)).tid) + if len(tasks_) >= 1: # if there are multiple task, take the one with lowest ID (oldest). + task_id = min(tasks_) + else: + raise ValueError(did) + + # Optional - Check that the task has the same target attribute as the + # dataset default target attribute + # (disabled for this example as it needs to run fast to be rendered online) + # task = openml.tasks.get_task(task_id) + # dataset = task.get_dataset() + # if task.target_name != dataset.default_target_attribute: + # raise ValueError( + # (task.target_name, dataset.default_target_attribute) + # ) + + task_ids.append(task_id) + +assert len(task_ids) == 140 +task_ids.sort() + +# These are the tasks to work with: +print(task_ids) diff --git a/examples/40_paper/2018_ida_strang_example.py b/examples/40_paper/2018_ida_strang_example.py new file mode 100644 index 000000000..ef35a4a21 --- /dev/null +++ b/examples/40_paper/2018_ida_strang_example.py @@ -0,0 +1,120 @@ +""" +Strang et al. (2018) +==================== + +A tutorial on how to reproduce the analysis conducted for *Don't Rule Out Simple Models +Prematurely: A Large Scale Benchmark Comparing Linear and Non-linear Classifiers in OpenML*. + +Publication +~~~~~~~~~~~ + +| Don't Rule Out Simple Models Prematurely: A Large Scale Benchmark Comparing Linear and Non-linear Classifiers in OpenML +| Benjamin Strang, Peter van der Putten, Jan N. van Rijn and Frank Hutter +| In *Advances in Intelligent Data Analysis XVII 17th International Symposium*, 2018 +| Available at https://link.springer.com/chapter/10.1007%2F978-3-030-01768-2_25 +""" +import matplotlib.pyplot as plt +import openml +import pandas as pd + +############################################################################## +# A basic step for each data-mining or machine learning task is to determine +# which model to choose based on the problem and the data at hand. In this +# work we investigate when non-linear classifiers outperform linear +# classifiers by means of a large scale experiment. +# +# The paper is accompanied with a study object, containing all relevant tasks +# and runs (``study_id=123``). The paper features three experiment classes: +# Support Vector Machines (SVM), Neural Networks (NN) and Decision Trees (DT). +# This example demonstrates how to reproduce the plots, comparing two +# classifiers given the OpenML flow ids. Note that this allows us to reproduce +# the SVM and NN experiment, but not the DT experiment, as this requires a bit +# more effort to distinguish the same flow with different hyperparameter +# values. + +study_id = 123 +# for comparing svms: flow_ids = [7754, 7756] +# for comparing nns: flow_ids = [7722, 7729] +# for comparing dts: flow_ids = [7725], differentiate on hyper-parameter value +classifier_family = 'SVM' +flow_ids = [7754, 7756] +measure = 'predictive_accuracy' +meta_features = ['NumberOfInstances', 'NumberOfFeatures'] +class_values = ['non-linear better', 'linear better', 'equal'] + +# Downloads all evaluation records related to this study +evaluations = openml.evaluations.list_evaluations( + measure, flow=flow_ids, study=study_id, output_format='dataframe') +# gives us a table with columns data_id, flow1_value, flow2_value +evaluations = evaluations.pivot( + index='data_id', columns='flow_id', values='value').dropna() +# downloads all data qualities (for scatter plot) +data_qualities = openml.datasets.list_datasets( + data_id=list(evaluations.index.values), output_format='dataframe') +# removes irrelevant data qualities +data_qualities = data_qualities[meta_features] +# makes a join between evaluation table and data qualities table, +# now we have columns data_id, flow1_value, flow2_value, meta_feature_1, +# meta_feature_2 +evaluations = evaluations.join(data_qualities, how='inner') + +# adds column that indicates the difference between the two classifiers +evaluations['diff'] = evaluations[flow_ids[0]] - evaluations[flow_ids[1]] + + +############################################################################## +# makes the s-plot + +fig_splot, ax_splot = plt.subplots() +ax_splot.plot(range(len(evaluations)), sorted(evaluations['diff'])) +ax_splot.set_title(classifier_family) +ax_splot.set_xlabel('Dataset (sorted)') +ax_splot.set_ylabel('difference between linear and non-linear classifier') +ax_splot.grid(linestyle='--', axis='y') +plt.show() + + +############################################################################## +# adds column that indicates the difference between the two classifiers, +# needed for the scatter plot + +def determine_class(val_lin, val_nonlin): + if val_lin < val_nonlin: + return class_values[0] + elif val_nonlin < val_lin: + return class_values[1] + else: + return class_values[2] + + +evaluations['class'] = evaluations.apply( + lambda row: determine_class(row[flow_ids[0]], row[flow_ids[1]]), axis=1) + +# does the plotting and formatting +fig_scatter, ax_scatter = plt.subplots() +for class_val in class_values: + df_class = evaluations[evaluations['class'] == class_val] + plt.scatter(df_class[meta_features[0]], + df_class[meta_features[1]], + label=class_val) +ax_scatter.set_title(classifier_family) +ax_scatter.set_xlabel(meta_features[0]) +ax_scatter.set_ylabel(meta_features[1]) +ax_scatter.legend() +ax_scatter.set_xscale('log') +ax_scatter.set_yscale('log') +plt.show() + +############################################################################## +# makes a scatter plot where each data point represents the performance of the +# two algorithms on various axis (not in the paper) + +fig_diagplot, ax_diagplot = plt.subplots() +ax_diagplot.grid(linestyle='--') +ax_diagplot.plot([0, 1], ls="-", color="black") +ax_diagplot.plot([0.2, 1.2], ls="--", color="black") +ax_diagplot.plot([-0.2, 0.8], ls="--", color="black") +ax_diagplot.scatter(evaluations[flow_ids[0]], evaluations[flow_ids[1]]) +ax_diagplot.set_xlabel(measure) +ax_diagplot.set_ylabel(measure) +plt.show() diff --git a/examples/40_paper/2018_kdd_rijn_example.py b/examples/40_paper/2018_kdd_rijn_example.py new file mode 100644 index 000000000..3302333ae --- /dev/null +++ b/examples/40_paper/2018_kdd_rijn_example.py @@ -0,0 +1,145 @@ +""" +van Rijn and Hutter (2018) +========================== + +A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*. + +This is a Unix-only tutorial, as the requirements can not be satisfied on a Windows machine (Untested on other +systems). + +Publication +~~~~~~~~~~~ + +| Hyperparameter importance across datasets +| Jan N. van Rijn and Frank Hutter +| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018 +| Available at https://dl.acm.org/citation.cfm?id=3220058 +""" +import sys + +if sys.platform == 'win32': # noqa + print('The pyrfr library (requirement of fanova) can currently not be installed on Windows systems') + exit() + +import json +import fanova +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns + +import openml + + +############################################################################## +# With the advent of automated machine learning, automated hyperparameter +# optimization methods are by now routinely used in data mining. However, this +# progress is not yet matched by equal progress on automatic analyses that +# yield information beyond performance-optimizing hyperparameter settings. +# In this example, we aim to answer the following two questions: Given an +# algorithm, what are generally its most important hyperparameters? +# +# This work is carried out on the OpenML-100 benchmark suite, which can be +# obtained by ``openml.study.get_suite('OpenML100')``. In this example, we +# conduct the experiment on the Support Vector Machine (``flow_id=7707``) +# with specific kernel (we will perform a post-process filter operation for +# this). We should set some other experimental parameters (number of results +# per task, evaluation measure and the number of trees of the internal +# functional Anova) before the fun can begin. +# +# Note that we simplify the example in several ways: +# +# 1) We only consider numerical hyperparameters +# 2) We consider all hyperparameters that are numerical (in reality, some +# hyperparameters might be inactive (e.g., ``degree``) or irrelevant +# (e.g., ``random_state``) +# 3) We assume all hyperparameters to be on uniform scale +# +# Any difference in conclusion between the actual paper and the presented +# results is most likely due to one of these simplifications. For example, +# the hyperparameter C looks rather insignificant, whereas it is quite +# important when it is put on a log-scale. All these simplifications can be +# addressed by defining a ConfigSpace. For a more elaborated example that uses +# this, please see: +# https://github.com/janvanrijn/openml-pimp/blob/d0a14f3eb480f2a90008889f00041bdccc7b9265/examples/plot/plot_fanova_aggregates.py # noqa F401 + +suite = openml.study.get_suite('OpenML100') +flow_id = 7707 +parameter_filters = { + 'sklearn.svm.classes.SVC(17)_kernel': 'sigmoid' +} +evaluation_measure = 'predictive_accuracy' +limit_per_task = 500 +limit_nr_tasks = 15 +n_trees = 16 + +fanova_results = [] +# we will obtain all results from OpenML per task. Practice has shown that this places the bottleneck on the +# communication with OpenML, and for iterated experimenting it is better to cache the results in a local file. +for idx, task_id in enumerate(suite.tasks): + if limit_nr_tasks is not None and idx >= limit_nr_tasks: + continue + print('Starting with task %d (%d/%d)' + % (task_id, idx+1, len(suite.tasks) if limit_nr_tasks is None else limit_nr_tasks)) + # note that we explicitly only include tasks from the benchmark suite that was specified (as per the for-loop) + evals = openml.evaluations.list_evaluations_setups( + evaluation_measure, flow=[flow_id], task=[task_id], size=limit_per_task, output_format='dataframe') + + performance_column = 'value' + # make a DataFrame consisting of all hyperparameters (which is a dict in setup['parameters']) and the performance + # value (in setup['value']). The following line looks a bit complicated, but combines 2 tasks: a) combine + # hyperparameters and performance data in a single dict, b) cast hyperparameter values to the appropriate format + # Note that the ``json.loads(...)`` requires the content to be in JSON format, which is only the case for + # scikit-learn setups (and even there some legacy setups might violate this requirement). It will work for the + # setups that belong to the flows embedded in this example though. + try: + setups_evals = pd.DataFrame([dict(**{name: json.loads(value) for name, value in setup['parameters'].items()}, + **{performance_column: setup[performance_column]}) + for _, setup in evals.iterrows()]) + except json.decoder.JSONDecodeError as e: + print('Task %d error: %s' % (task_id, e)) + continue + # apply our filters, to have only the setups that comply to the hyperparameters we want + for filter_key, filter_value in parameter_filters.items(): + setups_evals = setups_evals[setups_evals[filter_key] == filter_value] + # in this simplified example, we only display numerical and float hyperparameters. For categorical hyperparameters, + # the fanova library needs to be informed by using a configspace object. + setups_evals = setups_evals.select_dtypes(include=['int64', 'float64']) + # drop rows with unique values. These are by definition not an interesting hyperparameter, e.g., ``axis``, + # ``verbose``. + setups_evals = setups_evals[[c for c in list(setups_evals) + if len(setups_evals[c].unique()) > 1 or c == performance_column]] + # We are done with processing ``setups_evals``. Note that we still might have some irrelevant hyperparameters, e.g., + # ``random_state``. We have dropped some relevant hyperparameters, i.e., several categoricals. Let's check it out: + + # determine x values to pass to fanova library + parameter_names = [pname for pname in setups_evals.columns.to_numpy() if pname != performance_column] + evaluator = fanova.fanova.fANOVA( + X=setups_evals[parameter_names].to_numpy(), Y=setups_evals[performance_column].to_numpy(), n_trees=n_trees) + for idx, pname in enumerate(parameter_names): + try: + fanova_results.append({ + 'hyperparameter': pname.split(".")[-1], + 'fanova': evaluator.quantify_importance([idx])[(idx,)]['individual importance'] + }) + except RuntimeError as e: + # functional ANOVA sometimes crashes with a RuntimeError, e.g., on tasks where the performance is constant + # for all configurations (there is no variance). We will skip these tasks (like the authors did in the + # paper). + print('Task %d error: %s' % (task_id, e)) + continue + +# transform ``fanova_results`` from a list of dicts into a DataFrame +fanova_results = pd.DataFrame(fanova_results) + +############################################################################## +# make the boxplot of the variance contribution. Obviously, we can also use +# this data to make the Nemenyi plot, but this relies on the rather complex +# ``Orange`` dependency (``pip install Orange3``). For the complete example, +# the reader is referred to the more elaborate script (referred to earlier) +fig, ax = plt.subplots() +sns.boxplot(x='hyperparameter', y='fanova', data=fanova_results, ax=ax) +ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right') +ax.set_ylabel('Variance Contribution') +ax.set_xlabel(None) +plt.tight_layout() +plt.show() diff --git a/examples/40_paper/2018_neurips_perrone_example.py b/examples/40_paper/2018_neurips_perrone_example.py new file mode 100644 index 000000000..5513fab30 --- /dev/null +++ b/examples/40_paper/2018_neurips_perrone_example.py @@ -0,0 +1,247 @@ +""" +Perrone et al. (2018) +===================== + +A tutorial on how to build a surrogate model based on OpenML data as done for *Scalable +Hyperparameter Transfer Learning* by Perrone et al.. + +Publication +~~~~~~~~~~~ + +| Scalable Hyperparameter Transfer Learning +| Valerio Perrone and Rodolphe Jenatton and Matthias Seeger and Cedric Archambeau +| In *Advances in Neural Information Processing Systems 31*, 2018 +| Available at http://papers.nips.cc/paper/7917-scalable-hyperparameter-transfer-learning.pdf + +This example demonstrates how OpenML runs can be used to construct a surrogate model. + +In the following section, we shall do the following: + +* Retrieve tasks and flows as used in the experiments by Perrone et al. (2018). +* Build a tabular data by fetching the evaluations uploaded to OpenML. +* Impute missing values and handle categorical data before building a Random Forest model that + maps hyperparameter values to the area under curve score. +""" + +############################################################################ +import openml +import numpy as np +import pandas as pd +from matplotlib import pyplot as plt +from sklearn.pipeline import Pipeline +from sklearn.impute import SimpleImputer +from sklearn.compose import ColumnTransformer +from sklearn.metrics import mean_squared_error +from sklearn.preprocessing import OneHotEncoder +from sklearn.ensemble import RandomForestRegressor + +flow_type = 'svm' # this example will use the smaller svm flow evaluations +############################################################################ +# The subsequent functions are defined to fetch tasks, flows, evaluations and preprocess them into +# a tabular format that can be used to build models. + + +def fetch_evaluations(run_full=False, + flow_type='svm', + metric='area_under_roc_curve'): + ''' + Fetch a list of evaluations based on the flows and tasks used in the experiments. + + Parameters + ---------- + run_full : boolean + If True, use the full list of tasks used in the paper + If False, use 5 tasks with the smallest number of evaluations available + flow_type : str, {'svm', 'xgboost'} + To select whether svm or xgboost experiments are to be run + metric : str + The evaluation measure that is passed to openml.evaluations.list_evaluations + + Returns + ------- + eval_df : dataframe + task_ids : list + flow_id : int + ''' + # Collecting task IDs as used by the experiments from the paper + if flow_type == 'svm' and run_full: + task_ids = [ + 10101, 145878, 146064, 14951, 34537, 3485, 3492, 3493, 3494, + 37, 3889, 3891, 3899, 3902, 3903, 3913, 3918, 3950, 9889, + 9914, 9946, 9952, 9967, 9971, 9976, 9978, 9980, 9983, + ] + elif flow_type == 'svm' and not run_full: + task_ids = [9983, 3485, 3902, 3903, 145878] + elif flow_type == 'xgboost' and run_full: + task_ids = [ + 10093, 10101, 125923, 145847, 145857, 145862, 145872, 145878, + 145953, 145972, 145976, 145979, 146064, 14951, 31, 3485, + 3492, 3493, 37, 3896, 3903, 3913, 3917, 3918, 3, 49, 9914, + 9946, 9952, 9967, + ] + else: # flow_type == 'xgboost' and not run_full: + task_ids = [3903, 37, 3485, 49, 3913] + + # Fetching the relevant flow + flow_id = 5891 if flow_type == 'svm' else 6767 + + # Fetching evaluations + eval_df = openml.evaluations.list_evaluations_setups(function=metric, + task=task_ids, + flow=[flow_id], + uploader=[2702], + output_format='dataframe', + parameters_in_separate_columns=True) + return eval_df, task_ids, flow_id + + +def create_table_from_evaluations(eval_df, + flow_type='svm', + run_count=np.iinfo(np.int64).max, + task_ids=None): + ''' + Create a tabular data with its ground truth from a dataframe of evaluations. + Optionally, can filter out records based on task ids. + + Parameters + ---------- + eval_df : dataframe + Containing list of runs as obtained from list_evaluations() + flow_type : str, {'svm', 'xgboost'} + To select whether svm or xgboost experiments are to be run + run_count : int + Maximum size of the table created, or number of runs included in the table + task_ids : list, (optional) + List of integers specifying the tasks to be retained from the evaluations dataframe + + Returns + ------- + eval_table : dataframe + values : list + ''' + if task_ids is not None: + eval_df = eval_df[eval_df['task_id'].isin(task_ids)] + if flow_type == 'svm': + colnames = ['cost', 'degree', 'gamma', 'kernel'] + else: + colnames = [ + 'alpha', 'booster', 'colsample_bylevel', 'colsample_bytree', + 'eta', 'lambda', 'max_depth', 'min_child_weight', 'nrounds', + 'subsample', + ] + eval_df = eval_df.sample(frac=1) # shuffling rows + eval_df = eval_df.iloc[:run_count, :] + eval_df.columns = [column.split('_')[-1] for column in eval_df.columns] + eval_table = eval_df.loc[:, colnames] + value = eval_df.loc[:, 'value'] + return eval_table, value + + +def list_categorical_attributes(flow_type='svm'): + if flow_type == 'svm': + return ['kernel'] + return ['booster'] + + +############################################################################# +# Fetching the data from OpenML +# ***************************** +# Now, we read all the tasks and evaluations for them and collate into a table. +# Here, we are reading all the tasks and evaluations for the SVM flow and +# pre-processing all retrieved evaluations. + +eval_df, task_ids, flow_id = fetch_evaluations(run_full=False, flow_type=flow_type) +X, y = create_table_from_evaluations(eval_df, flow_type=flow_type) +print(X.head()) +print("Y : ", y[:5]) + +############################################################################# +# Creating pre-processing and modelling pipelines +# *********************************************** +# The two primary tasks are to impute the missing values, that is, account for the hyperparameters +# that are not available with the runs from OpenML. And secondly, to handle categorical variables +# using One-hot encoding prior to modelling. + +# Separating data into categorical and non-categorical (numeric for this example) columns +cat_cols = list_categorical_attributes(flow_type=flow_type) +num_cols = list(set(X.columns) - set(cat_cols)) + +# Missing value imputers +cat_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='None') +num_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-1) + +# Creating the one-hot encoder +enc = OneHotEncoder(handle_unknown='ignore') + +# Pipeline to handle categorical column transformations +cat_transforms = Pipeline(steps=[('impute', cat_imputer), ('encode', enc)]) + +# Combining column transformers +ct = ColumnTransformer([('cat', cat_transforms, cat_cols), ('num', num_imputer, num_cols)]) + +# Creating the full pipeline with the surrogate model +clf = RandomForestRegressor(n_estimators=50) +model = Pipeline(steps=[('preprocess', ct), ('surrogate', clf)]) + + +############################################################################# +# Building a surrogate model on a task's evaluation +# ************************************************* +# The same set of functions can be used for a single task to retrieve a singular table which can +# be used for the surrogate model construction. We shall use the SVM flow here to keep execution +# time simple and quick. + +# Selecting a task for the surrogate +task_id = task_ids[-1] +print("Task ID : ", task_id) +X, y = create_table_from_evaluations(eval_df, task_ids=[task_id], flow_type='svm') + +model.fit(X, y) +y_pred = model.predict(X) + +print("Training RMSE : {:.5}".format(mean_squared_error(y, y_pred))) + + +############################################################################# +# Evaluating the surrogate model +# ****************************** +# The surrogate model built from a task's evaluations fetched from OpenML will be put into +# trivial action here, where we shall randomly sample configurations and observe the trajectory +# of the area under curve (auc) we can obtain from the surrogate we've built. +# +# NOTE: This section is written exclusively for the SVM flow + + +# Sampling random configurations +def random_sample_configurations(num_samples=100): + colnames = ['cost', 'degree', 'gamma', 'kernel'] + ranges = [(0.000986, 998.492437), + (2.0, 5.0), + (0.000988, 913.373845), + (['linear', 'polynomial', 'radial', 'sigmoid'])] + X = pd.DataFrame(np.nan, index=range(num_samples), columns=colnames) + for i in range(len(colnames)): + if len(ranges[i]) == 2: + col_val = np.random.uniform(low=ranges[i][0], high=ranges[i][1], size=num_samples) + else: + col_val = np.random.choice(ranges[i], size=num_samples) + X.iloc[:, i] = col_val + return X + + +configs = random_sample_configurations(num_samples=1000) +print(configs) + +############################################################################# +preds = model.predict(configs) + +# tracking the maximum AUC obtained over the functions evaluations +preds = np.maximum.accumulate(preds) +# computing regret (1 - predicted_auc) +regret = 1 - preds + +# plotting the regret curve +plt.plot(regret) +plt.title('AUC regret for Random Search on surrogate') +plt.xlabel('Numbe of function evaluations') +plt.ylabel('Regret') diff --git a/examples/40_paper/README.txt b/examples/40_paper/README.txt new file mode 100644 index 000000000..9b571d55b --- /dev/null +++ b/examples/40_paper/README.txt @@ -0,0 +1,5 @@ +Usage in research papers +======================== + +These examples demonstrate how OpenML-Python can be used for research purposes by re-implementing +its use in recent publications. diff --git a/examples/README.txt b/examples/README.txt index e41bfd4fc..b90c0e1cb 100644 --- a/examples/README.txt +++ b/examples/README.txt @@ -1,4 +1,3 @@ -Introductory Examples -===================== - -General examples for OpenML usage. +======== +Examples +======== diff --git a/examples/sklearn/README.txt b/examples/sklearn/README.txt deleted file mode 100644 index d61578cf1..000000000 --- a/examples/sklearn/README.txt +++ /dev/null @@ -1,4 +0,0 @@ -Experiment Examples -=================== - -OpenML experiment examples using a sklearn classifier/pipeline. diff --git a/examples/sklearn/openml_run_example.py b/examples/sklearn/openml_run_example.py deleted file mode 100644 index 195a0aa77..000000000 --- a/examples/sklearn/openml_run_example.py +++ /dev/null @@ -1,40 +0,0 @@ -""" -OpenML Run Example -================== - -An example of an automated machine learning experiment. -""" -import openml -from sklearn import impute, tree, pipeline - -############################################################################ -# .. warning:: This example uploads data. For that reason, this example -# connects to the test server at test.openml.org. This prevents the main -# server from crowding with example datasets, tasks, runs, and so on. - -openml.config.start_using_configuration_for_example() -############################################################################ - -# Uncomment and set your OpenML key. Don't share your key with others. -# openml.config.apikey = 'YOURKEY' - -# Define a scikit-learn pipeline -clf = pipeline.Pipeline( - steps=[ - ('imputer', impute.SimpleImputer()), - ('estimator', tree.DecisionTreeClassifier()) - ] -) -############################################################################ -# Download the OpenML task for the german credit card dataset. -task = openml.tasks.get_task(97) -############################################################################ -# Run the scikit-learn model on the task (requires an API key). -run = openml.runs.run_model_on_task(clf, task) -# Publish the experiment on OpenML (optional, requires an API key). -run.publish() - -print('URL for run: %s/run/%d' % (openml.config.server, run.run_id)) - -############################################################################ -openml.config.stop_using_configuration_for_example() diff --git a/openml/__version__.py b/openml/__version__.py index fd6968a5d..30750c80a 100644 --- a/openml/__version__.py +++ b/openml/__version__.py @@ -1,4 +1,4 @@ """Version information.""" # The following line *must* be the last in the module, exactly as formatted: -__version__ = "0.10.0" +__version__ = "0.10.1" diff --git a/openml/_api_calls.py b/openml/_api_calls.py index 803dc6b42..22223d587 100644 --- a/openml/_api_calls.py +++ b/openml/_api_calls.py @@ -1,4 +1,5 @@ import time +from typing import Dict import requests import warnings @@ -80,7 +81,7 @@ def _read_url_files(url, data=None, file_elements=None): files=file_elements, ) if response.status_code != 200: - raise _parse_server_exception(response, url) + raise _parse_server_exception(response, url, file_elements=file_elements) if 'Content-Encoding' not in response.headers or \ response.headers['Content-Encoding'] != 'gzip': warnings.warn('Received uncompressed content from OpenML for {}.' @@ -95,7 +96,7 @@ def _read_url(url, request_method, data=None): response = send_request(request_method=request_method, url=url, data=data) if response.status_code != 200: - raise _parse_server_exception(response, url) + raise _parse_server_exception(response, url, file_elements=None) if 'Content-Encoding' not in response.headers or \ response.headers['Content-Encoding'] != 'gzip': warnings.warn('Received uncompressed content from OpenML for {}.' @@ -137,7 +138,11 @@ def send_request( return response -def _parse_server_exception(response, url): +def _parse_server_exception( + response: requests.Response, + url: str, + file_elements: Dict, +) -> OpenMLServerError: # OpenML has a sophisticated error system # where information about failures is provided. try to parse this try: @@ -152,12 +157,29 @@ def _parse_server_exception(response, url): message = server_error['oml:message'] additional_information = server_error.get('oml:additional_information') if code in [372, 512, 500, 482, 542, 674]: + if additional_information: + full_message = '{} - {}'.format(message, additional_information) + else: + full_message = message + # 512 for runs, 372 for datasets, 500 for flows # 482 for tasks, 542 for evaluations, 674 for setups - return OpenMLServerNoResult(code, message, additional_information) + return OpenMLServerNoResult( + code=code, + message=full_message, + ) + # 163: failure to validate flow XML (https://www.openml.org/api_docs#!/flow/post_flow) + if code in [163] and file_elements is not None and 'description' in file_elements: + # file_elements['description'] is the XML file description of the flow + full_message = '\n{}\n{} - {}'.format( + file_elements['description'], + message, + additional_information, + ) + else: + full_message = '{} - {}'.format(message, additional_information) return OpenMLServerException( code=code, - message=message, - additional=additional_information, + message=full_message, url=url ) diff --git a/openml/base.py b/openml/base.py new file mode 100644 index 000000000..9e28bd055 --- /dev/null +++ b/openml/base.py @@ -0,0 +1,157 @@ +from abc import ABC, abstractmethod +from collections import OrderedDict +import re +from typing import Optional, List, Tuple, Union, Dict +import webbrowser + +import xmltodict + +import openml.config +from .utils import _tag_openml_base, _get_rest_api_type_alias + + +class OpenMLBase(ABC): + """ Base object for functionality that is shared across entities. """ + + def __repr__(self): + body_fields = self._get_repr_body_fields() + return self._apply_repr_template(body_fields) + + @property + @abstractmethod + def id(self) -> Optional[int]: + """ The id of the entity, it is unique for its entity type. """ + pass + + @property + def openml_url(self) -> Optional[str]: + """ The URL of the object on the server, if it was uploaded, else None. """ + if self.id is None: + return None + return self.__class__.url_for_id(self.id) + + @classmethod + def url_for_id(cls, id_: int) -> str: + """ Return the OpenML URL for the object of the class entity with the given id. """ + # Sample url for a flow: openml.org/f/123 + return "{}/{}/{}".format(openml.config.server_base_url, cls._entity_letter(), id_) + + @classmethod + def _entity_letter(cls) -> str: + """ Return the letter which represents the entity type in urls, e.g. 'f' for flow.""" + # We take advantage of the class naming convention (OpenMLX), + # which holds for all entities except studies and tasks, which overwrite this method. + return cls.__name__.lower()[len('OpenML'):][0] + + @abstractmethod + def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: + """ Collect all information to display in the __repr__ body. + + Returns + ------ + body_fields : List[Tuple[str, Union[str, int, List[str]]]] + A list of (name, value) pairs to display in the body of the __repr__. + E.g.: [('metric', 'accuracy'), ('dataset', 'iris')] + If value is a List of str, then each item of the list will appear in a separate row. + """ + # Should be implemented in the base class. + pass + + def _apply_repr_template(self, body_fields: List[Tuple[str, str]]) -> str: + """ Generates the header and formats the body for string representation of the object. + + Parameters + ---------- + body_fields: List[Tuple[str, str]] + A list of (name, value) pairs to display in the body of the __repr__. + """ + # We add spaces between capitals, e.g. ClassificationTask -> Classification Task + name_with_spaces = re.sub(r"(\w)([A-Z])", r"\1 \2", + self.__class__.__name__[len('OpenML'):]) + header_text = 'OpenML {}'.format(name_with_spaces) + header = '{}\n{}\n'.format(header_text, '=' * len(header_text)) + + longest_field_name_length = max(len(name) for name, value in body_fields) + field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) + body = '\n'.join(field_line_format.format(name, value) for name, value in body_fields) + return header + body + + @abstractmethod + def _to_dict(self) -> 'OrderedDict[str, OrderedDict]': + """ Creates a dictionary representation of self. + + Uses OrderedDict to ensure consistent ordering when converting to xml. + The return value (OrderedDict) will be used to create the upload xml file. + The xml file must have the tags in exactly the order of the object's xsd. + (see https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/). + + Returns + ------- + OrderedDict + Flow represented as OrderedDict. + + """ + # Should be implemented in the base class. + pass + + def _to_xml(self) -> str: + """ Generate xml representation of self for upload to server. """ + dict_representation = self._to_dict() + xml_representation = xmltodict.unparse(dict_representation, pretty=True) + + # A task may not be uploaded with the xml encoding specification: + # + encoding_specification, xml_body = xml_representation.split('\n', 1) + return xml_body + + def _get_file_elements(self) -> Dict: + """ Get file_elements to upload to the server, called during Publish. + + Derived child classes should overwrite this method as necessary. + The description field will be populated automatically if not provided. + """ + return {} + + @abstractmethod + def _parse_publish_response(self, xml_response: Dict): + """ Parse the id from the xml_response and assign it to self. """ + pass + + def publish(self) -> 'OpenMLBase': + file_elements = self._get_file_elements() + + if 'description' not in file_elements: + file_elements['description'] = self._to_xml() + + call = '{}/'.format(_get_rest_api_type_alias(self)) + response_text = openml._api_calls._perform_api_call( + call, 'post', file_elements=file_elements + ) + xml_response = xmltodict.parse(response_text) + + self._parse_publish_response(xml_response) + return self + + def open_in_browser(self): + """ Opens the OpenML web page corresponding to this object in your default browser. """ + webbrowser.open(self.openml_url) + + def push_tag(self, tag: str): + """Annotates this entity with a tag on the server. + + Parameters + ---------- + tag : str + Tag to attach to the flow. + """ + _tag_openml_base(self, tag) + + def remove_tag(self, tag: str): + """Removes a tag from this entity on the server. + + Parameters + ---------- + tag : str + Tag to attach to the flow. + """ + _tag_openml_base(self, tag, untag=True) diff --git a/openml/config.py b/openml/config.py index 91d7345e0..2af1bfef6 100644 --- a/openml/config.py +++ b/openml/config.py @@ -2,23 +2,49 @@ Store module level information like the API key, cache directory and the server """ import logging +import logging.handlers import os +from typing import cast from io import StringIO import configparser from urllib.parse import urlparse - logger = logging.getLogger(__name__) -logging.basicConfig( - format='[%(levelname)s] [%(asctime)s:%(name)s] %(' - 'message)s', datefmt='%H:%M:%S') -# Default values! + +def configure_logging(console_output_level: int, file_output_level: int): + """ Sets the OpenML logger to DEBUG, with attached Stream- and FileHandler. """ + # Verbosity levels as defined (https://github.com/openml/OpenML/wiki/Client-API-Standards) + # don't match Python values directly: + verbosity_map = {0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG} + + openml_logger = logging.getLogger('openml') + openml_logger.setLevel(logging.DEBUG) + message_format = '[%(levelname)s] [%(asctime)s:%(name)s] %(message)s' + output_formatter = logging.Formatter(message_format, datefmt='%H:%M:%S') + + console_stream = logging.StreamHandler() + console_stream.setFormatter(output_formatter) + console_stream.setLevel(verbosity_map[console_output_level]) + + one_mb = 2**20 + log_path = os.path.join(cache_directory, 'openml_python.log') + file_stream = logging.handlers.RotatingFileHandler(log_path, maxBytes=one_mb, backupCount=1) + file_stream.setLevel(verbosity_map[file_output_level]) + file_stream.setFormatter(output_formatter) + + openml_logger.addHandler(console_stream) + openml_logger.addHandler(file_stream) + return console_stream, file_stream + + +# Default values (see also https://github.com/openml/OpenML/wiki/Client-API-Standards) _defaults = { 'apikey': None, 'server': "https://www.openml.org/api/v1/xml", - 'verbosity': 0, + 'verbosity': 0, # WARNING + 'file_verbosity': 2, # DEBUG 'cachedir': os.path.expanduser(os.path.join('~', '.openml', 'cache')), 'avoid_duplicate_runs': 'True', 'connection_n_retries': 2, @@ -28,10 +54,11 @@ # Default values are actually added here in the _setup() function which is # called at the end of this module -server = _defaults['server'] +server = str(_defaults['server']) # so mypy knows it is a string +server_base_url = server[:-len('/api/v1/xml')] apikey = _defaults['apikey'] # The current cache directory (without the server name) -cache_directory = _defaults['cachedir'] +cache_directory = str(_defaults['cachedir']) # so mypy knows it is a string avoid_duplicate_runs = True if _defaults['avoid_duplicate_runs'] == 'True' else False # Number of retries if the connection breaks @@ -100,12 +127,14 @@ def _setup(): global cache_directory global avoid_duplicate_runs global connection_n_retries + # read config file, create cache directory try: os.mkdir(os.path.expanduser(os.path.join('~', '.openml'))) - except (IOError, OSError): - # TODO add debug information + except FileExistsError: + # For other errors, we want to propagate the error as openml does not work without cache pass + config = _parse_config() apikey = config.get('FAKE_SECTION', 'apikey') server = config.get('FAKE_SECTION', 'server') @@ -113,6 +142,13 @@ def _setup(): short_cache_dir = config.get('FAKE_SECTION', 'cachedir') cache_directory = os.path.expanduser(short_cache_dir) + # create the cache subdirectory + try: + os.mkdir(cache_directory) + except FileExistsError: + # For other errors, we want to propagate the error as openml does not work without cache + pass + avoid_duplicate_runs = config.getboolean('FAKE_SECTION', 'avoid_duplicate_runs') connection_n_retries = config.get('FAKE_SECTION', 'connection_n_retries') @@ -146,7 +182,7 @@ def _parse_config(): config_file_.seek(0) config.read_file(config_file_) except OSError as e: - logging.info("Error opening file %s: %s", config_file, e.message) + logger.info("Error opening file %s: %s", config_file, e.message) return config @@ -203,3 +239,7 @@ def set_cache_directory(cachedir): ] _setup() + +_console_log_level = cast(int, _defaults['verbosity']) +_file_log_level = cast(int, _defaults['file_verbosity']) +console_log, file_log = configure_logging(_console_log_level, _file_log_level) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 630fac35e..26215736d 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -1,28 +1,27 @@ from collections import OrderedDict +import re import gzip import io import logging import os import pickle -from typing import List, Optional, Union, Tuple, Iterable +from typing import List, Optional, Union, Tuple, Iterable, Dict import arff import numpy as np import pandas as pd import scipy.sparse -import xmltodict from warnings import warn -import openml._api_calls +from openml.base import OpenMLBase from .data_feature import OpenMLDataFeature from ..exceptions import PyOpenMLError -from ..utils import _tag_entity logger = logging.getLogger(__name__) -class OpenMLDataset(object): +class OpenMLDataset(OpenMLBase): """Dataset object. Allows fetching and uploading datasets to OpenML. @@ -108,7 +107,18 @@ def __init__(self, name, description, format=None, paper_url=None, update_comment=None, md5_checksum=None, data_file=None, features=None, qualities=None, dataset=None): - + if dataset_id is None: + if description and not re.match("^[\x00-\x7F]*$", description): + # not basiclatin (XSD complains) + raise ValueError("Invalid symbols in description: {}".format( + description)) + if citation and not re.match("^[\x00-\x7F]*$", citation): + # not basiclatin (XSD complains) + raise ValueError("Invalid symbols in citation: {}".format( + citation)) + if not re.match("^[a-zA-Z0-9_\\-\\.\\(\\),]+$", name): + # regex given by server in error message + raise ValueError("Invalid symbols in name: {}".format(name)) # TODO add function to check if the name is casual_string128 # Attributes received by querying the RESTful API self.dataset_id = int(dataset_id) if dataset_id is not None else None @@ -168,15 +178,16 @@ def __init__(self, name, description, format=None, self.qualities = _check_qualities(qualities) if data_file is not None: - self.data_pickle_file = self._data_arff_to_pickle(data_file) + self.data_pickle_file = self._create_pickle_in_cache(data_file) else: self.data_pickle_file = None - def __repr__(self): - header = "OpenML Dataset" - header = '{}\n{}\n'.format(header, '=' * len(header)) + @property + def id(self) -> Optional[int]: + return self.dataset_id - base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) + def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: + """ Collect all information to display in the __repr__ body. """ fields = {"Name": self.name, "Version": self.version, "Format": self.format, @@ -184,44 +195,126 @@ def __repr__(self): "Download URL": self.url, "Data file": self.data_file, "Pickle file": self.data_pickle_file, - "# of features": len(self.features)} + "# of features": len(self.features) + if self.features is not None else None} if self.upload_date is not None: fields["Upload Date"] = self.upload_date.replace('T', ' ') if self.dataset_id is not None: - fields["OpenML URL"] = "{}d/{}".format(base_url, self.dataset_id) - if self.qualities['NumberOfInstances'] is not None: + fields["OpenML URL"] = self.openml_url + if self.qualities is not None and self.qualities['NumberOfInstances'] is not None: fields["# of instances"] = int(self.qualities['NumberOfInstances']) # determines the order in which the information will be printed order = ["Name", "Version", "Format", "Upload Date", "Licence", "Download URL", "OpenML URL", "Data File", "Pickle File", "# of features", "# of instances"] - fields = [(key, fields[key]) for key in order if key in fields] + return [(key, fields[key]) for key in order if key in fields] - longest_field_name_length = max(len(name) for name, value in fields) - field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) - body = '\n'.join(field_line_format.format(name, value) for name, value in fields) - return header + body + def __eq__(self, other): - def _data_arff_to_pickle(self, data_file): - data_pickle_file = data_file.replace('.arff', '.pkl.py3') - if os.path.exists(data_pickle_file): - with open(data_pickle_file, "rb") as fh: - data, categorical, attribute_names = pickle.load(fh) + if type(other) != OpenMLDataset: + return False - # Between v0.8 and v0.9 the format of pickled data changed from - # np.ndarray to pd.DataFrame. This breaks some backwards compatibility, - # e.g. for `run_model_on_task`. If a local file still exists with - # np.ndarray data, we reprocess the data file to store a pickled - # pd.DataFrame blob. See also #646. - if isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data): - logger.debug("Data pickle file already exists.") - return data_pickle_file + server_fields = { + 'dataset_id', + 'version', + 'upload_date', + 'url', + 'dataset', + 'data_file', + } + + # check that the keys are identical + self_keys = set(self.__dict__.keys()) - server_fields + other_keys = set(other.__dict__.keys()) - server_fields + if self_keys != other_keys: + return False + + # check that values of the common keys are identical + return all(self.__dict__[key] == other.__dict__[key] + for key in self_keys) + + def _download_data(self) -> None: + """ Download ARFF data file to standard cache directory. Set `self.data_file`. """ + # import required here to avoid circular import. + from .functions import _get_dataset_arff + self.data_file = _get_dataset_arff(self) + + def _get_arff(self, format: str) -> Dict: + """Read ARFF file and return decoded arff. + + Reads the file referenced in self.data_file. + + Parameters + ---------- + format : str + Format of the ARFF file. + Must be one of 'arff' or 'sparse_arff' or a string that will be either of those + when converted to lower case. + + + + Returns + ------- + dict + Decoded arff. + """ + + # TODO: add a partial read method which only returns the attribute + # headers of the corresponding .arff file! + import struct + + filename = self.data_file + bits = (8 * struct.calcsize("P")) + # Files can be considered too large on a 32-bit system, + # if it exceeds 120mb (slightly more than covtype dataset size) + # This number is somewhat arbitrary. + if bits != 64 and os.path.getsize(filename) > 120000000: + raise NotImplementedError("File {} too big for {}-bit system ({} bytes)." + .format(filename, os.path.getsize(filename), bits)) + + if format.lower() == 'arff': + return_type = arff.DENSE + elif format.lower() == 'sparse_arff': + return_type = arff.COO + else: + raise ValueError('Unknown data format {}'.format(format)) + + def decode_arff(fh): + decoder = arff.ArffDecoder() + return decoder.decode(fh, encode_nominal=True, + return_type=return_type) + + if filename[-3:] == ".gz": + with gzip.open(filename) as fh: + return decode_arff(fh) + else: + with io.open(filename, encoding='utf8') as fh: + return decode_arff(fh) + + def _parse_data_from_arff( + self, + arff_file_path: str + ) -> Tuple[Union[pd.DataFrame, scipy.sparse.csr_matrix], List[bool], List[str]]: + """ Parse all required data from arff file. + + Parameters + ---------- + arff_file_path : str + Path to the file on disk. + + Returns + ------- + Tuple[Union[pd.DataFrame, scipy.sparse.csr_matrix], List[bool], List[str]] + DataFrame or csr_matrix: dataset + List[bool]: List indicating which columns contain categorical variables. + List[str]: List of column names. + """ try: data = self._get_arff(self.format) except OSError as e: - logger.critical("Please check that the data file %s is " - "there and can be read.", data_file) + logger.critical("Please check that the data file {} is " + "there and can be read.".format(arff_file_path)) raise e ARFF_DTYPES_TO_PD_DTYPE = { @@ -234,13 +327,15 @@ def _data_arff_to_pickle(self, data_file): attribute_names = [] categories_names = {} categorical = [] - for name, type_ in data['attributes']: + for i, (name, type_) in enumerate(data['attributes']): # if the feature is nominal and the a sparse matrix is # requested, the categories need to be numeric if (isinstance(type_, list) and self.format.lower() == 'sparse_arff'): try: - np.array(type_, dtype=np.float32) + # checks if the strings which should be the class labels + # can be encoded into integers + pd.factorize(type_)[0] except ValueError: raise ValueError( "Categorical data needs to be numeric when " @@ -282,7 +377,6 @@ def _data_arff_to_pickle(self, data_file): X = scipy.sparse.coo_matrix( (X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32) X = X.tocsr() - elif self.format.lower() == 'arff': X = pd.DataFrame(data['data'], columns=attribute_names) @@ -295,103 +389,72 @@ def _data_arff_to_pickle(self, data_file): else: col.append(X[column_name]) X = pd.concat(col, axis=1) + else: + raise ValueError("Dataset format '{}' is not a valid format.".format(self.format)) + + return X, categorical, attribute_names + + def _create_pickle_in_cache(self, data_file: str) -> str: + """ Parse the arff and pickle the result. Update any old pickle objects. """ + data_pickle_file = data_file.replace('.arff', '.pkl.py3') + if os.path.exists(data_pickle_file): + # Load the data to check if the pickle file is outdated (i.e. contains numpy array) + with open(data_pickle_file, "rb") as fh: + try: + data, categorical, attribute_names = pickle.load(fh) + except EOFError: + # The file is likely corrupt, see #780. + # We deal with this when loading the data in `_load_data`. + return data_pickle_file + + # Between v0.8 and v0.9 the format of pickled data changed from + # np.ndarray to pd.DataFrame. This breaks some backwards compatibility, + # e.g. for `run_model_on_task`. If a local file still exists with + # np.ndarray data, we reprocess the data file to store a pickled + # pd.DataFrame blob. See also #646. + if isinstance(data, pd.DataFrame) or scipy.sparse.issparse(data): + logger.debug("Data pickle file already exists and is up to date.") + return data_pickle_file + + # At this point either the pickle file does not exist, or it had outdated formatting. + # We parse the data from arff again and populate the cache with a recent pickle file. + X, categorical, attribute_names = self._parse_data_from_arff(data_file) - # Pickle the dataframe or the sparse matrix. with open(data_pickle_file, "wb") as fh: - pickle.dump((X, categorical, attribute_names), fh, -1) + pickle.dump((X, categorical, attribute_names), fh, pickle.HIGHEST_PROTOCOL) logger.debug("Saved dataset {did}: {name} to file {path}" .format(did=int(self.dataset_id or -1), name=self.name, path=data_pickle_file) ) - return data_pickle_file - - def push_tag(self, tag): - """Annotates this data set with a tag on the server. - - Parameters - ---------- - tag : str - Tag to attach to the dataset. - """ - _tag_entity('data', self.dataset_id, tag) - - def remove_tag(self, tag): - """Removes a tag from this dataset on the server. - - Parameters - ---------- - tag : str - Tag to attach to the dataset. - """ - _tag_entity('data', self.dataset_id, tag, untag=True) - - def __eq__(self, other): - - if type(other) != OpenMLDataset: - return False - - server_fields = { - 'dataset_id', - 'version', - 'upload_date', - 'url', - 'dataset', - 'data_file', - } - - # check that the keys are identical - self_keys = set(self.__dict__.keys()) - server_fields - other_keys = set(other.__dict__.keys()) - server_fields - if self_keys != other_keys: - return False - - # check that values of the common keys are identical - return all(self.__dict__[key] == other.__dict__[key] - for key in self_keys) - - def _get_arff(self, format): - """Read ARFF file and return decoded arff. - - Reads the file referenced in self.data_file. - - Returns - ------- - dict - Decoded arff. - - """ - - # TODO: add a partial read method which only returns the attribute - # headers of the corresponding .arff file! - import struct - - filename = self.data_file - bits = (8 * struct.calcsize("P")) - # Files can be considered too large on a 32-bit system, - # if it exceeds 120mb (slightly more than covtype dataset size) - # This number is somewhat arbitrary. - if bits != 64 and os.path.getsize(filename) > 120000000: - return NotImplementedError("File too big") - if format.lower() == 'arff': - return_type = arff.DENSE - elif format.lower() == 'sparse_arff': - return_type = arff.COO - else: - raise ValueError('Unknown data format %s' % format) + return data_pickle_file - def decode_arff(fh): - decoder = arff.ArffDecoder() - return decoder.decode(fh, encode_nominal=True, - return_type=return_type) + def _load_data(self): + """ Load data from pickle or arff. Download data first if not present on disk. """ + if self.data_pickle_file is None: + if self.data_file is None: + self._download_data() + self.data_pickle_file = self._create_pickle_in_cache(self.data_file) - if filename[-3:] == ".gz": - with gzip.open(filename) as fh: - return decode_arff(fh) - else: - with io.open(filename, encoding='utf8') as fh: - return decode_arff(fh) + try: + with open(self.data_pickle_file, "rb") as fh: + data, categorical, attribute_names = pickle.load(fh) + except EOFError: + logger.warning( + "Detected a corrupt cache file loading dataset %d: '%s'. " + "We will continue loading data from the arff-file, " + "but this will be much slower for big datasets. " + "Please manually delete the cache file if you want openml-python " + "to attempt to reconstruct it." + "" % (self.dataset_id, self.data_pickle_file) + ) + data, categorical, attribute_names = self._parse_data_from_arff(self.data_file) + except FileNotFoundError: + raise ValueError("Cannot find a pickle file for dataset {} at " + "location {} ".format(self.name, self.data_pickle_file)) + + return data, categorical, attribute_names @staticmethod def _convert_array_format(data, array_format, attribute_names): @@ -417,6 +480,7 @@ def _convert_array_format(data, array_format, attribute_names): else returns data as is """ + if array_format == "array" and not scipy.sparse.issparse(data): # We encode the categories such that they are integer to be able # to make a conversion to numeric for backward compatibility @@ -441,11 +505,17 @@ def _encode_if_category(column): 'PyOpenML cannot handle string when returning numpy' ' arrays. Use dataset_format="dataframe".' ) - elif array_format == "dataframe" and scipy.sparse.issparse(data): - return pd.SparseDataFrame(data, columns=attribute_names) + elif array_format == "dataframe": + if scipy.sparse.issparse(data): + return pd.SparseDataFrame(data, columns=attribute_names) + else: + return data else: data_type = "sparse-data" if scipy.sparse.issparse(data) else "non-sparse data" - warn("Cannot convert {} to '{}'. Returning input data.".format(data_type, array_format)) + logger.warning( + "Cannot convert %s (%s) to '%s'. Returning input data." + % (data_type, type(data), array_format) + ) return data @staticmethod @@ -461,12 +531,6 @@ def _unpack_categories(series, categories): raw_cat = pd.Categorical(col, ordered=True, categories=categories) return pd.Series(raw_cat, index=series.index, name=series.name) - def _download_data(self) -> None: - """ Download ARFF data file to standard cache directory. Set `self.data_file`. """ - # import required here to avoid circular import. - from .functions import _get_dataset_arff - self.data_file = _get_dataset_arff(self) - def get_data( self, target: Optional[Union[List[str], str]] = None, @@ -507,18 +571,7 @@ def get_data( attribute_names : List[str] List of attribute names. """ - if self.data_pickle_file is None: - if self.data_file is None: - self._download_data() - self.data_pickle_file = self._data_arff_to_pickle(self.data_file) - - path = self.data_pickle_file - if not os.path.exists(path): - raise ValueError("Cannot find a pickle file for dataset %s at " - "location %s " % (self.name, path)) - else: - with open(path, "rb") as fh: - data, categorical, attribute_names = pickle.load(fh) + data, categorical, attribute_names = self._load_data() to_exclude = [] if not include_row_id and self.row_id_attribute is not None: @@ -673,58 +726,31 @@ def get_features_by_type(self, data_type, exclude=None, result.append(idx - offset) return result - def publish(self): - """Publish the dataset on the OpenML server. - - Upload the dataset description and dataset content to openml. + def _get_file_elements(self) -> Dict: + """ Adds the 'dataset' to file elements. """ + file_elements = {} + path = None if self.data_file is None else os.path.abspath(self.data_file) - Returns - ------- - dataset_id: int - Id of the dataset uploaded to the server. - """ - file_elements = {'description': self._to_xml()} - - # the arff dataset string is available if self._dataset is not None: file_elements['dataset'] = self._dataset - else: - # the path to the arff dataset is given - if self.data_file is not None: - path = os.path.abspath(self.data_file) - if os.path.exists(path): - try: - - with io.open(path, encoding='utf8') as fh: - # check if arff is valid - decoder = arff.ArffDecoder() - decoder.decode(fh, encode_nominal=True) - except arff.ArffException: - raise ValueError("The file you have provided is not " - "a valid arff file.") - - with open(path, 'rb') as fp: - file_elements['dataset'] = fp.read() - else: - if self.url is None: - raise ValueError("No url/path to the data file was given") - - return_value = openml._api_calls._perform_api_call( - "data/", 'post', - file_elements=file_elements, - ) - response = xmltodict.parse(return_value) - self.dataset_id = int(response['oml:upload_data_set']['oml:id']) - return self.dataset_id - - def _to_xml(self): - """ Serialize object to xml for upload - - Returns - ------- - xml_dataset : str - XML description of the data. - """ + elif path is not None and os.path.exists(path): + with open(path, 'rb') as fp: + file_elements['dataset'] = fp.read() + try: + dataset_utf8 = str(file_elements['dataset'], 'utf8') + arff.ArffDecoder().decode(dataset_utf8, encode_nominal=True) + except arff.ArffException: + raise ValueError("The file you have provided is not a valid arff file.") + elif self.url is None: + raise ValueError("No valid url/path to the data file was given.") + return file_elements + + def _parse_publish_response(self, xml_response: Dict): + """ Parse the id from the xml_response and assign it to self. """ + self.dataset_id = int(xml_response['oml:upload_data_set']['oml:id']) + + def _to_dict(self) -> 'OrderedDict[str, OrderedDict]': + """ Creates a dictionary representation of self. """ props = ['id', 'name', 'version', 'description', 'format', 'creator', 'contributor', 'collection_date', 'upload_date', 'language', 'licence', 'url', 'default_target_attribute', @@ -732,7 +758,7 @@ def _to_xml(self): 'citation', 'tag', 'visibility', 'original_data_url', 'paper_url', 'update_comment', 'md5_checksum'] - data_container = OrderedDict() + data_container = OrderedDict() # type: 'OrderedDict[str, OrderedDict]' data_dict = OrderedDict([('@xmlns:oml', 'http://openml.org/openml')]) data_container['oml:data_set_description'] = data_dict @@ -741,14 +767,7 @@ def _to_xml(self): if content is not None: data_dict["oml:" + prop] = content - xml_string = xmltodict.unparse( - input_dict=data_container, - pretty=True, - ) - # A flow may not be uploaded with the xml encoding specification: - # - xml_string = xml_string.split('\n', 1)[-1] - return xml_string + return data_container def _check_qualities(qualities): diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 1ed888ec1..bc2606506 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -1,4 +1,5 @@ import io +import logging import os import re from typing import List, Dict, Union, Optional @@ -28,6 +29,7 @@ DATASETS_CACHE_DIR_NAME = 'datasets' +logger = logging.getLogger(__name__) ############################################################################ # Local getters/accessors to the cache directory @@ -190,6 +192,7 @@ def list_qualities() -> List[str]: def list_datasets( + data_id: Optional[List[int]] = None, offset: Optional[int] = None, size: Optional[int] = None, status: Optional[str] = None, @@ -204,6 +207,9 @@ def list_datasets( Parameters ---------- + data_id : list, optional + A list of data ids, to specify which datasets should be + listed offset : int, optional The number of datasets to skip, starting from the first. size : int, optional @@ -251,7 +257,8 @@ def list_datasets( raise ValueError("Invalid output format selected. " "Only 'dict' or 'dataframe' applicable.") - return openml.utils._list_all(output_format=output_format, + return openml.utils._list_all(data_id=data_id, + output_format=output_format, listing_call=_list_datasets, offset=offset, size=size, @@ -260,12 +267,19 @@ def list_datasets( **kwargs) -def _list_datasets(output_format='dict', **kwargs): +def _list_datasets(data_id: Optional[List] = None, output_format='dict', **kwargs): """ Perform api call to return a list of all datasets. Parameters ---------- + The arguments that are lists are separated from the single value + ones which are put into the kwargs. + display_errors is also separated from the kwargs since it has a + default value. + + data_id : list, optional + output_format: str, optional (default='dict') The parameter decides the format of the output. - If 'dict' the output is a dict of dict @@ -285,6 +299,8 @@ def _list_datasets(output_format='dict', **kwargs): if kwargs is not None: for operator, value in kwargs.items(): api_call += "/%s/%s" % (operator, value) + if data_id is not None: + api_call += "/data_id/%s" % ','.join([str(int(i)) for i in data_id]) return __list_datasets(api_call=api_call, output_format=output_format) @@ -444,7 +460,8 @@ def get_dataset( If dataset is retrieved by name, a version may be specified. If no version is specified and multiple versions of the dataset exist, the earliest version of the dataset that is still active will be returned. - This scenario will raise an error instead if `exception_if_multiple` is `True`. + If no version is specified, multiple versions of the dataset exist and + ``exception_if_multiple`` is set to ``True``, this function will raise an exception. Parameters ---------- @@ -459,7 +476,7 @@ def get_dataset( Specifies the version if `dataset_id` is specified by name. If no version is specified, retrieve the least recent still active version. error_if_multiple : bool, optional (default=False) - If `True` raise an error if multiple datasets are found with matching criteria. + If ``True`` raise an error if multiple datasets are found with matching criteria. Returns ------- @@ -483,10 +500,17 @@ def get_dataset( remove_dataset_cache = True description = _get_dataset_description(did_cache_dir, dataset_id) features = _get_dataset_features(did_cache_dir, dataset_id) - qualities = _get_dataset_qualities(did_cache_dir, dataset_id) - arff_file = _get_dataset_arff(description) if download_data else None + try: + qualities = _get_dataset_qualities(did_cache_dir, dataset_id) + except OpenMLServerException as e: + if e.code == 362 and str(e) == 'No qualities found - None': + logger.warning("No qualities found for dataset {}".format(dataset_id)) + qualities = None + else: + raise + arff_file = _get_dataset_arff(description) if download_data else None remove_dataset_cache = False except OpenMLServerException as e: # if there was an exception, @@ -525,10 +549,15 @@ def attributes_arff_from_df(df): 'string': 'STRING' } attributes_arff = [] + + if not all([isinstance(column_name, str) for column_name in df.columns]): + logger.warning("Converting non-str column names to str.") + df.columns = [str(column_name) for column_name in df.columns] + for column_name in df: # skipna=True does not infer properly the dtype. The NA values are # dropped before the inference instead. - column_dtype = pd.api.types.infer_dtype(df[column_name].dropna()) + column_dtype = pd.api.types.infer_dtype(df[column_name].dropna(), skipna=False) if column_dtype == 'categorical': # for categorical feature, arff expects a list string. However, a diff --git a/openml/evaluations/evaluation.py b/openml/evaluations/evaluation.py index 48b407575..9d8507708 100644 --- a/openml/evaluations/evaluation.py +++ b/openml/evaluations/evaluation.py @@ -26,6 +26,10 @@ class OpenMLEvaluation(object): The evaluation metric of this item (e.g., accuracy). upload_time : str The time of evaluation. + uploader: int + Uploader ID (user ID) + upload_name : str + Name of the uploader of this evaluation value : float The value (score) of this evaluation. values : List[float] @@ -35,7 +39,8 @@ class OpenMLEvaluation(object): (e.g., in case of precision, auroc, recall) """ def __init__(self, run_id, task_id, setup_id, flow_id, flow_name, - data_id, data_name, function, upload_time, value, values, + data_id, data_name, function, upload_time, uploader: int, + uploader_name: str, value, values, array_data=None): self.run_id = run_id self.task_id = task_id @@ -46,6 +51,8 @@ def __init__(self, run_id, task_id, setup_id, flow_id, flow_name, self.data_name = data_name self.function = function self.upload_time = upload_time + self.uploader = uploader + self.uploader_name = uploader_name self.value = value self.values = values self.array_data = array_data @@ -54,18 +61,17 @@ def __repr__(self): header = "OpenML Evaluation" header = '{}\n{}\n'.format(header, '=' * len(header)) - base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) fields = {"Upload Date": self.upload_time, "Run ID": self.run_id, - "OpenML Run URL": "{}r/{}".format(base_url, self.run_id), + "OpenML Run URL": openml.runs.OpenMLRun.url_for_id(self.run_id), "Task ID": self.task_id, - "OpenML Task URL": "{}t/{}".format(base_url, self.task_id), + "OpenML Task URL": openml.tasks.OpenMLTask.url_for_id(self.task_id), "Flow ID": self.flow_id, - "OpenML Flow URL": "{}f/{}".format(base_url, self.flow_id), + "OpenML Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id), "Setup ID": self.setup_id, "Data ID": self.data_id, "Data Name": self.data_name, - "OpenML Data URL": "{}d/{}".format(base_url, self.data_id), + "OpenML Data URL": openml.datasets.OpenMLDataset.url_for_id(self.data_id), "Metric Used": self.function, "Result": self.value} diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py index 55517f3d6..8de69ebc1 100644 --- a/openml/evaluations/functions.py +++ b/openml/evaluations/functions.py @@ -15,12 +15,13 @@ def list_evaluations( function: str, offset: Optional[int] = None, size: Optional[int] = None, - id: Optional[List] = None, task: Optional[List] = None, setup: Optional[List] = None, flow: Optional[List] = None, + run: Optional[List] = None, uploader: Optional[List] = None, tag: Optional[str] = None, + study: Optional[int] = None, per_fold: Optional[bool] = None, sort_order: Optional[str] = None, output_format: str = 'object' @@ -38,18 +39,20 @@ def list_evaluations( size : int, optional the maximum number of runs to show - id : list, optional - task : list, optional setup: list, optional flow : list, optional + run : list, optional + uploader : list, optional tag : str, optional + study : int, optional + per_fold : bool, optional sort_order : str, optional @@ -78,23 +81,25 @@ def list_evaluations( function=function, offset=offset, size=size, - id=id, task=task, setup=setup, flow=flow, + run=run, uploader=uploader, tag=tag, + study=study, sort_order=sort_order, per_fold=per_fold_str) def _list_evaluations( function: str, - id: Optional[List] = None, task: Optional[List] = None, setup: Optional[List] = None, flow: Optional[List] = None, + run: Optional[List] = None, uploader: Optional[List] = None, + study: Optional[int] = None, sort_order: Optional[str] = None, output_format: str = 'object', **kwargs @@ -110,16 +115,18 @@ def _list_evaluations( function : str the evaluation function. e.g., predictive_accuracy - id : list, optional - task : list, optional setup: list, optional flow : list, optional + run : list, optional + uploader : list, optional + study : int, optional + kwargs: dict, optional Legal filter operators: tag, limit, offset. @@ -143,16 +150,18 @@ def _list_evaluations( if kwargs is not None: for operator, value in kwargs.items(): api_call += "/%s/%s" % (operator, value) - if id is not None: - api_call += "/run/%s" % ','.join([str(int(i)) for i in id]) if task is not None: api_call += "/task/%s" % ','.join([str(int(i)) for i in task]) if setup is not None: api_call += "/setup/%s" % ','.join([str(int(i)) for i in setup]) if flow is not None: api_call += "/flow/%s" % ','.join([str(int(i)) for i in flow]) + if run is not None: + api_call += "/run/%s" % ','.join([str(int(i)) for i in run]) if uploader is not None: api_call += "/uploader/%s" % ','.join([str(int(i)) for i in uploader]) + if study is not None: + api_call += "/study/%d" % study if sort_order is not None: api_call += "/sort_order/%s" % sort_order @@ -172,6 +181,12 @@ def __list_evaluations(api_call, output_format='object'): type(evals_dict['oml:evaluations']) evals = collections.OrderedDict() + uploader_ids = list(set([eval_['oml:uploader'] for eval_ in + evals_dict['oml:evaluations']['oml:evaluation']])) + api_users = "user/list/user_id/" + ','.join(uploader_ids) + xml_string_user = openml._api_calls._perform_api_call(api_users, 'get') + users = xmltodict.parse(xml_string_user, force_list=('oml:user',)) + user_dict = {user['oml:id']: user['oml:username'] for user in users['oml:users']['oml:user']} for eval_ in evals_dict['oml:evaluations']['oml:evaluation']: run_id = int(eval_['oml:run_id']) value = None @@ -190,10 +205,12 @@ def __list_evaluations(api_call, output_format='object'): int(eval_['oml:setup_id']), int(eval_['oml:flow_id']), eval_['oml:flow_name'], - eval_['oml:data_id'], + int(eval_['oml:data_id']), eval_['oml:data_name'], eval_['oml:function'], eval_['oml:upload_time'], + int(eval_['oml:uploader']), + user_dict[eval_['oml:uploader']], value, values, array_data) else: # for output_format in ['dict', 'dataframe'] @@ -202,10 +219,12 @@ def __list_evaluations(api_call, output_format='object'): 'setup_id': int(eval_['oml:setup_id']), 'flow_id': int(eval_['oml:flow_id']), 'flow_name': eval_['oml:flow_name'], - 'data_id': eval_['oml:data_id'], + 'data_id': int(eval_['oml:data_id']), 'data_name': eval_['oml:data_name'], 'function': eval_['oml:function'], 'upload_time': eval_['oml:upload_time'], + 'uploader': int(eval_['oml:uploader']), + 'uploader_name': user_dict[eval_['oml:uploader']], 'value': value, 'values': values, 'array_data': array_data} @@ -246,15 +265,16 @@ def list_evaluations_setups( function: str, offset: Optional[int] = None, size: Optional[int] = None, - id: Optional[List] = None, task: Optional[List] = None, setup: Optional[List] = None, flow: Optional[List] = None, + run: Optional[List] = None, uploader: Optional[List] = None, tag: Optional[str] = None, per_fold: Optional[bool] = None, sort_order: Optional[str] = None, - output_format: str = 'dataframe' + output_format: str = 'dataframe', + parameters_in_separate_columns: bool = False ) -> Union[Dict, pd.DataFrame]: """ List all run-evaluation pairs matching all of the given filters @@ -268,16 +288,16 @@ def list_evaluations_setups( the number of runs to skip, starting from the first size : int, optional the maximum number of runs to show - id : list[int], optional - the list of evaluation ID's task : list[int], optional - the list of task ID's + the list of task IDs setup: list[int], optional - the list of setup ID's + the list of setup IDs flow : list[int], optional - the list of flow ID's + the list of flow IDs + run : list[int], optional + the list of run IDs uploader : list[int], optional - the list of uploader ID's + the list of uploader IDs tag : str, optional filter evaluation based on given tag per_fold : bool, optional @@ -287,24 +307,34 @@ def list_evaluations_setups( The parameter decides the format of the output. - If 'dict' the output is a dict of dict - If 'dataframe' the output is a pandas DataFrame + parameters_in_separate_columns: bool, optional (default= False) + Returns hyperparameters in separate columns if set to True. + Valid only for a single flow Returns ------- dict or dataframe with hyperparameter settings as a list of tuples. """ + if parameters_in_separate_columns and (flow is None or len(flow) != 1): + raise ValueError("Can set parameters_in_separate_columns to true " + "only for single flow_id") + # List evaluations - evals = list_evaluations(function=function, offset=offset, size=size, id=id, task=task, + evals = list_evaluations(function=function, offset=offset, size=size, run=run, task=task, setup=setup, flow=flow, uploader=uploader, tag=tag, per_fold=per_fold, sort_order=sort_order, output_format='dataframe') - # List setups - # Split setups in evals into chunks of N setups as list_setups does not support large size + # list_setups by setup id does not support large sizes (exceeds URL length limit) + # Hence we split the list of unique setup ids returned by list_evaluations into chunks of size N df = pd.DataFrame() if len(evals) != 0: - N = 100 - setup_chunks = np.split(evals['setup_id'].unique(), - ((len(evals['setup_id'].unique()) - 1) // N) + 1) + N = 100 # size of section + length = len(evals['setup_id'].unique()) # length of the array we want to split + # array_split - allows indices_or_sections to not equally divide the array + # array_split -length % N sub-arrays of size length//N + 1 and the rest of size length//N. + setup_chunks = np.array_split(ary=evals['setup_id'].unique(), + indices_or_sections=((length - 1) // N) + 1) setups = pd.DataFrame() for setup in setup_chunks: result = pd.DataFrame(openml.setups.list_setups(setup=setup, output_format='dataframe')) @@ -315,14 +345,18 @@ def list_evaluations_setups( # Convert parameters of setup into list of tuples of (hyperparameter, value) for parameter_dict in setups['parameters']: if parameter_dict is not None: - parameters.append([tuple([param['parameter_name'], param['value']]) - for param in parameter_dict.values()]) + parameters.append({param['full_name']: param['value'] + for param in parameter_dict.values()}) else: - parameters.append([]) + parameters.append({}) setups['parameters'] = parameters # Merge setups with evaluations df = pd.merge(evals, setups, on='setup_id', how='left') + if parameters_in_separate_columns: + df = pd.concat([df.drop('parameters', axis=1), + df['parameters'].apply(pd.Series)], axis=1) + if output_format == 'dataframe': return df else: diff --git a/openml/exceptions.py b/openml/exceptions.py index 492587adc..78accd671 100644 --- a/openml/exceptions.py +++ b/openml/exceptions.py @@ -18,10 +18,9 @@ class OpenMLServerException(OpenMLServerError): # Code needs to be optional to allow the exceptino to be picklable: # https://stackoverflow.com/questions/16244923/how-to-make-a-custom-exception-class-with-multiple-init-args-pickleable # noqa: E501 - def __init__(self, message: str, code: str = None, additional: str = None, url: str = None): + def __init__(self, message: str, code: int = None, url: str = None): self.message = message self.code = code - self.additional = additional self.url = url super().__init__(message) diff --git a/openml/extensions/extension_interface.py b/openml/extensions/extension_interface.py index 6346cb0bf..d963edb1b 100644 --- a/openml/extensions/extension_interface.py +++ b/openml/extensions/extension_interface.py @@ -58,7 +58,9 @@ def can_handle_model(cls, model: Any) -> bool: # Abstract methods for flow serialization and de-serialization @abstractmethod - def flow_to_model(self, flow: 'OpenMLFlow', initialize_with_defaults: bool = False) -> Any: + def flow_to_model(self, flow: 'OpenMLFlow', + initialize_with_defaults: bool = False, + strict_version: bool = True) -> Any: """Instantiate a model from the flow representation. Parameters @@ -69,6 +71,9 @@ def flow_to_model(self, flow: 'OpenMLFlow', initialize_with_defaults: bool = Fal If this flag is set, the hyperparameter values of flows will be ignored and a flow with its defaults is returned. + strict_version : bool, default=True + Whether to fail if version requirements are not fulfilled. + Returns ------- Any diff --git a/openml/extensions/sklearn/__init__.py b/openml/extensions/sklearn/__init__.py index c125f51bd..a9d1db37b 100644 --- a/openml/extensions/sklearn/__init__.py +++ b/openml/extensions/sklearn/__init__.py @@ -1,4 +1,7 @@ from .extension import SklearnExtension +from openml.extensions import register_extension __all__ = ['SklearnExtension'] + +register_extension(SklearnExtension) diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index d44b61ae7..cc3352a20 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -6,6 +6,7 @@ import json import logging import re +from re import IGNORECASE import sys import time from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union @@ -21,7 +22,7 @@ import openml from openml.exceptions import PyOpenMLError -from openml.extensions import Extension, register_extension +from openml.extensions import Extension from openml.flows import OpenMLFlow from openml.runs.trace import OpenMLRunTrace, OpenMLTraceIteration, PREFIX from openml.tasks import ( @@ -33,6 +34,8 @@ OpenMLRegressionTask, ) +logger = logging.getLogger(__name__) + if sys.version_info >= (3, 5): from json.decoder import JSONDecodeError @@ -206,7 +209,9 @@ def remove_all_in_parentheses(string: str) -> str: ################################################################################################ # Methods for flow serialization and de-serialization - def flow_to_model(self, flow: 'OpenMLFlow', initialize_with_defaults: bool = False) -> Any: + def flow_to_model(self, flow: 'OpenMLFlow', + initialize_with_defaults: bool = False, + strict_version: bool = True) -> Any: """Initializes a sklearn model based on a flow. Parameters @@ -219,11 +224,16 @@ def flow_to_model(self, flow: 'OpenMLFlow', initialize_with_defaults: bool = Fal If this flag is set, the hyperparameter values of flows will be ignored and a flow with its defaults is returned. + strict_version : bool, default=True + Whether to fail if version requirements are not fulfilled. + Returns ------- mixed """ - return self._deserialize_sklearn(flow, initialize_with_defaults=initialize_with_defaults) + return self._deserialize_sklearn( + flow, initialize_with_defaults=initialize_with_defaults, + strict_version=strict_version) def _deserialize_sklearn( self, @@ -231,6 +241,7 @@ def _deserialize_sklearn( components: Optional[Dict] = None, initialize_with_defaults: bool = False, recursion_depth: int = 0, + strict_version: bool = True, ) -> Any: """Recursive function to deserialize a scikit-learn flow. @@ -254,14 +265,16 @@ def _deserialize_sklearn( The depth at which this flow is called, mostly for debugging purposes + strict_version : bool, default=True + Whether to fail if version requirements are not fulfilled. + Returns ------- mixed """ - logging.info('-%s flow_to_sklearn START o=%s, components=%s, ' - 'init_defaults=%s' % ('-' * recursion_depth, o, components, - initialize_with_defaults)) + logger.info('-%s flow_to_sklearn START o=%s, components=%s, init_defaults=%s' + % ('-' * recursion_depth, o, components, initialize_with_defaults)) depth_pp = recursion_depth + 1 # shortcut var, depth plus plus # First, we need to check whether the presented object is a json string. @@ -290,13 +303,15 @@ def _deserialize_sklearn( rval = self._deserialize_function(value) elif serialized_type == 'component_reference': assert components is not None # Necessary for mypy - value = self._deserialize_sklearn(value, recursion_depth=depth_pp) + value = self._deserialize_sklearn(value, recursion_depth=depth_pp, + strict_version=strict_version) step_name = value['step_name'] key = value['key'] component = self._deserialize_sklearn( components[key], initialize_with_defaults=initialize_with_defaults, - recursion_depth=depth_pp + recursion_depth=depth_pp, + strict_version=strict_version, ) # The component is now added to where it should be used # later. It should not be passed to the constructor of the @@ -310,7 +325,8 @@ def _deserialize_sklearn( rval = (step_name, component, value['argument_1']) elif serialized_type == 'cv_object': rval = self._deserialize_cross_validator( - value, recursion_depth=recursion_depth + value, recursion_depth=recursion_depth, + strict_version=strict_version ) else: raise ValueError('Cannot flow_to_sklearn %s' % serialized_type) @@ -323,12 +339,14 @@ def _deserialize_sklearn( components=components, initialize_with_defaults=initialize_with_defaults, recursion_depth=depth_pp, + strict_version=strict_version ), self._deserialize_sklearn( o=value, components=components, initialize_with_defaults=initialize_with_defaults, recursion_depth=depth_pp, + strict_version=strict_version ) ) for key, value in sorted(o.items()) @@ -340,6 +358,7 @@ def _deserialize_sklearn( components=components, initialize_with_defaults=initialize_with_defaults, recursion_depth=depth_pp, + strict_version=strict_version ) for element in o ] @@ -354,11 +373,11 @@ def _deserialize_sklearn( flow=o, keep_defaults=initialize_with_defaults, recursion_depth=recursion_depth, + strict_version=strict_version ) else: raise TypeError(o) - logging.info('-%s flow_to_sklearn END o=%s, rval=%s' - % ('-' * recursion_depth, o, rval)) + logger.info('-%s flow_to_sklearn END o=%s, rval=%s' % ('-' * recursion_depth, o, rval)) return rval def model_to_flow(self, model: Any) -> 'OpenMLFlow': @@ -471,10 +490,177 @@ def _is_cross_validator(self, o: Any) -> bool: @classmethod def _is_sklearn_flow(cls, flow: OpenMLFlow) -> bool: - return ( - flow.external_version.startswith('sklearn==') - or ',sklearn==' in flow.external_version - ) + if (getattr(flow, 'dependencies', None) is not None + and "sklearn" in flow.dependencies): + return True + if flow.external_version is None: + return False + else: + return ( + flow.external_version.startswith('sklearn==') + or ',sklearn==' in flow.external_version + ) + + def _get_sklearn_description(self, model: Any, char_lim: int = 1024) -> str: + '''Fetches the sklearn function docstring for the flow description + + Retrieves the sklearn docstring available and does the following: + * If length of docstring <= char_lim, then returns the complete docstring + * Else, trims the docstring till it encounters a 'Read more in the :ref:' + * Or till it encounters a 'Parameters\n----------\n' + The final string returned is at most of length char_lim with leading and + trailing whitespaces removed. + + Parameters + ---------- + model : sklearn model + char_lim : int + Specifying the max length of the returned string. + OpenML servers have a constraint of 1024 characters for the 'description' field. + + Returns + ------- + str + ''' + def match_format(s): + return "{}\n{}\n".format(s, len(s) * '-') + s = inspect.getdoc(model) + if s is None: + return '' + try: + # trim till 'Read more' + pattern = "Read more in the :ref:" + index = s.index(pattern) + s = s[:index] + # trimming docstring to be within char_lim + if len(s) > char_lim: + s = "{}...".format(s[:char_lim - 3]) + return s.strip() + except ValueError: + logger.warning("'Read more' not found in descriptions. " + "Trying to trim till 'Parameters' if available in docstring.") + pass + try: + # if 'Read more' doesn't exist, trim till 'Parameters' + pattern = "Parameters" + index = s.index(match_format(pattern)) + except ValueError: + # returning full docstring + logger.warning("'Parameters' not found in docstring. Omitting docstring trimming.") + index = len(s) + s = s[:index] + # trimming docstring to be within char_lim + if len(s) > char_lim: + s = "{}...".format(s[:char_lim - 3]) + return s.strip() + + def _extract_sklearn_parameter_docstring(self, model) -> Union[None, str]: + '''Extracts the part of sklearn docstring containing parameter information + + Fetches the entire docstring and trims just the Parameter section. + The assumption is that 'Parameters' is the first section in sklearn docstrings, + followed by other sections titled 'Attributes', 'See also', 'Note', 'References', + appearing in that order if defined. + Returns a None if no section with 'Parameters' can be found in the docstring. + + Parameters + ---------- + model : sklearn model + + Returns + ------- + str, or None + ''' + def match_format(s): + return "{}\n{}\n".format(s, len(s) * '-') + s = inspect.getdoc(model) + if s is None: + return None + try: + index1 = s.index(match_format("Parameters")) + except ValueError as e: + # when sklearn docstring has no 'Parameters' section + logger.warning("{} {}".format(match_format("Parameters"), e)) + return None + + headings = ["Attributes", "Notes", "See also", "Note", "References"] + for h in headings: + try: + # to find end of Parameters section + index2 = s.index(match_format(h)) + break + except ValueError: + logger.warning("{} not available in docstring".format(h)) + continue + else: + # in the case only 'Parameters' exist, trim till end of docstring + index2 = len(s) + s = s[index1:index2] + return s.strip() + + def _extract_sklearn_param_info(self, model, char_lim=1024) -> Union[None, Dict]: + '''Parses parameter type and description from sklearn dosctring + + Parameters + ---------- + model : sklearn model + char_lim : int + Specifying the max length of the returned string. + OpenML servers have a constraint of 1024 characters string fields. + + Returns + ------- + Dict, or None + ''' + docstring = self._extract_sklearn_parameter_docstring(model) + if docstring is None: + # when sklearn docstring has no 'Parameters' section + return None + + n = re.compile("[.]*\n", flags=IGNORECASE) + lines = n.split(docstring) + p = re.compile("[a-z0-9_ ]+ : [a-z0-9_']+[a-z0-9_ ]*", flags=IGNORECASE) + # The above regular expression is designed to detect sklearn parameter names and type + # in the format of [variable_name][space]:[space][type] + # The expectation is that the parameter description for this detected parameter will + # be all the lines in the docstring till the regex finds another parameter match + + # collecting parameters and their descriptions + description = [] # type: List + for i, s in enumerate(lines): + param = p.findall(s) + if param != []: + # a parameter definition is found by regex + # creating placeholder when parameter found which will be a list of strings + # string descriptions will be appended in subsequent iterations + # till another parameter is found and a new placeholder is created + placeholder = [''] # type: List[str] + description.append(placeholder) + else: + if len(description) > 0: # description=[] means no parameters found yet + # appending strings to the placeholder created when parameter found + description[-1].append(s) + for i in range(len(description)): + # concatenating parameter description strings + description[i] = '\n'.join(description[i]).strip() + # limiting all parameter descriptions to accepted OpenML string length + if len(description[i]) > char_lim: + description[i] = "{}...".format(description[i][:char_lim - 3]) + + # collecting parameters and their types + parameter_docs = OrderedDict() # type: Dict + matches = p.findall(docstring) + for i, param in enumerate(matches): + key, value = str(param).split(':') + parameter_docs[key.strip()] = [value.strip(), description[i]] + + # to avoid KeyError for missing parameters + param_list_true = list(model.get_params().keys()) + param_list_found = list(parameter_docs.keys()) + for param in list(set(param_list_true) - set(param_list_found)): + parameter_docs[param] = [None, None] + + return parameter_docs def _serialize_model(self, model: Any) -> OpenMLFlow: """Create an OpenMLFlow. @@ -534,10 +720,12 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: sklearn_version = self._format_external_version('sklearn', sklearn.__version__) sklearn_version_formatted = sklearn_version.replace('==', '_') + + sklearn_description = self._get_sklearn_description(model) flow = OpenMLFlow(name=name, class_name=class_name, custom_name=short_name, - description='Automatically created scikit-learn flow.', + description=sklearn_description, model=model, components=subcomponents, parameters=parameters, @@ -623,6 +811,7 @@ def _extract_information_from_model( sub_components_explicit = set() parameters = OrderedDict() # type: OrderedDict[str, Optional[str]] parameters_meta_info = OrderedDict() # type: OrderedDict[str, Optional[Dict]] + parameters_docs = self._extract_sklearn_param_info(model) model_parameters = model.get_params(deep=False) for k, v in sorted(model_parameters.items(), key=lambda t: t[0]): @@ -743,7 +932,12 @@ def flatten_all(list_): else: parameters[k] = None - parameters_meta_info[k] = OrderedDict((('description', None), ('data_type', None))) + if parameters_docs is not None: + data_type, description = parameters_docs[k] + parameters_meta_info[k] = OrderedDict((('description', description), + ('data_type', data_type))) + else: + parameters_meta_info[k] = OrderedDict((('description', None), ('data_type', None))) return parameters, parameters_meta_info, sub_components, sub_components_explicit @@ -779,10 +973,12 @@ def _deserialize_model( flow: OpenMLFlow, keep_defaults: bool, recursion_depth: int, + strict_version: bool = True ) -> Any: - logging.info('-%s deserialize %s' % ('-' * recursion_depth, flow.name)) + logger.info('-%s deserialize %s' % ('-' * recursion_depth, flow.name)) model_name = flow.class_name - self._check_dependencies(flow.dependencies) + self._check_dependencies(flow.dependencies, + strict_version=strict_version) parameters = flow.parameters components = flow.components @@ -797,13 +993,13 @@ def _deserialize_model( for name in parameters: value = parameters.get(name) - logging.info('--%s flow_parameter=%s, value=%s' % - ('-' * recursion_depth, name, value)) + logger.info('--%s flow_parameter=%s, value=%s' % ('-' * recursion_depth, name, value)) rval = self._deserialize_sklearn( value, components=components_, initialize_with_defaults=keep_defaults, recursion_depth=recursion_depth + 1, + strict_version=strict_version, ) parameter_dict[name] = rval @@ -813,11 +1009,11 @@ def _deserialize_model( if name not in components_: continue value = components[name] - logging.info('--%s flow_component=%s, value=%s' - % ('-' * recursion_depth, name, value)) + logger.info('--%s flow_component=%s, value=%s' % ('-' * recursion_depth, name, value)) rval = self._deserialize_sklearn( value, recursion_depth=recursion_depth + 1, + strict_version=strict_version ) parameter_dict[name] = rval @@ -843,7 +1039,8 @@ def _deserialize_model( del parameter_dict[param] return model_class(**parameter_dict) - def _check_dependencies(self, dependencies: str) -> None: + def _check_dependencies(self, dependencies: str, + strict_version: bool = True) -> None: if not dependencies: return @@ -871,9 +1068,13 @@ def _check_dependencies(self, dependencies: str) -> None: else: raise NotImplementedError( 'operation \'%s\' is not supported' % operation) + message = ('Trying to deserialize a model with dependency ' + '%s not satisfied.' % dependency_string) if not check: - raise ValueError('Trying to deserialize a model with dependency ' - '%s not satisfied.' % dependency_string) + if strict_version: + raise ValueError(message) + else: + warnings.warn(message) def _serialize_type(self, o: Any) -> 'OrderedDict[str, str]': mapping = {float: 'float', @@ -991,6 +1192,7 @@ def _deserialize_cross_validator( self, value: 'OrderedDict[str, Any]', recursion_depth: int, + strict_version: bool = True ) -> Any: model_name = value['name'] parameters = value['parameters'] @@ -1002,6 +1204,7 @@ def _deserialize_cross_validator( parameters[parameter] = self._deserialize_sklearn( parameters[parameter], recursion_depth=recursion_depth + 1, + strict_version=strict_version ) return model_class(**parameters) @@ -1549,7 +1752,7 @@ def is_subcomponent_specification(values): if len(subcomponent) == 3: if not isinstance(subcomponent[2], list): raise TypeError('Subcomponent argument should be' - 'list') + ' list') current['value']['argument_1'] = subcomponent[2] parsed_values.append(current) parsed_values = json.dumps(parsed_values) @@ -1739,6 +1942,3 @@ def _obtain_arff_trace( trace_attributes, trace_content, ) - - -register_extension(SklearnExtension) diff --git a/openml/flows/__init__.py b/openml/flows/__init__.py index 504c37c1a..3bbf1b21b 100644 --- a/openml/flows/__init__.py +++ b/openml/flows/__init__.py @@ -1,11 +1,12 @@ from .flow import OpenMLFlow -from .functions import get_flow, list_flows, flow_exists, assert_flows_equal +from .functions import get_flow, list_flows, flow_exists, get_flow_id, assert_flows_equal __all__ = [ 'OpenMLFlow', 'get_flow', 'list_flows', + 'get_flow_id', 'flow_exists', 'assert_flows_equal', ] diff --git a/openml/flows/flow.py b/openml/flows/flow.py index 0db69d16f..732f54208 100644 --- a/openml/flows/flow.py +++ b/openml/flows/flow.py @@ -1,16 +1,16 @@ from collections import OrderedDict import os -from typing import Dict, List, Union # noqa: F401 +from typing import Dict, List, Union, Tuple, Optional # noqa: F401 +import logging import xmltodict +from openml.base import OpenMLBase from ..extensions import get_extension_by_flow -from ..utils import extract_xml_tags, _tag_entity +from ..utils import extract_xml_tags -import openml.config - -class OpenMLFlow(object): +class OpenMLFlow(OpenMLBase): """OpenML Flow. Stores machine learning models. Flows should not be generated manually, but by the function @@ -136,6 +136,10 @@ def __init__(self, name, description, model, components, parameters, else: self._extension = extension + @property + def id(self) -> Optional[int]: + return self.flow_id + @property def extension(self): if self._extension is not None: @@ -144,20 +148,16 @@ def extension(self): raise RuntimeError("No extension could be found for flow {}: {}" .format(self.flow_id, self.name)) - def __repr__(self): - header = "OpenML Flow" - header = '{}\n{}\n'.format(header, '=' * len(header)) - - base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) + def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: + """ Collect all information to display in the __repr__ body. """ fields = {"Flow Name": self.name, "Flow Description": self.description, "Dependencies": self.dependencies} if self.flow_id is not None: + fields["Flow URL"] = self.openml_url + fields["Flow ID"] = str(self.flow_id) if self.version is not None: - fields["Flow ID"] = "{} (version {})".format(self.flow_id, self.version) - else: - fields["Flow ID"] = self.flow_id - fields["Flow URL"] = "{}f/{}".format(base_url, self.flow_id) + fields["Flow ID"] += " (version {})".format(self.version) if self.upload_date is not None: fields["Upload Date"] = self.upload_date.replace('T', ' ') if self.binary_url is not None: @@ -166,48 +166,10 @@ def __repr__(self): # determines the order in which the information will be printed order = ["Flow ID", "Flow URL", "Flow Name", "Flow Description", "Binary URL", "Upload Date", "Dependencies"] - fields = [(key, fields[key]) for key in order if key in fields] - - longest_field_name_length = max(len(name) for name, value in fields) - field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) - body = '\n'.join(field_line_format.format(name, value) for name, value in fields) - return header + body - - def _to_xml(self) -> str: - """Generate xml representation of self for upload to server. - - Returns - ------- - str - Flow represented as XML string. - """ - flow_dict = self._to_dict() - flow_xml = xmltodict.unparse(flow_dict, pretty=True) - - # A flow may not be uploaded with the xml encoding specification: - # - flow_xml = flow_xml.split('\n', 1)[-1] - return flow_xml - - def _to_dict(self) -> dict: - """ Helper function used by _to_xml and itself. - - Creates a dictionary representation of self which can be serialized - to xml by the function _to_xml. Since a flow can contain subflows - (components) this helper function calls itself recursively to also - serialize these flows to dictionaries. - - Uses OrderedDict to ensure consistent ordering when converting to xml. - The return value (OrderedDict) will be used to create the upload xml - file. The xml file must have the tags in exactly the order given in the - xsd schema of a flow (see class docstring). - - Returns - ------- - OrderedDict - Flow represented as OrderedDict. + return [(key, fields[key]) for key in order if key in fields] - """ + def _to_dict(self) -> 'OrderedDict[str, OrderedDict]': + """ Creates a dictionary representation of self. """ flow_container = OrderedDict() # type: 'OrderedDict[str, OrderedDict]' flow_dict = OrderedDict([('@xmlns:oml', 'http://openml.org/openml')]) # type: 'OrderedDict[str, Union[List, str]]' # noqa E501 flow_container['oml:flow'] = flow_dict @@ -223,6 +185,10 @@ def _to_dict(self) -> dict: _add_if_nonempty(flow_dict, 'oml:{}'.format(attribute), getattr(self, attribute)) + if not self.description: + logger = logging.getLogger(__name__) + logger.warn("Flow % has empty description", self.name) + flow_parameters = [] for key in self.parameters: param_dict = OrderedDict() # type: 'OrderedDict[str, str]' @@ -280,6 +246,9 @@ def _from_dict(cls, xml_dict): Calls itself recursively to create :class:`OpenMLFlow` objects of subflows (components). + + XML definition of a flow is available at + https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.implementation.upload.xsd Parameters ---------- @@ -290,18 +259,29 @@ def _from_dict(cls, xml_dict): ------- OpenMLFlow - """ + """ # noqa E501 arguments = OrderedDict() dic = xml_dict["oml:flow"] # Mandatory parts in the xml file - for key in ['name', 'external_version']: + for key in ['name']: arguments[key] = dic["oml:" + key] # non-mandatory parts in the xml file - for key in ['uploader', 'description', 'upload_date', 'language', - 'dependencies', 'version', 'binary_url', 'binary_format', - 'binary_md5', 'class_name', 'custom_name']: + for key in [ + 'external_version', + 'uploader', + 'description', + 'upload_date', + 'language', + 'dependencies', + 'version', + 'binary_url', + 'binary_format', + 'binary_md5', + 'class_name', + 'custom_name', + ]: arguments[key] = dic.get("oml:" + key) # has to be converted to an int if present and cannot parsed in the @@ -371,6 +351,10 @@ def from_filesystem(cls, input_directory) -> 'OpenMLFlow': xml_string = f.read() return OpenMLFlow._from_dict(xmltodict.parse(xml_string)) + def _parse_publish_response(self, xml_response: Dict): + """ Parse the id from the xml_response and assign it to self. """ + self.flow_id = int(xml_response['oml:upload_flow']['oml:id']) + def publish(self, raise_error_if_exists: bool = False) -> 'OpenMLFlow': """ Publish this flow to OpenML server. @@ -399,15 +383,8 @@ def publish(self, raise_error_if_exists: bool = False) -> 'OpenMLFlow': if self.flow_id: raise openml.exceptions.PyOpenMLError("Flow does not exist on the server, " "but 'flow.flow_id' is not None.") - xml_description = self._to_xml() - file_elements = {'description': xml_description} - return_value = openml._api_calls._perform_api_call( - "flow/", - 'post', - file_elements=file_elements, - ) - server_response = xmltodict.parse(return_value) - flow_id = int(server_response['oml:upload_flow']['oml:id']) + super().publish() + flow_id = self.flow_id elif raise_error_if_exists: error_message = "This OpenMLFlow already exists with id: {}.".format(flow_id) raise openml.exceptions.PyOpenMLError(error_message) @@ -487,26 +464,6 @@ def get_subflow(self, structure): structure.pop(0) return self.components[sub_identifier].get_subflow(structure) - def push_tag(self, tag): - """Annotates this flow with a tag on the server. - - Parameters - ---------- - tag : str - Tag to attach to the flow. - """ - _tag_entity('flow', self.flow_id, tag) - - def remove_tag(self, tag): - """Removes a tag from this flow on the server. - - Parameters - ---------- - tag : str - Tag to attach to the flow. - """ - _tag_entity('flow', self.flow_id, tag, untag=True) - def _copy_server_fields(source_flow, target_flow): fields_added_by_the_server = ['flow_id', 'uploader', 'version', diff --git a/openml/flows/functions.py b/openml/flows/functions.py index d12bcfe91..4389eb3c0 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -5,7 +5,7 @@ import re import xmltodict import pandas as pd -from typing import Union, Dict, Optional +from typing import Any, Union, Dict, Optional, List from ..exceptions import OpenMLCacheException import openml._api_calls @@ -71,7 +71,8 @@ def _get_cached_flow(fid: int) -> OpenMLFlow: @openml.utils.thread_safe_if_oslo_installed -def get_flow(flow_id: int, reinstantiate: bool = False) -> OpenMLFlow: +def get_flow(flow_id: int, reinstantiate: bool = False, + strict_version: bool = True) -> OpenMLFlow: """Download the OpenML flow for a given flow ID. Parameters @@ -82,6 +83,9 @@ def get_flow(flow_id: int, reinstantiate: bool = False) -> OpenMLFlow: reinstantiate: bool Whether to reinstantiate the flow to a model instance. + strict_version : bool, default=True + Whether to fail if version requirements are not fulfilled. + Returns ------- flow : OpenMLFlow @@ -91,7 +95,13 @@ def get_flow(flow_id: int, reinstantiate: bool = False) -> OpenMLFlow: flow = _get_flow_description(flow_id) if reinstantiate: - flow.model = flow.extension.flow_to_model(flow) + flow.model = flow.extension.flow_to_model( + flow, strict_version=strict_version) + if not strict_version: + # check if we need to return a new flow b/c of version mismatch + new_flow = flow.extension.model_to_flow(flow.model) + if new_flow.dependencies != flow.dependencies: + return new_flow return flow @@ -258,6 +268,72 @@ def flow_exists(name: str, external_version: str) -> Union[int, bool]: return False +def get_flow_id( + model: Optional[Any] = None, + name: Optional[str] = None, + exact_version=True, +) -> Union[int, bool, List[int]]: + """Retrieves the flow id for a model or a flow name. + + Provide either a model or a name to this function. Depending on the input, it does + + * ``model`` and ``exact_version == True``: This helper function first queries for the necessary + extension. Second, it uses that extension to convert the model into a flow. Third, it + executes ``flow_exists`` to potentially obtain the flow id the flow is published to the + server. + * ``model`` and ``exact_version == False``: This helper function first queries for the + necessary extension. Second, it uses that extension to convert the model into a flow. Third + it calls ``list_flows`` and filters the returned values based on the flow name. + * ``name``: Ignores ``exact_version`` and calls ``list_flows``, then filters the returned + values based on the flow name. + + Parameters + ---------- + model : object + Any model. Must provide either ``model`` or ``name``. + name : str + Name of the flow. Must provide either ``model`` or ``name``. + exact_version : bool + Whether to return the ``flow_id`` of the exact version or all ``flow_id``s where the name + of the flow matches. This is only taken into account for a model where a version number + is available. + + Returns + ------- + int or bool, List + flow id iff exists, ``False`` otherwise, List if exact_version is ``False`` + """ + if model is None and name is None: + raise ValueError( + 'Need to provide either argument `model` or argument `name`, but both are `None`.' + ) + elif model is not None and name is not None: + raise ValueError( + 'Must provide either argument `model` or argument `name`, but not both.' + ) + + if model is not None: + extension = openml.extensions.get_extension_by_model(model, raise_if_no_extension=True) + if extension is None: + # This should never happen and is only here to please mypy will be gone soon once the + # whole function is removed + raise TypeError(extension) + flow = extension.model_to_flow(model) + flow_name = flow.name + external_version = flow.external_version + else: + flow_name = name + exact_version = False + + if exact_version: + return flow_exists(name=flow_name, external_version=external_version) + else: + flows = list_flows(output_format='dataframe') + assert isinstance(flows, pd.DataFrame) # Make mypy happy + flows = flows.query('name == "{}"'.format(flow_name)) + return flows['id'].to_list() + + def __list_flows( api_call: str, output_format: str = 'dict' @@ -308,7 +384,8 @@ def _check_flow_for_server_id(flow: OpenMLFlow) -> None: def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow, ignore_parameter_values_on_older_children: str = None, ignore_parameter_values: bool = False, - ignore_custom_name_if_none: bool = False) -> None: + ignore_custom_name_if_none: bool = False, + check_description: bool = True) -> None: """Check equality of two flows. Two flows are equal if their all keys which are not set by the server @@ -327,8 +404,11 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow, ignore_parameter_values : bool Whether to ignore parameter values when comparing flows. - ignore_custom_name_if_none : bool + ignore_custom_name_if_none : bool Whether to ignore the custom name field if either flow has `custom_name` equal to `None`. + + check_description : bool + Whether to ignore matching of flow descriptions. """ if not isinstance(flow1, OpenMLFlow): raise TypeError('Argument 1 must be of type OpenMLFlow, but is %s' % @@ -345,7 +425,7 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow, # but the uploader has no control over them! 'tags'] ignored_by_python_api = ['binary_url', 'binary_format', 'binary_md5', - 'model'] + 'model', '_entity_id'] for key in set(flow1.__dict__.keys()).union(flow2.__dict__.keys()): if key in generated_by_the_server + ignored_by_python_api: @@ -366,6 +446,10 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow, ignore_custom_name_if_none) elif key == '_extension': continue + elif check_description and key == 'description': + # to ignore matching of descriptions since sklearn based flows may have + # altering docstrings and is not guaranteed to be consistent + continue else: if key == 'parameters': if ignore_parameter_values or \ @@ -397,6 +481,35 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow, # Helps with backwards compatibility as `custom_name` is now auto-generated, but # before it used to be `None`. continue + elif key == 'parameters_meta_info': + # this value is a dictionary where each key is a parameter name, containing another + # dictionary with keys specifying the parameter's 'description' and 'data_type' + # checking parameter descriptions can be ignored since that might change + # data type check can also be ignored if one of them is not defined, i.e., None + params1 = set(flow1.parameters_meta_info.keys()) + params2 = set(flow2.parameters_meta_info.keys()) + if params1 != params2: + raise ValueError('Parameter list in meta info for parameters differ ' + 'in the two flows.') + # iterating over the parameter's meta info list + for param in params1: + if isinstance(flow1.parameters_meta_info[param], Dict) and \ + isinstance(flow2.parameters_meta_info[param], Dict) and \ + 'data_type' in flow1.parameters_meta_info[param] and \ + 'data_type' in flow2.parameters_meta_info[param]: + value1 = flow1.parameters_meta_info[param]['data_type'] + value2 = flow2.parameters_meta_info[param]['data_type'] + else: + value1 = flow1.parameters_meta_info[param] + value2 = flow2.parameters_meta_info[param] + if value1 is None or value2 is None: + continue + elif value1 != value2: + raise ValueError("Flow {}: data type for parameter {} in {} differ " + "as {}\nvs\n{}".format(flow1.name, param, key, + value1, value2)) + # the continue is to avoid the 'attr != attr2' check at end of function + continue if attr1 != attr2: raise ValueError("Flow %s: values for attribute '%s' differ: " diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 767a4a48a..95407d517 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -687,10 +687,13 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): '(OpenML server error?)') else: output_data = run['oml:output_data'] + predictions_url = None if 'oml:file' in output_data: # multiple files, the normal case for file_dict in output_data['oml:file']: files[file_dict['oml:name']] = int(file_dict['oml:file_id']) + if file_dict['oml:name'] == 'predictions': + predictions_url = file_dict['oml:url'] if 'oml:evaluation' in output_data: # in normal cases there should be evaluations, but in case there # was an error these could be absent @@ -757,8 +760,8 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): dataset_id=dataset_id, output_files=files, evaluations=evaluations, fold_evaluations=fold_evaluations, - sample_evaluations=sample_evaluations, - tags=tags) + sample_evaluations=sample_evaluations, tags=tags, + predictions_url=predictions_url) def _get_cached_run(run_id): @@ -786,6 +789,7 @@ def list_runs( flow: Optional[List] = None, uploader: Optional[List] = None, tag: Optional[str] = None, + study: Optional[int] = None, display_errors: bool = False, output_format: str = 'dict', **kwargs @@ -813,6 +817,8 @@ def list_runs( tag : str, optional + study : int, optional + display_errors : bool, optional (default=None) Whether to list runs which have an error (for example a missing prediction file). @@ -854,6 +860,7 @@ def list_runs( flow=flow, uploader=uploader, tag=tag, + study=study, display_errors=display_errors, **kwargs) @@ -864,6 +871,7 @@ def _list_runs( setup: Optional[List] = None, flow: Optional[List] = None, uploader: Optional[List] = None, + study: Optional[int] = None, display_errors: bool = False, output_format: str = 'dict', **kwargs @@ -889,6 +897,8 @@ def _list_runs( uploader : list, optional + study : int, optional + display_errors : bool, optional (default=None) Whether to list runs which have an error (for example a missing prediction file). @@ -921,6 +931,8 @@ def _list_runs( api_call += "/flow/%s" % ','.join([str(int(i)) for i in flow]) if uploader is not None: api_call += "/uploader/%s" % ','.join([str(int(i)) for i in uploader]) + if study is not None: + api_call += "/study/%d" % study if display_errors: api_call += "/show_errors/true" return __list_runs(api_call=api_call, output_format=output_format) diff --git a/openml/runs/run.py b/openml/runs/run.py index 6a4818f30..e3df97083 100644 --- a/openml/runs/run.py +++ b/openml/runs/run.py @@ -1,15 +1,15 @@ from collections import OrderedDict import pickle import time -from typing import Any, IO, TextIO # noqa F401 +from typing import Any, IO, TextIO, List, Union, Tuple, Optional, Dict # noqa F401 import os import arff import numpy as np -import xmltodict import openml import openml._api_calls +from openml.base import OpenMLBase from ..exceptions import PyOpenMLError from ..flows import get_flow from ..tasks import (get_task, @@ -19,10 +19,9 @@ OpenMLClusteringTask, OpenMLRegressionTask ) -from ..utils import _tag_entity -class OpenMLRun(object): +class OpenMLRun(OpenMLBase): """OpenML Run: result of running a model on an openml dataset. Parameters @@ -67,28 +66,30 @@ def __init__(self, task_id, flow_id, dataset_id, setup_string=None, self.tags = tags self.predictions_url = predictions_url - def __repr__(self): - header = "OpenML Run" - header = '{}\n{}\n'.format(header, '=' * len(header)) + @property + def id(self) -> Optional[int]: + return self.run_id - base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) + def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: + """ Collect all information to display in the __repr__ body. """ fields = {"Uploader Name": self.uploader_name, "Metric": self.task_evaluation_measure, "Run ID": self.run_id, "Task ID": self.task_id, "Task Type": self.task_type, - "Task URL": "{}t/{}".format(base_url, self.task_id), + "Task URL": openml.tasks.OpenMLTask.url_for_id(self.task_id), "Flow ID": self.flow_id, "Flow Name": self.flow_name, - "Flow URL": "{}f/{}".format(base_url, self.flow_id), + "Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id), "Setup ID": self.setup_id, "Setup String": self.setup_string, "Dataset ID": self.dataset_id, - "Dataset URL": "{}d/{}".format(base_url, self.dataset_id)} + "Dataset URL": openml.datasets.OpenMLDataset.url_for_id(self.dataset_id)} if self.uploader is not None: - fields["Uploader Profile"] = "{}u/{}".format(base_url, self.uploader) + fields["Uploader Profile"] = "{}/u/{}".format(openml.config.server_base_url, + self.uploader) if self.run_id is not None: - fields["Run URL"] = "{}r/{}".format(base_url, self.run_id) + fields["Run URL"] = self.openml_url if self.evaluations is not None and self.task_evaluation_measure in self.evaluations: fields["Result"] = self.evaluations[self.task_evaluation_measure] @@ -96,15 +97,7 @@ def __repr__(self): order = ["Uploader Name", "Uploader Profile", "Metric", "Result", "Run ID", "Run URL", "Task ID", "Task Type", "Task URL", "Flow ID", "Flow Name", "Flow URL", "Setup ID", "Setup String", "Dataset ID", "Dataset URL"] - fields = [(key, fields[key]) for key in order if key in fields] - - longest_field_name_length = max(len(name) for name, value in fields) - field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) - body = '\n'.join(field_line_format.format(name, value) for name, value in fields) - return header + body - - def _repr_pretty_(self, pp, cycle): - pp.text(str(self)) + return [(key, fields[key]) for key in order if key in fields] @classmethod def from_filesystem(cls, directory: str, expect_model: bool = True) -> 'OpenMLRun': @@ -201,7 +194,7 @@ def to_filesystem( 'Output directory {} should be empty'.format(os.path.abspath(directory)) ) - run_xml = self._create_description_xml() + run_xml = self._to_xml() predictions_arff = arff.dumps(self._generate_arff_dict()) # It seems like typing does not allow to define the same variable multiple times @@ -434,16 +427,15 @@ def _attribute_list_to_dict(attribute_list): scores.append(sklearn_fn(y_true, y_pred, **kwargs)) return np.array(scores) - def publish(self) -> 'OpenMLRun': - """ Publish a run (and if necessary, its flow) to the OpenML server. + def _parse_publish_response(self, xml_response: Dict): + """ Parse the id from the xml_response and assign it to self. """ + self.run_id = int(xml_response['oml:upload_run']['oml:run_id']) - Uploads the results of a run to OpenML. - If the run is of an unpublished OpenMLFlow, the flow will be uploaded too. - Sets the run_id on self. + def _get_file_elements(self) -> Dict: + """ Get file_elements to upload to the server. - Returns - ------- - self : OpenMLRun + Derived child classes should overwrite this method as necessary. + The description field will be populated automatically if not provided. """ if self.model is None: raise PyOpenMLError( @@ -469,8 +461,7 @@ def publish(self) -> 'OpenMLRun': self.model, ) - description_xml = self._create_description_xml() - file_elements = {'description': ("description.xml", description_xml)} + file_elements = {'description': ("description.xml", self._to_xml())} if self.error_message is None: predictions = arff.dumps(self._generate_arff_dict()) @@ -479,123 +470,43 @@ def publish(self) -> 'OpenMLRun': if self.trace is not None: trace_arff = arff.dumps(self.trace.trace_to_arff()) file_elements['trace'] = ("trace.arff", trace_arff) - - return_value = openml._api_calls._perform_api_call( - "/run/", 'post', file_elements=file_elements - ) - result = xmltodict.parse(return_value) - self.run_id = int(result['oml:upload_run']['oml:run_id']) - return self - - def _create_description_xml(self): - """Create xml representation of run for upload. - - Returns - ------- - xml_string : string - XML description of run. - """ - - # as a tag, it must be of the form ([a-zA-Z0-9_\-\.])+ - # so we format time from 'mm/dd/yy hh:mm:ss' to 'mm-dd-yy_hh.mm.ss' - # well_formatted_time = time.strftime("%c").replace( - # ' ', '_').replace('/', '-').replace(':', '.') - # tags = run_environment + [well_formatted_time] + ['run_task'] + \ - # [self.model.__module__ + "." + self.model.__class__.__name__] - description = _to_dict(taskid=self.task_id, flow_id=self.flow_id, - setup_string=self.setup_string, - parameter_settings=self.parameter_settings, - error_message=self.error_message, - fold_evaluations=self.fold_evaluations, - sample_evaluations=self.sample_evaluations, - tags=self.tags) - description_xml = xmltodict.unparse(description, pretty=True) - return description_xml - - def push_tag(self, tag: str) -> None: - """Annotates this run with a tag on the server. - - Parameters - ---------- - tag : str - Tag to attach to the run. - """ - _tag_entity('run', self.run_id, tag) - - def remove_tag(self, tag: str) -> None: - """Removes a tag from this run on the server. - - Parameters - ---------- - tag : str - Tag to attach to the run. - """ - _tag_entity('run', self.run_id, tag, untag=True) - - -############################################################################### -# Functions which cannot be in runs/functions due to circular imports - -def _to_dict(taskid, flow_id, setup_string, error_message, parameter_settings, - tags=None, fold_evaluations=None, sample_evaluations=None): - """ Creates a dictionary corresponding to the desired xml desired by openML - - Parameters - ---------- - taskid : int - the identifier of the task - setup_string : string - a CLI string which can invoke the learning with the correct parameter - settings - parameter_settings : array of dicts - each dict containing keys name, value and component, one per parameter - setting - tags : array of strings - information that give a description of the run, must conform to - regex ``([a-zA-Z0-9_\-\.])+`` - fold_evaluations : dict mapping from evaluation measure to a dict mapping - repeat_nr to a dict mapping from fold nr to a value (double) - sample_evaluations : dict mapping from evaluation measure to a dict - mapping repeat_nr to a dict mapping from fold nr to a dict mapping to - a sample nr to a value (double) - sample_evaluations : - Returns - ------- - result : an array with version information of the above packages - """ # noqa: W605 - description = OrderedDict() - description['oml:run'] = OrderedDict() - description['oml:run']['@xmlns:oml'] = 'http://openml.org/openml' - description['oml:run']['oml:task_id'] = taskid - description['oml:run']['oml:flow_id'] = flow_id - if error_message is not None: - description['oml:run']['oml:error_message'] = error_message - description['oml:run']['oml:parameter_setting'] = parameter_settings - if tags is not None: - description['oml:run']['oml:tag'] = tags # Tags describing the run - if (fold_evaluations is not None and len(fold_evaluations) > 0) or \ - (sample_evaluations is not None and len(sample_evaluations) > 0): - description['oml:run']['oml:output_data'] = OrderedDict() - description['oml:run']['oml:output_data']['oml:evaluation'] = list() - if fold_evaluations is not None: - for measure in fold_evaluations: - for repeat in fold_evaluations[measure]: - for fold, value in fold_evaluations[measure][repeat].items(): - current = OrderedDict([ - ('@repeat', str(repeat)), ('@fold', str(fold)), - ('oml:name', measure), ('oml:value', str(value))]) - description['oml:run']['oml:output_data'][ - 'oml:evaluation'].append(current) - if sample_evaluations is not None: - for measure in sample_evaluations: - for repeat in sample_evaluations[measure]: - for fold in sample_evaluations[measure][repeat]: - for sample, value in sample_evaluations[measure][repeat][ - fold].items(): + return file_elements + + def _to_dict(self) -> 'OrderedDict[str, OrderedDict]': + """ Creates a dictionary representation of self. """ + description = OrderedDict() # type: 'OrderedDict' + description['oml:run'] = OrderedDict() + description['oml:run']['@xmlns:oml'] = 'http://openml.org/openml' + description['oml:run']['oml:task_id'] = self.task_id + description['oml:run']['oml:flow_id'] = self.flow_id + if self.error_message is not None: + description['oml:run']['oml:error_message'] = self.error_message + description['oml:run']['oml:parameter_setting'] = self.parameter_settings + if self.tags is not None: + description['oml:run']['oml:tag'] = self.tags # Tags describing the run + if (self.fold_evaluations is not None and len(self.fold_evaluations) > 0) or \ + (self.sample_evaluations is not None and len(self.sample_evaluations) > 0): + description['oml:run']['oml:output_data'] = OrderedDict() + description['oml:run']['oml:output_data']['oml:evaluation'] = list() + if self.fold_evaluations is not None: + for measure in self.fold_evaluations: + for repeat in self.fold_evaluations[measure]: + for fold, value in self.fold_evaluations[measure][repeat].items(): current = OrderedDict([ ('@repeat', str(repeat)), ('@fold', str(fold)), - ('@sample', str(sample)), ('oml:name', measure), - ('oml:value', str(value))]) + ('oml:name', measure), ('oml:value', str(value))]) description['oml:run']['oml:output_data'][ 'oml:evaluation'].append(current) - return description + if self.sample_evaluations is not None: + for measure in self.sample_evaluations: + for repeat in self.sample_evaluations[measure]: + for fold in self.sample_evaluations[measure][repeat]: + for sample, value in \ + self.sample_evaluations[measure][repeat][fold].items(): + current = OrderedDict([ + ('@repeat', str(repeat)), ('@fold', str(fold)), + ('@sample', str(sample)), ('oml:name', measure), + ('oml:value', str(value))]) + description['oml:run']['oml:output_data'][ + 'oml:evaluation'].append(current) + return description diff --git a/openml/runs/trace.py b/openml/runs/trace.py index 1786120e8..c6ca1f057 100644 --- a/openml/runs/trace.py +++ b/openml/runs/trace.py @@ -1,7 +1,7 @@ from collections import OrderedDict import json import os -from typing import List, Tuple # noqa F401 +from typing import List, Tuple, Optional # noqa F401 import arff import xmltodict @@ -381,7 +381,7 @@ def merge_traces(cls, traces: List['OpenMLRunTrace']) -> 'OpenMLRunTrace': return cls(None, merged_trace) def __repr__(self): - return '[Run id: %d, %d trace iterations]'.format( + return '[Run id: {}, {} trace iterations]'.format( -1 if self.run_id is None else self.run_id, len(self.trace_iterations), ) diff --git a/openml/setups/setup.py b/openml/setups/setup.py index aee1aa0bf..31fdc15a4 100644 --- a/openml/setups/setup.py +++ b/openml/setups/setup.py @@ -31,10 +31,9 @@ def __repr__(self): header = "OpenML Setup" header = '{}\n{}\n'.format(header, '=' * len(header)) - base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) fields = {"Setup ID": self.setup_id, "Flow ID": self.flow_id, - "Flow URL": "{}f/{}".format(base_url, self.flow_id), + "Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id), "# of Parameters": len(self.parameters)} # determines the order in which the information will be printed @@ -86,12 +85,11 @@ def __repr__(self): header = "OpenML Parameter" header = '{}\n{}\n'.format(header, '=' * len(header)) - base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) fields = {"ID": self.id, "Flow ID": self.flow_id, # "Flow Name": self.flow_name, "Flow Name": self.full_name, - "Flow URL": "{}f/{}".format(base_url, self.flow_id), + "Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id), "Parameter Name": self.parameter_name} # indented prints for parameter attributes # indention = 2 spaces + 1 | + 2 underscores diff --git a/openml/study/functions.py b/openml/study/functions.py index ccd523016..25ebea5fd 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -120,7 +120,7 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy: if 'oml:setups' in result_dict: setups = [int(x) for x in result_dict['oml:setups']['oml:setup_id']] else: - raise ValueError('No setups attached to study!'.format(id_)) + raise ValueError('No setups attached to study {}!'.format(id_)) if 'oml:runs' in result_dict: runs = [ int(x) for x in result_dict['oml:runs']['oml:run_id'] @@ -130,7 +130,7 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy: # Legacy studies did not require runs runs = None else: - raise ValueError('No runs attached to study!'.format(id_)) + raise ValueError('No runs attached to study {}!'.format(id_)) study = OpenMLStudy( study_id=study_id, diff --git a/openml/study/study.py b/openml/study/study.py index 8657749da..64d47dce7 100644 --- a/openml/study/study.py +++ b/openml/study/study.py @@ -1,12 +1,11 @@ -import collections -from typing import Dict, List, Optional - -import xmltodict +from collections import OrderedDict +from typing import Dict, List, Optional, Tuple, Union, Any import openml +from openml.base import OpenMLBase -class BaseStudy(object): +class BaseStudy(OpenMLBase): """ An OpenMLStudy represents the OpenML concept of a study. It contains the following information: name, id, description, creation date, @@ -72,7 +71,7 @@ def __init__( setups: Optional[List[int]], ): - self.id = study_id + self.study_id = study_id self.alias = alias self.main_entity_type = main_entity_type self.benchmark_suite = benchmark_suite @@ -87,19 +86,25 @@ def __init__( self.flows = flows self.setups = setups self.runs = runs - pass - def __repr__(self): - # header is provided by the sub classes - base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) + @classmethod + def _entity_letter(cls) -> str: + return 's' + + @property + def id(self) -> Optional[int]: + return self.study_id + + def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: + """ Collect all information to display in the __repr__ body. """ fields = {"Name": self.name, "Status": self.status, - "Main Entity Type": self.main_entity_type} - if self.id is not None: - fields["ID"] = self.id - fields["Study URL"] = "{}s/{}".format(base_url, self.id) + "Main Entity Type": self.main_entity_type} # type: Dict[str, Any] + if self.study_id is not None: + fields["ID"] = self.study_id + fields["Study URL"] = self.openml_url if self.creator is not None: - fields["Creator"] = "{}u/{}".format(base_url, self.creator) + fields["Creator"] = "{}/u/{}".format(openml.config.server_base_url, self.creator) if self.creation_date is not None: fields["Upload Time"] = self.creation_date.replace('T', ' ') if self.data is not None: @@ -115,42 +120,14 @@ def __repr__(self): order = ["ID", "Name", "Status", "Main Entity Type", "Study URL", "# of Data", "# of Tasks", "# of Flows", "# of Runs", "Creator", "Upload Time"] - fields = [(key, fields[key]) for key in order if key in fields] - - longest_field_name_length = max(len(name) for name, value in fields) - field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) - body = '\n'.join(field_line_format.format(name, value) for name, value in fields) - return body - - def publish(self) -> int: - """ - Publish the study on the OpenML server. - - Returns - ------- - study_id: int - Id of the study uploaded to the server. - """ - file_elements = { - 'description': self._to_xml() - } - return_value = openml._api_calls._perform_api_call( - "study/", - 'post', - file_elements=file_elements, - ) - study_res = xmltodict.parse(return_value) - self.study_id = int(study_res['oml:study_upload']['oml:id']) - return self.study_id + return [(key, fields[key]) for key in order if key in fields] - def _to_xml(self) -> str: - """Serialize object to xml for upload + def _parse_publish_response(self, xml_response: Dict): + """ Parse the id from the xml_response and assign it to self. """ + self.study_id = int(xml_response['oml:study_upload']['oml:id']) - Returns - ------- - xml_study : str - XML description of the data. - """ + def _to_dict(self) -> 'OrderedDict[str, OrderedDict]': + """ Creates a dictionary representation of self. """ # some can not be uploaded, e.g., id, creator, creation_date simple_props = ['alias', 'main_entity_type', 'name', 'description'] # maps from attribute name (which is used as outer tag name) to immer @@ -161,9 +138,9 @@ def _to_xml(self) -> str: 'runs': 'run_id', } - study_container = collections.OrderedDict() # type: 'collections.OrderedDict' + study_container = OrderedDict() # type: 'OrderedDict' namespace_list = [('@xmlns:oml', 'http://openml.org/openml')] - study_dict = collections.OrderedDict(namespace_list) # type: 'collections.OrderedDict' + study_dict = OrderedDict(namespace_list) # type: 'OrderedDict' study_container['oml:study'] = study_dict for prop_name in simple_props: @@ -177,15 +154,13 @@ def _to_xml(self) -> str: 'oml:' + inner_name: content } study_dict["oml:" + prop_name] = sub_dict + return study_container - xml_string = xmltodict.unparse( - input_dict=study_container, - pretty=True, - ) - # A flow may not be uploaded with the xml encoding specification: - # - xml_string = xml_string.split('\n', 1)[-1] - return xml_string + def push_tag(self, tag: str): + raise NotImplementedError("Tags for studies is not (yet) supported.") + + def remove_tag(self, tag: str): + raise NotImplementedError("Tags for studies is not (yet) supported.") class OpenMLStudy(BaseStudy): @@ -268,12 +243,6 @@ def __init__( setups=setups, ) - def __repr__(self): - header = "OpenML Study" - header = '{}\n{}\n'.format(header, '=' * len(header)) - body = super(OpenMLStudy, self).__repr__() - return header + body - class OpenMLBenchmarkSuite(BaseStudy): """ @@ -345,9 +314,3 @@ def __init__( runs=None, setups=None, ) - - def __repr__(self): - header = "OpenML Benchmark Suite" - header = '{}\n{}\n'.format(header, '=' * len(header)) - body = super(OpenMLBenchmarkSuite, self).__repr__() - return header + body diff --git a/openml/tasks/task.py b/openml/tasks/task.py index 83af79373..f415a3fea 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -2,21 +2,21 @@ from collections import OrderedDict import io import os -from typing import Union, Tuple, Dict, List, Optional +from typing import Union, Tuple, Dict, List, Optional, Any from warnings import warn import numpy as np import pandas as pd import scipy.sparse -import xmltodict import openml._api_calls +from openml.base import OpenMLBase from .. import datasets from .split import OpenMLSplit -from ..utils import _create_cache_directory_for_id, _tag_entity +from ..utils import _create_cache_directory_for_id -class OpenMLTask(ABC): +class OpenMLTask(OpenMLBase): """OpenML Task object. Parameters @@ -55,35 +55,36 @@ def __init__( self.estimation_procedure_id = estimation_procedure_id self.split = None # type: Optional[OpenMLSplit] - def __repr__(self): - header = "OpenML Task" - header = '{}\n{}\n'.format(header, '=' * len(header)) + @classmethod + def _entity_letter(cls) -> str: + return 't' - base_url = "{}".format(openml.config.server[:-len('api/v1/xml')]) - fields = {"Task Type": self.task_type} + @property + def id(self) -> Optional[int]: + return self.task_id + + def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]: + """ Collect all information to display in the __repr__ body. """ + fields = {"Task Type Description": '{}/tt/{}'.format( + openml.config.server_base_url, self.task_type_id)} # type: Dict[str, Any] if self.task_id is not None: fields["Task ID"] = self.task_id - fields["Task URL"] = "{}t/{}".format(base_url, self.task_id) + fields["Task URL"] = self.openml_url if self.evaluation_measure is not None: fields["Evaluation Measure"] = self.evaluation_measure if self.estimation_procedure is not None: fields["Estimation Procedure"] = self.estimation_procedure['type'] - if self.target_name is not None: - fields["Target Feature"] = self.target_name + if getattr(self, 'target_name', None) is not None: + fields["Target Feature"] = getattr(self, 'target_name') if hasattr(self, 'class_labels'): - fields["# of Classes"] = len(self.class_labels) + fields["# of Classes"] = len(getattr(self, 'class_labels')) if hasattr(self, 'cost_matrix'): fields["Cost Matrix"] = "Available" # determines the order in which the information will be printed - order = ["Task Type", "Task ID", "Task URL", "Estimation Procedure", "Evaluation Measure", - "Target Feature", "# of Classes", "Cost Matrix"] - fields = [(key, fields[key]) for key in order if key in fields] - - longest_field_name_length = max(len(name) for name, value in fields) - field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length) - body = '\n'.join(field_line_format.format(name, value) for name, value in fields) - return header + body + order = ["Task Type Description", "Task ID", "Task URL", "Estimation Procedure", + "Evaluation Measure", "Target Feature", "# of Classes", "Cost Matrix"] + return [(key, fields[key]) for key in order if key in fields] def get_dataset(self) -> datasets.OpenMLDataset: """Download dataset associated with task""" @@ -144,28 +145,8 @@ def get_split_dimensions(self) -> Tuple[int, int, int]: return self.split.repeats, self.split.folds, self.split.samples - def push_tag(self, tag: str): - """Annotates this task with a tag on the server. - - Parameters - ---------- - tag : str - Tag to attach to the task. - """ - _tag_entity('task', self.task_id, tag) - - def remove_tag(self, tag: str): - """Removes a tag from this task on the server. - - Parameters - ---------- - tag : str - Tag to attach to the task. - """ - _tag_entity('task', self.task_id, tag, untag=True) - def _to_dict(self) -> 'OrderedDict[str, OrderedDict]': - + """ Creates a dictionary representation of self. """ task_container = OrderedDict() # type: OrderedDict[str, OrderedDict] task_dict = OrderedDict([ ('@xmlns:oml', 'http://openml.org/openml') @@ -199,47 +180,9 @@ def _to_dict(self) -> 'OrderedDict[str, OrderedDict]': return task_container - def _to_xml(self) -> str: - """Generate xml representation of self for upload to server. - - Returns - ------- - str - Task represented as XML string. - """ - task_dict = self._to_dict() - task_xml = xmltodict.unparse(task_dict, pretty=True) - - # A task may not be uploaded with the xml encoding specification: - # - task_xml = task_xml.split('\n', 1)[-1] - - return task_xml - - def publish(self) -> int: - """Publish task to OpenML server. - - Returns - ------- - task_id: int - Returns the id of the uploaded task - if successful. - - """ - - xml_description = self._to_xml() - - file_elements = {'description': xml_description} - - return_value = openml._api_calls._perform_api_call( - "task/", - 'post', - file_elements=file_elements, - ) - - task_id = int(xmltodict.parse(return_value)['oml:upload_task']['oml:id']) - - return task_id + def _parse_publish_response(self, xml_response: Dict): + """ Parse the id from the xml_response and assign it to self. """ + self.task_id = int(xml_response['oml:upload_task']['oml:id']) class OpenMLSupervisedTask(OpenMLTask, ABC): diff --git a/openml/utils.py b/openml/utils.py index f6cc81ff7..a458d3132 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -2,6 +2,7 @@ import hashlib import xmltodict import shutil +from typing import TYPE_CHECKING, List, Tuple, Union, Type import warnings import pandas as pd from functools import wraps @@ -11,6 +12,11 @@ import openml.exceptions from . import config +# Avoid import cycles: https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles +if TYPE_CHECKING: + from openml.base import OpenMLBase + + oslo_installed = False try: # Currently, importing oslo raises a lot of warning that it will stop working @@ -62,6 +68,26 @@ def extract_xml_tags(xml_tag_name, node, allow_none=True): (xml_tag_name, str(node))) +def _get_rest_api_type_alias(oml_object: 'OpenMLBase') -> str: + """ Return the alias of the openml entity as it is defined for the REST API. """ + rest_api_mapping = [ + (openml.datasets.OpenMLDataset, 'data'), + (openml.flows.OpenMLFlow, 'flow'), + (openml.tasks.OpenMLTask, 'task'), + (openml.runs.OpenMLRun, 'run'), + ((openml.study.OpenMLStudy, openml.study.OpenMLBenchmarkSuite), 'study') + ] # type: List[Tuple[Union[Type, Tuple], str]] + _, api_type_alias = [(python_type, api_alias) + for (python_type, api_alias) in rest_api_mapping + if isinstance(oml_object, python_type)][0] + return api_type_alias + + +def _tag_openml_base(oml_object: 'OpenMLBase', tag: str, untag: bool = False): + api_type_alias = _get_rest_api_type_alias(oml_object) + _tag_entity(api_type_alias, oml_object.id, tag, untag) + + def _tag_entity(entity_type, entity_id, tag, untag=False): """ Function that tags or untags a given entity on OpenML. As the OpenML diff --git a/setup.cfg b/setup.cfg index 726c8fa73..156baa3bb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,3 +4,11 @@ description-file = README.md [tool:pytest] filterwarnings = ignore:the matrix subclass:PendingDeprecationWarning + +[flake8] +exclude = + # the following file and directory can be removed when the descriptions + # are shortened. More info at: + # https://travis-ci.org/openml/openml-python/jobs/590382001 + examples/30_extended/tasks_tutorial.py + examples/40_paper \ No newline at end of file diff --git a/setup.py b/setup.py index 3b271badd..f4fbe7991 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +import os import setuptools import sys @@ -12,13 +13,19 @@ .format(sys.version_info.major, sys.version_info.minor, sys.version_info.micro) ) +with open(os.path.join("README.md")) as fid: + README = fid.read() + setuptools.setup(name="openml", - author="Matthias Feurer, Jan van Rijn, Arlind Kadra, Andreas Müller, " - "Pieter Gijsbers and Joaquin Vanschoren", + author="Matthias Feurer, Jan van Rijn, Arlind Kadra, Pieter Gijsbers, " + "Neeratyoy Mallik, Sahithya Ravi, Andreas Müller, Joaquin Vanschoren " + "and Frank Hutter", author_email="feurerm@informatik.uni-freiburg.de", maintainer="Matthias Feurer", maintainer_email="feurerm@informatik.uni-freiburg.de", description="Python API for OpenML", + long_description=README, + long_description_content_type='text/markdown', license="BSD 3-clause", url="http://openml.org/", project_urls={ @@ -48,7 +55,8 @@ 'pytest-xdist', 'pytest-timeout', 'nbformat', - 'oslo.concurrency' + 'oslo.concurrency', + 'flaky', ], 'examples': [ 'matplotlib', @@ -60,6 +68,9 @@ 'ipython', 'ipykernel', 'seaborn' + ], + 'examples_unix': [ + 'fanova', ] }, test_suite="pytest", diff --git a/tests/conftest.py b/tests/conftest.py index 9e08d09a8..056cc7f96 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -37,7 +37,7 @@ # finding the root directory of conftest.py and going up to OpenML main directory # exploiting the fact that conftest.py always resides in the root directory for tests static_dir = os.path.dirname(os.path.abspath(__file__)) -logging.info("static directory: {}".format(static_dir)) +logger.info("static directory: {}".format(static_dir)) print("static directory: {}".format(static_dir)) while True: if 'openml' in os.listdir(static_dir): @@ -178,4 +178,4 @@ def pytest_sessionfinish() -> None: compare_delete_files(file_list, new_file_list) logger.info("Local files deleted") - logging.info("{} is killed".format(worker)) + logger.info("{} is killed".format(worker)) diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index cabad9565..9d1076371 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -9,6 +9,7 @@ import openml from openml.testing import TestBase from openml.exceptions import PyOpenMLError +from openml.datasets import OpenMLDataset, OpenMLDataFeature class OpenMLDatasetTest(TestBase): @@ -26,6 +27,28 @@ def setUp(self): # these datasets have some boolean features self.pc4 = openml.datasets.get_dataset(1049, download_data=False) self.jm1 = openml.datasets.get_dataset(1053, download_data=False) + self.iris = openml.datasets.get_dataset(61, download_data=False) + + def test_repr(self): + # create a bare-bones dataset as would be returned by + # create_dataset + data = openml.datasets.OpenMLDataset(name="somename", + description="a description") + str(data) + + def test_init_string_validation(self): + with pytest.raises(ValueError, match="Invalid symbols in name"): + openml.datasets.OpenMLDataset(name="some name", + description="a description") + + with pytest.raises(ValueError, match="Invalid symbols in description"): + openml.datasets.OpenMLDataset(name="somename", + description="a descriptïon") + + with pytest.raises(ValueError, match="Invalid symbols in citation"): + openml.datasets.OpenMLDataset(name="somename", + description="a description", + citation="Something by Müller") def test_get_data_array(self): # Basic usage @@ -190,6 +213,18 @@ def test_get_data_with_nonexisting_class(self): # Check that no label is mapped to 3, since it is reserved for label '4'. self.assertEqual(np.sum(y == 3), 0) + def test_get_data_corrupt_pickle(self): + # Lazy loaded dataset, populate cache. + self.iris.get_data() + # Corrupt pickle file, overwrite as empty. + with open(self.iris.data_pickle_file, "w") as fh: + fh.write("") + # Despite the corrupt file, the data should be loaded from the ARFF file. + # A warning message is written to the python logger. + xy, _, _, _ = self.iris.get_data() + self.assertIsInstance(xy, pd.DataFrame) + self.assertEqual(xy.shape, (150, 5)) + class OpenMLDatasetTestOnTestServer(TestBase): def setUp(self): @@ -307,6 +342,16 @@ def test_get_sparse_dataset_rowid_and_ignore_and_target(self): self.assertListEqual(categorical, [False] * 19998) self.assertEqual(y.shape, (600, )) + def test_get_sparse_categorical_data_id_395(self): + dataset = openml.datasets.get_dataset(395, download_data=True) + feature = dataset.features[3758] + self.assertTrue(isinstance(dataset, OpenMLDataset)) + self.assertTrue(isinstance(feature, OpenMLDataFeature)) + self.assertEqual(dataset.name, 're1.wc') + self.assertEqual(feature.name, 'CLASS_LABEL') + self.assertEqual(feature.data_type, 'nominal') + self.assertEqual(len(feature.nominal_values), 25) + class OpenMLDatasetQualityTest(TestBase): def test__check_qualities(self): diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 5726d2442..fb363bcf4 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -265,6 +265,7 @@ def test__name_to_id_with_version(self): def test__name_to_id_with_multiple_active_error(self): """ With multiple active datasets, retrieve the least recent active. """ + openml.config.server = self.production_server self.assertRaisesRegex( ValueError, "Multiple active datasets exist with name iris", @@ -514,10 +515,10 @@ def test_data_status(self): version=1, url="https://www.openml.org/data/download/61/dataset_61_iris.arff") dataset.publish() - TestBase._mark_entity_for_removal('data', dataset.dataset_id) + TestBase._mark_entity_for_removal('data', dataset.id) TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], - dataset.dataset_id)) - did = dataset.dataset_id + dataset.id)) + did = dataset.id # admin key for test server (only adminds can activate datasets. # all users can deactivate their own datasets) @@ -525,23 +526,23 @@ def test_data_status(self): openml.datasets.status_update(did, 'active') # need to use listing fn, as this is immune to cache - result = openml.datasets.list_datasets(data_id=did, status='all') + result = openml.datasets.list_datasets(data_id=[did], status='all') self.assertEqual(len(result), 1) self.assertEqual(result[did]['status'], 'active') openml.datasets.status_update(did, 'deactivated') # need to use listing fn, as this is immune to cache - result = openml.datasets.list_datasets(data_id=did, status='all') + result = openml.datasets.list_datasets(data_id=[did], status='all') self.assertEqual(len(result), 1) self.assertEqual(result[did]['status'], 'deactivated') openml.datasets.status_update(did, 'active') # need to use listing fn, as this is immune to cache - result = openml.datasets.list_datasets(data_id=did, status='all') + result = openml.datasets.list_datasets(data_id=[did], status='all') self.assertEqual(len(result), 1) self.assertEqual(result[did]['status'], 'active') with self.assertRaises(ValueError): openml.datasets.status_update(did, 'in_preparation') # need to use listing fn, as this is immune to cache - result = openml.datasets.list_datasets(data_id=did, status='all') + result = openml.datasets.list_datasets(data_id=[did], status='all') self.assertEqual(len(result), 1) self.assertEqual(result[did]['status'], 'active') @@ -569,6 +570,12 @@ def test_attributes_arff_from_df(self): self.assertEqual(attributes, [('integer', 'INTEGER'), ('floating', 'REAL')]) + def test_attributes_arff_from_df_numeric_column(self): + # Test column names are automatically converted to str if needed (#819) + df = pd.DataFrame({0: [1, 2, 3], 0.5: [4, 5, 6], 'target': [0, 1, 1]}) + attributes = attributes_arff_from_df(df) + self.assertEqual(attributes, [('0', 'INTEGER'), ('0.5', 'INTEGER'), ('target', 'INTEGER')]) + def test_attributes_arff_from_df_mixed_dtype_categories(self): # liac-arff imposed categorical attributes to be of sting dtype. We # raise an error if this is not the case. @@ -629,18 +636,18 @@ def test_create_dataset_numpy(self): paper_url='http://openml.github.io/openml-python' ) - upload_did = dataset.publish() - TestBase._mark_entity_for_removal('data', upload_did) + dataset.publish() + TestBase._mark_entity_for_removal('data', dataset.id) TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], - upload_did)) + dataset.id)) self.assertEqual( - _get_online_dataset_arff(upload_did), + _get_online_dataset_arff(dataset.id), dataset._dataset, "Uploaded arff does not match original one" ) self.assertEqual( - _get_online_dataset_format(upload_did), + _get_online_dataset_format(dataset.id), 'arff', "Wrong format for dataset" ) @@ -694,17 +701,17 @@ def test_create_dataset_list(self): paper_url='http://openml.github.io/openml-python' ) - upload_did = dataset.publish() - TestBase._mark_entity_for_removal('data', upload_did) + dataset.publish() + TestBase._mark_entity_for_removal('data', dataset.id) TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], - upload_did)) + dataset.id)) self.assertEqual( - _get_online_dataset_arff(upload_did), + _get_online_dataset_arff(dataset.id), dataset._dataset, "Uploaded ARFF does not match original one" ) self.assertEqual( - _get_online_dataset_format(upload_did), + _get_online_dataset_format(dataset.id), 'arff', "Wrong format for dataset" ) @@ -740,17 +747,17 @@ def test_create_dataset_sparse(self): version_label='test', ) - upload_did = xor_dataset.publish() - TestBase._mark_entity_for_removal('data', upload_did) + xor_dataset.publish() + TestBase._mark_entity_for_removal('data', xor_dataset.id) TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], - upload_did)) + xor_dataset.id)) self.assertEqual( - _get_online_dataset_arff(upload_did), + _get_online_dataset_arff(xor_dataset.id), xor_dataset._dataset, "Uploaded ARFF does not match original one" ) self.assertEqual( - _get_online_dataset_format(upload_did), + _get_online_dataset_format(xor_dataset.id), 'sparse_arff', "Wrong format for dataset" ) @@ -780,17 +787,17 @@ def test_create_dataset_sparse(self): version_label='test', ) - upload_did = xor_dataset.publish() - TestBase._mark_entity_for_removal('data', upload_did) + xor_dataset.publish() + TestBase._mark_entity_for_removal('data', xor_dataset.id) TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], - upload_did)) + xor_dataset.id)) self.assertEqual( - _get_online_dataset_arff(upload_did), + _get_online_dataset_arff(xor_dataset.id), xor_dataset._dataset, "Uploaded ARFF does not match original one" ) self.assertEqual( - _get_online_dataset_format(upload_did), + _get_online_dataset_format(xor_dataset.id), 'sparse_arff', "Wrong format for dataset" ) @@ -906,12 +913,12 @@ def test_create_dataset_pandas(self): original_data_url=original_data_url, paper_url=paper_url ) - upload_did = dataset.publish() - TestBase._mark_entity_for_removal('data', upload_did) + dataset.publish() + TestBase._mark_entity_for_removal('data', dataset.id) TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], - upload_did)) + dataset.id)) self.assertEqual( - _get_online_dataset_arff(upload_did), + _get_online_dataset_arff(dataset.id), dataset._dataset, "Uploaded ARFF does not match original one" ) @@ -943,17 +950,17 @@ def test_create_dataset_pandas(self): original_data_url=original_data_url, paper_url=paper_url ) - upload_did = dataset.publish() - TestBase._mark_entity_for_removal('data', upload_did) + dataset.publish() + TestBase._mark_entity_for_removal('data', dataset.id) TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], - upload_did)) + dataset.id)) self.assertEqual( - _get_online_dataset_arff(upload_did), + _get_online_dataset_arff(dataset.id), dataset._dataset, "Uploaded ARFF does not match original one" ) self.assertEqual( - _get_online_dataset_format(upload_did), + _get_online_dataset_format(dataset.id), 'sparse_arff', "Wrong format for dataset" ) @@ -982,11 +989,11 @@ def test_create_dataset_pandas(self): original_data_url=original_data_url, paper_url=paper_url ) - upload_did = dataset.publish() - TestBase._mark_entity_for_removal('data', upload_did) + dataset.publish() + TestBase._mark_entity_for_removal('data', dataset.id) TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], - upload_did)) - downloaded_data = _get_online_dataset_arff(upload_did) + dataset.id)) + downloaded_data = _get_online_dataset_arff(dataset.id) self.assertEqual( downloaded_data, dataset._dataset, @@ -1139,14 +1146,14 @@ def test_publish_fetch_ignore_attribute(self): ) # publish dataset - upload_did = dataset.publish() - TestBase._mark_entity_for_removal('data', upload_did) + dataset.publish() + TestBase._mark_entity_for_removal('data', dataset.id) TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], - upload_did)) + dataset.id)) # test if publish was successful - self.assertIsInstance(upload_did, int) + self.assertIsInstance(dataset.id, int) - dataset = None + downloaded_dataset = None # fetching from server # loop till timeout or fetch not successful max_waiting_time_seconds = 400 @@ -1154,17 +1161,17 @@ def test_publish_fetch_ignore_attribute(self): start_time = time.time() while time.time() - start_time < max_waiting_time_seconds: try: - dataset = openml.datasets.get_dataset(upload_did) + downloaded_dataset = openml.datasets.get_dataset(dataset.id) break except Exception as e: # returned code 273: Dataset not processed yet # returned code 362: No qualities found - print("Failed to fetch dataset:{} with '{}'.".format(upload_did, str(e))) + print("Failed to fetch dataset:{} with '{}'.".format(dataset.id, str(e))) time.sleep(10) continue - if dataset is None: - raise ValueError("TIMEOUT: Failed to fetch uploaded dataset - {}".format(upload_did)) - self.assertEqual(dataset.ignore_attribute, ignore_attribute) + if downloaded_dataset is None: + raise ValueError("TIMEOUT: Failed to fetch uploaded dataset - {}".format(dataset.id)) + self.assertEqual(downloaded_dataset.ignore_attribute, ignore_attribute) def test_create_dataset_row_id_attribute_error(self): # meta-information @@ -1254,11 +1261,11 @@ def test_create_dataset_row_id_attribute_inference(self): paper_url=paper_url ) self.assertEqual(dataset.row_id_attribute, output_row_id) - upload_did = dataset.publish() - TestBase._mark_entity_for_removal('data', upload_did) + dataset.publish() + TestBase._mark_entity_for_removal('data', dataset.id) TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], - upload_did)) - arff_dataset = arff.loads(_get_online_dataset_arff(upload_did)) + dataset.id)) + arff_dataset = arff.loads(_get_online_dataset_arff(dataset.id)) arff_data = np.array(arff_dataset['data'], dtype=object) # if we set the name of the index then the index will be added to # the data diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py index b25b35391..fe38a5a66 100644 --- a/tests/test_evaluations/test_evaluation_functions.py +++ b/tests/test_evaluations/test_evaluation_functions.py @@ -6,18 +6,20 @@ class TestEvaluationFunctions(TestBase): _multiprocess_can_split_ = True - def _check_list_evaluation_setups(self, size, **kwargs): + def _check_list_evaluation_setups(self, **kwargs): evals_setups = openml.evaluations.list_evaluations_setups("predictive_accuracy", - **kwargs, size=size, + **kwargs, sort_order='desc', output_format='dataframe') evals = openml.evaluations.list_evaluations("predictive_accuracy", - **kwargs, size=size, + **kwargs, sort_order='desc', output_format='dataframe') # Check if list is non-empty self.assertGreater(len(evals_setups), 0) + # Check if length is accurate + self.assertEqual(len(evals_setups), len(evals)) # Check if output from sort is sorted in the right order self.assertSequenceEqual(sorted(evals_setups['value'].tolist(), reverse=True), evals_setups['value'].tolist()) @@ -27,8 +29,11 @@ def _check_list_evaluation_setups(self, size, **kwargs): # Check if the hyper-parameter column is as accurate and flow_id for index, row in evals_setups.iterrows(): params = openml.runs.get_run(row['run_id']).parameter_settings - hyper_params = [tuple([param['oml:name'], param['oml:value']]) for param in params] - self.assertTrue(sorted(row['parameters']) == sorted(hyper_params)) + list1 = [param['oml:value'] for param in params] + list2 = list(row['parameters'].values()) + # check if all values are equal + self.assertSequenceEqual(sorted(list1), sorted(list2)) + return evals_setups def test_evaluation_list_filter_task(self): openml.config.server = self.production_server @@ -51,7 +56,9 @@ def test_evaluation_list_filter_uploader_ID_16(self): uploader_id = 16 evaluations = openml.evaluations.list_evaluations("predictive_accuracy", - uploader=[uploader_id]) + uploader=[uploader_id], + output_format='dataframe') + self.assertEqual(evaluations['uploader'].unique(), [uploader_id]) self.assertGreater(len(evaluations), 50) @@ -92,7 +99,7 @@ def test_evaluation_list_filter_run(self): run_id = 12 evaluations = openml.evaluations.list_evaluations("predictive_accuracy", - id=[run_id]) + run=[run_id]) self.assertEqual(len(evaluations), 1) for run_id in evaluations.keys(): @@ -142,8 +149,9 @@ def test_evaluation_list_per_fold(self): self.assertIsNone(evaluations[run_id].values) def test_evaluation_list_sort(self): + openml.config.server = self.production_server size = 10 - task_id = 115 + task_id = 6 # Get all evaluations of the task unsorted_eval = openml.evaluations.list_evaluations( "predictive_accuracy", offset=0, task=[task_id]) @@ -171,10 +179,20 @@ def test_list_evaluations_setups_filter_flow(self): openml.config.server = self.production_server flow_id = [405] size = 100 - self._check_list_evaluation_setups(size, flow=flow_id) + evals = self._check_list_evaluation_setups(flow=flow_id, size=size) + # check if parameters in separate columns works + evals_cols = openml.evaluations.list_evaluations_setups("predictive_accuracy", + flow=flow_id, size=size, + sort_order='desc', + output_format='dataframe', + parameters_in_separate_columns=True + ) + columns = (list(evals_cols.columns)) + keys = (list(evals['parameters'].values[0].keys())) + self.assertTrue(all(elem in columns for elem in keys)) def test_list_evaluations_setups_filter_task(self): openml.config.server = self.production_server task_id = [6] - size = 100 - self._check_list_evaluation_setups(size, task=task_id) + size = 121 + self._check_list_evaluation_setups(task=task_id, size=size) diff --git a/tests/test_evaluations/test_evaluations_example.py b/tests/test_evaluations/test_evaluations_example.py new file mode 100644 index 000000000..490971c1e --- /dev/null +++ b/tests/test_evaluations/test_evaluations_example.py @@ -0,0 +1,34 @@ +import unittest + + +class TestEvaluationsExample(unittest.TestCase): + + def test_example_python_paper(self): + # Example script which will appear in the upcoming OpenML-Python paper + # This test ensures that the example will keep running! + + import openml + import numpy as np + import matplotlib.pyplot as plt + + df = openml.evaluations.list_evaluations_setups( + 'predictive_accuracy', + flow=[8353], + task=[6], + output_format='dataframe', + parameters_in_separate_columns=True, + ) # Choose an SVM flow, for example 8353, and a task. + + hp_names = ['sklearn.svm.classes.SVC(16)_C', 'sklearn.svm.classes.SVC(16)_gamma'] + df[hp_names] = df[hp_names].astype(float).apply(np.log) + C, gamma, score = df[hp_names[0]], df[hp_names[1]], df['value'] + + cntr = plt.tricontourf(C, gamma, score, levels=12, cmap='RdBu_r') + plt.colorbar(cntr, label='accuracy') + plt.xlim((min(C), max(C))) + plt.ylim((min(gamma), max(gamma))) + plt.xlabel('C (log10)', size=16) + plt.ylabel('gamma (log10)', size=16) + plt.title('SVM performance landscape', size=20) + + plt.tight_layout() diff --git a/tests/test_examples/__init__.py b/tests/test_examples/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/test_examples/test_OpenMLDemo.py b/tests/test_examples/test_OpenMLDemo.py deleted file mode 100644 index 64c710873..000000000 --- a/tests/test_examples/test_OpenMLDemo.py +++ /dev/null @@ -1,82 +0,0 @@ -import os -import shutil -import sys - -import matplotlib -matplotlib.use('AGG') -import nbformat -from nbconvert.exporters import export -from nbconvert.exporters.python import PythonExporter - -import unittest.mock as mock - -from unittest import skip -import openml._api_calls -import openml.config -from openml.testing import TestBase - -_perform_api_call = openml._api_calls._perform_api_call - - -class OpenMLDemoTest(TestBase): - def setUp(self): - super(OpenMLDemoTest, self).setUp() - - python_version = sys.version_info[0] - self.kernel_name = 'python%d' % python_version - self.this_file_directory = os.path.dirname(__file__) - self.notebook_output_directory = os.path.join( - self.this_file_directory, '.out') - - try: - shutil.rmtree(self.notebook_output_directory) - except OSError: - pass - - try: - os.makedirs(self.notebook_output_directory) - except OSError: - pass - - def _tst_notebook(self, notebook_name): - - notebook_filename = os.path.abspath(os.path.join( - self.this_file_directory, '..', '..', 'examples', notebook_name)) - - with open(notebook_filename) as f: - nb = nbformat.read(f, as_version=4) - - python_nb, metadata = export(PythonExporter, nb) - - # Remove magic lines manually - python_nb = '\n'.join([ - line for line in python_nb.split('\n') - if 'get_ipython().run_line_magic(' not in line - ]) - - exec(python_nb) - - @skip - @mock.patch('openml._api_calls._perform_api_call') - def test_tutorial_openml(self, patch): - def side_effect(*args, **kwargs): - if ( - args[0].endswith('/run/') - and kwargs['file_elements'] is not None - ): - return """ - 1 - - """ - else: - return _perform_api_call(*args, **kwargs) - patch.side_effect = side_effect - - openml.config.server = self.production_server - self._tst_notebook('OpenML_Tutorial.ipynb') - self.assertGreater(patch.call_count, 100) - - @skip("Deleted tutorial file") - def test_tutorial_dataset(self): - - self._tst_notebook('Dataset_import.ipynb') diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 8bc615516..a93c79bcd 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -8,6 +8,7 @@ from collections import OrderedDict from unittest import mock import warnings +from packaging import version import numpy as np import scipy.optimize @@ -75,7 +76,8 @@ def test_serialize_model(self): fixture_name = 'sklearn.tree.tree.DecisionTreeClassifier' fixture_short_name = 'sklearn.DecisionTreeClassifier' - fixture_description = 'Automatically created scikit-learn flow.' + # str obtained from self.extension._get_sklearn_description(model) + fixture_description = 'A decision tree classifier.' version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \ % sklearn.__version__ # min_impurity_decrease has been introduced in 0.20 @@ -137,13 +139,24 @@ def test_serialize_model(self): self.assertEqual(check_dependencies_mock.call_count, 1) + def test_can_handle_flow(self): + openml.config.server = self.production_server + + R_flow = openml.flows.get_flow(6794) + assert not self.extension.can_handle_flow(R_flow) + old_3rd_party_flow = openml.flows.get_flow(7660) + assert self.extension.can_handle_flow(old_3rd_party_flow) + + openml.config.server = self.test_server + def test_serialize_model_clustering(self): with mock.patch.object(self.extension, '_check_dependencies') as check_dependencies_mock: model = sklearn.cluster.KMeans() fixture_name = 'sklearn.cluster.k_means_.KMeans' fixture_short_name = 'sklearn.KMeans' - fixture_description = 'Automatically created scikit-learn flow.' + # str obtained from self.extension._get_sklearn_description(model) + fixture_description = 'K-Means clustering' version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \ % sklearn.__version__ # n_jobs default has changed to None in 0.20 @@ -207,10 +220,18 @@ def test_serialize_model_with_subcomponent(self): '(base_estimator=sklearn.tree.tree.DecisionTreeClassifier)' fixture_class_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier' fixture_short_name = 'sklearn.AdaBoostClassifier' - fixture_description = 'Automatically created scikit-learn flow.' + # str obtained from self.extension._get_sklearn_description(model) + fixture_description = 'An AdaBoost classifier.\n\nAn AdaBoost [1] classifier is a '\ + 'meta-estimator that begins by fitting a\nclassifier on the original'\ + ' dataset and then fits additional copies of the\nclassifier on the '\ + 'same dataset but where the weights of incorrectly\nclassified '\ + 'instances are adjusted such that subsequent classifiers focus\nmore'\ + ' on difficult cases.\n\nThis class implements the algorithm known '\ + 'as AdaBoost-SAMME [2].' fixture_subcomponent_name = 'sklearn.tree.tree.DecisionTreeClassifier' fixture_subcomponent_class_name = 'sklearn.tree.tree.DecisionTreeClassifier' - fixture_subcomponent_description = 'Automatically created scikit-learn flow.' + # str obtained from self.extension._get_sklearn_description(model.base_estimator) + fixture_subcomponent_description = 'A decision tree classifier.' fixture_structure = { fixture_name: [], 'sklearn.tree.tree.DecisionTreeClassifier': ['base_estimator'] @@ -264,7 +285,25 @@ def test_serialize_pipeline(self): 'scaler=sklearn.preprocessing.data.StandardScaler,' \ 'dummy=sklearn.dummy.DummyClassifier)' fixture_short_name = 'sklearn.Pipeline(StandardScaler,DummyClassifier)' - fixture_description = 'Automatically created scikit-learn flow.' + + if version.parse(sklearn.__version__) >= version.parse("0.21.0"): + fixture_description = "Pipeline of transforms with a final estimator.\n\nSequentially"\ + " apply a list of transforms and a final estimator.\n"\ + "Intermediate steps of the pipeline must be 'transforms', that "\ + "is, they\nmust implement fit and transform methods.\nThe final "\ + "estimator only needs to implement fit.\nThe transformers in "\ + "the pipeline can be cached using ``memory`` argument.\n\nThe "\ + "purpose of the pipeline is to assemble several steps that can "\ + "be\ncross-validated together while setting different parameters"\ + ".\nFor this, it enables setting parameters of the various steps"\ + " using their\nnames and the parameter name separated by a '__',"\ + " as in the example below.\nA step's estimator may be replaced "\ + "entirely by setting the parameter\nwith its name to another "\ + "estimator, or a transformer removed by setting\nit to "\ + "'passthrough' or ``None``." + else: + fixture_description = self.extension._get_sklearn_description(model) + fixture_structure = { fixture_name: [], 'sklearn.preprocessing.data.StandardScaler': ['scaler'], @@ -353,7 +392,24 @@ def test_serialize_pipeline_clustering(self): 'scaler=sklearn.preprocessing.data.StandardScaler,' \ 'clusterer=sklearn.cluster.k_means_.KMeans)' fixture_short_name = 'sklearn.Pipeline(StandardScaler,KMeans)' - fixture_description = 'Automatically created scikit-learn flow.' + + if version.parse(sklearn.__version__) >= version.parse("0.21.0"): + fixture_description = "Pipeline of transforms with a final estimator.\n\nSequentially"\ + " apply a list of transforms and a final estimator.\n"\ + "Intermediate steps of the pipeline must be 'transforms', that "\ + "is, they\nmust implement fit and transform methods.\nThe final "\ + "estimator only needs to implement fit.\nThe transformers in "\ + "the pipeline can be cached using ``memory`` argument.\n\nThe "\ + "purpose of the pipeline is to assemble several steps that can "\ + "be\ncross-validated together while setting different parameters"\ + ".\nFor this, it enables setting parameters of the various steps"\ + " using their\nnames and the parameter name separated by a '__',"\ + " as in the example below.\nA step's estimator may be replaced "\ + "entirely by setting the parameter\nwith its name to another "\ + "estimator, or a transformer removed by setting\nit to "\ + "'passthrough' or ``None``." + else: + fixture_description = self.extension._get_sklearn_description(model) fixture_structure = { fixture_name: [], 'sklearn.preprocessing.data.StandardScaler': ['scaler'], @@ -445,7 +501,20 @@ def test_serialize_column_transformer(self): 'numeric=sklearn.preprocessing.data.StandardScaler,' \ 'nominal=sklearn.preprocessing._encoders.OneHotEncoder)' fixture_short_name = 'sklearn.ColumnTransformer' - fixture_description = 'Automatically created scikit-learn flow.' + + if version.parse(sklearn.__version__) >= version.parse("0.21.0"): + # str obtained from self.extension._get_sklearn_description(model) + fixture_description = 'Applies transformers to columns of an array or pandas '\ + 'DataFrame.\n\nThis estimator allows different columns or '\ + 'column subsets of the input\nto be transformed separately and '\ + 'the features generated by each transformer\nwill be '\ + 'concatenated to form a single feature space.\nThis is useful '\ + 'for heterogeneous or columnar data, to combine several\nfeature'\ + ' extraction mechanisms or transformations into a single '\ + 'transformer.' + else: + fixture_description = self.extension._get_sklearn_description(model) + fixture_structure = { fixture: [], 'sklearn.preprocessing.data.StandardScaler': ['numeric'], @@ -504,7 +573,25 @@ def test_serialize_column_transformer_pipeline(self): fixture_name: [], } - fixture_description = 'Automatically created scikit-learn flow.' + if version.parse(sklearn.__version__) >= version.parse("0.21.0"): + # str obtained from self.extension._get_sklearn_description(model) + fixture_description = "Pipeline of transforms with a final estimator.\n\nSequentially"\ + " apply a list of transforms and a final estimator.\n"\ + "Intermediate steps of the pipeline must be 'transforms', that "\ + "is, they\nmust implement fit and transform methods.\nThe final"\ + " estimator only needs to implement fit.\nThe transformers in "\ + "the pipeline can be cached using ``memory`` argument.\n\nThe "\ + "purpose of the pipeline is to assemble several steps that can "\ + "be\ncross-validated together while setting different "\ + "parameters.\nFor this, it enables setting parameters of the "\ + "various steps using their\nnames and the parameter name "\ + "separated by a '__', as in the example below.\nA step's "\ + "estimator may be replaced entirely by setting the parameter\n"\ + "with its name to another estimator, or a transformer removed by"\ + " setting\nit to 'passthrough' or ``None``." + else: + fixture_description = self.extension._get_sklearn_description(model) + serialization = self.extension.model_to_flow(model) structure = serialization.get_structure('name') self.assertEqual(serialization.name, fixture_name) diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index 95b4fa3f0..91c107b3d 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -95,7 +95,6 @@ def test_are_flows_equal(self): # Test most important values that can be set by a user openml.flows.functions.assert_flows_equal(flow, flow) for attribute, new_value in [('name', 'Tes'), - ('description', 'Test flo'), ('external_version', '2'), ('language', 'english'), ('dependencies', 'ab'), @@ -264,6 +263,13 @@ def test_sklearn_to_flow_list_of_lists(self): self.assertEqual(server_flow.parameters['categories'], '[[0, 1], [0, 1]]') self.assertEqual(server_flow.model.categories, flow.model.categories) + def test_get_flow1(self): + # Regression test for issue #305 + # Basically, this checks that a flow without an external version can be loaded + openml.config.server = self.production_server + flow = openml.flows.get_flow(1) + self.assertIsNone(flow.external_version) + def test_get_flow_reinstantiate_model(self): model = ensemble.RandomForestClassifier(n_estimators=33) extension = openml.extensions.get_extension_by_model(model) @@ -290,9 +296,37 @@ def test_get_flow_reinstantiate_model_wrong_version(self): openml.config.server = self.production_server _, sklearn_major, _ = LooseVersion(sklearn.__version__).version[:3] flow = 8175 - expected = 'Trying to deserialize a model with dependency sklearn==0.19.1 not satisfied.' + expected = ('Trying to deserialize a model with dependency' + ' sklearn==0.19.1 not satisfied.') self.assertRaisesRegex(ValueError, expected, openml.flows.get_flow, flow_id=flow, reinstantiate=True) + if LooseVersion(sklearn.__version__) > "0.19.1": + # 0.18 actually can't deserialize this because of incompatibility + flow = openml.flows.get_flow(flow_id=flow, reinstantiate=True, + strict_version=False) + # ensure that a new flow was created + assert flow.flow_id is None + assert "0.19.1" not in flow.dependencies + + def test_get_flow_id(self): + clf = sklearn.tree.DecisionTreeClassifier() + flow = openml.extensions.get_extension_by_model(clf).model_to_flow(clf).publish() + + self.assertEqual(openml.flows.get_flow_id(model=clf, exact_version=True), flow.flow_id) + flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False) + self.assertIn(flow.flow_id, flow_ids) + self.assertGreater(len(flow_ids), 2) + + # Check that the output of get_flow_id is identical if only the name is given, no matter + # whether exact_version is set to True or False. + flow_ids_exact_version_True = openml.flows.get_flow_id(name=flow.name, exact_version=True) + flow_ids_exact_version_False = openml.flows.get_flow_id( + name=flow.name, + exact_version=False, + ) + self.assertEqual(flow_ids_exact_version_True, flow_ids_exact_version_False) + self.assertIn(flow.flow_id, flow_ids_exact_version_True) + self.assertGreater(len(flow_ids_exact_version_True), 2) diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py index 88fe8d6ef..0266ca4d9 100644 --- a/tests/test_runs/test_run.py +++ b/tests/test_runs/test_run.py @@ -46,8 +46,8 @@ def _test_run_obj_equals(self, run, run_prime): other = getattr(run_prime, dictionary) if other is not None: self.assertDictEqual(other, dict()) - self.assertEqual(run._create_description_xml(), - run_prime._create_description_xml()) + self.assertEqual(run._to_xml(), + run_prime._to_xml()) numeric_part = \ np.array(np.array(run.data_content)[:, 0:-2], dtype=float) @@ -134,7 +134,7 @@ def test_to_from_filesystem_vanilla(self): TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], run_prime.run_id)) - @pytest.mark.flaky(reruns=3) + @pytest.mark.flaky() def test_to_from_filesystem_search(self): model = Pipeline([ diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index dc35d1f01..4ff39ac6d 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -7,6 +7,7 @@ import unittest.mock import numpy as np +import pytest import openml import openml.exceptions @@ -78,7 +79,7 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds): if len(run.evaluations) > 0: return else: - time.sleep(10) + time.sleep(3) raise RuntimeError('Could not find any evaluations! Please check whether run {} was ' 'evaluated correctly on the server'.format(run_id)) @@ -205,7 +206,7 @@ def _remove_random_state(flow): # This is only a smoke check right now # TODO add a few asserts here - run._create_description_xml() + run._to_xml() if run.trace is not None: # This is only a smoke check right now # TODO add a few asserts here @@ -419,7 +420,7 @@ def determine_grid_size(param_grid): fold=0, ) except openml.exceptions.OpenMLServerException as e: - e.additional = "%s; run_id %d" % (e.additional, run.run_id) + e.message = "%s; run_id %d" % (e.message, run.run_id) raise e self._rerun_model_and_compare_predictions(run.run_id, model_prime, @@ -826,6 +827,7 @@ def test_initialize_model_from_run(self): self.assertEqual(flowS.components['VarianceThreshold']. parameters['threshold'], '0.05') + @pytest.mark.flaky() def test_get_run_trace(self): # get_run_trace is already tested implicitly in test_run_and_publish # this test is a bit additional. @@ -1110,10 +1112,21 @@ def test_get_run(self): self.assertEqual(run.fold_evaluations['f_measure'][0][i], value) assert ('weka' in run.tags) assert ('weka_3.7.12' in run.tags) + assert ( + run.predictions_url == ( + "https://www.openml.org/data/download/1667125/" + "weka_generated_predictions4575715871712251329.arff" + ) + ) def _check_run(self, run): + # This tests that the API returns seven entries for each run + # Check out https://openml.org/api/v1/xml/run/list/flow/1154 + # They are run_id, task_id, task_type_id, setup_id, flow_id, uploader, upload_time + # error_message and run_details exist, too, but are not used so far. We need to update + # this check once they are used! self.assertIsInstance(run, dict) - self.assertEqual(len(run), 7) + assert len(run) == 7, str(run) def test_get_runs_list(self): # TODO: comes from live, no such lists on test @@ -1235,9 +1248,9 @@ def test_get_runs_list_by_filters(self): runs = openml.runs.list_runs(id=ids, task=tasks, uploader=uploaders_1) - @unittest.skip("API currently broken: https://github.com/openml/OpenML/issues/948") def test_get_runs_list_by_tag(self): # TODO: comes from live, no such lists on test + # Unit test works on production server only openml.config.server = self.production_server runs = openml.runs.list_runs(tag='curves') self.assertGreaterEqual(len(runs), 1) diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py index 33ba0c452..e31a40cd2 100644 --- a/tests/test_study/test_study_functions.py +++ b/tests/test_study/test_study_functions.py @@ -34,7 +34,7 @@ def test_get_openml100(self): self.assertIsInstance(study, openml.study.OpenMLBenchmarkSuite) study_2 = openml.study.get_suite('OpenML100') self.assertIsInstance(study_2, openml.study.OpenMLBenchmarkSuite) - self.assertEqual(study.id, study_2.id) + self.assertEqual(study.study_id, study_2.study_id) def test_get_study_error(self): openml.config.server = self.production_server @@ -76,14 +76,14 @@ def test_publish_benchmark_suite(self): description=fixture_descr, task_ids=fixture_task_ids ) - study_id = study.publish() - TestBase._mark_entity_for_removal('study', study_id) - TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], study_id)) + study.publish() + TestBase._mark_entity_for_removal('study', study.id) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], study.id)) - self.assertGreater(study_id, 0) + self.assertGreater(study.id, 0) # verify main meta data - study_downloaded = openml.study.get_suite(study_id) + study_downloaded = openml.study.get_suite(study.id) self.assertEqual(study_downloaded.alias, fixture_alias) self.assertEqual(study_downloaded.name, fixture_name) self.assertEqual(study_downloaded.description, fixture_descr) @@ -98,34 +98,34 @@ def test_publish_benchmark_suite(self): # attach more tasks tasks_additional = [4, 5, 6] - openml.study.attach_to_study(study_id, tasks_additional) - study_downloaded = openml.study.get_suite(study_id) + openml.study.attach_to_study(study.id, tasks_additional) + study_downloaded = openml.study.get_suite(study.id) # verify again self.assertSetEqual(set(study_downloaded.tasks), set(fixture_task_ids + tasks_additional)) # test detach function - openml.study.detach_from_study(study_id, fixture_task_ids) - study_downloaded = openml.study.get_suite(study_id) + openml.study.detach_from_study(study.id, fixture_task_ids) + study_downloaded = openml.study.get_suite(study.id) self.assertSetEqual(set(study_downloaded.tasks), set(tasks_additional)) # test status update function - openml.study.update_suite_status(study_id, 'deactivated') - study_downloaded = openml.study.get_suite(study_id) + openml.study.update_suite_status(study.id, 'deactivated') + study_downloaded = openml.study.get_suite(study.id) self.assertEqual(study_downloaded.status, 'deactivated') # can't delete study, now it's not longer in preparation def test_publish_study(self): # get some random runs to attach - run_list = openml.runs.list_runs(size=10) + run_list = openml.evaluations.list_evaluations('predictive_accuracy', size=10) self.assertEqual(len(run_list), 10) fixt_alias = None fixt_name = 'unit tested study' fixt_descr = 'bla' - fixt_flow_ids = set([run['flow_id'] for run in run_list.values()]) - fixt_task_ids = set([run['task_id'] for run in run_list.values()]) - fixt_setup_ids = set([run['setup_id']for run in run_list.values()]) + fixt_flow_ids = set([evaluation.flow_id for evaluation in run_list.values()]) + fixt_task_ids = set([evaluation.task_id for evaluation in run_list.values()]) + fixt_setup_ids = set([evaluation.setup_id for evaluation in run_list.values()]) study = openml.study.create_study( alias=fixt_alias, @@ -134,11 +134,11 @@ def test_publish_study(self): description=fixt_descr, run_ids=list(run_list.keys()) ) - study_id = study.publish() + study.publish() # not tracking upload for delete since _delete_entity called end of function # asserting return status from openml.study.delete_study() - self.assertGreater(study_id, 0) - study_downloaded = openml.study.get_study(study_id) + self.assertGreater(study.id, 0) + study_downloaded = openml.study.get_study(study.id) self.assertEqual(study_downloaded.alias, fixt_alias) self.assertEqual(study_downloaded.name, fixt_name) self.assertEqual(study_downloaded.description, fixt_descr) @@ -149,27 +149,35 @@ def test_publish_study(self): self.assertSetEqual(set(study_downloaded.flows), set(fixt_flow_ids)) self.assertSetEqual(set(study_downloaded.tasks), set(fixt_task_ids)) + # test whether the list run function also handles study data fine + run_ids = openml.runs.list_runs(study=study.id) + self.assertSetEqual(set(run_ids), set(study_downloaded.runs)) + + # test whether the list evaluation function also handles study data fine + run_ids = openml.evaluations.list_evaluations('predictive_accuracy', study=study.id) + self.assertSetEqual(set(run_ids), set(study_downloaded.runs)) + # attach more runs run_list_additional = openml.runs.list_runs(size=10, offset=10) - openml.study.attach_to_study(study_id, + openml.study.attach_to_study(study.id, list(run_list_additional.keys())) - study_downloaded = openml.study.get_study(study_id) + study_downloaded = openml.study.get_study(study.id) # verify again all_run_ids = set(run_list_additional.keys()) | set(run_list.keys()) self.assertSetEqual(set(study_downloaded.runs), all_run_ids) # test detach function - openml.study.detach_from_study(study_id, list(run_list.keys())) - study_downloaded = openml.study.get_study(study_id) + openml.study.detach_from_study(study.id, list(run_list.keys())) + study_downloaded = openml.study.get_study(study.id) self.assertSetEqual(set(study_downloaded.runs), set(run_list_additional.keys())) # test status update function - openml.study.update_study_status(study_id, 'deactivated') - study_downloaded = openml.study.get_study(study_id) + openml.study.update_study_status(study.id, 'deactivated') + study_downloaded = openml.study.get_study(study.id) self.assertEqual(study_downloaded.status, 'deactivated') - res = openml.study.delete_study(study_id) + res = openml.study.delete_study(study.id) self.assertTrue(res) def test_study_attach_illegal(self): @@ -185,21 +193,21 @@ def test_study_attach_illegal(self): description='none', run_ids=list(run_list.keys()) ) - study_id = study.publish() - TestBase._mark_entity_for_removal('study', study_id) - TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], study_id)) - study_original = openml.study.get_study(study_id) + study.publish() + TestBase._mark_entity_for_removal('study', study.id) + TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], study.id)) + study_original = openml.study.get_study(study.id) with self.assertRaisesRegex(openml.exceptions.OpenMLServerException, 'Problem attaching entities.'): # run id does not exists - openml.study.attach_to_study(study_id, [0]) + openml.study.attach_to_study(study.id, [0]) with self.assertRaisesRegex(openml.exceptions.OpenMLServerException, 'Problem attaching entities.'): # some runs already attached - openml.study.attach_to_study(study_id, list(run_list_more.keys())) - study_downloaded = openml.study.get_study(study_id) + openml.study.attach_to_study(study.id, list(run_list_more.keys())) + study_downloaded = openml.study.get_study(study.id) self.assertListEqual(study_original.runs, study_downloaded.runs) def test_study_list(self): diff --git a/tests/test_tasks/test_clustering_task.py b/tests/test_tasks/test_clustering_task.py index 168b798d1..53152acb5 100644 --- a/tests/test_tasks/test_clustering_task.py +++ b/tests/test_tasks/test_clustering_task.py @@ -40,10 +40,10 @@ def test_upload_task(self): dataset_id=dataset_id, estimation_procedure_id=self.estimation_procedure ) - task_id = task.publish() - TestBase._mark_entity_for_removal('task', task_id) + task = task.publish() + TestBase._mark_entity_for_removal('task', task.id) TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], - task_id)) + task.id)) # success break except OpenMLServerException as e: diff --git a/tests/test_tasks/test_task.py b/tests/test_tasks/test_task.py index 3066d9ce9..0154dc2a3 100644 --- a/tests/test_tasks/test_task.py +++ b/tests/test_tasks/test_task.py @@ -57,10 +57,10 @@ def test_upload_task(self): estimation_procedure_id=self.estimation_procedure ) - task_id = task.publish() - TestBase._mark_entity_for_removal('task', task_id) + task.publish() + TestBase._mark_entity_for_removal('task', task.id) TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], - task_id)) + task.id)) # success break except OpenMLServerException as e: diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py index f773752d5..fd64f805d 100644 --- a/tests/test_tasks/test_task_functions.py +++ b/tests/test_tasks/test_task_functions.py @@ -84,8 +84,6 @@ def test_list_tasks_empty(self): self.assertIsInstance(tasks, dict) - @unittest.skip("Server will currently incorrectly return only 99 tasks." - "See https://github.com/openml/OpenML/issues/980") def test_list_tasks_by_tag(self): num_basic_tasks = 100 # number is flexible, check server if fails tasks = openml.tasks.list_tasks(tag='OpenML100') diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index 1f754c23a..de2d18981 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -75,14 +75,14 @@ def test_list_all_for_setups(self): self.assertEqual(len(setups), required_size) def test_list_all_for_runs(self): - required_size = 48 + required_size = 21 runs = openml.runs.list_runs(batch_size=self._batch_size, size=required_size) # might not be on test server after reset, please rerun test at least once if fails self.assertEqual(len(runs), required_size) def test_list_all_for_evaluations(self): - required_size = 57 + required_size = 22 # TODO apparently list_evaluations function does not support kwargs evaluations = openml.evaluations.list_evaluations(function='predictive_accuracy', size=required_size)