diff --git a/examples/20_basic/README.txt b/examples/20_basic/README.txt new file mode 100644 index 000000000..29c787116 --- /dev/null +++ b/examples/20_basic/README.txt @@ -0,0 +1,4 @@ +Introductory Examples +===================== + +Introductory examples to the usage of the OpenML python connector. diff --git a/examples/introduction_tutorial.py b/examples/20_basic/introduction_tutorial.py similarity index 97% rename from examples/introduction_tutorial.py rename to examples/20_basic/introduction_tutorial.py index 9cd88ceba..c54bb7b96 100644 --- a/examples/introduction_tutorial.py +++ b/examples/20_basic/introduction_tutorial.py @@ -1,8 +1,8 @@ """ -Introduction -============ +Setup +===== -An introduction to OpenML, followed up by a simple example. +An example how to set up OpenML-Python followed up by a simple example. """ ############################################################################ # OpenML is an online collaboration platform for machine learning which allows diff --git a/examples/20_basic/simple_datasets_tutorial.py b/examples/20_basic/simple_datasets_tutorial.py new file mode 100644 index 000000000..6239cbb15 --- /dev/null +++ b/examples/20_basic/simple_datasets_tutorial.py @@ -0,0 +1,29 @@ +""" +======== +Datasets +======== + +A basic tutorial on how to list and download datasets. +""" +############################################################################ +import openml + +############################################################################ +# List datasets +# ============= + +datasets_df = openml.datasets.list_datasets(output_format='dataframe') +print(datasets_df.head(n=10)) + +############################################################################ +# Download a dataset +# ================== + +first_dataset_id = int(datasets_df['did'].iloc[0]) +dataset = openml.datasets.get_dataset(first_dataset_id) + +# Print a summary +print("This is dataset '%s', the target feature is '%s'" % + (dataset.name, dataset.default_target_attribute)) +print("URL: %s" % dataset.url) +print(dataset.description[:500]) diff --git a/examples/20_basic/simple_flows_and_runs_tutorial.py b/examples/20_basic/simple_flows_and_runs_tutorial.py new file mode 100644 index 000000000..e3f028418 --- /dev/null +++ b/examples/20_basic/simple_flows_and_runs_tutorial.py @@ -0,0 +1,48 @@ +""" +Flows and Runs +============== + +A simple tutorial on how to train/run a model and how to upload the results. +""" + +import openml +from sklearn import ensemble, neighbors + +############################################################################ +# Train a machine learning model +# ============================== +# +# .. warning:: This example uploads data. For that reason, this example +# connects to the test server at test.openml.org. This prevents the main +# server from crowding with example datasets, tasks, runs, and so on. + +openml.config.start_using_configuration_for_example() + +# NOTE: We are using dataset 20 from the test server: https://test.openml.org/d/20 +dataset = openml.datasets.get_dataset(20) +X, y, categorical_indicator, attribute_names = dataset.get_data( + dataset_format='array', + target=dataset.default_target_attribute +) +clf = neighbors.KNeighborsClassifier(n_neighbors=3) +clf.fit(X, y) + +############################################################################ +# Running a model on a task +# ========================= + +task = openml.tasks.get_task(119) +clf = ensemble.RandomForestClassifier() +run = openml.runs.run_model_on_task(clf, task) +print(run) + +############################################################################ +# Publishing the run +# ================== + +myrun = run.publish() +print("Run was uploaded to http://test.openml.org/r/" + str(myrun.run_id)) +print("The flow can be found at http://test.openml.org/f/" + str(myrun.flow_id)) + +############################################################################ +openml.config.stop_using_configuration_for_example() diff --git a/examples/20_basic/simple_studies_tutorial.py b/examples/20_basic/simple_studies_tutorial.py new file mode 100644 index 000000000..5198edf66 --- /dev/null +++ b/examples/20_basic/simple_studies_tutorial.py @@ -0,0 +1,7 @@ +""" +======= +Studies +======= + +This is only a placeholder so far. +""" diff --git a/examples/30_extended/README.txt b/examples/30_extended/README.txt new file mode 100644 index 000000000..432fa68f0 --- /dev/null +++ b/examples/30_extended/README.txt @@ -0,0 +1,4 @@ +In-Depth Examples +================= + +Extended examples for the usage of the OpenML python connector. \ No newline at end of file diff --git a/examples/create_upload_tutorial.py b/examples/30_extended/create_upload_tutorial.py similarity index 100% rename from examples/create_upload_tutorial.py rename to examples/30_extended/create_upload_tutorial.py diff --git a/examples/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py similarity index 99% rename from examples/datasets_tutorial.py rename to examples/30_extended/datasets_tutorial.py index 70da03d15..e94476c2c 100644 --- a/examples/datasets_tutorial.py +++ b/examples/30_extended/datasets_tutorial.py @@ -14,6 +14,7 @@ # ********** # # * List datasets +# # * Use the output_format parameter to select output type # * Default gives 'dict' (other option: 'dataframe') diff --git a/examples/fetch_evaluations_tutorial.py b/examples/30_extended/fetch_evaluations_tutorial.py similarity index 100% rename from examples/fetch_evaluations_tutorial.py rename to examples/30_extended/fetch_evaluations_tutorial.py diff --git a/examples/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py similarity index 99% rename from examples/flows_and_runs_tutorial.py rename to examples/30_extended/flows_and_runs_tutorial.py index d65abdf28..6603b6621 100644 --- a/examples/flows_and_runs_tutorial.py +++ b/examples/30_extended/flows_and_runs_tutorial.py @@ -132,7 +132,7 @@ # The run may be stored offline, and the flow will be stored along with it: run.to_filesystem(directory='myrun') -# They made later be loaded and uploaded +# They may be loaded and uploaded at a later time run = openml.runs.OpenMLRun.from_filesystem(directory='myrun') run.publish() diff --git a/examples/run_setup_tutorial.py b/examples/30_extended/run_setup_tutorial.py similarity index 100% rename from examples/run_setup_tutorial.py rename to examples/30_extended/run_setup_tutorial.py diff --git a/examples/tasks_tutorial.py b/examples/30_extended/tasks_tutorial.py similarity index 87% rename from examples/tasks_tutorial.py rename to examples/30_extended/tasks_tutorial.py index c54ecdbd9..f7a28ef92 100644 --- a/examples/tasks_tutorial.py +++ b/examples/30_extended/tasks_tutorial.py @@ -13,15 +13,14 @@ # Tasks are identified by IDs and can be accessed in two different ways: # # 1. In a list providing basic information on all tasks available on OpenML. -# This function will not download the actual tasks, but will instead download -# meta data that can be used to filter the tasks and retrieve a set of IDs. -# We can filter this list, for example, we can only list tasks having a -# special tag or only tasks for a specific target such as -# *supervised classification*. -# +# This function will not download the actual tasks, but will instead download +# meta data that can be used to filter the tasks and retrieve a set of IDs. +# We can filter this list, for example, we can only list tasks having a +# special tag or only tasks for a specific target such as +# *supervised classification*. # 2. A single task by its ID. It contains all meta information, the target -# metric, the splits and an iterator which can be used to access the -# splits in a useful manner. +# metric, the splits and an iterator which can be used to access the +# splits in a useful manner. ############################################################################ # Listing tasks @@ -148,9 +147,9 @@ # # You can also create new tasks. Take the following into account: # -# * You can only create tasks on _active_ datasets +# * You can only create tasks on *active* datasets # * For now, only the following tasks are supported: classification, regression, -# clustering, and learning curve analysis. +# clustering, and learning curve analysis. # * For now, tasks can only be created on a single dataset. # * The exact same task must not already exist. # @@ -158,10 +157,9 @@ # # * task_type_id: The task type ID, required (see below). Required. # * dataset_id: The dataset ID. Required. -# * target_name: The name of the attribute you aim to predict. -# Optional. +# * target_name: The name of the attribute you aim to predict. Optional. # * estimation_procedure_id : The ID of the estimation procedure used to create train-test -# splits. Optional. +# splits. Optional. # * evaluation_measure: The name of the evaluation measure. Optional. # * Any additional inputs for specific tasks # @@ -178,7 +176,7 @@ # # Let's create a classification task on a dataset. In this example we will do this on the # Iris dataset (ID=128 (on test server)). We'll use 10-fold cross-validation (ID=1), -# and _predictive accuracy_ as the predefined measure (this can also be left open). +# and *predictive accuracy* as the predefined measure (this can also be left open). # If a task with these parameters exist, we will get an appropriate exception. # If such a task doesn't exist, a task will be created and the corresponding task_id # will be returned. @@ -212,8 +210,7 @@ ############################################################################ -# [Complete list of task types](https://www.openml.org/search?type=task_type) -# [Complete list of model estimation procedures]( -# https://www.openml.org/search?q=%2520measure_type%3Aestimation_procedure&type=measure) -# [Complete list of evaluation measures]( -# https://www.openml.org/search?q=measure_type%3Aevaluation_measure&type=measure) +# * `Complete list of task types `_. +# * `Complete list of model estimation procedures `_. +# * `Complete list of evaluation measures `_. +# diff --git a/examples/40_paper/2015_neurips_feurer_example.py b/examples/40_paper/2015_neurips_feurer_example.py new file mode 100644 index 000000000..106d120df --- /dev/null +++ b/examples/40_paper/2015_neurips_feurer_example.py @@ -0,0 +1,18 @@ +""" +Feurer et al. (2015) +==================== + +A tutorial on how to get the datasets used in the paper introducing *Auto-sklearn* by Feurer et al.. + +Auto-sklearn website: https://automl.github.io/auto-sklearn/master/ + +Publication +~~~~~~~~~~~ + +| Efficient and Robust Automated Machine Learning +| Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter +| In *Advances in Neural Information Processing Systems 28*, 2015 +| Available at http://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf + +This is currently a placeholder. +""" diff --git a/examples/40_paper/2018_ida_strang_example.py b/examples/40_paper/2018_ida_strang_example.py new file mode 100644 index 000000000..21165a8ff --- /dev/null +++ b/examples/40_paper/2018_ida_strang_example.py @@ -0,0 +1,17 @@ +""" +Strang et al. (2018) +==================== + +A tutorial on how to reproduce the analysis conducted for *Don't Rule Out Simple Models +Prematurely: A Large Scale Benchmark Comparing Linear and Non-linear Classifiers in OpenML*. + +Publication +~~~~~~~~~~~ + +| Don't Rule Out Simple Models Prematurely: A Large Scale Benchmark Comparing Linear and Non-linear Classifiers in OpenML +| Benjamin Strang, Pieter Putten, Jan van Rijn and Frank Hutter +| In *Advances in Intelligent Data Analysis XVII 17th International Symposium*, 2018 +| Available at https://link.springer.com/chapter/10.1007%2F978-3-030-01768-2_25 + +This is currently a placeholder. +""" diff --git a/examples/40_paper/2018_kdd_rijn_example.py b/examples/40_paper/2018_kdd_rijn_example.py new file mode 100644 index 000000000..5e03f09a4 --- /dev/null +++ b/examples/40_paper/2018_kdd_rijn_example.py @@ -0,0 +1,16 @@ +""" +van Rijn and Hutter (2018) +========================== + +A tutorial on how to reproduce the paper *Hyperparameter Importance Across Datasets*. + +Publication +~~~~~~~~~~~ + +| Hyperparameter importance across datasets +| Jan van Rijn and Frank Hutter +| In *Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining*, 2018 +| Available at https://dl.acm.org/citation.cfm?id=3220058 + +This is currently a placeholder. +""" diff --git a/examples/40_paper/2018_neurips_fusi_example.py b/examples/40_paper/2018_neurips_fusi_example.py new file mode 100644 index 000000000..656f617fa --- /dev/null +++ b/examples/40_paper/2018_neurips_fusi_example.py @@ -0,0 +1,17 @@ +""" +Fusi et al. (2018) +================== + +A tutorial on how to get the datasets used in the paper introducing *Probabilistic Matrix +Factorization for Automated Machine Learning* by Fusi et al.. + +Publication +~~~~~~~~~~~ + +| Probabilistic Matrix Factorization for Automated Machine Learning +| Nicolo Fusi and Rishit Sheth and Melih Elibol +| In *Advances in Neural Information Processing Systems 31*, 2018 +| Available at http://papers.nips.cc/paper/7595-probabilistic-matrix-factorization-for-automated-machine-learning.pdf + +This is currently a placeholder. +""" diff --git a/examples/40_paper/2018_neurips_perrone_example.py b/examples/40_paper/2018_neurips_perrone_example.py new file mode 100644 index 000000000..3262ee4a1 --- /dev/null +++ b/examples/40_paper/2018_neurips_perrone_example.py @@ -0,0 +1,17 @@ +""" +Perrone et al. (2018) +===================== + +A tutorial on how to build a surrogate model based on OpenML data as done for *Scalable +Hyperparameter Transfer Learning* by Perrone et al.. + +Publication +~~~~~~~~~~~ + +| Scalable Hyperparameter Transfer Learning +| Valerio Perrone and Rodolphe Jenatton and Matthias Seeger and Cedric Archambeau +| In *Advances in Neural Information Processing Systems 31*, 2018 +| Available at http://papers.nips.cc/paper/7917-scalable-hyperparameter-transfer-learning.pdf + +This is currently a placeholder. +""" diff --git a/examples/40_paper/README.txt b/examples/40_paper/README.txt new file mode 100644 index 000000000..9b571d55b --- /dev/null +++ b/examples/40_paper/README.txt @@ -0,0 +1,5 @@ +Usage in research papers +======================== + +These examples demonstrate how OpenML-Python can be used for research purposes by re-implementing +its use in recent publications. diff --git a/examples/README.txt b/examples/README.txt index e41bfd4fc..b90c0e1cb 100644 --- a/examples/README.txt +++ b/examples/README.txt @@ -1,4 +1,3 @@ -Introductory Examples -===================== - -General examples for OpenML usage. +======== +Examples +======== diff --git a/examples/sklearn/README.txt b/examples/sklearn/README.txt deleted file mode 100644 index d61578cf1..000000000 --- a/examples/sklearn/README.txt +++ /dev/null @@ -1,4 +0,0 @@ -Experiment Examples -=================== - -OpenML experiment examples using a sklearn classifier/pipeline. diff --git a/examples/sklearn/openml_run_example.py b/examples/sklearn/openml_run_example.py deleted file mode 100644 index 195a0aa77..000000000 --- a/examples/sklearn/openml_run_example.py +++ /dev/null @@ -1,40 +0,0 @@ -""" -OpenML Run Example -================== - -An example of an automated machine learning experiment. -""" -import openml -from sklearn import impute, tree, pipeline - -############################################################################ -# .. warning:: This example uploads data. For that reason, this example -# connects to the test server at test.openml.org. This prevents the main -# server from crowding with example datasets, tasks, runs, and so on. - -openml.config.start_using_configuration_for_example() -############################################################################ - -# Uncomment and set your OpenML key. Don't share your key with others. -# openml.config.apikey = 'YOURKEY' - -# Define a scikit-learn pipeline -clf = pipeline.Pipeline( - steps=[ - ('imputer', impute.SimpleImputer()), - ('estimator', tree.DecisionTreeClassifier()) - ] -) -############################################################################ -# Download the OpenML task for the german credit card dataset. -task = openml.tasks.get_task(97) -############################################################################ -# Run the scikit-learn model on the task (requires an API key). -run = openml.runs.run_model_on_task(clf, task) -# Publish the experiment on OpenML (optional, requires an API key). -run.publish() - -print('URL for run: %s/run/%d' % (openml.config.server, run.run_id)) - -############################################################################ -openml.config.stop_using_configuration_for_example() diff --git a/setup.cfg b/setup.cfg index 726c8fa73..156baa3bb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,3 +4,11 @@ description-file = README.md [tool:pytest] filterwarnings = ignore:the matrix subclass:PendingDeprecationWarning + +[flake8] +exclude = + # the following file and directory can be removed when the descriptions + # are shortened. More info at: + # https://travis-ci.org/openml/openml-python/jobs/590382001 + examples/30_extended/tasks_tutorial.py + examples/40_paper \ No newline at end of file