Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 83 additions & 8 deletions examples/tasks_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import openml
import pandas as pd
from pprint import pprint

############################################################################
#
Expand Down Expand Up @@ -40,11 +39,11 @@
tasks = pd.DataFrame.from_dict(tasks, orient='index')
print(tasks.columns)
print("First 5 of %s tasks:" % len(tasks))
pprint(tasks.head())
print(tasks.head())

# The same can be obtained through lesser lines of code
tasks_df = openml.tasks.list_tasks(task_type_id=1, output_format='dataframe')
pprint(tasks_df.head())
print(tasks_df.head())

############################################################################
# We can filter the list of tasks to only contain datasets with more than
Expand Down Expand Up @@ -78,22 +77,22 @@
tasks = openml.tasks.list_tasks(tag='OpenML100')
tasks = pd.DataFrame.from_dict(tasks, orient='index')
print("First 5 of %s tasks:" % len(tasks))
pprint(tasks.head())
print(tasks.head())

############################################################################
# Furthermore, we can list tasks based on the dataset id:

tasks = openml.tasks.list_tasks(data_id=1471)
tasks = pd.DataFrame.from_dict(tasks, orient='index')
print("First 5 of %s tasks:" % len(tasks))
pprint(tasks.head())
print(tasks.head())

############################################################################
# In addition, a size limit and an offset can be applied both separately and simultaneously:

tasks = openml.tasks.list_tasks(size=10, offset=50)
tasks = pd.DataFrame.from_dict(tasks, orient='index')
pprint(tasks)
print(tasks)

############################################################################
#
Expand Down Expand Up @@ -134,11 +133,87 @@
############################################################################
# Properties of the task are stored as member variables:

pprint(vars(task))
print(vars(task))

############################################################################
# And:

ids = [2, 1891, 31, 9983]
tasks = openml.tasks.get_tasks(ids)
pprint(tasks[0])
print(tasks[0])

############################################################################
# Creating tasks
# ^^^^^^^^^^^^^^
#
# You can also create new tasks. Take the following into account:
#
# * You can only create tasks on _active_ datasets
# * For now, only the following tasks are supported: classification, regression,
# clustering, and learning curve analysis.
# * For now, tasks can only be created on a single dataset.
# * The exact same task must not already exist.
#
# Creating a task requires the following input:
#
# * task_type_id: The task type ID, required (see below). Required.
# * dataset_id: The dataset ID. Required.
# * target_name: The name of the attribute you aim to predict.
# Optional.
# * estimation_procedure_id : The ID of the estimation procedure used to create train-test
# splits. Optional.
# * evaluation_measure: The name of the evaluation measure. Optional.
# * Any additional inputs for specific tasks
#
# It is best to leave the evaluation measure open if there is no strong prerequisite for a
# specific measure. OpenML will always compute all appropriate measures and you can filter
# or sort results on your favourite measure afterwards. Only add an evaluation measure if
# necessary (e.g. when other measure make no sense), since it will create a new task, which
# scatters results across tasks.


############################################################################
# Example
# #######
#
# Let's create a classification task on a dataset. In this example we will do this on the
# Iris dataset (ID=128 (on test server)). We'll use 10-fold cross-validation (ID=1),
# and _predictive accuracy_ as the predefined measure (this can also be left open).
# If a task with these parameters exist, we will get an appropriate exception.
# If such a task doesn't exist, a task will be created and the corresponding task_id
# will be returned.


# using test server for example uploads
openml.config.start_using_configuration_for_example()

try:
tasktypes = openml.tasks.TaskTypeEnum
my_task = openml.tasks.create_task(
task_type_id=tasktypes.SUPERVISED_CLASSIFICATION,
dataset_id=128,
target_name="class",
evaluation_measure="predictive_accuracy",
estimation_procedure_id=1)
my_task.publish()
except openml.exceptions.OpenMLServerException as e:
# Error code for 'task already exists'
if e.code == 614:
# Lookup task
tasks = openml.tasks.list_tasks(data_id=128, output_format='dataframe').to_numpy()
tasks = tasks[tasks[:, 4] == "Supervised Classification"]
tasks = tasks[tasks[:, 6] == "10-fold Crossvalidation"]
tasks = tasks[tasks[:, 19] == "predictive_accuracy"]
task_id = tasks[0][0]
print("Task already exists. Task ID is", task_id)

# reverting to prod server
openml.config.stop_using_configuration_for_example()


############################################################################
# [Complete list of task types](https://www.openml.org/search?type=task_type)
# [Complete list of model estimation procedures](
# https://www.openml.org/search?q=%2520measure_type%3Aestimation_procedure&type=measure)
# [Complete list of evaluation measures](
# https://www.openml.org/search?q=measure_type%3Aevaluation_measure&type=measure)
1 change: 0 additions & 1 deletion tests/test_tasks/test_clustering_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ def test_upload_task(self):
dataset_id=dataset_id,
estimation_procedure_id=self.estimation_procedure
)

task_id = task.publish()
TestBase._mark_entity_for_removal('task', task_id)
TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
Expand Down