Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 18 additions & 29 deletions examples/40_paper/2018_neurips_perrone_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
############################################################################
# The subsequent functions are defined to fetch tasks, flows, evaluations and preprocess them into
# a tabular format that can be used to build models.
#


def fetch_evaluations(run_full=False,
flow_type='svm',
Expand Down Expand Up @@ -79,25 +79,25 @@ def fetch_evaluations(run_full=False,
3492, 3493, 37, 3896, 3903, 3913, 3917, 3918, 3, 49, 9914,
9946, 9952, 9967,
]
else: #flow_type == 'xgboost' and not run_full:
else: # flow_type == 'xgboost' and not run_full:
task_ids = [3903, 37, 3485, 49, 3913]

# Fetching the relevant flow
flow_id = 5891 if flow_type == 'svm' else 6767

# Fetching evaluations
eval_df = openml.evaluations.list_evaluations(function=metric,
task=task_ids,
flow=[flow_id],
uploader=[2702],
output_format='dataframe')
eval_df = openml.evaluations.list_evaluations_setups(function=metric,
task=task_ids,
flow=[flow_id],
uploader=[2702],
output_format='dataframe',
parameters_in_separate_columns=True)
return eval_df, task_ids, flow_id


def create_table_from_evaluations(eval_df,
flow_type='svm',
run_count=np.iinfo(np.int64).max,
metric = 'area_under_roc_curve',
task_ids=None):
'''
Create a tabular data with its ground truth from a dataframe of evaluations.
Expand All @@ -111,8 +111,6 @@ def create_table_from_evaluations(eval_df,
To select whether svm or xgboost experiments are to be run
run_count : int
Maximum size of the table created, or number of runs included in the table
metric : str
The evaluation measure that is passed to openml.evaluations.list_evaluations
task_ids : list, (optional)
List of integers specifying the tasks to be retained from the evaluations dataframe

Expand All @@ -132,18 +130,11 @@ def create_table_from_evaluations(eval_df,
'subsample',
]
eval_df = eval_df.sample(frac=1) # shuffling rows
run_ids = eval_df["run_id"][:run_count]
eval_table = pd.DataFrame(np.nan, index=run_ids, columns=colnames)
values = []
runs = openml.runs.get_runs(run_ids)
for r in runs:
params = r.parameter_settings
for p in params:
name, value = p['oml:name'], p['oml:value']
if name in colnames:
eval_table.loc[r.run_id, name] = value
values.append(r.evaluations[metric])
return eval_table, values
eval_df = eval_df.iloc[:run_count, :]
eval_df.columns = [column.split('_')[-1] for column in eval_df.columns]
eval_table = eval_df.loc[:, colnames]
value = eval_df.loc[:, 'value']
return eval_table, value


def list_categorical_attributes(flow_type='svm'):
Expand All @@ -160,9 +151,7 @@ def list_categorical_attributes(flow_type='svm'):
# pre-processing all retrieved evaluations.

eval_df, task_ids, flow_id = fetch_evaluations(run_full=False, flow_type=flow_type)
# run_count can not be passed if all the results are required
# it is set to 500 here arbitrarily to get results quickly
X, y = create_table_from_evaluations(eval_df, run_count=500, flow_type=flow_type)
X, y = create_table_from_evaluations(eval_df, flow_type=flow_type)
print(X.head())
print("Y : ", y[:5])

Expand All @@ -176,8 +165,6 @@ def list_categorical_attributes(flow_type='svm'):
# Separating data into categorical and non-categorical (numeric for this example) columns
cat_cols = list_categorical_attributes(flow_type=flow_type)
num_cols = list(set(X.columns) - set(cat_cols))
X_cat = X.loc[:, cat_cols]
X_num = X.loc[:, num_cols]

# Missing value imputers
cat_imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='None')
Expand All @@ -187,7 +174,7 @@ def list_categorical_attributes(flow_type='svm'):
enc = OneHotEncoder(handle_unknown='ignore')

# Pipeline to handle categorical column transformations
cat_transforms = Pipeline([('impute', cat_imputer), ('encode', enc)])
cat_transforms = Pipeline(steps=[('impute', cat_imputer), ('encode', enc)])

# Combining column transformers
ct = ColumnTransformer([('cat', cat_transforms, cat_cols), ('num', num_imputer, num_cols)])
Expand All @@ -207,7 +194,7 @@ def list_categorical_attributes(flow_type='svm'):
# Selecting a task for the surrogate
task_id = task_ids[-1]
print("Task ID : ", task_id)
X, y = create_table_from_evaluations(eval_df, run_count=1000, task_ids=[task_id], flow_type='svm')
X, y = create_table_from_evaluations(eval_df, task_ids=[task_id], flow_type='svm')

model.fit(X, y)
y_pred = model.predict(X)
Expand All @@ -224,6 +211,7 @@ def list_categorical_attributes(flow_type='svm'):
#
# NOTE: This section is written exclusively for the SVM flow


# Sampling random configurations
def random_sample_configurations(num_samples=100):
colnames = ['cost', 'degree', 'gamma', 'kernel']
Expand All @@ -240,6 +228,7 @@ def random_sample_configurations(num_samples=100):
X.iloc[:, i] = col_val
return X


configs = random_sample_configurations(num_samples=1000)
print(configs)

Expand Down