From 964ae8500bf7d979e9d257694c951bcdb1683440 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 12 Nov 2019 14:32:33 +0100 Subject: [PATCH 1/5] add better error message for too-long URI --- openml/_api_calls.py | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/openml/_api_calls.py b/openml/_api_calls.py index 888afa18e..ce05c62d3 100644 --- a/openml/_api_calls.py +++ b/openml/_api_calls.py @@ -3,7 +3,6 @@ import time import logging import requests -import warnings import xmltodict from typing import Dict @@ -47,20 +46,27 @@ def _perform_api_call(call, request_method, data=None, file_elements=None): url = url.replace('=', '%3d') logging.info('Starting [%s] request for the URL %s', request_method, url) start = time.time() + if file_elements is not None: if request_method != 'post': - raise ValueError('request method must be post when file elements ' - 'are present') + raise ValueError('request method must be post when file elements are present') response = _read_url_files(url, data=data, file_elements=file_elements) else: response = _read_url(url, request_method, data) + + if response.status_code != 200: + raise _parse_server_exception(response, url, file_elements=file_elements) + elif 'Content-Encoding' not in response.headers or \ + response.headers['Content-Encoding'] != 'gzip': + logging.warning('Received uncompressed content from OpenML for {}.'.format(url)) + logging.info( '%.7fs taken for [%s] request for the URL %s', time.time() - start, request_method, url, ) - return response + return response.text def _file_id_to_url(file_id, filename=None): @@ -91,13 +97,7 @@ def _read_url_files(url, data=None, file_elements=None): data=data, files=file_elements, ) - if response.status_code != 200: - raise _parse_server_exception(response, url, file_elements=file_elements) - if 'Content-Encoding' not in response.headers or \ - response.headers['Content-Encoding'] != 'gzip': - warnings.warn('Received uncompressed content from OpenML for {}.' - .format(url)) - return response.text + return response def _read_url(url, request_method, data=None): @@ -105,14 +105,7 @@ def _read_url(url, request_method, data=None): if config.apikey is not None: data['api_key'] = config.apikey - response = send_request(request_method=request_method, url=url, data=data) - if response.status_code != 200: - raise _parse_server_exception(response, url, file_elements=None) - if 'Content-Encoding' not in response.headers or \ - response.headers['Content-Encoding'] != 'gzip': - warnings.warn('Received uncompressed content from OpenML for {}.' - .format(url)) - return response.text + return send_request(request_method=request_method, url=url, data=data) def send_request( @@ -159,9 +152,12 @@ def _parse_server_exception( try: server_exception = xmltodict.parse(response.text) except Exception: - raise OpenMLServerError( - 'Unexpected server error when calling {}. Please contact the developers!\n' - 'Status code: {}\n{}'.format(url, response.status_code, response.text)) + if response.status_code == 414: + raise OpenMLServerError('URI too long! ({})'.format(url)) + else: + raise OpenMLServerError( + 'Unexpected server error when calling {}. Please contact the developers!\n' + 'Status code: {}\n{}'.format(url, response.status_code, response.text)) server_error = server_exception['oml:error'] code = int(server_error['oml:code']) From a5da1959deba3ad38c5f9d09cf847189b747c7af Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 12 Nov 2019 15:56:03 +0100 Subject: [PATCH 2/5] improve error handling --- openml/_api_calls.py | 78 +++++++++++++++++++++++++++++++++--- openml/datasets/functions.py | 2 +- openml/tasks/task.py | 10 ++--- openml/utils.py | 51 ----------------------- 4 files changed, 77 insertions(+), 64 deletions(-) diff --git a/openml/_api_calls.py b/openml/_api_calls.py index ce05c62d3..4742f9d98 100644 --- a/openml/_api_calls.py +++ b/openml/_api_calls.py @@ -1,6 +1,7 @@ # License: BSD 3-Clause import time +import hashlib import logging import requests import xmltodict @@ -8,7 +9,7 @@ from . import config from .exceptions import (OpenMLServerError, OpenMLServerException, - OpenMLServerNoResult) + OpenMLServerNoResult, OpenMLHashException) def _perform_api_call(call, request_method, data=None, file_elements=None): @@ -54,11 +55,7 @@ def _perform_api_call(call, request_method, data=None, file_elements=None): else: response = _read_url(url, request_method, data) - if response.status_code != 200: - raise _parse_server_exception(response, url, file_elements=file_elements) - elif 'Content-Encoding' not in response.headers or \ - response.headers['Content-Encoding'] != 'gzip': - logging.warning('Received uncompressed content from OpenML for {}.'.format(url)) + _check_response(response, url, file_elements) logging.info( '%.7fs taken for [%s] request for the URL %s', @@ -69,6 +66,75 @@ def _perform_api_call(call, request_method, data=None, file_elements=None): return response.text +def _download_text_file(source: str, + output_path: str, + md5_checksum: str = None, + exists_ok: bool = True, + encoding: str = 'utf8', + ) -> None: + """ Download the text file at `source` and store it in `output_path`. + + By default, do nothing if a file already exists in `output_path`. + The downloaded file can be checked against an expected md5 checksum. + + Parameters + ---------- + source : str + url of the file to be downloaded + output_path : str + full path, including filename, of where the file should be stored. + md5_checksum : str, optional (default=None) + If not None, should be a string of hexidecimal digits of the expected digest value. + exists_ok : bool, optional (default=True) + If False, raise an FileExistsError if there already exists a file at `output_path`. + encoding : str, optional (default='utf8') + The encoding with which the file should be stored. + """ + try: + with open(output_path, encoding=encoding): + if exists_ok: + return + else: + raise FileExistsError + except FileNotFoundError: + pass + + logging.info('Starting [%s] request for the URL %s', 'get', source) + start = time.time() + response = _read_url(source, request_method='get') + _check_response(response, source, None) + downloaded_file = response.text + + if md5_checksum is not None: + md5 = hashlib.md5() + md5.update(downloaded_file.encode('utf-8')) + md5_checksum_download = md5.hexdigest() + if md5_checksum != md5_checksum_download: + raise OpenMLHashException( + 'Checksum {} of downloaded file is unequal to the expected checksum {}.' + .format(md5_checksum_download, md5_checksum)) + + with open(output_path, "w", encoding=encoding) as fh: + fh.write(downloaded_file) + + logging.info( + '%.7fs taken for [%s] request for the URL %s', + time.time() - start, + 'get', + source, + ) + + del downloaded_file + + +def _check_response(response, url, file_elements): + if response.status_code != 200: + raise _parse_server_exception(response, url, file_elements=file_elements) + elif 'Content-Encoding' not in response.headers or \ + response.headers['Content-Encoding'] != 'gzip': + logging.warning('Received uncompressed content from OpenML for {}.'.format(url)) + + def _file_id_to_url(file_id, filename=None): """ Presents the URL how to download a given file id diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index e85c55aa3..07d67717d 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -886,7 +886,7 @@ def _get_dataset_arff(description: Union[Dict, OpenMLDataset], output_file_path = os.path.join(cache_directory, "dataset.arff") try: - openml.utils._download_text_file( + openml._api_calls._download_text_file( source=url, output_path=output_file_path, md5_checksum=md5_checksum_fixture diff --git a/openml/tasks/task.py b/openml/tasks/task.py index 0b79c2eca..fd07df5ab 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -116,12 +116,10 @@ def _download_split(self, cache_file: str): pass except (OSError, IOError): split_url = self.estimation_procedure["data_splits_url"] - split_arff = openml._api_calls._read_url(split_url, - request_method='get') - - with io.open(cache_file, "w", encoding='utf8') as fh: - fh.write(split_arff) - del split_arff + openml._api_calls._download_text_file( + source=split_url, + output_path=cache_file, + ) def download_split(self) -> OpenMLSplit: """Download the OpenML split for a given task. diff --git a/openml/utils.py b/openml/utils.py index 09a0f6a83..2815f1afd 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -1,7 +1,6 @@ # License: BSD 3-Clause import os -import hashlib import xmltodict import shutil from typing import TYPE_CHECKING, List, Tuple, Union, Type @@ -366,53 +365,3 @@ def _create_lockfiles_dir(): except OSError: pass return dir - - -def _download_text_file(source: str, - output_path: str, - md5_checksum: str = None, - exists_ok: bool = True, - encoding: str = 'utf8', - ) -> None: - """ Download the text file at `source` and store it in `output_path`. - - By default, do nothing if a file already exists in `output_path`. - The downloaded file can be checked against an expected md5 checksum. - - Parameters - ---------- - source : str - url of the file to be downloaded - output_path : str - full path, including filename, of where the file should be stored. - md5_checksum : str, optional (default=None) - If not None, should be a string of hexidecimal digits of the expected digest value. - exists_ok : bool, optional (default=True) - If False, raise an FileExistsError if there already exists a file at `output_path`. - encoding : str, optional (default='utf8') - The encoding with which the file should be stored. - """ - try: - with open(output_path, encoding=encoding): - if exists_ok: - return - else: - raise FileExistsError - except FileNotFoundError: - pass - - downloaded_file = openml._api_calls._read_url(source, request_method='get') - - if md5_checksum is not None: - md5 = hashlib.md5() - md5.update(downloaded_file.encode('utf-8')) - md5_checksum_download = md5.hexdigest() - if md5_checksum != md5_checksum_download: - raise openml.exceptions.OpenMLHashException( - 'Checksum {} of downloaded file is unequal to the expected checksum {}.' - .format(md5_checksum_download, md5_checksum)) - - with open(output_path, "w", encoding=encoding) as fh: - fh.write(downloaded_file) - - del downloaded_file From f05ffc3d069f81b30ecfa28a0a14c011410edef0 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 12 Nov 2019 17:39:36 +0100 Subject: [PATCH 3/5] improve data download function, fix bugs --- openml/_api_calls.py | 56 ++++++++++++++++++++++-------------- openml/datasets/functions.py | 6 ++-- 2 files changed, 36 insertions(+), 26 deletions(-) diff --git a/openml/_api_calls.py b/openml/_api_calls.py index 4742f9d98..5be9a2d48 100644 --- a/openml/_api_calls.py +++ b/openml/_api_calls.py @@ -5,7 +5,7 @@ import logging import requests import xmltodict -from typing import Dict +from typing import Dict, Optional from . import config from .exceptions import (OpenMLServerError, OpenMLServerException, @@ -67,11 +67,11 @@ def _perform_api_call(call, request_method, data=None, file_elements=None): def _download_text_file(source: str, - output_path: str, + output_path: Optional[str] = None, md5_checksum: str = None, exists_ok: bool = True, encoding: str = 'utf8', - ) -> None: + ) -> Optional[str]: """ Download the text file at `source` and store it in `output_path`. By default, do nothing if a file already exists in `output_path`. @@ -81,8 +81,9 @@ def _download_text_file(source: str, ---------- source : str url of the file to be downloaded - output_path : str - full path, including filename, of where the file should be stored. + output_path : str, (optional) + full path, including filename, of where the file should be stored. If ``None``, + this function returns the downloaded file as string. md5_checksum : str, optional (default=None) If not None, should be a string of hexidecimal digits of the expected digest value. exists_ok : bool, optional (default=True) @@ -90,14 +91,15 @@ def _download_text_file(source: str, encoding : str, optional (default='utf8') The encoding with which the file should be stored. """ - try: - with open(output_path, encoding=encoding): - if exists_ok: - return - else: - raise FileExistsError - except FileNotFoundError: - pass + if output_path is not None: + try: + with open(output_path, encoding=encoding): + if exists_ok: + return + else: + raise FileExistsError + except FileNotFoundError: + pass logging.info('Starting [%s] request for the URL %s', 'get', source) start = time.time() @@ -114,17 +116,27 @@ def _download_text_file(source: str, 'Checksum {} of downloaded file is unequal to the expected checksum {}.' .format(md5_checksum_download, md5_checksum)) - with open(output_path, "w", encoding=encoding) as fh: - fh.write(downloaded_file) + if output_path is None: + logging.info( + '%.7fs taken for [%s] request for the URL %s', + time.time() - start, + 'get', + source, + ) + return downloaded_file - logging.info( - '%.7fs taken for [%s] request for the URL %s', - time.time() - start, - 'get', - source, - ) + else: + with open(output_path, "w", encoding=encoding) as fh: + fh.write(downloaded_file) + + logging.info( + '%.7fs taken for [%s] request for the URL %s', + time.time() - start, + 'get', + source, + ) - del downloaded_file + del downloaded_file def _check_response(response, url, file_elements): diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 07d67717d..657fbc7c6 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -1038,13 +1038,11 @@ def _get_online_dataset_arff(dataset_id): str A string representation of an ARFF file. """ - dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id, - 'get') + dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id, 'get') # build a dict from the xml. # use the url from the dataset description and return the ARFF string - return openml._api_calls._read_url( + return openml._api_calls._download_text_file( xmltodict.parse(dataset_xml)['oml:data_set_description']['oml:url'], - request_method='get' ) From b673d430fd3c9aea5cbf111dddf47040b00651e8 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 12 Nov 2019 18:10:00 +0100 Subject: [PATCH 4/5] stricter API, more private methods --- openml/_api_calls.py | 29 ++++++++++++++------------- openml/runs/run.py | 3 +-- openml/tasks/task.py | 2 +- tests/test_runs/test_run_functions.py | 3 +-- tests/test_utils/test_utils.py | 2 +- 5 files changed, 19 insertions(+), 20 deletions(-) diff --git a/openml/_api_calls.py b/openml/_api_calls.py index 5be9a2d48..8f0cf88da 100644 --- a/openml/_api_calls.py +++ b/openml/_api_calls.py @@ -51,11 +51,11 @@ def _perform_api_call(call, request_method, data=None, file_elements=None): if file_elements is not None: if request_method != 'post': raise ValueError('request method must be post when file elements are present') - response = _read_url_files(url, data=data, file_elements=file_elements) + response = __read_url_files(url, data=data, file_elements=file_elements) else: - response = _read_url(url, request_method, data) + response = __read_url(url, request_method, data) - _check_response(response, url, file_elements) + __check_response(response, url, file_elements) logging.info( '%.7fs taken for [%s] request for the URL %s', @@ -95,7 +95,7 @@ def _download_text_file(source: str, try: with open(output_path, encoding=encoding): if exists_ok: - return + return None else: raise FileExistsError except FileNotFoundError: @@ -103,8 +103,8 @@ def _download_text_file(source: str, logging.info('Starting [%s] request for the URL %s', 'get', source) start = time.time() - response = _read_url(source, request_method='get') - _check_response(response, source, None) + response = __read_url(source, request_method='get') + __check_response(response, source, None) downloaded_file = response.text if md5_checksum is not None: @@ -137,11 +137,12 @@ def _download_text_file(source: str, ) del downloaded_file + return None -def _check_response(response, url, file_elements): +def __check_response(response, url, file_elements): if response.status_code != 200: - raise _parse_server_exception(response, url, file_elements=file_elements) + raise __parse_server_exception(response, url, file_elements=file_elements) elif 'Content-Encoding' not in response.headers or \ response.headers['Content-Encoding'] != 'gzip': logging.warning('Received uncompressed content from OpenML for {}.'.format(url)) @@ -159,7 +160,7 @@ def _file_id_to_url(file_id, filename=None): return url -def _read_url_files(url, data=None, file_elements=None): +def __read_url_files(url, data=None, file_elements=None): """do a post request to url with data and sending file_elements as files""" @@ -169,7 +170,7 @@ def _read_url_files(url, data=None, file_elements=None): file_elements = {} # Using requests.post sets header 'Accept-encoding' automatically to # 'gzip,deflate' - response = send_request( + response = __send_request( request_method='post', url=url, data=data, @@ -178,15 +179,15 @@ def _read_url_files(url, data=None, file_elements=None): return response -def _read_url(url, request_method, data=None): +def __read_url(url, request_method, data=None): data = {} if data is None else data if config.apikey is not None: data['api_key'] = config.apikey - return send_request(request_method=request_method, url=url, data=data) + return __send_request(request_method=request_method, url=url, data=data) -def send_request( +def __send_request( request_method, url, data, @@ -220,7 +221,7 @@ def send_request( return response -def _parse_server_exception( +def __parse_server_exception( response: requests.Response, url: str, file_elements: Dict, diff --git a/openml/runs/run.py b/openml/runs/run.py index 140347cc4..910801971 100644 --- a/openml/runs/run.py +++ b/openml/runs/run.py @@ -327,8 +327,7 @@ def get_metric_fn(self, sklearn_fn, kwargs=None): predictions_file_url = openml._api_calls._file_id_to_url( self.output_files['predictions'], 'predictions.arff', ) - response = openml._api_calls._read_url(predictions_file_url, - request_method='get') + response = openml._api_calls._download_text_file(predictions_file_url) predictions_arff = arff.loads(response) # TODO: make this a stream reader else: diff --git a/openml/tasks/task.py b/openml/tasks/task.py index fd07df5ab..72c12bab5 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -117,7 +117,7 @@ def _download_split(self, cache_file: str): except (OSError, IOError): split_url = self.estimation_procedure["data_splits_url"] openml._api_calls._download_text_file( - source=split_url, + source=str(split_url), output_path=cache_file, ) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 2773bc8d9..fe8aab808 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -119,8 +119,7 @@ def _rerun_model_and_compare_predictions(self, run_id, model_prime, seed): # downloads the predictions of the old task file_id = run.output_files['predictions'] predictions_url = openml._api_calls._file_id_to_url(file_id) - response = openml._api_calls._read_url(predictions_url, - request_method='get') + response = openml._api_calls._download_text_file(predictions_url) predictions = arff.loads(response) run_prime = openml.runs.run_model_on_task( model=model_prime, diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index de2d18981..152dd4dba 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -16,7 +16,7 @@ class OpenMLTaskTest(TestBase): def mocked_perform_api_call(call, request_method): # TODO: JvR: Why is this not a staticmethod? url = openml.config.server + '/' + call - return openml._api_calls._read_url(url, request_method=request_method) + return openml._api_calls._download_text_file(url) def test_list_all(self): openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks) From 9e2f328ce0df3519e5c0f7c11a6ca4a8da6a1c87 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 13 Nov 2019 16:20:34 +0100 Subject: [PATCH 5/5] incorporate Pieter's feedback --- openml/_api_calls.py | 16 ++++++++-------- tests/test_openml/test_api_calls.py | 12 ++++++++++++ 2 files changed, 20 insertions(+), 8 deletions(-) create mode 100644 tests/test_openml/test_api_calls.py diff --git a/openml/_api_calls.py b/openml/_api_calls.py index 8f0cf88da..c357dc3d0 100644 --- a/openml/_api_calls.py +++ b/openml/_api_calls.py @@ -226,17 +226,17 @@ def __parse_server_exception( url: str, file_elements: Dict, ) -> OpenMLServerError: - # OpenML has a sophisticated error system - # where information about failures is provided. try to parse this + + if response.status_code == 414: + raise OpenMLServerError('URI too long! ({})'.format(url)) try: server_exception = xmltodict.parse(response.text) except Exception: - if response.status_code == 414: - raise OpenMLServerError('URI too long! ({})'.format(url)) - else: - raise OpenMLServerError( - 'Unexpected server error when calling {}. Please contact the developers!\n' - 'Status code: {}\n{}'.format(url, response.status_code, response.text)) + # OpenML has a sophisticated error system + # where information about failures is provided. try to parse this + raise OpenMLServerError( + 'Unexpected server error when calling {}. Please contact the developers!\n' + 'Status code: {}\n{}'.format(url, response.status_code, response.text)) server_error = server_exception['oml:error'] code = int(server_error['oml:code']) diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py new file mode 100644 index 000000000..1748608bb --- /dev/null +++ b/tests/test_openml/test_api_calls.py @@ -0,0 +1,12 @@ +import openml +import openml.testing + + +class TestConfig(openml.testing.TestBase): + + def test_too_long_uri(self): + with self.assertRaisesRegex( + openml.exceptions.OpenMLServerError, + 'URI too long!', + ): + openml.datasets.list_datasets(data_id=list(range(10000)))