openml / openml-python

Python module to interface with OpenML
https://openml.github.io/openml-python/main/
Other
279 stars 143 forks source link

Problem with get_dataset when dataset does not have qualities #1290

Open LizzAlice opened 10 months ago

LizzAlice commented 10 months ago

Description

I tried using openml.datasets.get_dataset(202, download_data=False), but got the error

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
File ~/lib/python3.9/site-packages/openml/datasets/functions.py:1241, in _get_dataset_qualities_file(did_cache_dir, dataset_id)
   1240 try:
-> 1241     with io.open(qualities_file, encoding="utf8") as fh:
   1242         qualities_xml = fh.read()

FileNotFoundError: [Errno 2] No such file or directory: '/home/.cache/openml/org/openml/www/datasets/202/qualities.xml'

During handling of the above exception, another exception occurred:

OpenMLServerException                     Traceback (most recent call last)
Cell In[12], line 2
      1 for did in did_list:
----> 2             ds = openml.datasets.get_dataset(int(did), download_data=False)

File ~/lib/python3.9/site-packages/openml/datasets/functions.py:514, in get_dataset(dataset_id, download_data, version, error_if_multiple, cache_format, download_qualities, download_features_meta_data, download_all_files, force_refresh_cache)
    512         raise OpenMLPrivateDatasetError(e.message) from None
    513     else:
--> 514         raise e
    515 finally:
    516     if remove_dataset_cache:

File ~/lib/python3.9/site-packages/openml/datasets/functions.py:493, in get_dataset(dataset_id, download_data, version, error_if_multiple, cache_format, download_qualities, download_features_meta_data, download_all_files, force_refresh_cache)
    491     features_file = _get_dataset_features_file(did_cache_dir, dataset_id)
    492 if download_qualities:
--> 493     qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)
    495 arff_file = _get_dataset_arff(description) if download_data else None
    496 if "oml:minio_url" in description and download_data:

File ~/lib/python3.9/site-packages/openml/datasets/functions.py:1245, in _get_dataset_qualities_file(did_cache_dir, dataset_id)
   1243 except (OSError, IOError):
   1244     try:
-> 1245         qualities_xml = _get_qualities_xml(dataset_id)
   1246         with io.open(qualities_file, "w", encoding="utf8") as fh:
   1247             fh.write(qualities_xml)

File ~/lib/python3.9/site-packages/openml/datasets/functions.py:1205, in _get_qualities_xml(dataset_id)
   1203 def _get_qualities_xml(dataset_id):
   1204     url_extension = f"data/qualities/{dataset_id}"
-> 1205     return openml._api_calls._perform_api_call(url_extension, "get")

File ~/lib/python3.9/site-packages/openml/_api_calls.py:99, in _perform_api_call(call, request_method, data, file_elements)
     97     response = _read_url_files(url, data=data, file_elements=file_elements)
     98 else:
---> 99     response = __read_url(url, request_method, data)
    101 __check_response(response, url, file_elements)
    103 logging.info(
    104     "%.7fs taken for [%s] request for the URL %s",
    105     time.time() - start,
    106     request_method,
    107     url,
    108 )

File ~/lib/python3.9/site-packages/openml/_api_calls.py:308, in __read_url(url, request_method, data, md5_checksum)
    306 if config.apikey:
    307     data["api_key"] = config.apikey
--> 308 return _send_request(
    309     request_method=request_method, url=url, data=data, md5_checksum=md5_checksum
    310 )

File ~/lib/python3.9/site-packages/openml/_api_calls.py:344, in _send_request(request_method, url, data, files, md5_checksum)
    342 else:
    343     raise NotImplementedError()
--> 344 __check_response(response=response, url=url, file_elements=files)
    345 if request_method == "get" and not __is_checksum_equal(
    346     response.text.encode("utf-8"), md5_checksum
    347 ):
    348     # -- Check if encoding is not UTF-8 perhaps
    349     if __is_checksum_equal(response.content, md5_checksum):

File ~/lib/python3.9/site-packages/openml/_api_calls.py:409, in __check_response(response, url, file_elements)
    405 def __check_response(
    406     response: requests.Response, url: str, file_elements: Optional[FILE_ELEMENTS_TYPE]
    407 ) -> None:
    408     if response.status_code != 200:
--> 409         raise __parse_server_exception(response, url, file_elements=file_elements)
    410     elif (
    411         "Content-Encoding" not in response.headers or response.headers["Content-Encoding"] != "gzip"
    412     ):
    413         logging.warning("Received uncompressed content from OpenML for {}.".format(url))

OpenMLServerException: https://www.openml.org/api/v1/xml/data/qualities/202 returned code 362: No qualities found - None

Versions

Linux-5.10.0-26-amd64-x86_64-with-glibc2.31 Python 3.9.2 (default, Feb 28 2021, 17:03:44) [GCC 10.2.1 20210110] NumPy 1.26.2 SciPy 1.11.4 Scikit-Learn 1.3.2 OpenML 0.14.1

PGijsbers commented 10 months ago

We'll look into that, the error message surely should be nicer at the very least. As a work around, for now you can opt-out from trying to download qualities altogether (if you do not need them):

import openml
dataset = openml.datasets.get_dataset(202, download_data=False, download_qualities=False)