openml / openml-data

For tracking issues related to OpenML datasets
1 stars 1 forks source link

ValueError on retrieving Penguins data #38

Closed hildeweerts closed 3 years ago

hildeweerts commented 3 years ago

When I try to get the penguins data set using the python API (openml.datasets.get_dataset(dataset_id=42585)), I get a ValueError originating from pandas, because there are some missing values.

In the scikit-learn API (sklearn.datasets.fetch_openml) it's possible to use the as_frame argument to control whether pandas is used or not. I'm not sure whether I've just missed that in the openml python API but I couldn't find a similar option there.

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-9-42cadb8ab205> in <module>
----> 1 dataset = get_dataset(dataset_id=42585)

~/opt/anaconda3/lib/python3.8/site-packages/openml/datasets/functions.py in get_dataset(dataset_id, download_data, version, error_if_multiple)
    527                                      did_cache_dir)
    528 
--> 529     dataset = _create_dataset_from_description(
    530         description, features, qualities, arff_file
    531     )

~/opt/anaconda3/lib/python3.8/site-packages/openml/datasets/functions.py in _create_dataset_from_description(description, features, qualities, arff_file)
    995         Dataset object from dict and ARFF.
    996     """
--> 997     return OpenMLDataset(
    998         description["oml:name"],
    999         description.get("oml:description"),

~/opt/anaconda3/lib/python3.8/site-packages/openml/datasets/dataset.py in __init__(self, name, description, format, data_format, dataset_id, version, creator, contributor, collection_date, upload_date, language, licence, url, default_target_attribute, row_id_attribute, ignore_attribute, version_label, citation, tag, visibility, original_data_url, paper_url, update_comment, md5_checksum, data_file, features, qualities, dataset)
    181 
    182         if data_file is not None:
--> 183             self.data_pickle_file = self._create_pickle_in_cache(data_file)
    184         else:
    185             self.data_pickle_file = None

~/opt/anaconda3/lib/python3.8/site-packages/openml/datasets/dataset.py in _create_pickle_in_cache(self, data_file)
    421         # At this point either the pickle file does not exist, or it had outdated formatting.
    422         # We parse the data from arff again and populate the cache with a recent pickle file.
--> 423         X, categorical, attribute_names = self._parse_data_from_arff(data_file)
    424 
    425         with open(data_pickle_file, "wb") as fh:

~/opt/anaconda3/lib/python3.8/site-packages/openml/datasets/dataset.py in _parse_data_from_arff(self, arff_file_path)
    387                 if attribute_dtype[column_name] in ('categorical',
    388                                                     'boolean'):
--> 389                     col.append(self._unpack_categories(
    390                         X[column_name], categories_names[column_name]))
    391                 else:

~/opt/anaconda3/lib/python3.8/site-packages/openml/datasets/dataset.py in _unpack_categories(series, categories)
    531         # We require two lines to create a series of categories as detailed here:
    532         # https://pandas.pydata.org/pandas-docs/version/0.24/user_guide/categorical.html#series-creation  # noqa E501
--> 533         raw_cat = pd.Categorical(col, ordered=True, categories=categories)
    534         return pd.Series(raw_cat, index=series.index, name=series.name)
    535 

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/arrays/categorical.py in __init__(self, values, categories, ordered, dtype, fastpath)
    314     ):
    315 
--> 316         dtype = CategoricalDtype._from_values_or_dtype(
    317             values, categories, ordered, dtype
    318         )

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/dtypes/dtypes.py in _from_values_or_dtype(cls, values, categories, ordered, dtype)
    328             # Note: This could potentially have categories=None and
    329             # ordered=None.
--> 330             dtype = CategoricalDtype(categories, ordered)
    331 
    332         return dtype

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/dtypes/dtypes.py in __init__(self, categories, ordered)
    220 
    221     def __init__(self, categories=None, ordered: Ordered = False):
--> 222         self._finalize(categories, ordered, fastpath=False)
    223 
    224     @classmethod

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/dtypes/dtypes.py in _finalize(self, categories, ordered, fastpath)
    367 
    368         if categories is not None:
--> 369             categories = self.validate_categories(categories, fastpath=fastpath)
    370 
    371         self._categories = categories

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/dtypes/dtypes.py in validate_categories(categories, fastpath)
    541 
    542             if categories.hasnans:
--> 543                 raise ValueError("Categorial categories cannot be null")
    544 
    545             if not categories.is_unique:

ValueError: Categorial categories cannot be null
PGijsbers commented 3 years ago

Thanks for letting us know! I am closing this as it is a duplicate from openml-python#1036, follow that issue to stay up-to-date.