When I try to get the penguins data set using the python API (openml.datasets.get_dataset(dataset_id=42585)), I get a ValueError originating from pandas, because there are some missing values.
In the scikit-learn API (sklearn.datasets.fetch_openml) it's possible to use the as_frame argument to control whether pandas is used or not. I'm not sure whether I've just missed that in the openml python API but I couldn't find a similar option there.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-9-42cadb8ab205> in <module>
----> 1 dataset = get_dataset(dataset_id=42585)
~/opt/anaconda3/lib/python3.8/site-packages/openml/datasets/functions.py in get_dataset(dataset_id, download_data, version, error_if_multiple)
527 did_cache_dir)
528
--> 529 dataset = _create_dataset_from_description(
530 description, features, qualities, arff_file
531 )
~/opt/anaconda3/lib/python3.8/site-packages/openml/datasets/functions.py in _create_dataset_from_description(description, features, qualities, arff_file)
995 Dataset object from dict and ARFF.
996 """
--> 997 return OpenMLDataset(
998 description["oml:name"],
999 description.get("oml:description"),
~/opt/anaconda3/lib/python3.8/site-packages/openml/datasets/dataset.py in __init__(self, name, description, format, data_format, dataset_id, version, creator, contributor, collection_date, upload_date, language, licence, url, default_target_attribute, row_id_attribute, ignore_attribute, version_label, citation, tag, visibility, original_data_url, paper_url, update_comment, md5_checksum, data_file, features, qualities, dataset)
181
182 if data_file is not None:
--> 183 self.data_pickle_file = self._create_pickle_in_cache(data_file)
184 else:
185 self.data_pickle_file = None
~/opt/anaconda3/lib/python3.8/site-packages/openml/datasets/dataset.py in _create_pickle_in_cache(self, data_file)
421 # At this point either the pickle file does not exist, or it had outdated formatting.
422 # We parse the data from arff again and populate the cache with a recent pickle file.
--> 423 X, categorical, attribute_names = self._parse_data_from_arff(data_file)
424
425 with open(data_pickle_file, "wb") as fh:
~/opt/anaconda3/lib/python3.8/site-packages/openml/datasets/dataset.py in _parse_data_from_arff(self, arff_file_path)
387 if attribute_dtype[column_name] in ('categorical',
388 'boolean'):
--> 389 col.append(self._unpack_categories(
390 X[column_name], categories_names[column_name]))
391 else:
~/opt/anaconda3/lib/python3.8/site-packages/openml/datasets/dataset.py in _unpack_categories(series, categories)
531 # We require two lines to create a series of categories as detailed here:
532 # https://pandas.pydata.org/pandas-docs/version/0.24/user_guide/categorical.html#series-creation # noqa E501
--> 533 raw_cat = pd.Categorical(col, ordered=True, categories=categories)
534 return pd.Series(raw_cat, index=series.index, name=series.name)
535
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/arrays/categorical.py in __init__(self, values, categories, ordered, dtype, fastpath)
314 ):
315
--> 316 dtype = CategoricalDtype._from_values_or_dtype(
317 values, categories, ordered, dtype
318 )
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/dtypes/dtypes.py in _from_values_or_dtype(cls, values, categories, ordered, dtype)
328 # Note: This could potentially have categories=None and
329 # ordered=None.
--> 330 dtype = CategoricalDtype(categories, ordered)
331
332 return dtype
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/dtypes/dtypes.py in __init__(self, categories, ordered)
220
221 def __init__(self, categories=None, ordered: Ordered = False):
--> 222 self._finalize(categories, ordered, fastpath=False)
223
224 @classmethod
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/dtypes/dtypes.py in _finalize(self, categories, ordered, fastpath)
367
368 if categories is not None:
--> 369 categories = self.validate_categories(categories, fastpath=fastpath)
370
371 self._categories = categories
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/dtypes/dtypes.py in validate_categories(categories, fastpath)
541
542 if categories.hasnans:
--> 543 raise ValueError("Categorial categories cannot be null")
544
545 if not categories.is_unique:
ValueError: Categorial categories cannot be null
When I try to get the penguins data set using the python API (
openml.datasets.get_dataset(dataset_id=42585)
), I get a ValueError originating from pandas, because there are some missing values.In the scikit-learn API (
sklearn.datasets.fetch_openml
) it's possible to use theas_frame
argument to control whether pandas is used or not. I'm not sure whether I've just missed that in the openml python API but I couldn't find a similar option there.