[FEA] Add support for strings in cuml.preprocessing.SimpleImputer

kshitizgupta21 commented 2 years ago

Is your feature request related to a problem? Please describe. I'm trying to use cuml preprocessing's SimpleImputer to impute string columns in v22.04. The docs

mention that both constant and most_frequent strategies are supported for string columns but when I try to use them I get this error:

TypeError: String Arrays is not yet implemented in cudf

Here is the complete output

from cuml.preprocessing import SimpleImputer
# Merchant State and Zip are type object columns
string_cols = ["Merchant State", "Zip"]

for col in string_cols:
    imputer = SimpleImputer(strategy="most_frequent")
    X_train[[col]] = imputer.fit_transform(X_train[[col]])
    X_test[[col]] = imputer.transform(X_test[[col]])

for col in string_cols:
    imputer = SimpleImputer(strategy="constant", fill_value='UNKNOWN')
    X_train[[col]] = imputer.fit_transform(X_train[[col]])
    X_test[[col]] = imputer.transform(X_test[[col]])

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Input In [20], in <cell line: 1>()
      1 for col in string_cols:
      2     imputer = SimpleImputer(strategy="constant", fill_value='UNKNOWN')
----> 3     X_train[[col]] = imputer.fit_transform(X_train[[col]])
      4     X_test[[col]] = imputer.transform(X_test[[col]])

File /opt/conda/envs/rapids/lib/python3.9/site-packages/cuml/_thirdparty/sklearn/utils/skl_dependencies.py:161, in TransformerMixin.fit_transform(self, X, y, **fit_params)
    157 # non-optimized default implementation; override when a better
    158 # method is possible for a given clustering algorithm
    159 if y is None:
    160     # fit method of arity 1 (unsupervised transformation)
--> 161     return self.fit(X, **fit_params).transform(X)
    162 else:
    163     # fit method of arity 2 (supervised transformation)
    164     return self.fit(X, y, **fit_params).transform(X)

File /opt/conda/envs/rapids/lib/python3.9/site-packages/cuml/internals/api_decorators.py:409, in BaseReturnAnyDecorator.__call__.<locals>.inner_with_setters(*args, **kwargs)
    402 self_val, input_val, target_val = \
    403     self.get_arg_values(*args, **kwargs)
    405 self.do_setters(self_val=self_val,
    406                 input_val=input_val,
    407                 target_val=target_val)
--> 409 return func(*args, **kwargs)

File /opt/conda/envs/rapids/lib/python3.9/site-packages/cuml/_thirdparty/sklearn/preprocessing/_imputation.py:310, in SimpleImputer.fit(self, X, y)
    307 if type(X) is list:
    308     X = np.asarray(X)
--> 310 X = self._validate_input(X, in_fit=True)
    311 super()._fit_indicator(X)
    313 # default fill_value is 0 for numerical input and "missing_value"
    314 # otherwise

File /opt/conda/envs/rapids/lib/python3.9/site-packages/cuml/_thirdparty/sklearn/preprocessing/_imputation.py:270, in SimpleImputer._validate_input(self, X, in_fit)
    267     force_all_finite = "allow-nan"
    269 try:
--> 270     X = self._validate_data(X, reset=in_fit,
    271                             accept_sparse='csc', dtype=dtype,
    272                             force_all_finite=force_all_finite,
    273                             copy=self.copy)
    274 except ValueError as ve:
    275     if "could not convert" in str(ve):

File /opt/conda/envs/rapids/lib/python3.9/site-packages/cuml/_thirdparty/sklearn/utils/skl_dependencies.py:110, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
    105     if self._get_tags()['requires_y']:
    106         raise ValueError(
    107             f"This {self.__class__.__name__} estimator "
    108             f"requires y to be passed, but the target y is None."
    109         )
--> 110     X = check_array(X, **check_params)
    111     out = X
    112 else:

File /opt/conda/envs/rapids/lib/python3.9/site-packages/cuml/thirdparty_adapters/adapters.py:273, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    271     return new_array
    272 else:
--> 273     X, n_rows, n_cols, dtype = input_to_cupy_array(array,
    274                                                    order=order,
    275                                                    deepcopy=copy,
    276                                                    fail_on_null=False)
    277     if correct_dtype != dtype:
    278         X = X.astype(correct_dtype)

File /opt/conda/envs/rapids/lib/python3.9/contextlib.py:79, in ContextDecorator.__call__.<locals>.inner(*args, **kwds)
     76 @wraps(func)
     77 def inner(*args, **kwds):
     78     with self._recreate_cm():
---> 79         return func(*args, **kwds)

File /opt/conda/envs/rapids/lib/python3.9/site-packages/cuml/common/input_utils.py:445, in input_to_cupy_array(X, order, deepcopy, check_dtype, convert_to_dtype, check_cols, check_rows, fail_on_order, force_contiguous, fail_on_null)
    443 if isinstance(X, (cudf.DataFrame, cudf.Series)):
    444     try:
--> 445         X = X.values
    446     except ValueError:
    447         X = X.astype('float64', copy=False)

File /opt/conda/envs/rapids/lib/python3.9/site-packages/cudf/core/frame.py:455, in Frame.values(self)
    442 @property
    443 def values(self):
    444     """
    445     Return a CuPy representation of the DataFrame.
    446 
   (...)
    453         The values of the DataFrame.
    454     """
--> 455     return self.to_cupy()

File /opt/conda/envs/rapids/lib/python3.9/contextlib.py:79, in ContextDecorator.__call__.<locals>.inner(*args, **kwds)
     76 @wraps(func)
     77 def inner(*args, **kwds):
     78     with self._recreate_cm():
---> 79         return func(*args, **kwds)

File /opt/conda/envs/rapids/lib/python3.9/site-packages/cudf/core/frame.py:555, in Frame.to_cupy(self, dtype, copy, na_value)
    529 @_cudf_nvtx_annotate
    530 def to_cupy(
    531     self,
   (...)
    534     na_value=None,
    535 ) -> cupy.ndarray:
    536     """Convert the Frame to a CuPy array.
    537 
    538     Parameters
   (...)
    553     cupy.ndarray
    554     """
--> 555     return self._to_array(
    556         (lambda col: col.values.copy())
    557         if copy
    558         else (lambda col: col.values),
    559         cupy.empty,
    560         dtype,
    561         na_value,
    562     )

File /opt/conda/envs/rapids/lib/python3.9/site-packages/cudf/core/frame.py:520, in Frame._to_array(self, get_column_values, make_empty_matrix, dtype, na_value)
    513 matrix = make_empty_matrix(
    514     shape=(len(self), ncol), dtype=dtype, order="F"
    515 )
    516 for i, col in enumerate(self._data.values()):
    517     # TODO: col.values may fail if there is nullable data or an
    518     # unsupported dtype. We may want to catch and provide a more
    519     # suitable error.
--> 520     matrix[:, i] = get_column_values_na(col)
    521 return matrix

File /opt/conda/envs/rapids/lib/python3.9/site-packages/cudf/core/frame.py:499, in Frame._to_array.<locals>.get_column_values_na(col)
    497 if na_value is not None:
    498     col = col.fillna(na_value)
--> 499 return get_column_values(col)

File /opt/conda/envs/rapids/lib/python3.9/site-packages/cudf/core/frame.py:558, in Frame.to_cupy.<locals>.<lambda>(col)
    529 @_cudf_nvtx_annotate
    530 def to_cupy(
    531     self,
   (...)
    534     na_value=None,
    535 ) -> cupy.ndarray:
    536     """Convert the Frame to a CuPy array.
    537 
    538     Parameters
   (...)
    553     cupy.ndarray
    554     """
    555     return self._to_array(
    556         (lambda col: col.values.copy())
    557         if copy
--> 558         else (lambda col: col.values),
    559         cupy.empty,
    560         dtype,
    561         na_value,
    562     )

File /opt/conda/envs/rapids/lib/python3.9/site-packages/cudf/core/column/string.py:5321, in StringColumn.values(self)
   5316 @property
   5317 def values(self) -> cupy.ndarray:
   5318     """
   5319     Return a CuPy representation of the StringColumn.
   5320     """
-> 5321     raise TypeError("String Arrays is not yet implemented in cudf")

TypeError: String Arrays is not yet implemented in cudf

Describe the solution you'd like String column imputation to go smoothly

beckernick commented 2 years ago

Thanks for raising this issue. This is likely a documentation error, as this functionality is currently designed for numeric data in cuML.

If you share a reproducible example, someone may be able to advise on a workaround.

github-actions[bot] commented 2 years ago

This issue has been labeled inactive-30d due to no recent activity in the past 30 days. Please close this issue if no further response or action is needed. Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed. This issue will be labeled inactive-90d if there is no activity in the next 60 days.

LiamMadigan-EN0107 commented 2 years ago

A reproducible example can be taken from the cuml.compose.make_column_selector() example in the API reference guide:

https://docs.rapids.ai/api/cuml/stable/api.html#text-preprocessing-single-gpu.

The original example is:

from cuml.preprocessing import StandardScaler, OneHotEncoder from cuml.preprocessing import make_column_transformer from cuml.preprocessing import make_column_selector import cupy as cp import cudf
X = cudf.DataFrame({'city': ['London', 'London', 'Paris', 'Sallisaw'], 'rating': [5, 3, 4, 5]})
ct = make_column_transformer( (StandardScaler(), make_column_selector(dtype_include=cp.number)), # rating (OneHotEncoder(), make_column_selector(dtype_include=object))) # city ct.fit_transform(X)

By changing the example to: X = cudf.DataFrame({'city': ['London', np.nan, 'Paris', 'Sallisaw'], 'rating': [5, 3, 4, 5]})
ct = make_column_transformer( (StandardScaler(), make_column_selector(dtype_include=cp.number)), (SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='Other'), make_column_selector(dtype_include=object)), (OneHotEncoder(), make_column_selector(dtype_include=object))) # city ct.fit_transform(X)

This gives the error message shown by kshitizgupta21 above.

As an aside. I can also get the same error message, when I add the "remainder='passthrough'" argument to the original example, ie:

X = cudf.DataFrame({'city': ['London', 'London', 'Paris', 'Sallisaw'], 'rating': [5, 3, 4, 5]})
ct = make_column_transformer( (StandardScaler(), make_column_selector(dtype_include=cp.number)), # rating (OneHotEncoder(), make_column_selector(dtype_include=object)), remainder='passthrough') # city ct.fit_transform(X)

I'm using a docker container, converted into an AWS Sagemaker Studio Kernel. The docker version is:

nvcr.io/nvidia/rapidsai/rapidsai:22.08-cuda11.5-runtime-ubuntu20.04-py3.8

beckernick commented 2 years ago

Thanks for providing a reproducible example @LiamMadigan-EN0107 ! We'll evaluate the feasibility of including string support with constant and most_frequent strategies.

In the short term, would you be interested in contributing a PR to update the documentation to indicate strings are not yet supported?

github-actions[bot] commented 2 years ago

This issue has been labeled inactive-30d due to no recent activity in the past 30 days. Please close this issue if no further response or action is needed. Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed. This issue will be labeled inactive-90d if there is no activity in the next 60 days.

jacklinsibiyal commented 4 weeks ago

I also got the same error.

TypeError: String Arrays is not yet implemented in cudf

rapidsai / cuml

[FEA] Add support for strings in cuml.preprocessing.SimpleImputer #4786