alteryx / featuretools

An open source python library for automated feature engineering
https://www.featuretools.com
BSD 3-Clause "New" or "Revised" License
7.26k stars 879 forks source link

CumMean and CumSum can fail on all null columns #1682

Open thehomebrewnerd opened 3 years ago

thehomebrewnerd commented 3 years ago

CumMean and CumSum can fail on all null columns

The CumMean and CumSum primitives can fail during calculate feature matrix if a numeric column with all pd.NA values is present. The failure happens when Featuretools attempts to initialize Woodwork on the feature matrix.

import pandas as pd
import featuretools as ft

df = pd.DataFrame({
    'id': [0, 1, 2],
    'null_ints': [pd.NA]*3
})

es = ft.EntitySet('test')
es.add_dataframe(dataframe_name="test_df",
                 dataframe=df,
                 index='id',
                 logical_types={'null_ints': 'IntegerNullable'})

ft.dfs(entityset=es, target_dataframe_name='test_df', trans_primitives=['cum_mean'])
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~/dev/featuretools/env/lib/python3.8/site-packages/woodwork/logical_types.py in transform(self, series)
     57             try:
---> 58                 series = series.astype(new_dtype)
     59             except (TypeError, ValueError):

~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
   5814             # else, only a single dtype is given
-> 5815             new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   5816             return self._constructor(new_data).__finalize__(self, method="astype")

~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
    417     def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
--> 418         return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
    419 

~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
    326                 else:
--> 327                     applied = getattr(b, f)(**kwargs)
    328             except (TypeError, NotImplementedError):

~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
    591 
--> 592         new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
    593 

~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_array_safe(values, dtype, copy, errors)
   1308     try:
-> 1309         new_values = astype_array(values, dtype, copy=copy)
   1310     except (ValueError, TypeError):

~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_array(values, dtype, copy)
   1256     else:
-> 1257         values = astype_nansafe(values, dtype, copy=copy)
   1258 

~/dev/featuretools/env/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
   1200         # Explicit copy, or required since NumPy can't view from / to object.
-> 1201         return arr.astype(dtype, copy=True)
   1202 

TypeError: float() argument must be a string or a number, not 'NAType'

During handling of the above exception, another exception occurred:

TypeConversionError                       Traceback (most recent call last)
<ipython-input-15-8a9a157a6521> in <module>
----> 1 ft.dfs(entityset=es, target_dataframe_name='test_df', trans_primitives=['cum_mean'])

~/dev/featuretools/featuretools/utils/entry_point.py in function_wrapper(*args, **kwargs)
     38                     ep.on_error(error=e,
     39                                 runtime=runtime)
---> 40                 raise e
     41 
     42             # send return value

~/dev/featuretools/featuretools/utils/entry_point.py in function_wrapper(*args, **kwargs)
     30                 # call function
     31                 start = time.time()
---> 32                 return_value = func(*args, **kwargs)
     33                 runtime = time.time() - start
     34             except Exception as e:

~/dev/featuretools/featuretools/synthesis/dfs.py in dfs(dataframes, relationships, entityset, target_dataframe_name, cutoff_time, instance_ids, agg_primitives, trans_primitives, groupby_trans_primitives, allowed_paths, max_depth, ignore_dataframes, ignore_columns, primitive_options, seed_features, drop_contains, drop_exact, where_primitives, max_features, cutoff_time_in_index, save_progress, features_only, training_window, approximate, chunk_size, n_jobs, dask_kwargs, verbose, return_types, progress_callback, include_cutoff_time)
    276         return features
    277 
--> 278     feature_matrix = calculate_feature_matrix(features,
    279                                               entityset=entityset,
    280                                               cutoff_time=cutoff_time,

~/dev/featuretools/featuretools/computational_backends/calculate_feature_matrix.py in calculate_feature_matrix(features, entityset, cutoff_time, instance_ids, dataframes, relationships, cutoff_time_in_index, training_window, approximate, save_progress, verbose, chunk_size, n_jobs, dask_kwargs, progress_callback, include_cutoff_time)
    291                                                        include_cutoff_time=include_cutoff_time)
    292         else:
--> 293             feature_matrix = calculate_chunk(cutoff_time=cutoff_time_to_pass,
    294                                              chunk_size=chunk_size,
    295                                              feature_set=feature_set,

~/dev/featuretools/featuretools/computational_backends/calculate_feature_matrix.py in calculate_chunk(cutoff_time, chunk_size, feature_set, entityset, approximate, training_window, save_progress, no_unapproximated_aggs, cutoff_df_time_col, target_time, pass_columns, progress_bar, progress_callback, include_cutoff_time)
    487 
    488     ww_init_kwargs = get_ww_types_from_features(feature_set.target_features, entityset, pass_columns, cutoff_time)
--> 489     feature_matrix = init_ww_and_concat_fm(feature_matrix, ww_init_kwargs)
    490     return feature_matrix
    491 

~/dev/featuretools/featuretools/computational_backends/calculate_feature_matrix.py in init_ww_and_concat_fm(feature_matrix, ww_init_kwargs)
    756 def init_ww_and_concat_fm(feature_matrix, ww_init_kwargs):
    757     for fm in feature_matrix:
--> 758         fm.ww.init(**ww_init_kwargs)
    759 
    760     if any(isinstance(fm, dd.DataFrame) for fm in feature_matrix):

~/dev/featuretools/env/lib/python3.8/site-packages/woodwork/table_accessor.py in init(self, **kwargs)
     95                 Any errors resulting from skipping validation with invalid inputs may not be easily understood.
     96         """
---> 97         self.init_with_partial_schema(**kwargs)
     98 
     99     def init_with_full_schema(self, schema: TableSchema, validate: bool = True, **kwargs) -> None:

~/dev/featuretools/env/lib/python3.8/site-packages/woodwork/table_accessor.py in init_with_partial_schema(self, schema, index, time_index, logical_types, already_sorted, name, semantic_tags, table_metadata, column_metadata, use_standard_tags, column_descriptions, column_origins, validate, **kwargs)
    202 
    203         # overwrite schema parameters with specified kwargs
--> 204         logical_types = _infer_missing_logical_types(self._dataframe, logical_types, existing_logical_types)
    205         column_descriptions = {**existing_col_descriptions, **(column_descriptions or {})}
    206         column_metadata = {**existing_col_metadata, **(column_metadata or {})}

~/dev/featuretools/env/lib/python3.8/site-packages/woodwork/table_accessor.py in _infer_missing_logical_types(dataframe, force_logical_types, existing_logical_types)
   1037         logical_type = force_logical_types.get(name) if name in force_logical_types else existing_logical_types.get(name)
   1038         parsed_logical_types[name] = _get_column_logical_type(series, logical_type, name)
-> 1039         updated_series = parsed_logical_types[name].transform(series)
   1040         if updated_series is not series:
   1041             dataframe[name] = updated_series

~/dev/featuretools/env/lib/python3.8/site-packages/woodwork/logical_types.py in transform(self, series)
     58                 series = series.astype(new_dtype)
     59             except (TypeError, ValueError):
---> 60                 raise TypeConversionError(series, new_dtype, type(self))
     61         return series
     62 

TypeConversionError: Error converting datatype for CUM_MEAN(null_ints) from type object to type float64. Please confirm the underlying data is consistent with logical type Double.
gsheni commented 3 years ago

@thehomebrewnerd moving this to Woodwork Integration Follow Up Epic

dvreed77 commented 2 years ago

I think the fix for this would be to give Double logical type the dtype of Float64, but may cause issues as described here.

gsheni commented 2 years ago

There is a pandas issue:

import pandas as pd

df = pd.DataFrame({
    'id': [0, 1, 2],
    'null_ints': [pd.NA] * 3
})

df['null_ints'] = df['null_ints'].astype("object").astype("float64")
dvreed77 commented 2 years ago

Opened a ticket with Pandas.