sdv-dev / SDV

Synthetic data generation for tabular data
https://docs.sdv.dev/sdv
Other
2.37k stars 317 forks source link

PAR model cannot fit if sequence index is type `date` instead of `datetime` #465

Open csala opened 3 years ago

csala commented 3 years ago

Environment Details

Please indicate the following details about the environment in which you found the bug:

Error Description

A Sequence index of type date (as opposed to datetime) makes PAR crash with an AttributeError

Steps to reproduce

Here's a snippet reproducing the error by converting the demo sequence index into a date before running PAR.

>>> from sdv.demo import load_timeseries_demo
>>> from sdv.timeseries import PAR
>>> 
>>> data = load_timeseries_demo()
>>> data['Date'] = data['Date'].dt.date
>>> 
>>> model = PAR(
...     entity_columns=['Symbol'],
...     context_columns=['MarketCap', 'Sector', 'Industry'],
...     sequence_index='Date',
... )
>>> model.fit(data)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/home/ubuntu/SDV/sdv/timeseries/base.py", line 209, in fit
    self._fit(transformed)
  File "/home/ubuntu/SDV/sdv/timeseries/deepecho.py", line 85, in _fit
    self._model.fit_sequences(sequences, context_types, data_types)
  File "/home/ubuntu/.virtualenvs/SDV/lib/python3.8/site-packages/deepecho/models/par.py", line 310, in fit_sequences
    self._build(sequences, context_types, data_types)
  File "/home/ubuntu/.virtualenvs/SDV/lib/python3.8/site-packages/deepecho/models/par.py", line 186, in _build
    self._data_map, self._data_dims = self._idx_map(data, data_types)
  File "/home/ubuntu/.virtualenvs/SDV/lib/python3.8/site-packages/deepecho/models/par.py", line 129, in _idx_map
    'mu': np.nanmean(x[i]),
  File "<__array_function__ internals>", line 5, in nanmean
  File "/home/ubuntu/.virtualenvs/SDV/lib/python3.8/site-packages/numpy/lib/nanfunctions.py", line 950, in nanmean
    avg = _divide_by_count(tot, cnt, out=out)
  File "/home/ubuntu/.virtualenvs/SDV/lib/python3.8/site-packages/numpy/lib/nanfunctions.py", line 217, in _divide_by_count
    return a.dtype.type(a / b)
AttributeError: 'datetime.timedelta' object has no attribute 'dtype'
npatki commented 2 years ago

I can still replicate this in newer versions of the SDV 0.15.0 but the error is different

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
[<ipython-input-2-a0f8e31d5f65>](https://localhost:8080/#) in <module>()
      3     context_columns=['MarketCap', 'Sector', 'Industry'],
      4     sequence_index='Date')
----> 5 model.fit(data)

7 frames
[/usr/local/lib/python3.7/dist-packages/sdv/timeseries/base.py](https://localhost:8080/#) in fit(self, timeseries_data)
    208 
    209         LOGGER.debug('Fitting %s model to table %s', self.__class__.__name__, self._metadata.name)
--> 210         self._fit(transformed)
    211 
    212     def get_metadata(self):
[/usr/local/lib/python3.7/dist-packages/sdv/timeseries/deepecho.py](https://localhost:8080/#) in _fit(self, timeseries_data)
     85 
     86         # Validate and fit
---> 87         self._model.fit_sequences(sequences, context_types, data_types)
     88 
     89     def _sample(self, context=None, sequence_length=None):

[/usr/local/lib/python3.7/dist-packages/deepecho/models/par.py](https://localhost:8080/#) in fit_sequences(self, sequences, context_types, data_types)
    314         """
    315         X, C = [], []
--> 316         self._build(sequences, context_types, data_types)
    317         for sequence in sequences:
    318             X.append(self._data_to_tensor(sequence['data']))

[/usr/local/lib/python3.7/dist-packages/deepecho/models/par.py](https://localhost:8080/#) in _build(self, sequences, context_types, data_types)
    184 
    185         self._ctx_map, self._ctx_dims = self._idx_map(contexts, context_types)
--> 186         self._data_map, self._data_dims = self._idx_map(data, data_types)
    187         self._data_map['<TOKEN>'] = {
    188             'type': 'categorical',
[/usr/local/lib/python3.7/dist-packages/deepecho/models/par.py](https://localhost:8080/#) in _idx_map(self, x, t)
    127                 idx_map[i] = {
    128                     'type': t,
--> 129                     'mu': np.nanmean(x[i]),
    130                     'std': np.nanstd(x[i]),
    131                     'nulls': pd.isnull(x[i]).any(),

<__array_function__ internals> in nanmean(*args, **kwargs)

[/usr/local/lib/python3.7/dist-packages/numpy/lib/nanfunctions.py](https://localhost:8080/#) in nanmean(a, axis, dtype, out, keepdims)
    947 
    948     cnt = np.sum(~mask, axis=axis, dtype=np.intp, keepdims=keepdims)
--> 949     tot = np.sum(arr, axis=axis, dtype=dtype, out=out, keepdims=keepdims)
    950     avg = _divide_by_count(tot, cnt, out=out)
    951 

<__array_function__ internals> in sum(*args, **kwargs)
[/usr/local/lib/python3.7/dist-packages/numpy/core/fromnumeric.py](https://localhost:8080/#) in sum(a, axis, dtype, out, keepdims, initial, where)
   2258 
   2259     return _wrapreduction(a, np.add, 'sum', axis, dtype, out, keepdims=keepdims,
-> 2260                           initial=initial, where=where)
   2261 
   2262 

[/usr/local/lib/python3.7/dist-packages/numpy/core/fromnumeric.py](https://localhost:8080/#) in _wrapreduction(obj, ufunc, method, axis, dtype, out, **kwargs)
     84                 return reduction(axis=axis, out=out, **passkwargs)
     85 
---> 86     return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
     87 
     88 

TypeError: unsupported operand type(s) for +: 'datetime.date' and 'datetime.date