alteryx / autonormalize

python library for automated dataset normalization
https://blog.featurelabs.com/automatic-dataset-normalization-for-feature-engineering-in-python/
BSD 3-Clause "New" or "Revised" License
109 stars 16 forks source link

Error Found with AutoNormalize and dataset #23

Open gsheni opened 4 years ago

gsheni commented 4 years ago
import pandas as pd
import autonormalize as an

data = pd.read_csv('dataset.csv.zip')
es = an.auto_entityset(data,
                       name="fraud",
                       index='id',
                       time_index='datetime')

Results in the following error:

ValueError: zero-size array to reduction operation maximum which has no identity
gsheni commented 4 years ago

Full stack trace:

ValueError                                Traceback (most recent call last)
<ipython-input-5-930823aade43> in <module>
      7                        index='id',
      8                        time_index='datetime',
----> 9                        accuracy=0.50)

~/lib/python3.6/site-packages/autonormalize/autonormalize.py in auto_entityset(df, accuracy, index, name, time_index)
    133         entityset (ft.EntitySet) : created entity set
    134     """
--> 135     return make_entityset(df, find_dependencies(df, accuracy, index), name, time_index)
    136
    137

~/lib/python3.6/site-packages/autonormalize/autonormalize.py in find_dependencies(df, accuracy, index)
     25         within the contraints provided
     26     """
---> 27     deps = Dependencies(dfd.dfd(df, accuracy, index))
     28     if index is None:
     29         prim_key = normalize.choose_index(deps.find_candidate_keys(), df)

~/lib/python3.6/site-packages/autonormalize/dfd.py in dfd(df, accuracy, index)
     45             dependencies.add_unique_lhs(i)
     46     for i in tqdm(non_uniq):
---> 47         lhss = find_LHSs(i, non_uniq, df, partitions, accuracy, masks)
     48         dependencies.add_LHSs(i, lhss)
     49     return dependencies

~/lib/python3.6/site-packages/autonormalize/dfd.py in find_LHSs(rhs, attrs, df, partitions, accuracy, masks)
     97                 node.infer_type()
     98                 if node.category == 0:
---> 99                     if compute_partitions(df, rhs, node.attrs, partitions, accuracy, masks):
    100                         if node.is_minimal():
    101                             min_deps.add_dep(node.attrs)

~/lib/python3.6/site-packages/autonormalize/dfd.py in compute_partitions(df, rhs, lhs_set, partitions, accuracy, masks)
    314     # for approximate dependencies see TANE section 2.3s
    315     if accuracy < 1:
--> 316         return approximate_dependencies(list(lhs_set), rhs, df, accuracy, masks)
    317     part_rhs = partition(lhs_set.union(set([rhs])), df, partitions)
    318     # if part_rhs > df.shape[0] * rep_percent:

~/lib/python3.6/site-packages/autonormalize/dfd.py in approximate_dependencies(lhs_set, rhs, df, accuracy, masks)
    377         options = df[mask]
    378         _, unique_counts = numpy.unique(options[rhs].to_numpy(), return_counts=True)
--> 379         acc += unique_counts.sum() - unique_counts.max()
    380         if acc > limit:
    381             return False

~/lib/python3.6/site-packages/numpy/core/_methods.py in _amax(a, axis, out, keepdims, initial, where)
     28 def _amax(a, axis=None, out=None, keepdims=False,
     29           initial=_NoValue, where=True):
---> 30     return umr_maximum(a, axis, None, out, keepdims, initial, where)
     31
     32 def _amin(a, axis=None, out=None, keepdims=False,

ValueError: zero-size array to reduction operation maximum which has no identity
thehomebrewnerd commented 4 years ago

@gsheni Can you test this out with the issue19 branch of this repo to see if that resolves your issue? I was having similar problems on a different dataset and made some changes to approximate_dependencies that might fix your issue as well.

gsheni commented 4 years ago

The branch does allow the autonormalize to progress further (now 13/13), but I got the following error with the above (attached dataset) KeyError: 'Variable: country not found in entity'

gsheni commented 4 years ago
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [24:53<00:00, 114.87s/it]
2020-04-02 10:18:24,380 featuretools.entityset - WARNING    index id not found in dataframe, creating new integer column
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-1-f1c5c82cc330> in <module>
      8                        index='id',
      9                        time_index='datetime',
---> 10                        accuracy=0.50)

~/autonormalize/autonormalize/autonormalize.py in auto_entityset(df, accuracy, index, name, time_index)
    133         entityset (ft.EntitySet) : created entity set
    134     """
--> 135     return make_entityset(df, find_dependencies(df, accuracy, index), name, time_index)
    136
    137

~/autonormalize/autonormalize/autonormalize.py in make_entityset(df, dependencies, name, time_index)
    108             relationships.append((child.index[0], child.index[0], current.index[0], child.index[0]))
    109
--> 110     return ft.EntitySet(name, entities, relationships)
    111
    112

~/autonormalize/venv/lib/python3.6/site-packages/featuretools/entityset/entityset.py in __init__(self, id, entities, relationships)
     84         for relationship in relationships:
     85             parent_variable = self[relationship[0]][relationship[1]]
---> 86             child_variable = self[relationship[2]][relationship[3]]
     87             self.add_relationship(Relationship(parent_variable,
     88                                                child_variable))

~/autonormalize/venv/lib/python3.6/site-packages/featuretools/entityset/entity.py in __getitem__(self, variable_id)
    154
    155     def __getitem__(self, variable_id):
--> 156         return self._get_variable(variable_id)
    157
    158     def _get_variable(self, variable_id):

~/autonormalize/venv/lib/python3.6/site-packages/featuretools/entityset/entity.py in _get_variable(self, variable_id)
    172                 return v
    173
--> 174         raise KeyError("Variable: %s not found in entity" % (variable_id))
    175
    176     @property

KeyError: 'Variable: country not found in entity'
thehomebrewnerd commented 4 years ago

@gsheni That was also an error I was seeing that I thought was resolved in the issue19 branch. I'll take another look and run with the dataset you attached to see what is going on.

thehomebrewnerd commented 4 years ago

@gsheni That was also an error I was seeing that I thought was resolved in the issue19 branch. I'll take another look and run with the dataset you attached to see what is going on.

I actually got a different error when I ran. Will continue to investigate both.

Traceback (most recent call last):
  File "test_issue19.py", line 18, in <module>
    time_index='datetime')
  File "/Users/nate.parsons/dev/autonormalize/autonormalize/autonormalize.py", line 135, in auto_entityset
    return make_entityset(df, find_dependencies(df, accuracy, index), name, time_index)
  File "/Users/nate.parsons/dev/autonormalize/autonormalize/autonormalize.py", line 110, in make_entityset
    return ft.EntitySet(name, entities, relationships)
  File "/Users/nate.parsons/dev/autonormalize/env/lib/python3.7/site-packages/featuretools/entityset/entityset.py", line 82, in __init__
    variable_types=variable_types)
  File "/Users/nate.parsons/dev/autonormalize/env/lib/python3.7/site-packages/featuretools/entityset/entityset.py", line 480, in entity_from_dataframe
    raise ValueError("time_index and index cannot be the same value, %s" % (time_index))
ValueError: time_index and index cannot be the same value, datetime
thehomebrewnerd commented 4 years ago

I'm not sure what is going on, but the dependencies that are being found appear to be quite complex for this input dataframe. Will require a more in depth investigation to really understand and diagnose this problem.

Current dependencies identified with accuracy at 0.98:

--> id
 {expiration_date,currency,lng}  {datetime}  {store_id,expiration_date,currency}  {expiration_date,currency,lat}  {id}  {store_id,currency,customer_present}  --> card_id
 {currency,region,provider}  {id}  {amount,expiration_date}  {country,card_id,customer_present}  {expiration_date,lat,provider}  {lng,card_id}  {amount,currency}  {country,fraud,card_id}  {currency,card_id}  {currency,lat,provider}  {region,card_id}  {lat,card_id}  {amount,card_id}  {expiration_date,currency,lng}  {country,amount}  {expiration_date,currency,lat}  {expiration_date,currency,region}  {lng,currency,provider}  {datetime}  {country,expiration_date,currency,customer_present}  {expiration_date,provider,lng}  {country,expiration_date,currency,provider}  {expiration_date,region,provider}  --> store_id
 {lng,card_id}  {country,amount}  {amount,currency}  {store_id,expiration_date,currency}  {id}  {region,card_id}  {lat,card_id}  {country,card_id,customer_present}  {store_id,currency,customer_present}  {store_id,card_id}  --> datetime
 {datetime}  {lng,card_id}  {store_id,expiration_date,currency}  {id}  {region,card_id}  {lat,card_id}  {country,card_id,customer_present}  {store_id,currency,customer_present}  {store_id,card_id}  --> amount
 {datetime}  {lng,card_id}  {country,amount}  {amount,lat}  {expiration_date,provider,lng}  {id}  --> currency
 {currency,region,provider}  {datetime}  {country,amount}  {store_id,expiration_date,fraud}  {id}  {currency,lat,provider}  {store_id,currency}  --> customer_present
 {currency,region,provider}  {datetime}  {country,amount}  {amount,currency}  {id}  {currency,lat,provider}  {store_id,currency,customer_present}  {card_id}  {lng,currency,provider}  --> expiration_date
 {expiration_date,currency,lng}  {datetime}  {amount,currency}  {store_id,expiration_date,currency}  {expiration_date,currency,lat}  {store_id,expiration_date,customer_present}  {expiration_date,currency,region}  {id}  {card_id}  {store_id,currency,customer_present}  --> provider
 {datetime}  {lng}  {country,amount}  {country,expiration_date,currency,customer_present}  {amount,currency}  {country,fraud,card_id}  {region}  {country,expiration_date,currency,provider}  {currency,card_id}  {store_id}  {id}  {amount,expiration_date}  {country,card_id,customer_present}  {amount,card_id}  --> lat
 {datetime}  {country,amount}  {amount,currency}  {country,expiration_date,currency,customer_present}  {country,fraud,card_id}  {region}  {lat}  {country,expiration_date,currency,provider}  {currency,card_id}  {store_id}  {id}  {amount,expiration_date}  {country,card_id,customer_present}  {amount,card_id}  --> lng
 {datetime}  {lng}  {country,amount}  {country,expiration_date,currency,customer_present}  {amount,currency}  {country,fraud,card_id}  {lat}  {country,expiration_date,currency,provider}  {currency,card_id}  {store_id}  {id}  {amount,expiration_date}  {country,card_id,customer_present}  {amount,card_id}  --> region
 {datetime}  {lng}  {amount,currency}  {region}  {amount,provider}  {lat}  {currency,card_id}  {store_id}  {id}  {amount,expiration_date}  {amount,card_id}  --> country
 {currency,lat,customer_present}  {datetime}  {expiration_date,lat,provider}  {store_id,expiration_date}  {expiration_date,lng,customer_present}  {expiration_date,currency,provider}  {expiration_date,region,customer_present}  {amount}  {currency,card_id}  {expiration_date,lat,customer_present}  {id}  {country,expiration_date,currency}  {currency,lat,provider}  {store_id,currency}  {country,card_id}  {currency,region,customer_present}  {lng,currency,customer_present}  --> fraud