KaveIO / PhiK

Phi_K correlation analyzer library
Other
156 stars 28 forks source link

ZeroDivisionError #20

Closed nateGeorge closed 3 years ago

nateGeorge commented 3 years ago

I thought I would put this up here in case anyone is interested in tracking down what's wrong. You can download the data from here: https://www.kaggle.com/wordsforthewise/loan-data-sample?select=loan_data.csv

Running this:

interval_columns = ['DISBURSED_AMOUNT', 'ASSET_COST', 'LTV', 'DATE_OF_BIRTH', 'DISBURSAL_DATE', 'PERFORM_CNS_SCORE',
                   'NEW_ACCTS_IN_LAST_SIX_MONTHS', 'NO_OF_INQUIRIES']
df.phik_matrix(interval_cols=interval_columns)

Results in the error:

---------------------------------------------------------------------------
_RemoteTraceback                          Traceback (most recent call last)
_RemoteTraceback: 
"""
Traceback (most recent call last):
  File "C:\Users\words\Anaconda3\envs\datasci\lib\site-packages\joblib\externals\loky\process_executor.py", line 431, in _process_worker
    r = call_item()
  File "C:\Users\words\Anaconda3\envs\datasci\lib\site-packages\joblib\externals\loky\process_executor.py", line 285, in __call__
    return self.fn(*self.args, **self.kwargs)
  File "C:\Users\words\Anaconda3\envs\datasci\lib\site-packages\joblib\_parallel_backends.py", line 595, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\words\Anaconda3\envs\datasci\lib\site-packages\joblib\parallel.py", line 262, in __call__
    return [func(*args, **kwargs)
  File "C:\Users\words\Anaconda3\envs\datasci\lib\site-packages\joblib\parallel.py", line 262, in <listcomp>
    return [func(*args, **kwargs)
  File "C:\Users\words\Anaconda3\envs\datasci\lib\site-packages\phik\phik.py", line 180, in _calc_phik
    phikvalue = phik_from_hist2d(datahist.values, noise_correction=noise_correction)
  File "C:\Users\words\Anaconda3\envs\datasci\lib\site-packages\phik\phik.py", line 99, in phik_from_hist2d
    return phik_from_chi2(chi2, observed.sum(), *observed.shape, pedestal=pedestal)
  File "C:\Users\words\Anaconda3\envs\datasci\lib\site-packages\phik\bivariate.py", line 204, in phik_from_chi2
    chi2_one = n * sum([((c1-c0)*(c1-c0)) / c0 for c0,c1 in zip(corr0,corr1)])
  File "C:\Users\words\Anaconda3\envs\datasci\lib\site-packages\phik\bivariate.py", line 204, in <listcomp>
    chi2_one = n * sum([((c1-c0)*(c1-c0)) / c0 for c0,c1 in zip(corr0,corr1)])
ZeroDivisionError: float division by zero
"""

The above exception was the direct cause of the following exception:

ZeroDivisionError                         Traceback (most recent call last)
<ipython-input-77-d2314d5ac2f2> in <module>
      1 interval_columns = ['DISBURSED_AMOUNT', 'ASSET_COST', 'LTV', 'DATE_OF_BIRTH', 'DISBURSAL_DATE', 'PERFORM_CNS_SCORE',
      2                    'NEW_ACCTS_IN_LAST_SIX_MONTHS', 'NO_OF_INQUIRIES']
----> 3 sample.phik_matrix(interval_cols=interval_columns)

~\Anaconda3\envs\datasci\lib\site-packages\phik\phik.py in phik_matrix(df, interval_cols, bins, quantile, noise_correction, dropna, drop_underflow, drop_overflow)
    214     data_binned, binning_dict = bin_data(df_clean, cols=interval_cols_clean, bins=bins, quantile=quantile, retbins=True)
    215 
--> 216     return phik_from_rebinned_df(
    217         data_binned, noise_correction, dropna=dropna, drop_underflow=drop_underflow, drop_overflow=drop_overflow
    218     )

~\Anaconda3\envs\datasci\lib\site-packages\phik\phik.py in phik_from_rebinned_df(data_binned, noise_correction, dropna, drop_underflow, drop_overflow)
    141         ]
    142     else:
--> 143         phik_list = Parallel(n_jobs=NCORES)(
    144             delayed(_calc_phik)(co, data_binned[list(co)], noise_correction)
    145             for co in itertools.combinations_with_replacement(data_binned.columns.values, 2)

~\Anaconda3\envs\datasci\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
   1052 
   1053             with self._backend.retrieval_context():
-> 1054                 self.retrieve()
   1055             # Make sure that we get a last message telling us we are done
   1056             elapsed_time = time.time() - self._start_time

~\Anaconda3\envs\datasci\lib\site-packages\joblib\parallel.py in retrieve(self)
    931             try:
    932                 if getattr(self._backend, 'supports_timeout', False):
--> 933                     self._output.extend(job.get(timeout=self.timeout))
    934                 else:
    935                     self._output.extend(job.get())

~\Anaconda3\envs\datasci\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
    540         AsyncResults.get from multiprocessing."""
    541         try:
--> 542             return future.result(timeout=timeout)
    543         except CfTimeoutError as e:
    544             raise TimeoutError from e

~\Anaconda3\envs\datasci\lib\concurrent\futures\_base.py in result(self, timeout)
    437                 raise CancelledError()
    438             elif self._state == FINISHED:
--> 439                 return self.__get_result()
    440             else:
    441                 raise TimeoutError()

~\Anaconda3\envs\datasci\lib\concurrent\futures\_base.py in __get_result(self)
    386     def __get_result(self):
    387         if self._exception:
--> 388             raise self._exception
    389         else:
    390             return self._result

ZeroDivisionError: float division by zero

My next steps for debugging would be to insert ipdb in the phik code just before it throws an error, and to try dropping columns one-by-one to see which one is causing the problem. Mabye the package author(s) have a hunch as to what's going on.

mbaak commented 3 years ago

Thanks for posting, I will have a look.

mbaak commented 3 years ago

@nateGeorge The link to the dataset is broken for me. It is correct?

nateGeorge commented 3 years ago

Sorry, the dataset was private. It is public now. By the way, I was able to calculate phik between the variables like so:

df['LOAN_DEFAULT'] = df['LOAN_DEFAULT'].astype('category')

interval_columns = ['DISBURSED_AMOUNT', 'ASSET_COST', 'LTV', 'DATE_OF_BIRTH', 'DISBURSAL_DATE', 'PERFORM_CNS_SCORE',
                   'NEW_ACCTS_IN_LAST_SIX_MONTHS', 'NO_OF_INQUIRIES']

for c in df.columns:
    print(c)
    if c in interval_columns:
        print(phik.phik_from_array(df[c], df['LOAN_DEFAULT'], [c]))
    else:
        print(phik.phik_from_array(df[c].astype('category'), df['LOAN_DEFAULT'], drop_underflow=False, drop_overflow=False))

Without the drop_underflow=True in there, it was throwing an error. It's not clear to me what the under/overflow dropping is doing, and it says in the docs it's only applicable for numeric variables.

It takes a very long time to calculate the full matrix due to the number of columns, so I started doing it one-by-one (since we mostly care about the correlation with the target variable here, LOAN_DEFAULT).

I have not tried running the full df.phik_matrix(interval_cols=interval_columns) after converting the non-interval columns to category type and converting LOAN_DEFAULT to category type. I have also not tried setting the drop overflow and underflow options for calculation of the full matrix.

nateGeorge commented 3 years ago

By the way, if you want the full dataset, it is here: https://www.kaggle.com/avikpaul4u/vehicle-loan-default-prediction

The dataset I shared is a sample with some columns removed.

nateGeorge commented 3 years ago

I also tried converting things to categorical (including the LOAN_DEFAULT column), but got the same ZeroDivisionError:

interval_columns = ['DISBURSED_AMOUNT', 'ASSET_COST', 'LTV', 'DATE_OF_BIRTH', 'DISBURSAL_DATE', 'PERFORM_CNS_SCORE',
                   'NEW_ACCTS_IN_LAST_SIX_MONTHS', 'NO_OF_INQUIRIES']

for c in df.columns:
    if c not in interval_columns:
        df[c] = df[c].astype('category')

df['LOAN_DEFAULT'] = df['LOAN_DEFAULT'].astype('category')

sample.phik_matrix(interval_cols=interval_columns)

When I drop over and underflow, it still has the same error:

sample.phik_matrix(interval_cols=interval_columns, drop_underflow=False, drop_overflow=False).

mbaak commented 3 years ago

Thanks again for reporting this. This issue should be solved in phik version 0.11.2. Can you give it a try and let me know in case it fails? pip install phik==0.11.2

The problem boiled down to calculating phik for two categorical variables with 1000+ unique values each.

phik needs to evaluate the integral of a (rotated) 2d normal distribution, in nx * ny bins. This integral can become zero for certain cells when there are too many bins. I've now included a protection against division by zero. So it's working now.

The calculation is expensive though. E.g. eg. data[['CURRENT_PINCODE_ID','SUPPLIER_ID']].phik_matrix(interval_cols=[]) with 6+ million cells takes about a minute on my laptop.

Btw, the drop_underflow=True setting for two arrays was a separate issue that I have fixed as well.

nateGeorge commented 3 years ago

Yes, it is working now, thanks much. I installed via conda which had a slightly older version (0.10 something). It works with latest from pip, although it looks like you just implemented the fix.

The drop_underflow bit with the array-by-array calculations is still needed. Without drop under and overflow, it is throwing an error. I can open up a separate issue if you want.

mbaak commented 3 years ago

@nateGeorge For me phik.phik_from_array() works both with drop_underflow=False, drop_overflow=False and drop_underflow=True, drop_overflow=True. Can you give an example where it fails? Thanks.

nateGeorge commented 3 years ago

It seems to break on any categorical column. But what I realized looking closer is that it's in the phik.significance.significance_from_array() function. The phik.phik_from_array() actually does seem to work with True/False for drop over/underflow.

Here is my code, starting with the full dataset:

import pandas as pd
import phik

loan_df = pd.read_csv('data/loan_data.csv', parse_dates=[
                      'DATE_OF_BIRTH', 'DISBURSAL_DATE'], infer_datetime_format=True)
loan_df = loan_df.sample(10000, random_state=42)

loan_df.drop('UNIQUEID', axis=1, inplace=True)

loan_df_epoch_time = loan_df.copy()
loan_df_epoch_time['DATE_OF_BIRTH'] = (loan_df_epoch_time['DATE_OF_BIRTH'] - pd.to_datetime('1-1-1970')).dt.total_seconds()
loan_df_epoch_time['DISBURSAL_DATE'] = (loan_df_epoch_time['DISBURSAL_DATE'] - pd.to_datetime('1-1-1970')).dt.total_seconds()

interval_columns = ['DISBURSED_AMOUNT', 'ASSET_COST', 'LTV', 'DATE_OF_BIRTH', 'DISBURSAL_DATE', 'PERFORM_CNS_SCORE',
                   'NEW_ACCTS_IN_LAST_SIX_MONTHS', 'NO_OF_INQUIRIES']

for c in loan_df_epoch_time.columns:
    if c not in interval_columns:
        loan_df_epoch_time[c] = loan_df_epoch_time[c].astype('category')

phik_correlations = []
phik_significances = []
columns = loan_df_epoch_time.columns
y = loan_df_epoch_time['LOAN_DEFAULT']

for c in columns:
    print(c)
    x = loan_df_epoch_time[c]
    if c in interval_columns:
        phik_correlations.append(phik.phik_from_array(x, y, [c]))
        phik_significances.append(phik.significance.significance_from_array(x, y, [c])[0])
    else:
        phik_correlations.append(phik.phik_from_array(x, y))
        phik_significances.append(phik.significance.significance_from_array(x, y)[0])

I think the default for drop over/underflow is True, but I also tried setting it as true and it gave the same error. When drop over/underflow in the else block is False, it works. Here is the error. The first column it hits and errors on is BRANCH_ID, but I think it would error with any categorical column.

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~\Anaconda3\envs\datasci\lib\site-packages\pandas\core\series.py in __setitem__(self, key, value)
   1000         try:
-> 1001             self._set_with_engine(key, value)
   1002         except (KeyError, ValueError):

~\Anaconda3\envs\datasci\lib\site-packages\pandas\core\series.py in _set_with_engine(self, key, value)
   1033         # fails with AttributeError for IntervalIndex
-> 1034         loc = self.index._engine.get_loc(key)
   1035         validate_numeric_casting(self.dtype, value)

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

TypeError: '(array([], dtype=int64),)' is an invalid key

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
<ipython-input-12-3620e6f38caa> in <module>
     12     else:
     13         phik_correlations.append(phik.phik_from_array(x, y, drop_overflow=True, drop_underflow=True))
---> 14         phik_significances.append(phik.significance.significance_from_array(x, y)[0])

~\Anaconda3\envs\datasci\lib\site-packages\phik\significance.py in significance_from_array(x, y, num_vars, bins, quantile, lambda_, nsim, significance_method, simulation_method, dropna, drop_underflow, drop_overflow)
    376         x, y = bin_data(df, num_vars, bins=bins, quantile=quantile).T.values
    377 
--> 378     return significance_from_binned_array(
    379         x, y, lambda_=lambda_, significance_method=significance_method, nsim=nsim,
    380         simulation_method=simulation_method, dropna=dropna, drop_underflow=drop_underflow, drop_overflow=drop_overflow

~\Anaconda3\envs\datasci\lib\site-packages\phik\significance.py in significance_from_binned_array(x, y, lambda_, significance_method, nsim, simulation_method, dropna, drop_underflow, drop_overflow)
    414         y = y.copy()
    415         if drop_underflow:
--> 416             x[np.where(x == defs.UF)] = np.nan
    417             y[np.where(y == defs.UF)] = np.nan
    418         if drop_overflow:

~\Anaconda3\envs\datasci\lib\site-packages\pandas\core\series.py in __setitem__(self, key, value)
   1011         except TypeError as err:
   1012             if isinstance(key, tuple) and not isinstance(self.index, MultiIndex):
-> 1013                 raise KeyError(
   1014                     "key of type tuple not found and not a MultiIndex"
   1015                 ) from err

KeyError: 'key of type tuple not found and not a MultiIndex'
mbaak commented 3 years ago

Ah right, I see it. Same bug but in a different place. (I should have spotted that.)