capitalone / DataProfiler

What's in your data? Extract schema, statistics and entities from datasets
https://capitalone.github.io/DataProfiler
Apache License 2.0
1.42k stars 158 forks source link

Ability to enforce data types. #480

Open ashwinassysh opened 2 years ago

ashwinassysh commented 2 years ago

I am trying to run DataProfiler on one of the dataset and i am getting below error message.

OverflowError: (34, 'Numerical result out of range')

The column has mixed data type. Some data is numeric in the column, and it is grabbing that and doing numerical analysis on those values. These values are large and variable. Hence, the moment is large and causing an error. I want to treat this column as string.

This issue can be replicated by using a pandas series with mixed data types for ex - '31241AAA9',999999999,123456789

Below is the complete stack trace.

OverflowError                             Traceback (most recent call last)
Input In [26], in <cell line: 1>()
----> 1 profile1 = Profiler(df,options=profile_options)

File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/profile_builder.py:2186, in Profiler.__new__(cls, data, samples_per_update, min_true_samples, options, profiler_type)
   2184 # Construct based off of initial kwarg input or inference
   2185 if profiler_type == "structured":
-> 2186     return StructuredProfiler(data, samples_per_update,
   2187                               min_true_samples, options)
   2188 elif profiler_type == "unstructured":
   2189     return UnstructuredProfiler(data, samples_per_update,
   2190                                 min_true_samples, options)

File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/profile_builder.py:1323, in StructuredProfiler.__init__(self, data, samples_per_update, min_true_samples, options)
   1320 self.chi2_matrix = None
   1322 if data is not None:
-> 1323     self.update_profile(data)

File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/profile_builder.py:771, in BaseProfiler.update_profile(self, data, sample_size, min_true_samples)
    768 if not sample_size:
    769     sample_size = self._get_sample_size(data)
--> 771 self._update_profile_from_chunk(data, sample_size, min_true_samples)
    773 # set file properties since data will be processed
    774 if encoding is not None:

File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/profile_builder.py:2102, in StructuredProfiler._update_profile_from_chunk(self, data, sample_size, min_true_samples)
   2099 logger.info(notification_str)
   2101 for prof_idx in tqdm(clean_sampled_dict.keys()):
-> 2102     self._profile[prof_idx].update_column_profilers(
   2103         clean_sampled_dict[prof_idx], pool)
   2105 if pool is not None:
   2106     pool.close()  # Close pool for new tasks

File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/profile_builder.py:130, in StructuredColProfiler.update_column_profilers(self, clean_sampled_df, pool)
    126 # First run, create the compilers
    127 if self.profiles is None or len(self.profiles) == 0:
    128     self.profiles = {
    129         'data_type_profile':
--> 130             ColumnPrimitiveTypeProfileCompiler(
    131                 clean_sampled_df, self.options, pool),
    132         'data_stats_profile':
    133             ColumnStatsProfileCompiler(
    134                 clean_sampled_df, self.options, pool)
    135     }
    137     use_data_labeler = True
    138     if self.options and isinstance(self.options, StructuredOptions):

File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/column_profile_compilers.py:33, in BaseCompiler.__init__(self, df_series, options, pool)
     31 if df_series is not None:
     32     self.name = df_series.name
---> 33     self._create_profile(df_series, options, pool)

File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/column_profile_compilers.py:76, in BaseCompiler._create_profile(self, df_series, options, pool)
     73             utils.warn_on_profile(profiler.type, e)
     75 # Update profile after creation
---> 76 self.update_profile(df_series, pool)

File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/column_profile_compilers.py:176, in BaseCompiler.update_profile(self, df_series, pool)
    174         # Single process thread to loop through
    175 for profile_type in single_process_list:
--> 176     self._profiles[profile_type].update(df_series)
    177 return self

File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/int_column_profile.py:134, in IntColumn.update(self, df_series)
    128 profile = dict(match_count=match_int_count, sample_size=sample_size)
    130 BaseColumnProfiler._perform_property_calcs(
    131     self, self.__calculations, df_series=df_series[is_each_row_int],
    132     prev_dependent_properties={}, subset_properties=profile)
--> 134 self._update_helper(
    135     df_series_clean=df_series[is_each_row_int],
    136     profile=profile
    137 )
    139 return self

File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/int_column_profile.py:110, in IntColumn._update_helper(self, df_series_clean, profile)
     99 """
    100 Method for updating the column profile properties with a cleaned
    101 dataset and the known null parameters of the dataset.
   (...)
    107 :return: None
    108 """
    109 if self._NumericStatsMixin__calculations:
--> 110     NumericStatsMixin._update_helper(self, df_series_clean, profile)
    111 self._update_column_base_properties(profile)

File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/numerical_column_stats.py:1179, in NumericStatsMixin._update_helper(self, df_series_clean, profile)
   1177 subset_properties = copy.deepcopy(profile)
   1178 df_series_clean = df_series_clean.astype(float)
-> 1179 super(NumericStatsMixin, self)._perform_property_calcs(
   1180     self.__calculations,
   1181     df_series=df_series_clean,
   1182     prev_dependent_properties=prev_dependent_properties,
   1183     subset_properties=subset_properties)
   1184 if len(self._batch_history) == 5:
   1185     self._batch_history.pop(0)

File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/base_column_profilers.py:118, in BaseColumnProfiler._perform_property_calcs(self, calculations, df_series, prev_dependent_properties, subset_properties)
    102 """
    103 Cycles through the properties of the columns and calculate them.
    104 
   (...)
    115 :return: None
    116 """
    117 for prop in calculations:
--> 118     calculations[prop](self,
    119                        df_series,
    120                        prev_dependent_properties,
    121                        subset_properties)

File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/utils.py:608, in method_timeit.<locals>.decorator.<locals>.wrapper(self, *args, **kw)
    606     name_dec = method.__name__
    607 ts = time.time()
--> 608 result = method(self, *args, **kw)
    609 te = time.time()
    610 self.times[name_dec] += (te - ts)

File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/numerical_column_stats.py:1264, in NumericStatsMixin._get_skewness(self, df_series, prev_dependent_properties, subset_properties)
   1260 if np.isinf(self._biased_skewness) or \
   1261         (np.isnan(self._biased_skewness) and self.match_count > 0):
   1262     return
-> 1264 batch_biased_skewness = utils.biased_skew(df_series)
   1265 subset_properties["biased_skewness"] = batch_biased_skewness
   1266 batch_count = subset_properties["match_count"]

File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/utils.py:321, in biased_skew(df_series)
    318 if (M2 == 0):
    319     return 0.0
--> 321 skew = np.sqrt(n) * M3 / M2 ** 1.5
    322 return skew

OverflowError: (34, 'Numerical result out of range')
JGSweets commented 2 years ago

PR for fix: https://github.com/capitalone/DataProfiler/pull/481

JGSweets commented 2 years ago

A separate feature should allow a user to set given columns to be profiled as specific types.

JGSweets commented 2 years ago

@taylorfturner we should spin the last comment into a new issue and close this one, thoughts?