I am trying to run DataProfiler on one of the dataset and i am getting below error message.
OverflowError: (34, 'Numerical result out of range')
The column has mixed data type. Some data is numeric in the column, and it is grabbing that and doing numerical analysis on those values. These values are large and variable. Hence, the moment is large and causing an error. I want to treat this column as string.
This issue can be replicated by using a pandas series with mixed data types for ex - '31241AAA9',999999999,123456789
Below is the complete stack trace.
OverflowError Traceback (most recent call last)
Input In [26], in <cell line: 1>()
----> 1 profile1 = Profiler(df,options=profile_options)
File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/profile_builder.py:2186, in Profiler.__new__(cls, data, samples_per_update, min_true_samples, options, profiler_type)
2184 # Construct based off of initial kwarg input or inference
2185 if profiler_type == "structured":
-> 2186 return StructuredProfiler(data, samples_per_update,
2187 min_true_samples, options)
2188 elif profiler_type == "unstructured":
2189 return UnstructuredProfiler(data, samples_per_update,
2190 min_true_samples, options)
File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/profile_builder.py:1323, in StructuredProfiler.__init__(self, data, samples_per_update, min_true_samples, options)
1320 self.chi2_matrix = None
1322 if data is not None:
-> 1323 self.update_profile(data)
File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/profile_builder.py:771, in BaseProfiler.update_profile(self, data, sample_size, min_true_samples)
768 if not sample_size:
769 sample_size = self._get_sample_size(data)
--> 771 self._update_profile_from_chunk(data, sample_size, min_true_samples)
773 # set file properties since data will be processed
774 if encoding is not None:
File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/profile_builder.py:2102, in StructuredProfiler._update_profile_from_chunk(self, data, sample_size, min_true_samples)
2099 logger.info(notification_str)
2101 for prof_idx in tqdm(clean_sampled_dict.keys()):
-> 2102 self._profile[prof_idx].update_column_profilers(
2103 clean_sampled_dict[prof_idx], pool)
2105 if pool is not None:
2106 pool.close() # Close pool for new tasks
File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/profile_builder.py:130, in StructuredColProfiler.update_column_profilers(self, clean_sampled_df, pool)
126 # First run, create the compilers
127 if self.profiles is None or len(self.profiles) == 0:
128 self.profiles = {
129 'data_type_profile':
--> 130 ColumnPrimitiveTypeProfileCompiler(
131 clean_sampled_df, self.options, pool),
132 'data_stats_profile':
133 ColumnStatsProfileCompiler(
134 clean_sampled_df, self.options, pool)
135 }
137 use_data_labeler = True
138 if self.options and isinstance(self.options, StructuredOptions):
File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/column_profile_compilers.py:33, in BaseCompiler.__init__(self, df_series, options, pool)
31 if df_series is not None:
32 self.name = df_series.name
---> 33 self._create_profile(df_series, options, pool)
File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/column_profile_compilers.py:76, in BaseCompiler._create_profile(self, df_series, options, pool)
73 utils.warn_on_profile(profiler.type, e)
75 # Update profile after creation
---> 76 self.update_profile(df_series, pool)
File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/column_profile_compilers.py:176, in BaseCompiler.update_profile(self, df_series, pool)
174 # Single process thread to loop through
175 for profile_type in single_process_list:
--> 176 self._profiles[profile_type].update(df_series)
177 return self
File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/int_column_profile.py:134, in IntColumn.update(self, df_series)
128 profile = dict(match_count=match_int_count, sample_size=sample_size)
130 BaseColumnProfiler._perform_property_calcs(
131 self, self.__calculations, df_series=df_series[is_each_row_int],
132 prev_dependent_properties={}, subset_properties=profile)
--> 134 self._update_helper(
135 df_series_clean=df_series[is_each_row_int],
136 profile=profile
137 )
139 return self
File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/int_column_profile.py:110, in IntColumn._update_helper(self, df_series_clean, profile)
99 """
100 Method for updating the column profile properties with a cleaned
101 dataset and the known null parameters of the dataset.
(...)
107 :return: None
108 """
109 if self._NumericStatsMixin__calculations:
--> 110 NumericStatsMixin._update_helper(self, df_series_clean, profile)
111 self._update_column_base_properties(profile)
File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/numerical_column_stats.py:1179, in NumericStatsMixin._update_helper(self, df_series_clean, profile)
1177 subset_properties = copy.deepcopy(profile)
1178 df_series_clean = df_series_clean.astype(float)
-> 1179 super(NumericStatsMixin, self)._perform_property_calcs(
1180 self.__calculations,
1181 df_series=df_series_clean,
1182 prev_dependent_properties=prev_dependent_properties,
1183 subset_properties=subset_properties)
1184 if len(self._batch_history) == 5:
1185 self._batch_history.pop(0)
File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/base_column_profilers.py:118, in BaseColumnProfiler._perform_property_calcs(self, calculations, df_series, prev_dependent_properties, subset_properties)
102 """
103 Cycles through the properties of the columns and calculate them.
104
(...)
115 :return: None
116 """
117 for prop in calculations:
--> 118 calculations[prop](self,
119 df_series,
120 prev_dependent_properties,
121 subset_properties)
File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/utils.py:608, in method_timeit.<locals>.decorator.<locals>.wrapper(self, *args, **kw)
606 name_dec = method.__name__
607 ts = time.time()
--> 608 result = method(self, *args, **kw)
609 te = time.time()
610 self.times[name_dec] += (te - ts)
File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/numerical_column_stats.py:1264, in NumericStatsMixin._get_skewness(self, df_series, prev_dependent_properties, subset_properties)
1260 if np.isinf(self._biased_skewness) or \
1261 (np.isnan(self._biased_skewness) and self.match_count > 0):
1262 return
-> 1264 batch_biased_skewness = utils.biased_skew(df_series)
1265 subset_properties["biased_skewness"] = batch_biased_skewness
1266 batch_count = subset_properties["match_count"]
File ~/.conda/envs/weekly_download/lib/python3.8/site-packages/dataprofiler/profilers/utils.py:321, in biased_skew(df_series)
318 if (M2 == 0):
319 return 0.0
--> 321 skew = np.sqrt(n) * M3 / M2 ** 1.5
322 return skew
OverflowError: (34, 'Numerical result out of range')
I am trying to run DataProfiler on one of the dataset and i am getting below error message.
OverflowError: (34, 'Numerical result out of range')
The column has mixed data type. Some data is numeric in the column, and it is grabbing that and doing numerical analysis on those values. These values are large and variable. Hence, the moment is large and causing an error. I want to treat this column as string.
This issue can be replicated by using a pandas series with mixed data types for ex - '31241AAA9',999999999,123456789
Below is the complete stack trace.