Open Loquats opened 2 years ago
I volunteer to look into this and try fixing this issue. Is that okay?
@Sohaib90 yes off course!
There is a known memory leak issue with numpy and auto
value for the bin calculation. See: https://github.com/numpy/numpy/issues/10297
We should be able to handle this by a custom binning method.
Describe the bug Profile report HTML cannot be generated for dataframes with very large integers. It seems like an overflow issue. Would it be possible to fix this? Even if a report for this series cannot be generated, it would be great if pandas-profiling could at least handle it gracefully so that the code does not crash. For example, if one of the series in a dataframe would encounter this issue, would it be possible to omit it from the report, and only report on the other series?
To Reproduce
Full stack trace
``` IndexError: index 10 is out of bounds for axis 0 with size 9 --------------------------------------------------------------------------- IndexError Traceback (most recent call last) in
4 df = pd.DataFrame({"col": pd.Series([716277643516076032 + i for i in range(100)])})
5 df_profile = ProfileReport(df)
----> 6 profile_html = df_profile.to_html()
7 profile_html
/databricks/python/lib/python3.8/site-packages/pandas_profiling/profile_report.py in to_html(self)
366
367 """
--> 368 return self.html
369
370 def to_json(self) -> str:
/databricks/python/lib/python3.8/site-packages/pandas_profiling/profile_report.py in html(self)
183 def html(self) -> str:
184 if self._html is None:
--> 185 self._html = self._render_html()
186 return self._html
187
/databricks/python/lib/python3.8/site-packages/pandas_profiling/profile_report.py in _render_html(self)
285 from pandas_profiling.report.presentation.flavours import HTMLReport
286
--> 287 report = self.report
288
289 with tqdm(
/databricks/python/lib/python3.8/site-packages/pandas_profiling/profile_report.py in report(self)
177 def report(self) -> Root:
178 if self._report is None:
--> 179 self._report = get_report_structure(self.config, self.description_set)
180 return self._report
181
/databricks/python/lib/python3.8/site-packages/pandas_profiling/profile_report.py in description_set(self)
159 def description_set(self) -> Dict[str, Any]:
160 if self._description_set is None:
--> 161 self._description_set = describe_df(
162 self.config,
163 self.df,
/databricks/python/lib/python3.8/site-packages/pandas_profiling/model/describe.py in describe(config, df, summarizer, typeset, sample)
69 # Variable-specific
70 pbar.total += len(df.columns)
---> 71 series_description = get_series_descriptions(
72 config, df, summarizer, typeset, pbar
73 )
/databricks/python/lib/python3.8/site-packages/multimethod/__init__.py in __call__(self, *args, **kwargs)
301 func = self[tuple(func(arg) for func, arg in zip(self.type_checkers, args))]
302 try:
--> 303 return func(*args, **kwargs)
304 except TypeError as ex:
305 raise DispatchError(f"Function {func.__code__}") from ex
/databricks/python/lib/python3.8/site-packages/pandas_profiling/model/pandas/summary_pandas.py in pandas_get_series_descriptions(config, df, summarizer, typeset, pbar)
90 # TODO: use `Pool` for Linux-based systems
91 with multiprocessing.pool.ThreadPool(pool_size) as executor:
---> 92 for i, (column, description) in enumerate(
93 executor.imap_unordered(multiprocess_1d, args)
94 ):
/usr/lib/python3.8/multiprocessing/pool.py in next(self, timeout)
866 if success:
867 return value
--> 868 raise value
869
870 __next__ = next # XXX
/usr/lib/python3.8/multiprocessing/pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
123 job, i, func, args, kwds = task
124 try:
--> 125 result = (True, func(*args, **kwds))
126 except Exception as e:
127 if wrap_exception and func is not _helper_reraises_exception:
/databricks/python/lib/python3.8/site-packages/pandas_profiling/model/pandas/summary_pandas.py in multiprocess_1d(args)
70 """
71 column, series = args
---> 72 return column, describe_1d(config, series, summarizer, typeset)
73
74 pool_size = config.pool_size
/databricks/python/lib/python3.8/site-packages/multimethod/__init__.py in __call__(self, *args, **kwargs)
301 func = self[tuple(func(arg) for func, arg in zip(self.type_checkers, args))]
302 try:
--> 303 return func(*args, **kwargs)
304 except TypeError as ex:
305 raise DispatchError(f"Function {func.__code__}") from ex
/databricks/python/lib/python3.8/site-packages/pandas_profiling/model/pandas/summary_pandas.py in pandas_describe_1d(config, series, summarizer, typeset)
48 vtype = typeset.detect_type(series)
49
---> 50 return summarizer.summarize(config, series, dtype=vtype)
51
52
/databricks/python/lib/python3.8/site-packages/pandas_profiling/model/summarizer.py in summarize(self, config, series, dtype)
35 object:
36 """
---> 37 _, _, summary = self.handle(str(dtype), config, series, {"type": str(dtype)})
38 return summary
39
/databricks/python/lib/python3.8/site-packages/pandas_profiling/model/handler.py in handle(self, dtype, *args, **kwargs)
60 funcs = self.mapping.get(dtype, [])
61 op = compose(funcs)
---> 62 return op(*args)
63
64
/databricks/python/lib/python3.8/site-packages/pandas_profiling/model/handler.py in func2(*x)
19 return f(*x)
20 else:
---> 21 return f(*res)
22
23 return func2
/databricks/python/lib/python3.8/site-packages/pandas_profiling/model/handler.py in func2(*x)
19 return f(*x)
20 else:
---> 21 return f(*res)
22
23 return func2
/databricks/python/lib/python3.8/site-packages/pandas_profiling/model/handler.py in func2(*x)
19 return f(*x)
20 else:
---> 21 return f(*res)
22
23 return func2
/databricks/python/lib/python3.8/site-packages/pandas_profiling/model/handler.py in func2(*x)
15 def func(f: Callable, g: Callable) -> Callable:
16 def func2(*x) -> Any:
---> 17 res = g(*x)
18 if type(res) == bool:
19 return f(*x)
/databricks/python/lib/python3.8/site-packages/multimethod/__init__.py in __call__(self, *args, **kwargs)
301 func = self[tuple(func(arg) for func, arg in zip(self.type_checkers, args))]
302 try:
--> 303 return func(*args, **kwargs)
304 except TypeError as ex:
305 raise DispatchError(f"Function {func.__code__}") from ex
/databricks/python/lib/python3.8/site-packages/pandas_profiling/model/summary_algorithms.py in inner(config, series, summary)
63 if not summary["hashable"]:
64 return config, series, summary
---> 65 return fn(config, series, summary)
66
67 return inner
/databricks/python/lib/python3.8/site-packages/pandas_profiling/model/summary_algorithms.py in inner(config, series, summary)
80 series = series.dropna()
81
---> 82 return fn(config, series, summary)
83
84 return inner
/databricks/python/lib/python3.8/site-packages/pandas_profiling/model/pandas/describe_numeric_pandas.py in pandas_describe_numeric_1d(config, series, summary)
113
114 if chi_squared_threshold > 0.0:
--> 115 stats["chi_squared"] = chi_square(finite_values)
116
117 stats["range"] = stats["max"] - stats["min"]
/databricks/python/lib/python3.8/site-packages/pandas_profiling/model/summary_algorithms.py in chi_square(values, histogram)
50 ) -> dict:
51 if histogram is None:
---> 52 histogram, _ = np.histogram(values, bins="auto")
53 return dict(chisquare(histogram)._asdict())
54
<__array_function__ internals> in histogram(*args, **kwargs)
/databricks/python/lib/python3.8/site-packages/numpy/lib/histograms.py in histogram(a, bins, range, normed, weights, density)
854 # The index computation is not guaranteed to give exactly
855 # consistent results within ~1 ULP of the bin edges.
--> 856 decrement = tmp_a < bin_edges[indices]
857 indices[decrement] -= 1
858 # The last bin includes the right edge. The other bins do not.
IndexError: index 10 is out of bounds for axis 0 with size 9
```
Version information:
Python version: 3.8.10 (default, Nov 26 2021, 20:14:08) Environment: jupyter notebook
Click to expand Version information
``` absl-py==0.11.0 aiohttp==3.8.1 aiosignal==1.2.0 appdirs==1.4.4 argon2-cffi==20.1.0 astor==0.8.1 astunparse==1.6.3 async-generator==1.10 async-timeout==4.0.2 attrs==20.3.0 backcall==0.2.0 bcrypt==3.2.0 bidict==0.21.4 bleach==3.3.0 blis==0.7.4 boto3==1.16.7 botocore==1.19.7 branca==0.4.2 cachetools==4.2.4 catalogue==2.0.6 certifi==2020.12.5 cffi==1.14.5 chardet==4.0.0 charset-normalizer==2.0.10 click==7.1.2 cloudpickle==1.6.0 cmdstanpy==0.9.68 configparser==5.0.1 convertdate==2.3.2 cryptography==3.4.7 cycler==0.10.0 cymem==2.0.5 Cython==0.29.23 databricks-automl-runtime==0.2.5 databricks-cli==0.16.2 dbl-tempo==0.1.2 dbus-python==1.2.16 decorator==5.0.6 defusedxml==0.7.1 dill==0.3.2 diskcache==5.2.1 distlib==0.3.4 distro==1.4.0 distro-info===0.23ubuntu1 eli5==0.11.0 entrypoints==0.3 ephem==4.1.3 facets-overview==1.0.0 fasttext==0.9.2 filelock==3.0.12 Flask==1.1.2 flatbuffers==2.0 folium==0.12.1.post1 frozenlist==1.3.0 fsspec==2022.1.0 future==0.18.2 gast==0.4.0 gitdb==4.0.7 GitPython==3.1.12 google-auth==1.22.1 google-auth-oauthlib==0.4.2 google-pasta==0.2.0 graphviz==0.19.1 grpcio==1.39.0 gunicorn==20.0.4 gviz-api==1.10.0 h5py==3.1.0 hijri-converter==2.2.2 holidays==0.12 horovod==0.23.0 htmlmin==0.1.12 huggingface-hub==0.1.2 idna==2.10 ImageHash==4.2.1 imbalanced-learn==0.8.1 importlib-metadata==3.10.0 ipykernel==5.3.4 ipython==7.22.0 ipython-genutils==0.2.0 ipywidgets==7.6.3 isodate==0.6.0 itsdangerous==1.1.0 jedi==0.17.2 Jinja2==2.11.3 jmespath==0.10.0 joblib==1.0.1 joblibspark==0.3.0 jsonschema==3.2.0 jupyter-client==6.1.12 jupyter-core==4.7.1 jupyterlab-pygments==0.1.2 jupyterlab-widgets==1.0.0 keras==2.7.0 Keras-Preprocessing==1.1.2 kiwisolver==1.3.1 koalas==1.8.2 korean-lunar-calendar==0.2.1 langcodes==3.3.0 libclang==12.0.0 lightgbm==3.3.1 llvmlite==0.38.0 LunarCalendar==0.0.9 Mako==1.1.3 Markdown==3.3.3 MarkupSafe==2.0.1 matplotlib==3.4.2 missingno==0.5.0 mistune==0.8.4 mleap==0.18.1 mlflow-skinny==1.23.0 multidict==6.0.2 multimethod==1.6 murmurhash==1.0.5 nbclient==0.5.3 nbconvert==6.0.7 nbformat==5.1.3 nest-asyncio==1.5.1 networkx==2.5 nltk==3.6.1 notebook==6.3.0 numba==0.55.0 numpy==1.20.1 oauthlib==3.1.0 opt-einsum==3.3.0 packaging==21.3 pandas==1.2.4 pandas-profiling==3.1.0 pandocfilters==1.4.3 paramiko==2.7.2 parso==0.7.0 pathy==0.6.0 patsy==0.5.1 petastorm==0.11.3 pexpect==4.8.0 phik==0.12.0 pickleshare==0.7.5 Pillow==8.2.0 plotly==5.5.0 pmdarima==1.8.4 preshed==3.0.5 prometheus-client==0.10.1 prompt-toolkit==3.0.17 prophet==1.0.1 protobuf==3.17.2 psutil==5.8.0 psycopg2==2.8.5 ptyprocess==0.7.0 pyarrow==4.0.0 pyasn1==0.4.8 pyasn1-modules==0.2.8 pybind11==2.9.0 pycparser==2.20 pydantic==1.8.2 pyDeprecate==0.3.1 Pygments==2.8.1 PyGObject==3.36.0 PyMeeus==0.5.11 PyNaCl==1.4.0 pyodbc==4.0.30 pyparsing==2.4.7 pyrsistent==0.17.3 pystan==2.19.1.1 python-apt==2.0.0+ubuntu0.20.4.6 python-dateutil==2.8.1 python-editor==1.0.4 python-engineio==4.3.0 python-socketio==5.4.1 pytorch-lightning==1.5.9 pytz==2020.5 PyWavelets==1.1.1 PyYAML==5.4.1 pyzmq==20.0.0 regex==2021.4.4 requests==2.25.1 requests-oauthlib==1.3.0 requests-unixsocket==0.2.0 rsa==4.7.2 s3transfer==0.3.7 sacremoses==0.0.46 scikit-learn==0.24.1 scipy==1.6.2 seaborn==0.11.1 Send2Trash==1.5.0 setuptools-git==1.2 shap==0.40.0 simplejson==3.17.2 six==1.15.0 slicer==0.0.7 smart-open==5.2.0 smmap==3.0.5 spacy==3.2.1 spacy-legacy==3.0.8 spacy-loggers==1.0.1 spark-tensorflow-distributor==1.0.0 sqlparse==0.4.1 srsly==2.4.1 ssh-import-id==5.10 statsmodels==0.12.2 tabulate==0.8.7 tangled-up-in-unicode==0.1.0 tenacity==6.2.0 tensorboard==2.7.0 tensorboard-data-server==0.6.1 tensorboard-plugin-profile==2.5.0 tensorboard-plugin-wit==1.8.1 tensorflow-cpu==2.7.0 tensorflow-estimator==2.7.0 tensorflow-io-gcs-filesystem==0.23.1 termcolor==1.1.0 terminado==0.9.4 testpath==0.4.4 thinc==8.0.12 threadpoolctl==2.1.0 tokenizers==0.10.3 torch==1.10.1+cpu torchmetrics==0.7.0 torchvision==0.11.2+cpu tornado==6.1 tqdm==4.59.0 traitlets==5.0.5 transformers==4.15.0 typer==0.3.2 typing-extensions==3.7.4.3 ujson==4.0.2 unattended-upgrades==0.1 urllib3==1.25.11 virtualenv==20.4.1 visions==0.7.4 wasabi==0.8.2 wcwidth==0.2.5 webencodings==0.5.1 websocket-client==0.57.0 Werkzeug==1.0.1 widgetsnbextension==3.5.1 wrapt==1.12.1 xgboost==1.5.1 yarl==1.7.2 zipp==3.4.1 ```
Additional context