CODAIT / text-extensions-for-pandas

Natural language processing support for Pandas dataframes.
Apache License 2.0
217 stars 34 forks source link

Error displaying TensorArray column of strings when > 50 rows #132

Closed frreiss closed 4 years ago

frreiss commented 4 years ago

If you try to display a DataFrame of more than 50 rows in a Jupyter notebook, and that DataFrame contains a TensorArray column of string values, the displaying functionality crashes while trying to pull out ranges of the DataFrame to display.

Code to reproduce (paste into a Jupyter notebook):

pd.DataFrame({"foo": tp.TensorArray(np.array([["Hello", "world"]] * 100))})

Expected output: First and last few rows of a 100-row DataFrame.

Actual output: Crash with the following stack trace:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/IPython/core/formatters.py in __call__(self, obj)
    700                 type_pprinters=self.type_printers,
    701                 deferred_pprinters=self.deferred_printers)
--> 702             printer.pretty(obj)
    703             printer.flush()
    704             return stream.getvalue()

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/IPython/lib/pretty.py in pretty(self, obj)
    392                         if cls is not object \
    393                                 and callable(cls.__dict__.get('__repr__')):
--> 394                             return _repr_pprint(obj, self, cycle)
    395 
    396             return _default_pprint(obj, self, cycle)

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
    698     """A pprint that just redirects to the normal repr function."""
    699     # Find newlines and replace them with p.break_()
--> 700     output = repr(obj)
    701     lines = output.splitlines()
    702     with p.group():

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/frame.py in __repr__(self)
    749             line_width=width,
    750             max_colwidth=max_colwidth,
--> 751             show_dimensions=show_dimensions,
    752         )
    753 

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/frame.py in to_string(self, buf, columns, col_space, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, max_rows, min_rows, max_cols, show_dimensions, decimal, line_width, max_colwidth, encoding)
    879                 show_dimensions=show_dimensions,
    880                 decimal=decimal,
--> 881                 line_width=line_width,
    882             )
    883             return formatter.to_string(buf=buf, encoding=encoding)

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/io/formats/format.py in __init__(self, frame, columns, col_space, header, index, na_rep, formatters, justify, float_format, sparsify, index_names, line_width, max_rows, min_rows, max_cols, show_dimensions, decimal, table_id, render_links, bold_rows, escape)
    628             self.columns = frame.columns
    629 
--> 630         self._chk_truncate()
    631         self.adj = _get_adjustment()
    632 

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/io/formats/format.py in _chk_truncate(self)
    714             else:
    715                 row_num = max_rows_adj // 2
--> 716                 frame = concat((frame.iloc[:row_num, :], frame.iloc[-row_num:, :]))
    717             self.tr_row_num = row_num
    718         else:

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/reshape/concat.py in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
    285     )
    286 
--> 287     return op.get_result()
    288 
    289 

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/reshape/concat.py in get_result(self)
    501 
    502             new_data = concatenate_block_managers(
--> 503                 mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy,
    504             )
    505             if not self.copy:

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/internals/concat.py in concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy)
     62                 values = values.view()
     63             b = b.make_block_same_class(values, placement=placement)
---> 64         elif _is_uniform_join_units(join_units):
     65             blk = join_units[0].block
     66             vals = [ju.block.values for ju in join_units]

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/internals/concat.py in _is_uniform_join_units(join_units)
    478         # no blocks that would get missing values (can lead to type upcasts)
    479         # unless we're an extension dtype.
--> 480         all(not ju.is_na or ju.block.is_extension for ju in join_units)
    481         and
    482         # no blocks with indexers (as then the dimensions do not fit)

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/internals/concat.py in <genexpr>(.0)
    478         # no blocks that would get missing values (can lead to type upcasts)
    479         # unless we're an extension dtype.
--> 480         all(not ju.is_na or ju.block.is_extension for ju in join_units)
    481         and
    482         # no blocks with indexers (as then the dimensions do not fit)

pandas/_libs/properties.pyx in pandas._libs.properties.CachedProperty.__get__()

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/internals/concat.py in is_na(self)
    226         chunk_len = max(total_len // 40, 1000)
    227         for i in range(0, total_len, chunk_len):
--> 228             if not isna(values_flat[i : i + chunk_len]).all():
    229                 return False
    230 

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/dtypes/missing.py in isna(obj)
    122     Name: 1, dtype: bool
    123     """
--> 124     return _isna(obj)
    125 
    126 

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/dtypes/missing.py in _isna(obj, inf_as_na)
    155         return False
    156     elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass, ABCExtensionArray)):
--> 157         return _isna_ndarraylike(obj, inf_as_na=inf_as_na)
    158     elif isinstance(obj, ABCDataFrame):
    159         return obj.isna()

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/dtypes/missing.py in _isna_ndarraylike(obj, inf_as_na)
    214             result = libmissing.isnaobj_old(values.to_numpy())
    215         else:
--> 216             result = values.isna()
    217     elif is_string_dtype(dtype):
    218         result = _isna_string_dtype(values, dtype, inf_as_na=inf_as_na)

~/pd/tep-website/text_extensions_for_pandas/array/tensor.py in isna(self)
    199         for information about this method.
    200         """
--> 201         return np.all(np.isnan(self._tensor), axis=-1)
    202 
    203     def copy(self) -> "TensorArray":

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/IPython/core/formatters.py in __call__(self, obj)
    343             method = get_real_method(obj, self.print_method)
    344             if method is not None:
--> 345                 return method()
    346             return None
    347         else:

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/frame.py in _repr_html_(self)
    794                 decimal=".",
    795                 table_id=None,
--> 796                 render_links=False,
    797             )
    798             return formatter.to_html(notebook=True)

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/io/formats/format.py in __init__(self, frame, columns, col_space, header, index, na_rep, formatters, justify, float_format, sparsify, index_names, line_width, max_rows, min_rows, max_cols, show_dimensions, decimal, table_id, render_links, bold_rows, escape)
    628             self.columns = frame.columns
    629 
--> 630         self._chk_truncate()
    631         self.adj = _get_adjustment()
    632 

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/io/formats/format.py in _chk_truncate(self)
    714             else:
    715                 row_num = max_rows_adj // 2
--> 716                 frame = concat((frame.iloc[:row_num, :], frame.iloc[-row_num:, :]))
    717             self.tr_row_num = row_num
    718         else:

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/reshape/concat.py in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
    285     )
    286 
--> 287     return op.get_result()
    288 
    289 

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/reshape/concat.py in get_result(self)
    501 
    502             new_data = concatenate_block_managers(
--> 503                 mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy,
    504             )
    505             if not self.copy:

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/internals/concat.py in concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy)
     62                 values = values.view()
     63             b = b.make_block_same_class(values, placement=placement)
---> 64         elif _is_uniform_join_units(join_units):
     65             blk = join_units[0].block
     66             vals = [ju.block.values for ju in join_units]

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/internals/concat.py in _is_uniform_join_units(join_units)
    478         # no blocks that would get missing values (can lead to type upcasts)
    479         # unless we're an extension dtype.
--> 480         all(not ju.is_na or ju.block.is_extension for ju in join_units)
    481         and
    482         # no blocks with indexers (as then the dimensions do not fit)

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/internals/concat.py in <genexpr>(.0)
    478         # no blocks that would get missing values (can lead to type upcasts)
    479         # unless we're an extension dtype.
--> 480         all(not ju.is_na or ju.block.is_extension for ju in join_units)
    481         and
    482         # no blocks with indexers (as then the dimensions do not fit)

pandas/_libs/properties.pyx in pandas._libs.properties.CachedProperty.__get__()

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/internals/concat.py in is_na(self)
    226         chunk_len = max(total_len // 40, 1000)
    227         for i in range(0, total_len, chunk_len):
--> 228             if not isna(values_flat[i : i + chunk_len]).all():
    229                 return False
    230 

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/dtypes/missing.py in isna(obj)
    122     Name: 1, dtype: bool
    123     """
--> 124     return _isna(obj)
    125 
    126 

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/dtypes/missing.py in _isna(obj, inf_as_na)
    155         return False
    156     elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass, ABCExtensionArray)):
--> 157         return _isna_ndarraylike(obj, inf_as_na=inf_as_na)
    158     elif isinstance(obj, ABCDataFrame):
    159         return obj.isna()

~/opt/miniconda3/envs/pd/lib/python3.7/site-packages/pandas/core/dtypes/missing.py in _isna_ndarraylike(obj, inf_as_na)
    214             result = libmissing.isnaobj_old(values.to_numpy())
    215         else:
--> 216             result = values.isna()
    217     elif is_string_dtype(dtype):
    218         result = _isna_string_dtype(values, dtype, inf_as_na=inf_as_na)

~/pd/tep-website/text_extensions_for_pandas/array/tensor.py in isna(self)
    199         for information about this method.
    200         """
--> 201         return np.all(np.isnan(self._tensor), axis=-1)
    202 
    203     def copy(self) -> "TensorArray":

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''
BryanCutler commented 4 years ago

Good catch @frreiss , TensorArray.isna() wasn't able to handle strings