mongodb-labs / mongo-arrow

MongoDB integrations for Apache Arrow. Export MongoDB documents to numpy array, parquet files, and pandas dataframes in one line of code.
https://mongo-arrow.readthedocs.io
Apache License 2.0
92 stars 14 forks source link

Handling ObjectID after to_pandas #242

Open frbelotto opened 1 month ago

frbelotto commented 1 month ago

Hello guys, I am importing a big dataset from mongo:

pd_confirmacao_conversao = find_arrow_all(pd_confirmacao_conversao, {'estadoContabilizacaoEvento': { '$lt': 100}})

After that, I´ve just exported it to a pandas dataframe

pd_confirmacao_conversao = pd_confirmacao_conversao.to_pandas()

Mu issue is that my original dataframe contains two columns that contains ObjectIds ('_id' and 'referenciaConversao'). Because of that I try to run a df.info, it crashes!

pd_confirmacao_conversao.info()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[9], line 1
----> 1 pd_confirmacao_conversao.info()

File /projeto/libs/lib/python3.11/site-packages/pandas/core/frame.py:3659, in DataFrame.info(self, verbose, buf, max_cols, memory_usage, show_counts)
   3646 @doc(INFO_DOCSTRING, **frame_sub_kwargs)
   3647 def info(
   3648     self,
   (...)
   3653     show_counts: bool | None = None,
   3654 ) -> None:
   3655     info = DataFrameInfo(
   3656         data=self,
   3657         memory_usage=memory_usage,
   3658     )
-> 3659     info.render(
   3660         buf=buf,
   3661         max_cols=max_cols,
   3662         verbose=verbose,
   3663         show_counts=show_counts,
   3664     )

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:512, in DataFrameInfo.render(self, buf, max_cols, verbose, show_counts)
    498 def render(
    499     self,
    500     *,
   (...)
    504     show_counts: bool | None,
    505 ) -> None:
    506     printer = _DataFrameInfoPrinter(
    507         info=self,
    508         max_cols=max_cols,
    509         verbose=verbose,
    510         show_counts=show_counts,
    511     )
--> 512     printer.to_buffer(buf)

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:583, in _InfoPrinterAbstract.to_buffer(self, buf)
    581 """Save dataframe info into buffer."""
    582 table_builder = self._create_table_builder()
--> 583 lines = table_builder.get_lines()
    584 if buf is None:  # pragma: no cover
    585     buf = sys.stdout

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:790, in _DataFrameTableBuilder.get_lines(self)
    788     self._fill_empty_info()
    789 else:
--> 790     self._fill_non_empty_info()
    791 return self._lines

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:960, in _DataFrameTableBuilderVerbose._fill_non_empty_info(self)
    958 self.add_dtypes_line()
    959 if self.display_memory_usage:
--> 960     self.add_memory_usage_line()

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:820, in _DataFrameTableBuilder.add_memory_usage_line(self)
    818 def add_memory_usage_line(self) -> None:
    819     """Add line containing memory usage."""
--> 820     self._lines.append(f"memory usage: {self.memory_usage_string}")

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:750, in _TableBuilderAbstract.memory_usage_string(self)
    747 @property
    748 def memory_usage_string(self) -> str:
    749     """Memory usage string with proper size qualifier."""
--> 750     return self.info.memory_usage_string

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:413, in _BaseInfo.memory_usage_string(self)
    410 @property
    411 def memory_usage_string(self) -> str:
    412     """Memory usage in a form of human readable string."""
--> 413     return f"{_sizeof_fmt(self.memory_usage_bytes, self.size_qualifier)}\n"

File /projeto/libs/lib/python3.11/site-packages/pandas/io/formats/info.py:496, in DataFrameInfo.memory_usage_bytes(self)
    493 @property
    494 def memory_usage_bytes(self) -> int:
    495     deep = self.memory_usage == "deep"
--> 496     return self.data.memory_usage(index=True, deep=deep).sum()

File /projeto/libs/lib/python3.11/site-packages/pandas/core/frame.py:3755, in DataFrame.memory_usage(self, index, deep)
   3666 def memory_usage(self, index: bool = True, deep: bool = False) -> Series:
   3667     """
   3668     Return the memory usage of each column in bytes.
   3669 
   (...)
   3753     5244
   3754     """
-> 3755     result = self._constructor_sliced(
   3756         [c.memory_usage(index=False, deep=deep) for col, c in self.items()],
   3757         index=self.columns,
   3758         dtype=np.intp,
   3759     )
   3760     if index:
   3761         index_memory_usage = self._constructor_sliced(
   3762             self.index.memory_usage(deep=deep), index=["Index"]
   3763         )

File /projeto/libs/lib/python3.11/site-packages/pandas/core/series.py:584, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
    582         data = data.copy()
    583 else:
--> 584     data = sanitize_array(data, index, dtype, copy)
    586     manager = _get_option("mode.data_manager", silent=True)
    587     if manager == "block":

File /projeto/libs/lib/python3.11/site-packages/pandas/core/construction.py:651, in sanitize_array(data, index, dtype, copy, allow_2d)
    648     subarr = np.array([], dtype=np.float64)
    650 elif dtype is not None:
--> 651     subarr = _try_cast(data, dtype, copy)
    653 else:
    654     subarr = maybe_convert_platform(data)

File /projeto/libs/lib/python3.11/site-packages/pandas/core/construction.py:818, in _try_cast(arr, dtype, copy)
    813 # GH#15832: Check if we are requesting a numeric dtype and
    814 # that we can convert the data to the requested dtype.
    815 elif dtype.kind in "iu":
    816     # this will raise if we have e.g. floats
--> 818     subarr = maybe_cast_to_integer_array(arr, dtype)
    819 elif not copy:
    820     subarr = np.asarray(arr, dtype=dtype)

File /projeto/libs/lib/python3.11/site-packages/pandas/core/dtypes/cast.py:1657, in maybe_cast_to_integer_array(arr, dtype)
   1650         if not np_version_gt2:
   1651             warnings.filterwarnings(
   1652                 "ignore",
   1653                 "NumPy will stop allowing conversion of "
   1654                 "out-of-bound Python int",
   1655                 DeprecationWarning,
   1656             )
-> 1657         casted = np.asarray(arr, dtype=dtype)
   1658 else:
   1659     with warnings.catch_warnings():

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'method'

I Can fix it converting the dtype to strings, but I want to understand what should be the expected behavior without converting it.

Thanks in advance!

pandas                            2.2.2
pyarrow                          17.0.0
pymongo                        4.8.0
pymongoarrow               1.5.1
aclark4life commented 1 month ago

Thank you for the question! Tracking in https://jira.mongodb.org/browse/ARROW-256