modin-project / modin

Modin: Scale your Pandas workflows by changing a single line of code
http://modin.readthedocs.io
Apache License 2.0
9.76k stars 651 forks source link

read_csv occurs a error " TypeError: Cannot interpret 'nan' as a data type" #6165

Open tanliwei-coder opened 1 year ago

tanliwei-coder commented 1 year ago

When I use modin with ray engine to read a csv by method read_csv, it raises a TypeError: Cannot interpret 'nan' as a data type, but it is fine in pandas, anybody can tell me why?

> ---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[19], line 1
----> 1 data1 = pd1.read_csv(file_path, sep="\t", comment="#", header=0)

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/modin/logging/logger_decorator.py:128, in enable_logging.<locals>.decorator.<locals>.run_and_log(*args, **kwargs)
    113 """
    114 Compute function with logging if Modin logging is enabled.
    115 
   (...)
    125 Any
    126 """
    127 if LogMode.get() == "disable":
--> 128     return obj(*args, **kwargs)
    130 logger = get_logger()
    131 logger_level = getattr(logger, log_level)

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/modin/pandas/io.py:215, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    213 f_locals.pop("mangle_dupe_cols", None)
    214 kwargs = {k: v for k, v in f_locals.items() if k in _pd_read_csv_signature}
--> 215 return _read(**kwargs)

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/modin/pandas/io.py:87, in _read(**kwargs)
     84 from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher
     86 squeeze = kwargs.pop("squeeze", False)
---> 87 pd_obj = FactoryDispatcher.read_csv(**kwargs)
     88 # This happens when `read_csv` returns a TextFileReader object for iterating through
     89 if isinstance(pd_obj, TextFileReader):

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/modin/core/execution/dispatching/factories/dispatcher.py:188, in FactoryDispatcher.read_csv(cls, **kwargs)
    185 @classmethod
    186 @_inherit_docstrings(factories.BaseFactory._read_csv)
    187 def read_csv(cls, **kwargs):
--> 188     return cls.get_factory()._read_csv(**kwargs)

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/modin/core/execution/dispatching/factories/factories.py:223, in BaseFactory._read_csv(cls, **kwargs)
    215 @classmethod
    216 @doc(
    217     _doc_io_method_template,
   (...)
    221 )
    222 def _read_csv(cls, **kwargs):
--> 223     return cls.io_cls.read_csv(**kwargs)

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/modin/logging/logger_decorator.py:128, in enable_logging.<locals>.decorator.<locals>.run_and_log(*args, **kwargs)
    113 """
    114 Compute function with logging if Modin logging is enabled.
    115 
   (...)
    125 Any
    126 """
    127 if LogMode.get() == "disable":
--> 128     return obj(*args, **kwargs)
    130 logger = get_logger()
    131 logger_level = getattr(logger, log_level)

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/modin/core/io/file_dispatcher.py:159, in FileDispatcher.read(cls, *args, **kwargs)
    157 query_compiler = cls._read(*args, **kwargs)
    158 # TextFileReader can also be returned from `_read`.
--> 159 if not AsyncReadMode.get() and hasattr(query_compiler, "dtypes"):
    160     # at the moment it is not possible to use `wait_partitions` function;
    161     # in a situation where the reading function is called in a row with the
    162     # same parameters, `wait_partitions` considers that we have waited for
    163     # the end of remote calculations, however, when trying to materialize the
    164     # received data, it is clear that the calculations have not yet ended.
    165     # for example, `test_io_exp.py::test_read_evaluated_dict` is failed because of that.
    166     # see #5944 for details
    167     _ = query_compiler.dtypes
    168 return query_compiler

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/modin/core/storage_formats/pandas/query_compiler.py:301, in PandasQueryCompiler.dtypes(self)
    299 @property
    300 def dtypes(self):
--> 301     return self._modin_frame.dtypes

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/modin/core/dataframe/pandas/dataframe/dataframe.py:262, in PandasDataframe.dtypes(self)
    253 """
    254 Compute the data types if they are not cached.
    255 
   (...)
    259     A pandas Series containing the data types for this dataframe.
    260 """
    261 if self.has_dtypes_cache:
--> 262     dtypes = self._dtypes.get()
    263 else:
    264     dtypes = self._compute_dtypes()

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/modin/core/dataframe/pandas/metadata/dtypes.py:54, in ModinDtypes.get(self)
     52 if not self.is_materialized:
     53     if callable(self._value):
---> 54         self._value = self._value()
     55         if self._value is None:
     56             self._value = pandas.Series([])

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/modin/core/io/text/text_file_dispatcher.py:937, in TextFileDispatcher._get_new_qc.<locals>.<lambda>()
    895 """
    896 Get new query compiler from data received from workers.
    897 
   (...)
    925     New query compiler, created from `new_frame`.
    926 """
    927 partition_ids = cls.build_partition(
    928     partition_ids, [None] * len(index_ids), column_widths
    929 )
    931 new_frame = cls.frame_cls(
    932     partition_ids,
    933     lambda: cls._define_index(index_ids, index_name),
    934     column_names,
    935     None,
    936     column_widths,
--> 937     dtypes=lambda: cls.get_dtypes(dtypes_ids, column_names),
    938 )
    939 new_query_compiler = cls.query_compiler_cls(new_frame)
    940 skipfooter = kwargs.get("skipfooter", None)

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/modin/logging/logger_decorator.py:128, in enable_logging.<locals>.decorator.<locals>.run_and_log(*args, **kwargs)
    113 """
    114 Compute function with logging if Modin logging is enabled.
    115 
   (...)
    125 Any
    126 """
    127 if LogMode.get() == "disable":
--> 128     return obj(*args, **kwargs)
    130 logger = get_logger()
    131 logger_level = getattr(logger, log_level)

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/modin/core/storage_formats/pandas/parsers.py:261, in PandasParser.get_dtypes(cls, dtypes_ids, columns)
    252     ErrorMessage.missmatch_with_pandas(
    253         operation="read_*",
    254         message="Data types of partitions are different! "
    255         + "Please refer to the troubleshooting section of the Modin documentation "
    256         + "to fix this issue",
    257     )
    259     # concat all elements of `partitions_dtypes` and find common dtype
    260     # for each of the column among all partitions
--> 261     frame_dtypes = combined_part_dtypes.apply(
    262         lambda row: find_common_type_cat(row.values),
    263         axis=1,
    264     ).squeeze(axis=0)
    266 # Set the index for the dtypes to the column names
    267 if isinstance(frame_dtypes, pandas.Series):

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/pandas/core/frame.py:9568, in DataFrame.apply(self, func, axis, raw, result_type, args, **kwargs)
   9557 from pandas.core.apply import frame_apply
   9559 op = frame_apply(
   9560     self,
   9561     func=func,
   (...)
   9566     kwargs=kwargs,
   9567 )
-> 9568 return op.apply().__finalize__(self, method="apply")

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/pandas/core/apply.py:764, in FrameApply.apply(self)
    761 elif self.raw:
    762     return self.apply_raw()
--> 764 return self.apply_standard()

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/pandas/core/apply.py:891, in FrameApply.apply_standard(self)
    890 def apply_standard(self):
--> 891     results, res_index = self.apply_series_generator()
    893     # wrap results
    894     return self.wrap_results(results, res_index)

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/pandas/core/apply.py:907, in FrameApply.apply_series_generator(self)
    904 with option_context("mode.chained_assignment", None):
    905     for i, v in enumerate(series_gen):
    906         # ignore SettingWithCopy here in case the user mutates
--> 907         results[i] = self.f(v)
    908         if isinstance(results[i], ABCSeries):
    909             # If we have a view on v, we need to make a copy because
    910             #  series_generator will swap out the underlying data
    911             results[i] = results[i].copy(deep=False)

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/modin/core/storage_formats/pandas/parsers.py:262, in PandasParser.get_dtypes.<locals>.<lambda>(row)
    252     ErrorMessage.missmatch_with_pandas(
    253         operation="read_*",
    254         message="Data types of partitions are different! "
    255         + "Please refer to the troubleshooting section of the Modin documentation "
    256         + "to fix this issue",
    257     )
    259     # concat all elements of `partitions_dtypes` and find common dtype
    260     # for each of the column among all partitions
    261     frame_dtypes = combined_part_dtypes.apply(
--> 262         lambda row: find_common_type_cat(row.values),
    263         axis=1,
    264     ).squeeze(axis=0)
    266 # Set the index for the dtypes to the column names
    267 if isinstance(frame_dtypes, pandas.Series):

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/modin/core/storage_formats/pandas/parsers.py:149, in find_common_type_cat(types)
    144     return union_categoricals(
    145         [pandas.Categorical([], dtype=t) for t in types],
    146         sort_categories=all(t.ordered for t in types),
    147     ).dtype
    148 else:
--> 149     return find_common_type(list(types))

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/pandas/core/dtypes/cast.py:1641, in find_common_type(types)
   1638         if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t):
   1639             return np.dtype("object")
-> 1641 return np.find_common_type(types, [])

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/numpy/core/numerictypes.py:651, in find_common_type(array_types, scalar_types)
    599 @set_module('numpy')
    600 def find_common_type(array_types, scalar_types):
    601     """
    602     Determine common type following standard coercion rules.
    603 
   (...)
    649 
    650     """
--> 651     array_types = [dtype(x) for x in array_types]
    652     scalar_types = [dtype(x) for x in scalar_types]
    654     maxa = _can_coerce_all(array_types)

File /usr/local/miniconda3/envs/stereopy/lib/python3.8/site-packages/numpy/core/numerictypes.py:651, in <listcomp>(.0)
    599 @set_module('numpy')
    600 def find_common_type(array_types, scalar_types):
    601     """
    602     Determine common type following standard coercion rules.
    603 
   (...)
    649 
    650     """
--> 651     array_types = [dtype(x) for x in array_types]
    652     scalar_types = [dtype(x) for x in scalar_types]
    654     maxa = _can_coerce_all(array_types)

TypeError: Cannot interpret 'nan' as a data type
vnlitvinov commented 1 year ago

Hi @tanliwei-coder! Would it be possible to share some data which you're trying to read (anonymized if it's not a public one), or some generated bogus which is producing this error? It's hard to tell otherwise...