apache / arrow

Apache Arrow is a multi-language toolbox for accelerated data interchange and in-memory processing
https://arrow.apache.org/
Apache License 2.0
13.96k stars 3.4k forks source link

[Python] Reading from hdfs using pyarrow 10.0.0 throws OSError: [Errno 22] Opening HDFS file #33455

Open asfimport opened 1 year ago

asfimport commented 1 year ago

Hey! I am trying to read a CSV file using pyarrow together with fsspec from HDFS. I used to do this with pyarrow 9.0.0 and fsspec 2022.7.1, however, after I upgraded to pyarrow 10.0.0 this stopped working.

I am not quite sure if this is an incompatibility introduced in the new pyarrow version or if it is a Bug in fsspec. So if I am in the wrong place here, please let me know.

Apart from pyarrow 10.0.0 and fsspec 2022.7.1, I am using pandas version 1.3.3 and python 3.8.11.

Here is the full stack trace


pd.read_csv("hdfs://10.0.2.15:8020/Projects/testing/testing_Training_Datasets/transactions_view_fraud_batch_fv_1_1/validation/part-00000-42b57ad2-57eb-4a63-bfaa-7375e82863e8-c000.csv")
---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    584     kwds.update(kwds_defaults)
    585 
--> 586     return _read(filepath_or_buffer, kwds)
    587 
    588 
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/readers.py in _read(filepath_or_buffer, kwds)
    480 
    481     # Create the parser.
--> 482     parser = TextFileReader(filepath_or_buffer, **kwds)
    483 
    484     if chunksize or iterator:
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/readers.py in _init_(self, f, engine, **kwds)
    809             self.options["has_index_names"] = kwds["has_index_names"]
    810 
--> 811         self._engine = self._make_engine(self.engine)
    812 
    813     def close(self):
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/readers.py in _make_engine(self, engine)
   1038             )
   1039         # error: Too many arguments for "ParserBase"
-> 1040         return mapping[engine](self.f, **self.options)  # type: ignore[call-arg]
   1041 
   1042     def _failover_to_python(self):
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/c_parser_wrapper.py in _init_(self, src, **kwds)
     49 
     50         # open handles
---> 51         self._open_handles(src, kwds)
     52         assert self.handles is not None
     53 
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/base_parser.py in _open_handles(self, src, kwds)
    220         Let the readers open IOHandles after they are done with their potential raises.
    221         """
--> 222         self.handles = get_handle(
    223             src,
    224             "r",
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    607 
    608     # open URLs
--> 609     ioargs = _get_filepath_or_buffer(
    610         path_or_buf,
    611         encoding=encoding,
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/common.py in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
    356 
    357         try:
--> 358             file_obj = fsspec.open(
    359                 filepath_or_buffer, mode=fsspec_mode, **(storage_options or {})
    360             ).open()
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/core.py in open(self)
    133         during the life of the file-like it generates.
    134         """
--> 135         return self._enter_()
    136 
    137     def close(self):
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/core.py in _enter_(self)
    101         mode = self.mode.replace("t", "").replace("b", "") + "b"
    102 
--> 103         f = self.fs.open(self.path, mode=mode)
    104 
    105         self.fobjects = [f]
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/spec.py in open(self, path, mode, block_size, cache_options, compression, **kwargs)
   1092         else:
   1093             ac = kwargs.pop("autocommit", not self._intrans)
-> 1094             f = self._open(
   1095                 path,
   1096                 mode=mode,
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/implementations/arrow.py in wrapper(*args, **kwargs)
     15     def wrapper(*args, **kwargs):
     16         try:
---> 17             return func(*args, **kwargs)
     18         except OSError as exception:
     19             if not exception.args:
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/implementations/arrow.py in _open(self, path, mode, block_size, **kwargs)
    157             # disable compression auto-detection
    158             _kwargs["compression"] = None
--> 159         stream = method(path, **_kwargs)
    160 
    161         return ArrowFile(self, stream, path, mode, block_size, **kwargs)
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pyarrow/_fs.pyx in pyarrow._fs.FileSystem.open_input_stream()
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.pyarrow_internal_check_status()
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()
OSError: [Errno 22] Opening HDFS file '10.0.2.15:8020/Projects/test/test_Training_Datasets/td_1/td/part-00000-acb66eca-636e-4917-9673-e6646f0b4486-c000.csv' failed. Detail: [errno 22] Invalid argument

However, if I leave out the namenode IP and port, it works as expected:


pd.read_csv("hdfs:///Projects/testing/testing_Training_Datasets/transactions_view_fraud_batch_fv_1_1/validation/part-00000-42b57ad2-57eb-4a63-bfaa-7375e82863e8-c000.csv")

Any help is appreciated, thank you!

Environment: pyarrow 10.0.0 fsspec 2022.7.1 pandas 1.3.3 python 3.8.11. Reporter: Moritz Meister

Note: This issue was originally created as ARROW-18276. Please see the migration documentation for further details.

asfimport commented 1 year ago

Alenka Frim / @AlenkaF: Hi [~moritzmeister] !

Could you try using pyarrow directly to see if you then get the same error when opening the file? You can instantiate a HadoopFileSystem object from an URI string, or using the class constructor directly (https://arrow.apache.org/docs/dev/python/filesystems.html#hadoop-distributed-file-system-hdfs). Something similar to this:


from pyarrow import fs
hdfs, _ = fs.HadoopFileSystem.from_uri('hdfs://10.0.2.15:8020/Projects/testing/testing_Training_Datasets/transactions_view_fraud_batch_fv_1_1/validation/')
hdfs.open_input_file("/Projects/testing/testing_Training_Datasets/transactions_view_fraud_batch_fv_1_1/validation/part-00000-42b57ad2-57eb-4a63-bfaa-7375e82863e8-c000.csv")

If that works, you can then use hdfs with {}fsspec{}: https://arrow.apache.org/docs/python/filesystems.html#using-arrow-filesystems-with-fsspec

and fsspec API to open the files: https://filesystem-spec.readthedocs.io/en/latest/api.html

Something similar to this:


from pyarrow import fs
hdfs = fs.HadoopFileSystem.from_uri('hdfs://10.0.2.15:8020/Projects/testing/testing_Training_Datasets/transactions_view_fraud_batch_fv_1_1/validation/')
from fsspec.implementations.arrow import ArrowFSWrapper
hdfs_fsspec = ArrowFSWrapper(hdfs)
hdfs_fsspec.open_files(...)

This way you can see if pyarrow 10.0.0 works or errors. And it is more direct so less likely to error :)

Also, do you maybe know if the Hadoop installation has changed in this time?