apache / arrow

Apache Arrow is the universal columnar format and multi-language toolbox for fast data interchange and in-memory analytics
https://arrow.apache.org/
Apache License 2.0
14.48k stars 3.52k forks source link

ArrowNotImplementedError: Not implemented type for Arrow list to pandas #37517

Open PCClimate opened 1 year ago

PCClimate commented 1 year ago

Getting the following error when trying to pull in data from a parquet file, is this expected for the data structure, is there a workaround using Arrow?

ArrowNotImplementedError: Not implemented type for Arrow list to pandas: map<string, string ('array_element')>


Full error:

ArrowNotImplementedError                  Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_32648\504770532.py in <module>
----> 1 df.to_csv(r'dataoutput.csv', index=False)

~\Anaconda3\lib\site-packages\dask\dataframe\core.py in to_csv(self, filename, **kwargs)
   1689         from dask.dataframe.io import to_csv
   1690 
-> 1691         return to_csv(self, filename, **kwargs)
   1692 
   1693     def to_sql(

~\Anaconda3\lib\site-packages\dask\dataframe\io\csv.py in to_csv(df, filename, single_file, encoding, mode, name_function, compression, compute, scheduler, storage_options, header_first_partition_only, compute_kwargs, **kwargs)
    970         import dask
    971 
--> 972         return list(dask.compute(*values, **compute_kwargs))
    973     else:
    974         return values

~\Anaconda3\lib\site-packages\dask\base.py in compute(traverse, optimize_graph, scheduler, get, *args, **kwargs)
    601         postcomputes.append(x.__dask_postcompute__())
    602 
--> 603     results = schedule(dsk, keys, **kwargs)
    604     return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
    605 

~\Anaconda3\lib\site-packages\dask\threaded.py in get(dsk, keys, cache, num_workers, pool, **kwargs)
     87             pool = MultiprocessingPoolExecutor(pool)
     88 
---> 89     results = get_async(
     90         pool.submit,
     91         pool._max_workers,

~\Anaconda3\lib\site-packages\dask\local.py in get_async(submit, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, chunksize, **kwargs)
    509                             _execute_task(task, data)  # Re-execute locally
    510                         else:
--> 511                             raise_exception(exc, tb)
    512                     res, worker_id = loads(res_info)
    513                     state["cache"][key] = res

~\Anaconda3\lib\site-packages\dask\local.py in reraise(exc, tb)
    317     if exc.__traceback__ is not tb:
    318         raise exc.with_traceback(tb)
--> 319     raise exc
    320 
    321 

~\Anaconda3\lib\site-packages\dask\local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
    222     try:
    223         task, data = loads(task_info)
--> 224         result = _execute_task(task, data)
    225         id = get_id()
    226         result = dumps((result, id))

~\Anaconda3\lib\site-packages\dask\core.py in _execute_task(arg, cache, dsk)
    117         # temporaries by their reference count and can execute certain
    118         # operations in-place.
--> 119         return func(*(_execute_task(a, cache) for a in args))
    120     elif not ishashable(arg):
    121         return arg

~\Anaconda3\lib\site-packages\dask\optimization.py in __call__(self, *args)
    988         if not len(args) == len(self.inkeys):
    989             raise ValueError("Expected %d args, got %d" % (len(self.inkeys), len(args)))
--> 990         return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
    991 
    992     def __reduce__(self):

~\Anaconda3\lib\site-packages\dask\core.py in get(dsk, out, cache)
    147     for key in toposort(dsk):
    148         task = dsk[key]
--> 149         result = _execute_task(task, cache)
    150         cache[key] = result
    151     result = _execute_task(out, cache)

~\Anaconda3\lib\site-packages\dask\core.py in _execute_task(arg, cache, dsk)
    117         # temporaries by their reference count and can execute certain
    118         # operations in-place.
--> 119         return func(*(_execute_task(a, cache) for a in args))
    120     elif not ishashable(arg):
    121         return arg

~\Anaconda3\lib\site-packages\dask\dataframe\io\parquet\core.py in __call__(self, part)
     87             part = [part]
     88 
---> 89         return read_parquet_part(
     90             self.fs,
     91             self.engine,

~\Anaconda3\lib\site-packages\dask\dataframe\io\parquet\core.py in read_parquet_part(fs, engine, meta, part, columns, index, kwargs)
    583             # Part kwargs expected
    584             func = engine.read_partition
--> 585             dfs = [
    586                 func(fs, rg, columns.copy(), index, **toolz.merge(kwargs, kw))
    587                 for (rg, kw) in part

~\Anaconda3\lib\site-packages\dask\dataframe\io\parquet\core.py in <listcomp>(.0)
    584             func = engine.read_partition
    585             dfs = [
--> 586                 func(fs, rg, columns.copy(), index, **toolz.merge(kwargs, kw))
    587                 for (rg, kw) in part
    588             ]

~\Anaconda3\lib\site-packages\dask\dataframe\io\parquet\arrow.py in read_partition(cls, fs, pieces, columns, index, categories, partitions, filters, schema, **kwargs)
    451 
    452         # Convert to pandas
--> 453         df = cls._arrow_table_to_pandas(arrow_table, categories, **kwargs)
    454 
    455         # For pyarrow.dataset api, need to convert partition columns

~\Anaconda3\lib\site-packages\dask\dataframe\io\parquet\arrow.py in _arrow_table_to_pandas(cls, arrow_table, categories, **kwargs)
   1552         _kwargs.update({"use_threads": False, "ignore_metadata": False})
   1553 
-> 1554         return arrow_table.to_pandas(categories=categories, **_kwargs)
   1555 
   1556     @classmethod

~\Anaconda3\lib\site-packages\pyarrow\array.pxi in pyarrow.lib._PandasConvertible.to_pandas()

~\Anaconda3\lib\site-packages\pyarrow\table.pxi in pyarrow.lib.Table._to_pandas()

~\Anaconda3\lib\site-packages\pyarrow\pandas_compat.py in table_to_blockmanager(options, table, categories, ignore_metadata, types_mapper)
    818     _check_data_column_metadata_consistency(all_columns)
    819     columns = _deserialize_column_index(table, all_columns, column_indexes)
--> 820     blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes)
    821 
    822     axes = [columns, index]

~\Anaconda3\lib\site-packages\pyarrow\pandas_compat.py in _table_to_blocks(options, block_table, categories, extension_columns)
   1166     # Convert an arrow table to Block from the internal pandas API
   1167     columns = block_table.column_names
-> 1168     result = pa.lib.table_to_blocks(options, block_table, categories,
   1169                                     list(extension_columns.keys()))
   1170     return [_reconstruct_block(item, columns, extension_columns)

~\Anaconda3\lib\site-packages\pyarrow\table.pxi in pyarrow.lib.table_to_blocks()

~\Anaconda3\lib\site-packages\pyarrow\error.pxi in pyarrow.lib.check_status()

ArrowNotImplementedError: Not implemented type for Arrow list to pandas: map<string, string ('array_element')>

Component(s)

Python

AlenkaF commented 1 year ago

Could you first verify the pyarrow version you are using (check pyarrow.__version__)? Then could you also inspect the columns of the parquet file, you can use pyarrow for that (table=pq.read_table('example.parquet')and then check the schema of the table object table.schema).

I tried couple of examples using map type and none of them error:

>>> import pyarrow as pa
>>> data = [[{'key': 'a', 'value': "1"}, {'key': 'b', 'value': "2"}], [{'key': 'c', 'value': "3"}]]

>>> map_type = pa.map_(pa.string(), pa.string())
>>> table = pa.table([pa.array(data, type=map_type)], names=["array_element"])
>>> table.schema
array_element: map<string, string>
  child 0, entries: struct<key: string not null, value: string> not null
      child 0, key: string not null
      child 1, value: string
>>> table.to_pandas()
      array_element
0  [(a, 1), (b, 2)]
1          [(c, 3)]

>>> table = pa.table([pa.array(data, type=pa.list_(map_type))], names=["array_element"])
>>> table.schema
array_element: list<item: map<string, string>>
  child 0, item: map<string, string>
      child 0, entries: struct<key: string not null, value: string> not null
          child 0, key: string not null
          child 1, value: string
>>> table.to_pandas()
                                      array_element
0  [[(key, a), (value, 1)], [(key, b), (value, 2)]]
1                          [[(key, c), (value, 3)]]

>>> inner = pa.array(data, type=map_type)
>>> array = pa.MapArray.from_arrays([0, 2], ['a', 'b'], inner)
>>> table = pa.table({'array_element': array})
>>> table.schema
array_element: map<string, map<string, string>>
  child 0, entries: struct<key: string not null, value: map<string, string>> not null
      child 0, key: string not null
      child 1, value: map<string, string>
          child 0, entries: struct<key: string not null, value: string> not null
              child 0, key: string not null
              child 1, value: string
>>> table.to_pandas()
                                       array_element
0  [(a, [('a', '1'), ('b', '2')]), (b, [('c', '3'...
AlenkaF commented 1 year ago

Duplicate of https://github.com/apache/arrow/issues/12396

PCClimate commented 1 year ago

pyarrow version is 12.0.1.

This is the schema:

`id: string
updatetime: string
version: int32
names: map<string, list<array_element: map<string, string ('array_element')>> ('names')>
  child 0, names: struct<key: string not null, value: list<array_element: map<string, string ('array_element')>>> not null
      child 0, key: string not null
      child 1, value: list<array_element: map<string, string ('array_element')>>
          child 0, array_element: map<string, string ('array_element')>
              child 0, array_element: struct<key: string not null, value: string> not null
                  child 0, key: string not null
                  child 1, value: string
categories: struct<main: string, alternate: list<array_element: string>>
  child 0, main: string
  child 1, alternate: list<array_element: string>
      child 0, array_element: string
confidence: double
websites: list<array_element: string>
  child 0, array_element: string
socials: list<array_element: string>
  child 0, array_element: string
emails: list<array_element: string>
  child 0, array_element: string
phones: list<array_element: string>
  child 0, array_element: string
brand: struct<names: map<string, list<array_element: map<string, string ('array_element')>> ('names')>, wikidata: string>
  child 0, names: map<string, list<array_element: map<string, string ('array_element')>> ('names')>
      child 0, names: struct<key: string not null, value: list<array_element: map<string, string ('array_element')>>> not null
          child 0, key: string not null
          child 1, value: list<array_element: map<string, string ('array_element')>>
              child 0, array_element: map<string, string ('array_element')>
                  child 0, array_element: struct<key: string not null, value: string> not null
                      child 0, key: string not null
                      child 1, value: string
  child 1, wikidata: string
addresses: list<array_element: map<string, string ('array_element')>>
  child 0, array_element: map<string, string ('array_element')>
      child 0, array_element: struct<key: string not null, value: string> not null
          child 0, key: string not null
          child 1, value: string
sources: list<array_element: map<string, string ('array_element')>>
  child 0, array_element: map<string, string ('array_element')>
      child 0, array_element: struct<key: string not null, value: string> not null
          child 0, key: string not null
          child 1, value: string
bbox: struct<minx: double, maxx: double, miny: double, maxy: double>
  child 0, minx: double
  child 1, maxx: double
  child 2, miny: double
  child 3, maxy: double
geometry: binary
-- schema metadata --
writer.time.zone: 'UTC'`

table.to_pandas() Returns:

---------------------------------------------------------------------------
ArrowNotImplementedError                  Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_66740\1919897604.py in <module>
----> 1 table.to_pandas()

~\Anaconda3\lib\site-packages\pyarrow\array.pxi in pyarrow.lib._PandasConvertible.to_pandas()

~\Anaconda3\lib\site-packages\pyarrow\table.pxi in pyarrow.lib.Table._to_pandas()

~\Anaconda3\lib\site-packages\pyarrow\pandas_compat.py in table_to_blockmanager(options, table, categories, ignore_metadata, types_mapper)
    818     _check_data_column_metadata_consistency(all_columns)
    819     columns = _deserialize_column_index(table, all_columns, column_indexes)
--> 820     blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes)
    821 
    822     axes = [columns, index]

~\Anaconda3\lib\site-packages\pyarrow\pandas_compat.py in _table_to_blocks(options, block_table, categories, extension_columns)
   1166     # Convert an arrow table to Block from the internal pandas API
   1167     columns = block_table.column_names
-> 1168     result = pa.lib.table_to_blocks(options, block_table, categories,
   1169                                     list(extension_columns.keys()))
   1170     return [_reconstruct_block(item, columns, extension_columns)

~\Anaconda3\lib\site-packages\pyarrow\table.pxi in pyarrow.lib.table_to_blocks()

~\Anaconda3\lib\site-packages\pyarrow\error.pxi in pyarrow.lib.check_status()

ArrowNotImplementedError: Not implemented type for Arrow list to pandas: map<string, string ('array_element')>
AlenkaF commented 1 year ago

Sorry, I can not seem to create an example to reproduce the issue. Tried with dev version of pyarrow:

(pyarrow-dev) alenkafrim@Alenkas-MacBook-Pro python % python
Python 3.10.10 (main, Feb 16 2023, 02:46:59) [Clang 14.0.0 (clang-1400.0.29.202)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import pyarrow as pa
>>> pa.__version__
'14.0.0.dev42+g1af709ff9.d20230830'

>>> data = [[{'key': 'a', 'value': "1"}, {'key': 'b', 'value': "2"}], [{'key': 'c', 'value': "3"}]]

>>> map_type = pa.map_(pa.string(), pa.string())
>>> inner_map = pa.array(data, type=map_type)
>>> inner_list = pa.ListArray.from_arrays([0, 1, 2], inner_map)
>>> array = pa.MapArray.from_arrays([0, 1, 2], ["First", "Second"], inner_list)

>>> table = pa.table({'array_element': array})
>>> table.schema
array_element: map<string, list<item: map<string, string>>>
  child 0, entries: struct<key: string not null, value: list<item: map<string, string>>> not null
      child 0, key: string not null
      child 1, value: list<item: map<string, string>>
          child 0, item: map<string, string>
              child 0, entries: struct<key: string not null, value: string> not null
                  child 0, key: string not null
                  child 1, value: string
>>> table.to_pandas()
                           array_element
0  [(First, [[('a', '1'), ('b', '2')]])]
1             [(Second, [[('c', '3')]])]
>>> table = pa.table({'array_element': inner_list})
>>> table.schema
array_element: list<item: map<string, string>>
  child 0, item: map<string, string>
      child 0, entries: struct<key: string not null, value: string> not null
          child 0, key: string not null
          child 1, value: string
>>> table.to_pandas()
        array_element
0  [[(a, 1), (b, 2)]]
1          [[(c, 3)]]

Can you check which column is giving you the error? Also, does the code above work for you?

PCClimate commented 1 year ago

The code you shared above does not work for me, I get the same error in both instances. pyarrow_to_pandas_issue2

The columns names, brand, addresses, and sources each give the same error. pyarrow_to_pandas_issue

PCClimate commented 1 year ago

@AlenkaF Someone using version 10.x of pyarrow was able to reproduce the issue with the code you provided above.