ValueError when running pipe.extract on pyarrow table with DictionaryArray field:
Traceback (most recent call last):
File "/home/j/repos/dlt/dlt/pipeline/pipeline.py", line 468, in extract
self._extract_source(
File "/home/j/repos/dlt/dlt/pipeline/pipeline.py", line 1238, in _extract_source
load_id = extract.extract(
File "/home/j/repos/dlt/dlt/extract/extract.py", line 417, in extract
self._extract_single_source(
File "/home/j/repos/dlt/dlt/extract/extract.py", line 349, in _extract_single_source
extractors[item_format].write_items(
File "/home/j/repos/dlt/dlt/extract/extractors.py", line 333, in write_items
super().write_items(resource, items, meta)
File "/home/j/repos/dlt/dlt/extract/extractors.py", line 138, in write_items
self._write_to_static_table(resource, table_name, items, meta)
File "/home/j/repos/dlt/dlt/extract/extractors.py", line 340, in _write_to_static_table
super()._write_to_static_table(resource, table_name, items, meta)
File "/home/j/repos/dlt/dlt/extract/extractors.py", line 217, in _write_to_static_table
items = self._compute_and_update_table(resource, table_name, items, meta)
File "/home/j/repos/dlt/dlt/extract/extractors.py", line 455, in _compute_and_update_table
items = super()._compute_and_update_table(resource, table_name, items, meta)
File "/home/j/repos/dlt/dlt/extract/extractors.py", line 236, in _compute_and_update_table
computed_table = self._compute_table(resource, items, meta)
File "/home/j/repos/dlt/dlt/extract/extractors.py", line 414, in _compute_table
arrow_table["columns"] = pyarrow.py_arrow_to_table_schema_columns(item.schema)
File "/home/j/repos/dlt/dlt/common/libs/pyarrow.py", line 402, in py_arrow_to_table_schema_columns
**get_column_type_from_py_arrow(field.type),
File "/home/j/repos/dlt/dlt/common/libs/pyarrow.py", line 187, in get_column_type_from_py_arrow
raise ValueError(dtype)
ValueError: dictionary<values=string, indices=int8, ordered=0>
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/j/repos/dlt/mre.py", line 9, in <module>
pipe.extract(table, table_name="bug")
File "/home/j/repos/dlt/dlt/pipeline/pipeline.py", line 223, in _wrap
step_info = f(self, *args, **kwargs)
File "/home/j/repos/dlt/dlt/pipeline/pipeline.py", line 177, in _wrap
rv = f(self, *args, **kwargs)
File "/home/j/repos/dlt/dlt/pipeline/pipeline.py", line 163, in _wrap
return f(self, *args, **kwargs)
File "/home/j/repos/dlt/dlt/pipeline/pipeline.py", line 272, in _wrap
return f(self, *args, **kwargs)
File "/home/j/repos/dlt/dlt/pipeline/pipeline.py", line 489, in extract
raise PipelineStepFailed(
dlt.pipeline.exceptions.PipelineStepFailed: Pipeline execution failed at stage extract when processing package 1730705003.55613 with exception:
<class 'ValueError'>
dictionary<values=string, indices=int8, ordered=0>
Expected behavior
Successful extract.
Steps to reproduce
import dlt
import pyarrow as pa
# create pyarrow table with DictionaryArray field
array = pa.array(["a", "b", "c"], type=pa.dictionary(pa.int8(), pa.string()))
table = pa.table({"foo": array})
# try to extract with dlt pipeline
pipe = dlt.pipeline(destination="filesystem")
pipe.extract(table, table_name="bug")
# result: <class 'ValueError'> dictionary<values=string, indices=int8, ordered=0>
dlt version
dlt 1.3.1a1
Describe the problem
ValueError
when runningpipe.extract
on pyarrow table withDictionaryArray
field:Expected behavior
Successful extract.
Steps to reproduce
Operating system
Linux
Runtime environment
Local
Python version
3.9
dlt data source
pyarrow table
dlt destination
Filesystem & buckets
Other deployment details
No response
Additional information
No response