vaexio / vaex

Out-of-Core hybrid Apache Arrow/NumPy DataFrame for Python, ML, visualization and exploration of big tabular data at a billion rows per second šŸš€
https://vaex.io
MIT License
8.23k stars 590 forks source link

[BUG-REPORT] `.join()` throws `pyarrow.lib.ChunkedArray' object has no attribute 'view'` #2227

Open jmakov opened 1 year ago

jmakov commented 1 year ago

Description I have 2 dataframes which I want to merge (pandas.concat([df_first, df_second], axis=1).sort_index().fillna(method="ffill").fillna(method="bfill")). Looking at https://github.com/vaexio/vaex/issues/1578 I use .join(), but get an exception.

Software information

Additional information

import vaex

df1 = vaex.open(dir1 + "*.feather")
df2 = vaex.open(dir2 + "*.feather")

"""
df1 with x rows:
timestamp_ns (dtype: datetime64[ns]),  col1 (dtype: int64), col2 (dtype: int64

df2 with y rows and some same timestamps:
timestamp_ns col1 col2
"""
df1.join(df2, on="timestamp_ns", rprefix="second_", allow_duplication=True)
# The same exception is thrown for
# df1.join(df2, left_on="timestamp_ns", right_on="timestamp_ns", rprefix="second_", how="left", allow_duplication=True)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In [39], line 1
----> 1 df.join(df2, on="timestamp_ns", rprefix="second_", allow_duplication=True)

File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/dataframe.py:6705, in DataFrameLocal.join(self, other, on, left_on, right_on, lprefix, rprefix, lsuffix, rsuffix, how, allow_duplication, prime_growth, cardinality_other, inplace)
   6703 kwargs['df'] = kwargs.pop('self')
   6704 del kwargs['vaex']
-> 6705 return vaex.join.join(**kwargs)

File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/join.py:209, in join(df, other, on, left_on, right_on, lprefix, rprefix, lsuffix, rsuffix, how, allow_duplication, prime_growth, cardinality_other, inplace)
    207 def reduce(a, b):
    208     pass
--> 209 left.map_reduce(map, reduce, [left_on], delay=False, name='fill looking', info=True, to_numpy=False, ignore_filter=True)
    210 if len(lookup_extra_chunks):
    211     # if the right has duplicates, we increase the left of left, and the lookup array
    212     lookup_left = np.concatenate([k[0] for k in lookup_extra_chunks])

File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/dataframe.py:438, in DataFrame.map_reduce(self, map, reduce, arguments, progress, delay, info, to_numpy, ignore_filter, pre_filter, name, selection)
    436 progressbar.add_task(task, f'map reduce: {name}')
    437 task = self.executor.schedule(task)
--> 438 return self._delay(delay, task)

File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/dataframe.py:1781, in DataFrame._delay(self, delay, task, progressbar)
   1779 else:
   1780     self.execute()
-> 1781     return task.get()

File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/aplus/__init__.py:170, in Promise.get(self, timeout)
    168     return self._value
    169 else:
--> 170     raise self._reason

File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/execution.py:566, in ExecutorLocal.process_tasks(self, thread_index, i1, i2, chunks, run, df, tasks)
    564         task_part.process(thread_index, i1, i2, filter_mask, selections, blocks)
    565     else:
--> 566         task_part.process(thread_index, i1, i2, filter_mask, selections, blocks)
    567 except Exception as e:
    568     # we cannot call .reject, since then we'll handle fallbacks in this thread
    569     task._toreject = e

File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/cpu.py:462, in TaskPartMapReduce.process(self, thread_index, i1, i2, filter_mask, selection_masks, blocks)
    460             blocks = [filter(block, selection_mask) for block in blocks]
    461 if self.info:
--> 462     self.values.append(self._map(thread_index, i1, i2, selection_mask, blocks))
    463 else:
    464     self.values.append(self._map(*blocks))

File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/join.py:193, in join.<locals>.map(thread_index, i1, i2, selection_masks, blocks)
    191     ar = _to_string_sequence(ar)
    192 if dtype.is_datetime:
--> 193     ar = ar.view(np.int64)
    194 if np.ma.isMaskedArray(ar):
    195     mask = np.ma.getmaskarray(ar)

AttributeError: 'pyarrow.lib.ChunkedArray' object has no attribute 'view'