Description
I have 2 dataframes which I want to merge (pandas.concat([df_first, df_second], axis=1).sort_index().fillna(method="ffill").fillna(method="bfill")). Looking at https://github.com/vaexio/vaex/issues/1578 I use .join(), but get an exception.
Software information
Vaex version (import vaex; vaex.__version__):
vaex==4.14.0
pyarrow==9.0.0
Vaex was installed via: pip
OS: Ubuntu 22.04
Additional information
import vaex
df1 = vaex.open(dir1 + "*.feather")
df2 = vaex.open(dir2 + "*.feather")
"""
df1 with x rows:
timestamp_ns (dtype: datetime64[ns]), col1 (dtype: int64), col2 (dtype: int64
df2 with y rows and some same timestamps:
timestamp_ns col1 col2
"""
df1.join(df2, on="timestamp_ns", rprefix="second_", allow_duplication=True)
# The same exception is thrown for
# df1.join(df2, left_on="timestamp_ns", right_on="timestamp_ns", rprefix="second_", how="left", allow_duplication=True)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In [39], line 1
----> 1 df.join(df2, on="timestamp_ns", rprefix="second_", allow_duplication=True)
File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/dataframe.py:6705, in DataFrameLocal.join(self, other, on, left_on, right_on, lprefix, rprefix, lsuffix, rsuffix, how, allow_duplication, prime_growth, cardinality_other, inplace)
6703 kwargs['df'] = kwargs.pop('self')
6704 del kwargs['vaex']
-> 6705 return vaex.join.join(**kwargs)
File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/join.py:209, in join(df, other, on, left_on, right_on, lprefix, rprefix, lsuffix, rsuffix, how, allow_duplication, prime_growth, cardinality_other, inplace)
207 def reduce(a, b):
208 pass
--> 209 left.map_reduce(map, reduce, [left_on], delay=False, name='fill looking', info=True, to_numpy=False, ignore_filter=True)
210 if len(lookup_extra_chunks):
211 # if the right has duplicates, we increase the left of left, and the lookup array
212 lookup_left = np.concatenate([k[0] for k in lookup_extra_chunks])
File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/dataframe.py:438, in DataFrame.map_reduce(self, map, reduce, arguments, progress, delay, info, to_numpy, ignore_filter, pre_filter, name, selection)
436 progressbar.add_task(task, f'map reduce: {name}')
437 task = self.executor.schedule(task)
--> 438 return self._delay(delay, task)
File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/dataframe.py:1781, in DataFrame._delay(self, delay, task, progressbar)
1779 else:
1780 self.execute()
-> 1781 return task.get()
File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/aplus/__init__.py:170, in Promise.get(self, timeout)
168 return self._value
169 else:
--> 170 raise self._reason
File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/execution.py:566, in ExecutorLocal.process_tasks(self, thread_index, i1, i2, chunks, run, df, tasks)
564 task_part.process(thread_index, i1, i2, filter_mask, selections, blocks)
565 else:
--> 566 task_part.process(thread_index, i1, i2, filter_mask, selections, blocks)
567 except Exception as e:
568 # we cannot call .reject, since then we'll handle fallbacks in this thread
569 task._toreject = e
File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/cpu.py:462, in TaskPartMapReduce.process(self, thread_index, i1, i2, filter_mask, selection_masks, blocks)
460 blocks = [filter(block, selection_mask) for block in blocks]
461 if self.info:
--> 462 self.values.append(self._map(thread_index, i1, i2, selection_mask, blocks))
463 else:
464 self.values.append(self._map(*blocks))
File ~/mambaforge-pypy3/envs/puma-lab/lib/python3.8/site-packages/vaex/join.py:193, in join.<locals>.map(thread_index, i1, i2, selection_masks, blocks)
191 ar = _to_string_sequence(ar)
192 if dtype.is_datetime:
--> 193 ar = ar.view(np.int64)
194 if np.ma.isMaskedArray(ar):
195 mask = np.ma.getmaskarray(ar)
AttributeError: 'pyarrow.lib.ChunkedArray' object has no attribute 'view'
Description I have 2 dataframes which I want to merge (
pandas.concat([df_first, df_second], axis=1).sort_index().fillna(method="ffill").fillna(method="bfill")
). Looking at https://github.com/vaexio/vaex/issues/1578 I use.join()
, but get an exception.Software information
import vaex; vaex.__version__)
:Additional information