Closed randerzander closed 5 years ago
Note: df['b'].astype('int') works when using cudf directly
df['b'].astype('int')
Repro:
df = cudf.DataFrame() df['a'] = [0, 1, 2, 3, 4] dgd = dask_cudf.from_cudf(df, npartitions=1) dgd['b'] = dgd['b'].astype('int')
Result:
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-131-c6dfd269c825> in <module> 4 dgd = dask_cudf.from_cudf(df, npartitions=1) 5 #dgd['b'] = dgd['a']+1 ----> 6 dgd['b'] = dgd['a'].astype(np.int32) /conda/envs/cudf/lib/python3.7/site-packages/dask/dataframe/core.py in __setitem__(self, key, value) 2521 df = self.assign(**{k: value for k in key}) 2522 else: -> 2523 df = self.assign(**{key: value}) 2524 2525 self.dask = df.dask /conda/envs/cudf/lib/python3.7/site-packages/dask/dataframe/core.py in assign(self, **kwargs) 2718 2719 # Figure out columns of the output -> 2720 df2 = self._meta.assign(**_extract_meta(kwargs)) 2721 return elemwise(methods.assign, self, *pairs, meta=df2) 2722 /conda/envs/cudf/lib/python3.7/site-packages/cudf-0.6.0.dev0+708.gc5b9d3e.dirty-py3.7-linux-x86_64.egg/cudf/dataframe/dataframe.py in assign(self, **kwargs) 320 new = self.copy() 321 for k, v in kwargs.items(): --> 322 new[k] = v 323 return new 324 /conda/envs/cudf/lib/python3.7/site-packages/cudf-0.6.0.dev0+708.gc5b9d3e.dirty-py3.7-linux-x86_64.egg/cudf/dataframe/dataframe.py in __setitem__(self, name, col) 272 self._cols[name] = self._prepare_series_for_add(col) 273 else: --> 274 self.add_column(name, col) 275 276 def __delitem__(self, name): /conda/envs/cudf/lib/python3.7/site-packages/cudf-0.6.0.dev0+708.gc5b9d3e.dirty-py3.7-linux-x86_64.egg/cudf/dataframe/dataframe.py in add_column(self, name, data, forceindex) 835 if isinstance(data, GeneratorType): 836 data = Series(data) --> 837 series = self._prepare_series_for_add(data, forceindex=forceindex) 838 series.name = name 839 self._cols[name] = series /conda/envs/cudf/lib/python3.7/site-packages/cudf-0.6.0.dev0+708.gc5b9d3e.dirty-py3.7-linux-x86_64.egg/cudf/dataframe/dataframe.py in _prepare_series_for_add(self, col, forceindex) 807 """ 808 self._sanitize_columns(col) --> 809 col = self._sanitize_values(col) 810 811 empty_index = len(self._index) == 0 /conda/envs/cudf/lib/python3.7/site-packages/cudf-0.6.0.dev0+708.gc5b9d3e.dirty-py3.7-linux-x86_64.egg/cudf/dataframe/dataframe.py in _sanitize_values(self, col) 791 return Series(arr) 792 elif len(self) > 0 and len(sind) != len(index): --> 793 raise ValueError('Length of values does not match index length') 794 return col 795 ValueError: Length of values does not match index length
Thanks for the excellent bug report. This should be resolved by https://github.com/rapidsai/dask-cudf/pull/105
Note:
df['b'].astype('int')
works when using cudf directlyRepro:
Result: