I have executed 01 successfully and one issue appeared in this block chunk "5.4. Grouping interactions into sessions":
workflow = nvt.Workflow(filtered_sessions)
dataset = nvt.Dataset(df)
# Learn features statistics necessary of the preprocessing workflow
# The following will generate schema.pbtxt file in the provided folder and export the parquet files.
workflow.fit_transform(dataset).to_parquet(os.path.join(INPUT_DATA_DIR, "processed_nvt"))
Error message shows workflow.fit_transform(dataset) caused a KeyError: 'category_id_price_sum'. But I didn't find this feature in the tutorial. Maybe I missed something? anyone could help me out?
The detailed error message is as follows:
KeyError Traceback (most recent call last)
Cell In[48], line 5
2 dataset = nvt.Dataset(df)
3 # Learn features statistics necessary of the preprocessing workflow
4 # The following will generate schema.pbtxt file in the provided folder and export the parquet files.
----> 5 workflow.fit_transform(dataset).to_parquet(os.path.join(INPUT_DATA_DIR, "processed_nvt"))
File ~/miniconda3/lib/python3.10/site-packages/nvtabular/workflow/workflow.py:236, in Workflow.fit_transform(self, dataset)
216 def fit_transform(self, dataset: Dataset) -> Dataset:
217 """Convenience method to both fit the workflow and transform the dataset in a single
218 call. Equivalent to calling workflow.fit(dataset) followed by
219 workflow.transform(dataset)
(...)
234 transform
235 """
--> 236 self.fit(dataset)
237 return self.transform(dataset)
File ~/miniconda3/lib/python3.10/site-packages/nvtabular/workflow/workflow.py:213, in Workflow.fit(self, dataset)
199 def fit(self, dataset: Dataset) -> "Workflow":
200 """Calculates statistics for this workflow on the input dataset
201
202 Parameters
(...)
211 This Workflow with statistics calculated on it
212 """
--> 213 self.executor.fit(dataset, self.graph)
214 return self
File ~/miniconda3/lib/python3.10/site-packages/merlin/dag/executors.py:466, in DaskExecutor.fit(self, dataset, graph, refit)
462 if not current_phase:
463 # this shouldn't happen, but lets not infinite loop just in case
464 raise RuntimeError("failed to find dependency-free StatOperator to fit")
--> 466 self.fit_phase(dataset, current_phase)
468 # Remove all the operators we processed in this phase, and remove
469 # from the dependencies of other ops too
470 for node in current_phase:
File ~/miniconda3/lib/python3.10/site-packages/merlin/dag/executors.py:532, in DaskExecutor.fit_phase(self, dataset, nodes, strict)
530 stats.append(node.op.fit(node.input_columns, Dataset(ddf)))
531 else:
--> 532 stats.append(node.op.fit(node.input_columns, transformed_ddf))
533 except Exception:
534 LOG.exception("Failed to fit operator %s", node.op)
File ~/miniconda3/lib/python3.10/site-packages/nvtabular/ops/join_groupby.py:154, in JoinGroupby.fit(self, col_selector, ddf)
151 # Cannot use "device" caching if the data is pandas-backed
152 self.cat_cache = "host" if self.cat_cache == "device" else self.cat_cache
--> 154 dsk, key = nvt_cat._category_stats(
155 ddf,
156 nvt_cat.FitOptions(
157 col_selector,
158 self.cont_names,
159 self.stats,
160 self.out_path,
161 0,
162 self.split_out,
163 self.on_host,
164 concat_groups=False,
165 name_sep=self.name_sep,
166 split_every=self.split_every,
167 ),
168 )
169 return Delayed(key, dsk)
File ~/miniconda3/lib/python3.10/site-packages/nvtabular/ops/categorify.py:1559, in _category_stats(ddf, options)
1556 if options.agg_list == []:
1557 options.agg_list = ["count"]
-> 1559 return _groupby_to_disk(ddf, None, options)
File ~/miniconda3/lib/python3.10/site-packages/nvtx/nvtx.py:116, in annotate.call..inner(*args, kwargs)
113 @wraps(func)
114 def inner(*args, *kwargs):
115 libnvtx_push_range(self.attributes, self.domain.handle)
--> 116 result = func(args, kwargs)
117 libnvtx_pop_range(self.domain.handle)
118 return result
File ~/miniconda3/lib/python3.10/site-packages/cudf/core/dataframe.py:1336, in DataFrame.getitem(self, arg)
1274 """
1275 If arg is a str or int type, return the column Series.
1276 If arg is a slice, return a new DataFrame with all columns
(...)
1333 8 8 8 8
1334 """
1335 if _is_scalar_or_zero_d_array(arg) or isinstance(arg, tuple):
-> 1336 return self._get_columns_by_label(arg, downcast=True)
1338 elif isinstance(arg, slice):
1339 return self._slice(arg)
File ~/miniconda3/lib/python3.10/site-packages/nvtx/nvtx.py:116, in annotate.call..inner(*args, kwargs)
113 @wraps(func)
114 def inner(*args, *kwargs):
115 libnvtx_push_range(self.attributes, self.domain.handle)
--> 116 result = func(args, kwargs)
117 libnvtx_pop_range(self.domain.handle)
118 return result
File ~/miniconda3/lib/python3.10/site-packages/cudf/core/dataframe.py:1995, in DataFrame._get_columns_by_label(self, labels, downcast)
1986 @_cudf_nvtx_annotate
1987 def _get_columns_by_label(
1988 self, labels, *, downcast=False
1989 ) -> Self | Series:
1990 """
1991 Return columns of dataframe by labels
1992
1993 If downcast is True, try and downcast from a DataFrame to a Series
1994 """
-> 1995 ca = self._data.select_by_label(labels)
1996 if downcast:
1997 if is_scalar(labels):
File ~/miniconda3/lib/python3.10/site-packages/cudf/core/column_accessor.py:351, in ColumnAccessor.select_by_label(self, key)
349 if any(isinstance(k, slice) for k in key):
350 return self._select_by_label_with_wildcard(key)
--> 351 return self._select_by_label_grouped(key)
❓ Questions & Help
Details
I have executed 01 successfully and one issue appeared in this block chunk "5.4. Grouping interactions into sessions":
Error message shows
workflow.fit_transform(dataset)
caused a KeyError: 'category_id_price_sum'. But I didn't find this feature in the tutorial. Maybe I missed something? anyone could help me out?The detailed error message is as follows: