Closed hapion1 closed 2 years ago
Have you verified, that there are no "NaN" values or similar in your input batch? Which dataset do you use?
No, I did not verify that there are no NaN values etc. in the input, but I am using the preselected Mobis set.
When I choose another set like BPIC 12 or everything else from the EventLog class the following error occurs:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/anaconda3/envs/mppn/lib/python3.7/site-packages/pandas-1.2.3-py3.7-linux-x86_64.egg/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3079 try:
-> 3080 return self._engine.get_loc(casted_key)
3081 except KeyError as err:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index_class_helper.pxi in pandas._libs.index.Int64Engine._maybe_get_bool_indexer()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'type'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
/tmp/ipykernel_7540/287089386.py in <module>
2 cat_names=['activity','type','resource'],
3 cont_names=['cost'],date_names=['timestamp'],
----> 4 splits=splits, y_names=['activity','resource','timestamp_Relative_elapsed_minmax']
5 )
~/Documents/MPPN2/mppn/mppn/preprocessing.py in __init__(self, df, procs, cat_names, cont_names, date_names, y_names, splits, ycat_names, ycont_names, inplace, do_setup)
80 self.procs = Pipeline(procs)
81 self.splits=splits
---> 82 if do_setup: self.setup()
83
84
~/Documents/MPPN2/mppn/mppn/preprocessing.py in setup(self)
91 self.ycat_names,self.ycont_names=(L([i for i in L(y_names) if i in self.cat_names]),
92 L([i for i in L(y_names) if i not in self.cat_names]))
---> 93 def setup(self): self.procs.setup(self)
94 def subset(self, i): return self.new(self.loc[self.splits[i]]) if self.splits else self
95 def __len__(self): return len(np.unique(self.items.index))
~/anaconda3/envs/mppn/lib/python3.7/site-packages/fastcore-1.3.19-py3.7.egg/fastcore/transform.py in setup(self, items, train_setup)
190 tfms = self.fs[:]
191 self.fs.clear()
--> 192 for t in tfms: self.add(t,items, train_setup)
193
194 def add(self,t, items=None, train_setup=False):
~/anaconda3/envs/mppn/lib/python3.7/site-packages/fastcore-1.3.19-py3.7.egg/fastcore/transform.py in add(self, t, items, train_setup)
193
194 def add(self,t, items=None, train_setup=False):
--> 195 t.setup(items, train_setup)
196 self.fs.append(t)
197
~/Documents/MPPN2/mppn/mppn/preprocessing.py in setup(self, items, train_setup)
133 "Base class to write a non-lazy tabular processor for dataframes"
134 def setup(self, items=None, train_setup=False): #TODO: properly deal with train_setup
--> 135 super().setup(getattr(items,'train',items), train_setup=False)
136 #super().setup(items, train_setup=False)
137
~/anaconda3/envs/mppn/lib/python3.7/site-packages/fastcore-1.3.19-py3.7.egg/fastcore/transform.py in setup(self, items, train_setup)
77 def setup(self, items=None, train_setup=False):
78 train_setup = train_setup if self.train_setup is None else self.train_setup
---> 79 return self.setups(getattr(items, 'train', items) if train_setup else items)
80
81 def _call(self, fn, x, split_idx=None, **kwargs):
~/anaconda3/envs/mppn/lib/python3.7/site-packages/fastcore-1.3.19-py3.7.egg/fastcore/dispatch.py in __call__(self, *args, **kwargs)
116 elif self.inst is not None: f = MethodType(f, self.inst)
117 elif self.owner is not None: f = MethodType(f, self.owner)
--> 118 return f(*args, **kwargs)
119
120 def __get__(self, inst, owner):
~/Documents/MPPN2/mppn/mppn/preprocessing.py in setups(self, to)
153 order = 2
154 def setups(self, to):
--> 155 store_attr(classes={n:CategoryMap(to.items.loc[:,n], add_na=True) for n in to.cat_names}, but='to')
156 def encodes(self, to):
157 to.transform(to.cat_names, partial(_apply_cats, self.classes, 1))
~/Documents/MPPN2/mppn/mppn/preprocessing.py in <dictcomp>(.0)
153 order = 2
154 def setups(self, to):
--> 155 store_attr(classes={n:CategoryMap(to.items.loc[:,n], add_na=True) for n in to.cat_names}, but='to')
156 def encodes(self, to):
157 to.transform(to.cat_names, partial(_apply_cats, self.classes, 1))
~/anaconda3/envs/mppn/lib/python3.7/site-packages/pandas-1.2.3-py3.7-linux-x86_64.egg/pandas/core/indexing.py in __getitem__(self, key)
887 # AttributeError for IntervalTree get_value
888 return self.obj._get_value(*key, takeable=self._takeable)
--> 889 return self._getitem_tuple(key)
890 else:
891 # we by definition only have the 0th axis
~/anaconda3/envs/mppn/lib/python3.7/site-packages/pandas-1.2.3-py3.7-linux-x86_64.egg/pandas/core/indexing.py in _getitem_tuple(self, tup)
1058 def _getitem_tuple(self, tup: Tuple):
1059 with suppress(IndexingError):
-> 1060 return self._getitem_lowerdim(tup)
1061
1062 # no multi-index, so validate all of the indexers
~/anaconda3/envs/mppn/lib/python3.7/site-packages/pandas-1.2.3-py3.7-linux-x86_64.egg/pandas/core/indexing.py in _getitem_lowerdim(self, tup)
805 # We don't need to check for tuples here because those are
806 # caught by the _is_nested_tuple_indexer check above.
--> 807 section = self._getitem_axis(key, axis=i)
808
809 # We should never have a scalar section here, because
~/anaconda3/envs/mppn/lib/python3.7/site-packages/pandas-1.2.3-py3.7-linux-x86_64.egg/pandas/core/indexing.py in _getitem_axis(self, key, axis)
1122 # fall thru to straight lookup
1123 self._validate_key(key, axis)
-> 1124 return self._get_label(key, axis=axis)
1125
1126 def _get_slice_axis(self, slice_obj: slice, axis: int):
~/anaconda3/envs/mppn/lib/python3.7/site-packages/pandas-1.2.3-py3.7-linux-x86_64.egg/pandas/core/indexing.py in _get_label(self, label, axis)
1071 def _get_label(self, label, axis: int):
1072 # GH#5667 this will fail if the label is not present in the axis.
-> 1073 return self.obj.xs(label, axis=axis)
1074
1075 def _handle_lowerdim_multi_index_axis0(self, tup: Tuple):
~/anaconda3/envs/mppn/lib/python3.7/site-packages/pandas-1.2.3-py3.7-linux-x86_64.egg/pandas/core/generic.py in xs(self, key, axis, level, drop_level)
3722 if axis == 1:
3723 if drop_level:
-> 3724 return self[key]
3725 index = self.columns
3726 else:
~/anaconda3/envs/mppn/lib/python3.7/site-packages/pandas-1.2.3-py3.7-linux-x86_64.egg/pandas/core/frame.py in __getitem__(self, key)
3022 if self.columns.nlevels > 1:
3023 return self._getitem_multilevel(key)
-> 3024 indexer = self.columns.get_loc(key)
3025 if is_integer(indexer):
3026 indexer = [indexer]
~/anaconda3/envs/mppn/lib/python3.7/site-packages/pandas-1.2.3-py3.7-linux-x86_64.egg/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3080 return self._engine.get_loc(casted_key)
3081 except KeyError as err:
-> 3082 raise KeyError(key) from err
3083
3084 if tolerance is not None:
KeyError: 'type' ```
No matter what dataset you use, you need to make sure that there are non "Nan" etc values in it. The transformations will not be able to scale them properly and raise the mentioned error. I know that "cost" in MobIS has a lot of NaN values as the values does not change in each event. You can either try to set "cost" to any other numeric value if it is "NaN", e.g. using this code assuming data
is a pandas dataframe
data["cost"] = self.data["cost"].str.replace(",", ".")
data["cost"] = pd.to_numeric(self.data["cost"], downcast="signed", errors="coerce")
data["cost"].fillna(0, inplace=True)
data["cost"] = self.data["cost"].apply(lambda x: int(round(x, 0)))
or you remove "cost" from the cont_names
of PPObj
.
When using other datasets than MobIS you need to modify the cat_names
, cont_names
and date_names
in PPObj
. They need to match columns of the dataset you choose. cat for categorical attributes, cont for numerical and date for temporal.
Hi, this is hopefully the last issue :)
Like the title says, notebook 05 does not run further than (my) ln [11] - for context please refer to the screenshot:
The complete output of this line is the following: