Closed priamai closed 10 months ago
Describe the bug
I am running the anomaly attribution on a pandas dataframe.
Steps to reproduce the behavior
from dowhy import gcm import networkx as nx causal_graph = nx.DiGraph([('Page Views', 'Sold Units'), ('Revenue', 'Profit'), ('Unit Price', 'Sold Units'), ('Unit Price', 'Revenue'), ('Shopping Event?', 'Page Views'), ('Shopping Event?', 'Sold Units'), ('Shopping Event?', 'Unit Price'), ('Shopping Event?', 'Ad Spend'), ('Ad Spend', 'Page Views'), ('Ad Spend', 'Operational Cost'), ('Sold Units', 'Revenue'), ('Sold Units', 'Operational Cost'), ('Operational Cost', 'Profit')]) # Create the structural causal model object scm = gcm.StructuralCausalModel(causal_graph) # Automatically assign generative models to each node based on the given data gcm.auto.assign_causal_mechanisms(scm, data_2021) gcm.fit(scm, data_2021) attributions = gcm.attribute_anomalies(scm, target_node='Profit', anomaly_samples=first_day_2022.head(1))
The dataframe is identical in structure to the one trained by the scm.
Expected behavior I believe this is related to the latest version of pandas I am running 2.0.2 because it works fine on 1.5.0.
Version information:
Additional context There is not an "object" type column in my dataframe so I am not sure why is going there ...
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[114], line 1 ----> 1 attributions = gcm.attribute_anomalies(scm, target_node='Profit', anomaly_samples=first_day_2022.head(1)) 3 #bar_plot({k: v[0] for k, v in attributions.items()}, ylabel='Anomaly attribution score') File /opt/conda/lib/python3.10/site-packages/dowhy/gcm/anomaly.py:141, in attribute_anomalies(causal_model, target_node, anomaly_samples, anomaly_scorer, attribute_mean_deviation, num_distribution_samples, shapley_config) 138 noise_dependent_function, nodes_order = get_noise_dependent_function(causal_model, target_node) 139 anomaly_scorer.fit(node_samples[target_node].to_numpy()) --> 141 attributions = attribute_anomaly_scores( 142 noise_of_anomaly_samples[nodes_order].to_numpy(), 143 noise_samples[nodes_order].to_numpy(), 144 lambda x: anomaly_scorer.score(noise_dependent_function(x)), 145 attribute_mean_deviation, 146 shapley_config, 147 ) 149 return {node: attributions[:, i] for i, node in enumerate(nodes_order)} File /opt/conda/lib/python3.10/site-packages/dowhy/gcm/anomaly.py:190, in attribute_anomaly_scores(anomaly_samples, distribution_samples, anomaly_scoring_func, attribute_mean_deviation, shapley_config) 188 expectation_of_score = np.mean(anomaly_scoring_func(distribution_samples)) 189 else: --> 190 anomaly_scores = anomaly_scoring_func(anomaly_samples) 192 def set_function(subset: np.ndarray) -> Union[np.ndarray, float]: 193 feature_samples = permute_features(distribution_samples, np.arange(0, subset.shape[0])[subset == 0], True) File /opt/conda/lib/python3.10/site-packages/dowhy/gcm/anomaly.py:144, in attribute_anomalies.<locals>.<lambda>(x) 138 noise_dependent_function, nodes_order = get_noise_dependent_function(causal_model, target_node) 139 anomaly_scorer.fit(node_samples[target_node].to_numpy()) 141 attributions = attribute_anomaly_scores( 142 noise_of_anomaly_samples[nodes_order].to_numpy(), 143 noise_samples[nodes_order].to_numpy(), --> 144 lambda x: anomaly_scorer.score(noise_dependent_function(x)), 145 attribute_mean_deviation, 146 shapley_config, 147 ) 149 return {node: attributions[:, i] for i, node in enumerate(nodes_order)} File /opt/conda/lib/python3.10/site-packages/dowhy/gcm/_noise.py:107, in _get_exact_noise_dependent_function.<locals>.predict_method(noise_samples) 106 def predict_method(noise_samples: np.ndarray) -> np.ndarray: --> 107 return compute_data_from_noise(causal_model, pd.DataFrame(noise_samples, columns=[x for x in nodes_order]))[ 108 target_node 109 ].to_numpy() File /opt/conda/lib/python3.10/site-packages/dowhy/gcm/_noise.py:28, in compute_data_from_noise(causal_model, noise_data) 26 data[node] = noise_data[node].to_numpy() 27 else: ---> 28 data[node] = causal_model.causal_mechanism(node).evaluate( 29 data[get_ordered_predecessors(causal_model.graph, node)].to_numpy(), noise_data[node].to_numpy() 30 ) 32 return data File /opt/conda/lib/python3.10/site-packages/pandas/core/frame.py:3950, in DataFrame.__setitem__(self, key, value) 3947 self._setitem_array([key], value) 3948 else: 3949 # set column -> 3950 self._set_item(key, value) File /opt/conda/lib/python3.10/site-packages/pandas/core/frame.py:4143, in DataFrame._set_item(self, key, value) 4133 def _set_item(self, key, value) -> None: 4134 """ 4135 Add series to DataFrame in specified column. 4136 (...) 4141 ensure homogeneity. 4142 """ -> 4143 value = self._sanitize_column(value) 4145 if ( 4146 key in self.columns 4147 and value.ndim == 1 4148 and not is_extension_array_dtype(value) 4149 ): 4150 # broadcast across multiple columns if necessary 4151 if not self.columns.is_unique or isinstance(self.columns, MultiIndex): File /opt/conda/lib/python3.10/site-packages/pandas/core/frame.py:4871, in DataFrame._sanitize_column(self, value) 4869 if is_list_like(value): 4870 com.require_length_match(value, self.index) -> 4871 return sanitize_array(value, self.index, copy=True, allow_2d=True) File /opt/conda/lib/python3.10/site-packages/pandas/core/construction.py:569, in sanitize_array(data, index, dtype, copy, allow_2d) 567 subarr = data 568 if data.dtype == object: --> 569 subarr = maybe_infer_to_datetimelike(data) 571 if subarr is data and copy: 572 subarr = subarr.copy() File /opt/conda/lib/python3.10/site-packages/pandas/core/dtypes/cast.py:1181, in maybe_infer_to_datetimelike(value) 1178 raise TypeError(type(value)) # pragma: no cover 1179 if value.ndim != 1: 1180 # Caller is responsible -> 1181 raise ValueError(value.ndim) # pragma: no cover 1183 if not len(value): 1184 return value ValueError: 2
This is a full replicable notebook
Thanks for flagging this! I will take a look
Opened a PR to fix this. Thanks again for reporting!
Fixed
Describe the bug
I am running the anomaly attribution on a pandas dataframe.
Steps to reproduce the behavior
The dataframe is identical in structure to the one trained by the scm.
Expected behavior I believe this is related to the latest version of pandas I am running 2.0.2 because it works fine on 1.5.0.
Version information:
Additional context There is not an "object" type column in my dataframe so I am not sure why is going there ...