py-why / dowhy

DoWhy is a Python library for causal inference that supports explicit modeling and testing of causal assumptions. DoWhy is based on a unified language for causal inference, combining causal graphical models and potential outcomes frameworks.
https://www.pywhy.org/dowhy
MIT License
6.91k stars 921 forks source link

attribute anomalies fails check on pandas 2.0.2 #1020

Closed priamai closed 10 months ago

priamai commented 11 months ago

Describe the bug

I am running the anomaly attribution on a pandas dataframe.

Steps to reproduce the behavior


from dowhy import gcm
import networkx as nx

causal_graph = nx.DiGraph([('Page Views', 'Sold Units'),
                           ('Revenue', 'Profit'),
                           ('Unit Price', 'Sold Units'),
                           ('Unit Price', 'Revenue'),
                           ('Shopping Event?', 'Page Views'),
                           ('Shopping Event?', 'Sold Units'),
                           ('Shopping Event?', 'Unit Price'),
                           ('Shopping Event?', 'Ad Spend'),
                           ('Ad Spend', 'Page Views'),
                           ('Ad Spend', 'Operational Cost'),
                           ('Sold Units', 'Revenue'),
                           ('Sold Units', 'Operational Cost'),
                           ('Operational Cost', 'Profit')])

# Create the structural causal model object
scm = gcm.StructuralCausalModel(causal_graph)

# Automatically assign generative models to each node based on the given data
gcm.auto.assign_causal_mechanisms(scm, data_2021)
gcm.fit(scm, data_2021)
attributions = gcm.attribute_anomalies(scm, target_node='Profit', anomaly_samples=first_day_2022.head(1))

The dataframe is identical in structure to the one trained by the scm.

image

Expected behavior I believe this is related to the latest version of pandas I am running 2.0.2 because it works fine on 1.5.0.

Version information:

Additional context There is not an "object" type column in my dataframe so I am not sure why is going there ...

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[114], line 1
----> 1 attributions = gcm.attribute_anomalies(scm, target_node='Profit', anomaly_samples=first_day_2022.head(1))
      3 #bar_plot({k: v[0] for k, v in attributions.items()}, ylabel='Anomaly attribution score')

File /opt/conda/lib/python3.10/site-packages/dowhy/gcm/anomaly.py:141, in attribute_anomalies(causal_model, target_node, anomaly_samples, anomaly_scorer, attribute_mean_deviation, num_distribution_samples, shapley_config)
    138 noise_dependent_function, nodes_order = get_noise_dependent_function(causal_model, target_node)
    139 anomaly_scorer.fit(node_samples[target_node].to_numpy())
--> 141 attributions = attribute_anomaly_scores(
    142     noise_of_anomaly_samples[nodes_order].to_numpy(),
    143     noise_samples[nodes_order].to_numpy(),
    144     lambda x: anomaly_scorer.score(noise_dependent_function(x)),
    145     attribute_mean_deviation,
    146     shapley_config,
    147 )
    149 return {node: attributions[:, i] for i, node in enumerate(nodes_order)}

File /opt/conda/lib/python3.10/site-packages/dowhy/gcm/anomaly.py:190, in attribute_anomaly_scores(anomaly_samples, distribution_samples, anomaly_scoring_func, attribute_mean_deviation, shapley_config)
    188     expectation_of_score = np.mean(anomaly_scoring_func(distribution_samples))
    189 else:
--> 190     anomaly_scores = anomaly_scoring_func(anomaly_samples)
    192 def set_function(subset: np.ndarray) -> Union[np.ndarray, float]:
    193     feature_samples = permute_features(distribution_samples, np.arange(0, subset.shape[0])[subset == 0], True)

File /opt/conda/lib/python3.10/site-packages/dowhy/gcm/anomaly.py:144, in attribute_anomalies.<locals>.<lambda>(x)
    138 noise_dependent_function, nodes_order = get_noise_dependent_function(causal_model, target_node)
    139 anomaly_scorer.fit(node_samples[target_node].to_numpy())
    141 attributions = attribute_anomaly_scores(
    142     noise_of_anomaly_samples[nodes_order].to_numpy(),
    143     noise_samples[nodes_order].to_numpy(),
--> 144     lambda x: anomaly_scorer.score(noise_dependent_function(x)),
    145     attribute_mean_deviation,
    146     shapley_config,
    147 )
    149 return {node: attributions[:, i] for i, node in enumerate(nodes_order)}

File /opt/conda/lib/python3.10/site-packages/dowhy/gcm/_noise.py:107, in _get_exact_noise_dependent_function.<locals>.predict_method(noise_samples)
    106 def predict_method(noise_samples: np.ndarray) -> np.ndarray:
--> 107     return compute_data_from_noise(causal_model, pd.DataFrame(noise_samples, columns=[x for x in nodes_order]))[
    108         target_node
    109     ].to_numpy()

File /opt/conda/lib/python3.10/site-packages/dowhy/gcm/_noise.py:28, in compute_data_from_noise(causal_model, noise_data)
     26         data[node] = noise_data[node].to_numpy()
     27     else:
---> 28         data[node] = causal_model.causal_mechanism(node).evaluate(
     29             data[get_ordered_predecessors(causal_model.graph, node)].to_numpy(), noise_data[node].to_numpy()
     30         )
     32 return data

File /opt/conda/lib/python3.10/site-packages/pandas/core/frame.py:3950, in DataFrame.__setitem__(self, key, value)
   3947     self._setitem_array([key], value)
   3948 else:
   3949     # set column
-> 3950     self._set_item(key, value)

File /opt/conda/lib/python3.10/site-packages/pandas/core/frame.py:4143, in DataFrame._set_item(self, key, value)
   4133 def _set_item(self, key, value) -> None:
   4134     """
   4135     Add series to DataFrame in specified column.
   4136 
   (...)
   4141     ensure homogeneity.
   4142     """
-> 4143     value = self._sanitize_column(value)
   4145     if (
   4146         key in self.columns
   4147         and value.ndim == 1
   4148         and not is_extension_array_dtype(value)
   4149     ):
   4150         # broadcast across multiple columns if necessary
   4151         if not self.columns.is_unique or isinstance(self.columns, MultiIndex):

File /opt/conda/lib/python3.10/site-packages/pandas/core/frame.py:4871, in DataFrame._sanitize_column(self, value)
   4869 if is_list_like(value):
   4870     com.require_length_match(value, self.index)
-> 4871 return sanitize_array(value, self.index, copy=True, allow_2d=True)

File /opt/conda/lib/python3.10/site-packages/pandas/core/construction.py:569, in sanitize_array(data, index, dtype, copy, allow_2d)
    567 subarr = data
    568 if data.dtype == object:
--> 569     subarr = maybe_infer_to_datetimelike(data)
    571 if subarr is data and copy:
    572     subarr = subarr.copy()

File /opt/conda/lib/python3.10/site-packages/pandas/core/dtypes/cast.py:1181, in maybe_infer_to_datetimelike(value)
   1178     raise TypeError(type(value))  # pragma: no cover
   1179 if value.ndim != 1:
   1180     # Caller is responsible
-> 1181     raise ValueError(value.ndim)  # pragma: no cover
   1183 if not len(value):
   1184     return value

ValueError: 2
priamai commented 11 months ago

This is a full replicable notebook

bloebp commented 11 months ago

Thanks for flagging this! I will take a look

bloebp commented 10 months ago

Opened a PR to fix this. Thanks again for reporting!

bloebp commented 10 months ago

Fixed