mmschlk / iXAI

Fast and incremental explanations for online machine learning models. Works best with the river framework.
MIT License
51 stars 2 forks source link

iPFI and iSAGE with categorical inputs #85

Open rjagtani opened 1 year ago

rjagtani commented 1 year ago

iPFI and iSAGE attribute all importance to categorical variables and return ~0 fi scores for all numerical variables. The issue can be reproduced using this code.

# Load imports
from river import metrics
from river.utils import Rolling
from river.ensemble import AdaptiveRandomForestRegressor
from river.datasets import Bikes
from river import preprocessing
from river import compose
from ixai.explainer import IncrementalPFI, IncrementalSage, IncrementalPDP
from ixai.utils.wrappers import RiverWrapper
from ixai.storage import GeometricReservoirStorage
from ixai.imputer import MarginalImputer
#%%
# Set config variables
RANDOM_SEED = 42
#%%
# Load stream
stream = Bikes()
#%%
# Printing an observation from this stream
for n, (x,y) in enumerate(stream):
    if n>0:
        break
    print(x)
    print(y)
#%%
# Include features that are passed to the model
cat_vars = ['station', 'description']
num_vars = ['clouds','humidity','pressure','temperature','wind']
feature_names = num_vars + ['description']
#feature_names = num_vars
#%%
# Model and training setup
model = compose.Pipeline(
    compose.Select('description') | preprocessing.OneHotEncoder()
    | AdaptiveRandomForestRegressor(seed=RANDOM_SEED)
    )

#model = AdaptiveRandomForestRegressor(seed=RANDOM_SEED)
#%%
# Use River Wrapper around model function to standardize model outputs, Initialize loss and training metric depending on ML task
model_function = RiverWrapper(model.predict_one)
loss_metric = metrics.MAE()
training_metric = Rolling(metrics.MAE(), window_size=1000)
#%%
# Instantiate Storage Object and Imputer
storage = GeometricReservoirStorage(
    size=500,
    store_targets=False
)

imputer = MarginalImputer(
    model_function=model_function,
    storage_object=storage,
    sampling_strategy="joint"
)
#%%
# Instantiate Incremental PFI Explainer
incremental_pfi = IncrementalPFI(
    model_function=model_function,
    loss_function=loss_metric,
    feature_names=feature_names,
    smoothing_alpha=0.01,
    n_inner_samples=4,
    imputer=imputer,
    storage=storage
)
#%%
# Instantiate Incremental SAGE Explainer
incremental_sage = IncrementalSage(
    model_function=model_function,
    loss_function=loss_metric,
    imputer=imputer,
    storage=storage,
    feature_names=feature_names,
    smoothing_alpha=0.01,
    n_inner_samples=4
)
#%%
# Instantiate Incremental PDP Explainer
incremental_pdp = IncrementalPDP(
    model_function=model_function,
    gridsize=8,
    dynamic_setting=True,
    smoothing_alpha=0.01,
    pdp_feature='humidity',
    storage=storage,
    storage_size=100,
    is_classification=False
)
#%%
# Iterate over stream and explain each instance using explainers
for (n, (x_i, y_i)) in enumerate(stream, start=1):
    x_i = dict((k, x_i[k]) for k in feature_names)
    y_i_pred = model.predict_one(x_i)
    #print(y_i_pred)
    training_metric.update(y_true=y_i, y_pred=y_i_pred)

    # explaining
    inc_sage = incremental_sage.explain_one(x_i, y_i)
    inc_fi_pfi = incremental_pfi.explain_one(x_i, y_i, update_storage=False)
    inc_pdp = incremental_pdp.explain_one(x_i, update_storage=False)

    # learning
    model.learn_one(x_i, y_i)
    #print("Here")
    if n % 250 == 0:
        print(f"{n}: perf {training_metric.get()}\n"
              f"{n}: sage  {incremental_sage.importance_values}\n"
              f"{n}: pfi  {incremental_pfi.importance_values}\n")

    if n >= 1000:
        incremental_pdp.plot_pdp()
        break
mmschlk commented 1 year ago

Thank you!

mmschlk commented 1 year ago

I am pretty sure this comes from the pipeline object...