h2oai / h2o-3

H2O is an Open Source, Distributed, Fast & Scalable Machine Learning Platform: Deep Learning, Gradient Boosting (GBM) & XGBoost, Random Forest, Generalized Linear Modeling (GLM with Elastic Net), K-Means, PCA, Generalized Additive Models (GAM), RuleFit, Support Vector Machine (SVM), Stacked Ensembles, Automatic Machine Learning (AutoML), etc.
http://h2o.ai
Apache License 2.0
6.81k stars 1.99k forks source link

Multinomial Logistic Regression Standardized Coefficient Plot Edits #9178

Open exalate-issue-sync[bot] opened 1 year ago

exalate-issue-sync[bot] commented 1 year ago

Issue: std_coef_plot method in h2o.model.model_base.ModelBase class plots multinomial logistic regression standardized coefficients as the the sum of the absolute values of each class's regression coefficients. As someone who wants to plot proper inference, I am more interested in the standardized regression coefficient plots for each class, and whether they positively/negatively contribute to the likelihood of that class.

Proposed: New method functions below for h2o.model.model_base.ModelBase class will provide standardized coefficient plots for specified response class.

{code:python} def coef_norm_NEW(self, category=None): """ Return coefficients fitted on the standardized data (requires standardize = True, which is on by default).

These coefficients can be used to evaluate variable importance.
"""
if self._model_json["output"]["model_category"]=="Multinomial":
    tbl = self._model_json["output"]['coefficients_table_multinomials_with_class_names'].as_data_frame()[['names','std_coefs_class_' + category]]
    if tbl is None:
        return None
    return {name: coef for name, coef in zip(tbl["names"], tbl['std_coefs_class_' + category])}
else:
    tbl = self._model_json["output"]["coefficients_table"]
    if tbl is None:
        return None
    return {name: coef for name, coef in zip(tbl["names"], tbl["standardized_coefficients"])}

def std_coef_plot_NEW(self, category=None, num_of_features=None, server=False): """ Plot a GLM model"s standardized coefficient magnitudes.

:param num_of_features: the number of features shown in the plot.
:param server: ?

:returns: None.
"""
assert_is_type(num_of_features, None, I(int, lambda x: x > 0))

# check that model is a glm
if self._model_json["algo"] != "glm":
    raise H2OValueError("This function is available for GLM models only")

plt = _get_matplotlib_pyplot(server)
if not plt: return

# get unsorted tuple of labels and coefficients
unsorted_norm_coef = self.coef_norm_NEW(category).items()
# drop intercept value then sort tuples by the coefficient"s absolute value
drop_intercept = [tup for tup in unsorted_norm_coef if tup[0] != "Intercept"]
norm_coef = sorted(drop_intercept, key=lambda x: abs(x[1]), reverse=True)

signage = []
for element in norm_coef:
    # if positive including zero, color blue, else color orange (use same colors as Flow)
    if element[1] >= 0:
        signage.append("#1F77B4")  # blue
    else:
        signage.append("#FF7F0E")  # dark orange

# get feature labels and their corresponding magnitudes
feature_labels = [tup[0] for tup in norm_coef]
norm_coef_magn = [abs(tup[1]) for tup in norm_coef]
# specify bar centers on the y axis, but flip the order so largest bar appears at top
pos = range(len(feature_labels))[::-1]
# specify the bar lengths
val = norm_coef_magn

# check number of features, default is all the features
if num_of_features is None:
    num_of_features = len(val)

# plot horizontal plot
fig, ax = plt.subplots(1, 1, figsize=(14, 10))
# create separate plot for the case where num_of_features = 1
if num_of_features == 1:
    plt.barh(pos[0], val[0],
             align="center", height=0.8, color=signage[0], edgecolor="none")
    # Hide the right and top spines, color others grey
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)
    ax.spines["bottom"].set_color("#7B7B7B")
    ax.spines["left"].set_color("#7B7B7B")
    # Only show ticks on the left and bottom spines
    ax.yaxis.set_ticks_position("left")
    ax.xaxis.set_ticks_position("bottom")
    plt.yticks([0], feature_labels[0])
    ax.margins(None, 0.5)

else:
    plt.barh(pos[0:num_of_features], val[0:num_of_features],
             align="center", height=0.8, color=signage[0:num_of_features], edgecolor="none")
    # Hide the right and top spines, color others grey
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)
    ax.spines["bottom"].set_color("#7B7B7B")
    ax.spines["left"].set_color("#7B7B7B")
    # Only show ticks on the left and bottom spines
    ax.yaxis.set_ticks_position("left")
    ax.xaxis.set_ticks_position("bottom")
    plt.yticks(pos[0:num_of_features], feature_labels[0:num_of_features])
    ax.margins(None, 0.05)

# generate custom fake lines that will be used as legend entries:
# check if positive and negative values exist
# if positive create positive legend
if "#1F77B4" in signage[0:num_of_features] and "#FF7F0E" not in signage[0:num_of_features]:
    color_ids = ("Positive",)
    markers = [plt.Line2D([0, 0], [0, 0], color=color, marker="s", linestyle="")
               for color in signage[0:num_of_features]]
    lgnd = plt.legend(markers, color_ids, numpoints=1, loc="best", frameon=False, fontsize=13)
    lgnd.legendHandles[0]._legmarker.set_markersize(10)
# if neg create neg legend
elif "#FF7F0E" in signage[0:num_of_features] and "#1F77B4" not in signage[0:num_of_features]:
    color_ids = ("Negative",)
    markers = [plt.Line2D([0, 0], [0, 0], color=color, marker="s", linestyle="")
               for color in set(signage[0:num_of_features])]
    lgnd = plt.legend(markers, color_ids, numpoints=1, loc="best", frameon=False, fontsize=13)
    lgnd.legendHandles[0]._legmarker.set_markersize(10)
# if both provide both colors in legend
else:
    color_ids = ("Positive", "Negative")
    markers = [plt.Line2D([0, 0], [0, 0], color=color, marker="s", linestyle="")
               for color in ['#1F77B4', '#FF7F0E']] # blue should always be positive, orange negative
    lgnd = plt.legend(markers, color_ids, numpoints=1, loc="best", frameon=False, fontsize=13)
    lgnd.legendHandles[0]._legmarker.set_markersize(10)
    lgnd.legendHandles[1]._legmarker.set_markersize(10)

# Hide the right and top spines, color others grey
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_color("#7B7B7B")
ax.spines["left"].set_color("#7B7B7B")

# Only show ticks on the left and bottom spines
# ax.yaxis.set_ticks_position("left")
# ax.xaxis.set_ticks_position("bottom")
plt.yticks(pos[0:num_of_features], feature_labels[0:num_of_features])
plt.tick_params(axis="x", which="minor", bottom="off", top="off",  labelbottom="off")
plt.title("Standardized Coef. Magnitudes: H2O GLM", fontsize=20)
# plt.axis("tight")
# show plot
if not server: plt.show()

{code}

h2o-ops commented 1 year ago

JIRA Issue Migration Info

Jira Issue: PUBDEV-6448 Assignee: New H2O Bugs Reporter: Former user State: Open Fix Version: N/A Attachments: N/A Development PRs: N/A