H2O is an Open Source, Distributed, Fast & Scalable Machine Learning Platform: Deep Learning, Gradient Boosting (GBM) & XGBoost, Random Forest, Generalized Linear Modeling (GLM with Elastic Net), K-Means, PCA, Generalized Additive Models (GAM), RuleFit, Support Vector Machine (SVM), Stacked Ensembles, Automatic Machine Learning (AutoML), etc.
Issue: std_coef_plot method in h2o.model.model_base.ModelBase class plots multinomial logistic regression standardized coefficients as the the sum of the absolute values of each class's regression coefficients. As someone who wants to plot proper inference, I am more interested in the standardized regression coefficient plots for each class, and whether they positively/negatively contribute to the likelihood of that class.
Proposed: New method functions below for h2o.model.model_base.ModelBase class will provide standardized coefficient plots for specified response class.
{code:python}
def coef_norm_NEW(self, category=None):
"""
Return coefficients fitted on the standardized data (requires standardize = True, which is on by default).
These coefficients can be used to evaluate variable importance.
"""
if self._model_json["output"]["model_category"]=="Multinomial":
tbl = self._model_json["output"]['coefficients_table_multinomials_with_class_names'].as_data_frame()[['names','std_coefs_class_' + category]]
if tbl is None:
return None
return {name: coef for name, coef in zip(tbl["names"], tbl['std_coefs_class_' + category])}
else:
tbl = self._model_json["output"]["coefficients_table"]
if tbl is None:
return None
return {name: coef for name, coef in zip(tbl["names"], tbl["standardized_coefficients"])}
:param num_of_features: the number of features shown in the plot.
:param server: ?
:returns: None.
"""
assert_is_type(num_of_features, None, I(int, lambda x: x > 0))
# check that model is a glm
if self._model_json["algo"] != "glm":
raise H2OValueError("This function is available for GLM models only")
plt = _get_matplotlib_pyplot(server)
if not plt: return
# get unsorted tuple of labels and coefficients
unsorted_norm_coef = self.coef_norm_NEW(category).items()
# drop intercept value then sort tuples by the coefficient"s absolute value
drop_intercept = [tup for tup in unsorted_norm_coef if tup[0] != "Intercept"]
norm_coef = sorted(drop_intercept, key=lambda x: abs(x[1]), reverse=True)
signage = []
for element in norm_coef:
# if positive including zero, color blue, else color orange (use same colors as Flow)
if element[1] >= 0:
signage.append("#1F77B4") # blue
else:
signage.append("#FF7F0E") # dark orange
# get feature labels and their corresponding magnitudes
feature_labels = [tup[0] for tup in norm_coef]
norm_coef_magn = [abs(tup[1]) for tup in norm_coef]
# specify bar centers on the y axis, but flip the order so largest bar appears at top
pos = range(len(feature_labels))[::-1]
# specify the bar lengths
val = norm_coef_magn
# check number of features, default is all the features
if num_of_features is None:
num_of_features = len(val)
# plot horizontal plot
fig, ax = plt.subplots(1, 1, figsize=(14, 10))
# create separate plot for the case where num_of_features = 1
if num_of_features == 1:
plt.barh(pos[0], val[0],
align="center", height=0.8, color=signage[0], edgecolor="none")
# Hide the right and top spines, color others grey
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_color("#7B7B7B")
ax.spines["left"].set_color("#7B7B7B")
# Only show ticks on the left and bottom spines
ax.yaxis.set_ticks_position("left")
ax.xaxis.set_ticks_position("bottom")
plt.yticks([0], feature_labels[0])
ax.margins(None, 0.5)
else:
plt.barh(pos[0:num_of_features], val[0:num_of_features],
align="center", height=0.8, color=signage[0:num_of_features], edgecolor="none")
# Hide the right and top spines, color others grey
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_color("#7B7B7B")
ax.spines["left"].set_color("#7B7B7B")
# Only show ticks on the left and bottom spines
ax.yaxis.set_ticks_position("left")
ax.xaxis.set_ticks_position("bottom")
plt.yticks(pos[0:num_of_features], feature_labels[0:num_of_features])
ax.margins(None, 0.05)
# generate custom fake lines that will be used as legend entries:
# check if positive and negative values exist
# if positive create positive legend
if "#1F77B4" in signage[0:num_of_features] and "#FF7F0E" not in signage[0:num_of_features]:
color_ids = ("Positive",)
markers = [plt.Line2D([0, 0], [0, 0], color=color, marker="s", linestyle="")
for color in signage[0:num_of_features]]
lgnd = plt.legend(markers, color_ids, numpoints=1, loc="best", frameon=False, fontsize=13)
lgnd.legendHandles[0]._legmarker.set_markersize(10)
# if neg create neg legend
elif "#FF7F0E" in signage[0:num_of_features] and "#1F77B4" not in signage[0:num_of_features]:
color_ids = ("Negative",)
markers = [plt.Line2D([0, 0], [0, 0], color=color, marker="s", linestyle="")
for color in set(signage[0:num_of_features])]
lgnd = plt.legend(markers, color_ids, numpoints=1, loc="best", frameon=False, fontsize=13)
lgnd.legendHandles[0]._legmarker.set_markersize(10)
# if both provide both colors in legend
else:
color_ids = ("Positive", "Negative")
markers = [plt.Line2D([0, 0], [0, 0], color=color, marker="s", linestyle="")
for color in ['#1F77B4', '#FF7F0E']] # blue should always be positive, orange negative
lgnd = plt.legend(markers, color_ids, numpoints=1, loc="best", frameon=False, fontsize=13)
lgnd.legendHandles[0]._legmarker.set_markersize(10)
lgnd.legendHandles[1]._legmarker.set_markersize(10)
# Hide the right and top spines, color others grey
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_color("#7B7B7B")
ax.spines["left"].set_color("#7B7B7B")
# Only show ticks on the left and bottom spines
# ax.yaxis.set_ticks_position("left")
# ax.xaxis.set_ticks_position("bottom")
plt.yticks(pos[0:num_of_features], feature_labels[0:num_of_features])
plt.tick_params(axis="x", which="minor", bottom="off", top="off", labelbottom="off")
plt.title("Standardized Coef. Magnitudes: H2O GLM", fontsize=20)
# plt.axis("tight")
# show plot
if not server: plt.show()
Issue: std_coef_plot method in h2o.model.model_base.ModelBase class plots multinomial logistic regression standardized coefficients as the the sum of the absolute values of each class's regression coefficients. As someone who wants to plot proper inference, I am more interested in the standardized regression coefficient plots for each class, and whether they positively/negatively contribute to the likelihood of that class.
Proposed: New method functions below for h2o.model.model_base.ModelBase class will provide standardized coefficient plots for specified response class.
{code:python} def coef_norm_NEW(self, category=None): """ Return coefficients fitted on the standardized data (requires standardize = True, which is on by default).
def std_coef_plot_NEW(self, category=None, num_of_features=None, server=False): """ Plot a GLM model"s standardized coefficient magnitudes.
{code}