Open linxz-nice opened 2 months ago
Hi @linxz-nice, could you please share the whole code block you run? From the code you shared, I cannot reproduce what is wrong.
We don´t calculate the exact AUC value. H2O-3 uses the trapezoidal rule to approximate the area under the ROC curve from binned values. We also optimize the threshold to get the best predictions. So, the threshold in sci-kit-learn can differ from H2O-3.
Model training and predictive code:
def
model(train_file, valid_file, outdir):
train = h2o.upload_file(train_file)
valid = h2o.upload_file(valid_file)
# Identify predictors and response
x = train.columns # columns
y = "group" # Response variable
s = 'sample'
x.remove(y)
x.remove(s)
# For binary classification, response should be a factor
train[y] = train[y].asfactor()
valid[y] = valid[y].asfactor()
valid1 = valid[s]
train1 = train[s]
domain = train[y].categories()
aml = H2OAutoML(max_runtime_secs=7200, max_models=20, seed=1, nfolds=10, include_algos=["GLM", "DRF"]) # config
aml.train(x=x, y=y, training_frame=train, validation_frame=valid) # fit model
lb = h2o.make_leaderboard(aml, scoring_data='train') # leaderboard
h2o.export_file(lb, f'{outdir}/leaderboard-train.csv', force = True)
for model_id in list(lb.as_data_frame()['model_id']):
m = h2o.get_model(model_id)
#print(m.auc())
h2o.save_model(m, path=f'{outdir}/{model_id}', force=True) # save model
m.varimp_plot(save_plot_path=f'{outdir}/{model_id}/varimp_plot.png') # Plot the variable importance for a trained model.
varimp_df = h2o.H2OFrame(m.varimp(use_pandas=True)) # Variable importance as a pandas dataframe.
h2o.export_file(varimp_df, f'{outdir}/{model_id}/variable_importances.csv', force=True)
m.learning_curve_plot(save_plot_path=f'{outdir}/{model_id}/learn_curve.png') # Learning curve plot.
train_thresholds = m.thresholds_and_metric_scores(train=True)['threshold']
valid_thresholds = m.thresholds_and_metric_scores(valid=True)['threshold']
# train set
perf = m.model_performance(train=True)
# youden index
thresholds, fpr = zip(*perf.fpr(thresholds=train_thresholds))
thresholds, tpr = zip(*perf.tpr(thresholds=train_thresholds))
youden_index = np.argmax(np.array(tpr) - np.array(fpr))
best_threshold = thresholds[youden_index]
# predict
train_pre= m.predict(train)
train_pre = train1.cbind(train_pre)
h2o.export_file(train_pre, f'{outdir}/{model_id}/train.csv', force=True)
# plot ROC
rocs = m.roc(train=True, valid=True)
dfs = []
for name, roc in rocs.items():
fprs,tprs = roc
a = h2o.H2OFrame({'fprs':fprs, 'tprs':tprs, "type": [name]*len(fprs)})
dfs.append(a)
df=dfs[0].concat(dfs[1:], axis=0)
h2o.export_file(df, f'{outdir}/{model_id}/roc.csv', force=True)
`
Compute auc code using scikit-learn
fprs, tprs, thresholds = roc_curve(y_true, predict) auc(fprs, tprs)
H2O version, Operating System and Environment H2O 3.46.0.2, linux
aucscikit-learn After the predicted value is output with the training set, the AUC is calculated with scikit-learn. The code is as follows: fprs, tprs, thresholds = roc_curve(y_true, predict)
auc(fprs, tprs)