Open choshin84 opened 4 years ago
if no features are nominal, then mlxtend library seems good option http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/
if not, below works.
result_cv = pd.DataFrame()
t000 = time.time()
param_iter = 0
for param in param_list:
param_iter += 1
print("Parameter tuning start: ", param_iter, " out of ", len(param_list), param)
t00 = time.time()
sfs_list = []
auc_pr_list = []
if k_feature > len(selected_features):
print("Too many features selected: ", k_feature, "<", len(selected_features))
sys.exit()
model = lgb.LGBMClassifier(learning_rate=param[0], n_estimators=param[1], max_depth=param[2], \
random_state=123, n_jobs=-1)
for feature in range(k_feature):
print("Step forward feature selection # column: ", len(sfs_list) + 1)
threshold = 0
t0 = time.time()
for column in list(set(selected_features) - set(sfs_list)):
# CV with with small sample, 10%
result = []
for i in range(len(month_list[:-2])):
month_train = month_list[i]
month_val = month_list[i+2]
print("\tTraining data set: ", month_train, ", Validation data set: ", month_val)
train_df, val_df = cv_sample(month_train, month_val, df_all, view_all, prog_all, member_all, data_sample_rate)
X = train_df.drop(columns=['Target'])
y = train_df.loc[:, ['Target']]
for c in X.columns:
col_type = X[c].dtype
if col_type == 'object' or col_type.name == 'category':
X[c] = X[c].astype('category')
model.fit(X.loc[:, sfs_list + [column]], y)
auc_pr = average_precision_score(val_df.loc[:, ['Target']].values, model.predict(val_df.loc[:, sfs_list + [column]]))
result.append([sfs_list + [column], auc_pr])
result = pd.DataFrame(result, columns=['Column', 'AUC_PR'])
result['Param'] = str(param)
if result['AUC_PR'].mean() > threshold:
best_col = column
threshold = result['AUC_PR'].mean()
best_result = result
result_cv = result_cv.append(best_result)
sfs_list.append(best_col)
auc_pr_list.append(threshold)
print("Selected features: ", sfs_list)
print("CV-iteration completed...: # columns: ", len(sfs_list), ", ", int(time.time() - t0), "sec")
print("Step forward feature selection completed...", int(time.time() - t00), "sec")
print("Parameter tuning end...", int(time.time() - t000), "sec")
Tweet summary
sklearn doesnt have generic forward feature selection module but only RFE, recursive feature elimination. Instead, worth trying mlxtend module.
Useful link
https://github.com/rasbt/mlxtend