choshin84 / learning_memo

personal learning memo
0 stars 0 forks source link

Feature selection: Forward feature selection #16

Open choshin84 opened 4 years ago

choshin84 commented 4 years ago

Tweet summary

sklearn doesnt have generic forward feature selection module but only RFE, recursive feature elimination. Instead, worth trying mlxtend module.

Useful link

https://github.com/rasbt/mlxtend

choshin84 commented 4 years ago

if no features are nominal, then mlxtend library seems good option http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/

choshin84 commented 4 years ago

if not, below works.

result_cv = pd.DataFrame()
t000 = time.time()
param_iter = 0
for param in param_list:
    param_iter += 1
    print("Parameter tuning start: ", param_iter, " out of ", len(param_list), param)
    t00 = time.time()
    sfs_list = []
    auc_pr_list = []
    if k_feature > len(selected_features):
        print("Too many features selected: ", k_feature, "<", len(selected_features))
        sys.exit()
    model = lgb.LGBMClassifier(learning_rate=param[0], n_estimators=param[1], max_depth=param[2], \
                               random_state=123, n_jobs=-1)
    for feature in range(k_feature):
        print("Step forward feature selection # column: ", len(sfs_list) + 1)
        threshold = 0
        t0 = time.time()
        for column in list(set(selected_features) - set(sfs_list)):  
            # CV with with small sample, 10%
            result = []
            for i in range(len(month_list[:-2])):
                month_train = month_list[i]
                month_val = month_list[i+2]
                print("\tTraining data set: ", month_train, ", Validation data set: ", month_val)
                train_df, val_df = cv_sample(month_train, month_val, df_all, view_all, prog_all, member_all, data_sample_rate)
                X = train_df.drop(columns=['Target'])
                y = train_df.loc[:, ['Target']]
                for c in X.columns:
                    col_type = X[c].dtype
                    if col_type == 'object' or col_type.name == 'category':
                        X[c] = X[c].astype('category')
                model.fit(X.loc[:, sfs_list + [column]], y)
                auc_pr = average_precision_score(val_df.loc[:, ['Target']].values, model.predict(val_df.loc[:, sfs_list + [column]]))
                result.append([sfs_list + [column], auc_pr])
            result = pd.DataFrame(result, columns=['Column', 'AUC_PR'])
            result['Param'] = str(param)
            if result['AUC_PR'].mean() > threshold:
                best_col = column
                threshold = result['AUC_PR'].mean()
                best_result = result
        result_cv = result_cv.append(best_result)
        sfs_list.append(best_col)
        auc_pr_list.append(threshold)
        print("Selected features: ", sfs_list)
        print("CV-iteration completed...: # columns: ", len(sfs_list), ", ", int(time.time() - t0), "sec")
    print("Step forward feature selection completed...", int(time.time() - t00), "sec")
print("Parameter tuning end...", int(time.time() - t000), "sec")