Open WMJi opened 7 years ago
代码是对科比投篮数据的分析,省略了作者数据可视化的部分。 从数据处理,清洗,到特征选择,到模型选择,行云流水般的潇洒。
import warnings warnings.filterwarnings('ignore') import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.decomposition import PCA, KernelPCA from sklearn.cross_validation import KFold, cross_val_score from sklearn.metrics import make_scorer from sklearn.grid_search import GridSearchCV from sklearn.feature_selection import VarianceThreshold, RFE, SelectKBest, chi2 from sklearn.preprocessing import MinMaxScaler from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.linear_model import LogisticRegression from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.ensemble import (BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier, RandomForestClassifier, AdaBoostClassifier) ###################################数据处################################ ###################################### # 数据预处理 ###################################### pd.set_option('display.max_columns', None) data = pd.read_csv('./data/data.csv') data.set_index('shot_id', inplace=True) data["action_type"] = data["action_type"].astype('object') data["combined_shot_type"] = data["combined_shot_type"].astype('category') data["game_event_id"] = data["game_event_id"].astype('category') data["game_id"] = data["game_id"].astype('category') data["period"] = data["period"].astype('object') data["playoffs"] = data["playoffs"].astype('category') data["season"] = data["season"].astype('category') data["shot_made_flag"] = data["shot_made_flag"].astype('category') data["shot_type"] = data["shot_type"].astype('category') data["team_id"] = data["team_id"].astype('category') unknown_mask = data['shot_made_flag'].isnull() data_cl = data.copy() # create a copy of data frame target = data_cl['shot_made_flag'].copy() # Remove some columns data_cl.drop('team_id', axis=1, inplace=True) # Always one number data_cl.drop('lat', axis=1, inplace=True) # Correlated with loc_x data_cl.drop('lon', axis=1, inplace=True) # Correlated with loc_y data_cl.drop('game_id', axis=1, inplace=True) # Independent data_cl.drop('game_event_id', axis=1, inplace=True) # Independent data_cl.drop('team_name', axis=1, inplace=True) # Always LA Lakers data_cl.drop('shot_made_flag', axis=1, inplace=True) data_cl['seconds_from_period_end'] = 60 * data_cl['minutes_remaining'] + data_cl['seconds_remaining'] data_cl['last_5_sec_in_period'] = data_cl['seconds_from_period_end'] < 5 data_cl.drop('minutes_remaining', axis=1, inplace=True) data_cl.drop('seconds_remaining', axis=1, inplace=True) data_cl.drop('seconds_from_period_end', axis=1, inplace=True) ## Matchup - (away/home) data_cl['home_play'] = data_cl['matchup'].str.contains('vs').astype('int') data_cl.drop('matchup', axis=1, inplace=True) # Game date data_cl['game_date'] = pd.to_datetime(data_cl['game_date']) data_cl['game_year'] = data_cl['game_date'].dt.year data_cl['game_month'] = data_cl['game_date'].dt.month data_cl.drop('game_date', axis=1, inplace=True) # Loc_x, and loc_y binning data_cl['loc_x'] = pd.cut(data_cl['loc_x'], 25) data_cl['loc_y'] = pd.cut(data_cl['loc_y'], 25) # Replace 20 least common action types with value 'Other' rare_action_types = data_cl['action_type'].value_counts().sort_values().index.values[:20] data_cl.loc[data_cl['action_type'].isin(rare_action_types), 'action_type'] = 'Other' categorial_cols = [ 'action_type', 'combined_shot_type', 'period', 'season', 'shot_type', 'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'game_year', 'game_month', 'opponent', 'loc_x', 'loc_y'] for cc in categorial_cols: dummies = pd.get_dummies(data_cl[cc]) dummies = dummies.add_prefix("{}#".format(cc)) data_cl.drop(cc, axis=1, inplace=True) data_cl = data_cl.join(dummies) # 异常点检测方法 def detect_outliers(series, whis=1.5): q75, q25 = np.percentile(series, [75 ,25]) iqr = q75 - q25 return ~((series - series.median()).abs() <= (whis * iqr)) # Separate dataset for validation data_submit = data_cl[unknown_mask] # 训练数据 X = data_cl[~unknown_mask] Y = target[~unknown_mask] ################################### Feature Selection################### ################################# #RandomForestClassifier 来选择特征 ############################### threshold = 0.90 vt = VarianceThreshold().fit(X) feat_var_threshold = data_cl.columns[vt.variances_ > threshold * (1-threshold)] feat_var_threshold model = RandomForestClassifier() model.fit(X, Y) feature_imp = pd.DataFrame(model.feature_importances_, index=X.columns, columns=["importance"]) feat_imp_20 = feature_imp.sort_values("importance", ascending=False).head(20).index ################################# # Univariate feature selection ################################# X_minmax = MinMaxScaler(feature_range=(0,1)).fit_transform(X) X_scored = SelectKBest(score_func=chi2, k='all').fit(X_minmax, Y) feature_scoring = pd.DataFrame({ 'feature': X.columns, 'score': X_scored.scores_ }) feat_scored_20 = feature_scoring.sort_values('score', ascending=False).head(20)['feature'].values feat_scored_20 ################################# # Recursive Feature Elimination ################################# rfe = RFE(LogisticRegression(), 20) rfe.fit(X, Y) feature_rfe_scoring = pd.DataFrame({ 'feature': X.columns, 'score': rfe.ranking_ }) feat_rfe_20 = feature_rfe_scoring[feature_rfe_scoring['score'] == 1]['feature'].values feat_rfe_20 ############################### # 合并所有特征选择方法的结果 ################################ features = np.hstack([ feat_var_threshold, feat_imp_20, feat_scored_20, feat_rfe_20 ]) features = np.unique(features) print('Final features set:\n') for f in features: print("\t-{}".format(f)) ################################ # clearn data ############################### data_cl = data_cl.ix[:, features] data_submit = data_submit.ix[:, features] X = X.ix[:, features] print('Clean dataset shape: {}'.format(data_cl.shape)) print('Subbmitable dataset shape: {}'.format(data_submit.shape)) print('Train features shape: {}'.format(X.shape)) print('Target label shape: {}'. format(Y.shape)) ################################# # PCA ################################# components = 8 pca = PCA(n_components=components).fit(X) pca_variance_explained_df = pd.DataFrame({ "component": np.arange(1, components+1), "variance_explained": pca.explained_variance_ratio_ }) ax = sns.barplot(x='component', y='variance_explained', data=pca_variance_explained_df) ax.set_title("PCA - Variance explained") plt.show() ################################### # 评估函数 ################################### seed = 7 processors=1 num_folds=3 num_instances=len(X) scoring='log_loss' kfold = KFold(n=num_instances, n_folds=num_folds, random_state=seed) #################################模型选则############################## ################################# # 常用模型 ################################# models = [] models.append(('LR', LogisticRegression())) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('K-NN', KNeighborsClassifier(n_neighbors=5))) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) #models.append(('SVC', SVC(probability=True))) # Evaluate each model in turn results = [] names = [] for name, model in models: cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring, n_jobs=processors) results.append(cv_results) names.append(name) print("{0}: ({1:.3f}) +/- ({2:.3f})".format(name, cv_results.mean(), cv_results.std())) ################################## # Bootstrap Aggregation ################################### cart = DecisionTreeClassifier() num_trees = 100 model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed) results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring, n_jobs=processors) print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std())) ##################################### # Random Forest ##################################### num_trees = 100 num_features = 10 model = RandomForestClassifier(n_estimators=num_trees, max_features=num_features) results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring, n_jobs=processors) print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std())) ##################################### # extra tree ####################################### num_trees = 100 num_features = 10 model = ExtraTreesClassifier(n_estimators=num_trees, max_features=num_features) results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring, n_jobs=processors) print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std())) ####################################### # AdaBoost ###################################### model = AdaBoostClassifier(n_estimators=100, random_state=seed) results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring, n_jobs=processors) print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std())) ############################################ # Stochastic Gradient Boosting ############################################# model = GradientBoostingClassifier(n_estimators=100, random_state=seed) results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring, n_jobs=processors) print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std())) ############################参数寻找################################ ##################################### # Logistic 参数寻找 ###################################### lr_grid = GridSearchCV( estimator = LogisticRegression(random_state=seed), param_grid = { 'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 1, 10, 100, 1000] }, cv = kfold, scoring = scoring, n_jobs = processors) lr_grid.fit(X, Y) print(lr_grid.best_score_) print(lr_grid.best_params_) ######################################### # LinearDiscriminant ######################################## lda_grid = GridSearchCV( estimator = LinearDiscriminantAnalysis(), param_grid = { 'solver': ['lsqr'], 'shrinkage': [0, 0.25, 0.5, 0.75, 1], 'n_components': [None, 2, 5, 10] }, cv = kfold, scoring = scoring, n_jobs = processors) lda_grid.fit(X, Y) print(lda_grid.best_score_) print(lda_grid.best_params_) ####################################### # KNN ############################################ knn_grid = GridSearchCV( estimator = Pipeline([ ('min_max_scaler', MinMaxScaler()), ('knn', KNeighborsClassifier()) ]), param_grid = { 'knn__n_neighbors': [25], 'knn__algorithm': ['ball_tree'], 'knn__leaf_size': [2, 3, 4], 'knn__p': [1] }, cv = kfold, scoring = scoring, n_jobs = processors) knn_grid.fit(X, Y) print(knn_grid.best_score_) print(knn_grid.best_params_) ############################################### # 寻找随机森林参数 ############################################## rf_grid = GridSearchCV( estimator = RandomForestClassifier(warm_start=True, random_state=seed), param_grid = { 'n_estimators': [100, 200], 'criterion': ['gini', 'entropy'], 'max_features': [18, 20], 'max_depth': [8, 10], 'bootstrap': [True] }, cv = kfold, scoring = scoring, n_jobs = processors) rf_grid.fit(X, Y) print(rf_grid.best_score_) print(rf_grid.best_params_) ############################################ # AdaBoost 参数寻找 ############################################## ada_grid = GridSearchCV( estimator = AdaBoostClassifier(random_state=seed), param_grid = { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [10, 25, 50], 'learning_rate': [1e-3, 1e-2, 1e-1] }, cv = kfold, scoring = scoring, n_jobs = processors) ada_grid.fit(X, Y) print(ada_grid.best_score_) print(ada_grid.best_params_) ################################################# # GradientBoosting ################################################# gbm_grid = GridSearchCV( estimator = GradientBoostingClassifier(warm_start=True, random_state=seed), param_grid = { 'n_estimators': [100, 200], 'max_depth': [2, 3, 4], 'max_features': [10, 15, 20], 'learning_rate': [1e-1, 1] }, cv = kfold, scoring = scoring, n_jobs = processors) gbm_grid.fit(X, Y) print(gbm_grid.best_score_) print(gbm_grid.best_params_) ################################################# # 组合上面选择的模型 ################################################# estimators = [] estimators.append(('lr', LogisticRegression(penalty='l2', C=1))) estimators.append(('gbm', GradientBoostingClassifier(n_estimators=200, max_depth=3, learning_rate=0.1, max_features=15, warm_start=True, random_state=seed))) estimators.append(('rf', RandomForestClassifier(bootstrap=True, max_depth=8, n_estimators=200, max_features=20, criterion='entropy', random_state=seed))) estimators.append(('ada', AdaBoostClassifier(algorithm='SAMME.R', learning_rate=1e-2, n_estimators=10, random_state=seed))) # create the ensemble model ensemble = VotingClassifier(estimators, voting='soft', weights=[2,3,3,1]) results = cross_val_score(ensemble, X, Y, cv=kfold, scoring=scoring,n_jobs=processors) print("({0:.3f}) +/- ({1:.3f})".format(results.mean(), results.std())) ############################################### # 预测 ############################################### model = ensemble model.fit(X, Y) preds = model.predict_proba(data_submit) submission = pd.DataFrame() submission["shot_id"] = data_submit.index submission["shot_made_flag"]= preds[:,0] submission.to_csv("sub.csv",index=False)
copied from here
代码是对科比投篮数据的分析,省略了作者数据可视化的部分。 从数据处理,清洗,到特征选择,到模型选择,行云流水般的潇洒。