Open Hyhyhyhyhyhyh opened 4 years ago
import pandas as pd
import numpy as np
from numpy import NaN as nan
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
# 加载数据
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
# 把训练集和测试集合并到一起,进行数据预处理
full = pd.concat([train, test], axis=0)
# --- 空值处理 ---
full.isna().sum()
'''
使用均值填充数值类型列的缺失值
使用出现最多的值填充分类类型的缺失值
'''
full['Age'] = full['Age'].fillna(full['Age'].mean())
full['Fare'] = full['Fare'].fillna(full['Fare'].mean())
full['Embarked'] = full['Embarked'].value_counts().index[0]
# 把同样字母开头的船舱归为一类;如果为空则归为unkown类U
def transform_cabin(t):
if str(t) == 'nan':
return 'U'
else:
return t[0]
full['cabin'] = full['Cabin'].apply(transform_cabin)
full['cabin'].value_counts()
# --- 数据预处理 ---
# 从名字中提取出身份信息
def transform_name(t):
return t.split('.')[0].split(', ')[1]
full['title'] = full['Name'].apply(transform_name)
# 根据title进行人群分类
title_mapDict = {
"Capt": "Officer",
"Col": "Officer",
"Major": "Officer",
"Jonkheer": "Royalty",
"Don": "Royalty",
"Sir" : "Royalty",
"Dr": "Officer",
"Rev": "Officer",
"the Countess":"Royalty",
"Dona": "Royalty",
"Mme": "Mrs",
"Mlle": "Miss",
"Ms": "Mrs",
"Mr" : "Mr",
"Mrs" : "Mrs",
"Miss" : "Miss",
"Master" : "Master",
"Lady" : "Royalty"
}
full['title'] = full['title'].map(title_mapDict)
# 统计家庭人数和类别
'''
家庭人数=同代直系亲属数(Parch)+不同代直系亲属数(SibSp)+乘客自己(因为乘客自己也是家庭成员的一个,所以加1)
家庭类别:
小家庭Family_Single:家庭人数=1
中等家庭Family_Small: 2<=家庭人数<=4
大家庭Family_Large: 家庭人数>=5
'''
def transform_family(parch, sibsp):
family_cnt = parch + sibsp + 1
if family_cnt < 2:
return 'single'
elif family_cnt >=2 and family_cnt <6:
return 'small'
else:
return 'large'
full['family'] = full.apply(lambda x: transform_family(x['Parch'], x['SibSp']), axis=1)
# 对分类数据进行onehot编码
df = pd.concat([
full,
pd.get_dummies(full['Pclass'], prefix='Pclass'),
pd.get_dummies(full['cabin'], prefix='Cabin'),
pd.get_dummies(full['Embarked'], prefix='Embarked'),
pd.get_dummies(full['family'], prefix='family'),
pd.get_dummies(full['title'], prefix='title'),
], axis=1)
df.set_index(keys='PassengerId', drop=True, inplace=True)
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
# 删除分类型数据的列
df.drop(['Pclass', 'Name', 'Ticket', 'Cabin', 'Embarked', 'title', 'family', 'cabin'], axis=1, inplace=True)
corr = df.corr()
corr['Survived'].sort_values(ascending=False)
train_df = df[ ~np.isnan(df['Survived']) ]
test_df = df[ np.isnan(df['Survived']) ]
X_col = train_df.columns.tolist()
X_col.remove('Survived')
X = train_df[X_col]
y = train_df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2)
lr = LogisticRegression()
lr_param = [
{
'penalty': ['l1', 'l2'],
'C': [0.1, 1, 10],
'solver': ['liblinear']
},
{
'penalty': ['elasticnet'],
'C': [0.1, 1, 10],
'solver': ['saga'],
'l1_ratio': [0.5]
}
]
lr_gs = GridSearchCV(estimator=lr, param_grid=lr_param, n_jobs=-1, verbose=10)
lr_gs.fit(X_train, y_train)
y_hat = lr_gs.predict(X_test)
print(classification_report(y_test, y_hat))
'''
precision recall f1-score support
0.0 0.79 0.88 0.83 131
1.0 0.79 0.67 0.73 92
accuracy 0.79 223
macro avg 0.79 0.78 0.78 223
weighted avg 0.79 0.79 0.79 223
'''
tree = DecisionTreeClassifier()
tree_param = {
'criterion': ['gini', 'entropy'],
'max_depth': range(5,18)
}
tree_gs = GridSearchCV(estimator=tree, param_grid=tree_param, n_jobs=-1, verbose=10)
tree_gs.fit(X_train, y_train)
tree_y_hat = tree_gs.predict(X_test)
print(classification_report(y_test, tree_y_hat))
'''
precision recall f1-score support
0.0 0.76 0.91 0.83 131
1.0 0.82 0.59 0.68 92
accuracy 0.78 223
macro avg 0.79 0.75 0.75 223
weighted avg 0.78 0.78 0.77 223
'''
rdf = RandomForestClassifier()
rdf_param = {'n_estimators': range(100, 1000, 100)}
rdf_gs = GridSearchCV(estimator=rdf, param_grid=rdf_param, n_jobs=-1, verbose=10)
rdf_gs.fit(X_train, y_train)
rdf_y_hat = rdf_gs.predict(X_test)
print(classification_report(y_test, rdf_y_hat))
'''
precision recall f1-score support
0.0 0.80 0.87 0.83 131
1.0 0.79 0.68 0.73 92
accuracy 0.79 223
macro avg 0.79 0.78 0.78 223
weighted avg 0.79 0.79 0.79 223
'''
knn = KNeighborsClassifier()
knn_param = {
'n_neighbors': range(2,10),
'weights': ['uniform', 'distance']
}
knn_gs = GridSearchCV(estimator=knn, param_grid=knn_param, n_jobs=-1, verbose=10)
knn_gs.fit(X_train, y_train)
knn_y_hat = knn_gs.predict(X_test)
print(classification_report(y_test, knn_y_hat))
'''
precision recall f1-score support
0.0 0.73 0.82 0.77 131
1.0 0.69 0.58 0.63 92
accuracy 0.72 223
macro avg 0.71 0.70 0.70 223
weighted avg 0.71 0.72 0.71 223
'''
nb = BernoulliNB()
nb.fit(X_train, y_train)
nb_y_hat = nb.predict(X_test)
print(classification_report(y_test, nb_y_hat))
'''
precision recall f1-score support
0.0 0.83 0.82 0.83 131
1.0 0.75 0.76 0.76 92
accuracy 0.80 223
macro avg 0.79 0.79 0.79 223
weighted avg 0.80 0.80 0.80 223
'''
test_col = test_df.columns.tolist()
test_col.remove('Survived')
test_df_new = test_df[test_col]
test_y_hat = nb.predict(test_df_new)
svc = SVC()
svc_param=[
{"kernel":["rbf"],"C":[0.1, 1, 10], "gamma": [1, 0.1, 0.01]},
{"kernel":["poly"],"C": [0.1, 1, 10], "gamma": [1, 0.1, 0.01],"degree":[3,5,10],"coef0":[0,0.1,1]},
{"kernel":["sigmoid"], "C": [0.1, 1, 10], "gamma": [1, 0.1, 0.01],"coef0":[0,0.1,1]}
]
svc_gs = GridSearchCV(estimator=svc, param_grid=svc_param, n_jobs=1, verbose=10, cv=4)
svc_gs.fit(X_train, y_train)
svc_y_hat = svc_gs.predict(X_test)
print(classification_report(y_test, svc_y_hat))
项目基本信息
项目的背景是大家都熟知的发生在1912年的泰坦尼克号沉船灾难,这次灾难导致2224名船员和乘客中有1502人遇难。而哪些人幸存那些人丧生并非完全随机。比如说你碰巧搭乘了这艘游轮,而你碰巧又是一名人见人爱,花见花开的一等舱小公主,那活下来的概率就很大了,但是如果不巧你只是一名三等舱的抠脚大汉,那只有自求多福了。也就是说在这生死攸关的情况下,生存与否与性别,年龄,阶层等因素是有关系的,如果把这些因素作为特征,生存的结果作为预测目标,就可以建立一个典型的二分类机器学习模型。在这个项目中提供了部分的乘客名单,包括各种维度的特征以及是否幸存的标签,存在train.csv文件中,这是我们训练需要的数据;另一个test.csv文件是我们需要预测的乘客名单,只有相应的特征。通过对训练数据的特征与生存关系进行探索,构建合适的机器学习的模型,再用这个模型预测测试文件中乘客的幸存情况。
项目数据如下:
参考链接
https://www.jianshu.com/p/06c2ee7e5c68