工程实训——泰坦尼克号(摆烂版)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
data = pd.concat([train,test]) #将数据集测试集拆分开
data.columns = ['乘客编号','存活情况','票种','名字','性别','年龄','陪同兄弟姐妹','陪同直系亲属','票据编号','票价','乘仓','登船口']
data.describe().T #查看总体,发现看不出来什麽东西
data.isnull().sum(axis=0)
#1、发现登船口只有两个,票价只有一个,那就直接删掉了
data = data.loc[data['票价'].notnull() & data['登船口'].notnull()]
data.isnull().sum(axis=0)
#2、年龄有263人没填写,那么就,全部置为-1
data['年龄'] = data['年龄'].fillna(-1)
data.isnull().sum()
#3、乘仓有很多人没有写,那就全部置为U,表示没有填写
data['乘仓'] = data['乘仓'].fillna("U")
data.isnull().sum()
#性别映射成0、1
dic = {
'male':0,
'female':1
}
data['性别'] = data['性别'].map(dic)
#姓名直接去掉
data.drop(['名字'], axis=1, inplace=True)
#票据编号直接去掉
data.drop(['票据编号'], axis=1, inplace=True)
#乘客编号去掉
data.drop(['乘客编号'], axis=1, inplace=True)
#乘仓和登船口用独热码改写
df_Cabin = pd.get_dummies(data['乘仓'])
df_Embarked = pd.get_dummies(data['登船口'])
data = pd.concat((data,df_Cabin),axis=1)
data = pd.concat((data,df_Embarked),axis=1)
data.drop(['乘仓','登船口'],axis=1,inplace=True)
# #target也用独热码改写,方便之后获取决策树的值
# #存活情况映射成标签,方便独热码处理
# dic = {
# 0:'死亡',
# 1:'存活'
# }
# target_vector = data['存活情况']
# data['存活情况'] = data['存活情况'].map(dic)
# data
# df_target = pd.get_dummies(data['存活情况'])
# df_target
# data = pd.concat((data,df_target),axis=1)
# data.drop(['存活情况'],axis=1,inplace=True)
# feature = data.loc[:,((data.columns != '存活') & (data.columns != '死亡' ))]
# target = data[['存活','死亡']]
# feature
feature = data.loc[:,(data.columns != '存活情况')]
target = data[['存活情况']]
feature
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score, roc_curve, recall_score, f1_score, roc_auc_score, accuracy_score
import warnings
size = np.arange(0.6,1,0.1)
scorelist = [[],[],[],[],[],[]]
for siz in size:
x_train,x_test,y_train,y_test = train_test_split(feature,target,train_size=siz,random_state=2021)
#target_vector
#逻辑回归
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit( x_train , y_train )
scorelist[0].append(model.score(x_test , y_test ))
#随机森林Random Forests Model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100)
model.fit( x_train , y_train )
scorelist[1].append(model.score(x_test , y_test ))
#支持向量机Support Vector Machines
from sklearn.svm import SVC
model = SVC()
model.fit( x_train , y_train )
scorelist[2].append(model.score(x_test , y_test ))
#KNN最邻近算法 K-nearest neighbors
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors = 3)
model.fit( x_train , y_train )
scorelist[3].append(model.score(x_test , y_test ))
#朴素贝叶斯分类 Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit( x_train , y_train )
scorelist[4].append(model.score(x_test , y_test ))
#决策树 Designed Tree
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_leaf_nodes=15)
model.fit( x_train , y_train )
scorelist[5].append(model.score(x_test , y_test ))
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
color_list = ('red', 'blue', 'lightgreen', 'cornflowerblue', 'turquoise', 'magenta')
for i in range(0,6):
plt.plot(size,scorelist[i],color=color_list[i])
plt.legend(['逻辑回归', '随机森林','支持向量机', 'KNN最邻近算法','朴素贝叶斯','决策树'])
plt.xlabel('训练集占比')
plt.ylabel('准确率')
plt.title('不同的模型随着训练集占比变化曲线(accuracy)')
plt.show()
#如图所示,随机森林yyds
#但也可以看到决策树在0.8的训练集的时候,非常牛逼
#我们把决策树画出来
import graphviz
from sklearn import tree
feature_name = data.columns
x_train,x_test,y_train,y_test = train_test_split(feature,target_vector,train_size=0.8,random_state=2021)
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_leaf_nodes=10)
model.fit( x_train , y_train )
print(model.score(x_test,y_test))
dot_data = tree.export_graphviz(model
,out_file = None
,feature_names= feature_name
,class_names=df_target.columns
,filled=True#使用颜色表示分类结果
)
graph = graphviz.Source(dot_data)
graph
import pydotplus
graph = pydotplus.graph_from_dot_data(dot_data)
# 保存图像到pdf文件
graph.write_pdf("titanic.pdf")