Loading

工程实训——泰坦尼克号(摆烂版)

工程实训——泰坦尼克号(摆烂版)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

data = pd.concat([train,test]) #将数据集测试集拆分开
data.columns = ['乘客编号','存活情况','票种','名字','性别','年龄','陪同兄弟姐妹','陪同直系亲属','票据编号','票价','乘仓','登船口']
data.describe().T  #查看总体,发现看不出来什麽东西
data.isnull().sum(axis=0)  
#1、发现登船口只有两个,票价只有一个,那就直接删掉了
data = data.loc[data['票价'].notnull() & data['登船口'].notnull()]
data.isnull().sum(axis=0)
#2、年龄有263人没填写,那么就,全部置为-1
data['年龄'] = data['年龄'].fillna(-1)
data.isnull().sum()
#3、乘仓有很多人没有写,那就全部置为U,表示没有填写
data['乘仓'] = data['乘仓'].fillna("U")
data.isnull().sum()

#性别映射成0、1
dic = {
    'male':0,
    'female':1
}
data['性别'] = data['性别'].map(dic)
#姓名直接去掉
data.drop(['名字'], axis=1, inplace=True)
#票据编号直接去掉
data.drop(['票据编号'], axis=1, inplace=True)
#乘客编号去掉
data.drop(['乘客编号'], axis=1, inplace=True)

#乘仓和登船口用独热码改写
df_Cabin = pd.get_dummies(data['乘仓'])
df_Embarked = pd.get_dummies(data['登船口'])
data = pd.concat((data,df_Cabin),axis=1)
data = pd.concat((data,df_Embarked),axis=1)
data.drop(['乘仓','登船口'],axis=1,inplace=True)


# #target也用独热码改写,方便之后获取决策树的值

# #存活情况映射成标签,方便独热码处理
# dic = {
#     0:'死亡',
#     1:'存活'
# }
# target_vector = data['存活情况']
# data['存活情况'] = data['存活情况'].map(dic)
# data
# df_target = pd.get_dummies(data['存活情况'])

# df_target

# data = pd.concat((data,df_target),axis=1)

# data.drop(['存活情况'],axis=1,inplace=True)

# feature = data.loc[:,((data.columns != '存活') & (data.columns != '死亡' ))]
# target = data[['存活','死亡']]
# feature

feature = data.loc[:,(data.columns != '存活情况')]
target = data[['存活情况']]
feature
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score, roc_curve, recall_score, f1_score, roc_auc_score, accuracy_score
import warnings

size = np.arange(0.6,1,0.1)
scorelist = [[],[],[],[],[],[]]

for siz in size:
    x_train,x_test,y_train,y_test = train_test_split(feature,target,train_size=siz,random_state=2021)
    
    #target_vector
    
    #逻辑回归
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression()
    model.fit( x_train , y_train )
    scorelist[0].append(model.score(x_test , y_test ))
    
    #随机森林Random Forests Model
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=100)
    model.fit( x_train , y_train )
    scorelist[1].append(model.score(x_test , y_test ))
    
    #支持向量机Support Vector Machines
    from sklearn.svm import SVC
    model = SVC()
    model.fit( x_train , y_train )
    scorelist[2].append(model.score(x_test , y_test ))
    
    #KNN最邻近算法 K-nearest neighbors
    from sklearn.neighbors import KNeighborsClassifier
    model = KNeighborsClassifier(n_neighbors = 3)
    model.fit( x_train , y_train )
    scorelist[3].append(model.score(x_test , y_test ))
    
    #朴素贝叶斯分类 Gaussian Naive Bayes
    from sklearn.naive_bayes import GaussianNB
    model = GaussianNB()
    model.fit( x_train , y_train )
    scorelist[4].append(model.score(x_test , y_test ))
    
    #决策树 Designed Tree
    from sklearn.tree import DecisionTreeClassifier
    model = DecisionTreeClassifier(max_leaf_nodes=15)
    model.fit( x_train , y_train )
    scorelist[5].append(model.score(x_test , y_test ))

    
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
color_list = ('red', 'blue', 'lightgreen', 'cornflowerblue', 'turquoise', 'magenta')
for i in range(0,6):
    plt.plot(size,scorelist[i],color=color_list[i])
plt.legend(['逻辑回归', '随机森林','支持向量机', 'KNN最邻近算法','朴素贝叶斯','决策树'])

plt.xlabel('训练集占比')
plt.ylabel('准确率')
plt.title('不同的模型随着训练集占比变化曲线(accuracy)')
plt.show()
#如图所示,随机森林yyds
#但也可以看到决策树在0.8的训练集的时候,非常牛逼
#我们把决策树画出来
import graphviz
from sklearn import tree

feature_name = data.columns

x_train,x_test,y_train,y_test = train_test_split(feature,target_vector,train_size=0.8,random_state=2021)
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_leaf_nodes=10)
model.fit( x_train , y_train )
print(model.score(x_test,y_test))
dot_data = tree.export_graphviz(model
                                ,out_file = None
                                ,feature_names= feature_name 
                                ,class_names=df_target.columns
                                ,filled=True#使用颜色表示分类结果
                               )
graph = graphviz.Source(dot_data)
graph
import pydotplus
graph = pydotplus.graph_from_dot_data(dot_data)
# 保存图像到pdf文件

graph.write_pdf("titanic.pdf")
posted @ 2021-09-09 21:41  my-island  阅读(79)  评论(0)    收藏  举报