1 # encoding:utf-8
2
3 import pandas as pd
4 from sklearn.feature_extraction import DictVectorizer
5 from sklearn.model_selection import train_test_split
6 from sklearn import *
7 from sklearn.datasets import *
8 from sklearn.model_selection import *
9 from sklearn.neighbors import *
10 from sklearn.preprocessing import *
11 from sklearn.naive_bayes import *
12 from sklearn.tree import DecisionTreeClassifier,export_graphviz
13
14 def decision():
15 '''
16 决策树对泰坦尼克号进行预测生死
17 :return:
18 '''
19 # 获取数据
20 titan = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")
21
22 # 处理数据,找出特征值和目标值
23 x = titan[['pclass','age','sex']]
24 y = titan['survived']
25 print(x)
26
27 # 缺失值处理 以平均值填补
28 x['age'].fillna(x['age'].mean(),inplace=True)
29
30 # 分割数据集到训练集测试集
31 x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25)
32
33 # 进行处理(特征工程) 特征-》类别-》one_hot编码
34 dict = DictVectorizer(sparse=False)
35 x_train = dict.fit_transform(x_train.to_dict(orient="records"))
36 print(dict.get_feature_names())
37 x_test = dict.transform(x_test.to_dict(orient="records"))
38 print(x_train)
39
40 # 用决策树进行预测
41 dec = DecisionTreeClassifier()
42 dec.fit(x_train,y_train)
43
44 # 预测准确率
45 print("预测的准确率:",dec.score(x_test,y_test))
46
47 # 导出决策树的结构
48 # export_graphviz(dec,out_file="./tree.dot",feature_names=['年龄','pclass1','pclass2','pclass3','女性','男性'])
49 return None
50
51 if __name__ == '__main__':
52 decision()