泰坦尼克号决策树预测 笔记

 1 import matplotlib.pyplot as plt
 2 import random 
 3 import pylab as mpl
 4 import pandas as pd
 5 import numpy as np
 6 from sklearn.feature_extraction import DictVectorizer
 7 from sklearn.model_selection import train_test_split
 8 from sklearn.tree import DecisionTreeClassifier, export_graphviz
 9 
10 mpl.rcParams['font.sans-serif'] = ['SimHei']
11 #mpl.rcParams['axes.unicode_minus'] = False
12 
13 # 1. 获取数据
14 titan = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
15 
16 # 2. 数据的基本处理
17 # 2.1 确定特征值,目标值
18 
19 x = titan[['pclass', 'age', 'sex']]
20 y = titan['survived']
21 
22 # 2.2 缺失值处理
23 x['age'].fillna(x['age'].mean(), inplace=True)
24 
25 # 2.3 数据集的划分
26 x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22)
27 
28 # 3. 特征工程(字典特征抽取)
29 
30 
31 x_train = x_train.to_dict(orient='records')
32 x_test = x_test.to_dict(orient='records')
33 
34 transfer = DictVectorizer()
35 
36 x_train = transfer.fit_transform(x_train)
37 x_test = transfer.fit_transform(x_test)
38 
39 
40 
41 # 4. 机器学习(决策树)
42 estimator = DecisionTreeClassifier()
43 estimator.fit(x_train, y_train)
44 
45 
46 # 5. 模型评估
47 y_pre = estimator.predict(x_test)
48 
49 ret = estimator.score(x_test, y_test)
50 print(ret)

 

posted on 2020-10-08 15:53  Hrunjie  阅读(211)  评论(0)    收藏  举报

导航