1 import matplotlib.pyplot as plt
2 import random
3 import pylab as mpl
4 import pandas as pd
5 import numpy as np
6 from sklearn.feature_extraction import DictVectorizer
7 from sklearn.model_selection import train_test_split
8 from sklearn.tree import DecisionTreeClassifier, export_graphviz
9
10 mpl.rcParams['font.sans-serif'] = ['SimHei']
11 #mpl.rcParams['axes.unicode_minus'] = False
12
13 # 1. 获取数据
14 titan = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
15
16 # 2. 数据的基本处理
17 # 2.1 确定特征值,目标值
18
19 x = titan[['pclass', 'age', 'sex']]
20 y = titan['survived']
21
22 # 2.2 缺失值处理
23 x['age'].fillna(x['age'].mean(), inplace=True)
24
25 # 2.3 数据集的划分
26 x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22)
27
28 # 3. 特征工程(字典特征抽取)
29
30
31 x_train = x_train.to_dict(orient='records')
32 x_test = x_test.to_dict(orient='records')
33
34 transfer = DictVectorizer()
35
36 x_train = transfer.fit_transform(x_train)
37 x_test = transfer.fit_transform(x_test)
38
39
40
41 # 4. 机器学习(决策树)
42 estimator = DecisionTreeClassifier()
43 estimator.fit(x_train, y_train)
44
45
46 # 5. 模型评估
47 y_pre = estimator.predict(x_test)
48
49 ret = estimator.score(x_test, y_test)
50 print(ret)