import csv
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
from sklearn import tree
film_data = open('fime.csv','rt')
reader = csv.reader(film_data)
headers = next(reader)
feature_list=[] #特征值
result_list=[] #结果
for row in reader:
# 结果
result_list.append(row[-1])
# 去掉没用的信息列
feature_list.append(dict(zip(headers[1:-1],row[1:-1])))
# 对特征值扁平化处理,结果集亦然
vec = DictVectorizer()
dummyX = vec.fit_transform(feature_list).toarray()
dummyY = preprocessing.LabelBinarizer().fit_transform(result_list)
clf = tree.DecisionTreeClassifier(criterion='entropy',random_state=0)
clf = clf.fit(dummyX,dummyY)
print('clf:'+str(clf))
# 将这个树可视化
import pydotplus
dot_data = tree.export_graphviz(clf,
feature_names=vec.get_feature_names(),
filled=True,rounded=True,
special_characters=True,
out_file=None
)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("film.pdf")
#预测
predict_result=clf.predict()