6.利用 Python 练习数据挖掘
0.中间出现错误可能用到的资源
安装包:http://www.lfd.uci.edu/~gohlke/pythonlibs/#scipy
安装文件:

1.代码
#!/usr/bin/env python # -*- coding:utf-8 -*- import urllib2 from numpy import genfromtxt, zeros, mean from pylab import plot, show from pylab import figure, subplot, hist, xlim, show from sklearn.naive_bayes import GaussianNB from sklearn import model_selection from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report from sklearn.model_selection import cross_val_score # 加载数据 def load_data(): url = 'http://aima.cs.berkeley.edu/data/iris.csv' u = urllib2.urlopen(url) local_file = open('iris.csv', 'w') local_file.write(u.read()) local_file.close() # 生成矩阵 def ger_matrix(): # read the first 4 columns data_ = genfromtxt('iris.csv', delimiter=',', usecols=(0, 1, 2, 3)) # read the fifth column target_ = genfromtxt('iris.csv', delimiter=',', usecols=(4), dtype=str) print data_.shape print target_.shape print set(target_) # build a collection of unique elements return data_, target_ # 绘制散点图 def draw_scatter(data, target): # 使用数据的第一维和第三维进行绘制 plot(data[target == 'setosa', 0], data[target == 'setosa', 2], 'bo') # blue 山鸢尾 plot(data[target == 'versicolor', 0], data[target == 'versicolor', 2], 'ro') # red 变色鸢尾 plot(data[target == 'virginica', 0], data[target == 'virginica', 2], 'go') # green 维吉尼亚鸢尾 show() # 绘制柱形图 def draw_column(data, target): xmin = min(data[:, 0]) xmax = max(data[:, 0]) figure() subplot(411) # distribution of the setosa class (1st, on the top) hist(data[target == 'setosa', 0], color='b', alpha=.7) xlim(xmin, xmax) subplot(412) # distribution of the versicolor class (2nd) hist(data[target == 'versicolor', 0], color='r', alpha=.7) xlim(xmin, xmax) subplot(413) # distribution of the virginica class (3rd) hist(data[target == 'virginica', 0], color='g', alpha=.7) xlim(xmin, xmax) subplot(414) # global histogram (4th, on the bottom) hist(data[:, 0], color='y', alpha=.7) xlim(xmin, xmax) show() # 分类 def classify(data, target): # 使用高斯朴素贝叶斯来分析 t = zeros(len(target)) t[target == 'setosa'] = 1 t[target == 'versicolor'] = 2 t[target == 'virginica'] = 3 classifier = GaussianNB() classifier.fit(data, t) # training on the iris dataset print classifier.predict(data)[0] print t[0] # 随机抽取样本把数据分为训练集和测试集。我们将会使用训练集的数据来训练分类器,并使用测试集的数据来测试分类器 # 测试集被指定为源数据的40%(命名为test_size),我们用它反复训练我们的分类器并输出精确度 train, test, t_train, t_test = model_selection.train_test_split(data, t, test_size=0.4, random_state=0) classifier.fit(train, t_train) # train # 正确分类样本的数量除以总样本的数量得出的,它意味着我们正确预测的比例 print classifier.score(test, t_test) # test # 另一个估计分类器表现的工具叫做混淆矩阵。在此矩阵中每列代表一个预测类的实例,每行代表一个实际类的实例 # 所有正确的猜测都在表格的对角线上,那么观测表格的错误就很容易了,即对角线以外的非零值 print confusion_matrix(classifier.predict(test), t_test) # 展示分类器性能的完整报告 # Precision:正确预测的比例 # Recall(或者叫真阳性率):正确识别的比例 # F1-Score:precision和recall的调和平均数 print classification_report(classifier.predict(test), t_test, target_names=['setosa', 'versicolor', 'virginica']) # Cross Validation。该模型背后的思想很简单:多次将数据分为不同的训练集和测试集,最终分类器评估选取多次预测的平均值 scores = cross_val_score(classifier, data, t, cv=6) print scores print mean(scores) if __name__ == '__main__': load_data() data_, target_ = ger_matrix() # draw_scatter(data_, target_) # draw_column(data_, target_) classify(data_, target_)
    http://www.cnblogs.com/makexu/

 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号