6.利用 Python 练习数据挖掘

0.中间出现错误可能用到的资源

安装包:http://www.lfd.uci.edu/~gohlke/pythonlibs/#scipy

安装文件:

1.代码

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import urllib2
from numpy import genfromtxt, zeros, mean
from pylab import plot, show
from pylab import figure, subplot, hist, xlim, show
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score


# 加载数据
def load_data():
    url = 'http://aima.cs.berkeley.edu/data/iris.csv'
    u = urllib2.urlopen(url)
    local_file = open('iris.csv', 'w')
    local_file.write(u.read())
    local_file.close()


# 生成矩阵
def ger_matrix():
    # read the first 4 columns
    data_ = genfromtxt('iris.csv', delimiter=',', usecols=(0, 1, 2, 3))
    # read the fifth column
    target_ = genfromtxt('iris.csv', delimiter=',', usecols=(4), dtype=str)
    print data_.shape
    print target_.shape
    print set(target_)  # build a collection of unique elements
    return data_, target_


# 绘制散点图
def draw_scatter(data, target):
    # 使用数据的第一维和第三维进行绘制
    plot(data[target == 'setosa', 0], data[target == 'setosa', 2], 'bo')  # blue 山鸢尾
    plot(data[target == 'versicolor', 0], data[target == 'versicolor', 2], 'ro')  # red 变色鸢尾
    plot(data[target == 'virginica', 0], data[target == 'virginica', 2], 'go')  # green 维吉尼亚鸢尾
    show()


# 绘制柱形图
def draw_column(data, target):
    xmin = min(data[:, 0])
    xmax = max(data[:, 0])
    figure()
    subplot(411)  # distribution of the setosa class (1st, on the top)
    hist(data[target == 'setosa', 0], color='b', alpha=.7)
    xlim(xmin, xmax)
    subplot(412)  # distribution of the versicolor class (2nd)
    hist(data[target == 'versicolor', 0], color='r', alpha=.7)
    xlim(xmin, xmax)
    subplot(413)  # distribution of the virginica class (3rd)
    hist(data[target == 'virginica', 0], color='g', alpha=.7)
    xlim(xmin, xmax)
    subplot(414)  # global histogram (4th, on the bottom)
    hist(data[:, 0], color='y', alpha=.7)
    xlim(xmin, xmax)
    show()


# 分类
def classify(data, target):
    # 使用高斯朴素贝叶斯来分析
    t = zeros(len(target))
    t[target == 'setosa'] = 1
    t[target == 'versicolor'] = 2
    t[target == 'virginica'] = 3
    classifier = GaussianNB()
    classifier.fit(data, t)  # training on the iris dataset
    print classifier.predict(data)[0]
    print t[0]

    # 随机抽取样本把数据分为训练集和测试集。我们将会使用训练集的数据来训练分类器,并使用测试集的数据来测试分类器
    # 测试集被指定为源数据的40%(命名为test_size),我们用它反复训练我们的分类器并输出精确度
    train, test, t_train, t_test = model_selection.train_test_split(data, t, test_size=0.4, random_state=0)
    classifier.fit(train, t_train)  # train

    # 正确分类样本的数量除以总样本的数量得出的,它意味着我们正确预测的比例
    print classifier.score(test, t_test)  # test

    # 另一个估计分类器表现的工具叫做混淆矩阵。在此矩阵中每列代表一个预测类的实例,每行代表一个实际类的实例
    # 所有正确的猜测都在表格的对角线上,那么观测表格的错误就很容易了,即对角线以外的非零值
    print confusion_matrix(classifier.predict(test), t_test)

    # 展示分类器性能的完整报告
    # Precision:正确预测的比例
    # Recall(或者叫真阳性率):正确识别的比例
    # F1-Score:precision和recall的调和平均数
    print classification_report(classifier.predict(test), t_test, target_names=['setosa', 'versicolor', 'virginica'])

    # Cross Validation。该模型背后的思想很简单:多次将数据分为不同的训练集和测试集,最终分类器评估选取多次预测的平均值
    scores = cross_val_score(classifier, data, t, cv=6)
    print scores
    print mean(scores)


if __name__ == '__main__':
    load_data()
    data_, target_ = ger_matrix()
    # draw_scatter(data_, target_)
    # draw_column(data_, target_)
    classify(data_, target_)

 

posted @ 2017-09-19 15:25  桃源仙居  阅读(164)  评论(0)    收藏  举报