贝叶斯实现分类

# -*- coding: utf-8 -*-

import sys
import os
import numpy as np
import pickle
from sklearn import metrics

#导入数据集
def loadDataSet():
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him', 'my'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]  # 1 is abusive, 0 not,分类
    return postingList, classVec


# 读取文件
def readfile(path):
    fp = open(path, "rb")
    content = fp.read()
    fp.close()
    return content

"""

#计算分类精度:
def metrics_result(actual,predict):
    print('精度:{0:.3f}'.format(metrics.precision_score(actual,predict)))
    print ('召回:{0:0.3f}'.format(metrics.recall_score(actual,predict)))
    print ('f1-score:{0:.3f}'.format(metrics.f1_score(actual,predict)))

"""

# 读取bunch对象
def readbunchobj(path):
    file_obj = open(path, "rb")
    bunch = pickle.load(file_obj)
    file_obj.close()
    return bunch


# 写入bunch对象
def writebunchobj(path, bunchobj):
    file_obj = open(path, "wb")
    pickle.dump(bunchobj, file_obj)
    file_obj.close()


class NBayes(object):
    def __init__(self):
        self.vocabulary = []  # 词典
        self.idf = 0  # 词典的idf权值向量
        self.tf = 0  # 训练集的权值矩阵
        self.tdm = 0  # P(x|yi)
        self.Pcates = {}  # P(yi)--是个类别字典,这个集合就是p(yi)的值的集合
        self.labels = []  # 对应每个文本的分类,是个外部导入的列表
        self.doclength = 0  # 训练集文本数
        self.vocablen = 0  # 词典词长
        self.testset = 0  # 测试集

    #    加载训练集并生成词典,以及tf, idf值
    def train_set(self, trainset, classVec):
        self.cate_prob(classVec)  # 计算每个分类在数据集中的概率:P(yi)
        self.doclength = len(trainset)
        tempset = set()
        [tempset.add(word) for doc in trainset for word in doc]  # 生成词典
        self.vocabulary = list(tempset)
        self.vocablen = len(self.vocabulary)
        self.calc_wordfreq(trainset)
        # self.calc_tfidf(trainset)  # 生成tf-idf权值
        self.build_tdm()  # 按分类累计向量空间的每维值:P(x|yi)

    # 生成 tf-idf
    def calc_tfidf(self, trainset):
        self.idf = np.zeros([1, self.vocablen])
        self.tf = np.zeros([self.doclength, self.vocablen])
        for indx in range(self.doclength):
            for word in trainset[indx]:
                self.tf[indx, self.vocabulary.index(word)] += 1
            # 消除不同句长导致的偏差
            self.tf[indx] = self.tf[indx] / float(len(trainset[indx]))
            for signleword in set(trainset[indx]):
                self.idf[0, self.vocabulary.index(signleword)] += 1
        self.idf = np.log(float(self.doclength) / self.idf)
        self.tf = np.multiply(self.tf, self.idf)  # 矩阵与向量的点乘

    # 生成普通的词频向量
    def calc_wordfreq(self, trainset):
        self.idf = np.zeros([1, self.vocablen])  # 1*词典数
        self.tf = np.zeros([self.doclength, self.vocablen])  # 训练集文件数*词典数
        for indx in range(self.doclength):  # 遍历所有的文本
            for word in trainset[indx]:  # 遍历文本中的每个词
                self.tf[indx, self.vocabulary.index(word)] += 1  # 找到文本的词在字典中的位置+1
            for signleword in set(trainset[indx]):
                self.idf[0, self.vocabulary.index(signleword)] += 1

    # 计算每个分类在数据集中的概率:P(yi)
    def cate_prob(self, classVec):
        self.labels = classVec#让分类作为相对应的标签
        labeltemps = set(self.labels)  # 获取全部分类,返回的是一个集合,其值为{0,1}
        #print('分类的结果:',labeltemps)
        for labeltemp in labeltemps:
            # 统计列表中重复的值:self.labels.count(labeltemp)
            self.Pcates[labeltemp] = float(self.labels.count(labeltemp)) / float(len(self.labels))#求分类列表中重复的值,就是0和1在所有当中所占的比例

    # 按分类累计向量空间的每维值:P(x|yi)
    def build_tdm(self):
        self.tdm = np.zeros([len(self.Pcates), self.vocablen])  # 类别行*词典列
        sumlist = np.zeros([len(self.Pcates), 1])  # 统计每个分类的总值
        for indx in range(self.doclength):
            self.tdm[self.labels[indx]] += self.tf[indx]  # 将同一类别的词向量空间值加总
            sumlist[self.labels[indx]] = np.sum(self.tdm[self.labels[indx]])  # 统计每个分类的总值--是个标量
        self.tdm = self.tdm / sumlist  # P(x|yi)

    # 测试集映射到当前词典
    def map2vocab(self, testdata):
        self.testset = np.zeros([1, self.vocablen])
        for word in testdata:
            self.testset[0, self.vocabulary.index(word)] += 1

    # 输出分类类别
    def predict(self, testset):
        if np.shape(testset)[1] != self.vocablen:
            print("输入错误")
            exit(0)
        predvalue = 0
        predclass = ""
        for tdm_vect, keyclass in zip(self.tdm, self.Pcates):
            # P(x|yi)P(yi)
            temp = np.sum(testset * tdm_vect * self.Pcates[keyclass])
            if temp > predvalue:
                predvalue = temp
                predclass = keyclass
        return predclass

 

posted @ 2016-10-17 18:49  你若精彩,蝴蝶自来  阅读(316)  评论(0编辑  收藏  举报