# -*- coding: utf-8 -*-
import sys
import os
import numpy as np
import pickle
from sklearn import metrics
#导入数据集
def loadDataSet():
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him', 'my'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0, 1, 0, 1, 0, 1] # 1 is abusive, 0 not,分类
return postingList, classVec
# 读取文件
def readfile(path):
fp = open(path, "rb")
content = fp.read()
fp.close()
return content
"""
#计算分类精度:
def metrics_result(actual,predict):
print('精度:{0:.3f}'.format(metrics.precision_score(actual,predict)))
print ('召回:{0:0.3f}'.format(metrics.recall_score(actual,predict)))
print ('f1-score:{0:.3f}'.format(metrics.f1_score(actual,predict)))
"""
# 读取bunch对象
def readbunchobj(path):
file_obj = open(path, "rb")
bunch = pickle.load(file_obj)
file_obj.close()
return bunch
# 写入bunch对象
def writebunchobj(path, bunchobj):
file_obj = open(path, "wb")
pickle.dump(bunchobj, file_obj)
file_obj.close()
class NBayes(object):
def __init__(self):
self.vocabulary = [] # 词典
self.idf = 0 # 词典的idf权值向量
self.tf = 0 # 训练集的权值矩阵
self.tdm = 0 # P(x|yi)
self.Pcates = {} # P(yi)--是个类别字典,这个集合就是p(yi)的值的集合
self.labels = [] # 对应每个文本的分类,是个外部导入的列表
self.doclength = 0 # 训练集文本数
self.vocablen = 0 # 词典词长
self.testset = 0 # 测试集
# 加载训练集并生成词典,以及tf, idf值
def train_set(self, trainset, classVec):
self.cate_prob(classVec) # 计算每个分类在数据集中的概率:P(yi)
self.doclength = len(trainset)
tempset = set()
[tempset.add(word) for doc in trainset for word in doc] # 生成词典
self.vocabulary = list(tempset)
self.vocablen = len(self.vocabulary)
self.calc_wordfreq(trainset)
# self.calc_tfidf(trainset) # 生成tf-idf权值
self.build_tdm() # 按分类累计向量空间的每维值:P(x|yi)
# 生成 tf-idf
def calc_tfidf(self, trainset):
self.idf = np.zeros([1, self.vocablen])
self.tf = np.zeros([self.doclength, self.vocablen])
for indx in range(self.doclength):
for word in trainset[indx]:
self.tf[indx, self.vocabulary.index(word)] += 1
# 消除不同句长导致的偏差
self.tf[indx] = self.tf[indx] / float(len(trainset[indx]))
for signleword in set(trainset[indx]):
self.idf[0, self.vocabulary.index(signleword)] += 1
self.idf = np.log(float(self.doclength) / self.idf)
self.tf = np.multiply(self.tf, self.idf) # 矩阵与向量的点乘
# 生成普通的词频向量
def calc_wordfreq(self, trainset):
self.idf = np.zeros([1, self.vocablen]) # 1*词典数
self.tf = np.zeros([self.doclength, self.vocablen]) # 训练集文件数*词典数
for indx in range(self.doclength): # 遍历所有的文本
for word in trainset[indx]: # 遍历文本中的每个词
self.tf[indx, self.vocabulary.index(word)] += 1 # 找到文本的词在字典中的位置+1
for signleword in set(trainset[indx]):
self.idf[0, self.vocabulary.index(signleword)] += 1
# 计算每个分类在数据集中的概率:P(yi)
def cate_prob(self, classVec):
self.labels = classVec#让分类作为相对应的标签
labeltemps = set(self.labels) # 获取全部分类,返回的是一个集合,其值为{0,1}
#print('分类的结果:',labeltemps)
for labeltemp in labeltemps:
# 统计列表中重复的值:self.labels.count(labeltemp)
self.Pcates[labeltemp] = float(self.labels.count(labeltemp)) / float(len(self.labels))#求分类列表中重复的值,就是0和1在所有当中所占的比例
# 按分类累计向量空间的每维值:P(x|yi)
def build_tdm(self):
self.tdm = np.zeros([len(self.Pcates), self.vocablen]) # 类别行*词典列
sumlist = np.zeros([len(self.Pcates), 1]) # 统计每个分类的总值
for indx in range(self.doclength):
self.tdm[self.labels[indx]] += self.tf[indx] # 将同一类别的词向量空间值加总
sumlist[self.labels[indx]] = np.sum(self.tdm[self.labels[indx]]) # 统计每个分类的总值--是个标量
self.tdm = self.tdm / sumlist # P(x|yi)
# 测试集映射到当前词典
def map2vocab(self, testdata):
self.testset = np.zeros([1, self.vocablen])
for word in testdata:
self.testset[0, self.vocabulary.index(word)] += 1
# 输出分类类别
def predict(self, testset):
if np.shape(testset)[1] != self.vocablen:
print("输入错误")
exit(0)
predvalue = 0
predclass = ""
for tdm_vect, keyclass in zip(self.tdm, self.Pcates):
# P(x|yi)P(yi)
temp = np.sum(testset * tdm_vect * self.Pcates[keyclass])
if temp > predvalue:
predvalue = temp
predclass = keyclass
return predclass