import numpy as np
import math
#加载模拟数据
def loaddata():
postingList=[['my','dog','has','flea','problem','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
classVec = [0,1,0,1,0,1] # 1 侮辱 0 非侮辱
return postingList,classVec
#创建词汇表
def createSet(dataset):
result = set([])
for i in dataset:
result = result | set(i)
return list(result)
# dataSet,labels = loaddata()
# vacablist = createSet(dataSet)
# print('外lables',labels)
# print('外dataSet',dataSet)
# print('外vacablist:',vacablist)
#创建和词汇表对应的向量
def setofword(vacablist,inputdata):
mylist = [0] * len(vacablist)
for word in inputdata:
if word in vacablist:
mylist[vacablist.index(word)] = 1
else:
print('没有 {} 这个词'.format(word))
return mylist
# setofdata = setofword(vacablist,dataSet[3])
# print('外setofdata:',setofdata) #[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]
# print(vacablist) #['garbage', 'to', 'worthless', 'ate', 'has', 'so', 'take', 'cute', 'dog', 'flea', 'buying', 'help', 'is', 'park', 'I', 'food', 'my', 'licks', 'posting', 'dalmation', 'problem', 'please', 'stop', 'how', 'stupid', 'maybe', 'love', 'steak', 'quit', 'him', 'not', 'mr']
# print(dataSet[3]) #['stop', 'posting', 'stupid', 'worthless', 'garbage']
# trainmat = []
# for i in dataSet:
# trainmat.append(setofword(vacablist,i))
# print('外trainmat:',trainmat)
# 训练函数,算P(word\1)的概率
def P1(trainmat,labels):
plable_1 = sum(labels)/len(labels)
data_0 = np.ones(len(trainmat[0]))
count_0 = 2
data_1 = np.ones(len(trainmat[0]))
count_1 = 2
for i in range(len(labels)):
if labels[i] == 0:
data_0 += trainmat[i]
count_0 += sum(trainmat[i])
if labels[i] == 1:
data_1 += trainmat[i]
count_1 += sum(trainmat[i])
data_0 = data_0 / count_0
data_1 = data_1/count_1
print('data_0:{},count:{}'.format(data_0,count_0))
print('data_1:{},count:{}'.format(data_1, count_1))
print('plabel_1:',plable_1)
return data_0,data_1,plable_1
# P1(trainmat,labels)
#用得到的概率分类
def classfy(testset,data_0,data_1,plabel_1):
print('开始classfy')
p1 = 1
p0 = 1
for i in range(len(testset)):
if testset[i] ==1 :
p1 = p1 * data_1[i]
p0 = p0 * data_0[i]
p1 = p1 * plabel_1
p0 = p0 * (1-plabel_1)
print('p1:{},p0:{}'.format(p1,p0))
if p1>p0:
print('该分类为1')
return 1
else:
print('该分类为0')
return 0
#测试总逻辑代码
def test():
dataSet,labels = loaddata()
vacablist = createSet(dataSet)
trainmat = []
for i in dataSet: #因为训练函数需要训练数据是词汇表的格式
trainmat.append(setofword(vacablist,i))
data_0, data_1, plable_1 = P1(trainmat,labels)
testlist = ['my','love','stupid']
testdata = setofword(vacablist,testlist)
classfy(testdata,data_0,data_1,plable_1)
test()