#coding:utf-8
from numpy import *
import re
def createlist(lst):#将所有文本放入一个列表中
listt=set([])
for line in lst:
listt=listt|set(line)
return list(listt)
def word2vec(List,inputset):#将输入文本转为词向量,每个文本对应一个词向量,其长度为上述列表长度
lenth=len(List)
vec=[0]*lenth
for word in inputset:
if word in List:
vec[List.index(word)]+=1
return vec
def singprob(trainmatrix,label):#计算在已知类别的条件下,每个单词出现的概率对应于p(w1|c1),p(w2|c1).....
lenth=len(trainmatrix)
numword=len(trainmatrix[0])
pb1=sum(label)/float(lenth)
p0num=ones(numword)
p1num=ones(numword)
p0all=2
p1all=2
for i in range(lenth):
if label[i]==1:
p1num+=trainmatrix[i]
p1all+=sum(trainmatrix[i])
else:
p0num+=trainmatrix[i]
p0all+=sum(trainmatrix[i])
p1vect=log(p1num/p1all)
p0vect=log(p0num/p0all)
return p1vect,p0vect,pb1
def classifier(vect,p1vect,p0vect,pb1):#对应公式:lnp(w1|c=1 or 0)p(w2/c=1 or 0)..p(wn/c=1 0r 0)p(c)
p1=sum(vect*p1vect)+log(pb1)
p0=sum(vect*p0vect)+log(1-pb1)
if p1>p0:
return 1
else:
return 0
def testparse(str):
reg=re.compile('\W*')
line=reg.split(str)
List=[tt.lower for tt in line if len(tt)>2]
return List
def Test():
doc=[]
label=[]
fulltext=[]
for i in range(1,26):
wordlist=testparse(open("email/spam/%d.txt" %i).read())
doc.append(wordlist)
fulltext.extend(wordlist)
label.append(1)
wordlist=testparse(open("email/ham/%d.txt" %i).read())
doc.append(wordlist)
fulltext.extend(wordlist)
label.append(0)
doclist=createlist(doc)
trainingset=range(50)
testset=[]
for i in range(10):
index=int(random.uniform(0,len(trainingset)))
testset.append(index)
del(trainingset[index])
trainmat=[]
classlabel=[]
for docindex in trainingset:
trainmat.append(word2vec(doclist,doc[docindex]))
classlabel.append(label[docindex])
p1,p0,pb=singprob(trainmat,classlabel)
error=0
for testindex in testset:
wordvect=word2vec(doclist,doc[testindex])
if classifier(wordvect,p1,p0,pb)!=label[testindex]:
error+=1
print error/float(len(testset))
Test()