import numpy
import math
def loadDataSet():
postingList = [['my','dog','has','flea',\
'problems','help','please'],
['maybe','not','take','him',\
'to','dog','park','stupid'],
['my','dalmation','is','so','cute',\
'I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how',\
'to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
classVec = [0,1,0,1,0,1] #1代表侮辱性文字,0,代表正常言论
return postingList,classVec
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet)
def setOfWords2Vec(vocabList,inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:print ("the word: %s is not in my Vocabulary") % word
return returnVec
def trainNB0(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory)/float(numTrainDocs)
p0Num = numpy.ones(numWords)
plNum = numpy.ones(numWords) #
p0Denom = 2.0;plDenom = 2.0 #初始化概率值
for i in range(numTrainDocs):
if trainCategory[i] == 1:
plNum += trainMatrix[i] #
plDenom += sum(trainMatrix[i]) #向量相加
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = plNum/plDenom #change to log() 对每个元素做除法
p0Vect = p0Num/p0Denom #change to log()
return p0Vect,p1Vect,pAbusive
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1 = sum(vec2Classify*p1Vec) + math.log(pClass1)
p0 = sum(vec2Classify*p0Vec) + math.log(1.0-pClass1)
if p1 > p0:
return 0
else:
return 1
def testingNB():
listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
p0V,p1V,pAb = trainNB0(trainMat,listClasses)
testEntry = ['love','my','dalmation']
thisDoc = setOfWords2Vec(myVocabList,testEntry)
print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
testEntry = ['stupid','garbage']
thisDoc = setOfWords2Vec(myVocabList,testEntry)
print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))