42、贝叶斯分类算法源程序

import numpy
import math
def loadDataSet():
    postingList = [['my','dog','has','flea',\
                    'problems','help','please'],
                    ['maybe','not','take','him',\
                     'to','dog','park','stupid'],
                     ['my','dalmation','is','so','cute',\
                     'I','love','him'],
                     ['stop','posting','stupid','worthless','garbage'],
                     ['mr','licks','ate','my','steak','how',\
                     'to','stop','him'],
                     ['quit','buying','worthless','dog','food','stupid']]
    
    classVec = [0,1,0,1,0,1]  #1代表侮辱性文字,0,代表正常言论
    return postingList,classVec
    
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)
            
def setOfWords2Vec(vocabList,inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1 
        else:print ("the word: %s is not in my Vocabulary") % word
    return returnVec

def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = numpy.ones(numWords)
    plNum = numpy.ones(numWords)      #
    p0Denom = 2.0;plDenom = 2.0                          #初始化概率值
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            plNum += trainMatrix[i]                        #
            plDenom += sum(trainMatrix[i])                #向量相加
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = plNum/plDenom            #change to log()    对每个元素做除法
    p0Vect = p0Num/p0Denom            #change to log()
    return p0Vect,p1Vect,pAbusive
    
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    p1 = sum(vec2Classify*p1Vec) + math.log(pClass1)
    p0 = sum(vec2Classify*p0Vec) + math.log(1.0-pClass1)
    if p1 > p0:
        return 0
    else:
        return 1

def testingNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    p0V,p1V,pAb = trainNB0(trainMat,listClasses)
    testEntry = ['love','my','dalmation']
    thisDoc = setOfWords2Vec(myVocabList,testEntry)
    print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
    testEntry = ['stupid','garbage']
    thisDoc = setOfWords2Vec(myVocabList,testEntry)
    print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
    

 

posted @ 2016-12-02 08:01  香港胖仔  阅读(185)  评论(0)    收藏  举报