Python 朴素贝叶斯算法原理及其运用
朴素贝叶斯是一种有监督机器学习,其原理通过后验概率进行有效的分类,目前常见的贝叶斯分类器有高斯贝叶斯分类器、多项式贝叶斯分类器、伯努利贝叶斯分类器。
官方链接:https://scikit-learn.org/stable/modules/naive_bayes.html
(一)概率论相关概念
条件概率:

全概率:

(二)贝叶斯理论




(二)代码实现
1 from numpy import * 2 3 def loadDataSet(): 4 """创建数据集""" 5 postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], 6 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], 7 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], 8 ['stop', 'posting', 'stupid', 'worthless', 'garbage'], 9 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], 10 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] 11 classVec = [0,1,0,1,0,1] #1 is abusive, 0 not 12 return postingList,classVec
1 def createVocabList(dataSet): 2 """创建文档中出现的所有词,且不重复,dataSet为数据集""" 3 vocabSet = set([]) #create empty set 4 for document in dataSet: 5 vocabSet = vocabSet | set(document) #union of the two sets,|求两个集合的并集,&求两个集合的交集 6 return list(vocabSet)
1 def setOfWords2Vec(vocabList, inputSet): 2 """创建某个文档的词条,vocabList为文档的词汇表,inputSet为某个文档的向量""" 3 returnVec = [0]*len(vocabList) 4 for word in inputSet: 5 if word in vocabList: 6 returnVec[vocabList.index(word)] = 1 #vocabList.index(word)返回word在vocabList中的第一次出现的索引号, 7 #保持了每个文档的长度都是词汇的长度,即(len(vocabList)) 8 else: print ("the word: %s is not in my Vocabulary!" % word) 9 return returnVec
1 def train_mat(listOPosts ,myVocablist ): 2 """将文档转变为0、1列表""" 3 trainmat=[] 4 for postinDoc in listOPosts: 5 trainmat.append(setOfWords2Vec(myVocablist,postinDoc)) 6 return trainmat
1 def trainNB0(trainMatrix,trainCategory): 2 numTrainDocs = len(trainMatrix) #矩阵的行数 3 numWords = len(trainMatrix[0]) #列数 4 pAbusive = sum(trainCategory)/float(numTrainDocs) #sum(trainCategory)表示文档为标签为1的数量 5 6 ##参数初始化 7 # p0Num = ones(numWords); p1Num = ones(numWords) #change to ones() 8 # p0Denom = 2.0; p1Denom = 2.0 #change to 2.0 9 10 p0Num = zeros(numWords); p1Num = zeros(numWords) #change to ones() 11 p0Denom = 0.0; p1Denom = 0.0 #change to 2.0 12 13 for i in range(numTrainDocs): 14 if trainCategory[i] == 1: 15 p1Num += trainMatrix[i] 16 p1Denom += sum(trainMatrix[i]) 17 else: 18 p0Num += trainMatrix[i] 19 p0Denom += sum(trainMatrix[i]) 20 # p1Vect = log(p1Num/p1Denom) #change to log() 21 # p0Vect = log(p0Num/p0Denom) #change to log() 22 23 p1Vect = p1Num/p1Denom #change to log() 24 p0Vect = p0Num/p0Denom 25 return p0Vect,p1Vect,pAbusive
1 if __name__=='__main__': 2 """代码测试""" 3 listOPosts,listClasses = loadDataSet() 4 myVocabList = createVocabList( listOPosts) 5 # returnVec = setOfWords2Vec(vocabSet,postingList[0]) 6 trainmat = train_mat(listOPosts ,myVocabList ) 7 p0Vect,p1Vect,pAbusive = trainNB0(trainmat,listClasses ) 8 print(p0Vect,"\n") 9 print(p1Vect,"\n") 10 print(pAbusive) 11 # print(list(p1Vect).index(p1Vect.max())) #检验标签1出现概率最大的词。
结果:

参考图书:机器学习实战【美】Peter Harrington 著 李锐 李鹏 曲亚东 王斌 译
浙公网安备 33010602011771号