Python 朴素贝叶斯算法原理及其运用

  朴素贝叶斯是一种有监督机器学习,其原理通过后验概率进行有效的分类,目前常见的贝叶斯分类器有高斯贝叶斯分类器、多项式贝叶斯分类器、伯努利贝叶斯分类器。

官方链接:https://scikit-learn.org/stable/modules/naive_bayes.html

(一)概率论相关概念

条件概率:

全概率:

(二)贝叶斯理论

 (二)代码实现

 1 from numpy import *
 2 
 3 def loadDataSet():
 4     """创建数据集"""
 5     postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
 6                  ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
 7                  ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
 8                  ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
 9                  ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
10                  ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
11     classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
12     return postingList,classVec
1 def createVocabList(dataSet):
2     """创建文档中出现的所有词,且不重复,dataSet为数据集"""
3     vocabSet = set([])  #create empty set
4     for document in dataSet:
5         vocabSet = vocabSet | set(document) #union of the two sets,|求两个集合的并集,&求两个集合的交集
6     return list(vocabSet)
1 def setOfWords2Vec(vocabList, inputSet):
2     """创建某个文档的词条,vocabList为文档的词汇表,inputSet为某个文档的向量"""
3     returnVec = [0]*len(vocabList)
4     for word in inputSet:
5         if word in vocabList:
6             returnVec[vocabList.index(word)] = 1   #vocabList.index(word)返回word在vocabList中的第一次出现的索引号,
7                                                     #保持了每个文档的长度都是词汇的长度,即(len(vocabList))
8         else: print ("the word: %s is not in my Vocabulary!" % word)
9     return returnVec
1 def train_mat(listOPosts ,myVocablist ):
2     """将文档转变为0、1列表"""
3     trainmat=[]
4     for postinDoc in listOPosts:
5         trainmat.append(setOfWords2Vec(myVocablist,postinDoc))
6     return trainmat
 1 def trainNB0(trainMatrix,trainCategory):
 2     numTrainDocs = len(trainMatrix)  #矩阵的行数
 3     numWords = len(trainMatrix[0])   #列数
 4     pAbusive = sum(trainCategory)/float(numTrainDocs)  #sum(trainCategory)表示文档为标签为1的数量
 5     
 6     ##参数初始化
 7 #     p0Num = ones(numWords); p1Num = ones(numWords)      #change to ones() 
 8 #     p0Denom = 2.0; p1Denom = 2.0                        #change to 2.0
 9 
10     p0Num = zeros(numWords); p1Num = zeros(numWords)      #change to ones() 
11     p0Denom = 0.0; p1Denom = 0.0                        #change to 2.0
12     
13     for i in range(numTrainDocs):
14         if trainCategory[i] == 1:
15             p1Num += trainMatrix[i]   
16             p1Denom += sum(trainMatrix[i])
17         else:
18             p0Num += trainMatrix[i]
19             p0Denom += sum(trainMatrix[i])
20 #     p1Vect = log(p1Num/p1Denom)          #change to log()
21 #     p0Vect = log(p0Num/p0Denom)          #change to log()
22 
23     p1Vect = p1Num/p1Denom        #change to log()
24     p0Vect = p0Num/p0Denom
25     return p0Vect,p1Vect,pAbusive
 1 if __name__=='__main__':
 2     """代码测试"""
 3     listOPosts,listClasses = loadDataSet()
 4     myVocabList = createVocabList( listOPosts)
 5 #     returnVec = setOfWords2Vec(vocabSet,postingList[0])
 6     trainmat = train_mat(listOPosts ,myVocabList )
 7     p0Vect,p1Vect,pAbusive = trainNB0(trainmat,listClasses )
 8     print(p0Vect,"\n")
 9     print(p1Vect,"\n")
10     print(pAbusive)
11 #     print(list(p1Vect).index(p1Vect.max()))  #检验标签1出现概率最大的词。

结果:

参考图书:机器学习实战【美】Peter Harrington 著  李锐 李鹏 曲亚东 王斌 译

posted on 2019-08-29 12:42  LiErRui  阅读(392)  评论(0)    收藏  举报

导航