1 import numpy as np
2
3 def loadDataSet():
4 postingList = [["my","dog","has","flea",
5 "problems","help","please"],
6 ["maybe","not","take","him",
7 "to","dog","park","stupid"],
8 ["my","dalmation","is","so","cute",
9 "I","love","him"],
10 ["stop","posting","stupid","worthless","garbage",],
11 ["my","licks","ate","my","steak","how",
12 "to","stop","him"],
13 ["qiut","buying","worthless","dog","food","stupid"]]
14 classVec = [0,1,0,1,0,1]
15 return postingList,classVec
16
17 def createVocabList(dataSet):
18 vocabSet = set([])
19 for document in dataSet:
20 vocabSet = vocabSet | set(document)
21 return list(vocabSet)
22
23 def setOfWords2Vec(vocabList,inputSet):
24 returnVec = [0] * len(vocabList)
25 for word in inputSet:
26 if word in vocabList:
27 returnVec[vocabList.index(word)] = 1
28 else:
29 print "the word: %s is not in my Vocabulary!" % word
30 return returnVec
31
32 def trainNB0(trainMatrix,trainCategory):
33 numTrainDocs = len(trainMatrix)
34 numWords = len(trainMatrix[0])
35 pAbusive = sum(trainCategory) / float(numTrainDocs)
36 p0Num = np.ones(numWords);p1Num = np.ones(numWords)
37 p0Denom = 2.0;p1Denom = 2.0
38 for i in range(numTrainDocs):
39 if trainCategory[i] == 1:
40 p1Num += trainMatrix[i]
41 p1Denom += np.sum(trainMatrix[i])
42 else:
43 p0Num += trainMatrix[i]
44 p0Denom += np.sum(trainMatrix[i])
45 p1Vect = np.log(p1Num / p1Denom)
46 p0Vect = np.log(p0Num / p0Denom)
47 return p0Vect,p1Vect,pAbusive
48
49 def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
50 p1 = np.sum(vec2Classify * p1Vec) + np.log(pClass1)
51 p0 = np.sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
52 if p1 > p0:
53 return 1
54 else:
55 return 0
56
57 def testingNB():
58 listOPosts,listClasses = loadDataSet()
59 myVocabList = createVocabList(listOPosts)
60 trainMat = []
61 for postinDoc in listOPosts:
62 trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
63 p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(listClasses))
64 testEntry = ["love","my","dalmation"]
65 thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))
66 print testEntry,"classified as :",classifyNB(thisDoc,p0V,p1V,pAb)
67 testEntry = ["stupid","garbage"]
68 thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))
69 print testEntry,"classified as :",classifyNB(thisDoc,p0V,p1V,pAb)
70
71 def bagOfWords2VecMN(vocabList,inputSet):
72 returnVec = [0] * len(vocabList)
73 for word in inputSet:
74 if word in vocabList:
75 returnVec[vocabList.index(word)] += 1
76 return returnVec
77
78 if __name__ == "__main__":
79 print testingNB()