Python (Naïve Bayes) (classify algorithm)
a b c d 类
p(a)>p(b)>p(c)>p(d) m 属于 a 类
class Bayes:
def __init__(self):
self.length=-1
self.label=dic()
self.vector=dic()
def fit (self,dataSet,labels):
if(len(dataSet))!=len(labels):
raise ValueError ("Wrong imput!")
self.length=len(dataSet[0])#测试特征值长度
labelsnum=len(labels)#所有类别数量
norlabels=set(labels)#不重复类别
for item in norlabels:
thislabel=item
labelcount[thislabel]=labels.count(thislabel)/labelsnum#当前种类占总类别的比例
for vector,labels in zip(dataSet,labels):
if (label not in vectorcount):
self.vectorcount[label]=[]
self.vectorcount[label].append(vector)
print ("train finished!")
return self
def bayes_test(self,testData,labelSet):
if (self.length==-1):
raise ValueError("you havn't taining yet!")
#计算 testdata 各个类别的概率
lbdic=dic()
for thislb in labelSet:
p=1
alllabel=self.labelcount[thislb]
allvector=self.vectorvount[thislb]
vnum=len(allvector)
allvector = numpy.array(allvector).T
for index in range(0,len(testData)):
vector=list(allvector[indext])
p*=vector.count(testData[indext])/vnum
lbdic[thislb]=p*alllabel
thislb=sorted(lbdic,key=lambda x:lbdic[x],reverse=True)[0]
return thislb
*********************************************************************************************
import numpy as npy
class Bayes:
def __init__(self):
self.length=-1
self.labelcount=dict()
self.vectorcount=dict()
def fit(self,dataSet:list,labels:list):
if(len(dataSet)!=len(labels)):
raise ValueError("您输入的测试数组跟类别数组长度不一致")
self.length=len(dataSet[0])#测试数据特征值的长度
labelsnum=len(labels)#类别所有的数量
norlabels=set(labels)#不重复类别的数量
for item in norlabels:
thislabel=item
labelcount[thislabel]=labels.count(thislabel)/labelsnum#求的当前类别占类别总数的比例
for vector,label in zip(dataSet,labels):
if(label not in vectorcount):
self.vectorcount[label]=[]
self.vectorcount[label].append(vector)
print("训练结束")
return self
def btest(self,TestData,labelsSet):
if(self.length==-1):
raise ValueError("您还没有进行训练,请先训练")
#计算testdata分别为各个类别的概率
lbDict=dict()
for thislb in labelsSet:
p=1
alllabel=self.labelcount[thislb]
allvector=self.vectorcount[thislb]
vnum=len(allvector)
allvector=numpy.array(allvector).T
for index in range(0,len(TestData)):
vector=list(allvector[index])
p*=vector.count(TestData[index])/vnum
lbDict[thislb]=p*alllabel
thislabel=sorted(lbDict,key=lambda x:lbDict[x],reverse=True)[0]
return thislabel