import re
import numpy as np
def file_do(filename):
with open(filename,'rb') as f:
data = f.read().decode()
data = data.split('\n')
# print('dasdqaw:',data)
words = []
labels = []
for i in data:
# print('i:',i)
data1 = re.findall(r'[\u4e00-\u9fff]+', i) #提取每行的中文 不要英文 和字符
if data1 != []:
# print('data1:',data1)
words.append(data1[1:])
labels.append(data1[0])
print('内words:',words)
print('内labels:',labels)
# print(len(words),len(labels))
return words,labels
# filename = 'C:/Users/cuit/Desktop/文本测试数据/training-1000.txt'
# words,labels = file_do(filename)
#创建训练样本的词汇表
def createvacablist(words):
vacablist = set()
for row in words:
vacablist = vacablist | set(row)
# print(vacablist)
return list(vacablist)
# vacablist = createvacablist(words)
# print('外vacablist:',vacablist)
# print(vacablist.index('愤怒'))
#将数据变为 0,1 向量集
def setdata(vacablist,inputdata):
result = np.zeros(len(vacablist))
for i in inputdata:
if i in vacablist:
result[vacablist.index(i)] += 1
else:
print('词汇表里没有 :',i)
# print('inputdata',inputdata)
# print('result:',result)
return result
# setdata(vacablist,words[0])
#训练函数,计算没个词在每个标签的不同概率
def P1(words,labels,vacablist):
#先计算标签中好评概率
labels_1count = 0 # 1为好评 0为差评
for i in labels:
if i == '好评':
labels_1count +=1
plabels_1 = labels_1count/len(labels)
print('plabels_1:',plabels_1)
#将训练数据再统一成词汇表格式的数据
setwords = []
for i in words:
setwords.append(setdata(vacablist,i))
# print('setwords:',setwords)
count_1 = 2 # 好评中词语的个数
count_0 = 2
data_1 = np.ones(len(vacablist))
data_0 = np.ones(len(vacablist))
for i in range(len(setwords)):
if labels[i] == '好评':
data_1 = data_1 + setwords[i]
count_1 += sum(setwords[i])
else:
data_0 = data_0 + setwords[i]
count_0 += sum(setwords[i])
# print('data_1:',data_1)
# print('data_0:', data_0)
print('count_1:{},count_0:{}'.format(count_1,count_0))
data_1 = (data_1/count_1) * plabels_1
data_0 = (data_0/count_0) * (1-plabels_1)
print('data_1:', data_1)
print('data_0:', data_0)
return data_1,data_0
# data_1,data_0 = P1(words,labels,vacablist)
# print('data_1:',data_1)
# print('data_0:',data_0)
def classfy(data_1,data_0,vacablist,test_data):
p1 = 1
p0 = 1
set_tdata = setdata(vacablist,test_data)
# print('set_tdata:',set_tdata)
for i in range(len(set_tdata)):
if set_tdata[i] != 0:
# print('i:{},data_1[i]:{},data_0[i]:{},set_tdata[i]:{}'.format(i,data_1[i],data_0[i],set_tdata[i]))
p1 = p1 * data_1[i] * set_tdata[i]
p0 = p0 * data_0[i] * set_tdata[i]
if p1>p0:
# print('好评')
return '好评'
if p1 <p0:
# print('差评')
return '差评'
else:
# print('p1:{}.p0:{}'.format(p1,p0))
return -1
# mydata = ['这个','酒店','马马虎虎']
# classfy(data_1,data_0,vacablist,mydata)
#测试文档预测
def test():
#训练数据
filename = 'C:/Users/cuit/Desktop/文本测试数据/training-1000.txt'
words,labels = file_do(filename)
print('len(words):',len(words))
vacablist = createvacablist(words)
data_1, data_0 = P1(words, labels, vacablist)
#测试数据转格式
print()
filename = 'C:/Users/cuit/Desktop/文本测试数据/test-1000.txt'
test_words,test_labels = file_do(filename)
yes_count = 0
for i in range(len(test_words)):
result = classfy(data_1,data_0,vacablist,test_words[i]) # '好评' 或 '差评'
# print('result:{},labels[i]:{}'.format(result,test_labels[i]))
if result == test_labels[i]:
yes_count += 1
corect = yes_count/len(test_labels)
print('正确了{}个,总共有{}个,准确率为{}'.format(yes_count,len(test_labels),corect))
test()