朴素贝叶斯应用：垃圾邮件分类 - C22C

# 若没有nltk则先定义一个空函数
def pre(text):
    pre_text=text
    return pre_text

#读取数据
import csv
# with open(r'd:/SMSSpamCollectionjsn.txt',encoding = "utf-8")as file_path:
# with open('C:\Users\Administrator\Desktop\SMSSpamCollection.csv','r',encoding='utf-8')as file_path:
#     sms=file_path.read()
# print(sms)
file_path=r'd:/SMSSpamCollectionjsn.txt'
sms=open(file_path,'r',encoding="utf-8")
sms_data=[]
sms_label=[]
reader=csv.reader(sms,delimiter='\t')
for  line in reader:
    sms_label.append(line[0])
    sms_data.append(pre(line[1]))
sms.close()

#训练集合测试集，先将先验数据按如下比例划分
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(sms_data,sms_label,test_size=0.3,random_state=0,stratify=sms_label)
print(len(sms_data),len(x_train),len(x_test))
x_train


# 将其向量化，提取数据特征，将文本解析为词向量，训练模型
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(min_df=2,ngram_range=(1,2),stop_words='english',strip_accents='unicode')
x_train=vectorizer.fit_transform(x_train)
x_train.toarray().shape



(3898, 6649)


x_test=vectorizer.transform(x_test)
# 贝叶斯分类器
from sklearn.naive_bayes import MultinomialNB
result=MultinomialNB().fit(x_train,y_train)
y_pred=result.predict(x_test)


#分类结果显示，利用混淆矩阵评估预测模型的正确率，准确率、精确率、召回率。F值
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print(y_pred.shape,y_pred)
print('nb_confusion_matrix:')
cm=confusion_matrix(y_test,y_pred)
print(cm)
print('nb_classification_report:')
cr=classification_report(y_test,y_pred)
print(cr)


(1671,) ['ham' 'ham' 'ham' ... 'ham' 'spam' 'ham']
nb_confusion_matrix:
[[1447    0]
 [  48  176]]
nb_classification_report:
             precision    recall  f1-score   support

        ham       0.97      1.00      0.98      1447
       spam       1.00      0.79      0.88       224

avg / total       0.97      0.97      0.97      1671



#预测排行榜

feature_names=vectorizer.get_feature_names()#出现过的单词列表
xgailv=result.coef_  #先验概率 P(x_i|y)
intercept=result.intercept_   #p(y)
xgailv_with_fns=sorted(zip(xgailv[0],feature_names))  #对数海旅p(x_i|y)与单词x_i映射

n=10
top=zip(xgailv_with_fns[:n],xgailv_with_fns[:-(n+1):-1]) #最大的10个和最小的10个单词
for (coef_1,fn_1),(coef_2,fn_2) in top:
    print('\t%.4f\t%-15s\t%.4f\t%-15s' % (coef_1,fn_1,coef_2,fn_2))



-9.1053	10 smth        	-6.1149	free           
	-9.1053	15             	-6.3421	txt            
	-9.1053	2go            	-6.4948	mobile         
	-9.1053	2gthr          	-6.5769	text           
	-9.1053	2gthr drinking 	-6.5780	claim          
	-9.1053	2marrow        	-6.6015	stop           
	-9.1053	2morrow        	-6.6108	ur             
	-9.1053	2mrw           	-6.6352	reply          
	-9.1053	2mrw luv       	-6.7198	www            
	-9.1053	2nd ur         	-6.7481	prize    

vectorizer.get_feature_names()#出现的有分类价值的单词
posted on 2018-12-03 16:03 C22C 阅读(215) 评论(0) 编辑收藏举报
会员力量，点亮园子希望
刷新页面返回顶部

Copyright © 2024 C22C Powered by .NET 8.0 on Kubernetes 博客园
导航