13垃圾邮件分类2
1.读取
2.数据预处理
import csv
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
#返回类别
def getLb(data):
if data.startswith("J"):
return nltk.corpus.wordnet.ADJ
elif data.startswith("V"):
return nltk.corpus.wordnet.VERB
elif data.startswith("N"):
return nltk.corpus.wordnet.NOUN
elif data.startswith("R"):
return nltk.corpus.wordnet.ADV
else:
return "";
def preprocessing(data):
newdata=[]
punctuation = '!,;:?"\''
data=re.sub(r'[{}]+'.format(punctuation), '', data).strip().lower();#去标点和转小写
for i in nltk.sent_tokenize(data, "english"): # 对文本按照句子进行分割
for j in nltk.word_tokenize(i): # 对句子进行分词
newdata.append(j)
stops = stopwords.words('english')
newdata= [i for i in newdata if i not in stops]#去停用词
newdata = nltk.pos_tag(newdata)#词性标注
lem = WordNetLemmatizer()
for i, j in enumerate(newdata):#还原词
y = getLb(j[1])
if y:
newdata[i] = lem.lemmatize(j[0], y)
else:
newdata[i] = j[0]
return newdata
3.数据划分—训练集和测试集数据划分
from sklearn.model_selection import train_test_split
x_train,x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=0, stratify=y_train)
4.文本特征提取
sklearn.feature_extraction.text.CountVectorizer
sklearn.feature_extraction.text.TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf2 = TfidfVectorizer()
观察邮件与向量的关系
向量还原为邮件
# 4 文本特征选取即向量化,一个单词一种特征
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf2 = TfidfVectorizer() # 把原始文本转成特征矩阵
X_train = tfidf2.fit_transform(x_train)
X_test = tfidf2.transform(x_test)
print(X_train)
print("X_train.toarray()数组向量",X_train.toarray())
print("X_train.toarray()",X_train.toarray().shape)
print("X_test.toarray()",X_test.toarray().shape)
# 统计所有的词
email_txt = []
for email in email_data:
email_txt.extend(email.split())
print("总共有的单词数:",len(email_txt))
print("不重复的单词数:",len(set(email_txt)))
print("生成词袋:",tfidf2.vocabulary_) # fit生成词袋
# 向量还原成邮件
print("X_train.toarray()[0]:",X_train.toarray()[0])
import numpy as np
a = np.flatnonzero(X_train.toarray()[0]) # (中为邮件0的向量) 该函数输入一个矩阵,返回扁平化后矩阵中非零元素的位置(index)即下标
print("查看返回非零的个数:",a)
print(X_train.toarray()[0][a])
# 非零元素对应的单词
b = tfidf2.vocabulary_ # 词汇表
key_list = []
for key,value in b.items():
if value in a:
key_list.append(key)
print("非零中查看有用的词",key_list)
print("x_train[0]",x_train[0])
4.模型选择
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
说明为什么选择这个模型?
5.模型评价:混淆矩阵,分类报告
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_predict)
说明混淆矩阵的含义
from sklearn.metrics import classification_report
说明准确率、精确率、召回率、F值分别代表的意义
6.比较与总结
如果用CountVectorizer进行文本特征生成,与TfidfVectorizer相比,效果如何?
训练数据少时,用CountVectorizer进行文本特征生成更精确
浙公网安备 33010602011771号