机器学习实战
第四章
例题4.6
过滤垃圾邮件
书中 4.6 Page 64
代码如下:
# _*_ coding: utf-8 _*_
# @Time : 2020/11/23 下午7:23
# @Author : 城市就是森林
# @Version:V 0.1
# @File : naive_bayes.py
# @desc : none
# @software: PyCharm
import random
import re
from os import listdir
import numpy as np
from sklearn.naive_bayes import MultinomialNB
def split_text(e_mail: str) -> list:
"""
将邮件拆分为字符串
:parameter
e_mail: 邮件内容
:return
返回字符串列表
"""
list_tokens = re.split(r'[\W*]\d*', e_mail) # 用非字母、非数字切分句子
list_tokens_lower = [] # 存储小写字符串
for token in list_tokens: # 判断是否是字符串,删除空格,是字符串则改为小写字母
if isinstance(token, str) and len(token) > 0:
list_tokens_lower.append(token.lower())
return list_tokens_lower
def read_email(e_mail_address: str) -> str:
"""
读取邮件,将邮件内容转为字符串
:param
e_mail_address: 邮件地址
:return:
邮件字符串
"""
doc_list = [] # 存储邮件字符串
try:
with open(e_mail_address, 'r') as e: # 读取文件并存储
email = e.read()
doc_list.append(email)
except UnicodeDecodeError or IndexError: # 防止由于编码错误的出现
pass
if len(doc_list) > 0:
return doc_list[0]
else:
return 'Hi' # 随书下载的数据.6txt文件编码有误
def creat_vocab_list(ham_filepath: str, spam_filepath: str) -> list:
"""
构造词列表
:parameter
ham_filepath: 非垃圾邮件数据地址
spam_filepath: 垃圾邮件数据地址
"""
vocab_list = set([]) # 构造词列表
ham_emails = listdir(ham_filepath) # 非垃圾邮件列表
ham_num = len(ham_emails) # 非垃圾邮件数量
for i in range(ham_num): # 将非垃圾邮件中的词放入词列表中
file_name = ham_emails[i]
file_address = ham_filepath + '/' + file_name
file_content = read_email(file_address)
file_words = split_text(file_content)
for word in file_words:
vocab_list |= {word}
spam_emails = listdir(spam_filepath)
spam_num = len(spam_emails)
for i in range(spam_num): # 将垃圾邮件中的词放入词表中
file_name = spam_emails[i]
file_address = spam_filepath + '/' + file_name
file_content = read_email(file_address)
file_words = split_text(file_content)
for word in file_words:
vocab_list |= {word}
return list(vocab_list.intersection(vocab_list))
def word_to_bagsvector(vocab_list: list, input_list: list) -> list:
"""
将邮件内容转换为词袋
:param
vocab_list: 词汇表
input_list: 需要转换的词
:return
返回词袋向量
"""
vocab_num = len(vocab_list)
word_bags = [0] * vocab_num
for word in input_list:
if word in vocab_list:
word_bags[vocab_list.index(word)] += 1
else:
print('存在未检测到的词')
return word_bags
def email_data(ham_file_path: str, spam_file_path: str, vocab_list: list) -> dict:
"""
生成所有邮件的数据集
:param
ham_file_path: 非垃圾邮件文件地址
spam_file_path: 垃圾邮件文件地址
vocab_list: 词汇列表
:return:
返回数据矩阵和标签
"""
ham_emails = listdir(ham_file_path) # 非垃圾邮件列表
ham_num = len(ham_emails) # 非垃圾邮件数量
spam_emails = listdir(spam_file_path) # 垃圾邮件列表
spam_num = len(spam_emails) # 垃圾邮件数量
data_num = ham_num + spam_num # 邮件总数
column = len(vocab_list) # 词向量长度
email_matrix = np.zeros((data_num, column)) # 词向量矩阵
labels = [] # 训练数据标签,1 表示垃圾邮件, 0 表示非垃圾邮件
for i in range(ham_num): # 将非垃圾邮件存入矩阵中
file_name = ham_emails[i]
file_address = ham_file_path + '/' + file_name
file_content = read_email(file_address)
file_words = split_text(file_content)
word_bags = word_to_bagsvector(vocab_list=vocab_list, input_list=file_words)
email_matrix[i, :] = word_bags
labels.append(0)
for i in range(spam_num): # 将垃圾邮件存入矩阵中
file_name = spam_emails[i]
file_address = spam_file_path + '/' + file_name
file_content = read_email(file_address)
file_words = split_text(file_content)
word_bags = word_to_bagsvector(vocab_list=vocab_list, input_list=file_words)
email_matrix[i + ham_num, :] = word_bags
labels.append(1)
dic = {'matrix': email_matrix, 'labels': labels}
return dic
def classifier_training(traning_matrix: np.ndarray, labels: list) -> MultinomialNB:
"""
训练朴素贝叶斯分类器 使用词袋模型,固使用多项式朴素贝叶斯模型;默认拉普拉斯平滑
:param
traning_matrix: 训练数据矩阵
labels: 标签
:return
训练完成的分类器
"""
clf = MultinomialNB()
clf.fit(traning_matrix, labels)
return clf
def classifier_testing(testing_data: np.ndarray, clf: MultinomialNB):
"""
测试分类器
:param
testing_matrix: 测试数据集
clf: 分类器
"""
result = clf.predict(testing_data)
return result
def main(test_num=10) -> dict:
"""
主函数
:param
random_num: 测试集的数量
"""
spam_mail_address = '/Users/***' # 垃圾邮件文件地址
ham_mail_address = '/Users/***' # 非垃圾邮件文件地址
vocab_list = creat_vocab_list(ham_filepath=ham_mail_address,
spam_filepath=spam_mail_address) # 词汇表
dic = email_data(ham_file_path=ham_mail_address,
spam_file_path=spam_mail_address,
vocab_list=vocab_list) # 邮件数据矩阵和标签
data_matrix = dic['matrix'] # 邮件数据矩阵
labels = dic['labels'] # 标签
line = data_matrix.shape[0] # 获取邮件总数量
random_email_list = random.sample(range(line), test_num) # 确定随机10封邮件
random_email_list.sort() # 排序
# 选择测试数据集
test_set = data_matrix[random_email_list] # 测试数据矩阵
test_label = [0] * test_num # 测试数据标签
for i in range(test_num):
test_label[i] = labels[random_email_list[i]]
# 训练数据集
email_list = list(range(line))
train_list = list(set(email_list) - set(random_email_list)) # 训练数据是测试数据的补集
train_set = data_matrix[train_list]
train_label = [0] * (line - test_num)
for i in range(line - test_num):
train_label[i] = labels[train_list[i]]
# 使用朴素朴素贝叶斯训练及测试
clf = classifier_training(train_set, train_label)
result = classifier_testing(test_set, clf)
# 计算错误率
count = 0 # 总错误数
ham_wrong = 0 # 非邮件邮件错分类数
spam_wrong = 0 # 垃圾邮件错分类数
for i in range(test_num):
if result[i] != test_label[i]:
count += 1.0
if test_label[i] == 1:
spam_wrong += 1.0
else:
ham_wrong += 1.0
# print(count)
# print(ham_wrong)
# print(spam_wrong)
return {'wrong': count / test_num, 'spam_wrong': spam_wrong / test_num,
'ham_wrong': ham_wrong / test_num}
if __name__ == '__main__':
"""
重复10次计算平均错误率
"""
error_rate = 0
spam_rate = 0
ham_rate = 0
for i in range(9):
rate = main()
error_rate += rate['wrong']
spam_rate += rate['spam_wrong']
ham_rate += rate['ham_wrong']
average_error_rate = error_rate / 10
average_spame_rate = spam_rate / 10
average_ham_rage = ham_rate / 10
print('对10封垃圾邮件进行测试,测试10次的平均错误率是%.2f%%; 垃圾邮件分类错误率是'
'%.2f%%; 非垃圾邮件分类错误率是%.2f%%' % (average_error_rate * 100,
average_spame_rate * 100,
average_ham_rage * 100))
浙公网安备 33010602011771号