机器学习实战 示例4.6

机器学习实战

第四章

例题4.6

过滤垃圾邮件

书中 4.6 Page 64

代码如下:

# _*_ coding: utf-8 _*_
# @Time : 2020/11/23 下午7:23
# @Author : 城市就是森林
# @Version:V 0.1
# @File : naive_bayes.py
# @desc : none
# @software: PyCharm
import random
import re
from os import listdir

import numpy as np
from sklearn.naive_bayes import MultinomialNB


def split_text(e_mail: str) -> list:
    """
    将邮件拆分为字符串
    :parameter
        e_mail: 邮件内容
    :return
        返回字符串列表
    """
    list_tokens = re.split(r'[\W*]\d*', e_mail)  # 用非字母、非数字切分句子
    list_tokens_lower = []  # 存储小写字符串
    for token in list_tokens:  # 判断是否是字符串,删除空格,是字符串则改为小写字母
        if isinstance(token, str) and len(token) > 0:
            list_tokens_lower.append(token.lower())
    return list_tokens_lower


def read_email(e_mail_address: str) -> str:
    """
    读取邮件,将邮件内容转为字符串
    :param
        e_mail_address: 邮件地址
    :return:
        邮件字符串
    """
    doc_list = []  # 存储邮件字符串
    try:
        with open(e_mail_address, 'r') as e:  # 读取文件并存储
            email = e.read()
            doc_list.append(email)
    except UnicodeDecodeError or IndexError:  # 防止由于编码错误的出现
        pass
    if len(doc_list) > 0:
        return doc_list[0]
    else:
        return 'Hi'  # 随书下载的数据.6txt文件编码有误


def creat_vocab_list(ham_filepath: str, spam_filepath: str) -> list:
    """
    构造词列表
    :parameter
        ham_filepath: 非垃圾邮件数据地址
        spam_filepath: 垃圾邮件数据地址
    """
    vocab_list = set([])  # 构造词列表
    ham_emails = listdir(ham_filepath)  # 非垃圾邮件列表
    ham_num = len(ham_emails)  # 非垃圾邮件数量
    for i in range(ham_num):  # 将非垃圾邮件中的词放入词列表中
        file_name = ham_emails[i]
        file_address = ham_filepath + '/' + file_name
        file_content = read_email(file_address)
        file_words = split_text(file_content)
        for word in file_words:
            vocab_list |= {word}

    spam_emails = listdir(spam_filepath)
    spam_num = len(spam_emails)
    for i in range(spam_num):  # 将垃圾邮件中的词放入词表中
        file_name = spam_emails[i]
        file_address = spam_filepath + '/' + file_name
        file_content = read_email(file_address)
        file_words = split_text(file_content)
        for word in file_words:
            vocab_list |= {word}

    return list(vocab_list.intersection(vocab_list))


def word_to_bagsvector(vocab_list: list, input_list: list) -> list:
    """
    将邮件内容转换为词袋
    :param
        vocab_list: 词汇表
        input_list: 需要转换的词
    :return
        返回词袋向量
    """
    vocab_num = len(vocab_list)
    word_bags = [0] * vocab_num
    for word in input_list:
        if word in vocab_list:
            word_bags[vocab_list.index(word)] += 1
        else:
            print('存在未检测到的词')
    return word_bags


def email_data(ham_file_path: str, spam_file_path: str, vocab_list: list) -> dict:
    """
    生成所有邮件的数据集
    :param
        ham_file_path: 非垃圾邮件文件地址
        spam_file_path: 垃圾邮件文件地址
        vocab_list: 词汇列表
    :return:
        返回数据矩阵和标签
    """
    ham_emails = listdir(ham_file_path)  # 非垃圾邮件列表
    ham_num = len(ham_emails)  # 非垃圾邮件数量
    spam_emails = listdir(spam_file_path)  # 垃圾邮件列表
    spam_num = len(spam_emails)  # 垃圾邮件数量
    data_num = ham_num + spam_num  # 邮件总数
    column = len(vocab_list)  # 词向量长度
    email_matrix = np.zeros((data_num, column))  # 词向量矩阵
    labels = []  # 训练数据标签,1 表示垃圾邮件, 0 表示非垃圾邮件
    for i in range(ham_num):  # 将非垃圾邮件存入矩阵中
        file_name = ham_emails[i]
        file_address = ham_file_path + '/' + file_name
        file_content = read_email(file_address)
        file_words = split_text(file_content)
        word_bags = word_to_bagsvector(vocab_list=vocab_list, input_list=file_words)
        email_matrix[i, :] = word_bags
        labels.append(0)
    for i in range(spam_num):  # 将垃圾邮件存入矩阵中
        file_name = spam_emails[i]
        file_address = spam_file_path + '/' + file_name
        file_content = read_email(file_address)
        file_words = split_text(file_content)
        word_bags = word_to_bagsvector(vocab_list=vocab_list, input_list=file_words)
        email_matrix[i + ham_num, :] = word_bags
        labels.append(1)
    dic = {'matrix': email_matrix, 'labels': labels}
    return dic


def classifier_training(traning_matrix: np.ndarray, labels: list) -> MultinomialNB:
    """
    训练朴素贝叶斯分类器 使用词袋模型,固使用多项式朴素贝叶斯模型;默认拉普拉斯平滑
    :param
        traning_matrix: 训练数据矩阵
        labels: 标签
    :return
        训练完成的分类器
    """
    clf = MultinomialNB()
    clf.fit(traning_matrix, labels)
    return clf


def classifier_testing(testing_data: np.ndarray, clf: MultinomialNB):
    """
    测试分类器 
    :param
        testing_matrix: 测试数据集
        clf: 分类器
    """
    result = clf.predict(testing_data)
    return result


def main(test_num=10) -> dict:
    """
    主函数
    :param
        random_num: 测试集的数量
    """
    spam_mail_address = '/Users/***'  # 垃圾邮件文件地址
    ham_mail_address = '/Users/***'  # 非垃圾邮件文件地址
    vocab_list = creat_vocab_list(ham_filepath=ham_mail_address,
                                  spam_filepath=spam_mail_address)  # 词汇表
    dic = email_data(ham_file_path=ham_mail_address,
                     spam_file_path=spam_mail_address,
                     vocab_list=vocab_list)  # 邮件数据矩阵和标签
    data_matrix = dic['matrix']  # 邮件数据矩阵
    labels = dic['labels']  # 标签
    line = data_matrix.shape[0]  # 获取邮件总数量
    random_email_list = random.sample(range(line), test_num)  # 确定随机10封邮件
    random_email_list.sort()  # 排序
    # 选择测试数据集
    test_set = data_matrix[random_email_list]  # 测试数据矩阵
    test_label = [0] * test_num  # 测试数据标签
    for i in range(test_num):
        test_label[i] = labels[random_email_list[i]]
    # 训练数据集
    email_list = list(range(line))
    train_list = list(set(email_list) - set(random_email_list))  # 训练数据是测试数据的补集
    train_set = data_matrix[train_list]
    train_label = [0] * (line - test_num)
    for i in range(line - test_num):
        train_label[i] = labels[train_list[i]]
    # 使用朴素朴素贝叶斯训练及测试
    clf = classifier_training(train_set, train_label)
    result = classifier_testing(test_set, clf)
    # 计算错误率
    count = 0  # 总错误数
    ham_wrong = 0  # 非邮件邮件错分类数
    spam_wrong = 0  # 垃圾邮件错分类数
    for i in range(test_num):
        if result[i] != test_label[i]:
            count += 1.0
            if test_label[i] == 1:
                spam_wrong += 1.0
            else:
                ham_wrong += 1.0
    # print(count)
    # print(ham_wrong)
    # print(spam_wrong)
    return {'wrong': count / test_num, 'spam_wrong': spam_wrong / test_num,
            'ham_wrong': ham_wrong / test_num}


if __name__ == '__main__':
    """
    重复10次计算平均错误率
    """
    error_rate = 0
    spam_rate = 0
    ham_rate = 0
    for i in range(9):
        rate = main()
        error_rate += rate['wrong']
        spam_rate += rate['spam_wrong']
        ham_rate += rate['ham_wrong']

    average_error_rate = error_rate / 10
    average_spame_rate = spam_rate / 10
    average_ham_rage = ham_rate / 10
    print('对10封垃圾邮件进行测试,测试10次的平均错误率是%.2f%%; 垃圾邮件分类错误率是'
          '%.2f%%; 非垃圾邮件分类错误率是%.2f%%' % (average_error_rate * 100,
                                         average_spame_rate * 100,
                                         average_ham_rage * 100))

posted on 2020-11-26 21:22  城市就是森林  阅读(84)  评论(0)    收藏  举报