机器学习之朴素贝叶斯算法
import random
from numpy import ones, log, array
def load_dataset():
words = [
["my", "dog", "has", "flea",
"problems", "help", "please"],
["maybe", "not", "take", "him", "to",
"dog", "park", "stupid"],
["my", "dalmation", "is", "so", "cute", "I", "love", "him"],
["stop", "posting", "stupid", "worthless", "garbage"],
["mr", "licks", "ate", "my", "steak", "how", "to", "stop", "him"],
["quit", "buying", "worthless", "dog", "food", "stupid"],
]
class_list = [0, 1, 0, 1, 0, 1]
return words, class_list
def create_vocab_list(dataset):
vocab_set = set()
for data in dataset:
vocab_set |= set(data)
return list(vocab_set)
def vocab_list2vec(vocab_list, input_set):
vec = [0] * len(vocab_list)
for word in input_set:
if word in vocab_list:
vec[vocab_list.index(word)] += 1
return vec
def train_classify(train_mat, train_category):
num_train_doc = len(train_mat)
num_words = len(train_mat[0])
p_abusive = sum(train_category) / float(num_train_doc)
p0_num = ones(num_words)
p1_num = ones(num_words)
p0 = 2.0
p1 = 2.0
for i in range(num_train_doc):
if train_category[i] == 1:
p1_num += train_mat[i]
p1 += sum(train_mat[i])
else:
p0_num += train_mat[i]
p0 += sum(train_mat[i])
p1_vect = log(p1_num / p1)
p0_vect = log(p0_num / p0)
return p0_vect, p1_vect, p_abusive
def classify(vec2classify, p0_vect, p1_vect, p_class):
p1 = sum(vec2classify * p1_vect) + log(p_class)
p0 = sum(vec2classify * p0_vect) + log(1 - p_class)
if p1 > p0:
return 1
else:
return 0
def testing():
words, class_list = load_dataset()
vocab_list = create_vocab_list(words)
train_mat = [vocab_list2vec(vocab_list, _) for _ in words]
p0_v, p1_v, p_abusive = train_classify(train_mat, class_list)
print(f"p0: {p0_v}\np1: {p1_v}\np_abusive: {p_abusive}")
test_entry = ["love", "my", "dalmation"]
doc = array(vocab_list2vec(vocab_list, test_entry))
print(f"test_entry: {test_entry}, classify: {classify(doc, p0_v, p1_v, p_abusive)}")
test_entry = ["stupid", "garbage"]
doc = array(vocab_list2vec(vocab_list, test_entry))
print(f"test_entry: {test_entry}, classify: {classify(doc, p0_v, p1_v, p_abusive)}")
def words_bag2vec(vocab_list, input_set):
vec = [0] * len(vocab_list)
for word in input_set:
if word in vocab_list:
vec[vocab_list.index(word)] += 1
return vec
def text_parse(string):
import re
word_list = re.split(r'\W+', string)
return [_.lower() for _ in word_list if len(_) > 2]
def spam_test():
doc_list = []
class_list = []
full_text_list = []
for _ in range(1, 26):
with open(f"email/spam/{_}.txt", "r+") as f:
words = f.read()
word_list = text_parse(words)
doc_list.extend(word_list)
full_text_list.extend(word_list)
class_list.append(1)
with open(f"email/ham/{_}.txt", "r+") as f:
words = f.read()
word_list = text_parse(words)
doc_list.extend(word_list)
full_text_list.extend(word_list)
class_list.append(0)
vocab_list = create_vocab_list(doc_list)
train_list = list(range(50))
test_list = []
for i in range(10):
index = int(random.uniform(0, len(train_list)))
test_list.append(train_list[index])
del train_list[index]
train_mat = []
train_category = []
for i in train_list:
train_mat.append(vocab_list2vec(vocab_list, doc_list[i]))
train_category.append(class_list[i])
p0_v, p1_v, p_abusive = train_classify(array(train_mat), array(train_category))
error_count = 0
for i in test_list:
word_vec = vocab_list2vec(vocab_list, doc_list[i])
if classify(array(word_vec), p0_v, p1_v, p_abusive) == class_list[i]:
error_count += 1
print(f"error rate: {float(error_count) / len(test_list)}")
if __name__ == "__main__":
testing()
spam_test()
其他朴素贝叶斯示例或者基于主流机器学习框架实现的朴素贝叶斯代码地址:
https://gitee.com/navysummer/machine-learning/tree/master/bayes

浙公网安备 33010602011771号