李航《统计学方法》贝叶斯估计分类器实现习题
#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
author: jianbin
time:2022/10/26
"""
import numpy as np
# 构造NB分类器
def Train(X_train, Y_train, feature):
global class_num, label
alpha = 1 # 拉普拉斯平滑,贝叶斯估计
class_num = 2 # 分类数目
label = [1, -1] # 分类标签
feature_len = 3 # 特征长度
# 构造3x2的列表
feature = [[1, 'S'],
[2, 'M'],
[3, 'L']]
prior_probability = np.zeros(class_num) # 初始化先验概率
conditional_probability = np.zeros((class_num, feature_len, 2)) # 初始化条件概率
positive_count = 0 # 统计正类
negative_count = 0 # 统计负类
for i in range(len(Y_train)):
if Y_train[i] == 1:
positive_count += 1
else:
negative_count += 1
prior_probability[0] = (positive_count + alpha) / (len(Y_train) + class_num * alpha) # 得到正类的先验概率
prior_probability[1] = (negative_count + alpha) / (len(Y_train) + class_num * alpha) # 得到负类的先验概率
print("正负先验概率:", prior_probability[0], prior_probability[1]) # 0.6, 04
# conditional_probability是一个2*3*2的三维列表,
# 第一维是类别分类, 第二维和第三维是一个3*2的特征分类
# 分为两个类别
for i in range(class_num): # 2
# 对特征按行遍历
for j in range(feature_len): # 3
# 遍历数据集,并依次做判断
for k in range(len(Y_train)): # 15
# 这里判断类别是否相同
if Y_train[k] == label[i]: # 相同类别
# 这里判断数字是否相同,第k个样本的第一个特征
if X_train[k][0] == feature[j][0]:
conditional_probability[i][j][0] += 1
# 这里判断字母是否相同,第k个样本的第二个特征
if X_train[k][1] == feature[j][1]:
conditional_probability[i][j][1] += 1
# print(" conditional_probability[%d][%d][0] :" % (i, j),
# conditional_probability[i][j][0])
# print(" conditional_probability[%d][%d][1] :" % (i, j),
# conditional_probability[i][j][1])
class_label_num = [positive_count, negative_count] # 存放各类型的数目9, 6
print(class_label_num)
for i in range(class_num):
for j in range(feature_len):
# 求得第i类j行第一个特征的条件概率
conditional_probability[i][j][0] = \
(conditional_probability[i][j][0] + alpha) / (class_label_num[i] + feature_len * alpha)
# 求得第i类j行第二个特征的条件概率
conditional_probability[i][j][1] = \
(conditional_probability[i][j][1] + alpha) / (class_label_num[i] + feature_len * alpha)
return prior_probability, conditional_probability
# 给定数据进行分类
def Predict(testset, prior_probability, conditional_probability, feature):
result = np.zeros(len(label))
for i in range(class_num):
for j in range(len(feature)):
# 判断第一个特征
if feature[j][0] == testset[0]:
conditionalA = conditional_probability[i][j][0]
# 判断第二个特征
if feature[j][1] == testset[1]:
conditionalB = conditional_probability[i][j][1]
result[i] = conditionalA * conditionalB * prior_probability[i]
result = np.vstack([result, label]) # np.vstack()沿着竖直方向将矩阵堆叠起来
return result
def main():
X_train = [[1, 'S'], [1, 'M'], [1, 'M'], [1, 'S'], [1, 'S'],
[2, 'S'], [2, 'M'], [2, 'M'], [2, 'L'], [2, 'L'],
[3, 'L'], [3, 'M'], [3, 'M'], [3, 'L'], [3, 'L']]
Y_train = [-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1]
# 构造3x2的列表
feature = [[1, 'S'],
[2, 'M'],
[3, 'L']]
testset = [2, 'S']
prior_probability, conditional_probability = Train(X_train, Y_train, feature)
result = Predict(testset, prior_probability, conditional_probability, feature)
print(result)
if __name__ == '__main__':
main()
参考博客:https://cloud.tencent.com/developer/article/1505695
浙公网安备 33010602011771号