2025.11.16上机实验六:朴素贝叶斯算法实现与测试

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

class NaiveBayesClassifier:
"""
自定义朴素贝叶斯分类器实现
使用高斯分布假设(类似GaussianNB)
"""

def __init__(self):
    self.classes = None
    self.class_priors = {}
    self.feature_means = {}
    self.feature_vars = {}

def fit(self, X, y):
    """
    训练朴素贝叶斯模型
    
    参数:
    X: 特征矩阵,形状为 (n_samples, n_features)
    y: 目标标签,形状为 (n_samples,)
    """
    self.classes = np.unique(y)
    
    for cls in self.classes:
        # 计算每个类别的先验概率
        X_cls = X[y == cls]
        self.class_priors[cls] = len(X_cls) / len(X)
        
        # 计算每个类别下每个特征的均值和方差
        self.feature_means[cls] = np.mean(X_cls, axis=0)
        self.feature_vars[cls] = np.var(X_cls, axis=0)

def _calculate_likelihood(self, x, mean, var):
    """
    计算高斯分布下的似然概率
    
    参数:
    x: 特征值
    mean: 均值
    var: 方差
    
    返回:
    似然概率
    """
    eps = 1e-10  # 避免除零
    coeff = 1.0 / np.sqrt(2.0 * np.pi * var + eps)
    exponent = np.exp(-0.5 * ((x - mean) ** 2) / (var + eps))
    return coeff * exponent

def predict_single(self, x):
    """
    预测单个样本的类别
    
    参数:
    x: 单个样本的特征向量
    
    返回:
    预测的类别
    """
    posteriors = []
    
    for cls in self.classes:
        # 计算先验概率的对数
        prior = np.log(self.class_priors[cls])
        
        # 计算似然概率的对数和
        likelihood = np.sum(np.log(self._calculate_likelihood(
            x, self.feature_means[cls], self.feature_vars[cls])))
        
        # 计算后验概率
        posterior = prior + likelihood
        posteriors.append(posterior)
    
    # 返回后验概率最大的类别
    return self.classes[np.argmax(posteriors)]

def predict(self, X):
    """
    预测多个样本的类别
    
    参数:
    X: 特征矩阵,形状为 (n_samples, n_features)
    
    返回:
    预测的类别数组
    """
    return np.array([self.predict_single(x) for x in X])

def load_and_explore_data():
"""
加载和探索iris数据集
"""
print("=== 1. 数据集加载与探索 ===")

# 加载iris数据集
iris = load_iris()
X, y = iris.data, iris.target

print(f"数据集大小: {X.shape}")
print(f"特征名称: {iris.feature_names}")
print(f"目标类别: {iris.target_names}")
print(f"类别分布: {np.bincount(y)}")

# 创建DataFrame便于分析
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y
df['target_name'] = [iris.target_names[i] for i in y]

print("\n数据集前5行:")
print(df.head())

print("\n数据集统计信息:")
print(df.describe())

return X, y, iris

def evaluate_model(y_true, y_pred, fold_num=None):
"""
计算模型性能指标

参数:
y_true: 真实标签
y_pred: 预测标签
fold_num: 折数(可选)

返回:
性能指标字典
"""
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

prefix = f"第{fold_num}折 " if fold_num is not None else ""

print(f"\n{prefix}性能指标:")
print(f"准确度 (Accuracy): {accuracy:.4f}")
print(f"精度 (Precision): {precision:.4f}")
print(f"召回率 (Recall): {recall:.4f}")
print(f"F1值 (F1-score): {f1:.4f}")

return {
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1': f1
}

def cross_validation_experiment(X, y, n_splits=5, use_custom=True):
"""
执行交叉验证实验

参数:
X: 特征矩阵
y: 目标标签
n_splits: 折数
use_custom: 是否使用自定义实现

返回:
性能指标列表
"""
print(f"\n=== 2. 五折交叉验证实验 ===")
print(f"使用{'自定义' if use_custom else 'scikit-learn'}朴素贝叶斯算法")

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

results = []
fold = 1

for train_idx, test_idx in skf.split(X, y):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    print(f"\n--- 第{fold}折 ---")
    print(f"训练集大小: {X_train.shape}, 测试集大小: {X_test.shape}")
    
    if use_custom:
        # 使用自定义朴素贝叶斯
        nb = NaiveBayesClassifier()
        nb.fit(X_train, y_train)
        y_pred = nb.predict(X_test)
    else:
        # 使用scikit-learn的GaussianNB
        nb = GaussianNB()
        nb.fit(X_train, y_train)
        y_pred = nb.predict(X_test)
    
    # 评估性能
    metrics = evaluate_model(y_test, y_pred, fold)
    results.append(metrics)
    
    fold += 1

return results

def analyze_results(custom_results, sklearn_results):
"""
分析实验结果
"""
print("\n=== 3. 实验结果分析 ===")

# 转换为DataFrame便于分析
custom_df = pd.DataFrame(custom_results)
sklearn_df = pd.DataFrame(sklearn_results)

print("\n自定义朴素贝叶斯结果:")
print(custom_df)
print(f"\n平均值:")
print(custom_df.mean())
print(f"\n标准差:")
print(custom_df.std())

print("\nscikit-learn朴素贝叶斯结果:")
print(sklearn_df)
print(f"\n平均值:")
print(sklearn_df.mean())
print(f"\n标准差:")
print(sklearn_df.std())

# 比较两种实现
print("\n=== 4. 两种实现对比 ===")
comparison = pd.DataFrame({
    '自定义实现': custom_df.mean(),
    'scikit-learn': sklearn_df.mean()
})
print(comparison)

return custom_df, sklearn_df

def visualize_results(custom_df, sklearn_df):
"""
可视化实验结果
"""
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

metrics = ['accuracy', 'precision', 'recall', 'f1']
titles = ['准确度', '精度', '召回率', 'F1值']

for i, (metric, title) in enumerate(zip(metrics, titles)):
    row, col = i // 2, i % 2
    
    # 箱线图比较
    data = [custom_df[metric], sklearn_df[metric]]
    axes[row, col].boxplot(data, labels=['自定义', 'scikit-learn'])
    axes[row, col].set_title(f'{title}对比')
    axes[row, col].set_ylabel(title)
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('naive_bayes_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# 生成混淆矩阵(使用最后一次交叉验证的结果)
iris = load_iris()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, test_idx in skf.split(iris.data, iris.target):
    X_train, X_test = iris.data[train_idx], iris.data[test_idx]
    y_train, y_test = iris.target[train_idx], iris.target[test_idx]
    break

# 使用scikit-learn模型生成混淆矩阵
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=iris.target_names,
            yticklabels=iris.target_names)
plt.title('混淆矩阵 - 朴素贝叶斯分类')
plt.xlabel('预测类别')
plt.ylabel('真实类别')
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

def main():
"""
主函数
"""
print("朴素贝叶斯算法实验")
print("=" * 50)

# 1. 加载和探索数据
X, y, iris = load_and_explore_data()

# 2. 使用自定义朴素贝叶斯进行交叉验证
custom_results = cross_validation_experiment(X, y, n_splits=5, use_custom=True)

# 3. 使用scikit-learn朴素贝叶斯进行交叉验证
sklearn_results = cross_validation_experiment(X, y, n_splits=5, use_custom=False)

# 4. 分析结果
custom_df, sklearn_df = analyze_results(custom_results, sklearn_results)

# 5. 可视化结果
visualize_results(custom_df, sklearn_df)

print("\n=== 5. 实验总结 ===")
print("实验完成!主要发现:")
print("1. 自定义朴素贝叶斯算法与scikit-learn实现效果相近")
print("2. 两种实现都能达到较高的分类准确度(>0.9)")
print("3. 五折交叉验证结果显示模型具有良好的泛化能力")
print("4. 混淆矩阵显示各类别的分类效果良好")

if name == "main":
main()

posted @ 2025-12-29 00:02  ysd666  阅读(3)  评论(0)    收藏  举报