2025.11.16上机实验六:朴素贝叶斯算法实现与测试
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
class NaiveBayesClassifier:
"""
自定义朴素贝叶斯分类器实现
使用高斯分布假设(类似GaussianNB)
"""
def __init__(self):
self.classes = None
self.class_priors = {}
self.feature_means = {}
self.feature_vars = {}
def fit(self, X, y):
"""
训练朴素贝叶斯模型
参数:
X: 特征矩阵,形状为 (n_samples, n_features)
y: 目标标签,形状为 (n_samples,)
"""
self.classes = np.unique(y)
for cls in self.classes:
# 计算每个类别的先验概率
X_cls = X[y == cls]
self.class_priors[cls] = len(X_cls) / len(X)
# 计算每个类别下每个特征的均值和方差
self.feature_means[cls] = np.mean(X_cls, axis=0)
self.feature_vars[cls] = np.var(X_cls, axis=0)
def _calculate_likelihood(self, x, mean, var):
"""
计算高斯分布下的似然概率
参数:
x: 特征值
mean: 均值
var: 方差
返回:
似然概率
"""
eps = 1e-10 # 避免除零
coeff = 1.0 / np.sqrt(2.0 * np.pi * var + eps)
exponent = np.exp(-0.5 * ((x - mean) ** 2) / (var + eps))
return coeff * exponent
def predict_single(self, x):
"""
预测单个样本的类别
参数:
x: 单个样本的特征向量
返回:
预测的类别
"""
posteriors = []
for cls in self.classes:
# 计算先验概率的对数
prior = np.log(self.class_priors[cls])
# 计算似然概率的对数和
likelihood = np.sum(np.log(self._calculate_likelihood(
x, self.feature_means[cls], self.feature_vars[cls])))
# 计算后验概率
posterior = prior + likelihood
posteriors.append(posterior)
# 返回后验概率最大的类别
return self.classes[np.argmax(posteriors)]
def predict(self, X):
"""
预测多个样本的类别
参数:
X: 特征矩阵,形状为 (n_samples, n_features)
返回:
预测的类别数组
"""
return np.array([self.predict_single(x) for x in X])
def load_and_explore_data():
"""
加载和探索iris数据集
"""
print("=== 1. 数据集加载与探索 ===")
# 加载iris数据集
iris = load_iris()
X, y = iris.data, iris.target
print(f"数据集大小: {X.shape}")
print(f"特征名称: {iris.feature_names}")
print(f"目标类别: {iris.target_names}")
print(f"类别分布: {np.bincount(y)}")
# 创建DataFrame便于分析
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y
df['target_name'] = [iris.target_names[i] for i in y]
print("\n数据集前5行:")
print(df.head())
print("\n数据集统计信息:")
print(df.describe())
return X, y, iris
def evaluate_model(y_true, y_pred, fold_num=None):
"""
计算模型性能指标
参数:
y_true: 真实标签
y_pred: 预测标签
fold_num: 折数(可选)
返回:
性能指标字典
"""
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')
prefix = f"第{fold_num}折 " if fold_num is not None else ""
print(f"\n{prefix}性能指标:")
print(f"准确度 (Accuracy): {accuracy:.4f}")
print(f"精度 (Precision): {precision:.4f}")
print(f"召回率 (Recall): {recall:.4f}")
print(f"F1值 (F1-score): {f1:.4f}")
return {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1
}
def cross_validation_experiment(X, y, n_splits=5, use_custom=True):
"""
执行交叉验证实验
参数:
X: 特征矩阵
y: 目标标签
n_splits: 折数
use_custom: 是否使用自定义实现
返回:
性能指标列表
"""
print(f"\n=== 2. 五折交叉验证实验 ===")
print(f"使用{'自定义' if use_custom else 'scikit-learn'}朴素贝叶斯算法")
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
results = []
fold = 1
for train_idx, test_idx in skf.split(X, y):
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
print(f"\n--- 第{fold}折 ---")
print(f"训练集大小: {X_train.shape}, 测试集大小: {X_test.shape}")
if use_custom:
# 使用自定义朴素贝叶斯
nb = NaiveBayesClassifier()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
else:
# 使用scikit-learn的GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
# 评估性能
metrics = evaluate_model(y_test, y_pred, fold)
results.append(metrics)
fold += 1
return results
def analyze_results(custom_results, sklearn_results):
"""
分析实验结果
"""
print("\n=== 3. 实验结果分析 ===")
# 转换为DataFrame便于分析
custom_df = pd.DataFrame(custom_results)
sklearn_df = pd.DataFrame(sklearn_results)
print("\n自定义朴素贝叶斯结果:")
print(custom_df)
print(f"\n平均值:")
print(custom_df.mean())
print(f"\n标准差:")
print(custom_df.std())
print("\nscikit-learn朴素贝叶斯结果:")
print(sklearn_df)
print(f"\n平均值:")
print(sklearn_df.mean())
print(f"\n标准差:")
print(sklearn_df.std())
# 比较两种实现
print("\n=== 4. 两种实现对比 ===")
comparison = pd.DataFrame({
'自定义实现': custom_df.mean(),
'scikit-learn': sklearn_df.mean()
})
print(comparison)
return custom_df, sklearn_df
def visualize_results(custom_df, sklearn_df):
"""
可视化实验结果
"""
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
metrics = ['accuracy', 'precision', 'recall', 'f1']
titles = ['准确度', '精度', '召回率', 'F1值']
for i, (metric, title) in enumerate(zip(metrics, titles)):
row, col = i // 2, i % 2
# 箱线图比较
data = [custom_df[metric], sklearn_df[metric]]
axes[row, col].boxplot(data, labels=['自定义', 'scikit-learn'])
axes[row, col].set_title(f'{title}对比')
axes[row, col].set_ylabel(title)
axes[row, col].grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('naive_bayes_comparison.png', dpi=300, bbox_inches='tight')
plt.show()
# 生成混淆矩阵(使用最后一次交叉验证的结果)
iris = load_iris()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, test_idx in skf.split(iris.data, iris.target):
X_train, X_test = iris.data[train_idx], iris.data[test_idx]
y_train, y_test = iris.target[train_idx], iris.target[test_idx]
break
# 使用scikit-learn模型生成混淆矩阵
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=iris.target_names,
yticklabels=iris.target_names)
plt.title('混淆矩阵 - 朴素贝叶斯分类')
plt.xlabel('预测类别')
plt.ylabel('真实类别')
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()
def main():
"""
主函数
"""
print("朴素贝叶斯算法实验")
print("=" * 50)
# 1. 加载和探索数据
X, y, iris = load_and_explore_data()
# 2. 使用自定义朴素贝叶斯进行交叉验证
custom_results = cross_validation_experiment(X, y, n_splits=5, use_custom=True)
# 3. 使用scikit-learn朴素贝叶斯进行交叉验证
sklearn_results = cross_validation_experiment(X, y, n_splits=5, use_custom=False)
# 4. 分析结果
custom_df, sklearn_df = analyze_results(custom_results, sklearn_results)
# 5. 可视化结果
visualize_results(custom_df, sklearn_df)
print("\n=== 5. 实验总结 ===")
print("实验完成!主要发现:")
print("1. 自定义朴素贝叶斯算法与scikit-learn实现效果相近")
print("2. 两种实现都能达到较高的分类准确度(>0.9)")
print("3. 五折交叉验证结果显示模型具有良好的泛化能力")
print("4. 混淆矩阵显示各类别的分类效果良好")
if name == "main":
main()

浙公网安备 33010602011771号