| 博客班级 https://edu.cnblogs.com/campus/ahgc/machinelearning |
| 作业要求 https://edu.cnblogs.com/campus/ahgc/machinelearning/homework/12085 |
| 作业目标 掌握常见的高斯模型,多项式模型和伯努利模型; |
| 学号 3180701235 |
| 二:实验目的 |
| 1.理解朴素贝叶斯算法原理,掌握朴素贝叶斯算法框架; |
| 2.掌握常见的高斯模型,多项式模型和伯努利模型; |
| 3.能根据不同的数据类型,选择不同的概率模型实现朴素贝叶斯算法; |
| 4.针对特定应用场景及数据,能应用朴素贝叶斯解决实际问题。 |
|
| 三:实验内容 |
| 1.实现高斯朴素贝叶斯算法。 |
| 2.熟悉sklearn库中的朴素贝叶斯算法; |
| 3.针对iris数据集,应用sklearn的朴素贝叶斯算法进行类别预测。 |
| 4.针对iris数据集,利用自编朴素贝叶斯算法进行类别预测。 |
|
| 四:实验报告要求 |
| 1.对照实验内容,撰写实验过程、算法及测试结果; |
| 2.代码规范化:命名规则、注释; |
| 3.分析核心算法的复杂度; |
| 4.查阅文献,讨论各种朴素贝叶斯算法的应用场景; |
| 5.讨论朴素贝叶斯算法的优缺点。 |
|
| 五:实验过程 |
| In [1]: |
|
| import numpy as np |
| import pandas as pd |
| import matplotlib.pyplot as plt |
| %matplotlib inline |
|
| from sklearn.datasets import load_iris |
| from sklearn.model_selection import train_test_split |
|
| from collections import Counter |
| import math |
| In [2]: |
|
| def create_data(): |
| iris = load_iris() |
| df = pd.DataFrame(iris.data, columns=iris.feature_names) |
| df['label'] = iris.target |
| df.columns = [ |
| 'sepal length', 'sepal width', 'petal length', 'petal width', 'label' |
| ] |
| data = np.array(df.iloc[:100, :]) |
|
| return data[:, :-1], data[:, -1] |
| In [3]: |
|
| X, y = create_data() |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) |
| In [4]: |
|
| X_test[0], y_test[0] |
| In [5]: |
|
| class NaiveBayes: |
| def init(self): |
| self.model = None |
|
| # 数学期望 |
| @staticmethod |
| def mean(X): |
| return sum(X) / float(len(X)) |
|
| # 标准差(方差) |
| def stdev(self, X): |
| avg = self.mean(X) |
| return math.sqrt(sum([pow(x - avg, 2) for x in X]) / float(len(X))) |
|
| # 概率密度函数 |
| def gaussian_probability(self, x, mean, stdev): |
| exponent = math.exp(-(math.pow(x - mean, 2) / |
| (2 * math.pow(stdev, 2)))) |
| return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent |
|
| # 处理X_train |
| def summarize(self, train_data): |
| summaries = [(self.mean(i), self.stdev(i)) for i in zip(*train_data)] |
| return summaries |
|
| # 分类别求出数学期望和标准差 |
| def fit(self, X, y): |
| labels = list(set(y)) |
| data = |
| for f, label in zip(X, y): |
| data[label].append(f) |
| self.model = { |
| label: self.summarize(value) |
| for label, value in data.items() |
| } |
| return 'gaussianNB train done!' |
|
| # 计算概率 |
| def calculate_probabilities(self, input_data): |
| # summaries: |
| # input_data:[1.1, 2.2] |
| probabilities = {} |
| for label, value in self.model.items(): |
| probabilities[label] = 1 |
| for i in range(len(value)): |
| mean, stdev = value[i] |
| probabilities[label] *= self.gaussian_probability( |
| input_data[i], mean, stdev) |
| return probabilities |
|
| # 类别 |
| def predict(self, X_test): |
| # |
| label = sorted( |
| self.calculate_probabilities(X_test).items(), |
| key=lambda x: x[-1])[-1][0] |
| return label |
|
| def score(self, X_test, y_test): |
| right = 0 |
| for X, y in zip(X_test, y_test): |
| label = self.predict(X) |
| if label == y: |
| right += 1 |
| return right / float(len(X_test)) |
| In [6]: |
|
| model = NaiveBayes() |
| In [7]: |
|
| model.fit(X_train, y_train) |
| In [8]: |
|
| print(model.predict([4.4, 3.2, 1.3, 0.2])) |
| In [9]: |
|
| model.score(X_test, y_test) |
| In [10]: |
|
| from sklearn.naive_bayes import GaussianNB |
| In [11]: |
|
| clf = GaussianNB() |
| clf.fit(X_train, y_train) |
| In [12]: |
|
| clf.score(X_test, y_test) |
| In [13]: |
|
| clf.predict([[4.4, 3.2, 1.3, 0.2]]) |
| In [14]: |
|
| from sklearn.naive_bayes import BernoulliNB, MultinomialNB # 伯努利模型和多项式模型 |
| 六:实验结果 |
![]() |