多项式朴素贝叶斯API
- 
from sklearn.naive_bayes import MultinomialNB
 MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
- 
alpha:拉普拉斯平滑系数,默认1,一般不需要改变 
实战
- 
使用fetch_20newsgroups中的数据,包含了20个主题的18000个新闻组的帖子 - 
流程: from sklearn.naive_bayes import MultinomialNB
 import sklearn.datasets as datasets
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.model_selection import train_test_split- 
加载20类新闻数据,并进行样本分隔 news = datasets.fetch_20newsgroups(subset='all')
 feature = news.data # 返回的是列表,列表中为一篇篇文章
 target = news.target # 返回的ndarray,储存的是每一篇文章的类别
 x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.2)
- 
生成文章特征词 tf = TfidfVectorizer() # 实例工具类
 x_train = tf.fit_transform(x_train) # 返回训练集所有文章中每个词的重要性
 x_test = tf.fit_transform(x_test) # 返回测试集所有文章中每个词的重要性
 # print(x_train)
 # print(tf.get_feature_names()) # 所有文章中出现的词语
 print(x_train.toarray().shape)
 
 (15076, 141989)
 
 Process finished with exit code 0
- 
使用模型进行文章分类 mlt = MultinomialNB(alpha=1)
 mlt.fit(x_train, y_train)
 y_predict = mlt.predict(x_test)
 print('预测文章类别为:', y_predict)
 print('真实文章类别为:', y_test)
 print('准确率为:', mlt.score(x_test, y_test))
 
- 
 
- 
- 
总 from sklearn.naive_bayes import MultinomialNB
 import sklearn.datasets as datasets
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.model_selection import train_test_split
 
 
 # 加载20类新闻数据,并进行样本分隔
 news = datasets.fetch_20newsgroups(subset='all')
 feature = news.data # 返回的是列表,列表中为一篇篇文章
 target = news.target # 返回的ndarray,储存的是每一篇文章的类别
 print(len(feature))
 print(len(target))
 x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.2)
 
 # 生成文章特征词
 tf = TfidfVectorizer() # 实例工具类
 x_train = tf.fit_transform(x_train) # 返回训练集所有文章中每个词的重要性
 x_test = tf.transform(x_test) # 返回测试集所有文章中每个词的重要性
 # print(x_train)
 # print(tf.get_feature_names()) # 所有文章中出现的词语
 print(x_train.shape)
 print(x_test.shape)
 # 使用模型进行文章分类
 
 mlt = MultinomialNB(alpha=1)
 mlt.fit(x_train, y_train)
 y_predict = mlt.predict(x_test)
 print('预测文章类别为:', y_predict)
 print('真实文章类别为:', y_test)
 print('准确率为:', mlt.score(x_test, y_test))
 
 18846
 18846
 (15076, 147050)
 (3770, 147050)
 预测文章类别为: [ 0 15 2 ... 18 17 11]
 真实文章类别为: [ 5 19 2 ... 18 17 11]
 准确率为: 0.8562334217506631
 
 Process finished with exit code 0
期间运行时出现问题,发现为将生成文章特征词时
将
x_train = tf.fit_transform(x_train)     # 返回训练集所有文章中每个词的重要性
x_test = tf.transform(x_test)     # 返回测试集所有文章中每个词的重要性
写为
x_train = tf.fit_transform(x_train)     # 返回训练集所有文章中每个词的重要性
x_test = tf.fit_transform(x_test)     # 返回测试集所有文章中每个词的重要性
- 
作用到鸢尾花 # 作用到鸢尾花
 from sklearn.datasets import load_digits
 
 
 digits = load_digits()
 X, Y = digits.data, digits.target
 x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=420)
 
 gnb = MultinomialNB()
 gnb.fit(x_train, y_train)
 
 # 查看分数
 acc_score = gnb.score(x_test, y_test)
 print("鸢尾花模型分数:", acc_score)
 
 鸢尾花模型分数: 0.9138888888888889
- 
注意: - 
fit_transform()做了两件事:fit找到数据转换规则,并将数据标准化 
- 
transfrom:是将数据进行转换,比如数据归一化和标准化,将测试数据按照训练数据同样的模型进行转换,得到特征向量,可以直接把转换规则拿来用,所以并不需要fit_transform(),否则两次标准化后的数据格式(或者说数据参数)就不一样了 
 
- 
 
                    
                 
 
                
            
         浙公网安备 33010602011771号
浙公网安备 33010602011771号