1 #!/usr/bin/python
2 # coding=utf-8
3 from sklearn.datasets import fetch_20newsgroups
4 from sklearn.model_selection import train_test_split
5 from sklearn.feature_extraction.text import TfidfVectorizer
6 from sklearn.naive_bayes import MultinomialNB
7
8
9 def nb_news():
10 #朴素贝叶斯对新闻进行分类
11
12 #获取数据
13 news = fetch_20newsgroups(subset="all")
14
15 #划分数据
16 x_train, x_test, y_train, y_test = train_test_split(news.data, news.target)
17
18 #特征工程:文本特征提取
19 transfer = TfidfVectorizer()
20 x_train = transfer.fit_transform(x_train)
21 x_test = transfer.transform(x_test)
22
23 #朴素贝叶斯算法估计
24 estimator = MultinomialNB()
25 estimator.fit(x_train, y_train)
26
27 #模型评估
28 y_predict = estimator.predict(x_test)
29 print "y_predict:\n", y_predict
30 print "对比真实值和预测值:\n", y_test == y_predict
31
32 # 方法二:计算正确率
33 score = estimator.score(x_test, y_test)
34 print "准确率:\n", score
35
36 return None
37
38 nb_news()