垃圾
#导入邮件数据
1
2
3
4
5
6
7
8
9
10
11
|
import csv file_path = r 'F:\Pycharm\11.22\SMSSpamCollection' sms = open (file_path, 'r' ,encoding = 'utf-8' ) sms_data = [] sms_label = [] csv_reader = csv.reader(sms,delimiter = '\t' ) for line in csv_reader: sms_label.append(line[ 0 ]) sms_data.append(preprocessing(line[ 1 ])) sms.close() sms_data |
#将数据分类并对模型进行类别预测
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
from sklearn.model_selection import train_test_split x_train, x_text, y_train, y_test = train_test_split(sms_data, sms_label, test_size = 0.3 , random_state = 0 , stratify = sms_label) from sklearn.feature_extraction.text import TfidfVectorizer #向量化 vectorizer = TfidfVectorizer(min_df = 2 ,ngram_range = ( 1 , 2 ),stop_words = 'english' ,strip_accents = 'unicode' ,norm = '12' ) X_train = vectorizer.fit_transform(x_train) X_text = vectorizer.transform(x_test) X_train a = X_train.toarray() print (a) for i in range ( 1000 ): for j in range ( 5984 ): if a[i,j]! = 0 : print (i,j,a[i,j]) from sklearn.navie_bayes import MultinomialNB #导入贝叶斯分类器 clf = MultinomialNB().fit(X_train,y_train) y_nb_pred = clf.predict(X_test) from sklearn.metrics import confusion_matrix #导入混淆矩阵 from sklearn.metrics import classification_report #分类报告 print (y_nb_pred.shape,y_nb_pred) #预测结果 print ( 'nb_confusion_matrix:' ) cm = confusion_matrix(y_test,y_nb_pred) print (cm) print ( 'nb_classification_report:' ) cr = classification_report(y_test,y_nb_pred) #分类指标文本报告 print (cr) feature_name = vectorizer.get_feature_name() coefs = clf_coef_ intercept = clf.intercept_ coefs_with_fns = sorted ( zip (coefs[ 0 ],feature_names)) #对数概率p(x_i|y)与单词x_i映射 n = 10 top = zip (coefs_with_fns[:n],coefs_with_fns[: - (n + 1 ): - 1 ]) #最大的10个单词与最小的10个单词 for (coef_1,fn_1),(coef_2,fn_2) in top: print ( '\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1,fn_1,coef_2,fn_2)) |
#运行结果