1 import os
2 import jieba
3 import random
4 """
5 函数说明:中文文本处理
6 Parameters:
7 folder_path - 文本存放的路径
8 test_size - 测试集占比,默认占所有数据集的百分之20
9 Returns:
10 all_words_list - 按词频降序排序的训练集列表
11 train_data_list - 训练集列表
12 test_data_list - 测试集列表
13 train_class_list - 训练集标签列表
14 test_class_list - 测试集标签列表
15 """
16 def TextProcessing(folder_path):
17 folder_list = os.listdir(folder_path) #查看folder_path下的文件
18 data_list = [] #训练集
19 class_list = []
20
21 #遍历每个子文件夹
22 for folder in folder_list:
23 new_folder_path = os.path.join(folder_path, folder) #根据子文件夹,生成新的路径
24 files = os.listdir(new_folder_path) #存放子文件夹下的txt文件的列表
25
26 j = 1
27 #遍历每个txt文件
28 for file in files:
29 if j > 100: #每类txt样本数最多100个
30 break
31 with open(os.path.join(new_folder_path, file), 'r', encoding = 'utf-8') as f: #打开txt文件
32 raw = f.read()
33
34 word_cut = jieba.cut(raw, cut_all = False) #精简模式,返回一个可迭代的generator
35 word_list = list(word_cut) #generator转换为list
36
37 data_list.append(word_list)
38 class_list.append(folder)
39 j += 1
40 data_class_list = list(zip(data_list, class_list)) # zip压缩合并,将数据与标签对应压缩
41 random.shuffle(data_class_list) # 将data_class_list乱序
42 index = int(len(data_class_list) * test_size) + 1 # 训练集和测试集切分的索引值
43 train_list = data_class_list[index:] # 训练集
44 test_list = data_class_list[:index] # 测试集
45 train_data_list, train_class_list = zip(*train_list) # 训练集解压缩
46 test_data_list, test_class_list = zip(*test_list) # 测试集解压缩
47
48 all_words_dict = {} # 统计训练集词频
49 for word_list in train_data_list:
50 for word in word_list:
51 if word in all_words_dict.keys():
52 all_words_dict[word] += 1
53 else:
54 all_words_dict[word] = 1
55
56 # 根据键的值倒序排序
57 all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f: f[1], reverse=True)
58 all_words_list, all_words_nums = zip(*all_words_tuple_list) # 解压缩
59 all_words_list = list(all_words_list) # 转换成列表
60 return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list
61
62 if __name__ == '__main__':
63 # 文本预处理
64 folder_path = './SogouC/Sample' # 训练集存放地址
65 all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(
66 folder_path, test_size=0.2)
67 print(all_words_list)
68 """
69 函数说明:读取文件里的内容,并去重
70 Parameters:
71 words_file - 文件路径
72 Returns:
73 words_set - 读取的内容的set集合
74 """
75 def MakeWordsSet(words_file):
76 words_set = set() # 创建set集合
77 with open(words_file, 'r', encoding='utf-8') as f: # 打开文件
78 for line in f.readlines(): # 一行一行读取
79 word = line.strip() # 去回车
80 if len(word) > 0: # 有文本,则添加到words_set中
81 words_set.add(word)
82 return words_set # 返回处理结果
83 """
84 函数说明:文本特征选取
85 Parameters:
86 all_words_list - 训练集所有文本列表
87 deleteN - 删除词频最高的deleteN个词
88 stopwords_set - 指定的结束语
89 Returns:
90 feature_words - 特征集
91 """
92 def words_dict(all_words_list, deleteN, stopwords_set=set()):
93 feature_words = [] # 特征列表
94 n = 1
95 for t in range(deleteN, len(all_words_list), 1):
96 if n > 1000: # feature_words的维度为1000
97 break
98 # 如果这个词不是数字,并且不是指定的结束语,并且单词长度大于1小于5,那么这个词就可以作为特征词
99 if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(
100 all_words_list[t]) < 5:
101 feature_words.append(all_words_list[t])
102 n += 1
103 return feature_words
104
105 if __name__ == '__main__':
106 # 文本预处理
107 folder_path = './SogouC/Sample' # 训练集存放地址
108 all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(
109 folder_path, test_size=0.2)
110
111 # 生成stopwords_set
112 stopwords_file = './stopwords_cn.txt'
113 stopwords_set = MakeWordsSet(stopwords_file)
114 feature_words = words_dict(all_words_list, 100, stopwords_set)
115 print(feature_words)
116
117 # print(data_list)
118 # print(class_list)
119 if __name__ == '__main__':
120 #文本预处理
121 folder_path = './SogouC/Sample' #训练集存放地址
122 TextProcessing(folder_path)
123 '''
124 将所有文本分成训练集和测试集,并对训练集中的所有单词进行词频统计,并按降序排序。
125 使用Sklearn构建朴素贝叶斯分类器
126 在scikit-learn中,一共有3个朴素贝叶斯的分类算法类。
127 分别是GaussianNB,MultinomialNB和BernoulliNB。
128 其中GaussianNB就是先验为高斯分布的朴素贝叶斯,
129 MultinomialNB就是先验为多项式分布的朴素贝叶斯,
130 而BernoulliNB就是先验为伯努利分布的朴素贝叶斯。
131 '''
132 from sklearn.naive_bayes import MultinomialNB
133 import matplotlib.pyplot as plt
134 """
135 函数说明:根据feature_words将文本向量化
136 Parameters:
137 train_data_list - 训练集
138 test_data_list - 测试集
139 feature_words - 特征集
140 Returns:
141 train_feature_list - 训练集向量化列表
142 test_feature_list - 测试集向量化列表
143 """
144 def TextFeatures(train_data_list, test_data_list, feature_words):
145 def text_features(text, feature_words): #出现在特征集中,则置1
146 text_words = set(text)
147 features = [1 if word in text_words else 0 for word in feature_words]
148 return features
149 train_feature_list = [text_features(text, feature_words) for text in train_data_list]
150 test_feature_list = [text_features(text, feature_words) for text in test_data_list]
151 return train_feature_list, test_feature_list #返回结果
152
153 """
154 函数说明:新闻分类器
155
156 Parameters:
157 train_feature_list - 训练集向量化的特征文本
158 test_feature_list - 测试集向量化的特征文本
159 train_class_list - 训练集分类标签
160 test_class_list - 测试集分类标签
161 Returns:
162 test_accuracy - 分类器精度
163 """
164 def TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list):
165 classifier = MultinomialNB().fit(train_feature_list, train_class_list)
166 test_accuracy = classifier.score(test_feature_list, test_class_list)
167 return test_accuracy
168
169 # if __name__ == '__main__':
170 # #文本预处理
171 # folder_path = './SogouC/Sample' #训练集存放地址
172 # all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2)
173 #
174 # # 生成stopwords_set
175 # stopwords_file = './stopwords_cn.txt'
176 # stopwords_set = MakeWordsSet(stopwords_file)
177
178 test_accuracy_list = []
179 deleteNs = range(0, 1000, 20) #0 20 40 60 ... 980
180 for deleteN in deleteNs:
181 feature_words = words_dict(all_words_list, deleteN, stopwords_set)
182 train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words)
183 test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list)
184 test_accuracy_list.append(test_accuracy)
185
186 plt.figure()
187 plt.plot(deleteNs, test_accuracy_list)
188 plt.title('Relationship of deleteNs and test_accuracy')
189 plt.xlabel('deleteNs')
190 plt.ylabel('test_accuracy')
191 plt.show()
192 if __name__ == '__main__':
193 # 文本预处理
194 folder_path = './SogouC/Sample' # 训练集存放地址
195 all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path,
196 test_size=0.2)
197
198 # 生成stopwords_set
199 stopwords_file = './stopwords_cn.txt'
200 stopwords_set = MakeWordsSet(stopwords_file)
201
202 test_accuracy_list = []
203 feature_words = words_dict(all_words_list, 450, stopwords_set)
204 train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words)
205 test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list)
206 test_accuracy_list.append(test_accuracy)
207 ave = lambda c: sum(c) / len(c)