jieba分割热词,统计频率,以及停用词
import jieba
from collections import Counter
if __name__ == '__main__':
filehandle = open("boke.txt", "r", encoding='utf-8',errors='ignore');
mystr = filehandle.read()
seg_list = jieba.cut(mystr) # 默认是精确模式
print(seg_list)
# all_words = cut_words.split()
# print(all_words)
stopwords = {}.fromkeys([line.rstrip() for line in open("stop.txt", "r", encoding='utf-8',errors='ignore')])
c = Counter()
for x in seg_list:
if x not in stopwords:
if len(x) > 1 and x != '\r\n':
c[x] += 1
print('\n词频统计结果:')
for (k, v) in c.most_common(50): # 输出词频最高的前两个词
print("%s:%d" % (k, v))
# print(mystr)
filehandle.close();
# seg2 = jieba.cut("好好学学python,有用。", cut_all=False)
# print("精确模式(也是默认模式):", ' '.join(seg2))
参考了龙哥的代码。自己代码总是出现转码问题


浙公网安备 33010602011771号