利用第三方库jieba进行中文分词

# -*- coding:utf-8 -*-
import jieba
fp = open('2018.txt', 'r', encoding="utf-8")
text = fp.read()
fp.close()

ls = jieba.lcut(text)
d = {}
res = {}
special_char = ['\n', '（', '）', '，', '。', '','、','、','“', '”', '较', '好', '了', '；', '把', '上', '对', '等', '与', '为', '的', '在', '和', '新', '以', '较', '好', '了']
for item in ls:
    d[item] = d.get(item, 0) + 1
for k in d:
    if d[k]>=20 and k not in special_char:
        res[k] = d[k]
for item in sorted(res, key=d.__getitem__):
    print("{}:{}".format(item, res[item]))

posted @ 2019-06-20 17:48 n0page404 阅读(133) 评论(0) 收藏举报

刷新页面返回顶部

n0page404

利用第三方库jieba进行中文分词

公告