jieba分词
一.对西游记文档的分词
代码如下:
import jieba
import jieba
def takeSecond(elem):
return elem[1]
def main():
path = "西游记.txt"
file = open(path, "r", encoding="utf-8")
text = file.read()
file.close()
words = jieba.lcut(text)
counts = {}
for word in words:
if len(word) == 1:
continue
elif word == "大圣" or word == "老孙" or word == "行者" or word == "孙大圣" or word == "孙行者" or word == "猴王" or word == "悟空" or word == "齐天大圣" or word == "猴子":
rword = "孙悟空"
elif word == "师父" or word == "三藏" or word == "圣僧":
rword = "唐僧"
elif word == "呆子" or word == "八戒" or word == "老猪":
rword = "猪八戒"
elif word == "沙和尚":
rword = "沙僧"
elif word == "妖精" or word == "妖魔" or word == "妖道":
rword = "妖怪"
elif word == "佛祖":
rword = "如来"
elif word == "三太子":
rword = "白马"
else:
rword = word
counts[word] = counts.get(word, 0) + 1
items = list(counts.items())
items.sort(key=takeSecond, reverse=True)
for i in range(20):
item = items[i]
keyWord = item[0]
count = item[1]
print("{0:<10}{1:>5}".format(keyWord, count))
main()
运行结果如下:
一部 35
三藏 30
行者 28
如来 20
唐僧 18
圣僧 18
八戒 15
师父 14
佛祖 14
大仙 12
沙僧 10
正是 9
金刚 9
四众 8
山门 8
东土 8
菩萨 8
论经 8
传经 7
长老 7

浙公网安备 33010602011771号