【作业】jieba分词分析西游记

 1 import jieba
 2 txt=open('D:\桌面\西游记.txt',"r",encoding='utf-8').read()
 3 excludes={"什么","一个","那里","怎么","我们","不知","两个","甚么",\
 4           "不是","只见","原来","不敢","如何","这个","不曾","闻言",\
 5           "正是","只是","出来","一声", "真个", "不得", "这里", \
 6           "今日" ,"那个","不见"}
 7 
 8 words =jieba.lcut(txt)
 9 counts = {}
10 for word in words:
11     if len(word) == 1:
12         continue
13     elif word == "行者" or word == "大圣" or \
14          word =="老孙" or word =="悟空" or word =="孙行者":
15         rword = "孙悟空"
16     elif word == "师父" or word =="三藏" or word =="长老":
17         rword = "唐僧"
18     elif word =="呆子":
19         rword ="八戒"
20     elif word =="那怪" or word =="小妖" :
21         rword ="妖精"
22     else:
23         rword = word
24     counts[rword] = counts.get(rword,0) + 1
25 
26 for word in excludes:
27     del counts[word]
28 items = list(counts.items())
29 items.sort(key=lambda x:x[1], reverse = True)
30 for i in range(20):
31    word,count=items[i]
32    print("{0:<10}{1:>5}".format(word,count))

 

posted @ 2020-11-22 23:21  littlechang  阅读(346)  评论(0)    收藏  举报