jieba分词
一.对西游记文档的分词
代码如下:
import jieba
import jieba
def takeSecond(elem):
    return elem[1]
def main():
    path = "西游记.txt"
    file = open(path, "r", encoding="utf-8")
    text = file.read()
    file.close()
    words = jieba.lcut(text)
    counts = {}
    for word in words:
        if len(word) == 1:
            continue
        elif word == "大圣" or word == "老孙" or word == "行者" or word == "孙大圣" or word == "孙行者" or word == "猴王" or word == "悟空" or word == "齐天大圣" or word == "猴子":
            rword = "孙悟空"
        elif word == "师父" or word == "三藏" or word == "圣僧":
            rword = "唐僧"
        elif word == "呆子" or word == "八戒" or word == "老猪":
            rword = "猪八戒"
        elif word == "沙和尚":
            rword = "沙僧"
        elif word == "妖精" or word == "妖魔" or word == "妖道":
            rword = "妖怪"
        elif word == "佛祖":
            rword = "如来"
        elif word == "三太子":
            rword = "白马"
        else:
            rword = word
        counts[word] = counts.get(word, 0) + 1
    items = list(counts.items())
    items.sort(key=takeSecond, reverse=True)
    for i in range(20):
        item = items[i]
        keyWord = item[0]
        count = item[1]
        print("{0:<10}{1:>5}".format(keyWord, count))
main()
运行结果如下:
  一部 35
  三藏           30
  行者           28
  如来           20
  唐僧           18
  圣僧           18
  八戒           15
  师父           14
  佛祖           14
  大仙           12
  沙僧           10
  正是            9
  金刚            9
  四众            8
  山门            8
  东土            8
  菩萨            8  
  论经            8
  传经            7
  长老            7
 
                    
                     
                    
                 
                    
                
 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号