jieba分词

一.对西游记文档的分词

代码如下:

import jieba

import jieba


def takeSecond(elem):
    return elem[1]


def main():
    path = "西游记.txt"
    file = open(path, "r", encoding="utf-8")
    text = file.read()
    file.close()

    words = jieba.lcut(text)
    counts = {}
    for word in words:
        if len(word) == 1:
            continue
        elif word == "大圣" or word == "老孙" or word == "行者" or word == "孙大圣" or word == "孙行者" or word == "猴王" or word == "悟空" or word == "齐天大圣" or word == "猴子":
            rword = "孙悟空"
        elif word == "师父" or word == "三藏" or word == "圣僧":
            rword = "唐僧"
        elif word == "呆子" or word == "八戒" or word == "老猪":
            rword = "猪八戒"
        elif word == "沙和尚":
            rword = "沙僧"
        elif word == "妖精" or word == "妖魔" or word == "妖道":
            rword = "妖怪"
        elif word == "佛祖":
            rword = "如来"
        elif word == "三太子":
            rword = "白马"
        else:
            rword = word
        counts[word] = counts.get(word, 0) + 1

    items = list(counts.items())
    items.sort(key=takeSecond, reverse=True)

    for i in range(20):
        item = items[i]
        keyWord = item[0]
        count = item[1]
        print("{0:<10}{1:>5}".format(keyWord, count))

main()

运行结果如下:

  一部 35
  三藏 30
  行者 28
  如来 20
  唐僧 18
  圣僧 18
  八戒 15
  师父 14
  佛祖 14
  大仙 12
  沙僧 10
  正是 9
  金刚 9
  四众 8
  山门 8
  东土 8
  菩萨 8  
  论经 8
  传经 7
  长老 7

 

posted @ 2021-11-14 09:34  小魏子~  阅读(63)  评论(0)    收藏  举报