【Python小随笔】词频统计

    import collections
    from snownlp import SnowNLP
    def word_counts_action(self, text, top_number):
        """
        :param text:  统计的文本
        :param top_number:   输出词频前几
        """
        # 自定义去除词库
        remove_words = [
            u'的', u'，', u'!', u'和', u'是', u'随着', u'对于', u'对', u'等', u'能', u'都', u'。', u'！',
            u'你', u'|', u'一', u'不', u'！,', u'了', u'（', u'我', u'看', u'题', u' ', u'、', u'中', u'在',
            u'】', u',【', u'但', u',', u'通常', u'如果', u'我们', u'需要', u'： ', u'）, ',
            u'：', u'）,', u'｜', u'？', u'-', u'【', u'）', u',：', u'个', u'语', u'最', u'这', u'讲', u'年',
            u'+', u'人', u'/', u'?', u'？,', u'！！！！！', u'。,', u'~,', u'》,'
        ]
        # 先替换掉所有需要移除的词
        for w in remove_words:
            text = text.replace(w, "")
        # 分词
        seg_list_exact = SnowNLP(text).words
        # 去除空值
        clean_words = [w for w in seg_list_exact if w.strip()]
        # 统计词频
        word_counts = collections.Counter(clean_words)
        return word_counts.most_common(top_number)

posted @ 2022-05-06 21:23 PythonNew_Mr.Wang Views(73) Comments(0) 收藏举报

刷新页面返回顶部

PythonNew_Mr.Wang

【Python小随笔】词频统计

公告