ruijiege

  博客园 :: 首页 :: 博问 :: 闪存 :: 新随笔 :: 联系 :: 订阅 订阅 :: 管理 ::
#coding=utf-8
import numpy as np
import jieba

class TfIdf:
    def __init__(self,doc):
        self.doc = doc
        self.get_dic()
        
    def get_dic(self):
        stop_path = 'stop_word.txt'
        with open(stop_path,encoding="utf-8") as f:
            stop_dic = set(f.read().split("\n"))
        self.doc = [list(jieba.cut(sent)) for sent in self.doc]
        self.dic = sorted(list(set([word for sent in self.doc for word in sent if word not in stop_dic])))
    
    def cal_tf(self):
        self.tf = np.array([[round(sent.count(word)/len(sent),4) for word in self.dic] for sent in self.doc])
    
    def cal_idf(self):
        self.idf = np.array([round(np.log(len(self.doc)/sum([1 for sent in self.doc if word in sent])),4) for word in self.dic])
    
    def cal_tfidf(self):
        self.cal_tf()
        self.cal_idf()
        self.tfidf = self.tf*self.idf    

if __name__=="__main__":
    doc = ['女排北京奥运会夺冠',
           '北京奥运会的羽毛球男单决赛',
           '中国队女排夺北京奥运会金牌重返巅峰观众欢呼女排女排女排']
    tf_idf = TfIdf(doc)
    tf_idf.cal_tfidf()
    print(tf_idf.tfidf)

 

posted on 2021-11-30 17:07  哦哟这个怎么搞  阅读(27)  评论(0编辑  收藏  举报