TFIDF代码实现

 1 #!/usr/bin/env python 
 2 # encoding: utf-8 
 3 
 4 """
 5 @author: zkjiang
 6 @site: https://www.github.com
 7 @software: PyCharm
 8 @file: TFIDF.py
 9 @time: 2019/2/2 12:33
10 """
11 
12 import numpy as np
13 
14 class TFIDF(object):
15 
16     """
17     手写一个TFIDF统计类,只写最简单的一个实现
18     """
19 
20     def __init__(self, corpus):
21         """
22         初始化
23         self.vob:词汇个数统计,dict格式
24         self.word_id:词汇编码id,dict格式
25         self.smooth_idf:平滑系数,关于平滑不多解释了
26         :param corpus:输入的语料
27         """
28         self.word_id = {}
29         self.vob = {}
30         self.corpus = corpus
31         self.smooth_idf = 0.01
32 
33     def fit_transform(self, corpus):
34         pass
35 
36     def get_vob_fre(self):
37         """
38         计算文本特特征的出现次数,也就是文本频率term frequency,但是没有除token总数,因为后面bincount计算不支持float
39         :return: 修改self.vob也就是修改词频统计字典
40         """
41         # 统计各词出现个数
42         id = 0
43         for single_corpus in self.corpus:
44             if isinstance(single_corpus, list):
45                 pass
46             if isinstance(single_corpus, str):
47                 single_corpus = single_corpus.strip("\n").split(" ")
48             for word in single_corpus:
49                 if word not in self.vob:
50                     self.vob[word] = 1
51                     self.word_id[word] = id
52                     id += 1
53                 else:
54                     self.vob[word] += 1
55 
56         # 生成矩阵
57         X = np.zeros((len(self.corpus), len(self.vob)))
58         for i in range(len(self.corpus)):
59             if isinstance(self.corpus[i], str):
60                 single_corpus = self.corpus[i].strip("\n").split(" ")
61             else:
62                 single_corpus = self.corpus[i]
63             for j in range(len(single_corpus)):
64                 feature = single_corpus[j]
65                 feature_id = self.word_id[feature]
66                 X[i, feature_id] = self.vob[feature]
67         return X.astype(int)  # 需要转化成int
68 
69 
70     def get_tf_idf(self):
71         """
72         计算idf并生成最后的TFIDF矩阵
73         :return:
74         """
75         X = self.get_vob_fre()
76         n_samples, n_features = X.shape
77         df = []
78         for i in range(n_features):
79             """
80             这里是统计每个特征的非0的数量,也就是逆文档频率指数的分式中的分母,是为了计算idf
81             """
82             df.append(n_samples - np.bincount(X[:,i])[0])
83         df = np.array(df)
84         # perform idf smoothing if required
85         df += int(self.smooth_idf)
86         n_samples += int(self.smooth_idf)
87         idf = np.log(n_samples / df) + 1  # 核心公式
88         # print(self.vob)
89         # print(self.word_id)
90         return X*idf/len(self.vob)
91 
92 
93 
94 if __name__ == '__main__':
95     corpus = [["","a","e"],["","a","c"],["","a","b"]]
96     test = TFIDF(corpus)
97     # print(test.get_vob_fre())
98     print(test.get_tf_idf())

 

posted @ 2020-03-10 23:52  博二爷  阅读(645)  评论(1编辑  收藏  举报