1 #!/usr/bin/env python
2 # encoding: utf-8
3
4 """
5 @author: zkjiang
6 @site: https://www.github.com
7 @software: PyCharm
8 @file: TFIDF.py
9 @time: 2019/2/2 12:33
10 """
11
12 import numpy as np
13
14 class TFIDF(object):
15
16 """
17 手写一个TFIDF统计类,只写最简单的一个实现
18 """
19
20 def __init__(self, corpus):
21 """
22 初始化
23 self.vob:词汇个数统计,dict格式
24 self.word_id:词汇编码id,dict格式
25 self.smooth_idf:平滑系数,关于平滑不多解释了
26 :param corpus:输入的语料
27 """
28 self.word_id = {}
29 self.vob = {}
30 self.corpus = corpus
31 self.smooth_idf = 0.01
32
33 def fit_transform(self, corpus):
34 pass
35
36 def get_vob_fre(self):
37 """
38 计算文本特特征的出现次数,也就是文本频率term frequency,但是没有除token总数,因为后面bincount计算不支持float
39 :return: 修改self.vob也就是修改词频统计字典
40 """
41 # 统计各词出现个数
42 id = 0
43 for single_corpus in self.corpus:
44 if isinstance(single_corpus, list):
45 pass
46 if isinstance(single_corpus, str):
47 single_corpus = single_corpus.strip("\n").split(" ")
48 for word in single_corpus:
49 if word not in self.vob:
50 self.vob[word] = 1
51 self.word_id[word] = id
52 id += 1
53 else:
54 self.vob[word] += 1
55
56 # 生成矩阵
57 X = np.zeros((len(self.corpus), len(self.vob)))
58 for i in range(len(self.corpus)):
59 if isinstance(self.corpus[i], str):
60 single_corpus = self.corpus[i].strip("\n").split(" ")
61 else:
62 single_corpus = self.corpus[i]
63 for j in range(len(single_corpus)):
64 feature = single_corpus[j]
65 feature_id = self.word_id[feature]
66 X[i, feature_id] = self.vob[feature]
67 return X.astype(int) # 需要转化成int
68
69
70 def get_tf_idf(self):
71 """
72 计算idf并生成最后的TFIDF矩阵
73 :return:
74 """
75 X = self.get_vob_fre()
76 n_samples, n_features = X.shape
77 df = []
78 for i in range(n_features):
79 """
80 这里是统计每个特征的非0的数量,也就是逆文档频率指数的分式中的分母,是为了计算idf
81 """
82 df.append(n_samples - np.bincount(X[:,i])[0])
83 df = np.array(df)
84 # perform idf smoothing if required
85 df += int(self.smooth_idf)
86 n_samples += int(self.smooth_idf)
87 idf = np.log(n_samples / df) + 1 # 核心公式
88 # print(self.vob)
89 # print(self.word_id)
90 return X*idf/len(self.vob)
91
92
93
94 if __name__ == '__main__':
95 corpus = [["我","a","e"],["我","a","c"],["我","a","b"]]
96 test = TFIDF(corpus)
97 # print(test.get_vob_fre())
98 print(test.get_tf_idf())