自动生成词,并通过tf-idf算法分析不同文本的不同

本程序修改并集成自其它的两个人的源码,功能是:
1、自动从一大段文本中提取可能组成的词语。
2、通过输入多个文档,通过tf-idf算法自动提取各种文档的关键字。
3、通过提取关键字可以得到不同的QQ群讨论的主要内容。

关于第一条的原理说明请参考《互联网时代的社会语言学:基于SNS的文本数据挖掘》http://www.matrix67.com/blog/archives/5044

wordmaker.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# -*- coding=utf-8 -*-
import re
import collections
import math
  
# modify from https://gist.github.com/lastland/3322018
  
def info_entropy(words):
    result = 0
    total = sum([val for _, val in words.iteritems()])
    for word, cnt in words.iteritems():
        p = float(cnt) / total
        result -= p * math.log(p)
    return result
  
max_word_len = 5
entropy_threshold = 1
  
def make_words(content, filename):
    sentences = re.split("\W+|[a-zA-Z0-9]+", content, 0, re.UNICODE)
    #sentences = re.split(ur"[a-zA-Z0-9^\u4E00-\u9FA5]+", content, 0, re.UNICODE)
    freq = collections.Counter()
    for sentence in sentences:
        if sentence:
            l = len(sentence)
            wl = min(l, max_word_len)
            for i in range(1, wl + 1):
                for j in range(0, l - i + 1):
                    freq[sentence[j:j + i]] += 1
    total = sum([val for _, val in freq.iteritems()])
    ps = collections.defaultdict(int)
    for word, val in freq.iteritems():
        ps[word] = float(val) / total
  
    words = set()
    for word, word_p in ps.items():
        if len(word) > 1:
            p = 0
            for i in range(1, len(word)):
                t = ps[word[0:i]] * ps[word[i:]]
                p = max(p, t)
            if freq[word] >= 3 and word_p / p > 100:
                words.add(word)
  
    final_words = set()
    for word in words:
        lf = rf = True
        left_words = collections.Counter()
        right_words = collections.Counter()
        pattern = re.compile(word.join(['.?', '.?']))
        for sentence in sentences:
            l = pattern.findall(sentence)
            if l:
                if l[0][0] != word[0]:
                    left_words[l[0][0]] += 1
                else:
                    lf = False
                if l[0][-1] != word[-1]:
                    right_words[l[0][-1]] += 1
                else:
                    rf = False
        left_info_entropy = info_entropy(left_words)
        right_info_entropy = info_entropy(right_words)
        if lf and len(left_words) > 0 and left_info_entropy < entropy_threshold:
            continue
        if rf and len(right_words) > 0 and right_info_entropy < entropy_threshold:
            continue
        final_words.add(word)
    words_list = list(final_words)
    words_list.sort(cmp = lambda x, y: cmp(freq[y], freq[x]))
      
    final_freq = collections.Counter()
    file = open(filename, 'w')
  
    #for word,v in final_freq.iteritems():
    #for word in final_words:
    for word in words_list:
        v = freq[word]
        file.write("%s %d\n" % (word,v))
        final_freq[word] = v
  
    file.close()
  
    return final_freq

tfidf.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/env python
#
# Copyright 2009  Niniane Wang (niniane@gmail.com)
# Reviewed by Alex Mendes da Costa.
#
# This is a simple Tf-idf library.  The algorithm is described in
#   http://en.wikipedia.org/wiki/Tf-idf
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# Tfidf is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details:
#
#   http://www.gnu.org/licenses/lgpl.txt
  
__author__ = "Niniane Wang"
__email__ = "niniane at gmail dot com"
  
import math
import re
from operator import itemgetter
import collections
import codecs
  
#modify from http://code.google.com/p/tfidf/
  
class TfIdf:
  
  """Tf-idf class implementing http://en.wikipedia.org/wiki/Tf-idf.
    
     The library constructs an IDF corpus and stopword list either from
     documents specified by the client, or by reading from input files.  It
     computes IDF for a specified term based on the corpus, or generates
     keywords ordered by tf-idf for a specified document.
  """
  
  def __init__(self, corpus_filename = None, stopword_filename = None,
               DEFAULT_IDF = 1.5):
    """Initialize the idf dictionary. 
      
       If a corpus file is supplied, reads the idf dictionary from it, in the
       format of:
         # of total documents
         term: # of documents containing the term
  
       If a stopword file is specified, reads the stopword list from it, in
       the format of one stopword per line.
  
       The DEFAULT_IDF value is returned when a query term is not found in the
       idf corpus.
    """
    self.num_docs = 0
    self.term_num_docs = {}     # term : num_docs_containing_term
    self.stopwords = []
    self.idf_default = DEFAULT_IDF
  
    if corpus_filename:
      corpus_file = open(corpus_filename, "r")
  
      # Load number of documents.
      line = corpus_file.readline()
      self.num_docs = int(line.strip())
  
      # Reads "term:frequency" from each subsequent line in the file.
      for line in corpus_file:
       tokens = line.split(" ")
       term = tokens[0].strip()
       frequency = int(tokens[1].strip())
       self.term_num_docs[term] = frequency
  
    if stopword_filename:
      stopword_file = open(stopword_filename, "r")
      self.stopwords = [line.strip() for line in stopword_file]
  
  def get_tokens(self, str):
    """Break a string into tokens, preserving URL tags as an entire token.
  
       This implementation does not preserve case. 
       Clients may wish to override this behavior with their own tokenization.
    """
    return re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", str.lower())
  
  def add_input_document(self, input):
    """Add terms in the specified document to the idf dictionary."""
    self.num_docs += 1
    words = set(self.get_tokens(input))
    for word in words:
      if word in self.term_num_docs:
        self.term_num_docs[word] += 1
      else:
        self.term_num_docs[word] = 1
  def add_input_corpus(self, corpus_filename):
    if corpus_filename:
      corpus_file = codecs.open(corpus_filename, "r", "utf-8")
  
      # Load number of documents.
      self.num_docs += 1
  
      # Reads "term:frequency" from each subsequent line in the file.
      for line in corpus_file:
       tokens = line.split(" ")
       term = tokens[0].strip()
       frequency = int(tokens[1].strip())
       if term in self.term_num_docs:
           self.term_num_docs[term] += 1
       else:
           self.term_num_docs[term] = 1
  def save_corpus_to_file(self, idf_filename, stopword_filename,
                          STOPWORD_PERCENTAGE_THRESHOLD = 0.01):
    """Save the idf dictionary and stopword list to the specified file."""
    output_file = open(idf_filename, "w")
  
    output_file.write(str(self.num_docs) + "\n")
    for term, num_docs in self.term_num_docs.items():
      output_file.write(term + ": " + str(num_docs) + "\n")
  
    sorted_terms = sorted(self.term_num_docs.items(), key=itemgetter(1),
                          reverse=True)
    stopword_file = open(stopword_filename, "w")
    for term, num_docs in sorted_terms:
      if num_docs < STOPWORD_PERCENTAGE_THRESHOLD * self.num_docs:
        break
  
      stopword_file.write(term + "\n")
  
  def get_num_docs(self):
    """Return the total number of documents in the IDF corpus."""
    return self.num_docs
  
  def get_idf(self, term):
    """Retrieve the IDF for the specified term.
      
       This is computed by taking the logarithm of (
       (number of documents in corpus) divided by (number of documents
        containing this term) ).
     """
    if term in self.stopwords:
      return 0
  
    if not term in self.term_num_docs:
      return self.idf_default
  
    return math.log(float(1 + self.get_num_docs()) /
      (1 + self.term_num_docs[term]))
  
  def get_doc_keywords(self, curr_doc):
    """Retrieve terms and corresponding tf-idf for the specified document.
  
       The returned terms are ordered by decreasing tf-idf.
    """
    tfidf = {}
    tokens = self.get_tokens(curr_doc)
    tokens_set = set(tokens)
    for word in tokens_set:
      mytf = float(tokens.count(word)) / len(tokens_set)
      myidf = self.get_idf(word)
      tfidf[word] = mytf * myidf
  
    return sorted(tfidf.items(), key=itemgetter(1), reverse=True)
  def get_corpus_keywords(self, corpus_filename):
    tfidf = {}
    tokens_set = collections.Counter()
    tokens_len = 0
    if corpus_filename:
      corpus_file = codecs.open(corpus_filename, "r", "utf-8")
      for line in corpus_file:
        tokens = line.split(" ")
        term = tokens[0].strip()
        frequency = int(tokens[1].strip())
        tokens_set[term] = frequency
        tokens_len += 1
    for word,v in tokens_set.iteritems():
      mytf = float(v) / tokens_len
      myidf = self.get_idf(word)
      tfidf[word] = mytf * myidf
  
    return sorted(tfidf.items(), key=itemgetter(1), reverse=True)

test_idf.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# -*- coding=utf-8 -*-
import sys, os.path
reload(sys)
sys.setdefaultencoding('utf-8')
  
import wordmaker
import tfidf
import re
import codecs
  
files = ['qq0','qq1', 'qq2', 'qq3', 'qq4']
  
accepted_chars = re.compile(ur"\d+-\d+-\d+")
for filename in files:
    file = codecs.open(filename, 'r', 'utf-8')
    content = ''
    for line in file.readlines():
        if not accepted_chars.match(line):
            content += line
    file.close()
    #print type(content)
    final_freq = wordmaker.make_words(content, 'name_'+filename)
  
    #file = open('name_'+filename, 'w')
    #for word,v in final_freq.iteritems():
    #    file.write("%s %d\n" % (word,v))
    #file.close()
  
my_tfidf = tfidf.TfIdf()
for n in files:
    my_tfidf.add_input_corpus('name_'+n)
  
keywords = my_tfidf.get_corpus_keywords('name_'+files[1])
  
cnt = 0
for k,v in keywords:
    print k, v
    cnt += 1
    if cnt > 20:
        break

本文来自【余争】大神

posted @ 2017-04-13 16:24  l4617  阅读(162)  评论(0)    收藏  举报