python3 LDA主题模型以及TFIDF实现

import codecs  #主题模型
from gensim import corpora
from gensim.models import LdaModel
from gensim import models
from gensim.corpora import Dictionary
te = []
fp = codecs.open('input.txt','r')
for line in fp:
    line = line.split(',')
    te.append([ w  for w in line ])
print ('输入文本数量:',len(te)) 
dictionary = corpora.Dictionary(te)
corpus = [ dictionary.doc2bow(text) for text in te ]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
print(list(corpus_tfidf))#输出词的tfidf
print(list(corpus))#输出文本向量空间
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=20,passes=100) 
doc_topic = [a for a in lda[corpus]]
topics_r = lda.print_topics(num_topics = 20, num_words =20)
topic_name = codecs.open('topics_result3.txt','w')
for v in topics_r:
    topic_name.write(str(v)+'\n')
fp2 = codecs.open('documents_result.txt','w')
for t in doc_topic:
    c = []
    c.append([a[1] for a in t])
    print(t)
    m = max(c[0])
    
    for i in range(0, len(t)):
        if m in t[i]:
            #print(t[i])
            fp2.write(str(t[i][0]) + '  ' + str(t[i][1]) + '\n')#输出模型类和概览
            break

 

posted @ 2018-10-16 15:29  六盘水月照  阅读(4787)  评论(0编辑  收藏  举报