nltk-构建和使用语料库-可用于小说的推荐

结合先前的python文章现在我们可以进一步的通过自然语言处理来实操演练,应用到网站中,更好的辅助seo
步骤1:构建语料库:

 1 #!/usr/bin/env python
 2 #-*-coding=utf-8-*-
 3  
 4  
 5 #数据源目录(二级目录)
 6 sourceDataDir='data'
 7  
 8 #数据源文件列表
 9 fileLists = []
10  
11 import os
12 from gensim import corpora, models, similarities
13              
14 def getSourceFileLists(sourceDataDir): 
15     fileLists = []
16     subDirList = os.listdir(sourceDataDir)
17     for subDir in subDirList:
18         subList = os.listdir(sourceDataDir + '/' + subDir)
19         fileList = [ sourceDataDir+'/'+subDir+'/'+ x for x in subList if os.path.isfile(sourceDataDir+'/'+subDir+'/'+x)]
20         fileLists += fileList
21  
22     return  fileLists  
23          
24          
25 fileLists = getSourceFileLists(sourceDataDir) 
26    
27    
28 if 0 < len(fileLists):
29     import codecs
30     import jieba
31     punctuations = ['','\n','\t',',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
32      
33     if not os.path.exists('dict'):
34         os.mkdir("dict")
35     if not os.path.exists('corpus'):
36         os.mkdir("corpus")
37  
38     for fileName in fileLists:
39         print fileName
40  
41         hFile = None
42         content = None
43         try:
44             hFile = codecs.open(fileName,'r','gb18030')
45             content = hFile.readlines()
46         except Exception,e:
47             print e
48         finally:
49             if hFile:
50                 hFile.close()
51          
52         if content:
53             fileFenci = [ x for x in jieba.cut(' '.join(content),cut_all=True)]
54             fileFenci2 = [word for word in fileFenci if not word in punctuations] 
55              
56             texts = [fileFenci2]
57  
58             all_tokens = sum(texts, [])
59             tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
60             texts = [[word for word in text if word not in tokens_once] for text in texts]
61  
62             sFileDir, sFileName = os.path.split(fileName)
63             dictFileName = 'dict/'+sFileName+'.dict'
64             corpusFileName = 'corpus/'+sFileName+'.mm'
65              
66             dictionary = corpora.Dictionary(texts)
67             dictionary.save_as_text(dictFileName)
68  
69             corpus = ([dictionary.doc2bow(text) for text in texts])
70             corpora.MmCorpus.serialize(corpusFileName, corpus)
71  
72 print 'Build corpus done'

 

数据源:

来自 http://d1.txthj.com/newrar/txthj_264.rar 的83篇小说,将其目录存放在目录 ./data/下。

加载时作为二层目录处理

输出:

./dict 和 ./corpus

在对应目录下生成 xxx.dict 和 xxx.mm,xxx为原文件的全称(不包括路径,包括后缀)

步骤2:加载语料库,相似性分析

  1 #!/usr/bin/env python
  2 #-*-coding=utf-8-*-
  3  
  4  
  5 import os
  6 from gensim import corpora, models, similarities
  7              
  8 def getFileList(dir):           
  9     return [ dir + x for x in os.listdir(dir)]
 10 dictLists =  getFileList('./dict/')
 11   
 12  
 13 class LoadDictionary(object):
 14     def __init__(self, dictionary):
 15         self.dictionary = dictionary
 16  
 17     def __iter__(self):
 18         for dictFile in dictLists:
 19             sFileRaw, sFilePostfix = os.path.splitext(dictFile)
 20             sFileDir, sFileName = os.path.split(sFileRaw)
 21             (dictFile, corpusFile) = ( './dict/' + sFileName + '.dict',  './corpus/'+sFileName + '.mm')
 22             yield self.dictionary.load_from_text(dictFile)
 23              
 24 class LoadCorpus(object):
 25  
 26     def __iter__(self):
 27         for dictFile in dictLists:
 28             sFileRaw, sFilePostfix = os.path.splitext(dictFile)
 29             sFileDir, sFileName = os.path.split(sFileRaw)
 30             (dictFile, corpusFile) = ( './dict/' + sFileName + '.dict',  './corpus/'+sFileName + '.mm')
 31             yield corpora.MmCorpus(corpusFile)
 32              
 33    
 34 """
 35     预处理(easy_install nltk)
 36 """
 37 #简化的 中文+英文 预处理
 38 def pre_process_cn(inputs, low_freq_filter = True):
 39     """
 40         1.去掉停用词
 41         2.去掉标点符号
 42         3.处理为词干
 43         4.去掉低频词
 44  
 45     """
 46     import nltk
 47     import jieba.analyse
 48     from nltk.tokenize import word_tokenize
 49      
 50     texts_tokenized = []
 51     for document in inputs:
 52         texts_tokenized_tmp = []
 53         for word in word_tokenize(document):
 54             texts_tokenized_tmp += jieba.analyse.extract_tags(word,10)
 55         texts_tokenized.append(texts_tokenized_tmp)   
 56      
 57     texts_filtered_stopwords = texts_tokenized
 58  
 59     #去除标点符号
 60     english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
 61     texts_filtered = [[word for word in document if not word in english_punctuations] for document in texts_filtered_stopwords]
 62  
 63     #词干化
 64     from nltk.stem.lancaster import LancasterStemmer
 65     st = LancasterStemmer()
 66     texts_stemmed = [[st.stem(word) for word in docment] for docment in texts_filtered]
 67      
 68     #去除过低频词
 69     if low_freq_filter:
 70         all_stems = sum(texts_stemmed, [])
 71         stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1)
 72         texts = [[stem for stem in text if stem not in stems_once] for text in texts_stemmed]
 73     else:
 74         texts = texts_stemmed
 75     return texts
 76  
 77 dictionary = corpora.dictionary.Dictionary()
 78 dictionary_memory_friendly = LoadDictionary(dictionary)
 79 for vector in dictionary_memory_friendly:
 80     dictionary = vector
 81  
 82 corpus = []
 83 corpus_memory_friendly = LoadCorpus()
 84 for vector in corpus_memory_friendly:
 85     corpus.append(vector[0])
 86      
 87 if 0 < len(corpus):
 88     tfidf = models.TfidfModel(corpus)
 89     corpus_tfidf = tfidf[corpus]
 90  
 91     model = models.LsiModel(corpus_tfidf, id2word=None, num_topics=20,  chunksize=2000000) #不指定 id2word=dictionary 时,LsiModel内部会根据 corpus 重建 dictionary
 92     index = similarities.Similarity('./novel_', model[corpus], num_features=len(corpus))
 93  
 94     #要处理的对象登场,这里随便从小说中截取了一段话
 95     target_courses = ['男人们的脸上沉重而冷凝,蒙着面纱的女人们则是发出断断续续的哭泣声,他们无比专注地看着前方,见证一场生与死的拉锯战。']
 96     target_text = pre_process_cn(target_courses, low_freq_filter=False)
 97  
 98     """
 99     对具体对象相似度匹配
100     """
101     #选择一个基准数据
102     ml_course = target_text[0]
103     #词袋处理
104     ml_bow = dictionary.doc2bow(ml_course)  
105  
106     #在上面选择的模型数据 lsi model 中,计算其他数据与其的相似度
107     ml_lsi = model[ml_bow]     #ml_lsi 形式如 (topic_id, topic_value)
108     sims = index[ml_lsi]     #sims 是最终结果了, index[xxx] 调用内置方法 __getitem__() 来计算ml_lsi
109  
110     #排序,为输出方便
111     sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
112  
113     #查看结果
114     print sort_sims[0:10]  
115     print len(dictLists)
116     print dictLists[sort_sims[1][0]]
117     print dictLists[sort_sims[2][0]]
118     print dictLists[sort_sims[3][0]]

 

说明:

yield的使用是为了更好的内存效率。

遗留问题:

步骤2会有提示:

/usr/lib/python2.7/dist-packages/scipy/sparse/compressed.py:122: UserWarning: indices array has non-integer dtype (float64)

不影响处理过程

原文:深蓝苹果 https://my.oschina.net/kakablue/home

posted @ 2017-04-13 16:15  l4617  阅读(366)  评论(0)    收藏  举报