nltk-构建和使用语料库-可用于小说的推荐
结合先前的python文章现在我们可以进一步的通过自然语言处理来实操演练,应用到网站中,更好的辅助seo
步骤1:构建语料库:
1 #!/usr/bin/env python 2 #-*-coding=utf-8-*- 3 4 5 #数据源目录(二级目录) 6 sourceDataDir='data' 7 8 #数据源文件列表 9 fileLists = [] 10 11 import os 12 from gensim import corpora, models, similarities 13 14 def getSourceFileLists(sourceDataDir): 15 fileLists = [] 16 subDirList = os.listdir(sourceDataDir) 17 for subDir in subDirList: 18 subList = os.listdir(sourceDataDir + '/' + subDir) 19 fileList = [ sourceDataDir+'/'+subDir+'/'+ x for x in subList if os.path.isfile(sourceDataDir+'/'+subDir+'/'+x)] 20 fileLists += fileList 21 22 return fileLists 23 24 25 fileLists = getSourceFileLists(sourceDataDir) 26 27 28 if 0 < len(fileLists): 29 import codecs 30 import jieba 31 punctuations = ['','\n','\t',',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%'] 32 33 if not os.path.exists('dict'): 34 os.mkdir("dict") 35 if not os.path.exists('corpus'): 36 os.mkdir("corpus") 37 38 for fileName in fileLists: 39 print fileName 40 41 hFile = None 42 content = None 43 try: 44 hFile = codecs.open(fileName,'r','gb18030') 45 content = hFile.readlines() 46 except Exception,e: 47 print e 48 finally: 49 if hFile: 50 hFile.close() 51 52 if content: 53 fileFenci = [ x for x in jieba.cut(' '.join(content),cut_all=True)] 54 fileFenci2 = [word for word in fileFenci if not word in punctuations] 55 56 texts = [fileFenci2] 57 58 all_tokens = sum(texts, []) 59 tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) 60 texts = [[word for word in text if word not in tokens_once] for text in texts] 61 62 sFileDir, sFileName = os.path.split(fileName) 63 dictFileName = 'dict/'+sFileName+'.dict' 64 corpusFileName = 'corpus/'+sFileName+'.mm' 65 66 dictionary = corpora.Dictionary(texts) 67 dictionary.save_as_text(dictFileName) 68 69 corpus = ([dictionary.doc2bow(text) for text in texts]) 70 corpora.MmCorpus.serialize(corpusFileName, corpus) 71 72 print 'Build corpus done'
数据源:
来自 http://d1.txthj.com/newrar/txthj_264.rar 的83篇小说,将其目录存放在目录 ./data/下。
加载时作为二层目录处理
输出:
./dict 和 ./corpus
在对应目录下生成 xxx.dict 和 xxx.mm,xxx为原文件的全称(不包括路径,包括后缀)
步骤2:加载语料库,相似性分析
1 #!/usr/bin/env python 2 #-*-coding=utf-8-*- 3 4 5 import os 6 from gensim import corpora, models, similarities 7 8 def getFileList(dir): 9 return [ dir + x for x in os.listdir(dir)] 10 dictLists = getFileList('./dict/') 11 12 13 class LoadDictionary(object): 14 def __init__(self, dictionary): 15 self.dictionary = dictionary 16 17 def __iter__(self): 18 for dictFile in dictLists: 19 sFileRaw, sFilePostfix = os.path.splitext(dictFile) 20 sFileDir, sFileName = os.path.split(sFileRaw) 21 (dictFile, corpusFile) = ( './dict/' + sFileName + '.dict', './corpus/'+sFileName + '.mm') 22 yield self.dictionary.load_from_text(dictFile) 23 24 class LoadCorpus(object): 25 26 def __iter__(self): 27 for dictFile in dictLists: 28 sFileRaw, sFilePostfix = os.path.splitext(dictFile) 29 sFileDir, sFileName = os.path.split(sFileRaw) 30 (dictFile, corpusFile) = ( './dict/' + sFileName + '.dict', './corpus/'+sFileName + '.mm') 31 yield corpora.MmCorpus(corpusFile) 32 33 34 """ 35 预处理(easy_install nltk) 36 """ 37 #简化的 中文+英文 预处理 38 def pre_process_cn(inputs, low_freq_filter = True): 39 """ 40 1.去掉停用词 41 2.去掉标点符号 42 3.处理为词干 43 4.去掉低频词 44 45 """ 46 import nltk 47 import jieba.analyse 48 from nltk.tokenize import word_tokenize 49 50 texts_tokenized = [] 51 for document in inputs: 52 texts_tokenized_tmp = [] 53 for word in word_tokenize(document): 54 texts_tokenized_tmp += jieba.analyse.extract_tags(word,10) 55 texts_tokenized.append(texts_tokenized_tmp) 56 57 texts_filtered_stopwords = texts_tokenized 58 59 #去除标点符号 60 english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%'] 61 texts_filtered = [[word for word in document if not word in english_punctuations] for document in texts_filtered_stopwords] 62 63 #词干化 64 from nltk.stem.lancaster import LancasterStemmer 65 st = LancasterStemmer() 66 texts_stemmed = [[st.stem(word) for word in docment] for docment in texts_filtered] 67 68 #去除过低频词 69 if low_freq_filter: 70 all_stems = sum(texts_stemmed, []) 71 stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1) 72 texts = [[stem for stem in text if stem not in stems_once] for text in texts_stemmed] 73 else: 74 texts = texts_stemmed 75 return texts 76 77 dictionary = corpora.dictionary.Dictionary() 78 dictionary_memory_friendly = LoadDictionary(dictionary) 79 for vector in dictionary_memory_friendly: 80 dictionary = vector 81 82 corpus = [] 83 corpus_memory_friendly = LoadCorpus() 84 for vector in corpus_memory_friendly: 85 corpus.append(vector[0]) 86 87 if 0 < len(corpus): 88 tfidf = models.TfidfModel(corpus) 89 corpus_tfidf = tfidf[corpus] 90 91 model = models.LsiModel(corpus_tfidf, id2word=None, num_topics=20, chunksize=2000000) #不指定 id2word=dictionary 时,LsiModel内部会根据 corpus 重建 dictionary 92 index = similarities.Similarity('./novel_', model[corpus], num_features=len(corpus)) 93 94 #要处理的对象登场,这里随便从小说中截取了一段话 95 target_courses = ['男人们的脸上沉重而冷凝,蒙着面纱的女人们则是发出断断续续的哭泣声,他们无比专注地看着前方,见证一场生与死的拉锯战。'] 96 target_text = pre_process_cn(target_courses, low_freq_filter=False) 97 98 """ 99 对具体对象相似度匹配 100 """ 101 #选择一个基准数据 102 ml_course = target_text[0] 103 #词袋处理 104 ml_bow = dictionary.doc2bow(ml_course) 105 106 #在上面选择的模型数据 lsi model 中,计算其他数据与其的相似度 107 ml_lsi = model[ml_bow] #ml_lsi 形式如 (topic_id, topic_value) 108 sims = index[ml_lsi] #sims 是最终结果了, index[xxx] 调用内置方法 __getitem__() 来计算ml_lsi 109 110 #排序,为输出方便 111 sort_sims = sorted(enumerate(sims), key=lambda item: -item[1]) 112 113 #查看结果 114 print sort_sims[0:10] 115 print len(dictLists) 116 print dictLists[sort_sims[1][0]] 117 print dictLists[sort_sims[2][0]] 118 print dictLists[sort_sims[3][0]]
说明:
yield的使用是为了更好的内存效率。
遗留问题:
步骤2会有提示:
/usr/lib/python2.7/dist-packages/scipy/sparse/compressed.py:122: UserWarning: indices array has non-integer dtype (float64)
不影响处理过程
原文:深蓝苹果 https://my.oschina.net/kakablue/home

浙公网安备 33010602011771号