gensim训练模型并使用以人民的名义小说为例子
首先对小说进行预处理,使用jieba分词进行分词
import jieba import jieba.analyse jieba.suggest_freq('沙瑞金', True) jieba.suggest_freq('田国富', True) jieba.suggest_freq('高育良', True) jieba.suggest_freq('侯亮平', True) jieba.suggest_freq('钟小艾', True) jieba.suggest_freq('陈岩石', True) jieba.suggest_freq('欧阳菁', True) jieba.suggest_freq('易学习', True) jieba.suggest_freq('王大路', True) jieba.suggest_freq('蔡成功', True) jieba.suggest_freq('孙连城', True) jieba.suggest_freq('季昌明', True) jieba.suggest_freq('丁义珍', True) jieba.suggest_freq('郑西坡', True) jieba.suggest_freq('赵东来', True) jieba.suggest_freq('高小琴', True) jieba.suggest_freq('赵瑞龙', True) jieba.suggest_freq('林华华', True) jieba.suggest_freq('陆亦可', True) jieba.suggest_freq('刘新建', True) jieba.suggest_freq('刘庆祝', True) with open('./in_the_name_of_people.txt', encoding="utf8") as f: document = f.read() # print(document) # document_decode = document.decode('GBK') document_cut = jieba.cut(document) # print ' '.join(jieba_cut) //如果打印结果,则分词效果消失,后面的result无法显示 result = ' '.join(document_cut) # result = result.encode('utf-8') # print(result) with open('./in_the_name_of_people_segment.txt', 'w', encoding="utf8") as f2: f2.write(result)
训练模型:
import logging import os from gensim.models import word2vec logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.LineSentence('./in_the_name_of_people_segment.txt') model = word2vec.Word2Vec(sentences, hs=1, min_count=1, window=3)
模型的三种应用:
# 找出相近词的集合 print(model.wv.similar_by_key("沙瑞金", topn=5)) # 查看两个词的相近程度 print("沙瑞金|高育良的相似度:", model.wv.similarity("沙瑞金", "高育良")) # 找出不同类的词 print("沙瑞金 高育良 李达康 刘庆祝中不同类别的词为:",model.wv.doesnt_match(u"沙瑞金 高育良 李达康 刘庆祝".split()))

浙公网安备 33010602011771号