快速进行词向量训练和读取
1.词向量训练demo
from gensim.models import Word2Vec from gensim.test.utils import common_texts import jieba import tqdm word2vec_path = './resources/word2vec.model' def word_vector_gener(): """ 几种不同的方法来生成词向量 :return: """ # 1.word2vec # 获取原始数据 DATA_PATH = './data/seo_search_word_copy.txt' # word2evctor = open('./word2vector.txt', 'w', encoding='utf8') word_list = [] finall = [] # jieba分词 with open(DATA_PATH, 'r', encoding='utf8') as file: for each_line in tqdm.tqdm(file.readlines()): query = each_line.strip().split('\t')[-1] # 分词 cut_word = jieba.lcut(query) finall.append(cut_word) # 训练模型 model = Word2Vec(finall, sg=1, size=10, window=2, min_count=1, negative=1, sample=0.001, workers=4) # model.save('./resources/word2vec.model') model.wv.save(word2vec_path) print(model['老师']) if __name__ == '__main__': word_vector_gener()
2.词向量加载demo(此方法为获得词向量最快)
word2vec_path = './resources/word2vec.model' wv = KeyedVectors.load(word2vec_path, mmap='r')
vector = wv['主管']
word = wv.most_similar(['主管'], topn=30)
print(word)
输出:
[('组长', 0.8488447070121765),
('经理', 0.8272342085838318),
('总监', 0.816636323928833),
('副经理', 0.8071938753128052),
('部长', 0.8019827604293823),
('专员', 0.7792257070541382),
('高级专员', 0.7695066332817078),
('主任', 0.7676611542701721),
('负责人', 0.761403501033783),
('部副', 0.7570186853408813),
('及', 0.7355248928070068),
('业务主管', 0.732032299041748),
('岗', 0.7316986322402954),
('副总', 0.7278518676757812),
('科长', 0.72648024559021),
('兼', 0.7262977957725525),
('助理', 0.7255839705467224),
('资深', 0.7252861261367798),
('组', 0.7167786955833435),
('储干', 0.7150581479072571),
('班长', 0.7146369218826294),
('职员', 0.7104721665382385),
('实习生', 0.707991898059845),
('支持', 0.7070707082748413),
('高级', 0.7055947184562683),
('管理人员', 0.7054109573364258),
('初级', 0.7042156457901001),
('副理', 0.7038965821266174),
('小组长', 0.7035383582115173),
('技术主管', 0.7024495601654053)]
时刻记着自己要成为什么样的人!
浙公网安备 33010602011771号