文本聚类
聚类:
1.通过词袋向量 进行聚类
a. word_vectors = model.wv.vectors
b. idx = KMeans(n_clusters=50, n_jobs=NUM_CORES)
c. word_centroid_map = dict(zip(model.wv.index2word, idx))
def create_bag_of_centroids(wordlist, word_centroid_map ):
"""
先对词聚类
每句话中 每个词的类之和
创建句子袋向量
:param wordlist: 词list
:param word_centroid_map: 预处理好的词向量聚类
:return:
"""
num_centroids = max(word_centroid_map.values()) + 1
bag_of_centroids = np.zeros(num_centroids, dtype="float32")
for word in wordlist:
if word in word_centroid_map:
index = word_centroid_map[word]
bag_of_centroids[index] += 1
return bag_of_centroids
2. 通过词向量得到的句向量
def makeFeatureVec(words, model, num_features=100):
"""
生成句向量
:param words: 词列表
:param model: 词向量模型
:param num_features: 最大特征数
:return:
"""
featureVec = np.zeros((num_features,), dtype="float32")
nwords = 0
index2word_set = set(model.wv.index2word)
for word in words:
if word in index2word_set:
nwords = nwords + 1
featureVec = np.add(featureVec, model.wv.__getitem__(word))
featureVec = np.divide(featureVec, nwords)
return featureVec
3. 通过doc2vec模型得到的向量 进行聚类
a. taggededDocument = gensim.models.doc2vec.TaggedDocument # 输入输出内容都为 词袋 + tag列表, 作用是记录每一篇文章的大致内容,
def tag_reviews(content, prefix):
tagged = []
for k, review in enumerate(content):
tagged.append(TaggedDocument(words=review.split(), tags=[prefix + '_%s' % k]))
return tagged
b. 模型构建
model_dbow = Doc2Vec(min_count=1, window=10, size=100, sample=1e-3, negative=5, dm=0, workers=3)
c.建立词典 用全量数据
model_dbow.build_vocab(all_tagged)
for i in range(10):
shuffle(train_tagged)
model_dbow.train(train_tagged, total_examples=len(all_tagged), epochs=1, start_alpha=0.025, end_alpha=0.025)
d.测试
# print(model_dbow.docvecs.most_similar('TRAIN_1'))
# print(model_dbow.docvecs['TRAIN_1'])
e. 保存
model_dbow.save("../static/doc2vec/doc2vec_lr100")
train_array_dbow = []
for i in range(len(train_tagged)):
tag = train_tagged[i].tags[0]
train_array_dbow.append(model_dbow.docvecs[tag])
np.savetxt('../static/doc2vec/train_feature_d2v.txt', train_array_dbow)
f.聚类
train_data_features_d2v = np.loadtxt('../static/doc2vec/train_feature_d2v.txt')
kmeans_clustering = KMeans(n_clusters=50, n_jobs=NUM_CORES)
idx = kmeans_clustering.fit_predict(train_data_features_d2v) # 聚类中心
posted on 2020-01-06 10:02 nnnnnnnnnnnnnnnn 阅读(509) 评论(0) 收藏 举报
浙公网安备 33010602011771号