第五周学习记录
使用kmeans对文本内容进行聚类:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.cluster import KMeans
from sklearn import metrics
import jieba
import DB_fun
import copy
import info
def easy_get_parameter_k_means():
res0 = []
res1 = []
data = []
industry = []
results = DB_fun.retrieve_all()
for row in results:
tec = info.tec(row[1], row[4], row[6], row[7])
data = data + [tec.key_word]
keyword = copy.deepcopy(data)
industry = industry + [tec.industry]
vec = CountVectorizer()
X = vec.fit_transform([" ".join([b for b in jieba.cut(a)]) for a in data])
tf = TfidfTransformer()
X = tf.fit_transform(X.toarray())
data = X.toarray()
test_score = []
n_clusters_end = 8 # 聚类个数
n_clusters_start = 8 # 聚类个数
while n_clusters_start <= n_clusters_end:
km = KMeans(n_clusters=n_clusters_start)
km.fit(data)
clusters = km.labels_.tolist()
# print(type(clusters))
# print(clusters)
score = metrics.silhouette_score(X=X, labels=clusters)
num = sorted([(np.sum([1 for a in clusters if a == i]), i) for i in set(clusters)])[-1]
test_score.append([n_clusters_start, score, num[0], num[1]])
print([n_clusters_start, score, num[0], num[1]]) # 输出分数
n_clusters_start += 1
for i in range(0, 8):
print("第",i,"类")
for index in range(len(clusters)):
if clusters[index] == i:
res = keyword[index] + " ———— " + industry[index]
res1.append(res)
print(res)
res0.append(res1)
# print(res0[0][11])
# return pd.DataFrame(test_score, columns=['共分了几类', '分数', '最大类包含的个数', '聚类的名称']).sort_values(by='分数', ascending=False)
return res0
# easy_get_parameter_k_means() # 得到最佳参数
arrs = easy_get_parameter_k_means()

浙公网安备 33010602011771号