第五周学习记录

使用kmeans对文本内容进行聚类:

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.cluster import KMeans
from sklearn import metrics
import jieba
import DB_fun
import copy
import info


def easy_get_parameter_k_means():
    res0 = []
    res1 = []
    data = []
    industry = []
    results = DB_fun.retrieve_all()
    for row in results:
        tec = info.tec(row[1], row[4], row[6], row[7])
        data = data + [tec.key_word]
        keyword = copy.deepcopy(data)
        industry = industry + [tec.industry]

    vec = CountVectorizer()
    X = vec.fit_transform([" ".join([b for b in jieba.cut(a)]) for a in data])
    tf = TfidfTransformer()
    X = tf.fit_transform(X.toarray())

    data = X.toarray()

    test_score = []
    n_clusters_end = 8  # 聚类个数
    n_clusters_start = 8  # 聚类个数
    while n_clusters_start <= n_clusters_end:
        km = KMeans(n_clusters=n_clusters_start)
        km.fit(data)
        clusters = km.labels_.tolist()
        # print(type(clusters))
        # print(clusters)
        score = metrics.silhouette_score(X=X, labels=clusters)
        num = sorted([(np.sum([1 for a in clusters if a == i]), i) for i in set(clusters)])[-1]
        test_score.append([n_clusters_start, score, num[0], num[1]])
        print([n_clusters_start, score, num[0], num[1]]) # 输出分数
        n_clusters_start += 1
        for i in range(0, 8):
            print("",i,"")
            for index in range(len(clusters)):
                if clusters[index] == i:
                    res = keyword[index] + " ———— " + industry[index]
                    res1.append(res)
                    print(res)
            res0.append(res1)
        # print(res0[0][11])
    # return pd.DataFrame(test_score, columns=['共分了几类', '分数', '最大类包含的个数', '聚类的名称']).sort_values(by='分数', ascending=False)
    return res0


# easy_get_parameter_k_means()  # 得到最佳参数
arrs = easy_get_parameter_k_means()
View Code

posted on 2020-03-21 20:10  丸za  阅读(83)  评论(0)    收藏  举报

导航