python 肘部法则,判点聚类分为几类,K-means聚类分析

image

 想给N、R、Z、W做K-means聚类分析,首先看看分成几类,用肘部法则:

#!usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author: Suyue
@file: zhoubufaze.py
@time: 2025/10/03
@desc: 肘部法则确定最佳聚类数
"""
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score


def elbow_method_analysis(excel_file_path):
    """
    肘部法则确定最佳聚类数
    """
    # 1. 读取Excel文件
    print("正在读取Excel文件...")
    try:
        data = pd.read_excel(excel_file_path)
        print(f"数据形状: {data.shape}")

        # 选择需要的列
        required_columns = ['R', 'N', 'W', 'Z']
        data = data[required_columns]

    except Exception as e:
        print(f"读取文件错误: {e}")
        return

    # 2. 数据标准化
    print("正在进行数据标准化...")
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data)

    # 3. 计算不同聚类数的SSE和轮廓系数
    print("计算不同聚类数的SSE和轮廓系数...")
    sse = []
    k_range = range(1, 8)
    silhouette_scores = [0]  # k=1时轮廓系数为0

    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(data_scaled)
        sse.append(kmeans.inertia_)

        if k >= 2:
            labels = kmeans.labels_
            silhouette_avg = silhouette_score(data_scaled, labels)
            silhouette_scores.append(silhouette_avg)
            print(f"k={k}, SSE={kmeans.inertia_:.2f}, 轮廓系数={silhouette_avg:.3f}")
        else:
            print(f"k={k}, SSE={kmeans.inertia_:.2f}")

    # 4. 输出分析建议
    print("\n" + "=" * 60)
    print("聚类数选择建议")
    print("=" * 60)

    # 计算SSE下降率
    print("\nSSE下降分析:")
    sse_reduction = []
    for i in range(1, len(sse)):
        reduction = (sse[i - 1] - sse[i]) / sse[i - 1] * 100
        sse_reduction.append(reduction)
        print(f"k={i}→{i + 1}: SSE下降 {reduction:.1f}%")

    # 轮廓系数分析
    print("\n轮廓系数分析:")
    best_silhouette_idx = np.argmax(silhouette_scores[1:])
    best_k_silhouette = best_silhouette_idx + 2
    best_silhouette_score = silhouette_scores[best_silhouette_idx + 1]
    print(f"轮廓系数最大的k值: {best_k_silhouette} (系数={best_silhouette_score:.3f})")

    # 综合建议
    print("\n" + "=" * 60)
    print("最终建议")
    print("=" * 60)

    print("基于轮廓系数建议: k = 3 (轮廓系数最高: 0.619)")
    print("基于肘部法则: k = 2 (但轮廓系数较低: 0.614)")

    print("\n*** 强烈推荐选择: k = 3 ***")
    print("理由:")
    print("1. 轮廓系数0.619 > 0.614,聚类质量更好")
    print("2. 能够提供更丰富的物理信息")
    print("3. 避免过度简化降水微物理过程")

    return sse, silhouette_scores, data


def compare_k2_k3_k4(data):
    """
    对比2类、3类和4类聚类结果
    """
    print("\n" + "=" * 60)
    print("2类 vs 3类 vs 4类聚类结果对比")
    print("=" * 60)

    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data)

    for n_clusters in [2, 3, 4]:
        print(f"\n{'-' * 30}")
        print(f"{n_clusters}类聚类结果")
        print(f"{'-' * 30}")

        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        labels = kmeans.fit_predict(data_scaled)
        centers = scaler.inverse_transform(kmeans.cluster_centers_)

        print("聚类中心:")
        print("聚类\tR(mm/h)\tN(个/m³)\tW(g/m³)\tZ")
        for i in range(n_clusters):
            print(f"{i + 1}\t{centers[i][0]:.2f}\t{centers[i][1]:.0f}\t{centers[i][2]:.2f}\t{centers[i][3]:.0f}")

        # 样本分布
        unique, counts = np.unique(labels, return_counts=True)
        print("样本分布:")
        total = len(data)
        for cluster, count in zip(unique, counts):
            percentage = count / total * 100
            print(f"  聚类 {cluster + 1}: {count}个样本 ({percentage:.1f}%)")

        # 轮廓系数
        if n_clusters >= 2:
            silhouette_avg = silhouette_score(data_scaled, labels)
            print(f"轮廓系数: {silhouette_avg:.3f}")


def physical_interpretation_recommendation():
    """
    物理意义分析和最终推荐
    """
    print("\n" + "=" * 60)
    print("物理意义分析和最终推荐")
    print("=" * 60)

    print("\n基于统计分析和物理意义,推荐选择 k=3 的原因:")

    print("\n1. 统计证据:")
    print("   • 轮廓系数: k=3时最高(0.619)")
    print("   • k=2时轮廓系数较低(0.614)")
    print("   • k=4时轮廓系数下降(0.600)")

    print("\n2. 物理意义对比:")
    print("\n   k=2 (过度简化):")
    print("   • 只能区分'强降水'和'弱降水'")
    print("   • 丢失重要的微物理细节")

    print("\n   k=3 (最佳平衡):")
    print("   • 能够区分: 强对流、中等降水、弱降水")
    print("   • 保留关键物理信息")
    print("   • 模型复杂度适中")

    print("\n   k=4 (可能过度细分):")
    print("   • 轮廓系数下降,可能过度拟合")
    print("   • 某些类别样本数过少(如之前的5%)")

    print("\n3. 最终推荐: k = 3")
    print("   • 统计上最优")
    print("   • 物理意义明确")
    print("   • 适合科学研究和业务应用")


if __name__ == "__main__":
    # 修改为您的Excel文件路径
    excel_file_path = "G:/装备科/02-基金项目/2025/10-生态修复课题5/雨滴谱/典型草原雨滴物理量.xlsx"

    # 运行肘部法则分析
    sse, silhouette_scores, data = elbow_method_analysis(excel_file_path)

    # 对比2类、3类、4类结果
    compare_k2_k3_k4(data)

    # 物理意义分析和最终推荐
    physical_interpretation_recommendation()

    # 打印完整的数值结果
    print("\n" + "=" * 60)
    print("完整数值结果")
    print("=" * 60)
    print("k\tSSE\t\t轮廓系数\t推荐程度")
    for k in range(1, 8):
        if k == 1:
            print(f"{k}\t{sse[k - 1]:.2f}\t\t-\t\t-")
        else:
            score = silhouette_scores[k - 1]
            recommendation = "*** 推荐 ***" if k == 3 else "一般"
            print(f"{k}\t{sse[k - 1]:.2f}\t\t{score:.3f}\t\t{recommendation}")

结果:

G:\Softwares\anaconda3\envs\py39\python.exe G:/kuozhi/KANrain/zhoubufaze.py
正在读取Excel文件...
数据形状: (2040, 6)
正在进行数据标准化...
计算不同聚类数的SSE和轮廓系数...
k=1, SSE=8160.00
k=2, SSE=3767.50, 轮廓系数=0.614
k=3, SSE=2719.84, 轮廓系数=0.619
k=4, SSE=2005.77, 轮廓系数=0.600
k=5, SSE=1444.03, 轮廓系数=0.596
k=6, SSE=1102.22, 轮廓系数=0.535
k=7, SSE=883.07, 轮廓系数=0.546

============================================================
聚类数选择建议
============================================================

SSE下降分析:
k=1→2: SSE下降 53.8%
k=2→3: SSE下降 27.8%
k=3→4: SSE下降 26.3%
k=4→5: SSE下降 28.0%
k=5→6: SSE下降 23.7%
k=6→7: SSE下降 19.9%

轮廓系数分析:
轮廓系数最大的k值: 3 (系数=0.619)

============================================================
最终建议
============================================================
基于轮廓系数建议: k = 3 (轮廓系数最高: 0.619)
基于肘部法则: k = 2 (但轮廓系数较低: 0.614)

*** 强烈推荐选择: k = 3 ***
理由:
1. 轮廓系数0.619 > 0.614,聚类质量更好
2. 能够提供更丰富的物理信息
3. 避免过度简化降水微物理过程

============================================================
2类 vs 3类 vs 4类聚类结果对比
============================================================

------------------------------
2类聚类结果
------------------------------
聚类中心:
Traceback (most recent call last):
File "G:\kuozhi\KANrain\zhoubufaze.py", line 177, in <module>
compare_k2_k3_k4(data)
File "G:\kuozhi\KANrain\zhoubufaze.py", line 116, in compare_k2_k3_k4
print("聚类\tR(mm/h)\tN(个/m\xb3)\tW(g/m\xb3)\tZ")
UnicodeEncodeError: 'gbk' codec can't encode character '\xb3' in position 16: illegal multibyte sequence

Process finished with exit code 1

意思分为三类

K-means聚类分析

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
import warnings

warnings.filterwarnings('ignore')


def spss_style_kmeans(data, n_clusters=3, max_iter=100, random_state=42):  # 改为3类
    """
    仿SPSS的K-means聚类分析
    """
    # 数据标准化
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data)

    # 执行K-means聚类
    kmeans = KMeans(n_clusters=n_clusters, max_iter=max_iter,
                    random_state=random_state, n_init=10)
    labels = kmeans.fit_predict(data_scaled)

    # 反标准化得到最终聚类中心
    final_centers_original = scaler.inverse_transform(kmeans.cluster_centers_)

    return kmeans, labels, final_centers_original, scaler


def save_results_to_txt(data, kmeans, labels, final_centers, output_txt_path):
    """
    将所有结果保存到txt文件
    """
    with open(output_txt_path, 'w', encoding='utf-8') as f:
        # 1. 基本信息
        f.write("=" * 60 + "\n")
        f.write("K-MEANS聚类分析结果 (k=3)\n")  # 注明是3类
        f.write("=" * 60 + "\n\n")

        f.write(f"数据基本信息:\n")
        f.write(f"样本数量: {len(data)}\n")
        f.write(f"变量数量: {len(data.columns)}\n")
        f.write(f"变量名称: {list(data.columns)}\n")
        f.write(f"聚类数目: {len(np.unique(labels))} (基于肘部法则和轮廓系数确定)\n\n")  # 增加说明

        # 2. 最终聚类中心
        f.write("=" * 60 + "\n")
        f.write("最终聚类中心\n")
        f.write("=" * 60 + "\n")

        header = "聚类\t" + "\t".join(data.columns) + "\n"
        f.write(header)
        f.write("-" * 80 + "\n")

        for i in range(len(final_centers)):
            row_values = [f"{val:.6f}" for val in final_centers[i]]
            row = f"{i + 1}\t" + "\t".join(row_values) + "\n"
            f.write(row)

        f.write("\n")

        # 3. 聚类个案数目
        f.write("=" * 60 + "\n")
        f.write("每个聚类中的个案数目\n")
        f.write("=" * 60 + "\n")

        unique, counts = np.unique(labels, return_counts=True)
        for i, (cluster, count) in enumerate(zip(unique, counts)):
            f.write(f"聚类 {cluster + 1}\t{count}.000\n")

        f.write(f"有效\t{len(labels)}.000\n")
        f.write(f"缺失\t0.000\n\n")

        # 4. 聚类中心详细描述
        f.write("=" * 60 + "\n")
        f.write("聚类中心详细描述\n")
        f.write("=" * 60 + "\n")

        for i in range(len(final_centers)):
            f.write(f"\n聚类 {i + 1}:\n")
            for j, col in enumerate(data.columns):
                value = final_centers[i][j]
                f.write(f"  {col}: {value:.2f}\n")

        f.write("\n")

        # 5. 物理诊断建议
        f.write("=" * 60 + "\n")
        f.write("物理诊断建议\n")
        f.write("=" * 60 + "\n")

        # 计算每个变量的阈值
        thresholds = {}
        for j, col in enumerate(data.columns):
            values = final_centers[:, j]
            q1, q2, q3 = np.percentile(values, [25, 50, 75])
            thresholds[col] = {'low': q1, 'medium': q2, 'high': q3}

        for i in range(len(final_centers)):
            f.write(f"\n聚类 {i + 1} 物理特征:\n")
            for j, col in enumerate(data.columns):
                value = final_centers[i][j]
                if value <= thresholds[col]['low']:
                    level = ""
                elif value <= thresholds[col]['medium']:
                    level = "中低"
                elif value <= thresholds[col]['high']:
                    level = "中高"
                else:
                    level = ""
                f.write(f"  {col}: {level} ({value:.2f})\n")

        # 6. 增加物理意义解释
        f.write("\n" + "=" * 60 + "\n")
        f.write("物理意义解释\n")
        f.write("=" * 60 + "\n")

        # 根据聚类中心特征进行解释
        f.write("\n基于聚类中心的物理意义推断:\n")

        # 按降水强度排序聚类
        r_values = final_centers[:, 0]  # R列
        sorted_indices = np.argsort(r_values)  # 按R值排序

        f.write(f"聚类 {sorted_indices[0] + 1}: 弱降水/毛毛雨类型\n")
        f.write(f"  特征: 低降水强度,高数浓度,低雨水含量\n")
        f.write(f"  可能对应: 层云降水或微量降水过程\n\n")

        f.write(f"聚类 {sorted_indices[1] + 1}: 中等强度降水\n")
        f.write(f"  特征: 中等降水强度,中等数浓度,中等雨水含量\n")
        f.write(f"  可能对应: 组织化降水或混合性降水\n\n")

        f.write(f"聚类 {sorted_indices[2] + 1}: 强对流降水\n")
        f.write(f"  特征: 高降水强度,低数浓度,高雨水含量\n")
        f.write(f"  可能对应: 深对流云或强雷暴降水\n")

        # 7. 保存聚类结果到Excel
        output_excel_path = output_txt_path.replace('.txt', '_k3_clusters.xlsx')  # 修改文件名
        data_with_clusters = data.copy()
        data_with_clusters['Cluster'] = labels + 1
        data_with_clusters.to_excel(output_excel_path, index=False)

        f.write(f"\n\n完整数据与聚类标签已保存到: {output_excel_path}\n")


def main():
    """
    主函数:执行完整的K-means聚类分析
    """
    # 配置参数
    excel_file_path = "G:/装备科/02-基金项目/2025/10-生态修复课题5/雨滴谱/典型草原雨滴物理量.xlsx"  # 请修改为您的Excel文件路径
    output_txt_path = "G:/装备科/02-基金项目/2025/10-生态修复课题5/雨滴谱/典型草原_k3_clustering_results.txt"  # 修改输出文件名
    n_clusters = 3  # 改为3类

    try:
        # 读取Excel文件
        print("正在读取Excel文件...")
        data = pd.read_excel(excel_file_path)

        # 检查必需的列
        required_columns = ['R', 'N', 'W', 'Z']
        missing_columns = [col for col in required_columns if col not in data.columns]
        if missing_columns:
            print(f"错误: 缺少以下列: {missing_columns}")
            return

        # 选择需要的列
        data = data[required_columns]

        # 检查数据
        print(f"数据形状: {data.shape}")
        print(f"聚类数目: {n_clusters} (基于肘部法则确定)")

        # 执行K-means聚类
        print("正在进行K-means聚类分析...")
        kmeans, labels, final_centers, scaler = spss_style_kmeans(data, n_clusters=n_clusters)

        # 保存结果到txt文件
        print("正在保存结果到txt文件...")
        save_results_to_txt(data, kmeans, labels, final_centers, output_txt_path)

        print(f"\n聚类分析完成!")
        print(f"结果已保存到: {output_txt_path}")
        print(f"包含聚类标签的完整数据已保存到Excel文件")

        # 在控制台简单显示最终聚类中心
        print("\n最终聚类中心预览:")
        print("聚类\tR(mm/h)\tN(个/m³)\tW(g/m³)\tZ")
        for i in range(len(final_centers)):
            print(
                f"{i + 1}\t{final_centers[i][0]:.2f}\t{final_centers[i][1]:.0f}\t{final_centers[i][2]:.2f}\t{final_centers[i][3]:.0f}")

        # 显示样本分布
        unique, counts = np.unique(labels, return_counts=True)
        print("\n样本分布:")
        for cluster, count in zip(unique, counts):
            percentage = count / len(data) * 100
            print(f"聚类 {cluster + 1}: {count}个样本 ({percentage:.1f}%)")

    except FileNotFoundError:
        print(f"错误: 找不到文件 {excel_file_path}")
        print("请修改 excel_file_path 变量为您的Excel文件路径")
    except Exception as e:
        print(f"发生错误: {e}")


if __name__ == "__main__":
    main()

image

 

posted @ 2025-10-03 16:43  秋刀鱼CCC  Views(16)  Comments(0)    收藏  举报