pandas数据清洗示例

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# 1. 加载数据
def load_data(file_path):
    """加载数据集"""
    try:
        # 尝试读取不同格式的文件
        if file_path.endswith(".csv"):
            df = pd.read_csv(file_path)
        elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
            df = pd.read_excel(file_path)
        else:
            raise ValueError("不支持的文件格式")
        return df
    except Exception as e:
        print(f"加载数据时出错: {e}")
        return None


# 2. 数据探索
def explore_data(df):
    """探索数据基本情况"""
    # 显示基本信息
    print("数据集形状:", df.shape)
    print("\n数据类型:\n", df.dtypes)
    print("\n数据预览:\n", df.head())
    print("\n数据描述性统计:\n", df.describe())
    print("\n缺失值统计:\n", df.isnull().sum())

    # 可视化缺失值
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.isnull(), cbar=False, yticklabels=False, cmap="viridis")
    plt.title("缺失值可视化")
    plt.tight_layout()
    plt.show()


# 3. 处理缺失值
def handle_missing_values(df):
    """处理缺失值"""
    # 复制数据框以避免修改原始数据
    df_clean = df.copy()

    # 对数值列使用均值填充
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df_clean[col].isnull().sum() > 0:
            mean_val = df_clean[col].mean()
            df_clean[col].fillna(mean_val, inplace=True)

    # 对分类列使用众数填充
    categorical_cols = df_clean.select_dtypes(include=["object"]).columns
    for col in categorical_cols:
        if df_clean[col].isnull().sum() > 0:
            mode_val = df_clean[col].mode()[0]
            df_clean[col].fillna(mode_val, inplace=True)

    return df_clean


# 4. 处理异常值
def handle_outliers(df, numeric_cols=None):
    """使用IQR方法处理异常值"""
    df_clean = df.copy()

    if numeric_cols is None:
        numeric_cols = df_clean.select_dtypes(include=[np.number]).columns

    for col in numeric_cols:
        # 计算IQR
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1

        # 定义异常值边界
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # 将异常值替换为边界值
        df_clean.loc[df_clean[col] < lower_bound, col] = lower_bound
        df_clean.loc[df_clean[col] > upper_bound, col] = upper_bound

    return df_clean


# 5. 数据标准化/归一化
def normalize_data(df, method="minmax", numeric_cols=None):
    """标准化/归一化数据"""
    df_norm = df.copy()

    if numeric_cols is None:
        numeric_cols = df_norm.select_dtypes(include=[np.number]).columns

    for col in numeric_cols:
        if method == "minmax":
            # Min-Max归一化
            min_val = df_norm[col].min()
            max_val = df_norm[col].max()
            df_norm[col] = (df_norm[col] - min_val) / (max_val - min_val)
        elif method == "zscore":
            # Z-score标准化
            mean_val = df_norm[col].mean()
            std_val = df_norm[col].std()
            df_norm[col] = (df_norm[col] - mean_val) / std_val

    return df_norm


# 6. 编码分类变量
def encode_categorical_variables(df):
    """编码分类变量"""
    df_encoded = df.copy()

    categorical_cols = df_encoded.select_dtypes(include=["object"]).columns

    # 使用pandas的get_dummies进行独热编码
    df_encoded = pd.get_dummies(df_encoded, columns=categorical_cols, drop_first=True)

    return df_encoded


# 7. 特征工程
def feature_engineering(df):
    """创建新特征"""
    df_featured = df.copy()

    # 这里只是示例,实际特征工程需要根据具体数据集调整
    numeric_cols = df_featured.select_dtypes(include=[np.number]).columns

    # 创建一些交互特征
    if len(numeric_cols) >= 2:
        df_featured["interaction"] = (
            df_featured[numeric_cols[0]] * df_featured[numeric_cols[1]]
        )

    # 创建多项式特征
    for col in numeric_cols[:2]:  # 只对前两列创建
        df_featured[f"{col}_squared"] = df_featured[col] ** 2

    return df_featured


# 8. 主函数:整合所有数据清洗步骤
def clean_data(file_path):
    """整合所有数据清洗步骤"""
    # 加载数据
    df = load_data(file_path)
    if df is None:
        return None

    # 探索数据
    explore_data(df)

    # 处理缺失值
    df_clean = handle_missing_values(df)

    # 处理异常值
    df_clean = handle_outliers(df_clean)

    # 编码分类变量
    df_encoded = encode_categorical_variables(df_clean)

    # 特征工程
    df_featured = feature_engineering(df_encoded)

    # 标准化数据
    df_normalized = normalize_data(df_featured, method="zscore")

    print("\n数据清洗完成!")
    return df_normalized


# 使用示例
if __name__ == "__main__":
    # 替换为你的数据文件路径
    file_path = "your_data_file.csv"
    clean_df = clean_data(file_path)

    if clean_df is not None:
        # 保存清洗后的数据
        clean_df.to_csv("cleaned_data.csv", index=False)
        print("清洗后的数据已保存到 cleaned_data.csv")
posted @ 2025-05-16 08:38  卓能文  阅读(24)  评论(0)    收藏  举报