import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# 1. 加载数据
def load_data(file_path):
"""加载数据集"""
try:
# 尝试读取不同格式的文件
if file_path.endswith(".csv"):
df = pd.read_csv(file_path)
elif file_path.endswith(".xlsx") or file_path.endswith(".xls"):
df = pd.read_excel(file_path)
else:
raise ValueError("不支持的文件格式")
return df
except Exception as e:
print(f"加载数据时出错: {e}")
return None
# 2. 数据探索
def explore_data(df):
"""探索数据基本情况"""
# 显示基本信息
print("数据集形状:", df.shape)
print("\n数据类型:\n", df.dtypes)
print("\n数据预览:\n", df.head())
print("\n数据描述性统计:\n", df.describe())
print("\n缺失值统计:\n", df.isnull().sum())
# 可视化缺失值
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, yticklabels=False, cmap="viridis")
plt.title("缺失值可视化")
plt.tight_layout()
plt.show()
# 3. 处理缺失值
def handle_missing_values(df):
"""处理缺失值"""
# 复制数据框以避免修改原始数据
df_clean = df.copy()
# 对数值列使用均值填充
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
if df_clean[col].isnull().sum() > 0:
mean_val = df_clean[col].mean()
df_clean[col].fillna(mean_val, inplace=True)
# 对分类列使用众数填充
categorical_cols = df_clean.select_dtypes(include=["object"]).columns
for col in categorical_cols:
if df_clean[col].isnull().sum() > 0:
mode_val = df_clean[col].mode()[0]
df_clean[col].fillna(mode_val, inplace=True)
return df_clean
# 4. 处理异常值
def handle_outliers(df, numeric_cols=None):
"""使用IQR方法处理异常值"""
df_clean = df.copy()
if numeric_cols is None:
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
# 计算IQR
Q1 = df_clean[col].quantile(0.25)
Q3 = df_clean[col].quantile(0.75)
IQR = Q3 - Q1
# 定义异常值边界
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# 将异常值替换为边界值
df_clean.loc[df_clean[col] < lower_bound, col] = lower_bound
df_clean.loc[df_clean[col] > upper_bound, col] = upper_bound
return df_clean
# 5. 数据标准化/归一化
def normalize_data(df, method="minmax", numeric_cols=None):
"""标准化/归一化数据"""
df_norm = df.copy()
if numeric_cols is None:
numeric_cols = df_norm.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
if method == "minmax":
# Min-Max归一化
min_val = df_norm[col].min()
max_val = df_norm[col].max()
df_norm[col] = (df_norm[col] - min_val) / (max_val - min_val)
elif method == "zscore":
# Z-score标准化
mean_val = df_norm[col].mean()
std_val = df_norm[col].std()
df_norm[col] = (df_norm[col] - mean_val) / std_val
return df_norm
# 6. 编码分类变量
def encode_categorical_variables(df):
"""编码分类变量"""
df_encoded = df.copy()
categorical_cols = df_encoded.select_dtypes(include=["object"]).columns
# 使用pandas的get_dummies进行独热编码
df_encoded = pd.get_dummies(df_encoded, columns=categorical_cols, drop_first=True)
return df_encoded
# 7. 特征工程
def feature_engineering(df):
"""创建新特征"""
df_featured = df.copy()
# 这里只是示例,实际特征工程需要根据具体数据集调整
numeric_cols = df_featured.select_dtypes(include=[np.number]).columns
# 创建一些交互特征
if len(numeric_cols) >= 2:
df_featured["interaction"] = (
df_featured[numeric_cols[0]] * df_featured[numeric_cols[1]]
)
# 创建多项式特征
for col in numeric_cols[:2]: # 只对前两列创建
df_featured[f"{col}_squared"] = df_featured[col] ** 2
return df_featured
# 8. 主函数:整合所有数据清洗步骤
def clean_data(file_path):
"""整合所有数据清洗步骤"""
# 加载数据
df = load_data(file_path)
if df is None:
return None
# 探索数据
explore_data(df)
# 处理缺失值
df_clean = handle_missing_values(df)
# 处理异常值
df_clean = handle_outliers(df_clean)
# 编码分类变量
df_encoded = encode_categorical_variables(df_clean)
# 特征工程
df_featured = feature_engineering(df_encoded)
# 标准化数据
df_normalized = normalize_data(df_featured, method="zscore")
print("\n数据清洗完成!")
return df_normalized
# 使用示例
if __name__ == "__main__":
# 替换为你的数据文件路径
file_path = "your_data_file.csv"
clean_df = clean_data(file_path)
if clean_df is not None:
# 保存清洗后的数据
clean_df.to_csv("cleaned_data.csv", index=False)
print("清洗后的数据已保存到 cleaned_data.csv")