红酒质量评估:数据理解-模型可视化
描述:基于数据集,进行数据理解、清洗、预处理、分析与建模,通过线性回归模型预测红酒质量,并进行可视化展示。
一、数据理解
目标:了解数据集结构,查看质量评分分布
import pandas as pd
import matplotlib.pyplot as plt
def data_understanding():
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
df = pd.read_csv('red_wine_s.csv')
features = ['固定酸度', '挥发性酸度', '柠檬酸', '残糖', '氯化物',
'游离二氧化硫', '总二氧化硫', '密度', 'pH值', '硫酸盐', '酒精含量']
# 绘制质量评分分布
target = df.columns[-1]
plt.hist(df[target], bins=20, alpha=0.7, color='skyblue')
plt.title('质量评分分布')
plt.show()
return df, features
二、数据清洗
目标:处理缺失值与异常数据,使用中位数填充数值型缺失值
from sklearn.impute import SimpleImputer
def data_cleaning(df):
df_clean = df.copy()
# 转换数据类型为数值型
for col in df_clean.columns:
df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
# 中位数填充
imputer = SimpleImputer(strategy='median')
numeric_cols = df_clean.select_dtypes(include='number').columns
df_clean[numeric_cols] = imputer.fit_transform(df_clean[numeric_cols])
return df_clean
三、数据预处理
目标:标准化特征,划分训练集与测试集
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
def data_preprocessing():
df, features = data_understanding()
df_clean = data_cleaning(df)
target = df_clean.columns[-1]
X = df_clean[df_clean.columns[:-1]]
y = df_clean[target]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=0.3, random_state=42)
return X_train, X_test, y_train, y_test, features
四、数据分析
目标:分析特征与目标变量关系,绘制相关性热力图
import seaborn as sns
def data_analysis():
X_train, X_test, y_train, y_test, features = data_preprocessing()
# 绘制第一个特征与质量的关系
plt.scatter(X_train[:, 0], y_train, alpha=0.5)
plt.title(f'{features[0]}与质量评分关系')
plt.show()
# 绘制相关性热力图
temp_df = pd.DataFrame(X_train, columns=features)
temp_df['质量评分'] = y_train.values
sns.heatmap(temp_df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('特征相关性')
plt.show()
return X_train, X_test, y_train, y_test, features
五、回归分析
目标:建立线性回归模型,评估模型性能(R²、MAE)
绘制预测值与真实值对比图、残差图
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
def regression_analysis():
X_train, X_test, y_train, y_test, features = data_analysis()
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"模型性能: R² = {r2:.3f}, 平均绝对误差 = {mae:.3f}")
# 预测值 vs 真实值
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([3, 8], [3, 8], 'r--')
plt.title('预测值 vs 真实值')
plt.show()
# 残差图
residuals = y_test - y_pred
plt.scatter(y_pred, residuals, alpha=0.6, color='green')
plt.axhline(0, color='red', linestyle='--')
plt.title('残差分析图')
plt.show()
return model, X_test, y_test, y_pred, features
六、可视化(Data Visualization)
目标:展示特征重要性,三维特征空间可视化,预测值与真实值趋势对比
def data_visualization():
model, X_test, y_test, y_pred, features = regression_analysis()
# 特征重要性条形图
importance = pd.DataFrame({
'feature': features,
'importance': abs(model.coef_)
}).sort_values('importance', ascending=False)
plt.barh(importance['feature'], importance['importance'])
plt.title('特征重要性')
plt.show()
# 三维可视化
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
top_indices = [features.index(f) for f in importance.head(3)['feature']]
scatter = ax.scatter(X_test[:, top_indices[0]],
X_test[:, top_indices[1]],
X_test[:, top_indices[2]],
c=y_pred, cmap='viridis')
ax.set_xlabel(importance.iloc[0]['feature'])
ax.set_ylabel(importance.iloc[1]['feature'])
ax.set_zlabel(importance.iloc[2]['feature'])
plt.colorbar(scatter, label='预测质量评分')
plt.show()
# 趋势对比图
sorted_idx = y_test.argsort()
plt.plot(y_test.values[sorted_idx], label='真实值')
plt.plot(y_pred[sorted_idx], label='预测值', linestyle='--')
plt.title('预测值与真实值趋势对比')
plt.legend()
plt.show()
return importance

浙公网安备 33010602011771号