各种回归预测北京雾霾PM2.5
北京雾霾数据集
数据集:Source/PRSA_data_2010.1.1-2014.12.31.csv at main · ziwenhahaha/Source (github.com)
导包
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import r2_score as r2
from sklearn.decomposition import PCA
加载数据集
data = pd.read_csv('./datasets/PRSA_data_2010.1.1-2014.12.31.csv')
data.head()
特征工程
#特征工程
#1、把No列去掉
data.drop(labels=['No','year','day','hour'],axis=1,inplace=True)
#2、把pm2.5的NaN转换0
data = data.dropna(axis=0)
#3、把cbwd转换独热码
one_hot_df = pd.get_dummies(data['cbwd'])
data = pd.concat((data,one_hot_df),axis=1).drop(labels='cbwd',axis=1)
把Target值取Log
data[data['pm2.5']==0].count()
data = data.drop(data[data['pm2.5'] == 0].index)
data['pm2.5'] = np.log(data['pm2.5'])
特征、标签划分
feature = data.loc[:,(data.columns != 'pm2.5')]
target = data['pm2.5']
#4、主成分分析
#将数据分解为较低维度的空间
#n_components可以为小数(保留特征的百分比),整数(减少到的特征数量)
#pca = PCA(n_components=5)
#feature = pca.fit_transform(feature)
训练集和数据集划分
feature = data.loc[:,(data.columns != 'pm2.5')]
target = data['pm2.5']
#4、主成分分析
#将数据分解为较低维度的空间
#n_components可以为小数(保留特征的百分比),整数(减少到的特征数量)
#pca = PCA(n_components=5)
#feature = pca.fit_transform(feature)
import numpy as np
#归一化
# mm = MinMaxScaler() #####################很重要
# m_feature = mm.fit_transform(feature)#####################很重要
# m_target = mm.fit_transform(np.array(target).reshape(-1, 1))
# x_train,x_test,y_train,y_test = train_test_split(m_feature,m_target,test_size=0.2)
x_train,x_test,y_train,y_test = train_test_split(feature,target,test_size=0.2,random_state=42)
下面开始重头戏!!
千万别眨眼
一、标准化的线性回归
linner = LinearRegression(normalize=True) #这里是是否标准化
linner.fit(x_train,y_train) #开始训练
y_pred = linner.predict(x_test) #预测
mse = MSE(y_test,y_pred)
mae = MAE(y_test,y_pred)
print("标准化的线性回归")
print("MSE:",mse,"MAE:",mae)
print("R2:",r2(y_test,y_pred))
二、非标准化的线性回归
linner = LinearRegression(normalize=False) #这里是是否标准化
linner.fit(x_train,y_train) #开始训练
y_pred = linner.predict(x_test) #预测
mse = MSE(y_test,y_pred)
mae = MAE(y_test,y_pred)
print("非标准化的线性回归")
print("MSE:",mse,"MAE:",mae)
print("R2:",r2(y_test,y_pred))
三、Ridge回归
from sklearn.linear_model import Ridge
r = Ridge(alpha=0.9).fit(x_train,y_train)
r.fit(x_train,y_train) #开始训练
y_pred = r.predict(x_test) #预测
mse = MSE(y_test,y_pred)
mae = MAE(y_test,y_pred)
print("Ridge回归")
print("MSE:",mse,"MAE:",mae)
print("R2:",r2(y_test,y_pred))
四、Logistic回归
from sklearn.linear_model import LogisticRegression
l = LinearRegression(normalize=False) #这里是是否标准化
l.fit(x_train,y_train) #开始训练
y_pred = l.predict(x_test) #预测
mse = MSE(y_test,y_pred)
mae = MAE(y_test,y_pred)
print("Logistic回归")
print("MSE:",mse,"MAE:",mae)
print("R2:",r2(y_test,y_pred))
最终结果
标准化的线性回归
MSE: 6324.29522726665 MAE: 57.54255918425556
R2: 0.23999003992539147
非标准化的线性回归
MSE: 6324.238665887426 MAE: 57.54113577957322
R2: 0.2399968370798603
Ridge回归
MSE: 6324.238818536449 MAE: 57.54117316366388
R2: 0.2399968187355589
Logistic回归
MSE: 6324.238665887426 MAE: 57.54113577957322
R2: 0.2399968370798603
结论
我也不知道啊=-=。为啥这个结果这么辣鸡。。就很离谱。

浙公网安备 33010602011771号