Loading

各种回归预测北京雾霾PM2.5

北京雾霾数据集

数据集Source/PRSA_data_2010.1.1-2014.12.31.csv at main · ziwenhahaha/Source (github.com)

导包

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import r2_score as r2

from sklearn.decomposition import PCA

加载数据集

data = pd.read_csv('./datasets/PRSA_data_2010.1.1-2014.12.31.csv')
data.head()

特征工程

#特征工程

#1、把No列去掉
data.drop(labels=['No','year','day','hour'],axis=1,inplace=True)

#2、把pm2.5的NaN转换0
data = data.dropna(axis=0)

#3、把cbwd转换独热码
one_hot_df = pd.get_dummies(data['cbwd'])
data = pd.concat((data,one_hot_df),axis=1).drop(labels='cbwd',axis=1)

把Target值取Log

data[data['pm2.5']==0].count()
data = data.drop(data[data['pm2.5'] == 0].index)
data['pm2.5'] = np.log(data['pm2.5']) 

特征、标签划分

feature = data.loc[:,(data.columns != 'pm2.5')]
target = data['pm2.5']
#4、主成分分析
#将数据分解为较低维度的空间
#n_components可以为小数(保留特征的百分比),整数(减少到的特征数量)
#pca = PCA(n_components=5)
#feature = pca.fit_transform(feature)

训练集和数据集划分

feature = data.loc[:,(data.columns != 'pm2.5')]
target = data['pm2.5']

#4、主成分分析
#将数据分解为较低维度的空间
#n_components可以为小数(保留特征的百分比),整数(减少到的特征数量)
#pca = PCA(n_components=5)
#feature = pca.fit_transform(feature)
import numpy as np

#归一化
# mm = MinMaxScaler()                  #####################很重要
# m_feature = mm.fit_transform(feature)#####################很重要
# m_target = mm.fit_transform(np.array(target).reshape(-1, 1))

# x_train,x_test,y_train,y_test = train_test_split(m_feature,m_target,test_size=0.2)
x_train,x_test,y_train,y_test = train_test_split(feature,target,test_size=0.2,random_state=42)

下面开始重头戏!!

千万别眨眼

一、标准化的线性回归

linner = LinearRegression(normalize=True) #这里是是否标准化
linner.fit(x_train,y_train) #开始训练
y_pred = linner.predict(x_test) #预测
mse = MSE(y_test,y_pred)
mae = MAE(y_test,y_pred)
print("标准化的线性回归")
print("MSE:",mse,"MAE:",mae)
print("R2:",r2(y_test,y_pred))

二、非标准化的线性回归

linner = LinearRegression(normalize=False) #这里是是否标准化
linner.fit(x_train,y_train) #开始训练
y_pred = linner.predict(x_test) #预测
mse = MSE(y_test,y_pred)
mae = MAE(y_test,y_pred)
print("非标准化的线性回归")
print("MSE:",mse,"MAE:",mae)
print("R2:",r2(y_test,y_pred))

三、Ridge回归

from sklearn.linear_model import Ridge
r = Ridge(alpha=0.9).fit(x_train,y_train)
r.fit(x_train,y_train) #开始训练
y_pred = r.predict(x_test) #预测
mse = MSE(y_test,y_pred)
mae = MAE(y_test,y_pred)
print("Ridge回归")
print("MSE:",mse,"MAE:",mae)
print("R2:",r2(y_test,y_pred))

四、Logistic回归

from sklearn.linear_model import LogisticRegression
l = LinearRegression(normalize=False) #这里是是否标准化
l.fit(x_train,y_train) #开始训练
y_pred = l.predict(x_test) #预测
mse = MSE(y_test,y_pred)
mae = MAE(y_test,y_pred)
print("Logistic回归")
print("MSE:",mse,"MAE:",mae)
print("R2:",r2(y_test,y_pred))

最终结果

标准化的线性回归
MSE: 6324.29522726665 MAE: 57.54255918425556
R2: 0.23999003992539147
非标准化的线性回归
MSE: 6324.238665887426 MAE: 57.54113577957322
R2: 0.2399968370798603
Ridge回归
MSE: 6324.238818536449 MAE: 57.54117316366388
R2: 0.2399968187355589
Logistic回归
MSE: 6324.238665887426 MAE: 57.54113577957322
R2: 0.2399968370798603

结论

我也不知道啊=-=。为啥这个结果这么辣鸡。。就很离谱。

posted @ 2021-09-03 21:28  my-island  阅读(470)  评论(0)    收藏  举报