LinearRegression的一些简单sample
LinearRegression的一些简单sample
数据集:Source/house.xlsx at main · ziwenhahaha/Source (github.com)
房价预测
导包
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as r2
读数据,切分数据
data = pd.read_excel('./datasets/house.xlsx',engine='openpyxl')
data.drop(labels='No',axis=1,inplace=True)
feature = data.loc[:,(data.columns != 'Y house price of unit area')]
target = data['Y house price of unit area']
x_train,x_test,y_train,y_test = train_test_split(feature,target,test_size=0.2,random_state=2021)
开始训练
linner = LinearRegression(normalize=True) #这里是是否标准化
linner.fit(x_train,y_train) #开始训练
y_pred = linner.predict(x_test) #预测
#用测试集验证
print(MSE(y_test,y_pred))
print(y_test.min(),y_test.max())
print(r2(y_test,y_pred))
#用训练集验证
y_pred_train = linner.predict(x_train) #这里是用训练集去预测,那当然会更准确一点
print(MSE(y_train,y_pred_train))#虽然从MSE结果来看,并不是
print(r2(y_train,y_pred_train))#但是从R2来看,更接近1了,真不愧是训练集呢!
房价预测-PolynomialFeatures(增加平方样本、立方样本)
#增加二次项的测试
from sklearn.preprocessing import PolynomialFeatures
p = PolynomialFeatures(degree=2,include_bias=False) #这里是新建工具类,工具类属性是
#将原数据平方化,然后不含偏置项
feature = data.loc[:,(data.columns != 'Y house price of unit area')]
target = data['Y house price of unit area']
#高次项特征, 执行完之后,p的特征就是原特征,加上二次方的特征了
p_feature = p.fit_transform(feature)
#注意这里是用p_feature 去分割
x_train,x_test,y_train,y_test = train_test_split(p_feature,target,test_size=0.2,random_state=2021)
linner = LinearRegression().fit(x_train,y_train) #开始训练
#训练集的评估情况
y_pred_train = linner.predict(x_train)
print(MSE(y_train,y_pred_train),r2(y_train,y_pred_train))
#测试集的评估情况
y_pred_test = linner.predict(x_test)
print(MSE(y_test,y_pred_test),r2(y_test,y_pred_test))
#增加3次项的测试
from sklearn.preprocessing import PolynomialFeatures
p = PolynomialFeatures(degree=3,include_bias=False) #这里是新建工具类,工具类属性是
#将原数据平方化,然后不含偏置项
feature = data.loc[:,(data.columns != 'Y house price of unit area')]
target = data['Y house price of unit area']
#高次项特征, 执行完之后,p的特征就是原特征,加上二次方的特征了
p_feature = p.fit_transform(feature)
#注意这里是用p_feature 去分割
x_train,x_test,y_train,y_test = train_test_split(p_feature,target,test_size=0.2,random_state=2021)
linner = LinearRegression().fit(x_train,y_train) #开始训练
#训练集的评估情况
y_pred_train = linner.predict(x_train)
print(MSE(y_train,y_pred_train),r2(y_train,y_pred_train))
#测试集的评估情况
y_pred_test = linner.predict(x_test)
print(MSE(y_test,y_pred_test),r2(y_test,y_pred_test))
可视化高次项特征的影响
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt
# 样本的训练数据,特征和目标值
x_train = [[6], [8], [10], [14], [18]] #大小
y_train = [[7], [9], [13], [17.5], [18]]#价格
plt.scatter(x_train,y_train)
#绘制训练好的模型回归出的线形图
x = np.linspace(5,18,num=100)
y = linner.predict(x.reshape((-1,1)))
y_pred = linner.predict(x_train)
print(MSE(y_train,y_pred),r2(y_train,y_pred))
plt.scatter(x,y,s=0.5)
plt.scatter(x_train,y_train)
#这里开始二次方的特征补充
p = PolynomialFeatures(degree=2,include_bias=False)
#先用训练集x_train补充一下训练特征xx_train
xx_train = p.fit_transform(x_train)
print(xx_train.shape)
linner = LinearRegression().fit(xx_train,y_train)# 这里用xx_train和y_train训练
x = np.linspace(5,18,num=100).reshape(-1,1) #密集生成x点,作为直线
x_2 = p.fit_transform(x) #x_2是转换过后的x
print(x.shape) #查看形状
print(x_2.shape)#查看形状
y_2_pred = linner.predict(x_2).reshape(1,-1) #用xx去弄出来标签预测标签yy
print(y_2_pred.shape)
plt.scatter(x,y,s=0.5) #直线
plt.scatter(x_train,y_train) #散点
plt.scatter(x.reshape(1,-1),y_2_pred,s=0.5) #二次曲线 用原标签去画图,
Ridge回归
from sklearn.linear_model import Ridge
r = Ridge(alpha=0.9).fit(x_train,y_train)
模型的保存
import pickle
with open('./123.pkl','wb') as fp:
pickle.dump(linner,fp)
with open('./123.pkl','rb') as fp:
linner = pickle.load(fp)
linner.coef_ #参数

浙公网安备 33010602011771号