线性回归
线性回归算法介绍
- 解决回归问题
- 思想简单,实现容易
- 许多强大的非线性模型的基础
- 结果具有很好的解释性
- 蕴含机器学习中很多重要的思想
简单线性回归代价函数推导

实现简单线性回归
import numpy as np
import matplotlib.pyplot as plt
x = np.array([1,2,3,4,5])
y = np.array([1,3,2,3,5])
x_mean = np.mean(x)
y_mean = np.mean(y)
a = np.sum(np.subtract(x,x_mean)*np.subtract(y,y_mean))/np.sum(np.power((x-x_mean),2))
b = y_mean-a*x_mean
y_predict = a*x + b
print("a =",a,"\n","b = ",b)
print(y_predict)
plt.scatter(x,y)
plt.plot(x,y_predict)
自己实现简单线性回归
class SimpleLinearRegression:
def __init__(self):
"""初始化简单线性模型"""
self.a_ = None
self.b_ = None
def fit(self,trainX,trainY):
"""根据数据训练简单线性模型"""
assert trainX.ndim==1,"SimpleLinearRegression must be input 1D"
x_mean = np.mean(trainX)
y_mean = np.mean(trainY)
self.a_ = np.dot(np.subtract(trainX,x_mean),np.subtract(trainY,y_mean))/np.sum(np.power((trainX-x_mean),2))
self.b_ = y_mean-self.a_*x_mean
return self
def _predict(self,test_x):
"""给定单个待测数,返回预测值"""
test_y = self.a_*test_x + self.b_
return test_y
def predict(self,testX):
"""给定待测数据集,返回结果向量"""
return np.array([self._predict(i) for i in testX])
simpleLinearRegression = SimpleLinearRegression() simpleLinearRegression.fit(x,y) print(simpleLinearRegression.a_) print(simpleLinearRegression.b_) simpleLinearRegression.predict(x)
衡量线性回归的指标
均方误差MSE
均方根误差RMSE
平均绝对误差MAE
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_selection import train_test_split boston = datasets.load_boston() x = boston.data[:,5] # 只使用房间数量这特征 y = bsoton.target x = x[y<50] y = y[y<50] plt.scatter(x,y) plt.xlim(2,10) plt.ylim(0,60)
trainX,testX,trainY,testY = train_test_split(x,y,test_size=0.2) simpleLinearRegression = SimpleLinearRegression() simpleLinearRegression.fit(trainX,trainY) train_y = simpleLinearRegression.predict(trainX) test_y = simpleLinearRegression.predict(testX) plt.subplot(1,2,1) plt.scatter(trainX,trainY) plt.plot(trainX,train_y) plt.subplot(1,2,2) plt.scatter(testX,testY) plt.plot(testX,test_y)
MSE
mse = np.sum(np.power(np.subtract(test_y,testY),2))/test_y.shape[0] mse >>>40.95154236464736
RMSE
rmse = np.sqrt(mse) rmse >>>6.39933921312563
MAE
mae = np.sum(np.abs(test_y-testY))/test_y.shape[0] mae >>>4.399066068730565
R Squared

r = 1-mse/np.var(testY) r >>>0.2541624531120106 from sklearn.metrics import r2_score r2_score(testY,test_y) >>>0.2541624531120107
多元线性回归
优点:不需要对数据做归一化处理
缺点:时间复杂度高

自己实现正规方程法LinearRegressor
from sklearn.metrics import r2_score
class LinearRegressor:
def __init__(self):
self.coef_ = None # θ向量
self.intercept_ = None # 截距
self._theta = None
def fit(self,trainX,trainY):
x = np.hstack((np.ones((trainX.shape[0],1)),trainX))
self._theta = np.linalg.inv(np.dot(x.T,x)).dot(x.T).dot(trainY)
self.intercept_ = self._theta[0] # 截距
self.coef_ = self._theta[1:] # 权重参数weights
return self
def predict(self,testX):
x = np.hstack((np.ones((testX.shape[0],1)),testX))
# y = np.dot(x,self._theta.reshape(-1,1))
y = np.dot(x,self._theta)
return y
def score(self,testX,testY):
return r2_score(testY,self.predict(testX))
trainX,t testX,trainY,testY = train_test_split(x,y,test_size=0.2,random_state=666)
lin_reg = LinearRegressor()
lin_reg.fit(trainX,trainY)
print(lin_reg.predict(testX).shape)
print(lin_reg.coef_)
print(lin_reg.intercept_)
print(lin_reg.score(testX,testY))
>>>(98, 1)
>>>[-1.18919477e-01 3.63991462e-02 -3.56494193e-02 5.66737830e-02
-1.16195486e+01 3.42022185e+00 -2.31470282e-02 -1.19509560e+00
2.59339091e-01 -1.40112724e-02 -8.36521175e-01 7.92283639e-03
-3.81966137e-01]
>>>34.16143549624022
>>>0.8129802602658537
使用sklearn中的LinearRegressor
boston = datasets.load_boston()
x = boston.data
y = bsoton.target
x = x[y<50]
y = y[y<50]
x.shape
>>>(490, 13)
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
trainX,testX,trainY,testY = train_test_split(x,y,test_size=0.2,random_state=666)
lin_reg = LinearRegression()
lin_reg.fit(trainX,trainY)
print(lin_reg.predict(testX).shape)
print(lin_reg.coef_)
print(lin_reg.intercept_)
print(lin_reg.score(testX,testY))
>>>(98,)
>>>[-1.18919477e-01 3.63991462e-02 -3.56494193e-02 5.66737830e-02
-1.16195486e+01 3.42022185e+00 -2.31470282e-02 -1.19509560e+00
2.59339091e-01 -1.40112724e-02 -8.36521175e-01 7.92283639e-03
-3.81966137e-01]
>>>34.16143549624665
>>>0.8129802602658495
KNN Regressor
from sklearn.neighbors import KNeighborsRegressor from sklearn.metrics import r2_score kneighborsRegressor = KNeighborsRegressor() kneighborsRegressor.fit(trainX,trainY) y_predict = kneighborsRegressor.predict(testX) r2_score(testY,y_predict) >>>0.5865412198300899
更多关于线性回归模型的讨论
- 对数据具有强解释性,正负相关影响因素大小,从而使我们有目标性的采集数据
- 对数据有假设:线性
- 拿到一组数据先用线性回归试试看,总归是没有坏处的。

浙公网安备 33010602011771号