线性回归与最小二乘法 python实现
1 以简单线性回归为例

示例代码:
import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split # 实现SimpleLinearRegressional class SimpleLinearRegressional: def __init__(self): self.a = None self.b = None def fit(self, x_train, y_train): '''由训练集训练出模型''' assert x_train.ndim == 1,"简单线性回归只可以处理一个特征" assert len(x_train) == len(y_train),"训练集中x的维度和y的维度必须相等" x_mean, y_mean = np.mean(x_train), np.mean(y_train) # 法二,用for循环 # num = 0.0 # d = 0.0 # for x, y in zip(x_train, y_train): # num += (x - x_mean) * (y - y_mean) # d += (x - x_mean) ** 2 # 法二:用矩阵 num = (x_train - x_mean).dot(y_train - y_mean) d = (x_train - x_mean).dot(x_train - x_mean) self.a = num/d self.b = y_mean - self.a * x_mean return self def predict(self, x_predict): return np.array([self._predict(x) for x in x_predict]) def _predict(self, x_single): '''给定单个待预测的数据,返回预测值''' return self.b + self.a * x_single m = 1000 big_x = np.random.random(size=m) big_y = big_x * 2 + 3.0 + np.random.normal(size=m) x_train, x_test, y_train, y_test = train_test_split(big_x, big_y, test_size=0.3, random_state=42) reg1 = SimpleLinearRegressional() reg1.fit(x_train, y_train) # 预测数据 plt.plot(big_x, reg1.predict(big_x), label = 'fitted-curve') plt.show()
输出结果:

2 多元线性回归



多元回归代码实现:
预测数据是: x_b.dot(theta),其中x_b表示原始x前加一列1,theta表示多元回归拟合系数
import numpy as np from sklearn import datasets from sklearn.model_selection import train_test_split class LinearRegression: def __init__(self): self.coeff_ = None self.interception_ = None self._theta = None def fit(self, x_train, y_train): '''根据训练集训练线性回归模型''' assert x_train.shape[0] == y_train.shape[0], "维度必须相同" # 要先加一列 x_b = np.hstack([np.ones((len(x_train), 1)), x_train]) self._theta = np.linalg.inv(x_b.T.dot(x_b)).dot(x_b.T).dot(y_train) self.interception_ = self._theta[0] self.coeff_ = self._theta[1:] return self def predict(self, x_predict): '''给定预测数据集,返回表示结果''' assert self.interception_ is not None and self.coeff_ is not None,"预测应已训练好" assert x_predict.shape[1] == len(self.coeff_), "特征维度应当相同" x_b = np.hstack([np.ones((len(x_predict), 1)), x_predict]) return x_b.dot(self._theta) boston = datasets.load_boston() x = boston.data y = boston.target x = x[y<50] y = y[y<50] x_train, x_test, y_train, y_test = train_test_split(x, y ,test_size=0.3,random_state=42) reg = LinearRegression() reg.fit(x_train,y_train) print(reg.coeff_) print(reg.interception_)
输出系数:
[-1.23818781e-01 4.03899117e-02 -4.63364280e-02 -2.99732398e-02 -1.46880633e+01 3.33324672e+00 -2.12948682e-02 -1.38818508e+00 2.31608778e-01 -1.24333203e-02 -8.57628626e-01 6.89841247e-03 -3.75313011e-01] 37.55993342611868
若是通过from scipy.optimize import leastsq最小二乘法来实现多项式拟合(不再是原本线性回归),则是指定维数。以下以简单正弦拟合为例
import numpy as np import scipy as sp from scipy.optimize import leastsq import matplotlib.pyplot as plt # 目标函数 def real_func(x): return np.sin(2*np.pi*x) # 多项式 def fit_func(p, x): f = np.poly1d(p) return f(x) # 残差 def residuals_func(p, x, y): ret = fit_func(p, x) - y return ret # 十个点 x = np.linspace(0, 1, 10) x_points = np.linspace(0, 1, 1000) # 加上正态分布噪音的目标函数的值 y_ = real_func(x) y = [np.random.normal(0, 0.1) + y1 for y1 in y_] def fitting(M=0): """ M 为 多项式的次数 """ # 随机初始化多项式参数 p_init = np.random.rand(M + 1) # 最小二乘法 p_lsq = leastsq(residuals_func, p_init, args=(x, y)) # 把error函数中除了p以外的参数打包到args中 print('Fitting Parameters:', p_lsq[0]) # p_lsq,就会发现返回了一个truple,其中第一维是一个列表,保存的是拟合的曲线的参数 # 可视化 plt.plot(x_points, real_func(x_points), label='real') plt.plot(x_points, fit_func(p_lsq[0], x_points), label='fitted curve') plt.plot(x, y, 'bo', label='noise') plt.legend() return p_lsq # M=0 p_lsq_0 = fitting(M=0) # M=1 p_lsq_1 = fitting(M=1) # M=3 p_lsq_3 = fitting(M=3) # M=9,过拟合 p_lsq_9 = fitting(M=9) plt.show()

如果需要在最小二乘法中加入正则项以防过拟合,需要在初始参数p_init中多设置一个值,为正则项系数。
同时要记得修改优化目标函数
'''用用目标函数y=sin2pix, 加上一个正态分布的噪音干扰,用多项式去拟合''' import numpy as np import scipy as sp from scipy.optimize import leastsq import matplotlib.pyplot as plt # 目标函数 def real_func(x): return np.sin(2*np.pi*x) # 多项式 def fit_func(p, x): f = np.poly1d(p) return f(x) # 残差 def residuals_func(p, x, y): ret = fit_func(p, x) - y return ret # 十个点 x = np.linspace(0, 1, 10) x_points = np.linspace(0, 1, 1000) # 加上正态分布噪音的目标函数的值 y_ = real_func(x) y = [np.random.normal(0, 0.1) + y1 for y1 in y_] def fitting(M=0): """ M 为 多项式的次数 """ # 随机初始化多项式参数 p_init = np.random.rand(M + 1) # 最小二乘法 p_lsq = leastsq(residuals_func, p_init, args=(x, y)) # 把error函数中除了p以外的参数打包到args中 print('Fitting Parameters:', p_lsq[0]) # p_lsq,就会发现返回了一个truple,其中第一维是一个列表,保存的是拟合的曲线的参数 return p_lsq # M=9,过拟合 p_lsq_9 = fitting(M=9) # 引入正则化 # 需要修改损失函数 regularization = 0.0001 def residuals_func_regularization(p, x, y): ret = fit_func(p, x) - y ret = np.append(ret, np.sqrt(0.5 * regularization * np.square(p))) # L2范数作为正则化项 return ret # 最小二乘法,加正则化项 # 所以初始参数要比原来多1个 p_init = np.random.rand(9 + 1) p_lsq_regularization = leastsq( residuals_func_regularization, p_init, args=(x, y)) # 把error函数中除了p以外的参数打包到args中 plt.plot(x_points, real_func(x_points), label='real') plt.plot(x_points, fit_func(p_lsq_9[0], x_points), label='fitted curve') plt.plot( x_points, fit_func(p_lsq_regularization[0], x_points), label='regularization') plt.plot(x, y, 'bo', label='noise') plt.legend() plt.show()


浙公网安备 33010602011771号