线性回归与最小二乘法 python实现

1 以简单线性回归为例

示例代码:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
# 实现SimpleLinearRegressional
class SimpleLinearRegressional:

    def __init__(self):
        self.a = None
        self.b = None

    def fit(self, x_train, y_train):
        '''由训练集训练出模型'''
        assert x_train.ndim == 1,"简单线性回归只可以处理一个特征"
        assert  len(x_train) == len(y_train),"训练集中x的维度和y的维度必须相等"
        x_mean, y_mean = np.mean(x_train), np.mean(y_train)
        # 法二,用for循环
        # num = 0.0
        # d = 0.0
        # for x, y in zip(x_train, y_train):
        #     num += (x - x_mean) * (y - y_mean)
        #     d += (x - x_mean) ** 2
        # 法二:用矩阵
        num = (x_train - x_mean).dot(y_train - y_mean)
        d = (x_train - x_mean).dot(x_train - x_mean)
        self.a = num/d
        self.b = y_mean - self.a * x_mean
        return self

    def predict(self, x_predict):
        return np.array([self._predict(x) for x in x_predict])

    def _predict(self, x_single):
        '''给定单个待预测的数据,返回预测值'''
        return self.b + self.a * x_single

m = 1000
big_x = np.random.random(size=m)
big_y = big_x * 2 + 3.0 + np.random.normal(size=m)
x_train, x_test, y_train, y_test = train_test_split(big_x, big_y, test_size=0.3, random_state=42)
reg1 = SimpleLinearRegressional()
reg1.fit(x_train, y_train)
# 预测数据
plt.plot(big_x, reg1.predict(big_x), label = 'fitted-curve')
plt.show()

输出结果:

2 多元线性回归

 

 

 

 多元回归代码实现:

预测数据是: x_b.dot(theta),其中x_b表示原始x前加一列1,theta表示多元回归拟合系数

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
class LinearRegression:
    def __init__(self):
        self.coeff_ = None
        self.interception_ = None
        self._theta = None

    def fit(self, x_train, y_train):
        '''根据训练集训练线性回归模型'''
        assert x_train.shape[0] == y_train.shape[0], "维度必须相同"
        # 要先加一列
        x_b = np.hstack([np.ones((len(x_train), 1)), x_train])
        self._theta = np.linalg.inv(x_b.T.dot(x_b)).dot(x_b.T).dot(y_train)
        self.interception_ = self._theta[0]
        self.coeff_ = self._theta[1:]
        return self

    def predict(self, x_predict):
        '''给定预测数据集,返回表示结果'''
        assert self.interception_ is not None and self.coeff_ is not None,"预测应已训练好"
        assert x_predict.shape[1] == len(self.coeff_), "特征维度应当相同"
        x_b = np.hstack([np.ones((len(x_predict), 1)), x_predict])
        return x_b.dot(self._theta)

boston = datasets.load_boston()
x = boston.data
y = boston.target
x = x[y<50]
y = y[y<50]
x_train, x_test, y_train, y_test = train_test_split(x, y ,test_size=0.3,random_state=42)
reg = LinearRegression()
reg.fit(x_train,y_train)
print(reg.coeff_)
print(reg.interception_)

 

输出系数:

[-1.23818781e-01  4.03899117e-02 -4.63364280e-02 -2.99732398e-02
 -1.46880633e+01  3.33324672e+00 -2.12948682e-02 -1.38818508e+00
  2.31608778e-01 -1.24333203e-02 -8.57628626e-01  6.89841247e-03
 -3.75313011e-01]
37.55993342611868

 若是通过from scipy.optimize import leastsq最小二乘法来实现多项式拟合(不再是原本线性回归),则是指定维数。以下以简单正弦拟合为例

import numpy as np
import scipy as sp
from scipy.optimize import leastsq
import matplotlib.pyplot as plt


# 目标函数
def real_func(x):
    return np.sin(2*np.pi*x)

# 多项式
def fit_func(p, x):
    f = np.poly1d(p)
    return f(x)

# 残差
def residuals_func(p, x, y):
    ret = fit_func(p, x) - y
    return ret

# 十个点
x = np.linspace(0, 1, 10)
x_points = np.linspace(0, 1, 1000)
# 加上正态分布噪音的目标函数的值
y_ = real_func(x)
y = [np.random.normal(0, 0.1) + y1 for y1 in y_]

def fitting(M=0):
    """
    M    为 多项式的次数
    """
    # 随机初始化多项式参数
    p_init = np.random.rand(M + 1)
    # 最小二乘法
    p_lsq = leastsq(residuals_func, p_init, args=(x, y))    # 把error函数中除了p以外的参数打包到args中
    print('Fitting Parameters:', p_lsq[0])  # p_lsq,就会发现返回了一个truple,其中第一维是一个列表,保存的是拟合的曲线的参数

    # 可视化
    plt.plot(x_points, real_func(x_points), label='real')
    plt.plot(x_points, fit_func(p_lsq[0], x_points), label='fitted curve')
    plt.plot(x, y, 'bo', label='noise')
    plt.legend()
    return p_lsq

# M=0
p_lsq_0 = fitting(M=0)
# M=1
p_lsq_1 = fitting(M=1)
# M=3
p_lsq_3 = fitting(M=3)
# M=9,过拟合
p_lsq_9 = fitting(M=9)
plt.show()

 

如果需要在最小二乘法中加入正则项以防过拟合,需要在初始参数p_init中多设置一个值,为正则项系数。

同时要记得修改优化目标函数

'''用用目标函数y=sin2pix, 加上一个正态分布的噪音干扰,用多项式去拟合'''
import numpy as np
import scipy as sp
from scipy.optimize import leastsq
import matplotlib.pyplot as plt

# 目标函数
def real_func(x):
    return np.sin(2*np.pi*x)

# 多项式
def fit_func(p, x):
    f = np.poly1d(p)
    return f(x)

# 残差
def residuals_func(p, x, y):
    ret = fit_func(p, x) - y
    return ret

# 十个点
x = np.linspace(0, 1, 10)
x_points = np.linspace(0, 1, 1000)
# 加上正态分布噪音的目标函数的值
y_ = real_func(x)
y = [np.random.normal(0, 0.1) + y1 for y1 in y_]


def fitting(M=0):
    """
    M    为 多项式的次数
    """
    # 随机初始化多项式参数
    p_init = np.random.rand(M + 1)
    # 最小二乘法
    p_lsq = leastsq(residuals_func, p_init, args=(x, y))    # 把error函数中除了p以外的参数打包到args中
    print('Fitting Parameters:', p_lsq[0])  # p_lsq,就会发现返回了一个truple,其中第一维是一个列表,保存的是拟合的曲线的参数
    return p_lsq

# M=9,过拟合
p_lsq_9 = fitting(M=9)



# 引入正则化
# 需要修改损失函数
regularization = 0.0001
def residuals_func_regularization(p, x, y):
    ret = fit_func(p, x) - y
    ret = np.append(ret,
                    np.sqrt(0.5 * regularization * np.square(p)))  # L2范数作为正则化项
    return ret


# 最小二乘法,加正则化项
# 所以初始参数要比原来多1个
p_init = np.random.rand(9 + 1)
p_lsq_regularization = leastsq(
    residuals_func_regularization, p_init, args=(x, y))     # 把error函数中除了p以外的参数打包到args中

plt.plot(x_points, real_func(x_points), label='real')
plt.plot(x_points, fit_func(p_lsq_9[0], x_points), label='fitted curve')
plt.plot(
    x_points,
    fit_func(p_lsq_regularization[0], x_points),
    label='regularization')
plt.plot(x, y, 'bo', label='noise')
plt.legend()
plt.show()

 

posted @ 2020-04-07 23:12  牛肉拉面香菜少一点  阅读(584)  评论(0)    收藏  举报