P37 Ridge岭回归分析

http://bilibili.com/video/BV184411Q7Ng?p=37

#python岭回归进行房间预测：

#load_boston里面的数值都是连续的
from sklearn.datasets import load_boston
#从sklearn中的线性模型导入线性回归，SGD随机梯度下降
from sklearn.linear_model import LinearRegression,SGDRegressor,Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

def mylinear():
    """
    线性回归预测房子价格
    :return:
    """
    # 获取数据
    lb=load_boston()
    # 分割数据集到训练集和测试集
    x_train,x_test,y_train,y_test=train_test_split(lb.data,lb.target,test_size=0.25)

    #print(y_train) #打印训练样本
    #print("\n")
    #print(y_test) #打印测试样本
    # 对特征值进行标准化处理，目标值要不要进行标准化处理？答：也要标准化，因为特征值标准化之后，乘以小权重之后，得到
    # 的值肯定也很小，这与原本的目标值相差会很大
    # 特征值和目标值都必须进行标准化处理,实例化两个标准化API
    std_x=StandardScaler()

    #x_train=std_x.fit_transform(x_train.reshape(-1,13))
    #x_test=std_x.fit_transform(x_test.reshape(-1,13))

    x_train=std_x.fit_transform(x_train)
    x_test=std_x.fit_transform(x_test)

    # 对目标值进行标准化
    std_y=StandardScaler()

    y_train=std_y.fit_transform(y_train.reshape(-1,1)) #样本训练标签y_train源程序提供的是1维数据，
    # 这里要做一个维度的形状转换，1维->2维，不知道有多少个样本，所以第一个参数是"-1",但是每个样本都只有一个目标值
    y_test=std_y.transform(y_test.reshape(-1,1))

    #y_train=std_y.fit_transform(y_train) #样本训练标签y_train源程序提供的是1维数据，
    # 这里要做一个维度的形状转换，1维->2维，不知道有多少个样本，所以第一个参数是"-1",但是每个样本都只有一个目标值
    #x_test=std_y.transform(y_test)




    #estimator预测

    # 首先使用正规方程求解方式预测结果：
    zhengguifunction=LinearRegression()
    zhengguifunction.fit(x_train,y_train)
    #print("通过正规方程的方法求解的回归方程的系数是：\n",zhengguifunction.coef_) # 打印出求解的权重参数

    # 没有准确率了，但是可以使用求出的权重预测测试集样本的价格
    #y_predict=lr.predict(x_test) #这个是根据测试样本预测的价格
    # 之前标准化了，现在转化回去，不然预测的都是小值
    y_zhengguifunction_predict = std_y.inverse_transform(zhengguifunction.predict(x_test))  # 这个是根据测试样本预测的价格
    #print("正规方程的方法，测试集里面每个测试样本中房子的预测价格是：\n",y_zhengguifunction_predict) #打印出预测价格

    print("正规方程的均方误差是：",mean_squared_error(std_y.inverse_transform(y_test),y_zhengguifunction_predict),'单位是：万元的平方')

    # 通过梯度下降的方式进行房价预测：
    #lr=LinearRegression()
    sgdmethod=SGDRegressor() #SGD是随机梯度下降的意思，regressor就是回归的意思
    sgdmethod.fit(x_train,y_train)
    sgdmethod.coef_=sgdmethod.coef_.reshape(1,-1)
    #print("通过梯度下降的方法迭代求解的回归方程的系数是：\n",sgdmethod.coef_) # 打印出求解的权重参数

    # 没有准确率了，但是可以使用求出的权重预测测试集样本的价格
    #y_predict=lr.predict(x_test) #这个是根据测试样本预测的价格
    # 之前标准化了，现在转化回去，不然预测的都是小值
    y_sgdmethod_predict = std_y.inverse_transform(sgdmethod.predict(x_test))  # 这个是根据测试样本预测的价格
    y_sgdmethod_predict=y_sgdmethod_predict.reshape(-1,1)
    #print("梯度下降的方法，测试集里面每个测试样本中房子的预测价格是：\n",y_sgdmethod_predict) #打印出预测价格
    print("梯度下降的均方误差是：", mean_squared_error(std_y.inverse_transform(y_test),  y_sgdmethod_predict),'单位是：万元的平方')

    # 通过岭回归的方式进行房价预测：
    sgd_Ridge=Ridge(alpha=1.0) #alpha是回归力度参数
    sgd_Ridge.fit(x_train,y_train)
    sgd_Ridge.coef_=sgd_Ridge.coef_.reshape(1,-1)
    #print("通过梯度下降的方法迭代求解的回归方程的系数是：\n",sgdmethod.coef_) # 打印出求解的权重参数

    # 之前标准化了，现在转化回去，不然预测的都是小值
    sgd_Ridge_predict = std_y.inverse_transform(sgd_Ridge.predict(x_test))  # 这个是根据测试样本预测的价格
    sgd_Ridge_predict=sgd_Ridge_predict.reshape(-1,1)
    #print("梯度下降的方法，测试集里面每个测试样本中房子的预测价格是：\n",sgd_Ridge_predict) #打印出预测价格
    print("岭回归的均方误差是：", mean_squared_error(std_y.inverse_transform(y_test),  sgd_Ridge_predict),'单位是：万元的平方')


    # 下面打印正规方程和随机梯度下降分别预测的权重系数的差值
    #print("通过正规方程方法求得的系数和随机梯度下降迭代求得的系数差值是：")
    #for i in range(sgdmethod.coef_.shape[1]):
    #    print(sgdmethod.coef_[0][i]-zhengguifunction.coef_[0][i])


    # 下面打印正规方程和随机梯度下降分别预测的房价的差值
    #print("通过正规方程方法预测的房价和随机梯度下降迭代预测的房价的差值是：")
    #for i in range(y_sgdmethod_predict.shape[0]):
    #    print("%.1f"%abs((y_sgdmethod_predict[i][0]-y_zhengguifunction_predict[i][0])*10),"千元")



    return None





if __name__=="__main__":
    mylinear()

运行结果：

C:\Users\TJ\AppData\Local\Programs\Python\Python37\python.exe D:/qcc/python/mnist/fangjia_yuce.py
正规方程的均方误差是： 22.63101875379508 单位是：万元的平方
梯度下降的均方误差是： 23.391171004734396 单位是：万元的平方
岭回归的均方误差是： 22.639158049580825 单位是：万元的平方

可见，加了正则化的线性回归好于一般的线性回归。

病态数据：异常数据，有异常数据的话最容易出现过拟合的情况，而岭回归能在一定程度上抵抗这种异常数据。

posted on 2020-12-24 14:31 一杯明月阅读(314) 评论(0) 编辑收藏举报