import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
#数据导入
data = pd.read_csv(r'D:\Anaconda\ana\envs\python32\Lib\site-packages\sklearn\datasets\data\boston_house_prices.csv')
# print(data.head()) #数据集的简单显示
class LinerRegression:
# alpha: 学习率
# times: 迭代次数
def __init__(self,alpha,times): # 初始化方法
self.alpha = alpha
self.times = times
def fit(self,X,y): # 训练方法
X = np.array(X)
y = np.array(y)
#初始化权重向量(初始值为0或任意其他值)
self.w_ = np.zeros(1 + X.shape[1])
#创建损失列表
self.loss_ = []
#多次迭代求出模型
for i in range(self.times):
#获得预测值
y_hat = np.dot(X,self.w_[1:]) + self.w_[0]
#获得误差
error = y - y_hat
self.loss_.append(np.sum(error ** 2) / 2)
#权值调整:注意w[0]即偏置值需要单独处理
self.w_[0] += self.alpha * np.sum(error)
self.w_[1:] += self.alpha * np.dot(X.T , error)
def predict(self,X):
X = np.array(X)
result = np.dot(X,self.w_[1:]) + self.w_[0]
return result
#标准化数据,如果数据范围相差太大则会导致梯度下降异常,甚至梯度上升
class StandarScaler:
def fit(self,X):
X = np.array(X)
# 按列求出标准差
self.std_ = np.std(X,axis = 0)
# 按列求出平均值
self.mean_ = np.mean(X,axis = 0)
def transform(self,X):
""" 对给定的数据进行标准化处理(即将X的每一列都变成表直正态分布)
前提:默认X的列数据已经为按照一般正态分布分布好的数据
因此只需要根据 X~N(μ,θ)转化为标准正态分布的方法进行转换即可
即:X = (x - μ) / θ
"""
return (X - self.mean_)/self.std_
# 合并fit和transform函数简化主函数
def fit_transform(self,X):
self.fit(X)
return self.transform(X)
lr = LinerRegression(alpha = 0.0007,times = 50)
ssx = StandarScaler()
tData = data.sample(len(data),random_state = 0)
train_X = ssx.fit_transform(tData.iloc[:400,:-1])
test_X = ssx.transform(tData.iloc[400:,:-1])
ssy = StandarScaler()
# train_y = ssy.fit_transform(tData.iloc[:400,-1])
# test_y = ssy.transform(tData.iloc[400:,-1])
train_y = tData.iloc[:400,-1]
test_y = tData.iloc[400:,-1]
lr.fit(train_X,train_y)
result = lr.predict(test_X)
np.savetxt(r"C:\Users\Y_ch\Desktop\result.txt",result,fmt = "%.2lf")
np.savetxt(r"C:\Users\Y_ch\Desktop\testy.txt",test_y,fmt = "%.2lf")
# 可视化
mpl.rcParams["font.family"] = "sans-serif"
mpl.rcParams["axes.unicode_minus"] = False
plt.figure(figsize = (10,10))
plt.plot(result,"ro-",label = "predict")
# 注意是test_y的value而不是test_y test_y是panda的数据类型
# 而test_y.values是numpy的数组类型,二者不是同一类型,最后显示在画布上的应为数组类型而不是数据
plt.plot(test_y.values,"go--",label = "real")
plt.xlabel("Sample")
plt.ylabel("Price")
plt.title("Gradient Descent To Solve Liner Reression")
plt.legend()
plt.savefig(r"C:\Users\Y_ch\Desktop\HousePrice\GradientDescent.png")
plt.show()