使用Python手撸了一个线性回归模型
## 线性回归模型
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def getData():
r"""
使用numpy构造满足条件的随机数()
:return:
"""
dataX = np.random.randint(1,100,size=(1,100))
dataY = np.random.randint(1,100,size=(1,100))
data = np.concatenate((dataX,dataY), axis = 0)
G1tfa = data > 50
G1tfi = np.logical_and(G1tfa[0,:], G1tfa[1,:])
G1index = np.nonzero(G1tfi)[0]
G1 = data[:, G1index]
G2tfa = data < 50
G2tfi = np.logical_and(G2tfa[0,:], G2tfa[1,:])
G2index = np.nonzero(G2tfi)[0]
G2 = data[:, G2index]
return (G1, G2)
def getData2():
r"""
使用numpy生成随机数;
使用pandas构造满足条件的随机数;
:return:
"""
df = pd.DataFrame()
df['X'] = np.random.randint(1,100,size=(100))
df['Y'] = np.random.randint(1,100,size=(100))
G1 = df[(df['X']>50) & (df['Y']>50)]
G2 = df[(df['X']<50) & (df['Y']<50)]
return (G1.values.T, G2.values.T)
def getData3():
r"""
生成正太分布的随机数;
:return:
"""
# data = np.random.uniform(0, 1, size = 1000)#随机均匀采样
# data3 = np.random.rand(1000) #随机均匀分布
#
# data2 = np.random.normal(0, 1, size = 1000)#正太分布,均值为0,方差为1
# data4 = np.random.randn(1000)#正太分布,均值为0,方差为1
point1 = (0.5, 0.5)
point2 = (2.5, 2.5)
a = np.random.normal(point1[0], 0.3, size = (1000,1))
b = np.random.normal(point1[1], 0.2, size = (1000,1))
c = np.random.normal(point2[0], 0.2, size = (1000,1))
d = np.random.normal(point2[1], 0.3, size = (1000,1))
G1 = pd.DataFrame(data = np.concatenate((a,b),axis=1), columns=['X','Y'])
G2 = pd.DataFrame(data = np.concatenate((c,d),axis=1), columns=['X','Y'])
return (G1.values.T, G2.values.T)
def getData4():
r"""
生成可以用来线性回归的数据
:return:
"""
w = 1.2
b = 0.2
x = np.linspace(1,100,100)
noise = np.random.randn(100)*10
y = w * x + b + noise
return (x,y)
def linefit(x, y):
r"""
采用梯度下降的方法更新w和b
:param x:
:param y:
:return:
"""
#初始化迭代参数;
w = 0
b = 0
w_last = 0.001
b_last = 0.001
loss_last = 6330
stop = 10000
i = 0
eta = 0.00001#学习率,如果学习率很大,将无法收敛,loss震荡很大;
count = 0 #如果100次都没进步,就停止迭代
loss_list = [] #用来保存loss曲线
w_list = [0.001, 0]#用来保存每次迭代后的w
b_list = [0.001, 0]#用来保存每次迭代后的b
while(i < stop):
loss = getLoss(w,b,x,y)
w_t = w - eta*(loss-loss_last)/(w-w_last)
b_t = b - eta*(loss-loss_last)/(b-b_last)
w_list.append(w_t)
b_list.append(b_t)
w_last = w
w = w_t
b_last = b
b = b_t
if abs(loss_last - loss)< 0.01:
count += 1
if count >=100:
print("w is {:.2f}, b is {:.2f}, error is {}, i is {}".format(w,b,loss,i))
break
print("w is {:.2f}, b is {:.2f}, loss is {:.2f},i is {}".format(w,b,loss,i))
loss_list.append(loss)
loss_last = loss
i += 1
# xb = np.mean(x)
# yb = np.mean(y)
return w,b,loss_list,w_list,b_list
def getLoss(w,b,x,y,MAE = True):
if MAE:
#L1 mean absolute error (MAE)
totalLoss = np.sum(abs((w*x+b)-y))
else:
#L2 mean squari error (MSE)
totalLoss = np.sum((w*x+b)-y**2)
return totalLoss
def getErrorSurface(x,y):
r"""
获取error surface
:param x:
:param y:
:return:
"""
minError = 999999
w_f = 0
b_f = 0
size = 200
r = np.zeros((size,size))
for i,b in enumerate(np.linspace(-2.1,2.2,size)):
for j,w in enumerate(np.linspace(-2.1,2.2,size)):
r[i,j] = getLoss(w,b,x,y)
if r[i,j] < minError:
minError = r[i,j]
w_f = w
b_f = b
return r,w_f, b_f
def getErrorSurface2(x,y):
r"""
获取error surface (塞给contour函数)
:param x:
:param y:
:return:
"""
minError = 999999
w_f = 0
b_f = 0
size = 200
r = np.zeros((size,size))
wa = np.linspace(-2.1,2.2,size)
ba = np.linspace(-2.1,2.2,size)
z = []
for i,b in enumerate(np.linspace(-2.1,2.2,size)):
for j,w in enumerate(np.linspace(-2.1,2.2,size)):
r[i,j] = getLoss(w,b,x,y)
z.append(r[i,j])
if r[i,j] < minError:
minError = r[i,j]
w_f = w
b_f = b
z = np.array(z)
za = z.reshape((size,size))
return wa,ba,za
if __name__ == "__main__":
print("线性回归模型")
np.random.seed (10)
# 绘制散点图;
fig, ax = plt.subplots()
# G1, G2 = getData3()
# ax.scatter(G1[0,:], G1[1,:])
# ax.scatter(G2[0,:], G2[1,:])
x,y = getData4()
w1,b1,lossList,w_list,b_list = linefit(x, y)
r,w,b= getErrorSurface(x,y)
w2,b2,z2= getErrorSurface2(x,y)
ax.scatter(x, y,color = "C0")
ax.plot(np.r_[0,100],1.2*np.r_[0,100]+0.2,color = "C1",linewidth=3.0,label='Ground Truth')
ax.plot(np.r_[0,100],w*np.r_[0,100]+b,color = "C2",linewidth=3.0,label='fit from error surface')
ax.plot(np.r_[0,100],w1*np.r_[0,100]+b1,color = "C3",linewidth=3.0,label='Grident')
ax.legend()
ax.set_title("GT and scatter ")
fig.show()
#绘制error surface
fig, ax = plt.subplots()
ax.contourf(w2,b2,z2)
ax.scatter(np.array(w_list), np.array(b_list))
ax.scatter(np.array(1.2), np.array(0.2), color = 'r')
ax.annotate('', xy=(1.2, 0.2), xycoords='data',
xytext=(1.2+0.4, 0.2+0.4), textcoords='axes fraction',
arrowprops=dict(facecolor='white', shrink=0.05),
horizontalalignment='right', verticalalignment='top',
)
# im = ax.imshow(r)
# cb = plt.colorbar(im,)
ax.set_xlabel('w')
# ax.set_xlim(xmin = 0.2, xmax = 2.2)
ax.set_ylabel('b')
# ax.set_ylim(ymin = 0, ymax = 0.4)
ax.set_title("error surface ")
fig.show()
# 绘制loss
# lossList
fig, ax = plt.subplots()
ax.plot(lossList)
ax.set_xlabel('step')
# ax.set_xlim(xmin = 0.2, xmax = 2.2)
ax.set_ylabel('loss')
ax.set_ylim(ymin = 700, ymax = 1000)
ax.set_title("loss ")
fig.show()
#绘制直方图;
# ax.hist(data,bins=20) #随机均匀采样
# ax.hist(data3,bins=20) #随机均匀分布
# ax.hist(data2,bins=20) #正太分布,均值为0,方差为1
# ax.hist(data4,bins=20) #正太分布,均值为0,方差为1
#固定X轴、Y轴的范围
# ax.set_ylim(ymin = 0, ymax = 130)
# ax.set_xlim(xmin = -5, xmax = 5)



浙公网安备 33010602011771号