向后选择法
向后选择法是一种用于处理多元线性回归问题的变量选择方法
首先要设定一个阈值,就是我们所期待的模型的准确度
每次去除和原方程相关度最低的变量,直到所有的变量都满足对应的阈值
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
# 向后选择部分
def backwardElimination(x_train,flo):
tmp=np.zeros((17290,17))
for i in range(0,17):
regressor_OLS=sm.OLS(endog=y_train,exog=x_train).fit()
maxn=max(regressor_OLS.pvalues)
adj_b=regressor_OLS.rsquared_adj
if maxn>flo:
for j in range(0,17-i):
if regressor_OLS.pvalues[i]==maxn:
tmp[:,j]=x_train[:,j]
x_train=np.delete(x_train,j,axis=1)
regressor_OLS=sm.OLS(endog=y_train,exog=x_train).fit()
adj_a=regressor_OLS.rsquared_adj
if adj_a<=adj_b:
return np.delete(np.hstack((x_train,tmp[:,[0,j]])),j,1)
else :
continue
return x_train
#读入数据,对数据进行处理
data=pd.read_csv('house_data.csv')
#Q1
arr1=data['bedrooms']
print('Q1:',stats.mode(arr1)[0].tolist()[0])
#Q2
arr2=data[['price','bedrooms','bathrooms','sqft_living','sqft_lot','floors','waterfront','view','condition','grade','sqft_above','sqft_basement',
'yr_built','yr_renovated','zipcode','lat','long']]
arr2=arr2.corr()
ans=abs(arr2['price']).sort_values(ascending=False)
print ('Q2:',ans[1:4].index.tolist())
#Q3
arr3=data[['lat','long']]
print('Q3')
arr3.plot(kind='scatter',x='lat',y='long')
plt.show()
x_data=data.iloc[:,2:].values
y_data=data['price'].values
x_train,x_test,y_train,y_test=train_test_split(x_data,y_data,test_size=0.2) #数据切分
#创建线性回归模型
clf=LinearRegression()
clf.fit(x_train,y_train)
#训练
x_train=backwardElimination(x_train,0.05)
y_pred=clf.predict(x_test)
print('score:',clf.score(x_test,y_test))

浙公网安备 33010602011771号