from sklearn.datasets import load_boston # 波士顿房价
boston = load_boston()
X = boston.data
y = boston.target
print(X.shape)
print(boston.feature_names) # 特征名称
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
import time
from sklearn.linear_model import LinearRegression
# model = LinearRegression()
# start = time.process_time()
# model.fit(X_train, y_train)
#
# train_score = model.score(X_train, y_train)
# cv_score = model.score(X_test, y_test)
# print('elaspe:{0:.6f}; train_score:{1:0.6f}; cv_score:{2:.6f}'
# .format(time.process_time() - start, train_score, cv_score)) # elaspe:0.002894;train_score:0.723941;cv_score:0.795262
# 优化欠拟合模型,数据归一化处理只会加快算法的收敛速度
# 创建多项式模型的函数
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
def polynomial_model(degree=1):
polynomial_features = PolynomialFeatures(degree=degree, include_bias=False) # include_bias 截距项
linear_regression = LinearRegression(normalize=True) # normalize 归一化
pipeline = Pipeline([('polynomial_features', polynomial_features),
('linear_regression', linear_regression)])
return pipeline
# 使用二阶多项式来拟合数据
model = polynomial_model(2)
start = time.process_time()
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
cv_score = model.score(X_test, y_test)
print('elaspe:{0:.6f}; train_score:{1:0.6f}; cv_score:{2:.6f}'
.format(time.process_time() - start, train_score, cv_score)) # elaspe:0.032048; train_score:0.930547; cv_score:0.860049
# 使用三阶多项式来拟合数据,训练样本的分数为1,测试样本的分数为负数,说明模型过拟合了
model = polynomial_model(3)
start = time.process_time()
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
cv_score = model.score(X_test, y_test)
print('elaspe:{0:.6f}; train_score:{1:0.6f}; cv_score:{2:.6f}'
.format(time.process_time() - start, train_score, cv_score)) # elaspe:0.299934; train_score:1.000000; cv_score:-110.033777