【第9章】教材上代码
代码清单9-1 导入scikit-learn工具包¶
In [2]:
import sklearn #导入sklearn工具包
from sklearn import ensemble #从sklearn工具包中导入ensemble模块
from sklearn.ensemble import GradientBoostingRegressor #从sklearn.ensemble模块中导入GradientBoostingRegressor类
代码清单9-2 加载scikit-learn工具包内置的糖尿病数据集¶
In [3]:
from sklearn import datasets #从sklearn工具包导入datasets模块
diabetes = datasets.load_diabetes() #加载糖尿病数据集
X, y = diabetes.data, diabetes.target #获取属性X和标签y
print('X type:',type(X),'shape:',X.shape) #输出属性X的类型和形状
print('y type:',type(y),'shape:',y.shape) #输出标签y的类型和形状
X type: <class 'numpy.ndarray'> shape: (442, 10)
y type: <class 'numpy.ndarray'> shape: (442,)
In [4]:
print(diabetes.DESCR) #输出糖尿病数据集的描述
.. _diabetes_dataset:
Diabetes dataset
----------------
Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.
**Data Set Characteristics:**
:Number of Instances: 442
:Number of Attributes: First 10 columns are numeric predictive values
:Target: Column 11 is a quantitative measure of disease progression one year after baseline
:Attribute Information:
- age age in years
- sex
- bmi body mass index
- bp average blood pressure
- s1 tc, total serum cholesterol
- s2 ldl, low-density lipoproteins
- s3 hdl, high-density lipoproteins
- s4 tch, total cholesterol / HDL
- s5 ltg, possibly log of serum triglycerides level
- s6 glu, blood sugar level
Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times the square root of `n_samples` (i.e. the sum of squares of each column totals 1).
Source URL:
https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html
For more information see:
Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) "Least Angle Regression," Annals of Statistics (with discussion), 407-499.
(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)
代码清单9-3 分割数据集为训练集和测试集¶
In [5]:
from sklearn.model_selection import train_test_split #从sklearn.model_selection模块导入train_test_split函数
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13) #划分训练集和测试集
print('X_train samples:', X_train.shape[0]) #输出训练集的样本数
print('X_test samples:', X_test.shape[0]) #输出测试集的样本数
X_train samples: 309
X_test samples: 133
代码清单9-4 模型训练¶
In [6]:
from sklearn import ensemble #从sklearn工具包导入ensemble模块
params={
"n_estimators": 10, #所使用回归树的数量
"max_depth": 5 #每棵回归树的最大深度
}
reg=ensemble.RandomForestRegressor(**params,random_state=0) #创建模型
reg.fit(X_train, y_train) #根据训练数据进行模型的参数学习
Out[6]:
RandomForestRegressor(max_depth=5, n_estimators=10, random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor
Parameters
| n_estimators | 10 | |
| criterion | 'squared_error' | |
| max_depth | 5 | |
| min_samples_split | 2 | |
| min_samples_leaf | 1 | |
| min_weight_fraction_leaf | 0.0 | |
| max_features | 1.0 | |
| max_leaf_nodes | None | |
| min_impurity_decrease | 0.0 | |
| bootstrap | True | |
| oob_score | False | |
| n_jobs | None | |
| random_state | 0 | |
| verbose | 0 | |
| warm_start | False | |
| ccp_alpha | 0.0 | |
| max_samples | None | |
| monotonic_cst | None |
代码清单9-5 模型推理¶
In [7]:
import matplotlib.pyplot as plt #导入matplotlib.pyplot
import numpy as np #导入numpy
y_pred = reg.predict(X_test) #在测试数据上进行模型推理
plt.scatter(y_test, y_pred) #绘制散点图
plt.xlabel('label') #设置横轴标签
plt.ylabel('predict') #设置纵轴标签
plt.show() #显示图表

代码清单9-6 模型的性能评价¶
In [ ]:
from sklearn.metrics import mean_squared_error #从sklearn.metrics模块导入mean_squared_error函数
mse = mean_squared_error(y_test,y_pred) #计算MSE指标
print("测试集上的MSE指标: %.4f"%mse) #输出MSE指标
代码清单9-7 属性的贡献度分析¶
In [ ]:
from sklearn.inspection import permutation_importance #从sklearn.inspection模块导入permutation_importance函数
result = permutation_importance( #基于测试集计算各属性的贡献度
reg, X_test, y_test, n_repeats=10,
random_state=13, scoring='neg_mean_squared_error')
sorted_idx = result.importances_mean.argsort() #按照贡献度进行排序
plt.rcParams['font.family'] = 'Microsoft YaHei' #设置中文字体
plt.boxplot( #绘制箱图
result.importances[sorted_idx].T,
vert=False,
labels=np.array(diabetes.feature_names)[sorted_idx],
)
plt.xlabel('贡献度')
plt.ylabel('属性')
plt.title("基于测试集的属性贡献度分析")
plt.show()
代码清单9-8 模型超参数的自动搜索¶
In [ ]:
from sklearn.model_selection import GridSearchCV #导入GridSearchCV类
params = {'n_estimators': [5,10,15],
'max_depth': [3,5,7]} #定义超参数搜索空间
rf = ensemble.RandomForestRegressor(random_state=0)
reg_search = GridSearchCV(estimator=rf,
param_grid=params,
cv=5) # 创建GridSearchCV对象
reg_search.fit(X_train, y_train) #模型训练及超参数寻优
print('最优超参数:', reg_search.best_params_) #输出最优超参数
y_pred = reg_search.predict(X_test) #使用最优超参数模型在测试集上进行推理
mse = mean_squared_error(y_test,y_pred) #计算MAE指标
print("测试集上的MSE指标: %.4f"%mse) #输出MAE指标
代码清单9-9 加载手写数字图像数据集¶
In [ ]:
import matplotlib.pyplot as plt
from sklearn import datasets
digits = datasets.load_digits() #加载手写数字图像数据集
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(10, 5)) #将画图区域划分为2行5列
for idx in range(10): #依次获取前10条数据
row,col = idx//5, idx%5 #计算每一子图的行索引和列索引
axes[row][col].set_axis_off() #不显示坐标轴
axes[row][col].imshow(digits.images[idx], cmap=plt.cm.gray_r) #显示图像
axes[row][col].set_title('Label: %d'%digits.target[idx]) #显示标签
代码清单9-10 数据预处理和数据集划分¶
In [ ]:
from sklearn.model_selection import train_test_split #从sklearn.model_selection模块导入train_test_split函数
print('原数据集:', digits.images.shape) #打印数据集的形状
samplenum = digits.images.shape[0] #获取数据条数
X = digits.images.reshape((samplenum, -1)) #将二维手写数字图像转为一维数据
print('转换数据集:', X.shape) #打印转换后的数据集的形状
y = digits.target #获取数据的标签
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.5, random_state=13) #划分训练集和测试集
代码清单9-11 模型训练¶
In [ ]:
from sklearn import ensemble #从sklearn工具包导入ensemble模块
rfc=ensemble.RandomForestClassifier(n_estimators=50, max_depth=10, random_state=0) #创建模型
rfc.fit(X_train, y_train) #根据训练数据进行模型的参数学习
代码清单9-12 模型推理¶
In [ ]:
y_pred=rfc.predict(X_test) #在测试集上进行模型推理
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(10, 5)) #将画图区域划分为2行5列
for idx in range(10): #依次获取前10条数据
row,col = idx//5, idx%5 #计算每一子图的行索引和列索引
axes[row][col].set_axis_off() #不显示坐标轴
image = X_test[idx].reshape(8,8)
axes[row][col].imshow(image, cmap=plt.cm.gray_r) #显示图像
axes[row][col].set_title('Label: %d,Predict:%d'%(y_test[idx], y_pred[idx])) #显示标签
代码清单9-13 打印模型性能评估报告¶
In [ ]:
from sklearn import metrics #从sklearn工具包导入metrics模块
print(metrics.classification_report(y_test, y_pred, digits=4)) #打印分类报告
代码清单9-14 打印混淆矩阵¶
In [ ]:
metrics.ConfusionMatrixDisplay.from_predictions(y_test, y_pred) #打印混淆矩阵
代码清单9-15 数据集的加载和划分¶
In [ ]:
from sklearn import datasets #导入sklearn.datasets模块
from sklearn.model_selection import train_test_split #从sklearn.model_selection模块导入train_test_split函数
digits = datasets.load_digits() #加载手写数字图像数据集
X = digits.data #直接通过data属性获取一维数据
X_train, X_test = train_test_split(X, test_size=0.01, random_state=13) #划分训练集和测试集
代码清单9-16 模型的训练及超参数选择¶
In [ ]:
from sklearn.cluster import KMeans #从sklearn.cluster模块导入KMeans类
from sklearn import metrics #导入sklearn.metrics模块
import matplotlib.pyplot as plt #导入matpoltlib.pyplot模块
import numpy as np #导入numpy工具包
ls_K = list(range(3,20,4)) #设置聚类簇数列表
ls_SC = [] #保存每种聚类簇数的轮廓系数
SC_best = 0 #保存当前最优轮廓系数
for K in ls_K: #依次取每种聚类簇数
kmeans = KMeans(n_clusters=K,random_state=0) #创建模型
kmeans.fit(X_train) #根据X_train训练模型
SC = metrics.silhouette_score(X_train,kmeans.labels_) #计算轮廓系数
ls_SC.append(SC) #将轮廓系数保存到ls_SC中
if SC > SC_best: #如果当前轮廓系数更优
SC_best = SC #更新最优轮廓系数
kmeans_best = kmeans #更新最优模型
plt.rcParams['font.family'] = 'Microsoft YaHei' #设置中文字体
plt.plot(ls_K, ls_SC, marker='o') #绘制轮廓系数随簇数K的变化曲线
plt.xlabel('簇数K') # 设置x轴标签
plt.ylabel('轮廓系数') # 设置y轴标签
plt.show() # 显示图表
print('最优簇数:%d'%kmeans_best.n_clusters)
代码清单9-17 模型推理¶
In [ ]:
preds = kmeans_best.predict(X_test) #模型推理
X_match = X_train[kmeans.labels_==preds[0]] #获取第一个测试样本匹配聚类中的训练样本
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(10, 5)) #将画图区域划分为2行5列
for row in range(2):
for col in range(5):
axes[row][col].set_axis_off() #不显示坐标轴
image = X_test[0].reshape(8,8) #将一维数据转换为二维图像
axes[0][0].imshow(image, cmap=plt.cm.gray_r) #显示图像
axes[0][0].set_title('输入图像:') #设置标题
axes[1][0].set_title('检索结果:') #设置标题
for idx in range(5): #依次获取前5条数据
image = X_match[idx].reshape(8,8) #将一维数据转换为二维图像
axes[1][idx].imshow(image, cmap=plt.cm.gray_r) #显示图像
In [ ]:

浙公网安备 33010602011771号