lightgbm

1.

# coding=utf-8
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor
import re
from sklearn.decomposition import PCA
import joblib
import shap
import time
from lightgbm import plot_importance
import seaborn
import warnings
warnings.filterwarnings("ignore")

#https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html

data = pd.read_excel(r"E:\Desktop\data.xlsx")
X = data.iloc[:, 0:13] # 选择第0~12列作为X值
y = data.iloc[:, 13] # 选择第13列作为y值

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
lgb = LGBMRegressor(random_state=0,device="gpu",boosting="gbdt")
param_grid = {
    'n_estimators':[100],
    'max_depth': [5],
    'learning_rate': [0.01,0.1],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'num_leaves': [15, 31],
    'n_jobs': [-1], 
    'device': ["gpu"]
}
grid = GridSearchCV(lgb, param_grid, cv=10, scoring="neg_mean_squared_error")

start = time.time()
grid.fit(X_train, y_train)
best_lgb = grid.best_estimator_
y_pred = best_lgb.predict(X_test)

#y_pred残差分布图
residuals = y_test - y_pred
seaborn.histplot(residuals,bins=20, color="orange", kde=True)
plt.xlabel("Residuals")
plt.ylabel("Count")
plt.title("y_test Residuals Distribution")
plt.show()


# y_pred残差图
residual_test = y_test - y_pred
plt.scatter(y_test, residual_test)
plt.hlines(y=0, xmin=y_test.min(), xmax=y_test.max())
plt.title('Residual plot for y_pred')
plt.xlabel('True value') 
plt.ylabel('Residual')
plt.show()


mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
pcc = np.corrcoef(y_test, y_pred)[0, 1]

#散点图
plt.scatter(y_test, y_pred, c="blue")
plt.xlabel("Truth")
plt.ylabel("predict")
plt.title("Truth vs predict")
plt.show()

#主成分图
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap="rainbow")
plt.xlabel("1st_PCA")
plt.ylabel("2rd_PCA")
plt.title("PCA result")
plt.colorbar()
plt.show()

y_train_pred = best_lgb.predict(X_train)

# y_train_pred残差分布图
residuals = y_train - y_train_pred  
seaborn.histplot(residuals,bins=20, color="orange", kde=True)
plt.xlabel("Residuals")
plt.ylabel("Count")
plt.title("y_train Residuals Distribution")
plt.show()

# y_train_pred残差图
residual_train = y_train - y_train_pred  
plt.scatter(y_train, residual_train)
plt.hlines(y=0, xmin=y_train.min(), xmax=y_train.max())  
plt.title('Residual plot for y_train_pred')
plt.xlabel('True value')
plt.ylabel('Residual')  
plt.show()


# 显示重要特征,max_num_features 指定显示多少个特征
plot_importance(best_lgb)
plt.show()

# 绘制柱状图显示每个特征的重要性
feature_names = X.columns # 获取特征名称
feature_importances = best_lgb.feature_importances_ # 获取特征重要性分数
plt.bar(feature_names, feature_importances)
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.title("Feature importance")
plt.show()

mae_train = mean_absolute_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_train_pred)
pcc_train = np.corrcoef(y_train, y_train_pred)[0, 1]

# 使用shap库获取每个特征的SHAP值
explainer = shap.TreeExplainer(best_lgb) # 创建解释器对象
shap_values = explainer.shap_values(X) # 获取SHAP值
# 绘制汇总图显示每个特征的SHAP值
shap.summary_plot(shap_values, X, plot_type="bar")

joblib.dump(best_lgb, 'best_lgb5.pkl')

# 调用best_lgb.pkl文件
model = joblib.load('best_lgb5.pkl')
data = pd.read_excel(r"E:\Desktop\data.xlsx",header=0)
# 获取数据的行数和列数
rows, cols = data.shape

# 遍历每一行的x值,输入到模型,并将预测的y值输入到最后一列
for i, row in data.iterrows():
    # 获取x值,转换为二维数组
    x = row[:13].values.reshape(1, -1)
    # 预测y值,转换为标量
    y = model.predict(x)[0]
    # 输入y值到最后一列
    data.loc[i, cols+1] = y
# 保存数据到Excel文件
data.to_excel(r"E:\Desktop\data.xlsx", index=False)

print("best_params:", grid.best_params_)
print("mse:", mse)
print("rmse:", rmse)
print("mae:", mae)
print("r2:", r2)
print("pcc:", pcc)
print("mae_train:", mae_train)
print("mse_train:", mse_train)
print("rmse_train:", rmse_train)
print("r2_train:", r2_train)
print("pcc_train:", pcc_train)
# 记录结束时间
end = time.time()
# 打印训练时间
print("Training time: {:.2f} seconds".format(end - start))

  

posted @ 2023-11-18 14:14  kehan  阅读(20)  评论(0编辑  收藏  举报