LightGBM使用
https://bacterous.github.io/2018/09/13/LightGBM%E4%BD%BF%E7%94%A8/
深入理解LightGBM
https://zhuanlan.zhihu.com/p/99069186
https://github.com/Microstrong0305/WeChat-zhihu-csdnblog-code
https://github.com/microsoft/LightGBM
https://www.csdn.net/tags/MtTaEg5sMjE2MjE0LWJsb2cO0O0O.html
利用XGBoost实现对鸢尾花数据集(Iris.csv)的分类预测
https://blog.csdn.net/Cyril_KI/article/details/107660210
python将大csv文件划分成小csv文件做训练集和测试集
https://blog.csdn.net/Findingxu/article/details/86683743?utm_term=csv%E6%96%87%E4%BB%B6%E5%88%86%E8%AE%AD%E7%BB%83%E9%9B%86%E5%92%8C%E6%B5%8B%E8%AF%95%E9%9B%86&utm_medium=distribute.pc_aggpage_search_result.none-task-blog-2~all~sobaiduweb~default-0-86683743-null-null&spm=3001.4430
https://wenku.baidu.com/view/9597ad3bc6da50e2524de518964bcf84b9d52db6.html
程序1
import lightgbm as lgb import pandas as pd from sklearn.metrics import mean_squared_error from sklearn.model_selection import GridSearchCV from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.datasets import make_classification import csv import numpy as np
def read_data(test_data='input/train.csv', n=0, label=1):
'''
加载数据的功能
n:特征数据起始位
label:是否是监督样本数据
'''
csv.field_size_limit(500 * 1024 * 1024) #一定要加上这一句
csv_reader = csv.reader(open(test_data, encoding="utf8", errors="ignore"))
data_list = []
for one_line in csv_reader:
data_list.append(one_line)
x_list = []
y_list = []
for one_line in data_list[1:]:
if label == 1:#如果是监督样本数据
y_list.append(int(one_line[-1])) # 标志位(最后一位都是标签位)
one_list = [o for o in one_line[n:-1]]
x_list.append(one_list)
else:
one_list = [o for o in one_line[n:]]
x_list.append(one_list)
return x_list, y_list
def split_data(data_list, y_list, ratio=0.30):#70%训练集,30%测试集: 914285,391837
'''
按照指定的比例,划分样本数据集
ratio: 测试数据的比率
'''
X_train, X_test, y_train, y_test = train_test_split(data_list, y_list, test_size=ratio, random_state=50)
"""训练集"""
with open('input/sub_train.csv', 'w', encoding="utf8",newline="", errors="ignore") as csvfile:#不加newline=""的话会空一行出来
fieldnames = ['qid', 'question_text','target']
write = csv.DictWriter(csvfile,fieldnames=fieldnames)
write.writeheader()#写表头
for i in range(len(X_train)):
write.writerow({'qid':X_train[i][0],'question_text':X_train[i][1],'target':y_train[i]})
"""测试集"""
#标签文件
with open('input/sub_test_y', 'w') as fp:
json.dump(y_test, fp)
#测试csv
with open('input/sub_test_x.csv', 'w', encoding="utf8",newline="", errors="ignore") as csvfile:#不加newline=""的话会空一行出来
fieldnames = ['qid', 'question_text']
write = csv.DictWriter(csvfile,fieldnames=fieldnames)
write.writeheader()#写表头
for i in range(len(X_test)):
write.writerow({'qid':X_test[i][0],'question_text':X_test[i][1]})
return X_train, X_test, y_train, y_test
F_feature,F_label = read_data(test_data='D:\\20210706E\\2020-python\\light_GBM\\Iris.csv', n=1, label=1)
print(np.array(F_feature))
print(np.array(F_label))
X_train,X_test,y_train,y_test =train_test_split(np.array(F_feature),np.array(F_label),test_size=0.2)
#X_train,X_test,y_train,y_test =train_test_split(F_train,F_train,test_size=0.2)
print(70*'*\n')
'''df_train = pd.read_csv('D:\\20210706E\\2020-python\\light_GBM\\Iris.csv',sep=",")
#print(df_train)
df_train_label=df_train['Species']
# df_train_feature=df_train['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']
df_train_feature=df_train[['SepalWidthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
#print(df_train_label.tolist())
#test = df_train_label.drop(, axis = 1)
#print(test)
print(df_train_feature)'''
# df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
# df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
# y_train = df_train[0].values
# y_test = df_test[0].values
# X_train = df_train.drop(0, axis=1).values
# X_test = df_test.drop(0, axis=1).values
# 加载数据
print('Load data...')
iris = load_iris()
data=iris.data
target = iris.target
#X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.2)
# df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
# df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
# y_train = df_train[0].values
# y_test = df_test[0].values
# X_train = df_train.drop(0, axis=1).values
# X_test = df_test.drop(0, axis=1).values
print('Start training...')
# 创建模型,训练模型
gbm = lgb.LGBMRegressor(objective='regression',num_leaves=31,learning_rate=0.05,n_estimators=20)
gbm.fit(X_train, y_train,eval_set=[(X_test, y_test)],eval_metric='l1',early_stopping_rounds=5)
print('Start predicting...') # 测试机预测 y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_) # 模型评估 print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) # feature importances print('Feature importances:', list(gbm.feature_importances_)) # 网格搜索,参数优化 estimator = lgb.LGBMRegressor(num_leaves=31) param_grid = { 'learning_rate': [0.01, 0.1, 1], 'n_estimators': [20, 40] } gbm = GridSearchCV(estimator, param_grid) gbm.fit(X_train, y_train) print('Best parameters found by grid search are:', gbm.best_params_)
from sklearn.metrics import roc_auc_score, accuracy_score
数据1
Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species 1,5.1,3.5,1.4,0.2,0 2,4.9,3.0,1.4,0.2,0 3,4.7,3.2,1.3,0.2,0 4,4.6,3.1,1.5,0.2,0 5,5.0,3.6,1.4,0.2,0 6,5.4,3.9,1.7,0.4,0 7,4.6,3.4,1.4,0.3,0 8,5.0,3.4,1.5,0.2,0 9,4.4,2.9,1.4,0.2,0 10,4.9,3.1,1.5,0.1,0 11,5.4,3.7,1.5,0.2,0 12,4.8,3.4,1.6,0.2,0 13,4.8,3.0,1.4,0.1,0 14,4.3,3.0,1.1,0.1,0 15,5.8,4.0,1.2,0.2,0 16,5.7,4.4,1.5,0.4,0 17,5.4,3.9,1.3,0.4,0 18,5.1,3.5,1.4,0.3,0 19,5.7,3.8,1.7,0.3,0 20,5.1,3.8,1.5,0.3,0 21,5.4,3.4,1.7,0.2,0 22,5.1,3.7,1.5,0.4,0 23,4.6,3.6,1.0,0.2,0 24,5.1,3.3,1.7,0.5,0 25,4.8,3.4,1.9,0.2,0 26,5.0,3.0,1.6,0.2,0 27,5.0,3.4,1.6,0.4,0 28,5.2,3.5,1.5,0.2,0 29,5.2,3.4,1.4,0.2,0 30,4.7,3.2,1.6,0.2,0 31,4.8,3.1,1.6,0.2,0 32,5.4,3.4,1.5,0.4,0 33,5.2,4.1,1.5,0.1,0 34,5.5,4.2,1.4,0.2,0 35,4.9,3.1,1.5,0.1,0 36,5.0,3.2,1.2,0.2,0 37,5.5,3.5,1.3,0.2,0 38,4.9,3.1,1.5,0.1,0 39,4.4,3.0,1.3,0.2,0 40,5.1,3.4,1.5,0.2,0 41,5.0,3.5,1.3,0.3,0 42,4.5,2.3,1.3,0.3,0 43,4.4,3.2,1.3,0.2,0 44,5.0,3.5,1.6,0.6,0 45,5.1,3.8,1.9,0.4,0 46,4.8,3.0,1.4,0.3,0 47,5.1,3.8,1.6,0.2,0 48,4.6,3.2,1.4,0.2,0 49,5.3,3.7,1.5,0.2,0 50,5.0,3.3,1.4,0.2,0 51,7.0,3.2,4.7,1.4,1 52,6.4,3.2,4.5,1.5,1 53,6.9,3.1,4.9,1.5,1 54,5.5,2.3,4.0,1.3,1 55,6.5,2.8,4.6,1.5,1 56,5.7,2.8,4.5,1.3,1 57,6.3,3.3,4.7,1.6,1 58,4.9,2.4,3.3,1.0,1 59,6.6,2.9,4.6,1.3,1 60,5.2,2.7,3.9,1.4,1 61,5.0,2.0,3.5,1.0,1 62,5.9,3.0,4.2,1.5,1 63,6.0,2.2,4.0,1.0,1 64,6.1,2.9,4.7,1.4,1 65,5.6,2.9,3.6,1.3,1 66,6.7,3.1,4.4,1.4,1 67,5.6,3.0,4.5,1.5,1 68,5.8,2.7,4.1,1.0,1 69,6.2,2.2,4.5,1.5,1 70,5.6,2.5,3.9,1.1,1 71,5.9,3.2,4.8,1.8,1 72,6.1,2.8,4.0,1.3,1 73,6.3,2.5,4.9,1.5,1 74,6.1,2.8,4.7,1.2,1 75,6.4,2.9,4.3,1.3,1 76,6.6,3.0,4.4,1.4,1 77,6.8,2.8,4.8,1.4,1 78,6.7,3.0,5.0,1.7,1 79,6.0,2.9,4.5,1.5,1 80,5.7,2.6,3.5,1.0,1 81,5.5,2.4,3.8,1.1,1 82,5.5,2.4,3.7,1.0,1 83,5.8,2.7,3.9,1.2,1 84,6.0,2.7,5.1,1.6,1 85,5.4,3.0,4.5,1.5,1 86,6.0,3.4,4.5,1.6,1 87,6.7,3.1,4.7,1.5,1 88,6.3,2.3,4.4,1.3,1 89,5.6,3.0,4.1,1.3,1 90,5.5,2.5,4.0,1.3,1 91,5.5,2.6,4.4,1.2,1 92,6.1,3.0,4.6,1.4,1 93,5.8,2.6,4.0,1.2,1 94,5.0,2.3,3.3,1.0,1 95,5.6,2.7,4.2,1.3,1 96,5.7,3.0,4.2,1.2,1 97,5.7,2.9,4.2,1.3,1 98,6.2,2.9,4.3,1.3,1 99,5.1,2.5,3.0,1.1,1 100,5.7,2.8,4.1,1.3,1 101,6.3,3.3,6.0,2.5,2 102,5.8,2.7,5.1,1.9,2 103,7.1,3.0,5.9,2.1,2 104,6.3,2.9,5.6,1.8,2 105,6.5,3.0,5.8,2.2,2 106,7.6,3.0,6.6,2.1,2 107,4.9,2.5,4.5,1.7,2 108,7.3,2.9,6.3,1.8,2 109,6.7,2.5,5.8,1.8,2 110,7.2,3.6,6.1,2.5,2 111,6.5,3.2,5.1,2.0,2 112,6.4,2.7,5.3,1.9,2 113,6.8,3.0,5.5,2.1,2 114,5.7,2.5,5.0,2.0,2 115,5.8,2.8,5.1,2.4,2 116,6.4,3.2,5.3,2.3,2 117,6.5,3.0,5.5,1.8,2 118,7.7,3.8,6.7,2.2,2 119,7.7,2.6,6.9,2.3,2 120,6.0,2.2,5.0,1.5,2 121,6.9,3.2,5.7,2.3,2 122,5.6,2.8,4.9,2.0,2 123,7.7,2.8,6.7,2.0,2 124,6.3,2.7,4.9,1.8,2 125,6.7,3.3,5.7,2.1,2 126,7.2,3.2,6.0,1.8,2 127,6.2,2.8,4.8,1.8,2 128,6.1,3.0,4.9,1.8,2 129,6.4,2.8,5.6,2.1,2 130,7.2,3.0,5.8,1.6,2 131,7.4,2.8,6.1,1.9,2 132,7.9,3.8,6.4,2.0,2 133,6.4,2.8,5.6,2.2,2 134,6.3,2.8,5.1,1.5,2 135,6.1,2.6,5.6,1.4,2 136,7.7,3.0,6.1,2.3,2 137,6.3,3.4,5.6,2.4,2 138,6.4,3.1,5.5,1.8,2 139,6.0,3.0,4.8,1.8,2 140,6.9,3.1,5.4,2.1,2 141,6.7,3.1,5.6,2.4,2 142,6.9,3.1,5.1,2.3,2 143,5.8,2.7,5.1,1.9,2 144,6.8,3.2,5.9,2.3,2 145,6.7,3.3,5.7,2.5,2 146,6.7,3.0,5.2,2.3,2 147,6.3,2.5,5.0,1.9,2 148,6.5,3.0,5.2,2.0,2 149,6.2,3.4,5.4,2.3,2 150,5.9,3.0,5.1,1.8,2
from lightgbm import LGBMClassifier from sklearn.metrics import accuracy_score from sklearn.model_selection import GridSearchCV from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.externals import joblib # 加载数据 iris = load_iris() data = iris.data target = iris.target # 划分训练数据和测试数据 X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2) # 模型训练 gbm = LGBMClassifier(num_leaves=31, learning_rate=0.05, n_estimators=20) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5) # 模型存储 joblib.dump(gbm, 'loan_model.pkl') # 模型加载 gbm = joblib.load('loan_model.pkl') # 模型预测 y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_) # 模型评估 print('The accuracy of prediction is:', accuracy_score(y_test, y_pred)) # 特征重要度 print('Feature importances:', list(gbm.feature_importances_)) # 网格搜索,参数优化 estimator = LGBMClassifier(num_leaves=31) param_grid = { 'learning_rate': [0.01, 0.1, 1], 'n_estimators': [20, 40] } gbm = GridSearchCV(estimator, param_grid) gbm.fit(X_train, y_train) print('Best parameters found by grid search are:', gbm.best_params_)
ImportError: cannot import name ‘joblib‘ from ‘sklearn.externals‘
https://blog.csdn.net/weixin_45031468/article/details/113825131