LightGBM使用

https://bacterous.github.io/2018/09/13/LightGBM%E4%BD%BF%E7%94%A8/

深入理解LightGBM

https://zhuanlan.zhihu.com/p/99069186

 

https://github.com/Microstrong0305/WeChat-zhihu-csdnblog-code

https://github.com/microsoft/LightGBM

 

 

https://www.csdn.net/tags/MtTaEg5sMjE2MjE0LWJsb2cO0O0O.html

 

利用XGBoost实现对鸢尾花数据集(Iris.csv)的分类预测

https://blog.csdn.net/Cyril_KI/article/details/107660210

 

python将大csv文件划分成小csv文件做训练集和测试集

https://blog.csdn.net/Findingxu/article/details/86683743?utm_term=csv%E6%96%87%E4%BB%B6%E5%88%86%E8%AE%AD%E7%BB%83%E9%9B%86%E5%92%8C%E6%B5%8B%E8%AF%95%E9%9B%86&utm_medium=distribute.pc_aggpage_search_result.none-task-blog-2~all~sobaiduweb~default-0-86683743-null-null&spm=3001.4430

https://wenku.baidu.com/view/9597ad3bc6da50e2524de518964bcf84b9d52db6.html

 

 

程序1

import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.datasets import  make_classification
import csv
import numpy as np    
def read_data(test_data='input/train.csv', n=0, label=1):
    '''
    加载数据的功能
    n:特征数据起始位
    label:是否是监督样本数据
    '''
    csv.field_size_limit(500 * 1024 * 1024) #一定要加上这一句
    csv_reader = csv.reader(open(test_data, encoding="utf8", errors="ignore"))
    data_list = []
    for one_line in csv_reader:
        data_list.append(one_line)
    x_list = []
    y_list = []
    for one_line in data_list[1:]:
        if label == 1:#如果是监督样本数据
            y_list.append(int(one_line[-1]))  # 标志位(最后一位都是标签位)
            one_list = [o for o in one_line[n:-1]]
            x_list.append(one_list)
        else:
            one_list = [o for o in one_line[n:]]
            x_list.append(one_list)
    return x_list, y_list
 
def split_data(data_list, y_list, ratio=0.30):#70%训练集,30%测试集: 914285,391837
    '''
    按照指定的比例,划分样本数据集
    ratio: 测试数据的比率
    '''
    X_train, X_test, y_train, y_test = train_test_split(data_list, y_list, test_size=ratio, random_state=50)
 
    """训练集"""
    with open('input/sub_train.csv', 'w', encoding="utf8",newline="", errors="ignore") as csvfile:#不加newline=""的话会空一行出来
        fieldnames = ['qid', 'question_text','target']
        write = csv.DictWriter(csvfile,fieldnames=fieldnames)
        write.writeheader()#写表头
        for i in range(len(X_train)):
           write.writerow({'qid':X_train[i][0],'question_text':X_train[i][1],'target':y_train[i]})
 
    """测试集"""
    #标签文件
    with open('input/sub_test_y', 'w') as fp:
        json.dump(y_test, fp)
    #测试csv
    with open('input/sub_test_x.csv', 'w', encoding="utf8",newline="", errors="ignore") as csvfile:#不加newline=""的话会空一行出来
        fieldnames = ['qid', 'question_text']
        write = csv.DictWriter(csvfile,fieldnames=fieldnames)
        write.writeheader()#写表头
        for i in range(len(X_test)):
           write.writerow({'qid':X_test[i][0],'question_text':X_test[i][1]})
    return X_train, X_test, y_train, y_test
 
F_feature,F_label = read_data(test_data='D:\\20210706E\\2020-python\\light_GBM\\Iris.csv', n=1, label=1)
print(np.array(F_feature))
print(np.array(F_label))
X_train,X_test,y_train,y_test =train_test_split(np.array(F_feature),np.array(F_label),test_size=0.2)
#X_train,X_test,y_train,y_test =train_test_split(F_train,F_train,test_size=0.2)
print(70*'*\n')

'''df_train = pd.read_csv('D:\\20210706E\\2020-python\\light_GBM\\Iris.csv',sep=",")
#print(df_train)
df_train_label=df_train['Species']
# df_train_feature=df_train['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']
df_train_feature=df_train[['SepalWidthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
#print(df_train_label.tolist())
#test = df_train_label.drop(, axis = 1)
#print(test)
print(df_train_feature)'''


# df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
# df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
# y_train = df_train[0].values
# y_test = df_test[0].values
# X_train = df_train.drop(0, axis=1).values
# X_test = df_test.drop(0, axis=1).values


# 加载数据

print('Load data...')

iris = load_iris()
data=iris.data
target = iris.target
#X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.2)

# df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
# df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
# y_train = df_train[0].values
# y_test = df_test[0].values
# X_train = df_train.drop(0, axis=1).values
# X_test = df_test.drop(0, axis=1).values

print('Start training...')
# 创建模型,训练模型
gbm = lgb.LGBMRegressor(objective='regression',num_leaves=31,learning_rate=0.05,n_estimators=20)
gbm.fit(X_train, y_train,eval_set=[(X_test, y_test)],eval_metric='l1',early_stopping_rounds=5)
   # 模型存储
   joblib.dump(gbm, 'loan_model.pkl')
   # 模型加载
   gbm = joblib.load('loan_model.pkl')

print('Start predicting...') # 测试机预测 y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_) # 模型评估 print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) # feature importances print('Feature importances:', list(gbm.feature_importances_)) # 网格搜索,参数优化 estimator = lgb.LGBMRegressor(num_leaves=31) param_grid = { 'learning_rate': [0.01, 0.1, 1], 'n_estimators': [20, 40] } gbm = GridSearchCV(estimator, param_grid) gbm.fit(X_train, y_train) print('Best parameters found by grid search are:', gbm.best_params_)

  

 from sklearn.metrics import roc_auc_score, accuracy_score

# 模型预测
y_pred = gbm.predict(X_test)
y_pred = [list(x).index(max(x)) for x in y_pred]
print(y_pred)

# 模型评估
print(accuracy_score(y_test, y_pred))

数据1

Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
1,5.1,3.5,1.4,0.2,0
2,4.9,3.0,1.4,0.2,0
3,4.7,3.2,1.3,0.2,0
4,4.6,3.1,1.5,0.2,0
5,5.0,3.6,1.4,0.2,0
6,5.4,3.9,1.7,0.4,0
7,4.6,3.4,1.4,0.3,0
8,5.0,3.4,1.5,0.2,0
9,4.4,2.9,1.4,0.2,0
10,4.9,3.1,1.5,0.1,0
11,5.4,3.7,1.5,0.2,0
12,4.8,3.4,1.6,0.2,0
13,4.8,3.0,1.4,0.1,0
14,4.3,3.0,1.1,0.1,0
15,5.8,4.0,1.2,0.2,0
16,5.7,4.4,1.5,0.4,0
17,5.4,3.9,1.3,0.4,0
18,5.1,3.5,1.4,0.3,0
19,5.7,3.8,1.7,0.3,0
20,5.1,3.8,1.5,0.3,0
21,5.4,3.4,1.7,0.2,0
22,5.1,3.7,1.5,0.4,0
23,4.6,3.6,1.0,0.2,0
24,5.1,3.3,1.7,0.5,0
25,4.8,3.4,1.9,0.2,0
26,5.0,3.0,1.6,0.2,0
27,5.0,3.4,1.6,0.4,0
28,5.2,3.5,1.5,0.2,0
29,5.2,3.4,1.4,0.2,0
30,4.7,3.2,1.6,0.2,0
31,4.8,3.1,1.6,0.2,0
32,5.4,3.4,1.5,0.4,0
33,5.2,4.1,1.5,0.1,0
34,5.5,4.2,1.4,0.2,0
35,4.9,3.1,1.5,0.1,0
36,5.0,3.2,1.2,0.2,0
37,5.5,3.5,1.3,0.2,0
38,4.9,3.1,1.5,0.1,0
39,4.4,3.0,1.3,0.2,0
40,5.1,3.4,1.5,0.2,0
41,5.0,3.5,1.3,0.3,0
42,4.5,2.3,1.3,0.3,0
43,4.4,3.2,1.3,0.2,0
44,5.0,3.5,1.6,0.6,0
45,5.1,3.8,1.9,0.4,0
46,4.8,3.0,1.4,0.3,0
47,5.1,3.8,1.6,0.2,0
48,4.6,3.2,1.4,0.2,0
49,5.3,3.7,1.5,0.2,0
50,5.0,3.3,1.4,0.2,0
51,7.0,3.2,4.7,1.4,1
52,6.4,3.2,4.5,1.5,1
53,6.9,3.1,4.9,1.5,1
54,5.5,2.3,4.0,1.3,1
55,6.5,2.8,4.6,1.5,1
56,5.7,2.8,4.5,1.3,1
57,6.3,3.3,4.7,1.6,1
58,4.9,2.4,3.3,1.0,1
59,6.6,2.9,4.6,1.3,1
60,5.2,2.7,3.9,1.4,1
61,5.0,2.0,3.5,1.0,1
62,5.9,3.0,4.2,1.5,1
63,6.0,2.2,4.0,1.0,1
64,6.1,2.9,4.7,1.4,1
65,5.6,2.9,3.6,1.3,1
66,6.7,3.1,4.4,1.4,1
67,5.6,3.0,4.5,1.5,1
68,5.8,2.7,4.1,1.0,1
69,6.2,2.2,4.5,1.5,1
70,5.6,2.5,3.9,1.1,1
71,5.9,3.2,4.8,1.8,1
72,6.1,2.8,4.0,1.3,1
73,6.3,2.5,4.9,1.5,1
74,6.1,2.8,4.7,1.2,1
75,6.4,2.9,4.3,1.3,1
76,6.6,3.0,4.4,1.4,1
77,6.8,2.8,4.8,1.4,1
78,6.7,3.0,5.0,1.7,1
79,6.0,2.9,4.5,1.5,1
80,5.7,2.6,3.5,1.0,1
81,5.5,2.4,3.8,1.1,1
82,5.5,2.4,3.7,1.0,1
83,5.8,2.7,3.9,1.2,1
84,6.0,2.7,5.1,1.6,1
85,5.4,3.0,4.5,1.5,1
86,6.0,3.4,4.5,1.6,1
87,6.7,3.1,4.7,1.5,1
88,6.3,2.3,4.4,1.3,1
89,5.6,3.0,4.1,1.3,1
90,5.5,2.5,4.0,1.3,1
91,5.5,2.6,4.4,1.2,1
92,6.1,3.0,4.6,1.4,1
93,5.8,2.6,4.0,1.2,1
94,5.0,2.3,3.3,1.0,1
95,5.6,2.7,4.2,1.3,1
96,5.7,3.0,4.2,1.2,1
97,5.7,2.9,4.2,1.3,1
98,6.2,2.9,4.3,1.3,1
99,5.1,2.5,3.0,1.1,1
100,5.7,2.8,4.1,1.3,1
101,6.3,3.3,6.0,2.5,2
102,5.8,2.7,5.1,1.9,2
103,7.1,3.0,5.9,2.1,2
104,6.3,2.9,5.6,1.8,2
105,6.5,3.0,5.8,2.2,2
106,7.6,3.0,6.6,2.1,2
107,4.9,2.5,4.5,1.7,2
108,7.3,2.9,6.3,1.8,2
109,6.7,2.5,5.8,1.8,2
110,7.2,3.6,6.1,2.5,2
111,6.5,3.2,5.1,2.0,2
112,6.4,2.7,5.3,1.9,2
113,6.8,3.0,5.5,2.1,2
114,5.7,2.5,5.0,2.0,2
115,5.8,2.8,5.1,2.4,2
116,6.4,3.2,5.3,2.3,2
117,6.5,3.0,5.5,1.8,2
118,7.7,3.8,6.7,2.2,2
119,7.7,2.6,6.9,2.3,2
120,6.0,2.2,5.0,1.5,2
121,6.9,3.2,5.7,2.3,2
122,5.6,2.8,4.9,2.0,2
123,7.7,2.8,6.7,2.0,2
124,6.3,2.7,4.9,1.8,2
125,6.7,3.3,5.7,2.1,2
126,7.2,3.2,6.0,1.8,2
127,6.2,2.8,4.8,1.8,2
128,6.1,3.0,4.9,1.8,2
129,6.4,2.8,5.6,2.1,2
130,7.2,3.0,5.8,1.6,2
131,7.4,2.8,6.1,1.9,2
132,7.9,3.8,6.4,2.0,2
133,6.4,2.8,5.6,2.2,2
134,6.3,2.8,5.1,1.5,2
135,6.1,2.6,5.6,1.4,2
136,7.7,3.0,6.1,2.3,2
137,6.3,3.4,5.6,2.4,2
138,6.4,3.1,5.5,1.8,2
139,6.0,3.0,4.8,1.8,2
140,6.9,3.1,5.4,2.1,2
141,6.7,3.1,5.6,2.4,2
142,6.9,3.1,5.1,2.3,2
143,5.8,2.7,5.1,1.9,2
144,6.8,3.2,5.9,2.3,2
145,6.7,3.3,5.7,2.5,2
146,6.7,3.0,5.2,2.3,2
147,6.3,2.5,5.0,1.9,2
148,6.5,3.0,5.2,2.0,2
149,6.2,3.4,5.4,2.3,2
150,5.9,3.0,5.1,1.8,2

 

from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

# 加载数据
iris = load_iris()
data = iris.data
target = iris.target

# 划分训练数据和测试数据
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)

# 模型训练
gbm = LGBMClassifier(num_leaves=31, learning_rate=0.05, n_estimators=20)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5)

# 模型存储
joblib.dump(gbm, 'loan_model.pkl')
# 模型加载
gbm = joblib.load('loan_model.pkl')

# 模型预测
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)

# 模型评估
print('The accuracy of prediction is:', accuracy_score(y_test, y_pred))

# 特征重要度
print('Feature importances:', list(gbm.feature_importances_))

# 网格搜索,参数优化
estimator = LGBMClassifier(num_leaves=31)
param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [20, 40]
}
gbm = GridSearchCV(estimator, param_grid)
gbm.fit(X_train, y_train)
print('Best parameters found by grid search are:', gbm.best_params_)

 

 

ImportError: cannot import name ‘joblib‘ from ‘sklearn.externals‘

https://blog.csdn.net/weixin_45031468/article/details/113825131