机器学习比赛
基本上来就用sklearn的随机森林, 和xgboost库.
2个算法先跑一下.基本能拿到一个能看的结果. 速度都很快. 基本30秒内跑完.
然后再考虑其他的特征工程, 深度学习等等..
模板:
# lightgbm原生接口
import xgboost as xgb
# 基于scikit-learn接口
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.datasets import load_breast_cancer, load_wine
import warnings
warnings.simplefilter("ignore")
cancer = load_breast_cancer()
wine = load_wine()
# data_train, data_test, target_train, target_test = train_test_split(cancer.data, cancer.target, test_size = 0.2, random_state = 0)
# params = {
# 'eta': 0.02, #lr
# 'max_depth': 6,
# 'min_child_weight':3,#最小叶子节点样本权重和
# 'gamma':0, #指定节点分裂所需的最小损失函数下降值。
# 'subsample': 0.7, #控制对于每棵树,随机采样的比例
# 'colsample_bytree': 0.3, #用来控制每棵随机采样的列数的占比 (每一列是一个特征)。
# 'lambda':2,
# 'objective': 'binary:logistic',
# 'eval_metric': 'auc',
# 'silent': True,
# 'nthread': -1
# }
# xgb_train = xgb.DMatrix(data_train, target_train)
# xgb_test = xgb.DMatrix(data_test, target_test)
# xgb_model = xgb.train(dtrain = xgb_train, params=params)
# xgb_predict = xgb_model.predict(xgb_train)
# xgb_predict[xgb_predict > .5] = 1
# xgb_predict[xgb_predict <= .5] = 0
data_train, data_test, target_train, target_test = train_test_split(wine.data, wine.target, test_size = 0.2, random_state = 0)
params = {
'eta': 0.02, #lr
'num_class':3,
'max_depth': 5,
'min_child_weight':1,#最小叶子节点样本权重和
'gamma':0, #指定节点分裂所需的最小损失函数下降值。
'subsample': 0.7, #控制对于每棵树,随机采样的比例
'colsample_bytree': 0.3, #用来控制每棵随机采样的列数的占比 (每一列是一个特征)。
'lambda':2,
'objective': 'multi:softmax',
'eval_metric': 'mlogloss',
'silent': True,
'nthread': -1,
'n_estimators':100
}
xgb_train = xgb.DMatrix(data_train, target_train)
xgb_test = xgb.DMatrix(data_test, target_test)
xgb_model = xgb.train(dtrain = xgb_train, params=params)
xgb_predict = xgb_model.predict(xgb_train)
xgb_test_pred = xgb_model.predict(xgb_test)
aaa=mean_squared_error(xgb_test_pred, target_test)
print('xgboost',aaa)
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import mean_absolute_error, r2_score,mean_squared_error
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
import pandas as pd
import numpy as np
# import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
#train model
model_tree = RandomForestClassifier(n_estimators=102, random_state=42, )#分类用这个, 回归用RandomForestRegressor
model_tree.fit(data_train, target_train)
print('random forest',mean_squared_error(model_tree.predict(data_test), target_test))
print(1)