import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 模型处理模块
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
# 常规模型
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
# 集成学习和stacking模型
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
import xgboost as xgb
# 评价标准模块
from sklearn import metrics
from sklearn.metrics import accuracy_score,roc_auc_score,recall_score,precision_score, classification_report
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
data = pd.read_csv(r"E:\Excersise\ML\Trip\order_train_merage.csv",parse_dates=["orderdate","arrival","etd"])
data.head()
# 检测个字段的缺失及占比
data.apply(lambda x: [x.isnull().sum(), x.isnull().sum()/x.size], axis=0)
data.dropna(inplace=True)
data.label.value_counts()
data.duplicated().sum()
data.describe(include="object")
dummies = pd.get_dummies(data.hotelbelongto ,prefix='hotelbelongto')
dummies_1 = pd.get_dummies(data.supplierchannel ,prefix='supplierchannel')
data = pd.concat([data,dummies,dummies_1],axis=1)
data.head()
#ADASYN自适应采样
from imblearn.over_sampling import ADASYN
sample =ADASYN()
#抽样的X,Y都要为数组
X_resampled,y_resampled = sample.fit_resample(data.loc[:,data.columns != "label"].values,data.label.values)
model_name_param_dict = { 'LR': (LogisticRegression()),
'DT': (DecisionTreeClassifier()),
'AdaBoost': (AdaBoostClassifier()),
'GBDT': (GradientBoostingClassifier()),
'RF': (RandomForestClassifier()),
'XGBoost':(XGBClassifier())
}
result = {}
for model_name, model in model_name_param_dict.items():
result[model_name] = train_model(X_train, y_train, X_test, y_test, model,model_name)