决策树规则挖掘

import pandas as pd
import numpy as np
import os
data = pd.read_excel('oil_data_for_tree.xlsx')
data.head()
uid oil_actv_dt create_dt total_oil_cnt pay_amount_total class_new bad_ind oil_amount discount_amount sale_amount amount pay_amount coupon_amount payment_coupon_amount channel_code oil_code scene source_app call_source
0 A8217710 2018-08-19 2018-08-17 137.0 24147747.2 B 0 1653.78 880040.0 898000.0 865540.0 4327700.0 0.0 100.0 1 3 2 0 3
1 A8217710 2018-08-19 2018-08-16 137.0 24147747.2 B 0 2336.84 1243522.0 1268900.0 1218922.0 6094610.0 0.0 100.0 1 3 2 0 3
2 A8217710 2018-08-19 2018-08-15 137.0 24147747.2 B 0 936.03 488922.0 498900.0 480922.0 2404610.0 0.0 200.0 1 2 2 0 3
3 A8217710 2018-08-19 2018-08-14 137.0 24147747.2 B 0 2418.39 1263220.0 1289000.0 1242220.0 6211100.0 0.0 300.0 1 2 2 0 3
4 A8217710 2018-08-19 2018-08-13 137.0 24147747.2 B 0 1292.69 675220.0 689000.0 664220.0 3321100.0 0.0 100.0 1 2 2 0 3

uid
oil_actv_dt
create_dt
total_oil_cnt
pay_amount_total
class_new
bad_ind oil_amount
discount_amount
sale_amount
amount
pay_amount
coupon_amount
payment_coupon_amount
channel_code
oil_code
scene
source_app
call_source

set(data.class_new)
{'A', 'B', 'C', 'D', 'E', 'F'}

org_lst 时间类别型变量,不需要做特殊变换,直接去重
agg_lst 数值型变量做聚合
dstc_lst 文本型变量做cnt

org_lst = ['uid', 'create_dt', 'oil_actv_dt', 'class_new', 'bad_ind']
agg_lst = ['oil_amount', 'discount_amount', 'sale_amount', 'amount', 'pay_amount', 'coupon_amount', 'payment_coupon_amount']
dstc_lst = ['channel_code', 'oil_code', 'scene', 'source_app', 'call_source']

数据重组

df = data[org_lst].copy()
df.head()
uid create_dt oil_actv_dt class_new bad_ind
0 A8217710 2018-08-17 2018-08-19 B 0
1 A8217710 2018-08-16 2018-08-19 B 0
2 A8217710 2018-08-15 2018-08-19 B 0
3 A8217710 2018-08-14 2018-08-19 B 0
4 A8217710 2018-08-13 2018-08-19 B 0
df[agg_lst] = data[agg_lst].copy()
df[agg_lst].head()
oil_amount discount_amount sale_amount amount pay_amount coupon_amount payment_coupon_amount
0 1653.78 880040.0 898000.0 865540.0 4327700.0 0.0 100.0
1 2336.84 1243522.0 1268900.0 1218922.0 6094610.0 0.0 100.0
2 936.03 488922.0 498900.0 480922.0 2404610.0 0.0 200.0
3 2418.39 1263220.0 1289000.0 1242220.0 6211100.0 0.0 300.0
4 1292.69 675220.0 689000.0 664220.0 3321100.0 0.0 100.0
df[dstc_lst] = data[dstc_lst].copy()
df[dstc_lst].head()
channel_code oil_code scene source_app call_source
0 1 3 2 0 3
1 1 3 2 0 3
2 1 2 2 0 3
3 1 2 2 0 3
4 1 2 2 0 3

查看缺失情况

df.isna().sum()
uid                         0
create_dt                4944
oil_actv_dt                 0
class_new                   0
bad_ind                     0
oil_amount               4944
discount_amount          4944
sale_amount              4944
amount                   4944
pay_amount               4944
coupon_amount            4944
payment_coupon_amount    4946
channel_code                0
oil_code                    0
scene                       0
source_app                  0
call_source                 0
dtype: int64

查看基础变量的describe()

df.describe()
bad_ind oil_amount discount_amount sale_amount amount pay_amount coupon_amount payment_coupon_amount channel_code oil_code scene source_app call_source
count 50609.000000 45665.000000 4.566500e+04 4.566500e+04 4.566500e+04 4.566500e+04 45665.0 45663.000000 50609.000000 50609.000000 50609.000000 50609.000000 50609.000000
mean 0.017764 212.188054 1.091035e+05 1.121195e+05 1.077312e+05 5.386562e+05 0.0 417.055384 1.476378 1.617894 1.906519 0.306072 2.900729
std 0.132093 200.298122 1.010993e+05 1.031804e+05 9.953775e+04 4.976888e+05 0.0 968.250273 1.511470 3.074166 0.367280 0.893682 0.726231
min 0.000000 0.000000 0.000000e+00 0.000000e+00 1.000000e+00 5.000000e+00 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 87.220000 4.854000e+04 5.000000e+04 4.820000e+04 2.410000e+05 0.0 0.000000 1.000000 0.000000 2.000000 0.000000 3.000000
50% 0.000000 167.580000 8.820000e+04 9.000000e+04 8.709600e+04 4.354800e+05 0.0 100.000000 1.000000 0.000000 2.000000 0.000000 3.000000
75% 0.000000 278.300000 1.391600e+05 1.430000e+05 1.371150e+05 6.855750e+05 0.0 500.000000 1.000000 0.000000 2.000000 0.000000 3.000000
max 1.000000 3975.910000 1.958040e+06 1.998000e+06 1.925540e+06 9.627700e+06 0.0 50000.000000 6.000000 9.000000 2.000000 3.000000 4.000000

对creat_dt做补全,用oil_actv_dt来填补,并且截取6个月的数据。
构造变量的时候不能直接对历史所有数据做累加。
否则随着时间推移,变量分布会有很大的变化。

def time_isna(x,y):
    if str(x) == 'NaT':
        x = y
    else:
        x = x
    return x
df2 = df.sort_values(['uid','create_dt'], ascending = False)
df2.head()
uid create_dt oil_actv_dt class_new bad_ind oil_amount discount_amount sale_amount amount pay_amount coupon_amount payment_coupon_amount channel_code oil_code scene source_app call_source
50608 B96436391985035703 NaT 2018-10-08 B 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4
50607 B96436391984693397 NaT 2018-10-11 E 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4
50606 B96436391977217468 NaT 2018-10-17 B 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4
50605 B96436391976480892 NaT 2018-09-28 B 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4
50604 B96436391972106043 NaT 2018-10-19 A 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4
df2['create_dt'] = df2.apply(lambda x: time_isna(x.create_dt, x.oil_actv_dt), axis = 1)
df2['create_dt'].head()
50608   2018-10-08
50607   2018-10-11
50606   2018-10-17
50605   2018-09-28
50604   2018-10-19
Name: create_dt, dtype: datetime64[ns]
df2['dtn'] = (df2.oil_actv_dt - df2.create_dt).apply(lambda x: x.days)
df2['dtn']
50608      0
50607      0
50606      0
50605      0
50604      0
50603      0
50602      0
50601      0
50600      0
50599      0
50598      0
50597      0
50596      0
50595      0
50594      0
50593      0
50592      0
50591      0
50590      0
50589      0
50588      0
50587      0
50586      0
50585      0
50584      0
50583      0
50582      0
50581      0
50580      0
50579      0
        ... 
12860    255
12865    275
46427      0
35426      2
35425      4
35427      7
35419      9
35418     15
35414     19
35420     38
35421     39
35428     41
35417     42
35415     44
35424     45
35423     46
35422     48
35416     49
40437    183
40440    186
40442    188
40438    189
40436    192
40441    193
40439    194
46426      0
41434     21
41433     36
41432     69
45296     28
Name: dtn, Length: 50609, dtype: int64
df = df2[df2['dtn'] < 180]
df.head()
uid create_dt oil_actv_dt class_new bad_ind oil_amount discount_amount sale_amount amount pay_amount coupon_amount payment_coupon_amount channel_code oil_code scene source_app call_source dtn
50608 B96436391985035703 2018-10-08 2018-10-08 B 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4 0
50607 B96436391984693397 2018-10-11 2018-10-11 E 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4 0
50606 B96436391977217468 2018-10-17 2018-10-17 B 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4 0
50605 B96436391976480892 2018-09-28 2018-09-28 B 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4 0
50604 B96436391972106043 2018-10-19 2018-10-19 A 0 NaN NaN NaN NaN NaN NaN NaN 6 9 2 3 4 0

对org_list变量求历史贷款天数的最大间隔,并且去重

base = df[org_lst]
base.head()
uid create_dt oil_actv_dt class_new bad_ind
50608 B96436391985035703 2018-10-08 2018-10-08 B 0
50607 B96436391984693397 2018-10-11 2018-10-11 E 0
50606 B96436391977217468 2018-10-17 2018-10-17 B 0
50605 B96436391976480892 2018-09-28 2018-09-28 B 0
50604 B96436391972106043 2018-10-19 2018-10-19 A 0
base['dtn'] = df['dtn']
base = base.sort_values(['uid', 'create_dt'], ascending = False)
# 去重
base = base.drop_duplicates(['uid'], keep = 'first')
base.shape
E:\Anaconda3\envs\sklearn\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.





(11099, 6)

变量衍生(可以尝试featuretools)

gn = pd.DataFrame()
for i in agg_lst:
    # count
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:len(df[i])).reset_index())
    tp.columns = ['uid', i + '_cnt']
    
    # sum
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn, tp, on = 'uid', how = 'left')
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.where(df[i]>0, 1, 0).sum()).reset_index())
    tp.columns = ['uid', i + '_num']
    
    # nansum
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nansum(df[i])).reset_index())
    tp.columns = ['uid', i + '_tot']
    
    # nanmean
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmean(df[i])).reset_index())
    tp.columns = ['uid',i + '_avg']
    
    # nanmax
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmax(df[i])).reset_index())
    tp.columns = ['uid',i + '_max']
    
    # nanmin
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmin(df[i])).reset_index())
    tp.columns = ['uid',i + '_min']
    
    # nanvar 计算沿指定轴的方差,同时忽略NaNs
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanvar(df[i])).reset_index())
    tp.columns = ['uid',i + '_var']
    
    # nanmax-nanmin
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmax(df[i]) -np.nanmin(df[i]) ).reset_index())
    tp.columns = ['uid',i + '_var']
    
    # nanmean/nanvar
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmean(df[i])/max(np.nanvar(df[i]),1)).reset_index())
    tp.columns = ['uid',i + '_var']
    
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
E:\Anaconda3\envs\sklearn\lib\site-packages\ipykernel_launcher.py:24: RuntimeWarning: Mean of empty slice
E:\Anaconda3\envs\sklearn\lib\site-packages\ipykernel_launcher.py:31: RuntimeWarning: All-NaN axis encountered
E:\Anaconda3\envs\sklearn\lib\site-packages\ipykernel_launcher.py:38: RuntimeWarning: All-NaN axis encountered
E:\Anaconda3\envs\sklearn\lib\site-packages\ipykernel_launcher.py:45: RuntimeWarning: Degrees of freedom <= 0 for slice.
E:\Anaconda3\envs\sklearn\lib\site-packages\ipykernel_launcher.py:52: RuntimeWarning: All-NaN axis encountered
E:\Anaconda3\envs\sklearn\lib\site-packages\ipykernel_launcher.py:59: RuntimeWarning: Mean of empty slice
E:\Anaconda3\envs\sklearn\lib\site-packages\ipykernel_launcher.py:59: RuntimeWarning: Degrees of freedom <= 0 for slice.
gn.head()
uid oil_amount_cnt oil_amount_num oil_amount_tot oil_amount_avg oil_amount_max oil_amount_min oil_amount_var_x oil_amount_var_y oil_amount_var ... coupon_amount_var payment_coupon_amount_cnt payment_coupon_amount_num payment_coupon_amount_tot payment_coupon_amount_avg payment_coupon_amount_max payment_coupon_amount_min payment_coupon_amount_var_x payment_coupon_amount_var_y payment_coupon_amount_var
0 A10000481 1 1 94.76 94.760000 94.76 94.76 0.000000 0.00 94.760000 ... 0.0 1 1 200.0 200.000000 200.0 200.0 0.000000 0.0 200.000000
1 A1000232 3 3 372.82 124.273333 138.46 109.55 139.442022 28.91 0.891219 ... 0.0 3 2 300.0 100.000000 200.0 0.0 6666.666667 200.0 0.015000
2 A1000455 1 0 0.00 NaN NaN NaN NaN NaN NaN ... NaN 1 0 0.0 NaN NaN NaN NaN NaN NaN
3 A10010042 15 15 742.05 49.470000 76.47 31.62 109.291800 44.85 0.452641 ... 0.0 15 12 1900.0 126.666667 500.0 0.0 13955.555556 500.0 0.009076
4 A1001014 1 0 0.00 NaN NaN NaN NaN NaN NaN ... NaN 1 0 0.0 NaN NaN NaN NaN NaN NaN

5 rows × 64 columns

gn.columns
Index(['uid', 'oil_amount_cnt', 'oil_amount_num', 'oil_amount_tot',
       'oil_amount_avg', 'oil_amount_max', 'oil_amount_min',
       'oil_amount_var_x', 'oil_amount_var_y', 'oil_amount_var',
       'discount_amount_cnt', 'discount_amount_num', 'discount_amount_tot',
       'discount_amount_avg', 'discount_amount_max', 'discount_amount_min',
       'discount_amount_var_x', 'discount_amount_var_y', 'discount_amount_var',
       'sale_amount_cnt', 'sale_amount_num', 'sale_amount_tot',
       'sale_amount_avg', 'sale_amount_max', 'sale_amount_min',
       'sale_amount_var_x', 'sale_amount_var_y', 'sale_amount_var',
       'amount_cnt', 'amount_num', 'amount_tot', 'amount_avg', 'amount_max',
       'amount_min', 'amount_var_x', 'amount_var_y', 'amount_var',
       'pay_amount_cnt', 'pay_amount_num', 'pay_amount_tot', 'pay_amount_avg',
       'pay_amount_max', 'pay_amount_min', 'pay_amount_var_x',
       'pay_amount_var_y', 'pay_amount_var', 'coupon_amount_cnt',
       'coupon_amount_num', 'coupon_amount_tot', 'coupon_amount_avg',
       'coupon_amount_max', 'coupon_amount_min', 'coupon_amount_var_x',
       'coupon_amount_var_y', 'coupon_amount_var', 'payment_coupon_amount_cnt',
       'payment_coupon_amount_num', 'payment_coupon_amount_tot',
       'payment_coupon_amount_avg', 'payment_coupon_amount_max',
       'payment_coupon_amount_min', 'payment_coupon_amount_var_x',
       'payment_coupon_amount_var_y', 'payment_coupon_amount_var'],
      dtype='object')

对dstc_lst变量求distinct个数

# 去重计数
gc = pd.DataFrame()
for i in dstc_lst:
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df: len(set(df[i]))).reset_index())
    tp.columns = ['uid',i + '_dstc']
    if gc.empty == True:
        gc = tp
    else:
        gc = pd.merge(gc,tp,on = 'uid',how = 'left')
gc.head()
uid channel_code_dstc oil_code_dstc scene_dstc source_app_dstc call_source_dstc
0 A10000481 1 1 1 1 1
1 A1000232 1 1 1 1 1
2 A1000455 1 1 1 1 1
3 A10010042 1 1 1 1 1
4 A1001014 1 1 1 1 1

将变量组合在一起

fn = pd.merge(base,gn,on= 'uid')
fn = pd.merge(fn,gc,on= 'uid') 
fn.shape
(45039, 73)
fn = fn.fillna(0)
fn.head()
uid create_dt oil_actv_dt class_new bad_ind oil_amount_cnt oil_amount_num oil_amount_tot oil_amount_avg oil_amount_max ... payment_coupon_amount_max payment_coupon_amount_min payment_coupon_amount_var_x payment_coupon_amount_var_y payment_coupon_amount_var channel_code_dstc oil_code_dstc scene_dstc source_app_dstc call_source_dstc
0 B96436391985035703 2018-10-08 2018-10-08 B 0 1 0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1 1 1 1 1
1 B96436391984693397 2018-10-11 2018-10-11 E 0 1 0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1 1 1 1 1
2 B96436391977217468 2018-10-17 2018-10-17 B 0 1 0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1 1 1 1 1
3 B96436391976480892 2018-09-28 2018-09-28 B 0 1 0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1 1 1 1 1
4 B96436391972106043 2018-10-19 2018-10-19 A 0 1 0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1 1 1 1 1

5 rows × 73 columns

训练决策树模型

# 特征
x = fn.drop(['uid','oil_actv_dt','create_dt','bad_ind','class_new'],axis = 1)
# 标签
y = fn.bad_ind.copy()

from sklearn import tree
dtree = tree.DecisionTreeRegressor(max_depth=2, min_samples_leaf=500, min_samples_split=5000)
dtree = dtree.fit(x,y)

输出决策树图像,并作出决策

import pydotplus 
from IPython.display import Image
from sklearn.externals.six import StringIO
import os

with open("dt.dot", "w") as f:
    tree.export_graphviz(dtree, out_file=f)

dot_data = StringIO()

tree.export_graphviz(dtree, out_file=dot_data,
                         feature_names=x.columns,
                         class_names=['bad_ind'],
                         filled=True, rounded=True,
                         special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
Image(graph.create_png())

value = badrate

sum(fn.bad_ind)/len(fn.bad_ind)
0.04658077304261645
posted @ 2019-05-21 20:24  chenxiangzhen  阅读(464)  评论(0编辑  收藏  举报