决策树规则挖掘

import pandas as pd
import numpy as np
import os

data = pd.read_excel('oil_data_for_tree.xlsx')
data.head()

	uid	oil_actv_dt	create_dt	total_oil_cnt	pay_amount_total	class_new	oil_amount	discount_amount	sale_amount	amount	pay_amount	payment_coupon_amount	channel_code	oil_code	scene	call_source
0	A8217710	2018-08-19	2018-08-17	137.0	24147747.2	B	1653.78	880040.0	898000.0	865540.0	4327700.0	100.0	1	3	2	3
1	A8217710	2018-08-19	2018-08-16	137.0	24147747.2	B	2336.84	1243522.0	1268900.0	1218922.0	6094610.0	100.0	1	3	2	3
2	A8217710	2018-08-19	2018-08-15	137.0	24147747.2	B	936.03	488922.0	498900.0	480922.0	2404610.0	200.0	1	2	2	3
3	A8217710	2018-08-19	2018-08-14	137.0	24147747.2	B	2418.39	1263220.0	1289000.0	1242220.0	6211100.0	300.0	1	2	2	3
4	A8217710	2018-08-19	2018-08-13	137.0	24147747.2	B	1292.69	675220.0	689000.0	664220.0	3321100.0	100.0	1	2	2	3

uid
oil_actv_dt
create_dt
total_oil_cnt
pay_amount_total
class_new
bad_ind oil_amount
discount_amount
sale_amount
amount
pay_amount
coupon_amount
payment_coupon_amount
channel_code
oil_code
scene
source_app
call_source

set(data.class_new)

{'A', 'B', 'C', 'D', 'E', 'F'}

org_lst 时间类别型变量，不需要做特殊变换，直接去重
agg_lst 数值型变量做聚合
dstc_lst 文本型变量做cnt

org_lst = ['uid', 'create_dt', 'oil_actv_dt', 'class_new', 'bad_ind']
agg_lst = ['oil_amount', 'discount_amount', 'sale_amount', 'amount', 'pay_amount', 'coupon_amount', 'payment_coupon_amount']
dstc_lst = ['channel_code', 'oil_code', 'scene', 'source_app', 'call_source']

数据重组

df = data[org_lst].copy()
df.head()

	uid	create_dt	oil_actv_dt	class_new
0	A8217710	2018-08-17	2018-08-19	B
1	A8217710	2018-08-16	2018-08-19	B
2	A8217710	2018-08-15	2018-08-19	B
3	A8217710	2018-08-14	2018-08-19	B
4	A8217710	2018-08-13	2018-08-19	B

df[agg_lst] = data[agg_lst].copy()
df[agg_lst].head()

	oil_amount	discount_amount	sale_amount	amount	pay_amount	payment_coupon_amount
0	1653.78	880040.0	898000.0	865540.0	4327700.0	100.0
1	2336.84	1243522.0	1268900.0	1218922.0	6094610.0	100.0
2	936.03	488922.0	498900.0	480922.0	2404610.0	200.0
3	2418.39	1263220.0	1289000.0	1242220.0	6211100.0	300.0
4	1292.69	675220.0	689000.0	664220.0	3321100.0	100.0

df[dstc_lst] = data[dstc_lst].copy()
df[dstc_lst].head()

	channel_code	oil_code	scene	call_source
0	1	3	2	3
1	1	3	2	3
2	1	2	2	3
3	1	2	2	3
4	1	2	2	3

查看缺失情况

df.isna().sum()

uid                         0
create_dt                4944
oil_actv_dt                 0
class_new                   0
bad_ind                     0
oil_amount               4944
discount_amount          4944
sale_amount              4944
amount                   4944
pay_amount               4944
coupon_amount            4944
payment_coupon_amount    4946
channel_code                0
oil_code                    0
scene                       0
source_app                  0
call_source                 0
dtype: int64

查看基础变量的describe()

df.describe()

	bad_ind	oil_amount	discount_amount	sale_amount	amount	pay_amount	coupon_amount	payment_coupon_amount	channel_code	oil_code	scene	source_app	call_source
count	50609.000000	45665.000000	4.566500e+04	4.566500e+04	4.566500e+04	4.566500e+04	45665.0	45663.000000	50609.000000	50609.000000	50609.000000	50609.000000	50609.000000
mean	0.017764	212.188054	1.091035e+05	1.121195e+05	1.077312e+05	5.386562e+05	0.0	417.055384	1.476378	1.617894	1.906519	0.306072	2.900729
std	0.132093	200.298122	1.010993e+05	1.031804e+05	9.953775e+04	4.976888e+05	0.0	968.250273	1.511470	3.074166	0.367280	0.893682	0.726231
min	0.000000	0.000000	0.000000e+00	0.000000e+00	1.000000e+00	5.000000e+00	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	87.220000	4.854000e+04	5.000000e+04	4.820000e+04	2.410000e+05	0.0	0.000000	1.000000	0.000000	2.000000	0.000000	3.000000
50%	0.000000	167.580000	8.820000e+04	9.000000e+04	8.709600e+04	4.354800e+05	0.0	100.000000	1.000000	0.000000	2.000000	0.000000	3.000000
75%	0.000000	278.300000	1.391600e+05	1.430000e+05	1.371150e+05	6.855750e+05	0.0	500.000000	1.000000	0.000000	2.000000	0.000000	3.000000
max	1.000000	3975.910000	1.958040e+06	1.998000e+06	1.925540e+06	9.627700e+06	0.0	50000.000000	6.000000	9.000000	2.000000	3.000000	4.000000

对creat_dt做补全，用oil_actv_dt来填补，并且截取6个月的数据。
构造变量的时候不能直接对历史所有数据做累加。
否则随着时间推移，变量分布会有很大的变化。

def time_isna(x,y):
    if str(x) == 'NaT':
        x = y
    else:
        x = x
    return x

df2 = df.sort_values(['uid','create_dt'], ascending = False)
df2.head()

	uid	create_dt	oil_actv_dt	class_new	oil_amount	discount_amount	sale_amount	amount	pay_amount	coupon_amount	payment_coupon_amount	channel_code	oil_code	scene	source_app	call_source
50608	B96436391985035703	NaT	2018-10-08	B	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6	9	2	3	4
50607	B96436391984693397	NaT	2018-10-11	E	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6	9	2	3	4
50606	B96436391977217468	NaT	2018-10-17	B	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6	9	2	3	4
50605	B96436391976480892	NaT	2018-09-28	B	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6	9	2	3	4
50604	B96436391972106043	NaT	2018-10-19	A	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6	9	2	3	4

df2['create_dt'] = df2.apply(lambda x: time_isna(x.create_dt, x.oil_actv_dt), axis = 1)
df2['create_dt'].head()

50608   2018-10-08
50607   2018-10-11
50606   2018-10-17
50605   2018-09-28
50604   2018-10-19
Name: create_dt, dtype: datetime64[ns]

df2['dtn'] = (df2.oil_actv_dt - df2.create_dt).apply(lambda x: x.days)
df2['dtn']

50608      0
50607      0
50606      0
50605      0
50604      0
50603      0
50602      0
50601      0
50600      0
50599      0
50598      0
50597      0
50596      0
50595      0
50594      0
50593      0
50592      0
50591      0
50590      0
50589      0
50588      0
50587      0
50586      0
50585      0
50584      0
50583      0
50582      0
50581      0
50580      0
50579      0
        ... 
12860    255
12865    275
46427      0
35426      2
35425      4
35427      7
35419      9
35418     15
35414     19
35420     38
35421     39
35428     41
35417     42
35415     44
35424     45
35423     46
35422     48
35416     49
40437    183
40440    186
40442    188
40438    189
40436    192
40441    193
40439    194
46426      0
41434     21
41433     36
41432     69
45296     28
Name: dtn, Length: 50609, dtype: int64

df = df2[df2['dtn'] < 180]
df.head()

	uid	create_dt	oil_actv_dt	class_new	oil_amount	discount_amount	sale_amount	amount	pay_amount	coupon_amount	payment_coupon_amount	channel_code	oil_code	scene	source_app	call_source
50608	B96436391985035703	2018-10-08	2018-10-08	B	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6	9	2	3	4
50607	B96436391984693397	2018-10-11	2018-10-11	E	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6	9	2	3	4
50606	B96436391977217468	2018-10-17	2018-10-17	B	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6	9	2	3	4
50605	B96436391976480892	2018-09-28	2018-09-28	B	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6	9	2	3	4
50604	B96436391972106043	2018-10-19	2018-10-19	A	NaN	NaN	NaN	NaN	NaN	NaN	NaN	6	9	2	3	4

对org_list变量求历史贷款天数的最大间隔，并且去重

base = df[org_lst]
base.head()

	uid	create_dt	oil_actv_dt	class_new
50608	B96436391985035703	2018-10-08	2018-10-08	B
50607	B96436391984693397	2018-10-11	2018-10-11	E
50606	B96436391977217468	2018-10-17	2018-10-17	B
50605	B96436391976480892	2018-09-28	2018-09-28	B
50604	B96436391972106043	2018-10-19	2018-10-19	A

base['dtn'] = df['dtn']
base = base.sort_values(['uid', 'create_dt'], ascending = False)
# 去重
base = base.drop_duplicates(['uid'], keep = 'first')
base.shape

E:\Anaconda3\envs\sklearn\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.





(11099, 6)

变量衍生（可以尝试featuretools）

gn = pd.DataFrame()
for i in agg_lst:
    # count
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:len(df[i])).reset_index())
    tp.columns = ['uid', i + '_cnt']
    
    # sum
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn, tp, on = 'uid', how = 'left')
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.where(df[i]>0, 1, 0).sum()).reset_index())
    tp.columns = ['uid', i + '_num']
    
    # nansum
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nansum(df[i])).reset_index())
    tp.columns = ['uid', i + '_tot']
    
    # nanmean
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmean(df[i])).reset_index())
    tp.columns = ['uid',i + '_avg']
    
    # nanmax
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmax(df[i])).reset_index())
    tp.columns = ['uid',i + '_max']
    
    # nanmin
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmin(df[i])).reset_index())
    tp.columns = ['uid',i + '_min']
    
    # nanvar 计算沿指定轴的方差，同时忽略NaNs
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanvar(df[i])).reset_index())
    tp.columns = ['uid',i + '_var']
    
    # nanmax-nanmin
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmax(df[i]) -np.nanmin(df[i]) ).reset_index())
    tp.columns = ['uid',i + '_var']
    
    # nanmean/nanvar
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmean(df[i])/max(np.nanvar(df[i]),1)).reset_index())
    tp.columns = ['uid',i + '_var']
    
    if gn.empty == True:
        gn = tp
    else:
        gn = pd.merge(gn,tp,on = 'uid',how = 'left')

E:\Anaconda3\envs\sklearn\lib\site-packages\ipykernel_launcher.py:24: RuntimeWarning: Mean of empty slice
E:\Anaconda3\envs\sklearn\lib\site-packages\ipykernel_launcher.py:31: RuntimeWarning: All-NaN axis encountered
E:\Anaconda3\envs\sklearn\lib\site-packages\ipykernel_launcher.py:38: RuntimeWarning: All-NaN axis encountered
E:\Anaconda3\envs\sklearn\lib\site-packages\ipykernel_launcher.py:45: RuntimeWarning: Degrees of freedom <= 0 for slice.
E:\Anaconda3\envs\sklearn\lib\site-packages\ipykernel_launcher.py:52: RuntimeWarning: All-NaN axis encountered
E:\Anaconda3\envs\sklearn\lib\site-packages\ipykernel_launcher.py:59: RuntimeWarning: Mean of empty slice
E:\Anaconda3\envs\sklearn\lib\site-packages\ipykernel_launcher.py:59: RuntimeWarning: Degrees of freedom <= 0 for slice.

gn.head()

	uid	oil_amount_cnt	oil_amount_num	oil_amount_tot	oil_amount_avg	oil_amount_max	oil_amount_min	oil_amount_var_x	oil_amount_var_y	oil_amount_var	...	coupon_amount_var	payment_coupon_amount_cnt	payment_coupon_amount_num	payment_coupon_amount_tot	payment_coupon_amount_avg	payment_coupon_amount_max	payment_coupon_amount_min	payment_coupon_amount_var_x	payment_coupon_amount_var_y	payment_coupon_amount_var
0	A10000481	1	1	94.76	94.760000	94.76	94.76	0.000000	0.00	94.760000	...	0.0	1	1	200.0	200.000000	200.0	200.0	0.000000	0.0	200.000000
1	A1000232	3	3	372.82	124.273333	138.46	109.55	139.442022	28.91	0.891219	...	0.0	3	2	300.0	100.000000	200.0	0.0	6666.666667	200.0	0.015000
2	A1000455	1	0	0.00	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	1	0	0.0	NaN	NaN	NaN	NaN	NaN	NaN
3	A10010042	15	15	742.05	49.470000	76.47	31.62	109.291800	44.85	0.452641	...	0.0	15	12	1900.0	126.666667	500.0	0.0	13955.555556	500.0	0.009076
4	A1001014	1	0	0.00	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	1	0	0.0	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 64 columns

gn.columns

Index(['uid', 'oil_amount_cnt', 'oil_amount_num', 'oil_amount_tot',
       'oil_amount_avg', 'oil_amount_max', 'oil_amount_min',
       'oil_amount_var_x', 'oil_amount_var_y', 'oil_amount_var',
       'discount_amount_cnt', 'discount_amount_num', 'discount_amount_tot',
       'discount_amount_avg', 'discount_amount_max', 'discount_amount_min',
       'discount_amount_var_x', 'discount_amount_var_y', 'discount_amount_var',
       'sale_amount_cnt', 'sale_amount_num', 'sale_amount_tot',
       'sale_amount_avg', 'sale_amount_max', 'sale_amount_min',
       'sale_amount_var_x', 'sale_amount_var_y', 'sale_amount_var',
       'amount_cnt', 'amount_num', 'amount_tot', 'amount_avg', 'amount_max',
       'amount_min', 'amount_var_x', 'amount_var_y', 'amount_var',
       'pay_amount_cnt', 'pay_amount_num', 'pay_amount_tot', 'pay_amount_avg',
       'pay_amount_max', 'pay_amount_min', 'pay_amount_var_x',
       'pay_amount_var_y', 'pay_amount_var', 'coupon_amount_cnt',
       'coupon_amount_num', 'coupon_amount_tot', 'coupon_amount_avg',
       'coupon_amount_max', 'coupon_amount_min', 'coupon_amount_var_x',
       'coupon_amount_var_y', 'coupon_amount_var', 'payment_coupon_amount_cnt',
       'payment_coupon_amount_num', 'payment_coupon_amount_tot',
       'payment_coupon_amount_avg', 'payment_coupon_amount_max',
       'payment_coupon_amount_min', 'payment_coupon_amount_var_x',
       'payment_coupon_amount_var_y', 'payment_coupon_amount_var'],
      dtype='object')

对dstc_lst变量求distinct个数

# 去重计数
gc = pd.DataFrame()
for i in dstc_lst:
    tp = pd.DataFrame(df.groupby('uid').apply(lambda df: len(set(df[i]))).reset_index())
    tp.columns = ['uid',i + '_dstc']
    if gc.empty == True:
        gc = tp
    else:
        gc = pd.merge(gc,tp,on = 'uid',how = 'left')

gc.head()

	uid	channel_code_dstc	oil_code_dstc	scene_dstc	source_app_dstc	call_source_dstc
0	A10000481	1	1	1	1	1
1	A1000232	1	1	1	1	1
2	A1000455	1	1	1	1	1
3	A10010042	1	1	1	1	1
4	A1001014	1	1	1	1	1

将变量组合在一起

fn = pd.merge(base,gn,on= 'uid')
fn = pd.merge(fn,gc,on= 'uid') 
fn.shape

(45039, 73)

fn = fn.fillna(0)

fn.head()

	uid	create_dt	oil_actv_dt	class_new	oil_amount_cnt	...	channel_code_dstc	oil_code_dstc	scene_dstc	source_app_dstc	call_source_dstc
0	B96436391985035703	2018-10-08	2018-10-08	B	1	...	1	1	1	1	1
1	B96436391984693397	2018-10-11	2018-10-11	E	1	...	1	1	1	1	1
2	B96436391977217468	2018-10-17	2018-10-17	B	1	...	1	1	1	1	1
3	B96436391976480892	2018-09-28	2018-09-28	B	1	...	1	1	1	1	1
4	B96436391972106043	2018-10-19	2018-10-19	A	1	...	1	1	1	1	1

5 rows × 73 columns

训练决策树模型

# 特征
x = fn.drop(['uid','oil_actv_dt','create_dt','bad_ind','class_new'],axis = 1)
# 标签
y = fn.bad_ind.copy()

from sklearn import tree
dtree = tree.DecisionTreeRegressor(max_depth=2, min_samples_leaf=500, min_samples_split=5000)
dtree = dtree.fit(x,y)

输出决策树图像，并作出决策

import pydotplus 
from IPython.display import Image
from sklearn.externals.six import StringIO
import os

with open("dt.dot", "w") as f:
    tree.export_graphviz(dtree, out_file=f)

dot_data = StringIO()

tree.export_graphviz(dtree, out_file=dot_data,
                         feature_names=x.columns,
                         class_names=['bad_ind'],
                         filled=True, rounded=True,
                         special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
Image(graph.create_png())

value = badrate

sum(fn.bad_ind)/len(fn.bad_ind)

0.04658077304261645

posted @ 2019-05-21 20:24 chenxiangzhen 阅读(464) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

chenxiangzhen

爱学习，爱生活

决策树规则挖掘

公告