代码改变世界

阿里云天池新人赛o2o优惠券使用预测-------进阶心得

2020-05-16 17:15  冻雨冷雾  阅读(961)  评论(0编辑  收藏  举报

赛题链接: https://tianchi.aliyun.com/competition/entrance/231593/introduction

天池的这个比赛用于学习数据挖掘是一个很好的例子,该题核心是对特征的处理与提取。

从大神代码里习得了xgboost, 学习了特征处理的精髓后,训练调优了一版GBDT。 效果还不错:

最好成绩101:

 

 

数据处理部分:

   1 # -*- coding: utf-8 -*-
   2 """
   3 Created on Wed Dec  4 14:30:36 2019
   4 
   5 @author: wenzhe.tian
   6 
   7 
   8     本赛题提供用户在2016年1月1日至2016年6月30日之间真实线上线下消费行为,预测用户在2016年7月领取优惠券后15天以内的使用情况。
   9 注意: 为了保护用户和商家的隐私,所有数据均作匿名处理,同时采用了有偏采样和必要过滤。
  10 
  11 用户消费日期特征添加:法定节假日?
  12 
  13 """
  14 
  15 import pandas as pd
  16 import numpy as np
  17 from datetime import date
  18 
  19 # 数据导入
  20 off_train = pd.read_csv('data/ccf_offline_stage1_train.csv')
  21 off_train.columns = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']
  22 
  23 off_test = pd.read_csv('data/ccf_offline_stage1_test_revised.csv')
  24 off_test.columns = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received']
  25 
  26 on_train = pd.read_csv('data/ccf_online_stage1_train.csv')
  27 on_train.columns = ['user_id','merchant_id','action','coupon_id','discount_rate','date_received','date']
  28 
  29 # 日期格式处理
  30 off_train['date']= off_train['date'].fillna('null')
  31 off_train['date_received']= off_train['date_received'].fillna('null')
  32 off_train.date= off_train.date.astype('str')
  33 off_train.date_received= off_train.date_received.astype('str')
  34 
  35 # 时间差分 3个预测模型
  36 dataset3 = off_test
  37 feature3 = off_train[((off_train.date>='20160315')&(off_train.date<='20160630'))|((off_train.date=='null')&(off_train.date_received>='20160315')&(off_train.date_received<='20160630'))]
  38 dataset2 = off_train[(off_train.date_received>='20160515')&(off_train.date_received<='20160615')]
  39 feature2 = off_train[(off_train.date>='20160201')&(off_train.date<='20160514')|((off_train.date=='null')&(off_train.date_received>='20160201')&(off_train.date_received<='20160514'))]
  40 dataset1 = off_train[(off_train.date_received>='20160414')&(off_train.date_received<='20160514')]
  41 feature1 = off_train[(off_train.date>='20160101')&(off_train.date<='20160413')|((off_train.date=='null')&(off_train.date_received>='20160101')&(off_train.date_received<='20160413'))]
  42 
  43 
  44 ############# other feature ##################3
  45 """
  46 5. other feature:
  47       this_month_user_receive_all_coupon_count
  48       this_month_user_receive_same_coupon_count
  49       this_month_user_receive_same_coupon_lastone
  50       this_month_user_receive_same_coupon_firstone
  51       this_day_user_receive_all_coupon_count
  52       this_day_user_receive_same_coupon_count
  53       day_gap_before, day_gap_after  (receive the same coupon)
  54 """
  55 
  56 #for dataset3 (标签) 需写出目的
  57 # 按照用户id 进行重组统计齐获得优惠券的数目
  58 t = dataset3[['user_id']]
  59 t['this_month_user_receive_all_coupon_count'] = 1
  60 t = t.groupby('user_id').agg('sum').reset_index()
  61 
  62 # 按照用户id和优惠券id(类型) 进行重组统计齐获得相同优惠券的数目
  63 t1 = dataset3[['user_id','coupon_id']]
  64 t1['this_month_user_receive_same_coupon_count'] = 1
  65 t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index()
  66 
  67 #按照两个id 对收到优惠券的时间统计并按照:隔开
  68 t2 = dataset3[['user_id','coupon_id','date_received']]
  69 t2.date_received = t2.date_received.astype('str')
  70 t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
  71 #统计收到的优惠券数量
  72 t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':')))
  73 #只要发了2张及以上的数据
  74 t2 = t2[t2.receive_number>1]
  75 #添加最早和最晚获取优惠券的时间数据  并只取这两个数据和两个id作为t2
  76 t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(float(d)) for d in s.split(':')]))
  77 t2['min_date_received'] = t2.date_received.apply(lambda s:min([int(float(d)) for d in s.split(':')]))
  78 t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']]
  79 
  80 # t2按照两个id
  81 t3 = dataset3[['user_id','coupon_id','date_received']]
  82 t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left')
  83 
  84 t3.max_date_received=t3.max_date_received.astype('float')
  85 t3.date_received=t3.date_received.astype('float')
  86 t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received
  87 t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received - t3.min_date_received
  88 def is_firstlastone(x):
  89     if x==0:
  90         return 1
  91     elif x>0:
  92         return 0
  93     else:
  94         return -1 #those only receive once
  95 
  96 t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(is_firstlastone)
  97 t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(is_firstlastone)
  98 t3 = t3[['user_id','coupon_id','date_received','this_month_user_receive_same_coupon_lastone','this_month_user_receive_same_coupon_firstone']]
  99 
 100 t4 = dataset3[['user_id','date_received']]
 101 t4['this_day_user_receive_all_coupon_count'] = 1
 102 t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index()
 103 
 104 t5 = dataset3[['user_id','coupon_id','date_received']]
 105 t5['this_day_user_receive_same_coupon_count'] = 1
 106 t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index()
 107 
 108 t6 = dataset3[['user_id','coupon_id','date_received']]
 109 t6.date_received = t6.date_received.astype('str')
 110 t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
 111 t6.rename(columns={'date_received':'dates'},inplace=True)
 112 
 113 def get_day_gap_before(s):
 114     date_received,dates = s.split('-')
 115     dates = dates.split(':')
 116     gaps = []
 117     for d in dates:
 118         this_gap = (date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))-date(int(d[0:4]),int(d[4:6]),int(d[6:8]))).days
 119         if this_gap>0:
 120             gaps.append(this_gap)
 121     if len(gaps)==0:
 122         return -1
 123     else:
 124         return min(gaps)
 125 
 126 def get_day_gap_after(s):
 127     date_received,dates = s.split('-')
 128     dates = dates.split(':')
 129     gaps = []
 130     for d in dates:
 131         this_gap = (date(int(d[0:4]),int(d[4:6]),int(d[6:8]))-date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))).days
 132         if this_gap>0:
 133             gaps.append(this_gap)
 134     if len(gaps)==0:
 135         return -1
 136     else:
 137         return min(gaps)
 138 
 139 
 140 t7 = dataset3[['user_id','coupon_id','date_received']]
 141 t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left')
 142 t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates
 143 t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
 144 t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
 145 t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']]
 146 
 147 other_feature3 = pd.merge(t1,t,on='user_id')
 148 other_feature3 = pd.merge(other_feature3,t3,on=['user_id','coupon_id'])
 149 other_feature3 = pd.merge(other_feature3,t4,on=['user_id','date_received'])
 150 other_feature3 = pd.merge(other_feature3,t5,on=['user_id','coupon_id','date_received'])
 151 other_feature3 = pd.merge(other_feature3,t7,on=['user_id','coupon_id','date_received'])
 152 other_feature3.to_csv('data/other_feature3.csv',index=None)
 153 print (other_feature3.shape)
 154 
 155 
 156 
 157 #for dataset2
 158 t = dataset2[['user_id']]
 159 t['this_month_user_receive_all_coupon_count'] = 1
 160 t = t.groupby('user_id').agg('sum').reset_index()
 161 
 162 t1 = dataset2[['user_id','coupon_id']]
 163 t1['this_month_user_receive_same_coupon_count'] = 1
 164 t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index()
 165 
 166 t2 = dataset2[['user_id','coupon_id','date_received']]
 167 t2.date_received = t2.date_received.astype('str')
 168 t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
 169 t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':')))
 170 t2 = t2[t2.receive_number>1]
 171 t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(float(d)) for d in s.split(':')]))
 172 t2['min_date_received'] = t2.date_received.apply(lambda s:min([int(float(d)) for d in s.split(':')]))
 173 t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']]
 174 
 175 t3 = dataset2[['user_id','coupon_id','date_received']]
 176 t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left')
 177 t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received.astype('float').astype('int')
 178 t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received.astype('float').astype('int') - t3.min_date_received
 179 def is_firstlastone(x):
 180     if x==0:
 181         return 1
 182     elif x>0:
 183         return 0
 184     else:
 185         return -1 #those only receive once
 186 
 187 t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(is_firstlastone)
 188 t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(is_firstlastone)
 189 t3 = t3[['user_id','coupon_id','date_received','this_month_user_receive_same_coupon_lastone','this_month_user_receive_same_coupon_firstone']]
 190 
 191 t4 = dataset2[['user_id','date_received']]
 192 t4['this_day_user_receive_all_coupon_count'] = 1
 193 t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index()
 194 
 195 t5 = dataset2[['user_id','coupon_id','date_received']]
 196 t5['this_day_user_receive_same_coupon_count'] = 1
 197 t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index()
 198 
 199 t6 = dataset2[['user_id','coupon_id','date_received']]
 200 t6.date_received = t6.date_received.astype('str')
 201 t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
 202 t6.rename(columns={'date_received':'dates'},inplace=True)
 203 
 204 def get_day_gap_before(s):
 205     date_received,dates = s.split('-')
 206     dates = dates.split(':')
 207     gaps = []
 208     for d in dates:
 209         this_gap = (date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))-date(int(d[0:4]),int(d[4:6]),int(d[6:8]))).days
 210         if this_gap>0:
 211             gaps.append(this_gap)
 212     if len(gaps)==0:
 213         return -1
 214     else:
 215         return min(gaps)
 216 
 217 def get_day_gap_after(s):
 218     date_received,dates = s.split('-')
 219     dates = dates.split(':')
 220     gaps = []
 221     for d in dates:
 222         this_gap = (date(int(d[0:4]),int(d[4:6]),int(d[6:8]))-date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))).days
 223         if this_gap>0:
 224             gaps.append(this_gap)
 225     if len(gaps)==0:
 226         return -1
 227     else:
 228         return min(gaps)
 229 
 230 
 231 t7 = dataset2[['user_id','coupon_id','date_received']]
 232 t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left')
 233 t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates
 234 t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
 235 t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
 236 t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']]
 237 
 238 other_feature2 = pd.merge(t1,t,on='user_id')
 239 other_feature2 = pd.merge(other_feature2,t3,on=['user_id','coupon_id'])
 240 other_feature2 = pd.merge(other_feature2,t4,on=['user_id','date_received'])
 241 other_feature2 = pd.merge(other_feature2,t5,on=['user_id','coupon_id','date_received'])
 242 other_feature2 = pd.merge(other_feature2,t7,on=['user_id','coupon_id','date_received'])
 243 other_feature2.to_csv('data/other_feature2.csv',index=None)
 244 print (other_feature2.shape)
 245 
 246 
 247 
 248 #for dataset1
 249 t = dataset1[['user_id']]
 250 t['this_month_user_receive_all_coupon_count'] = 1
 251 t = t.groupby('user_id').agg('sum').reset_index()
 252 
 253 t1 = dataset1[['user_id','coupon_id']]
 254 t1['this_month_user_receive_same_coupon_count'] = 1
 255 t1 = t1.groupby(['user_id','coupon_id']).agg('sum').reset_index()
 256 
 257 t2 = dataset1[['user_id','coupon_id','date_received']]
 258 t2.date_received = t2.date_received.astype('str')
 259 t2 = t2.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
 260 t2['receive_number'] = t2.date_received.apply(lambda s:len(s.split(':')))
 261 t2 = t2[t2.receive_number>1]
 262 t2['max_date_received'] = t2.date_received.apply(lambda s:max([int(float(d)) for d in s.split(':')]))
 263 t2['min_date_received'] = t2.date_received.apply(lambda s:min([int(float(d)) for d in s.split(':')]))
 264 t2 = t2[['user_id','coupon_id','max_date_received','min_date_received']]
 265 
 266 t3 = dataset1[['user_id','coupon_id','date_received']]
 267 t3 = pd.merge(t3,t2,on=['user_id','coupon_id'],how='left')
 268 t3['this_month_user_receive_same_coupon_lastone'] = t3.max_date_received - t3.date_received.astype('float').astype('int')
 269 t3['this_month_user_receive_same_coupon_firstone'] = t3.date_received.astype('float').astype('int') - t3.min_date_received
 270 def is_firstlastone(x):
 271     if x==0:
 272         return 1
 273     elif x>0:
 274         return 0
 275     else:
 276         return -1 #those only receive once
 277 
 278 t3.this_month_user_receive_same_coupon_lastone = t3.this_month_user_receive_same_coupon_lastone.apply(is_firstlastone)
 279 t3.this_month_user_receive_same_coupon_firstone = t3.this_month_user_receive_same_coupon_firstone.apply(is_firstlastone)
 280 t3 = t3[['user_id','coupon_id','date_received','this_month_user_receive_same_coupon_lastone','this_month_user_receive_same_coupon_firstone']]
 281 
 282 t4 = dataset1[['user_id','date_received']]
 283 t4['this_day_user_receive_all_coupon_count'] = 1
 284 t4 = t4.groupby(['user_id','date_received']).agg('sum').reset_index()
 285 
 286 t5 = dataset1[['user_id','coupon_id','date_received']]
 287 t5['this_day_user_receive_same_coupon_count'] = 1
 288 t5 = t5.groupby(['user_id','coupon_id','date_received']).agg('sum').reset_index()
 289 
 290 t6 = dataset1[['user_id','coupon_id','date_received']]
 291 t6.date_received = t6.date_received.astype('str')
 292 t6 = t6.groupby(['user_id','coupon_id'])['date_received'].agg(lambda x:':'.join(x)).reset_index()
 293 t6.rename(columns={'date_received':'dates'},inplace=True)
 294 
 295 def get_day_gap_before(s):
 296     date_received,dates = s.split('-')
 297     dates = dates.split(':')
 298     gaps = []
 299     for d in dates:
 300         this_gap = (date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))-date(int(d[0:4]),int(d[4:6]),int(d[6:8]))).days
 301         if this_gap>0:
 302             gaps.append(this_gap)
 303     if len(gaps)==0:
 304         return -1
 305     else:
 306         return min(gaps)
 307 
 308 def get_day_gap_after(s):
 309     date_received,dates = s.split('-')
 310     dates = dates.split(':')
 311     gaps = []
 312     for d in dates:
 313         this_gap = (date(int(d[0:4]),int(d[4:6]),int(d[6:8]))-date(int(date_received[0:4]),int(date_received[4:6]),int(date_received[6:8]))).days
 314         if this_gap>0:
 315             gaps.append(this_gap)
 316     if len(gaps)==0:
 317         return -1
 318     else:
 319         return min(gaps)
 320 
 321 
 322 t7 = dataset1[['user_id','coupon_id','date_received']]
 323 t7 = pd.merge(t7,t6,on=['user_id','coupon_id'],how='left')
 324 t7['date_received_date'] = t7.date_received.astype('str') + '-' + t7.dates
 325 t7['day_gap_before'] = t7.date_received_date.apply(get_day_gap_before)
 326 t7['day_gap_after'] = t7.date_received_date.apply(get_day_gap_after)
 327 t7 = t7[['user_id','coupon_id','date_received','day_gap_before','day_gap_after']]
 328 
 329 other_feature1 = pd.merge(t1,t,on='user_id')
 330 other_feature1 = pd.merge(other_feature1,t3,on=['user_id','coupon_id'])
 331 other_feature1 = pd.merge(other_feature1,t4,on=['user_id','date_received'])
 332 other_feature1 = pd.merge(other_feature1,t5,on=['user_id','coupon_id','date_received'])
 333 other_feature1 = pd.merge(other_feature1,t7,on=['user_id','coupon_id','date_received'])
 334 other_feature1.to_csv('data/other_feature1.csv',index=None)
 335 print (other_feature1.shape)
 336 
 337 
 338 
 339 
 340 
 341 
 342 ############# coupon related feature   #############
 343 """
 344 2.coupon related:
 345       discount_rate. discount_man. discount_jian. is_man_jian
 346       day_of_week,day_of_month. (date_received)
 347 """
 348 def calc_discount_rate(s):
 349     s =str(s)
 350     s = s.split(':')
 351     if len(s)==1:
 352         return float(s[0])
 353     else:
 354         return 1.0-float(s[1])/float(s[0])
 355 
 356 def get_discount_man(s):
 357     s =str(s)
 358     s = s.split(':')
 359     if len(s)==1:
 360         return 'null'
 361     else:
 362         return int(s[0])
 363 
 364 def get_discount_jian(s):
 365     s =str(s)
 366     s = s.split(':')
 367     if len(s)==1:
 368         return 'null'
 369     else:
 370         return int(s[1])
 371 
 372 def is_man_jian(s):
 373     s =str(s)
 374     s = s.split(':')
 375     if len(s)==1:
 376         return 0
 377     else:
 378         return 1
 379 
 380 #dataset3
 381 dataset3['day_of_week'] = dataset3.date_received.astype('str').apply(lambda x:date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1)
 382 dataset3['day_of_month'] = dataset3.date_received.astype('str').apply(lambda x:int(x[6:8]))
 383 dataset3['days_distance'] = dataset3.date_received.astype('str').apply(lambda x:(date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-date(2016,6,30)).days)
 384 dataset3['discount_man'] = dataset3.discount_rate.apply(get_discount_man)
 385 dataset3['discount_jian'] = dataset3.discount_rate.apply(get_discount_jian)
 386 dataset3['is_man_jian'] = dataset3.discount_rate.apply(is_man_jian)
 387 dataset3['discount_rate'] = dataset3.discount_rate.apply(calc_discount_rate)
 388 d = dataset3[['coupon_id']]
 389 d['coupon_count'] = 1
 390 d = d.groupby('coupon_id').agg('sum').reset_index()
 391 dataset3 = pd.merge(dataset3,d,on='coupon_id',how='left')
 392 dataset3.to_csv('data/coupon3_feature.csv',index=None)
 393 #dataset2
 394 dataset2['day_of_week'] = dataset2.date_received.astype('str').apply(lambda x:date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1)
 395 dataset2['day_of_month'] = dataset2.date_received.astype('str').apply(lambda x:int(x[6:8]))
 396 dataset2['days_distance'] = dataset2.date_received.astype('str').apply(lambda x:(date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-date(2016,5,14)).days)
 397 dataset2['discount_man'] = dataset2.discount_rate.apply(get_discount_man)
 398 dataset2['discount_jian'] = dataset2.discount_rate.apply(get_discount_jian)
 399 dataset2['is_man_jian'] = dataset2.discount_rate.apply(is_man_jian)
 400 dataset2['discount_rate'] = dataset2.discount_rate.apply(calc_discount_rate)
 401 d = dataset2[['coupon_id']]
 402 d['coupon_count'] = 1
 403 d = d.groupby('coupon_id').agg('sum').reset_index()
 404 dataset2 = pd.merge(dataset2,d,on='coupon_id',how='left')
 405 dataset2.to_csv('data/coupon2_feature.csv',index=None)
 406 #dataset1
 407 dataset1['day_of_week'] = dataset1.date_received.astype('str').apply(lambda x:date(int(x[0:4]),int(x[4:6]),int(x[6:8])).weekday()+1)
 408 dataset1['day_of_month'] = dataset1.date_received.astype('str').apply(lambda x:int(x[6:8]))
 409 dataset1['days_distance'] = dataset1.date_received.astype('str').apply(lambda x:(date(int(x[0:4]),int(x[4:6]),int(x[6:8]))-date(2016,4,13)).days)
 410 dataset1['discount_man'] = dataset1.discount_rate.apply(get_discount_man)
 411 dataset1['discount_jian'] = dataset1.discount_rate.apply(get_discount_jian)
 412 dataset1['is_man_jian'] = dataset1.discount_rate.apply(is_man_jian)
 413 dataset1['discount_rate'] = dataset1.discount_rate.apply(calc_discount_rate)
 414 d = dataset1[['coupon_id']]
 415 d['coupon_count'] = 1
 416 d = d.groupby('coupon_id').agg('sum').reset_index()
 417 dataset1 = pd.merge(dataset1,d,on='coupon_id',how='left')
 418 dataset1.to_csv('data/coupon1_feature.csv',index=None)
 419 
 420 
 421 ############# merchant related feature   #############
 422 """
 423 1.merchant related:
 424       total_sales. sales_use_coupon.  total_coupon
 425       coupon_rate = sales_use_coupon/total_sales.
 426       transfer_rate = sales_use_coupon/total_coupon.
 427       merchant_avg_distance,merchant_min_distance,merchant_max_distance of those use coupon
 428 
 429 """
 430 
 431 #for dataset3
 432 merchant3 = feature3[['merchant_id','coupon_id','distance','date_received','date']]
 433 
 434 t = merchant3[['merchant_id']]
 435 t.drop_duplicates(inplace=True)
 436 
 437 t1 = merchant3[merchant3.date!='null'][['merchant_id']]
 438 t1['total_sales'] = 1
 439 t1 = t1.groupby('merchant_id').agg('sum').reset_index()
 440 
 441 merchant3.coupon_id=merchant3.coupon_id.replace(np.nan,'null')
 442 t2 = merchant3[(merchant3.date!='null')&(merchant3.coupon_id!='null')][['merchant_id']]
 443 t2['sales_use_coupon'] = 1
 444 t2 = t2.groupby('merchant_id').agg('sum').reset_index()
 445 
 446 t3 = merchant3[merchant3.coupon_id!='null'][['merchant_id']]
 447 t3['total_coupon'] = 1
 448 t3 = t3.groupby('merchant_id').agg('sum').reset_index()
 449 
 450 t4 = merchant3[(merchant3.date!='null')&(merchant3.coupon_id!='null')][['merchant_id','distance']]
 451 t4.replace('null',-1,inplace=True)
 452 #t4.distance = t4.distance.astype('int')
 453 t4.replace(-1,np.nan,inplace=True)
 454 t5 = t4.groupby('merchant_id').agg('min').reset_index()
 455 t5.rename(columns={'distance':'merchant_min_distance'},inplace=True)
 456 
 457 t6 = t4.groupby('merchant_id').agg('max').reset_index()
 458 t6.rename(columns={'distance':'merchant_max_distance'},inplace=True)
 459 
 460 t7 = t4.groupby('merchant_id').agg('mean').reset_index()
 461 t7.rename(columns={'distance':'merchant_mean_distance'},inplace=True)
 462 
 463 t8 = t4.groupby('merchant_id').agg('median').reset_index()
 464 t8.rename(columns={'distance':'merchant_median_distance'},inplace=True)
 465 
 466 merchant3_feature = pd.merge(t,t1,on='merchant_id',how='left')
 467 merchant3_feature = pd.merge(merchant3_feature,t2,on='merchant_id',how='left')
 468 merchant3_feature = pd.merge(merchant3_feature,t3,on='merchant_id',how='left')
 469 merchant3_feature = pd.merge(merchant3_feature,t5,on='merchant_id',how='left')
 470 merchant3_feature = pd.merge(merchant3_feature,t6,on='merchant_id',how='left')
 471 merchant3_feature = pd.merge(merchant3_feature,t7,on='merchant_id',how='left')
 472 merchant3_feature = pd.merge(merchant3_feature,t8,on='merchant_id',how='left')
 473 merchant3_feature.sales_use_coupon = merchant3_feature.sales_use_coupon.replace(np.nan,0) #fillna with 0
 474 merchant3_feature['merchant_coupon_transfer_rate'] = merchant3_feature.sales_use_coupon.astype('float') / merchant3_feature.total_coupon
 475 merchant3_feature['coupon_rate'] = merchant3_feature.sales_use_coupon.astype('float') / merchant3_feature.total_sales
 476 merchant3_feature.total_coupon = merchant3_feature.total_coupon.replace(np.nan,0) #fillna with 0
 477 merchant3_feature.to_csv('data/merchant3_feature.csv',index=None)
 478 
 479 
 480 #for dataset2
 481 merchant2 = feature2[['merchant_id','coupon_id','distance','date_received','date']]
 482 
 483 t = merchant2[['merchant_id']]
 484 t.drop_duplicates(inplace=True)
 485 
 486 t1 = merchant2[merchant2.date!='null'][['merchant_id']]
 487 t1['total_sales'] = 1
 488 t1 = t1.groupby('merchant_id').agg('sum').reset_index()
 489 
 490 merchant2.coupon_id=merchant2.coupon_id.replace(np.nan,'null')
 491 t2 = merchant2[(merchant2.date!='null')&(merchant2.coupon_id!='null')][['merchant_id']]
 492 t2['sales_use_coupon'] = 1
 493 t2 = t2.groupby('merchant_id').agg('sum').reset_index()
 494 
 495 t3 = merchant2[merchant2.coupon_id!='null'][['merchant_id']]
 496 t3['total_coupon'] = 1
 497 t3 = t3.groupby('merchant_id').agg('sum').reset_index()
 498 
 499 t4 = merchant2[(merchant2.date!='null')&(merchant2.coupon_id!='null')][['merchant_id','distance']]
 500 t4.replace('null',-1,inplace=True)
 501 #t4.distance = t4.distance.astype('int')
 502 t4.replace(-1,np.nan,inplace=True)
 503 t5 = t4.groupby('merchant_id').agg('min').reset_index()
 504 t5.rename(columns={'distance':'merchant_min_distance'},inplace=True)
 505 
 506 t6 = t4.groupby('merchant_id').agg('max').reset_index()
 507 t6.rename(columns={'distance':'merchant_max_distance'},inplace=True)
 508 
 509 t7 = t4.groupby('merchant_id').agg('mean').reset_index()
 510 t7.rename(columns={'distance':'merchant_mean_distance'},inplace=True)
 511 
 512 t8 = t4.groupby('merchant_id').agg('median').reset_index()
 513 t8.rename(columns={'distance':'merchant_median_distance'},inplace=True)
 514 
 515 merchant2_feature = pd.merge(t,t1,on='merchant_id',how='left')
 516 merchant2_feature = pd.merge(merchant2_feature,t2,on='merchant_id',how='left')
 517 merchant2_feature = pd.merge(merchant2_feature,t3,on='merchant_id',how='left')
 518 merchant2_feature = pd.merge(merchant2_feature,t5,on='merchant_id',how='left')
 519 merchant2_feature = pd.merge(merchant2_feature,t6,on='merchant_id',how='left')
 520 merchant2_feature = pd.merge(merchant2_feature,t7,on='merchant_id',how='left')
 521 merchant2_feature = pd.merge(merchant2_feature,t8,on='merchant_id',how='left')
 522 merchant2_feature.sales_use_coupon = merchant2_feature.sales_use_coupon.replace(np.nan,0) #fillna with 0
 523 merchant2_feature['merchant_coupon_transfer_rate'] = merchant2_feature.sales_use_coupon.astype('float') / merchant2_feature.total_coupon
 524 merchant2_feature['coupon_rate'] = merchant2_feature.sales_use_coupon.astype('float') / merchant2_feature.total_sales
 525 merchant2_feature.total_coupon = merchant2_feature.total_coupon.replace(np.nan,0) #fillna with 0
 526 merchant2_feature.to_csv('data/merchant2_feature.csv',index=None)
 527 
 528 #for dataset1
 529 merchant1 = feature1[['merchant_id','coupon_id','distance','date_received','date']]
 530 
 531 t = merchant1[['merchant_id']]
 532 t.drop_duplicates(inplace=True)
 533 
 534 t1 = merchant1[merchant1.date!='null'][['merchant_id']]
 535 t1['total_sales'] = 1
 536 t1 = t1.groupby('merchant_id').agg('sum').reset_index()
 537 
 538 merchant1.coupon_id=merchant1.coupon_id.replace(np.nan,'null')
 539 t2 = merchant1[(merchant1.date!='null')&(merchant1.coupon_id!='null')][['merchant_id']]
 540 t2['sales_use_coupon'] = 1
 541 t2 = t2.groupby('merchant_id').agg('sum').reset_index()
 542 
 543 t3 = merchant1[merchant1.coupon_id!='null'][['merchant_id']]
 544 t3['total_coupon'] = 1
 545 t3 = t3.groupby('merchant_id').agg('sum').reset_index()
 546 
 547 t4 = merchant1[(merchant1.date!='null')&(merchant1.coupon_id!='null')][['merchant_id','distance']]
 548 t4.replace('null',-1,inplace=True)
 549 #t4.distance = t4.distance.astype('int')
 550 t4.replace(-1,np.nan,inplace=True)
 551 t5 = t4.groupby('merchant_id').agg('min').reset_index()
 552 t5.rename(columns={'distance':'merchant_min_distance'},inplace=True)
 553 
 554 t6 = t4.groupby('merchant_id').agg('max').reset_index()
 555 t6.rename(columns={'distance':'merchant_max_distance'},inplace=True)
 556 
 557 t7 = t4.groupby('merchant_id').agg('mean').reset_index()
 558 t7.rename(columns={'distance':'merchant_mean_distance'},inplace=True)
 559 
 560 t8 = t4.groupby('merchant_id').agg('median').reset_index()
 561 t8.rename(columns={'distance':'merchant_median_distance'},inplace=True)
 562 
 563 
 564 merchant1_feature = pd.merge(t,t1,on='merchant_id',how='left')
 565 merchant1_feature = pd.merge(merchant1_feature,t2,on='merchant_id',how='left')
 566 merchant1_feature = pd.merge(merchant1_feature,t3,on='merchant_id',how='left')
 567 merchant1_feature = pd.merge(merchant1_feature,t5,on='merchant_id',how='left')
 568 merchant1_feature = pd.merge(merchant1_feature,t6,on='merchant_id',how='left')
 569 merchant1_feature = pd.merge(merchant1_feature,t7,on='merchant_id',how='left')
 570 merchant1_feature = pd.merge(merchant1_feature,t8,on='merchant_id',how='left')
 571 merchant1_feature.sales_use_coupon = merchant1_feature.sales_use_coupon.replace(np.nan,0) #fillna with 0
 572 merchant1_feature['merchant_coupon_transfer_rate'] = merchant1_feature.sales_use_coupon.astype('float') / merchant1_feature.total_coupon
 573 merchant1_feature['coupon_rate'] = merchant1_feature.sales_use_coupon.astype('float') / merchant1_feature.total_sales
 574 merchant1_feature.total_coupon = merchant1_feature.total_coupon.replace(np.nan,0) #fillna with 0
 575 merchant1_feature.to_csv('data/merchant1_feature.csv',index=None)
 576 
 577 
 578 ############# user related feature   #############
 579 """
 580 3.user related:
 581       count_merchant.
 582       user_avg_distance, user_min_distance,user_max_distance.
 583       buy_use_coupon. buy_total. coupon_received.
 584       buy_use_coupon/coupon_received.
 585       buy_use_coupon/buy_total
 586       user_date_datereceived_gap
 587 
 588 
 589 """
 590 
 591 def get_user_date_datereceived_gap(s):
 592     s = s.split(':')
 593     return (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8])) - date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days
 594 
 595 #for dataset3
 596 user3 = feature3[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']]
 597 
 598 t = user3[['user_id']]
 599 t.drop_duplicates(inplace=True)
 600 
 601 t1 = user3[user3.date!='null'][['user_id','merchant_id']]
 602 t1.drop_duplicates(inplace=True)
 603 t1.merchant_id = 1
 604 t1 = t1.groupby('user_id').agg('sum').reset_index()
 605 t1.rename(columns={'merchant_id':'count_merchant'},inplace=True)
 606 user3['coupon_id']=user3['coupon_id'].replace(np.nan,'null')
 607 t2 = user3[(user3.date!='null')&(user3.coupon_id!='null')][['user_id','distance']]
 608 t2.replace('null',-1,inplace=True)
 609 #t2.distance = t2.distance.astype('int')
 610 t2.replace(-1,np.nan,inplace=True)
 611 t3 = t2.groupby('user_id').agg('min').reset_index()
 612 t3.rename(columns={'distance':'user_min_distance'},inplace=True)
 613 
 614 t4 = t2.groupby('user_id').agg('max').reset_index()
 615 t4.rename(columns={'distance':'user_max_distance'},inplace=True)
 616 
 617 t5 = t2.groupby('user_id').agg('mean').reset_index()
 618 t5.rename(columns={'distance':'user_mean_distance'},inplace=True)
 619 
 620 t6 = t2.groupby('user_id').agg('median').reset_index()
 621 t6.rename(columns={'distance':'user_median_distance'},inplace=True)
 622 
 623 t7 = user3[(user3.date!='null')&(user3.coupon_id!='null')][['user_id']]
 624 t7['buy_use_coupon'] = 1
 625 t7 = t7.groupby('user_id').agg('sum').reset_index()
 626 
 627 t8 = user3[user3.date!='null'][['user_id']]
 628 t8['buy_total'] = 1
 629 t8 = t8.groupby('user_id').agg('sum').reset_index()
 630 
 631 t9 = user3[user3.coupon_id!='null'][['user_id']]
 632 t9['coupon_received'] = 1
 633 t9 = t9.groupby('user_id').agg('sum').reset_index()
 634 
 635 t10 = user3[(user3.date_received!='null')&(user3.date!='null')][['user_id','date_received','date']]
 636 t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received
 637 t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap)
 638 t10 = t10[['user_id','user_date_datereceived_gap']]
 639 
 640 t11 = t10.groupby('user_id').agg('mean').reset_index()
 641 t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True)
 642 t12 = t10.groupby('user_id').agg('min').reset_index()
 643 t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True)
 644 t13 = t10.groupby('user_id').agg('max').reset_index()
 645 t13.rename(columns={'user_date_datereceived_gap':'max_user_date_datereceived_gap'},inplace=True)
 646 
 647 
 648 user3_feature = pd.merge(t,t1,on='user_id',how='left')
 649 user3_feature = pd.merge(user3_feature,t3,on='user_id',how='left')
 650 user3_feature = pd.merge(user3_feature,t4,on='user_id',how='left')
 651 user3_feature = pd.merge(user3_feature,t5,on='user_id',how='left')
 652 user3_feature = pd.merge(user3_feature,t6,on='user_id',how='left')
 653 user3_feature = pd.merge(user3_feature,t7,on='user_id',how='left')
 654 user3_feature = pd.merge(user3_feature,t8,on='user_id',how='left')
 655 user3_feature = pd.merge(user3_feature,t9,on='user_id',how='left')
 656 user3_feature = pd.merge(user3_feature,t11,on='user_id',how='left')
 657 user3_feature = pd.merge(user3_feature,t12,on='user_id',how='left')
 658 user3_feature = pd.merge(user3_feature,t13,on='user_id',how='left')
 659 user3_feature.count_merchant = user3_feature.count_merchant.replace(np.nan,0)
 660 user3_feature.buy_use_coupon = user3_feature.buy_use_coupon.replace(np.nan,0)
 661 user3_feature['buy_use_coupon_rate'] = user3_feature.buy_use_coupon.astype('float') / user3_feature.buy_total.astype('float')
 662 user3_feature['user_coupon_transfer_rate'] = user3_feature.buy_use_coupon.astype('float') / user3_feature.coupon_received.astype('float')
 663 user3_feature.buy_total = user3_feature.buy_total.replace(np.nan,0)
 664 user3_feature.coupon_received = user3_feature.coupon_received.replace(np.nan,0)
 665 user3_feature.to_csv('data/user3_feature.csv',index=None)
 666 
 667 
 668 #for dataset2
 669 user2 = feature2[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']]
 670 
 671 t = user2[['user_id']]
 672 t.drop_duplicates(inplace=True)
 673 
 674 t1 = user2[user2.date!='null'][['user_id','merchant_id']]
 675 t1.drop_duplicates(inplace=True)
 676 t1.merchant_id = 1
 677 t1 = t1.groupby('user_id').agg('sum').reset_index()
 678 t1.rename(columns={'merchant_id':'count_merchant'},inplace=True)
 679 user2['coupon_id']=user3['coupon_id'].replace(np.nan,'null')
 680 t2 = user2[(user2.date!='null')&(user2.coupon_id!='null')][['user_id','distance']]
 681 t2.replace('null',-1,inplace=True)
 682 #t2.distance = t2.distance.astype('int')
 683 t2.replace(-1,np.nan,inplace=True)
 684 t3 = t2.groupby('user_id').agg('min').reset_index()
 685 t3.rename(columns={'distance':'user_min_distance'},inplace=True)
 686 
 687 t4 = t2.groupby('user_id').agg('max').reset_index()
 688 t4.rename(columns={'distance':'user_max_distance'},inplace=True)
 689 
 690 t5 = t2.groupby('user_id').agg('mean').reset_index()
 691 t5.rename(columns={'distance':'user_mean_distance'},inplace=True)
 692 
 693 t6 = t2.groupby('user_id').agg('median').reset_index()
 694 t6.rename(columns={'distance':'user_median_distance'},inplace=True)
 695 
 696 t7 = user2[(user2.date!='null')&(user2.coupon_id!='null')][['user_id']]
 697 t7['buy_use_coupon'] = 1
 698 t7 = t7.groupby('user_id').agg('sum').reset_index()
 699 
 700 t8 = user2[user2.date!='null'][['user_id']]
 701 t8['buy_total'] = 1
 702 t8 = t8.groupby('user_id').agg('sum').reset_index()
 703 
 704 t9 = user2[user2.coupon_id!='null'][['user_id']]
 705 t9['coupon_received'] = 1
 706 t9 = t9.groupby('user_id').agg('sum').reset_index()
 707 
 708 t10 = user2[(user2.date_received!='null')&(user2.date!='null')][['user_id','date_received','date']]
 709 t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received
 710 t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap)
 711 t10 = t10[['user_id','user_date_datereceived_gap']]
 712 
 713 t11 = t10.groupby('user_id').agg('mean').reset_index()
 714 t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True)
 715 t12 = t10.groupby('user_id').agg('min').reset_index()
 716 t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True)
 717 t13 = t10.groupby('user_id').agg('max').reset_index()
 718 t13.rename(columns={'user_date_datereceived_gap':'max_user_date_datereceived_gap'},inplace=True)
 719 
 720 user2_feature = pd.merge(t,t1,on='user_id',how='left')
 721 user2_feature = pd.merge(user2_feature,t3,on='user_id',how='left')
 722 user2_feature = pd.merge(user2_feature,t4,on='user_id',how='left')
 723 user2_feature = pd.merge(user2_feature,t5,on='user_id',how='left')
 724 user2_feature = pd.merge(user2_feature,t6,on='user_id',how='left')
 725 user2_feature = pd.merge(user2_feature,t7,on='user_id',how='left')
 726 user2_feature = pd.merge(user2_feature,t8,on='user_id',how='left')
 727 user2_feature = pd.merge(user2_feature,t9,on='user_id',how='left')
 728 user2_feature = pd.merge(user2_feature,t11,on='user_id',how='left')
 729 user2_feature = pd.merge(user2_feature,t12,on='user_id',how='left')
 730 user2_feature = pd.merge(user2_feature,t13,on='user_id',how='left')
 731 user2_feature.count_merchant = user2_feature.count_merchant.replace(np.nan,0)
 732 user2_feature.buy_use_coupon = user2_feature.buy_use_coupon.replace(np.nan,0)
 733 user2_feature['buy_use_coupon_rate'] = user2_feature.buy_use_coupon.astype('float') / user2_feature.buy_total.astype('float')
 734 user2_feature['user_coupon_transfer_rate'] = user2_feature.buy_use_coupon.astype('float') / user2_feature.coupon_received.astype('float')
 735 user2_feature.buy_total = user2_feature.buy_total.replace(np.nan,0)
 736 user2_feature.coupon_received = user2_feature.coupon_received.replace(np.nan,0)
 737 user2_feature.to_csv('data/user2_feature.csv',index=None)
 738 
 739 
 740 #for dataset1
 741 user1 = feature1[['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']]
 742 
 743 t = user1[['user_id']]
 744 t.drop_duplicates(inplace=True)
 745 
 746 t1 = user1[user1.date!='null'][['user_id','merchant_id']]
 747 t1.drop_duplicates(inplace=True)
 748 t1.merchant_id = 1
 749 t1 = t1.groupby('user_id').agg('sum').reset_index()
 750 t1.rename(columns={'merchant_id':'count_merchant'},inplace=True)
 751 user1['coupon_id']=user3['coupon_id'].replace(np.nan,'null')
 752 t2 = user1[(user1.date!='null')&(user1.coupon_id!='null')][['user_id','distance']]
 753 t2.replace('null',-1,inplace=True)
 754 #t2.distance = t2.distance.astype('int')
 755 t2.replace(-1,np.nan,inplace=True)
 756 t3 = t2.groupby('user_id').agg('min').reset_index()
 757 t3.rename(columns={'distance':'user_min_distance'},inplace=True)
 758 
 759 t4 = t2.groupby('user_id').agg('max').reset_index()
 760 t4.rename(columns={'distance':'user_max_distance'},inplace=True)
 761 
 762 t5 = t2.groupby('user_id').agg('mean').reset_index()
 763 t5.rename(columns={'distance':'user_mean_distance'},inplace=True)
 764 
 765 t6 = t2.groupby('user_id').agg('median').reset_index()
 766 t6.rename(columns={'distance':'user_median_distance'},inplace=True)
 767 
 768 t7 = user1[(user1.date!='null')&(user1.coupon_id!='null')][['user_id']]
 769 t7['buy_use_coupon'] = 1
 770 t7 = t7.groupby('user_id').agg('sum').reset_index()
 771 
 772 t8 = user1[user1.date!='null'][['user_id']]
 773 t8['buy_total'] = 1
 774 t8 = t8.groupby('user_id').agg('sum').reset_index()
 775 
 776 t9 = user1[user1.coupon_id!='null'][['user_id']]
 777 t9['coupon_received'] = 1
 778 t9 = t9.groupby('user_id').agg('sum').reset_index()
 779 
 780 t10 = user1[(user1.date_received!='null')&(user1.date!='null')][['user_id','date_received','date']]
 781 t10['user_date_datereceived_gap'] = t10.date + ':' + t10.date_received
 782 t10.user_date_datereceived_gap = t10.user_date_datereceived_gap.apply(get_user_date_datereceived_gap)
 783 t10 = t10[['user_id','user_date_datereceived_gap']]
 784 
 785 t11 = t10.groupby('user_id').agg('mean').reset_index()
 786 t11.rename(columns={'user_date_datereceived_gap':'avg_user_date_datereceived_gap'},inplace=True)
 787 t12 = t10.groupby('user_id').agg('min').reset_index()
 788 t12.rename(columns={'user_date_datereceived_gap':'min_user_date_datereceived_gap'},inplace=True)
 789 t13 = t10.groupby('user_id').agg('max').reset_index()
 790 t13.rename(columns={'user_date_datereceived_gap':'max_user_date_datereceived_gap'},inplace=True)
 791 
 792 user1_feature = pd.merge(t,t1,on='user_id',how='left')
 793 user1_feature = pd.merge(user1_feature,t3,on='user_id',how='left')
 794 user1_feature = pd.merge(user1_feature,t4,on='user_id',how='left')
 795 user1_feature = pd.merge(user1_feature,t5,on='user_id',how='left')
 796 user1_feature = pd.merge(user1_feature,t6,on='user_id',how='left')
 797 user1_feature = pd.merge(user1_feature,t7,on='user_id',how='left')
 798 user1_feature = pd.merge(user1_feature,t8,on='user_id',how='left')
 799 user1_feature = pd.merge(user1_feature,t9,on='user_id',how='left')
 800 user1_feature = pd.merge(user1_feature,t11,on='user_id',how='left')
 801 user1_feature = pd.merge(user1_feature,t12,on='user_id',how='left')
 802 user1_feature = pd.merge(user1_feature,t13,on='user_id',how='left')
 803 user1_feature.count_merchant = user1_feature.count_merchant.replace(np.nan,0)
 804 user1_feature.buy_use_coupon = user1_feature.buy_use_coupon.replace(np.nan,0)
 805 user1_feature['buy_use_coupon_rate'] = user1_feature.buy_use_coupon.astype('float') / user1_feature.buy_total.astype('float')
 806 user1_feature['user_coupon_transfer_rate'] = user1_feature.buy_use_coupon.astype('float') / user1_feature.coupon_received.astype('float')
 807 user1_feature.buy_total = user1_feature.buy_total.replace(np.nan,0)
 808 user1_feature.coupon_received = user1_feature.coupon_received.replace(np.nan,0)
 809 user1_feature.to_csv('data/user1_feature.csv',index=None)
 810 
 811 
 812 
 813 ##################  user_merchant related feature #########################
 814 
 815 """
 816 4.user_merchant:
 817       times_user_buy_merchant_before.
 818 """
 819 #for dataset3
 820 all_user_merchant = feature3[['user_id','merchant_id']]
 821 all_user_merchant.drop_duplicates(inplace=True)
 822 
 823 t = feature3[['user_id','merchant_id','date']]
 824 t = t[t.date!='null'][['user_id','merchant_id']]
 825 t['user_merchant_buy_total'] = 1
 826 t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 827 t.drop_duplicates(inplace=True)
 828 
 829 t1 = feature3[['user_id','merchant_id','coupon_id']]
 830 t1.coupon_id=t1.coupon_id.replace(np.nan,'null')
 831 t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']]
 832 t1['user_merchant_received'] = 1
 833 t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 834 t1.drop_duplicates(inplace=True)
 835 
 836 t2 = feature3[['user_id','merchant_id','date','date_received']]
 837 t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']]
 838 t2['user_merchant_buy_use_coupon'] = 1
 839 t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 840 t2.drop_duplicates(inplace=True)
 841 
 842 t3 = feature3[['user_id','merchant_id']]
 843 t3['user_merchant_any'] = 1
 844 t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 845 t3.drop_duplicates(inplace=True)
 846 
 847 t4 = feature3[['user_id','merchant_id','date','coupon_id']]
 848 t4.coupon_id=t4.coupon_id.replace(np.nan,'null')
 849 t4 = t4[(t4.date!='null')&(t4.coupon_id=='null')][['user_id','merchant_id']]
 850 t4['user_merchant_buy_common'] = 1
 851 t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 852 t4.drop_duplicates(inplace=True)
 853 
 854 user_merchant3 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left')
 855 user_merchant3 = pd.merge(user_merchant3,t1,on=['user_id','merchant_id'],how='left')
 856 user_merchant3 = pd.merge(user_merchant3,t2,on=['user_id','merchant_id'],how='left')
 857 user_merchant3 = pd.merge(user_merchant3,t3,on=['user_id','merchant_id'],how='left')
 858 user_merchant3 = pd.merge(user_merchant3,t4,on=['user_id','merchant_id'],how='left')
 859 user_merchant3.user_merchant_buy_use_coupon = user_merchant3.user_merchant_buy_use_coupon.replace(np.nan,0)
 860 user_merchant3.user_merchant_buy_common = user_merchant3.user_merchant_buy_common.replace(np.nan,0)
 861 user_merchant3['user_merchant_coupon_transfer_rate'] = user_merchant3.user_merchant_buy_use_coupon.astype('float') / user_merchant3.user_merchant_received.astype('float')
 862 user_merchant3['user_merchant_coupon_buy_rate'] = user_merchant3.user_merchant_buy_use_coupon.astype('float') / user_merchant3.user_merchant_buy_total.astype('float')
 863 user_merchant3['user_merchant_rate'] = user_merchant3.user_merchant_buy_total.astype('float') / user_merchant3.user_merchant_any.astype('float')
 864 user_merchant3['user_merchant_common_buy_rate'] = user_merchant3.user_merchant_buy_common.astype('float') / user_merchant3.user_merchant_buy_total.astype('float')
 865 user_merchant3.to_csv('data/user_merchant3.csv',index=None)
 866 
 867 #for dataset2
 868 all_user_merchant = feature2[['user_id','merchant_id']]
 869 all_user_merchant.drop_duplicates(inplace=True)
 870 
 871 t = feature2[['user_id','merchant_id','date']]
 872 t = t[t.date!='null'][['user_id','merchant_id']]
 873 t['user_merchant_buy_total'] = 1
 874 t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 875 t.drop_duplicates(inplace=True)
 876 
 877 t1 = feature2[['user_id','merchant_id','coupon_id']]
 878 t1.coupon_id=t1.coupon_id.replace(np.nan,'null')
 879 t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']]
 880 t1['user_merchant_received'] = 1
 881 t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 882 t1.drop_duplicates(inplace=True)
 883 
 884 t2 = feature2[['user_id','merchant_id','date','date_received']]
 885 t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']]
 886 t2['user_merchant_buy_use_coupon'] = 1
 887 t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 888 t2.drop_duplicates(inplace=True)
 889 
 890 t3 = feature2[['user_id','merchant_id']]
 891 t3['user_merchant_any'] = 1
 892 t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 893 t3.drop_duplicates(inplace=True)
 894 
 895 t4 = feature2[['user_id','merchant_id','date','coupon_id']]
 896 t4.coupon_id=t4.coupon_id.replace(np.nan,'null')
 897 t4 = t4[(t4.date!='null')&(t4.coupon_id=='null')][['user_id','merchant_id']]
 898 t4['user_merchant_buy_common'] = 1
 899 t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 900 t4.drop_duplicates(inplace=True)
 901 
 902 user_merchant2 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left')
 903 user_merchant2 = pd.merge(user_merchant2,t1,on=['user_id','merchant_id'],how='left')
 904 user_merchant2 = pd.merge(user_merchant2,t2,on=['user_id','merchant_id'],how='left')
 905 user_merchant2 = pd.merge(user_merchant2,t3,on=['user_id','merchant_id'],how='left')
 906 user_merchant2 = pd.merge(user_merchant2,t4,on=['user_id','merchant_id'],how='left')
 907 user_merchant2.user_merchant_buy_use_coupon = user_merchant2.user_merchant_buy_use_coupon.replace(np.nan,0)
 908 user_merchant2.user_merchant_buy_common = user_merchant2.user_merchant_buy_common.replace(np.nan,0)
 909 user_merchant2['user_merchant_coupon_transfer_rate'] = user_merchant2.user_merchant_buy_use_coupon.astype('float') / user_merchant2.user_merchant_received.astype('float')
 910 user_merchant2['user_merchant_coupon_buy_rate'] = user_merchant2.user_merchant_buy_use_coupon.astype('float') / user_merchant2.user_merchant_buy_total.astype('float')
 911 user_merchant2['user_merchant_rate'] = user_merchant2.user_merchant_buy_total.astype('float') / user_merchant2.user_merchant_any.astype('float')
 912 user_merchant2['user_merchant_common_buy_rate'] = user_merchant2.user_merchant_buy_common.astype('float') / user_merchant2.user_merchant_buy_total.astype('float')
 913 user_merchant2.to_csv('data/user_merchant2.csv',index=None)
 914 
 915 #for dataset2
 916 all_user_merchant = feature1[['user_id','merchant_id']]
 917 all_user_merchant.drop_duplicates(inplace=True)
 918 
 919 t = feature1[['user_id','merchant_id','date']]
 920 t = t[t.date!='null'][['user_id','merchant_id']]
 921 t['user_merchant_buy_total'] = 1
 922 t = t.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 923 t.drop_duplicates(inplace=True)
 924 
 925 t1 = feature1[['user_id','merchant_id','coupon_id']]
 926 t1.coupon_id=t1.coupon_id.replace(np.nan,'null')
 927 t1 = t1[t1.coupon_id!='null'][['user_id','merchant_id']]
 928 t1['user_merchant_received'] = 1
 929 t1 = t1.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 930 t1.drop_duplicates(inplace=True)
 931 
 932 t2 = feature1[['user_id','merchant_id','date','date_received']]
 933 t2 = t2[(t2.date!='null')&(t2.date_received!='null')][['user_id','merchant_id']]
 934 t2['user_merchant_buy_use_coupon'] = 1
 935 t2 = t2.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 936 t2.drop_duplicates(inplace=True)
 937 
 938 t3 = feature1[['user_id','merchant_id']]
 939 t3['user_merchant_any'] = 1
 940 t3 = t3.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 941 t3.drop_duplicates(inplace=True)
 942 
 943 t4 = feature1[['user_id','merchant_id','date','coupon_id']]
 944 t4.coupon_id=t4.coupon_id.replace(np.nan,'null')
 945 t4 = t4[(t4.date!='null')&(t4.coupon_id=='null')][['user_id','merchant_id']]
 946 t4['user_merchant_buy_common'] = 1
 947 t4 = t4.groupby(['user_id','merchant_id']).agg('sum').reset_index()
 948 t4.drop_duplicates(inplace=True)
 949 
 950 user_merchant1 = pd.merge(all_user_merchant,t,on=['user_id','merchant_id'],how='left')
 951 user_merchant1 = pd.merge(user_merchant1,t1,on=['user_id','merchant_id'],how='left')
 952 user_merchant1 = pd.merge(user_merchant1,t2,on=['user_id','merchant_id'],how='left')
 953 user_merchant1 = pd.merge(user_merchant1,t3,on=['user_id','merchant_id'],how='left')
 954 user_merchant1 = pd.merge(user_merchant1,t4,on=['user_id','merchant_id'],how='left')
 955 user_merchant1.user_merchant_buy_use_coupon = user_merchant1.user_merchant_buy_use_coupon.replace(np.nan,0)
 956 user_merchant1.user_merchant_buy_common = user_merchant1.user_merchant_buy_common.replace(np.nan,0)
 957 user_merchant1['user_merchant_coupon_transfer_rate'] = user_merchant1.user_merchant_buy_use_coupon.astype('float') / user_merchant1.user_merchant_received.astype('float')
 958 user_merchant1['user_merchant_coupon_buy_rate'] = user_merchant1.user_merchant_buy_use_coupon.astype('float') / user_merchant1.user_merchant_buy_total.astype('float')
 959 user_merchant1['user_merchant_rate'] = user_merchant1.user_merchant_buy_total.astype('float') / user_merchant1.user_merchant_any.astype('float')
 960 user_merchant1['user_merchant_common_buy_rate'] = user_merchant1.user_merchant_buy_common.astype('float') / user_merchant1.user_merchant_buy_total.astype('float')
 961 user_merchant1.to_csv('data/user_merchant1.csv',index=None)
 962 
 963 ##################  generate training and testing set ################
 964 def get_label(s):
 965     s = s.split(':')
 966     if s[0]=='nan':
 967         return 0
 968     elif (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8]))-date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days<=15:
 969         return 1
 970     else:
 971         return -1
 972 
 973 
 974 coupon3 = pd.read_csv('data/coupon3_feature.csv')
 975 merchant3 = pd.read_csv('data/merchant3_feature.csv')
 976 user3 = pd.read_csv('data/user3_feature.csv')
 977 user_merchant3 = pd.read_csv('data/user_merchant3.csv')
 978 other_feature3 = pd.read_csv('data/other_feature3.csv')
 979 dataset3 = pd.merge(coupon3,merchant3,on='merchant_id',how='left')
 980 dataset3 = pd.merge(dataset3,user3,on='user_id',how='left')
 981 dataset3 = pd.merge(dataset3,user_merchant3,on=['user_id','merchant_id'],how='left')
 982 dataset3 = pd.merge(dataset3,other_feature3,on=['user_id','coupon_id','date_received'],how='left')
 983 dataset3.drop_duplicates(inplace=True)
 984 print (dataset3.shape)
 985 
 986 dataset3.user_merchant_buy_total = dataset3.user_merchant_buy_total.replace(np.nan,0)
 987 dataset3.user_merchant_any = dataset3.user_merchant_any.replace(np.nan,0)
 988 dataset3.user_merchant_received = dataset3.user_merchant_received.replace(np.nan,0)
 989 dataset3['is_weekend'] = dataset3.day_of_week.apply(lambda x:1 if x in (6,7) else 0)
 990 weekday_dummies = pd.get_dummies(dataset3.day_of_week)
 991 weekday_dummies.columns = ['weekday'+str(i+1) for i in range(weekday_dummies.shape[1])]
 992 dataset3 = pd.concat([dataset3,weekday_dummies],axis=1)
 993 dataset3.drop(['merchant_id','day_of_week','coupon_count'],axis=1,inplace=True)
 994 dataset3 = dataset3.replace('null',np.nan)
 995 dataset3.to_csv('data/dataset3.csv',index=None)
 996 
 997 
 998 coupon2 = pd.read_csv('data/coupon2_feature.csv')
 999 merchant2 = pd.read_csv('data/merchant2_feature.csv')
1000 user2 = pd.read_csv('data/user2_feature.csv')
1001 user_merchant2 = pd.read_csv('data/user_merchant2.csv')
1002 other_feature2 = pd.read_csv('data/other_feature2.csv')
1003 dataset2 = pd.merge(coupon2,merchant2,on='merchant_id',how='left')
1004 dataset2 = pd.merge(dataset2,user2,on='user_id',how='left')
1005 dataset2 = pd.merge(dataset2,user_merchant2,on=['user_id','merchant_id'],how='left')
1006 dataset2 = pd.merge(dataset2,other_feature2,on=['user_id','coupon_id','date_received'],how='left')
1007 dataset2.drop_duplicates(inplace=True)
1008 print( dataset2.shape)
1009 
1010 dataset2.user_merchant_buy_total = dataset2.user_merchant_buy_total.replace(np.nan,0)
1011 dataset2.user_merchant_any = dataset2.user_merchant_any.replace(np.nan,0)
1012 dataset2.user_merchant_received = dataset2.user_merchant_received.replace(np.nan,0)
1013 dataset2['is_weekend'] = dataset2.day_of_week.apply(lambda x:1 if x in (6,7) else 0)
1014 weekday_dummies = pd.get_dummies(dataset2.day_of_week)
1015 weekday_dummies.columns = ['weekday'+str(i+1) for i in range(weekday_dummies.shape[1])]
1016 dataset2 = pd.concat([dataset2,weekday_dummies],axis=1)
1017 dataset2['label'] = dataset2.date.astype('str') + ':' +  dataset2.date_received.astype('str')
1018 dataset2.label = dataset2.label.apply(get_label)
1019 dataset2.drop(['merchant_id','day_of_week','date','date_received','coupon_id','coupon_count'],axis=1,inplace=True)
1020 dataset2 = dataset2.replace('null',np.nan)
1021 dataset2.to_csv('data/dataset2.csv',index=None)
1022 
1023 
1024 coupon1 = pd.read_csv('data/coupon1_feature.csv')
1025 merchant1 = pd.read_csv('data/merchant1_feature.csv')
1026 user1 = pd.read_csv('data/user1_feature.csv')
1027 user_merchant1 = pd.read_csv('data/user_merchant1.csv')
1028 other_feature1 = pd.read_csv('data/other_feature1.csv')
1029 dataset1 = pd.merge(coupon1,merchant1,on='merchant_id',how='left')
1030 dataset1 = pd.merge(dataset1,user1,on='user_id',how='left')
1031 dataset1 = pd.merge(dataset1,user_merchant1,on=['user_id','merchant_id'],how='left')
1032 dataset1 = pd.merge(dataset1,other_feature1,on=['user_id','coupon_id','date_received'],how='left')
1033 dataset1.drop_duplicates(inplace=True)
1034 print (dataset1.shape)
1035 
1036 dataset1.user_merchant_buy_total = dataset1.user_merchant_buy_total.replace(np.nan,0)
1037 dataset1.user_merchant_any = dataset1.user_merchant_any.replace(np.nan,0)
1038 dataset1.user_merchant_received = dataset1.user_merchant_received.replace(np.nan,0)
1039 dataset1['is_weekend'] = dataset1.day_of_week.apply(lambda x:1 if x in (6,7) else 0)
1040 weekday_dummies = pd.get_dummies(dataset1.day_of_week)
1041 weekday_dummies.columns = ['weekday'+str(i+1) for i in range(weekday_dummies.shape[1])]
1042 dataset1 = pd.concat([dataset1,weekday_dummies],axis=1)
1043 dataset1['label'] = dataset1.date.astype('str') + ':' +  dataset1.date_received.astype('str')
1044 dataset1.label = dataset1.label.apply(get_label)
1045 dataset1.drop(['merchant_id','day_of_week','date','date_received','coupon_id','coupon_count'],axis=1,inplace=True)
1046 dataset1 = dataset1.replace('null',np.nan)
1047 dataset1.to_csv('data/dataset1.csv',index=None)
View Code

GBDT训练部分:

  1 # -*- coding: utf-8 -*-
  2 """
  3 Created on Mon Dec  9 13:21:52 2019
  4 
  5 @author: wenzhe.tian
  6 
  7 
  8 GBDT调参顺序 可使用GridSearchCV
  9 1. n_estimators & learning_rate
 10 2. max_depth 或可为1
 11 3. min_samples_split & min_samples_leaf
 12 4. max_features 特征较多时考虑取部分
 13 5. subsample 使用样本的比例
 14 """
 15 from sklearn.preprocessing import MinMaxScaler
 16 import pandas as pd
 17 import numpy as np
 18 from sklearn import ensemble
 19 from sklearn.ensemble import GradientBoostingRegressor
 20 from sklearn.grid_search import GridSearchCV
 21 import pickle
 22 import os
 23 import datetime
 24 dataset1 = pd.read_csv('data/dataset1.csv')
 25 dataset1.label.replace(-1,0,inplace=True)
 26 dataset2 = pd.read_csv('data/dataset2.csv')
 27 dataset2.label.replace(-1,0,inplace=True)
 28 dataset3 = pd.read_csv('data/dataset3.csv')
 29 
 30 dataset1.drop_duplicates(inplace=True)
 31 dataset2.drop_duplicates(inplace=True)
 32 dataset3.drop_duplicates(inplace=True)
 33 
 34 dataset12 = pd.concat([dataset1,dataset2],axis=0)
 35 
 36 dataset1_y = dataset1.label
 37 dataset1_x = dataset1.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1)  # 'day_gap_before','day_gap_after' cause overfitting, 0.77
 38 dataset2_y = dataset2.label
 39 dataset2_x = dataset2.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1)
 40 dataset12_y = dataset12.label
 41 dataset12_x = dataset12.drop(['user_id','label','day_gap_before','day_gap_after'],axis=1)
 42 dataset3_preds = dataset3[['user_id','coupon_id','date_received']]
 43 dataset3_x = dataset3.drop(['user_id','coupon_id','date_received','day_gap_before','day_gap_after'],axis=1)
 44 
 45 dataset12_x=dataset12_x.fillna(-100)
 46 dataset3_x=dataset3_x.fillna(-100)
 47 
 48 
 49 
 50 
 51 score=[]
 52 # 深度调参
 53 #gbdt=ensemble.GradientBoostingRegressor(
 54 #  loss='ls'
 55 #, learning_rate=0.1
 56 #, n_estimators=1000
 57 #, subsample=1
 58 #, min_samples_split=15
 59 #, min_samples_leaf=10
 60 #, max_depth=3
 61 #, init=None
 62 #, random_state=None
 63 #, max_features=None
 64 #, alpha=0.9
 65 #, verbose=1
 66 #, max_leaf_nodes=None
 67 #, warm_start=False)
 68 #gbdt.fit(dataset12_x,dataset12_y)
 69 #
 70 #score.append(gbdt.score(dataset12_x,dataset12_y))
 71 
 72 #dataset3_preds['label'] = gbdt.predict(dataset3_x)
 73 #dataset3_preds.label = MinMaxScaler().fit_transform(dataset3_preds.label.reshape(-1, 1))
 74 #dataset3_preds.sort_values(by=['coupon_id','label'],inplace=True)
 75 #dataset3_preds.to_csv("test.csv",index=None,header=None)
 76 
 77 # 深度调参
 78 gbdt=ensemble.GradientBoostingRegressor(
 79   loss='ls'
 80 , learning_rate=0.11
 81 , n_estimators=160
 82 , subsample=1
 83 , min_samples_split=4  #4 is best
 84 , min_samples_leaf=1
 85 , max_depth=4
 86 , init=None
 87 , random_state=None
 88 , max_features=None
 89 , alpha=0.9
 90 , verbose=1
 91 , max_leaf_nodes=None
 92 , warm_start=False)
 93 gbdt.fit(dataset12_x,dataset12_y)
 94 
 95 score.append(gbdt.score(dataset12_x,dataset12_y))
 96 
 97 
 98 dataset3_preds['label'] = gbdt.predict(dataset3_x)
 99 dataset3_preds.label = MinMaxScaler().fit_transform(dataset3_preds.label.reshape(-1, 1))
100 dataset3_preds.sort_values(by=['coupon_id','label'],inplace=True)
101 dataset3_preds.to_csv("results"+datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S')+".csv",index=None,header=None)
102 
103 #os.system('shutdown -s -t 10')
View Code

GridSearchCV的一个调参案例:

param_test2 = {'n_estimators':[50,75,100,125,150,200,300,400]}
# 迭代次数调参
gsearch2 = GridSearchCV(
    estimator=GradientBoostingRegressor(
      loss='ls'
    , learning_rate=0.1
#    , n_estimators=125
    , subsample=1
    , min_samples_split=5 #2
    , min_samples_leaf=1   #1
    , max_depth=5
    , init=None
    , random_state=None
    , max_features=None
    , alpha=0.9
    , verbose=1
    , max_leaf_nodes=None
    , warm_start=False
    ),param_grid = param_test2, scoring='roc_auc',iid=False,cv=None)

gsearch2.fit(dataset12_x,dataset12_y)
ans_vid1={}
ans_vid1['grid_scores']=gsearch2.grid_scores_
ans_vid1['best_params']=gsearch2.best_params_
ans_vid1['best_score']=gsearch2.best_score_