http://www.cnblogs.com/sudz/articles/4366275.html
项目简介:统计所有用户使用APP的动作(按操作顺序)
用户点击路径计算:
基础数据:
hive表:desc client_base_new
字段:
datetype:客户端日志标识(newsapp)
ip:ip地址
mos:平台
softv:软件版本
pub:渠道号
userkey:用户标识
ua:用户信息
net:网络
logintime:安装时间
isupdate:是否更新
ct:操作时间
opa:操作
docid:页面ID
pv:无关
start_pv:无关
ch:频道
pagetype:页面类型
other:补充信息
dt:分区
建表语句:
show create table client_base_new CREATE TABLE client_base_new( datatype string, ip string, mos string, softv string, pub string, userkey string, ua string, net string, logintime string, isupdate string, ct string, opa string, docid string, pv int, start_pv int, ch string, pagetype string, other string) PARTITIONED BY ( dt string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.RCFileInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.RCFileOutputFormat' LOCATION 'hdfs://nameservice1/user/hive/warehouse/client_base_new'
create table client_wen_crumbs( userkey string, mos string, softv string, ua string, cts string, opas string, pgids string, chs string, pgtys string) PARTITIONED BY (dt string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '9' LINES TERMINATED BY '10' STORED AS RCFile;
按用户及操作时间排序:
from ( from ( select userkey, mos, ct, opa, docid, ch, pagetype from ( from ( from ( select userkey, getmos(mos) as mos, ct, opa, docid, ch, pagetype from client_base_new where instr(ct, '2015-03-17+') <> 0 and unix_timestamp(ct, 'yyyy-MM-dd+HH:mm:ss') is not null and opa in ('in', 'page', 'end') and dt = '2015-03-17' and userkey <> '#' distribute by userkey sort by userkey, ct) a select userkey, mos, ct, opa, docid, ch, pagetype where a.userkey <> '' and a.mos <> '' and ct <> '' and opa <> '' and docid <> '' and ch <> '' and a.pagetype <> '' ) m select userkey, mos, ct, opa, docid, ch, pagetype) b ) c select transform (userkey, mos, ct, opa, docid, ch, pagetype) USING './client_user_crumbs_ol.py' as (userkey, mos, cts, opas, pgids, chs, pgtys)) d insert overwrite table client_wen_crumbs PARTITION (dt='2015-03-17') select userkey, mos, cts, opas, pgids, chs, pgtys where split(d.cts, '&')[0] > '2015-03-17+01:00:00';
./client_user_crumbs_ol.py
#!/usr/local/python-2.7.2/bin/python import sys # result = ['#uid', '#mos', ['#cts'], ['#opas'], ['#pages'], ['#chs'], ['#pgtys']] result = ['#', '#', '#', '#', ['#'], ['#'], ['#'], ['#'], ['#']] def same_pic_page(pgid1, pgid2): if pgid1.count('_') == 2 and pgid1.count('_') == pgid2.count('_'): return "".join(pgid1.split('_')[:-1]) == "".join(pgid2.split('_')[:-1]) else: return pgid1 == pgid2 # if pgid1.count('_') == 2 and pgid1.count('_') == pgid2.count('_'): # return "".join(pgid1.split('_')[:-1]) == "".join(pgid2.split('_')[:-1]) # elif pgid1.count('_') == 2 and pgid2.count('_') == 1: # return "".join(pgid1.split('_')[:-1]) == "".join(pgid2.split('_')) # elif pgid2.count('_') == 2 and pgid1.count('_') == 1: # return "".join(pgid2.split('_')[:-1]) == "".join(pgid1.split('_')) # elif pgid1.count('_') == 1 and pgid2.count('_') == 1: # return pgid1 == pgid2 # else: # return False for line in sys.stdin: try: tmp = line.split("\t") if len(tmp) != 9: continue items = map(lambda a: a.strip(), tmp) uid, mos, softv, ua, ct, opa, docid, ch, pgty = items uid_ = '_'.join([uid, ua]) if result[0] != uid_: if result[0] != '#': print "\t".join(map(str, [result[0].split('_')[0], result[1], result[2], result[3], '&'.join(result[4]), '&'.join(result[5]), '&'.join(result[6]), '&'.join(result[7]), '&'.join(result[8])])) result = ['#', '#', '#', '#', ['#'], ['#'], ['#'], ['#'], ['#']] if result[0] == '#': result[0] = uid_ result[1] = mos result[2] = softv result[3] = ua result[4][0] = ct result[5][0] = opa result[6][0] = docid result[7][0] = ch result[8][0] = pgty continue elif pgty == 'pic' and result[8][-1] == 'pic' and same_pic_page(docid, result[6][-1]): continue if len(result[4]) > 100: continue result[4].append(ct) result[5].append(opa) result[6].append(docid) result[7].append(ch) result[8].append(pgty) except: continue
生成路径:
insert overwrite table client_wen_crumbs_path partition (dt = '2015-03-17') select transform (userkey, mos, cts, opas, pgids, chs, pgtys) using './client_user_crumbs_step1_ol.py' as (userkey, mos, path, delta) from client_wen_crumbs where dt = '2015-03-17' and mos in ('android', 'iphone');
./client_user_crumbs_step1_ol.py
#!/usr/local/python-2.7.2/bin/python import sys import time # import itertools for line in sys.stdin: try: user_route = [] tm_delta = [] tmp = line.split('\t') if len(tmp) != 9: continue items = map(lambda a: a.strip(), tmp) userkey, mos, softv, ua, cts, opas, pgs, chs, pgtys = items cts = cts.split("&") opas = opas.split("&") pgs = pgs.split("&") chs = chs.split("&") pgtys = pgtys.split("&") tm_stamps = map(lambda a: int(time.mktime(time.strptime(a, '%Y-%m-%d+%H:%M:%S'))), cts) tmp = tm_stamps[0] for item in zip(cts, opas, pgs, chs, pgtys, tm_stamps): ct, opa, pg, ch, pgty, second_delta = item # seconds_delta = tm_stamp - tmp if opa == 'page': if pgty == "ch": user_route.append(pg) else: user_route.append(pgty) else: if len(user_route): if opa == 'in' and user_route[-1] == 'in': tm_delta[-1] = second_delta - tmp tmp = second_delta continue elif opa == 'in' and tmp == second_delta: user_route.insert(len(user_route)-1, opa) tm_delta.append(second_delta - tmp) tmp = second_delta continue user_route.append(opa) tm_delta.append(second_delta - tmp) tmp = second_delta print "\t".join(map(str, [userkey, mos, softv, ua, '/'.join(map(str, user_route)), '/'.join(map(str, tm_delta))])) except: continue
路径统计算法:
import sys import glob import time def user_path(pathdeep, outdeep, prefix = 'in', prefix_tag = True, opr_delta = 2000): filepath = './user_path.2015-03-17' result = {} with open(filepath) as f: for line in f: item = line.split('\t') userkey, mos, path, delta, dt = item del dt if prefix_tag: if not path.startswith(prefix): continue paths = path.split('/') deltas = delta.split('/') for i in range(1, min(pathdeep+2, len(paths)+1)): pg = paths[i-1] # pg, delta_tm, step = item0 if 'icmp' in pg: paths[i-1] = 'article' if pg.count('_') == 2: paths[i-1] = 'pic' pg = paths[i-1] step = i # for item0 in zip(paths, deltas, range(1, len(paths)+1)): delta_tm = deltas[i-1] if any([int(delta_tm) > opr_delta, pg == 'end', pg == 'in']) and step != 1: paths[i-1] = 'end' path = '/'.join(paths[:i]) result.setdefault(step, {}).setdefault(path, [0])[0] += 1 result.setdefault(step, {}).setdefault('all', [0])[0] += 1 break elif len(paths) == i: paths.append('end') path = '/'.join(paths[:i+1]) result.setdefault(step, {}).setdefault(path, [0])[0] += 1 result.setdefault(step, {}).setdefault('all', [0])[0] += 1 break path = '/'.join(paths[:i]) result.setdefault(step, {}).setdefault(path, [0])[0] += 1 result.setdefault(step, {}).setdefault('all', [0])[0] += 1 # for i in range(1, len(result)+1): # print '\t'.join(map(str, [i, result[i]['end'][0], result[i]['all'][0]])) for item in result[outdeep]: try: print '\t'.join(map(str, [item, result[outdeep][item][0]])) except: print item continue if __name__ == '__main__': user_path(3, outdeep = 2)
user_path.2015-03-17文件格式:
000000183318702 android 4.4.2 x909t in/sy:#/mil:#/article:mil/mil:back/article:mil/mil:back/article:mil/mil:back/article:disnews/disnews:#/disnews:back/article:disnews 0/6/10/13/11/4/7/3/13/4/0/8/3
000000183443982 android 4.4.0 x909t in/sy:#/article:disnews/disnews:#/disnews:back/article:disnews/disnews:back/article:disnews/disnews:back/tech:#/article:tech/tech:back/pic:tech/tech:back/piclive:finance/finance:#/finance:back/pic:finance/finance:back/article:sports/sports:#/sports:back/article:sports/sports:back/piclive:live/live:#/live:back/piclive:live/live:back/rcmd:#/pic:rcmd/rcmd:back/article:rcmd/rcmd:back/pic:rcmd 0/13/10/1/4/4/8/19/5/4/16/6/3/12/4/0/13/4/5/4/0/10/4/11/4/0/9/3/13/4/0/10/6/5/4
000000183666412 android 4.4.2 x909t in/sy:#/mil:#/disnews:#/article:disnews/disnews:back/article:disnews/disnews:back/tech:# 0/6/9/27/11/11/4/8/4
000000184233052 android 4.4.1 x909t in/sy:#/article:sy/sy:back/article:sy/sy:back/article:sy/sy:back/sy:back/topic:sy/sy:back/article:sy/sy:back/article:sy/sy:back/article:sy/sy:back/article:sy 0/14/20/13/3/9/4/12/9/4/6/3/5/4/11/3/6/4
最终结果:

对于user_path.2015-03-17中间文件路径计算算法如下:
def user_path(dayStr, pathdeep, outdeep = None, prefix = 'in', prefix_tag = True, opr_delta = 2000): filepath = '/data1/pylogs/newsapp_user_path/user_path.' + dayStr result = {} with open(filepath) as f: for line in f: item = line.split('\t') userkey, mos, ver, ua, path, delta = item if prefix_tag: if not path.startswith(prefix): continue if mos not in ['android']: continue paths = path.split('/') deltas = delta.split('/') for i in range(1, min(pathdeep+2, len(paths)+1)): pgs = paths[i-1].split(":") # pg, delta_tm, step = item0 for pg, index in zip(pgs, range(0, len(pgs))): if 'imcp' in pgs[index]: pgs[index] = 'article' if pgs[index].count('_') == 2: pgs[index] = 'pic' paths[i-1] = ":".join(pgs) pg = paths[i-1] # print pg, pgs, ":".join(pgs) step = i # for item0 in zip(paths, deltas, range(1, len(paths)+1)): delta_tm = deltas[i-1] if any([int(delta_tm) > opr_delta, pg == 'end', pg == 'in']) and step != 1: paths[i-1] = 'end' path = '/'.join(paths[:i]) result.setdefault(step, {}).setdefault(path, [0])[0] += 1 result.setdefault(step, {}).setdefault('all', [0])[0] += 1 break elif len(paths) == i: paths.append('end') path = '/'.join(paths[:i+1]) result.setdefault(step, {}).setdefault(path, [0])[0] += 1 result.setdefault(step, {}).setdefault('all', [0])[0] += 1 break path = '/'.join(paths[:i]) result.setdefault(step, {}).setdefault(path, [0])[0] += 1 result.setdefault(step, {}).setdefault('all', [0])[0] += 1 # for i in range(1, len(result)+1): # print '\t'.join(map(str, [i, result[i]['end'][0], result[i]['all'][0]])) if outdeep: for item in result[outdeep]: try: print '\t'.join(map(str, [item, result[outdeep][item][0]])) except: print item continue else: return result
计算制定路径前缀的前若干路径,入库代码:
def store_userpath(num, pathdeep = 7, path_top = 20): dayStr = time.strftime('%Y-%m-%d', time.localtime(time.time()-86400*num)) result = user_path(dayStr, pathdeep = 6, prefix = 'in/sy') con, cur = db.open_15() sql = "insert into f_newsapp_userpath values(to_date(:1, 'yyyy-mm-dd'), :2, :3, :4, null, :5)" heads = ['in/sy'] for outdeep in range(1, pathdeep): tmp = [] tmp = {} for item in result[outdeep]: if not any([head[2] in item for head in heads]): continue try: # tmp.append((dayStr, outdeep, item, result[outdeep][item][0], "/".join(item.split("/")[:-1]))) tmp.setdefault(item, []).append(result[outdeep][item][0]) except: import traceback print traceback.print_exc() continue for path in tmp: store_list = [] for uv in tmp[path]: store_list.append((dayStr, outdeep, path, uv, "/".join(path.split("/")[:-1]))) heads = sorted(store_list, key = lambda item: item[3], reverse = True)[:path_top] cur.executemany(sql, heads) con.commit()
根据展示需求,并不是所有的路径都入库,只筛选排名前5的展示,另外加一个退出客户端及其他页展示汇总数据;因此上述代码还需根据需求完善,完善以后的代码如下:
1 # -*- encoding: utf-8 -*- 2 import db 3 import analysis_news_pylogs 4 import sys 5 import time 6 import itertools 7 import operator 8 9 10 def store_userpath(num, plat, del_flag = False, pathdeep = 8, chid_num = 7): 11 dayStr = time.strftime('%Y-%m-%d', time.localtime(time.time()-86400*num)) 12 result = analysis_news_pylogs.user_path(dayStr, pathdeep = 6, prefix = 'in') 13 con, cur = db.open_app() 14 if del_flag: 15 del_sql = "delete from f_newsapp_userpath where tm = to_date('%s', 'yyyy-mm-dd')" % dayStr 16 cur.execute(del_sql) 17 sql = "insert into f_newsapp_userpath values(to_date(:1, 'yyyy-mm-dd'), :2, :3, :4, null, :5, :6, :7, :8, :9, :10, :11)" 18 fpaths = set() 19 fpaths.add('in') 20 fpaths.add('all') 21 store_list = [] 22 for outdeep in range(1, pathdeep): 23 tmp = {} 24 for item in result[outdeep]: 25 try: 26 # tmp.append((dayStr, outdeep, item, result[outdeep][item][0], "/".join(item.split("/")[:-1]))) 27 tmp.setdefault("/".join(item.split("/")[:-1] if item.split("/")[:-1] else [item]), {}).setdefault(item, result[outdeep][item][0]) 28 except: 29 import traceback 30 print traceback.print_exc() 31 continue 32 for fpath in tmp: 33 for path in tmp[fpath]: 34 try: 35 tail = path.split('/')[-1] 36 if ':' in tail: 37 cpath, ref = tail.split(':')[:2] 38 else: 39 cpath, ref = tail, tail 40 except: 41 import traceback 42 print traceback.print_exc() 43 if ref == "end": 44 ref = "" 45 store_list.append([dayStr, outdeep, path, tmp[fpath][path], fpath, cpath, plat, ref, "/".join([fpath, cpath])]) 46 sorted_list = sorted(store_list, key = lambda item: (item[1], item[8], item[3]), reverse = True) 47 sorted_list_1 = [] 48 record_repeat = {} 49 tag = "" 50 print len(sorted_list) 51 # delete repeat head path 52 for item in sorted_list: 53 last_one = item[-1] 54 if tag == item[8]: 55 del item[-1] 56 record_repeat.setdefault(item[2], []).append(item) 57 continue 58 else: 59 del item[-1] 60 sorted_list_1.append(item) 61 tag = last_one 62 del sorted_list 63 sorted_list = sorted_list_1 64 print len(sorted_list) 65 print len(record_repeat) 66 del sorted_list_1 67 # soet by outdeep, fpath, uv 68 sorted_list = sorted(sorted_list, key = lambda item: (item[1], item[4], item[3]), reverse = True) 69 sorted_list_2 = [] 70 # add sort num for show 71 i = 0 72 for item in sorted_list: 73 if item[5] == 'end': 74 item.append(len(sorted_list)+100) 75 else: 76 i = i+1 77 item.append(i) 78 item.append(0) 79 sorted_list_2.append(item) 80 del sorted_list 81 sorted_list = sorted_list_2 82 del sorted_list_2 83 sorted_list_bydeep = sorted([[key, [item for item in items]] for key, items in itertools.groupby(sorted_list, key = lambda item: (item[1], item[4]))], key = lambda item: item[0][0]) 84 contains_path = {1: set(["in", "all"])} 85 store_DB = [] 86 other_detail_all = [] 87 for item in sorted_list_bydeep: 88 other_detail = [] 89 outdeep, fpath = item[0] 90 if fpath not in contains_path[outdeep]: 91 continue 92 [contains_path.setdefault(outdeep+1, set()).add(tmp[2]) for tmp in item[1][:chid_num]] 93 contain_end = False 94 for ite in item[1][:chid_num-2]: 95 if outdeep+1 in result: 96 leave_uv = result.get(outdeep+1, {"".join([ite[2], "/", "end"]): [0]}).get("".join([ite[2], "/", "end"]), [0])[0] 97 else: 98 leave_uv = 0 99 if leave_uv > ite[3]: 100 print ite 101 print result.get(outdeep+1, {"".join([ite[2], "/", "end"]): [0]}).get("".join([ite[2], "/", "end"]), [0])[0] 102 ite[9] = leave_uv 103 if ite[5] == 'end': 104 contain_end = True 105 if ite[5] == 'end': 106 store_DB.append(ite) 107 else: 108 store_DB.append(ite) 109 other_uv = 0 110 other_uv_patch = 0 111 if len(item[1]) > chid_num: 112 if contain_end: 113 if outdeep+1 in result: 114 leave_uv = result.get(outdeep+1, {"".join([item[1][chid_num-2][2], "/", "end"]): [0]}).get("".join([item[1][chid_num-2][2], "/", "end"]), [0])[0] 115 else: 116 leave_uv = 0 117 item[1][chid_num-2][9] = leave_uv 118 store_DB.append(item[1][chid_num-2]) 119 if contain_end: 120 for ite in item[1][chid_num-1:]: 121 other_uv += ite[3] 122 if len(other_detail) < 10: 123 other_detail.append(ite) 124 else: 125 for ite in item[1][chid_num-2:]: 126 if ite[5] == 'end': 127 store_DB.append(ite) 128 continue 129 other_uv += ite[3] 130 if len(other_detail) < 10: 131 other_detail.append(ite) 132 for ite in record_repeat.get(fpath, []): 133 other_uv += ite[3] 134 for it in other_detail: 135 other_uv_patch += it[3] 136 it.append(1) 137 other_detail_all.append(it) 138 del other_detail 139 if other_uv != 0: 140 store_DB.append([dayStr, item[0][0], "".join([item[0][1], "/", 'other_total']), other_uv, item[0][1], 'other_total', plat, '', len(sorted_list)+50, 0]) 141 if other_uv > other_uv_patch: 142 other_detail_all.append([dayStr, item[0][0], "".join([item[0][1], "/", 'other_patch_total']), other_uv-other_uv_patch, item[0][1], 'other_patch_total', plat, '', len(sorted_list)+1, 0, 1]) 143 store_DB_lis = [] 144 for item in store_DB: 145 item.append(0) 146 store_DB_lis.append(item) 147 for item in store_DB_lis: 148 if item: 149 cur.execute(sql, item) 150 for item in other_detail_all: 151 if item: 152 cur.execute(sql, item) 153 cur.execute("update f_newsapp_userpath set ref = null where tm = to_date('%s', 'yyyy-mm-dd') and lay = 1 and path = 'in' and ref = 'in'" % (dayStr,)) 154 con.commit() 155 con.close() 156 157 158 if __name__ == '__main__': 159 if 'store' in sys.argv: 160 store_userpath(1, 'android', del_flag = True) 161 if 'daily' in sys.argv: 162 store_userpath(1, 'android')
最终展示效果如下:
浙公网安备 33010602011771号