APP用户路径计算

http://www.cnblogs.com/sudz/articles/4366275.html

项目简介：统计所有用户使用APP的动作（按操作顺序）

用户点击路径计算：

基础数据：

hive表：desc client_base_new

字段：

datetype：客户端日志标识（newsapp）

ip：ip地址

mos：平台

softv：软件版本
pub：渠道号
userkey：用户标识
ua：用户信息
net：网络
logintime：安装时间
isupdate：是否更新
ct：操作时间
opa：操作
docid：页面ID
pv：无关
start_pv：无关
ch：频道
pagetype：页面类型
other：补充信息
dt：分区

建表语句：

show create table client_base_new

CREATE  TABLE client_base_new(
  datatype string, 
  ip string, 
  mos string, 
  softv string, 
  pub string, 
  userkey string, 
  ua string, 
  net string, 
  logintime string, 
  isupdate string, 
  ct string, 
  opa string, 
  docid string, 
  pv int, 
  start_pv int, 
  ch string, 
  pagetype string, 
  other string)
PARTITIONED BY ( 
  dt string)
ROW FORMAT DELIMITED 
  FIELDS TERMINATED BY '\t' 
  LINES TERMINATED BY '\n' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.RCFileInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.RCFileOutputFormat'
LOCATION
  'hdfs://nameservice1/user/hive/warehouse/client_base_new'

View Code

create table client_wen_crumbs(
userkey string,
mos string,
softv string,
ua string,
cts string,
opas string,
pgids string,
chs string,
pgtys string)
PARTITIONED BY (dt string)
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY '9'
LINES TERMINATED BY '10'
STORED AS RCFile;

View Code

按用户及操作时间排序：

from
(
from 
(
select userkey, mos, ct, opa, docid, ch, pagetype
from 
(
from 
(
from
(
select userkey, getmos(mos) as mos, ct, opa, docid, ch, pagetype from client_base_new  where instr(ct, '2015-03-17+') <> 0 and unix_timestamp(ct, 'yyyy-MM-dd+HH:mm:ss') is not null and opa in ('in', 'page', 'end') and dt = '2015-03-17' and userkey <> '#' distribute by userkey sort by userkey, ct) a
select userkey, mos, ct, opa, docid, ch, pagetype where a.userkey <> '' and a.mos <> '' and ct <> '' and opa <> '' and docid <> '' and ch <> '' and a.pagetype <> ''
) m
select userkey, mos, ct, opa, docid, ch, pagetype) b ) c
select transform (userkey, mos, ct, opa, docid, ch, pagetype) USING './client_user_crumbs_ol.py' as (userkey, mos, cts, opas, pgids, chs, pgtys)) d
insert overwrite table client_wen_crumbs PARTITION (dt='2015-03-17')
select userkey, mos, cts, opas, pgids, chs, pgtys where split(d.cts, '&')[0] > '2015-03-17+01:00:00';

View Code

./client_user_crumbs_ol.py

#!/usr/local/python-2.7.2/bin/python
import sys

# result = ['#uid', '#mos', ['#cts'], ['#opas'], ['#pages'], ['#chs'], ['#pgtys']]
result = ['#', '#', '#', '#', ['#'], ['#'], ['#'], ['#'], ['#']]

def same_pic_page(pgid1, pgid2):
    if pgid1.count('_') == 2 and pgid1.count('_') == pgid2.count('_'):
        return "".join(pgid1.split('_')[:-1]) == "".join(pgid2.split('_')[:-1])
    else:
        return pgid1 == pgid2
    # if pgid1.count('_') == 2 and pgid1.count('_') == pgid2.count('_'):
    #     return "".join(pgid1.split('_')[:-1]) == "".join(pgid2.split('_')[:-1])
    # elif pgid1.count('_') == 2 and pgid2.count('_') == 1:
    #     return "".join(pgid1.split('_')[:-1]) == "".join(pgid2.split('_'))
    # elif pgid2.count('_') == 2 and pgid1.count('_') == 1:
    #     return "".join(pgid2.split('_')[:-1]) == "".join(pgid1.split('_'))
    # elif pgid1.count('_') == 1 and pgid2.count('_') == 1:
    #     return pgid1 == pgid2
    # else:
    #     return False

for line in sys.stdin:
    try:
        tmp = line.split("\t")
        if len(tmp) != 9:
            continue
        items = map(lambda a: a.strip(), tmp)
        uid, mos, softv, ua, ct, opa, docid, ch, pgty = items
        uid_ = '_'.join([uid, ua])
        if result[0] != uid_:
            if result[0] != '#':
                print "\t".join(map(str, [result[0].split('_')[0], result[1], result[2], result[3], '&'.join(result[4]), '&'.join(result[5]), '&'.join(result[6]), '&'.join(result[7]), '&'.join(result[8])]))
            result = ['#', '#', '#', '#', ['#'], ['#'], ['#'], ['#'], ['#']]
        if result[0] == '#':
            result[0] = uid_
            result[1] = mos
            result[2] = softv
            result[3] = ua
            result[4][0] = ct
            result[5][0] = opa
            result[6][0] = docid
            result[7][0] = ch
            result[8][0] = pgty
            continue
        elif pgty == 'pic' and result[8][-1] == 'pic' and same_pic_page(docid, result[6][-1]):
            continue
        if len(result[4]) > 100:
            continue
        result[4].append(ct)
        result[5].append(opa)
        result[6].append(docid)
        result[7].append(ch)
        result[8].append(pgty)
    except:
        continue

View Code

生成路径：

insert overwrite table client_wen_crumbs_path partition (dt = '2015-03-17')
select transform (userkey, mos, cts, opas, pgids, chs, pgtys) using './client_user_crumbs_step1_ol.py' as (userkey, mos, path, delta) from client_wen_crumbs where dt = '2015-03-17' and mos in ('android', 'iphone');

View Code

./client_user_crumbs_step1_ol.py

#!/usr/local/python-2.7.2/bin/python
import sys
import time
# import itertools


for line in sys.stdin:
    try:
        user_route = []
        tm_delta = []
        tmp = line.split('\t')
        if len(tmp) != 9:
            continue
        items = map(lambda a: a.strip(), tmp)
        userkey, mos, softv, ua, cts, opas, pgs, chs, pgtys = items
        cts = cts.split("&")
        opas = opas.split("&")
        pgs = pgs.split("&")
        chs = chs.split("&")
        pgtys = pgtys.split("&")
        tm_stamps = map(lambda a: int(time.mktime(time.strptime(a, '%Y-%m-%d+%H:%M:%S'))), cts)
        tmp = tm_stamps[0]
        for item in zip(cts, opas, pgs, chs, pgtys, tm_stamps):
            ct, opa, pg, ch, pgty, second_delta = item

            # seconds_delta = tm_stamp - tmp
            if opa == 'page':
                if pgty == "ch":
                    user_route.append(pg)
                else:
                    user_route.append(pgty)
            else:
                if len(user_route):
                    if opa == 'in' and user_route[-1] == 'in':
                        tm_delta[-1] = second_delta - tmp
                        tmp = second_delta
                        continue
                    elif opa == 'in' and tmp == second_delta:
                        user_route.insert(len(user_route)-1, opa)
                        tm_delta.append(second_delta - tmp)
                        tmp = second_delta
                        continue
                user_route.append(opa)
            tm_delta.append(second_delta - tmp)
            tmp = second_delta
        print "\t".join(map(str, [userkey, mos, softv, ua, '/'.join(map(str, user_route)), '/'.join(map(str, tm_delta))]))
    except:
        continue

View Code

路径统计算法：

import sys
import glob
import time


def user_path(pathdeep, outdeep, prefix = 'in', prefix_tag = True, opr_delta = 2000):
    filepath = './user_path.2015-03-17'
    result = {}
    with open(filepath) as f:
        for line in f:
            item = line.split('\t')
            userkey, mos, path, delta, dt = item
            del dt
            if prefix_tag:
                if not path.startswith(prefix):
                    continue
            paths = path.split('/')
            deltas = delta.split('/')
            for i in range(1, min(pathdeep+2, len(paths)+1)):
                pg = paths[i-1]
                # pg, delta_tm, step = item0
                if 'icmp' in pg:
                    paths[i-1] = 'article'
                if pg.count('_') == 2:
                    paths[i-1] = 'pic'
                pg = paths[i-1]
                step = i
                # for item0 in zip(paths, deltas, range(1, len(paths)+1)):
                delta_tm = deltas[i-1]
                if any([int(delta_tm) > opr_delta, pg == 'end', pg == 'in']) and step != 1:
                    paths[i-1] = 'end'
                    path = '/'.join(paths[:i])
                    result.setdefault(step, {}).setdefault(path, [0])[0] += 1
                    result.setdefault(step, {}).setdefault('all', [0])[0] += 1
                    break
                elif len(paths) == i:
                    paths.append('end')
                    path = '/'.join(paths[:i+1])
                    result.setdefault(step, {}).setdefault(path, [0])[0] += 1
                    result.setdefault(step, {}).setdefault('all', [0])[0] += 1
                    break
                path = '/'.join(paths[:i])
                result.setdefault(step, {}).setdefault(path, [0])[0] += 1
                result.setdefault(step, {}).setdefault('all', [0])[0] += 1
    # for i in range(1, len(result)+1):
    #     print '\t'.join(map(str, [i, result[i]['end'][0], result[i]['all'][0]]))
    for item in result[outdeep]:
        try:
            print '\t'.join(map(str, [item, result[outdeep][item][0]]))
        except:
            print item
            continue


if __name__ == '__main__':
    user_path(3, outdeep = 2)

user_path.2015-03-17文件格式：

000000183318702 android 4.4.2 x909t in/sy:#/mil:#/article:mil/mil:back/article:mil/mil:back/article:mil/mil:back/article:disnews/disnews:#/disnews:back/article:disnews 0/6/10/13/11/4/7/3/13/4/0/8/3
000000183443982 android 4.4.0 x909t in/sy:#/article:disnews/disnews:#/disnews:back/article:disnews/disnews:back/article:disnews/disnews:back/tech:#/article:tech/tech:back/pic:tech/tech:back/piclive:finance/finance:#/finance:back/pic:finance/finance:back/article:sports/sports:#/sports:back/article:sports/sports:back/piclive:live/live:#/live:back/piclive:live/live:back/rcmd:#/pic:rcmd/rcmd:back/article:rcmd/rcmd:back/pic:rcmd 0/13/10/1/4/4/8/19/5/4/16/6/3/12/4/0/13/4/5/4/0/10/4/11/4/0/9/3/13/4/0/10/6/5/4
000000183666412 android 4.4.2 x909t in/sy:#/mil:#/disnews:#/article:disnews/disnews:back/article:disnews/disnews:back/tech:# 0/6/9/27/11/11/4/8/4
000000184233052 android 4.4.1 x909t in/sy:#/article:sy/sy:back/article:sy/sy:back/article:sy/sy:back/sy:back/topic:sy/sy:back/article:sy/sy:back/article:sy/sy:back/article:sy/sy:back/article:sy 0/14/20/13/3/9/4/12/9/4/6/3/5/4/11/3/6/4

最终结果：

对于user_path.2015-03-17中间文件路径计算算法如下：

def user_path(dayStr, pathdeep, outdeep = None, prefix = 'in', prefix_tag = True, opr_delta = 2000):
    filepath = '/data1/pylogs/newsapp_user_path/user_path.' + dayStr
    result = {}
    with open(filepath) as f:
        for line in f:
            item = line.split('\t')
            userkey, mos, ver, ua, path, delta = item
            if prefix_tag:
                if not path.startswith(prefix):
                    continue
            if mos not in ['android']:
                continue
            paths = path.split('/')
            deltas = delta.split('/')
            for i in range(1, min(pathdeep+2, len(paths)+1)):
                pgs = paths[i-1].split(":")
                # pg, delta_tm, step = item0
                for pg, index in zip(pgs, range(0, len(pgs))):
                    if 'imcp' in pgs[index]:
                        pgs[index] = 'article'
                    if pgs[index].count('_') == 2:
                        pgs[index] = 'pic'
                paths[i-1] = ":".join(pgs)
                pg = paths[i-1]
                # print pg, pgs, ":".join(pgs)
                step = i
                # for item0 in zip(paths, deltas, range(1, len(paths)+1)):
                delta_tm = deltas[i-1]
                if any([int(delta_tm) > opr_delta, pg == 'end', pg == 'in']) and step != 1:
                    paths[i-1] = 'end'
                    path = '/'.join(paths[:i])
                    result.setdefault(step, {}).setdefault(path, [0])[0] += 1
                    result.setdefault(step, {}).setdefault('all', [0])[0] += 1
                    break
                elif len(paths) == i:
                    paths.append('end')
                    path = '/'.join(paths[:i+1])
                    result.setdefault(step, {}).setdefault(path, [0])[0] += 1
                    result.setdefault(step, {}).setdefault('all', [0])[0] += 1
                    break
                path = '/'.join(paths[:i])
                result.setdefault(step, {}).setdefault(path, [0])[0] += 1
                result.setdefault(step, {}).setdefault('all', [0])[0] += 1
    # for i in range(1, len(result)+1):
    #     print '\t'.join(map(str, [i, result[i]['end'][0], result[i]['all'][0]]))
    if outdeep:
        for item in result[outdeep]:
            try:
                print '\t'.join(map(str, [item, result[outdeep][item][0]]))
            except:
                print item
                continue
    else:
        return result

计算制定路径前缀的前若干路径，入库代码：

def store_userpath(num, pathdeep = 7, path_top = 20):
    dayStr = time.strftime('%Y-%m-%d', time.localtime(time.time()-86400*num))
    result = user_path(dayStr, pathdeep = 6, prefix = 'in/sy')
    con, cur = db.open_15()
    sql = "insert into f_newsapp_userpath values(to_date(:1, 'yyyy-mm-dd'), :2, :3, :4, null, :5)"
    heads = ['in/sy']
    for outdeep in range(1, pathdeep):
        tmp = []
        tmp = {}
        for item in result[outdeep]:
            if not any([head[2] in item for head in heads]):
                continue
            try:
                # tmp.append((dayStr, outdeep, item, result[outdeep][item][0], "/".join(item.split("/")[:-1])))
                tmp.setdefault(item, []).append(result[outdeep][item][0])
            except:
                import traceback
                print traceback.print_exc()
                continue
        for path in tmp:
            store_list = []
            for uv in tmp[path]:
                store_list.append((dayStr, outdeep, path, uv, "/".join(path.split("/")[:-1])))
            heads = sorted(store_list, key = lambda item: item[3], reverse = True)[:path_top]
            cur.executemany(sql, heads)
    con.commit()

根据展示需求，并不是所有的路径都入库，只筛选排名前5的展示，另外加一个退出客户端及其他页展示汇总数据；因此上述代码还需根据需求完善，完善以后的代码如下：

  1 # -*- encoding: utf-8 -*-
  2 import db
  3 import analysis_news_pylogs
  4 import sys
  5 import time
  6 import itertools
  7 import operator
  8 
  9 
 10 def store_userpath(num, plat, del_flag = False, pathdeep = 8, chid_num = 7):
 11     dayStr = time.strftime('%Y-%m-%d', time.localtime(time.time()-86400*num))
 12     result = analysis_news_pylogs.user_path(dayStr, pathdeep = 6, prefix = 'in')
 13     con, cur = db.open_app()
 14     if del_flag:
 15         del_sql = "delete from f_newsapp_userpath where tm = to_date('%s', 'yyyy-mm-dd')" % dayStr
 16         cur.execute(del_sql)
 17     sql = "insert into f_newsapp_userpath values(to_date(:1, 'yyyy-mm-dd'), :2, :3, :4, null, :5, :6, :7, :8, :9, :10, :11)"
 18     fpaths = set()
 19     fpaths.add('in')
 20     fpaths.add('all')
 21     store_list = []
 22     for outdeep in range(1, pathdeep):
 23         tmp = {}
 24         for item in result[outdeep]:
 25             try:
 26                 # tmp.append((dayStr, outdeep, item, result[outdeep][item][0], "/".join(item.split("/")[:-1])))
 27                 tmp.setdefault("/".join(item.split("/")[:-1] if item.split("/")[:-1] else [item]), {}).setdefault(item, result[outdeep][item][0])
 28             except:
 29                 import traceback
 30                 print traceback.print_exc()
 31                 continue
 32         for fpath in tmp:
 33             for path in tmp[fpath]:
 34                 try:
 35                     tail = path.split('/')[-1]
 36                     if ':' in tail:
 37                         cpath, ref = tail.split(':')[:2]
 38                     else:
 39                         cpath, ref = tail, tail
 40                 except:
 41                     import traceback
 42                     print traceback.print_exc()
 43                 if ref == "end":
 44                     ref = ""
 45                 store_list.append([dayStr, outdeep, path, tmp[fpath][path], fpath, cpath, plat, ref, "/".join([fpath, cpath])])
 46     sorted_list = sorted(store_list, key = lambda item: (item[1], item[8], item[3]), reverse = True)
 47     sorted_list_1 = []
 48     record_repeat = {}
 49     tag = ""
 50     print len(sorted_list)
 51     # delete repeat head path
 52     for item in sorted_list:
 53         last_one = item[-1]
 54         if tag == item[8]:
 55             del item[-1]
 56             record_repeat.setdefault(item[2], []).append(item)
 57             continue
 58         else:
 59             del item[-1]
 60             sorted_list_1.append(item)
 61             tag = last_one
 62     del sorted_list
 63     sorted_list = sorted_list_1
 64     print len(sorted_list)
 65     print len(record_repeat)
 66     del sorted_list_1
 67     # soet by outdeep, fpath, uv
 68     sorted_list = sorted(sorted_list, key = lambda item: (item[1], item[4], item[3]), reverse = True)
 69     sorted_list_2 = []
 70     # add sort num for show
 71     i = 0
 72     for item in sorted_list:
 73         if item[5] == 'end':
 74             item.append(len(sorted_list)+100)
 75         else:
 76             i = i+1
 77             item.append(i)
 78         item.append(0)
 79         sorted_list_2.append(item)
 80     del sorted_list
 81     sorted_list = sorted_list_2
 82     del sorted_list_2
 83     sorted_list_bydeep = sorted([[key, [item for item in items]] for key, items in itertools.groupby(sorted_list, key = lambda item: (item[1], item[4]))], key = lambda item: item[0][0])
 84     contains_path = {1: set(["in", "all"])}
 85     store_DB = []
 86     other_detail_all = []
 87     for item in sorted_list_bydeep:
 88         other_detail = []
 89         outdeep, fpath = item[0]
 90         if fpath not in contains_path[outdeep]:
 91             continue
 92         [contains_path.setdefault(outdeep+1, set()).add(tmp[2]) for tmp in item[1][:chid_num]]
 93         contain_end = False
 94         for ite in item[1][:chid_num-2]:
 95             if outdeep+1 in result:
 96                 leave_uv = result.get(outdeep+1, {"".join([ite[2], "/", "end"]): [0]}).get("".join([ite[2], "/", "end"]), [0])[0]
 97             else:
 98                 leave_uv = 0
 99             if leave_uv > ite[3]:
100                 print ite
101                 print result.get(outdeep+1, {"".join([ite[2], "/", "end"]): [0]}).get("".join([ite[2], "/", "end"]), [0])[0]
102             ite[9] = leave_uv
103             if ite[5] == 'end':
104                 contain_end = True
105             if ite[5] == 'end':
106                 store_DB.append(ite)
107             else:
108                 store_DB.append(ite)
109         other_uv = 0
110         other_uv_patch = 0
111         if len(item[1]) > chid_num:
112             if contain_end:
113                 if outdeep+1 in result:
114                     leave_uv = result.get(outdeep+1, {"".join([item[1][chid_num-2][2], "/", "end"]): [0]}).get("".join([item[1][chid_num-2][2], "/", "end"]), [0])[0]
115                 else:
116                     leave_uv = 0
117                 item[1][chid_num-2][9] = leave_uv
118                 store_DB.append(item[1][chid_num-2])
119             if contain_end:
120                 for ite in item[1][chid_num-1:]:
121                     other_uv += ite[3]
122                     if len(other_detail) < 10:
123                         other_detail.append(ite)
124             else:
125                 for ite in item[1][chid_num-2:]:
126                     if ite[5] == 'end':
127                         store_DB.append(ite)
128                         continue
129                     other_uv += ite[3]
130                     if len(other_detail) < 10:
131                         other_detail.append(ite)
132             for ite in record_repeat.get(fpath, []):
133                 other_uv += ite[3]
134         for it in other_detail:
135             other_uv_patch += it[3]
136             it.append(1)
137             other_detail_all.append(it)
138         del other_detail
139         if other_uv != 0:
140             store_DB.append([dayStr, item[0][0], "".join([item[0][1], "/", 'other_total']), other_uv, item[0][1], 'other_total', plat, '', len(sorted_list)+50, 0])
141             if other_uv > other_uv_patch:
142                 other_detail_all.append([dayStr, item[0][0], "".join([item[0][1], "/", 'other_patch_total']), other_uv-other_uv_patch, item[0][1], 'other_patch_total', plat, '', len(sorted_list)+1, 0, 1])
143     store_DB_lis = []
144     for item in store_DB:
145         item.append(0)
146         store_DB_lis.append(item)
147     for item in store_DB_lis:
148         if item:
149             cur.execute(sql, item)
150     for item in other_detail_all:
151         if item:
152             cur.execute(sql, item)
153     cur.execute("update f_newsapp_userpath set ref = null where tm = to_date('%s', 'yyyy-mm-dd') and lay = 1 and path = 'in' and ref = 'in'" % (dayStr,))
154     con.commit()
155     con.close()
156 
157 
158 if __name__ == '__main__':
159     if 'store' in sys.argv:
160         store_userpath(1, 'android', del_flag = True)
161     if 'daily' in sys.argv:
162         store_userpath(1, 'android')

View Code

最终展示效果如下：

posted on 2015-04-01 14:35 闪电战阅读(863) 评论(0) 收藏举报