03.spark rdd topN
1.基本示例
取出top2
tmp = [('a', 1, 'a1'), ('a', 2, 'a2'), ('a', 3, 'a3'), ('b', 2, 'b2'), ('b', 3, 'b3'), ('c', 3, 'c3'),('c', 4, 'c4')]
rdd_1 = sc.parallelize(tmp)
rdd_2 = rdd_1.map(lambda x: (x[0], (x[1], x[2]))).groupByKey()
rdd_3 = rdd_2.map(lambda x: (x[0], sorted(list(x[1]), key=lambda y: y[0], reverse=True)[:2]))
for row in rdd_3.collect():
    print row
2.生成json数据的示例
取出top20
sql_base = '''
    select distinct time, fn.json(source, '$.role_id') as role_id, fn.json(source, '$.server') as server,
    int(fn.json(source, '$.rank')) as rank, int(fn.json(source, '$.kill')) as kill,
    fn.json(source, '$.space') as space, fn.json(source, '$.team_guid') as team_guid,
    int(COALESCE(fn.json(source, '$.match_mode'), fn.json(source, '$.game_type'))) as game_type
    from gxx.resultsystem
    where date >= {date_start} and date <= {date_end} and time >= '{time_start}' and time <= '{time_end}'
    and account_id like('%@%') 
'''.format(date_start=_date_start, date_end=_date_end, time_start=_time_start, time_end=_time_end)
print sql_base
ret_base = para.hsc.sql(sql_base)
def convert_score(df):
    score = get_score(df['rank'], df['kill'])
    return df['role_id'], score, df['time'], df['server'], df['rank'], df['kill'], df['space'], df['team_guid'], df['game_type']
ret_rdd = ret_base.rdd.map(convert_score) \
    .map(lambda (role_id, score, endtime, server, rank, kill, space, team_guid, game_type): (role_id, (score, endtime, server, rank, kill, space, team_guid, game_type))) \
    .groupByKey() \
    .map(lambda (role_id, combats): (role_id, sorted(list(combats), key=lambda y: (y[0], y[1]), reverse=True)[:20]))
def json_data(item):
    role_id = item[0]
    combats = item[1]
    rs = []
    for combat in combats:
        o = {
            'score': combat[0],
            'time': combat[1],
            'server': combat[2],
            'rank': combat[3],
            'kill': combat[4],
            'space': combat[5],
            'team_guid': combat[6],
            'game_type': combat[7]
        }
        rs.append(o)
    o_final = {
        'role_id': role_id,
        'date': Yesterday,
        'combats': rs
    }
    return json.dumps(o_final)
ret_rdd.map(json_data).saveAsTextFile('/home/workspace/g83/campus_76733/date=%s' % Yesterday)
    http://www.cnblogs.com/makexu/

 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号