36.spark rdd large的处理
0.前言
使用spark计算时,将获得的数据导入到数据库中时,如果数据量很大,当collect的时候就会出现OOM的问题
借鉴网址:Spark: Best practice for retrieving big data from RDD to local machine
1.参考
Here is the same approach as suggested by @Wildlife but written in pyspark.
The nice thing about this approach - it lets user access records in RDD in order. I'm using this code to feed data from RDD into STDIN of the machine learning tool's process.
rdd = sc.parallelize(range(100), 10)
def make_part_filter(index):
    def part_filter(split_index, iterator):
        if split_index == index:
            for el in iterator:
                yield el
    return part_filter
for part_id in range(rdd.getNumPartitions()):
    part_rdd = rdd.mapPartitionsWithIndex(make_part_filter(part_id), True)
    data_from_part_rdd = part_rdd.collect()
    print "partition id: %s elements: %s" % (part_id, data_from_part_rdd)
Produces output:
partition id: 0 elements: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
partition id: 1 elements: [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
partition id: 2 elements: [20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
partition id: 3 elements: [30, 31, 32, 33, 34, 35, 36, 37, 38, 39]
partition id: 4 elements: [40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
partition id: 5 elements: [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]
partition id: 6 elements: [60, 61, 62, 63, 64, 65, 66, 67, 68, 69]
partition id: 7 elements: [70, 71, 72, 73, 74, 75, 76, 77, 78, 79]
partition id: 8 elements: [80, 81, 82, 83, 84, 85, 86, 87, 88, 89]
partition id: 9 elements: [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]
2.自己代码中使用
rets = rets_total_combat.join(rets_total_kill, [rets_total_combat["role_id"] == rets_total_kill["role_id"]],
                                  "left_outer") \
        .join(rets_top1, [rets_top1["role_id"] == rets_total_combat["role_id"]], "left_outer") \
        .join(rets_top10, [rets_top10["role_id"] == rets_total_combat["role_id"]], "left_outer") \
        .select(rets_total_combat.role_id, rets_total_combat.total_num, rets_total_kill.kill_num, rets_top1.top1_num,
                rets_top10.top10_num).distinct()
print '---------------最后信息----------------------'
rets.printSchema()
def make_part_filter(index):
     def part_filter(split_index, iterator):
          if split_index == index:
              for el in iterator:
                  yield el
      return part_filter
rdds = rets.rdd.repartition(200)
for part_id in range(rdds.getNumPartitions()):
        part_rdd = rdds.mapPartitionsWithIndex(make_part_filter(part_id), True)
        # data_from_part_rdd = part_rdd.collect()
        object_list = []
        for row in part_rdd.collect():
            data = {
                'role_id': logic.role_id_key(row['role_id']),
                'total_num': row['total_num'] if row['total_num'] else 0,
                'top1_num': row['top1_num'] if row['top1_num'] else 0,
                'top10_num': row['top10_num'] if row['top10_num'] else 0,
                'kill_num': row['kill_num'] if row['kill_num'] else 0
            }
            object_list.append(data)
        if object_list and len(object_list) > 0:
            insert_into_tb(db['h45_' + Yesterday], object_list)
            # print "partition id: %s elements: %s" % (part_id, data_from_part_rdd)
    http://www.cnblogs.com/makexu/

 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号