1

 dataset = """
        role_1,u1,1,1 \n
        role_1,u1,2,2 \n
        role_1,u1,3,3 \n
        role_1,u1,4,4 \n
        role_2,u2,5,5 \n
        role_2,u2,6,6
    """

    data = para.sc.parallelize(dataset.strip().split("\n")).filter(lambda line: line)
    data = data.map(lambda line: line.strip().split(','))
    print data.take(6)
    unweighted = data \
        .map(lambda (role_id, role_name, total_num, logtime): ((role_id, role_name), (int(total_num), str(logtime)))) \
        .reduceByKey(lambda x, y: (x[0] + y[0], min(x[1], y[1]))) \
        .map(lambda x: (x[0][0], x[0][1], x[1][0], x[1][1]))
    print unweighted.take(2)

 

posted @ 2018-01-09 17:15  桃源仙居  阅读(216)  评论(0编辑  收藏  举报