pyspark -partitionBy 案例

from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
import json




def  make_data(line):
    data = json.loads(line)
    h = data.get("h")
    l = data.get("l")
    return (f"{h}_{l}",line)



if __name__ == '__main__':

    conf = SparkConf()
    sc = SparkContext(conf = conf)
    spark=SparkSession(sc)
    input_path="/in"
    rdd = sc.textFile(input_path)
    rdd2 = rdd.map(lambda line:make_data(line)).filter(lambda x:x)
    df = rdd2.toDF(["h_l","data"])
    df.write.partitionBy("h_l").json("/out")

 

posted @ 2021-09-17 07:30  冰底熊  阅读(14)  评论(0)    收藏  举报