pyspark -partitionBy 案例
from pyspark import SparkConf,SparkContext from pyspark.sql import SparkSession import json def make_data(line): data = json.loads(line) h = data.get("h") l = data.get("l") return (f"{h}_{l}",line) if __name__ == '__main__': conf = SparkConf() sc = SparkContext(conf = conf) spark=SparkSession(sc) input_path="/in" rdd = sc.textFile(input_path) rdd2 = rdd.map(lambda line:make_data(line)).filter(lambda x:x) df = rdd2.toDF(["h_l","data"]) df.write.partitionBy("h_l").json("/out")
有疑问可以加wx:18179641802,进行探讨