Glue 数据清洗
from awsglue.context import GlueContext from awsglue.dynamicframe import DynamicFrame from pyspark.context import SparkContext from pyspark.sql.functions import udf from pyspark.sql.types import StringType glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session db_name = "payments" tbl_name = "medicare" output_dir = "s3://glue-sample-target/output-dir/medicare_parquet" # medicare_dyf = spark.read \ # .format("com.databricks.spark.csv") \ # .option("header", "true") \ # .option("inferSchema", "true") \ # .load('s3://awsglue-datasets/examples/medicare/Medicare_Hospital_Provider.csv') medicare_dyf = glueContext.create_dynamic_frame.from_catalog(database=db_name, table_name=tbl_name) medicare_dyf.printSchema() medicare_res = medicare_dyf.resolveChoice(specs=[('provider id', 'cast:long')]) # 强制转换为long类型,无法转换的变为 null medicare_df = medicare_res.toDF().where("`provider id` is NOT NULL") chop_f = udf(lambda x: x[1:], StringType()) # Glue DynamicFrames 不支持UDF,但是Spark DataFrame支持 # 添加一个新列 medicare_df = medicare_df.withColumn( "ACC", chop_f(medicare_df["average covered charges"]) ) # Turn it back to a dynamic frame medicare_tmp = DynamicFrame.fromDF(medicare_df, glueContext, "nested") # Rename, cast, and nest with apply_mapping medicare_nest = medicare_tmp.apply_mapping([ ('provider id', 'long', 'provider.id', 'long'), ('ACC', 'string', 'charges.covered', 'double') # 转换为一个数组,第一个 ]) # Write it out in Parquet glueContext.write_dynamic_frame.from_options( frame=medicare_nest, connection_type="s3", connection_options={"path": output_dir}, format="parquet" )