Glue 数据清洗

from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from pyspark.context import SparkContext
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.spark_session
db_name = "payments"
tbl_name = "medicare"
output_dir = "s3://glue-sample-target/output-dir/medicare_parquet"

# medicare_dyf = spark.read \
#     .format("com.databricks.spark.csv") \
#     .option("header", "true") \
#     .option("inferSchema", "true") \
#     .load('s3://awsglue-datasets/examples/medicare/Medicare_Hospital_Provider.csv')
medicare_dyf = glueContext.create_dynamic_frame.from_catalog(database=db_name, table_name=tbl_name)

medicare_dyf.printSchema()
medicare_res = medicare_dyf.resolveChoice(specs=[('provider id', 'cast:long')])  # 强制转换为long类型,无法转换的变为 null

medicare_df = medicare_res.toDF().where("`provider id` is NOT NULL")

chop_f = udf(lambda x: x[1:], StringType())
# Glue DynamicFrames 不支持UDF,但是Spark DataFrame支持

# 添加一个新列
medicare_df = medicare_df.withColumn(
    "ACC", chop_f(medicare_df["average covered charges"])
)

# Turn it back to a dynamic frame
medicare_tmp = DynamicFrame.fromDF(medicare_df, glueContext, "nested")

# Rename, cast, and nest with apply_mapping
medicare_nest = medicare_tmp.apply_mapping([
    ('provider id', 'long', 'provider.id', 'long'),
    ('ACC', 'string', 'charges.covered', 'double')  # 转换为一个数组,第一个
])

# Write it out in Parquet
glueContext.write_dynamic_frame.from_options(
    frame=medicare_nest, connection_type="s3",
    connection_options={"path": output_dir}, format="parquet"
)

 

posted @ 2020-10-16 16:19  慕沁  阅读(312)  评论(0)    收藏  举报