1/27

# coding:utf-8

# 导入 PySpark 相关模块
from pyspark.sql import SparkSession # 用于创建 SparkSession 对象
from pyspark.sql.functions import monotonically_increasing_id # 生成唯一 ID
from pyspark.sql.types import StringType, StructField, IntegerType, StructType, FloatType # 数据类型定义
from pyspark.sql.functions import col, sum, when, desc, asc # 数据处理函数

if __name__ == '__main__':
# 创建 SparkSession 对象，配置 Spark 应用程序的名称、运行模式和相关参数
spark = SparkSession.builder.appName("sparkSql").master("local[*]").\
config("spark.sql.shuffle.partitions", 2).\
config("spark.sql.warehouse.dir", "hdfs://192.168.116.131:8020/user/hive/warehouse").\
config("hive.metastore.uris", "thrift://192.168.116.131:9083").enableHiveSupport().getOrCreate()

# 获取 SparkContext 对象，用于低级操作
sc = spark.sparkContext

# 定义数据的 Schema 结构，指定每个字段的名称和数据类型
schema = StructType().add("type", StringType(), nullable=True). \
add("title", StringType(), nullable=True). \
add("price", FloatType(), nullable=True). \
add("buy_len", IntegerType(), nullable=True). \
add("img_src", StringType(), nullable=True). \
add("name", StringType(), nullable=True). \
add("address", StringType(), nullable=True). \
add("isFreeDelivery", StringType(), nullable=True). \
add("href", StringType(), nullable=True). \
add("nameHref", StringType(), nullable=True)

# 从 CSV 文件中读取数据，指定分隔符、表头、编码和 Schema
df = spark.read.format("csv"). \
option("sep", ","). \
option("header", True). \
option("encoding", "utf-8"). \
schema(schema=schema). \
load("../spider/data.csv")

# 为 DataFrame 添加一个唯一 ID 列
df = df.withColumn("id", monotonically_increasing_id())

# 去除重复数据
df = df.drop_duplicates()

# 删除包含缺失值的行
df = df.na.drop()

# 展示处理后的数据（可选，用于调试）
# df.show()

# 将数据写入 MySQL 数据库（当前注释掉，未成功运行）
# df.write.mode("overwrite"). \
# format("jdbc"). \
# option("url", "jdbc:mysql://192.168.116.131:3306/bigdata?useSSL=false&useUnicode=true&charset=utf-8"). \
# option("dbtable", "commoditydata"). \
# option("user", "root"). \
# option("password", ""). \
# option("encoding", "utf-8"). \
# option("driver", "com.mysql.cj.jdbc.Driver"). \
# save()

# 将数据写入 Hive 表中
df.write.mode("overwrite").saveAsTable("commoditydata", "parquet")

# 查询 Hive 表中的数据并展示
spark.sql("select * from commoditydata").show()

posted @ 2025-01-27 18:10 Hbro 阅读(25) 评论(0) 收藏举报

刷新页面返回顶部

hbro

1/27

公告