学习笔记4

  1. Spark SQL核心概念(新大陆!)
    SparkSession - 统一的入口点
    scala
    import org.apache.spark.sql.SparkSession

// 创建SparkSession(不再需要单独创建SparkContext)
val spark = SparkSession.builder()
.appName("Spark SQL Learning")
.master("local[*]")
.config("spark.sql.adaptive.enabled", "true") // 动态优化
.config("spark.sql.shuffle.partitions", "4") // 调整分区数
.getOrCreate()

// SparkContext仍然可以通过spark.sparkContext访问
val sc = spark.sparkContext
三种API对比:RDD vs DataFrame vs Dataset
scala
// 1. RDD方式 - 类型安全,但需要手动优化
case class Person(name: String, age: Int, city: String)
val rdd = sc.parallelize(Seq(
Person("Alice", 30, "Beijing"),
Person("Bob", 25, "Shanghai"),
Person("Charlie", 35, "Beijing")
))
val rddResult = rdd.filter(.age > 25).map(.name).collect()

// 2. DataFrame方式 - 类似Python/Pandas,有Schema
import spark.implicits._
val df = Seq(
("Alice", 30, "Beijing"),
("Bob", 25, "Shanghai"),
("Charlie", 35, "Beijing")
).toDF("name", "age", "city")

df.filter($"age" > 25).select("name").show()

// 3. Dataset方式 - 结合RDD类型安全和DataFrame性能
case class Person(name: String, age: Int, city: String)
val ds = Seq(
Person("Alice", 30, "Beijing"),
Person("Bob", 25, "Shanghai"),
Person("Charlie", 35, "Beijing")
).toDS()

ds.filter(.age > 25).map(.name).show()
2. DataFrame基础操作
创建DataFrame的多种方式
scala
// 方式1:从RDD转换
val rdd = sc.parallelize(Seq(
(1, "Apple", 5.99),
(2, "Banana", 3.50),
(3, "Orange", 4.20)
))
val df1 = rdd.toDF("id", "product", "price")

// 方式2:从Seq直接创建
val df2 = Seq(
(1, "Apple", 5.99),
(2, "Banana", 3.50)
).toDF("id", "product", "price")

// 方式3:从数据源读取(重点)
val jsonDF = spark.read.json("data/people.json")
val csvDF = spark.read
.option("header", "true")
.option("inferSchema", "true")
.csv("data/sales.csv")
val parquetDF = spark.read.parquet("data/users.parquet")
val jdbcDF = spark.read
.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/db")
.option("dbtable", "table")
.option("user", "root")
.option("password", "password")
.load()
DataFrame常用操作(DSL语法)
scala
// 准备示例数据
val salesDF = Seq(
("2024-01-01", "Electronics", 1200.50, "Beijing"),
("2024-01-01", "Clothing", 800.00, "Shanghai"),
("2024-01-02", "Electronics", 1500.00, "Beijing"),
("2024-01-02", "Books", 300.50, "Guangzhou"),
("2024-01-03", "Clothing", 950.00, "Shanghai"),
("2024-01-03", "Electronics", 800.00, "Beijing")
).toDF("date", "category", "amount", "city")

// 1. 选择列
salesDF.select("category", "amount").show(3)
salesDF.select($"category", $"amount" * 1.1 as "amount_with_tax").show()

// 2. 过滤数据
salesDF.filter($"amount" > 1000).show()
salesDF.where($"category" === "Electronics" and $"city" === "Beijing").show()

// 3. 添加/修改列
salesDF.withColumn("tax", $"amount" * 0.1).show()
salesDF.withColumnRenamed("amount", "sales_amount").show()

// 4. 分组聚合
salesDF.groupBy("category")
.agg(
sum("amount").alias("total_sales"),
avg("amount").alias("avg_sales"),
count("*").alias("transaction_count")
)
.show()

// 5. 排序
salesDF.orderBy($"amount".desc).show()
salesDF.sort($"category", $"amount".desc).show()

// 6. 去重和统计
salesDF.select("category").distinct().show()
salesDF.describe("amount").show() // 统计摘要
3. Spark SQL语法
注册临时视图
scala
// 创建临时视图(Session级别)
salesDF.createOrReplaceTempView("sales")

// 创建全局临时视图(跨Session)
salesDF.createGlobalTempView("global_sales")

// 使用SQL查询
val result = spark.sql("""
SELECT
category,
SUM(amount) as total_sales,
AVG(amount) as avg_sales,
COUNT(*) as transactions
FROM sales
WHERE amount > 500
GROUP BY category
HAVING total_sales > 1000
ORDER BY total_sales DESC
""")

result.show()
复杂SQL操作示例
scala
// 准备更多数据
val productsDF = Seq(
(1, "Laptop", "Electronics", 5000),
(2, "T-shirt", "Clothing", 200),
(3, "Smartphone", "Electronics", 3000),
(4, "Jeans", "Clothing", 800)
).toDF("product_id", "product_name", "category", "price")

val ordersDF = Seq(
(101, 1, 2, "2024-01-01"),
(102, 3, 1, "2024-01-01"),
(103, 2, 5, "2024-01-02"),
(104, 1, 1, "2024-01-02"),
(105, 4, 3, "2024-01-03")
).toDF("order_id", "product_id", "quantity", "order_date")

// 注册视图
productsDF.createOrReplaceTempView("products")
ordersDF.createOrReplaceTempView("orders")

// 复杂查询:销售额排行
val salesRanking = spark.sql("""
SELECT
p.category,
p.product_name,
SUM(p.price * o.quantity) as revenue,
RANK() OVER (PARTITION BY p.category ORDER BY SUM(p.price * o.quantity) DESC) as rank
FROM products p
JOIN orders o ON p.product_id = o.product_id
GROUP BY p.category, p.product_name
""")

salesRanking.show()

// 窗口函数:月度累计销售额
val cumulativeSales = spark.sql("""
SELECT
order_date,
product_name,
price * quantity as daily_sales,
SUM(price * quantity) OVER (
PARTITION BY product_name
ORDER BY order_date
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) as cumulative_sales
FROM products p
JOIN orders o ON p.product_id = o.product_id
ORDER BY product_name, order_date
""")

cumulativeSales.show()
4. 数据源读写实战
读取不同格式
scala
// JSON文件(嵌套结构)
val jsonDF = spark.read
.option("multiline", "true")
.json("data/nested_data.json")
jsonDF.printSchema()
jsonDF.select("user.name", "user.email", "orders.*").show()

// CSV文件(带选项)
val csvDF = spark.read
.option("header", "true")
.option("inferSchema", "true")
.option("sep", "|")
.option("mode", "FAILFAST") // 遇到错误立即失败
.csv("data/data.csv")

// Parquet(列式存储,推荐)
val parquetDF = spark.read.parquet("data/users.parquet")
parquetDF.explain(true) // 查看执行计划

// 读取多个文件
val multiDF = spark.read.csv("data/2024/*.csv")

// 读取带分区的数据
val partitionedDF = spark.read
.option("basePath", "data/warehouse")
.parquet("data/warehouse/year=2024/month=01/*")
写入数据
scala
// 写入不同格式
resultDF.write
.mode("overwrite") // overwrite, append, ignore, error
.option("header", "true")
.csv("output/csv_result")

// 分区写入
resultDF.write
.partitionBy("category", "city")
.parquet("output/partitioned_result")

// 分桶写入(优化join)
resultDF.write
.bucketBy(4, "category")
.sortBy("amount")
.saveAsTable("bucketed_sales")

// 写入JDBC
resultDF.write
.mode("overwrite")
.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/db")
.option("dbtable", "sales_summary")
.option("user", "root")
.option("password", "password")
.save()
5. 实战项目:销售数据分析系统(综合应用)
scala
object SalesAnalysisSystem {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("Sales Analysis System")
.master("local[*]")
.config("spark.sql.adaptive.enabled", "true")
.config("spark.sql.adaptive.coalescePartitions.enabled", "true")
.enableHiveSupport() // 如果需要Hive支持
.getOrCreate()

import spark.implicits._

println("=== 销售数据分析系统启动 ===")

// 1. 读取多源数据
println("\n1. 加载数据...")

val salesDF = spark.read
  .option("header", "true")
  .option("inferSchema", "true")
  .csv("data/sales/*.csv")
  .withColumn("date", to_date($"date", "yyyy-MM-dd"))

val productsDF = spark.read
  .option("header", "true")
  .option("inferSchema", "true")
  .csv("data/products.csv")

val storesDF = spark.read
  .option("header", "true")
  .option("inferSchema", "true")
  .csv("data/stores.csv")

// 2. 数据清洗和预处理
println("\n2. 数据清洗...")

val cleanSalesDF = salesDF
  .filter($"amount" > 0)  // 过滤无效数据
  .na.drop(Seq("product_id", "store_id"))  // 删除关键字段为空的记录
  .withColumn("year", year($"date"))
  .withColumn("month", month($"date"))
  .withColumn("quarter", quarter($"date"))

// 3. 缓存清洗后的数据
cleanSalesDF.cache()
println(s"清洗后数据量: ${cleanSalesDF.count()} 条")

// 4. 创建临时视图用于SQL分析
cleanSalesDF.createOrReplaceTempView("sales")
productsDF.createOrReplaceTempView("products")
storesDF.createOrReplaceTempView("stores")

// 5. 执行分析任务

// 5.1 销售趋势分析
println("\n3.1 月度销售趋势")
val monthlyTrend = spark.sql("""
  SELECT 
    year,
    month,
    COUNT(DISTINCT order_id) as orders,
    SUM(amount) as total_sales,
    AVG(amount) as avg_order_value
  FROM sales
  GROUP BY year, month
  ORDER BY year, month
""")
monthlyTrend.show(12)

// 5.2 产品品类分析
println("\n3.2 品类销售排行")
val categoryAnalysis = spark.sql("""
  SELECT 
    p.category,
    COUNT(*) as sales_count,
    SUM(s.amount) as total_revenue,
    AVG(s.amount) as avg_price,
    SUM(s.amount) / SUM(SUM(s.amount)) OVER() * 100 as revenue_percentage
  FROM sales s
  JOIN products p ON s.product_id = p.product_id
  GROUP BY p.category
  ORDER BY total_revenue DESC
""")
categoryAnalysis.show()

// 5.3 门店绩效分析
println("\n3.3 门店绩效排名")
val storePerformance = spark.sql("""
  SELECT 
    st.region,
    st.city,
    st.store_name,
    COUNT(*) as transactions,
    SUM(s.amount) as revenue,
    AVG(s.amount) as avg_transaction,
    RANK() OVER (PARTITION BY st.region ORDER BY SUM(s.amount) DESC) as region_rank
  FROM sales s
  JOIN stores st ON s.store_id = st.store_id
  GROUP BY st.region, st.city, st.store_name
  ORDER BY st.region, region_rank
""")
storePerformance.show(20)

// 5.4 用户购买行为分析(RFM模型简化版)
println("\n3.4 客户价值分析")
val rfmAnalysis = spark.sql("""
  WITH customer_stats AS (
    SELECT 
      customer_id,
      DATEDIFF(CURRENT_DATE(), MAX(date)) as recency,
      COUNT(DISTINCT order_id) as frequency,
      SUM(amount) as monetary
    FROM sales
    GROUP BY customer_id
  )
  SELECT 
    customer_id,
    recency,
    frequency,
    monetary,
    CASE 
      WHEN recency <= 30 AND frequency >= 5 AND monetary >= 1000 THEN '高价值'
      WHEN recency <= 60 AND frequency >= 3 THEN '中价值'
      ELSE '低价值'
    END as customer_segment
  FROM customer_stats
  ORDER BY monetary DESC
""")
rfmAnalysis.show(10)

// 6. 数据导出
println("\n4. 导出分析结果...")

// 按分区保存结果
monthlyTrend.write
  .mode("overwrite")
  .partitionBy("year")
  .parquet("output/monthly_trend")

categoryAnalysis.write
  .mode("overwrite")
  .option("header", "true")
  .csv("output/category_analysis")

// 7. 执行计划优化示例
println("\n5. 查看执行计划优化")
println("原始查询计划:")
monthlyTrend.explain("simple")

println("\n优化后的物理计划:")
monthlyTrend.explain("extended")

// 8. 清理缓存
cleanSalesDF.unpersist()

println("\n=== 分析完成 ===")
println("结果已保存到 output/ 目录")
println("查看Spark UI: http://localhost:4040")

Thread.sleep(30000)  // 等待查看UI
spark.stop()

}
}
🧠 学习收获与理解
✅ 今日掌握知识点
Spark SQL核心概念:

SparkSession统一入口

DataFrame vs Dataset vs RDD 区别

Catalyst优化器工作原理

DataFrame API熟练:

各种转换操作

聚合函数使用

窗口函数应用

SQL支持:

临时视图注册

复杂查询编写

Hive集成基础

数据源处理:

多格式读写

分区策略

性能优化选项

❗ 重要发现
性能对比实验:

scala
// RDD方式 vs DataFrame方式
val start = System.currentTimeMillis()
rdd.filter(_.age > 25).count()
val rddTime = System.currentTimeMillis() - start

val start2 = System.currentTimeMillis()
df.filter($"age" > 25).count()
val dfTime = System.currentTimeMillis() - start2

println(s"RDD耗时: ${rddTime}ms, DF耗时: ${dfTime}ms")
// 结论:DataFrame比RDD快3-5倍!
执行计划观察:

scala
df.filter($"age" > 25).groupBy("city").count().explain(true)
// 可以看到Catalyst的优化过程:
// - 逻辑计划优化
// - 物理计划选择
// - 代码生成
分区优化效果:

scala
// 未优化:读取全表
spark.sql("SELECT * FROM sales WHERE date = '2024-01-01'").explain()

// 优化后:分区裁剪
spark.sql("SELECT * FROM partitioned_sales WHERE date = '2024-01-01'").explain()
// 只读取一个分区,性能大幅提升
🔍 实践问题与解决方案
问题1:数据倾斜
scala
// 现象:某些任务执行特别慢
val skewedDF = df.groupBy("city").count()

// 解决方案:加盐处理
val saltedDF = df.withColumn("salt", (rand() * 10).cast("int"))
.groupBy("city", "salt")
.count()
.groupBy("city")
.agg(sum("count") as "count")
问题2:小文件问题
scala
// 写入产生大量小文件
df.write.parquet("output/")

// 解决方案:合并小文件
df.coalesce(4).write.parquet("output/")
// 或
df.repartition(4).write.parquet("output/")
问题3:内存溢出
scala
// 解决方案:调整配置
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "10485760") // 10MB

posted @ 2026-03-08 13:00  Lomook  阅读(2)  评论(0)    收藏  举报