spark 批量写入redis控制
需求
spark scala 程序 读取hive数据,sparksql解析 然后分批写入redis
原则:
一、尽可能少和redis交互,减少redis写入压力。
二、可以考虑pipeline的操作习惯。
三、不要一个partition只提交一个pipeline
网络传输压力大
内存消耗高
Redis服务端处理压力大
可能增加命令处理延迟
实现方案
import org.apache.spark.sql.{DataFrame, SparkSession}
import redis.clients.jedis.{Jedis, Pipeline}
import scala.collection.JavaConverters._
object OptimizedHiveToRedisPipeline {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("OptimizedHiveToRedisPipeline")
.enableHiveSupport()
.getOrCreate()
val redisHost = "your_redis_host"
val redisPort = 6379
val redisPassword = "your_redis_password" // 可选
val hiveTableDF = spark.sql(
"""
SELECT
user_id,
name,
age,
total_spend,
date_partition
FROM your_hive_database.your_hive_table
WHERE date_partition = '2024-01-01'
"""
)
val processedDF = hiveTableDF.select(
"user_id",
"name",
"age",
"total_spend"
)
// 定义批处理大小常量
val BATCH_SIZE = 1000 // 每批次处理1000条记录
def writeToRedisBatch(records: Iterator[org.apache.spark.sql.Row]): Unit = {
val jedis = new Jedis(redisHost, redisPort)
try {
// 将记录分批处理
records.grouped(BATCH_SIZE).foreach { batch =>
val pipeline = jedis.pipelined()
batch.foreach { record =>
val userId = record.getString(0)
val key = s"user:${userId}"
pipeline.hset(key, "name", record.getString(1))
pipeline.hset(key, "age", record.getInt(2).toString)
pipeline.hset(key, "total_spend", record.getDouble(3).toString)
}
// 对每个小批次执行sync
pipeline.sync()
}
} catch {
case e: Exception =>
println(s"Redis写入错误: ${e.getMessage}")
} finally {
jedis.close()
}
}
// 分批写入Redis,控制每批次大小
processedDF.rdd.foreachPartition(writeToRedisBatch)
spark.stop()
}
}

浙公网安备 33010602011771号