spark 批量写入redis控制

需求

spark scala 程序 读取hive数据,sparksql解析 然后分批写入redis

原则:

一、尽可能少和redis交互,减少redis写入压力。

二、可以考虑pipeline的操作习惯。

三、不要一个partition只提交一个pipeline

  1. 网络传输压力大

  2. 内存消耗高

  3. Redis服务端处理压力大

  4. 可能增加命令处理延迟

实现方案

import org.apache.spark.sql.{DataFrame, SparkSession}
import redis.clients.jedis.{Jedis, Pipeline}
import scala.collection.JavaConverters._

object OptimizedHiveToRedisPipeline {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .appName("OptimizedHiveToRedisPipeline")
      .enableHiveSupport()
      .getOrCreate()

    val redisHost = "your_redis_host"
    val redisPort = 6379
    val redisPassword = "your_redis_password" // 可选

    val hiveTableDF = spark.sql(
      """
      SELECT 
        user_id, 
        name, 
        age, 
        total_spend,
        date_partition
      FROM your_hive_database.your_hive_table
      WHERE date_partition = '2024-01-01'
      """
    )

    val processedDF = hiveTableDF.select(
      "user_id", 
      "name", 
      "age", 
      "total_spend"
    )

    // 定义批处理大小常量
    val BATCH_SIZE = 1000 // 每批次处理1000条记录

    def writeToRedisBatch(records: Iterator[org.apache.spark.sql.Row]): Unit = {
      val jedis = new Jedis(redisHost, redisPort)
      
      try {
        // 将记录分批处理
        records.grouped(BATCH_SIZE).foreach { batch =>
          val pipeline = jedis.pipelined()
          
          batch.foreach { record =>
            val userId = record.getString(0)
            val key = s"user:${userId}"
            
            pipeline.hset(key, "name", record.getString(1))
            pipeline.hset(key, "age", record.getInt(2).toString)
            pipeline.hset(key, "total_spend", record.getDouble(3).toString)
          }
          
          // 对每个小批次执行sync
          pipeline.sync()
        }
      } catch {
        case e: Exception => 
          println(s"Redis写入错误: ${e.getMessage}")
      } finally {
        jedis.close()
      }
    }

    // 分批写入Redis,控制每批次大小
    processedDF.rdd.foreachPartition(writeToRedisBatch)

    spark.stop()
  }
}
posted @ 2024-12-05 18:26  唐钰逍遥  阅读(177)  评论(0)    收藏  举报