3月16日(spark streaming 收集kafka数据到mysql数据库)
spark streaming 收集kafka数据到mysql数据库,
本地不需要安装spark,在是spark中引入相关依赖即可,但是需要注意版本需要和本地的scala版本对应
例如这样的依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.1.3</version>
</dependency>
这个2.11就是本地scala的版本,要配套,不然会报很多错误,依赖版本不行可以多试几个。
整体代码如下,有很多细节不是很明白
import java.sql.{Connection, DriverManager, PreparedStatement}
import SparkStreamingKafka.{ saveDataToMysqlCountComment, updateFunc}
import kafka.serializer.StringDecoder
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} //spark程序的入口函数
object jd_ss {
def main(args: Array[String]): Unit = {
// 创建sparksession
val conf = new SparkConf().setAppName("Consumer").setMaster("local[*]")
val ssc = new StreamingContext(conf,Seconds(15))//设置每隔1秒接收并计算一次
//val kafkaParam = Map("bootstrap.servers" -> "192.168.10.102:9092")
ssc.checkpoint("./TmpCount")// 设置缓存位置
var kafkaParams = Map[String, Object](
"bootstrap.servers" -> "hadoop102:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "SparkKafkaDemo",
//earliest:当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,从头开始消费
//latest:当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,消费新产生的该分区下的数据
//none:topic各分区都存在已提交的offset时,从offset后开始消费;只要有一个分区不存在已提交的offset,则抛出异常
//这里配置latest自动重置偏移量为最新的偏移量,即如果有偏移量从偏移量位置开始消费,没有偏移量从新来的数据开始消费
"auto.offset.reset" -> "earliest",
//false表示关闭自动提交.由spark帮你提交到Checkpoint或程序员手动维护
"enable.auto.commit" -> (false: java.lang.Boolean)
)
//val kafkaParam = Map("metadata.broker.list" -> "192.168.10.10:9092")
//接收kafka数据
//val logDStream: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder](ssc,kafkaParam,topic)
val data: InputDStream[ConsumerRecord[String, String]] =
KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Array("jd3"), kafkaParams))
data.foreachRDD(_.foreach(row => {
//获取一行转化成list
val list: List[String] = row.value().split(",").toList
//判断如果 微博会员等级为5 就写入到 vip_rank表
// if (list(9) == "") {
// //掉用方法把数据写入mysql
// //saveDataToMysqlVipRankAndLikeStatus(list, "vip_rank")
// }
println(list(2))
// if (list(5).toInt >= 10) {
// //掉用方法把数据写入mysql
// // saveDataToMysqlVipRankAndLikeStatus(list, "like_status")
// }
})
)
//按天进行计算
val count: DStream[(String, Int)] = data
.map(_.value().split(",")(2).split(" ")(0)) //截取日期中的天
.map((_, 1)).updateStateByKey(updateFunc) //实时统计总数(历史累加)
//遍历统计结果
count.foreachRDD(_.foreach(row => {
//调用方法把数据存储到mysql
saveDataToMysqlCountComment(row._1, row._2)
print(row._1, row._2)
}
))
//启动Streaming
// RDDIP.print()
ssc.start()
ssc.awaitTermination()
// ssc.stop()
}
def mysqlConnection(): Connection = {
DriverManager.getConnection("jdbc:mysql://localhost:3306/market_database?characterEncoding=UTF-8", "root", "root")
}
def saveDataToMysqlCountComment(time: String, count: Int): Unit = {
println(s"${time}\t ${count}")
//获取连接
val connection: Connection = mysqlConnection()
//创建一个变量用来保存sql语句
val sql = "INSERT INTO comment_perday (time,count) VALUES (?,?) ON DUPLICATE KEY UPDATE count = ?"
//将一条数据存入到mysql
val ps: PreparedStatement = connection.prepareStatement(sql)
ps.setString(1, time)
ps.setInt(2, count)
ps.setInt(3, count)
//提交
ps.execute()
connection.close()
}
}
浙公网安备 33010602011771号