SPARK SQL 中两种使用方式:入门实例
SPARK SQL适合做离线数据的结构化处理,并提供了SQL形式的数据访问和查询方式。
SPARK SQL也可以与SPARK streaming搭配,做实时数据分析处理。只需将spark streaming的RDD组装成DATAFRAME即可
spark view模式,将dataframe注册为临时表,使得数据分析过程接近sql语句
spark sql模式,直接使用spark的sql接口,做类似mysql的分析
SPARK view模式实例
主程序入口 jmspark.scala:
import jmsql.sqlnodefileviruses
import org.apache.spark.sql.SparkSession
import tools.serverproperties
object jmspark {
def main(args: Array[String]): Unit = {
//读取配置文件
val prop = new serverproperties
val master = prop.getSparkMaster()
val dburl = prop.getMysqlServer()
//获取spark session对象
val sparkSession = SparkSession.builder()
.appName("Spark Mysql Example")
.config("master", master)
.getOrCreate()
//读取数据表的所有的内容
val jdbcDF_nfv = sparkSession.read.format("jdbc")
.option("url",dburl)
.option("dbtable", "node_file_viruses")
.option("user", "root")
.option("password", "lovelsl")
.option("driver", "com.mysql.cj.jdbc.Driver")
.load()
// 实际使用时仅仅使用其中之一
//mysql的历史数据汇总 执行一次即可,
new sqlnodefileviruses().getAllInfo(jdbcDF_nfv, sparkSession)
//mysql当天数据汇总 每天定时运行
//new sqlnodefileviruses().getYestodayInfo(jdbcDF_nfv, sparkSession)
}
}
sqlnodefileviruses.scala文件内容:提供了历史数据分析和新数据每天汇总功能
package jmsql
import org.apache.spark.sql.{DataFrame, SparkSession}
import tools.jmdatets
import tools.data2mysql
class sqlnodefileviruses {
def _getMinFindTime(jdbcDF_nfv:DataFrame, sparkSession: SparkSession): Int ={
jdbcDF_nfv.createOrReplaceTempView("node_file_viruses")
var df = sparkSession.sql("SELECT min(find_time) from node_file_viruses")
var min_find_time = df.collect()(0).getInt(0)
//println(min_find_time)
min_find_time
}
def _getClientsInfoEvery(jdbcDF_nfv:DataFrame, sparkSession: SparkSession, start_ts:Long, end_ts:Long): Unit ={
//jdbcDF_nfv.createOrReplaceTempView("node_file_viruses")
var df = sparkSession.sql(
s"""
|SELECT t.nid as nid, count(virus_name) as events_count, sum(t.total) as effect_count, FROM_UNIXTIME($start_ts) as reg_time FROM
| (
| SELECT nid, virus_name, count(id) as total
| FROM node_file_viruses
| WHERE find_time >= $start_ts and find_time < $end_ts
| GROUP BY nid, virus_name ) AS t
| GROUP BY nid
|""".stripMargin)
//df.show()
data2mysql.data2StatClteventsDays(df)
}
def _getVirusInfoEveryDay(jdbcDF_nfv:DataFrame, sparkSession: SparkSession, start_ts:Long, end_ts:Long): Unit ={
//jdbcDF_nfv.createOrReplaceTempView("node_file_viruses")
var df = sparkSession.sql(
s"""
|SELECT virus_name, virus_type, FROM_UNIXTIME($start_ts) as reg_time, count(id) as effect_count FROM node_file_viruses
|WHERE find_time >= $start_ts AND find_time < $end_ts
|GROUP BY virus_name, virus_type
|""".stripMargin)
//df.show()
data2mysql.data2StatVireventsDays(df)
}
//历史数据分析
def getAllInfo(jdbcDF_nfv:DataFrame, sparkSession: SparkSession): Unit ={
jdbcDF_nfv.createOrReplaceTempView("node_file_viruses")
var start_dt = jmdatets.getDateTimeFromTimeStamp(_getMinFindTime(jdbcDF_nfv, sparkSession))
val num = jmdatets.getDateRange2Now(start_dt)
for(i <- 0 to num){
var start_ts = jmdatets.getSpecialDateTimeObjectTimeStamp(start_dt)
var next_date = jmdatets.getNextDayDateTimeObject(start_dt)
var end_ts = jmdatets.getSpecialDateTimeObjectTimeStamp(next_date)
_getClientsInfoEvery(jdbcDF_nfv, sparkSession, start_ts, end_ts)
_getVirusInfoEveryDay(jdbcDF_nfv, sparkSession, start_ts, end_ts)
start_dt = jmdatets.getNextDayDateTimeObject(start_dt)
}
}
//第二天凌晨开始计算前一天的数据汇总
def getYestodayInfo(jdbcDF_nfv:DataFrame, sparkSession: SparkSession): Unit ={
jdbcDF_nfv.createOrReplaceTempView("node_file_viruses")
//前一天的起始时间戳
var start_ts = jmdatets.getYestodayUtcTimeStamp()
//获取今天的UTC起始时间戳
var end_ts = jmdatets.getTodayUtcTimeStamp()
_getClientsInfoEvery(jdbcDF_nfv, sparkSession, start_ts, end_ts)
_getVirusInfoEveryDay(jdbcDF_nfv, sparkSession, start_ts, end_ts)
}
}
data2mysql.scala:提供了将分析结果,存储到数据库的接口,数据库表需要预先建立
package tools
import org.apache.spark.sql.{DataFrame, SaveMode}
object data2mysql {
def data2StatClteventsDays(jdbcDF:DataFrame): Unit ={
jdbcDF.write
.format("jdbc")
.option("url", "jdbc:mysql://192.168.10.56:33060/testdata")
.option("dbtable", "stat_cltevents_days")
.option("user", "root")
.option("password", "lovelsl")
.option("driver", "com.mysql.cj.jdbc.Driver")
.mode(SaveMode.Append)
.save()
}
def data2StatVireventsDays(jdbcDF:DataFrame): Unit ={
jdbcDF.write
.format("jdbc")
.option("url", "jdbc:mysql://192.168.10.56:33060/testdata")
.option("dbtable", "stat_virevents_days")
.option("user", "root")
.option("password", "lovelsl")
.option("driver", "com.mysql.cj.jdbc.Driver")
.mode(SaveMode.Append)
.save()
}
}
serverproperties.scala:提供了读取配置文件的接口
package tools
import java.io.FileInputStream
import java.util.Properties
class serverproperties {
val properties = new Properties()
val path = Thread.currentThread().getContextClassLoader.getResource("server.properties").getPath
//println(path)
properties.load(new FileInputStream(path))
//println(properties.getProperty("ddd"))
def getSparkMaster():String={
properties.getProperty("sparkmaster")
}
def getMysqlServer():String={
properties.getProperty("mysqlserver")
}
}
jmdatets.scala提供了时间处理函数
package tools
import java.text.SimpleDateFormat
import java.util.Date
import org.joda.time
import org.joda.time.{DateTime, DateTimeZone, Days}
import org.joda.time.format.DateTimeFormat
object jmdatets {
var format = "%s 16:00:00"
//获取n天前的UTC起始时间戳
def getNdaysBeforeUtcTimeStamp(n:Int): Unit ={
var nowdt = new DateTime(DateTimeZone.UTC)
var ndaybefore = nowdt.minusDays(n)
var nbdt = ndaybefore.toString("yyyy-MM-dd HH:MM:SS")
var mydate = nbdt.split(" ").toList(0)
val start = "%s 00:00:00".format(mydate)
val format = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
val mydt = DateTime.parse(start, format)
mydt.getMillis/1000
}
// 获取昨天的UTC起始时间戳
def getYestodayUtcTimeStamp(): Long={
var nowdt = new DateTime(DateTimeZone.UTC)
var yestoday = nowdt.minusDays(1)
var yesdt = yestoday.toString("yyyy-MM-dd HH:MM:SS")
var mydate = yesdt.split(" ").toList(0)
val start = "%s 00:00:00".format(mydate)
val format = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
val mydt = DateTime.parse(start, format)
mydt.getMillis/1000
}
// 获取昨天的UTC起始时间戳
def getTomorrowUtcTimeStamp(): Long={
var nowdt = new DateTime(DateTimeZone.UTC)
var yestoday = nowdt.plusDays(1)
var yesdt = yestoday.toString("yyyy-MM-dd HH:MM:SS")
var mydate = yesdt.split(" ").toList(0)
val start = "%s 00:00:00".format(mydate)
val format = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
val mydt = DateTime.parse(start, format)
mydt.getMillis/1000
}
//获取今天的UTC起始时间戳
def getTodayUtcTimeStamp(): Long ={
var nowdt = new DateTime(DateTimeZone.UTC)
var dtstr = nowdt.toString("yyyy-MM-dd HH:MM:SS")
var mydate = dtstr.split(" ").toList(0)
val start = "%s 00:00:00".format(mydate)
val format = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
val mydt = DateTime.parse(start, format)
mydt.getMillis/1000
}
//
def getSpecialDateTimeStamp(year:Int, mon:Int, day:Int, hour:Int, min: Int, sec: Int): Long= {
val specialdt = new DateTime(year, mon, day, hour, min, sec, DateTimeZone.UTC)
specialdt.getMillis/1000
}
def getSpecialDateTimeObject(year:Int, mon:Int, day:Int, hour:Int, min: Int, sec: Int): DateTime = {
new DateTime(year, mon, day, hour, min, sec, DateTimeZone.UTC)
}
// 获取天的UTC起始时间戳
def getNextDayDateTimeObject(dt: DateTime): DateTime={
var next = dt.plusDays(1)
var nextdt = next.toString("yyyy-MM-dd HH:MM:SS")
var mydate = nextdt.split(" ").toList(0)
val start = "%s 00:00:00".format(mydate)
val format = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
val mydt = DateTime.parse(start, format)
mydt
}
//获取指定日期的时间戳
def getSpecialDateTimeObjectTimeStamp(dt:DateTime): Long = {
dt.getMillis/1000
}
//指定日期,到当前日期的天数差
def getDateRange2Now(start:DateTime):Int = {
var nowdt = new DateTime(DateTimeZone.UTC)
Days.daysBetween(start, nowdt).getDays()
}
//UTC时间戳转日期 s
def getDateTimeFromTimeStamp(ts:Long): DateTime ={
var dt1 = new Date(ts*1000)
var dt2 = new DateTime(dt1)
var dt2str = dt2.toString("yyyy-MM-dd HH:MM:SS")
var mydate = dt2str.split(" ").toList(0)
val start = "%s 00:00:00".format(mydate)
val format = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
val mydt = DateTime.parse(start, format)
mydt
}
}
SPARK SQL 的dataframe模式
main.scala:
import org.apache.spark
import org.apache.spark.sql.SparkSession
import my.spark.mysql.ToMysql
// bin/spark-submit --class main --master spark://192.168.6.190:7077 myjar/spark_mysql_2.11.jar
object sparksql {
def main(args: Array[String]): Unit ={
val sparkSession = SparkSession.builder()
.appName("Spark Mysql Example")
.config("master","spark://192.168.6.190:7077")
.getOrCreate()
val jdbcDF = sparkSession.read.format("jdbc")
.option("url","jdbc:mysql://192.168.10.56:33060/testdata")
.option("dbtable", "node_file_viruses")
.option("user", "root")
.option("password", "lovelsl")
.option("driver", "com.mysql.cj.jdbc.Driver")
.load()
println("SELECT nid, count(nid) from node_file_viruses group by nid")
//SELECT nid, count(nid) from node_file_viruses group by nid
jdbcDF.groupBy("nid").count().show()
val df1 =jdbcDF.groupBy("nid").count()
new ToMysql().write2Database(df1)
println("SELECT nid, virus_name from node_file_viruses group by nid, virus_name")
//SELECT nid, virus_name from node_file_viruses group by nid, virus_name
val df2 = jdbcDF.groupBy("nid", "virus_name").count().show()
println("SELECT nid, virus_name, virus_op, virus_type, virus_findby from node_file_viruses group by nid, virus_name")
//SELECT nid, virus_name, virus_op, virus_type, virus_findby from node_file_viruses group by nid, virus_name
val df3 = jdbcDF.select(jdbcDF("nid"), jdbcDF("virus_name"),
jdbcDF("virus_op"), jdbcDF("virus_type"),
jdbcDF("virus_findby"))
.groupBy("nid", "virus_name")
.count()
.show()
println("SELECT nid, virus_name, virus_op, virus_findby from node_file_viruses where virus_type=1 and virus_op=19")
//SELECT nid, virus_name, virus_op, virus_findby from node_file_viruses where virus_type=1 and virus_op=19
val df4 = jdbcDF.select(jdbcDF("nid"), jdbcDF("virus_name"),
jdbcDF("virus_op"),jdbcDF("virus_findby"))
.where("virus_type=1 and virus_op=19")
.show()
println("SELECT * FROM node_file_viruses order by id asc limit 10")
//SELECT * FROM node_file_viruses order by id asc limit 10
val df5 = jdbcDF.orderBy(jdbcDF("id").asc).limit(10).show()
}
}
tomysql.scala :
package my.spark.mysql
import org.apache.spark.sql.{DataFrame, SaveMode}
class ToMysql {
def write2Database(jdbcDF:DataFrame): Unit ={
jdbcDF.write
.format("jdbc")
.option("url", "jdbc:mysql://192.168.10.56:33060/testdata")
.option("dbtable", "node_file_viruses_test")
.option("user", "root")
.option("password", "lovelsl")
.option("driver", "com.mysql.cj.jdbc.Driver")
//.option("createTableColumnTypes", "nid decimal(20), count bigint")
.option("createTableColumnTypes", "nid decimal(20), count bigint")
//SaveMode.Append SaveMode.Overwrite
.mode(SaveMode.Append)
.save()
}
}

浙公网安备 33010602011771号