SPARK SQL 中两种使用方式:入门实例

 
 
        SPARK SQL适合做离线数据的结构化处理,并提供了SQL形式的数据访问和查询方式。
        SPARK SQL也可以与SPARK streaming搭配,做实时数据分析处理。只需将spark streaming的RDD组装成DATAFRAME即可
 
 
        spark view模式,将dataframe注册为临时表,使得数据分析过程接近sql语句
        spark sql模式,直接使用spark的sql接口,做类似mysql的分析
 
 
SPARK view模式实例
 
主程序入口 jmspark.scala:
import jmsql.sqlnodefileviruses
import org.apache.spark.sql.SparkSession
import tools.serverproperties
 
object jmspark {
    def main(args: Array[String]): Unit = {
        //读取配置文件
        val prop = new serverproperties
        val master = prop.getSparkMaster()
        val dburl = prop.getMysqlServer()
 
        //获取spark session对象
        val sparkSession = SparkSession.builder()
            .appName("Spark Mysql Example")
            .config("master", master)
            .getOrCreate()
 
        //读取数据表的所有的内容
        val jdbcDF_nfv = sparkSession.read.format("jdbc")
            .option("url",dburl)
            .option("dbtable", "node_file_viruses")
            .option("user", "root")
            .option("password", "lovelsl")
            .option("driver", "com.mysql.cj.jdbc.Driver")
            .load()
 
        // 实际使用时仅仅使用其中之一
        //mysql的历史数据汇总  执行一次即可,
        new sqlnodefileviruses().getAllInfo(jdbcDF_nfv, sparkSession)
 
        //mysql当天数据汇总    每天定时运行
        //new sqlnodefileviruses().getYestodayInfo(jdbcDF_nfv, sparkSession)
 
    }
 
}
 
 
sqlnodefileviruses.scala文件内容:提供了历史数据分析和新数据每天汇总功能
package jmsql
 
import org.apache.spark.sql.{DataFrame, SparkSession}
import tools.jmdatets
import tools.data2mysql
 
class sqlnodefileviruses {
 
    def _getMinFindTime(jdbcDF_nfv:DataFrame, sparkSession: SparkSession): Int ={
        jdbcDF_nfv.createOrReplaceTempView("node_file_viruses")
        var  df = sparkSession.sql("SELECT min(find_time) from node_file_viruses")
        var min_find_time = df.collect()(0).getInt(0)
        //println(min_find_time)
        min_find_time
    }
 
    def _getClientsInfoEvery(jdbcDF_nfv:DataFrame, sparkSession: SparkSession, start_ts:Long, end_ts:Long): Unit ={
        //jdbcDF_nfv.createOrReplaceTempView("node_file_viruses")
        var  df = sparkSession.sql(
            s"""
               |SELECT t.nid as nid, count(virus_name) as events_count, sum(t.total) as effect_count, FROM_UNIXTIME($start_ts) as reg_time FROM
               | (
               |     SELECT nid, virus_name, count(id) as total
               |     FROM node_file_viruses
               |     WHERE find_time >= $start_ts and find_time < $end_ts
               |     GROUP BY nid, virus_name ) AS t
               | GROUP BY nid
               |""".stripMargin)
        //df.show()
        data2mysql.data2StatClteventsDays(df)
    }
 
    def _getVirusInfoEveryDay(jdbcDF_nfv:DataFrame, sparkSession: SparkSession, start_ts:Long, end_ts:Long): Unit ={
        //jdbcDF_nfv.createOrReplaceTempView("node_file_viruses")
        var  df = sparkSession.sql(
            s"""
              |SELECT virus_name, virus_type, FROM_UNIXTIME($start_ts) as reg_time, count(id) as effect_count FROM node_file_viruses
              |WHERE find_time >= $start_ts AND find_time < $end_ts
              |GROUP BY virus_name, virus_type
              |""".stripMargin)
 
        //df.show()
        data2mysql.data2StatVireventsDays(df)
    }
 
    //历史数据分析
    def getAllInfo(jdbcDF_nfv:DataFrame, sparkSession: SparkSession): Unit ={
        jdbcDF_nfv.createOrReplaceTempView("node_file_viruses")
 
        var start_dt = jmdatets.getDateTimeFromTimeStamp(_getMinFindTime(jdbcDF_nfv, sparkSession))
        val num = jmdatets.getDateRange2Now(start_dt)
        for(i <- 0 to num){
            var start_ts = jmdatets.getSpecialDateTimeObjectTimeStamp(start_dt)
            var next_date = jmdatets.getNextDayDateTimeObject(start_dt)
            var end_ts = jmdatets.getSpecialDateTimeObjectTimeStamp(next_date)
 
            _getClientsInfoEvery(jdbcDF_nfv, sparkSession, start_ts, end_ts)
            _getVirusInfoEveryDay(jdbcDF_nfv, sparkSession, start_ts, end_ts)
 
            start_dt = jmdatets.getNextDayDateTimeObject(start_dt)
        }
    }
 
    //第二天凌晨开始计算前一天的数据汇总
    def getYestodayInfo(jdbcDF_nfv:DataFrame, sparkSession: SparkSession): Unit ={
        jdbcDF_nfv.createOrReplaceTempView("node_file_viruses")
 
        //前一天的起始时间戳
        var start_ts = jmdatets.getYestodayUtcTimeStamp()
        //获取今天的UTC起始时间戳
        var end_ts = jmdatets.getTodayUtcTimeStamp()
 
        _getClientsInfoEvery(jdbcDF_nfv, sparkSession, start_ts, end_ts)
        _getVirusInfoEveryDay(jdbcDF_nfv, sparkSession, start_ts, end_ts)
 
    }
 
}
 
 
data2mysql.scala:提供了将分析结果,存储到数据库的接口,数据库表需要预先建立
package tools
 
import org.apache.spark.sql.{DataFrame, SaveMode}
 
object data2mysql {
 
    def data2StatClteventsDays(jdbcDF:DataFrame): Unit ={
        jdbcDF.write
            .format("jdbc")
            .option("url", "jdbc:mysql://192.168.10.56:33060/testdata")
            .option("dbtable", "stat_cltevents_days")
            .option("user", "root")
            .option("password", "lovelsl")
            .option("driver", "com.mysql.cj.jdbc.Driver")
            .mode(SaveMode.Append)
            .save()
    }
 
    def data2StatVireventsDays(jdbcDF:DataFrame): Unit ={
        jdbcDF.write
            .format("jdbc")
            .option("url", "jdbc:mysql://192.168.10.56:33060/testdata")
            .option("dbtable", "stat_virevents_days")
            .option("user", "root")
            .option("password", "lovelsl")
            .option("driver", "com.mysql.cj.jdbc.Driver")
            .mode(SaveMode.Append)
            .save()
    }
 
}
 
serverproperties.scala:提供了读取配置文件的接口
package tools
 
import java.io.FileInputStream
import java.util.Properties
 
class serverproperties {
 
    val properties = new Properties()
    val path = Thread.currentThread().getContextClassLoader.getResource("server.properties").getPath
    //println(path)
    properties.load(new FileInputStream(path))
    //println(properties.getProperty("ddd"))
 
    def getSparkMaster():String={
        properties.getProperty("sparkmaster")
    }
 
    def getMysqlServer():String={
        properties.getProperty("mysqlserver")
    }
 
}
 
 
jmdatets.scala提供了时间处理函数
package tools
 
import java.text.SimpleDateFormat
import java.util.Date
 
import org.joda.time
import org.joda.time.{DateTime, DateTimeZone, Days}
import org.joda.time.format.DateTimeFormat
 
object jmdatets {
 
    var format = "%s 16:00:00"
 
    //获取n天前的UTC起始时间戳
    def getNdaysBeforeUtcTimeStamp(n:Int): Unit ={
        var nowdt = new DateTime(DateTimeZone.UTC)
        var ndaybefore = nowdt.minusDays(n)
        var nbdt = ndaybefore.toString("yyyy-MM-dd HH:MM:SS")
        var mydate = nbdt.split(" ").toList(0)
        val start = "%s 00:00:00".format(mydate)
        val format = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
        val mydt = DateTime.parse(start, format)
        mydt.getMillis/1000
    }
 
    // 获取昨天的UTC起始时间戳
    def getYestodayUtcTimeStamp(): Long={
        var nowdt = new DateTime(DateTimeZone.UTC)
        var yestoday = nowdt.minusDays(1)
        var yesdt = yestoday.toString("yyyy-MM-dd HH:MM:SS")
        var mydate = yesdt.split(" ").toList(0)
        val start = "%s 00:00:00".format(mydate)
        val format = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
        val mydt = DateTime.parse(start, format)
        mydt.getMillis/1000
    }
 
    // 获取昨天的UTC起始时间戳
    def getTomorrowUtcTimeStamp(): Long={
        var nowdt = new DateTime(DateTimeZone.UTC)
        var yestoday = nowdt.plusDays(1)
        var yesdt = yestoday.toString("yyyy-MM-dd HH:MM:SS")
        var mydate = yesdt.split(" ").toList(0)
        val start = "%s 00:00:00".format(mydate)
        val format = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
        val mydt = DateTime.parse(start, format)
        mydt.getMillis/1000
    }
 
    //获取今天的UTC起始时间戳
    def getTodayUtcTimeStamp(): Long ={
        var nowdt = new DateTime(DateTimeZone.UTC)
        var dtstr = nowdt.toString("yyyy-MM-dd HH:MM:SS")
        var mydate = dtstr.split(" ").toList(0)
        val start = "%s 00:00:00".format(mydate)
        val format = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
        val mydt = DateTime.parse(start, format)
        mydt.getMillis/1000
    }
 
    //
    def getSpecialDateTimeStamp(year:Int, mon:Int, day:Int, hour:Int, min: Int, sec: Int): Long= {
        val specialdt = new DateTime(year, mon, day, hour, min, sec, DateTimeZone.UTC)
        specialdt.getMillis/1000
    }
 
    def getSpecialDateTimeObject(year:Int, mon:Int, day:Int, hour:Int, min: Int, sec: Int): DateTime = {
        new DateTime(year, mon, day, hour, min, sec, DateTimeZone.UTC)
    }
 
    // 获取天的UTC起始时间戳
    def getNextDayDateTimeObject(dt: DateTime): DateTime={
        var next = dt.plusDays(1)
        var nextdt = next.toString("yyyy-MM-dd HH:MM:SS")
        var mydate = nextdt.split(" ").toList(0)
        val start = "%s 00:00:00".format(mydate)
        val format = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
        val mydt = DateTime.parse(start, format)
        mydt
    }
 
    //获取指定日期的时间戳
    def getSpecialDateTimeObjectTimeStamp(dt:DateTime): Long = {
        dt.getMillis/1000
    }
 
    //指定日期,到当前日期的天数差
    def getDateRange2Now(start:DateTime):Int = {
        var nowdt = new DateTime(DateTimeZone.UTC)
        Days.daysBetween(start, nowdt).getDays()
    }
 
    //UTC时间戳转日期  s
    def getDateTimeFromTimeStamp(ts:Long): DateTime ={
        var dt1 = new Date(ts*1000)
        var dt2 = new DateTime(dt1)
        var dt2str = dt2.toString("yyyy-MM-dd HH:MM:SS")
        var mydate = dt2str.split(" ").toList(0)
        val start = "%s 00:00:00".format(mydate)
        val format = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss")
        val mydt = DateTime.parse(start, format)
        mydt
    }
}
 
 
 
 
 
SPARK  SQL 的dataframe模式
 
 
main.scala:
 
import org.apache.spark
import org.apache.spark.sql.SparkSession
import my.spark.mysql.ToMysql
 
// bin/spark-submit  --class main  --master  spark://192.168.6.190:7077  myjar/spark_mysql_2.11.jar
object sparksql {
 
    def main(args: Array[String]): Unit ={
        val sparkSession = SparkSession.builder()
            .appName("Spark Mysql Example")
            .config("master","spark://192.168.6.190:7077")
            .getOrCreate()
 
        val jdbcDF = sparkSession.read.format("jdbc")
            .option("url","jdbc:mysql://192.168.10.56:33060/testdata")
            .option("dbtable", "node_file_viruses")
            .option("user", "root")
            .option("password", "lovelsl")
            .option("driver", "com.mysql.cj.jdbc.Driver")
            .load()
 
        println("SELECT nid, count(nid)  from node_file_viruses group by nid")
        //SELECT nid, count(nid)  from node_file_viruses group by nid
        jdbcDF.groupBy("nid").count().show()
        val df1 =jdbcDF.groupBy("nid").count()
        new ToMysql().write2Database(df1)
 
        println("SELECT nid, virus_name from node_file_viruses group by nid, virus_name")
        //SELECT nid, virus_name from node_file_viruses group by nid, virus_name
        val df2 = jdbcDF.groupBy("nid", "virus_name").count().show()
 
        println("SELECT nid, virus_name, virus_op, virus_type, virus_findby from node_file_viruses group by nid, virus_name")
        //SELECT nid, virus_name, virus_op, virus_type, virus_findby from node_file_viruses group by nid, virus_name
        val df3 = jdbcDF.select(jdbcDF("nid"), jdbcDF("virus_name"),
            jdbcDF("virus_op"), jdbcDF("virus_type"),
            jdbcDF("virus_findby"))
            .groupBy("nid", "virus_name")
            .count()
            .show()
 
        println("SELECT nid, virus_name, virus_op, virus_findby from node_file_viruses where virus_type=1 and virus_op=19")
        //SELECT nid, virus_name, virus_op, virus_findby from node_file_viruses where virus_type=1 and virus_op=19
        val df4 = jdbcDF.select(jdbcDF("nid"), jdbcDF("virus_name"),
            jdbcDF("virus_op"),jdbcDF("virus_findby"))
            .where("virus_type=1 and virus_op=19")
            .show()
 
        println("SELECT * FROM node_file_viruses order by id asc limit 10")
        //SELECT * FROM node_file_viruses order by id asc limit 10
        val df5 = jdbcDF.orderBy(jdbcDF("id").asc).limit(10).show()
    }
 
}
 
 
tomysql.scala :
package my.spark.mysql
 
import org.apache.spark.sql.{DataFrame, SaveMode}
 
class ToMysql {
 
 
    def write2Database(jdbcDF:DataFrame): Unit ={
        jdbcDF.write
            .format("jdbc")
            .option("url", "jdbc:mysql://192.168.10.56:33060/testdata")
            .option("dbtable", "node_file_viruses_test")
            .option("user", "root")
            .option("password", "lovelsl")
            .option("driver", "com.mysql.cj.jdbc.Driver")
            //.option("createTableColumnTypes", "nid decimal(20), count bigint")
            .option("createTableColumnTypes", "nid decimal(20), count bigint")
            //SaveMode.Append SaveMode.Overwrite
            .mode(SaveMode.Append)
            .save()
    }
 
}
 
 
 
posted @ 2020-12-19 23:37  dos_hello_world  阅读(418)  评论(0)    收藏  举报