SparkSQL_03

pom

<dependency>
    <groupId>org.apache.spark</groupId>
    <artifactId>spark-core_2.11</artifactId>
    <version>2.1.1</version>
</dependency>
<dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-sql_2.11</artifactId>
        <version>2.1.1</version>
</dependency>

1.RDD

  1. RDD的最大好处就是简单,API的人性化程度很高。
  2. RDD的劣势是性能限制,它是一个JVM驻内存对象,Java序列化成本高
  3. 编译时类型安全,能检查出类型错误

2.DataFrame

  1. DataFrame是一种以RDD为基础的分布式数据集,类似于传统数据库中的二维表格
  2. DataFrame的API比函数式的RDD API 要更加友好
  3. Dataframe的劣势在于在编译期缺少类型安全检查,导致运行时出错。

3.DataSet

  1. 用户友好的API风格,既具有类型安全检查也具有DataFrame的查询优化特性;
  2. DataSet是DataFrame API的一个扩展,是SparkSQL最新的数据抽象
  3. Dataframe是Dataset的特列,DataFrame=Dataset[Row]
  4. DataSet是强类型的。比如可以有Dataset[Car]

4.RDD DataFrame DataSet的区别和共性

共性:

  1. 三者都是spark平台下的分布式弹性数据集;
  2. 都有惰性机制;
  3. 有许多共同的函数;
  4. 根据 Spark 的内存情况自动缓存运算。

区别:

  1. RDD一般和spark mlib同时使用。
  2. RDD不支持sparksql操作。
  3. DataFrame与RDD和Dataset不同,DataFrame每一行的类型固定为Row
  4. DataFrame与DataSet均支持 SparkSQL 的操作
  5. Dataset和DataFrame拥有完全相同的成员函数,区别只是每一行的数据类型不同

5.三者转化

people.json

{"name":"M","age":20}
{"name":"A","age":30}
{"name":"J","age":19}

people.txt

hello,18
hello,20

5.1DataFrame与RDD之间转化

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

object demo3 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("demo1").setMaster("local[*]")
    val spark = SparkSession.builder().config(conf).getOrCreate()
    val sc = spark.sparkContext
    import spark.implicits._
    println("----DataFrame转换成RDD----")
    //创建DataFrame
    val df01 = spark.read.json("file:///F:/wc/people.json")
    df01.show()
    df01.printSchema()
    println("----------------")
    //|-- age: long (nullable = true)
    //|-- name: string (nullable = true)
    //DataFrame转换成RDD  每一行数据都是Row数据类型
    val rdd02 = df01.rdd
    rdd02.foreach(println(_))
    println("----------------")
    val rdd03 = rdd02.map {
      row =>
        val line = row
        val name = line.getAs[String]("name")
        val age = line.getAs[Long]("age")
        (name, age)
    }
    rdd03.foreach(println(_))
    println("----------------")
    println("----RDD转换成DataFrame----")
    val baseRdd = sc.textFile("file:///F:/wc/people.txt").map {
      line =>
        val word = line.split(",")
        (word(0), word(1).trim.toInt)
    }
    baseRdd.foreach(println(_))
    println("----------------")
    //RDD转换成DataFrame
    val df02 = baseRdd.toDF("name", "age")
    df02.show()
    df02.printSchema()
  }
}

5.2DataSet与RDD之间转化

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

case class People(name: String, age: Int)

object demo3 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("demo2").setMaster("local[*]")
    val spark = SparkSession.builder().config(conf).getOrCreate()
    val sc = spark.sparkContext
    import spark.implicits._
    //创建一个RDD,RDD中每一行中都是元组
    val rdd01 = sc.textFile("file:///F:/wc/people.txt").map {
      line =>
        val word = line.split(",")
        (word(0), word(1).trim.toInt)
    }
    //RDD 转换 DataSet  需要case class,每一行都是样例类
    val ds01 = rdd01.map(line => People(line._1, line._2)).toDS()
    ds01.show()
    ds01.printSchema()
    //DataSet  转换 RDD
    val rdd02 = ds01.rdd
    rdd02.foreach(println(_))
    rdd02.map {
      people =>
        (people.name, people.age)
    }.foreach(println(_))
  }
}

5.3DataSet与DataFrame之间转化

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

case class Peopl(name: String, age: Long)

object demo3 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("demo1").setMaster("local[*]")
    val spark = SparkSession.builder().config(conf).getOrCreate()
    val sc = spark.sparkContext
    import spark.implicits._
    val df01 = spark.read.json("file:///F:/wc/people.json")
    df01.show()
    df01.printSchema()
    //DataFrame转成DataSet
    val ds01 = df01.as[Peopl]
    ds01.show()
    ds01.foreach(println(_))
    println("--------------")
    //DataSet转成DataFrame
    val df02 = ds01.toDF()
    df02.show()
    df02.foreach(println(_))
  }
}

6. sparkSql练习

文件 stu.txt

张三	88	98
李四	86	100
王五	68	100
赵六	88	100
田七	90	78
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

object demo5 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("demo5").setMaster("local[*]")
    val spark = SparkSession.builder().config(conf).getOrCreate()
    val sc = spark.sparkContext
    import spark.implicits._
    val rdd01 = sc.textFile("file:///F:/wc/stu.txt").map {
      line =>
        val word = line.split("\t")
        (word(0), word(1).toInt, word(2).toInt)
    }
    val df02 = rdd01.toDF("name", "grade_1", "grade_2")
    df02.show()
    df02.printSchema()
    df02.createOrReplaceTempView("t_stu")
    spark.sql("select * from t_stu order by grade_1 desc limit 3").show()
    spark.sql("select * from t_stu order by grade_1 desc ,grade_2 desc ").show()
  }
}

文件 info.json

{"name":"张三"}
{"name":"李四"}
{"name":"王五","age":49}
{"name":"赵六","age":20}
{"name":"朱七","age":13}
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

object demo6 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("demo4").setMaster("local[*]")
    val spark = SparkSession.builder().config(conf).getOrCreate()
    val sc = spark.sparkContext
    import spark.implicits._

    //创建DataFrame
    val df01 = spark.read.json("file:///F:/wc/info.json")
    df01.show()
    //    3、查询出age大于等于20的数据
    //    4、查询出age不为空的数据
    df01.createOrReplaceTempView("info")
    spark.sql("select * from info where age >=20 ").show()
    println("-----------------")
    spark.sql("select * from info where age is not null").show()
  }
}

6.1学生信息统计

scores.txt

上海 1703A 95 90 90 99 98 
上海 1703B 85 80 70 99 88 
上地 1703C 75 60 50 99 78 
上海 1703D 96 70 80 99 68 
上地 1704F 94 95 80 99 90 
上海 1704E 95 96 90 99 92 
上地 1704K 91 97 60 99 93 
上海 1704G 89 70 70 99 94 
北京 1601A 79 80 70 99 88 
北京 1602A 69 60 70 99 78 
北京 1603A 59 80 90 99 68 
北京 1604A 89 95 80 99 98 
北京 1701A 99 95 90 99 78 
北京 1702A 99 98 60 99 88
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

case class Scores(xiaoqu: String, grade: String, ccl: Int, wwjl: Int, wycl: Int, cql: Int, pjf: Int, allscores: Int)

object demo7 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("demo5").setMaster("local[*]")
    val spark = SparkSession.builder().config(conf).getOrCreate()
    val sc = spark.sparkContext
    import spark.implicits._
    val rdd01 = sc.textFile("file:///F:/wc/scores.txt").map {
      line =>
        val word = line.split(" ")
        // 校区 班级 成才率 未违纪率 未异常率 出勤率 平均分
        (word(0), word(1), word(2).toInt, word(3).toInt, word(4).toInt, word(5).toInt, word(6).toInt)

    }
    rdd01.foreach(println(_))
    //Scores(xioaqu:String,grade:String,ccl:Int,wwjl:Int,wycl:Int,cql:Int,pjf:Int,allscores:Int)
    val ds01 = rdd01.map(line => Scores(line._1, line._2, line._3, line._4, line._5, line._6, line._7,
      line._3 + line._4 + line._5 + line._6 + line._7)).toDS()
    ds01.show()
    ds01.createOrReplaceTempView("scores")

    //(3)打印出综合分数排在前5的班级校区和对应的名称(10分)
    spark.sql("select xiaoqu,grade,allscores from scores order by allscores desc limit 5").show()
    println("3--------------")

    //(4)每个校区综合分数排在前5
    spark.sql("select * from " +
      "(select *," +
      "rank() over(partition by xiaoqu sort by allscores) as level " +
      "from scores) as t " +
      "where t.level<=5")
      .show()
    println("4--------------")

    //(5)成才率大于60且未违纪率大于70的班级(10分)
    spark.sql("select * from scores where ccl>60 and wwjl>70").show()
    println("5--------------")

    //(6)将以上数据按照成材率倒序安排,若成材率相同则未异常率升序排序输出(10分)
    spark.sql("select * from scores order by ccl desc ,wycl asc").show()
    println("6--------------")

    //(7)分别计算上地和上海的各班成材率的平均值(10分)
    spark.sql("select xiaoqu, round(avg(ccl),2) from scores group by xiaoqu").show()
  }
}

6.2 省市数据统计

water.txt

北京市	500	900	300	300
天津市	500	500	600	600
上海市	800	1200	500	500
重庆市	200	300	800	800
河北省	800	1700	200	200
山西省	900	200	800	1200
辽宁省	500	1800	900	300
吉林省	1200	1900	500	1700
黑龙江省	300	900	1200	200
江苏省	1700	700	300	1800
浙江省	200	300	1700	1900
安徽省	1800	300	200	900
福建省	1900	600	1800	700
江西省	900	500	1900	300
山东省	700	800	500	600
河南省	300	200	1200	500
湖北省	600	800	300	800
湖南省	500	900	1700	200
广东省	800	500	200	800
海南省	200	1200	1800	900
四川省	800	300	1900	300
贵州省	900	1700	900	600
云南省	500	200	700	500
陕西省	1200	1800	300	800
甘肃省	300	1900	600	200
青海省	1700	1800	700	800
台湾省	200	1900	300	900
内蒙古自治区	1800	500	600	500
广西壮族自治区	1900	1200	500	1200
西藏自治区	900	300	800	300
宁夏回族自治区	700	1700	200	1700
新疆维吾尔自治区	300	200	800	200
香港特别行政区	600	1800	900	1800
澳门特别行政区	800	1900	300	1900
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

case class Scores(xiaoqu: String, grade: String, ccl: Int, wwjl: Int, wycl: Int, cql: Int, pjf: Int, allscores: Int)

object demo8 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("demo8").setMaster("local[*]")
    val spark = SparkSession.builder().config(conf).getOrCreate()
    val sc = spark.sparkContext
    import spark.implicits._
    //    省份	第一季度	第二季度	第三季度	第四季度
    //    北京市	500	900	300	300
    val rdd01 = sc.textFile("file:///F:/wc/water.txt").map {
      line =>
        val word = line.split("\t")
        (word(0), word(1).toInt, word(2).toInt, word(3).toInt, word(4).toInt,
          word(1).toInt + word(2).toInt + word(3).toInt + word(4).toInt)
    }
    //rdd01.mapPartitionsWithIndex((index,item)=>Iterator(index+"~"+item.mkString("|"))).foreach(println(_))

    //    1)请求出各省全年的GDP数值,并且按总数居值进行倒序排名并打印(20分)
    val rdd02 = rdd01.coalesce(1).sortBy(_._6, false)
    rdd02.foreach(println(_))
    //rdd02.mapPartitionsWithIndex((index,item)=>Iterator(index+"~"+item.mkString("|"))).foreach(println(_))
    println("1--------------")

    //    2) 将上述结果求其中排名第5名到第10名,按总GDP量的倒序排序打印出来(20分)
    val arr = rdd02.take(10).takeRight(6)
    arr.foreach(println)
    println("2--------------")

    //    3)使用spark算法对上述进行排序,排序规则按照第一季度降序,第二季度升序、第三季度降序
    val rdd03 = rdd01.sortBy(_._4, false).sortBy(_._3).sortBy(_._2, false)
    rdd03.foreach(println(_))
    println("3--------------")

    //    4)请将数据读取,创建临时表,用sparksql完成如下功能(30分)
    //    A)	第一季度大于500且第二季度小于900的与省份
    //    B)	第三季度大于600或者第四季度大于800的省份
    //    C)	根据第二季度分组,每组当中第三节度GDP量最大的省份。
    //    D)	根据第四季度降水量阶梯分组,分组办法为0-500为一组、500-1000为一组、1000-1500为一组、1500-2000为一组、大于2000为一组; 求每组当中全年GDP量最高的省份。

    val df01 = rdd01.toDF("province", "first", "second", "third", "fourth", "all")
    df01.createOrReplaceTempView("gdp")

    spark.sql("select * from gdp where first > 500 and second <900").show()
    println("4a--------------")
    spark.sql("select * from gdp where third > 600 or fourth >800").show()
    println("4b--------------")
    spark.sql("select max(third) as t from gdp group by second").show()
    println("4c1--------------")
    spark.sql("select * from " +
      "(select * ,rank() over(partition by second order by third desc) as level  from gdp) as t" +
      " where t.level==1").show()
    println("4c2--------------")
    spark.sql("select * from " +
      "(select * ,rank() over(partition by g_group order by all desc) as level  from" +
      " (select *,(case" +
      " when fourth > 0 and fourth < 500 then 1" +
      " when fourth >= 500 and fourth < 1000 then 2" +
      " when fourth >=1000 and fourth < 1500 then 3" +
      " when fourth >= 1500 and fourth < 2000 then 4" +
      " else 5 end) g_group from gdp) as r" +
      ") as t where t.level==1").show()
    println("--------------")
  }
}

7.自定义函数

import org.apache.spark.SparkConf
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{DataType, DoubleType, LongType, StructField, StructType}

object Demo2 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("demo1").setMaster("local[*]")
    val spark = SparkSession.builder().config(conf).getOrCreate()
    val sc = spark.sparkContext
    import spark.implicits._
    val df01 = spark.read.json("file:///f:/wc/people.json")
    df01.createOrReplaceTempView("t_people")
    spark.sql("select * from t_people").show()

    //注册一个udf函数
    spark.udf.register("addname", (name: String) => "hello word " + name)
    spark.udf.register("chu", (x: Int, y: Int) => {
      if (y == 0) {
        0
      } else {
        x / y
      }
    })
    spark.sql("select concat('hello',name), addname(name) ,age,chu(age,2) from t_people").show
    //注册自定义聚合函数
    spark.udf.register("avgAge", new MyAverage)
    spark.sql("select avg(age),avgAge(age),sum(age),max(age) from  t_people").show()
  }
}

//求平均年龄
class MyAverage extends UserDefinedAggregateFunction {

  // 聚合函数输入参数的数据类型
  //输入:年龄
  override def inputSchema: StructType = {
    StructType(StructField("inputCloum", LongType) :: Nil)
  }

  // 聚合缓冲区中值得数据类型
  //年龄总和、人数
  override def bufferSchema: StructType = {
    StructType(StructField("sum", LongType) :: StructField("count", LongType) :: Nil)
  }

  // 返回值的数据类型
  override def dataType: DataType = DoubleType

  // 对于相同的输入是否一直返回相同的输出。
  // 确保一致性 一般用true,用以标记针对给定的一组输入,UDAF是否总是生成相同的结果。
  override def deterministic: Boolean = true

  // 初始化
  override def initialize(buffer: MutableAggregationBuffer): Unit = {
    //存放年龄的总数
    buffer(0) = 0L
    //存放人数
    buffer(1) = 0L
  }

  // 相同Execute间的数据合并。
  override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
    if (!input.isNullAt(0)) { //合并的时候input中年龄是否为NULL
      //年龄累加
      buffer(0) = buffer.getAs[Long](0) + input.getLong(0)
      //个数累加
      buffer(1) = buffer.getLong(1) + 1
    }
  }

  // 不同Execute间的数据合并
  override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
    //年龄相加
    buffer1(0) = buffer1.getLong(0) + buffer2.getLong(0)
    //个数相加
    buffer1(1) = buffer1.getLong(1) + buffer2.getLong(1)
  }

  //返回结果
  override def evaluate(buffer: Row): Any = {
    //    年龄/个数
    buffer.getLong(0).toDouble / buffer.getLong(1)
  }
}

8.开窗函数

over () 开窗函数是按照某个字段分组,然后查询出另一字段的前几个的值,相当于 分组取topN
row_number () over (partitin by XXX order by XXX)
rank () 跳跃排序,有两个第二名时,后边跟着的是第四名
dense_rank () 连续排序,有两个第二名时,后边跟着的是第三名
row_number () 连续排序,两个值相同排序也是不同
object demoStu {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("demoStu").setMaster("local[*]")
    val spark = SparkSession.builder().config(conf).getOrCreate()
    val sc = spark.sparkContext
    import spark.implicits._
    val arr01 = Array(("a", 1, 88),
      ("b", 1, 78),
      ("c", 1, 95),
      ("d", 2, 74),
      ("e", 2, 92),
      ("f", 3, 99),
      ("g", 3, 99),
      ("h", 3, 45),
      ("i", 3, 53),
      ("j", 3, 78))
    val stuDf = sc.makeRDD(arr01).map {
      x =>
        Student(x._1, x._2, x._3)
    }.toDF()
    stuDf.createOrReplaceTempView("student")

    //班级第一名
    spark.sql(
      "select name , grade , score from" +
        "(select * ," +
        "rank() over (partition by grade order by score desc) as level " +
        "from student)  t1 " +
        "where t1.level=1  order by score desc").show()

    //每个班的前两名
    spark.sql(
      "select name , grade , score from" +
        "(select * ," +
        "rank() over (partition by grade order by score desc) as level " +
        "from student)  t1 " +
        "where t1.level <= 2 ").show()

    //班级最高分
    spark.sql("select name , t1.grade grade , score  from student t1," +
      "(select grade,max(score) as maxscore from student group by grade) t2" +
      " where t1.grade = t2.grade and t1.score=t2.maxscore").show()

    spark.sql(
      "select * ," +
        "rank() over (partition by grade order by score desc) level " +
        "from student")
      .show
  }
}

9.数据加载和保存

 val peopleDF = spark.read.format("json").load("examples/src/main/resources/people.json")

​ peopleDF.write.format("parquet").save("hdfs://hadoop102:9000/namesAndAges.parquet")

2021.11.24 14.05

posted @ 2021-11-24 13:46  哟喝  阅读(71)  评论(0)    收藏  举报