Spark SQL 练习

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SQLContext, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.types._
import org.apache.spark.{SparkConf, SparkContext, sql}
import org.apache.spark.SparkContext._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row

 

object App {

  def init(): Unit = {
    //Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
    org.apache.log4j.Logger.getLogger("org.apache.spark").setLevel(org.apache.log4j.Level.ERROR)
  }


  case class OrderLine(store_id: Int, channel: String, gmv: BigDecimal, qty: Int)

  def main(args: Array[String]): Unit = {
    init()

    val conf = new SparkConf().setMaster("local[*]").setAppName("spark-rdd-test")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    val df = sqlContext.createDataFrame(List(
      //SNG
      (1094, "SNG", 50.00, 1), (1094, "SNG", 150.00, 3), (1094, "SNG", 250.00, 5),
      //WMDJ
      (1094, "WMDJ", 10.00, 1), (1094, "WMDJ", 20.00, 2), (1094, "WMDJ", 30.00, 3),
      (1094, "WMDJ", 40.00, 4), (1094, "WMDJ", 50.00, 5),
      //JDDJ
      (1094, "JDDJ", 100.00, 1), (1094, "JDDJ", 200.00, 20)
    )).toDF("store_id", "channel", "gmv", "qty")
    // store_id: Int, channel: String, gmv: BigDecimal, qty: Int

    df.createOrReplaceTempView("cte_order")
    df.cache()
    df.printSchema()
    /*
root
 |-- store_id: integer (nullable = false)
 |-- channel: string (nullable = true)
 |-- gmv: double (nullable = false)
 |-- qty: integer (nullable = false)
    * */

    df.show()
    /*
+--------+-------+-----+---+
|store_id|channel|  gmv|qty|
+--------+-------+-----+---+
|    1094|    SNG| 50.0|  1|
|    1094|    SNG|150.0|  3|
|    1094|    SNG|250.0|  5|
|    1094|   WMDJ| 10.0|  1|
|    1094|   WMDJ| 20.0|  2|
|    1094|   WMDJ| 30.0|  3|
|    1094|   WMDJ| 40.0|  4|
|    1094|   WMDJ| 50.0|  5|
|    1094|   JDDJ|100.0|  1|
|    1094|   JDDJ|200.0| 20|
+--------+-------+-----+---+
    * */

    println(df.collect().toList)
    // List(
    //  [1094,SNG,50.0,1], [1094,SNG,150.0,3], [1094,SNG,250.0,5],
    //  [1094,WMDJ,10.0,1], [1094,WMDJ,20.0,2], [1094,WMDJ,30.0,3], [1094,WMDJ,40.0,4], [1094,WMDJ,50.0,5],
    //  [1094,JDDJ,100.0,1], [1094,JDDJ,200.0,20]
    // )


    val query = sqlContext.sql(
      """
        |select
        | channel,
        | sum(qty) as sum_qty,
        | sum(gmv) as sales
        |from cte_order
        |group by channel
      """.stripMargin)
    query.printSchema()
    /*
root
 |-- channel: string (nullable = true)
 |-- sum_qty: long (nullable = true)
 |-- sales: double (nullable = true)
    * */
    query.show()
    /*
+-------+-------+-----+
|channel|sum_qty|sales|
+-------+-------+-----+
|   WMDJ|     15|150.0|
|   JDDJ|     21|300.0|
|    SNG|      9|450.0|
+-------+-------+-----+
    * */

    println(query.collect().toList)
    // List([WMDJ,15,150.0], [JDDJ,21,300.0], [SNG,9,450.0])


    val table = df.sqlContext.sql(
      """
        |select
        | channel,
        | row_number() over(order by gmv desc) as rowIndex,
        | row_number() over(partition by channel order by gmv desc) as par_rowIndex,
        | count(gmv) over(partition by channel) as counts,
        | sum(gmv) over(partition by channel) as sales
        |from cte_order
      """.stripMargin)
    table.printSchema()
    /*
root
 |-- channel: string (nullable = true)
 |-- rowIndex: integer (nullable = true)
 |-- par_rowIndex: integer (nullable = true)
 |-- counts: long (nullable = false)
 |-- sales: double (nullable = true)
    * */

    table.show()
    /*
+-------+--------+------------+------+-----+
|channel|rowIndex|par_rowIndex|counts|sales|
+-------+--------+------------+------+-----+
|   JDDJ|       2|           1|     2|300.0|
|   JDDJ|       4|           2|     2|300.0|
|    SNG|       1|           1|     3|450.0|
|    SNG|       3|           2|     3|450.0|
|    SNG|       5|           3|     3|450.0|
|   WMDJ|       6|           1|     5|150.0|
|   WMDJ|       7|           2|     5|150.0|
|   WMDJ|       8|           3|     5|150.0|
|   WMDJ|       9|           4|     5|150.0|
|   WMDJ|      10|           5|     5|150.0|
+-------+--------+------------+------+-----+
    * */

    println(table.collect().toList)
    // List(
    //  [JDDJ,2,1,2,300.0], [JDDJ,4,2,2,300.0],
    //  [SNG,1,1,3,450.0], [SNG,3,2,3,450.0], [SNG,5,3,3,450.0],
    //  [WMDJ,6,1,5,150.0], [WMDJ,7,2,5,150.0], [WMDJ,8,3,5,150.0], [WMDJ,9,4,5,150.0], [WMDJ,10,5,5,150.0]
    // )


    df.unpersist()
  }
}

 

object App {

  def init(): Unit = {
    //Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
    org.apache.log4j.Logger.getLogger("org.apache.spark").setLevel(org.apache.log4j.Level.ERROR)
  }


  case class OrderLine(store_id: Int, channel: String, gmv: Double, qty: Int)

  def main(args: Array[String]): Unit = {
    init()

    val conf = new SparkConf().setMaster("local[*]").setAppName("spark-rdd-test")
    val sparkSession = SparkSession.builder().config(conf).getOrCreate()
    import sparkSession.implicits._

    val df = sparkSession.createDataFrame(List(
      //SNG
      new OrderLine(1094, "SNG", 50.00, 1),
      new OrderLine(1094, "SNG", 150.00, 3),
      new OrderLine(1094, "SNG", 250.00, 5),
      //WMDJ
      new OrderLine(1094, "WMDJ", 10.00, 1),
      new OrderLine(1094, "WMDJ", 20.00, 2),
      new OrderLine(1094, "WMDJ", 30.00, 3),
      new OrderLine(1094, "WMDJ", 40.00, 4),
      new OrderLine(1094, "WMDJ", 50.00, 5),
      //JDDJ
      new OrderLine(1094, "JDDJ", 100.00, 1),
      new OrderLine(1094, "JDDJ", 200.00, 20)
    ))//.toDF()
    //.toDF("store_id", "channel", "gmv", "qty")

    df.createOrReplaceTempView("cte_order")
    df.cache()
    df.printSchema()
    /*
root
 |-- store_id: integer (nullable = false)
 |-- channel: string (nullable = true)
 |-- gmv: double (nullable = false)
 |-- qty: integer (nullable = false)
    * */

    df.show()
    /*
+--------+-------+-----+---+
|store_id|channel|  gmv|qty|
+--------+-------+-----+---+
|    1094|    SNG| 50.0|  1|
|    1094|    SNG|150.0|  3|
|    1094|    SNG|250.0|  5|
|    1094|   WMDJ| 10.0|  1|
|    1094|   WMDJ| 20.0|  2|
|    1094|   WMDJ| 30.0|  3|
|    1094|   WMDJ| 40.0|  4|
|    1094|   WMDJ| 50.0|  5|
|    1094|   JDDJ|100.0|  1|
|    1094|   JDDJ|200.0| 20|
+--------+-------+-----+---+
    * */

    println(df.collect().toList)
    // List(
    //  [1094,SNG,50.0,1], [1094,SNG,150.0,3], [1094,SNG,250.0,5],
    //  [1094,WMDJ,10.0,1], [1094,WMDJ,20.0,2], [1094,WMDJ,30.0,3], [1094,WMDJ,40.0,4], [1094,WMDJ,50.0,5],
    //  [1094,JDDJ,100.0,1], [1094,JDDJ,200.0,20]
    // )


    val query = sparkSession.sql(
      """
        |select
        | channel,
        | sum(qty) as sum_qty,
        | sum(gmv) as sales
        |from cte_order
        |group by channel
      """.stripMargin)
    query.printSchema()
    /*
root
 |-- channel: string (nullable = true)
 |-- sum_qty: long (nullable = true)
 |-- sales: double (nullable = true)
    * */
    query.show()
    /*
+-------+-------+-----+
|channel|sum_qty|sales|
+-------+-------+-----+
|   WMDJ|     15|150.0|
|   JDDJ|     21|300.0|
|    SNG|      9|450.0|
+-------+-------+-----+
    * */

    println(query.collect().toList)
    // List([WMDJ,15,150.0], [JDDJ,21,300.0], [SNG,9,450.0])


    val table = df.sqlContext.sql(
      """
        |select
        | channel,
        | row_number() over(order by gmv desc) as rowIndex,
        | row_number() over(partition by channel order by gmv desc) as par_rowIndex,
        | count(gmv) over(partition by channel) as counts,
        | sum(gmv) over(partition by channel) as sales
        |from cte_order
      """.stripMargin)
    table.printSchema()
    /*
root
 |-- channel: string (nullable = true)
 |-- rowIndex: integer (nullable = true)
 |-- par_rowIndex: integer (nullable = true)
 |-- counts: long (nullable = false)
 |-- sales: double (nullable = true)
    * */

    table.show()
    /*
+-------+--------+------------+------+-----+
|channel|rowIndex|par_rowIndex|counts|sales|
+-------+--------+------------+------+-----+
|   JDDJ|       2|           1|     2|300.0|
|   JDDJ|       4|           2|     2|300.0|
|    SNG|       1|           1|     3|450.0|
|    SNG|       3|           2|     3|450.0|
|    SNG|       6|           3|     3|450.0|
|   WMDJ|       5|           1|     5|150.0|
|   WMDJ|       7|           2|     5|150.0|
|   WMDJ|       8|           3|     5|150.0|
|   WMDJ|       9|           4|     5|150.0|
|   WMDJ|      10|           5|     5|150.0|
+-------+--------+------------+------+-----+
    * */

    println(table.collect().toList)
    // List(
    //  [JDDJ,2,1,2,300.0], [JDDJ,4,2,2,300.0],
    //  [SNG,1,1,3,450.0], [SNG,3,2,3,450.0], [SNG,6,3,3,450.0],
    //  [WMDJ,5,1,5,150.0], [WMDJ,7,2,5,150.0], [WMDJ,8,3,5,150.0], [WMDJ,9,4,5,150.0], [WMDJ,10,5,5,150.0]
    // )


    df.unpersist()
  }
}

 

posted @ 2020-07-02 23:21  茗::流  阅读(552)  评论(0)    收藏  举报
如有雷同,纯属参考。如有侵犯你的版权,请联系我。