package com.shujia.spark.sql
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
object Demo1SparkSession {
def main(args: Array[String]): Unit = {
/**
* SparkSession: spark2.0之后统一的入口,可以代替sparkContext和SqlContext
*
*/
val spark: SparkSession = SparkSession
.builder()
.master("local")
.appName("spark")
.config("spark.sql.shuffle.partitions", "1") //spark sql shuffle之后df的分区数据,如果在集群中运行,默认是200
.getOrCreate()
//导入spark 相关的隐式转换
import spark.implicits._
//读取json格式数据
val studentDF: DataFrame = spark.read.json("data/students.json")
//查看数据
studentDF.show()
//查看表结构
studentDF.printSchema()
//选择
studentDF.select("name", "age").show()
//$ 获取列对象,可以对列进行计算
//as 取别名
studentDF.select($"name", $"age" + 1 as "age").show() //DSL语句(类sql)
//过滤
studentDF.filter($"age" > 23).show()
//分组统计
studentDF.groupBy($"clazz").count().show()
//创建临时视图
studentDF.createOrReplaceTempView("student")
//编写sql
val clazzNumDF: DataFrame = spark.sql("select clazz,count(1) from student group by clazz")
clazzNumDF.show()
/**
* sql 执行顺序
* from --> join --> on ---> where ---> group by --> having --> select --> order by --> limit
*
*/
//保存数据
clazzNumDF
.write
.mode(SaveMode.Overwrite)
.csv("data/json")
}
}