spark---dateframe和dateset

Dateframe与DateSet使用

1.代码

import org.apache.spark.sql.SparkSession
/**
 * 计算部门的平均薪资和年龄
 * 
 * 需求：
 *         1、只统计年龄在20岁以上的员工
 *         2、根据部门名称和员工性别为粒度来进行统计
 *         3、统计出每个部门分性别的平均薪资和年龄
 * 
 */
object DepartmentAvgSalaryAndAgeStat {
  
  def main(args: Array[String]) {
    // 创建SparkSession
    val spark = SparkSession
        .builder()
        .appName("DepartmentAvgSalaryAndAgeStat") 
        .master("local") 
        .config("spark.sql.warehouse.dir", "C:\\Users\\Administrator\\Desktop\\spark-warehouse")
        .getOrCreate()
    // 导入spark的隐式转换
    import spark.implicits._
    // 导入spark sql的functions
    import org.apache.spark.sql.functions._
    
    // 首先将两份数据文件加载进来，形成两个dataframe（untyped类型的操作入门）
    val employee = spark.read.json("C:\\Users\\Administrator\\Desktop\\employee.json")
    val department = spark.read.json("C:\\Users\\Administrator\\Desktop\\department.json")
    
    // 进行计算操作
    
    
    employee
        // 先对employee进行过滤，只统计20岁以上的员工
        .filter("age > 20")
        // 需要跟department数据进行join，然后才能根据部门名称和员工性别进行聚合
        // 注意：untyped join，两个表的字段的连接条件，需要使用三个等号  下面的$是scala的语法
        .join(department, $"depId" === $"id")  
        // 根据部门名称和员工性别进行分组
        .groupBy(department("name"), employee("gender"))  
        // 最后执行聚合函数   agg是聚合函数，avg:求均值
        .agg(avg(employee("salary")), avg(employee("age")))
        // 执行action操作，将结果显示出来(注销下面的.show（）方法也会显示)
        .show()  // dataframe == dataset[Row]
    // dataframe的类型是Row，所以是untyped类型，弱类型
    // dataset的类型通常是我们自定义的case class，所以是typed类型，强类型
    
    // dataset开发，与rdd开发有很多的共同点
    // 比如说，dataset api也分成transformation和action，transformation是lazy特性的
    // action会触发实际的计算和操作
   
    // dataset也是有持久化的概念的
  }
}

2.测试用例

---department.json
{"id": 1, "name": "Technical Department"}
{"id": 2, "name": "Financial Department"}
{"id": 3, "name": "HR Department"}
---employee.json

{"name": "Leo", "age": 25, "depId": 1, "gender": "male", "salary": 20000}
{"name": "Marry", "age": 30, "depId": 2, "gender": "female", "salary": 25000}
{"name": "Jack", "age": 35, "depId": 1, "gender": "male", "salary": 15000}
{"name": "Tom", "age": 42, "depId": 3, "gender": "male", "salary": 18000}
{"name": "Kattie", "age": 21, "depId": 3, "gender": "female", "salary": 21000}

Action的方法测试

1.代码

package cn.ibeifeng.spark
import org.apache.spark.sql.SparkSession
/**
 * action操作详解
 * 
 * collect、count、first、foreach、reduce、show、take
 * 
 */
object ActionOperation {
  
  def main(args: Array[String]) {
    val spark = SparkSession
        .builder()
        .appName("ActionOperation") 
        .master("local") 
        .config("spark.sql.warehouse.dir", "C:\\Users\\Administrator\\Desktop\\spark-warehouse")
        .getOrCreate()
    
    import spark.implicits._
    
    val employee = spark.read.json("C:\\Users\\Administrator\\Desktop\\employee.json")
    
    // collect：将分布式存储在集群上的分布式数据集（比如dataset），中的所有数据都获取到driver端来
    employee.collect().foreach { println(_) }  
    // count：对dataset中的记录数进行统计个数的操作
    println(employee.count())
    // first：获取数据集中的第一条数据
    println(employee.first())  
    // foreach：遍历数据集中的每一条数据，对数据进行操作，这个跟collect不同，collect是将数据获取到driver端进行操作
    // foreach是将计算操作推到集群上去分布式执行
    // foreach(println(_))这种，真正在集群中执行的时候，是没用的，因为输出的结果是在分布式的集群中的，我是看不到的
    employee.foreach { println(_) }  
    // reduce：对数据集中的所有数据进行归约的操作，多条变成一条
    // 用reduce来实现数据集的个数的统计
    println(employee.map(employee => 1).reduce(_ + _))
    // show，默认将dataset数据打印前20条
    employee.show()
    // take，从数据集中获取指定条数
    employee.take(3).foreach { println(_) } 
  }
}

jar包上传集群运行要在maven添加这三个插件

<plugin>
        <artifactId>maven-assembly-plugin</artifactId>
        <configuration>
          <descriptorRefs>
            <descriptorRef>jar-with-dependencies</descriptorRef>
          </descriptorRefs>
          <archive>
            <manifest>
              <mainClass></mainClass>
            </manifest>
          </archive>
        </configuration>
        <executions>
          <execution>
            <id>make-assembly</id>
            <phase>package</phase>
            <goals>
              <goal>single</goal>
            </goals>
          </execution>
        </executions>
      </plugin>

      <plugin>
        <groupId>org.codehaus.mojo</groupId>
        <artifactId>exec-maven-plugin</artifactId>
        <version>1.2.1</version>
        <executions>
          <execution>
            <goals>
              <goal>exec</goal>
            </goals>
          </execution>
        </executions>
        <configuration>
          <executable>java</executable>
          <includeProjectDependencies>true</includeProjectDependencies>
          <includePluginDependencies>false</includePluginDependencies>
          <classpathScope>compile</classpathScope>
          <mainClass>cn.spark.study.App</mainClass>
        </configuration>
      </plugin>

      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-compiler-plugin</artifactId>
        <configuration>
          <source>1.6</source>
          <target>1.6</target>
        </configuration>
      </plugin>

在集群执行脚本

/usr/local/spark/bin/spark-submit \
--class cn.spark.SparkSQLDemo \
--master spark://spark2upgrade01:7077 \
--num-executors 1 \
--driver-memory 500m \
--executor-memory 500m \
--executor-cores 1 \
/usr/local/test_spark_app/spark2-upgrade-0.0.1-SNAPSHOT-jar-with-dependencies.jar \

Dateframe和Dateset相互转化及持久化

import org.apache.spark.sql.SparkSession
/**
 * 基础操作
 * 
 * 持久化：cache、persist
 * 创建临时视图：createTempView、createOrReplaceTempView
 * 获取执行计划：explain
 * 查看schema：printSchema
 * 写数据到外部存储：write
 * dataset与dataframe互相转换：as、toDF
 * 
 */
object BasicOperation {
 // 7.为下面dateframe和dateset相互转化做准备，这个要放到main外
  case class Employee(name: String, age: Long, depId: Long, gender: String, salary: Long)
  
  def main(args: Array[String]) {
    val spark = SparkSession
        .builder()
        .appName("BasicOperation") 
        .master("local") 
        .config("spark.sql.warehouse.dir", "C:\\Users\\Administrator\\Desktop\\spark-warehouse")
        .getOrCreate()
    import spark.implicits._
    val employee = spark.read.json("C:\\Users\\Administrator\\Desktop\\employee.json")// 持久化，如果要对一个dataset重复计算两次的话，那么建议先对这个dataset进行持久化再进行操作，避免重复计算
//    employee.cache()
//    println(employee.count())
//    println(employee.count())

    // 创建临时视图，主要是为了，可以直接对数据执行sql语句
//    employee.createOrReplaceTempView("employee") 
//    spark.sql("select * from employee where age > 30").show()  
    
    // 获取spark sql的执行计划
    // dataframe/dataset，比如执行了一个sql语句获取的dataframe，实际上内部包含一个logical plan，逻辑执行计划
    // 设计执行的时候，首先会通过底层的catalyst optimizer，生成物理执行计划，比如说会做一些优化，比如push filter
    // 还会通过whole-stage code generation技术去自动化生成代码，提升执行性能
//    spark.sql("select * from employee where age > 30").explain()  
    //查看源数据和spark推断的类型
//    employee.printSchema()
    
    // 下面这个运行会错，写hdfs是肯定没有问题的
//    val employeeWithAgeGreaterThen30DF = spark.sql("select * from employee where age > 30")
//    employeeWithAgeGreaterThen30DF.write.json("C:\\Users\\Administrator\\Desktop\\employeeWithAgeGreaterThen30DF.json")
   
// 7.dateframe和dateset相互转化
    val employeeDS = employee.as[Employee]  
    employeeDS.show()
    employeeDS.printSchema()
    val employeeDF = employeeDS.toDF()
  }
}

partition重新分区和去重

1.代码

import org.apache.spark.sql.SparkSession
/**
 * typed操作
 */
object TypedOperation {
  case class Employee(name: String, age: Long, depId: Long, gender: String, salary: Long)
  def main(args: Array[String]) {
    val spark = SparkSession
        .builder()
        .appName("TypedOperation") 
        .master("local") 
        .config("spark.sql.warehouse.dir", "C:\\Users\\Administrator\\Desktop\\spark-warehouse")
        .getOrCreate()
    import spark.implicits._
    val employee = spark.read.json("C:\\Users\\Administrator\\Desktop\\employee.json")
    val employeeDS = employee.as[Employee] 
    // 查看默认分区 
  //  println(employeeDS.rdd.partitions.size)
    // coalesce和repartition操作
    // 都是用来重新定义分区的
    // 区别在于：coalesce，只能用于减少分区数量，而且可以选择不发生shuffle
    // repartiton，可以增加分区，也可以减少分区，必须会发生shuffle，相当于是进行了一次重分区操作
   // val employeeDSRepartitioned = employeeDS.repartition(7);
    // 看一下它的分区情况
   // println(employeeDSRepartitioned.rdd.partitions.size)
   // val employeeDSCoalesced = employeeDSRepartitioned.coalesce(3);
   // println(employeeDSCoalesced.rdd.partitions.size)
   // employeeDSCoalesced.show()

// distinct和dropDuplicates
// 都是用来进行去重的，区别在哪儿呢？
// distinct，是根据每一条数据，进行完整内容的比对和去重
// dropDuplicates，可以根据指定的字段进行去重

val distinctEmployeeDS = employeeDS.distinct();
distinctEmployeeDS.show()
val dropDuplicatesEmployeeDS = employeeDS.dropDuplicates(Seq("name"))
dropDuplicatesEmployeeDS.show()

}
}

其他函数

   // except：获取在当前dataset中有，但是在另外一个dataset中没有的元素
    // filter：根据我们自己的逻辑，如果返回true，那么就保留该元素，否则就过滤掉该元素
    // intersect：获取两个数据集的交集 
//    employeeDS.except(employeeDS2).show()  
//    employeeDS.filter { employee => employee.age > 30 }.show() 
//    employeeDS.intersect(employeeDS2).show()

posted @ 2018-08-23 17:01 聚云阅读(806) 评论(0) 收藏举报

刷新页面返回顶部

spark---dateframe和dateset

公告