UDAF

package com.bjsxt.scala.spark.UDF_UDAF

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StringType

object UDAF {
  
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
        .setMaster("local") 
        .setAppName("UDAF")

    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    val names = Array("yarn", "Marry", "Jack", "Tom", "Tom", "Tom")

    /**
      * map task1 : "Tom", "Tom", "Tom"    ~Tom~Tom~Tom
      * map task2 : ~Jack
      *
      *
      * reduce task1 : ~Tom~Tom~Tom
      *  reduce task2 : ~Jack
      */

    val namesRDD = sc.parallelize(names, 5) 
    
    val namesRowRDD = namesRDD.map { name => Row(name) }

    val structType = StructType(Array(StructField("name", StringType, true)))
    val namesDF = sqlContext.createDataFrame(namesRowRDD, structType) 
    
    namesDF.registerTempTable("names")

    //StringCount功能是:统计一下每组数据的记录数
    sqlContext.udf.register("strCount", new StringCount)
    
    // 使用自定义函数
    sqlContext.sql("select name,strCount(name) from names group by name")
        .collect()
        .foreach(println)  
  }
}

  

posted @ 2018-06-23 16:53  uuhh  阅读(152)  评论(0)    收藏  举报