package com.bjsxt.scala.spark.UDF_UDAF
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StringType
object UDAF {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local")
.setAppName("UDAF")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val names = Array("yarn", "Marry", "Jack", "Tom", "Tom", "Tom")
/**
* map task1 : "Tom", "Tom", "Tom" ~Tom~Tom~Tom
* map task2 : ~Jack
*
*
* reduce task1 : ~Tom~Tom~Tom
* reduce task2 : ~Jack
*/
val namesRDD = sc.parallelize(names, 5)
val namesRowRDD = namesRDD.map { name => Row(name) }
val structType = StructType(Array(StructField("name", StringType, true)))
val namesDF = sqlContext.createDataFrame(namesRowRDD, structType)
namesDF.registerTempTable("names")
//StringCount功能是:统计一下每组数据的记录数
sqlContext.udf.register("strCount", new StringCount)
// 使用自定义函数
sqlContext.sql("select name,strCount(name) from names group by name")
.collect()
.foreach(println)
}
}