package com.shujia.spark.sql
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
object Demo5Submit {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.appName("submit")
.getOrCreate()
//导入隐式转换
import spark.implicits._
//导入spark 所有的函数
import org.apache.spark.sql.functions._
//读取数据
val linesDF: DataFrame = spark
.read
.format("csv")
.option("sep", "\t")
.schema("lines STRING")
.load("/data/words") //指定hdfs 的路径
//统计单词的数量
val wordCountDF: DataFrame = linesDF
.select(explode(split($"lines", ",")) as "word")
.groupBy($"word")
.agg(count($"word") as "c")
//保存数据
wordCountDF
.write
.format("csv")
.option("sep", "\t")
.mode(SaveMode.Overwrite)
.save("/data/wc")
/**
* 在jar所在的位置运行
* spark-submit --class com.shujia.spark.sql.Demo5Submit --master yarn-client spark-1.0.jar
*
*/
}
}