spark学习第一天-词频统计demo
依赖:
<properties>
<scala.version>2.11.12</scala.version>
<spark.version>2.3.0</spark.version>
</properties>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
</dependencies>
代码:
package com.cslc
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
/**
* Created by liuzhimin on 2019/5/28.
*/
object word_count {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("word count first scala")
val sc = new SparkContext(conf)
val lines = sc.textFile("hdfs://cslcdip/user/dip/word.txt")
lines.flatMap(_.split(" ")).map((_, 1)).reduceByKey(_+_).collect().foreach(println)
sc.stop()
}
}
主要函数:
方法一 a.flatMap(x=>x.split(" ")).map(x=>(x,1)).groupBy(x._1).map(x=>(x._1,x._2.map(x=>x._2).sum))
方法二
a.flatMap(x=>x.split(" ")).map(x=>(x,1)).groupBy(x._1).map(x=>(x._1,x._2.map(x=>x._2).reduce(_+_)))
方法三:
flatMap(x=>x.split(" ")).map(x=>(x, 1)).reduceByKey(_+_).collect()

浙公网安备 33010602011771号