spark-wordcount
package cn.spark.study.core; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; import java.sql.SQLOutput; import java.util.Arrays; /** * @author: yangchun * @description: * @date: Created in 2020-05-03 21:27 */ public class WordCountLocal { public static void main(String[] args) { //第一步:创建SparkConf对象,设置Spark应用的配置信息 //setMaster()可以设置Spark应用程序要连接的机器的master机器,设置为local表示本地运行 SparkConf conf = new SparkConf().setAppName("WordCountLocal") .setMaster("local"); //第二步:创建JavaSparkContext对象 /*在Spark,SparkContext是所有Spark所有功能的一个入口,你无论是java,scala,还是python编写的 都必须有一个SparkContext,它的主要作用,包括初始化Spark应用程序所需的一些核心组件,包括调度器 (DAGSchedule,TaskSchedule),还会去Spark Master节点进行注册,等等。Spark Context是Spark中 最重要的一个对象。不同类型的Spark应用程序,SparkContext不同 Java的SparkContext,就是JavaSparkContext Spark SQL程序,SQLContext,HiveContext Spark Streaming SparkContext Scala 就是SparkContext */ JavaSparkContext sc = new JavaSparkContext(conf); /* 第三步,针对输入源创建R初始RDD,输入源中的数据会打散,分配到RDD的每个partition中,形成一个分布式数据集 SparkContext根据本地文件创建RDD的方法叫做textFile(),Java中,创建的普通RDD,都叫做JavaRDD。RDD中有元素的 概念,如果hdfs和本地文件,创建的RDD每一个元素相当于文件里面的一行 */ JavaRDD<String> lines = sc.textFile("E:\\spark\\spark.txt"); /* 第四步,对初始RDD进行transformation操作,通过创建function,并配合RDD的map、flatMap等算子 来执行function,通常,如果比较简单,则创建指定的Function匿名内部类,如果比较复杂就定义一个类实现 function接口。 先将一行拆分成单词 */ JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() { private static final long serialVersionUID= 1l; @Override public Iterable<String> call(String s) throws Exception { return Arrays.asList(s.split(" ")); } }); /** * 将每一个单词映射成一个Tuple(单词,1),mapToPair与PairFunction配合使用。第一个参数是输入,第二个,第三个是 * Tuple的组成 */ JavaPairRDD<String,Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String word) throws Exception { return new Tuple2<>(word,1); } }); /** * 接着对所有Tuple进行reduce操作,相当于将根据key对所有tuple进行值的累加 */ JavaPairRDD<String,Integer> wordCounts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer v1, Integer v2) throws Exception { return v1+v2; } }); /** * 最后Spark程序中光有transformation操作,是不行的必须哟action操作,不会执行,可以用foreach操作来触发程序执行 */ wordCounts.foreach(new VoidFunction<Tuple2<String, Integer>>() { @Override public void call(Tuple2<String, Integer> wordCount) throws Exception { System.out.println(wordCount._1+" appeared "+wordCount._2+" times"); } }); sc.close(); } }
将spark提交到集群上进行运行
1、修改代码如下
package cn.spark.study.core; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; import java.util.Arrays; /** * @author: yangchun * @description: * @date: Created in 2020-05-03 21:27 */ public class WordCountCluster { public static void main(String[] args) { /** 如果要在spark集群上运行需要修改的地方,有两个。 第一、将SparkConf的setMaster方法去掉,它会默认自己连接 第二、我们针对的是本地文件,修改为hadoop hdfs上真正存储的大数据文件 实际执行步骤 1、将spark.txt文件上传到hdfs中去 2、利用maven插件对spark工程打包 3、将打包好的jar放到spark集群机器上执行 */ SparkConf conf = new SparkConf().setAppName("WordCountCluster"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile("hdfs:spark1:9000/spark.txt"); JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() { private static final long serialVersionUID= 1l; @Override public Iterable<String> call(String s) throws Exception { return Arrays.asList(s.split(" ")); } }); JavaPairRDD<String,Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String word) throws Exception { return new Tuple2<>(word,1); } }); JavaPairRDD<String,Integer> wordCounts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer v1, Integer v2) throws Exception { return v1+v2; } }); wordCounts.foreach(new VoidFunction<Tuple2<String, Integer>>() { @Override public void call(Tuple2<String, Integer> wordCount) throws Exception { System.out.println(wordCount._1+" appeared "+wordCount._2+" times"); } }); sc.close(); } }
2、上传文件到hdfs
hadoop fs -put spark.txt /spark.txt
3、打包spark工程
4、上传jar包,编写运行sh提交jar包,修改权限
chmod 777 wordcount.sh
/usr/local/spark/bin/spark-submit \ --class cn.spark.study.core.WordCountCluster \ --num-executors 3 \ --driver-memory 100m \ --executor-memory 100m \ --executor-cores 3 \ /usr/local/spark-test/java/spark-java-1.0-SNAPSHOT-jar-with-dependencies.jar \
scala实现代码如下
package cn.spark.study.core import org.apache.spark.{SparkConf, SparkContext} /** * @author: yangchun * @description: * @date: Created in 2020-05-04 15:41 */ object WordCountScala { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("WordCount") val sc = new SparkContext(conf) val lines = sc.textFile("hdfs://spark1:9000/spark.txt") val words = lines.flatMap{line=>line.split(" ")} val pairs = words.map{word=>(word,1)} val wordCounts = pairs.reduceByKey{_ + _} wordCounts.foreach(wordCount=>println(wordCount._1+" appeared "+wordCount._2+" times")) } }
scala版本的提交脚本如下
/usr/local/spark/bin/spark-submit \ --class cn.spark.study.core.WordCountScala \ --num-executors 3 \ --driver-memory 100m \ --executor-memory 100m \ --executor-cores 3 \ /usr/local/spark-test/scala/spark-study-1.0-SNAPSHOT.jar \
采用maven进行打包scala的时候需要注意源文件目录
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>cn.spark</groupId> <artifactId>spark-study</artifactId> <version>1.0-SNAPSHOT</version> <properties> <scala.version>2.11.7</scala.version> </properties> <dependencies> <dependency> <groupId>org.scala-lang</groupId> <artifactId>scala-library</artifactId> <version>2.11.7</version> </dependency> <dependency> <groupId>spark-hadoop</groupId> <artifactId>spark-hadoop</artifactId> <version>1.3.0</version> <scope>system</scope> <systemPath>${project.basedir}/src/lib/spark-assembly-1.3.0-hadoop2.4.0.jar</systemPath> </dependency> </dependencies> <build> <sourceDirectory>src/main/scala</sourceDirectory> <plugins> <plugin> <groupId>org.scala-tools</groupId> <artifactId>maven-scala-plugin</artifactId> <executions> <execution> <goals> <goal>compile</goal> </goals> </execution> </executions> </plugin> </plugins> </build> </project>
采用spark-shell开发,注意采用简单测试
输入spark-shell然后依次输入如下脚本
val lines = sc.textFile("hdfs://spark1:9000/spark.txt")
val words = lines.flatMap{line=>line.split(" ")}
val pairs = words.map{word=>(word,1)}
val wordCounts = pairs.reduceByKey{_ + _}
wordCounts.foreach(wordCount=>println(wordCount._1+" appeared "+wordCount._2+" times"))
浙公网安备 33010602011771号