spark wordcount

spark wordcount

 maven

<dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.3.0</version>
        </dependency>

 

package cn.easyinfo.spark;

import java.util.Arrays;
import java.util.Iterator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;

import scala.Tuple2;

/**
 * @author
 * @date 2020年4月6日 下午6:26:51
 * 
 */
public class MyJavaWordCount {

    public static void main(String[] args) {
        // TODO Auto-generated method stub
        //参数检查
        if(args.length<2){
            System.err.println("Usage:MyJavaWordCount input output");
            System.exit(1);
        }
        
        
        //输入路径
        String inputPath = args[0];
        //输出路径
        String outputPath = args[1];
        
        //创建SparkContext
        SparkConf conf = new SparkConf().setAppName("MyJavaWordCount");
        JavaSparkContext jsc = new JavaSparkContext(conf);
        
        //读取数据
        JavaRDD<String> inputRDD = jsc.textFile(inputPath);
        
        //flatmap扁平化操作
        JavaRDD<String> words = inputRDD.flatMap(new FlatMapFunction<String, String>() {

            /**
             * 
             */
            private static final long serialVersionUID = 1L;

            @Override
            public Iterator<String> call(String line) throws Exception {
                // TODO Auto-generated method stub
                return Arrays.asList(line.split("\\s+")).iterator();
            }
        });
        
        //map 操作
        JavaPairRDD<String, Integer> pairRDD = words.mapToPair(new PairFunction<String, String, Integer>() {

            /**
             * 
             */
            private static final long serialVersionUID = 1L;

            @Override
            public Tuple2<String, Integer> call(String word) throws Exception {
                // TODO Auto-generated method stub
                return new Tuple2<String, Integer>(word,1);
            }
        });
        
        //reduce操作
        JavaPairRDD<String, Integer> result = pairRDD.reduceByKey(new Function2<Integer, Integer, Integer>() {
            
            /**
             * 
             */
            private static final long serialVersionUID = 1L;

            @Override
            public Integer call(Integer x, Integer y) throws Exception {
                // TODO Auto-generated method stub
                return x+y;
            }
        });
        
        //保存结果
        result.saveAsTextFile(outputPath);
        
        //关闭jsc
        jsc.close();
    }

}

打包

 

 运行结果:

 

posted @ 2020-04-06 20:21  ptbx1t  阅读(145)  评论(0)    收藏  举报