spark Transformation 和action 例子

包含算子有:

Map、filter、flatMap、groupByKey、reduceByKey、SortByKey、join、cogroup

示例代码如下:

import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;


import scala.Tuple2;

/**
* @author 作者 E-mail:
* @version 创建时间:2017年8月25日 下午9:09:00
* 类说明 
*/
@SuppressWarnings("unused")
public class SparkTransformAction {

    public static void main(String[] args) {
        //mapAction();
        //filterAction();
        //flatMapAction();
        //groupByKeyAction();
        //redueByKeyAction();
        //sortByKeyAction();
        //joinAction();
        cogroupAction();
    }

    /**
     * map transformaction example
     */
    
    private static void mapAction(){
        SparkConf conf = new SparkConf().setAppName("mapaction").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
        List<Integer> asList = Arrays.asList(1, 2, 4, 8);
        JavaRDD<Integer> javaRDD = sc.parallelize(asList);
        //第二个泛型参数就是返回新的数据类型
        JavaRDD<Integer> map = javaRDD.map(new Function<Integer, Integer>() {

            
            private static final long serialVersionUID = 1L;

            @Override
            public Integer call(Integer v1) throws Exception {
                
                return v1 * 2;
            }
        });
        
        //action 执行
        map.foreach(new VoidFunction<Integer>() {
            
            private static final long serialVersionUID = 1L;

            @Override
            public void call(Integer v1) throws Exception {
                System.out.println("new datas..."+ v1);
            }
        });
        sc.close();
        
    }
    
    private static void filterAction(){
        SparkConf conf = new SparkConf().setAppName("filterAction").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
        List<Integer> asList = Arrays.asList(1, 2, 4, 8, 11, 23, 45, 55, 44);
        JavaRDD<Integer> javaRDD = sc.parallelize(asList);
        JavaRDD<Integer> evenNums = javaRDD.filter(new Function<Integer, Boolean>() {

            private static final long serialVersionUID = 1L;

            @Override
            public Boolean call(Integer num1) throws Exception {
                
                return num1 % 2 == 0;
            }
            
        });
        evenNums.foreach(new VoidFunction<Integer>() {

            private static final long serialVersionUID = 1L;

            @Override
            public void call(Integer t1) throws Exception {
            System.out.println("............"+ t1);
            }
        });
        
        sc.close();
    }
    
    private static void flatMapAction(){
        SparkConf conf = new SparkConf().setAppName("filterAction").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaRDD<String> tf = sc.textFile("G://121.txt");
        JavaRDD<String> flatMap = tf.flatMap(new FlatMapFunction<String, String>() {

            private static final long serialVersionUID = 1L;

            @Override
            public Iterable<String> call(String paramT) throws Exception {
                
                return Arrays.asList(paramT.split(" "));
            }
            
        });
        flatMap.foreach(new VoidFunction<String>() {

            private static final long serialVersionUID = 1L;

            @Override
            public void call(String paramT) throws Exception {
                System.out.println("...is..."+ paramT);
            }
        });
        sc.close();
    }
    
    /**groupByKey按照班级对学生成绩进行分组
     * 
     */
    private static void groupByKeyAction(){
        SparkConf conf = new SparkConf().setAppName("geoupByaction").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
        //模拟数据
        List<Tuple2<String, Integer>> asList = Arrays.asList(
                new Tuple2<String,Integer>("a1", 90),
                new Tuple2<String,Integer>("a1", 80),
                new Tuple2<String,Integer>("a2", 90),
                new Tuple2<String,Integer>("a2", 50));
         JavaPairRDD<String, Integer> sourceList = sc.parallelizePairs(asList);
         JavaPairRDD<String, Iterable<Integer>> groupByKey = sourceList.groupByKey();
         groupByKey.foreach(new VoidFunction<Tuple2<String,Iterable<Integer>>>() {
            
            /**
             * 
             */
            private static final long serialVersionUID = 1L;

            @Override
            public void call(Tuple2<String, Iterable<Integer>> paramT) throws Exception {
                System.out.println("....."+paramT._1);
                 Iterator<Integer> iterator = paramT._2.iterator();
                while (iterator.hasNext()){
                    System.out.println("....source...."+iterator.next());
                }
                
            }
        });
        sc.close();
    }
    /**
     * reduceByKey  统计每个班级的总分
     */
    private static void redueByKeyAction(){
        SparkConf conf = new SparkConf().setAppName("geoupByaction").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
        //模拟数据
        List<Tuple2<String, Integer>> asList = Arrays.asList(
                new Tuple2<String,Integer>("a1", 90),
                new Tuple2<String,Integer>("a1", 80),
                new Tuple2<String,Integer>("a2", 20),
                new Tuple2<String,Integer>("a2", 30),
                new Tuple2<String,Integer>("a2", 40),
                new Tuple2<String,Integer>("a2", 90),
                new Tuple2<String,Integer>("a2", 50));
         JavaPairRDD<String, Integer> sourceList = sc.parallelizePairs(asList);
         JavaPairRDD<String, Integer> reduces = sourceList.reduceByKey(new Function2<Integer, Integer, Integer>() {
        
            private static final long serialVersionUID = 1L;

            @Override
            public Integer call(Integer paramT1, Integer paramT2) throws Exception {
                return paramT1 + paramT2;
            }
        });
         reduces.foreach(new VoidFunction<Tuple2<String,Integer>>() {

            private static final long serialVersionUID = 1L;

            @Override
            public void call(Tuple2<String, Integer> paramT) throws Exception {
                
                System.out.println("..."+paramT._1+"sources..."+paramT._2);
            }
        });
         sc.close();
    }
    private static void sortByKeyAction(){
        SparkConf conf = new SparkConf().setAppName("geoupByaction").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
        //模拟数据
        List<Tuple2<String, Integer>> asList = Arrays.asList(
                new Tuple2<String,Integer>("a1", 90),
                new Tuple2<String,Integer>("a1", 80),
                new Tuple2<String,Integer>("a2", 20),
                new Tuple2<String,Integer>("a2", 30),
                new Tuple2<String,Integer>("a2", 40),
                new Tuple2<String,Integer>("a2", 90),
                new Tuple2<String,Integer>("a2", 50));
         JavaPairRDD<String, Integer> sourceList = sc.parallelizePairs(asList);
         JavaPairRDD<String, Integer> sortByKey = sourceList.sortByKey();
         sortByKey.foreach(new VoidFunction<Tuple2<String,Integer>>() {
            private static final long serialVersionUID = 1L;

            @Override
            public void call(Tuple2<String, Integer> paramT) throws Exception {
                System.out.println("...."+paramT._1 + "sort..."+paramT._2);
                
            }
        });
         sc.close();
    }
    
    private static void cogroupAction(){
        SparkConf conf = new SparkConf().setAppName("geoupByaction").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
        //模拟数据
        List<Tuple2<Integer, String>> students = Arrays.asList(
        new Tuple2<Integer, String>(1, "zhangsan"),
        new Tuple2<Integer, String>(2, "lisi"),
        new Tuple2<Integer, String>(3, "wangwu"),
        new Tuple2<Integer, String>(4, "laoliu"),
        new Tuple2<Integer, String>(5, "zhangqi"),
        new Tuple2<Integer, String>(6, "wangbli"),
        new Tuple2<Integer, String>(7, "bashi"));
                                         
        List<Tuple2<Integer, Integer>> sourceL = Arrays.asList(
                new Tuple2<Integer,Integer>(1,90 ),
                new Tuple2<Integer,Integer>(2, 80),
                new Tuple2<Integer,Integer>(3, 70),
                new Tuple2<Integer,Integer>(4, 60),
                new Tuple2<Integer,Integer>(5, 50),
                new Tuple2<Integer,Integer>(6, 20),
                new Tuple2<Integer,Integer>(7, 30),
                new Tuple2<Integer,Integer>(1, 40),
                new Tuple2<Integer,Integer>(2, 40),
                new Tuple2<Integer,Integer>(3, 100),
                new Tuple2<Integer,Integer>(4, 50),
                new Tuple2<Integer,Integer>(5, 90),
                new Tuple2<Integer,Integer>(6, 90),
                new Tuple2<Integer,Integer>(7, 50));
         JavaPairRDD<Integer, String> studentP = sc.parallelizePairs(students);
         JavaPairRDD<Integer, Integer> sourceP = sc.parallelizePairs(sourceL);
         JavaPairRDD<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> cogroup = studentP.cogroup(sourceP);
         cogroup.foreach(new VoidFunction<Tuple2<Integer,Tuple2<Iterable<String>,Iterable<Integer>>>>() {

            
            private static final long serialVersionUID = 1L;

            @Override
            public void call(Tuple2<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> paramT) throws Exception {
                System.out.println("...id"+paramT._1);
                System.out.println("name...."+paramT._2._1);
                System.out.println("....scode"+paramT._2._2);
                System.out.println("===================");
            }
        });
         sc.close();
    }
    private static void joinAction(){
        SparkConf conf = new SparkConf().setAppName("geoupByaction").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
        //模拟数据
        List<Tuple2<Integer, String>> students = Arrays.asList(
        new Tuple2<Integer, String>(1, "zhangsan"),
        new Tuple2<Integer, String>(2, "lisi"),
        new Tuple2<Integer, String>(3, "wangwu"),
        new Tuple2<Integer, String>(4, "laoliu"),
        new Tuple2<Integer, String>(5, "zhangqi"),
        new Tuple2<Integer, String>(6, "wangbli"),
        new Tuple2<Integer, String>(7, "bashi"));
                                         
        List<Tuple2<Integer, Integer>> sourceL = Arrays.asList(
                new Tuple2<Integer,Integer>(1,90 ),
                new Tuple2<Integer,Integer>(2, 80),
                new Tuple2<Integer,Integer>(3, 70),
                new Tuple2<Integer,Integer>(4, 60),
                new Tuple2<Integer,Integer>(5, 50),
                new Tuple2<Integer,Integer>(6, 20),
                new Tuple2<Integer,Integer>(7, 30),
                new Tuple2<Integer,Integer>(1, 40),
                new Tuple2<Integer,Integer>(2, 40),
                new Tuple2<Integer,Integer>(3, 100),
                new Tuple2<Integer,Integer>(4, 50),
                new Tuple2<Integer,Integer>(5, 90),
                new Tuple2<Integer,Integer>(6, 90),
                new Tuple2<Integer,Integer>(7, 50));
         JavaPairRDD<Integer, String> studentP = sc.parallelizePairs(students);
         JavaPairRDD<Integer, Integer> sourceP = sc.parallelizePairs(sourceL);
        JavaPairRDD<Integer, Tuple2<String, Integer>> join = studentP.join(sourceP);
        join.foreach(new VoidFunction<Tuple2<Integer,Tuple2<String,Integer>>>( ) {

            /**
             * 
             */
            private static final long serialVersionUID = 1L;

            @Override
            public void call(Tuple2<Integer, Tuple2<String, Integer>> paramT) throws Exception {
                System.out.println("...id"+paramT._1);
                System.out.println("name...."+paramT._2._1);
                System.out.println("....scode"+paramT._2._2);
                System.out.println("===================");
                
            }
        });
        sc.close();
    //结果数据
    //
} }

 

posted on 2017-08-27 19:03  ptbx  阅读(252)  评论(0)    收藏  举报