spark Transformation 和action 例子
包含算子有:
Map、filter、flatMap、groupByKey、reduceByKey、SortByKey、join、cogroup
示例代码如下:
import java.util.Arrays; import java.util.Iterator; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; /** * @author 作者 E-mail: * @version 创建时间:2017年8月25日 下午9:09:00 * 类说明 */ @SuppressWarnings("unused") public class SparkTransformAction { public static void main(String[] args) { //mapAction(); //filterAction(); //flatMapAction(); //groupByKeyAction(); //redueByKeyAction(); //sortByKeyAction(); //joinAction(); cogroupAction(); } /** * map transformaction example */ private static void mapAction(){ SparkConf conf = new SparkConf().setAppName("mapaction").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); List<Integer> asList = Arrays.asList(1, 2, 4, 8); JavaRDD<Integer> javaRDD = sc.parallelize(asList); //第二个泛型参数就是返回新的数据类型 JavaRDD<Integer> map = javaRDD.map(new Function<Integer, Integer>() { private static final long serialVersionUID = 1L; @Override public Integer call(Integer v1) throws Exception { return v1 * 2; } }); //action 执行 map.foreach(new VoidFunction<Integer>() { private static final long serialVersionUID = 1L; @Override public void call(Integer v1) throws Exception { System.out.println("new datas..."+ v1); } }); sc.close(); } private static void filterAction(){ SparkConf conf = new SparkConf().setAppName("filterAction").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); List<Integer> asList = Arrays.asList(1, 2, 4, 8, 11, 23, 45, 55, 44); JavaRDD<Integer> javaRDD = sc.parallelize(asList); JavaRDD<Integer> evenNums = javaRDD.filter(new Function<Integer, Boolean>() { private static final long serialVersionUID = 1L; @Override public Boolean call(Integer num1) throws Exception { return num1 % 2 == 0; } }); evenNums.foreach(new VoidFunction<Integer>() { private static final long serialVersionUID = 1L; @Override public void call(Integer t1) throws Exception { System.out.println("............"+ t1); } }); sc.close(); } private static void flatMapAction(){ SparkConf conf = new SparkConf().setAppName("filterAction").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> tf = sc.textFile("G://121.txt"); JavaRDD<String> flatMap = tf.flatMap(new FlatMapFunction<String, String>() { private static final long serialVersionUID = 1L; @Override public Iterable<String> call(String paramT) throws Exception { return Arrays.asList(paramT.split(" ")); } }); flatMap.foreach(new VoidFunction<String>() { private static final long serialVersionUID = 1L; @Override public void call(String paramT) throws Exception { System.out.println("...is..."+ paramT); } }); sc.close(); } /**groupByKey按照班级对学生成绩进行分组 * */ private static void groupByKeyAction(){ SparkConf conf = new SparkConf().setAppName("geoupByaction").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); //模拟数据 List<Tuple2<String, Integer>> asList = Arrays.asList( new Tuple2<String,Integer>("a1", 90), new Tuple2<String,Integer>("a1", 80), new Tuple2<String,Integer>("a2", 90), new Tuple2<String,Integer>("a2", 50)); JavaPairRDD<String, Integer> sourceList = sc.parallelizePairs(asList); JavaPairRDD<String, Iterable<Integer>> groupByKey = sourceList.groupByKey(); groupByKey.foreach(new VoidFunction<Tuple2<String,Iterable<Integer>>>() { /** * */ private static final long serialVersionUID = 1L; @Override public void call(Tuple2<String, Iterable<Integer>> paramT) throws Exception { System.out.println("....."+paramT._1); Iterator<Integer> iterator = paramT._2.iterator(); while (iterator.hasNext()){ System.out.println("....source...."+iterator.next()); } } }); sc.close(); } /** * reduceByKey 统计每个班级的总分 */ private static void redueByKeyAction(){ SparkConf conf = new SparkConf().setAppName("geoupByaction").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); //模拟数据 List<Tuple2<String, Integer>> asList = Arrays.asList( new Tuple2<String,Integer>("a1", 90), new Tuple2<String,Integer>("a1", 80), new Tuple2<String,Integer>("a2", 20), new Tuple2<String,Integer>("a2", 30), new Tuple2<String,Integer>("a2", 40), new Tuple2<String,Integer>("a2", 90), new Tuple2<String,Integer>("a2", 50)); JavaPairRDD<String, Integer> sourceList = sc.parallelizePairs(asList); JavaPairRDD<String, Integer> reduces = sourceList.reduceByKey(new Function2<Integer, Integer, Integer>() { private static final long serialVersionUID = 1L; @Override public Integer call(Integer paramT1, Integer paramT2) throws Exception { return paramT1 + paramT2; } }); reduces.foreach(new VoidFunction<Tuple2<String,Integer>>() { private static final long serialVersionUID = 1L; @Override public void call(Tuple2<String, Integer> paramT) throws Exception { System.out.println("..."+paramT._1+"sources..."+paramT._2); } }); sc.close(); } private static void sortByKeyAction(){ SparkConf conf = new SparkConf().setAppName("geoupByaction").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); //模拟数据 List<Tuple2<String, Integer>> asList = Arrays.asList( new Tuple2<String,Integer>("a1", 90), new Tuple2<String,Integer>("a1", 80), new Tuple2<String,Integer>("a2", 20), new Tuple2<String,Integer>("a2", 30), new Tuple2<String,Integer>("a2", 40), new Tuple2<String,Integer>("a2", 90), new Tuple2<String,Integer>("a2", 50)); JavaPairRDD<String, Integer> sourceList = sc.parallelizePairs(asList); JavaPairRDD<String, Integer> sortByKey = sourceList.sortByKey(); sortByKey.foreach(new VoidFunction<Tuple2<String,Integer>>() { private static final long serialVersionUID = 1L; @Override public void call(Tuple2<String, Integer> paramT) throws Exception { System.out.println("...."+paramT._1 + "sort..."+paramT._2); } }); sc.close(); } private static void cogroupAction(){ SparkConf conf = new SparkConf().setAppName("geoupByaction").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); //模拟数据 List<Tuple2<Integer, String>> students = Arrays.asList( new Tuple2<Integer, String>(1, "zhangsan"), new Tuple2<Integer, String>(2, "lisi"), new Tuple2<Integer, String>(3, "wangwu"), new Tuple2<Integer, String>(4, "laoliu"), new Tuple2<Integer, String>(5, "zhangqi"), new Tuple2<Integer, String>(6, "wangbli"), new Tuple2<Integer, String>(7, "bashi")); List<Tuple2<Integer, Integer>> sourceL = Arrays.asList( new Tuple2<Integer,Integer>(1,90 ), new Tuple2<Integer,Integer>(2, 80), new Tuple2<Integer,Integer>(3, 70), new Tuple2<Integer,Integer>(4, 60), new Tuple2<Integer,Integer>(5, 50), new Tuple2<Integer,Integer>(6, 20), new Tuple2<Integer,Integer>(7, 30), new Tuple2<Integer,Integer>(1, 40), new Tuple2<Integer,Integer>(2, 40), new Tuple2<Integer,Integer>(3, 100), new Tuple2<Integer,Integer>(4, 50), new Tuple2<Integer,Integer>(5, 90), new Tuple2<Integer,Integer>(6, 90), new Tuple2<Integer,Integer>(7, 50)); JavaPairRDD<Integer, String> studentP = sc.parallelizePairs(students); JavaPairRDD<Integer, Integer> sourceP = sc.parallelizePairs(sourceL); JavaPairRDD<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> cogroup = studentP.cogroup(sourceP); cogroup.foreach(new VoidFunction<Tuple2<Integer,Tuple2<Iterable<String>,Iterable<Integer>>>>() { private static final long serialVersionUID = 1L; @Override public void call(Tuple2<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> paramT) throws Exception { System.out.println("...id"+paramT._1); System.out.println("name...."+paramT._2._1); System.out.println("....scode"+paramT._2._2); System.out.println("==================="); } }); sc.close(); } private static void joinAction(){ SparkConf conf = new SparkConf().setAppName("geoupByaction").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); //模拟数据 List<Tuple2<Integer, String>> students = Arrays.asList( new Tuple2<Integer, String>(1, "zhangsan"), new Tuple2<Integer, String>(2, "lisi"), new Tuple2<Integer, String>(3, "wangwu"), new Tuple2<Integer, String>(4, "laoliu"), new Tuple2<Integer, String>(5, "zhangqi"), new Tuple2<Integer, String>(6, "wangbli"), new Tuple2<Integer, String>(7, "bashi")); List<Tuple2<Integer, Integer>> sourceL = Arrays.asList( new Tuple2<Integer,Integer>(1,90 ), new Tuple2<Integer,Integer>(2, 80), new Tuple2<Integer,Integer>(3, 70), new Tuple2<Integer,Integer>(4, 60), new Tuple2<Integer,Integer>(5, 50), new Tuple2<Integer,Integer>(6, 20), new Tuple2<Integer,Integer>(7, 30), new Tuple2<Integer,Integer>(1, 40), new Tuple2<Integer,Integer>(2, 40), new Tuple2<Integer,Integer>(3, 100), new Tuple2<Integer,Integer>(4, 50), new Tuple2<Integer,Integer>(5, 90), new Tuple2<Integer,Integer>(6, 90), new Tuple2<Integer,Integer>(7, 50)); JavaPairRDD<Integer, String> studentP = sc.parallelizePairs(students); JavaPairRDD<Integer, Integer> sourceP = sc.parallelizePairs(sourceL); JavaPairRDD<Integer, Tuple2<String, Integer>> join = studentP.join(sourceP); join.foreach(new VoidFunction<Tuple2<Integer,Tuple2<String,Integer>>>( ) { /** * */ private static final long serialVersionUID = 1L; @Override public void call(Tuple2<Integer, Tuple2<String, Integer>> paramT) throws Exception { System.out.println("...id"+paramT._1); System.out.println("name...."+paramT._2._1); System.out.println("....scode"+paramT._2._2); System.out.println("==================="); } }); sc.close();
//结果数据
//
} }
浙公网安备 33010602011771号