Spark RDD - Task not serializable记录
开始编写的程序:
public class SortTest {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("SortTest");
JavaSparkContext sc = new JavaSparkContext(conf);
sc.setLogLevel("WARN");
JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(5, 4, 2, 9, 18, 1));
JavaRDD<Integer> integerJavaRDD = rdd.sortBy(x -> x, true, 1);
System.out.println(StringUtils.join(integerJavaRDD.collect(), ","));
JavaPairRDD<String, Integer> rdd2 = sc.parallelizePairs(Arrays.asList(new Tuple2<>("b", 1), new Tuple2<>("c", 1), new Tuple2<>("a", 1)));
rdd2 = rdd2.sortByKey(new Comparator<String>() {
@Override
public int compare(String o1, String o2) {
return o1.compareTo(o2);
}
});
System.out.println(StringUtils.join(rdd2.collect(), ","));
}
}
运行时出现:

任务没有序列化。根据所学,main函数所在的即为Driver端的代码,这里提示SortTest没有序列化。但是SortTest类为主类,并不需要序列化。另外,算子里面的代码是在Executor中运行,则这里的sortByKey的参数,匿名比较器则是在Executor中运行,则Comparator需要序列化。但是没有找到Java匿名类序列化的方法,则单独实现一个具名类实现Serializable接口来代替,如下:
1 public class SortTest { 2 3 static class IntegerComparator implements Comparator<String>, Serializable { 4 5 @Override 6 public int compare(String o1, String o2) { 7 return o1.compareTo(o2); 8 } 9 } 10 11 public static void main(String[] args) { 12 13 SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("SortTest"); 14 JavaSparkContext sc = new JavaSparkContext(conf); 15 sc.setLogLevel("WARN"); 16 17 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(5, 4, 2, 9, 18, 1)); 18 19 JavaRDD<Integer> integerJavaRDD = rdd.sortBy(x -> x, true, 1); 20 System.out.println(StringUtils.join(integerJavaRDD.collect(), ",")); 21 22 JavaPairRDD<String, Integer> rdd2 = sc.parallelizePairs(Arrays.asList(new Tuple2<>("b", 1), new Tuple2<>("c", 1), new Tuple2<>("a", 1))); 23 rdd2 = rdd2.sortByKey(new IntegerComparator()); 24 System.out.println(StringUtils.join(rdd2.collect(), ",")); 25 26 sc.stop(); 27 } 28 }

浙公网安备 33010602011771号