一、代码范例:

public class BatchWordCount {

    public static void main(String[] args) throws Exception {
        //离线批处理使用的执行任务是ExecutionEnvironment,少了Stream
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        //使用ExecutionEnvironment创建DataSet
        DataSource<String> lines = env.readTextFile(args[0]);
        
        //切分压平
        FlatMapOperator<String, Tuple2<String, Integer>> wordAndOne = lines.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
            @Override
            public void flatMap(String line,
                    Collector<Tuple2<String, Integer>> out) throws Exception {
                String[] words = line.split(" ");
                for(String word : words) {
                     Tuple2<String, Integer> tp = Tuple2.of(word, 1);
                     out.collect(tp);
                }                
            }
        });
        //离线计算实现的是分组聚合,调用的是groupBy
        AggregateOperator<Tuple2<String, Integer>> summed = wordAndOne.groupBy(0).sum(1);
    
        //将结果保存到HDFS
        summed.writeAsText(args[1]);
        
        env.execute("BatchWordCount");
    }
}

输入文件:

spark spark strom flink
flink hadoop spark
jstorm hadoop

输出文件

1         (jstorm,1)

2        (hadoop,2)    

3        (flink,2)

4        (spark,3)    (strom,1)