MapReduce实现WordCount
MapReduce实现WC的步骤:
1、创建WC类继承 configured,实现Tool接口
2、实现Mapper内部类
3、实现Reducer内部类
4、设置job相关信息
5、提交job运行
以下是实现代码:
package mr; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.CounterGroup; import org.apache.hadoop.mapreduce.Counters; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class Wordcount extends Configured implements Tool { public static class WordcountMapper extends Mapper<LongWritable,Text,Text,LongWritable>{ Text keyout =new Text(); LongWritable valueout= new LongWritable(1L); @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException { System.out.println("---------------------------map()---------------------------"); //一次加载一行,用counter统计mapreduce输入的数据有多少行 context.getCounter("myc", "line num").increment(1L); System.out.println("map input ==> keyin:" +key.get()+"; valuein :" +value.toString()); //按行读取数据 String line =value.toString(); //根据空格切割数据 String[] splits=line.split(" "); for (String word : splits) { keyout.set(word); //map输出数据 context.write(keyout,valueout); System.out.println("map output ==> key:" +word+"; value :" +valueout.get()); } } } //编写reduce静态内部类 public static class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>{ LongWritable valueout=new LongWritable(); @Override protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { //一种Key调用一次reduce,调用一次加一次 context.getCounter("myc", "key types num").increment(1L); //key:one //value[1,1,1,1] StringBuilder sb =new StringBuilder("reduce input ===> key:"+key.toString()+"values["); long sum = 0L; for (LongWritable w : values) { long n =w.get(); sum += n; sb.append(n).append(","); } sb.deleteCharAt(sb.length()-1).append("]"); System.out.println(sb.toString()); valueout.set(sum); context.write(key, valueout); } } public int run(String[] args) throws Exception { //获取configuration对象,用于创建mr任务的job对象 Configuration conf=getConf(); //创建job对象 Job job =Job.getInstance(conf, "wordcount"); //设置job运行的类 job.setJarByClass(Wordcount.class); //设置Map运行的类 job.setMapperClass(WordcountMapper.class); //设置reduce运行的类 job.setReducerClass(WordCountReducer.class); //设置reduce的个数,(默认一个可以不写) job.setNumReduceTasks(3); //设置map输出key的类型 job.setMapOutputKeyClass(Text.class); //设置Map的输出的value值 job.setMapOutputValueClass(LongWritable.class); //设置reduce最终输出的key的类型 job.setOutputKeyClass(Text.class); //设置reduce最终输出的value的类型 job.setOutputValueClass(LongWritable.class); //设置任务的输入目录 FileInputFormat.addInputPath(job, new Path(args[0])); Path outputDir = new Path(args[1]); //增加自动删除输出目录 //获取文件系统对象,通过文件系统对象来操作删除目录 FileSystem fs =FileSystem.get(conf); if(fs.exists(outputDir)){ fs.delete(outputDir,true); System.out.println("删除输出路径:"+outputDir.toString() +"成功。"); } //设置任务的输出目录 FileOutputFormat.setOutputPath(job, outputDir); //运行job任务,不打印conter boolean status =job.waitForCompletion(true); //在提交任务之后 Counters counters = job.getCounters(); CounterGroup group =counters.getGroup("mayucheng"); StringBuilder sb =new StringBuilder(); sb.append("\t").append("mayucheng").append("\n"); //遍历CounterGroup下的所有counter for (Counter counter : group) { sb.append("\t\t").append(counter.getDisplayName()).append("=").append(counter.getValue()).append("\n"); } //寻找指定的counter long num =group.findCounter("line num").getValue(); System.out.println("\tline num "+ num); System.out.println("--------------------------"); System.out.println(sb.toString()); return status ? 0:1; } public static void main(String[] args) throws Exception { ToolRunner.run(new Wordcount(), args); } }
注意:
程序中文件的输入输出路径设置。