MapReduce实现WordCount

MapReduce实现WC的步骤:

1、创建WC类继承 configured,实现Tool接口

2、实现Mapper内部类

3、实现Reducer内部类

4、设置job相关信息

5、提交job运行

以下是实现代码:

package mr;

import java.io.IOException;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.CounterGroup;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class Wordcount extends Configured implements Tool {

    public static class WordcountMapper extends Mapper<LongWritable,Text,Text,LongWritable>{
        Text keyout =new Text();
        LongWritable valueout= new LongWritable(1L);
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context)
                throws IOException, InterruptedException {
            System.out.println("---------------------------map()---------------------------");
            //一次加载一行,用counter统计mapreduce输入的数据有多少行
            context.getCounter("myc", "line num").increment(1L);
            System.out.println("map input ==> keyin:" +key.get()+"; valuein :" +value.toString());
            //按行读取数据
            String line =value.toString();
            //根据空格切割数据
            String[] splits=line.split(" ");
            for (String word : splits) {
                keyout.set(word);
                //map输出数据
                context.write(keyout,valueout);
                System.out.println("map output ==> key:" +word+"; value :" +valueout.get());

            }
        }
    }
    
    
    
    //编写reduce静态内部类
    public static class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
        LongWritable valueout=new LongWritable();
        @Override
        protected void reduce(Text key, Iterable<LongWritable> values,
                Context context) throws IOException, InterruptedException {
            //一种Key调用一次reduce,调用一次加一次
            context.getCounter("myc", "key types num").increment(1L);
            
            //key:one
            //value[1,1,1,1]
            StringBuilder sb =new StringBuilder("reduce input ===> key:"+key.toString()+"values[");
            long sum = 0L;
            for (LongWritable w : values) {
                long n =w.get();
                sum += n;
                sb.append(n).append(",");
                
            }
            sb.deleteCharAt(sb.length()-1).append("]");
            System.out.println(sb.toString());
            valueout.set(sum);
            context.write(key, valueout);
        }
        
    }
    public int run(String[] args) throws Exception {
        //获取configuration对象,用于创建mr任务的job对象
        Configuration conf=getConf();
        //创建job对象
        Job job =Job.getInstance(conf, "wordcount");
        //设置job运行的类
        job.setJarByClass(Wordcount.class);
        //设置Map运行的类
        job.setMapperClass(WordcountMapper.class);
        //设置reduce运行的类
        job.setReducerClass(WordCountReducer.class);
        //设置reduce的个数,(默认一个可以不写)
        job.setNumReduceTasks(3);
        //设置map输出key的类型
        job.setMapOutputKeyClass(Text.class);
        //设置Map的输出的value值
        job.setMapOutputValueClass(LongWritable.class);
        //设置reduce最终输出的key的类型
        job.setOutputKeyClass(Text.class);
        //设置reduce最终输出的value的类型
        job.setOutputValueClass(LongWritable.class);
        //设置任务的输入目录
        FileInputFormat.addInputPath(job, new Path(args[0]));
        Path outputDir = new Path(args[1]);
        //增加自动删除输出目录
        //获取文件系统对象,通过文件系统对象来操作删除目录
        FileSystem fs =FileSystem.get(conf);
        if(fs.exists(outputDir)){
            fs.delete(outputDir,true);
            System.out.println("删除输出路径:"+outputDir.toString() +"成功。");
        }
        //设置任务的输出目录
        FileOutputFormat.setOutputPath(job, outputDir);
        //运行job任务,不打印conter
        boolean status =job.waitForCompletion(true);
        //在提交任务之后
        Counters counters = job.getCounters();
        CounterGroup group =counters.getGroup("mayucheng");
        StringBuilder sb =new StringBuilder();
        sb.append("\t").append("mayucheng").append("\n");
        //遍历CounterGroup下的所有counter
        for (Counter counter : group) {
            sb.append("\t\t").append(counter.getDisplayName()).append("=").append(counter.getValue()).append("\n");
        }
        //寻找指定的counter
        long num =group.findCounter("line num").getValue();
        System.out.println("\tline num "+ num);
        System.out.println("--------------------------");
        System.out.println(sb.toString());
        return  status ? 0:1;
    }
     public static void main(String[] args) throws Exception {
        ToolRunner.run(new Wordcount(), args);
    }
}

注意:

程序中文件的输入输出路径设置。

 

posted @ 2020-03-09 16:45  myc513  阅读(238)  评论(0)    收藏  举报