MapReduce编程：词频统计

首先在项目的src文件中需要加入以下文件，log4j的内容为：

log4j.rootLogger=INFO, stdout

log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n

log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=target/spring.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n

代码如下：

package org.apache.hadoop.examples;

    import java.io.IOException;
    import java.util.Iterator;
    import java.util.StringTokenizer;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.util.GenericOptionsParser;

    public class WordCount {
        public WordCount() {
        }

        //main函数，MapReduce程序运行的入口
        public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();   //指定HDFS相关的参数

            //String[] otherArgs = (new GenericOptionsParser(conf, args)).getRemainingArgs();
            String[] otherArgs = new String[]{"input","output"};
            if(otherArgs.length < 2) {
                System.err.println("Usage: wordcount <in> [<in>...] <out>");
                System.exit(2);
            }

            //通过Job类设置Hadoop程序运行时的环境变量
            Job job = Job.getInstance(conf, "word count");  //设置环境参数
            job.setJarByClass(WordCount.class);  //设置整个程序的类名
            job.setMapperClass(WordCount.TokenizerMapper.class); //添加Mapper类
            job.setCombinerClass(WordCount.IntSumReducer.class);
            job.setReducerClass(WordCount.IntSumReducer.class); //添加Reducer类
            job.setOutputKeyClass(Text.class);  //设置输出类型，因为输出的形式是<单词，个数>，所以这里用Text，类似于Java的String，但还是有些区别
            job.setOutputValueClass(IntWritable.class);  //设置输出类型，类似于Java的Int

            for(int i = 0; i < otherArgs.length - 1; ++i) {
                FileInputFormat.addInputPath(job, new Path(otherArgs[i]));    //设置输入文件
            }

            FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));   //设置输出文件
            System.exit(job.waitForCompletion(true)?0:1);  //提交作业
        }

        //Reduce处理逻辑
        public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
            private IntWritable result = new IntWritable();

            public IntSumReducer() {
            }

            public void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
                int sum = 0;

                IntWritable val;
                for(Iterator i$ = values.iterator(); i$.hasNext(); sum += val.get()) {
                    val = (IntWritable)i$.next();
                }

                this.result.set(sum);
                context.write(key, this.result);
            }
        }


        //Map处理逻辑
        public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
            private static final IntWritable one = new IntWritable(1);
            private Text word = new Text();

            public TokenizerMapper() {
            }

            public void map(Object key, Text value, Mapper<Object, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
                StringTokenizer itr = new StringTokenizer(value.toString());   //分词器

                while(itr.hasMoreTokens()) {
                    this.word.set(itr.nextToken());
                    context.write(this.word, one);  //输出键值对
                    //这里也可以直接写成context.write(new Text(word), new IntWritable(1));
                }

            }
        }
    }

posted @ 2019-01-09 15:37 Kayden_Cheung 阅读(519) 评论(0) 收藏举报

刷新页面返回顶部

Kayden_Cheung's Blog

对未来的真正慷慨，是把一切献给现在。

MapReduce编程：词频统计

公告