MR案例：倒排索引 && MultipleInputs

本案例采用 MultipleInputs类实现多路径输入的倒排索引。解读：MR多路径输入

package test0820;

import java.io.IOException;
import java.lang.reflect.Method;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WC0826 {

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(WC0826.class);      

        job.setMapperClass(IIMapper.class);
        job.setCombinerClass(IICombiner.class);
        job.setReducerClass(IIReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        //使用MultipleInputs类指定多路径输入
        MultipleInputs.addInputPath(job, new Path(args[0]), TextInputFormat.class);
        MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class);
        FileOutputFormat.setOutputPath(job, new Path(args[2]));

        System.exit(job.waitForCompletion(true)? 0:1);
    }

    //map
    public static class IIMapper extends Mapper<LongWritable, Text, Text, Text>{

        String fileName;

        /**
         * 使用 MultipleInputs 获得 FileName 必须添加的类
         */
        private Path getFilePath(Context context) throws IOException {
            
            InputSplit split = context.getInputSplit();
            Class<? extends InputSplit> splitClass = split.getClass();
            
            FileSplit fileSplit = null;
            if (splitClass.equals(FileSplit.class)) {
                fileSplit = (FileSplit) split;
            } else if (splitClass.getName().
                    equals("org.apache.hadoop.mapreduce.lib.input.TaggedInputSplit")) {
                
                // begin reflection hackery...
                try {
                    Method getInputSplitMethod = splitClass.getDeclaredMethod("getInputSplit");
                    getInputSplitMethod.setAccessible(true);
                    fileSplit = (FileSplit) getInputSplitMethod.invoke(split);
                } catch (Exception e) {                    
                    // wrap and re-throw error
                    throw new IOException(e);
                }
                // end reflection hackery
            }
            return fileSplit.getPath();
        }

        @Override
        protected void setup(Context context)
                throws IOException, InterruptedException {
            
            //get file name
            fileName = getFilePath(context).getName();
        }

        @Override
        protected void map(LongWritable key, Text value,Context context)
                throws IOException, InterruptedException {

            String[] splited = value.toString().split("\t"); 

            for(String word : splited){
                Text word_fileName=new Text(word+"@"+fileName);
                context.write(word_fileName,new Text("1"));
            }
        }
    }

    //combiner
    public static class IICombiner extends Reducer<Text, Text, Text, Text>{
        @Override
        protected void reduce(Text key, Iterable<Text> v2s, Context context)
                throws IOException, InterruptedException {

            Long sum = 0L;    
            String value=new String();

            String[] splited = key.toString().split("@");

            for(Text vl :v2s){
                sum += Long.parseLong(vl.toString());
                value = splited[1]+"@"+sum.toString();                
            }
            context.write(new Text(splited[0]), new Text(value));
        }
    }

    //reduce
    public static class IIReducer extends Reducer<Text, Text, Text, Text>{
        @Override
        protected void reduce(Text key, Iterable<Text> v2s, Context context)
                throws IOException, InterruptedException {

            String value=new String();

            for(Text text : v2s){
                value = text.toString()+":"+value;                
            }
            //去掉最后的":"
            context.write(key, new Text(value.substring(0, value.length()-1)));
        }
    }
}

出现问题01：使用MultipleInputs类指定输入路径，当setup()方法中调用getInputSplit()方法获取当前split对应的FileName时会报IO异常：

Error: java.lang.ClassCastException: org.apache.hadoop.mapreduce.lib.input.TaggedInputSplit cannot be cast to org.apache.hadoop.mapreduce.lib.input.FileSplit

问题原因01：filesplit实际上就是TaggedInputSplit中的成员变量inputSplit，而TaggedInputSplit类并不是public的(默认是default声明类型)，所以不能直接获得对应的信息。

解决方案01：

第一种方法：在当前项目中新建对应的TaggedInputSplit类，并声明为public。即覆盖掉原有TaggedInputSplit类的声明类型。然后通过以下代码就可以正确调用：

(FileSplit)((TaggedInputSplit)reporter.getInputSplit()).getInputSplit();

第二种方法：通过反射机制。代码如下：

/**
 * 反射机制
 * 使用 MultipleInputs 获得 FileName 必须添加的类
 */
private Path getFilePath(Context context) throws IOException {
    
    InputSplit split = context.getInputSplit();
    Class<? extends InputSplit> splitClass = split.getClass();
    
    FileSplit fileSplit = null;
    if (splitClass.equals(FileSplit.class)) {
        fileSplit = (FileSplit) split;
    } else if (splitClass.getName().
            equals("org.apache.hadoop.mapreduce.lib.input.TaggedInputSplit")) {
        
        // begin reflection hackery...
        try {
            Method getInputSplitMethod = splitClass.getDeclaredMethod("getInputSplit");
            getInputSplitMethod.setAccessible(true);
            fileSplit = (FileSplit) getInputSplitMethod.invoke(split);
        } catch (Exception e) {                    
            // wrap and re-throw error
            throw new IOException(e);
        }
        // end reflection hackery
    }
    return fileSplit.getPath();
}

出现问题02：

map<Object,Text,Text,IntWritble>
combiner<Text,IntWritble,Text,Text>
reduce<Text,Text,Text,Text>

这样设置，系统会异常。这是因为Combiner和Reducer其实是同一个函数，所以输入和输出类型必须保持一致。

Combiner实现对map端value的聚合，减少map 到 reudce 间数据传输，加快 shuffle 速度。牢记求平均值的MR不能使用Combiner。

posted @ 2015-08-26 12:54 skyl夜阅读(699) 评论(1) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

MR案例：倒排索引 && MultipleInputs

公告