MR案例:单表关联查询

"单表关联"这个实例要求从给出的数据中寻找所关心的数据,它是对原始数据所包含信息的挖掘。

需求:实例中给出 child-parent(孩子—父母)表,要求输出 grandchild-grandparent(孙子—爷奶)表。

package test;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 输入:
 * child        parent
 * 张三            张三的爸爸
 * 张三的爸爸        张三的爷爷
 * 
 * 输出:
 * grandChiled    grandFather
 * 张三            张三的爷爷 
 */
public class MySingle {

    public static void main(String[] args) throws Exception {
        
        //配置环境变量
        System.setProperty("hadoop.home.dir", "F:\\JAVA\\hadoop-2.2.0");
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(MySingle.class);

        job.setMapperClass(STMapper.class);
        job.setReducerClass(STReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        System.exit(job.waitForCompletion(true) ? 0 : -1);
    }

    public static class STMapper extends Mapper<LongWritable, Text, Text, Text>{
        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {

            String[] splited = value.toString().split(" ");
            if(splited.length >= 2){

                //正向输出,value即 父亲前加符号"-"            
                context.write(new Text(splited[0]), new Text("-"+splited[1]));

                //反向输出
                context.write(new Text(splited[1]), new Text(splited[0]));
            }
        }
    }

    public static class STReducer extends Reducer<Text, Text, Text, Text>{
        @Override
        protected void reduce(Text key, Iterable<Text> v2s,Context context)
                throws IOException, InterruptedException {

            List<String> grandChild=new ArrayList<String>();
            List<String> grandParent=new ArrayList<String>();

            for(Text text : v2s){

                //以"-"开始则是key的父亲
                if(text.toString().startsWith("-")){
                    
                    //将可能成为爷爷的变量存储到grandParent集合中去
                    grandParent.add(text.toString().substring(1));            
                }else {

                    grandChild.add(text.toString());                    
                }
            }
            /**
             * 【关键的判断】
             * 当前输入的key既有儿子又有父亲
             */
            if(grandChild.size()!=0 && grandParent.size()!=0){

                for(int i=0;i<grandChild.size();i++){
                    for(int j=0;j<grandParent.size();j++){
                        
                        //key:孙子 value:爷爷
                        context.write(new Text(grandChild.get(i)), new Text(grandParent.get(j)));
                    }
                }                
            }
        }
    }
}
  • 在reduce阶段,将两种Value分别存储到grandchild和grandparent集合中
  • 对于reduce阶段的key,只有当他既有儿子又有父亲时,他才可以使得grandchild和grandparent两集合都不为空
posted @ 2015-08-15 13:30  skyl夜  阅读(580)  评论(0编辑  收藏  举报