Hadoop学习-多表关联（MapReduce学习）

package org.bigdata508.util;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.bigdata.util.HadoopCfg;

/*
 * 多表关联
 * */

public class Multi {
    
    public static int time = 0;
    
    public static class MultiMapper extends Mapper<LongWritable, Text, Text, Text>{
        
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
            String line = value.toString();
            String relationType = new String();
            
            //输入文件首行，不进行处理
            if(line.contains("fname") == true 
                    || line.contains("addressname") == true){
                return ;
            }
            
            //输入的一行预处理文本
            StringTokenizer itr = new StringTokenizer(line);
            String mapKey = new String();
            String mapValue = new String();
            int i = 0;
            while(itr.hasMoreTokens()){
                //先读取一个单词
                String token = itr.nextToken();
                //判断为地址id则存在values[0]
                if(token.charAt(0) >= '0' && token.charAt(0) <= '9'){
                    mapKey = token;
                    if(i > 0){
                        relationType = "1";
                    }else{
                        relationType = "2";
                    }
                    continue;
                }
                //存工厂名
                mapValue += token + " ";
                i ++;
            }
            //输出左右表
            context.write(new Text(mapKey), new Text(relationType +  "+" + mapValue));
        }
    }
    
    /*
     * reduce 解析map输出，将value中数据按照左右表分别保存
     * 求出笛卡尔积，输出
     * */
    
    public static class MultiReduce extends Reducer<Text, Text, Text, Text>{

        @Override
        protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
            if(time == 0){
                context.write(new Text("fname"),new Text("addressname"));
                time ++;
            }
            
            int factNum = 0;
            String[] fact = new String[10];
            int addrNum = 0;
            String[] addr = new String[10];
            
            Iterator ite = values.iterator();
            
            while(ite.hasNext()){
                String record = ite.next().toString();
                System.out.println(record);
                int len = record.length();
                int i = 2;
                if(len == 0){
                    continue;
                }
                
                //取得左右表标识
                char relationType = record.charAt(0);
                
                //左表
                if(relationType == '1'){
                    fact[factNum] = record.substring(i);
                    factNum ++;
                }
                
                //右表
                if(relationType == '2'){
                    addr[addrNum] = record.substring(i);
                    addrNum ++;
                }
            }
            System.out.print("factNum: "+factNum+"\n");
            System.out.print("addrNum: "+addrNum+"\n");
            //求笛卡尔积
            if(factNum != 0 && addrNum != 0){
                for(int m = 0;m < factNum;m ++){
                    for(int n = 0;n < addrNum;n ++){
                        //输出结果
                        context.write(new Text(fact[m]), new Text(addr[n]));
                    }
                }
            }
        }
        
    }
    
    public static void main(String[] args) throws Exception{
        Configuration conf = HadoopCfg.getInstance();
//        String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
//        if(otherArgs.length != 2){
//            System.err.println("Usage:Mutiple Table Join <in> <out>");
//            System.exit(2);
//        }
        Job job = Job.getInstance(conf,"multi");
        job.setJarByClass(Multi.class);
        job.setMapperClass(MultiMapper.class);
        job.setReducerClass(MultiReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job, new Path("/MultiInput"));  
        FileOutputFormat.setOutputPath(job, new Path("/output/"));  
        System.exit(job.waitForCompletion(true)?0:1);  
    }
    
    
    
}

factory.txt:

fname addrId
Beijing Red Star 1
Shenzhen Thunder 223
Guangzhou Honda 2
Beijing Rising 1
Guangzhou Development bank 2
Tencent 223
Bank of Beijing 1

address.txt
addrId addressname
1 Beijing
2 Guangzhou
223 Shenzhen
114 Xian

===================================================================================

后续：KMeans-->KNN-->PeopleRank-->Native Bayes-->HMM-->Apriori-->hBase-->Rsync

===================================================================================

posted @ 2016-05-14 09:18 Decmber 阅读(418) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

Decmber

Hadoop学习-多表关联（MapReduce学习）

公告