mapjoin与reducejoin

一、mapjoin

1.Mapper类

package com.css.mapjoin;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

// 思路:商品表加载到内存中  然后数据在map端输出前  进行替换
public class CacheMapper extends Mapper<LongWritable, Text, Text, NullWritable>{

    HashMap<String, String> pdMap = new HashMap<>();
    
    // 1.商品表加载到内存
    @Override
    protected void setup(Context context)throws IOException {
        // 加载缓存文件
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream("pd.txt"), "UTF-8"));
        String line;
        while (StringUtils.isNotEmpty(line = br.readLine())) {
            // 切分
            String[] fields = line.split("\t");
            // 缓存
            pdMap.put(fields[0], fields[1]);
        }
        br.close();
    }
    
    // 2.map传输
    @Override
    protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
        // 获取数据
        String line = value.toString();
        // 切割
        String[] fields = line.split("\t");
        // 获取订单中商品id
        String pid = fields[1];
        // 根据订单商品id获取商品名
        String pName = pdMap.get(pid);
        // 拼接数据
        line = line + "\t" + pName;
        // 输出
        context.write(new Text(line), NullWritable.get());
    }
}

2.Driver类

package com.css.mapjoin;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class CacheDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        // 1.获取job信息
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        // 2.获取jar包
        job.setJarByClass(CacheDriver.class);
        // 3.获取自定义的mapper与reducer类
        job.setMapperClass(CacheMapper.class);
        // 4.设置reduce输出的数据类型(最终的数据类型)
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
        // 5.设置输入存在的路径与处理后的结果路径
        FileInputFormat.setInputPaths(job, new Path("c:/table1029/in"));
        FileOutputFormat.setOutputPath(job, new Path("c:/table1029/out"));
        // 6.加载缓存商品数据
        job.addCacheFile(new URI("file:///c:/inputcache/pd.txt"));
        // 7.设置一下reducetask的数量
        job.setNumReduceTasks(0);
        // 8.提交任务
        boolean rs = job.waitForCompletion(true);
        System.out.println(rs ? 0 : 1);
    }
}

3.输入文件

(1)order.txt
201801    01    1
201802    02    2
201803    03    3
201804    01    4
201805    02    5
201806    03    6

(2)pd.txt
01    苹果
02    华为
03    小米

4.输出文件part-m-00000

201801    01    1    苹果
201802    02    2    华为
201803    03    3    小米
201804    01    4    苹果
201805    02    5    华为
201806    03    6    小米

二、reducejoin

1.Mapper类

package com.css.reducejoin;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class TableMapper extends Mapper<LongWritable, Text, Text, TableBean>{

    @Override
    protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
        
        TableBean v = new TableBean();
        Text k = new Text();
        
        // 区分两张表
        FileSplit inputSplit = (FileSplit) context.getInputSplit();
        String name = inputSplit.getPath().getName();
        
        // 获取数据
        String line = value.toString();
        
        // 区分  此时是订单表
        if (name.contains("order.txt")) {
            // 切分字段
            String[] fields = line.split("\t");
            // 封装对象
            v.setOrder_id(fields[0]);
            v.setPid(fields[1]);
            v.setAmount(Integer.parseInt(fields[2]));
            v.setpName("");
            v.setFlag("0");
            // 设置k 商品id作为k
            k.set(fields[1]);
        }else { // 此时为商品表
            // 切分字段
            String[] fields = line.split("\t");
            // 封装对象
            v.setOrder_id("");
            v.setPid(fields[0]);
            v.setAmount(0);
            v.setpName(fields[1]);
            v.setFlag("1");
            // 设置k 商品id作为k
            k.set(fields[0]);
        }
        context.write(k, v);
    }
}

2.Reducer类

package com.css.reducejoin;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class TableReducer extends Reducer<Text, TableBean, TableBean, NullWritable>{

    @Override
    protected void reduce(Text key, Iterable<TableBean> values,
            Context context) throws IOException, InterruptedException {
        // 创建集合  存放订单数据
        ArrayList<TableBean> orderBean = new ArrayList<TableBean>();
        
        // 商品存储
        TableBean pdBean = new TableBean(); // 把pd商品中商品名  拷贝到orderBean
        
        for (TableBean v : values) {
            if ("0".equals(v.getFlag())) { // 订单表
                // 1.创建一个临时变量  拷贝数据
                TableBean tableBean = new TableBean();
                // 2.拷贝
                try {
                    BeanUtils.copyProperties(tableBean, v);
                } catch (IllegalAccessException | InvocationTargetException e) {
                    e.printStackTrace();
                }
                orderBean.add(tableBean);
            }else {
                try {
                    BeanUtils.copyProperties(pdBean, v);
                } catch (IllegalAccessException | InvocationTargetException e) {
                    e.printStackTrace();
                }
            }
        }
        
        // 拼接表
        for (TableBean tableBean : orderBean) {
            // 加入商品名
            tableBean.setpName(pdBean.getpName());
            context.write(tableBean, NullWritable.get());
        }
    }
}

3.封装类

package com.css.reducejoin;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class TableBean implements Writable{
    
    // 封装对应字段
    private String order_id; //订单id
    private String pid; // 产品id
    private int amount; // 产品数量
    private String pName; // 产品名称
    private String flag; // 判断是订单表还是商品表
    
    public TableBean() {
        super();
    }
    
    public String getOrder_id() {
        return order_id;
    }
    public void setOrder_id(String order_id) {
        this.order_id = order_id;
    }
    public String getPid() {
        return pid;
    }
    public void setPid(String pid) {
        this.pid = pid;
    }
    public int getAmount() {
        return amount;
    }
    public void setAmount(int amount) {
        this.amount = amount;
    }
    public String getpName() {
        return pName;
    }
    public void setpName(String pName) {
        this.pName = pName;
    }
    public String getFlag() {
        return flag;
    }
    public void setFlag(String flag) {
        this.flag = flag;
    }
    
    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(order_id);
        out.writeUTF(pid);
        out.writeInt(amount);
        out.writeUTF(pName);
        out.writeUTF(flag);
    }
    
    @Override
    public void readFields(DataInput in) throws IOException {
        order_id = in.readUTF();
        pid = in.readUTF();
        amount = in.readInt();
        pName = in.readUTF();
        flag = in.readUTF();
    }

    @Override
    public String toString() {
        return order_id + "\t" + pName + "\t" + amount;
    }
}

4.Driver类

package com.css.reducejoin;

import java.io.IOException;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TableDriver {
    public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        
        job.setJarByClass(TableDriver.class);
        
        job.setMapperClass(TableMapper.class);
        job.setReducerClass(TableReducer.class);
        
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(TableBean.class);
        
        job.setOutputKeyClass(TableBean.class);
        job.setOutputValueClass(NullWritable.class);
        
        FileInputFormat.setInputPaths(job, new Path("c:/reduce1029/in"));
        FileOutputFormat.setOutputPath(job, new Path("c:/reduce1029/out"));
        
        boolean rs = job.waitForCompletion(true);
        System.out.println(rs ? 0 : 1);
    }
}

5.输入文件

(1)order.txt
201801    01    1
201802    02    2
201803    03    3
201804    01    4
201805    02    5
201806    03    62)pd.txt
01    苹果
02    华为
03    小米

6.输出文件part-r-00000

201804    苹果    4
201801    苹果    1
201805    华为    5
201802    华为    2
201806    小米    6
201803    小米    3

 

posted on 2018-11-04 17:59    阅读(250)  评论(0编辑  收藏  举报