【MapReduce中多表合并案例】(一)Reduce端表合并(数据倾斜)

通过将关联条件作为map输出的key,将两表满足join条件的数据并携带数据所来源的文件信息,发往同一个reduce task,在reduce中进行数据的串联。

 

 

将数据封装成一个Bean对象,Key是商品id-0X

一个表就有自己的一个标识--》1、0

转换成Bean类后,toString()出来的就是 p_id p_name amount

Mapper之后会自动按照Key进行排序

一、将两个表的数据属性进行封装

package MapReduce.HeBing.Reduce_HeBing;

import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

@Getter
@Setter
//反序列化-需要空参构造器
@NoArgsConstructor
@AllArgsConstructor

/**
 * 封装类--实现序列化接口 Writable
 */
public class ReducerJoinBean implements Writable {
    //订单表里的数据-o_id
    private String o_id;
    //商品的p_id
    private String p_id;
    //商品的数量-amount
    private int amount;
    //商品的名称-p_name
    private String p_name;
    //标记位-statu
    private String statu;

    //序列化
    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(o_id);
        dataOutput.writeUTF(p_id);
        dataOutput.writeInt(amount);
        dataOutput.writeUTF(p_name);
        dataOutput.writeUTF(statu);
    }
    //反序列化
    @Override
    public void readFields(DataInput dataInput) throws IOException {
        this.o_id = dataInput.readUTF();
        this.p_id = dataInput.readUTF();
        this.amount = dataInput.readInt();
        this.p_name = dataInput.readUTF();
        this.statu = dataInput.readUTF();
    }

    @Override
    public String toString() {
        return o_id+"\t"+p_name+"\t"+amount;
    }
}

二、继承Mapper类

package MapReduce.HeBing.Reduce_HeBing;

import lombok.Getter;
import lombok.Setter;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

@Getter
@Setter

public class ReducerJoinMapper extends Mapper<LongWritable, Text,Text,ReducerJoinBean> {
    Text k = new Text();
    ReducerJoinBean v = new ReducerJoinBean();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //1.获取输入数据的文件的名称
        FileSplit inputSplit =(FileSplit)context.getInputSplit();
        String name = inputSplit.getPath().getName();

        //2.获取数据
        String line = value.toString();

        //3.不同文件添加标记位
        if(name.startsWith("order")){
            String[] strings = line.split("\t");
            //封装数据
            v.setO_id(strings[0]);
            v.setP_id(strings[1]);
            v.setAmount(Integer.parseInt(strings[2]));
            v.setStatu("1");
            //序列化里不能设为空-null,否则会报异常-NullPointerException
            v.setP_name("");

            k.set(strings[1]);

        }else if(name.startsWith("pd")) {
            String[] strings = line.split("\t");
            //封装数据
            //序列化里不能设为空-null,否则会报异常-NullPointerException
            v.setO_id("");
            v.setP_id(strings[0]);
            v.setAmount(0);
            v.setStatu("0");
            v.setP_name(strings[1]);

            k.set(strings[0]);
        }
        //5.输出
        context.write(k,v);
    }
}

三、继承Reducer类

package MapReduce.HeBing.Reduce_HeBing;

import lombok.Getter;
import lombok.Setter;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

@Setter
@Getter

public class ReducerJoin extends Reducer<Text,ReducerJoinBean,ReducerJoinBean, NullWritable> {
    @Override
    protected void reduce(Text key, Iterable<ReducerJoinBean> values, Context context) throws IOException, InterruptedException {
        //1.定义一个存储订单表的集合-用List
        ArrayList<ReducerJoinBean> reducerJoinBeans = new ArrayList<>();

        //2.定义一个商品表的对象
        ReducerJoinBean bean = new ReducerJoinBean();

        //3.筛选重组
        for(ReducerJoinBean value:values){
            if ("1".equals(value.getStatu())) {
                ReducerJoinBean bean1 = new ReducerJoinBean();
                //拷贝,将value的Bean对象存进对应的bean1对象中
                //BeanUtils要用含有commons的
                try {
                    BeanUtils.copyProperties(bean1,value);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
                reducerJoinBeans.add(bean1);
            }else {
                //pd表的Bean对象存储到该产品表对应的bean对象中
                try {
                    BeanUtils.copyProperties(bean,value);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
            }
        }
        //4.拼接到一起
        for(ReducerJoinBean reducerJoinBean:reducerJoinBeans){
            //将pb表的name赋值到订单表的对象中
            reducerJoinBean.setP_name(bean.getP_name());
            context.write(reducerJoinBean,NullWritable.get());
        }
    }
}

四、驱动类

package MapReduce.HeBing.Reduce_HeBing;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.net.URISyntaxException;

public class ReducerJoinDriver {
    public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
        //两个文件存在了一个文件夹下
        args = new String[]{"D:/Reducer_Join", "D:/Hadoop-result/HeBing_Reducer"};
        //1.配置信息
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        //2.映射类
        job.setJarByClass(ReducerJoinDriver.class);
        job.setMapperClass(ReducerJoinMapper.class);
        job.setReducerClass(ReducerJoin.class);

        //3.Map输出、Reducer输出的K、V类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(ReducerJoinBean.class);
        job.setOutputKeyClass(ReducerJoinBean.class);
        job.setOutputValueClass(NullWritable.class);

        //4.输入输出文件路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //5.提交Job
        job.waitForCompletion(true);
    }
}

五、总结

1.序列化里不能设为空-null,否则会报异常-NullPointerException

2.对...Reducer类中的如下代码说明:

//Reducer类中的reduce()是按照一组k、v值进行输入的
//也就是说:这里的Key->o_id和其相应的Value->p_id    p_name    amount
//这也就说明了为什么存储同一key的订单表数据(value)组要用一个List,
//而存储商品表数据-Key仅需要创建一个对象。
protected void reduce(Text key, Iterable<ReducerJoinBean> values, Context context) 
        //1.定义一个存储订单表的集合-用List
        ArrayList<ReducerJoinBean> beans = new ArrayList<>();
        //2.定义一个商品表的对象
        // 因为pd商品文件数据中的p_name每个只出现一次,他是会不断更换的
        ReducerJoinBean beanPd = new ReducerJoinBean();

3.缺点:

这种方式中,合并的操作是在reduce阶段完成,reduce端的处理压力太大,map节点的运算负载则很低,资源利用率不高,且在reduce阶段极易产生数据倾斜

解决方案: map端实现数据合并  https://www.cnblogs.com/liuxinrong/articles/12837314.html

博客园  ©  2004-2025
浙公网安备 33010602011771号 浙ICP备2021040463号-3