MapReduce-【MapReduce中多表合并案例】(一)Reduce端表合并(数据倾斜)
Posted on 2020-05-06 17:53 MissRong 阅读(606) 评论(0) 收藏 举报【MapReduce中多表合并案例】(一)Reduce端表合并(数据倾斜)
通过将关联条件作为map输出的key,将两表满足join条件的数据并携带数据所来源的文件信息,发往同一个reduce task,在reduce中进行数据的串联。
将数据封装成一个Bean对象,Key是商品id-0X
一个表就有自己的一个标识--》1、0
转换成Bean类后,toString()出来的就是 p_id p_name amount。
Mapper之后会自动按照Key进行排序
一、将两个表的数据属性进行封装
package MapReduce.HeBing.Reduce_HeBing; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.NoArgsConstructor; import lombok.Setter; import org.apache.hadoop.io.Writable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; @Getter @Setter //反序列化-需要空参构造器 @NoArgsConstructor @AllArgsConstructor /** * 封装类--实现序列化接口 Writable */ public class ReducerJoinBean implements Writable { //订单表里的数据-o_id private String o_id; //商品的p_id private String p_id; //商品的数量-amount private int amount; //商品的名称-p_name private String p_name; //标记位-statu private String statu; //序列化 @Override public void write(DataOutput dataOutput) throws IOException { dataOutput.writeUTF(o_id); dataOutput.writeUTF(p_id); dataOutput.writeInt(amount); dataOutput.writeUTF(p_name); dataOutput.writeUTF(statu); } //反序列化 @Override public void readFields(DataInput dataInput) throws IOException { this.o_id = dataInput.readUTF(); this.p_id = dataInput.readUTF(); this.amount = dataInput.readInt(); this.p_name = dataInput.readUTF(); this.statu = dataInput.readUTF(); } @Override public String toString() { return o_id+"\t"+p_name+"\t"+amount; } }
二、继承Mapper类
package MapReduce.HeBing.Reduce_HeBing; import lombok.Getter; import lombok.Setter; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import java.io.IOException; @Getter @Setter public class ReducerJoinMapper extends Mapper<LongWritable, Text,Text,ReducerJoinBean> { Text k = new Text(); ReducerJoinBean v = new ReducerJoinBean(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //1.获取输入数据的文件的名称 FileSplit inputSplit =(FileSplit)context.getInputSplit(); String name = inputSplit.getPath().getName(); //2.获取数据 String line = value.toString(); //3.不同文件添加标记位 if(name.startsWith("order")){ String[] strings = line.split("\t"); //封装数据 v.setO_id(strings[0]); v.setP_id(strings[1]); v.setAmount(Integer.parseInt(strings[2])); v.setStatu("1"); //序列化里不能设为空-null,否则会报异常-NullPointerException v.setP_name(""); k.set(strings[1]); }else if(name.startsWith("pd")) { String[] strings = line.split("\t"); //封装数据 //序列化里不能设为空-null,否则会报异常-NullPointerException v.setO_id(""); v.setP_id(strings[0]); v.setAmount(0); v.setStatu("0"); v.setP_name(strings[1]); k.set(strings[0]); } //5.输出 context.write(k,v); } }
三、继承Reducer类
package MapReduce.HeBing.Reduce_HeBing; import lombok.Getter; import lombok.Setter; import org.apache.commons.beanutils.BeanUtils; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; @Setter @Getter public class ReducerJoin extends Reducer<Text,ReducerJoinBean,ReducerJoinBean, NullWritable> { @Override protected void reduce(Text key, Iterable<ReducerJoinBean> values, Context context) throws IOException, InterruptedException { //1.定义一个存储订单表的集合-用List ArrayList<ReducerJoinBean> reducerJoinBeans = new ArrayList<>(); //2.定义一个商品表的对象 ReducerJoinBean bean = new ReducerJoinBean(); //3.筛选重组 for(ReducerJoinBean value:values){ if ("1".equals(value.getStatu())) { ReducerJoinBean bean1 = new ReducerJoinBean(); //拷贝,将value的Bean对象存进对应的bean1对象中 //BeanUtils要用含有commons的 try { BeanUtils.copyProperties(bean1,value); } catch (IllegalAccessException e) { e.printStackTrace(); } catch (InvocationTargetException e) { e.printStackTrace(); } reducerJoinBeans.add(bean1); }else { //pd表的Bean对象存储到该产品表对应的bean对象中 try { BeanUtils.copyProperties(bean,value); } catch (IllegalAccessException e) { e.printStackTrace(); } catch (InvocationTargetException e) { e.printStackTrace(); } } } //4.拼接到一起 for(ReducerJoinBean reducerJoinBean:reducerJoinBeans){ //将pb表的name赋值到订单表的对象中 reducerJoinBean.setP_name(bean.getP_name()); context.write(reducerJoinBean,NullWritable.get()); } } }
四、驱动类
package MapReduce.HeBing.Reduce_HeBing; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; import java.net.URISyntaxException; public class ReducerJoinDriver { public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException { //两个文件存在了一个文件夹下 args = new String[]{"D:/Reducer_Join", "D:/Hadoop-result/HeBing_Reducer"}; //1.配置信息 Configuration conf = new Configuration(); Job job = Job.getInstance(conf); //2.映射类 job.setJarByClass(ReducerJoinDriver.class); job.setMapperClass(ReducerJoinMapper.class); job.setReducerClass(ReducerJoin.class); //3.Map输出、Reducer输出的K、V类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(ReducerJoinBean.class); job.setOutputKeyClass(ReducerJoinBean.class); job.setOutputValueClass(NullWritable.class); //4.输入输出文件路径 FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); //5.提交Job job.waitForCompletion(true); } }
1.序列化里不能设为空-null,否则会报异常-NullPointerException
2.对...Reducer类中的如下代码说明:
//Reducer类中的reduce()是按照一组k、v值进行输入的 //也就是说:这里的Key->o_id和其相应的Value->p_id p_name amount //这也就说明了为什么存储同一key的订单表数据(value)组要用一个List, //而存储商品表数据-Key仅需要创建一个对象。 protected void reduce(Text key, Iterable<ReducerJoinBean> values, Context context) //1.定义一个存储订单表的集合-用List ArrayList<ReducerJoinBean> beans = new ArrayList<>(); //2.定义一个商品表的对象 // 因为pd商品文件数据中的p_name每个只出现一次,他是会不断更换的 ReducerJoinBean beanPd = new ReducerJoinBean();
3.缺点:
这种方式中,合并的操作是在reduce阶段完成,reduce端的处理压力太大,map节点的运算负载则很低,资源利用率不高,且在reduce阶段极易产生数据倾斜
解决方案: map端实现数据合并 https://www.cnblogs.com/liuxinrong/articles/12837314.html