day43-hadoop-mapreduce
day43-hadoop-mapreduce
hadoop-mapreduce
Join多种应用
Reduce Join
目录
Reduce Join工作原理
Map端的主要工作:为来自不同表和文件的key/value对,打标签以后区别不同来源的记录。然后用链接字段作为key,其余部分和新加的标志作为value,最后进行输出。
Reduce端的主要工作:再Reduce端以链接字段作为key的分组已经完成,我们只需要再每一个分组当中将那些来源于不同文件的记录(在Map阶段已经打标志)分开,最后进行合并就OK了。
Reducer Join案例
问题
实现
- OrderBean.java
package lc.mapreduce.mapper.reducejoin2;
public class OrderBean implements Writable {
private String orderId; // 订单id
private String pid; // 商品id
private Integer amount; // 订单数量
private String pname;// 商品名字
private String title;// 不同文件的标志
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public OrderBean(){}
@Override
public String toString() {
// 数据写出的格式
return orderId+"\\t"+pname+"\\t"+amount;
}
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public String getPid() {
return pid;
}
public void setPid(String pid) {
this.pid = pid;
}
public Integer getAmount() {
return amount;
}
public void setAmount(Integer amount) {
this.amount = amount;
}
public String getPname() {
return pname;
}
public void setPname(String pname) {
this.pname = pname;
}
/**
* 序列化:把内存中的对象写到磁盘
* @param out
* @throws IOException
*/
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(orderId);
out.writeUTF(pid);
out.writeInt(amount);
out.writeUTF(pname);
out.writeUTF(title);
}
/**
* 反序列化:把磁盘的对象写到内存
* @param in
* @throws IOException
*/
@Override
public void readFields(DataInput in) throws IOException {
orderId=in.readUTF();
pid=in.readUTF();
amount=in.readInt();
pname=in.readUTF();
title=in.readUTF();
}
}
- OrderJoinMapper.java
package lc.mapreduce.mapper.reducejoin2;
public class OrederJoinMapper extends Mapper<LongWritable, Text,Text,OrderBean> {
// 文件的名字
private String fileName;
// map输出的k
Text outk=new Text();
// map输出的v
OrderBean outv=new OrderBean();
/**
* 在任务(maptask)开始前会调用一次
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void setup(Mapper<LongWritable, Text, Text, OrderBean>.Context context) throws IOException, InterruptedException {
// 从map中获得切片对象
InputSplit inputSplit = context.getInputSplit();
FileSplit fileSplit=(FileSplit) inputSplit;
// 获取文件的名字
fileName=fileSplit.getPath().toString();
}
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, OrderBean>.Context context) throws IOException, InterruptedException {
// 获取读到的一行数据
String row=value.toString();
// 进行切割
String[] splits = row.split("\\t");
if(fileName.contains("order")){
// 说明是order.txt
// 数据格式:orderId pid amount
// 设置k
outk.set(splits[1]); // 相同k,会进行分组操作
// 设置v
outv.setAmount(Integer.parseInt(splits[2])); // 订单数量
outv.setOrderId(splits[0]);
outv.setPid(splits[1]);
outv.setPname("");
outv.setTitle("order"); // 标志,输出到reduce根据标志进行合并
}else{
// 说明是pd.txt
// 数据格式:pid pname
// 设置k
outk.set(splits[0]);
// 设置v
outv.setOrderId("");
outv.setAmount(0);
outv.setPname(splits[1]);
outv.setPid(splits[0]);
outv.setTitle("pd");
}
// 写数据
context.write(outk,outv);
}
}
- OrderJoinReducer.java
package lc.mapreduce.mapper.reducejoin2;
public class OrderJoinReducer extends Reducer<Text,OrderBean,OrderBean, NullWritable> {
// 存储一组数据的order数据
List<OrderBean> orders=new ArrayList<>();
// 存储一组数据的pd数据
OrderBean pdOrder=new OrderBean();
@Override
protected void reduce(Text key, Iterable<OrderBean> values, Reducer<Text, OrderBean, OrderBean, NullWritable>.Context context) throws IOException, InterruptedException {
// 相同key会被分到一个组,所以遍历相同key的集合
for (OrderBean value : values) {
// 根据标志进行复制
if(value.getTitle().equals("order")){
try{
//order.txt
OrderBean orderBean=new OrderBean();
BeanUtils.copyProperties(orderBean,value);
orders.add(orderBean);// 放到集合中
}catch (Exception e){
throw new RuntimeException(e);
}
}else{
//pd.txt
try {
BeanUtils.copyProperties(pdOrder,value);
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
}
}
// 进行合并
for (OrderBean order : orders) {
order.setPname(pdOrder.getPname());
// 写出
context.write(order,NullWritable.get());
}
// 每次执行一组后需要清空集合
orders.clear();
}
}
- OrderJoinDriver
package lc.mapreduce.mapper.reducejoin2;
public class OrderJoinDriver {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
// 创建配置文件
Configuration conf=new Configuration();
// 创建job
Job job=Job.getInstance(conf);
// 注册驱动
job.setJarByClass(OrderJoinDriver.class);
// 关联mapper和reducer
job.setMapperClass(OrederJoinMapper.class);
job.setReducerClass(OrderJoinReducer.class);
// 设置map输出的k v数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(OrderBean.class);
// 设置最终的输出k v的数据类型
job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(NullWritable.class);
// 关联输入和输出的文件
FileInputFormat.setInputPaths(job,new Path("D:\\\\a测试文件\\\\input\\\\reducejoin"));
FileOutputFormat.setOutputPath(job,new Path("D:\\\\a测试文件\\\\output\\\\reducejoin3"));
// 提交job
job.waitForCompletion(true);
}
}

浙公网安备 33010602011771号