MapReduce原理深入理解(二)

1.Mapreduce操作不需要reduce阶段

 

 1 import org.apache.hadoop.conf.Configuration;
 2 import org.apache.hadoop.fs.FileSystem;
 3 import org.apache.hadoop.fs.Path;
 4 import org.apache.hadoop.io.LongWritable;
 5 import org.apache.hadoop.io.NullWritable;
 6 import org.apache.hadoop.io.Text;
 7 import org.apache.hadoop.mapreduce.Job;
 8 import org.apache.hadoop.mapreduce.Mapper;
 9 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
10 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
11 
12 import java.io.IOException;
13 
14 public class WordCount03 {
15     public static class MyMapper extends Mapper<LongWritable, Text,Text, NullWritable>{
16         @Override
17         protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
18             String line = value.toString();
19             String s = line.split(",")[3];
20             if(s.equals("男")){
21                 context.write(new Text(s),NullWritable.get());
22             }
23         }
24     }
25     public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
26         Job job= Job.getInstance();
27         job.setNumReduceTasks(0);
28         /**
29          * 有些情况下,不需要reduce(聚合程序),
30          * 在不需要聚合操作的时候,可以不需要reduce
31          * 而reduce默认为1,需要手动设置为0,
32          * 如果没有设置为0,会产生默认的reduce,只不过reduce不处理任何数据
33          */
34         job.setJobName("mr03程序");
35         job.setJarByClass(WordCount03.class);
36         job.setMapOutputKeyClass(Text.class);
37         job.setMapOutputValueClass(NullWritable.class);
38         Path in = new Path("/word");
39         FileInputFormat.addInputPath(job,in);
40         Path out = new Path("/output");
41         FileSystem fs = FileSystem.get(new Configuration());
42         if(fs.exists(out)){
43             fs.delete(out);
44         }
45         FileOutputFormat.setOutputPath(job,out);
46         job.waitForCompletion(true);
47     }
48 }

 

注意:

有些情况下,不需要reduce(聚合程序),
在不需要聚合操作的时候,可以不需要reduce
而reduce默认为1,需要手动设置为0,
如果没有设置为0,会产生默认的reduce,只不过reduce不处理任何数据

2.MapReduce中join操作(数据拼接)
  1 import org.apache.hadoop.conf.Configuration;
  2 import org.apache.hadoop.fs.FileSystem;
  3 import org.apache.hadoop.fs.Path;
  4 import org.apache.hadoop.io.LongWritable;
  5 import org.apache.hadoop.io.NullWritable;
  6 import org.apache.hadoop.io.Text;
  7 import org.apache.hadoop.mapreduce.InputSplit;
  8 import org.apache.hadoop.mapreduce.Job;
  9 import org.apache.hadoop.mapreduce.Mapper;
 10 import org.apache.hadoop.mapreduce.Reducer;
 11 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 12 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 13 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 14 
 15 import java.io.IOException;
 16 import java.util.ArrayList;
 17 
 18 public class WordCount04 {
 19     public static class JoinMapper extends Mapper<LongWritable,Text,Text,Text>{
 20         @Override
 21         protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
 22             //1.获取数据的路径 InputSplit
 23             //context 上面是hdfs 下面如果有reduce就是reduce 没有就是hdfs
 24             InputSplit inputSplit = context.getInputSplit();
 25             FileSplit fs=(FileSplit)inputSplit;
 26             String url = fs.getPath().toString();
 27             //2.判断
 28             if(url.contains("students")){//true当前数据为students.txt
 29                 String id = value.toString().split(",")[0];
 30                 //为了方便reduce数据的操作 针对于不同的数据 打一个标签
 31                 String line = "*" + value.toString();
 32                 context.write(new Text(id),new Text(line));
 33             }else {//false 当前数据为score.txt
 34                 //以学号作为k 也是两张数据的关联条件
 35                 String id = value.toString().split(",")[0];
 36                 //为了方便reduce数据的操作 针对于不同的数据 打一个标签
 37                 String line = "#" + value.toString();
 38                 context.write(new Text(id),new Text(line));
 39             }
 40         }
 41     }
 42     public static class JoinReduce extends Reducer<Text,Text,Text,NullWritable>{
 43         @Override
 44         protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
 45             //数据在循环之外保存
 46             String stuInfo="";
 47             ArrayList<String> scores = new ArrayList<String>();
 48             //提取数据
 49             for (Text value : values) {
 50                 //获取一行一行的数据(所有数据包含students.txt和score.txt)
 51                 String line = value.toString();
 52                 if(line.startsWith("*")){//true 为学生数据
 53                     stuInfo= line.substring(1);
 54                 }else {//false  为学生成绩数据
 55                     scores.add(line.substring(1));
 56                 }
 57             }
 58             /**
 59              * 求的是 两张表的拼接
 60              */
 61             //数据拼接
 62             for (String score : scores) {
 63                 String subject = score.split(",")[1];
 64                 String s = score.split(",")[2];
 65                 String end=stuInfo+","+subject+","+s;
 66                 context.write(new Text(end),NullWritable.get());
 67             }
 68             /**
 69              * 求的是 两张表的拼接 拼接过程中对成绩求和
 70              */
 71 //            long sum=0l;
 72 //            for (String s : scores) {
 73 //                Integer sc =Integer.valueOf( s.split(",")[2]);
 74 //                sum+=sc;
 75 //            }
 76 //            String end=stuInfo+","+sum;
 77 //            context.write(new Text(end),NullWritable.get());
 78         }
 79     }
 80     public static void main(String[] args) throws Exception {
 81         Job job = Job.getInstance();
 82         job.setJobName("Join MapReduce");
 83         job.setJarByClass(WordCount04.class);
 84 
 85         job.setMapperClass(JoinMapper.class);
 86         job.setMapOutputKeyClass(Text.class);
 87         job.setMapOutputValueClass(Text.class);
 88 
 89         job.setReducerClass(JoinReduce.class);
 90         job.setOutputKeyClass(Text.class);
 91         job.setOutputValueClass(NullWritable.class);
 92         //指定路径
 93         FileInputFormat.addInputPath(job,new Path("/word"));
 94         Path path = new Path("/output");
 95         FileSystem fs = FileSystem.get(new Configuration());
 96         if(fs.exists(path)){
 97             fs.delete(path);
 98         }
 99         FileOutputFormat.setOutputPath(job,new Path("/output"));
100         job.waitForCompletion(true);
101         System.out.println("join 正在执行");
102     }
103 }

 3.MapReduce添加Combiner

combiner发生在map端的reduce操作。 作用是减少map端的输出,减少shuffle过程中网络传输的数据量,提高作业的执行效率。

combiner仅仅是单个map task的reduce,没有对全部map的输出做reduce。

如果不用combiner,那么,所有的结果都是reduce完成,效率会相对低下。使用combiner,先完成的map会在本地聚合,提升速度。 注意:Combiner的输出是Reducer的输入,Combiner绝不能改变最终的计算结果。所以,Combine适合于等幂操作,比如累加,最大值等。求平均数不适合

combine操作前的效果:

 

 

 

combine操作后的效果:

 

 

 1 package com.shujia.hadoop;
 2 
 3 import org.apache.hadoop.conf.Configuration;
 4 import org.apache.hadoop.fs.FileSystem;
 5 import org.apache.hadoop.fs.Path;
 6 import org.apache.hadoop.io.LongWritable;
 7 import org.apache.hadoop.io.Text;
 8 import org.apache.hadoop.mapreduce.Job;
 9 import org.apache.hadoop.mapreduce.Mapper;
10 import org.apache.hadoop.mapreduce.Reducer;
11 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13 
14 import java.io.IOException;
15 
16 public class Demo07Combine {
17     public static class CombineMapper extends Mapper<LongWritable,Text,Text,LongWritable>{
18         @Override
19         protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
20             String line = value.toString();
21             String sex = line.split(",")[3];
22             context.write(new Text(sex),new LongWritable(1));
23         }
24     }
25     //Combine(预聚合 预处理)过程在reduce之前 map端之后
26     //
27     public static class  Combine extends Reducer<Text,LongWritable,Text,LongWritable>{
28         @Override
29         protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
30             long sum=0l;
31             for (LongWritable value : values) {
32                 sum+=value.get();
33             }
34             context.write(key,new LongWritable(sum));
35         }
36     }
37     public static class CombineReduce extends Reducer<Text,LongWritable,Text,LongWritable>{
38         @Override
39         protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
40             long sum=0l;
41             for (LongWritable value : values) {
42                 sum+=value.get();
43             }
44             context.write(key,new LongWritable(sum));
45         }
46     }
47     public static void main(String[] args) throws Exception{
48         //创建一个job任务
49         Job job = Job.getInstance();
50 
51         // 指定reduce节点个数
52         job.setNumReduceTasks(2);
53         //指定job名称
54         job.setJobName("Combine,性别统计");
55         //构建mr
56         //指定当前main所在类名(识别具体的类)
57         job.setJarByClass(Demo07Combine.class);
58         //指定map端类
59         job.setMapperClass(CombineMapper.class);
60         // 指定map输出的kv类型
61         job.setMapOutputKeyClass(Text.class);
62         job.setMapOutputValueClass(LongWritable.class);
63 
64         /**
65          * 指定combine
66          */
67         job.setCombinerClass(Combine.class);
68 
69         //指定reduce端类
70         //指定reduce端输出的kv类型
71         job.setReducerClass(CombineReduce.class);
72         job.setOutputKeyClass(Text.class);
73         job.setOutputValueClass(LongWritable.class);
74 
75         // 指定输入路径
76         Path in = new Path("/data");
77         FileInputFormat.addInputPath(job,in);
78         //指定输出
79         Path out = new Path("/output");
80         //如果路径存在 删除
81         FileSystem fs = FileSystem.get(new Configuration());
82         if(fs.exists(out)){
83             fs.delete(out,true);
84         }
85         FileOutputFormat.setOutputPath(job,out);
86 
87         //启动任务
88         job.waitForCompletion(true);
89         /**
90          * 提交任务
91          * 1.通过maven中package将项目打包上传服务器然后执行
92          * 2.执行任务 hadoop jar hadoop-mapreduce-examples-2.7.6.jar com.shujia.hadoop.Demo01WordCount /word  /output
93          *
94          */
95         System.out.println("Combine正在执行");
96 
97     }
98 }

 

 

 

posted @ 2021-09-23 20:25  lmandcc  阅读(72)  评论(0编辑  收藏  举报