- MapReduce的整个过程分为map和reduce两个阶段,通俗的说就是map(映射)用来准备需要处理的数据,reduce(归纳)用来实际处理数据。当然,另外还需要一些用来调度作业的代码。
map
- mapper接口是一个泛型接口,其中有四个参数。前两个参数是输入的一对键与值,后两个参数是输出的一对键与值。其中输入键是一个长整数偏移量(类型为LongWritable),输入值是一行文本;输出键与值类型由开发者自行定义。
- 比如类 public static class Map extends Mapper<LongWritable, Text, Text, Text>
- 其中的map方法定义为 public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException
- 在老版本的hadoop中,map方法包括输入键与值,用来写入输出内容的OutputCollector实例与用来控制的reporter实例。新版本的hadoop把后两者合并为一个context(上下文)实例,完成同样的功能。
reduce
- reducer接口同样是泛型接口,其中的四个参数,前两个是由map的输出产生的输入键与值,类型务必要与map的输出数据(叫做中间数据)类型相同;后两个是reduce处理后得到的结果键与值,类型由开发者自行定义。
- 类的写法为 public static class Reduce extends Reducer<Text, Text, Text, Text>
- 其中的reduce方法定义为 public void reduce(Text key, Iterable values,Context context) throws IOException, InterruptedException
combine
- combine继承reducer接口,其中要重写的方法是reduce方法。combine本质上就是在本地执行的简单的reduce操作,以减少网络带宽占用。
- 经过实际测试,在集群环境下执行指定程序时,经过combine进行本地处理后程序效率提高10%左右。
main
在主方法中配置的东西如下
- Configuration conf = new Configuration(); //不再使用JobConf类,而是用其父类Configuration进行配置
- FileSystem.get(conf).delete(new Path(args[1]), true); //删除输出路径下的文件
- String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); //从命令行获取参数。这是hadoop提供的辅助类,相关介绍点这里
- Job job = new Job(conf, "product pv uv"); //配置一个新job
- //以下都是字面含义
- job.setJarByClass(ProductPvUv.class);
- job.setMapperClass(Map.class);
- job.setReducerClass(Reduce.class);
- job.setMapOutputKeyClass(Text.class);
- job.setMapOutputValueClass(Text.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(Text.class);
- FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
- FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
- //至此准备工作完成
- System.exit(job.waitForCompletion(true)?0:1) //job完成后exit
tips
- map的输入键是一个长整数偏移量,所以不能把输入键类型(第一个参数)定义为Text等非LongWritable类型,否则会出java.lang.ClassCastException: org.apache.hadoop.io.LongWritable cannot be cast to org.apache.hadoop.io.Text异常(0.5天)
- 变量名方法名类名要规范。
- 程序要考虑容错。输入数据可能不规范,要做细节处理。
1 package com;
2
3 import java.io.DataInput;
4 import java.io.DataOutput;
5 import java.io.IOException;
6 import java.io.PrintStream;
7 import java.util.ArrayList;
8 import java.util.Collections;
9 import java.util.List;
10 import java.util.regex.Matcher;
11 import java.util.regex.Pattern;
12
13 import org.apache.hadoop.conf.Configuration;
14 import org.apache.hadoop.fs.FileSystem;
15 import org.apache.hadoop.fs.Path;
16 import org.apache.hadoop.io.LongWritable;
17 import org.apache.hadoop.io.Text;
18 import org.apache.hadoop.io.WritableComparable;
19 import org.apache.hadoop.mapreduce.Job;
20 import org.apache.hadoop.mapreduce.Mapper;
21 import org.apache.hadoop.mapreduce.Reducer;
22 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
23 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
24 import org.apache.hadoop.util.GenericOptionsParser;
25
26 public class ProductPvUv {
27 private static String item = new String();
28
29 public static class Map extends Mapper<LongWritable, Text, Text, ValuePair> {
30 @Override
31 public void map(LongWritable key, Text value, Context context)
32 throws IOException, InterruptedException {
33 Text word = new Text();
34 ValuePair text = new ValuePair();
35 String aLine = value.toString();
36 String url = getAField(aLine, 5);
37 Pattern p = Pattern.compile("(?<=\\b[pP]roduct_id=)\\d+\\b");
38 Matcher m = p.matcher(url);
39 if (!m.find()) return;
40 String product_id = m.group();
41 if (!item.isEmpty() && !item.equals(product_id)) return;
42 word.set(product_id);
43 String type = getAField(aLine, 10);
44
45 if ("1".equals(type)) {
46 String permanent_id = getAField(aLine, 7);
47 if (permanent_id.equals("") || permanent_id.equals("0")
48 || permanent_id.equals("null")) {
49 String ip = getAField(aLine, 3);
50 String http_ua = getAField(aLine, 8);
51 permanent_id = ip.concat(http_ua);
52 permanent_id = (permanent_id.hashCode() & 0x7fffffff) + "";
53 } else {
54 Pattern pat = Pattern.compile("^\\d+$");
55 Matcher mat = pat.matcher(permanent_id);
56 if (!mat.find()) {
57 permanent_id = "0" + permanent_id.hashCode();
58 }
59 }
60 text.setPv(1);
61 text.setPermanent_id(permanent_id);
62 context.write(word, text);
63 }
64 }
65
66 }
67
68 public static class Reduce extends Reducer<Text, ValuePair, Text, Text> {
69 @Override
70 public void reduce(Text key, Iterable<ValuePair> values, Context context)
71 throws IOException, InterruptedException {
72
73 Text word = new Text();
74 Text text = new Text();
75 int pv = 0, uv = 0;
76 word = key;
77 //System.out.println("***");
78 List<ValuePair> list = new ArrayList<ValuePair>();
79 for (ValuePair pv_Permanent_id : values) {
80 list.add(pv_Permanent_id);
81 }
82 Collections.sort(list);
83 String lastPermanent_id = "";
84 for (ValuePair pv_Permanent_id : list) {
85
86 String permanent_id = pv_Permanent_id.getPermanent_id();
87 int npv = pv_Permanent_id.getPv();
88 pv = npv;
89 //System.out.println(permanent_id + "\t" + lastPermanent_id);
90 if (!permanent_id.equals(lastPermanent_id)) {
91 uv++;
92 lastPermanent_id = permanent_id;
93 text.set(pv + "\t" + uv);
94 context.write(word, text);
95 }
96 }
97 }
98 }
99
100 public static class Combine extends
101 Reducer<Text, ValuePair, Text, ValuePair> {
102 @Override
103 public void reduce(Text key, Iterable<ValuePair> values, Context context)
104 throws IOException, InterruptedException {
105 int pv = 1;
106 //System.out.println("***");
107 List<ValuePair> list = new ArrayList<ValuePair>();
108 for (ValuePair pv_Permanent_id : values) {
109 list.add(pv_Permanent_id);
110 }
111
112 Collections.sort(list);
113 String lastPermanent_id = "";
114 for (ValuePair pv_Permanent_id : list) {
115 String permanent_id = pv_Permanent_id.getPermanent_id();
116 if (!permanent_id.equals(lastPermanent_id)) {
117 lastPermanent_id = permanent_id;
118 ValuePair result = new ValuePair(pv, permanent_id);
119 context.write(key, result);
120 pv = 1;
121 } else
122 pv++;
123 }
124 }
125 }
126
127 private static class ValuePair implements WritableComparable<ValuePair> {
128 int pv;
129 String permanent_id;
130
131 public ValuePair() {
132 pv = 1;
133 permanent_id = new String();
134 }
135
136 public ValuePair(int npv, String npermanent_id) {
137 pv = npv;
138 permanent_id = npermanent_id;
139 }
140
141 @Override
142 public String toString() {
143 return permanent_id + pv;
144 }
145
146 @Override
147 public int compareTo(ValuePair v) {
148 int i = this.permanent_id.compareTo(v.permanent_id);
149 if (i > 0)
150 return 1;
151 else if (i < 0)
152 return -1;
153 else
154 return 0;
155 }
156
157 @Override
158 public void readFields(DataInput in) throws IOException {
159 // TODO Auto-generated method stub
160 permanent_id = in.readUTF();
161 pv = in.readInt();
162 }
163
164 @Override
165 public void write(DataOutput out) throws IOException {
166 // TODO Auto-generated method stub
167 out.writeUTF(permanent_id);
168 out.writeInt(pv);
169 }
170
171 @Override
172 public int hashCode() {
173 final int prime = 31;
174 int result = 1;
175 result = prime * result + pv;
176 result = prime * result
177 + ((permanent_id == null) ? 0 : permanent_id.hashCode());
178 return result;
179 }
180
181 @Override
182 public boolean equals(Object obj) {
183 if (this == obj)
184 return true;
185 if (obj == null)
186 return false;
187 if (getClass() != obj.getClass())
188 return false;
189 ValuePair other = (ValuePair) obj;
190 if (pv != other.pv)
191 return false;
192 if (permanent_id == null) {
193 if (other.permanent_id != null)
194 return false;
195 } else if (!permanent_id.equals(other.permanent_id))
196 return false;
197 return true;
198 }
199
200 public int getPv() {
201 return pv;
202 }
203
204 public void setPv(int pv) {
205 this.pv = pv;
206 }
207
208 public String getPermanent_id() {
209 return permanent_id;
210 }
211
212 public void setPermanent_id(String permanent_id) {
213 this.permanent_id = permanent_id;
214 }
215 }
216
217 private static final class Timer {
218 private long startTime;
219 private long endTime;
220
221 public Timer() {
222 reset();
223 }
224
225 public void start() {
226 System.gc();
227 startTime = System.currentTimeMillis();
228 }
229
230 public void end() {
231 System.gc();
232 endTime = System.currentTimeMillis();
233 }
234
235 public long duration() {
236 return (endTime - startTime);
237 }
238
239 public void printDuration(PrintStream out) {
240 long elapsedTimeInSecond = duration() / 1000;
241 long remainderInMillis = duration() % 1000;
242 out.println("\nTotal execution time:" + elapsedTimeInSecond + "."
243 + remainderInMillis + " seconds");
244 }
245
246 public void reset() {
247 startTime = 0;
248 endTime = 0;
249 }
250 }
251
252 public static void main(String[] args) throws Exception {
253 Timer timer = new Timer();
254 timer.start();
255 Configuration conf = new Configuration();
256 FileSystem.get(conf).delete(new Path(args[1]), true);
257 String[] otherArgs = new GenericOptionsParser(conf, args)
258 .getRemainingArgs();
259 if (otherArgs.length == 3)
260 item = otherArgs[2];
261 Job job = new Job(conf, "product pv uv");
262 job.setJarByClass(ProductPvUv.class);
263 job.setMapperClass(Map.class);
264 job.setReducerClass(Reduce.class);
265 job.setCombinerClass(Combine.class);
266 job.setMapOutputKeyClass(Text.class);
267 job.setMapOutputValueClass(ValuePair.class);
268 job.setOutputKeyClass(Text.class);
269 job.setOutputValueClass(Text.class);
270 FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
271 FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
272
273 if (job.waitForCompletion(true)) {
274 timer.end();
275 timer.printDuration(System.out);
276 System.exit(0);
277 }
278 }
279 }