处理map端多类型输出
多类型输出
多类型输出指的是在mapreduce里输出的不在是单一类型
本案例意思:
对于日志进行mapreduce处理得到每个城市访问的所有url和响应内容总量
(1)日志对象
LogWritablepackage com.simple;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
/*
* 代表日志信息的对象
199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245
其中:
199.72.81.55 客户端用户的ip
01/Jul/1995:00:00:01 -0400 访问的时间
GET HTTP方法,GET/POST
/history/apollo/ 客户请求的URL
200 响应码 404
6245 响应内容的大小
*/
public class LogWritable implements Writable{
private Text userIP; // 客户端的IP地址
private Text timestamp; // 客户访问时间
private Text url; // 客户访问的url
private IntWritable status; // 状态码
private IntWritable responseSize; // 服务端响应数据的大小
public LogWritable() {
this.userIP = new Text();
this.timestamp = new Text();
this.url = new Text();
this.status = new IntWritable();
this.responseSize = new IntWritable();
}
public void set(String userIP, String timestamp, String url, int status, int responseSize) {
this.userIP.set(userIP);
this.timestamp.set(timestamp);
this.url.set(url);
this.status.set(status);
this.responseSize.set(responseSize);
}
public Text getUserIP() {
return userIP;
}
public void setUserIP(Text userIP) {
this.userIP = userIP;
}
public Text getTimestamp() {
return timestamp;
}
public void setTimestamp(Text timestamp) {
this.timestamp = timestamp;
}
public Text getUrl() {
return url;
}
public void setUrl(Text url) {
this.url = url;
}
public IntWritable getStatus() {
return status;
}
public void setStatus(IntWritable status) {
this.status = status;
}
public IntWritable getResponseSize() {
return responseSize;
}
public void setResponseSize(IntWritable responseSize) {
this.responseSize = responseSize;
}
// 序列化方法
@Override
public void write(DataOutput out) throws IOException {
userIP.write(out);
timestamp.write(out);
url.write(out);
status.write(out);
responseSize.write(out);
}
// 反序列化方法
@Override
public void readFields(DataInput in) throws IOException {
userIP.readFields(in);
timestamp.readFields(in);
url.readFields(in);
status.readFields(in);
responseSize.readFields(in);
}
}
(2)对于多类型输出需要定义一个类继承 GenericWritable
package com.simple;
import org.apache.hadoop.io.GenericWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Writable;
public class MultiValueWritable extends GenericWritable {
//类型:因为本题的解题思路是在map端经过map函数处理获取日志对象(LogWritable.class 这里面含有url)和响应量大小(IntWritable.class)
// 使用类型的静态数组并使用数组中的索引作为对类型的序列化引用
private static Class[] CLASSES = { LogWritable.class, IntWritable.class };
// 一定要有无参构造器
public MultiValueWritable() {}
public MultiValueWritable(Writable w) {//有参的构造函数!
this.set(w);
}
@Override
protected Class[] getTypes() {
return CLASSES;
}
}
(3)mapper类
public class LogMapper extends Mapper<LongWritable,Text,Text,MultiValueWritable>{
@Override
protected void map(LongWritable key, Text value,Context context)
throws IOException, InterruptedException {
String regexp = "^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\d+)";
Pattern pattern = Pattern.compile(regexp);
Matcher matcher = pattern.matcher(value.toString());
if(!matcher.matches()) {
System.out.println("不是一个有效的日志记录");
return;
}
String ip = matcher.group(1);
String timestamp = matcher.group(4);
String url = matcher.group(5);
int status = Integer.parseInt(matcher.group(6));
int responseSize = Integer.parseInt(matcher.group(7));
LogWritable log = new LogWritable();
log.set(ip, timestamp, url, status, responseSize);
//输出key为ip,value可能是响应内容大小可能是日志对象
context.write(new Text(ip), new MultiValueWritable(log));
context.write(new Text(ip), new MultiValueWritable(new IntWritable(responseSize)));
}
}
(4)ruducer类还是求和同一城市的所有响应量
package com.simple;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Reducer;
public class LogReducer extends Reducer<Text, MultiValueWritable, Text, Text> {
private Text text = new Text();
@Override
protected void reduce(Text key, Iterable<MultiValueWritable> values, Context context)
throws IOException, InterruptedException {
int total = 0;
StringBuilder sb = new StringBuilder();//放该城市下所有的url
// reduce收到的MultiValueWritable中包装的值类型,可能是LogWritable,也有可能是IntWritable
for(MultiValueWritable mvw : values) {
//如果是IntWritable就是响应内容就求和,否则就是日志对象就获取url并加入sb
Writable w = mvw.get();//利用多态写法!
if(w instanceof IntWritable) {
// 说明收到的是IntWritable类型,累加responseSize的大小
total += ((IntWritable) w).get();
}else {
// 说明收到的是LogWritable
sb.append(((LogWritable)w).getUrl());
sb.append("\t");
}
}
// 某IP,访问了哪些url,这些url总共的大小是多少
context.write(key, new Text(sb.toString() + ":" + total));
}
}
(4)主启动类
package com.simple;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class LogDriver {
public static void main(String[] args) throws Exception {
String input = "hdfs://localhost:9000/NASA_log_sample.txt";
String output = "hdfs://localhost:9000/log-output";
Configuration conf = new Configuration();
Job job = Job.getInstance(conf,"日志分析");
job.setJarByClass(LogDriver.class);
job.setMapperClass(LogMapper.class);
job.setReducerClass(LogReducer.class);
// 设置map输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(MultiValueWritable.class);
// 设置reduce输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// 设置输入路径
FileInputFormat.setInputPaths(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
// 提交作业
System.exit(job.waitForCompletion(true)?0:1);
}
}

浙公网安备 33010602011771号