MapReduce编程:实现并使用自定义InputFormat
MapReduce编程:实现并使用自定义InputFormat
目的就是先将输入文本处理在本例子是先将输入的日志文本处理成日志对象(自定义InputFormat)就不需要在map里面再处理提取日志对象!
再进行map函数、reduce函数实现每个城市总响应量统计(这个和前面几个例子一样类似于word count就是个求和)
完整代码:
(1)日志对象类
LogWritable
package com.simple;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
/*
* 代表日志信息的对象
199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245
其中:
199.72.81.55 客户端用户的ip
01/Jul/1995:00:00:01 -0400 访问的时间
GET HTTP方法,GET/POST
/history/apollo/ 客户请求的URL
200 响应码 404
6245 响应内容的大小
*/
public class LogWritable implements Writable{
private Text userIP; // 客户端的IP地址
private Text timestamp; // 客户访问时间
private Text url; // 客户访问的url
private IntWritable status; // 状态码
private IntWritable responseSize; // 服务端响应数据的大小
public LogWritable() {
this.userIP = new Text();
this.timestamp = new Text();
this.url = new Text();
this.status = new IntWritable();
this.responseSize = new IntWritable();
}
public void set(String userIP, String timestamp, String url, int status, int responseSize) {
this.userIP.set(userIP);
this.timestamp.set(timestamp);
this.url.set(url);
this.status.set(status);
this.responseSize.set(responseSize);
}
public Text getUserIP() {
return userIP;
}
public void setUserIP(Text userIP) {
this.userIP = userIP;
}
public Text getTimestamp() {
return timestamp;
}
public void setTimestamp(Text timestamp) {
this.timestamp = timestamp;
}
public Text getUrl() {
return url;
}
public void setUrl(Text url) {
this.url = url;
}
public IntWritable getStatus() {
return status;
}
public void setStatus(IntWritable status) {
this.status = status;
}
public IntWritable getResponseSize() {
return responseSize;
}
public void setResponseSize(IntWritable responseSize) {
this.responseSize = responseSize;
}
// 序列化方法
@Override
public void write(DataOutput out) throws IOException {
userIP.write(out);
timestamp.write(out);
url.write(out);
status.write(out);
responseSize.write(out);
}
// 反序列化方法
@Override
public void readFields(DataInput in) throws IOException {
userIP.readFields(in);
timestamp.readFields(in);
url.readFields(in);
status.readFields(in);
responseSize.readFields(in);
}
}
下面是对日志处理
(2)处理日志记录的类LogRecordReader
package com.simple;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
// 循环读取split中的每一行,解析并生成相应的key-value,“喂给”Mapper
public class LogRecordReader extends RecordReader<LongWritable,LogWritable>{
private LineRecordReader rr; // 使用此对象来实现一行一行读取
private LogWritable value;
//初始化实现读整行
// 做初始化:创建实际调用的LineRecordReader对象
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
// 初始化
rr = new LineRecordReader();
rr.initialize(split, context); // 初始化LineRecordReader
}
//nextKeyValue()这个方法和分布式缓存例子map函数获取日志对象思路一致的!!
// 每次读取split中的一行,解析,并填充LogWritable对象的实例作为value
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
// 正则表达式
String logEntryPattern = "^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\d+)";
// 如果读取结束
if(!rr.nextKeyValue()) {
return false;
}
String line = rr.getCurrentValue().toString(); // 读取split中的每一行文本
Pattern pattern = Pattern.compile(logEntryPattern);
Matcher matcher = pattern.matcher(line);
if(!matcher.matches()) {
System.out.println("无效的记录");
return nextKeyValue();
}
// 提取一行中的各个字段:ip,timestamp,url,status,size
// 199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245
String ip = matcher.group(1);
String timestamp = matcher.group(4);
String url = matcher.group(5);
int status = Integer.parseInt(matcher.group(6));
int responseSize = Integer.parseInt(matcher.group(7));
value = new LogWritable();
value.set(ip, timestamp, url, status, responseSize);
return true;
}
@Override
public LongWritable getCurrentKey() throws IOException, InterruptedException {
return rr.getCurrentKey();
}
@Override
public LogWritable getCurrentValue() throws IOException, InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return rr.getProgress();
}
@Override
public void close() throws IOException {
rr.close();
}
}
(3)对输入文本自定义格式化
package com.simple;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
public class LogFileInputFormat extends FileInputFormat<LongWritable,LogWritable>{
@Override
public RecordReader<LongWritable, LogWritable> createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
// 返回一个自定义的RecordReader
return new LogRecordReader();
}
}
(4)文本处理完以后这样map输入的value就可以直接用日志对象logwritable,因为已经对日志格式化处理为日志对象
package com.simple;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class LogMapper extends Mapper<LongWritable,LogWritable,Text,LogWritable>{
@Override
protected void map(LongWritable key, LogWritable value, Context context)
throws IOException, InterruptedException {
context.write(value.getUserIP(), value);//输出ip为key,日志信息为value
}
}
(5)reducer类同ip下响应量求和
package com.simple;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class LogProcessorReduce extends Reducer<Text,LogWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
@Override
public void reduce(Text key, Iterable<LogWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (LogWritable val : values) {
sum += val.getResponseSize().get();
}
result.set(sum);
context.write(key, result);
}
}
(6)主启动类
package com.simple;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class LogDriver {
public static void main(String[] args) throws Exception {
String input = "hdfs://localhost:9000/NASA_log_sample.txt";
String output = "hdfs://localhost:9000/log-output";
Configuration conf = new Configuration();
Job job = Job.getInstance(conf,"日志分析");
job.setJarByClass(LogDriver.class);
job.setMapperClass(LogMapper.class);
job.setReducerClass(LogProcessorReduce.class);
// 指定使用自定义的FileInputFormat
job.setInputFormatClass(LogFileInputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LogWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

浙公网安备 33010602011771号