数据清洗
数据清洗
目的:就是从海量信息中筛选获取想要的信息
(1)多获取信息进行解析
LogParser
package com.simple.mr;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
// 对论坛日志进行解析的类
public class LogParser {
public static final SimpleDateFormat FORMAT = new SimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH);
public static final SimpleDateFormat dateformat1 = new SimpleDateFormat("yyyyMMddHHmmss");
public static void main(String[] args) throws ParseException {
final String S1 = "27.19.74.143 - - [30/May/2013:17:38:20 +0800] \"GET /static/image/common/faq.gif HTTP/1.1\" 200 1127";
LogParser parser = new LogParser();
final String[] array = parser.parse(S1);
System.out.println("样例数据:" + S1);
System.out.format("解析结果: ip=%s, time=%s, url=%s, status=%s, traffic=%s",array[0], array[1], array[2], array[3], array[4]);
}
/**
* 解析英文时间字符串
*
* @param string
* @return
* @throwsParseException
*/
private Date parseDateFormat(String string) {
Date parse = null;
try {
parse = FORMAT.parse(string);
} catch (ParseException e) {
e.printStackTrace();
}
return parse;
}
/**
*
* 解析日志的行记录
*
* @param line
* @return 数组含有5个元素,分别是ip、时间、url、状态、流量
*
*/
public String[] parse(String line) {
String ip = parseIP(line);
String time = parseTime(line);
String url = parseURL(line);
String status = parseStatus(line);
String traffic = parseTraffic(line);
return new String[] { ip, time, url, status, traffic };
}
private String parseTraffic(String line) {
final String trim = line.substring(line.lastIndexOf("\"") + 1) .trim();
String traffic = trim.split(" ")[1];
return traffic;
}
private String parseStatus(String line) {
final String trim = line.substring(line.lastIndexOf("\"") + 1).trim();
String status = trim.split(" ")[0];
return status;
}
private String parseURL(String line) {
final int first = line.indexOf("\"");
final int last = line.lastIndexOf("\"");
String url = line.substring(first + 1, last);
return url;
}
private String parseTime(String line) {
final int first = line.indexOf("[");
final int last = line.indexOf("+0800]");
String time = line.substring(first + 1, last).trim();
Date date = parseDateFormat(time);
return dateformat1.format(date);
}
private String parseIP(String line) {
String ip = line.split("- -")[0].trim();
return ip;
}
接下来是mapreduce重点:
(2)mapper类
输入:key为行偏移量,value为原文本
输出:key为行偏移量,value为清洗后文本
package com.simple.mr;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class LogCleanMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
LogParser logParser = new LogParser();//创建解析类对象
Text outputValue=new Text();
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{
final String[] parsed = logParser.parse(value.toString());//进行解析
// step1.过滤掉静态资源访问请求
if (parsed[2].startsWith("GET /static/") || parsed[2].startsWith("GET /uc_server")) {
return;
}
// step2.过滤掉开头的指定字符串
if (parsed[2].startsWith("GET /")) {
parsed[2] = parsed[2].substring("GET /".length());
} else if (parsed[2].startsWith("POST /")) {
parsed[2] = parsed[2].substring("POST /".length());
}
// step3.过滤掉结尾的特定字符串
if (parsed[2].endsWith(" HTTP/1.1")) {
parsed[2] = parsed[2].substring(0, parsed[2].length() - " HTTP/1.1".length());
}
// step4.只写入前三个记录类型项,即:ip,time,url
outputValue.set(parsed[0] + "\t" + parsed[1] + "\t" + parsed[2]);
context.write(key, outputValue);
}
}
(3)reducer方法
其实这个reduce没有做什么操作,但是reduce会进行排序等还是有作用
package com.simple.mr;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class LogCleanReducer extends Reducer<LongWritable, Text, Text, NullWritable> {
protected void reduce(LongWritable k2, Iterable<Text> v2s, Context context)
throws IOException, InterruptedException {
for (Text v2 : v2s) {//因为这个例子比较简单只有一个文本所以同一个行偏移量仅对应唯一文本就用不上迭代器,但是为了以后用所以这么写
context.write(v2, NullWritable.get());
}
}
}
(4)主启动类
package com.simple.mr;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class LogCleanDriver extends Configured implements Tool {
static final String INPUT_PATH = "hdfs://localhost:9000/access_2013_05_30.log";//输入路径
static final String OUT_PATH = "hdfs://localhost:9000/output";//输出路径
public static void main(String[] args) {
Configuration conf = new Configuration();
try {
int res = ToolRunner.run(conf, new LogCleanDriver(), args);
System.exit(res);
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public int run(String[] args) throws Exception {
// 清理已存在的输出文件
FileSystem fs = FileSystem.get(new URI(INPUT_PATH), getConf());
Path outPath = new Path(OUT_PATH);
if (fs.exists(outPath)) {
fs.delete(outPath, true);
}
final Job job = Job.getInstance(getConf(), "BBS论坛日志清洗");
// 设置为可以打包运行
job.setJarByClass(LogCleanDriver.class);
job.setMapperClass(LogCleanMapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(LogCleanReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//原因是FileOutputFormat.setOutputPath
方法不支持直接传入字符串类型的路径,必须使用Path
对象。
//FileInputFormat.setInputPath支持传入字符串或者传入Path对象
FileInputFormat.setInputPaths(job, INPUT_PATH);
FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));
boolean success = job.waitForCompletion(true);
// 如果清理数据成功输出 及 清理失败输出
if (success) {
System.out.println("Clean process success!");
}else {
System.out.println("Clean process failed!");
}
return 0;
}
}