处理map端多类型输出

多类型输出

多类型输出指的是在mapreduce里输出的不在是单一类型

本案例意思：

对于日志进行mapreduce处理得到每个城市访问的所有url和响应内容总量

（1）日志对象

LogWritable

package com.simple;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;

/*
 * 代表日志信息的对象
199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245
其中：
199.72.81.55                  客户端用户的ip
01/Jul/1995:00:00:01 -0400    访问的时间
GET                           HTTP方法，GET/POST
/history/apollo/              客户请求的URL
200                           响应码  404
6245                          响应内容的大小
 */
public class LogWritable implements Writable{
    private Text userIP;           // 客户端的IP地址
    private Text timestamp;        // 客户访问时间
    private Text url;              // 客户访问的url

    private IntWritable status;    // 状态码
    private IntWritable responseSize;    // 服务端响应数据的大小

    public LogWritable() {
        this.userIP = new Text();
        this.timestamp = new Text();
        this.url = new Text();
        this.status = new IntWritable();
        this.responseSize = new IntWritable();
    }

    public void set(String userIP, String timestamp, String url, int status, int responseSize) {
        this.userIP.set(userIP);
        this.timestamp.set(timestamp);
        this.url.set(url);
        this.status.set(status);
        this.responseSize.set(responseSize);
    }
　　
    public Text getUserIP() {
        return userIP;
    }
　　
    public void setUserIP(Text userIP) {
        this.userIP = userIP;
    }
　　
    public Text getTimestamp() {
        return timestamp;
    }
　　
    public void setTimestamp(Text timestamp) {
        this.timestamp = timestamp;
    }
　　
    public Text getUrl() {
        return url;
    }
　　
    public void setUrl(Text url) {
        this.url = url;
    }
　　
    public IntWritable getStatus() {
        return status;
    }
　　
    public void setStatus(IntWritable status) {
        this.status = status;
    }
　　
    public IntWritable getResponseSize() {
        return responseSize;
    }
　　
    public void setResponseSize(IntWritable responseSize) {
        this.responseSize = responseSize;
    }
　　
    // 序列化方法
    @Override
    public void write(DataOutput out) throws IOException {
        userIP.write(out);
        timestamp.write(out);
        url.write(out);
        status.write(out);
        responseSize.write(out);
    }
　　
    // 反序列化方法
    @Override
    public void readFields(DataInput in) throws IOException {
        userIP.readFields(in);
        timestamp.readFields(in);
        url.readFields(in);
        status.readFields(in);
        responseSize.readFields(in);
    }    

}

（2）对于多类型输出需要定义一个类继承 GenericWritable

package com.simple;

import org.apache.hadoop.io.GenericWritable;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Writable;

public class MultiValueWritable extends GenericWritable {

//类型：因为本题的解题思路是在map端经过map函数处理获取日志对象（LogWritable.class 这里面含有url）和响应量大小（IntWritable.class）

// 使用类型的静态数组并使用数组中的索引作为对类型的序列化引用

private static Class[] CLASSES = { LogWritable.class, IntWritable.class };

// 一定要有无参构造器

public MultiValueWritable() {}

public MultiValueWritable(Writable w) {//有参的构造函数！

this.set(w);

}

@Override

protected Class[] getTypes() {

return CLASSES;

}

（3）mapper类

public class LogMapper extends Mapper<LongWritable,Text,Text,MultiValueWritable>{

@Override

protected void map(LongWritable key, Text value,Context context)

throws IOException, InterruptedException {

String regexp = "^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\d+)";

Pattern pattern = Pattern.compile(regexp);

Matcher matcher = pattern.matcher(value.toString());

if(!matcher.matches()) {

System.out.println("不是一个有效的日志记录");

return;

}

String ip = matcher.group(1);

String timestamp = matcher.group(4);

String url = matcher.group(5);

int status = Integer.parseInt(matcher.group(6));

int responseSize = Integer.parseInt(matcher.group(7));

LogWritable log = new LogWritable();

log.set(ip, timestamp, url, status, responseSize);

　　//输出key为ip，value可能是响应内容大小可能是日志对象

context.write(new Text(ip), new MultiValueWritable(log));

context.write(new Text(ip), new MultiValueWritable(new IntWritable(responseSize)));

}

（4）ruducer类还是求和同一城市的所有响应量

package com.simple;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.Writable;

import org.apache.hadoop.mapreduce.Reducer;

public class LogReducer extends Reducer<Text, MultiValueWritable, Text, Text> {

private Text text = new Text();

@Override

protected void reduce(Text key, Iterable<MultiValueWritable> values, Context context)

throws IOException, InterruptedException {

int total = 0;

StringBuilder sb = new StringBuilder();//放该城市下所有的url

// reduce收到的MultiValueWritable中包装的值类型，可能是LogWritable,也有可能是IntWritable

for(MultiValueWritable mvw : values) {

//如果是IntWritable就是响应内容就求和，否则就是日志对象就获取url并加入sb

Writable w = mvw.get();//利用多态写法！

if(w instanceof IntWritable) {

// 说明收到的是IntWritable类型，累加responseSize的大小

total += ((IntWritable) w).get();

}else {

// 说明收到的是LogWritable

sb.append(((LogWritable)w).getUrl());

sb.append("\t");

}

// 某IP，访问了哪些url，这些url总共的大小是多少

context.write(key, new Text(sb.toString() + ":" + total));

}

（4）主启动类

package com.simple;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class LogDriver {

public static void main(String[] args) throws Exception {

String input = "hdfs://localhost:9000/NASA_log_sample.txt";

String output = "hdfs://localhost:9000/log-output";

Configuration conf = new Configuration();

Job job = Job.getInstance(conf,"日志分析");

job.setJarByClass(LogDriver.class);

job.setMapperClass(LogMapper.class);

job.setReducerClass(LogReducer.class);

// 设置map输出类型

job.setMapOutputKeyClass(Text.class);

job.setMapOutputValueClass(MultiValueWritable.class);

// 设置reduce输出类型

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(Text.class);

// 设置输入路径

FileInputFormat.setInputPaths(job, new Path(input));

FileOutputFormat.setOutputPath(job, new Path(output));

// 提交作业

System.exit(job.waitForCompletion(true)?0:1);

}

posted @ 2025-04-07 18:31 Annaprincess 阅读(18) 评论(0) 收藏举报

刷新页面返回顶部

luckyyaoyao

处理map端多类型输出

多类型输出

公告