处理map端多类型输出

多类型输出

多类型输出指的是在mapreduce里输出的不在是单一类型

本案例意思:

对于日志进行mapreduce处理得到每个城市访问的所有url和响应内容总量

(1)日志对象

LogWritable
package com.simple;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;

/*
 * 代表日志信息的对象
199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245
其中:
199.72.81.55                  客户端用户的ip
01/Jul/1995:00:00:01 -0400    访问的时间
GET                           HTTP方法,GET/POST
/history/apollo/              客户请求的URL
200                           响应码  404
6245                          响应内容的大小
 */
public class LogWritable implements Writable{
    private Text userIP;           // 客户端的IP地址
    private Text timestamp;        // 客户访问时间
    private Text url;              // 客户访问的url

    private IntWritable status;    // 状态码
    private IntWritable responseSize;    // 服务端响应数据的大小

    public LogWritable() {
        this.userIP = new Text();
        this.timestamp = new Text();
        this.url = new Text();
        this.status = new IntWritable();
        this.responseSize = new IntWritable();
    }

    public void set(String userIP, String timestamp, String url, int status, int responseSize) {
        this.userIP.set(userIP);
        this.timestamp.set(timestamp);
        this.url.set(url);
        this.status.set(status);
        this.responseSize.set(responseSize);
    }
  
    public Text getUserIP() {
        return userIP;
    }
  
    public void setUserIP(Text userIP) {
        this.userIP = userIP;
    }
  
    public Text getTimestamp() {
        return timestamp;
    }
  
    public void setTimestamp(Text timestamp) {
        this.timestamp = timestamp;
    }
  
    public Text getUrl() {
        return url;
    }
  
    public void setUrl(Text url) {
        this.url = url;
    }
  
    public IntWritable getStatus() {
        return status;
    }
  
    public void setStatus(IntWritable status) {
        this.status = status;
    }
  
    public IntWritable getResponseSize() {
        return responseSize;
    }
  
    public void setResponseSize(IntWritable responseSize) {
        this.responseSize = responseSize;
    }
  
    // 序列化方法
    @Override
    public void write(DataOutput out) throws IOException {
        userIP.write(out);
        timestamp.write(out);
        url.write(out);
        status.write(out);
        responseSize.write(out);
    }
  
    // 反序列化方法
    @Override
    public void readFields(DataInput in) throws IOException {
        userIP.readFields(in);
        timestamp.readFields(in);
        url.readFields(in);
        status.readFields(in);
        responseSize.readFields(in);
    }    

}

(2)对于多类型输出需要定义一个类继承 GenericWritable

package com.simple;

 

import org.apache.hadoop.io.GenericWritable;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Writable;

 

public class MultiValueWritable extends GenericWritable {

//类型:因为本题的解题思路是在map端经过map函数处理获取日志对象(LogWritable.class 这里面含有url)和响应量大小(IntWritable.class

    // 使用类型的静态数组并使用数组中的索引作为对类型的序列化引用

    private static Class[] CLASSES = { LogWritable.class, IntWritable.class };

 

    // 一定要有无参构造器

    public MultiValueWritable() {}

 

    public MultiValueWritable(Writable w) {//有参的构造函数!

        this.set(w);

    }

 

    @Override

    protected Class[] getTypes() {

        return CLASSES;

    }

}

(3)mapper类

public class LogMapper extends Mapper<LongWritable,Text,Text,MultiValueWritable>{

  

    @Override

    protected void map(LongWritable key, Text value,Context context)

            throws IOException, InterruptedException {

        String regexp = "^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\d+)";

  

        Pattern pattern = Pattern.compile(regexp);

        Matcher matcher = pattern.matcher(value.toString());

        if(!matcher.matches()) {

            System.out.println("不是一个有效的日志记录");

            return;

        }

  

        String ip = matcher.group(1);       

        String timestamp = matcher.group(4);

        String url = matcher.group(5);

        int status = Integer.parseInt(matcher.group(6));

        int responseSize = Integer.parseInt(matcher.group(7));

  

        LogWritable log = new LogWritable();

        log.set(ip, timestamp, url, status, responseSize);

  //输出key为ip,value可能是响应内容大小可能是日志对象

        context.write(new Text(ip), new MultiValueWritable(log));

        context.write(new Text(ip), new MultiValueWritable(new IntWritable(responseSize)));

    }

 

}

(4)ruducer类还是求和同一城市的所有响应量

package com.simple;

 

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.Writable;

import org.apache.hadoop.mapreduce.Reducer;

 

public class LogReducer extends Reducer<Text, MultiValueWritable, Text, Text> {

 

    private Text text = new Text();

 

    @Override

    protected void reduce(Text key, Iterable<MultiValueWritable> values, Context context)

            throws IOException, InterruptedException {

        int total = 0;

        StringBuilder sb = new StringBuilder();//放该城市下所有的url

 

        // reduce收到的MultiValueWritable中包装的值类型,可能是LogWritable,也有可能是IntWritable

        for(MultiValueWritable mvw : values) {

//如果是IntWritable就是响应内容就求和,否则就是日志对象就获取url并加入sb

            Writable w = mvw.get();//利用多态写法!

 

            if(w instanceof IntWritable) {

                // 说明收到的是IntWritable类型,累加responseSize的大小

                total += ((IntWritable) w).get();

            }else {

                // 说明收到的是LogWritable

                sb.append(((LogWritable)w).getUrl());

                sb.append("\t");

            }

        }

  

        // 某IP,访问了哪些url,这些url总共的大小是多少

        context.write(key, new Text(sb.toString() + ":" + total));

    }

 

}

(4)主启动类

package com.simple;

 

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 

public class LogDriver {

 

    public static void main(String[] args) throws Exception {

        String input = "hdfs://localhost:9000/NASA_log_sample.txt";

        String output = "hdfs://localhost:9000/log-output";

 

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf,"日志分析");

 

        job.setJarByClass(LogDriver.class);

 

        job.setMapperClass(LogMapper.class);

        job.setReducerClass(LogReducer.class);

 

        // 设置map输出类型

        job.setMapOutputKeyClass(Text.class);

        job.setMapOutputValueClass(MultiValueWritable.class);

  

        // 设置reduce输出类型

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(Text.class);

  

        // 设置输入路径

        FileInputFormat.setInputPaths(job, new Path(input));

        FileOutputFormat.setOutputPath(job, new Path(output));

  

        // 提交作业

        System.exit(job.waitForCompletion(true)?0:1);

    }

 

}

posted @ 2025-04-07 18:31  Annaprincess  阅读(18)  评论(0)    收藏  举报