MapReduce框架原理之OutputFormat数据输入

MapReduce 框架原理之 OutputFormat 数据输入

OutputFormat 接口实现类

OutFormat 是 MapReduce 输出的基类,所有实现 MapReduce 输出都实现了 OutFormat 接口。

1.OutputFormat 实现类

  • FileOutputFormat
  • SequenceFileOutputFormat
  • TextOutputFormat
  • DBOutputFormat

2.默认输出格式 TextOutputFormat

3.自定义 OutputFormat

  • 应用场景:

    例如:输出数据到 Mysql/Hbase/Elasticearch

  • 自定义 OutputFormat 步骤

    自定义一个类继承 FileOutputFormat

    改写 RecordWriter,具体改写输出数据的方法 write()

自定义 OutputFormat 案例实操

  1. 需求

    过滤输出的日志,包含 atguigu 网站字符的输出到 atguigu.log,不包含的输出到 other.log。

  2. 案例分析

  3. 案例实操

    • 编写 LogMapper 类

      package com.atguigu.mapreduce.outputformat;

      import org.apache.hadoop.io.LongWritable;
      import org.apache.hadoop.io.NullWritable;
      import org.apache.hadoop.io.Text;
      import org.apache.hadoop.mapreduce.Mapper;

      import java.io.IOException;

      /**
       * @author: fxl
       * @Description:
       * @Data:Create in  2021-11-08
       * @Modified By:
       */

      public class LogMapper extends Mapper<LongWritableTextTextNullWritable{
          @Override
          protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
              context.write(value, NullWritable.get());
          }
      }

    • 编写 LogReducer 类

      package com.atguigu.mapreduce.outputformat;

      import org.apache.hadoop.io.NullWritable;
      import org.apache.hadoop.io.Text;
      import org.apache.hadoop.mapreduce.Reducer;

      import java.io.IOException;

      public class LogReducer extends Reducer<TextNullWritable,TextNullWritable{
          @Override
          protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
              // 防止有相同的数据,迭代写出
              for (NullWritable value : values) {
                  context.write(key,NullWritable.get());
              }
          }
      }
    • 自定义 LogOutputFormat 类

      package com.atguigu.mapreduce.outputformat;

      import org.apache.hadoop.io.NullWritable;
      import org.apache.hadoop.io.Text;
      import org.apache.hadoop.mapreduce.RecordWriter;
      import org.apache.hadoop.mapreduce.TaskAttemptContext;
      import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

      import java.io.IOException;

      public class LogOutputFormat extends FileOutputFormat<TextNullWritable{
          @Override
          public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
              //创建一个自定义的RecordWriter返回
              LogRecordWriter logRecordWriter = new LogRecordWriter(job);
              return logRecordWriter;
          }
      }
    • 编写 LogRecordWriter 类

      package com.atguigu.mapreduce.outputformat;

      import org.apache.hadoop.fs.FSDataOutputStream;
      import org.apache.hadoop.fs.FileSystem;
      import org.apache.hadoop.fs.Path;
      import org.apache.hadoop.io.IOUtils;
      import org.apache.hadoop.io.NullWritable;
      import org.apache.hadoop.io.Text;
      import org.apache.hadoop.mapreduce.RecordWriter;
      import org.apache.hadoop.mapreduce.TaskAttemptContext;

      import java.io.IOException;

      public class LogRecordWriter extends RecordWriter<TextNullWritable{

          private FSDataOutputStream atguiguOut;
          private FSDataOutputStream otherOut;

          public LogRecordWriter(TaskAttemptContext job) {
              try {
                  //获取文件系统对象
                  FileSystem fs = FileSystem.get(job.getConfiguration());
                  //用文件系统对象创建两个输出流对应不同的目录
                  atguiguOut = fs.create(new Path("d:/hadoop/atguigu.log"));
                  otherOut = fs.create(new Path("d:/hadoop/other.log"));
              } catch (IOException e) {
                  e.printStackTrace();
              }
          }

          @Override
          public void write(Text key, NullWritable value) throws IOException, InterruptedException {
              String log = key.toString();
              //根据一行的log数据是否包含atguigu,判断两条输出流输出的内容
              if (log.contains("atguigu")) {
                  atguiguOut.writeBytes(log + "\n");
              } else {
                  otherOut.writeBytes(log + "\n");
              }
          }

          @Override
          public void close(TaskAttemptContext context) throws IOException, InterruptedException {
              //关流
              IOUtils.closeStream(atguiguOut);
              IOUtils.closeStream(otherOut);
          }
      }
    • 编写 LogDriver 类

      package com.atguigu.mapreduce.outputformat;

      import org.apache.hadoop.conf.Configuration;
      import org.apache.hadoop.fs.Path;
      import org.apache.hadoop.io.NullWritable;
      import org.apache.hadoop.io.Text;
      import org.apache.hadoop.mapreduce.Job;
      import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
      import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

      import java.io.IOException;

      public class LogDriver {
          public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

              Configuration conf = new Configuration();
              Job job = Job.getInstance(conf);

              job.setJarByClass(LogDriver.class);
              job.setMapperClass(LogMapper.class);
              job.setReducerClass(LogReducer.class);

              job.setMapOutputKeyClass(Text.class);
              job.setMapOutputValueClass(NullWritable.class);

              job.setOutputKeyClass(Text.class);
              job.setOutputValueClass(NullWritable.class);

              //设置自定义的outputformat
              job.setOutputFormatClass(LogOutputFormat.class);

              FileInputFormat.setInputPaths(job, new Path("D:\\input"));
              //虽然我们自定义了outputformat,但是因为我们的outputformat继承自fileoutputformat
              //而fileoutputformat要输出一个_SUCCESS文件,所以在这还得指定一个输出目录
              FileOutputFormat.setOutputPath(job, new Path("D:\\logoutput"));

              boolean b = job.waitForCompletion(true);
              System.exit(b ? 0 : 1);
          }
      }
posted @ 2021-11-08 21:23  逆十字  阅读(58)  评论(0)    收藏  举报