大数据学习之自定义输出 13

二：自定义输出

自定义输出

需求:过滤日志文件

把包含itstaredu的放在一个文件中 d:/itstaredu.log

把不包含itstaredu的放在另外一个文件 d:/other.log

1：自定义编写FileOutputFormate

package it.dawn.YARNPra.自定义.outputformat;

import java.io.IOException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * @author Dawn
 * @date 2019年5月11日23:45:47
 * @version 1.0
 * 类似自定义输入，根据源码自己写一个FileOutputFormat
 * 继承FileOutputFormat
 */
public class FuncFileOutputFormat extends FileOutputFormat<Text, NullWritable>{

	@Override
	public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext job)
			throws IOException, InterruptedException {
		FileRecordWriter recordWriter = new FileRecordWriter(job);
		return recordWriter;
	}

}

2 : 自定义编写FileRecordWriter类

package it.dawn.YARNPra.自定义.outputformat;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

/**
 * @author Dawn
 * @date 2019年5月11日23:48:31
 * @version 1.0
 * 继承 RecordWriter
 */
public class FileRecordWriter extends RecordWriter<Text, NullWritable>{
	
	Configuration conf=null;
	FSDataOutputStream itstarlog=null;
	FSDataOutputStream otherlog=null;

	//1.定义数据输出路径
	public FileRecordWriter(TaskAttemptContext job) throws IOException {
		//获取配置信息
		conf=job.getConfiguration();
		
		//获取文件系统
		FileSystem fs=FileSystem.get(conf);
		
		//定义输出路径
		//默认就是那个我们很熟悉的part-r-00000。这里我们把它自定义成itstar.log  other.log
		itstarlog=fs.create(new Path("f:/temp/outputformateSelf/fileoutSelf1/itstar.log"));
		otherlog=fs.create(new Path("f:/temp/outputformateSelf/fileoutSelf2/other.log"));
	}
	
	//2.数据输出
	@Override
	public void write(Text key, NullWritable value) throws IOException, InterruptedException {
		//判断的话根据key
		if(key.toString().contains("itstar")) {
			//写出到文件
			itstarlog.write(key.getBytes());
		}else {
			otherlog.write(key.getBytes());
		}
		
	}

	//3.关闭资源
	@Override
	public void close(TaskAttemptContext context) throws IOException, InterruptedException {
		if(null != itstarlog) {
			itstarlog.close();
		}
		
		if(null != otherlog) {
			otherlog.close();
		}
		
	}

}

3：编写MR

mapper

package it.dawn.YARNPra.自定义.outputformat;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

/**
 * @author Dawn
 * @date 2019年5月11日23:58:27
 * @version 1.0
 * 直接代码一把梭，写出去
 */
public class FileMapper extends Mapper<LongWritable, Text, Text, NullWritable>{

	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		context.write(value, NullWritable.get());
	}
	
	

}

Reduce：

package it.dawn.YARNPra.自定义.outputformat;

import java.io.IOException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class FileReducer extends Reducer<Text, NullWritable, Text, NullWritable>{

	@Override
	protected void reduce(Text key, Iterable<NullWritable> values,
				Context context) throws IOException, InterruptedException {
		//换个行吧！
		String k = key.toString()+"\n";
		
		context.write(new Text(k), NullWritable.get());
	}
	
	

}

Driver类：

package it.dawn.YARNPra.自定义.outputformat;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * @author Dawn
 * @date 2019年5月12日00:03:03
 * @version 1.0
 * 
 * 这里大家可能有个小疑问？
 * 就是我们已近在自定义输出的时候，已经指定了输出位置。为什么我们这里还是要写输出位置？
 * 
 * 大家可以这样想下，就是我们不进行自定义输出的时候，是不是每次任务之后，
 * 会出现一大堆的文件 ._SUCCESS.crc  .part-r-00000.crc _SUCCESS  part-r-00000这4个的嘛。
 * 而我们再自己写的自定义输出的时候，其实只是对part-r-00000文件指定了位置，而其他的什么 ._SUCCESS.crc ...这些没做处理啊！！
 * 
 */
public class FileDriver {
	
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		// 1.获取job信息
		Configuration conf = new Configuration();
		Job job=Job.getInstance(conf);
		
		// 2.获取jar包
		job.setJarByClass(FileDriver.class);
		
		// 3.获取自定义的mapper与reducer类
		job.setMapperClass(FileMapper.class);
		job.setReducerClass(FileReducer.class);
		
		// 4.设置map输出的数据类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(NullWritable.class);
		
		// 5.设置reduce输出的数据类型（最终的数据类型）
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		
		//设置自定outputFormat
		job.setOutputFormatClass(FuncFileOutputFormat.class);
		
		// 6.设置输入存在的路径与处理后的结果路径
		FileInputFormat.setInputPaths(job, new Path("f:/temp/流量日志.dat"));
		FileOutputFormat.setOutputPath(job, new Path("f:/temp/outputformateSelf"));
		
		// 7.提交任务
		boolean rs = job.waitForCompletion(true);
		System.out.println(rs? "成功":"失败");
	}

}