hadoop1-TopK问题实现之优化设计
此次map设计引入了一个自定义的list容器,使map输出时仅输出前几名即可。
直接程序代码(引言可参考上一篇):
package test; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; public class TopK2 { //改进型map public static class Map extends Mapper<Object, Text, MyKey, NullWritable>{ private static MyList list = null; //初始化list,使用配置容量 protected void setup(Context context) throws IOException ,InterruptedException { list = new MyList(Integer.parseInt(context.getConfiguration().get("top_num"))); }; protected void map(Object key, Text value, Context context) throws java.io.IOException ,InterruptedException { try { list.add(Integer.parseInt(value.toString())); } catch (Exception e) { // TODO: handle exception return ; } }; //Map任务结束时执行 protected void cleanup(Context context) throws IOException ,InterruptedException { for (Integer item : list) { context.write(new MyKey(item), NullWritable.get()); } list.clear(); }; } public static class Reduce extends Reducer<MyKey, NullWritable, Text, NullWritable>{ private static Text k = new Text(); private static MyList list = null; //初始化list,使用配置容量 protected void setup(Context context) throws IOException ,InterruptedException { list = new MyList(Integer.parseInt(context.getConfiguration().get("top_num"))); }; protected void reduce(MyKey key, Iterable<NullWritable> values, Context context) throws IOException ,InterruptedException { //所得到的key是降序输出的,因为是自定义的key try { list.add(key.getNum()); } catch (Exception e) { // TODO: handle exception return ; } }; protected void cleanup(Context context) throws IOException ,InterruptedException { for (int i=0; i<list.size(); i++) { k.set(list.get(i)+"\t"+(i+1)); context.write(k, NullWritable.get()); } list.clear(); }; } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs(); if(otherArgs.length != 3){ System.err.println("Usage:TopK"); System.exit(2); } //参数3 为要获取的最大个数 conf.set("top_num", args[2]); Job job = new Job(conf, "TopK2"); job.setJarByClass(TopK2.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setMapOutputKeyClass(MyKey.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } private static class MyKey implements WritableComparable<MyKey>{ private int num; public int getNum() { return num; } public MyKey() { } public MyKey(int num) { super(); this.num = num; } @Override public void readFields(DataInput in) throws IOException { // TODO Auto-generated method stub num = in.readInt(); } @Override public void write(DataOutput out) throws IOException { // TODO Auto-generated method stub out.writeInt(num); } @Override public int compareTo(MyKey o) { // TODO Auto-generated method stub //反序输出 return o.num - this.num; } } private static class MyList extends ArrayList<Integer>{ //默认容量为5 private int cont = 5; public MyList(int num){ super(); this.cont = num; } public void add(int value){ //添加前判断,如果<cont 直接添加,不用判断 if(super.size() < cont){ super.add(value); }else{ //此处还可以进行优化,可以采用动态链表的形式 Collections.sort(this); if(value > this.get(0)){ this.set(0, value); } } } } }
计算结果:
不写了,一样。