mapreduce自定义分组(map端1.4)

map端分组即将相同Key的value分到一组中去，对如下原始数据进行分组，分组规则是如果第一列相同，求出第二列的最小值。

3　　3

3　　2

3　　1

2　　2

2　　1

1　　1

-----------------期望输出

1　　1

2　　1

3　　1

-------------------

在mapreduce自定义排序(map端1.4步)基础上执行分组：

由于业务要求分组是按照第一列分组，但是NewK2的比较规则决定了不能按照第一列分。只能自定义分组比较器。

job.setGroupingComparatorClass(MyGroupingComparator.class);

-------------------

static class MyGroupingComparator implements RawComparator<NewK2>{

@Override
public int compare(NewK2 o1, NewK2 o2) {
return (int)(o1.first - o2.first);
}
/**
* @param arg0 表示第一个参与比较的字节数组
* @param arg1 表示第一个参与比较的字节数组的起始位置
* @param arg2 表示第一个参与比较的字节数组的偏移量
*
* @param arg3 表示第二个参与比较的字节数组
* @param arg4 表示第二个参与比较的字节数组的起始位置
* @param arg5 表示第二个参与比较的字节数组的偏移量
*/
@Override
public int compare(byte[] arg0, int arg1, int arg2, byte[] arg3,
int arg4, int arg5) {
return WritableComparator.compareBytes(arg0, arg1, 8, arg3, arg4, 8);
}

}

----------------------------

static class MyMapper extends Mapper<LongWritable, Text, NewK2, LongWritable>{
protected void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,NewK2,LongWritable>.Context context) throws java.io.IOException ,InterruptedException {
final String[] splited = value.toString().split("\t");
final NewK2 k2 = new NewK2(Long.parseLong(splited[0]), Long.parseLong(splited[1]));
final LongWritable v2 = new LongWritable(Long.parseLong(splited[1]));
context.write(k2, v2);
};
}

static class MyReducer extends Reducer<NewK2, LongWritable, LongWritable, LongWritable>{
protected void reduce(NewK2 k2, java.lang.Iterable<LongWritable> v2s, org.apache.hadoop.mapreduce.Reducer<NewK2,LongWritable,LongWritable,LongWritable>.Context context) throws java.io.IOException ,InterruptedException {
long min = Long.MAX_VALUE;
for (LongWritable v2 : v2s) {
if(v2.get()<min){
min = v2.get();
}
}
context.write(new LongWritable(k2.first), new LongWritable(min));
};
}

  1 package group;
  2 
  3 import java.io.DataInput;
  4 import java.io.DataOutput;
  5 import java.io.IOException;
  6 import java.net.URI;
  7 
  8 import org.apache.hadoop.conf.Configuration;
  9 import org.apache.hadoop.fs.FileSystem;
 10 import org.apache.hadoop.fs.Path;
 11 import org.apache.hadoop.io.LongWritable;
 12 import org.apache.hadoop.io.RawComparator;
 13 import org.apache.hadoop.io.Text;
 14 import org.apache.hadoop.io.WritableComparable;
 15 import org.apache.hadoop.io.WritableComparator;
 16 import org.apache.hadoop.mapreduce.Job;
 17 import org.apache.hadoop.mapreduce.Mapper;
 18 import org.apache.hadoop.mapreduce.Reducer;
 19 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 20 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 21 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 22 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 23 import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
 24 
 25 public class GroupApp {
 26     static final String INPUT_PATH = "hdfs://mlj:9000/sort";
 27     static final String OUT_PATH = "hdfs://mlj:9000/sort_out";
 28     public static void main(String[] args) throws Exception{
 29         final Configuration configuration = new Configuration();
 30         
 31         final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), configuration);
 32         if(fileSystem.exists(new Path(OUT_PATH))){
 33             fileSystem.delete(new Path(OUT_PATH), true);
 34         }
 35         
 36         final Job job = new Job(configuration, GroupApp.class.getSimpleName());
 37         
 38         //1.1 指定输入文件路径
 39         FileInputFormat.setInputPaths(job, INPUT_PATH);
 40         //指定哪个类用来格式化输入文件
 41         job.setInputFormatClass(TextInputFormat.class);
 42         
 43         //1.2指定自定义的Mapper类
 44         job.setMapperClass(MyMapper.class);
 45         //指定输出<k2,v2>的类型
 46         job.setMapOutputKeyClass(NewK2.class);
 47         job.setMapOutputValueClass(LongWritable.class);
 48         
 49         //1.3 指定分区类
 50         job.setPartitionerClass(HashPartitioner.class);
 51         job.setNumReduceTasks(1);
 52         
 53         //1.4 TODO 排序、分区
 54         job.setGroupingComparatorClass(MyGroupingComparator.class);
 55         //1.5  TODO （可选）合并
 56         
 57         //2.2 指定自定义的reduce类
 58         job.setReducerClass(MyReducer.class);
 59         //指定输出<k3,v3>的类型
 60         job.setOutputKeyClass(LongWritable.class);
 61         job.setOutputValueClass(LongWritable.class);
 62         
 63         //2.3 指定输出到哪里
 64         FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));
 65         //设定输出文件的格式化类
 66         job.setOutputFormatClass(TextOutputFormat.class);
 67         
 68         //把代码提交给JobTracker执行
 69         job.waitForCompletion(true);
 70     }
 71 
 72     
 73     static class MyMapper extends Mapper<LongWritable, Text, NewK2, LongWritable>{
 74         protected void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,NewK2,LongWritable>.Context context) throws java.io.IOException ,InterruptedException {
 75             final String[] splited = value.toString().split("\t");
 76             final NewK2 k2 = new NewK2(Long.parseLong(splited[0]), Long.parseLong(splited[1]));
 77             final LongWritable v2 = new LongWritable(Long.parseLong(splited[1]));
 78             context.write(k2, v2);
 79         };
 80     }
 81     
 82     static class MyReducer extends Reducer<NewK2, LongWritable, LongWritable, LongWritable>{
 83         protected void reduce(NewK2 k2, java.lang.Iterable<LongWritable> v2s, org.apache.hadoop.mapreduce.Reducer<NewK2,LongWritable,LongWritable,LongWritable>.Context context) throws java.io.IOException ,InterruptedException {
 84             long min = Long.MAX_VALUE;
 85             for (LongWritable v2 : v2s) {
 86                 if(v2.get()<min){
 87                     min = v2.get();
 88                 }
 89             }
 90             
 91             context.write(new LongWritable(k2.first), new LongWritable(min));
 92         };
 93     }
 94     
 95     /**
 96      * 问：为什么实现该类？
 97      * 答：因为原来的v2不能参与排序，把原来的k2和v2封装到一个类中，作为新的k2
 98      *
 99      */
100     static class  NewK2 implements WritableComparable<NewK2>{
101         Long first;
102         Long second;
103         
104         public NewK2(){}
105         
106         public NewK2(long first, long second){
107             this.first = first;
108             this.second = second;
109         }
110         
111         
112         @Override
113         public void readFields(DataInput in) throws IOException {
114             this.first = in.readLong();
115             this.second = in.readLong();
116         }
117 
118         @Override
119         public void write(DataOutput out) throws IOException {
120             out.writeLong(first);
121             out.writeLong(second);
122         }
123 
124         /**
125          * 当k2进行排序时，会调用该方法.
126          * 当第一列不同时，升序；当第一列相同时，第二列升序
127          */
128         @Override
129         public int compareTo(NewK2 o) {
130             final long minus = this.first - o.first;
131             if(minus !=0){
132                 return (int)minus;
133             }
134             return (int)(this.second - o.second);
135         }
136         
137         @Override
138         public int hashCode() {
139             return this.first.hashCode()+this.second.hashCode();
140         }
141         
142         @Override
143         public boolean equals(Object obj) {
144             if(!(obj instanceof NewK2)){
145                 return false;
146             }
147             NewK2 oK2 = (NewK2)obj;
148             return (this.first==oK2.first)&&(this.second==oK2.second);
149         }
150     }
151     
152     /**
153      * 问：为什么自定义该类？
154      * 答：业务要求分组是按照第一列分组，但是NewK2的比较规则决定了不能按照第一列分。只能自定义分组比较器。
155      */
156     static class MyGroupingComparator implements RawComparator<NewK2>{
157 
158         @Override
159         public int compare(NewK2 o1, NewK2 o2) {
160             return (int)(o1.first - o2.first);
161         }
162         /**
163          * @param arg0 表示第一个参与比较的字节数组
164          * @param arg1 表示第一个参与比较的字节数组的起始位置
165          * @param arg2 表示第一个参与比较的字节数组的偏移量
166          * 
167          * @param arg3 表示第二个参与比较的字节数组
168          * @param arg4 表示第二个参与比较的字节数组的起始位置
169          * @param arg5 表示第二个参与比较的字节数组的偏移量
170          */
171         @Override
172         public int compare(byte[] arg0, int arg1, int arg2, byte[] arg3,
173                 int arg4, int arg5) {
174             return WritableComparator.compareBytes(arg0, arg1, 8, arg3, arg4, 8);
175         }
176         
177     }
178 }

posted @ 2015-05-05 17:05 孟想阳光阅读(271) 评论(0) 收藏举报

刷新页面返回顶部

孟想阳光

mapreduce自定义分组(map端1.4)

在mapreduce自定义排序(map端1.4步)基础上执行分组：

公告