mapreduce自定义分组(map端1.4)
map端分组即将相同Key的value分到一组中去,对如下原始数据进行分组,分组规则是如果第一列相同,求出第二列的最小值。
3 3
3 2
3 1
2 2
2 1
1 1
-----------------期望输出
1 1
2 1
3 1
-------------------
在mapreduce自定义排序(map端1.4步)基础上执行分组:
由于业务要求分组是按照第一列分组,但是NewK2的比较规则决定了不能按照第一列分。只能自定义分组比较器。
job.setGroupingComparatorClass(MyGroupingComparator.class);
-------------------
static class MyGroupingComparator implements RawComparator<NewK2>{
@Override
public int compare(NewK2 o1, NewK2 o2) {
return (int)(o1.first - o2.first);
}
/**
* @param arg0 表示第一个参与比较的字节数组
* @param arg1 表示第一个参与比较的字节数组的起始位置
* @param arg2 表示第一个参与比较的字节数组的偏移量
*
* @param arg3 表示第二个参与比较的字节数组
* @param arg4 表示第二个参与比较的字节数组的起始位置
* @param arg5 表示第二个参与比较的字节数组的偏移量
*/
@Override
public int compare(byte[] arg0, int arg1, int arg2, byte[] arg3,
int arg4, int arg5) {
return WritableComparator.compareBytes(arg0, arg1, 8, arg3, arg4, 8);
}
}
----------------------------
static class MyMapper extends Mapper<LongWritable, Text, NewK2, LongWritable>{
protected void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,NewK2,LongWritable>.Context context) throws java.io.IOException ,InterruptedException {
final String[] splited = value.toString().split("\t");
final NewK2 k2 = new NewK2(Long.parseLong(splited[0]), Long.parseLong(splited[1]));
final LongWritable v2 = new LongWritable(Long.parseLong(splited[1]));
context.write(k2, v2);
};
}
static class MyReducer extends Reducer<NewK2, LongWritable, LongWritable, LongWritable>{
protected void reduce(NewK2 k2, java.lang.Iterable<LongWritable> v2s, org.apache.hadoop.mapreduce.Reducer<NewK2,LongWritable,LongWritable,LongWritable>.Context context) throws java.io.IOException ,InterruptedException {
long min = Long.MAX_VALUE;
for (LongWritable v2 : v2s) {
if(v2.get()<min){
min = v2.get();
}
}
context.write(new LongWritable(k2.first), new LongWritable(min));
};
}
1 package group; 2 3 import java.io.DataInput; 4 import java.io.DataOutput; 5 import java.io.IOException; 6 import java.net.URI; 7 8 import org.apache.hadoop.conf.Configuration; 9 import org.apache.hadoop.fs.FileSystem; 10 import org.apache.hadoop.fs.Path; 11 import org.apache.hadoop.io.LongWritable; 12 import org.apache.hadoop.io.RawComparator; 13 import org.apache.hadoop.io.Text; 14 import org.apache.hadoop.io.WritableComparable; 15 import org.apache.hadoop.io.WritableComparator; 16 import org.apache.hadoop.mapreduce.Job; 17 import org.apache.hadoop.mapreduce.Mapper; 18 import org.apache.hadoop.mapreduce.Reducer; 19 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 20 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 21 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 22 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 23 import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner; 24 25 public class GroupApp { 26 static final String INPUT_PATH = "hdfs://mlj:9000/sort"; 27 static final String OUT_PATH = "hdfs://mlj:9000/sort_out"; 28 public static void main(String[] args) throws Exception{ 29 final Configuration configuration = new Configuration(); 30 31 final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), configuration); 32 if(fileSystem.exists(new Path(OUT_PATH))){ 33 fileSystem.delete(new Path(OUT_PATH), true); 34 } 35 36 final Job job = new Job(configuration, GroupApp.class.getSimpleName()); 37 38 //1.1 指定输入文件路径 39 FileInputFormat.setInputPaths(job, INPUT_PATH); 40 //指定哪个类用来格式化输入文件 41 job.setInputFormatClass(TextInputFormat.class); 42 43 //1.2指定自定义的Mapper类 44 job.setMapperClass(MyMapper.class); 45 //指定输出<k2,v2>的类型 46 job.setMapOutputKeyClass(NewK2.class); 47 job.setMapOutputValueClass(LongWritable.class); 48 49 //1.3 指定分区类 50 job.setPartitionerClass(HashPartitioner.class); 51 job.setNumReduceTasks(1); 52 53 //1.4 TODO 排序、分区 54 job.setGroupingComparatorClass(MyGroupingComparator.class); 55 //1.5 TODO (可选)合并 56 57 //2.2 指定自定义的reduce类 58 job.setReducerClass(MyReducer.class); 59 //指定输出<k3,v3>的类型 60 job.setOutputKeyClass(LongWritable.class); 61 job.setOutputValueClass(LongWritable.class); 62 63 //2.3 指定输出到哪里 64 FileOutputFormat.setOutputPath(job, new Path(OUT_PATH)); 65 //设定输出文件的格式化类 66 job.setOutputFormatClass(TextOutputFormat.class); 67 68 //把代码提交给JobTracker执行 69 job.waitForCompletion(true); 70 } 71 72 73 static class MyMapper extends Mapper<LongWritable, Text, NewK2, LongWritable>{ 74 protected void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,NewK2,LongWritable>.Context context) throws java.io.IOException ,InterruptedException { 75 final String[] splited = value.toString().split("\t"); 76 final NewK2 k2 = new NewK2(Long.parseLong(splited[0]), Long.parseLong(splited[1])); 77 final LongWritable v2 = new LongWritable(Long.parseLong(splited[1])); 78 context.write(k2, v2); 79 }; 80 } 81 82 static class MyReducer extends Reducer<NewK2, LongWritable, LongWritable, LongWritable>{ 83 protected void reduce(NewK2 k2, java.lang.Iterable<LongWritable> v2s, org.apache.hadoop.mapreduce.Reducer<NewK2,LongWritable,LongWritable,LongWritable>.Context context) throws java.io.IOException ,InterruptedException { 84 long min = Long.MAX_VALUE; 85 for (LongWritable v2 : v2s) { 86 if(v2.get()<min){ 87 min = v2.get(); 88 } 89 } 90 91 context.write(new LongWritable(k2.first), new LongWritable(min)); 92 }; 93 } 94 95 /** 96 * 问:为什么实现该类? 97 * 答:因为原来的v2不能参与排序,把原来的k2和v2封装到一个类中,作为新的k2 98 * 99 */ 100 static class NewK2 implements WritableComparable<NewK2>{ 101 Long first; 102 Long second; 103 104 public NewK2(){} 105 106 public NewK2(long first, long second){ 107 this.first = first; 108 this.second = second; 109 } 110 111 112 @Override 113 public void readFields(DataInput in) throws IOException { 114 this.first = in.readLong(); 115 this.second = in.readLong(); 116 } 117 118 @Override 119 public void write(DataOutput out) throws IOException { 120 out.writeLong(first); 121 out.writeLong(second); 122 } 123 124 /** 125 * 当k2进行排序时,会调用该方法. 126 * 当第一列不同时,升序;当第一列相同时,第二列升序 127 */ 128 @Override 129 public int compareTo(NewK2 o) { 130 final long minus = this.first - o.first; 131 if(minus !=0){ 132 return (int)minus; 133 } 134 return (int)(this.second - o.second); 135 } 136 137 @Override 138 public int hashCode() { 139 return this.first.hashCode()+this.second.hashCode(); 140 } 141 142 @Override 143 public boolean equals(Object obj) { 144 if(!(obj instanceof NewK2)){ 145 return false; 146 } 147 NewK2 oK2 = (NewK2)obj; 148 return (this.first==oK2.first)&&(this.second==oK2.second); 149 } 150 } 151 152 /** 153 * 问:为什么自定义该类? 154 * 答:业务要求分组是按照第一列分组,但是NewK2的比较规则决定了不能按照第一列分。只能自定义分组比较器。 155 */ 156 static class MyGroupingComparator implements RawComparator<NewK2>{ 157 158 @Override 159 public int compare(NewK2 o1, NewK2 o2) { 160 return (int)(o1.first - o2.first); 161 } 162 /** 163 * @param arg0 表示第一个参与比较的字节数组 164 * @param arg1 表示第一个参与比较的字节数组的起始位置 165 * @param arg2 表示第一个参与比较的字节数组的偏移量 166 * 167 * @param arg3 表示第二个参与比较的字节数组 168 * @param arg4 表示第二个参与比较的字节数组的起始位置 169 * @param arg5 表示第二个参与比较的字节数组的偏移量 170 */ 171 @Override 172 public int compare(byte[] arg0, int arg1, int arg2, byte[] arg3, 173 int arg4, int arg5) { 174 return WritableComparator.compareBytes(arg0, arg1, 8, arg3, arg4, 8); 175 } 176 177 } 178 }

浙公网安备 33010602011771号