map/reduce实现数据去重

  1 import java.io.IOException;
  2 
  3 import org.apache.hadoop.conf.Configuration;
  4 import org.apache.hadoop.conf.Configured;
  5 import org.apache.hadoop.fs.Path;
  6 import org.apache.hadoop.io.LongWritable;
  7 import org.apache.hadoop.io.Text;
  8 import org.apache.hadoop.mapreduce.Job;
  9 import org.apache.hadoop.mapreduce.Mapper;
 10 import org.apache.hadoop.mapreduce.Reducer;
 11 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 12 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 13 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 14 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 15 import org.apache.hadoop.util.Tool;
 16 import org.apache.hadoop.util.ToolRunner;
 17 public class Dedpu extends Configured implements Tool {
 18 /**
 19  * 数据去重
 20  * 数据样例：
 21  * 输入数据
 22  * 2006-6-9 a
 23  * 2006-6-10 b
 24  * 2006-6-9 a
 25  * 结果数据
 26  * 2006-6-9 a
 27  * 2006-6-10 b
 28  * 设计思路：
 29  * Map阶段 <时间，字符>
 30  * Reduce阶段输入<时间，list<字符>>，去除重复的字符，输出
 31  * 
 32  * **/
 33     public static class Map extends Mapper<LongWritable,Text,Text,Text>{
 34         public void map(LongWritable key,Text value,Context context)throws IOException, InterruptedException{
 35             String line=value.toString();
 36             Text myvalue=new Text("");
 37             context.write(new Text(line), myvalue);
 38 //            StringTokenizer tokenizer=new StringTokenizer(line);
 39 //            String datestr="",datastr="";
 40 //            while(tokenizer.hasMoreTokens())
 41 //            {
 42 //                datestr=tokenizer.nextToken();
 43 //                datastr=tokenizer.nextToken();
 44 //                context.write(new Text(datestr), new Text(datastr));
 45 //                
 46 //            }
 47         }
 48         
 49     }
 50     
 51     public static class Reduce extends Reducer<Text,Text,Text,Text>{
 52         public void reduce(Text key,Iterable<Text>values,Context context)throws IOException,InterruptedException{
 53 
 54             context.write(key, new Text(""));
 55 //            ArrayList  arr=new ArrayList();
 56 //            Text mykey=key;
 57 //            for(Text txt:values)
 58 //            {
 59 //                
 60 //                if(!arr.contains(txt.toString())){
 61 //                    arr.add(txt.toString());
 62 //                }
 63 //                    
 64 //                
 65 //            }
 66 //            for(int i=0;i<arr.size();i++){
 67 //                context.write(mykey, new Text(arr.get(i).toString()));
 68 //                
 69 //            }
 70             
 71         
 72             
 73         }
 74         
 75     }
 76     
 77     public int run(String[] args)throws Exception
 78     {
 79         Configuration conf=new Configuration();
 80         Job job=new Job(conf,"Data Depution");
 81         job.setJarByClass(Dedpu.class);
 82         
 83         job.setMapperClass(Map.class);
 84         job.setCombinerClass(Reduce.class);
 85         job.setReducerClass(Reduce.class);
 86         
 87         job.setOutputKeyClass(Text.class);
 88         job.setOutputValueClass(Text.class);
 89         
 90         job.setInputFormatClass(TextInputFormat.class);
 91         job.setOutputFormatClass(TextOutputFormat.class);
 92         
 93         FileInputFormat.setInputPaths(job, new Path(args[0]));
 94         FileOutputFormat.setOutputPath(job, new Path(args[1]));
 95         
 96         boolean success=job.waitForCompletion(true);
 97         return success?0:1;
 98         
 99     }
100     
101     public static void main(String[] args) throws Exception{
102         int ret=ToolRunner.run(new Dedpu(), args);
103         System.exit(ret);
104     }
105 }
posted @ 2014-07-07 10:29 林六天阅读(413) 评论(0) 收藏举报
刷新页面返回顶部
林六天

生命不息，学习不止

map/reduce实现数据去重

公告