1 import java.io.IOException;
2
3 import org.apache.hadoop.conf.Configuration;
4 import org.apache.hadoop.conf.Configured;
5 import org.apache.hadoop.fs.Path;
6 import org.apache.hadoop.io.LongWritable;
7 import org.apache.hadoop.io.Text;
8 import org.apache.hadoop.mapreduce.Job;
9 import org.apache.hadoop.mapreduce.Mapper;
10 import org.apache.hadoop.mapreduce.Reducer;
11 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
13 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
15 import org.apache.hadoop.util.Tool;
16 import org.apache.hadoop.util.ToolRunner;
17 public class Dedpu extends Configured implements Tool {
18 /**
19 * 数据去重
20 * 数据样例:
21 * 输入数据
22 * 2006-6-9 a
23 * 2006-6-10 b
24 * 2006-6-9 a
25 * 结果数据
26 * 2006-6-9 a
27 * 2006-6-10 b
28 * 设计思路:
29 * Map阶段 <时间,字符>
30 * Reduce阶段输入<时间,list<字符>>,去除重复的字符,输出
31 *
32 * **/
33 public static class Map extends Mapper<LongWritable,Text,Text,Text>{
34 public void map(LongWritable key,Text value,Context context)throws IOException, InterruptedException{
35 String line=value.toString();
36 Text myvalue=new Text("");
37 context.write(new Text(line), myvalue);
38 // StringTokenizer tokenizer=new StringTokenizer(line);
39 // String datestr="",datastr="";
40 // while(tokenizer.hasMoreTokens())
41 // {
42 // datestr=tokenizer.nextToken();
43 // datastr=tokenizer.nextToken();
44 // context.write(new Text(datestr), new Text(datastr));
45 //
46 // }
47 }
48
49 }
50
51 public static class Reduce extends Reducer<Text,Text,Text,Text>{
52 public void reduce(Text key,Iterable<Text>values,Context context)throws IOException,InterruptedException{
53
54 context.write(key, new Text(""));
55 // ArrayList arr=new ArrayList();
56 // Text mykey=key;
57 // for(Text txt:values)
58 // {
59 //
60 // if(!arr.contains(txt.toString())){
61 // arr.add(txt.toString());
62 // }
63 //
64 //
65 // }
66 // for(int i=0;i<arr.size();i++){
67 // context.write(mykey, new Text(arr.get(i).toString()));
68 //
69 // }
70
71
72
73 }
74
75 }
76
77 public int run(String[] args)throws Exception
78 {
79 Configuration conf=new Configuration();
80 Job job=new Job(conf,"Data Depution");
81 job.setJarByClass(Dedpu.class);
82
83 job.setMapperClass(Map.class);
84 job.setCombinerClass(Reduce.class);
85 job.setReducerClass(Reduce.class);
86
87 job.setOutputKeyClass(Text.class);
88 job.setOutputValueClass(Text.class);
89
90 job.setInputFormatClass(TextInputFormat.class);
91 job.setOutputFormatClass(TextOutputFormat.class);
92
93 FileInputFormat.setInputPaths(job, new Path(args[0]));
94 FileOutputFormat.setOutputPath(job, new Path(args[1]));
95
96 boolean success=job.waitForCompletion(true);
97 return success?0:1;
98
99 }
100
101 public static void main(String[] args) throws Exception{
102 int ret=ToolRunner.run(new Dedpu(), args);
103 System.exit(ret);
104 }
105 }