代码改变世界

HBase MapReduce Demo

2014-07-21 17:08  programmer022  阅读(391)  评论(0)    收藏  举报

从HBase中读取表数据,统计一个表字段值的重复次数

package hadooptest;

import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class HBaseLogCount {

public static class TokenizerMapper extends Mapper<ImmutableBytesWritable, Result, Text, IntWritable> {

private static final IntWritable one = new IntWritable(1);
private Text word = new Text();

public void map(ImmutableBytesWritable key, Result value,
Mapper<ImmutableBytesWritable, Result, Text, IntWritable>.Context context) throws IOException,
InterruptedException {
byte[] tmp = value.getValue("logdata".getBytes(), "TASK_NAME".getBytes());
this.word.set(tmp);
System.out.println(new String(tmp));
context.write(this.word, one);
}
}

public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

private IntWritable result = new IntWritable();

public void reduce(Text key, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int sum = 0;
for(IntWritable val: values) {
sum += val.get();
}
this.result.set(sum);
context.write(key, this.result);
}
}

public static void main(String[] args) throws Exception {

System.setProperty("javax.xml.parsers.DocumentBuilderFactory",
"com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl");
System.setProperty("javax.xml.parsers.SAXParserFactory",
"com.sun.org.apache.xerces.internal.jaxp.SAXParserFactoryImpl");
System.setProperty("hadoop.home.dir", "D:\\hadoop-common-2.2.0-bin-master");
Configuration conf = new Configuration();

String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if(otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
String namenodeHost = "hadoop1.hwdomain";
String namenodePort = "8020";
String jobtrackerHost = "hadoop1.hwdomain";
String jobtrackerPort = "10020";
String fsDefaultName = "hdfs://" + namenodeHost + ":" + namenodePort;
String jobTracker = jobtrackerHost + ":" + jobtrackerPort;

conf.set("hbase.zookeeper.quorum", "hadoop1.hwdomain,hadoop2.hwdomain,hadoop3.hwdomain");
conf.set(HConstants.ZOOKEEPER_ZNODE_PARENT, "/hbase-unsecure");
conf.set("fs.default.name", fsDefaultName);
conf.set("mapred.job.tracker", jobTracker);
conf.set(TableInputFormat.INPUT_TABLE, "dsbtasklog");
Job job = new Job(conf, "hbase log count");
job.setJarByClass(HBaseLogCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setInputFormatClass(TableInputFormat.class);
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
job.submit();
// System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}