大数据迁移到HBase MR程序
思路:
1:如果原始数据不满足格式要求,先通过MR程序处理原始数据
2:通过一个Map程序处理生成的原始数据,输入为HFILE格式
3:通过BulkLoad方式,将HFILE文件迁移到,hdfs目录(已在HBase中创建好对应的表)
示例代码:
package com.doit.hbase.bulkload; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2; import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; /* 两个job 第一个job组织数据 输出为HFIL类型的文件 第二个job是将第一个job生成的HFILE导入到hbase表中 */ public class GeneratePutHFileAndBulkLoadToHBase { /* 实际业务处理 的map */ public static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> { private Text wordText=new Text(); private IntWritable one=new IntWritable(1); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub String line=value.toString(); String[] wordArray=line.split(" "); for(String word:wordArray) { wordText.set(word); context.write(wordText, one); } } } /* 实际业务处理 的reduce */ public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> { private IntWritable result=new IntWritable(); protected void reduce(Text key, Iterable<IntWritable> valueList, Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub int sum=0; for(IntWritable value:valueList) { sum+=value.get(); } result.set(sum); context.write(key, result); } } /* 输出是固定的,输入根据具体需求 ImmutableBytesWritable 表示 hbase中行键类型 put 表示行中的内容 输出类型转成HFILE,hbase保存在hdfs中的文件类型,带有索引 */ public static class ConvertWordCountOutToHFileMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub String line=value.toString(); String[] wordCountArray=line.split("\t"); String word=wordCountArray[0]; int count=Integer.valueOf(wordCountArray[1]); //创建HBase中的RowKey 这里是将单词作为行键 byte[] rowKey=Bytes.toBytes(word); ImmutableBytesWritable rowKeyWritable=new ImmutableBytesWritable(rowKey);//行键 byte[] family=Bytes.toBytes("cf");//列族 hbase中先将表创建好 byte[] qualifier=Bytes.toBytes("count");//列族中k/v的k byte[] hbaseValue=Bytes.toBytes(count);//列族中k/v 的v // Put 用于列簇下的多列提交,若只有一个列,则可以使用 KeyValue 格式 // KeyValue keyValue = new KeyValue(rowKey, family, qualifier, hbaseValue); Put put=new Put(rowKey); put.add(family, qualifier, hbaseValue);//如果上面解析到多个列,则多次put就可以了 context.write(rowKeyWritable, put); } } public static void main(String[] args) throws Exception { // TODO Auto-generated method stub Configuration hadoopConfiguration=new Configuration(); String[] dfsArgs = new GenericOptionsParser(hadoopConfiguration, args).getRemainingArgs(); //第一个Job就是普通MR,输出到指定的目录 Job wcJob = Job.getInstance(hadoopConfiguration,"wordCountJob");//新API写法 //Job job=new Job(hadoopConfiguration, "wordCountJob");//旧API写法 wcJob.setJarByClass(GeneratePutHFileAndBulkLoadToHBase.class); wcJob.setMapperClass(WordCountMapper.class); wcJob.setReducerClass(WordCountReducer.class); wcJob.setOutputKeyClass(Text.class); wcJob.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(wcJob, new Path(dfsArgs[0])); FileOutputFormat.setOutputPath(wcJob, new Path(dfsArgs[1])); //提交第一个Job if("1".equals(args[3]) || "all".equals(args[3])){ wcJob.waitForCompletion(true); } //第二个Job以第一个Job的输出做为输入,只需要编写Mapper类,在Mapper类中对一个job的输出进行分析,并转换为HBase需要的KeyValue的方式。 Job toHFileJob = Job.getInstance(hadoopConfiguration); //Job convertWordCountJobOutputToHFileJob=new Job(hadoopConfiguration, "wordCount_bulkload"); toHFileJob.setJarByClass(GeneratePutHFileAndBulkLoadToHBase.class); toHFileJob.setMapperClass(ConvertWordCountOutToHFileMapper.class); //ReducerClass 无需指定,框架会自行根据 MapOutputValueClass 来决定是使用 KeyValueSortReducer 还是 PutSortReducer //convertWordCountJobOutputToHFileJob.setReducerClass(KeyValueSortReducer.class); toHFileJob.setMapOutputKeyClass(ImmutableBytesWritable.class); toHFileJob.setMapOutputValueClass(Put.class); //以第一个Job的输出做为第二个Job的输入 FileInputFormat.addInputPath(toHFileJob, new Path(dfsArgs[1])); FileOutputFormat.setOutputPath(toHFileJob, new Path(dfsArgs[2])); //创建HBase的配置对象 Configuration hbaseConfiguration=HBaseConfiguration.create(); hbaseConfiguration.set("hbase.zookeeper.quorum", "zk-02:2181,zk-03:2181,zk-01:2181"); Connection conn = ConnectionFactory.createConnection(hbaseConfiguration); HTable table = (HTable) conn.getTable(TableName.valueOf("word_count")); //创建目标表对象 HFileOutputFormat2.configureIncrementalLoad(toHFileJob, table, table.getRegionLocator()); //提交第二个job if("2".equals(args[3]) || "all".equals(args[3]) ){ toHFileJob.waitForCompletion(true); } //当第二个job结束之后,调用BulkLoad方式来将MR结果批量入库 if("load".equals(args[3]) || "all".equals(args[3])){ LoadIncrementalHFiles loader = new LoadIncrementalHFiles(hbaseConfiguration); //第一个参数为第二个Job的输出目录即保存HFile的目录,第二个参数为目标表 loader.doBulkLoad(new Path(dfsArgs[2]), table); } //最后调用System.exit进行退出 System.exit(0); } }
posted on 2018-08-14 15:59 CodeArtist 阅读(165) 评论(0) 收藏 举报
浙公网安备 33010602011771号