大数据迁移到HBase MR程序

思路：
1：如果原始数据不满足格式要求，先通过MR程序处理原始数据
2：通过一个Map程序处理生成的原始数据，输入为HFILE格式
3：通过BulkLoad方式，将HFILE文件迁移到，hdfs目录（已在HBase中创建好对应的表）
示例代码：
package com.doit.hbase.bulkload;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
/*
    两个job
    第一个job组织数据 输出为HFIL类型的文件
    第二个job是将第一个job生成的HFILE导入到hbase表中
 */
public class GeneratePutHFileAndBulkLoadToHBase {
    /*
     实际业务处理 的map
     */
    public static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>
    {

        private Text wordText=new Text();
        private IntWritable one=new IntWritable(1);
        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            String line=value.toString();
            String[] wordArray=line.split(" ");
            for(String word:wordArray)
            {
                wordText.set(word);
                context.write(wordText, one);
            }
        }
    }
    /*
      实际业务处理 的reduce
     */
    public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>
    {
        private IntWritable result=new IntWritable();
        protected void reduce(Text key, Iterable<IntWritable> valueList,
                Context context)
                throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            int sum=0;
            for(IntWritable value:valueList)
            {
                sum+=value.get();
            }
            result.set(sum);
            context.write(key, result);
        }
    }
    /*
      输出是固定的，输入根据具体需求
      ImmutableBytesWritable 表示 hbase中行键类型
      put                    表示行中的内容
      输出类型转成HFILE，hbase保存在hdfs中的文件类型，带有索引
     */
    public static class ConvertWordCountOutToHFileMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put>
    {
        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            String line=value.toString();
            String[] wordCountArray=line.split("\t");
            String word=wordCountArray[0];
            int count=Integer.valueOf(wordCountArray[1]);
            
            //创建HBase中的RowKey   这里是将单词作为行键
            byte[] rowKey=Bytes.toBytes(word);
            ImmutableBytesWritable rowKeyWritable=new ImmutableBytesWritable(rowKey);//行键
            byte[] family=Bytes.toBytes("cf");//列族  hbase中先将表创建好
            byte[] qualifier=Bytes.toBytes("count");//列族中k/v的k
            byte[] hbaseValue=Bytes.toBytes(count);//列族中k/v 的v
            // Put 用于列簇下的多列提交，若只有一个列，则可以使用 KeyValue 格式
            // KeyValue keyValue = new KeyValue(rowKey, family, qualifier, hbaseValue);
            Put put=new Put(rowKey);
            put.add(family, qualifier, hbaseValue);//如果上面解析到多个列，则多次put就可以了
            context.write(rowKeyWritable, put);
        }
    }
    
    public static void main(String[] args) throws Exception {
        // TODO Auto-generated method stub
        Configuration hadoopConfiguration=new Configuration();
        String[] dfsArgs = new GenericOptionsParser(hadoopConfiguration, args).getRemainingArgs();
        
        //第一个Job就是普通MR，输出到指定的目录
        Job wcJob = Job.getInstance(hadoopConfiguration,"wordCountJob");//新API写法
        //Job job=new Job(hadoopConfiguration, "wordCountJob");//旧API写法
        wcJob.setJarByClass(GeneratePutHFileAndBulkLoadToHBase.class);
        wcJob.setMapperClass(WordCountMapper.class);
        wcJob.setReducerClass(WordCountReducer.class);
        wcJob.setOutputKeyClass(Text.class);
        wcJob.setOutputValueClass(IntWritable.class);
        FileInputFormat.setInputPaths(wcJob, new Path(dfsArgs[0]));
        FileOutputFormat.setOutputPath(wcJob, new Path(dfsArgs[1]));
        //提交第一个Job
        if("1".equals(args[3]) || "all".equals(args[3])){
            wcJob.waitForCompletion(true);
        }


        //第二个Job以第一个Job的输出做为输入，只需要编写Mapper类，在Mapper类中对一个job的输出进行分析，并转换为HBase需要的KeyValue的方式。
        Job toHFileJob = Job.getInstance(hadoopConfiguration);
        //Job convertWordCountJobOutputToHFileJob=new Job(hadoopConfiguration, "wordCount_bulkload");
        
        toHFileJob.setJarByClass(GeneratePutHFileAndBulkLoadToHBase.class);
        toHFileJob.setMapperClass(ConvertWordCountOutToHFileMapper.class);
        //ReducerClass 无需指定，框架会自行根据 MapOutputValueClass 来决定是使用 KeyValueSortReducer 还是 PutSortReducer
        //convertWordCountJobOutputToHFileJob.setReducerClass(KeyValueSortReducer.class);
        toHFileJob.setMapOutputKeyClass(ImmutableBytesWritable.class);
        toHFileJob.setMapOutputValueClass(Put.class);
        
        //以第一个Job的输出做为第二个Job的输入
        FileInputFormat.addInputPath(toHFileJob, new Path(dfsArgs[1]));
        FileOutputFormat.setOutputPath(toHFileJob, new Path(dfsArgs[2]));
        //创建HBase的配置对象
        Configuration hbaseConfiguration=HBaseConfiguration.create();
        hbaseConfiguration.set("hbase.zookeeper.quorum", "zk-02:2181,zk-03:2181,zk-01:2181");
        Connection conn = ConnectionFactory.createConnection(hbaseConfiguration);
        HTable table = (HTable) conn.getTable(TableName.valueOf("word_count"));
        //创建目标表对象
        HFileOutputFormat2.configureIncrementalLoad(toHFileJob, table, table.getRegionLocator());
        //提交第二个job
        if("2".equals(args[3])  || "all".equals(args[3]) ){
            toHFileJob.waitForCompletion(true);
        }
        
        //当第二个job结束之后，调用BulkLoad方式来将MR结果批量入库
        if("load".equals(args[3])  || "all".equals(args[3])){
            LoadIncrementalHFiles loader = new LoadIncrementalHFiles(hbaseConfiguration);
            //第一个参数为第二个Job的输出目录即保存HFile的目录，第二个参数为目标表
            loader.doBulkLoad(new Path(dfsArgs[2]), table);
        }
        //最后调用System.exit进行退出
        System.exit(0);
    }
}
posted on 2018-08-14 15:59 CodeArtist 阅读(165) 评论(0) 收藏举报
刷新页面返回顶部
CodeArt

大数据迁移到HBase MR程序

导航

公告