Hbase java API

一、Java查看zookeeper

1、使用idea下载zookeeper插件

2、启动集群

3、开始使用查看

二、Hbase Java api

1、新建maven项目命名hbase

2、hbase项目pom文件导包

 <dependencies>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>1.4.6</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.8.2</version>
        </dependency>
    </dependencies>

3、与hbase创建连接实现建表

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;

import java.io.IOException;

public class APIDemo1 {
    public static void main(String[] args) throws IOException {
        //创建配置文件
        Configuration conf = HBaseConfiguration.create();
        //配置zk地址，通过zk可以找到hbase
        conf.set("hbase.zookeeper.quorum","master:2181,node1:2181,node2:2181");
        //创建连接
        Connection conn = ConnectionFactory.createConnection(conf);
        /**
         * 操作表getAdmin
         */
        Admin admin = conn.getAdmin();
        //创建textAPI表，并指定列簇cf1，并将列簇的版本设置为3

        HTableDescriptor textAPI = new HTableDescriptor(TableName.valueOf("textAPI"));
        //创建一个列簇
        HColumnDescriptor cf1 = new HColumnDescriptor("cf1");
        //给列簇进行设置
        cf1.setMaxVersions(3);
        //给textAPI表增加一个列簇
        textAPI.addFamily(cf1);
        //创建表
        admin.createTable(textAPI);
        /**
         * 操作数据getTable
         */

        //关闭连接
        admin.close();
        conn.close();
    }
}

三、Java API操作hbase

getadmin对表操作

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

import java.io.IOException;

public class APIDemo2 {
    Connection conn;
    Admin admin;

    @Before
    public void createConn() throws IOException {
        //创建配置文件
        Configuration conf = HBaseConfiguration.create();
        //配置zk地址
        conf.set("hbase.zookeeper.quorum","master:2181,node1:2181,node2:2181");
        //创建连接
        conn= ConnectionFactory.createConnection(conf);
        //创建admin对象
        admin = conn.getAdmin();
    }

    @Test
    /**
     *list
     */
    public void list() throws IOException {
        TableName[] tableNames = admin.listTableNames();
        for (TableName tableName : tableNames) {
            System.out.println(tableName.getNameAsString());
        }
    }

    @Test
    /**
     * create table 建表
     */
    public void createTable() throws IOException {
        HTableDescriptor text = new HTableDescriptor(TableName.valueOf("text"));
        HColumnDescriptor info = new HColumnDescriptor("info");
        text.addFamily(info);
        admin.createTable(text);
    }

    @Test
    /**
     * drop table删表
     */
    public void dropTable() throws IOException {
        //创建需要删除表对象
        TableName text = TableName.valueOf("text");
        //判断是否存在
        if (admin.tableExists(text)){
            admin.disableTableAsync(text);
            admin.deleteTable(text);
        }
    }

    @Test
    /**
     * 修改表结构
     * 针对test表 将其info列簇的ttl设置为10000，并增加一个新的列簇cf1
     */
    public void modifyTable() throws IOException {
        TableName textAPI = TableName.valueOf("text");
        //获取表原有的结构
        HTableDescriptor tableDescriptor = admin.getTableDescriptor(textAPI);
        //在表原有的结构上修改簇的属性
        HColumnDescriptor[] columnFamilies = tableDescriptor.getColumnFamilies();
        //遍历表中原有的列簇
        for (HColumnDescriptor columnFamily : columnFamilies) {
            //对原有的info列簇进行修改
            if ("info".equals(columnFamily.getNameAsString())){
                columnFamily.setTimeToLive(10000);
            }
        }
        HColumnDescriptor cf1 = new HColumnDescriptor("cf1");
        tableDescriptor.addFamily(cf1);
        admin.modifyTable(textAPI,tableDescriptor);
    }

    @After
    /**
     * 关闭资源
     */
    public void close() throws IOException {
        admin.close();
        conn.close();
    }
}

getTable对表数据操作

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class Demo2API {

    Connection conn;
    Admin admin;

    @Before
    public void createConn() throws IOException {
        // 1、创建一个配置文件
        Configuration conf = HBaseConfiguration.create();
        // 配置ZK的地址，通过ZK可以找到HBase
        conf.set("hbase.zookeeper.quorum", "master:2181,node1:2181,node2:2181");

        // 2、创建连接
        conn = ConnectionFactory.createConnection(conf);
        // 3、创建Admin对象
        admin = conn.getAdmin();
    }

    @Test
    /**
     * put 插入一条数据
     */
    public void put() throws IOException {
        Table testAPI = conn.getTable(TableName.valueOf("testAPI"));

        Put put = new Put("0002".getBytes());
        // 相当于插入一列（一个cell）数据
        put.addColumn("cf1".getBytes(), "name".getBytes(), "李四".getBytes());
        put.addColumn("cf1".getBytes(), "age".getBytes(), "23".getBytes());
        put.addColumn("cf1".getBytes(), "phone".getBytes(), "18888887".getBytes());

        testAPI.put(put);
    }

    @Test
    /**
     * get 根据rowkey获取一条数据
     */
    public void get() throws IOException {
        Table testAPI = conn.getTable(TableName.valueOf("testAPI"));

        Get get = new Get("0002".getBytes());

        Result rs = testAPI.get(get);
        // 获取rk
        byte[] rk = rs.getRow();
        System.out.println(rk);
        System.out.println(Bytes.toString(rk));
        // 获取cell
        byte[] name = rs.getValue("cf1".getBytes(), "name".getBytes());
        System.out.println(name);
        System.out.println(Bytes.toString(name));

    }

    @Test
    /**
     * delete 删除数据
     */
    public void deleteLine() throws IOException {
        Delete delete = new Delete("1500100001".getBytes());
        Table student = conn.getTable(TableName.valueOf("student"));
        student.delete(delete);
    }

    @Test
    /**
     * putAll 读取学生信息数据并写入HBase的student表
     */
    public void putAll() throws IOException {
        /**
         * 读取学生信息数据
         */

        // Junit 和 main方法运行时的工作路径不一样
        // 这里传入的相对路径要动态调整
        BufferedReader br = new BufferedReader(new FileReader("data/students.txt"));

        // 与HBase中的student表建立连接
        Table student = conn.getTable(TableName.valueOf("split_table_test"));

        String line = null;

        // 创建Put的集合
        ArrayList<Put> puts = new ArrayList<>();
        int batchSize = 11;
        while ((line = br.readLine()) != null) {
            // 写入HBase
            String[] splits = line.split(",");
            String id = splits[0];
            String name = splits[1];
            String age = splits[2];
            String gender = splits[3];
            String clazz = splits[4];

            Put put = new Put(id.getBytes());
            byte[] info = "cf".getBytes();
            put.addColumn(info, "name".getBytes(), name.getBytes());
            put.addColumn(info, "age".getBytes(), age.getBytes());
            put.addColumn(info, "gender".getBytes(), gender.getBytes());
            put.addColumn(info, "clazz".getBytes(), clazz.getBytes());


            // 每条数据都会执行一次，效率很慢
//            student.put(put);

            // 将每个Put对象加入puts集合
            puts.add(put);
            // 当puts集合的大小同batchSize大小一致时，则调用HTable的put方法进行批量写入
            if (puts.size() == batchSize) {
                student.put(puts);
                // 清空集合
                puts.clear();
            }

        }
        System.out.println(puts.isEmpty());
        System.out.println(puts.size());
        // 当batchSize的大小同数据的条数不成整比的时候 可能会造成最后几条数据未被写入
        // 手动去判断puts集合是否为空，不为空则将其写入HBase
        if (!puts.isEmpty()) {
            student.put(puts);
        }


        br.close();


    }

    @Test
    /**
     * scan 获取一组数据
     * 读取student表
     */
    public void getScan() throws IOException {
        Table student = conn.getTable(TableName.valueOf("student"));

        // scan可以指定rowkey的范围进行查询，或者是限制返回的条数
        Scan scan = new Scan();
        scan.withStartRow("1500100100".getBytes());
        scan.withStopRow("1500100111".getBytes());
        scan.setLimit(10);

        for (Result rs : student.getScanner(scan)) {
            String id = Bytes.toString(rs.getRow());
            String name = Bytes.toString(rs.getValue("info".getBytes(), "name".getBytes()));
            String age = Bytes.toString(rs.getValue("info".getBytes(), "age".getBytes()));
            String gender = Bytes.toString(rs.getValue("info".getBytes(), "gender".getBytes()));
            String clazz = Bytes.toString(rs.getValue("info".getBytes(), "clazz".getBytes()));

            System.out.println(id + "," + name + "," + age + "," + gender + "," + clazz);
        }

    }

    @Test
    /**
     * CellUtil
     */
    public void scanWithCellUtil() throws IOException {
        Table student = conn.getTable(TableName.valueOf("student"));

        // scan可以指定rowkey的范围进行查询，或者是限制返回的条数
        Scan scan = new Scan();
        scan.withStartRow("1500100990".getBytes());
//        scan.withStopRow("1500100111".getBytes());

        for (Result rs : student.getScanner(scan)) {
            String id = Bytes.toString(rs.getRow());
            System.out.print(id + " ");
            // 将一条数据的所有的cell列举出来
            // 使用CellUtil从每一个cell中取出数据
            // 不需要考虑每条数据的结构
            List<Cell> cells = rs.listCells();
            for (Cell cell : cells) {
                String value = Bytes.toString(CellUtil.cloneValue(cell));
                System.out.print(value + " ");
            }
            System.out.println();

        }
    }


    @After
    public void close() throws IOException {
        admin.close();
        conn.close();
    }

}

四、HBase BulkLoading

优点

如果我们一次性入库hbase巨量数据，处理速度慢不说，还特别占用Region资源，一个比较高效便捷的方法就是使用 “Bulk Loading”方法，即HBase提供的HFileOutputFormat类。
它是利用hbase的数据信息按照特定格式存储在hdfs内这一原理，直接生成这种hdfs内存储的数据格式文件，然后上传至合适位置，即完成巨量数据快速入库的办法。配合mapreduce完成，高效便捷，而且不占用region资源，增添负载。

限制

仅适合初次数据导入，即表内数据为空，或者每次入库表内都无数据的情况。
HBase集群与Hadoop集群为同一集群，即HBase所基于的HDFS为生成HFile的MR的集群

代码

package com.shujia;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.RegionLocator;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.KeyValueSortReducer;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.mapreduce.SimpleTotalOrderPartitioner;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Demo10BulkLoading {
    public static class BulkLoadingMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, KeyValue> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] splits = value.toString().split(",");
            String mdn = splits[0];
            String start_time = splits[1];
            // 经度
            String longitude = splits[4];
            // 维度
            String latitude = splits[5];

            String rowkey = mdn + "_" + start_time;

            KeyValue lg = new KeyValue(rowkey.getBytes(), "info".getBytes(), "lg".getBytes(), longitude.getBytes());
            KeyValue lt = new KeyValue(rowkey.getBytes(), "info".getBytes(), "lt".getBytes(), latitude.getBytes());

            context.write(new ImmutableBytesWritable(rowkey.getBytes()), lg);
            context.write(new ImmutableBytesWritable(rowkey.getBytes()), lt);

        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = HBaseConfiguration.create();
        conf.set("hbase.zookeeper.quorum", "master:2181,node1:2181,node2:2181");


        // 创建Job实例
        Job job = Job.getInstance(conf);
        job.setJarByClass(Demo10BulkLoading.class);
        job.setJobName("Demo10BulkLoading");

        // 保证全局有序
        job.setPartitionerClass(SimpleTotalOrderPartitioner.class);

        // 设置reduce个数
        job.setNumReduceTasks(4);
        // 配置map任务
        job.setMapperClass(BulkLoadingMapper.class);

        // 配置reduce任务
        // KeyValueSortReducer 保证在每个Reduce有序
        job.setReducerClass(KeyValueSortReducer.class);

        // 输入输出路径
        FileInputFormat.addInputPath(job, new Path("/data/DIANXIN/"));
        FileOutputFormat.setOutputPath(job, new Path("/data/hfile"));

        // 创建HBase连接
        Connection conn = ConnectionFactory.createConnection(conf);
        // create 'dianxin_bulk','info'
        // 获取dianxin_bulk 表
        Table dianxin_bulk = conn.getTable(TableName.valueOf("dianxin_bulk"));
        // 获取dianxin_bulk 表 region定位器
        RegionLocator regionLocator = conn.getRegionLocator(TableName.valueOf("dianxin_bulk"));
        // 使用HFileOutputFormat2将输出的数据按照HFile的形式格式化
        HFileOutputFormat2.configureIncrementalLoad(job, dianxin_bulk, regionLocator);

        // 等到MapReduce任务执行完成
        job.waitForCompletion(true);

        // 加载HFile到 dianxin_bulk 中
        LoadIncrementalHFiles load = new LoadIncrementalHFiles(conf);
        load.doBulkLoad(new Path("/data/hfile"), conn.getAdmin(), dianxin_bulk, regionLocator);

        /**
         *  create 'dianxin_bulk','info'
         *  hadoop jar HBaseJavaAPI10-1.0-jar-with-dependencies.jar com.shujia.Demo10BulkLoading
         */
    }
}

说明

最终输出结果，无论是map还是reduce，输出部分key和value的类型必须是： < ImmutableBytesWritable, KeyValue>或者< ImmutableBytesWritable, Put>。
最终输出部分，Value类型是KeyValue 或Put，对应的Sorter分别是KeyValueSortReducer或PutSortReducer。
MR例子中HFileOutputFormat2.configureIncrementalLoad(job, dianxin_bulk, regionLocator);自动对job进行配置。SimpleTotalOrderPartitioner是需要先对key进行整体排序，然后划分到每个reduce中，保证每一个reducer中的的key最小最大值区间范围，是不会有交集的。因为入库到HBase的时候，作为一个整体的Region，key是绝对有序的。
MR例子中最后生成HFile存储在HDFS上，输出路径下的子目录是各个列族。如果对HFile进行入库HBase，相当于move HFile到HBase的Region中，HFile子目录的列族内容没有了，但不能直接使用mv命令移动，因为直接移动不能更新HBase的元数据。
HFile入库到HBase通过HBase中 LoadIncrementalHFiles的doBulkLoad方法，对生成的HFile文件入库

posted @ 2021-11-30 21:52 艺术派大星阅读(612) 评论(0) 收藏举报

刷新页面返回顶部

艺术派大星

Hbase java API

一、Java查看zookeeper

二、Hbase Java api

三、Java API操作hbase

getadmin对表操作

getTable对表数据操作

四、HBase BulkLoading

优点

限制

代码

说明

公告