hive

下载好hive后解压到应用程序目录中

root@s100:/tools# tar  xf  apache-hive-2.1.1-bin.tar.gz   -C  /soft/
root@s100:/soft# ln  -s   /soft/apache-hive-2.1.1-bin  /soft/hive

设置hive环境变量

root@s100:/soft# vim  /etc/environment 
JAVA_HOME=/soft/jdk
HADOOP_HOME=/soft/hadoop
HIVE_HOME=/soft/hive
PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/s
oft/jdk/bin:/soft/hadoop/bin:/soft/hadoop/sbin:/soft/hive/bin"
//使hive环境变量生效
root@s100:/soft# source  /etc/environment 
//验证hive安装是否成功
root@s100:/soft# hive  --version
Hive 2.1.1

由于hive需要hadoop的支持，所以在部署hive之前要先部署hadoop，然后配置hive关于hadoop的家目录。还要有hive-site.xml的配置文件

root@s100:/soft/hive/conf# cp  hive-env.sh.template  hive-env.sh
root@s100:/soft/hive/conf# vim  hive-env.sh
HADOOP_HOME=/soft/hadoop
root@s100:/soft/hive/conf# cp  hive-default.xml.template hive-site.xml

然后启动hive后报错，解决问题的方式为1格式化数据库，2.修改hive-site.xml配置文件

//初始化数据库，指定数据库为derby
root@s100:~# schematool   -initSchema -dbType derby
//修改hive-site.xml配置文件，将${system:java.io.tmpdir}全部修改成/root/hive;将${system:user.name}全部修改成ubuntu0
//修改完hive-site.xml配置文件后，创建配置文件中定义的目录
root@s100:~# mkdir  -p  hive/ubuntu0
//最后启动hive后就可以进入hive shell环境了,就可以像操作mysql数据库一样操作hive了
root@s100:~# hive
hive>

让hive连接外部的mysql服务器，让hive的metastore存储在mysql数据库中

　1.修改hive的hive-site.xml配置文件，修改hive的数据库连接信息

root@s100:/soft/hive/conf# vim  hive-site.xml
  <property>
    <name>javax.jdo.option.ConnectionDriverName</name>
    <value>com.mysql.jdbc.Driver</value>
    <description>Driver class name for a JDBC metastore</description>
  </property>
  <property>
    <name>javax.jdo.option.ConnectionURL</name>
    <value>jdbc:mysql://192.168.182.1:3306/myhive</value>
  </property>
  <property>
    <name>javax.jdo.option.ConnectionUserName</name>
    <value>root</value>
    <description>Username to use against metastore database</description>
  </property>
  <property>
    <name>javax.jdo.option.ConnectionPassword</name>
    <value>123</value>
    <description>password to use against metastore database</description>
  </property>

　2.导入mysql数据库的驱动到hive的classpath中

root@s100:/soft/hive# cd  lib/
#切换到hive的lib库目录中，上传mysql-connector-java-5.0.8-bin.jar到lib目录中即可

　3.删除以前derby数据库连接信息

root@s100:~# ls
derby.log  hadoop  hello.txt  hive  metastore_db
root@s100:~# rm  -fr  metastore_db
root@s100:~# hdfs  dfs -rm -r /user/hive
root@s100:~# hdfs  dfs -rm -r /tmp

　4.初始化hive连接mysql数据库

root@s100:~# schematool   -initSchema -dbType   mysql
Metastore connection URL:        jdbc:mysql://192.168.182.1:3306/myhive
Metastore Connection Driver :    com.mysql.jdbc.Driver
Metastore connection User:       root
Starting metastore schema initialization to 2.1.0
Initialization script hive-schema-2.1.0.mysql.sql
Initialization script completed
schemaTool completed

　　连接外部的mysql数据库时候可能会出如下的错误：

org.apache.hadoop.hive.metastore.HiveMetaException: Failed to get schema version.
Underlying cause: java.sql.SQLException : null,  message from server: "Host '192.168.1.100' is not allowed to connect to this MySQL server"
SQL Error code: 1130

　　这个错误表示连接mysql的权限有问题，解决办法为修改mysql的数据库访问权限

mysql> grant  all  on  myhive.* to  'root'@'%'  identified by  '123' with grant option;

　5.测试在hive中创建库、表后在mysql中是否能查询得到

hive> create database  hive1;
hive> create   table  users(id int,name string,age int);
#在mysql中查询hive创建的库和表还有字段
#查询数据库
mysql> select  *  from  dbs;
+-------+-----------------------+-----------------------------------------------+---------+------------+------------+
| DB_ID | DESC                  | DB_LOCATION_URI                               | NAME    | OWNER_NAME | OWNER_TYPE |
+-------+-----------------------+-----------------------------------------------+---------+------------+------------+
|     1 | Default Hive database | hdfs://s100:8020/user/hive/warehouse          | default | public     | ROLE       |
|     2 | NULL                  | hdfs://s100:8020/user/hive/warehouse/hive1.db | hive1   | root       | USER       |
+-------+-----------------------+-----------------------------------------------+---------+------------+------------+
#查询表
mysql> select  *  from  tbls;
+--------+-------------+-------+------------------+-------+-----------+-------+----------+---------------+--------------------+--------------------+
| TBL_ID | CREATE_TIME | DB_ID | LAST_ACCESS_TIME | OWNER | RETENTION | SD_ID | TBL_NAME | TBL_TYPE      | VIEW_EXPANDED_TEXT | VIEW_ORIGINAL_TEXT |
+--------+-------------+-------+------------------+-------+-----------+-------+----------+---------------+--------------------+--------------------+
|      1 |  1523896224 |     2 |                0 | root  |         0 |     1 | users    | MANAGED_TABLE | NULL               | NULL               |
+--------+-------------+-------+------------------+-------+-----------+-------+----------+---------------+--------------------+--------------------+
#查询字段
mysql> select  *  from  columns_v2;
+-------+---------+-------------+-----------+-------------+
| CD_ID | COMMENT | COLUMN_NAME | TYPE_NAME | INTEGER_IDX |
+-------+---------+-------------+-----------+-------------+
|     1 | NULL    | age         | int       |           2 |
|     1 | NULL    | id          | int       |           0 |
|     1 | NULL    | name        | string    |           1 |
+-------+---------+-------------+-----------+-------------+

通过hive创建表test，然后向这个表中添加数据

#创建表test结构如下：
hive> create  table  test(id  int,name string)
    > row format delimited
    > fields terminated by  ',' ;
#创建一个测试文件，作为数据插入到test表中
root@s100:~# vim   info
1,zhangsank
2,lisi
3,wangwu
4,wangerma
5,test
#向test表中插入数据info文件
hive> load data  local inpath  '/root/info' into  table  test;
#查看表test中的数据
hive> select *  from  test;
OK
1       zhangsank
2       lisi
3       wangwu
4       wangerma
5       test

我们知道hive存储的数据是放在hdfs分布式文件系统上的，hive的元数据存储在外部关系型数据库mysql中，hive存储在hdfs上的默认根位置为：/user/hive/warehouse/，创建的库、表在hdfs上的表现形式为文件夹，如上面创建的库hive1和表test在hdfs上的表现形式如下：

创建带有分区的分区表

hive> create table  data_partiton(id int,name  string)
    > partitioned  by(country string)
    > row  format  delimited
    > fields  terminated by ',';
#导入数据
hive> load data  local inpath  '/root/info' into  table data_partiton partition(country='china');
hive> load data  local inpath  '/root/info2' into  table data_partiton partition(country='amerecal');
#查看表data_partiton数据
hive> select  *  from  data_partiton;
OK
1       hadeng            amerecal
2       baoluo             amerecal
3       alizha              amerecal
4       gelin                amerecal
1       zhangsank       china
2       lisi                  china
3       wangwu           china
4       wangerma        china
5       test                  china

　　在hdfs上存储的分区数据方式为：在country-amerecal目录中存储info2文件，在country=china目录中存储info文件

创建外部表，向外部表中插入的数据存储在指定的目录中，当删除这个外部表后，外部表中的数据不会被删除，还保留在指定的目录中，这点和普通表不同。

hive> create  external   table  fz_external_tb(id int,name string)
    > row  format delimited
    > fields  terminated by ','
    > location  '/user/external_tb';
hive> load data  local inpath  '/root/info2' into  table fz_external_tb;
#外部表在hdfs上是看不到的，但其确实是存在的，通过show  tables可以查看
hive> show  tables;
OK
data_partiton
fz_external_tb
#下面测试通过删除外部表来验证hdfs上表的数据是否还保留
hive> drop  table  fz_external_tb;

创建分区表

#以id字段分桶和排序来创建分桶表test_buck，分桶数为4
hive> create  table  test_buck(id  int,name string)
    > clustered  by(id)
    > sorted  by(id  DESC)
    > into  4 buckets
    > row  format delimited fields  terminated by ',';
#开始向创建的分桶表中插入数据（插入的数据需要时已分桶，且排序的）
hive> insert  into  table test_buck
    > select  id,name from  test cluster by(id);

posted @ 2018-04-25 17:16 goser 阅读(230) 评论(0) 收藏举报

刷新页面返回顶部

hive

公告