hive

下载好hive后解压到应用程序目录中

root@s100:/tools# tar  xf  apache-hive-2.1.1-bin.tar.gz   -C  /soft/
root@s100:/soft# ln  -s   /soft/apache-hive-2.1.1-bin  /soft/hive

设置hive环境变量

root@s100:/soft# vim  /etc/environment 
JAVA_HOME=/soft/jdk
HADOOP_HOME=/soft/hadoop
HIVE_HOME=/soft/hive
PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/s
oft/jdk/bin:/soft/hadoop/bin:/soft/hadoop/sbin:/soft/hive/bin"
//使hive环境变量生效
root@s100:/soft# source  /etc/environment 
//验证hive安装是否成功
root@s100:/soft# hive  --version
Hive 2.1.1

由于hive需要hadoop的支持,所以在部署hive之前要先部署hadoop,然后配置hive关于hadoop的家目录。还要有hive-site.xml的配置文件

root@s100:/soft/hive/conf# cp  hive-env.sh.template  hive-env.sh
root@s100:/soft/hive/conf# vim  hive-env.sh
HADOOP_HOME=/soft/hadoop
root@s100:/soft/hive/conf# cp  hive-default.xml.template hive-site.xml

然后启动hive后报错,解决问题的方式为1格式化数据库,2.修改hive-site.xml配置文件

//初始化数据库,指定数据库为derby
root@s100:~# schematool   -initSchema -dbType derby
//修改hive-site.xml配置文件,将${system:java.io.tmpdir}全部修改成/root/hive;将${system:user.name}全部修改成ubuntu0
//修改完hive-site.xml配置文件后,创建配置文件中定义的目录
root@s100:~# mkdir  -p  hive/ubuntu0
//最后启动hive后就可以进入hive shell环境了,就可以像操作mysql数据库一样操作hive了
root@s100:~# hive
hive>

让hive连接外部的mysql服务器,让hive的metastore存储在mysql数据库中

 1.修改hive的hive-site.xml配置文件,修改hive的数据库连接信息

root@s100:/soft/hive/conf# vim  hive-site.xml
  <property>
    <name>javax.jdo.option.ConnectionDriverName</name>
    <value>com.mysql.jdbc.Driver</value>
    <description>Driver class name for a JDBC metastore</description>
  </property>
  <property>
    <name>javax.jdo.option.ConnectionURL</name>
    <value>jdbc:mysql://192.168.182.1:3306/myhive</value>
  </property>
  <property>
    <name>javax.jdo.option.ConnectionUserName</name>
    <value>root</value>
    <description>Username to use against metastore database</description>
  </property>
  <property>
    <name>javax.jdo.option.ConnectionPassword</name>
    <value>123</value>
    <description>password to use against metastore database</description>
  </property>

 2.导入mysql数据库的驱动到hive的classpath中

root@s100:/soft/hive# cd  lib/
#切换到hive的lib库目录中,上传mysql-connector-java-5.0.8-bin.jar到lib目录中即可

 3.删除以前derby数据库连接信息

root@s100:~# ls
derby.log  hadoop  hello.txt  hive  metastore_db
root@s100:~# rm  -fr  metastore_db
root@s100:~# hdfs  dfs -rm -r /user/hive
root@s100:~# hdfs  dfs -rm -r /tmp

 4.初始化hive连接mysql数据库

root@s100:~# schematool   -initSchema -dbType   mysql
Metastore connection URL:        jdbc:mysql://192.168.182.1:3306/myhive
Metastore Connection Driver :    com.mysql.jdbc.Driver
Metastore connection User:       root
Starting metastore schema initialization to 2.1.0
Initialization script hive-schema-2.1.0.mysql.sql
Initialization script completed
schemaTool completed

  连接外部的mysql数据库时候可能会出如下的错误:

org.apache.hadoop.hive.metastore.HiveMetaException: Failed to get schema version.
Underlying cause: java.sql.SQLException : null,  message from server: "Host '192.168.1.100' is not allowed to connect to this MySQL server"
SQL Error code: 1130

  这个错误表示连接mysql的权限有问题,解决办法为修改mysql的数据库访问权限

mysql> grant  all  on  myhive.* to  'root'@'%'  identified by  '123' with grant option;

 5.测试在hive中创建库、表后在mysql中是否能查询得到

hive> create database  hive1;
hive> create   table  users(id int,name string,age int);
#在mysql中查询hive创建的库和表还有字段
#查询数据库
mysql> select  *  from  dbs;
+-------+-----------------------+-----------------------------------------------+---------+------------+------------+
| DB_ID | DESC                  | DB_LOCATION_URI                               | NAME    | OWNER_NAME | OWNER_TYPE |
+-------+-----------------------+-----------------------------------------------+---------+------------+------------+
|     1 | Default Hive database | hdfs://s100:8020/user/hive/warehouse          | default | public     | ROLE       |
|     2 | NULL                  | hdfs://s100:8020/user/hive/warehouse/hive1.db | hive1   | root       | USER       |
+-------+-----------------------+-----------------------------------------------+---------+------------+------------+
#查询表
mysql> select  *  from  tbls;
+--------+-------------+-------+------------------+-------+-----------+-------+----------+---------------+--------------------+--------------------+
| TBL_ID | CREATE_TIME | DB_ID | LAST_ACCESS_TIME | OWNER | RETENTION | SD_ID | TBL_NAME | TBL_TYPE      | VIEW_EXPANDED_TEXT | VIEW_ORIGINAL_TEXT |
+--------+-------------+-------+------------------+-------+-----------+-------+----------+---------------+--------------------+--------------------+
|      1 |  1523896224 |     2 |                0 | root  |         0 |     1 | users    | MANAGED_TABLE | NULL               | NULL               |
+--------+-------------+-------+------------------+-------+-----------+-------+----------+---------------+--------------------+--------------------+
#查询字段
mysql> select  *  from  columns_v2;
+-------+---------+-------------+-----------+-------------+
| CD_ID | COMMENT | COLUMN_NAME | TYPE_NAME | INTEGER_IDX |
+-------+---------+-------------+-----------+-------------+
|     1 | NULL    | age         | int       |           2 |
|     1 | NULL    | id          | int       |           0 |
|     1 | NULL    | name        | string    |           1 |
+-------+---------+-------------+-----------+-------------+

通过hive创建表test,然后向这个表中添加数据

#创建表test结构如下:
hive> create  table  test(id  int,name string)
    > row format delimited
    > fields terminated by  ',' ;
#创建一个测试文件,作为数据插入到test表中
root@s100:~# vim   info
1,zhangsank
2,lisi
3,wangwu
4,wangerma
5,test
#向test表中插入数据info文件
hive> load data  local inpath  '/root/info' into  table  test;
#查看表test中的数据
hive> select *  from  test;
OK
1       zhangsank
2       lisi
3       wangwu
4       wangerma
5       test

我们知道hive存储的数据是放在hdfs分布式文件系统上的,hive的元数据存储在外部关系型数据库mysql中,hive存储在hdfs上的默认根位置为:/user/hive/warehouse/,创建的库、表在hdfs上的表现形式为文件夹,如上面创建的库hive1和表test在hdfs上的表现形式如下:

创建带有分区的分区表

hive> create table  data_partiton(id int,name  string)
    > partitioned  by(country string)
    > row  format  delimited
    > fields  terminated by ',';
#导入数据
hive> load data  local inpath  '/root/info' into  table data_partiton partition(country='china');
hive> load data  local inpath  '/root/info2' into  table data_partiton partition(country='amerecal');
#查看表data_partiton数据
hive> select  *  from  data_partiton;
OK
1       hadeng            amerecal
2       baoluo             amerecal
3       alizha              amerecal
4       gelin                amerecal
1       zhangsank       china
2       lisi                  china
3       wangwu           china
4       wangerma        china
5       test                  china

  在hdfs上存储的分区数据方式为:在country-amerecal目录中存储info2文件,在country=china目录中存储info文件

创建外部表,向外部表中插入的数据存储在指定的目录中,当删除这个外部表后,外部表中的数据不会被删除,还保留在指定的目录中,这点和普通表不同。

hive> create  external   table  fz_external_tb(id int,name string)
    > row  format delimited
    > fields  terminated by ','
    > location  '/user/external_tb';
hive> load data  local inpath  '/root/info2' into  table fz_external_tb;
#外部表在hdfs上是看不到的,但其确实是存在的,通过show  tables可以查看
hive> show  tables;
OK
data_partiton
fz_external_tb
#下面测试通过删除外部表来验证hdfs上表的数据是否还保留
hive> drop  table  fz_external_tb;

创建分区表

#以id字段分桶和排序来创建分桶表test_buck,分桶数为4
hive> create  table  test_buck(id  int,name string)
    > clustered  by(id)
    > sorted  by(id  DESC)
    > into  4 buckets
    > row  format delimited fields  terminated by ',';
#开始向创建的分桶表中插入数据(插入的数据需要时已分桶,且排序的)
hive> insert  into  table test_buck
    > select  id,name from  test cluster by(id); 

 

 

 

 

  

 

posted @ 2018-04-25 17:16  goser  阅读(227)  评论(0)    收藏  举报