Hive入门操作

Hive是基于Hadoop的一个数据仓库工具，可以将结构化的数据文件映射为一张数据库表，并提供类SQL查询功能。本文描述了HIve的一些基本操作，如有错误之处还请指出。

常用语法

#显示相关信息
show tables;
show databases;
show partitions;
show functions;
desc extended table_name;
desc formatted table_name;
#创建库
create database test_db;
#删除库
drop database 库名;
#删除表
drop table 表名;
#重命名表名
ALTER TABLE table_name RENAME TO new_table_name;
#清空表数据
truncate table 表名;

建表语句

CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name 
   [(col_name data_type [COMMENT col_comment], ...)] 
   [COMMENT table_comment] 
   [PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)] 
   [CLUSTERED BY (col_name, col_name, ...) 
   [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS] 
   [ROW FORMAT row_format] 
   [STORED AS file_format] 
   [LOCATION hdfs_path]

创建内部表

create table if not exists my_tb(id int,name string)
row format delimited fields terminated by ',';

创建外部表

#创建外部表要指明表的数据所在路径
create table if not exists my_ex_tb(id int,name string)
row format delimited fields terminated by ','
location 'hdfs://192.168.38.3:9000/externdb/my_ex_tb/';

在删除表的时候，内部表的元数据和数据会被一起删除，而外部表只删除元数据，不删除数据。

加载数据到表目录下

#在表的最后插入或覆盖整个表的数据（into/overwrite）
load data local inpath '/root/1.txt' into（overwrite） table my_ex_tb;

创建分区表

create table if not exists my_par_tb(id int,name string)
partitioned by(country string)
row format delimited fields terminated by ',';

load data local inpath '/root/1.txt' into table my_par_tb partition(country='China');
load data local inpath '/root/1.txt.us' into table my_par_tb partition(country='US');

#1.txt中的数据

    1,张三
    2,李四
    3,王五
    
#1.txt.us中的数据

    1,张三
    2,李四
    3,王五

#select * from my_par_tb显示的数据

    1	张三	China
    2	李四	China
    3	王五	China
    1	张三	US
    2	李四	US
    3	王五	US
    
#查某个分区里的数据

select * from my_par_tb where country='China'

    1	张三	China
    2	李四	China
    3	王五	China

添加删除分区

#添加分区
alter table my_par_tb add partition(country='Eng') partition(country='Ame');
#删除分区
alter table my_par_tb drop partition(country='Eng') partition(country='Ame');

#显示表中的分区
show partitions my_par_tb;

country=China
country=US

创建分桶表

create table if not exists my_buck_tb(id int,name string)
clustered by(id) sorted by(id)
into 4 buckets
row format delimited fields terminated by ',';

#指定开启分桶
set hive.enforce.bucketing=true;
#分了几个桶就设置几个reduce，将从其他表中查出来多个文件，分表放入到多个桶里。
set mapreduce.job.reduces=4;

#从my_tb表中查出数据插入到分桶表里
insert into table my_buck_tb
#指定map输出的数据根据id去分区,排序（cluster by等价于distribute by+sort by的效果）
select id,name from my_tb cluster by(id);

保存查询结果

默认情况下查询结果显示在屏幕上，可以将查询结果保存到表里。

#将查询结果保存到一张新创建的表中
create table tmp_tb as select * from my_tb;

#将查询结果保存到一张已经存在的表中
insert into table tmp_tb select * from my_tb;

#将查询结果保存到指定目录下（本地或hdfs上）
#本地
insert overwrite local directory '/root/out_tb/'
select * from my_tb;
#hdfs
insert overwrite  directory '/out_tb/'
select * from my_tb;

join操作

a表数据：

1,张三
2,李四
3,c
4,a
5,e
6,r

b表数据：

1,绿间
3,青峰
4,黑子
9,红发

建表：
create table a(id int,name string)
row format delimited fields terminated by ',';

create table b(id int,name string)
row format delimited fields terminated by ',';

导入数据：
load data local inpath '/root/a.txt' into table a;
load data local inpath '/root/b.txt' into table b;

#内连接（交集）
select * from a inner join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id  | a.name  | b.id  | b.name  |
+-------+---------+-------+---------+--+
| 1     | 张三      | 1     | 绿间      |
| 3     | c       | 3     | 青峰      |
| 4     | a       | 4     | 黑子      |
+-------+---------+-------+---------+--+

#左连接
select * from a left join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id  | a.name  | b.id  | b.name  |
+-------+---------+-------+---------+--+
| 1     | 张三      | 1     | 绿间      |
| 2     | 李四      | NULL  | NULL    |
| 3     | c       | 3     | 青峰      |
| 4     | a       | 4     | 黑子      |
| 5     | e       | NULL  | NULL    |
| 6     | r       | NULL  | NULL    |
+-------+---------+-------+---------+--+

#右连接
select * from a right join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id  | a.name  | b.id  | b.name  |
+-------+---------+-------+---------+--+
| 1     | 张三      | 1     | 绿间      |
| 3     | c       | 3     | 青峰      |
| 4     | a       | 4     | 黑子      |
| NULL  | NULL    | 9     | 红发      |
+-------+---------+-------+---------+--+

#全连接
select * from a full outer join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id  | a.name  | b.id  | b.name  |
+-------+---------+-------+---------+--+
| 1     | 张三      | 1     | 绿间      |
| 2     | 李四      | NULL  | NULL    |
| 3     | c       | 3     | 青峰      |
| 4     | a       | 4     | 黑子      |
| 5     | e       | NULL  | NULL    |
| 6     | r       | NULL  | NULL    |
| NULL  | NULL    | 9     | 红发      |
+-------+---------+-------+---------+--+


#左半连接（内连接的结果中只取左边的表的数据）
select * from a left semi join b on a.id = b.id;
+-------+---------+--+
| a.id  | a.name  |
+-------+---------+--+
| 1     | 张三      |
| 3     | c       |
| 4     | a       |
+-------+---------+--+

select查询语句

SELECT [ALL | DISTINCT] select_expr, select_expr, ... 
FROM table_name[..join..on(a.id=b.id)]
[WHERE where_condition] 
[GROUP BY col_list [HAVING condition]] 
[CLUSTER BY col_list 
  | [DISTRIBUTE BY col_list] [SORT BY| ORDER BY col_list] 
] 
[LIMIT number]

#指定map输出的数据根据id去分区,排序（cluster by等价于distribute by+sort by的效果）
select id,name from my_tb cluster by(id);

自定义函数

hive内置函数

https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF

pom文件

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>com.xiaojie.mm</groupId>
    <artifactId>my_hive</artifactId>
    <version>0.0.1-SNAPSHOT</version>

    <properties>
        <hadoop.version>2.6.5</hadoop.version>
        <hive.version>1.2.1</hive.version>
    </properties>


    <dependencies>
        <!-- Hadoop -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.6.5</version>
        </dependency>
        <!-- Hive -->
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>1.2.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-metastore</artifactId>
            <version>1.2.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-pdk</artifactId>
            <version>0.10.0</version>
        </dependency>
        <dependency>
            <groupId>javax.jdo</groupId>
            <artifactId>jdo2-api</artifactId>
            <version>2.3-eb</version>
        </dependency>
        <dependency>
            <groupId>commons-logging</groupId>
            <artifactId>commons-logging</artifactId>
            <version>1.1.1</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.7</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>jdk.tools</groupId>
            <artifactId>jdk.tools</artifactId>
            <version>1.7</version>
            <scope>system</scope>
            <systemPath>/home/miao/apps/install/jdk1.7.0_45/lib/tools.jar</systemPath>
        </dependency>
    </dependencies>
</project>

自定义将大写转为小写的方法

package com.xiaojie.mm;

import org.apache.hadoop.hive.ql.exec.UDF;

public class ToLower extends UDF{
//	重载该方法  
    public String evaluate(String field) {
        return field.toLowerCase();
    }
}

导出jar包，并放到hive所在的机器上

scp tolower.jar mini1:/root/apps/

hive客户端添加自定义函数

#第一步
add JAR /root/apps/tolower.jar;
#第二步 引号里是自定义方法的全名(临时方法，只在该回话窗口有效)
create temporary function tolower as 'com.xiaojie.mm.ToLower';
#第三步使用
select * from a;                        
+-------+---------+--+
| a.id  | a.name  |
+-------+---------+--+
| 7     | AAAAA   |
| 1     | 张三      |
| 2     | 李四      |
| 3     | c       |
| 4     | a       |
| 5     | e       |
| 6     | r       |
+-------+---------+--+

select id,tolower(name) from a;
+-----+--------+--+
| id  |  _c1   |
+-----+--------+--+
| 7   | aaaaa  |
| 1   | 张三     |
| 2   | 李四     |
| 3   | c      |
| 4   | a      |
| 5   | e      |
| 6   | r      |
+-----+--------+--+

自定义获取手机归属地

package com.xiaojie.mm;
import java.util.HashMap;
import org.apache.hadoop.hive.ql.exec.UDF;
public class GetProvince extends UDF{
    public static HashMap<String,String> provinceMap = new HashMap<String,String>();
    static {
        provinceMap.put("183", "hangzhou");
        provinceMap.put("186", "nanjing");
        provinceMap.put("187", "suzhou");
        provinceMap.put("188", "ningbo");
    }
    public String evaluate(int phonenumber) {
        String phone_num = String.valueOf(phonenumber);
        #取手机号码前三位
        String phone = phone_num.substring(0, 3);
        return provinceMap.get(phone)==null?"未知":provinceMap.get(phone);
    }
}

原数据：
+----------------------+---------------------+--+
| flow_province.phone  | flow_province.flow  |
+----------------------+---------------------+--+
| 1837878              | 12m                 |
| 1868989              | 13m                 |
| 1878989              | 14m                 |
| 1889898              | 15m                 |
| 1897867              | 16m                 |
| 1832323              | 78m                 |
| 1858767              | 88m                 |
| 1862343              | 99m                 |
| 1893454              | 77m                 |
+----------------------+---------------------+--+

调用自定义方法后：

select phone,getpro(phone),flow from flow_province;
+----------+-----------+-------+--+
|  phone   |    _c1    | flow  |
+----------+-----------+-------+--+
| 1837878  | hangzhou  | 12m   |
| 1868989  | nanjing   | 13m   |
| 1878989  | suzhou    | 14m   |
| 1889898  | ningbo    | 15m   |
| 1897867  | 未知        | 16m   |
| 1832323  | hangzhou  | 78m   |
| 1858767  | 未知        | 88m   |
| 1862343  | nanjing   | 99m   |
| 1893454  | 未知        | 77m   |
+----------+-----------+-------+--+

自定义解析json格式的数据

#创建表
create table json_tb(line string);
#导入数据
load data local inpath '/root/test_data/a.json' into table json_tb;
#显示原数据
select line from my_tb limit 10;
+----------------------------------------------------------------+--+
|                          json_tb.line                          |
+----------------------------------------------------------------+--+
| {"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}  |
| {"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"}   |
| {"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"}   |
| {"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"}  |
| {"movie":"2355","rate":"5","timeStamp":"978824291","uid":"1"}  |
| {"movie":"1197","rate":"3","timeStamp":"978302268","uid":"1"}  |
| {"movie":"1287","rate":"5","timeStamp":"978302039","uid":"1"}  |
| {"movie":"2804","rate":"5","timeStamp":"978300719","uid":"1"}  |
| {"movie":"594","rate":"4","timeStamp":"978302268","uid":"1"}   |
| {"movie":"919","rate":"4","timeStamp":"978301368","uid":"1"}   |
+----------------------------------------------------------------+--+

#自定义函数
package com.xiaojie.mm;
import org.apache.hadoop.hive.ql.exec.UDF;
import parquet.org.codehaus.jackson.map.ObjectMapper;

public class JsonParse extends UDF{
    public String evaluate(String jsonLine) {
        ObjectMapper objectMapper = new ObjectMapper();
        try {
            MovieBean bean = objectMapper.readValue(jsonLine, MovieBean.class);
            return bean.toString();
        }catch(Exception e){

        }
        return "";
    }
}

package com.xiaojie.mm;
public class MovieBean {
//	电影id
    private String movie;
//	电影评分
    private String rate;
//	评分时间
    private String timeStamp;
//	用户id
    private String uid;

    public String getMovie() {
        return movie;
    }

    public void setMovie(String movie) {
        this.movie = movie;
    }

    public String getRate() {
        return rate;
    }

    public void setRate(String rate) {
        this.rate = rate;
    }

    public String getTimeStamp() {
        return timeStamp;
    }

    public void setTimeStamp(String timeStamp) {
        this.timeStamp = timeStamp;
    }

    public String getUid() {
        return uid;
    }

    public void setUid(String uid) {
        this.uid = uid;
    }

    @Override
    public String toString() {
        return this.movie + "\t" +this.rate + "\t" + this.timeStamp + "\t" + this.uid;
    }

}

#打jar包上传到hive所在机器，创建函数
add JAR /root/test_data/json_parse.jar;
create temporary function json_parse as 'com.xiaojie.mm.JsonParse';

#使用自定义的json解析函数
select json_parse(line) from json_tb limit 10;
+---------------------+--+
|         _c0         |
+---------------------+--+
| 1193	5	978300760	1  |
| 661	3	978302109	1   |
| 914	3	978301968	1   |
| 3408	4	978300275	1  |
| 2355	5	978824291	1  |
| 1197	3	978302268	1  |
| 1287	5	978302039	1  |
| 2804	5	978300719	1  |
| 594	4	978302268	1   |
| 919	4	978301368	1   |
+---------------------+--+

#将json解析的数据保存到一张新创建的表里
create table json_parse_tb as
select split(json_parse(line),'\t')[0] as movieid,
split(json_parse(line),'\t')[1] as rate,
split(json_parse(line),'\t')[2] as time,
split(json_parse(line),'\t')[3] as userid
from json_tb limit 100;

#内置json函数
select get_json_object(line,'$.movie') as moiveid,
get_json_object(line,'$.rate') as rate,
get_json_object(line,'$.timeStamp') as time,
get_json_object(line,'$.uid') as userid
from json_tb limit 10;

Transform(调用自定义脚本)

Hive的 TRANSFORM 关键字提供了在SQL中调用自写脚本的功能,适合实现Hive中没有的功能又不想写UDF的情况。

自定义python脚本(vim time_parse.py)

#!/bin/python
import sys
import datetime

for line in sys.stdin:
  line = line.strip()
  movieid, rate, unixtime,userid = line.split('\t')
  weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
  print '\t'.join([movieid, rate, str(weekday),userid])

将py文件导入到hive的工作目录下

add file time_parse.py

使用transform调用自定义的py代码

create TABLE json_parse_time_tb as
SELECT
#根据transform括号中的参数，将json_parse_tb表的对应数据取出
  TRANSFORM (movieid, rate, time, userid)
  USING 'python time_parse.py'
  AS (movieid, rate, weekday,userid)
FROM json_parse_tb;

查看新表数据

select * from json_parse_time_tb;
+-----------------------------+--------------------------+-----------------------------+----------------------------+--+
| json_parse_time_tb.movieid  | json_parse_time_tb.rate  | json_parse_time_tb.weekday  | json_parse_time_tb.userid  |
+-----------------------------+--------------------------+-----------------------------+----------------------------+--+
| 1690                        | 3                        | 1                           | 2                          |
| 589                         | 4                        | 1                           | 2                          |
| 3471                        | 5                        | 1                           | 2                          |
| 1834                        | 4                        | 1                           | 2                          |
| 2490                        | 3                        | 1                           | 2                          |
| 2278                        | 3                        | 1                           | 2                          |
| 110                         | 5                        | 1                           | 2                          |
| 3257                        | 3                        | 1                           | 2                          |
| 3256                        | 2                        | 1                           | 2                          |
| 3255                        | 4                        | 1                           | 2                          |
+-----------------------------+--------------------------+-----------------------------+----------------------------+--+

案例

原数据（用户名，月份，点击量）

A,2015-01,5
A,2015-01,15
B,2015-01,5
A,2015-01,8
B,2015-01,25
A,2015-01,5
A,2015-02,4
A,2015-02,6
B,2015-02,10
B,2015-02,5

求每个人每个月的点击量，以及点击量累计

第一步：创建表，导入数据

#建表
create table click_tb(username string,month string,click int)
row format delimited fields terminated by ',';
#导入数据
load data local inpath ‘/root/test_data/click.txt’ into click_tb;

第二步：求每个用户每个月的点击量

select username,month,sum(click) as click_count from click_tb group by username,month;

+-----------+----------+--------------+--+
| username  |  month   | click_count  |
+-----------+----------+--------------+--+
| A         | 2015-01  | 33           |
| A         | 2015-02  | 10           |
| B         | 2015-01  | 30           |
| B         | 2015-02  | 15           |
+-----------+----------+--------------+--+

第三步：自己和自己内连接（求交集）

select * from
(select username,month,sum(click) as click_count from click_tb group by username,month) A
inner join
(select username,month,sum(click) as click_count from click_tb group by username,month) B
on
A.username=B.username;

+-------------+----------+----------------+-------------+----------+----------------+--+
| a.username  | a.month  | a.click_count  | b.username  | b.month  | b.click_count  |
+-------------+----------+----------------+-------------+----------+----------------+--+
| A           | 2015-01  | 33             | A           | 2015-01  | 33             |
| A           | 2015-01  | 33             | A           | 2015-02  | 10             |
| A           | 2015-02  | 10             | A           | 2015-01  | 33             |
| A           | 2015-02  | 10             | A           | 2015-02  | 10             |
| B           | 2015-01  | 30             | B           | 2015-01  | 30             |
| B           | 2015-01  | 30             | B           | 2015-02  | 15             |
| B           | 2015-02  | 15             | B           | 2015-01  | 30             |
| B           | 2015-02  | 15             | B           | 2015-02  | 15             |
+-------------+----------+----------------+-------------+----------+----------------+--+

第四步：求出最终所需结果

select a.username,a.month,min(a.click_count) as click_count,sum(b.click_count) as sum_count from
(select username,month,sum(click) as click_count from click_tb group by username,month) a
inner join
(select username,month,sum(click) as click_count from click_tb group by username,month) b
on
A.username=B.username
where b.month<=a.month
group by a.username,a.month
order by a.username,a.month;

+-------------+----------+--------------+------------+--+
| a.username  | a.month  | click_count  | sum_count  |
+-------------+----------+--------------+------------+--+
| A           | 2015-01  | 33           | 33         |
| A           | 2015-02  | 10           | 43         |
| B           | 2015-01  | 30           | 30         |
| B           | 2015-02  | 15           | 45         |
+-------------+----------+--------------+------------+--+

posted @ 2018-06-03 08:46 py小杰阅读(1655) 评论(4) 收藏举报

刷新页面返回顶部

py小杰