Hive练习

一、基础DDL练习

SHOW DATABASES;

CREATE DATABASE IF NOT EXISTS db1 COMMENT 'Our database db1';

SHOW DATABASES;

DESCRIBE DATABASE db1;

CREATE TABLE db1.table1 (word STRING, count INT);

SHOW TABLES in db1;

DESCRIBE db1.table1;

USE db1;

SHOW TABLES;

SELECT * FROM db1.table1;

DROP TABLE table1;

DROP DATABASE db1;

USE default;

二、基础DML语句

创建表
create table if not exists user_dimension (
 uid STRING,
 name STRING,
 gender STRING,
 birth DATE,
 province STRING
)ROW FORMAT DELIMITED //按行切分的意思
 FIELDS TERMINATED BY ','  //按逗号分隔的
查看表信息
describe user_dimension;

show create table user_dimension;

查看所有表
show tables;

载入本地数据
load data local inpath '/home/orco/tempdata/user.data' overwrite into table user_dimension;

载入HDFS上的数据
load data inpath '/user/orco/practice_1/user.data' overwrite into table user_dimension;

验证
select * from user_dimension;

查看hive在hdfs上的存储目录
hadoop fs -ls /warehouse/
hadoop fs -ls /warehouse/user_dimension

三、复杂数据类型

 

示例2:
CREATE TABLE IF NOT EXISTS employees (
   name         STRING,
   salary       FLOAT,
   subordinates ARRAY<STRING>,
   deductions   MAP<STRING, FLOAT>,
   address      STRUCT<street:STRING, city:STRING, state:STRING, zip:INT>
 )
 ROW FORMAT DELIMITED
 FIELDS TERMINATED BY '\001' 
 COLLECTION ITEMS TERMINATED BY '\002' 
 MAP KEYS TERMINATED BY '\003'
 LINES TERMINATED BY '\n'
 STORED AS TEXTFILE; //最后这一行,是默认,可以不写

载入数据
load data local inpath ' /home/orco/tempdata/data/employees.txt' overwrite into table employees ;

查询数据
SELECT name, deductions['Federal Taxes'] FROM employees WHERE deductions['Federal Taxes'] > 0.2;

SELECT name, deductions['Federal Taxes'] FROM employees WHERE deductions['Federal Taxes'] > cast( 0.2 as float);

SELECT name FROM employees WHERE subordinates[1] = 'Todd Jones';

SELECT name, address FROM employees WHERE address.street RLIKE '^.*(Ontario|Chicago).*$';

四、数据模型-分区

为减少不必要的暴力数据扫描,可以对表进行分区,为避免产生过多小文件,建议只对离散字段进行分区

 

建表
CREATE TABLE IF NOT EXISTS stocks (
     ymd             DATE,
     price_open      FLOAT, 
     price_high      FLOAT,
     price_low       FLOAT,
     price_close     FLOAT,
     volume            INT,
     price_adj_close FLOAT
     )
     PARTITIONED BY (exchanger STRING, symbol STRING)
     ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';

载入数据
load data local inpath '/home/orco/resources/apache-hive-2.1.1-bin/hivedata/stocks/NASDAQ/AAPL/stocks.csv' overwrite into table stocks partition(exchanger="NASDAQ", symbol="AAPL");

show partitions stocks;

load data local inpath '/home/orco/resources/apache-hive-2.1.1-bin/hivedata/stocks/NASDAQ/INTC/stocks.csv' overwrite into table stocks partition(exchanger="NASDAQ", symbol="INTC");

load data local inpath '/home/orco/resources/apache-hive-2.1.1-bin/hivedata/stocks/NYSE/GE/stocks.csv' overwrite into table stocks partition(exchanger="NYSE", symbol="GE");

show partitions stocks;

查询
SELECT * FROM stocks WHERE exchanger = 'NASDAQ' AND symbol = 'AAPL' LIMIT 10;

SELECT ymd, price_close FROM stocks WHERE exchanger = 'NASDAQ' AND symbol = 'AAPL' LIMIT 10;

查看HDFS文件目录
hadoop fs -ls /warehouse/stocks/

hadoop fs -ls /warehouse/stocks/exchanger=NASDAQ

hadoop fs -ls /warehouse/stocks/exchanger=NASDAQ/symbol=AAPL

六、外部表

external关键字,删除表时,外部表只删除元数据,不删除数据,更加安全

数据
hadoop fs -put stocks /user/orco/

创建外部表
CREATE EXTERNAL TABLE IF NOT EXISTS stocks_external (
  ymd             DATE,
  price_open      FLOAT, 
  price_high      FLOAT,
  price_low       FLOAT,
  price_close     FLOAT,
  volume            INT,
  price_adj_close FLOAT
  )
  PARTITIONED BY (exchanger STRING, symbol STRING)
  ROW FORMAT DELIMITED 
  FIELDS TERMINATED BY ','
  LOCATION '/user/orco/stocks';

select * from stocks_external;

载入数据
alter table stocks_external add partition(exchanger="NASDAQ", symbol="AAPL") location '/user/orco/stocks/NASDAQ/AAPL/'

show partitions stocks_external;

select * from stocks_external limit 10;

alter table stocks_external add partition(exchanger="NASDAQ", symbol="INTC") location '/user/orco/stocks/NASDAQ/INTC/';

alter table stocks_external add partition(exchanger="NYSE", symbol="IBM") location '/user/orco/stocks/NYSE/IBM/';

alter table stocks_external add partition(exchanger="NYSE", symbol="GE") location '/user/orco/stocks/NYSE/GE/';

show partitions stocks_external;

查询
SELECT * FROM stocks_external WHERE exchanger = 'NASDAQ' AND symbol = 'AAPL' LIMIT 10;

SELECT ymd, price_close FROM stocks_external WHERE exchanger = 'NASDAQ' AND symbol = 'AAPL' LIMIT 10;

select exchanger, symbol,count(*) from stocks_external group by exchanger, symbol;

select exchanger, symbol, max(price_high) from stocks_external group by exchanger, symbol;

删除表
删除内部表stocks
drop table stocks;

查看HDFS上文件目录
hadoop fs -ls /warehouse/

删除外部表stocks_external
drop table stocks_external;

查看HDFS上文件目录
hadoop fs -ls /user/orco

hadoop fs -ls /user/stocks

七、列式存储

在Create/Alter表的时候,可以为表以及分区的文件指定不同的格式
• Storage Formats
• Row Formats
• SerDe

STORED AS file_format
– STORED AS PARQUET
– STORED AS ORC
– STORED AS SEQUENCEFILE
– STORED AS AVRO
– STORED AS TEXTFILE

列式存储格式ORC与Parquet:存储空间

列式存储格式ORC与Parquet:性能

如何创建ORC表

create table if not exists record_orc (
  rid STRING,
  uid STRING,
  bid STRING,
  price INT,
  source_province STRING,
  target_province STRING,
  site STRING,
  express_number STRING,
  express_company STRING,
  trancation_date DATE
 )
 stored as orc;

show create table record_orc;

载入数据
select * from record_orc limit 10;

insert into table record_orc select * from record;

select * from record_orc limit 10;

八、Lateral View,行转多列

CREATE TABLE IF NOT EXISTS employees (
   name         STRING,
   salary       FLOAT,
   subordinates ARRAY<STRING>,
   deductions   MAP<STRING, FLOAT>,
   address      STRUCT<street:STRING, city:STRING, state:STRING, zip:INT>
 )
 ROW FORMAT DELIMITED
 FIELDS TERMINATED BY '\001' 
 COLLECTION ITEMS TERMINATED BY '\002' 
 MAP KEYS TERMINATED BY '\003'
 LINES TERMINATED BY '\n'
 STORED AS TEXTFILE;

查询
select name,subordinate from employees LATERAL VIEW explode(subordinates) subordinates_table AS subordinate;

九、explain

 

posted @ 2017-06-15 15:07  Super_Orco  阅读(1373)  评论(0编辑  收藏  举报