压缩文件

一、TextFile  

partitioned by (channel string,paid_date date) 
row format delimited
fields terminated by ','
lines terminated by '\n'
stored as textfile ; 

  

create table if not exists cn_uid_dl_secure.test2_order_line_1094_text
(
 store_id          smallint     ,
 order_id          string       ,
 user_id           string       ,
 delivery_phone    string       ,
 paid_time         timestamp    ,
 department        smallint     ,
 category          smallint     ,
 upc               string       ,
 upc_desc          string       ,
 fineline          int          ,
 wm_discount       decimal(14,2),
 merch_sales       decimal(14,2),
 gmv               decimal(14,2),
 units             decimal(14,2)  
)
partitioned by (channel string,paid_date date) 
stored as textfile ; 
hive> show create table cn_uid_dl_secure.test2_order_line_1094_text ;
+----------------------------------------------------+
|                   createtab_stmt                   |
+----------------------------------------------------+
| CREATE TABLE `cn_uid_dl_secure.test2_order_line_1094_text`( |
|   `store_id` smallint,                             |
|   `order_id` string,                               |
|   `user_id` string,                                |
|   `delivery_phone` string,                         |
|   `paid_time` timestamp,                           |
|   `department` smallint,                           |
|   `category` smallint,                             |
|   `upc` string,                                    |
|   `upc_desc` string,                               |
|   `fineline` int,                                  |
|   `wm_discount` decimal(14,2),                     |
|   `merch_sales` decimal(14,2),                     |
|   `gmv` decimal(14,2),                             |
|   `units` decimal(14,2))                           |
| PARTITIONED BY (                                   |
|   `channel` string,                                |
|   `paid_date` date)                                |
| ROW FORMAT SERDE                                   |
|   'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'  |
| STORED AS INPUTFORMAT                              |
|   'org.apache.hadoop.mapred.TextInputFormat'       |
| OUTPUTFORMAT                                       |
|   'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' |
| LOCATION                                           |
|   'hdfs://cnprod1ha/user/hive/warehouse/cn_uid_dl_secure.db/test2_order_line_1094_text' |
| TBLPROPERTIES (                                    |
|   'bucketing_version'='2',                         |
|   'transactional'='true',                          |
|   'transactional_properties'='insert_only',        |
|   'transient_lastDdlTime'='1586758992')            |
+----------------------------------------------------+

 

二、Parquet  

create table if not exists cn_uid_dl_secure.test2_order_line_1094_parquet
(
 store_id          smallint     ,
 order_id          string       ,
 user_id           string       ,
 delivery_phone    string       ,
 paid_time         timestamp    ,
 department        smallint     ,
 category          smallint     ,
 upc               string       ,
 upc_desc          string       ,
 fineline          int          ,
 wm_discount       decimal(14,2),
 merch_sales       decimal(14,2),
 gmv               decimal(14,2),
 units             decimal(14,2)  
)
partitioned by (channel string,paid_date date) 
stored as parquet ; 
hive> show create table cn_uid_dl_secure.test2_order_line_1094_parquet ;
+----------------------------------------------------+
|                   createtab_stmt                   |
+----------------------------------------------------+
| CREATE TABLE `cn_uid_dl_secure.test2_order_line_1094_parquet`( |
|   `store_id` smallint,                             |
|   `order_id` string,                               |
|   `user_id` string,                                |
|   `delivery_phone` string,                         |
|   `paid_time` timestamp,                           |
|   `department` smallint,                           |
|   `category` smallint,                             |
|   `upc` string,                                    |
|   `upc_desc` string,                               |
|   `fineline` int,                                  |
|   `wm_discount` decimal(14,2),                     |
|   `merch_sales` decimal(14,2),                     |
|   `gmv` decimal(14,2),                             |
|   `units` decimal(14,2))                           |
| PARTITIONED BY (                                   |
|   `channel` string,                                |
|   `paid_date` date)                                |
| ROW FORMAT SERDE                                   |
|   'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'  |
| STORED AS INPUTFORMAT                              |
|   'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'  |
| OUTPUTFORMAT                                       |
|   'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' |
| LOCATION                                           |
|   'hdfs://cnprod1ha/user/hive/warehouse/cn_uid_dl_secure.db/test2_order_line_1094_parquet' |
| TBLPROPERTIES (                                    |
|   'bucketing_version'='2',                         |
|   'transactional'='true',                          |
|   'transactional_properties'='insert_only',        |
|   'transient_lastDdlTime'='1586761990')            |
+----------------------------------------------------+

 

三、Parquet + GZIP

create table if not exists cn_uid_dl_secure.test2_order_line_1094_parquet_gzip
(
 store_id          smallint     ,
 order_id          string       ,
 user_id           string       ,
 delivery_phone    string       ,
 paid_time         timestamp    ,
 department        smallint     ,
 category          smallint     ,
 upc               string       ,
 upc_desc          string       ,
 fineline          int          ,
 wm_discount       decimal(14,2),
 merch_sales       decimal(14,2),
 gmv               decimal(14,2),
 units             decimal(14,2)  
)
partitioned by (channel string,paid_date date) 
stored as parquet tblproperties('parquet.compression'='GZIP');  --指定压缩算法
hive> show create table cn_uid_dl_secure.test2_order_line_1094_parquet_gzip ;
+----------------------------------------------------+
|                   createtab_stmt                   |
+----------------------------------------------------+
| CREATE TABLE `cn_uid_dl_secure.test2_order_line_1094_parquet_gzip`( |
|   `store_id` smallint,                             |
|   `order_id` string,                               |
|   `user_id` string,                                |
|   `delivery_phone` string,                         |
|   `paid_time` timestamp,                           |
|   `department` smallint,                           |
|   `category` smallint,                             |
|   `upc` string,                                    |
|   `upc_desc` string,                               |
|   `fineline` int,                                  |
|   `wm_discount` decimal(14,2),                     |
|   `merch_sales` decimal(14,2),                     |
|   `gmv` decimal(14,2),                             |
|   `units` decimal(14,2))                           |
| PARTITIONED BY (                                   |
|   `channel` string,                                |
|   `paid_date` date)                                |
| ROW FORMAT SERDE                                   |
|   'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'  |
| STORED AS INPUTFORMAT                              |
|   'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'  |
| OUTPUTFORMAT                                       |
|   'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' |
| LOCATION                                           |
|   'hdfs://cnprod1ha/user/hive/warehouse/cn_uid_dl_secure.db/test2_order_line_1094_parquet_gzip' |
| TBLPROPERTIES (                                    |
|   'bucketing_version'='2',                         |
|   'parquet.compression'='GZIP',                    |
|   'transactional'='true',                          |
|   'transactional_properties'='insert_only',        |
|   'transient_lastDdlTime'='1586762774')            |
+----------------------------------------------------+
--可以通过修改表属性的方式添加压缩(只会影响后续入库的数据,原来的数据不会被压缩,需要重跑原来的数据)
alter table  cn_uid_dl_secure.test2_order_line_1094_parquet_gzip  SET tblproperties ('parquet.compression'='GZIP');

 

四、Parquet + Snappy 

create table if not exists cn_uid_dl_secure.test2_order_line_1094_parquet_snappy
(
 store_id          smallint     ,
 order_id          string       ,
 user_id           string       ,
 delivery_phone    string       ,
 paid_time         timestamp    ,
 department        smallint     ,
 category          smallint     ,
 upc               string       ,
 upc_desc          string       ,
 fineline          int          ,
 wm_discount       decimal(14,2),
 merch_sales       decimal(14,2),
 gmv               decimal(14,2),
 units             decimal(14,2)  
)
partitioned by (channel string,paid_date date) 
stored as parquet tblproperties('parquet.compression'='SNAPPY'); --指定压缩算法
hive> show create table cn_uid_dl_secure.test2_order_line_1094_parquet_snappy ;
+----------------------------------------------------+
|                   createtab_stmt                   |
+----------------------------------------------------+
| CREATE TABLE `cn_uid_dl_secure.test2_order_line_1094_parquet_snappy`( |
|   `store_id` smallint,                             |
|   `order_id` string,                               |
|   `user_id` string,                                |
|   `delivery_phone` string,                         |
|   `paid_time` timestamp,                           |
|   `department` smallint,                           |
|   `category` smallint,                             |
|   `upc` string,                                    |
|   `upc_desc` string,                               |
|   `fineline` int,                                  |
|   `wm_discount` decimal(14,2),                     |
|   `merch_sales` decimal(14,2),                     |
|   `gmv` decimal(14,2),                             |
|   `units` decimal(14,2))                           |
| PARTITIONED BY (                                   |
|   `channel` string,                                |
|   `paid_date` date)                                |
| ROW FORMAT SERDE                                   |
|   'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'  |
| STORED AS INPUTFORMAT                              |
|   'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'  |
| OUTPUTFORMAT                                       |
|   'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' |
| LOCATION                                           |
|   'hdfs://cnprod1ha/user/hive/warehouse/cn_uid_dl_secure.db/test2_order_line_1094_parquet_snappy' |
| TBLPROPERTIES (                                    |
|   'bucketing_version'='2',                         |
|   'parquet.compression'='SNAPPY',                  |
|   'transactional'='true',                          |
|   'transactional_properties'='insert_only',        |
|   'transient_lastDdlTime'='1586763722')            |
+----------------------------------------------------+

 

五、ORC

create table if not exists cn_uid_dl_secure.test2_order_line_1094_orc
(
 store_id          smallint     ,
 order_id          string       ,
 user_id           string       ,
 delivery_phone    string       ,
 paid_time         timestamp    ,
 department        smallint     ,
 category          smallint     ,
 upc               string       ,
 upc_desc          string       ,
 fineline          int          ,
 wm_discount       decimal(14,2),
 merch_sales       decimal(14,2),
 gmv               decimal(14,2),
 units             decimal(14,2)  
)
partitioned by (channel string,paid_date date) 
stored as orc tblproperties('orc.compression'='ZLIB');
hive> show create table cn_uid_dl_secure.test2_order_line_1094_orc ;
+----------------------------------------------------+
|                   createtab_stmt                   |
+----------------------------------------------------+
| CREATE TABLE `cn_uid_dl_secure.test2_order_line_1094_orc`( |
|   `store_id` smallint,                             |
|   `order_id` string,                               |
|   `user_id` string,                                |
|   `delivery_phone` string,                         |
|   `paid_time` timestamp,                           |
|   `department` smallint,                           |
|   `category` smallint,                             |
|   `upc` string,                                    |
|   `upc_desc` string,                               |
|   `fineline` int,                                  |
|   `wm_discount` decimal(14,2),                     |
|   `merch_sales` decimal(14,2),                     |
|   `gmv` decimal(14,2),                             |
|   `units` decimal(14,2))                           |
| PARTITIONED BY (                                   |
|   `channel` string,                                |
|   `paid_date` date)                                |
| ROW FORMAT SERDE                                   |
|   'org.apache.hadoop.hive.ql.io.orc.OrcSerde'      |
| STORED AS INPUTFORMAT                              |
|   'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'  |
| OUTPUTFORMAT                                       |
|   'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' |
| LOCATION                                           |
|   'hdfs://cnprod1ha/user/hive/warehouse/cn_uid_dl_secure.db/test2_order_line_1094_orc' |
| TBLPROPERTIES (                                    |
|   'bucketing_version'='2',                         |
|   'orc.compression'='ZLIB',                        |
|   'transactional'='true',                          |
|   'transactional_properties'='insert_only',        |
|   'transient_lastDdlTime'='1586764700')            |
+----------------------------------------------------+

 

六、ORC + Snappy

create table if not exists cn_uid_dl_secure.test2_order_line_1094_orc_snappy
(
 store_id          smallint     ,
 order_id          string       ,
 user_id           string       ,
 delivery_phone    string       ,
 paid_time         timestamp    ,
 department        smallint     ,
 category          smallint     ,
 upc               string       ,
 upc_desc          string       ,
 fineline          int          ,
 wm_discount       decimal(14,2),
 merch_sales       decimal(14,2),
 gmv               decimal(14,2),
 units             decimal(14,2)  
)
partitioned by (channel string,paid_date date) 
stored as orc tblproperties('orc.compression'='SNAPPY');
hive> show create table cn_uid_dl_secure.test2_order_line_1094_orc_snappy ;
+----------------------------------------------------+
|                   createtab_stmt                   |
+----------------------------------------------------+
| CREATE TABLE `cn_uid_dl_secure.test2_order_line_1094_orc_snappy`( |
|   `store_id` smallint,                             |
|   `order_id` string,                               |
|   `user_id` string,                                |
|   `delivery_phone` string,                         |
|   `paid_time` timestamp,                           |
|   `department` smallint,                           |
|   `category` smallint,                             |
|   `upc` string,                                    |
|   `upc_desc` string,                               |
|   `fineline` int,                                  |
|   `wm_discount` decimal(14,2),                     |
|   `merch_sales` decimal(14,2),                     |
|   `gmv` decimal(14,2),                             |
|   `units` decimal(14,2))                           |
| PARTITIONED BY (                                   |
|   `channel` string,                                |
|   `paid_date` date)                                |
| ROW FORMAT SERDE                                   |
|   'org.apache.hadoop.hive.ql.io.orc.OrcSerde'      |
| STORED AS INPUTFORMAT                              |
|   'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'  |
| OUTPUTFORMAT                                       |
|   'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' |
| LOCATION                                           |
|   'hdfs://cnprod1ha/user/hive/warehouse/cn_uid_dl_secure.db/test2_order_line_1094_orc_snappy' |
| TBLPROPERTIES (                                    |
|   'bucketing_version'='2',                         |
|   'orc.compression'='SNAPPY',                      |
|   'transactional'='true',                          |
|   'transactional_properties'='insert_only',        |
|   'transient_lastDdlTime'='1586765252')            |
+----------------------------------------------------+

 

 

SET hive.exec.dynamic.partition =true;
SET hive.exec.dynamic.partition.mode=nonstrict;
SET hive.exec.max.dynamic.partitions=100000;
SET hive.exec.max.dynamic.partitions.pernode=100000;
insert overwrite table cn_uid_dl_secure.test2_order_line_1094_orc_snappy
partition(channel,paid_date)
select 
 store_id,order_id,user_id,delivery_phone,paid_time,
 department,category,upc,upc_desc,fineline,
 wm_discount,merch_sales,gmv,units,
 channel,paid_date
from cn_ec_bi_dl_secure.wm_order_line
where paid_date = '2019-12-01'
--where paid_date >= '2019-12-01' and paid_date <= '2019-12-07'
--where paid_date >= '2019-12-01' and paid_date < '2020-01-01'
and channel='SNG'
and store_id=1094 ;

 

 

posted @ 2020-04-15 01:42  茗::流  阅读(123)  评论(0)    收藏  举报
如有雷同,纯属参考。如有侵犯你的版权,请联系我。