压缩文件
一、TextFile
partitioned by (channel string,paid_date date) row format delimited fields terminated by ',' lines terminated by '\n' stored as textfile ;
create table if not exists cn_uid_dl_secure.test2_order_line_1094_text ( store_id smallint , order_id string , user_id string , delivery_phone string , paid_time timestamp , department smallint , category smallint , upc string , upc_desc string , fineline int , wm_discount decimal(14,2), merch_sales decimal(14,2), gmv decimal(14,2), units decimal(14,2) ) partitioned by (channel string,paid_date date) stored as textfile ;
hive> show create table cn_uid_dl_secure.test2_order_line_1094_text ; +----------------------------------------------------+ | createtab_stmt | +----------------------------------------------------+ | CREATE TABLE `cn_uid_dl_secure.test2_order_line_1094_text`( | | `store_id` smallint, | | `order_id` string, | | `user_id` string, | | `delivery_phone` string, | | `paid_time` timestamp, | | `department` smallint, | | `category` smallint, | | `upc` string, | | `upc_desc` string, | | `fineline` int, | | `wm_discount` decimal(14,2), | | `merch_sales` decimal(14,2), | | `gmv` decimal(14,2), | | `units` decimal(14,2)) | | PARTITIONED BY ( | | `channel` string, | | `paid_date` date) | | ROW FORMAT SERDE | | 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' | | STORED AS INPUTFORMAT | | 'org.apache.hadoop.mapred.TextInputFormat' | | OUTPUTFORMAT | | 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' | | LOCATION | | 'hdfs://cnprod1ha/user/hive/warehouse/cn_uid_dl_secure.db/test2_order_line_1094_text' | | TBLPROPERTIES ( | | 'bucketing_version'='2', | | 'transactional'='true', | | 'transactional_properties'='insert_only', | | 'transient_lastDdlTime'='1586758992') | +----------------------------------------------------+
二、Parquet
create table if not exists cn_uid_dl_secure.test2_order_line_1094_parquet ( store_id smallint , order_id string , user_id string , delivery_phone string , paid_time timestamp , department smallint , category smallint , upc string , upc_desc string , fineline int , wm_discount decimal(14,2), merch_sales decimal(14,2), gmv decimal(14,2), units decimal(14,2) ) partitioned by (channel string,paid_date date) stored as parquet ;
hive> show create table cn_uid_dl_secure.test2_order_line_1094_parquet ; +----------------------------------------------------+ | createtab_stmt | +----------------------------------------------------+ | CREATE TABLE `cn_uid_dl_secure.test2_order_line_1094_parquet`( | | `store_id` smallint, | | `order_id` string, | | `user_id` string, | | `delivery_phone` string, | | `paid_time` timestamp, | | `department` smallint, | | `category` smallint, | | `upc` string, | | `upc_desc` string, | | `fineline` int, | | `wm_discount` decimal(14,2), | | `merch_sales` decimal(14,2), | | `gmv` decimal(14,2), | | `units` decimal(14,2)) | | PARTITIONED BY ( | | `channel` string, | | `paid_date` date) | | ROW FORMAT SERDE | | 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' | | STORED AS INPUTFORMAT | | 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' | | OUTPUTFORMAT | | 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' | | LOCATION | | 'hdfs://cnprod1ha/user/hive/warehouse/cn_uid_dl_secure.db/test2_order_line_1094_parquet' | | TBLPROPERTIES ( | | 'bucketing_version'='2', | | 'transactional'='true', | | 'transactional_properties'='insert_only', | | 'transient_lastDdlTime'='1586761990') | +----------------------------------------------------+
三、Parquet + GZIP
create table if not exists cn_uid_dl_secure.test2_order_line_1094_parquet_gzip ( store_id smallint , order_id string , user_id string , delivery_phone string , paid_time timestamp , department smallint , category smallint , upc string , upc_desc string , fineline int , wm_discount decimal(14,2), merch_sales decimal(14,2), gmv decimal(14,2), units decimal(14,2) ) partitioned by (channel string,paid_date date) stored as parquet tblproperties('parquet.compression'='GZIP'); --指定压缩算法
hive> show create table cn_uid_dl_secure.test2_order_line_1094_parquet_gzip ; +----------------------------------------------------+ | createtab_stmt | +----------------------------------------------------+ | CREATE TABLE `cn_uid_dl_secure.test2_order_line_1094_parquet_gzip`( | | `store_id` smallint, | | `order_id` string, | | `user_id` string, | | `delivery_phone` string, | | `paid_time` timestamp, | | `department` smallint, | | `category` smallint, | | `upc` string, | | `upc_desc` string, | | `fineline` int, | | `wm_discount` decimal(14,2), | | `merch_sales` decimal(14,2), | | `gmv` decimal(14,2), | | `units` decimal(14,2)) | | PARTITIONED BY ( | | `channel` string, | | `paid_date` date) | | ROW FORMAT SERDE | | 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' | | STORED AS INPUTFORMAT | | 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' | | OUTPUTFORMAT | | 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' | | LOCATION | | 'hdfs://cnprod1ha/user/hive/warehouse/cn_uid_dl_secure.db/test2_order_line_1094_parquet_gzip' | | TBLPROPERTIES ( | | 'bucketing_version'='2', | | 'parquet.compression'='GZIP', | | 'transactional'='true', | | 'transactional_properties'='insert_only', | | 'transient_lastDdlTime'='1586762774') | +----------------------------------------------------+
--可以通过修改表属性的方式添加压缩(只会影响后续入库的数据,原来的数据不会被压缩,需要重跑原来的数据) alter table cn_uid_dl_secure.test2_order_line_1094_parquet_gzip SET tblproperties ('parquet.compression'='GZIP');
四、Parquet + Snappy
create table if not exists cn_uid_dl_secure.test2_order_line_1094_parquet_snappy ( store_id smallint , order_id string , user_id string , delivery_phone string , paid_time timestamp , department smallint , category smallint , upc string , upc_desc string , fineline int , wm_discount decimal(14,2), merch_sales decimal(14,2), gmv decimal(14,2), units decimal(14,2) ) partitioned by (channel string,paid_date date) stored as parquet tblproperties('parquet.compression'='SNAPPY'); --指定压缩算法
hive> show create table cn_uid_dl_secure.test2_order_line_1094_parquet_snappy ; +----------------------------------------------------+ | createtab_stmt | +----------------------------------------------------+ | CREATE TABLE `cn_uid_dl_secure.test2_order_line_1094_parquet_snappy`( | | `store_id` smallint, | | `order_id` string, | | `user_id` string, | | `delivery_phone` string, | | `paid_time` timestamp, | | `department` smallint, | | `category` smallint, | | `upc` string, | | `upc_desc` string, | | `fineline` int, | | `wm_discount` decimal(14,2), | | `merch_sales` decimal(14,2), | | `gmv` decimal(14,2), | | `units` decimal(14,2)) | | PARTITIONED BY ( | | `channel` string, | | `paid_date` date) | | ROW FORMAT SERDE | | 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' | | STORED AS INPUTFORMAT | | 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' | | OUTPUTFORMAT | | 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' | | LOCATION | | 'hdfs://cnprod1ha/user/hive/warehouse/cn_uid_dl_secure.db/test2_order_line_1094_parquet_snappy' | | TBLPROPERTIES ( | | 'bucketing_version'='2', | | 'parquet.compression'='SNAPPY', | | 'transactional'='true', | | 'transactional_properties'='insert_only', | | 'transient_lastDdlTime'='1586763722') | +----------------------------------------------------+
五、ORC
create table if not exists cn_uid_dl_secure.test2_order_line_1094_orc ( store_id smallint , order_id string , user_id string , delivery_phone string , paid_time timestamp , department smallint , category smallint , upc string , upc_desc string , fineline int , wm_discount decimal(14,2), merch_sales decimal(14,2), gmv decimal(14,2), units decimal(14,2) ) partitioned by (channel string,paid_date date) stored as orc tblproperties('orc.compression'='ZLIB');
hive> show create table cn_uid_dl_secure.test2_order_line_1094_orc ; +----------------------------------------------------+ | createtab_stmt | +----------------------------------------------------+ | CREATE TABLE `cn_uid_dl_secure.test2_order_line_1094_orc`( | | `store_id` smallint, | | `order_id` string, | | `user_id` string, | | `delivery_phone` string, | | `paid_time` timestamp, | | `department` smallint, | | `category` smallint, | | `upc` string, | | `upc_desc` string, | | `fineline` int, | | `wm_discount` decimal(14,2), | | `merch_sales` decimal(14,2), | | `gmv` decimal(14,2), | | `units` decimal(14,2)) | | PARTITIONED BY ( | | `channel` string, | | `paid_date` date) | | ROW FORMAT SERDE | | 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' | | STORED AS INPUTFORMAT | | 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' | | OUTPUTFORMAT | | 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' | | LOCATION | | 'hdfs://cnprod1ha/user/hive/warehouse/cn_uid_dl_secure.db/test2_order_line_1094_orc' | | TBLPROPERTIES ( | | 'bucketing_version'='2', | | 'orc.compression'='ZLIB', | | 'transactional'='true', | | 'transactional_properties'='insert_only', | | 'transient_lastDdlTime'='1586764700') | +----------------------------------------------------+
六、ORC + Snappy
create table if not exists cn_uid_dl_secure.test2_order_line_1094_orc_snappy ( store_id smallint , order_id string , user_id string , delivery_phone string , paid_time timestamp , department smallint , category smallint , upc string , upc_desc string , fineline int , wm_discount decimal(14,2), merch_sales decimal(14,2), gmv decimal(14,2), units decimal(14,2) ) partitioned by (channel string,paid_date date) stored as orc tblproperties('orc.compression'='SNAPPY');
hive> show create table cn_uid_dl_secure.test2_order_line_1094_orc_snappy ; +----------------------------------------------------+ | createtab_stmt | +----------------------------------------------------+ | CREATE TABLE `cn_uid_dl_secure.test2_order_line_1094_orc_snappy`( | | `store_id` smallint, | | `order_id` string, | | `user_id` string, | | `delivery_phone` string, | | `paid_time` timestamp, | | `department` smallint, | | `category` smallint, | | `upc` string, | | `upc_desc` string, | | `fineline` int, | | `wm_discount` decimal(14,2), | | `merch_sales` decimal(14,2), | | `gmv` decimal(14,2), | | `units` decimal(14,2)) | | PARTITIONED BY ( | | `channel` string, | | `paid_date` date) | | ROW FORMAT SERDE | | 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' | | STORED AS INPUTFORMAT | | 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' | | OUTPUTFORMAT | | 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' | | LOCATION | | 'hdfs://cnprod1ha/user/hive/warehouse/cn_uid_dl_secure.db/test2_order_line_1094_orc_snappy' | | TBLPROPERTIES ( | | 'bucketing_version'='2', | | 'orc.compression'='SNAPPY', | | 'transactional'='true', | | 'transactional_properties'='insert_only', | | 'transient_lastDdlTime'='1586765252') | +----------------------------------------------------+
SET hive.exec.dynamic.partition =true; SET hive.exec.dynamic.partition.mode=nonstrict; SET hive.exec.max.dynamic.partitions=100000; SET hive.exec.max.dynamic.partitions.pernode=100000;
insert overwrite table cn_uid_dl_secure.test2_order_line_1094_orc_snappy partition(channel,paid_date) select store_id,order_id,user_id,delivery_phone,paid_time, department,category,upc,upc_desc,fineline, wm_discount,merch_sales,gmv,units, channel,paid_date from cn_ec_bi_dl_secure.wm_order_line where paid_date = '2019-12-01' --where paid_date >= '2019-12-01' and paid_date <= '2019-12-07' --where paid_date >= '2019-12-01' and paid_date < '2020-01-01' and channel='SNG' and store_id=1094 ;


浙公网安备 33010602011771号