1、hive sql调优

一、非数据倾斜优化

参数设置:

        set hive.execution.engine=tez;

        set hive.auto.convert.join=true;

sql语句:

insert overwrite directory '/user/bi/tmp_suyan/' row format delimited FIELDS TERMINATED BY '\u0001'

SELECT t1.telnumber,t2.web_name,t2.web_classify1_name,t2.web_classify2_name,t1.web_id,t1.web_classify1,t1.web_classify2,

       t1.net_type,count(t1.user_id) usercnt,sum(t1.delaytime) delaytime,sum(t1.upbytes + t1.downbytes) DATA

FROM ( SELECT telnumber,web_id,web_classify1,web_classify2,net_type,user_id,delaytime,upbytes,downbytes

FROM netlog.ft_mid_xdr_domain_daily WHERE sum_date = '20190508' AND user_id IS NOT NULL ) t1

LEFT JOIN netlog.cfg_web_class_name t2 ON t1.web_id = t2.web_id

GROUP BY t1.telnumber,t2.web_name,t2.web_classify1_name,t2.web_classify2_name,t1.web_id,t1.web_classify1,t1.web_classify2,

         t1.net_type;

 

二、 多层Join数据倾斜优化

参数设置:

        set hive.execution.engine=tez;

        set hive.auto.convert.join=true;

        set hive.optimize.skewjoin= true;

sql语句:

 

 

 

三、 group by数据倾斜优化

参数设置:

 set hive.execution.engine=tez;

 set hive.auto.convert.join=true;

 set hive.groupby.skewindata=true;

 set hive.map.aggr=true;

sql语句:

insert into table dwtmp.tmp_mkt_lost_detail_app_list_02     

select t1.apply_class1 ,t.apply_id,t.telnumber          

from netlog.ft_mid_xdr_client_daily t

inner join dwpub.MKT_CFG_app_CLASS

t1 on t.apply_classify2=t1.apply_classify2     

where t.sum_date between '20190430' and '20190506'    

group by t1.apply_class1,

t.apply_id,

t.telnumber;

注意:1)group by和 多层join 引发的数据倾斜,都有调优的参数,但是两者不可同时使用,否则会导致数据条数异常。

 

四、 多层union all数据倾斜优化

参数设置:

set mapred.min.split.size=2147483648;

set mapred.max.split.size=2147483648;  

set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;

set hive.optimize.skewjoin= true;

set hive.auto.convert.join=true;

sql语句:

insert into table dwtmp.suyan_tmp_ta_productinfo_mms_group_day_15

select  'HNSJ'city_code ,'9999'area_code ,'9999'town_code ,round(sum(nvl(FEE_ACC,0))*1.00/10000000,1) as day_value

from dwtmp.tmp_ta_productinfo_mms_day_15

union all

select   city_code ,'9999'area_code ,'9999'town_code ,round(sum(nvl(FEE_ACC,0))*1.00/10000000,1) as day_value

from dwtmp.tmp_ta_productinfo_mms_day_15 group by city_code

union all

select  city_code ,area_code ,'9999'town_code ,round(sum(nvl(FEE_ACC,0))*1.00/10000000,1) as day_value

from dwtmp.tmp_ta_productinfo_mms_day_15 group by city_code,area_code

union all

select  city_code ,area_code ,town_code ,round(sum(nvl(FEE_ACC,0))*1.00/10000000,1) as day_value

from dwtmp.tmp_ta_productinfo_mms_day_15

group by city_code,area_code,town_code; ;

五、 自定义udf数据倾斜优化

sql语句

insert into dwtmp.suyan_tmp_ft_mid_user_work_model_day_02  select t3.msisdn,t3.lon,t3.lat,t3.sta_dur_total,t3.live_user_flag, t3.neighbourhood_code,t3.center_lon_wgs84,t3.center_lat_wgs84, t3.border_poi