1、hive sql调优
一、非数据倾斜优化
参数设置:
set hive.execution.engine=tez;
set hive.auto.convert.join=true;
sql语句:
|
insert overwrite directory '/user/bi/tmp_suyan/' row format delimited FIELDS TERMINATED BY '\u0001' SELECT t1.telnumber,t2.web_name,t2.web_classify1_name,t2.web_classify2_name,t1.web_id,t1.web_classify1,t1.web_classify2, t1.net_type,count(t1.user_id) usercnt,sum(t1.delaytime) delaytime,sum(t1.upbytes + t1.downbytes) DATA FROM ( SELECT telnumber,web_id,web_classify1,web_classify2,net_type,user_id,delaytime,upbytes,downbytes FROM netlog.ft_mid_xdr_domain_daily WHERE sum_date = '20190508' AND user_id IS NOT NULL ) t1 LEFT JOIN netlog.cfg_web_class_name t2 ON t1.web_id = t2.web_id GROUP BY t1.telnumber,t2.web_name,t2.web_classify1_name,t2.web_classify2_name,t1.web_id,t1.web_classify1,t1.web_classify2, t1.net_type; |
二、 多层Join数据倾斜优化
参数设置:
set hive.execution.engine=tez;
set hive.auto.convert.join=true;
set hive.optimize.skewjoin= true;
sql语句:
三、 group by数据倾斜优化
参数设置:
set hive.execution.engine=tez;
set hive.auto.convert.join=true;
set hive.groupby.skewindata=true;
set hive.map.aggr=true;
sql语句:
|
insert into table dwtmp.tmp_mkt_lost_detail_app_list_02 select t1.apply_class1 ,t.apply_id,t.telnumber from netlog.ft_mid_xdr_client_daily t inner join dwpub.MKT_CFG_app_CLASS t1 on t.apply_classify2=t1.apply_classify2 where t.sum_date between '20190430' and '20190506' group by t1.apply_class1, t.apply_id, t.telnumber; |
注意:1)group by和 多层join 引发的数据倾斜,都有调优的参数,但是两者不可同时使用,否则会导致数据条数异常。
四、 多层union all数据倾斜优化
参数设置:
set mapred.min.split.size=2147483648;
set mapred.max.split.size=2147483648;
set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
set hive.optimize.skewjoin= true;
set hive.auto.convert.join=true;
sql语句:
|
insert into table dwtmp.suyan_tmp_ta_productinfo_mms_group_day_15 select 'HNSJ'city_code ,'9999'area_code ,'9999'town_code ,round(sum(nvl(FEE_ACC,0))*1.00/10000000,1) as day_value from dwtmp.tmp_ta_productinfo_mms_day_15 union all select city_code ,'9999'area_code ,'9999'town_code ,round(sum(nvl(FEE_ACC,0))*1.00/10000000,1) as day_value from dwtmp.tmp_ta_productinfo_mms_day_15 group by city_code union all select city_code ,area_code ,'9999'town_code ,round(sum(nvl(FEE_ACC,0))*1.00/10000000,1) as day_value from dwtmp.tmp_ta_productinfo_mms_day_15 group by city_code,area_code union all select city_code ,area_code ,town_code ,round(sum(nvl(FEE_ACC,0))*1.00/10000000,1) as day_value from dwtmp.tmp_ta_productinfo_mms_day_15 group by city_code,area_code,town_code; ; |
五、 自定义udf数据倾斜优化
sql语句
|
insert into dwtmp.suyan_tmp_ft_mid_user_work_model_day_02 select t3.msisdn,t3.lon,t3.lat,t3.sta_dur_total,t3.live_user_flag, t3.neighbourhood_code,t3.center_lon_wgs84,t3.center_lat_wgs84, t3.border_poi |