1. key空值过多导致 任务异常(数据倾斜)处理方案
1. 空值过滤
说明 :
1. 当key的空值过多时,相同的key会被分配到同一个reduce中处理,导致此reduceTask内存不足,而任务失败
处理 :
在不影响业务的情况下将 key为null的数据处理掉
2. 空值转换
说明 : 大表中key=null的值过多,且无法过滤调空值key
处理 : 将空值key打散 (nvl(n.id,rand()))
消除了数据倾斜,负载均衡 reducer 的资源消耗
是每个reduceTask尽量处理相同量的数据
测试 :
-- 测试
-- reduceNum=5,不将空key打散
reduceTask1 : 10sec
reduceTask2 : 4sec
reduceTask3 : 4sec
reduceTask4 : 4sec
reduceTask5 : 5sec
-- reduceNum=5,将空key打散
reduceTask1 : 6sec
reduceTask2 : 6sec
reduceTask3 : 5sec
reduceTask4 : 5sec
reduceTask5 : 6sec
-- 测试sql
-- 创建空 id 表
create table nullidtable(id bigint, t bigint, uid string, keyword string, url_rank int, click_num int, click_url string) row format delimited
fields terminated by '\t';
-- 将本地文件导入 hive
load data local inpath '/root/nullid' into table nullidtable;
-- (对照组)reduceNum=5,不将空key打散
set yarn.scheduler.maximum-allocation-mb=118784;
set mapreduce.map.memory.mb=4096;
set mapreduce.reduce.memory.mb=4096;
set yarn.nodemanager.vmem-pmem-ratio=4.2;
set mapreduce.job.reduces = 5;
insert overwrite table jointable
select n.*
from nullidtable n
left outer join bigtable
o on n.id = o.id;
-- (实验组)reduceNum=5,将空key打散
set yarn.scheduler.maximum-allocation-mb=118784;
set mapreduce.map.memory.mb=4096;
set mapreduce.reduce.memory.mb=4096;
set yarn.nodemanager.vmem-pmem-ratio=4.2;
set mapreduce.job.reduces = 5;
insert overwrite table jointable
select n.*
from nullidtable n
left outer join bigtable
o on nvl(n.id,rand()) = o.id;