1. hive 的三种join
1. reduceJoin 也叫 Common Join、Shuffle Join
2. MapJoin
3. Sort Merge Bucket Join(分桶表Join)
2. SMB(Sort Merge Bucket) Join 分桶表join
说明 : 大表与大表join时,如果key分布均匀,单纯因为数据量过大,导致任务失败或运行时间过长
可以考虑将大表分桶,来优化任务
原理 :
key % 分桶数 = 分桶编号
分桶编号1 join 分桶编号1
注意 : A表、B表 都需要是分桶表且分桶规则相同
参数 :
set hive.optimize.bucketmapjoin=true;
set hive.optimize.bucketmapjoin.sortedmerge=true;
set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
3.测试案例
-- 对照组
-- 不分桶 A表
create table bigtable2(
id bigint,
t bigint,
uid string,
keyword string,
url_rank int,
click_num int,
click_url string)
row format delimited fields terminated by '\t';
-- 不分桶 B表
create table bigtable(
id bigint,
t bigint,
uid string,
keyword string,
url_rank int,
click_num int,
click_url string)
row format delimited fields terminated by '\t';
-- 导入数据
load data local inpath '/root/bigtable' into table bigtable2;
load data local inpath '/root/bigtable' into table bigtable;
-- 不分桶关联
set yarn.scheduler.maximum-allocation-mb=118784;
set mapreduce.map.memory.mb=4096;
set mapreduce.reduce.memory.mb=4096;
set yarn.nodemanager.vmem-pmem-ratio=4.2;
insert overwrite table jointable
select b.id, b.t, b.uid, b.keyword, b.url_rank, b.click_num, b.click_url
from bigtable s
join bigtable2 b
on b.id = s.id;
Time taken: 109.024 seconds
-- 实验组
-- 创建分桶表1,分桶个数不超过CPU核数
create table bigtable_buck1(
id bigint,
t bigint,
uid string,
keyword string,
url_rank int,
click_num int,
click_url string)
clustered by(id)
sorted by(id)
into 2 buckets
row format delimited fields terminated by '\t';
load data local inpath '/root/bigtable' into table bigtable_buck1;
-- 创建分桶表2,分桶个数不超过CPU核数
create table bigtable_buck2(
id bigint,
t bigint,
uid string,
keyword string,
url_rank int,
click_num int,
click_url string)
clustered by(id)
sorted by(id)
into 2 buckets
row format delimited fields terminated by '\t';
load data local inpath '/root/bigtable' into table bigtable_buck2;
-- 参数设置(开启分桶连接)
set hive.optimize.bucketmapjoin=true;
set hive.optimize.bucketmapjoin.sortedmerge=true;
set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
insert overwrite table jointable
select b.id, b.t, b.uid, b.keyword, b.url_rank, b.click_num, b.click_url
from bigtable_buck1 s
join bigtable_buck2 b
on b.id = s.id;
Time taken: 64.895 seconds