博客园 首页 私信博主 显示目录 隐藏目录 管理 动画

UDTF 函数 模板

- /*
-- 1653219286244|{"cm":{"ln":"-39.0","sv":"V2.4.3","os":"8.1.3","g":"QK98H42I@gmai"mid":"998","nw":"WIFI","l":"es","vc":"13","hw":"1080*1920","ar":"MX","uid":"99"1653179531755","la":"-8.1",
-- "md":"Huawei-14","vn":"1.0.9","ba":"Huawei","sr":"J:"app","et":[{"ett":"1653196354991","en":"notification","kv":{"ap_time":"165316","action":"4","type":"1","content":""}},
-- {"ett":"1653197089099","en":"active_fod","kv":{"access":"1","push_id":"3"}},{"ett":"1653145320863","en":"active_backg"kv":{"active_source":"2"}},{"ett":"1653137352302","en":"error",
-- "kv":{"errorDetava.lang.NullPointerException\\n at cn.lift.appIn.web.AbstractBaseControllernbound(AbstractBaseController.java:72)\\n at cn.lift.dfdf.web.AbstractBaseContralidInbound",
-- "errorBrief":"at cn.lift.appIn.control.CommandUtil.getInfo(Commandva:67)"}},{"ett":"1653136559468","en":"favorites","kv":{"course_id":2,"id":0,"a":"1653179650021","userid":8}},
-- {"ett":"1653187670283","en":"praise","kv":{"targ1,"id":6,"type":1,"add_time":"1653187575279","userid":0}}]}
-- */

-- 创建ods层事件日志的外部分区表
drop table if exists ods_event_log;
create external table if not exists ods_event_log(
line STRING comment "hdfs压缩后的每一行数据"
)
partitioned by (dt STRING)
stored as
inputformat 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
outputformat 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/outTable/gmall/ods/ods_event_log';

-- 插入数据
load data inpath '/origin_data/gmall/log/topic_event/2022-05-25' into table gmall.ods_event_log partition (dt='2022-05-25');

select * from ods_event_log limit 2;

-- 创建dwd层事件表
drop table if exists dwd_evevt_log;
create external table if not exists dwd_evevt_log(
`mid_id` string,
`user_id` string,
`version_code` string,
`version_name` string,
`lang` string,
`source` string,
`os` string,
`area` string,
`model` string,
`brand` string,
`sdk_version` string,
`gmail` string,
`height_width` string,
`app_time` string,
`network` string,
`lng` string,
`lat` string,
`event_name` string,
`event_json` string,
`server_time` string
)
partitioned by (dt STRING)
stored as parquet
location '/outTable/gmall/dwd/dwd_evevt_log'
TBLPROPERTIES ('parquet.compression'='lzo');

-- 创建udf自定义函数
create function base_analizer as 'com.bw.udf.MyUDF' using jar
'hdfs://hadoop106:8020/user/hive/jars/hivefunction-1.0-SNAPSHOT.jar';
-- 创建udtf自定义函数
create function flat_analizer as 'com.bw.udf.MyUDTF' using jar
'hdfs://hadoop106:8020/user/hive/jars/hivefunction-1.0-SNAPSHOT.jar';

-- 开启允许所有分区都是动态的,否则必须要有静态分区才能使用。
set hive.exec.dynamic.partition.mode=nonstrict;

-- 插入数据
insert overwrite table dwd_evevt_log partition (dt='2022-05-25')
select
base_analizer(line,'mid') as mid_id,
base_analizer(line,'uid') as user_id,
base_analizer(line,'vc') as version_code,
base_analizer(line,'vn') as version_name,
base_analizer(line,'l') as lang,
base_analizer(line,'sr') as source,
base_analizer(line,'os') as os,
base_analizer(line,'ar') as area,
base_analizer(line,'md') as model,
base_analizer(line,'ba') as brand,
base_analizer(line,'sv') as sdk_version,
base_analizer(line,'g') as gmail,
base_analizer(line,'hw') as height_width,
base_analizer(line,'t') as app_time,
base_analizer(line,'nw') as network,
base_analizer(line,'ln') as lng,
base_analizer(line,'la') as lat,
event_name,
event_json,
base_analizer(line,'st') as server_time
from ods_event_log lateral view flat_analizer(base_analizer(line,'et')) tmp_flat
as event_name,event_json
where dt='2022-05-25' and base_analizer(line,'et') <>'';

select * from dwd_evevt_log;

-- lateral viewlateral view 要与UDTF函数一起使用,比如 explode() 函数。这里的UDTF函数是指用户自定义的表生成函数(英文全称:user defined table-generating functions),
-- 它可以接受一行输入然后产生0行或多行输出。lateral view 首先将utdf函数应用到每一行上,这时每一行经utdf处理后得到多行输出,这些输出将会组建成一张虚拟表,
-- 然后这张虚拟表会跟当前表进行join操作,join完成之后会得出一张结果虚拟表,这张结果表里就有了utdf生成的列,当然原表的列除了utdf消耗的列之外肯定也在都里面。
posted @ 2022-05-25 21:16  CHANG_09  阅读(87)  评论(0)    收藏  举报