flink sql kafka数据接入hudi

--参数设置

--设置并行度
set  'parallelism.default' ='5';
--设置执行结果展示模式为tableau
set 'sql-client.execution.result-mode' = 'tableau';

reset execution.savepoint.path;
--reset execution.checkpoint.path;


set  'yarn.application.queue' = 'realtime';

-- 定义 source 表


drop table if exists  top_bos_login_dtl_kafka;

create table top_bos_login_dtl_kafka(
serial_number String,
bus_time String,
`result`  String,
errcode String,
serialnum_bg String,
bizcode String,
deal_time String,
serialnum_boss String,
errdesc String,
mbosversion String,
clnt_ver String,
bosscode String,
cid String,
push_cid String,
xk String,
channel_code String,
imei String,
ip String,
mb_type_info String,
scr_pix String,
mb_type_brand String,
sys_plat_ver String,
network_type String,
province_code String,
city_code        String,
event_day String
) WITH (
'connector' = 'kafka',
'topic' = 'top-bos-login-dtl',
'properties.bootstrap.servers' = '10.209.77.43:6667,10.209.77.46:6667,10.209.77.49:6667,10.209.77.53:6667,10.209.77.54:6667,10.209.77.57:6667,10.209.77.58:6667,10.209.77.66:6667,10.209.77.71:6667,10.209.77.35:6667,10.209.77.15:6667,10.209.77.75:6667',
'properties.group.id' = 'topicx-groupid',
'scan.startup.mode' = 'latest-offset',  
'json.fail-on-missing-field' = 'false',
'json.ignore-parse-errors' = 'true',
'format' = 'json'
);

-- 定义 sink 表


drop table if exists  ods_bos_login_dtl_hudi;

create table ods_bos_login_dtl_hudi(
serial_number String,
bus_time String,
result_status  String,
errcode String,
serialnum_bg String,
bizcode String,
deal_time String,
serialnum_boss String,
errdesc String,
mbosversion String,
clnt_ver String,
bosscode String,
cid String,
push_cid String,
xk String,
channel_code String,
imei String,
ip String,
mb_type_info String,
scr_pix String,
mb_type_brand String,
sys_plat_ver String,
network_type String,
province_code String,
city_code        String,
event_day String
)PARTITIONED BY (`event_day`) 
with ( 'connector'='hudi',
'path' ='/apps/hudi/ods_bos_login_dtl_hudi',
'table.type'='MERGE_ON_READ',        -- MERGE_ON_READ方式在没生成 parquet 文件前,hive不会有输出
'hoodie.datasource.write.recordkey.field'= 'serial_number,bus_time',  --主键
'hoodie.parquet.max.file.size'='268435456',   --Hudi 写入阶段生成的 parquet 文件的256M
'write.precombine.field'= 'event_day',
'write.task.max.size'='1024',  --写入任务的最大内存(以MB为单位),当阈值达到时,它会刷新最大大小的数据桶以避免ooM,默认1G
'write.rate.limit'='30000',   --写入限制,每秒写入记录速率限制,以防止流量抖动并提高稳定性,默认为0(无限制)
'write.tasks'= '5',
--'write.bucket_assign.tasks'='8',
--'write.insert.deduplicate'='false',
'write.option' = 'insert', -- bulk_insert, insert, upsert
--'write.bulk_insert.shuffle_by_partition' = 'false',
--'write.bulk_insert.sort_by_partition' = 'false',
'compaction.tasks'= '5', 
'compaction.async.enabled'= 'true',
'compaction.trigger.strategy'= 'num_commits',
'compaction.delta_commits'= '10',
'compaction.delta_seconds' = '3600', 
'compaction.schedule.enabled' = 'true',
'compaction.max_memory' = '1024',
'compaction.target_io' = '5120',
'changelog.enabled'= 'true',
'read.streaming.enabled'= 'true',
'read.streaming.check-interval'= '5',
'hive_sync.enable'='true',           -- required,开启hive同步功能
'hive_sync.mode' = 'hms',            -- required, 将hive sync mode设置为hms, 默认jdbc
'hive_sync.metastore.uris' = 'thrift://hadoop13.nb:9083', -- required, metastore的端口 宁波
'hive_sync.table'='ods_bos_login_dtl_hudi',                          -- required, hive 新建的表名
'hive_sync.db'='cmbh_real_time'                       -- required, hive 新建的数据库名
);

-- 数据从kafka 插入 hudi


insert into ods_bos_login_dtl_hudi 
select serial_number,bus_time,`result` as result_status,
errcode,serialnum_bg,bizcode,deal_time,serialnum_boss,
errdesc,mbosversion,clnt_ver,bosscode,cid,push_cid,
xk,channel_code,imei,ip,mb_type_info,scr_pix,mb_type_brand,
sys_plat_ver,network_type,province_code,city_code,
event_day  from top_bos_login_dtl_kafka;

posted @ 2022-10-18 16:37  whiteY  阅读(678)  评论(0)    收藏  举报