使用presto 进行跨库数据对比

需求:

  • 在dolphinscheduler上通过sqoop脚本将源库mysql数据同步至doris ,现需要对比 doris 和 源库mysql 表数据是否一致 。

对比粒度:

  • 明细+汇总  

实现:

  • 使用 information_schema.columns 获取相关的表所有字段 (如果是mysql和hive进行对比需要因DDL可能有差异 如 decimal(12,4) 类型,double ,datetime/timestamp(x) 需要根据字段类型做处理 
  • 是map将字段名和值进行组合
  • 使用唯一键进行关联
  • 使用transform函数将数据转为 {kay:[a.value,b.value]} 格式 (参考脚本2)
  • 使用filter函数进行过滤 ,a.value<>b.value (参考脚本2)
  • ...

知识点:

  • map
  • array
  • transform
  • json
  • 官网function:https://trino.io/docs/current/functions/array.html

遗留问题:

  • mysql有写字段是 json 类型的,在json 转 varchar 的时候 会出现异常 ,尝试使用 json_format(json null )/json_format(json CAST('{}' AS json)) /  json_format(json coalesce(....))  都无法解决,最终没有对比json类型的字段  使用这种 cast( json_format(json '{"":""}') as varchar)方式屏蔽json类型的字段 。

 脚本1

 1 SELECT 
 2 concat(
 3 'select a.* from ('
 4 ,concat(' ' ,array_join (TRANSFORM(table_catalog_schema ,_->concat('select id, map(array[' ,column_name_str ,'],split(' ,array_join(TRANSFORM(column_name_list ,_->concat('coalesce(cast('    ,CASE WHEN element_at(column_type_list,array_position(column_name_list,_)) LIKE 'decimal%' OR element_at(column_type_list,array_position(column_name_list,_)) ='double' THEN  concat('REGEXP_REPLACE(cast(',_,' as varchar),','''([0]+$)''',',','''''',')') WHEN element_at(column_type_list,array_position(column_name_list,_)) LIKE 'timestam%' OR element_at(column_type_list,array_position(column_name_list,_)) LIKE 'datetime' THEN  concat('substring( cast(',_,' as varchar) ,1,19)') WHEN element_at(column_type_list,array_position(column_name_list,_)) = 'json'  THEN  concat('json_format(json ','''{"":""}''',')') ELSE _  END    ,' as varchar)',',',chr(39),'NULL',chr(39),')')),concat('||','''^''','||'))  ,' ,''^''))' ,' from ',_,'.',table_name  ,' where '  , CASE WHEN contains(column_name_list,'create_time')  THEN concat( 'create_time',' between ' ,'date_trunc(''day'',current_timestamp -INTERVAL ''1'' DAY  )' ,' and ' ,'date_add(''HOUR'',-1,cast(date_format(current_timestamp,''%Y-%m-%d %H:00:00'') AS timestamp))') ELSE '1=1' END  ,CASE WHEN array_position(table_catalog_schema,_)<> dcnt THEN ' union all ' ELSE ' ' END  )),' '))                                            
 5 ,') a left join  ('
 6 ,concat(' ',concat('select id, map(array[',column_name_str ,'],split(' ,array_join(TRANSFORM(column_name_list ,_->concat('coalesce(cast('    ,CASE WHEN element_at(column_type_list,array_position(column_name_list,_)) LIKE 'decimal%' OR element_at(column_type_list,array_position(column_name_list,_)) ='double' THEN  concat('REGEXP_REPLACE(cast(',_,' as varchar),','''([0]+$)''',',','''''',')') WHEN element_at(column_type_list,array_position(column_name_list,_)) LIKE 'timestam%' OR element_at(column_type_list,array_position(column_name_list,_)) LIKE 'datetime' THEN  concat('substring( cast(',_,' as varchar) ,1,19)') WHEN element_at(column_type_list,array_position(column_name_list,_)) = 'json'  THEN  concat('json_format(json ','''{"":""}''',')') ELSE _  END    ,' as varchar)',',',chr(39),'NULL',chr(39),')')),concat('||','''^''','||')) ,' ,''^''))' ,' from doris.',doris_tabName  ,' where ' , CASE WHEN contains(column_name_list,'create_time') THEN concat( 'create_time',' between ','date_trunc(''day'',current_timestamp -INTERVAL ''1'' DAY  )',' and ','date_add(''HOUR'',-1,cast(date_format(current_timestamp,''%Y-%m-%d %H:00:00'') AS timestamp))') ELSE '1=1' END ,' and msg_is_valid =1 ' ))
 7 ,') b on a.id=b.id WHERE b.id IS NULL '  
 8 ) detal_SQL  -- 根据明细对比的SQL
 9 ,concat(
10     'select a.*,b.*,a.cnt-b.cnt from (' 
11     ,array_join (TRANSFORM(table_catalog_schema,_->concat('select  ''',substr(_,strpos(_,'.')+1)||'.'||table_name,''' cate_log_schema, count(1) cnt  from ',_,'.',table_name ,' where ' , CASE WHEN contains(column_name_list,'create_time') THEN concat( 'create_time',' between ','date_trunc(''day'',current_timestamp -INTERVAL ''1'' DAY  )',' and ','date_add(''HOUR'',-1,cast(date_format(current_timestamp,''%Y-%m-%d %H:00:00'') AS timestamp))') ELSE '1=1' END ,CASE WHEN array_position(table_catalog_schema,_)<> dcnt THEN ' group by 1 union all ' ELSE ' group by 1' END )),' ')
12     ,') a left join ( '    
13     ,concat('select  concat(msg_source_db,''.'',msg_source_table) cate_log_schema',', count(1) cnt  from doris.',doris_tabName ,' where ' , CASE WHEN contains(column_name_list,'create_time') THEN concat( 'create_time',' between ','date_trunc(''day'',current_timestamp -INTERVAL ''1'' DAY  )',' and  ','date_add(''HOUR'',-1,cast(date_format(current_timestamp,''%Y-%m-%d %H:00:00'') AS timestamp))') ELSE '1=1' END ,' and msg_is_valid =1  group by 1' )
14     ,') b on a.cate_log_schema=b.cate_log_schema '
15 ) agg_Sql_Str  -- 根据汇总数据对比的Sql
16 FROM (
17 SELECT table_name
18 ,max(doris_tabName) doris_tabName
19 --,array_agg(DISTINCT doris_tabName) doris_tabName
20 ,array_agg(DISTINCT  concat(table_catalog,'.',table_schema)) table_catalog_schema
21 ,count(DISTINCT  concat(table_catalog,'.',table_schema)) dcnt
22 ,max(column_name_list) column_name_list
23 ,max(column_name_str) column_name_str
24 ,max(columnCnt) columnCnt
25 ,max(column_type_list) column_type_list
26 FROM (
27 SELECT table_catalog,table_schema,table_name, array_join(array_agg(concat('''',column_name,'''')),',') column_name_str,array_agg(column_name ORDER BY ordinal_position) column_name_list ,count(column_name) columnCnt ,array_agg(data_type ORDER BY ordinal_position ) column_type_list FROM oms4_som01.information_schema.columns WHERE table_schema NOT IN ('information_schema','sys','pt','maxwell','db_oms4_som998','db_ecs_oms4_som_nike_prod') GROUP BY 1,2,3
28 UNION ALL  SELECT table_catalog,table_schema,table_name, array_join(array_agg(concat('''',column_name,'''')),',') column_name_str,array_agg(column_name ORDER BY ordinal_position) column_name_list ,count(column_name) columnCnt ,array_agg(data_type ORDER BY ordinal_position ) column_type_list  FROM oms4_som02.information_schema.columns  WHERE table_schema NOT IN ('information_schema','sys','pt','maxwell','db_oms4_som999') GROUP BY 1,2,3
29 UNION ALL  SELECT table_catalog,table_schema,table_name, array_join(array_agg(concat('''',column_name,'''')),',') column_name_str,array_agg(column_name ORDER BY ordinal_position) column_name_list ,count(column_name) columnCnt ,array_agg(data_type ORDER BY ordinal_position ) column_type_list  FROM oms4_baseinfo.information_schema.columns  WHERE table_schema NOT IN ('db_sec_compare_prod') GROUP BY 1,2,3
30 UNION ALL  SELECT table_catalog,table_schema,table_name, array_join(array_agg(concat('''',column_name,'''')),',') column_name_str,array_agg(column_name ORDER BY ordinal_position) column_name_list ,count(column_name) columnCnt ,array_agg(data_type ORDER BY ordinal_position ) column_type_list FROM oms4_finance.information_schema.columns   WHERE table_schema NOT IN ('db_sec_compare_prod') GROUP BY 1,2,3
31 )t INNER JOIN (
32 SELECT *
33 FROM (VALUES -- (doris表名, 源库表明)
34 ('db_ods.oms4_db_base_info_bi_brand','bi_brand'),
35 ('db_ods.oms4_db_base_info_bi_sales_entity_brand_ref','bi_sales_entity_brand_ref'),
36 ('db_ods.oms4_db_base_info_bi_sys_goods_tax_rate','bi_sys_goods_tax_rate'),
37 ('db_ods.oms4_rf_refund','rf_refund'),
38 ('db_ods.oms4_db_oms4_som_trade_order_guide_info','trade_order_guide_info'),
39 ('db_ods.oms4_db_oms4_som_trade_order_item','trade_order_item'),
40 ('db_ods.oms4_db_oms4_som_trade_order_member_info','trade_order_member_info'),
41 ('db_ods.oms4_db_oms4_som_trade_order_payment_info','trade_order_payment_info'),
42 ('db_ods.oms4_db_oms4_som_trade_return_order','trade_return_order')
43 --('db_ods.oms4_db_oms4_som_trade_order_guide_info','trade_order_guide_info'),
44 --('db_ods.oms4_db_oms4_som_trade_order_delivery_info','trade_order_delivery_info'),
45 --('db_ods.oms4_db_oms4_som_so_order_timing_promise','so_order_timing_promise')
46 ) AS tab (doris_tabName,src_db_tabName)
47 ) s ON t.table_name=s.src_db_tabName
48 GROUP BY table_name
49 ) tt
View Code

 脚本2

 1 SELECT hiveTableName,
 2 concat('select t.* from ( SELECT a.*,b.*,  map_filter(transform_values(a.datas,(k,v)->b.datas[k]||array[v]),(k,v)->v[1]<>v[2]) diff_fields   from (' 
 3 ,concat('select ''dmztc'' ds, ''',hiveTableName,''' src_tname , cast(id as varchar) id, map( array[', src_column_name_char,']' ,' , split(',format_column_name2,',','''^''',')) as datas FROM ',catalogs,SrcTableName ,' where ',concat('substring(CAST(',increnmentField,' AS varchar),1,10)'),'=',chr(39),CAST(current_date -INTERVAL '1' day AS varchar),chr(39))
 4 ,') a left join ('
 5 ,concat('select ''hive'' ds,''',hiveTableName,''' hive_tname ,id,map( array[', src_column_name_char,']' ,' , split(',format_column_name2,',','''^''','))  as datas FROM hive.' ,hiveTableName , ' where pt= ',chr(39),cast( current_date -INTERVAL '1' DAY AS varchar),chr(39) ,'and ',concat('substring(CAST(',increnmentField,' AS varchar),1,10)'),'=',chr(39),CAST(current_date -INTERVAL '1' day AS varchar),chr(39))
 6 ,') b on a.id=b.id ) t order by   CARDINALITY(diff_fields) desc    ')
 7 FROM (
 8 SELECT concat(table_catalog,'.',table_schema,'.') catalogs ,table_name SrcTableName, max(hiveTableName) hiveTableName ,max(increnmentField) increnmentField
 9 ,array_agg(column_name) src_column_name_array
10 ,array_join(array_agg(concat(chr(39),column_name,chr(39))),',') src_column_name_char
11 ,ARRAY_join(array_agg(concat('coalesce(cast('
12 ,CASE WHEN data_type LIKE 'decimal%' OR data_type ='double' THEN  concat('REGEXP_REPLACE(cast(',column_name,' as varchar),','''([0]+$)''',',','''''',')')  -- 处理
13       WHEN data_type LIKE 'timestam%' OR data_type LIKE 'datetime' THEN  concat('substring( cast(',column_name,' as varchar) ,1,19)') ELSE column_name  END
14 ,' as varchar)',',',chr(39),'NULL',chr(39),')')),concat('||','''^''','||')) format_column_name2
15 FROM (
16 SELECT table_catalog ,table_schema ,table_name ,column_name ,data_type FROM dmztc.information_schema.columns i 
17 UNION ALL 
18 SELECT table_catalog ,table_schema ,table_name ,column_name ,data_type FROM dm_tencent_ad.information_schema.columns i 
19 ) i
20 INNER JOIN (
21 SELECT * FROM (VALUES  -- (db_ods.或者db_working表明,源库表明,增量更新字段)
22 --('db_ods.dm_db_tencent_ad_prod_mp_adgroup_his_effect',    'mp_adgroup_his_effect',    'last_modify_time'),
23 --('db_ods.dm_db_tencent_ad_prod_tencent_account_hour_his_effect_industry',    'tencent_account_hour_his_effect_industry',    'last_modify_time'),
24 --('db_ods.dm_db_tencent_ad_prod_tencent_account_hour_his_effect_interaction',    'tencent_account_hour_his_effect_interaction',    'last_modify_time'),
25 --('db_ods.dm_db_tencent_ad_prod_tencent_ad_hour_his_effect_basic',    'tencent_ad_hour_his_effect_basic',    'last_modify_time'),
26 --('db_ods.dm_db_tencent_ad_prod_tencent_ad_hour_his_effect_industry',    'tencent_ad_hour_his_effect_industry',    'last_modify_time')
27 ('db_working.dm_db_tencent_ad_prod_mp_ad_his_effect_tmp',    'mp_ad_his_effect',    'last_modify_time'),
28 ('db_working.dm_db_tencent_ad_prod_mp_adertiser_his_effect_tmp',    'mp_adertiser_his_effect',    'last_modify_time'),
29 ('db_working.dm_db_tencent_ad_prod_tencent_account_his_effect_industry_tmp',    'tencent_account_his_effect_industry',    'last_modify_time'),
30 ('db_working.dm_db_tencent_ad_prod_tencent_account_his_effect_interaction_tmp',    'tencent_account_his_effect_interaction',    'last_modify_time'),
31 ('db_working.dm_db_tencent_ad_prod_tencent_ad_his_effect_basic_tmp',    'tencent_ad_his_effect_basic',    'last_modify_time'),
32 ('db_working.dm_db_tencent_ad_prod_tencent_ad_his_effect_industry_tmp',    'tencent_ad_his_effect_industry',    'last_modify_time'),
33 ('db_working.dm_db_tencent_ad_prod_tencent_ad_hour_his_effect_interaction_tmp',    'tencent_ad_hour_his_effect_interaction',    'last_modify_time'),
34 ('db_working.dm_db_tencent_ad_prod_tencent_advertising_his_effect_basic_tmp',    'tencent_advertising_his_effect_basic',    'last_modify_time'),
35 ('db_working.dm_db_tencent_ad_prod_tencent_advertising_hour_his_effect_industry_tmp',    'tencent_advertising_hour_his_effect_industry',    'last_modify_time'),
36 ('db_working.dm_db_tencent_ad_prod_tencent_advertising_hour_his_effect_interaction_tmp',    'tencent_advertising_hour_his_effect_interaction',    'last_modify_time'),
37 ('db_working.dm_db_tencent_ad_prod_tencent_campaign_hour_his_effect_basic_tmp',    'tencent_campaign_hour_his_effect_basic',    'last_modify_time'),
38 ('db_working.dm_db_tencent_ad_prod_tencent_campaign_hour_his_effect_industry_tmp',    'tencent_campaign_hour_his_effect_industry',    'last_modify_time')
39 
40 ) AS config (hiveTableName,SrcTableName,increnmentField)
41 ) c ON i.table_name =c.SrcTableName
42 GROUP BY 1 ,2
43 )
View Code

 

posted @ 2023-06-21 15:19  linbo.yang  阅读(76)  评论(0编辑  收藏  举报