shell 脚本 中的 函数、窗口函数中order by多字段时desc的用法、关于sql中的空值null及处理、增存量对比(1)
shell 脚本 中的 函数
#!/bin/bash # 指定了shell脚本解释器的路径,而且这个指定路径只能放在文件的第一行。第一行写错或者不写时,系统会有一个默认的解释器进行解释。
db_cr_hub1_ip="`get_database_info mysql.cr_hub1.ip`"
db_cr_hub1_database="`get_database_info mysql.cr_hub1.database`"
db_cr_hub1_username="`get_database_info mysql.cr_hub1.username`"
db_cr_hub1_password="`get_database_info mysql.cr_hub1.password`"
db_amarmonitor_ip="`get_database_info other.amarmonitor`"
impala_username="`get_database_info impala.username`"
impalad_host=`choose_impalad`
get_col(){ # shell中的函数
local s=$1 # $1 该脚本的第一个参数
local col_num=$2 # local 声明局部变量
local col_str=$3
let "col_num--" # let 命令和双小括号 (( )) 的用法是类似的,它们都是用来对整数进行运算,对col_num进行--操作
local ss=${s#*${col_str}}
if [[ ${col_num} -eq 0 ]] # if语句
then
echo ${ss%%${col_str}*}
else
local sss=${ss#*${col_str}}
get_col "$sss" $col_num "$col_str"
fi
}
…………
窗口函数中order by多字段时desc的用法
row_number() over(partition by t1.companycode,t1.mainunderwriter,t1.accountingfirm,t1.legaladvisor,t1.status order by t1.enddate,t1.eutime desc) as enddate_number --此时的desc仅对eutime生效
-- 想要desc对enddate也生效
row_number() over(partition by t1.companycode,t1.mainunderwriter,t1.accountingfirm,t1.legaladvisor,t1.status order by t1.enddate desc,t1.eutime desc) as enddate_number
关于sql中的空值null及处理
对于空值字段需要进行预处理 -- null不支持大小/相等判断
增存量对比(1)
use fin_dw;
--将过去第7天的数据插入到历史分区
insert into table app_api_sec_pre_ipo_situation_di partition (dt,data_status) -- qpp层以天和数据状态(新增\更新\失效)分区
select
ent_name,
ent_code,
addr,
ind,
guar_inst,
guar_per,
acct_firm,
sign_acct,
lawyer_firm,
sign_lawyer,
list_status,
is_submit_fin_rpt,
end_date,
pre_list_market,
remark1,
fin_type,
remark2,
update_date,
dw_ins_date,
query_org_id,
'his' as dt,
data_status
from app_api_sec_pre_ipo_situation_di
where dt >=concat(from_unixtime(unix_timestamp(days_sub(now(), 7)), 'yyyyMMdd'),'0000') -- 过去第七天0点
and dt <=concat(from_unixtime(unix_timestamp(days_sub(now(), 7)), 'yyyyMMdd'),'2359');-- 过去第七天23:59
--删除过去第7天的数据
alter table app_api_sec_pre_ipo_situation_di drop partition(dt between concat(from_unixtime(unix_timestamp(days_sub(now(),7)),'yyyyMMdd'),'0000') and concat(from_unixtime(unix_timestamp(days_sub(now(),7)),'yyyyMMdd'),'2359')); -- 删除过去第七天的分区
--拿目标表中最新数据插入临时表
insert overwrite table t_app_api_sec_pre_ipo_situation_di_01
select tmp.ent_name,tmp.ent_code,tmp.guar_inst,tmp.acct_firm,tmp.lawyer_firm,tmp.list_status,tmp.update_date,tmp.query_org_id
from (select ent_name,ent_code,guar_inst,acct_firm,lawyer_firm,list_status,update_date,query_org_id,
row_number() over(partition by ent_code,guar_inst,acct_firm,lawyer_firm,list_status,query_org_id order by nvl(update_date,'1990-01-01') desc) as rn
from app_api_sec_pre_ipo_situation_di ) tmp
where tmp.rn=1; -- 将表中最新的数据插入临时表
compute stats t_app_api_sec_pre_ipo_situation_di_01; -- 收集卷信息, 表列和分区的数据分布情况;收集的信息存储在元数据库中,用于优化Impala查询
--将新增数据插入到目标表
insert overwrite app_api_sec_pre_ipo_situation_di partition (dt,data_status)
select a.ent_name,
a.ent_code,
a.addr,
a.ind,
a.guar_inst,
a.guar_per,
a.acct_firm,
a.sign_acct,
a.lawyer_firm,
a.sign_lawyer,
a.list_status,
a.is_submit_fin_rpt,
a.end_date,
a.pre_list_market,
a.remark1,
a.fin_type,
a.remark2,
a.update_date,
from_unixtime(unix_timestamp(now())) as dw_ins_date,
a.bankid,
from_unixtime(unix_timestamp(now()),'yyyyMMddHHmm') as dt,
case when a.data_status='1' then 'D'
when b.ent_name is null then 'I'
when b.ent_name is not null then 'U' end as data_status -- 判断数据状态
from (
select a1.*,a2.bankid
from(
select *
from dwd_sec_pre_ipo_situation_dd
where dt = regexp_replace(to_date(subdate(now(), 1)), '-', '') -- 昨天的分区
and update_date>=to_timestamp(to_date(subdate(now(), 3)),'yyyy-MM-dd') --为了防止丢数据,拿近三天更新时间的数据进行增存量判断
) a1 cross join (
select distinct bankid
from fin_dw.interface_account_info where bankid = 'zhonghangjiangsuBANK'
) a2
) a
left join t_app_api_sec_pre_ipo_situation_di_01 b
on a.ent_code=b.ent_code and a.guar_inst=b.guar_inst and a.acct_firm=b.acct_firm and a.lawyer_firm=b.lawyer_firm and a.list_status=b.list_status and a.bankid = b.query_org_id -- 按照主键关联,让数据一一对应
where b.ent_name is null or (b.ent_name is not null and a.update_date > b.update_date); -- 过滤新增数据或更新数据