实时接口的shell脚本、数据推到Elasticsearch的python脚本
实时接口sh
#!/bin/bash
db_cr_packdb_url="`get_database_info mysql.packdb.url`"
db_cr_packdb_ip="`get_database_info mysql.packdb.ip`"
db_cr_packdb_database="`get_database_info mysql.packdb.database`"
db_cr_packdb_username="`get_database_info mysql.packdb.username`"
db_cr_packdb_password="`get_database_info mysql.packdb.password`"
db_app_compre_rsk_url="`get_database_info mysql.app_compre_rsk.url`"
db_app_compre_rsk_username="`get_database_info mysql.app_compre_rsk.username`"
db_app_compre_rsk_password="`get_database_info mysql.app_compre_rsk.password`"
hive_fin_dw_url="`get_database_info hive.fin_dw.url`"
hive_fin_dw_username="`get_database_info hive.fin_dw.username`"
impala_username="`get_database_info impala.username`"
impalad_host=`choose_impalad`
#判断临时文件是否存在,存在则删除
get_col(){
local s=$1
local col_num=$2
local col_str=$3
let "col_num--"
local ss=${s#*${col_str}}
if [[ ${col_num} -eq 0 ]]
then
echo ${ss%%${col_str}*}
else
local sss=${ss#*${col_str}}
get_col "$sss" $col_num "$col_str"
fi
}
SOURCE=`which impala-shell`
BIN_DIR="$(dirname "$SOURCE")"
while [ -h "$SOURCE" ]
do
SOURCE="$(readlink "$SOURCE")"
[[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE"
BIN_DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
done
BIN_DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
LIB_DIR=$BIN_DIR/../lib
SCRIPT_DIR=$LIB_DIR/impala-shell
SHELL_HOME=${IMPALA_SHELL_HOME:-${SCRIPT_DIR}}
export LC_CTYPE=${LC_CTYPE:-en_US.UTF-8}
PYTHON_EGG_CACHE=/tmp/impala-shell-python-egg-cache-${USER}
if [ ! -d ${PYTHON_EGG_CACHE} ]; then
mkdir ${PYTHON_EGG_CACHE}
fi
EGG_PATH=''
for EGG in $(ls ${SHELL_HOME}/ext-py/*.egg); do
EGG_PATH="${EGG}:${EGG_PATH}"
done
export PYTHONPATH="${EGG_PATH}${SHELL_HOME}/gen-py:${SHELL_HOME}/lib:${PYTHONPATH}"
sdate=`date +%Y-%m-%d' '%H:%M:%S`
lastresult=`impala-shell -i $impalad_host -l --auth_creds_ok_in_clear -u $impala_username --ldap_password_cmd="sh /home/etl/impala.sh" -q "select dt from fin_dw.app_api_ent_bond_cash_di where dt !='his' order by dt desc limit 1"`
lastmaxdt=`get_col "$lastresult" 2 "|"`
lastresult=`mysql -u$db_cr_packdb_username -p$db_cr_packdb_password -h$db_cr_packdb_ip -D$db_cr_packdb_database -e"select count(9) from meta_task_run_log_dd where task_name='app_api_ent_bond_cash_di' and db_ins_date='$lastmaxdt' and task_status='1'"`
lastcountNum=`get_col "$lastresult" 2 "|"|awk '{print $2}'`
if [[ "$lastmaxdt" != "" && "$lastcountNum" != "0" ]] || [ "$lastmaxdt" = "" ]
then
impala-shell -l --auth_creds_ok_in_clear -u $impala_username --ldap_password_cmd="sh /home/etl/impala.sh" -i $impalad_host -d fin_dw -f /home/etl/app/sql/app_api_ent_bond_cash_di.sql
if [ $? -eq 0 ]
then
echo "Impala SQL执行成功!"
else
echo "Impala SQL执行失败!"
exit 1
fi
fi
sql="select count(9) as cnt from fin_dw.t_app_api_ent_bond_cash_di_01"
record_cnt=`impala-shell -l --auth_creds_ok_in_clear -u $impala_username --ldap_password_cmd="sh /home/etl/impala.sh" -i $impalad_host -d fin_dw -B -q "$sql"`
if [ $? -eq 0 ]
then
echo "Impala SQL执行成功!"
else
echo "Impala SQL执行失败!"
exit 1
fi
if [ "$record_cnt" = "0" ]
then
echo "无增量更新数据"
exit 0
fi
executeSql="insert into meta_task_run_log_dd(task_name,tbl_name,biz_date,begin_date,end_date,task_status,db_ins_date,record_cnt) values('app_api_ent_bond_cash_di','app_api_ent_bond_cash_di'"
lastmaxdt=`impala-shell -i $impalad_host -l --auth_creds_ok_in_clear -u $impala_username --ldap_password_cmd="sh /home/etl/impala.sh" -B -q "select dt from fin_dw.app_api_ent_bond_cash_di where dt !='his' order by dt desc limit 1"`
impala-shell -l --auth_creds_ok_in_clear -u $impala_username --ldap_password_cmd="sh /home/etl/impala.sh" -i $impalad_host -d fin_dw -f /home/etl/app/sql/app_api_ent_bond_cash_push_di.sql --var=dt=$lastmaxdt
if [ $? -eq 0 ]
then
echo "Impala SQL执行成功!"
echo "导出数据到Mysql开始..."
sudo -u hdfs sqoop export --connect ${db_app_compre_rsk_url} --username ${db_app_compre_rsk_username} --password ${db_app_compre_rsk_password} --table app_api_ent_bond_cash_dd --update-mode allowinsert --update-key "sec_inner_code,cash_date" --hcatalog-database fin_dw --hcatalog-table app_api_ent_bond_cash_push_di --null-string '\\N' --null-non-string '\\N' -m 1;
if [ $? -eq 0 ]
then
echo "导出数据到Mysql成功"
edate=`date +%Y-%m-%d' '%H:%M:%S`
# mysql -u$db_cr_packdb_username -p$db_cr_packdb_password -h$db_cr_packdb_ip -D$db_cr_packdb_database -e"$executeSql,'$sdate','$sdate','$edate','1','$lastmaxdt','$record_cnt')" #这条打包日志的语句少东西
else
echo "导出数据到Mysql失败"
exit 1
fi
else
echo "Impala SQL执行失败!"
exit 1
fi
recordcnt=`impala-shell -i $impalad_host -l --auth_creds_ok_in_clear -u $impala_username --ldap_password_cmd="sh /home/etl/impala.sh" -B -q "select count(1) from fin_dw.app_api_ent_bond_cash_di where dt ='$lastmaxdt' "`
impala-shell -l --auth_creds_ok_in_clear -u $impala_username --ldap_password_cmd="sh /home/etl/impala.sh" -i $impalad_host -d fin_dw -f /home/etl/app/sql/app_api_ent_bond_cash_v2_push_di.sql --var=dt=$lastmaxdt --var=record_cnt=$recordcnt
if [ $? -eq 0 ]
then
echo "Impala SQL执行成功!"
echo "输出到es开始 '"
python /home/etl/app/python/app_api_ent_bond_cash_di.py $lastmaxdt
if [ $? -eq 0 ]
then
echo "推送es成功"
else
echo "执行全量最新状态表sql失败"
exit 1
fi
else
echo "Impala SQL执行失败!"
exit 1
fi
# 新增批量接口日志打包
# 放在后面是为了先执行sql文件
ddate=`date +%Y%m%d%H%M`
result=`impala-shell -i $impalad_host -l --auth_creds_ok_in_clear -u $impala_username --ldap_password_cmd="sh /home/etl/impala.sh" -q "select dt from fin_dw.app_api_ent_bond_cash_di where dt !='his' order by dt desc limit 1"`
maxdt=`get_col "$result" 2 "|"`
result=`mysql -u$db_cr_packdb_username -p$db_cr_packdb_password -h$db_cr_packdb_ip -D$db_cr_packdb_database -e"select count(9) from meta_task_run_log_dd where task_name='app_api_ent_bond_cash_di' and db_ins_date='$maxdt' and dw_src='fin_dw'"`
countNum=`get_col "$result" 2 "|"|awk '{print $2}'`
if [ "$maxdt" != "" ] && [ "$countNum" = "0" ]
then
echo "需要补日志"
result=`impala-shell -i $impalad_host -l --auth_creds_ok_in_clear -u $impala_username --ldap_password_cmd="sh /home/etl/impala.sh" -q "select count(1) from fin_dw.app_api_ent_bond_cash_di where dt='$maxdt' and dt !='his'"`
countNum=`get_col "$result" 2 "|"`
mysql -u$db_cr_packdb_username -p$db_cr_packdb_password -h$db_cr_packdb_ip -D$db_cr_packdb_database -e"insert into meta_task_run_log_dd(task_name,tbl_name,biz_date,begin_date,end_date,task_status,db_ins_date,record_cnt,dw_src) values('app_api_ent_bond_cash_di','app_api_ent_bond_cash_di','$sdate','$sdate','$sdate','1','$maxdt','$countNum','fin_dw')"
else
echo "不需要补日志"
fi
数据推到Elasticsearch的python脚本
# -*- coding=utf-8 -*-
import sys
sys.path.append('/usr/local/bin')
from get_database_info import get_database_info
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import os
from impala.dbapi import connect
import logging
import datetime
import kudu
import resource
import commands
import hashlib
import re
logging.basicConfig(format='<%(asctime)s>[%(levelname)s]-%(filename)s:%(message)s', level=logging.INFO)
soft, hard=resource.getrlimit(resource.RLIMIT_AS)
resource.setrlimit(resource.RLIMIT_AS,(21474836480, hard))
impalad_host = commands.getoutput('choose_impalad')
if __name__ == '__main__':
impala_username = get_database_info('impala.username')
kudu_master1 = get_database_info('kudu.master1')
kudu_master2 = get_database_info('kudu.master2')
kudu_master3 = get_database_info('kudu.master3')
hub_es = get_database_info('es.es_prod')
hub_es_list = hub_es.split(',')
es = Elasticsearch(hub_es_list, timeout=100, max_retries=3, retry_on_timeout=True)
tbl_dt = sys.argv[1]
class impala_client:
def __init__(self, host=None, impala_user=None, impala_ldap_pass=None):
self.conn = connect(host=host, port=21050, use_ldap=True, ldap_user=impala_user, ldap_password=impala_ldap_pass)
self.host = host
def __del__(self):
if self.conn is not None:
self.conn.close()
def impala_exec_result(self, sql):
'''
执行MySQL语句
:param sql:
:return:
'''
try:
# 执行sql查询,返回字典
cursor = self.conn.cursor()
# 执行语句
cursor.execute(sql)
# 获取全部数据
result = cursor.fetchall()
cursor.close()
logging.info("{}:执行Impala语句:{}".format(self.host, sql))
return result
except Exception as e:
logging.error("{}:执行Impala语句失败: {}".format(self.host, e))
return None
def impala_exec_solr_result(self, sql):
'''
执行MySQL语句
:param sql:
:return:
'''
try:
# 执行sql查询,返回字典
cursor = self.conn.cursor(dictify=True)
# 执行语句
cursor.execute(sql)
# 获取全部数据
result = cursor.fetchall()
cursor.close()
logging.info("{}:执行Impala语句:{}".format(self.host, sql))
return result
except Exception as e:
logging.error("{}:执行Impala语句失败: {}".format(self.host, e))
return None
def impala_exec(self, sql):
'''
执行MySQL语句
:param sql:
'''
try:
# 使用cursor()方法获取操作游标
cursor = self.conn.cursor(dictify=True)
# 执行sql语句
logging.info("{}:执行Impala语句:{}".format(self.host, sql))
cursor.execute(sql)
except Exception as e:
logging.error("{}:执行Impala语句失败: {}".format(self.host, e))
def f_tbl_data_deal(rows):
docs=[]
for row in rows:
index_action = {
"_op_type":'update',
"doc_as_upsert":True,
"_index":'assets_clue_info',
"_type":'_doc',
"_id":row[0],
"doc": {
"formattedEntName": unicode(row[1], 'utf-8'),
"entName": unicode(row[2], 'utf-8'),
"assetFlow": unicode(row[3], 'utf-8'),
"clueDate":row[4],
"clueType": unicode(row[5], 'utf-8'),
"assetsType1": unicode(row[6], 'utf-8'),
"assetsType2": unicode(row[7], 'utf-8'),
"isLitigation": unicode(row[8], 'utf-8'),
"totMoney": row[9],
"execEff": unicode(row[10], 'utf-8'),
"dataDtl": unicode(row[11], 'utf-8'),
"dataStatus": unicode(row[12], 'utf-8'),
"dwInsDate": unicode(row[13], 'utf-8'),
}
}
docs.append(index_action)
for ok, response in helpers.streaming_bulk(es, docs, raise_on_error=True, raise_on_exception=True, chunk_size=2000):
if not ok:
logging.info(response)
dw_ins_date = datetime.datetime.strptime(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
logging.info(str(dw_ins_date) + '-开始')
logging.info("创建kudu连接!")
impala_passwd = os.popen("sh /home/etl/impala.sh").readlines()[0].strip()
impala_client = impala_client(host=impalad_host,impala_user=impala_username,impala_ldap_pass=impala_passwd)
batch_ctrl_sql = 'select batch_no,tbl_dt from fin_dw.app_api_data_batch_ctrl where tbl_name="app_api_ent_bond_cash_di" and batch_status="init" and tbl_dt="'+tbl_dt+'"'
logging.info(batch_ctrl_sql)
batch_ctrl_result_set = impala_client.impala_exec_result(batch_ctrl_sql)
kudu_client = kudu.connect(host=[kudu_master1,kudu_master2,kudu_master3], port='7051')
session = kudu_client.new_session(flush_mode='manual', timeout_ms=5000)
tbl_app_api_data_batch_ctrl = kudu_client.table('impala::fin_dw.app_api_data_batch_ctrl')
for row in batch_ctrl_result_set:
app_api_sql = 'select rowkey,formatted_ent_name,ent_name,asset_flow,clue_date,clue_type,asset_type1,asset_type2,is_litigation,tot_money,exec_eff,data_dtl,data_status,dw_ins_date from fin_dw.app_api_ent_bond_cash_v2_push_di where batch_no='+str(row[0])
logging.info(app_api_sql)
app_api_result_set = impala_client.impala_exec_result(app_api_sql)
dw_ins_date = datetime.datetime.strptime(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
logging.info(str(dw_ins_date) + '-批次' + str(row[0]) + '-返回sql结果集')
f_tbl_data_deal(app_api_result_set)
update_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
op = tbl_app_api_data_batch_ctrl.new_update({'tbl_name': 'app_api_ent_bond_cash_di', 'batch_no': int(row[0]), 'tbl_dt': row[1], 'batch_status': 'succ', 'update_date': update_date})
session.apply(op)
try:
session.flush()
except kudu.KuduBadStatus as e:
logging.info((session.get_pending_errors()))
dw_ins_date = datetime.datetime.strptime(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
logging.info(str(dw_ins_date) + '-批次' + str(row[0]) + '-增量更新es结束')
dw_ins_date = datetime.datetime.strptime(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
kudu_client.close()
logging.info(str(dw_ins_date) + '-所有批次增量更新es结束')