实时接口的shell脚本、数据推到Elasticsearch的python脚本

实时接口的shell脚本、数据推到Elasticsearch的python脚本

实时接口sh

#!/bin/bash
db_cr_packdb_url="`get_database_info mysql.packdb.url`"
db_cr_packdb_ip="`get_database_info mysql.packdb.ip`"
db_cr_packdb_database="`get_database_info mysql.packdb.database`"
db_cr_packdb_username="`get_database_info mysql.packdb.username`"
db_cr_packdb_password="`get_database_info mysql.packdb.password`"
db_app_compre_rsk_url="`get_database_info mysql.app_compre_rsk.url`"
db_app_compre_rsk_username="`get_database_info mysql.app_compre_rsk.username`"
db_app_compre_rsk_password="`get_database_info mysql.app_compre_rsk.password`"
hive_fin_dw_url="`get_database_info hive.fin_dw.url`"
hive_fin_dw_username="`get_database_info hive.fin_dw.username`"
impala_username="`get_database_info impala.username`"
impalad_host=`choose_impalad`
#判断临时文件是否存在,存在则删除

get_col(){
local s=$1
local col_num=$2
local col_str=$3

let "col_num--"

local ss=${s#*${col_str}}

if [[ ${col_num} -eq 0 ]]
then
  echo ${ss%%${col_str}*}
else
  local sss=${ss#*${col_str}}
  get_col "$sss" $col_num "$col_str"
fi
}

SOURCE=`which impala-shell`
BIN_DIR="$(dirname "$SOURCE")"
while [ -h "$SOURCE" ]
do
    SOURCE="$(readlink "$SOURCE")"
    [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE"
    BIN_DIR="$( cd -P "$( dirname "$SOURCE"  )" && pwd )"
done
BIN_DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
LIB_DIR=$BIN_DIR/../lib

SCRIPT_DIR=$LIB_DIR/impala-shell
SHELL_HOME=${IMPALA_SHELL_HOME:-${SCRIPT_DIR}}
export LC_CTYPE=${LC_CTYPE:-en_US.UTF-8}

PYTHON_EGG_CACHE=/tmp/impala-shell-python-egg-cache-${USER}
if [ ! -d ${PYTHON_EGG_CACHE} ]; then
    mkdir ${PYTHON_EGG_CACHE}
fi

EGG_PATH=''
for EGG in $(ls ${SHELL_HOME}/ext-py/*.egg); do
    EGG_PATH="${EGG}:${EGG_PATH}"
done

export PYTHONPATH="${EGG_PATH}${SHELL_HOME}/gen-py:${SHELL_HOME}/lib:${PYTHONPATH}"

sdate=`date +%Y-%m-%d' '%H:%M:%S`
lastresult=`impala-shell -i $impalad_host -l --auth_creds_ok_in_clear -u $impala_username --ldap_password_cmd="sh /home/etl/impala.sh" -q "select dt from fin_dw.app_api_ent_bond_cash_di where dt !='his' order by dt desc limit 1"`
lastmaxdt=`get_col "$lastresult" 2 "|"`
lastresult=`mysql -u$db_cr_packdb_username -p$db_cr_packdb_password -h$db_cr_packdb_ip -D$db_cr_packdb_database -e"select count(9) from meta_task_run_log_dd where task_name='app_api_ent_bond_cash_di' and db_ins_date='$lastmaxdt' and task_status='1'"`
lastcountNum=`get_col "$lastresult" 2 "|"|awk '{print $2}'`

if [[ "$lastmaxdt" != "" && "$lastcountNum" != "0" ]] || [ "$lastmaxdt" = "" ]
  then
    impala-shell -l --auth_creds_ok_in_clear -u $impala_username --ldap_password_cmd="sh /home/etl/impala.sh" -i $impalad_host -d fin_dw -f /home/etl/app/sql/app_api_ent_bond_cash_di.sql
    if [ $? -eq 0 ]
      then
        echo "Impala SQL执行成功!"
      else
        echo "Impala SQL执行失败!"
        exit 1
    fi
fi

sql="select count(9) as cnt from fin_dw.t_app_api_ent_bond_cash_di_01"
record_cnt=`impala-shell -l --auth_creds_ok_in_clear -u $impala_username --ldap_password_cmd="sh /home/etl/impala.sh" -i $impalad_host -d fin_dw -B -q "$sql"`
if [ $? -eq 0 ]
  then
    echo "Impala SQL执行成功!"
  else
    echo "Impala SQL执行失败!"
    exit 1
fi
if [ "$record_cnt" = "0" ]
  then
    echo "无增量更新数据"
    exit 0
fi
        
executeSql="insert into meta_task_run_log_dd(task_name,tbl_name,biz_date,begin_date,end_date,task_status,db_ins_date,record_cnt) values('app_api_ent_bond_cash_di','app_api_ent_bond_cash_di'"

lastmaxdt=`impala-shell -i $impalad_host -l --auth_creds_ok_in_clear -u $impala_username --ldap_password_cmd="sh /home/etl/impala.sh" -B -q "select dt from fin_dw.app_api_ent_bond_cash_di where dt !='his' order by dt desc limit 1"`


impala-shell -l --auth_creds_ok_in_clear -u $impala_username --ldap_password_cmd="sh /home/etl/impala.sh" -i $impalad_host -d fin_dw -f /home/etl/app/sql/app_api_ent_bond_cash_push_di.sql --var=dt=$lastmaxdt
if [ $? -eq 0 ]
  then
    echo "Impala SQL执行成功!"
    echo "导出数据到Mysql开始..."
sudo -u hdfs sqoop export --connect ${db_app_compre_rsk_url} --username ${db_app_compre_rsk_username} --password ${db_app_compre_rsk_password} --table app_api_ent_bond_cash_dd --update-mode allowinsert --update-key "sec_inner_code,cash_date" --hcatalog-database fin_dw --hcatalog-table app_api_ent_bond_cash_push_di  --null-string '\\N'  --null-non-string '\\N' -m 1;
    if [ $? -eq 0 ]
      then
        echo "导出数据到Mysql成功"
        edate=`date +%Y-%m-%d' '%H:%M:%S`
        # mysql -u$db_cr_packdb_username -p$db_cr_packdb_password -h$db_cr_packdb_ip -D$db_cr_packdb_database -e"$executeSql,'$sdate','$sdate','$edate','1','$lastmaxdt','$record_cnt')" #这条打包日志的语句少东西
      else
        echo "导出数据到Mysql失败"
        exit 1
     fi
  else
    echo "Impala SQL执行失败!"
    exit 1
fi
recordcnt=`impala-shell -i $impalad_host -l --auth_creds_ok_in_clear -u $impala_username --ldap_password_cmd="sh /home/etl/impala.sh" -B -q "select count(1) from fin_dw.app_api_ent_bond_cash_di where dt ='$lastmaxdt' "`

impala-shell -l --auth_creds_ok_in_clear -u $impala_username --ldap_password_cmd="sh /home/etl/impala.sh" -i $impalad_host -d fin_dw -f /home/etl/app/sql/app_api_ent_bond_cash_v2_push_di.sql --var=dt=$lastmaxdt --var=record_cnt=$recordcnt
if [ $? -eq 0 ]
  then
    echo "Impala SQL执行成功!"
    echo "输出到es开始 '"
    python /home/etl/app/python/app_api_ent_bond_cash_di.py $lastmaxdt
    if [ $? -eq 0 ]
        then 
            echo "推送es成功"
        else
            echo "执行全量最新状态表sql失败"
            exit 1
    fi   
  else
    echo "Impala SQL执行失败!"
    exit 1
fi

# 新增批量接口日志打包
# 放在后面是为了先执行sql文件

ddate=`date +%Y%m%d%H%M` 

result=`impala-shell -i $impalad_host -l --auth_creds_ok_in_clear -u $impala_username --ldap_password_cmd="sh /home/etl/impala.sh" -q "select dt from fin_dw.app_api_ent_bond_cash_di where dt !='his' order by dt desc limit 1"`
maxdt=`get_col "$result" 2 "|"`

result=`mysql -u$db_cr_packdb_username -p$db_cr_packdb_password -h$db_cr_packdb_ip -D$db_cr_packdb_database -e"select count(9) from meta_task_run_log_dd where task_name='app_api_ent_bond_cash_di' and db_ins_date='$maxdt' and dw_src='fin_dw'"`
countNum=`get_col "$result" 2 "|"|awk '{print $2}'`
if [ "$maxdt" != "" ] && [ "$countNum" = "0" ]
 then
    echo "需要补日志"
    result=`impala-shell -i $impalad_host -l --auth_creds_ok_in_clear -u $impala_username --ldap_password_cmd="sh /home/etl/impala.sh" -q "select count(1) from fin_dw.app_api_ent_bond_cash_di where dt='$maxdt' and dt !='his'"`
    countNum=`get_col "$result" 2 "|"`
    mysql -u$db_cr_packdb_username -p$db_cr_packdb_password -h$db_cr_packdb_ip -D$db_cr_packdb_database -e"insert into meta_task_run_log_dd(task_name,tbl_name,biz_date,begin_date,end_date,task_status,db_ins_date,record_cnt,dw_src) values('app_api_ent_bond_cash_di','app_api_ent_bond_cash_di','$sdate','$sdate','$sdate','1','$maxdt','$countNum','fin_dw')"
 else
    echo "不需要补日志"
fi

数据推到Elasticsearch的python脚本

# -*- coding=utf-8 -*-
import sys

sys.path.append('/usr/local/bin')
from get_database_info import get_database_info
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import os
from impala.dbapi import connect
import logging
import datetime
import kudu
import resource
import commands
import hashlib
import re


logging.basicConfig(format='<%(asctime)s>[%(levelname)s]-%(filename)s:%(message)s', level=logging.INFO)

soft, hard=resource.getrlimit(resource.RLIMIT_AS)
resource.setrlimit(resource.RLIMIT_AS,(21474836480, hard))

impalad_host = commands.getoutput('choose_impalad')

if __name__ == '__main__':
    impala_username = get_database_info('impala.username')
    kudu_master1 = get_database_info('kudu.master1')
    kudu_master2 = get_database_info('kudu.master2')
    kudu_master3 = get_database_info('kudu.master3')
    hub_es = get_database_info('es.es_prod')
    hub_es_list = hub_es.split(',')    
    es = Elasticsearch(hub_es_list, timeout=100, max_retries=3, retry_on_timeout=True)
    tbl_dt = sys.argv[1] 
        
class impala_client:

    def __init__(self, host=None, impala_user=None, impala_ldap_pass=None):
        self.conn = connect(host=host, port=21050, use_ldap=True, ldap_user=impala_user, ldap_password=impala_ldap_pass)
        self.host = host

    def __del__(self):
        if self.conn is not None:
            self.conn.close()

    def impala_exec_result(self, sql):
        '''
        执行MySQL语句
        :param sql:
        :return:
        '''
        try:
            # 执行sql查询,返回字典
            cursor = self.conn.cursor()
            # 执行语句
            cursor.execute(sql)
            # 获取全部数据
            result = cursor.fetchall()
            cursor.close()
            logging.info("{}:执行Impala语句:{}".format(self.host, sql))
            return result
        except Exception as e:
            logging.error("{}:执行Impala语句失败: {}".format(self.host, e))
            return None

    def impala_exec_solr_result(self, sql):
        '''
        执行MySQL语句
        :param sql:
        :return:
        '''
        try:
            # 执行sql查询,返回字典
            cursor = self.conn.cursor(dictify=True)
            # 执行语句
            cursor.execute(sql)
            # 获取全部数据
            result = cursor.fetchall()
            cursor.close()
            logging.info("{}:执行Impala语句:{}".format(self.host, sql))
            return result
        except Exception as e:
            logging.error("{}:执行Impala语句失败: {}".format(self.host, e))
            return None

    def impala_exec(self, sql):
        '''
           执行MySQL语句
           :param sql:
       '''
        try:
            # 使用cursor()方法获取操作游标
            cursor = self.conn.cursor(dictify=True)
            # 执行sql语句
            logging.info("{}:执行Impala语句:{}".format(self.host, sql))
            cursor.execute(sql)
        except Exception as e:
            logging.error("{}:执行Impala语句失败: {}".format(self.host, e))


def f_tbl_data_deal(rows):
    docs=[]
    for row in rows:
        index_action = {
        	"_op_type":'update',
			"doc_as_upsert":True,
			"_index":'assets_clue_info',
        	"_type":'_doc',
        	"_id":row[0],
            "doc": {
				"formattedEntName": unicode(row[1], 'utf-8'),
                "entName": unicode(row[2], 'utf-8'),
                "assetFlow": unicode(row[3], 'utf-8'),
                "clueDate":row[4],
                "clueType": unicode(row[5], 'utf-8'),
                "assetsType1": unicode(row[6], 'utf-8'),
                "assetsType2": unicode(row[7], 'utf-8'),
                "isLitigation": unicode(row[8], 'utf-8'),
                "totMoney": row[9],
                "execEff": unicode(row[10], 'utf-8'),
                "dataDtl": unicode(row[11], 'utf-8'),
                "dataStatus": unicode(row[12], 'utf-8'),
                "dwInsDate": unicode(row[13], 'utf-8'),
            }
        }
        docs.append(index_action)
    for ok, response in helpers.streaming_bulk(es, docs, raise_on_error=True, raise_on_exception=True, chunk_size=2000):
        if not ok:
            logging.info(response)			

dw_ins_date = datetime.datetime.strptime(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
logging.info(str(dw_ins_date) + '-开始')

logging.info("创建kudu连接!")
impala_passwd = os.popen("sh /home/etl/impala.sh").readlines()[0].strip()
impala_client = impala_client(host=impalad_host,impala_user=impala_username,impala_ldap_pass=impala_passwd)
batch_ctrl_sql = 'select batch_no,tbl_dt from fin_dw.app_api_data_batch_ctrl where tbl_name="app_api_ent_bond_cash_di" and batch_status="init" and tbl_dt="'+tbl_dt+'"'
logging.info(batch_ctrl_sql)
batch_ctrl_result_set = impala_client.impala_exec_result(batch_ctrl_sql)

kudu_client = kudu.connect(host=[kudu_master1,kudu_master2,kudu_master3], port='7051')
session = kudu_client.new_session(flush_mode='manual', timeout_ms=5000)
tbl_app_api_data_batch_ctrl = kudu_client.table('impala::fin_dw.app_api_data_batch_ctrl')
for row in batch_ctrl_result_set:
	app_api_sql = 'select rowkey,formatted_ent_name,ent_name,asset_flow,clue_date,clue_type,asset_type1,asset_type2,is_litigation,tot_money,exec_eff,data_dtl,data_status,dw_ins_date from fin_dw.app_api_ent_bond_cash_v2_push_di where batch_no='+str(row[0])
	logging.info(app_api_sql)
	app_api_result_set = impala_client.impala_exec_result(app_api_sql)
	dw_ins_date = datetime.datetime.strptime(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
	logging.info(str(dw_ins_date) + '-批次' + str(row[0]) + '-返回sql结果集')
	f_tbl_data_deal(app_api_result_set)

	update_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	op = tbl_app_api_data_batch_ctrl.new_update({'tbl_name': 'app_api_ent_bond_cash_di', 'batch_no': int(row[0]), 'tbl_dt': row[1], 'batch_status': 'succ', 'update_date': update_date})
	session.apply(op)
	try:
		session.flush()
	except kudu.KuduBadStatus as e:
		logging.info((session.get_pending_errors()))

	dw_ins_date = datetime.datetime.strptime(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
	logging.info(str(dw_ins_date) + '-批次' + str(row[0]) + '-增量更新es结束')

dw_ins_date = datetime.datetime.strptime(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
kudu_client.close()
logging.info(str(dw_ins_date) + '-所有批次增量更新es结束')
posted @ 2022-07-11 11:18  赤兔胭脂小吕布  阅读(51)  评论(0)    收藏  举报