从hive中读取数据推送到kafka
1 # -*- coding: utf-8 -*- 2 # Version: 1.0.0 3 # Description: py_Hive2Kafka2kafka 4 # Author: wqbin 5 # Create_date:20191026 6 7 import re 8 import sys 9 import os 10 import logging 11 import string 12 import datetime 13 import time 14 import random 15 import subprocess as sp 16 17 from logging import handlers 18 from time import strftime, localtime 19 from pyhs2.haconnection import HAConnection 20 from kafka import KafkaProducer 21 22 23 ################################环境变量的设置############################################ 24 #1.指定编码格式 25 os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8' 26 27 #2.加载fi的环境变量 28 ENV_FILE='/home/root/ficlient/bigdata_env' 29 30 #加载ENV_FILE 相当于source /home/root/ficlient/bigdata_env 31 proc = sp.Popen(['bash','-c','source {0} && env'.format(ENV_FILE)],stdout = sp.PIPE) 32 for tup in map(lambda s: s.strip().split('=',1),proc.stdout): 33 k=tup[0].strip() 34 v=tup[1].strip() 35 os.environ[k]=v 36 #3.KERBEROS 认证 37 KERBEROS_USER = "rootuser" 38 KERBEROS_KEYTAB = "/home/root/rootuser.keytab" 39 TGT_PATH="/home/root/tagidentity.tgt" 40 os.environ['KRB5CCNAME'] = TGT_PATH 41 os.system("kinit -kt %s %s" % (KERBEROS_KEYTAB,KERBEROS_USER)) 42 43 #4.脚本路径 日志路径 配置路径 44 #MAIN_PATH = os.path.abspath(os.path.join(os.path.dirname("__file__"),os.path.pardir)) 45 MAIN_PATH = "/ETL/pybin/py_Hive2Kafka" 46 47 LOG_PATH = MAIN_PATH + "/log" 48 CONF_PATH = MAIN_PATH + "/conf" 49 #5.参数1:批次时间 20180721 50 batch_date = sys.argv[1] 51 52 ################################日志###################################################### 53 #日志中的时间格式 54 ISOTIMEFORMAT = '%Y-%m-%d %H:%M:%S' 55 56 #日志路径 57 logfile = "%s/%s.log" % (LOG_PATH,batch_date) 58 59 #整合层日志 60 LOGGER = logging.getLogger("data_life_manager") 61 62 LOGGER_HANDLER = logging.handlers.RotatingFileHandler(logfile, maxBytes=20*1024*1024, backupCount=10) 63 FORMATTER = logging.Formatter("\r%(asctime)s [%(levelname)s] %(message)s", ISOTIMEFORMAT) 64 LOGGER_HANDLER.setFormatter(FORMATTER) 65 66 LOGGER.setLevel(logging.INFO) 67 LOGGER.addHandler(LOGGER_HANDLER) 68 69 console = logging.StreamHandler() 70 console.setLevel(logging.INFO) 71 console.setFormatter(FORMATTER) 72 LOGGER.addHandler(console) 73 logger = LOGGER 74 logger.info(MAIN_PATH) 75 76 77 ###################################从配置文件中获取配置################################### 78 79 def get_conf(conf_file): 80 """ 81 Get conf from a file having attribute which relatives by equal sign. 82 Then, it will create and return a dic with conf. 83 """ 84 conf = {} 85 86 def add_conf(key, value): 87 conf[key] = value 88 89 map(lambda _: add_conf(_[0:_.index('=')], _[_.index('=') + 1:]), 90 map(lambda _: _.replace('"', '').replace('\n', ''), 91 #获取有效的配置行 92 filter(lambda _: "=" in _ and not _.startswith('#'), 93 open(conf_file).readlines() 94 ) 95 ) 96 ) 97 return conf 98 99 100 db_config = get_conf(MAIN_PATH + '/conf/database.conf') 101 102 103 #Hive连接配置 104 HIVE_HOSTS = db_config.get('HIVE_HOSTS').split(',') 105 HIVE_PORT = db_config.get('HIVE_PORT') 106 queue_name = db_config.get('QUEUE_NAME') 107 108 109 110 ###################################连接hive执行sql################################### 111 #查询统计结果sql 112 sql='' 113 if batch_date[6:8]=='03': 114 print 'batch_date[6:7]:%s'%batch_date[6:8] 115 sql = "select column1,column2,column3,column4 from table1 where batch_date=%s ;" % (batch_date) 116 else: 117 print 'batch_date[6:7]:%s'%batch_date[6:8] 118 sql = "select column1,column2 from table1 where batch_date=%s ;" % (batch_date) 119 database = "dt" 120 templatecode = "001" 121 transcode = "002" 122 orsenderid = "003" 123 orsenderchannel = "004" 124 125 def select_hive(queue_name, database, sql, logger): 126 v_queue_name = "set mapred.job.queue.name=%s" % queue_name 127 v_database = "use %s" % database 128 sql = sql.encode('UTF-8') 129 v_sql = re.sub(r';$', '', sql) 130 timeout11 = 3 * 60 * 60 * 1000 131 conf = {"krb_host": "hadoop001", "krb_service": "hive"} 132 print v_queue_name 133 print v_database 134 print v_sql 135 try: 136 with HAConnection(hosts=HIVE_HOSTS, 137 port=HIVE_PORT, 138 timeout=timeout11, 139 authMechanism="KERBEROS", 140 user='rootuser', 141 configuration=conf) as haConn: 142 with haConn.getConnection() as conn: 143 with conn.cursor() as cur: 144 print v_queue_name 145 logger.info(v_queue_name) 146 cur.execute(v_queue_name) 147 print v_database 148 logger.info(v_database) 149 cur.execute(v_database) 150 print v_sql 151 logger.info(v_sql) 152 cur.execute(v_sql) 153 tuple_dic = cur.fetchall() 154 if len(tuple_dic) == 0: 155 tuple_dic = None 156 except Exception, e: 157 logger.error(e) 158 raise Exception(e) 159 return tuple_dic 160 161 162 163 ####################################自定义异常类################################### 164 165 class UserDefException(Exception): 166 def __init__(self,msg): 167 self.msg=msg 168 def __str__(self): 169 return self.msg 170 171 172 173 174 ####################################拼接json字符串 发送kafka方法################################### 175 176 def send_json_to_Kafka(batch_date): 177 data_cnt_tuple_dic = select_hive(queue_name, database, sql, logger) 178 print data_cnt_tuple_dic 179 180 list = [] 181 try: 182 for a in data_cnt_tuple_dic: 183 if len(a)==2: 184 list.append(a[0]) 185 list.append(a[1]) 186 break 187 elif len(a)==4: 188 list.append(a[0]) 189 list.append(a[1]) 190 list.append(a[2]) 191 list.append(a[3]) 192 break 193 else: 194 raise UserDefException("select返回不是4也不是2") 195 except Exception, e: 196 list = [] 197 logger.error(e) 198 print list 199 200 201 orSenderSN = ''.join(random.sample(string.ascii_letters + string.digits, 22)) 202 agentSerialNo = ''.join(random.sample(string.ascii_letters + string.digits, 8)) 203 verison_name = "abc" 204 model_plat = "1" 205 206 msg_head = '{"TemplateCode":"%s","TransCode":"%s","orSenderID":"%s","orSenderChannel":"%s","orSenderSN":"%s",' \ 207 '"orSenderDate":"%s","curTime":"%d","agentSerialNo":"%s"}' \ 208 % (templatecode, transcode, orsenderid, orsenderchannel, orSenderSN, 209 time.strftime("%Y%m%d", time.localtime()), int(round(time.time() * 1000)), agentSerialNo) 210 start_time = batch_date 211 end_time = batch_date 212 if batch_date[6:8]=='03': 213 end_time=datetime.datetime.combine(datetime.date(int(batch_date[0:4]),int(batch_date[4:6]),int(batch_date[6:8]))-datetime.timedelta(days=30),datetime.time.min).strftime("%Y%m%d") 214 try: 215 216 if batch_date[6:8]=='03': 217 msg_result = '{' \ 218 '"%s":%s,' \ 219 '"%s":%s,' \ 220 '"%s":%s,' \ 221 '"%s":%s' \ 222 '}' % ("column1",list[0],"column2",list[1],"column3",list[2],"column4",list[3]) 223 elif batch_date[6:8]!='03': 224 msg_result = '{' \ 225 '"%s":%s,' \ 226 '"%s":%s' \ 227 '}' % ("column1",list[0],"column2",list[1]) 228 else: 229 raise UserDefException("select返回不是4也不是2") 230 except Exception, e: 231 logger.error(e) 232 raise Exception(e) 233 234 msg_body = '{"verison_name":"%s","version":"","model_plat":"%s","event_start_tm":"%s","event_end_tm":"%s","result":%s}' \ 235 % (verison_name, model_plat, start_time, end_time, str(msg_result).replace("'", '"')) 236 msg = '{"head":%s,"body":%s}' % (msg_head, msg_body) 237 logger.info(msg) 238 239 try: 240 send_kafka(msg) 241 except Exception, e: 242 logger.error(e) 243 raise Exception(e) 244 245 246 247 248 bootstrap_servers = '192.168.164.202:9092,192.168.164.203:9092,192.168.164.204:9092' 249 topic = 'topic1' 250 retries = 2 251 252 # 发送数据到kafka 253 def send_kafka(msg): 254 try: 255 producer = KafkaProducer(bootstrap_servers=bootstrap_servers, retries=retries) 256 except Exception as e: 257 logger.error(e) 258 raise Exception("catch an exception when create KafkaProducer") 259 try: 260 producer.send(topic, msg) 261 producer.flush() 262 producer.close() 263 264 except Exception as e: 265 logger.error(e) 266 if producer: 267 producer.close() 268 raise Exception("catch an exception when send message:%s" % msg) 269 270 271 272 if __name__ == '__main__': 273 send_json_to_Kafka(batch_date) 274 print "data from hive to kafka has all successed"
conf文件如下
1 #Hive 2 HIVE_HOSTS=192.168.154.201 3 HIVE_PORT=10000 4 QUEUE_NAME=NO1 5 PUB_BIGDATA_ENV_FILE=/home/root/bigdata_env 6 PUB_HIVE_ENV_FILE=/home/root/Hive/component_env 7 PUB_COMMON_PATH=/etl/pub 8 PUB_KEYTAB_PATH=/etl/pub/keytab_file
大多数人都以为是才智成就了科学家,他们错了,是品格。---爱因斯坦

浙公网安备 33010602011771号