从hive中读取数据推送到kafka

  1 # -*- coding: utf-8 -*-
  2 # Version: 1.0.0
  3 # Description: py_Hive2Kafka2kafka
  4 # Author: wqbin
  5 # Create_date:20191026
  6 
  7 import re
  8 import sys
  9 import os
 10 import logging
 11 import string
 12 import datetime
 13 import time
 14 import random
 15 import subprocess as sp
 16 
 17 from logging import handlers
 18 from time import strftime, localtime
 19 from pyhs2.haconnection import HAConnection
 20 from kafka import KafkaProducer
 21 
 22 
 23 ################################环境变量的设置############################################
 24 #1.指定编码格式
 25 os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8'
 26 
 27 #2.加载fi的环境变量
 28 ENV_FILE='/home/root/ficlient/bigdata_env'
 29 
 30 #加载ENV_FILE 相当于source /home/root/ficlient/bigdata_env
 31 proc = sp.Popen(['bash','-c','source {0} && env'.format(ENV_FILE)],stdout = sp.PIPE)
 32 for tup in map(lambda s: s.strip().split('=',1),proc.stdout):
 33 k=tup[0].strip()
 34 v=tup[1].strip()
 35 os.environ[k]=v
 36 #3.KERBEROS 认证
 37 KERBEROS_USER = "rootuser"
 38 KERBEROS_KEYTAB = "/home/root/rootuser.keytab" 
 39 TGT_PATH="/home/root/tagidentity.tgt"
 40 os.environ['KRB5CCNAME'] = TGT_PATH
 41 os.system("kinit -kt %s %s" % (KERBEROS_KEYTAB,KERBEROS_USER))
 42 
 43 #4.脚本路径 日志路径 配置路径
 44 #MAIN_PATH = os.path.abspath(os.path.join(os.path.dirname("__file__"),os.path.pardir))
 45 MAIN_PATH = "/ETL/pybin/py_Hive2Kafka"
 46 
 47 LOG_PATH = MAIN_PATH + "/log"
 48 CONF_PATH = MAIN_PATH + "/conf"
 49 #5.参数1:批次时间 20180721
 50 batch_date = sys.argv[1]
 51 
 52 ################################日志######################################################
 53 #日志中的时间格式
 54 ISOTIMEFORMAT = '%Y-%m-%d %H:%M:%S'
 55 
 56 #日志路径
 57 logfile = "%s/%s.log" % (LOG_PATH,batch_date)
 58 
 59 #整合层日志
 60 LOGGER = logging.getLogger("data_life_manager")
 61 
 62 LOGGER_HANDLER = logging.handlers.RotatingFileHandler(logfile, maxBytes=20*1024*1024, backupCount=10)
 63 FORMATTER = logging.Formatter("\r%(asctime)s [%(levelname)s] %(message)s", ISOTIMEFORMAT)
 64 LOGGER_HANDLER.setFormatter(FORMATTER)
 65 
 66 LOGGER.setLevel(logging.INFO)
 67 LOGGER.addHandler(LOGGER_HANDLER)
 68 
 69 console = logging.StreamHandler()
 70 console.setLevel(logging.INFO)
 71 console.setFormatter(FORMATTER)
 72 LOGGER.addHandler(console)
 73 logger = LOGGER
 74 logger.info(MAIN_PATH)
 75 
 76 
 77 ###################################从配置文件中获取配置###################################
 78 
 79 def get_conf(conf_file):
 80 """
 81 Get conf from a file having attribute which relatives by equal sign.
 82 Then, it will create and return a dic with conf.
 83 """
 84 conf = {}
 85 
 86 def add_conf(key, value):
 87 conf[key] = value
 88 
 89 map(lambda _: add_conf(_[0:_.index('=')], _[_.index('=') + 1:]),
 90 map(lambda _: _.replace('"', '').replace('\n', ''),
 91 #获取有效的配置行
 92 filter(lambda _: "=" in _ and not _.startswith('#'),
 93 open(conf_file).readlines()
 94 )
 95 )
 96 )
 97 return conf
 98 
 99 
100 db_config = get_conf(MAIN_PATH + '/conf/database.conf')
101 
102 
103 #Hive连接配置
104 HIVE_HOSTS = db_config.get('HIVE_HOSTS').split(',')
105 HIVE_PORT = db_config.get('HIVE_PORT')
106 queue_name = db_config.get('QUEUE_NAME')
107 
108  
109 
110 ###################################连接hive执行sql###################################
111 #查询统计结果sql
112 sql=''
113 if batch_date[6:8]=='03':
114 print 'batch_date[6:7]:%s'%batch_date[6:8]
115 sql = "select column1,column2,column3,column4 from table1 where batch_date=%s ;" % (batch_date)
116 else:
117 print 'batch_date[6:7]:%s'%batch_date[6:8]
118 sql = "select column1,column2 from table1 where batch_date=%s ;" % (batch_date)
119 database = "dt"
120 templatecode = "001"
121 transcode = "002"
122 orsenderid = "003"
123 orsenderchannel = "004"
124 
125 def select_hive(queue_name, database, sql, logger):
126 v_queue_name = "set mapred.job.queue.name=%s" % queue_name
127 v_database = "use %s" % database
128 sql = sql.encode('UTF-8')
129 v_sql = re.sub(r';$', '', sql)
130 timeout11 = 3 * 60 * 60 * 1000
131 conf = {"krb_host": "hadoop001", "krb_service": "hive"}
132 print v_queue_name
133 print v_database
134 print v_sql
135 try:
136 with HAConnection(hosts=HIVE_HOSTS,
137 port=HIVE_PORT,
138 timeout=timeout11,
139 authMechanism="KERBEROS",
140 user='rootuser',
141 configuration=conf) as haConn:
142 with haConn.getConnection() as conn:
143 with conn.cursor() as cur:
144 print v_queue_name
145 logger.info(v_queue_name)
146 cur.execute(v_queue_name)
147 print v_database
148 logger.info(v_database)
149 cur.execute(v_database)
150 print v_sql
151 logger.info(v_sql)
152 cur.execute(v_sql)
153 tuple_dic = cur.fetchall()
154 if len(tuple_dic) == 0:
155 tuple_dic = None
156 except Exception, e:
157 logger.error(e)
158 raise Exception(e)
159 return tuple_dic
160 
161  
162 
163 ####################################自定义异常类###################################
164 
165 class UserDefException(Exception):
166 def __init__(self,msg):
167 self.msg=msg
168 def __str__(self):
169 return self.msg
170 
171  
172 
173 
174 ####################################拼接json字符串 发送kafka方法###################################
175 
176 def send_json_to_Kafka(batch_date):
177 data_cnt_tuple_dic = select_hive(queue_name, database, sql, logger)
178 print data_cnt_tuple_dic
179 
180 list = []
181 try:
182 for a in data_cnt_tuple_dic:
183 if len(a)==2:
184 list.append(a[0])
185 list.append(a[1])
186 break 
187 elif len(a)==4:
188 list.append(a[0])
189 list.append(a[1])
190 list.append(a[2])
191 list.append(a[3])
192 break
193 else:
194 raise UserDefException("select返回不是4也不是2")
195 except Exception, e:
196 list = []
197 logger.error(e)
198 print list
199 
200 
201 orSenderSN = ''.join(random.sample(string.ascii_letters + string.digits, 22))
202 agentSerialNo = ''.join(random.sample(string.ascii_letters + string.digits, 8))
203 verison_name = "abc"
204 model_plat = "1"
205 
206 msg_head = '{"TemplateCode":"%s","TransCode":"%s","orSenderID":"%s","orSenderChannel":"%s","orSenderSN":"%s",' \
207 '"orSenderDate":"%s","curTime":"%d","agentSerialNo":"%s"}' \
208 % (templatecode, transcode, orsenderid, orsenderchannel, orSenderSN,
209 time.strftime("%Y%m%d", time.localtime()), int(round(time.time() * 1000)), agentSerialNo)
210 start_time = batch_date
211 end_time = batch_date
212 if batch_date[6:8]=='03':
213 end_time=datetime.datetime.combine(datetime.date(int(batch_date[0:4]),int(batch_date[4:6]),int(batch_date[6:8]))-datetime.timedelta(days=30),datetime.time.min).strftime("%Y%m%d")
214 try:
215 
216 if batch_date[6:8]=='03':
217 msg_result = '{' \
218 '"%s":%s,' \
219 '"%s":%s,' \
220 '"%s":%s,' \
221 '"%s":%s' \
222 '}' % ("column1",list[0],"column2",list[1],"column3",list[2],"column4",list[3])
223 elif batch_date[6:8]!='03':
224 msg_result = '{' \
225 '"%s":%s,' \
226 '"%s":%s' \
227 '}' % ("column1",list[0],"column2",list[1])
228 else:
229 raise UserDefException("select返回不是4也不是2")
230 except Exception, e:
231 logger.error(e)
232 raise Exception(e)
233 
234 msg_body = '{"verison_name":"%s","version":"","model_plat":"%s","event_start_tm":"%s","event_end_tm":"%s","result":%s}' \
235 % (verison_name, model_plat, start_time, end_time, str(msg_result).replace("'", '"'))
236 msg = '{"head":%s,"body":%s}' % (msg_head, msg_body)
237 logger.info(msg)
238 
239 try:
240 send_kafka(msg)
241 except Exception, e:
242 logger.error(e)
243 raise Exception(e)
244 
245  
246 
247 
248 bootstrap_servers = '192.168.164.202:9092,192.168.164.203:9092,192.168.164.204:9092'
249 topic = 'topic1'
250 retries = 2
251 
252 # 发送数据到kafka
253 def send_kafka(msg):
254 try:
255 producer = KafkaProducer(bootstrap_servers=bootstrap_servers, retries=retries)
256 except Exception as e:
257 logger.error(e)
258 raise Exception("catch an exception when create KafkaProducer")
259 try:
260 producer.send(topic, msg)
261 producer.flush()
262 producer.close()
263 
264 except Exception as e:
265 logger.error(e)
266 if producer:
267 producer.close()
268 raise Exception("catch an exception when send message:%s" % msg)
269 
270  
271 
272 if __name__ == '__main__':
273 send_json_to_Kafka(batch_date)
274 print "data from hive to kafka has all successed"

conf文件如下

1 #Hive
2 HIVE_HOSTS=192.168.154.201
3 HIVE_PORT=10000
4 QUEUE_NAME=NO1
5 PUB_BIGDATA_ENV_FILE=/home/root/bigdata_env
6 PUB_HIVE_ENV_FILE=/home/root/Hive/component_env
7 PUB_COMMON_PATH=/etl/pub
8 PUB_KEYTAB_PATH=/etl/pub/keytab_file

大多数人都以为是才智成就了科学家,他们错了,是品格。---爱因斯坦

 

转载于:https://www.cnblogs.com/wqbin/p/11279292.html

posted @ 2021-07-17 17:59  温家三哥  阅读(1330)  评论(0)    收藏  举报