from confluent_kafka import Consumer, TopicPartition
import time
import datetime
def str_to_timestamp(str_time, format_type='%Y-%m-%d %H:%M:%S'):
time_array = time.strptime(str_time, format_type)
time_int = int(time.mktime(time_array)) * 1000
return time_int
def timestamp_to_str(timestamp):
time_stamp = int(timestamp * (10 ** (10 - len(str(timestamp)))))
print(datetime.datetime.fromtimestamp(time_stamp))
def listdata_to_file(list_data, file='abnormal.logs'):
with open(file, "w", encoding="utf-8") as f:
for line in list_data:
f.write(line + '\n')
def consumer_data_according_timestamp(topic, time_begin, time_end):
KAFKASERVERS = 'xxxxxxxxxx'
GROUPNAME = 'xxxxxxxxx'
c = Consumer({
'bootstrap.servers': KAFKASERVERS,
'group.id': GROUPNAME,
'auto.offset.reset': 'earliest',
'session.timeout.ms': 6000,
'security.protocol': 'SASL_PLAINTEXT',
'sasl.mechanism': 'PLAIN',
'sasl.username': 'xxxxxxx',
'sasl.password': 'xxxxxxx',
})
# 主题名
topic = topic
str_time_begin = time_begin
str_time_end = time_end
file_name = topic + str_time_begin.replace(" ", "-").replace(":", "-")
# 获取当前topic存在多少个分区
cluster_data = c.list_topics(topic=topic)
topic_data = cluster_data.topics[topic]
available_partitions = topic_data.partitions
# c.subscribe([topic])
# 把每个partition的offset设置到指定时间戳下,即获取大于改timestamp入库kafka的数据
# 注意这里的时间戳位数
timestamp_begin = str_to_timestamp(str_time_begin)
timestamp_end = str_to_timestamp(str_time_end)
list_data = []
len_list_data = 0
data_total_num=0
for partition_id in range(len(available_partitions)):
print("partition_id:%d" %partition_id)
tps = [TopicPartition(topic, partition_id, timestamp_begin)]
print(tps)
start_offsets = c.offsets_for_times(tps)
print(start_offsets)
c.assign(start_offsets)
while True:
# 阻塞等待消息的最大时间
msg = c.poll(1.0)
if msg is None:
break
if msg.error():
print("Consumer error: {}".format(msg.error()))
continue
# 获取该数据入kafka时间戳
kafka_timestamp = msg.timestamp()[1]
if(kafka_timestamp >= timestamp_end):
print(timestamp_to_str(kafka_timestamp))
break
list_data.append(msg.value().decode('utf-8'))
len_list_data=len_list_data+1
if len_list_data >= 5000:
listdata_to_file(list_data, file=file_name)
len_list_data = 0
data_total_num=data_total_num + 5000
# 消费kafka相应数据
#print('Received message: {%s}[%d]' %(msg.value().decode('utf-8'), msg.partition()))
#print(list_data)
print(data_total_num+len_list_data)
listdata_to_file(list_data, file=file_name)
c.unassign()
c.close()
if __name__ == '__main__':
consumer_data_according_timestamp(topic='xxxxxx', time_begin='2021-07-12 19:25:36', time_end='2021-07-12 19:35:36')
#consumer_data_according_timestamp(topic='xxxx', time_begin='2021-07-05 18:55:29', time_end='2021-07-05 19:05:29')