kafaka获取指定时间内的消息并写入指定文件

from confluent_kafka import Consumer, TopicPartition
import time
import datetime


def str_to_timestamp(str_time, format_type='%Y-%m-%d %H:%M:%S'):
    time_array = time.strptime(str_time, format_type)
    time_int = int(time.mktime(time_array)) * 1000
    return time_int

def timestamp_to_str(timestamp):
    time_stamp = int(timestamp * (10 ** (10 - len(str(timestamp)))))
    print(datetime.datetime.fromtimestamp(time_stamp))

def listdata_to_file(list_data, file='abnormal.logs'):
    with open(file, "w", encoding="utf-8") as f:
        for line in list_data:
            f.write(line + '\n')

def consumer_data_according_timestamp(topic, time_begin, time_end):
    KAFKASERVERS = 'xxxxxxxxxx'
    GROUPNAME = 'xxxxxxxxx'

    c = Consumer({
        'bootstrap.servers': KAFKASERVERS,
        'group.id': GROUPNAME,
        'auto.offset.reset': 'earliest',
        'session.timeout.ms': 6000,
        'security.protocol': 'SASL_PLAINTEXT',
        'sasl.mechanism': 'PLAIN',
        'sasl.username': 'xxxxxxx',
        'sasl.password': 'xxxxxxx',
    })
    # 主题名
    topic = topic
    str_time_begin = time_begin
    str_time_end = time_end
    file_name = topic + str_time_begin.replace(" ", "-").replace(":", "-")


    # 获取当前topic存在多少个分区
    cluster_data = c.list_topics(topic=topic)
    topic_data = cluster_data.topics[topic]
    available_partitions = topic_data.partitions
    # c.subscribe([topic])
    # 把每个partition的offset设置到指定时间戳下,即获取大于改timestamp入库kafka的数据
    # 注意这里的时间戳位数
    timestamp_begin = str_to_timestamp(str_time_begin)
    timestamp_end = str_to_timestamp(str_time_end)
    list_data = []
    len_list_data = 0
    data_total_num=0
    for partition_id in range(len(available_partitions)):
        print("partition_id:%d" %partition_id)
        tps = [TopicPartition(topic, partition_id, timestamp_begin)]
        print(tps)
        start_offsets = c.offsets_for_times(tps)
        print(start_offsets)
        c.assign(start_offsets)
        while True:
            # 阻塞等待消息的最大时间
            msg = c.poll(1.0)
            if msg is None:
                break
            if msg.error():
                print("Consumer error: {}".format(msg.error()))
                continue
            # 获取该数据入kafka时间戳
            kafka_timestamp = msg.timestamp()[1]
            if(kafka_timestamp >= timestamp_end):
                print(timestamp_to_str(kafka_timestamp))
                break
            list_data.append(msg.value().decode('utf-8'))
            len_list_data=len_list_data+1
            if len_list_data >= 5000:
                listdata_to_file(list_data, file=file_name)
                len_list_data = 0
                data_total_num=data_total_num + 5000
            # 消费kafka相应数据
            #print('Received message: {%s}[%d]' %(msg.value().decode('utf-8'), msg.partition()))
            #print(list_data)

    print(data_total_num+len_list_data)
    listdata_to_file(list_data, file=file_name)
    c.unassign()
    c.close()

if __name__ == '__main__':
    consumer_data_according_timestamp(topic='xxxxxx', time_begin='2021-07-12 19:25:36', time_end='2021-07-12 19:35:36')
    #consumer_data_according_timestamp(topic='xxxx', time_begin='2021-07-05 18:55:29', time_end='2021-07-05 19:05:29')
posted on 2021-07-12 14:54 该用户很懒阅读(689) 评论(0) 收藏举报