python读kafka

from kafka import KafkaConsumer, TopicPartition   #pip install kafka-python -i http://pypi.douban.com/simple --trusted-host pypi.douban.com
from kafka import KafkaProducer
import time
import numpy as np
import pandas as pd


kafka_servers = ['xxxx:9092']


def subscribe_data(topic='xx',groupid='offline_anly'):
    consumer = KafkaConsumer(bootstrap_servers=kafka_servers,group_id=groupid)
    consumer.subscribe(topics=[topic])
    print(consumer.partitions_for_topic(topic))
    df = pd.DataFrame()
    print(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()))
    while True :
        msg = consumer.poll(10)
        if msg=={}:
            continue
        for m in list(msg.values()):
            for line in m:
                try:
                    if line.serialized_value_size<10:
                        continue
                    dta=str(line.value,encoding='utf-8')
                    if dta==None:
                        continue
                    df = df.append(eval(dta),ignore_index=True)
                except Exception as el:
                    print(el)
                    print(line)
                    return df
        if np.nanmax(df['lastTm'])-np.nanmin(df['lastTm'])>20*60*1000:
            print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
            break
    return df

def consume_from_time(topic='xx',groupid='offline-analy',begin_time=None,end_time=None):
    consumer=KafkaConsumer(bootstrap_servers=kafka_servers,group_id=groupid)
    pations = list(consumer.partitions_for_topic(topic))
    begin_offsets = consumer.offsets_for_times({TopicPartition(topic, pations[0]): begin_time})
    end_offsets =  consumer.offsets_for_times({TopicPartition(topic, pations[0]): end_time})
    topic_partition=0
    begin_offset=0
    end_offset=0
    df = pd.DataFrame()
    for topic_partition, offset_and_timestamp in begin_offsets.items():
        print(type(offset_and_timestamp[0]))
        begin_offset = 'null' if offset_and_timestamp is None else offset_and_timestamp[0]
    for topic_partition, offset_and_timestamp in end_offsets.items():
        # print(type(offset_and_timestamp[0]))
        end_offset = 'null' if offset_and_timestamp is None else offset_and_timestamp[0]
    consumer.assign(begin_offsets)
    consumer.seek(topic_partition,begin_offset)
    # print(consumer.assignment())
    print(begin_offset,end_offset)

    while True:
        msg = consumer.poll(10,max_records=1000)
        if msg=={}:
            continue
        for m in list(msg.values()):
            for line in m:
                try:
                    if line.serialized_value_size<10:   #异常数据
                        continue
                    # print(line)
                    if line.offset>end_offset:   #到达结束时间的offset
                        print(line)
                        return df
                    dta=str(line.value,encoding='utf-8')
                    if dta==None:   #异常数据
                        continue
                    data = eval(dta)
                    if data['province']=='HaiNan':
                        df = df.append(data,ignore_index=True)   #所有数据都存进去了,实际使用时,考虑省份,区域过滤。
                except Exception as el:
                    print(el)
                    print(line)
                    return df
        print(df.shape[0])

def produce_data(self,topic):
    producer=KafkaProducer(bootstrap_servers=kafka_servers)
    pass

if __name__=='__main__':
    df = pd.DataFrame()
    # df=subscribe_data()
    # print(df.shape[0])
    print(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()))
    df=consume_from_time(begin_time=1627363993627,end_time=1627365193000)
    print(df.shape[0])
    print(np.nanmax(df['lastTm']))
    print(np.nanmin(df['lastTm']))
    print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

 

posted on 2021-08-26 14:32  慢慢前进的蜗牛  阅读(434)  评论(0)    收藏  举报

导航