from kafka import KafkaConsumer, TopicPartition #pip install kafka-python -i http://pypi.douban.com/simple --trusted-host pypi.douban.com
from kafka import KafkaProducer
import time
import numpy as np
import pandas as pd
kafka_servers = ['xxxx:9092']
def subscribe_data(topic='xx',groupid='offline_anly'):
consumer = KafkaConsumer(bootstrap_servers=kafka_servers,group_id=groupid)
consumer.subscribe(topics=[topic])
print(consumer.partitions_for_topic(topic))
df = pd.DataFrame()
print(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()))
while True :
msg = consumer.poll(10)
if msg=={}:
continue
for m in list(msg.values()):
for line in m:
try:
if line.serialized_value_size<10:
continue
dta=str(line.value,encoding='utf-8')
if dta==None:
continue
df = df.append(eval(dta),ignore_index=True)
except Exception as el:
print(el)
print(line)
return df
if np.nanmax(df['lastTm'])-np.nanmin(df['lastTm'])>20*60*1000:
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
break
return df
def consume_from_time(topic='xx',groupid='offline-analy',begin_time=None,end_time=None):
consumer=KafkaConsumer(bootstrap_servers=kafka_servers,group_id=groupid)
pations = list(consumer.partitions_for_topic(topic))
begin_offsets = consumer.offsets_for_times({TopicPartition(topic, pations[0]): begin_time})
end_offsets = consumer.offsets_for_times({TopicPartition(topic, pations[0]): end_time})
topic_partition=0
begin_offset=0
end_offset=0
df = pd.DataFrame()
for topic_partition, offset_and_timestamp in begin_offsets.items():
print(type(offset_and_timestamp[0]))
begin_offset = 'null' if offset_and_timestamp is None else offset_and_timestamp[0]
for topic_partition, offset_and_timestamp in end_offsets.items():
# print(type(offset_and_timestamp[0]))
end_offset = 'null' if offset_and_timestamp is None else offset_and_timestamp[0]
consumer.assign(begin_offsets)
consumer.seek(topic_partition,begin_offset)
# print(consumer.assignment())
print(begin_offset,end_offset)
while True:
msg = consumer.poll(10,max_records=1000)
if msg=={}:
continue
for m in list(msg.values()):
for line in m:
try:
if line.serialized_value_size<10: #异常数据
continue
# print(line)
if line.offset>end_offset: #到达结束时间的offset
print(line)
return df
dta=str(line.value,encoding='utf-8')
if dta==None: #异常数据
continue
data = eval(dta)
if data['province']=='HaiNan':
df = df.append(data,ignore_index=True) #所有数据都存进去了,实际使用时,考虑省份,区域过滤。
except Exception as el:
print(el)
print(line)
return df
print(df.shape[0])
def produce_data(self,topic):
producer=KafkaProducer(bootstrap_servers=kafka_servers)
pass
if __name__=='__main__':
df = pd.DataFrame()
# df=subscribe_data()
# print(df.shape[0])
print(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()))
df=consume_from_time(begin_time=1627363993627,end_time=1627365193000)
print(df.shape[0])
print(np.nanmax(df['lastTm']))
print(np.nanmin(df['lastTm']))
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))