Python基于kafka-python的生产者和消费者
1.安装kafka-python
pip install kafka-python
2.生产者
""" 基于kafka-python的kafka生产者 """ import json from typing import List, Optional from kafka import KafkaProducer from kafka.admin import KafkaAdminClient, NewTopic from kafka.producer.future import RecordMetadata from loguru import logger class CustomKafkaProducer: def __init__( self, bootstrap_servers: List[str], async_mode: bool = True, request_timeout_ms: int = 30 * 1000, max_block_ms: int = 60 * 1000, metadata_max_age_ms: int = 5 * 60 * 1000, ): """ async_mode : True表示发消息时采用异步模式,在回调中处理结果/异常.False表示发消息时采用同步模式,在发送线程中处理结果/异常,会阻塞发送线程. request_timeout_ms : 请求超时时间 max_block_ms : 缓冲区等待超时时间 metadata_max_age_ms : 元数据刷新间隔 """ assert bootstrap_servers, "bootstrap_servers不能为空" self.bootstrap_servers = bootstrap_servers self.producer: Optional[KafkaProducer] = KafkaProducer( bootstrap_servers=bootstrap_servers, request_timeout_ms=request_timeout_ms, max_block_ms=max_block_ms, metadata_max_age_ms=metadata_max_age_ms, ) self.topic: str = "" self.async_mode = async_mode def set_topic(self, topic: str): """ 设置topic """ assert topic, "不可设置topic为空" if self.topic == topic: logger.warning(f"topic已是【{topic}】无需再次设置") return assert not self.topic, "topic不可修改" self.topic = topic logger.info(f"指定topic:【{topic}】") def create_topic( self, partitions_num: int = 1, replication_factor: int = 1, validate_only: bool = False, ): """ 创建topic """ assert self.topic, "请先设置topic" admin_client = KafkaAdminClient(bootstrap_servers=self.bootstrap_servers) try: topic_list = admin_client.list_topics() if self.topic in topic_list: logger.warning(f"kafka服务器已存在topic【{self.topic}】") return new_topic = NewTopic( name=self.topic, num_partitions=partitions_num, replication_factor=replication_factor, ) admin_client.create_topics([new_topic], validate_only=validate_only) logger.info(f"在kafka服务器中创建topic【{self.topic}】成功") finally: admin_client.close() def __send_byte_msg( self, value: bytes, partition: Optional[int] = None, key: Optional[bytes] = None, timeout: int = 30, ): """ 发送bytes到kafka partition : 指定分区 key : 跟消息一起发送的值,如果没有指定分区,且指定了key,相同的key的消息会发送到同一个分区 timeout: 同步模式下, 发送消息的最大等待时间 """ assert self.producer, "请先初始化kafka生产者" assert self.topic, "请先设置topic" assert isinstance(value, bytes), "发送数据前,先转换为bytes类型" fut = self.producer.send( self.topic, value, partition=partition, key=key, ) if self.async_mode: # 不等待发送结果,结果/异常在回调中可以得到,发送失败不会中断当前线程 # timeout参数不生效,超时时间需要在初始化的时候指定 fut.add_callback(self.on_success) fut.add_errback(self.on_error) else: # 阻塞当前线程,等待发送完毕,失败会中断当前线程 # timeout参数生效,代表发送消息最大等待时间 try: record_meta_data: RecordMetadata = fut.get(timeout=timeout) logger.info( f"消息发送成功。topic:{record_meta_data.topic}|partition:{record_meta_data.partition}|offset:{record_meta_data.offset}" ) except Exception as e: raise def send_str_msg( self, str_msg: str, partition: Optional[int] = None, key: Optional[bytes] = None, timeout: int = 30, encoding: str = "utf-8", ): """ 发送字符串消息 """ assert isinstance(str_msg, str), "只能输入字符串数据" msg_bytes = str_msg.encode(encoding) self.__send_byte_msg( value=msg_bytes, partition=partition, key=key, timeout=timeout, ) def send_dict_msg( self, dict_msg: dict, partition: Optional[int] = None, key: Optional[bytes] = None, timeout: int = 30, ensure_ascii: bool = False, ): """ 发送字典消息 会转换为json发送 """ assert isinstance(dict_msg, dict), "只能输入字典数据" msg_bytes = json.dumps(dict_msg, ensure_ascii=ensure_ascii).encode() self.__send_byte_msg( value=msg_bytes, partition=partition, key=key, timeout=timeout, ) def on_success(self, record_meta_data: RecordMetadata): """ 异步发送消息成功回调 :return: """ logger.info( f"消息发送成功。topic:{record_meta_data.topic}|partition:{record_meta_data.partition}|offset:{record_meta_data.offset}" ) def on_error(self, e: Exception): """ 异步发送消息失败回调 :return: """ logger.error(f"消息发送失败。错误信息:{e}") def list_partitions(self) -> List[int]: """ 获取所有分区 """ assert self.producer, "请先初始化kafka生产者" assert self.topic, "请先指定topic" return list(self.producer.partitions_for(self.topic)) def flush(self, timeout: Optional[int] = 30): """ 将缓存区的数据立刻提交到kafka """ assert self.producer, "请先初始化kafka生产者" self.producer.flush(timeout=timeout) def close(self, timeout: Optional[int] = 10): """ 关闭kafka生产者对象 :return: """ assert self.producer, "请先初始化kafka生产者" self.producer.close(timeout=timeout) self.topic = "" self.producer = None
3.消费者
""" 基于kafka-python的kafka消费者 """ from typing import Dict, List, Optional from kafka import KafkaConsumer from kafka.consumer.fetcher import ConsumerRecord from kafka.structs import TopicPartition from loguru import logger class CustomKafkaConsumer: def __init__( self, bootstrap_servers: List[str], group_id: str, auto_offset_reset: str, fetch_max_wait_ms: int = 3 * 1000, fetch_max_bytes: int = 50 * 1024 * 1024, enable_auto_commit: bool = True, auto_commit_interval_ms: int = 5 * 1000, max_poll_interval_ms: int = 5 * 60 * 1000, ): """ group_id: 消费组id auto_offset_reset: 消费起始策略 - earliest: 如果消费组在topic有offset,从offset位置开始消费;如果没有,从队列头部开始消费 - latest: 如果消费组在topic有offse,从offset位置开始消费;如果没有,从队列尾部开始消费 fetch_max_wait_ms: 消费者从broker拉取消息最长等待时间,即当次拉取等待数据累积的最大时间,超过这个时间,数据累积不足也会返回 fetch_max_bytes: 单次拉取最大数据量,单位是bytes enable_auto_commit: 是否允许自动提交offset,不允许则需要手动提交offset auto_commit_interval_ms: 自动提交offset的间隔时间 max_poll_interval_ms: 两次poll操作之间的超时间隔时间,取决于单条业务处理时间和批量拉取的数据量,量大或单条消息处理时间长,则需要适当调大这个值 """ assert bootstrap_servers, "bootstrap_servers不能为空" assert group_id, "group_id不能为空" assert auto_offset_reset, "auto_offset_reset不能为空" assert auto_offset_reset in [ "earliest", "latest", ], "auto_offset_reset只能是earliest或latest" self.consumer: Optional[KafkaConsumer] = KafkaConsumer( bootstrap_servers=bootstrap_servers, group_id=group_id, auto_offset_reset=auto_offset_reset, fetch_max_wait_ms=fetch_max_wait_ms, fetch_max_bytes=fetch_max_bytes, enable_auto_commit=enable_auto_commit, auto_commit_interval_ms=auto_commit_interval_ms, max_poll_interval_ms=max_poll_interval_ms, ) self.enable_auto_commit = enable_auto_commit self.topic = "" def subscribe_topic(self, topic: str): """ 订阅主题 """ assert self.consumer, "请先初始化消费者" assert topic, "订阅topic不可设置为空" if self.topic == topic: logger.warning(f"已订阅【{topic}】,无需再次订阅。") return assert not self.topic, "订阅topic不可修改" self.topic = topic # 补充介绍,kafka-python的消费者多次订阅列表不是增量订阅,而是覆盖订阅。 # 这里要求只订阅一个topic,所以不需要考虑这个问题。 self.consumer.subscribe([topic]) logger.info(f"订阅【{topic}】") def poll( self, timeout_ms: int = 5 * 1000, max_records: Optional[int] = None, ) -> Dict[TopicPartition, List[ConsumerRecord]]: """ timeout_ms: 获取数据的最大等待时间 max_records: 拉取的最大条数,默认为None,内置的值是500条 补充解释timeout_ms与fetch_max_wait_ms的区别: 首先需要知道poll和fetch之间是异步的. poll方法会优先从本地缓冲区获取数据(时间很短),缓冲区的数据不足(条数)会触发fetch请求,并等待fetch请求返回的数据,如果等待超过了timeout_ms,会先返回已获得的数据. 而fetch请求受到fetch_max_wait_ms的控制,在fetch_max_wait_ms时间内,等待broker累积足够的数据,如果数据不足也返回,返回的数据如果没有被poll处理,则放到本地缓冲区. """ assert self.consumer, "请先初始化消费者" assert self.topic, "请先订阅topic" records = self.consumer.poll(timeout_ms=timeout_ms, max_records=max_records) return records def commit(self, offsets: Optional[dict] = None): """ 手动提交offset 消费过程中,后台其实保存了已消费数据的offset,这里只要调用一下commit即可 """ assert self.consumer, "请先初始化消费者" assert self.topic, "请先订阅topic" assert not self.enable_auto_commit, "已开启自动提交offset,不能手动提交offset" self.consumer.commit(offsets=offsets) def close(self, autocommit=True): """ 关闭消费者 """ assert self.consumer, "请先初始化消费者" self.consumer.close(autocommit=autocommit) self.topic = "" self.consumer = None
浙公网安备 33010602011771号