Python基于kafka-python的生产者和消费者

1.安装kafka-python

pip install kafka-python

2.生产者

"""
基于kafka-python的kafka生产者
"""

import json
from typing import List, Optional

from kafka import KafkaProducer
from kafka.admin import KafkaAdminClient, NewTopic
from kafka.producer.future import RecordMetadata
from loguru import logger


class CustomKafkaProducer:
    def __init__(
        self,
        bootstrap_servers: List[str],
        async_mode: bool = True,
        request_timeout_ms: int = 30 * 1000,
        max_block_ms: int = 60 * 1000,
        metadata_max_age_ms: int = 5 * 60 * 1000,
    ):
        """
        async_mode : True表示发消息时采用异步模式,在回调中处理结果/异常.False表示发消息时采用同步模式,在发送线程中处理结果/异常,会阻塞发送线程.
        request_timeout_ms : 请求超时时间
        max_block_ms : 缓冲区等待超时时间
        metadata_max_age_ms : 元数据刷新间隔
        """
        assert bootstrap_servers, "bootstrap_servers不能为空"
        self.bootstrap_servers = bootstrap_servers
        self.producer: Optional[KafkaProducer] = KafkaProducer(
            bootstrap_servers=bootstrap_servers,
            request_timeout_ms=request_timeout_ms,
            max_block_ms=max_block_ms,
            metadata_max_age_ms=metadata_max_age_ms,
        )
        self.topic: str = ""
        self.async_mode = async_mode

    def set_topic(self, topic: str):
        """
        设置topic
        """
        assert topic, "不可设置topic为空"
        if self.topic == topic:
            logger.warning(f"topic已是【{topic}】无需再次设置")
            return
        assert not self.topic, "topic不可修改"
        self.topic = topic
        logger.info(f"指定topic:【{topic}】")

    def create_topic(
        self,
        partitions_num: int = 1,
        replication_factor: int = 1,
        validate_only: bool = False,
    ):
        """
        创建topic
        """
        assert self.topic, "请先设置topic"
        admin_client = KafkaAdminClient(bootstrap_servers=self.bootstrap_servers)
        try:
            topic_list = admin_client.list_topics()
            if self.topic in topic_list:
                logger.warning(f"kafka服务器已存在topic【{self.topic}】")
                return
            new_topic = NewTopic(
                name=self.topic,
                num_partitions=partitions_num,
                replication_factor=replication_factor,
            )
            admin_client.create_topics([new_topic], validate_only=validate_only)
            logger.info(f"在kafka服务器中创建topic【{self.topic}】成功")
        finally:
            admin_client.close()

    def __send_byte_msg(
        self,
        value: bytes,
        partition: Optional[int] = None,
        key: Optional[bytes] = None,
        timeout: int = 30,
    ):
        """
        发送bytes到kafka

        partition : 指定分区
        key : 跟消息一起发送的值,如果没有指定分区,且指定了key,相同的key的消息会发送到同一个分区
        timeout: 同步模式下, 发送消息的最大等待时间
        """
        assert self.producer, "请先初始化kafka生产者"
        assert self.topic, "请先设置topic"
        assert isinstance(value, bytes), "发送数据前,先转换为bytes类型"
        fut = self.producer.send(
            self.topic,
            value,
            partition=partition,
            key=key,
        )
        if self.async_mode:
            # 不等待发送结果,结果/异常在回调中可以得到,发送失败不会中断当前线程
            # timeout参数不生效,超时时间需要在初始化的时候指定
            fut.add_callback(self.on_success)
            fut.add_errback(self.on_error)
        else:
            # 阻塞当前线程,等待发送完毕,失败会中断当前线程
            # timeout参数生效,代表发送消息最大等待时间
            try:
                record_meta_data: RecordMetadata = fut.get(timeout=timeout)
                logger.info(
                    f"消息发送成功。topic:{record_meta_data.topic}|partition:{record_meta_data.partition}|offset:{record_meta_data.offset}"
                )
            except Exception as e:
                raise

    def send_str_msg(
        self,
        str_msg: str,
        partition: Optional[int] = None,
        key: Optional[bytes] = None,
        timeout: int = 30,
        encoding: str = "utf-8",
    ):
        """
        发送字符串消息
        """
        assert isinstance(str_msg, str), "只能输入字符串数据"
        msg_bytes = str_msg.encode(encoding)
        self.__send_byte_msg(
            value=msg_bytes,
            partition=partition,
            key=key,
            timeout=timeout,
        )

    def send_dict_msg(
        self,
        dict_msg: dict,
        partition: Optional[int] = None,
        key: Optional[bytes] = None,
        timeout: int = 30,
        ensure_ascii: bool = False,
    ):
        """
        发送字典消息
        会转换为json发送
        """
        assert isinstance(dict_msg, dict), "只能输入字典数据"
        msg_bytes = json.dumps(dict_msg, ensure_ascii=ensure_ascii).encode()
        self.__send_byte_msg(
            value=msg_bytes,
            partition=partition,
            key=key,
            timeout=timeout,
        )

    def on_success(self, record_meta_data: RecordMetadata):
        """
        异步发送消息成功回调
        :return:
        """
        logger.info(
            f"消息发送成功。topic:{record_meta_data.topic}|partition:{record_meta_data.partition}|offset:{record_meta_data.offset}"
        )

    def on_error(self, e: Exception):
        """
        异步发送消息失败回调
        :return:
        """
        logger.error(f"消息发送失败。错误信息:{e}")

    def list_partitions(self) -> List[int]:
        """
        获取所有分区
        """
        assert self.producer, "请先初始化kafka生产者"
        assert self.topic, "请先指定topic"
        return list(self.producer.partitions_for(self.topic))

    def flush(self, timeout: Optional[int] = 30):
        """
        将缓存区的数据立刻提交到kafka
        """
        assert self.producer, "请先初始化kafka生产者"
        self.producer.flush(timeout=timeout)

    def close(self, timeout: Optional[int] = 10):
        """
        关闭kafka生产者对象
        :return:
        """
        assert self.producer, "请先初始化kafka生产者"
        self.producer.close(timeout=timeout)
        self.topic = ""
        self.producer = None

3.消费者

"""
基于kafka-python的kafka消费者
"""

from typing import Dict, List, Optional

from kafka import KafkaConsumer
from kafka.consumer.fetcher import ConsumerRecord
from kafka.structs import TopicPartition
from loguru import logger


class CustomKafkaConsumer:

    def __init__(
        self,
        bootstrap_servers: List[str],
        group_id: str,
        auto_offset_reset: str,
        fetch_max_wait_ms: int = 3 * 1000,
        fetch_max_bytes: int = 50 * 1024 * 1024,
        enable_auto_commit: bool = True,
        auto_commit_interval_ms: int = 5 * 1000,
        max_poll_interval_ms: int = 5 * 60 * 1000,
    ):
        """
        group_id: 消费组id
        auto_offset_reset: 消费起始策略
            - earliest: 如果消费组在topic有offset,从offset位置开始消费;如果没有,从队列头部开始消费
            - latest: 如果消费组在topic有offse,从offset位置开始消费;如果没有,从队列尾部开始消费
        fetch_max_wait_ms:  消费者从broker拉取消息最长等待时间,即当次拉取等待数据累积的最大时间,超过这个时间,数据累积不足也会返回
        fetch_max_bytes: 单次拉取最大数据量,单位是bytes
        enable_auto_commit: 是否允许自动提交offset,不允许则需要手动提交offset
        auto_commit_interval_ms: 自动提交offset的间隔时间
        max_poll_interval_ms: 两次poll操作之间的超时间隔时间,取决于单条业务处理时间和批量拉取的数据量,量大或单条消息处理时间长,则需要适当调大这个值
        """
        assert bootstrap_servers, "bootstrap_servers不能为空"
        assert group_id, "group_id不能为空"
        assert auto_offset_reset, "auto_offset_reset不能为空"
        assert auto_offset_reset in [
            "earliest",
            "latest",
        ], "auto_offset_reset只能是earliest或latest"
        self.consumer: Optional[KafkaConsumer] = KafkaConsumer(
            bootstrap_servers=bootstrap_servers,
            group_id=group_id,
            auto_offset_reset=auto_offset_reset,
            fetch_max_wait_ms=fetch_max_wait_ms,
            fetch_max_bytes=fetch_max_bytes,
            enable_auto_commit=enable_auto_commit,
            auto_commit_interval_ms=auto_commit_interval_ms,
            max_poll_interval_ms=max_poll_interval_ms,
        )
        self.enable_auto_commit = enable_auto_commit
        self.topic = ""

    def subscribe_topic(self, topic: str):
        """
        订阅主题
        """
        assert self.consumer, "请先初始化消费者"
        assert topic, "订阅topic不可设置为空"
        if self.topic == topic:
            logger.warning(f"已订阅【{topic}】,无需再次订阅。")
            return
        assert not self.topic, "订阅topic不可修改"
        self.topic = topic
        # 补充介绍,kafka-python的消费者多次订阅列表不是增量订阅,而是覆盖订阅。
        # 这里要求只订阅一个topic,所以不需要考虑这个问题。
        self.consumer.subscribe([topic])
        logger.info(f"订阅【{topic}】")

    def poll(
        self,
        timeout_ms: int = 5 * 1000,
        max_records: Optional[int] = None,
    ) -> Dict[TopicPartition, List[ConsumerRecord]]:
        """
        timeout_ms: 获取数据的最大等待时间
        max_records: 拉取的最大条数,默认为None,内置的值是500条

        补充解释timeout_ms与fetch_max_wait_ms的区别:
        首先需要知道poll和fetch之间是异步的.
        poll方法会优先从本地缓冲区获取数据(时间很短),缓冲区的数据不足(条数)会触发fetch请求,并等待fetch请求返回的数据,如果等待超过了timeout_ms,会先返回已获得的数据.
        而fetch请求受到fetch_max_wait_ms的控制,在fetch_max_wait_ms时间内,等待broker累积足够的数据,如果数据不足也返回,返回的数据如果没有被poll处理,则放到本地缓冲区.
        """
        assert self.consumer, "请先初始化消费者"
        assert self.topic, "请先订阅topic"
        records = self.consumer.poll(timeout_ms=timeout_ms, max_records=max_records)
        return records

    def commit(self, offsets: Optional[dict] = None):
        """
        手动提交offset
        消费过程中,后台其实保存了已消费数据的offset,这里只要调用一下commit即可
        """
        assert self.consumer, "请先初始化消费者"
        assert self.topic, "请先订阅topic"
        assert not self.enable_auto_commit, "已开启自动提交offset,不能手动提交offset"
        self.consumer.commit(offsets=offsets)

    def close(self, autocommit=True):
        """
        关闭消费者
        """
        assert self.consumer, "请先初始化消费者"
        self.consumer.close(autocommit=autocommit)
        self.topic = ""
        self.consumer = None

 

posted @ 2025-05-22 17:30  CJTARRR  阅读(83)  评论(0)    收藏  举报