mysql数据同步至redis

 

通过mysql binlog日志进行

 

import pymysql
import redis
from pymysqlreplication import BinLogStreamReader
from pymysqlreplication.row_event import WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent
from pymysqlreplication.event import QueryEvent
import json
from urllib.parse import quote

# 处理流程:1、将mysql表中mysql_to_redis_sign字段更新为当前时间戳,为了生成更新binlog日志,生成后才会将数据输入到redis
# 2、清空redis中db0 的所有数据
# 3、将数据插入到redis

mysql_settings = {'host': 'xxx', 'port': 3306, 'user': 'root', 'password': 'xxx'}
binlog_position_file = 'binlog_position.json'
schema = 'analyze'
table = 'web_link_data'
redis_settings = {'host': 'xxx', 'port': 6379, 'db': 9, 'password': 'xxx'}




# 连接到 MySQL 数据库
connection = pymysql.connect(host=mysql_settings['host'],
                             user=mysql_settings['user'],
                             password=mysql_settings['password'],
                             cursorclass=pymysql.cursors.DictCursor)
try:
    with connection.cursor() as cursor:
        # 锁定 web_link_data 表
        cursor.execute("LOCK TABLES analyze.web_link_data WRITE")
        # 创建更新语句
        update_query = """
        UPDATE analyze.web_link_data
        SET mysql_to_redis_sign = UNIX_TIMESTAMP()
        """
        print(update_query)
        # 执行更新操作
        cursor.execute(update_query)
        # 提交修改
        connection.commit()
        print(f"Updated rows: {cursor.rowcount}")
        # 解锁表
        cursor.execute("UNLOCK TABLES")
finally:
    # 关闭数据库连接
    connection.close()


try:
    r = redis.Redis(**redis_settings)
    # 清空数据库
    r.flushdb()
    # 使用 pipeline 来优化批量写入性能
    pipeline = r.pipeline()
    try:
        with open(binlog_position_file, 'r') as f:
            saved_position = json.load(f)
            binlog_file = saved_position['binlog_file']
            binlog_position = saved_position['binlog_position']
    except FileNotFoundError:
        binlog_file = None
        binlog_position = None

    stream = BinLogStreamReader(connection_settings=mysql_settings, server_id=100, blocking=True,
                                 resume_stream=True, only_events=[WriteRowsEvent, UpdateRowsEvent, DeleteRowsEvent],
                                 log_file=binlog_file, log_pos=binlog_position)
    for binlogevent in stream:
        if binlogevent.schema == schema and binlogevent.table == table:
            for row in binlogevent.rows:
                row = dict(row)
                if isinstance(binlogevent, WriteRowsEvent):
                    print('新增数据:',row)
                    row = {
                        'id': row['values'].get('UNKNOWN_COL0', '') if row['values'].get('UNKNOWN_COL0','') is not None else '',
                        'host': row['values'].get('UNKNOWN_COL1', '') if row['values'].get('UNKNOWN_COL1','') is not None else '',
                        'url': row['values'].get('UNKNOWN_COL2', '') if row['values'].get('UNKNOWN_COL2',' ') is not None else '',
                        'class': row['values'].get('UNKNOWN_COL3', '') if row['values'].get('UNKNOWN_COL3',' ') is not None else '',
                        'class_sec': row['values'].get('UNKNOWN_COL4', '') if row['values'].get('UNKNOWN_COL4',' ') is not None else '',
                        'url_host': row['values'].get('UNKNOWN_COL5', '')if row['values'].get('UNKNOWN_COL5',' ') is not None else '',
                        'http': row['values'].get('UNKNOWN_COL6', '')if row['values'].get('UNKNOWN_COL6',' ') is not None else '',
                        'title': row['values'].get('UNKNOWN_COL7', '')if row['values'].get('UNKNOWN_COL7',' ') is not None else '',
                        'ip': row['values'].get('UNKNOWN_COL8', '')if row['values'].get('UNKNOWN_COL8',' ') is not None else '',
                        'ip_area': row['values'].get('UNKNOWN_COL9', '')if row['values'].get('UNKNOWN_COL9',' ') is not None else '',
                        'jump_url': row['values'].get('UNKNOWN_COL10', '')if row['values'].get('UNKNOWN_COL10',' ') is not None else '',
                        'import_source': row['values'].get('UNKNOWN_COL11', '')if row['values'].get('UNKNOWN_COL11',' ') is not None else '',
                        'is_gather': row['values'].get('UNKNOWN_COL12', '')if row['values'].get('UNKNOWN_COL12',' ') is not None else '',
                        'import_time': row['values'].get('UNKNOWN_COL13', '') if row['values'].get('UNKNOWN_COL13',' ') is not None else '',
                        'gather_time': row['values'].get('UNKNOWN_COL15', '')if row['values'].get('UNKNOWN_COL15',' ') is not None else '',
                    }



                    id = row['id']
                    # 对 URL 进行编码
                    row['url'] = quote(row['url'])
                    row['jump_url'] = quote(row['jump_url'])
                    pipeline.hmset(f"record:{id}", mapping=row)
                    fields = [
                        'host', 'url', 'class', 'class_sec', 'url_host', 'http', 'title',
                        'ip', 'ip_area', 'jump_url', 'import_source', 'is_gather', 'import_time','gather_time']
                    # 为每个字段 添加索引
                    for field in fields:
                        if row.get(field):  # 确保字段存在并且有值
                            print(row)
                            pipeline.sadd(f"{field}:{row[field]}", row['id'])
                    pipeline.execute()
                elif isinstance(binlogevent, DeleteRowsEvent):
                    print('删除数据:', row)
                    id = row['values']['UNKNOWN_COL0']
                    # 获取旧数据
                    old_data = r.hgetall(f"record:{id}")
                    if old_data:
                        # 删除旧数据记录
                        pipeline.delete(f"record:{id}")
                        # 从旧索引中移除
                        fields = ['host', 'url', 'class', 'class_sec', 'url_host', 'http', 'title', 'ip', 'ip_area',
                                  'jump_url', 'import_source', 'is_gather','import_time','gather_time']
                        for field in fields:
                            if old_data.get(field.encode()):
                                pipeline.srem(f"{field}:{old_data[field.encode()].decode()}", id)
                            # 执行所有操作
                        pipeline.execute()
                elif isinstance(binlogevent, UpdateRowsEvent):
                    for row in binlogevent.rows:
                        print('更改数据:', row)
                        before_values = row['before_values']
                        after_values = row['after_values']
                        id = before_values['UNKNOWN_COL0']
                        old_data = r.hgetall(f"record:{id}")
                        # 更新Redis记录
                        new_data = {
                            'id': after_values.get('UNKNOWN_COL0', ''),
                            'host': after_values.get('UNKNOWN_COL1', ''),
                            'url': quote(str(after_values.get('UNKNOWN_COL2', ''))),
                            'class': after_values.get('UNKNOWN_COL3', ''),
                            'class_sec': after_values.get('UNKNOWN_COL4', ''),
                            'url_host': after_values.get('UNKNOWN_COL5', ''),
                            'http': after_values.get('UNKNOWN_COL6', ''),
                            'title': after_values.get('UNKNOWN_COL7', ''),
                            'ip': after_values.get('UNKNOWN_COL8', ''),
                            'ip_area': after_values.get('UNKNOWN_COL9', ''),
                            'jump_url': quote(str(after_values.get('UNKNOWN_COL10', ''))),
                            'import_source': after_values.get('UNKNOWN_COL11', ''),
                            'is_gather': after_values.get('UNKNOWN_COL12', ''),
                            'import_time': after_values.get('UNKNOWN_COL13', ''),
                            'gather_time': after_values.get('UNKNOWN_COL15', '')
                        }
                        pipeline.hmset(f"record:{id}", mapping=new_data)
                        # 更新索引
                        fields = [
                            'host', 'url', 'class', 'class_sec', 'url_host', 'http', 'title',
                            'ip', 'ip_area', 'jump_url', 'import_source', 'is_gather','import_time','gather_time'
                        ]
                        # 删除旧索引并添加新索引
                        for field in fields:
                            if old_data.get(field.encode()):
                                pipeline.srem(f"{field}:{old_data[field.encode()].decode()}", id)
                            if new_data[field]:
                                pipeline.sadd(f"{field}:{new_data[field]}", id)
                        pipeline.execute()
        with open(binlog_position_file, 'w') as f:
            json.dump({'binlog_file': stream.log_file, 'binlog_position': stream.log_pos}, f)

finally:
    if stream:
        stream.close()
posted @ 2024-05-13 17:39  阿久丶xin  阅读(13)  评论(0编辑  收藏  举报
Live2D