关于列表分组的基本代码
如果一个 列表,根据固定每批次多少个,分组
self.batch_num = 1000
codes = [1, 2, 4, 2, 1, 0, 3, 5, 7]
batch_groups = [codes[idx:idx + self.batch_num] for idx in range(0, len(codes), self.batch_num)]
# 根据分组来分,一共分几组, 比如分5组
self.pool_num = 10
self.batch_num = math.ceil(len(codes)/self.pool_num)
batch_groups = [codes[idx:idx + self.batch_num] for idx in range(0, len(codes), self.batch_num)]
# 打印结果
for idx, item in enumerate(batch_groups):
logger.info("{}-{}-{}".format(idx, len(item), item))
第一种, 使用index
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# __author__ = "Victor"
# Date: 2016/9/20
# 定义批处理的条数, 如果是类就初始化为self.batch_times = 2
from loguru import logger
batch_times = 2
def batch_upsert(batches):
pass
def batch_processor(all_data_list):
try:
batches = []
for idx, value in enumerate(all_data_list):
print("idx: {}, value: {}".format(idx, value))
batches.append(value)
# for循环内正常处理
if idx % batch_times == 0:
batch_upsert(batches)
logger.debug('update items: {}-{}'.format(len(all_data_list), len(batches)))
batches = []
# for循环外批量处理剩下
batch_upsert(batches)
logger.debug('update items: {}-{}'.format(len(all_data_list), len(batches)))
except Exception as ex:
logger.exception('批处理异常: {}'.format(ex), ex)
if __name__ == '__main__':
all_data_list = ["a", "b", "c", "d", "e", "f", "g", "h"]
batch_processor(all_data_list)
# 基本思路, 连续的列表或者数组数据, 索引除尽批次数的条件下,把那个小循环数据插入数据库, 并把那个小循环列表清空
# 2020-11-27 11:19:26.541 | DEBUG | __main__:batch_processor:28 - update items: 8-1
# 2020-11-27 11:19:26.542 | DEBUG | __main__:batch_processor:28 - update items: 8-2
# 2020-11-27 11:19:26.542 | DEBUG | __main__:batch_processor:28 - update items: 8-2
# 2020-11-27 11:19:26.542 | DEBUG | __main__:batch_processor:28 - update items: 8-2
# 2020-11-27 11:19:26.542 | DEBUG | __main__:batch_processor:36 - update items: 8-1
# 有个问题是索引是从0开始的, 但是0很特殊, 0%任意数都是0, 因此第一条会单独插入, 因此更改为索引从1开始
# for idx, value in enumerate(all_data_list, 1):
第二种,使用len==
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# __author__ = "Victor"
# Date: 2016/9/20
# 定义批处理的条数, 如果是类就初始化为self.batch_times = 2
from loguru import logger
batch_times = 2
def batch_upsert(batches):
pass
def batch_processor(all_data_list):
try:
batches = []
for item in all_data_list:
print("item: {}".format(item))
batches.append(item)
# for循环内正常处理
if len(batches) == batch_times:
batch_upsert(batches)
logger.debug('update items: {}-{}'.format(len(all_data_list), len(batches)))
batches = []
# for循环外批量处理剩下
batch_upsert(batches)
logger.debug('update items: {}-{}'.format(len(all_data_list), len(batches)))
except Exception as ex:
logger.exception('批处理异常: {}'.format(ex), ex)
if __name__ == '__main__':
all_data_list = ["a", "b", "c", "d", "e", "f", "g", "h"]
batch_processor(all_data_list)
# 基本思路, 连续的列表或者数组数据, 达到每批次的个数时,把那个小循环数据插入数据库, 并把那个小循环列表清空
# 2020-11-27 11:27:17.194 | DEBUG | __main__:batch_processor:27 - update items: 8-2
# 2020-11-27 11:27:17.194 | DEBUG | __main__:batch_processor:27 - update items: 8-2
# 2020-11-27 11:27:17.195 | DEBUG | __main__:batch_processor:27 - update items: 8-2
# 2020-11-27 11:27:17.195 | DEBUG | __main__:batch_processor:27 - update items: 8-2
# 2020-11-27 11:27:17.195 | DEBUG | __main__:batch_processor:35 - update items: 8-0
第三种, 时间和插入批次双维度
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# __author__ = "Victor"
# Date: 2019/10/23
import time
def handle_batch_data(batches):
pass
# 每批插入的个数和默认插入的时间
batch_num = 100
default_interval = 10
batches = []
time_start = time.time()
for msg in [1,2,2,3,2,1,2,4,5,8,7,6,5,2,1]:
row = (msg)
batches.append(row)
# 计算距离上次插入数据库的时间
current_interval = time.time() - time_start
print("current_interval: ", current_interval)
# 如果达到默认插入的数值就插入, 或者实时消费数据超过默认时间自动插入数据
if (len(batches) == batch_num) or (current_interval > default_interval):
handle_batch_data(batches)
batches = []
time_start = time.time()
# print("批量处理剩余数据", batches)
handle_batch_data(batches)
print("end")
完善版本
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import time
from loguru import logger
from ycyj_zhongtai.service_ref.mdb_client import MdbClient
class MyTable(object):
"""
资金 分钟 Kline 表
"""
def __init__(self):
self.min_table = MdbClient().get_multi_mdb('gongshi2_mdb', 'money')['MoneyTable']
def insert_items(self, data_arr):
self.min_table.insert_many(data_arr)
def remove_items(self, where):
self.min_table.delete_many(where)
class Example():
def __init__(self):
self.table = MyTable()
self.batch_to_db_num = 1000
def insert_minutes_data(self, tag, data_list):
"""
批量插入数据库
:param tag:
:param data_list:
:return:
"""
data_len = len(data_list)
batches = []
accumulator = 0
for item in data_list:
batches.append(item)
# for循环内正常处理
if len(batches) == self.batch_to_db_num:
self.table.insert_items(batches)
accumulator += len(batches)
logger.debug('{} update items: {}/{}/{}'.format(tag, len(batches), accumulator, data_len))
batches = []
time.sleep(0.01)
# for循环外批量处理剩下
if batches:
self.table.insert_items(batches)
accumulator += len(batches)
logger.debug('{} update items: {}/{}/{}'.format(tag, len(batches), accumulator, data_len))
def update_data(self, tag, data):
"""
批量插入数据库
:param tag:
:param data:
:return:
"""
data_len = len(data)
batches = []
accumulator = 0
for item in data:
batches.append(item)
# for循环内正常处理
if len(batches) == self.batch_to_db_num:
self.table.update_items(batches)
accumulator += len(batches)
logger.debug('{} update items: {}/{}/{}'.format(tag, len(batches), accumulator, data_len))
batches = []
time.sleep(0.01)
# for循环外批量处理剩下
if batches:
self.table.update_items(batches)
accumulator += len(batches)
logger.debug('{} update items: {}/{}/{}'.format(tag, len(batches), accumulator, data_len))
def start(self): pass