python cockroachdb导出csv文件(单线程、多线程、多进程)
需求:
1、cockroachdb数据库中的表order_sku_manage存在28亿数据,需要导出成csv文件,以便迁移到hadoop上
2、分多个文件存储,每个文件最大存放数据量:5亿
测试结果:单线程导出太慢,应该按id分区间,以多线程或者多进程方式导出,目前还未想到如何动态划分id区间。
耗时:
1、单线程12个小时。
2、多线程按2亿数据量一个区间,6个半小时;
3、单纯的把多线程换成多进程,1个半小时搞定。
单线程:
# coding:utf-8 import os import csv import psycopg2 # 配置cockroach数据库链接参数 class CockDBConfig(object): host = "192.168.1.100" port = 3359 database = "test" user = "cock_ro" password = "abcd1234" conn = psycopg2.connect( port=CockDBConfig.port, host=CockDBConfig.host, database=CockDBConfig.database, user=CockDBConfig.user, password=CockDBConfig.password ) conn.set_session(autocommit=True) cursor = conn.cursor() # 定义存放路径,存放到py文件所在路径的result文件夹下 file_path = "{0}{1}result{1}".format(os.path.dirname(os.path.abspath(__file__)), os.sep) # 定义表名、主键列、比当前最小主键更小的id值、每次查询的返回的数据量 table_name = "order_sku_manage" key_column = "order_sku" min_id = "0000000000000000000" limit_num = 10000 # 定义每个文件存放多少行数据 batch_num = 500000000 file_num = 1 sql = "select * from {0} where {1} > '{2}' order by {1} limit {3}" sel_times = 1 while True: exec_sql = sql.format(table_name, key_column, min_id, limit_num) cursor.execute(exec_sql) rows = cursor.fetchall() # 如果查询返回的结果集为空,则退出 if not rows: break else: # 将每次查询结果集中的最大id值赋值给min_id,以便进行下一次查询 # 写入数据量超过定义文件的存储行数时,写入到新文件中 min_id = rows[-1][0] if sel_times*limit_num > batch_num: sel_times = 1 file_num += 1 data_file = "{}{}_data_{:0>2}.csv".format(file_path, table_name, file_num) with open(data_file, "a+", newline="", encoding="utf-8") as csv_file: w = csv.writer(csv_file) w.writerows(rows) sel_times += 1 cursor.close() conn.close()
多线程:
# coding:utf-8 import os import csv import psycopg2 import time from threading import Thread # 配置cockroach数据库链接参数 class CockDBConfig(object): def __init__(self): self.host = "192.168.1.100" self.port = 3359 self.database = "test" self.user = "cock_ro" self.password = "abcd1234" def get_conn(self): conn = psycopg2.connect( port=self.port, host=self.host, database=self.database, user=self.user, password=self.password ) return conn class ExportCsvData(Thread): def __init__(self, table_name, key_column, min_id, max_id, limit_num, file_num, is_last_batch=False): Thread.__init__(self) self.table_name = table_name self.key_column = key_column self.min_id = min_id self.max_id = max_id self.limit_num = limit_num self.file_num = file_num self.is_last_batch = is_last_batch self.sql_a = "select * from {0} where {1} > '{2}' order by {1} limit {3}" self.sql_b = "select * from {0} where {1} > '{2}' and {1} <= '{3}' order by {1} limit {4}" def run(self): file_path = "{0}{1}result{1}".format(os.path.dirname(os.path.abspath(__file__)), os.sep) cock = CockDBConfig() conn = cock.get_conn() conn.set_session(autocommit=True) cursor = conn.cursor() while True: if self.is_last_batch: exec_sql = self.sql_a.format(self.table_name, self.key_column, self.min_id, self.limit_num) else: exec_sql = self.sql_b.format(self.table_name, self.key_column, self.min_id, self.max_id, self.limit_num) cursor.execute(exec_sql) rows = cursor.fetchall() if not rows: break else: data_file = "{}{}_data_{:0>2}.csv".format(file_path, self.table_name, self.file_num) with open(data_file, "a+", newline="", encoding="utf-8") as csv_file: w = csv.writer(csv_file) w.writerows(rows) self.min_id = rows[-1][0] if self.min_id >= self.max_id != "": break cursor.close() conn.close() def export_data(): id_list = [ "0000000000000000", "1015486150247804122", "102543377594858419", "78220670719885987", "81120637207231415", "817284621641205150", "82589201230100000383225", "837699066446811349", "84921300134100001598892", "862851277794730571", "89469824036845313", "948472046751356504", "969484350821370329", "98075200988206799" ] table_name = "order_sku_manage" key_column = "order_sku" limit_num = 10000 thread_list = [] for index in range(len(id_list)): min_id = id_list[index] file_num = index + 1 if min_id == id_list[-1]: max_id = "" last_batch = True else: max_id = id_list[index + 1] last_batch = False thread_item = ExportCsvData(table_name, key_column, min_id, max_id, limit_num, file_num, last_batch) thread_list.append(thread_item) thread_item.start() running_thread_count = len(list(filter(lambda item: item.isAlive(), thread_list))) while running_thread_count > 0: running_thread_count = len(list(filter(lambda item: item.isAlive(), thread_list))) time.sleep(0.01) if __name__ == '__main__': export_data()
多进程:
# coding:utf-8 import os import csv import psycopg2 import time from multiprocessing import Process # 配置cockroach数据库链接参数 class CockDBConfig(object): def __init__(self): self.host = "192.168.1.100" self.port = 3359 self.database = "test" self.user = "cock_ro" self.password = "abcd1234" def get_conn(self): conn = psycopg2.connect( port=self.port, host=self.host, database=self.database, user=self.user, password=self.password ) return conn class ExportCsvData(Process): def __init__(self, table_name, key_column, min_id, max_id, limit_num, file_num, is_last_batch=False): super(ExportCsvData, self).__init__() self.table_name = table_name self.key_column = key_column self.min_id = min_id self.max_id = max_id self.limit_num = limit_num self.file_num = file_num self.is_last_batch = is_last_batch self.sql_a = "select * from {0} where {1} > '{2}' order by {1} limit {3}" self.sql_b = "select * from {0} where {1} > '{2}' and {1} <= '{3}' order by {1} limit {4}" def run(self): file_path = "{0}{1}result{1}".format(os.path.dirname(os.path.abspath(__file__)), os.sep) cock = CockDBConfig() conn = cock.get_conn() conn.set_session(autocommit=True) cursor = conn.cursor() while True: if self.is_last_batch: exec_sql = self.sql_a.format(self.table_name, self.key_column, self.min_id, self.limit_num) else: exec_sql = self.sql_b.format(self.table_name, self.key_column, self.min_id, self.max_id, self.limit_num) cursor.execute(exec_sql) rows = cursor.fetchall() if not rows: break else: data_file = "{}{}_data_{:0>2}.csv".format(file_path, self.table_name, self.file_num) with open(data_file, "a+", newline="", encoding="utf-8") as csv_file: w = csv.writer(csv_file) w.writerows(rows) self.min_id = rows[-1][0] if self.min_id >= self.max_id != "": break cursor.close() conn.close() def export_data(): id_list = [ "0000000000000000", "1015486150247804122", "102543377594858419", "78220670719885987", "81120637207231415", "817284621641205150", "82589201230100000383225", "837699066446811349", "84921300134100001598892", "862851277794730571", "89469824036845313", "948472046751356504", "969484350821370329", "98075200988206799" ] table_name = "order_sku_manage" key_column = "order_sku" limit_num = 100 process_list = [] for index in range(len(id_list)): min_id = id_list[index] file_num = index + 1 if min_id == id_list[-1]: max_id = "" last_batch = True else: max_id = id_list[index + 1] last_batch = False process_item = ExportCsvData(table_name, key_column, min_id, max_id, limit_num, file_num, last_batch) process_list.append(process_item) process_item.start() for proc in process_list: proc.join()
if __name__ == '__main__': export_data()

浙公网安备 33010602011771号