Python总结----造一万条数据及多线程、进程写入数据库表(三)
基础知识:
string.ascii_letters:Python 的 string 模块中的一个常量,包含所有 ASCII 字母(大小写):'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'。使用时需导入string模块:import string
string.digits:string 模块中的另一个常量,包含所有数字字符:'0123456789'。使用时需导入string模块:import string
random.choice :Python 的 random 模块中的一个函数,用于从非空序列(如列表、字符串、元组等)中随机选择一个元素。使用时需导入:import random,代码示例如下:
1 # 从字符串中随机选一个字符 2 import random 3 import string 4 5 chars = string.ascii_letters + string.digits # 字母 + 数字 6 random_char = random.choice(chars) # 随机选一个字符 7 print(random_char) # 输出可能是 'a'、'Z'、'7' 等 8 9 10 # 从列表中随机选一个元素 11 import random 12 13 fruits = ["apple", "banana", "cherry", "date"] 14 random_fruit = random.choice(fruits) 15 print(random_fruit) # 输出可能是 "banana"、"date" 等 16 17 18 # 从元组中随机选一个元素 19 import random 20 21 colors = ("red", "green", "blue") 22 random_color = random.choice(colors) 23 print(random_color) # 输出可能是 "green"、"red" 等
random.randint(a, b):生成一个 [a, b] 范围内的随机整数(包括 a 和 b);使用时需导入:import random
random.randrange(start, stop, step):作用:生成一个 [start, stop) 范围内的随机整数(不包括 stop),类似 range();使用时需导入:import random
random.random():生成生成 [0.0, 1.0) 的随机浮点数;使用时需导入:import random
random.uniform(a, b):生成 [a, b] 的随机浮点数;使用时需导入:import random
random.sample(seq, k) :从序列中随机选 k 个不重复元素;使用时需导入:import random
random.shuffle(seq) :打乱序列顺序(原地修改);使用时需导入:import random
造1万条数据代码示例如下:
其中CreateData.py文件中代码如下:
1 # 造一万条数据 数据内容:name、age、sex、email 2 # CreateData.py 3 4 import random 5 import string 6 7 class CreateData(object): 8 def get_name(self): 9 str_char = "" 10 get_chart = string.ascii_letters+string.digits 11 get_len = random.randint(6, 12) 12 for i in range(1, get_len+1): 13 str_char += random.choice(get_chart) 14 return str_char 15 16 def get_sex(self): 17 return random.choice(['男', '女']) 18 19 def get_age(self): 20 return random.randint(18,100) 21 22 def get_email(self): 23 return self.get_name()+random.choice(["@163.com","@126.com"]) 24 25 # 声明一个方法,可以使用一条数据放在一个列表 26 def get_one_data(self): 27 # 此处返回的数据最好和插入数据表中的数据保持一致,以便插入时数据一一对应 28 return [self.get_name(),self.get_age(),self.get_email()] 29 30 if __name__=="__main__": 31 create = CreateData() 32 print("随机得到的姓名是:", create.get_name()) 33 print("随机得到的性别是:", create.get_age()) 34 print("随机得到的email是:", create.get_email())
执行上述代码后,生成胡邮件和姓名不符,原因在创建邮件数据时重新调用了获取姓名的方法。调整代码如下:可以在CreateData类中声明一个初始化方法,该方法中存在一个name属性,初始值设置为None,然后在get_name后将随机生成的值赋值给name属性,最后在get_email方法中不要调用get_name方法,直接调用self.name属性即可。调整后后的代码如下:
1 # 造一万条数据 数据内容:name、age、sex、email 2 # CreateData.py 3 4 import random 5 import string 6 7 class CreateData(object): 8 def __init__(self): 9 self.name = None 10 def get_name(self): 11 str_char = "" 12 get_chart = string.ascii_letters+string.digits 13 get_len = random.randint(6, 12) 14 for i in range(1, get_len+1): 15 str_char += random.choice(get_chart) 16 self.name = str_char 17 return str_char 18 19 def get_sex(self): 20 return random.choice(['男', '女']) 21 22 def get_age(self): 23 return random.randint(18,100) 24 25 def get_email(self): 26 return self.name+random.choice(["@163.com","@126.com"]) 27 28 # 声明一个方法,可以使用一条数据放在一个列表 29 def get_one_data(self): 30 # 此处返回的数据最好和插入数据表中的数据保持一致,以便插入时数据一一对应 31 return [self.get_name(),self.get_age(),self.get_email()] 32 33 if __name__=="__main__": 34 create = CreateData() 35 print("随机得到的姓名是:", create.get_name()) 36 print("随机得到的性别是:", create.get_age()) 37 print("随机得到的email是:", create.get_email())
增加一个自动1的字段代码为:
1 # 造一万条数据 数据内容:name、age、sex、email 2 # CreateData.py 3 4 import random 5 import string 6 7 class CreateData(object): 8 def __init__(self): 9 self.name = None 10 self.counter = 0 11 def get_name(self): 12 str_char = "" 13 get_chart = string.ascii_letters+string.digits 14 get_len = random.randint(6, 12) 15 for i in range(1, get_len+1): 16 str_char += random.choice(get_chart) 17 self.name = str_char 18 return str_char 19 20 def get_sex(self): 21 return random.choice(['男', '女']) 22 23 def get_age(self): 24 return random.randint(18,100) 25 26 def get_email(self): 27 return self.name+random.choice(["@163.com","@126.com"]) 28 29 def get_custom_id(self, prefix="ID"): 30 self.counter += 1 31 return f"{prefix}{self.counter:04d}" 32 33 # 声明一个方法,可以使用一条数据放在一个列表 34 def get_one_data(self): 35 # 此处返回的数据最好和插入数据表中的数据保持一致,以便插入时数据一一对应 36 return [self.get_name(),self.get_age(),self.get_email(),self.get_custom_id()] 37 38 if __name__=="__main__": 39 create = CreateData() 40 print("随机得到的姓名是:", create.get_name()) 41 print("随机得到的性别是:", create.get_age()) 42 print("随机得到的email是:", create.get_email()) 43 print("获取的客户ID为:", create.get_custom_id())
在上述代码中加几个规定字段的代码如下:
1 # 造一万条数据 数据内容:name、age、sex、email 2 # CreateData.py 3 4 import random 5 import string 6 7 class CreateData(object): 8 def __init__(self): 9 self.name = None 10 self.counter = 0 11 def get_name(self): 12 str_char = "" 13 get_chart = string.ascii_letters+string.digits 14 get_len = random.randint(6, 12) 15 for i in range(1, get_len+1): 16 str_char += random.choice(get_chart) 17 self.name = str_char 18 return str_char 19 20 def get_sex(self): 21 return random.choice(['男', '女']) 22 23 def get_age(self): 24 return random.randint(18,100) 25 26 def get_email(self): 27 return self.name+random.choice(["@163.com","@126.com"]) 28 29 def get_custom_id(self, prefix="ID"): 30 self.counter += 1 31 return f"{prefix}{self.counter:04d}" 32 33 def get_fixed_fields(self): 34 """返回4个固定默认值的字段""" 35 return [ 36 "默认地址", # 固定字段1:地址 37 "中国", # 固定字段2:国家 38 "VIP", # 固定字段3:会员等级 39 "2023-01-01" # 固定字段4:注册日期 40 ] 41 42 # 声明一个方法,可以使用一条数据放在一个列表 43 def get_one_data(self): 44 # 此处返回的数据最好和插入数据表中的数据保持一致,以便插入时数据一一对应 45 return [self.get_name(),self.get_age(),self.get_email(),self.get_custom_id()]+ self.get_fixed_fields() 46 47 if __name__=="__main__": 48 create = CreateData() 49 data = create.get_one_data() 50 print("随机得到的姓名是:", create.get_name()) 51 print("随机得到的性别是:", create.get_age()) 52 print("随机得到的email是:", create.get_email()) 53 print("获取的客户ID为:", create.get_custom_id()) 54 print("地址:", data[4]) # 新增固定字段1 55 print("国家:", data[5]) # 新增固定字段2 56 print("会员等级:", data[6]) # 新增固定字段3 57 print("注册日期:", data[7]) # 新增固定字段4
将上述生成的数据存表到Excel文件中:
新建ExcelData.py文件,其具体代码为:
1 from openpyxl import Workbook 2 from CreateData import CreateData 3 class ExcelData(object): 4 def __init__(self): 5 self.data = CreateData() # 实例化 CreateData 类,用于生成数据 6 self.workbook = Workbook() # 创建一个新的 Excel 工作簿 7 self.get_sheet = self.workbook.active # 获取工作簿的活跃工作表(默认第一个表) 8 9 10 def create_data(self): 11 for row in range(1,10001): # 循环 10000 次(从 1 开始,实际写入 1~10000 行) 12 self.get_sheet.append(self.data.get_one_data()) # 生成一条数据并追加到 Excel 行 13 self.workbook.save("data.xlsx") 14 15 16 if __name__ == "__main__": 17 excel = ExcelData() 18 excel.create_data()
调整代码实现向数据库中插入数据:
1 import pymysql 2 from CreateData import CreateData 3 4 5 class DatabaseInserter: 6 def __init__(self, host, user, password, database, table_name="user_data"): 7 """ 8 初始化数据库连接 9 :param host: 数据库主机地址(如 "localhost") 10 :param user: 用户名 11 :param password: 密码 12 :param database: 数据库名 13 :param table_name: 表名(默认 "user_data") 14 """ 15 self.host = host 16 self.user = user 17 self.password = password 18 self.database = database 19 self.table_name = table_name 20 self.connection = None 21 self.cursor = None 22 self.data_generator = CreateData() # 实例化数据生成器 23 24 def connect(self): 25 """连接到数据库""" 26 try: 27 self.connection = pymysql.connect( 28 host=self.host, 29 user=self.user, 30 password=self.password, 31 database=self.database, 32 charset="utf8mb4" 33 ) 34 self.cursor = self.connection.cursor() 35 print("数据库连接成功!") 36 except pymysql.Error as e: 37 print(f"数据库连接失败: {e}") 38 raise 39 40 def create_table(self): 41 """创建数据表(如果不存在)""" 42 create_table_sql = f""" 43 CREATE TABLE IF NOT EXISTS {self.table_name} ( 44 id INT AUTO_INCREMENT PRIMARY KEY, 45 name VARCHAR(50) NOT NULL, 46 age INT NOT NULL, 47 sex ENUM('男', '女') NOT NULL, 48 email VARCHAR(100) NOT NULL, 49 custom_id VARCHAR(20) NOT NULL, 50 address VARCHAR(100) DEFAULT '默认地址', 51 country VARCHAR(20) DEFAULT '中国', 52 member_level VARCHAR(20) DEFAULT 'VIP', 53 register_date DATE DEFAULT '2023-01-01' 54 ) 55 """ 56 try: 57 self.cursor.execute(create_table_sql) 58 self.connection.commit() 59 print(f"表 '{self.table_name}' 已就绪!") 60 except pymysql.Error as e: 61 print(f"创建表失败: {e}") 62 raise 63 64 def insert_one_data(self, data): 65 """插入单条数据""" 66 insert_sql = f""" 67 INSERT INTO {self.table_name} ( 68 name, age, sex, email, custom_id, address, country, member_level, register_date 69 ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) 70 """ 71 try: 72 self.cursor.execute(insert_sql, data) 73 self.connection.commit() 74 except pymysql.Error as e: 75 print(f"插入数据失败: {e}") 76 self.connection.rollback() 77 78 def insert_batch_data(self, batch_size=1000): 79 """批量插入数据(默认每批1000条)""" 80 print(f"开始批量插入数据(每批 {batch_size} 条)...") 81 batch = [] 82 for i in range(10000): # 生成10000条数据 83 data = self.data_generator.get_one_data() 84 # 补充性别字段(原get_one_data()未包含sex,需手动插入到列表的合适位置) 85 full_data = data[:2] + [self.data_generator.get_sex()] + data[2:] 86 batch.append(full_data) 87 88 if len(batch) >= batch_size: 89 self._insert_batch(batch) 90 batch = [] # 清空批次 91 92 # 插入剩余数据 93 if batch: 94 self._insert_batch(batch) 95 96 print("数据插入完成!") 97 98 def _insert_batch(self, batch): 99 """内部方法:执行批量插入""" 100 insert_sql = f""" 101 INSERT INTO {self.table_name} ( 102 name, age, sex, email, custom_id, address, country, member_level, register_date 103 ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) 104 """ 105 try: 106 self.cursor.executemany(insert_sql, batch) 107 self.connection.commit() 108 print(f"已插入 {len(batch)} 条数据") 109 except pymysql.Error as e: 110 print(f"批量插入失败: {e}") 111 self.connection.rollback() 112 113 def close(self): 114 """关闭数据库连接""" 115 if self.cursor: 116 self.cursor.close() 117 if self.connection: 118 self.connection.close() 119 print("数据库连接已关闭") 120 121 122 if __name__ == "__main__": 123 # 示例:连接到本地MySQL数据库(需替换为你的数据库信息) 124 db_inserter = DatabaseInserter( 125 host="localhost", 126 user="your_username", 127 password="your_password", 128 database="your_database", 129 table_name="user_data" 130 ) 131 132 try: 133 db_inserter.connect() 134 db_inserter.create_table() 135 db_inserter.insert_batch_data(batch_size=500) # 每批插入500条 136 finally: 137 db_inserter.close()
调整上述代码实现多线程运行,并记录时间
1 import pymysql 2 import time 3 import random 4 from concurrent.futures import ThreadPoolExecutor 5 from CreateData import CreateData 6 7 8 class DatabaseInserter: 9 def __init__(self, host, user, password, database, table_name="user_data"): 10 self.host = host 11 self.user = user 12 self.password = password 13 self.database = database 14 self.table_name = table_name 15 self.connection = None 16 self.cursor = None 17 self.data_generator = CreateData() 18 19 def connect(self): 20 """连接到数据库""" 21 try: 22 self.connection = pymysql.connect( 23 host=self.host, 24 user=self.user, 25 password=self.password, 26 database=self.database, 27 charset="utf8mb4" 28 ) 29 self.cursor = self.connection.cursor() 30 except pymysql.Error as e: 31 print(f"数据库连接失败: {e}") 32 raise 33 34 def create_table(self): 35 """创建数据表(如果不存在)""" 36 create_table_sql = f""" 37 CREATE TABLE IF NOT EXISTS {self.table_name} ( 38 id INT AUTO_INCREMENT PRIMARY KEY, 39 name VARCHAR(50) NOT NULL, 40 age INT NOT NULL, 41 sex ENUM('男', '女') NOT NULL, 42 email VARCHAR(100) NOT NULL, 43 custom_id VARCHAR(20) NOT NULL, 44 address VARCHAR(100) DEFAULT '默认地址', 45 country VARCHAR(20) DEFAULT '中国', 46 member_level VARCHAR(20) DEFAULT 'VIP', 47 register_date DATE DEFAULT '2023-01-01' 48 ) 49 """ 50 try: 51 self.cursor.execute(create_table_sql) 52 self.connection.commit() 53 except pymysql.Error as e: 54 print(f"创建表失败: {e}") 55 raise 56 57 def insert_batch(self, batch_data): 58 """插入一批数据(线程任务)""" 59 insert_sql = f""" 60 INSERT INTO {self.table_name} ( 61 name, age, sex, email, custom_id, address, country, member_level, register_date 62 ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) 63 """ 64 try: 65 self.cursor.executemany(insert_sql, batch_data) 66 self.connection.commit() 67 return len(batch_data) # 返回成功插入的条数 68 except pymysql.Error as e: 69 print(f"批量插入失败: {e}") 70 self.connection.rollback() 71 return 0 72 73 def close(self): 74 """关闭数据库连接""" 75 if self.cursor: 76 self.cursor.close() 77 if self.connection: 78 self.connection.close() 79 80 81 def generate_data_batch(data_generator, batch_size): 82 """生成一批数据(供线程调用)""" 83 batch = [] 84 for _ in range(batch_size): 85 data = data_generator.get_one_data() 86 full_data = data[:2] + [data_generator.get_sex()] + data[2:] # 插入性别字段 87 batch.append(full_data) 88 return batch 89 90 91 def main(): 92 # 数据库配置(需替换为实际信息) 93 db_config = { 94 "host": "localhost", 95 "user": "your_username", 96 "password": "your_password", 97 "database": "your_database", 98 "table_name": "user_data" 99 } 100 101 # 参数配置 102 TOTAL_RECORDS = 10000 # 总数据量 103 BATCH_SIZE = 500 # 每批数据量 104 THREAD_COUNT = 4 # 线程数(根据CPU核心数调整) 105 106 # 初始化数据库插入器 107 inserter = DatabaseInserter(**db_config) 108 inserter.connect() 109 inserter.create_table() 110 111 # 记录开始时间 112 start_time = time.time() 113 print(f"【开始执行】插入 {TOTAL_RECORDS} 条数据,线程数: {THREAD_COUNT}") 114 115 # 多线程插入数据 116 success_count = 0 117 with ThreadPoolExecutor(max_workers=THREAD_COUNT) as executor: 118 futures = [] 119 for _ in range(0, TOTAL_RECORDS, BATCH_SIZE): 120 # 生成数据批次 121 batch = generate_data_batch(inserter.data_generator, BATCH_SIZE) 122 # 提交线程任务 123 futures.append(executor.submit(inserter.insert_batch, batch)) 124 125 # 等待所有线程完成并统计成功条数 126 for future in futures: 127 success_count += future.result() 128 129 # 记录结束时间 130 end_time = time.time() 131 total_time = end_time - start_time 132 133 # 打印结果 134 print(f"【执行完成】") 135 print(f"成功插入: {success_count}/{TOTAL_RECORDS} 条") 136 print(f"总耗时: {total_time:.2f} 秒") 137 print(f"平均速度: {TOTAL_RECORDS / total_time:.2f} 条/秒") 138 139 # 关闭连接 140 inserter.close() 141 142 143 if __name__ == "__main__": 144 main()
调整上述代码实现多进程完成
1 import pymysql 2 import time 3 import random 4 from multiprocessing import Pool, cpu_count 5 from CreateData import CreateData 6 7 class DatabaseInserter: 8 def __init__(self, host, user, password, database, table_name="user_data"): 9 self.host = host 10 self.user = user 11 self.password = password 12 self.database = database 13 self.table_name = table_name 14 self.connection = None 15 self.cursor = None 16 self.data_generator = CreateData() 17 18 def connect(self): 19 """每个进程独立创建数据库连接""" 20 try: 21 self.connection = pymysql.connect( 22 host=self.host, 23 user=self.user, 24 password=self.password, 25 database=self.database, 26 charset="utf8mb4" 27 ) 28 self.cursor = self.connection.cursor() 29 except pymysql.Error as e: 30 print(f"进程 {id(self)} 数据库连接失败: {e}") 31 raise 32 33 def create_table(self): 34 """创建数据表(如果不存在)""" 35 create_table_sql = f""" 36 CREATE TABLE IF NOT EXISTS {self.table_name} ( 37 id INT AUTO_INCREMENT PRIMARY KEY, 38 name VARCHAR(50) NOT NULL, 39 age INT NOT NULL, 40 sex ENUM('男', '女') NOT NULL, 41 email VARCHAR(100) NOT NULL, 42 custom_id VARCHAR(20) NOT NULL, 43 address VARCHAR(100) DEFAULT '默认地址', 44 country VARCHAR(20) DEFAULT '中国', 45 member_level VARCHAR(20) DEFAULT 'VIP', 46 register_date DATE DEFAULT '2023-01-01' 47 ) 48 """ 49 try: 50 self.cursor.execute(create_table_sql) 51 self.connection.commit() 52 except pymysql.Error as e: 53 print(f"进程 {id(self)} 创建表失败: {e}") 54 raise 55 56 def insert_batch(self, batch_data): 57 """插入一批数据(进程任务)""" 58 if not self.connection: 59 self.connect() # 每个进程独立连接数据库 60 61 insert_sql = f""" 62 INSERT INTO {self.table_name} ( 63 name, age, sex, email, custom_id, address, country, member_level, register_date 64 ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) 65 """ 66 try: 67 self.cursor.executemany(insert_sql, batch_data) 68 self.connection.commit() 69 return len(batch_data) # 返回成功插入的条数 70 except pymysql.Error as e: 71 print(f"进程 {id(self)} 批量插入失败: {e}") 72 self.connection.rollback() 73 return 0 74 finally: 75 self.close() # 每个进程任务完成后关闭连接 76 77 def close(self): 78 """关闭当前进程的数据库连接""" 79 if self.cursor: 80 self.cursor.close() 81 if self.connection: 82 self.connection.close() 83 84 def generate_data_batch(batch_size): 85 """生成一批数据(供进程调用)""" 86 data_generator = CreateData() 87 batch = [] 88 for _ in range(batch_size): 89 data = data_generator.get_one_data() 90 full_data = data[:2] + [data_generator.get_sex()] + data[2:] # 插入性别字段 91 batch.append(full_data) 92 return batch 93 94 def worker_process(args): 95 """工作进程函数(解包参数)""" 96 db_config, batch_data = args 97 inserter = DatabaseInserter(**db_config) 98 return inserter.insert_batch(batch_data) 99 100 def main(): 101 # 数据库配置(需替换为实际信息) 102 db_config = { 103 "host": "localhost", 104 "user": "your_username", 105 "password": "your_password", 106 "database": "your_database", 107 "table_name": "user_data" 108 } 109 110 # 参数配置 111 TOTAL_RECORDS = 10000 # 总数据量 112 BATCH_SIZE = 500 # 每批数据量 113 PROCESS_COUNT = min(cpu_count(), 4) # 进程数(默认不超过CPU核心数或4) 114 115 # 初始化数据库插入器(仅用于生成数据,不共享连接) 116 data_generator = CreateData() 117 118 # 记录开始时间 119 start_time = time.time() 120 print(f"【开始执行】插入 {TOTAL_RECORDS} 条数据,进程数: {PROCESS_COUNT}") 121 122 # 生成所有数据批次(主进程完成) 123 batches = [] 124 for _ in range(0, TOTAL_RECORDS, BATCH_SIZE): 125 batches.append(generate_data_batch(BATCH_SIZE)) 126 127 # 多进程插入数据 128 success_count = 0 129 with Pool(PROCESS_COUNT) as pool: 130 # 将数据库配置和数据批次打包传递给工作进程 131 tasks = [(db_config, batch) for batch in batches] 132 # 使用 map_async 异步执行并获取结果 133 for result in pool.imap_unordered(worker_process, tasks): 134 success_count += result 135 136 # 记录结束时间 137 end_time = time.time() 138 total_time = end_time - start_time 139 140 # 打印结果 141 print(f"【执行完成】") 142 print(f"成功插入: {success_count}/{TOTAL_RECORDS} 条") 143 print(f"总耗时: {total_time:.2f} 秒") 144 print(f"平均速度: {TOTAL_RECORDS / total_time:.2f} 条/秒") 145 146 if __name__ == "__main__": 147 main()
浙公网安备 33010602011771号