Python write 100M items data to csv file in batch

import uuid
import time
from datetime import datetime
import threading
from tkinter import messagebox
import os
import psutil
import pandas as pd



idx=0
idx_lock=threading.Lock()

def get_mem():
    pid=os.getpid()
    proc=psutil.Process(pid)
    mem_info=proc.memory_info()
    rss_mem=f'{mem_info.rss/1024/1024:.2f} M'
    vms_mem=f'{mem_info.vms/1024/1024:.2f} M'

    sys_mem=psutil.virtual_memory()
    total_mem=f'{sys_mem.total/1024/1024/1024:.2f} G'
    avail_mem=f'{sys_mem.available/1024/1024/1024:.2f} G'
    used_percent=f'{sys_mem.percent}%'
    return f'{datetime.now()},PId:{pid},rss mem:{rss_mem},vms mem:{vms_mem},total mem:{total_mem},avail mem:{avail_mem},used percent:{used_percent}'

def get_idx():
    global idx
    with idx_lock:
        idx+=1
        current_idx=idx
        return current_idx

def get_time_uuid():
    return f'{get_idx()}_{datetime.now().strftime('%Y%m%d%H%M%S%f')}_{uuid.uuid4().hex}'

def get_uuid_time():
    return f'{get_idx()}_{uuid.uuid4().hex}_{datetime.now().strftime('%Y%m%d%H%M%S%f')}'

class Book:
    def __init__(self,id,name,author,abstract,comment,content,isbn,summary,title,topic):
        self.id=id
        self.name=name
        self.author=author
        self.abstract=abstract
        self.comment=comment
        self.content=content
        self.isbn=isbn
        self.summary=summary
        self.title=title
        self.topic=topic
    
    def to_dict(self):
        return {
            'Id':self.id,
            'Name':self.name,
            'Author':self.author,
            'Abstract':self.abstract,
            'Comment':self.comment,
            'Content':self.content,
            'ISBN':self.isbn,
            'Summary':self.summary,
            'Title':self.title,
            'Topic':self.topic
        }
    

is_first_headers=True

def write_data_list_to_csv(data_list=[]):
    global is_first_headers
    df=pd.DataFrame(data_list)
    data_list.clear()
    with open(csv_file,'a+',encoding='utf-8-sig') as csv_append_file:
            df.to_csv(csv_append_file,index=False,header=is_first_headers)
            print(f'{datetime.now()},a:{a},write to {csv_file},{get_mem()}')
            is_first_headers=False

data_list=[]
csv_file=f'CSV_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.csv'
arr=range(1,105000001)
for a in arr:
    bk=Book(a,f'Name_{a}',f'Author_{a}',f'Abstract_{a}',f'Comment_{a}',f'Content_{a}',f'ISBN_{a}_{uuid.uuid4().hex}',f'Summary_{a}',f'Title_{a}',f'Topic_{a}')
    data_list.append(bk.to_dict())
    if a%1000000==0:
         write_data_list_to_csv(data_list)
         data_list.clear()
        
if len(data_list)>0:
    write_data_list_to_csv(data_list)
    data_list.clear()
        
print(f'{datetime.now()},write {len(arr)} items to {csv_file}')

 

 

image

 

 

image

 

 

 

image

 

 

 

 

 

 

image

 

posted @ 2026-02-23 16:29  FredGrit  阅读(7)  评论(0)    收藏  举报