同时下载数据

如果你希望直接为每条数据并行地下载,而不需要按顺序逐个下载,可以通过引入并行处理来提高效率。Python 提供了几种方法来实现并行,包括使用 concurrent.futures 模块、multiprocessing 模块或 asyncio 等。

使用 concurrent.futures 模块

concurrent.futures 提供了一个简单的接口来并行化任务。你可以使用 ThreadPoolExecutorProcessPoolExecutor 来为每条数据创建并行的下载任务。

下面是你可以参考的修改方式:

修改后的代码示例(使用 ThreadPoolExecutor):

import concurrent.futures
import xarray as xr
import cdsapi
import os
import numpy as np
from datetime import datetime
import sys
from pathlib import Path

def read_interpolated_typhoons(filename):
    """Read the python interpolated typhoon trajectory file and return structured data."""
    typhoons = {}
    current_typhoon = None
    
    with open(filename, 'r') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
                
            if line.startswith('Typhoon'):
                current_typhoon = line.split(' ')[1]
                typhoons[current_typhoon] = []
                continue
                
            if line.startswith('DateTime:'):
                try:
                    # Parse datetime, latitude, and longitude
                    parts = line.split(',')
                    dt_str = parts[0].split('DateTime: ')[1].strip()
                    lat = float(parts[1].split('Lat: ')[1].replace('°N', '').strip())
                    lon = float(parts[2].split('Lon: ')[1].replace('°E', '').strip())
                    
                    # Convert datetime string to datetime object
                    dt = datetime.strptime(dt_str, '%Y-%m-%d %H:00')
                    
                    if current_typhoon:
                        typhoons[current_typhoon].append({
                            'datetime': dt,
                            'lat': lat,
                            'lon': lon
                        })
                except Exception as e:
                    print(f"Error parsing line: {line}")
                    print(f"Error: {e}")
                    
    return typhoons

def check_cdsapi_configuration():
    """Check if CDS API configuration exists and is properly set up."""
    config_path = Path.home() / '.cdsapirc'
    
    # Define expected credentials
    expected_url = "https://cds.climate.copernicus.eu/api"
    expected_key = "bb32da10-3bd6-4909-82e8-7bd08b44ca7f"
    
    if not config_path.exists():
        print("Creating CDS API configuration file...")
        with open(config_path, 'w') as f:
            f.write(f"url: {expected_url}\n")
            f.write(f"key: {expected_key}\n")
        print(f"Created configuration file at {config_path}")
    
    # Check if file contains required fields
    with open(config_path) as f:
        content = f.read().lower()
        if 'url' not in content or 'key' not in content:
            print("Error: CDS API configuration file is incomplete!")
            print("Please ensure it contains both 'url' and 'key' fields.")
            sys.exit(1)
    
    try:
        # Test API connection
        client = cdsapi.Client()
        return client
    except Exception as e:
        print(f"Error connecting to CDS API: {e}")
        print("Please check your API key and internet connection.")
        sys.exit(1)

def download_era5_data(client, dt, save_dir):
    """Download ERA5 data for a specific datetime and location."""
    params = [
        {
            'param': '10m_u_component_of_wind',
            'variable': 'u-wind_10m',
            'dataset': 'reanalysis-era5-single-levels'
        },
        {
            'param': '10m_v_component_of_wind',
            'variable': 'v-wind_10m',
            'dataset': 'reanalysis-era5-single-levels'
        },
        {
            'param': '100m_u_component_of_wind',
            'variable': 'u-wind_100m',
            'dataset': 'reanalysis-era5-single-levels'
        },
        {
            'param': '100m_v_component_of_wind',
            'variable': 'v-wind_100m',
            'dataset': 'reanalysis-era5-single-levels'
        },
        {
            'param': 'geopotential',
            'level': '250',
            'variable': 'geopotential_250hPa',
            'dataset': 'reanalysis-era5-pressure-levels'
        }
    ]
    
    for param in params:
        param_code = param['param']
        variable = param['variable']
        dataset = param['dataset']
        
        request = {
            'product_type': 'reanalysis',
            'variable': [param_code],
            'year': str(dt.year),
            'month': str(dt.month).zfill(2),
            'day': str(dt.day).zfill(2),
            'time': [f"{dt.hour:02d}:00"],
            'format': 'grib'
        }
        
        if 'level' in param:
            request['pressure_level'] = [param['level']]
        
        temp_grib = os.path.join(save_dir, f'temp_{variable}.grib')
        npy_file = os.path.join(save_dir, f'{variable}.npy')
        
        try:
            print(f"Downloading {variable} for {dt}...")
            client.retrieve(dataset, request, temp_grib)
            
        except Exception as e:
            print(f"Error processing {variable}: {e}")
            if os.path.exists(temp_grib):
                os.remove(temp_grib)

def process_typhoon_data(client, typhoon_name, points):
    """Process and download data for a given typhoon and its points."""
    typhoon_dir = f'/diska/typhoon_data/{typhoon_name}'
    if not os.path.exists(typhoon_dir):
        os.makedirs(typhoon_dir)

    # Process each time point
    for i, point in enumerate(points):
        dt = point['datetime']
        
        # Create directory for this time point
        time_dir = f'{typhoon_dir}/time_{i:03d}'
        if not os.path.exists(time_dir):
            os.makedirs(time_dir)

        print(f"\nDownloading data for time point {i} ({dt})")
        download_era5_data(client, dt, time_dir)

def main():
    # First check CDS API configuration
    print("Checking CDS API configuration...")
    client = check_cdsapi_configuration()
    print("CDS API configuration verified successfully!")
    
    # Read interpolated typhoon data
    typhoons = read_interpolated_typhoons('/home/dodo/Desktop/typhoon/interpolated_typhoons.txt')
    
    # Get list of typhoons
    typhoon_names = list(typhoons.keys())
    
    # Create a thread pool to download data concurrently
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # For each typhoon, submit its download process to the executor
        futures = []
        for typhoon_name in typhoon_names:
            points = typhoons[typhoon_name]
            futures.append(executor.submit(process_typhoon_data, client, typhoon_name, points))
        
        # Wait for all futures to complete
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()  # This will re-raise any exceptions caught in the worker thread
            except Exception as e:
                print(f"Error occurred: {e}")

if __name__ == "__main__":
    main()

代码解析:

  1. ThreadPoolExecutor:

    • 使用 ThreadPoolExecutor 来并行化每个台风的数据处理。每个台风的处理是一个独立的任务,通过 executor.submit() 提交给线程池执行。
    • ThreadPoolExecutor 是基于线程的,因此适用于 I/O 密集型任务(如下载数据),而不是 CPU 密集型任务。如果你有大量的并行任务,ThreadPoolExecutor 会为你自动管理线程池。
  2. futures 列表:

    • 我们将每个台风的下载任务(process_typhoon_data)提交到线程池,executor.submit() 会返回一个 Future 对象,表示异步执行的任务。
    • 使用 concurrent.futures.as_completed() 来等待并处理每个任务的完成。
  3. 并行下载:

    • 对每个台风调用 process_typhoon_data 函数来并行下载数据。

注意事项:

  • 你可能需要调整线程池的大小,以防止创建过多线程导致资源耗尽。如果你的机器有足够的 CPU 或内存资源,ThreadPoolExecutor 会自动管理这些线程,但你也可以指定最大线程数,例如:ThreadPoolExecutor(max_workers=10)
  • 如果你需要更高效的 CPU 密集型任务并行,考虑使用 ProcessPoolExecutor 替换 ThreadPoolExecutor

这样,你的代码就可以并行地为每条数据调用下载函数,而无需按顺序逐个下载。

posted @ 2025-01-24 23:43  GraphL  阅读(48)  评论(0)    收藏  举报