vllm运行脚本

import subprocess
import time
import os
import signal
import requests
from run_vllm.args_235B import experiments  # Import experiment configurations from parameter file

# Configuration section
SERVER_LOG_FILE_TEMPLATE = "server_benchmark_{}.log"  # Each experiment uses an independent log file
HEALTH_CHECK_URL = "http://localhost:8001/health"
MAX_RETRY = 1200  # Maximum 1200 attempts (1 second each, total maximum waiting time 20 minutes)
RETRY_INTERVAL = 1  # Check every 1 second

# Server base command
base_server_cmd = [
    "python3", "-m", "vllm.entrypoints.openai.api_server",
    # "--model", "/nas_aisw/datasets/checkpoints/LLM/Qwen3-30B-A3B",
    "--trust-remote-code",
    # "--enable-prefix-caching",
    "--disable-log-requests"
]

# Client base command
base_client_cmd = [
    "python", "/mnt/ssd/mayiling.myl/vllm/benchmarks/benchmark_serving.py",
    # "--model", "/nas_aisw/datasets/checkpoints/LLM/Qwen3-30B-A3B",
    # "--dataset-name", "random",
    # "--random-input-len", "1000",
    # "--random-range-ratio", "0",
    # "--num-prompts", "80",
    "--trust-remote-code",
    "--ignore-eos",
    "--metric-percentiles", "90,99"
]

def build_command(base_cmd, args_dict):
    """Build the final command list based on base_cmd and argument dictionary"""
    cmd = base_cmd.copy()
    for key, value in args_dict.items():
        cmd.append(key)
        if value is not None:
            cmd.append(value)
    return cmd

def start_server(server_cmd, log_file_path):
    """Start server and write output to log file"""
    print("Starting vLLM server...")
    log_file = open(log_file_path, "w")
    server_process = subprocess.Popen(server_cmd, stdout=log_file, stderr=subprocess.STDOUT)
    return server_process, log_file

def wait_for_server():
    """Poll /health endpoint until server is ready"""
    print("Waiting for server to start...")
    retry = 0
    while retry < MAX_RETRY:
        try:
            response = requests.get(HEALTH_CHECK_URL, timeout=5)
            if response.status_code == 200:
                print("Server started and ready!")
                return True
        except requests.exceptions.RequestException:
            pass  # Ignore errors and continue retrying

        time.sleep(RETRY_INTERVAL)
        retry += 1

    print("Timeout: Server failed to start within the specified time.")
    return False

def run_benchmark(client_cmd):
    """Run benchmark client"""
    print("Starting benchmark test...")
    subprocess.run(client_cmd)

def stop_server(process, log_file):
    """Stop server and close log file"""
    print("\nStopping vLLM server...")
    process.send_signal(signal.SIGINT)
    process.wait()
    log_file.close()

if __name__ == "__main__":
    for idx, exp in enumerate(experiments):
        print(f"\nRunning experiment group {idx + 1}...")

        log_file_path = SERVER_LOG_FILE_TEMPLATE.format(idx + 1)
        print(f"Current log file: {log_file_path}")

        # Build server and client commands for current experiment
        server_cmd = build_command(base_server_cmd, exp["server_args"])
        client_cmd = build_command(base_client_cmd, exp["client_args"])

        print("Server command:", " ".join(server_cmd))
        print("Client command:", " ".join(client_cmd))

        # Start server
        server_process, log_file = start_server(server_cmd, log_file_path)

        # Wait for server to start completely
        if not wait_for_server():
            print("Server startup failed, skipping current experiment.")
            stop_server(server_process, log_file)
            continue

        try:
            run_benchmark(client_cmd)
        finally:
            stop_server(server_process, log_file)

    print("\nAll experiments completed.")

experiments = [
    # 1
    {
        "server_args": {
            "--max-num-batched-tokens": "16384",
            "--max-model-len": "16384",
            "--long-prefill-token-threshold": "0",
            "--enable-prefix-caching": None,
            "--model": "/nas_aisw/datasets/checkpoints/LLM/Qwen3-30B-A3B",
            "--tensor-parallel-size": "1",
            "--gpu-memory-utilization": "0.9",
            "--port": "8001",
        },
        "client_args": {
            "--model": "/nas_aisw/datasets/checkpoints/LLM/Qwen3-30B-A3B",
            # "--dataset-name": "random",
            # "--random-input-len": "200",
            # "--random-output-len": "200",
            # "--random-range-ratio": "0.3",
            # "--seed": "42",
            "--dataset-name": "sharegpt",
            "--dataset-path": "/mnt/ssd/mayiling.myl/dataset/ShareGPT_V3_unfiltered_cleaned_split.json",
            "--max-concurrency": "185",
            "--num-prompts": "80",
            "--append-result": None,
            "--result-filename": "/mnt/ssd/mayiling.myl/output/benchmark_results.json",
            "--port": "8001",
            "--request-rate": "10"
        },
    },
    # 2
    {
        "server_args": {
            "--max-num-batched-tokens": "16384",
            "--max-model-len": "16384",
            "--long-prefill-token-threshold": "0",
            "--enable-prefix-caching": None,
            "--model": "/nas_aisw/datasets/checkpoints/LLM/Qwen3-30B-A3B",
            "--tensor-parallel-size": "1",
            "--gpu-memory-utilization": "0.9",
            "--port": "8001",
        },
        "client_args": {
            "--model": "/nas_aisw/datasets/checkpoints/LLM/Qwen3-30B-A3B",
            "--dataset-name": "random",
            "--random-input-len": "200",
            "--random-output-len": "3000",
            "--random-range-ratio": "0.3",
            "--seed": "42",
            "--max-concurrency": "185",
            "--num-prompts": "80",
            "--append-result": None,
            "--result-filename": "/mnt/ssd/mayiling.myl/output/benchmark_results.json",
            "--port": "8001"
        },
    },
    # 2.1
    {
        "server_args": {
            "--max-num-batched-tokens": "16384",
            "--max-model-len": "16384",
            "--long-prefill-token-threshold": "0",
            "--enable-prefix-caching": None,
            "--model": "/nas_aisw/datasets/checkpoints/LLM/Qwen3-30B-A3B",
            "--tensor-parallel-size": "1",
            "--gpu-memory-utilization": "0.9",
            "--port": "8001",
        },
        "client_args": {
            "--model": "/nas_aisw/datasets/checkpoints/LLM/Qwen3-30B-A3B",
            "--dataset-name": "random",
            "--random-input-len": "1000",
            "--random-output-len": "200",
            "--random-range-ratio": "0.3",
            "--seed": "42",
            "--max-concurrency": "185",
            "--num-prompts": "80",
            "--append-result": None,
            "--result-filename": "/mnt/ssd/mayiling.myl/output/benchmark_results.json",
            "--port": "8001"
        },
    },
    # 3
    {
        "server_args": {
            "--max-num-batched-tokens": "16384",
            "--max-model-len": "16384",
            "--long-prefill-token-threshold": "0",
            "--enable-prefix-caching": None,
            "--model": "/nas_aisw/datasets/checkpoints/LLM/Qwen3-30B-A3B",
            "--tensor-parallel-size": "1",
            "--gpu-memory-utilization": "0.9",
            "--port": "8001",
        },
        "client_args": {
            "--model": "/nas_aisw/datasets/checkpoints/LLM/Qwen3-30B-A3B",
            "--dataset-name": "random",
            "--random-input-len": "1000",
            "--random-output-len": "3000",
            "--random-range-ratio": "0.3",
            "--seed": "42",
            "--max-concurrency": "185",
            "--num-prompts": "80",
            "--append-result": None,
            "--result-filename": "/mnt/ssd/mayiling.myl/output/benchmark_results.json",
            "--port": "8001"
        },
    },
]

posted @ 2025-09-04 16:00 Echo宝贝儿阅读(19) 评论(0) 收藏举报

刷新页面返回顶部

vllm运行脚本

公告