大模型性能测试

import time
import torch
import psutil
import asyncio
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import Optional, List, Dict

# ======================================
# 【你只需要改这一行:模型路径】
# ======================================
MODEL_PATH = "Qwen/Qwen2-7B-Instruct" # 本地模型路径 / HuggingFace 模型名
TEST_PROMPT = "请介绍一下人工智能。" # 测试用提示词
GEN_MAX_TOKENS = 512 # 生成最大token数
TEST_ROUNDS = 3 # 测试轮数(取平均值)

class LLMPerfBenchmark:
def __init__(self, model_path: str):
self.model_path = model_path
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.tokenizer = None
self.model = None

def load_model(self):
"""加载模型并打印初始显存"""
print(f"[+] 加载模型: {self.model_path}")
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype="auto",
device_map="auto",
trust_remote_code=True
)
self.print_gpu_memory("模型加载完成")

def print_gpu_memory(self, tag: str):
"""打印GPU显存占用"""
if torch.cuda.is_available():
allocated = round(torch.cuda.memory_allocated() / 1024**3, 2)
reserved = round(torch.cuda.memory_reserved() / 1024**3, 2)
print(f"📊 [{tag}] GPU显存: 已分配 {allocated}GB | 已缓存 {reserved}GB")

def generate_sync(self, prompt: str, max_tokens: int):
"""同步推理 + 全链路性能统计"""
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
input_len = len(inputs["input_ids"][0])

# 预热
self.model.generate(**inputs, max_new_tokens=1, pad_token_id=self.tokenizer.eos_token_id)

# 正式测试
start_enter = time.time()
first_token_time = None
all_tokens = []

for i in range(TEST_ROUNDS):
print(f"\n--- 第 {i+1}/{TEST_ROUNDS} 轮测试 ---")
torch.cuda.synchronize()
s = time.time()

outputs = self.model.generate(
**inputs,
max_new_tokens=max_tokens,
pad_token_id=self.tokenizer.eos_token_id,
do_sample=False,
temperature=0.1
)

torch.cuda.synchronize()
total_time = time.time() - s
output_ids = outputs[0][input_len:]
output_len = len(output_ids)
text = self.tokenizer.decode(output_ids, skip_special_tokens=True)

# 计算指标
ttft = round(total_time * 1000, 2)
tps = round(output_len / total_time, 2)
tpt = round((total_time / output_len)*1000, 2) if output_len else 0

print(f"输入token数: {input_len}")
print(f"输出token数: {output_len}")
print(f"⚡首包延迟: {ttft} ms")
print(f"🚀生成速度: {tps} token/s")
print(f"⏱每token延迟: {tpt} ms")
self.print_gpu_memory(f"第{i+1}轮推理")

async def generate_async(self, prompts: List[str], max_tokens: int):
"""并发压测(可选)"""
print(f"\n=== 并发压测({len(prompts)} 路)===")
tasks = [self._single_async_task(p, max_tokens) for p in prompts]
await asyncio.gather(*tasks)

async def _single_async_task(self, prompt, max_tokens):
self.generate_sync(prompt, max_tokens)

# ======================================
# 主运行入口
# ======================================
if __name__ == "__main__":
benchmark = LLMPerfBenchmark(MODEL_PATH)

print("="*60)
print(" 大模型性能压测工具 V1.0")
print("="*60)

# 1. 加载模型
benchmark.load_model()

# 2. 基础性能测试
print("\n" + "="*50)
print("开始基础性能测试...")
print("="*50)
benchmark.generate_sync(TEST_PROMPT, GEN_MAX_TOKENS)

# 3. 并发测试(可选)
# asyncio.run(benchmark.generate_async([TEST_PROMPT]*3, GEN_MAX_TOKENS))

print("\n✅ 测试完成!")

posted @ 2026-03-18 16:22  ReturnHome  阅读(2)  评论(0)    收藏  举报