多线程调用 LLM API 模板
服务器上用 vllm 部署了多个 Qwen2.5-72B-Instruct 节点,都只支持单次推理,不支持批量推理。
要清洗数据,有上万个单轮请求,通过 GPT 写了一个简单的多线程批量推理脚本,基于线程锁,见下面:
import openai
import concurrent.futures
import threading
import time
from tenacity import retry, wait_fixed, stop_after_attempt
# 初始化客户端
class OpenAIClient:
def __init__(self, base_url, api_key):
self.client = openai.OpenAI(
base_url=base_url,
api_key=api_key
)
self.semaphore = threading.Semaphore(1) # 控制每个端口最多只能同时处理一个请求
# 发送请求并进行重试
@retry(wait=wait_fixed(1), stop=stop_after_attempt(3))
def get_response(self, message):
try:
start_time = time.time()
response = self.client.chat.completions.create(
model="Qwen2.5-72B-Instruct", # 模型名称
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": message},
],
)
response_time = time.time() - start_time
return response.choices[0].message.content, response_time
except Exception as e:
print(f"Error: {e}")
raise
# 创建端口列表
endpoints = [
"http://172.30.8.46:50050/v1",
"http://172.30.24.236:50050/v1",
"http://172.30.48.158:50050/v1",
"http://172.30.16.111:50050/v1",
"http://172.30.56.78:50050/v1",
"http://172.30.40.134:50050/v1",
]
# 创建客户端列表
clients = [OpenAIClient(base_url=endpoint, api_key="sk-xxxxxxx") for endpoint in endpoints]
# 发送并发请求
def send_concurrent_requests(messages):
def send_request_to(message):
# 找到一个空闲的 client
client = None
for idx, c in enumerate(clients):
if c.semaphore.acquire(blocking=False):
client = c
break
res = client.get_response(message)
client.semaphore.release()
return res
with concurrent.futures.ThreadPoolExecutor(max_workers=len(endpoints)) as executor:
futures = {}
for idx, message in enumerate(messages):
# 将每个 message 的索引与 future 绑定
futures[executor.submit(send_request_to, message)] = idx
results = [None] * len(messages) # 创建一个与 messages 顺序相同的空列表
for future in concurrent.futures.as_completed(futures):
idx = futures[future] # 获取原始消息的索引
result, response_time = future.result()
results[idx] = result # 按原始顺序填充结果
# print(f"Response time: {response_time:.2f} seconds")
return results
# 示例消息
messages = [
"What's the weather?",
"Tell me a joke.",
"How do you make coffee?",
"What is the capital of France?",
"Explain quantum physics in simple terms.",
"What's the latest technology news?",
]
messages = messages * 3
if __name__ == "__main__":
# 计时
import time
start_time = time.time()
responses = send_concurrent_requests(messages)
end_time = time.time()
print(f"Time taken: {end_time - start_time:.2f} seconds")
for response in responses:
print(response)
转载注意标注出处:
转自Cold_Chair的博客+原博客地址

浙公网安备 33010602011771号