【Python】vllm部署调用Qwen3-VL
1. 创建环境
conda create -n vllm python=3.10
2. 安装vllm
pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
3. 下载模型
modelscope download --model Qwen/Qwen3-VL-32B-Instruct
4. 启动模型
vllm serve Qwen/Qwen3-VL-32B-Instruct --tensor-parallel-size 2 --max-model-len 16384
5. 调用,读取本地图片转为base64传入模型
import time from openai import OpenAI import base64 client = OpenAI( api_key="EMPTY", base_url="http://127.0.0.1:8000/v1", timeout=3600 ) # 处理多模态输入(示例:文本+图像) def image_to_base64(image_path): with open(image_path, "rb") as img_file: return base64.b64encode(img_file.read()).decode('utf-8') messages=[{"role": "system", "content": "你是一个有用的助手,请确保提供的所有信息都是准确、真实和可靠的。"}] messages.append( { "role": "user", "content": [ {"type": "text", "text": "描述这张图片"}, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{image_to_base64('demo.jpeg')}" } } ] } ) start = time.time() response = client.chat.completions.create( model="Qwen/Qwen3-VL-32B-Instruct", messages=messages, stream=True, max_tokens=4096 ) full_response = "" for chunk in response: if chunk.choices[0].delta.content is not None: content = chunk.choices[0].delta.content print(content, end="", flush=True) # 实时输出,不换行 full_response += content

浙公网安备 33010602011771号