vLLM 学习(二):vLLM 启动流程一
- vLLM 启动流程
以下面离线推理的简单例子作为代表,简要梳理一下 vLLM 的启动流程:
from vllm import LLM, SamplingParams
def main():
# Define a list of input prompts
prompts = [
"你好"
]
# Define sampling parameters
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Initialize the LLM engine with the OPT-125M model
llm = LLM(model="Qwen/Qwen3-0.6B")
# Generate outputs for the input prompts
outputs = llm.generate(prompts, sampling_params)
# Print the generated outputs
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == "__main__":
main()
在第 13 行初始化了一个 LLM 类,在该类的构造函数中加载预先配置的参数并初始化LLM Engine类
class LLM:
def __init__:
...
self.llm_engine = LLMEngine.from_engine_args(
engine_args=engine_args, usage_context=UsageContext.LLM_CLASS)
self.engine_class = type(self.llm_engine)
...
def from_engine_args:
...
if envs.VLLM_USE_V1:
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
engine_cls = V1LLMEngine
# 使用 vllm v1 的时候 engine_cls(一个类) = V1LLMEngine
return engine_cls.from_vllm_config(
vllm_config=vllm_config,
usage_context=usage_context,
stat_loggers=stat_loggers,
disable_log_stats=engine_args.disable_log_stats,
)
...
def from_vllm_config(
# cls 作为隐藏参数进行传递,代表调用该方法的类实例
cls,
vllm_config: VllmConfig,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
stat_loggers: Optional[list[StatLoggerFactory]] = None,
disable_log_stats: bool = False,
) -> "LLMEngine":
# 调用 V1LLMEngine 类的构造函数
return cls(vllm_config=vllm_config,
executor_class=Executor.get_class(vllm_config),
log_stats=(not disable_log_stats),
usage_context=usage_context,
stat_loggers=stat_loggers,
multiprocess_mode=envs.VLLM_ENABLE_V1_MULTIPROCESSING)
class LLMEngine:
def __init__()
...
self.engine_core = EngineCoreClient.make_client(
multiprocess_mode=multiprocess_mode,
asyncio_mode=False,
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=self.log_stats,
)
...
class EngineCoreClient(ABC):
"""
Subclasses:
* InprocClient: In process EngineCore (for V0-style LLMEngine use)
* SyncMPClient: ZMQ + background proc EngineCore (for LLM)
* AsyncMPClient: ZMQ + background proc EngineCore w/ asyncio (for AsyncLLM)
"""
@staticmethod
def make_client():
if asyncio_mode and not multiprocess_mode:
raise NotImplementedError(
"Running EngineCore in asyncio without multiprocessing "
"is not currently supported.")
if multiprocess_mode and asyncio_mode:
return EngineCoreClient.make_async_mp_client(
vllm_config, executor_class, log_stats)
if multiprocess_mode and not asyncio_mode:
return SyncMPClient(vllm_config, executor_class, log_stats)
return InprocClient(vllm_config, executor_class, log_stats)
这里和上一篇文章中讲述的 vLLM 架构有所区别,代码中出现了
EngineCoreClient这个类,并且对不同的推理模式还有不同的EngineCoreClient子类。为了实现推理引擎的解耦,特别是为了支持在不同的进程、容器甚至节点之间进行高效的任务调度, vLLM 在具体实现中,采用了“生产者【EngineCoreClient】”与“消费者【EngineCore】”模型,
以下是它们关系的详细拆解:
EngineCoreClient (客户端/代理):
- 它是 EngineCore 的“对外接口”,运行在 API Server(如 OpenAI API 入口)所在的进程中。
- 它不直接接触 GPU,也不做复杂的调度。
- 它的主要职责是转发:接收来自用户的 HTTP 请求,将其打包发送给 EngineCore,并接收返回的推理结果。
EngineCore (服务端/内核):
- 它是真正的“执行者”,运行在模型执行器所在的进程中。
- 它持有
Scheduler(调度器)和Executor(执行器),直接管理显存块(KV Cache)和模型推理任务。- 它负责处理复杂的请求状态管理、Token 生成以及输出的封装。
因为示例代码使用的是离线推理模式,这里也暂时以离线推理模式向下进行代码的解读。
class SyncMPClient(MPClient):
"""Synchronous client for multi-proc EngineCore."""
def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
log_stats: bool):
super().__init__(
asyncio_mode=False,
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=log_stats,
)
class MPClient(EngineCoreClient):
"""
MPClient: base client for multi-proc EngineCore.
EngineCore runs in a background process busy loop, getting
new EngineCoreRequests and returning EngineCoreOutputs
* pushes EngineCoreRequests via input_socket
* pulls EngineCoreOutputs via output_socket
* AsyncMPClient subclass for AsyncLLM usage
* SyncMPClient subclass for LLM usage
"""
def __init__(
self,
asyncio_mode: bool,
vllm_config: VllmConfig,
executor_class: type[Executor],
log_stats: bool,
client_addresses: Optional[dict[str, str]] = None,
):
...
with launch_core_engines(vllm_config, executor_class,
log_stats) as (engine_manager,
coordinator,
addresses):
self.resources.coordinator = coordinator
self.resources.engine_manager = engine_manager
...
def launch_core_engines():
"""Launch engine and DP coordinator processes as needed."""
# Start local engines.
if local_engine_count:
local_engine_manager = CoreEngineProcManager(
EngineCoreProc.run_engine_core,
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=log_stats,
handshake_address=handshake_address,
client_handshake_address=client_handshake_address,
local_client=True,
local_engine_count=local_engine_count,
start_index=dp_rank,
local_start_index=local_start_index or 0)
else:
local_engine_manager = None
class CoreEngineProcManager:
"""
Utility class to handle creation, readiness, and shutdown
of background processes used by the AsyncLLM and LLMEngine.
"""
def __init__():
self.processes.append(
# 这里的 target_fn 回调函数就是上一步传进来的函数参数EngineCoreProc.run_engine_core
context.Process(target=target_fn,
name=f"EngineCore_{global_index}",
kwargs=common_kwargs | {
"dp_rank": global_index,
"local_dp_rank": local_index,
}))
def run_engine_core():
"""Launch EngineCore busy loop in background process."""
engine_core = EngineCoreProc(*args, **kwargs)
在 vLLM V1 架构中,EngineCoreProc是EngineCore的进程级实现。当launch_core_engines完成进程拉起与握手后,标志着推理内核初始化完成。
LLM.__init__
└── LLMEngine.from_engine_args
└── V1LLMEngine.from_vllm_config
└── V1LLMEngine.__init__
└── EngineCoreClient.make_client
├── multiprocess_mode = True
└── new SyncMPClient
└── MPClient.__init__
└── launch_core_engines
└── CoreEngineProcManager
└── Process(
target=EngineCoreProc.run_engine_core
)
↓ 子进程
EngineCoreProc.run_engine_core
└── EngineCore(...)
核心协作关系:
-
接口层:
EngineCoreClient暴露给上层的LLM或AsyncLLMEngine类,充当外部访问的代理 -
通信层:两者通过高效 RPC 机制交互,实现请求下发与结果流回
-
执行层:
EngineCore在独立后台进程中实例化Scheduler和Executor,专注于 KV Cache 维护、模型任务调度及 Worker 管理。架构优势: 这种“Out-of-process”设计实现了 API 服务逻辑与推理调度逻辑的解耦。确保了核心调度器能以极高频率稳定运行,免受前端 HTTP 流量波动或复杂异步 I/O 操作的影响,从而保障了大规模并发下的低延迟性能。
┌──────────────────────────────┐
│ Python 主进程 │
│ │
│ LLM │
│ └── LLMEngine │
│ └── EngineCoreClient │ ← 生产者 / 代理
│ (SyncMPClient) │
│ │ │
│ ZMQ / IPC 通信 │
│ │ │
└────────────────┼──────────────┘
│
┌────────────────▼──────────────┐
│ EngineCore 子进程 │
│ │
│ EngineCore │ ← 消费者 / 执行者
│ ├── Scheduler │
│ ├── Executor │
│ ├── KV Cache Manager │
│ └── Model Runner (GPU) │
│ │
└────────────────────────────────┘

浙公网安备 33010602011771号