LMdeploy 执行效率高于VLLM探究

VLLM和LMdeploy 各有优劣

VLLM功能多，开发者多，代码比较容易懂也比较容易修改，能适配不同的平台

LMdeploy执行效率高，开发者少，文档几乎没有，只支持NVidia

公司实际上线，肯定要博采众长，既要有LMdeploy的执行效率，也要参考VLLM的各种功能实现。

LMdelpoy的执行效率是要高于VLLM的，尤其是对于int4量化的模型，效率能差开30%左右。

下面是网上找到的两个测试图。

自己测试没有上图差距那么夸张，差距25%左右，可能是网上的测试VLLM的版本比较老。

lmdeploy

vllm

通过查阅资料，代码定位了解到，lmdeploy的后端turbomind执行效率高于vllm的后端，其中最核心的是关于int4量化的实现方式差别。

VLLM对AWQ的模型没有原生支持[强制使用awq模式性能非常差]，需要转换成AWQ_marlin，然后调用对应的cuda kernel计算，核心的kernel是gptq_marlin_gemm

turbomind，原生集成了AWQ的kernel，都放在GemmPool里面，调用的时候gemm->Run，这里拿出了turbomind的linear模块，测试turbomind对于awq int4量化的性能。

下面放2段测试代码，时间关系，不是很严谨， turbomind对于awq int4的执行耗时35s，对比vllm的int4 gemm

执行耗时48s，代码贴在下面了。可以看出两者实现效率确实存在差距。后面文章会通过分析两者底层实现的差距，找到原因。内容会涉及cutlass之类的底层内容。

test_for_turbomind

import torch
import torch.nn as nn
import turbomind as tm
torch.manual_seed(0)


def i32x8_to_i4x8(w):
    """merge 8 integers (range from 0 to 15) into one 32-bit integer."""
    assert w.shape[-1] % 8 == 0
    shape = (w.shape[0], w.numel() // (w.shape[0] * 8), 8)
    shape = shape[:-1] + (1, )
    result = torch.zeros(shape, dtype=w.dtype, device=w.device)
    mask = torch.tensor([15], dtype=w.dtype, device=w.device)
    for i in range(8):
        shift = 4 * (7 - i)
        result[..., 0] |= (w[..., i] & mask) << shift
    result = result.view(w.shape[0], -1)
    return result


def makeup_weights(in_features: int, out_features: int, group_size: int = 128):
    # make up qweight
    assert out_features % 8 == 0
    qweight = torch.randint(0,
                            16, (in_features, out_features // 8, 8),
                            dtype=torch.int32,
                            device='cuda')
    print(f'-- makeup qweight: shape {qweight.shape}')
    print(qweight.view(in_features, -1))
    qweight = i32x8_to_i4x8(qweight)
    print(f'-- merge qweight: shape {qweight.shape}')
    print(qweight)

    # make up qzeros
    assert in_features % group_size == 0 and in_features // group_size >= 1
    qzeros = torch.randint(0,
                           16,
                           (in_features // group_size, out_features // 8, 8),
                           dtype=torch.int32,
                           device='cuda')
    print(f'-- makeup qzero: shape {qzeros.shape}')
    print(qzeros.view(in_features // group_size, -1))
    qzeros = i32x8_to_i4x8(qzeros)
    print(f'-- merge qzero: shape {qzeros.shape}\n{qzeros}')

    # make up scales
    scales = torch.rand((in_features // group_size, out_features),
                        dtype=torch.float16,
                        device='cuda')
    print(f'-- makeup scales: shape {scales.shape}\n{scales}')
    return qweight, qzeros, scales

group_size = 128
batch_size = 16384
in_features = 16384
out_features = 16384
qweight, qzeros, scales = makeup_weights(in_features, out_features, group_size)

x = torch.randn((batch_size, in_features),
                device=qweight.device,
                dtype=torch.float16)

model = tm.Linear(in_features=in_features,
                  out_features=out_features,
                  bias=False,
                  quant_method='awq',
                  w_bit=4,
                  group_size=group_size)

model.qweight = qweight
model.qzeros = qzeros
model.scales = scales

model.post_init()

stream = torch.cuda.Stream()
with torch.cuda.stream(stream):
    res = model(x)
stream.synchronize()


import time
st = time.time()

stream = torch.cuda.Stream()
with torch.cuda.stream(stream):
  for i in range(1000):
      res = model(x)
stream.synchronize()


print('total_time: ', time.time() - st)  #35s

test for vllm

import torch

from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
    MARLIN_SUPPORTED_GROUP_SIZES, marlin_make_empty_g_idx,
    marlin_permute_scales, query_marlin_supported_quant_types)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
    MarlinWorkspace, awq_marlin_quantize, get_weight_perm, marlin_quantize,
    marlin_weights)


def compute_max_diff(output, output_ref):
    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
        torch.abs(output_ref))


def rand_data(shape, dtype=torch.float16):
    return torch.randn(shape, dtype=dtype, device="cuda")
  

quant_type = query_marlin_supported_quant_types(True)[0] # int4

group_size = 128

is_k_full = True
use_fp32_reduce = True

size_m = 16384
size_k = 16384
size_n = 16384


a_input = rand_data((size_m, size_k))
b_weight = rand_data((size_k, size_n))

w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
    b_weight, quant_type, group_size)

g_idx = torch.empty(0, dtype=torch.int, device=marlin_q_w.device)
sort_indices = torch.empty(0, dtype=torch.int, device=marlin_q_w.device)
is_k_full = True
has_zp = True

workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
                            GPTQ_MARLIN_MAX_PARALLEL)

output = ops.gptq_marlin_gemm(
    a_input,
    marlin_q_w,
    marlin_s,
    marlin_zp,
    g_idx,
    sort_indices,
    workspace.scratch,
    quant_type,
    a_input.shape[0],
    b_weight.shape[1],
    a_input.shape[1],
    is_k_full=is_k_full,
    has_zp=has_zp,
    use_fp32_reduce=use_fp32_reduce,
)
torch.cuda.synchronize()

import time
st = time.time()
for i in range(1000):
  output = ops.gptq_marlin_gemm(
    a_input,
    marlin_q_w,
    marlin_s,
    marlin_zp,
    g_idx,
    sort_indices,
    workspace.scratch,
    quant_type,
    a_input.shape[0],
    b_weight.shape[1],
    a_input.shape[1],
    is_k_full=is_k_full,
    has_zp=has_zp,
    use_fp32_reduce=use_fp32_reduce,
)
torch.cuda.synchronize()
print(time.time() - st)  # 47.9874222278595

posted on 2025-03-04 14:35 ExplorerMan 阅读(224) 评论(0) 收藏举报

刷新页面返回顶部

ExplorerMan

LMdeploy 执行效率高于VLLM探究

导航

公告