贝隆

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理
apt update
sudo apt install ubuntu-drivers-common

pip install uv
uv venv --python=3.12
source .venv/bin/activate
git clone https://github.com/sgl-project/mini-sglang.git
cd mini-sglang
uv pip install -e .

nvidia-smi --query-compute-apps=pid --format=csv,noheader

apt-get install psmisc
fuser -v /dev/nvidia*

# 强制终止使用GPU的进程
fuser -k /dev/nvidia0  # 终止使用GPU 0的

python -m sglang.launch_server --model-path Qwen/Qwen2-0.5B-Instruct --port 30000
--mem-fraction-static 0.8 --disable-cuda-graph

https://docs.sglang.com.cn/backend/function_calling.html

apt-get install -y libnuma-dev

pip uninstall -y torch torchvision torchaudio
pip uninstall -y flashinfer sglang
 # 卸载 pip 安装的包
pip uninstall -y torch torchvision torchaudio
# 同时清理 conda 安装的版本(如果存在)
conda uninstall -y pytorch torchvision torchaudio cudatoolkit


import torch
print(f"PyTorch 版本: {torch.__version__}")
print(f"CUDA 可用: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA 版本: {torch.version.cuda}")
    # 尝试导入 NCCL 后端,检查是否正常
    print(f"NCCL 可用: {torch.distributed.is_nccl_available()}")

 


 
# Use the last main branch
git clone https://github.com/sgl-project/sglang.git
cd sglang

pip install --upgrade pip
pip install -e "python[all]"

# Install FlashInfer CUDA kernels
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/

python3 -m sglang.launch_server --model Qwen/Qwen2-1.5B-Instruct --mem-fraction-static 0.7

T4 should be supported in v0.3.1.post3
Try --disable-custom-all-reduce

https://github.com/sgl-project/sglang/issues/1325
 https://docs.sglang.com.cn/backend/function_calling.html

https://www.muliao.com/docs/SGLang.html

conda create -n sglang_env python=3.10 -y
conda activate sglang_env

pip install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

pip install triton>=2.2.0

pip install "sglang[all]" --find-links https://flashinfer.ai/whl/cu118/torch2.6/flashinfer/

pip install flashinfer-python -i https://flashinfer.ai/whl/cu126/torch2.6/

find / -name "libnvrtc.so.12" 2>/dev/null
/root/miniconda3/envs/sglang_env/lib/python3.10/site-packages/nvidia/cuda_nvrtc/lib/libnvrtc.so.12

echo "/root/miniconda3/envs/sglang_env/lib/python3.10/site-packages/nvidia/cuda_nvrtc/lib" | sudo tee /etc/ld.so.conf.d/cuda_libs.conf
ldconfig

pip install transformers==4.51.1
pip install compressed-tensors==0.10.2

wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
sudo sh cuda_11.8.0_520.61.05_linux.run
 
pip install triton>=2.2.0


CUDA 是什么?
CUDA(Compute Unified Device Architecture)是 NVIDIA 推出的并行计算平台和编程模型,允许开发者直接利用 GPU 进行通用计算(如深度学习、科学计算等)。其由 CUDA 工具包(Toolkit)、CUDA 驱动和 CUDA 运行时库等组成。
nvcc 是什么?
nvcc (NVIDIA CUDA Compiler)是 CUDA Toolkit 中的编译器,专门用于编译 CUDA 代码(.cu 文件)。
nvcc 与 CUDA 的关系
nvcc 是 CUDA Toolkit 的一部分。安装 CUDA Toolkit 时,会默认安装 nvcc;
nvcc 依赖 CUDA Toolkit。其行为受 CUDA Toolkit 版本的控制;
nvcc 不直接依赖显卡驱动。编译代码时只需 CUDA 工具包,但生成的程序运行时需要匹配的 NVIDIA 驱动。
————————————————
版权声明:本文为CSDN博主「Fox``y」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/2301_79780038/article/details/146324228

pip list | grep triton 

pip list | grep torch

pip list | grep transformers 

pip list | grep compressed-tensors 

pip list | grep  flashinfer-python

nvcc -V

 

torch 2.6.0
torch_memory_saver 0.0.9
torchao 0.9.0
torchaudio 2.4.1
torchcodec 0.8.0
torchvision 0.21.0

transformers              4.51.1

compressed-tensors        0.10.2

flashinfer-python         0.2.5

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0

sglang                    0.4.6.post5

python3 -m pip install --upgrade pip

pip install "sglang[all]==0.4.6.post5"

# 开启学术加速

 # Install FlashInfer CUDA kernels

pip install flashinfer-python -i https://flashinfer.ai/whl/cu126/torch2.6/

aiohappyeyeballs==2.6.1
aiohttp==3.13.2
aiosignal==1.4.0
airportsdata==20250909
annotated-doc==0.0.4
annotated-types==0.7.0
anthropic==0.75.0
anyio==4.12.0
apache-tvm-ffi==0.1.6
asttokens==3.0.1
async-timeout==5.0.1
attrs==25.4.0
blobfile==3.0.0
build==1.3.0
certifi==2025.11.12
cffi==2.0.0
charset-normalizer==3.4.4
click==8.3.1
cloudpickle==3.1.2
compressed-tensors==0.10.2
cuda-bindings==13.1.1
cuda-pathfinder==1.3.3
cuda-python==13.1.1
datasets==4.4.2
decorator==5.2.1
decord==0.6.0
decord2==3.0.0
dill==0.4.0
diskcache==5.6.3
distro==1.9.0
docstring_parser==0.17.0
einops==0.8.1
exceptiongroup==1.3.1
executing==2.2.1
fastapi==0.127.0
fastuuid==0.14.0
filelock==3.20.0
flashinfer-cubin==0.5.3
flashinfer-python==0.2.5
frozenlist==1.8.0
fsspec==2025.10.0
gguf==0.17.1
grpcio==1.67.1
grpcio-health-checking==1.75.1
grpcio-reflection==1.75.1
grpcio-tools==1.75.1
h11==0.16.0
hf-xet==1.2.0
hf_transfer==0.1.9
httpcore==1.0.9
httpx==0.28.1
huggingface-hub==0.36.0
idna==3.11
importlib_metadata==8.7.1
interegular==0.3.3
ipython==8.37.0
jedi==0.19.2
Jinja2==3.1.6
jiter==0.12.0
jsonschema==4.25.1
jsonschema-specifications==2025.9.1
lark==1.3.1
litellm==1.80.11
llguidance==0.7.30
loguru==0.7.3
lxml==6.0.2
MarkupSafe==2.1.5
matplotlib-inline==0.2.1
modelscope==1.33.0
mpmath==1.3.0
msgspec==0.20.0
multidict==6.7.0
multiprocess==0.70.18
nest-asyncio==1.6.0
networkx==3.4.2
ninja==1.13.0
numpy==2.2.6
nvidia-cublas-cu11==11.11.3.6
nvidia-cublas-cu12==12.4.5.8
nvidia-cuda-cupti-cu11==11.8.87
nvidia-cuda-cupti-cu12==12.4.127
nvidia-cuda-nvrtc-cu11==11.8.89
nvidia-cuda-nvrtc-cu12==12.4.127
nvidia-cuda-runtime-cu11==11.8.89
nvidia-cuda-runtime-cu12==12.4.127
nvidia-cudnn-cu11==9.1.0.70
nvidia-cudnn-cu12==9.1.0.70
nvidia-cudnn-frontend==1.17.0
nvidia-cufft-cu11==10.9.0.58
nvidia-cufft-cu12==11.2.1.3
nvidia-cufile-cu12==1.13.1.3
nvidia-curand-cu11==10.3.0.86
nvidia-curand-cu12==10.3.5.147
nvidia-cusolver-cu11==11.4.1.48
nvidia-cusolver-cu12==11.6.1.9
nvidia-cusparse-cu11==11.7.5.86
nvidia-cusparse-cu12==12.3.1.170
nvidia-cusparselt-cu12==0.6.2
nvidia-cutlass-dsl==4.2.1
nvidia-ml-py==13.590.44
nvidia-nccl-cu11==2.21.5
nvidia-nccl-cu12==2.21.5
nvidia-nvjitlink-cu12==12.4.127
nvidia-nvshmem-cu12==3.3.20
nvidia-nvtx-cu11==11.8.86
nvidia-nvtx-cu12==12.4.127
openai==2.14.0
openai-harmony==0.0.4
orjson==3.11.5
outlines==0.1.11
outlines_core==0.1.26
packaging==25.0
pandas==2.3.3
parso==0.8.5
partial-json-parser==0.2.1.1.post7
pexpect==4.9.0
pillow==12.0.0
prometheus_client==0.23.1
prompt_toolkit==3.0.52
propcache==0.4.1
protobuf==6.33.2
psutil==7.2.0
ptyprocess==0.7.0
pure_eval==0.2.3
py-spy==0.4.1
pyarrow==22.0.0
pybase64==1.4.3
pycountry==24.6.1
pycparser==2.23
pycryptodomex==3.23.0
pydantic==2.12.5
pydantic_core==2.41.5
Pygments==2.19.2
pynvml==13.0.1
pyproject_hooks==1.2.0
python-dateutil==2.9.0.post0
python-dotenv==1.2.1
python-multipart==0.0.21
pytz==2025.2
PyYAML==6.0.3
pyzmq==27.1.0
referencing==0.37.0
regex==2025.11.3
requests==2.32.5
rpds-py==0.30.0
safetensors==0.7.0
scipy==1.15.3
sentencepiece==0.2.1
setproctitle==1.3.7
sgl-kernel==0.1.4
sglang==0.4.6.post5
six==1.17.0
sniffio==1.3.1
soundfile==0.13.1
stack-data==0.6.3
starlette==0.50.0
sympy==1.13.1
tabulate==0.9.0
tiktoken==0.12.0
timm==1.0.16
tokenizers==0.21.4
tomli==2.3.0
torch==2.6.0
torch_memory_saver==0.0.9
torchao==0.9.0
torchaudio==2.4.1
torchcodec==0.8.0
torchvision==0.21.0
tqdm==4.67.1
traitlets==5.14.3
transformers==4.51.1
triton==3.2.0
typing-inspection==0.4.2
typing_extensions==4.15.0
tzdata==2025.3
urllib3==2.6.2
uvicorn==0.40.0
uvloop==0.22.1
wcwidth==0.2.14
xgrammar==0.1.19
xxhash==3.6.0
yarl==1.22.0
zipp==3.23.0

验证服务器对话接口

curl -kv http://127.0.0.1:30000/get_model_info

curl -X POST http://127.0.0.1:30000/v1/chat/completions  -H "Content-Type: application/json"  -d '{ "model": "Qwen/Qwen2-0.5B-Instruct", "messages": [ { "role": "system", "content": "You  are a helpful AI assistant" }, { "role": "user", "content": "你是谁" } ], "temperature": 0.6, "max_tokens": 1024 }'

posted on 2025-12-21 00:00  贝隆  阅读(13)  评论(0)    收藏  举报