apt update
sudo apt install ubuntu-drivers-common
pip install uv
uv venv --python=3.12
source .venv/bin/activate
git clone https://github.com/sgl-project/mini-sglang.git cd mini-sglang uv pip install -e .
nvidia-smi --query-compute-apps=pid --format=csv,noheader
apt-get install psmisc
fuser -v /dev/nvidia* # 强制终止使用GPU的进程 fuser -k /dev/nvidia0 # 终止使用GPU 0的
python -m sglang.launch_server --model-path Qwen/Qwen2-0.5B-Instruct --port 30000 --mem-fraction-static 0.8 --disable-cuda-graph
https://docs.sglang.com.cn/backend/function_calling.html
apt-get install -y libnuma-dev
pip uninstall -y torch torchvision torchaudio
pip uninstall -y flashinfer sglang
pip uninstall -y torch torchvision torchaudio
# 同时清理 conda 安装的版本(如果存在)
conda uninstall -y pytorch torchvision torchaudio cudatoolkit
import torch print(f"PyTorch 版本: {torch.__version__}") print(f"CUDA 可用: {torch.cuda.is_available()}") if torch.cuda.is_available(): print(f"CUDA 版本: {torch.version.cuda}") # 尝试导入 NCCL 后端,检查是否正常 print(f"NCCL 可用: {torch.distributed.is_nccl_available()}")
# Use the last main branch
git clone https://github.com/sgl-project/sglang.git
cd sglang
pip install --upgrade pip
pip install -e "python[all]"
# Install FlashInfer CUDA kernels
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
python3 -m sglang.launch_server --model Qwen/Qwen2-1.5B-Instruct --mem-fraction-static 0.7
T4 should be supported in v0.3.1.post3
Try --disable-custom-all-reduce
https://github.com/sgl-project/sglang/issues/1325
https://docs.sglang.com.cn/backend/function_calling.html
https://www.muliao.com/docs/SGLang.html
conda create -n sglang_env python=3.10 -y
conda activate sglang_env
pip install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip install triton>=2.2.0
pip install "sglang[all]" --find-links https://flashinfer.ai/whl/cu118/torch2.6/flashinfer/
pip install flashinfer-python -i https://flashinfer.ai/whl/cu126/torch2.6/
find / -name "libnvrtc.so.12" 2>/dev/null
/root/miniconda3/envs/sglang_env/lib/python3.10/site-packages/nvidia/cuda_nvrtc/lib/libnvrtc.so.12
echo "/root/miniconda3/envs/sglang_env/lib/python3.10/site-packages/nvidia/cuda_nvrtc/lib" | sudo tee /etc/ld.so.conf.d/cuda_libs.conf
ldconfig
pip install transformers==4.51.1
pip install compressed-tensors==0.10.2
wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
sudo sh cuda_11.8.0_520.61.05_linux.run
pip install triton>=2.2.0
CUDA 是什么?
CUDA(Compute Unified Device Architecture)是 NVIDIA 推出的并行计算平台和编程模型,允许开发者直接利用 GPU 进行通用计算(如深度学习、科学计算等)。其由 CUDA 工具包(Toolkit)、CUDA 驱动和 CUDA 运行时库等组成。
nvcc 是什么?
nvcc (NVIDIA CUDA Compiler)是 CUDA Toolkit 中的编译器,专门用于编译 CUDA 代码(.cu 文件)。
nvcc 与 CUDA 的关系
nvcc 是 CUDA Toolkit 的一部分。安装 CUDA Toolkit 时,会默认安装 nvcc;
nvcc 依赖 CUDA Toolkit。其行为受 CUDA Toolkit 版本的控制;
nvcc 不直接依赖显卡驱动。编译代码时只需 CUDA 工具包,但生成的程序运行时需要匹配的 NVIDIA 驱动。
————————————————
版权声明:本文为CSDN博主「Fox``y」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/2301_79780038/article/details/146324228
pip list | grep triton
pip list | grep torch
pip list | grep transformers
pip list | grep compressed-tensors
pip list | grep flashinfer-python
nvcc -V
torch 2.6.0
torch_memory_saver 0.0.9
torchao 0.9.0
torchaudio 2.4.1
torchcodec 0.8.0
torchvision 0.21.0
transformers 4.51.1
compressed-tensors 0.10.2
flashinfer-python 0.2.5
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0
sglang 0.4.6.post5
python3 -m pip install --upgrade pip
pip install "sglang[all]==0.4.6.post5"
# 开启学术加速
# Install FlashInfer CUDA kernels
pip install flashinfer-python -i https://flashinfer.ai/whl/cu126/torch2.6/
aiohappyeyeballs==2.6.1 aiohttp==3.13.2 aiosignal==1.4.0 airportsdata==20250909 annotated-doc==0.0.4 annotated-types==0.7.0 anthropic==0.75.0 anyio==4.12.0 apache-tvm-ffi==0.1.6 asttokens==3.0.1 async-timeout==5.0.1 attrs==25.4.0 blobfile==3.0.0 build==1.3.0 certifi==2025.11.12 cffi==2.0.0 charset-normalizer==3.4.4 click==8.3.1 cloudpickle==3.1.2 compressed-tensors==0.10.2 cuda-bindings==13.1.1 cuda-pathfinder==1.3.3 cuda-python==13.1.1 datasets==4.4.2 decorator==5.2.1 decord==0.6.0 decord2==3.0.0 dill==0.4.0 diskcache==5.6.3 distro==1.9.0 docstring_parser==0.17.0 einops==0.8.1 exceptiongroup==1.3.1 executing==2.2.1 fastapi==0.127.0 fastuuid==0.14.0 filelock==3.20.0 flashinfer-cubin==0.5.3 flashinfer-python==0.2.5 frozenlist==1.8.0 fsspec==2025.10.0 gguf==0.17.1 grpcio==1.67.1 grpcio-health-checking==1.75.1 grpcio-reflection==1.75.1 grpcio-tools==1.75.1 h11==0.16.0 hf-xet==1.2.0 hf_transfer==0.1.9 httpcore==1.0.9 httpx==0.28.1 huggingface-hub==0.36.0 idna==3.11 importlib_metadata==8.7.1 interegular==0.3.3 ipython==8.37.0 jedi==0.19.2 Jinja2==3.1.6 jiter==0.12.0 jsonschema==4.25.1 jsonschema-specifications==2025.9.1 lark==1.3.1 litellm==1.80.11 llguidance==0.7.30 loguru==0.7.3 lxml==6.0.2 MarkupSafe==2.1.5 matplotlib-inline==0.2.1 modelscope==1.33.0 mpmath==1.3.0 msgspec==0.20.0 multidict==6.7.0 multiprocess==0.70.18 nest-asyncio==1.6.0 networkx==3.4.2 ninja==1.13.0 numpy==2.2.6 nvidia-cublas-cu11==11.11.3.6 nvidia-cublas-cu12==12.4.5.8 nvidia-cuda-cupti-cu11==11.8.87 nvidia-cuda-cupti-cu12==12.4.127 nvidia-cuda-nvrtc-cu11==11.8.89 nvidia-cuda-nvrtc-cu12==12.4.127 nvidia-cuda-runtime-cu11==11.8.89 nvidia-cuda-runtime-cu12==12.4.127 nvidia-cudnn-cu11==9.1.0.70 nvidia-cudnn-cu12==9.1.0.70 nvidia-cudnn-frontend==1.17.0 nvidia-cufft-cu11==10.9.0.58 nvidia-cufft-cu12==11.2.1.3 nvidia-cufile-cu12==1.13.1.3 nvidia-curand-cu11==10.3.0.86 nvidia-curand-cu12==10.3.5.147 nvidia-cusolver-cu11==11.4.1.48 nvidia-cusolver-cu12==11.6.1.9 nvidia-cusparse-cu11==11.7.5.86 nvidia-cusparse-cu12==12.3.1.170 nvidia-cusparselt-cu12==0.6.2 nvidia-cutlass-dsl==4.2.1 nvidia-ml-py==13.590.44 nvidia-nccl-cu11==2.21.5 nvidia-nccl-cu12==2.21.5 nvidia-nvjitlink-cu12==12.4.127 nvidia-nvshmem-cu12==3.3.20 nvidia-nvtx-cu11==11.8.86 nvidia-nvtx-cu12==12.4.127 openai==2.14.0 openai-harmony==0.0.4 orjson==3.11.5 outlines==0.1.11 outlines_core==0.1.26 packaging==25.0 pandas==2.3.3 parso==0.8.5 partial-json-parser==0.2.1.1.post7 pexpect==4.9.0 pillow==12.0.0 prometheus_client==0.23.1 prompt_toolkit==3.0.52 propcache==0.4.1 protobuf==6.33.2 psutil==7.2.0 ptyprocess==0.7.0 pure_eval==0.2.3 py-spy==0.4.1 pyarrow==22.0.0 pybase64==1.4.3 pycountry==24.6.1 pycparser==2.23 pycryptodomex==3.23.0 pydantic==2.12.5 pydantic_core==2.41.5 Pygments==2.19.2 pynvml==13.0.1 pyproject_hooks==1.2.0 python-dateutil==2.9.0.post0 python-dotenv==1.2.1 python-multipart==0.0.21 pytz==2025.2 PyYAML==6.0.3 pyzmq==27.1.0 referencing==0.37.0 regex==2025.11.3 requests==2.32.5 rpds-py==0.30.0 safetensors==0.7.0 scipy==1.15.3 sentencepiece==0.2.1 setproctitle==1.3.7 sgl-kernel==0.1.4 sglang==0.4.6.post5 six==1.17.0 sniffio==1.3.1 soundfile==0.13.1 stack-data==0.6.3 starlette==0.50.0 sympy==1.13.1 tabulate==0.9.0 tiktoken==0.12.0 timm==1.0.16 tokenizers==0.21.4 tomli==2.3.0 torch==2.6.0 torch_memory_saver==0.0.9 torchao==0.9.0 torchaudio==2.4.1 torchcodec==0.8.0 torchvision==0.21.0 tqdm==4.67.1 traitlets==5.14.3 transformers==4.51.1 triton==3.2.0 typing-inspection==0.4.2 typing_extensions==4.15.0 tzdata==2025.3 urllib3==2.6.2 uvicorn==0.40.0 uvloop==0.22.1 wcwidth==0.2.14 xgrammar==0.1.19 xxhash==3.6.0 yarl==1.22.0 zipp==3.23.0
验证服务器对话接口
curl -kv http://127.0.0.1:30000/get_model_info
curl -X POST http://127.0.0.1:30000/v1/chat/completions -H "Content-Type: application/json" -d '{ "model": "Qwen/Qwen2-0.5B-Instruct", "messages": [ { "role": "system", "content": "You are a helpful AI assistant" }, { "role": "user", "content": "你是谁" } ], "temperature": 0.6, "max_tokens": 1024 }'
浙公网安备 33010602011771号