Milvus-standalone-GPU版本安装在centos上
前提:Nvidia驱动已经安装好
1.安装 NVIDIA Container Toolkit
curl -s -L -O https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo /etc/yum.repos.d/nvidia-container-toolkit.repo
来源:https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.repo
附REPO文件内容:
[nvidia-container-toolkit]
name=nvidia-container-toolkit
baseurl=https://nvidia.github.io/libnvidia-container/stable/rpm/$basearch
repo_gpgcheck=1
gpgcheck=0
enabled=1
gpgkey=https://nvidia.github.io/libnvidia-container/gpgkey
sslverify=1
sslcacert=/etc/pki/tls/certs/ca-bundle.crt
[nvidia-container-toolkit-experimental]
name=nvidia-container-toolkit-experimental
baseurl=https://nvidia.github.io/libnvidia-container/experimental/rpm/$basearch
repo_gpgcheck=1
gpgcheck=0
enabled=0
gpgkey=https://nvidia.github.io/libnvidia-container/gpgkey
sslverify=1
sslcacert=/etc/pki/tls/certs/ca-bundle.crt
2.安装工具包
sudo dnf install -y nvidia-container-toolkit
#验证
which nvidia-container-runtime
# 应输出路径:/usr/bin/nvidia-container-runtime
3.配置Docker配置文件
# 创建或修改 /etc/docker/daemon.json,添加以下内容
"runtimes": {
"nvidia": {
"path": "/usr/bin/nvidia-container-runtime",
"runtimeArgs": []
}
}
4.重启使生效
systemctl daemon-reload # 不重启内核有时会导致docker故障
systemctl restart firewalld # 不重启防火墙, 有时会导致docker0网桥无法启动, 直接删除似乎也可以(未测试)firewall-cmd --zone=trusted --remove-interface=docker0
systemctl restart docker
5.验证Nvidia运行时
docker info | grep -i nvidia
# 输出应包含 `nvidia.com/gpu` 相关信息
docker exec -it milvus-standalone nvidia-smi
# 输出GPU信息
6.完整的GPU版本-docker-compose文件
version: '3.5'
services:
etcd:
container_name: milvus-etcd
image: quay.io/coreos/etcd:v3.5.18
restart: always
environment:
- ETCD_AUTO_COMPACTION_MODE=revision
- ETCD_AUTO_COMPACTION_RETENTION=1000
- ETCD_QUOTA_BACKEND_BYTES=4294967296
- ETCD_SNAPSHOT_COUNT=50000
volumes:
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
command: etcd -advertise-client-urls=http://etcd:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
healthcheck:
test: ["CMD", "etcdctl", "endpoint", "health"]
interval: 30s
timeout: 20s
retries: 3
minio:
container_name: milvus-minio
image: minio/minio:RELEASE.2023-03-20T20-16-18Z
restart: always
environment:
MINIO_ACCESS_KEY: minioadmin
MINIO_SECRET_KEY: minioadmin
ports:
- "22001:9001"
- "22000:9000"
volumes:
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
command: minio server /minio_data --console-address ":22001"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 30s
timeout: 20s
retries: 3
standalone:
container_name: milvus-standalone
image: milvusdb/milvus:v2.5.11-gpu
restart: always
command: ["milvus", "run", "standalone"]
security_opt:
- seccomp:unconfined
environment:
ETCD_ENDPOINTS: etcd:2379
MINIO_ADDRESS: minio:9000
CUDA_VISIBLE_DEVICES: 0
volumes:
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
interval: 30s
start_period: 90s
timeout: 20s
retries: 3
ports:
- "19530:19530"
- "29091:9091"
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: ["gpu"]
device_ids: ["4"]
depends_on:
- "etcd"
- "minio"
networks:
default:
name: milvus
浙公网安备 33010602011771号