Milvus-standalone-GPU版本安装在centos上

前提:Nvidia驱动已经安装好
1.安装 NVIDIA Container Toolkit
curl -s -L -O https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo /etc/yum.repos.d/nvidia-container-toolkit.repo

来源:https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.repo

附REPO文件内容:

[nvidia-container-toolkit]
name=nvidia-container-toolkit
baseurl=https://nvidia.github.io/libnvidia-container/stable/rpm/$basearch
repo_gpgcheck=1
gpgcheck=0
enabled=1
gpgkey=https://nvidia.github.io/libnvidia-container/gpgkey
sslverify=1
sslcacert=/etc/pki/tls/certs/ca-bundle.crt
​
[nvidia-container-toolkit-experimental]
name=nvidia-container-toolkit-experimental
baseurl=https://nvidia.github.io/libnvidia-container/experimental/rpm/$basearch
repo_gpgcheck=1
gpgcheck=0
enabled=0
gpgkey=https://nvidia.github.io/libnvidia-container/gpgkey
sslverify=1
sslcacert=/etc/pki/tls/certs/ca-bundle.crt
​

 

2.安装工具包
sudo dnf install -y nvidia-container-toolkit
​
#验证
which nvidia-container-runtime
# 应输出路径:/usr/bin/nvidia-container-runtime

 

3.配置Docker配置文件
# 创建或修改 /etc/docker/daemon.json,添加以下内容
  "runtimes": {
    "nvidia": {
      "path": "/usr/bin/nvidia-container-runtime",
      "runtimeArgs": []
    }
  }

 

4.重启使生效
systemctl daemon-reload  # 不重启内核有时会导致docker故障
systemctl restart firewalld  # 不重启防火墙, 有时会导致docker0网桥无法启动, 直接删除似乎也可以(未测试)firewall-cmd --zone=trusted --remove-interface=docker0 
systemctl restart docker

 

5.验证Nvidia运行时
docker info | grep -i nvidia
# 输出应包含 `nvidia.com/gpu` 相关信息
docker exec -it milvus-standalone nvidia-smi
# 输出GPU信息

 

6.完整的GPU版本-docker-compose文件
version: '3.5'
​
services:
  etcd:
    container_name: milvus-etcd
    image: quay.io/coreos/etcd:v3.5.18
    restart: always
    environment:
      - ETCD_AUTO_COMPACTION_MODE=revision
      - ETCD_AUTO_COMPACTION_RETENTION=1000
      - ETCD_QUOTA_BACKEND_BYTES=4294967296
      - ETCD_SNAPSHOT_COUNT=50000
    volumes:
      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
    command: etcd -advertise-client-urls=http://etcd:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
    healthcheck:
      test: ["CMD", "etcdctl", "endpoint", "health"]
      interval: 30s
      timeout: 20s
      retries: 3
​
  minio:
    container_name: milvus-minio
    image: minio/minio:RELEASE.2023-03-20T20-16-18Z
    restart: always
    environment:
      MINIO_ACCESS_KEY: minioadmin
      MINIO_SECRET_KEY: minioadmin
    ports:
      - "22001:9001"
      - "22000:9000"
    volumes:
      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
    command: minio server /minio_data --console-address ":22001"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
      interval: 30s
      timeout: 20s
      retries: 3
​
  standalone:
    container_name: milvus-standalone
    image: milvusdb/milvus:v2.5.11-gpu
    restart: always
    command: ["milvus", "run", "standalone"]
    security_opt:
    - seccomp:unconfined
    environment:
      ETCD_ENDPOINTS: etcd:2379
      MINIO_ADDRESS: minio:9000
      CUDA_VISIBLE_DEVICES: 0 
    volumes:
      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
      interval: 30s
      start_period: 90s
      timeout: 20s
      retries: 3
    ports:
      - "19530:19530"
      - "29091:9091"
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              capabilities: ["gpu"]
              device_ids: ["4"]
    depends_on:
      - "etcd"
      - "minio"
​
networks:
  default:
    name: milvus
posted @ 2025-04-30 11:50  天海沙  阅读(102)  评论(0)    收藏  举报