apiVersion: v1
kind: ConfigMap
metadata:
name: rings-config-mindx-dls-test # The value of JobName must be the same as the name attribute of the following job. The prefix rings-config- cannot be modified.
namespace: vcjob # Select a proper namespace based on the site requirements. (The namespaces of ConfigMap and Job must be the same. In addition, if the tjm component of MindX-add exists, the vcjob namespace cannot be used.)
labels:
ring-controller.atlas: ascend-910 # The value cannot be modified. Service operations will be performed based on this label.
data:
hccl.json: |
{
"status":"initializing"
}
---
apiVersion: batch.volcano.sh/v1alpha1 # The value cannot be changed. The volcano API must be used.
kind: Job # Only the job type is supported at present.
metadata:
name: mindx-dls-test # The value must be consistent with the name of ConfigMap.
namespace: vcjob # Select a proper namespace based on the site requirements. (The namespaces of ConfigMap and Job must be the same. In addition, if the tjm component of MindX-add exists, the vcjob namespace cannot be used.)
labels:
ring-controller.atlas: ascend-910 # The value must be the same as the label in ConfigMap and cannot be changed.
fault-scheduling: "force"
spec:
minAvailable: 1 # The value of minAvailable is 1 in a single-node scenario and N in an N-node distributed scenario.
schedulerName: volcano # Use the Volcano scheduler to schedule jobs.
policies:
- event: PodEvicted
action: RestartJob
plugins:
ssh: []
env: []
svc: []
maxRetry: 3
queue: default
tasks:
- name: "default-test"
replicas: 1 # The value of replicas is 1 in a single-node scenario and N in an N-node scenario. The number of NPUs in the requests field is 8 in an N-node scenario.
template:
metadata:
labels:
app: pytorch
ring-controller.atlas: ascend-910 # The value must be the same as the label in ConfigMap and cannot be changed.
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: volcano.sh/job-name
operator: In
values:
- mindx-dls-test
topologyKey: kubernetes.io/hostname
hostNetwork: true
containers:
- image: torch:b030 # Training framework image, which can be modified.
imagePullPolicy: IfNotPresent
name: pytorch
env:
- name: mindx-dls-test # The value must be the same as that of Jobname.
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: XDL_IP # IP address of the physical node, which is used to identify the node where the pod is running
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: framework
value: "PyTorch"
command:
- "/bin/bash"
- "-c"
# Commands for running the training script. Ensure that the involved commands and paths exist on Docker.
- "cd /job/code/ResNet50_for_PyTorch_1.8_code/scripts;chmod +x train_start.sh;bash train_start.sh /job/code/ResNet50_for_PyTorch_1.8_code/ /job/output/ DistributedResnet50/main_apex_d76_npu.py --data=/job/data/resnet50/imagenet --seed=49 --worker=128 --learning-rate=1.6 --warmup=8 --label-smoothing=0.1 --mom=0.9 --weight-decay=1.0e-04 --static-loss-scale=128 --print-freq=1 --dist-url='tcp://127.0.0.1:50000' --dist-backend='hccl' --multiprocessing-distributed --benchmark=0 --device='npu' --epoch=90 --batch-size=1024;"
#args: [ "while true; do sleep 30000; done;" ] # Comment out the preceding line and enable this line. You can manually run the training script in the container to facilitate debugging.
# The command is 'kubectl exec -it -n {namespace} {podname} bash'
resources:
requests:
huawei.com/Ascend910: 8 # Number of required NPUs. The maximum value is 8. You can add lines below to configure resources such as memory and CPU.
limits:
huawei.com/Ascend910: 8 # The value must be consistent with that in requests.
volumeMounts:
- name: ascend-910-config
mountPath: /user/serverid/devindex/config
- name: code
mountPath: /job/code/ # Path of the training script in the container.
- name: data
mountPath: /job/data # Path of the training dataset in the container.
- name: output
mountPath: /job/output # Training output path in the container.
- name: slog
mountPath: /var/log/npu
- name: ascend-driver
mountPath: /usr/local/Ascend/driver
- name: ascend-add-ons
mountPath: /usr/local/Ascend/add-ons
- name: dshm
mountPath: /dev/shm
- name: localtime
mountPath: /etc/localtime
nodeSelector:
host-arch: huawei-arm # Configure the label based on the actual job.
volumes:
- name: ascend-910-config
configMap:
name: rings-config-mindx-dls-test # Correspond to the ConfigMap name above.
- name: code
nfs:
server: 127.0.0.1 # IP address of the NFS server. In this example, the shared path is /data/atlas_dls/.
path: "/data/atlas_dls/public/code/" # Configure the training script path.
- name: data
nfs:
server: 127.0.0.1
path: "/data/atlas_dls/public/dataset" # Configure the path of the training set.
- name: output
nfs:
server: 127.0.0.1
path: "/data/atlas_dls/output/" # Configure the path for saving the configuration model, which is related to the script.
- name: slog
hostPath:
path: /var/log/npu # Configure the NPU log path and mount it to the local host.
- name: ascend-driver
hostPath:
path: /usr/local/Ascend/driver
- name: ascend-add-ons
hostPath:
path: /usr/local/Ascend/add-ons
- name: localtime
hostPath:
path: /etc/localtime # Configure the Docker time.
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 16Gi
restartPolicy: OnFailure