apiVersion: v1
kind: ConfigMap
metadata:
name: rings-config-mindie-server-0
namespace: mindie
labels:
jobID: mindie-ms-test
ring-controller.atlas: ascend-910b
mx-consumer-cim: "true"
data:
hccl.json: |
{
"status":"initializing"
}
---
apiVersion: mindxdl.gitee.com/v1
kind: AscendJob
metadata:
name: mindie-server-0
namespace: mindie
labels:
framework: pytorch
app: mindie-ms-server
jobID: mindie-ms-test
ring-controller.atlas: ascend-910b
fault-scheduling: force
fault-retry-times: "10000"
annotations:
huawei.com/schedule_policy: "chip8-node8"
spec:
schedulerName: volcano
runPolicy:
schedulingPolicy:
minAvailable: 2
queue: default
successPolicy: AllWorkers
replicaSpecs:
Master:
replicas: 1
restartPolicy: Always
template:
metadata:
labels:
ring-controller.atlas: ascend-910b
app: mindie-ms-server
jobID: mindie-ms-test
spec:
nodeSelector:
accelerator: huawei-Ascend910
terminationGracePeriodSeconds: 10
automountServiceAccountToken: false
securityContext:
fsGroup: 1001
containers:
- image: mindie:1.0.0-aarch64-800I-A2
imagePullPolicy: IfNotPresent
name: ascend
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
seccompProfile:
type: "RuntimeDefault"
readinessProbe:
exec:
command:
- bash
- -c
- "$MIES_INSTALL_PATH/scripts/http_client_ctl/probe.sh readiness"
periodSeconds: 5
livenessProbe:
exec:
command:
- bash
- -c
- "$MIES_INSTALL_PATH/scripts/http_client_ctl/probe.sh liveness"
periodSeconds: 5
startupProbe:
exec:
command:
- bash
- -c
- "$MIES_INSTALL_PATH/scripts/http_client_ctl/probe.sh startup"
periodSeconds: 5
failureThreshold: 100
env:
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: GLOBAL_RANK_TABLE_FILE_PATH
value: "/user/serverid/devindex/config/..data/global_ranktable.json"
- name: MIES_INSTALL_PATH
value: $(MINDIE_USER_HOME_PATH)/Ascend/mindie/latest/mindie-service
- name: CONFIG_FROM_CONFIGMAP_PATH
value: /mnt/configmap
envFrom:
- configMapRef:
name: common-env
command: ["/bin/bash", "-c", "
/mnt/configmap/boot.sh; \n
"]
resources:
requests:
memory: "64Gi"
cpu: "16"
huawei.com/Ascend910: 8
limits:
memory: "256Gi"
cpu: "64"
huawei.com/Ascend910: 8
volumeMounts:
- name: data
mountPath: /data
readOnly: true
- name: mindie-server-config
mountPath: /mnt/configmap/config.json
subPath: config.json
- name: mindie-http-client-ctl-config
mountPath: /mnt/configmap/http_client_ctl.json
subPath: http_client_ctl.json
- name: global-ranktable
mountPath: /user/serverid/devindex/config
- name: python-script-get-group-id
mountPath: /mnt/configmap/get_group_id.py
subPath: get_group_id.py
- name: python-script-update-server-conf
mountPath: /mnt/configmap/update_mindie_server_config.py
subPath: update_mindie_server_config.py
- name: boot-bash-script
mountPath: /mnt/configmap/boot.sh
subPath: boot.sh
- name: queue-schedule
mountPath: /var/queue_schedule
- name: dshm
mountPath: /dev/shm
volumes:
- name: data
hostPath:
path: /data
- name: queue-schedule
hostPath:
path: /var/queue_schedule
- name: global-ranktable
configMap:
name: global-ranktable
defaultMode: 0640
- name: mindie-server-config
configMap:
name: mindie-server-config
defaultMode: 0640
- name: mindie-http-client-ctl-config
configMap:
name: mindie-http-client-ctl-config
defaultMode: 0640
- name: python-script-get-group-id
configMap:
name: python-script-get-group-id
defaultMode: 0640
- name: python-script-update-server-conf
configMap:
name: python-script-update-server-conf
defaultMode: 0640
- name: boot-bash-script
configMap:
name: boot-bash-script
defaultMode: 0550
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 4Gi
Worker:
replicas: 1
restartPolicy: Always
template:
metadata:
labels:
ring-controller.atlas: ascend-910b
app: mindie-ms-server
jobID: mindie-ms-test
spec:
nodeSelector:
accelerator: huawei-Ascend910
terminationGracePeriodSeconds: 10
automountServiceAccountToken: false
securityContext:
fsGroup: 1001
containers:
- image: mindie:1.0.0-aarch64-800I-A2
imagePullPolicy: IfNotPresent
name: ascend
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
seccompProfile:
type: "RuntimeDefault"
readinessProbe:
exec:
command:
- bash
- -c
- "$MIES_INSTALL_PATH/scripts/http_client_ctl/probe.sh readiness"
periodSeconds: 5
livenessProbe:
exec:
command:
- bash
- -c
- "$MIES_INSTALL_PATH/scripts/http_client_ctl/probe.sh liveness"
periodSeconds: 5
startupProbe:
exec:
command:
- bash
- -c
- "$MIES_INSTALL_PATH/scripts/http_client_ctl/probe.sh startup"
periodSeconds: 5
failureThreshold: 100
env:
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: GLOBAL_RANK_TABLE_FILE_PATH
value: "/user/serverid/devindex/config/..data/global_ranktable.json"
- name: MIES_INSTALL_PATH
value: $(MINDIE_USER_HOME_PATH)/Ascend/mindie/latest/mindie-service
- name: CONFIG_FROM_CONFIGMAP_PATH
value: /mnt/configmap
envFrom:
- configMapRef:
name: common-env
command: ["/bin/bash", "-c", "
/mnt/configmap/boot.sh; \n
"]
resources:
requests:
memory: "64Gi"
cpu: "16"
huawei.com/Ascend910: 8
limits:
memory: "256Gi"
cpu: "64"
huawei.com/Ascend910: 8
volumeMounts:
- name: data
mountPath: /data
readOnly: true
- name: mindie-server-config
mountPath: /mnt/configmap/config.json
subPath: config.json
- name: mindie-http-client-ctl-config
mountPath: /mnt/configmap/http_client_ctl.json
subPath: http_client_ctl.json
- name: global-ranktable
mountPath: /user/serverid/devindex/config
- name: python-script-get-group-id
mountPath: /mnt/configmap/get_group_id.py
subPath: get_group_id.py
- name: python-script-update-server-conf
mountPath: /mnt/configmap/update_mindie_server_config.py
subPath: update_mindie_server_config.py
- name: boot-bash-script
mountPath: /mnt/configmap/boot.sh
subPath: boot.sh
- name: queue-schedule
mountPath: /var/queue_schedule
- name: dshm
mountPath: /dev/shm
volumes:
- name: data
hostPath:
path: /data
- name: queue-schedule
hostPath:
path: /var/queue_schedule
- name: global-ranktable
configMap:
name: global-ranktable
defaultMode: 0640
- name: mindie-server-config
configMap:
name: mindie-server-config
defaultMode: 0640
- name: mindie-http-client-ctl-config
configMap:
name: mindie-http-client-ctl-config
defaultMode: 0640
- name: python-script-get-group-id
configMap:
name: python-script-get-group-id
defaultMode: 0640
- name: python-script-update-server-conf
configMap:
name: python-script-update-server-conf
defaultMode: 0640
- name: boot-bash-script
configMap:
name: boot-bash-script
defaultMode: 0550
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 4Gi