apiVersion: v1
kind: ConfigMap
metadata:
  name: rings-config-mindie-server-0  # The ame must be the same as the name attribute of the AscendJob deployment. The prefix rings-config- cannot be modified.
  namespace: mindie
  labels:
    jobID: mindie-ms-test
    ring-controller.atlas: ascend-910b
    mx-consumer-cim: "true"
data:
  hccl.json: |
    {
        "status":"initializing"
    }
---
apiVersion: mindxdl.gitee.com/v1
kind: AscendJob
metadata:
  name: mindie-server-0
  namespace: mindie
  labels:
    framework: pytorch
    app: mindie-ms-server        # do not modify
    jobID: mindie-ms-test        # uid of infer job, modify it according to your job
    ring-controller.atlas: ascend-910b
    fault-scheduling: force
    fault-retry-times: "10000"
  annotations:
    huawei.com/schedule_policy: "chip8-node8"  # Please configure according to the actual AI chip topology of the task
spec:
  schedulerName: volcano   # work when enableGangScheduling is true
  runPolicy:
    schedulingPolicy:      # work when enableGangScheduling is true
      minAvailable: 2      # should equal to Master.replicas + Worker.replicas
      queue: default
  successPolicy: AllWorkers
  replicaSpecs:
    Master:
      replicas: 1
      restartPolicy: Always
      template:
        metadata:
          labels:
            ring-controller.atlas: ascend-910b
            app: mindie-ms-server
            jobID: mindie-ms-test
        spec:
          # 节点反亲和部署场景配置,即同一个节点至多部署一个实例
          # affinity:
          #   podAntiAffinity:
          #     requiredDuringSchedulingIgnoredDuringExecution:
          #       - labelSelector:
          #           matchExpressions:
          #             - key: deploy-name
          #               operator: In
          #               values:
          #                 - mindie-server
          #         topologyKey: kubernetes.io/hostname
          nodeSelector:
            accelerator: huawei-Ascend910
            # hardware-type: "800I-A2-32G"
          terminationGracePeriodSeconds: 10
          automountServiceAccountToken: false
          securityContext:
            fsGroup: 1001
          containers:
            - image: mindie:1.0.0-aarch64-800I-A2
              imagePullPolicy: IfNotPresent
              name: ascend         # do not modify
              securityContext:
                allowPrivilegeEscalation: false
                capabilities:
                  drop: ["ALL"]
                seccompProfile:
                  type: "RuntimeDefault"
              readinessProbe:
                exec:
                  command:
                  - bash
                  - -c
                  - "$MIES_INSTALL_PATH/scripts/http_client_ctl/probe.sh readiness"
                periodSeconds: 5
              livenessProbe:
                exec:
                  command:
                  - bash
                  - -c
                  - "$MIES_INSTALL_PATH/scripts/http_client_ctl/probe.sh liveness"
                periodSeconds: 5
              startupProbe:
                exec:
                  command:
                  - bash
                  - -c
                  - "$MIES_INSTALL_PATH/scripts/http_client_ctl/probe.sh startup"
                periodSeconds: 5
                failureThreshold: 100
              env:
              - name: POD_IP
                valueFrom:
                  fieldRef:
                    fieldPath: status.podIP
              - name: GLOBAL_RANK_TABLE_FILE_PATH
                value: "/user/serverid/devindex/config/..data/global_ranktable.json"
              - name: MIES_INSTALL_PATH
                value: $(MINDIE_USER_HOME_PATH)/Ascend/mindie/latest/mindie-service
              - name: CONFIG_FROM_CONFIGMAP_PATH
                value: /mnt/configmap
              envFrom:
                - configMapRef:
                    name: common-env
              command: ["/bin/bash", "-c", "
                  /mnt/configmap/boot.sh; \n
              "]
              resources:
                requests:
                  memory: "64Gi"
                  cpu: "16"
                  huawei.com/Ascend910: 8                     # Number of required NPUs. The maximum value is 16. You can add lines below to configure resources such as memory and CPU.
                limits:
                  memory: "256Gi"
                  cpu: "64"
                  huawei.com/Ascend910: 8                         # The value must be consistent with that in requests.
              volumeMounts:
              - name: data
                mountPath: /data
                readOnly: true
              - name: mindie-server-config
                mountPath: /mnt/configmap/config.json
                subPath: config.json
              - name: mindie-http-client-ctl-config
                mountPath: /mnt/configmap/http_client_ctl.json
                subPath: http_client_ctl.json
              - name: global-ranktable
                mountPath: /user/serverid/devindex/config
              - name: python-script-get-group-id
                mountPath: /mnt/configmap/get_group_id.py
                subPath: get_group_id.py
              - name: python-script-update-server-conf
                mountPath: /mnt/configmap/update_mindie_server_config.py
                subPath: update_mindie_server_config.py
              - name: boot-bash-script
                mountPath: /mnt/configmap/boot.sh
                subPath: boot.sh
              - name: queue-schedule
                mountPath: /var/queue_schedule
              - name: dshm
                mountPath: /dev/shm
          volumes:
          - name: data
            hostPath:
              path: /data
          - name: queue-schedule
            hostPath:
              path: /var/queue_schedule
          - name: global-ranktable
            configMap:
              name: global-ranktable
              defaultMode: 0640
          - name: mindie-server-config
            configMap:
              name: mindie-server-config
              defaultMode: 0640
          - name: mindie-http-client-ctl-config
            configMap:
              name: mindie-http-client-ctl-config
              defaultMode: 0640
          - name: python-script-get-group-id
            configMap:
              name: python-script-get-group-id
              defaultMode: 0640
          - name: python-script-update-server-conf
            configMap:
              name: python-script-update-server-conf
              defaultMode: 0640
          - name: boot-bash-script
            configMap:
              name: boot-bash-script
              defaultMode: 0550
          - name: dshm
            emptyDir:
              medium: Memory
              sizeLimit: 4Gi
    Worker:
      replicas: 1
      restartPolicy: Always
      template:
        metadata:
          labels:
            ring-controller.atlas: ascend-910b
            app: mindie-ms-server
            jobID: mindie-ms-test
        spec:
          # 节点反亲和部署场景配置,即同一个节点至多部署一个实例
          # affinity:
          #   podAntiAffinity:
          #     requiredDuringSchedulingIgnoredDuringExecution:
          #       - labelSelector:
          #           matchExpressions:
          #             - key: deploy-name
          #               operator: In
          #               values:
          #                 - mindie-server
          #         topologyKey: kubernetes.io/hostname
          nodeSelector:
            accelerator: huawei-Ascend910
            # hardware-type: "800I-A2-32G"
          terminationGracePeriodSeconds: 10
          automountServiceAccountToken: false
          securityContext:
            fsGroup: 1001
          containers:
            - image: mindie:1.0.0-aarch64-800I-A2
              imagePullPolicy: IfNotPresent
              name: ascend       # do not modify
              securityContext:
                allowPrivilegeEscalation: false
                capabilities:
                  drop: ["ALL"]
                seccompProfile:
                  type: "RuntimeDefault"
              readinessProbe:
                exec:
                  command:
                  - bash
                  - -c
                  - "$MIES_INSTALL_PATH/scripts/http_client_ctl/probe.sh readiness"
                periodSeconds: 5
              livenessProbe:
                exec:
                  command:
                  - bash
                  - -c
                  - "$MIES_INSTALL_PATH/scripts/http_client_ctl/probe.sh liveness"
                periodSeconds: 5
              startupProbe:
                exec:
                  command:
                  - bash
                  - -c
                  - "$MIES_INSTALL_PATH/scripts/http_client_ctl/probe.sh startup"
                periodSeconds: 5
                failureThreshold: 100
              env:
              - name: POD_IP
                valueFrom:
                  fieldRef:
                    fieldPath: status.podIP
              - name: GLOBAL_RANK_TABLE_FILE_PATH
                value: "/user/serverid/devindex/config/..data/global_ranktable.json"
              - name: MIES_INSTALL_PATH
                value: $(MINDIE_USER_HOME_PATH)/Ascend/mindie/latest/mindie-service
              - name: CONFIG_FROM_CONFIGMAP_PATH
                value: /mnt/configmap
              envFrom:
                - configMapRef:
                    name: common-env
              command: ["/bin/bash", "-c", "
                  /mnt/configmap/boot.sh; \n
              "]
              resources:
                requests:
                  memory: "64Gi"
                  cpu: "16"
                  huawei.com/Ascend910: 8                     # Number of required NPUs. The maximum value is 16. You can add lines below to configure resources such as memory and CPU.
                limits:
                  memory: "256Gi"
                  cpu: "64"
                  huawei.com/Ascend910: 8                         # The value must be consistent with that in requests.
              volumeMounts:
              - name: data
                mountPath: /data
                readOnly: true
              - name: mindie-server-config
                mountPath: /mnt/configmap/config.json
                subPath: config.json
              - name: mindie-http-client-ctl-config
                mountPath: /mnt/configmap/http_client_ctl.json
                subPath: http_client_ctl.json
              - name: global-ranktable
                mountPath: /user/serverid/devindex/config
              - name: python-script-get-group-id
                mountPath: /mnt/configmap/get_group_id.py
                subPath: get_group_id.py
              - name: python-script-update-server-conf
                mountPath: /mnt/configmap/update_mindie_server_config.py
                subPath: update_mindie_server_config.py
              - name: boot-bash-script
                mountPath: /mnt/configmap/boot.sh
                subPath: boot.sh
              - name: queue-schedule
                mountPath: /var/queue_schedule
              - name: dshm
                mountPath: /dev/shm
          volumes:
          - name: data
            hostPath:
              path: /data
          - name: queue-schedule
            hostPath:
              path: /var/queue_schedule
          - name: global-ranktable
            configMap:
              name: global-ranktable
              defaultMode: 0640
          - name: mindie-server-config
            configMap:
              name: mindie-server-config
              defaultMode: 0640
          - name: mindie-http-client-ctl-config
            configMap:
              name: mindie-http-client-ctl-config
              defaultMode: 0640
          - name: python-script-get-group-id
            configMap:
              name: python-script-get-group-id
              defaultMode: 0640
          - name: python-script-update-server-conf
            configMap:
              name: python-script-update-server-conf
              defaultMode: 0640
          - name: boot-bash-script
            configMap:
              name: boot-bash-script
              defaultMode: 0550
          - name: dshm
            emptyDir:
              medium: Memory
              sizeLimit: 4Gi