test_name: "test DeepSeek-R1-W8A8-longseq disaggregated_prefill"

model: "vllm-ascend/DeepSeek-R1-0528-W8A8"

num_nodes: 2

npu_per_node: 16

env_common:

  HCCL_OP_EXPANSION_MODE: AIV

  VLLM_USE_MODELSCOPE: true

  HCCL_BUFFSIZE: 768

  SERVER_PORT: 8080

  OMP_PROC_BIND: false

  OMP_NUM_THREADS: 1

  PYTORCH_NPU_ALLOC_CONF: expandable_segments:True

  HCCL_DETERMINISTIC: True

  TASK_QUEUE_ENABLE: 1

  HCCL_OP_RETRY_ENABLE: "L0:0, L1:0"

  VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT: 480



disaggregated_prefill:

  enabled: true

  prefiller_host_index: [0]

  decoder_host_index: [1]



deployment:

  -

    server_cmd: >

      vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8

          --host 0.0.0.0

          --port $SERVER_PORT

          --data-parallel-size 1

          --decode-context-parallel-size 8

          --prefill-context-parallel-size 2

          --tensor-parallel-size 8

          --cp-kv-cache-interleave-size 128

          --enforce-eager

          --enable-expert-parallel

          --seed 1024

          --quantization ascend

          --max-num-seqs 3

          --max-model-len 32768

          --max-num-batched-tokens 16384

          --trust-remote-code

          --gpu-memory-utilization 0.85

          --enable-chunked-prefill

          --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}'

          --kv-transfer-config

          '{"kv_connector": "MooncakeConnectorV1",

          "kv_role": "kv_producer",

          "kv_port": "30000",

          "engine_id": "0",

          "kv_connector_extra_config": {

                    "prefill": {

                            "dp_size": 1,

                            "tp_size": 8

                    },

                    "decode": {

                            "dp_size": 2,

                            "tp_size": 8

                    }

              }

          }'



  -

    server_cmd: >

      vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8

        --host 0.0.0.0

        --port $SERVER_PORT

        --data-parallel-size 2

        --decode-context-parallel-size 2

        --prefill-context-parallel-size 1

        --tensor-parallel-size 8

        --cp-kv-cache-interleave-size 128

        --enable-expert-parallel

        --seed 1024

        --quantization ascend

        --max-num-seqs 8

        --max-model-len 32768

        --max-num-batched-tokens 256

        --trust-remote-code

        --gpu-memory-utilization 0.85

        --compilation_config '{"cudagraph_capture_sizes":[4,8,16,32],"cudagraph_mode": "FULL_DECODE_ONLY"}'

        --enable-chunked-prefill

        --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}'

        --kv-transfer-config

        '{"kv_connector": "MooncakeConnectorV1",

        "kv_role": "kv_consumer",

        "kv_port": "30100",

        "engine_id": "1",

        "kv_connector_extra_config": {

                  "prefill": {

                          "dp_size": 1,

                          "tp_size": 8

                  },

                  "decode": {

                          "dp_size": 2,

                          "tp_size": 8

                  }

            }

        }'



benchmarks:

  acc:

    case_type: accuracy

    dataset_path: vllm-ascend/gsm8k

    request_conf: vllm_api_general_chat

    dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt

    max_out_len: 24576

    batch_size: 16

    baseline: 95

    threshold: 5