test_name: "test Kimi-K2-Instruct-W8A8 2-nodes-dp4-tp8-torchair"

model: "vllm-ascend/Kimi-K2-Instruct-W8A8"



num_nodes: 2

npu_per_node: 16

env_common:

  VLLM_USE_MODELSCOPE: true

  HCCL_BUFFSIZE: 1024

  SERVER_PORT: 8080

  OMP_PROC_BIND: false

  OMP_NUM_THREADS: 100

  NUMEXPR_MAX_THREADS: 128



deployment:

  -

    server_cmd: >

      vllm serve "vllm-ascend/Kimi-K2-Instruct-W8A8"

      --host 0.0.0.0

      --port $SERVER_PORT

      --data-parallel-size 4

      --data-parallel-size-local 2

      --data-parallel-start-rank 0

      --data-parallel-address $LOCAL_IP

      --data-parallel-rpc-port 13389

      --tensor-parallel-size 8

      --seed 1024

      --enable-expert-parallel

      --max-num-seqs 32

      --max-model-len 8192

      --max-num-batched-tokens 8192

      --quantization ascend

      --trust-remote-code

      --no-enable-prefix-caching

      --gpu-memory-utilization 0.9

      --additional-config '{"torchair_graph_config":{"enabled":true}}'



  -

    server_cmd: >

      vllm serve "vllm-ascend/Kimi-K2-Instruct-W8A8"

      --headless

      --data-parallel-size 4

      --data-parallel-size-local 2

      --data-parallel-start-rank 2

      --data-parallel-address $MASTER_IP

      --data-parallel-rpc-port 13389

      --tensor-parallel-size 8

      --seed 1024

      --enable-expert-parallel

      --max-num-seqs 32

      --max-model-len 8192

      --max-num-batched-tokens 8192

      --quantization ascend

      --trust-remote-code

      --no-enable-prefix-caching

      --gpu-memory-utilization 0.9

      --additional-config '{"torchair_graph_config":{"enabled":true}}'



benchmarks:

  perf:

    case_type: performance

    dataset_path: vllm-ascend/GSM8K-in3500-bs2800

    request_conf: vllm_api_stream_chat

    dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf

    num_prompts: 512

    max_out_len: 256

    batch_size: 64

    trust_remote_code: True

    request_rate: 11.2

    baseline: 1

    threshold: 0.97

  acc:

    case_type: accuracy

    dataset_path: vllm-ascend/gsm8k-lite

    request_conf: vllm_api_general_chat

    dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt

    max_out_len: 7680

    batch_size: 64

    baseline: 95

    threshold: 5