msmodelslim/docs/en/feature_guide/auto_precision_tuning/example/standing_high.yaml-代码预览-MindStudio-ModelSlim:基于昇腾生态的模型压缩工具项目 - AtomGit

### Here is an template of an auto-tuning config 
### With strategy standing_high, inference engine vllm-ascend and evaluator aisbench
### Before running the config, please make sure you have replaced necessary fields with your own values

strategy:
  type: standing_high # first fix anti method and search rollback layers, then change anti method and reduce rollback layers
  anti_outlier_strategies:
    - - type: "iter_smooth"
        alpha: 0.5
    - - type: "flex_smooth_quant"
  template:
    runner: auto
    process: # only support one and only one linear_quant processor
      - type: linear_quant
        qconfig:
          act:
            scope: per_tensor
            dtype: int8
            symmetric: false
            method: minmax
          weight:
            scope: per_channel
            dtype: int8
            symmetric: true
            method: minmax
        include: [ "*" ]
        exclude: [ ]
    save:
      - type: ascendv1_saver
        part_file_size: 4
    dataset: mix_calib.jsonl
  metadata:
    config_id: standing_high
    label:
      w_bit: 8
      a_bit: 8
      is_sparse: false
      kv_cache: false

evaluation:
  type: service_oriented
  demand:
    expectations:
      - dataset: gsm8k # replace with your own dataset, equal to datasets in evaluation.aisbench.datasets
        target: "83" # replace with your own target (quoted string, matches Decimal JSON encoding)
        tolerance: "2" # replace with your own tolerance (quoted string, matches Decimal JSON encoding)
  evaluation:
    type: aisbench
    precheck: # optional, pre-validation configuration before formal evaluation
      - type: expected_answer # rule type identifier
        test_cases: # English only, must be non-empty strings
          - "What is 2+2?": ["4", "four"] # check if response contains "4" or "four"
          - "What is the capital of China?": "Beijing" # check if response contains "Beijing"
        max_tokens: 256 # optional, default 512
        timeout: 60.0 # optional, default 60.0 seconds
    aisbench:
      binary: ais_bench
      mode: all
      timeout: 7200
      request_rate: 1.0
      retry: 2
      batch_size: 32 # replace with your own batch size, according to your dataset
      max_out_len: 512 # replace with your own max output length, according to your dataset
      trust_remote_code: false
      pred_postprocessor: extract_non_reasoning_content
      generation_kwargs:
        temperature: 0.5
        top_k: 10
        top_p: 0.9
        seed: null
        repetition_penalty: 1.03
        chat_template_kwargs:
          thinking: true
      model_meta:
        base_name: vllm_api_general_chat
        subdir: vllm_api
        abbr: vllm-api-general-chat
        attr: service
    datasets:
      gsm8k:
        config_name: "gsm8k_gen_0_shot_cot_str"
        mode: all
        # max_out_len not specified, will use global max_out_len (512) from aisbench section
      aime25:
        config_name: "aime2025_gen_0_shot_chat_prompt"
        mode: all
      bfcl-simple:
        config_name: "BFCL_gen_simple"
        mode: all
        max_out_len: 1024  # dataset-specific max output length, will override global max_out_len
        returns_tool_calls: true
        api_chat_type: VLLMFunctionCallAPIChat
      # add other datasets aisbench supports here
      # Note: max_out_len is optional for each dataset. If not specified, the global max_out_len from aisbench section will be used
    host: localhost
    port: 1234 # replace with your own port, equal to port in inference_engine.port
    served_model_name: served_model_name
  inference_engine:
    type: vllm-ascend
    entrypoint: vllm.entrypoints.openai.api_server
    env_vars:
      HCCL_BUFFSIZE: 1024
      ASCEND_RT_VISIBLE_DEVICES: 0 # replace with your own visible devices, e.g. 0,1,2,3 for 4 cards

    served_model_name: served_model_name
    host: localhost
    port: 1234 # replace with your own port, equal to port in evaluation.aisbench.host
    health_check_endpoint: /v1/models # to check if the inference engine is running
    startup_timeout: 600 # replace with your own startup timeout, e.g. deepseek-v3 needs 1800 seconds
    args:
      enforce-eager: true # use single operator mode
      served-model-name: served_model_name
      trust-remote-code: true
      tensor-parallel-size: 1 # replace with your own tensor parallel size, according to your model and device
      data-parallel-size: 1 # replace with your own data parallel size, according to your model and device
      quantization: ascend
      enable-prefix-caching: false
      max-model-len: 8192 # replace with your own max model length, according to your dataset
      max-num-batched-tokens: 8192 # replace with your own max number of batched tokens, according to your dataset
      gpu-memory-utilization: 0.9
      enable-auto-tool-choice: true
      tool-call-parser: hermes
      additional_config:
        ascend_scheduler_config:
          enable: true
        enable_weight_nz_layout: true