mindspeed-llm/MindSpeed-LLM/tests/pipeline/ut/inference/test_inference.json · Dnisde7/MindSpeed-LLM训练实战_0528 - AtomGit

Dnisde7tç05-添Add repository Ascend/MindSpeed-LLM until 26/05/28
{
    "test_llama2_mcore_prompt_greedy_search": [
        {
            "param": {
                "tensor-model-parallel-size": 8,
                "pipeline-model-parallel-size": 1,
                "use-mcore-models": null,
                "use-kv-cache": null,
                "use-flash-attn": null,
                "use-fused-swiglu": null,
                "use-fused-rmsnorm": null,
                "use-fused-rotary-pos-emb": null,
                "num-layers": 32,
                "hidden-size": 4096,
                "ffn-hidden-size": 11008,
                "position-embedding-type": "rope", 
                "seq-length": 4096, 
                "max-new-tokens": 30,
                "micro-batch-size": 1,
                "global-batch-size": 1,
                "num-attention-heads": 32,
                "max-position-embeddings": 4096,
                "make-vocab-size-divisible-by": 1,
                "swiglu": null,
                "normalization": "RMSNorm", 
                "load":"/data/ci/models/llama2/mg/llama2-tp8pp1",
                "tokenizer-type": "PretrainedFromHF",  
                "tokenizer-name-or-path":"/data/ci/models/llama2/hf/llama-2-7b-hf",
                "tokenizer-model": "/data/ci/models/llama2/hf/llama-2-7b-hf/tokenizer.model",
                "disable-bias-linear": null,
                "attention-softmax-in-fp32": null,
                "untie-embeddings-and-output-weights": null,
                "no-masked-softmax-fusion": null,
                "no-load-optim": null,
                "no-load-rng": null,
                "fp16": null,
                "task":"greedy",
                "npu-deterministic": null,
                "prompt-type": "llama2",
                "transformer-impl": "local"
            }
        }
    ],

    "test_llama2_mcore_do_sample_search": [
        {
            "param": {
                "tensor-model-parallel-size": 8,
                "pipeline-model-parallel-size": 1,
                "use-mcore-models": null,
                "use-kv-cache": null,
                "use-flash-attn": null,
                "use-fused-swiglu": null,
                "use-fused-rmsnorm": null,
                "use-fused-rotary-pos-emb": null,
                "num-layers": 32,
                "hidden-size": 4096,
                "ffn-hidden-size": 11008,
                "position-embedding-type": "rope",
                "seq-length": 4096,
                "max-new-tokens": 30,
                "micro-batch-size": 1,
                "global-batch-size": 1,
                "num-attention-heads": 32,
                "max-position-embeddings": 4096,
                "make-vocab-size-divisible-by": 1,
                "swiglu": null,
                "normalization": "RMSNorm",
                "load":"/data/ci/models/llama2/mg/llama2-tp8pp1",
                "tokenizer-type": "PretrainedFromHF",
                "tokenizer-name-or-path":"/data/ci/models/llama2/hf/llama-2-7b-hf",
                "tokenizer-model": "/data/ci/models/llama2/hf/llama-2-7b-hf/tokenizer.model",
                "disable-bias-linear": null,
                "attention-softmax-in-fp32": null,
                "untie-embeddings-and-output-weights": null,
                "no-masked-softmax-fusion": null,
                "no-load-optim": null,
                "no-load-rng": null,
                "fp16": null,
                "task":"do_sample",
                "npu-deterministic": null,
                "prompt-type": "llama2",
                "transformer-impl": "local"
            }
        }
    ],
    
    "test_llama2_mcore_do_beam_search": [
        {
            "param": {
                "tensor-model-parallel-size": 8,
                "pipeline-model-parallel-size": 1,
                "use-mcore-models": null,
                "use-kv-cache": null,
                "use-flash-attn": null,
                "use-fused-swiglu": null,
                "use-fused-rmsnorm": null,
                "use-fused-rotary-pos-emb": null,
                "num-layers": 32,
                "hidden-size": 4096,
                "ffn-hidden-size": 11008,
                "position-embedding-type": "rope",
                "seq-length": 4096,
                "max-new-tokens": 30,
                "micro-batch-size": 1,
                "global-batch-size": 1,
                "num-attention-heads": 32,
                "max-position-embeddings": 4096,
                "make-vocab-size-divisible-by": 1,
                "swiglu": null,
                "normalization": "RMSNorm",
                "load":"/data/ci/models/llama2/mg/llama2-tp8pp1",
                "tokenizer-type": "PretrainedFromHF",
                "tokenizer-name-or-path":"/data/ci/models/llama2/hf/llama-2-7b-hf",
                "tokenizer-model": "/data/ci/models/llama2/hf/llama-2-7b-hf/tokenizer.model",
                "disable-bias-linear": null,
                "attention-softmax-in-fp32": null,
                "untie-embeddings-and-output-weights": null,
                "no-masked-softmax-fusion": null,
                "no-load-optim": null,
                "no-load-rng": null,
                "fp16": null,
                "task":"beam_search",
                "npu-deterministic": null,
                "prompt-type": "llama2",
                "transformer-impl": "local"
            }
        }
    ],


    "test_llama3_mcore_greedy_search_with_tp2pp4sp": [
        {
            "param": {
                "tensor-model-parallel-size": 2,
                "pipeline-model-parallel-size": 4,
                "sequence-parallel": null,
                "use-mcore-models": null,
                "use-fused-swiglu": null,
                "use-fused-rmsnorm": null,
                "use-fused-rotary-pos-emb": null,
                "num-layers": 32,
                "hidden-size": 4096,
                "ffn-hidden-size": 14336,
                "position-embedding-type": "rope",
                "rotary-base": 500000, 
                "seq-length": 8192,
                "max-position-embeddings": 8192,
                "max-new-tokens": 50,
                "micro-batch-size": 1,
                "num-attention-heads": 32,
                "num-query-groups": 8,
                "group-query-attention": null,
                "make-vocab-size-divisible-by": 16032,
                "swiglu": null,
                "normalization": "RMSNorm", 
                "norm-epsilon": 1e-5,
                "hidden-dropout": 0,
                "attention-dropout": 0,
                "load": "/data/ci/models/llama3/mg/llama3-8b-mcore-tp2-pp4",
                "tokenizer-type": "PretrainedFromHF",  
                "tokenizer-name-or-path":"/data/ci/models/llama3/hf/llama-3-8b-hf",
                "tokenizer-model": "/data/ci/models/llama3/hf/llama-3-8b-hf/tokenizer.model", 
                "disable-bias-linear": null,
                "attention-softmax-in-fp32": null, 
                "untie-embeddings-and-output-weights": null, 
                "no-masked-softmax-fusion": null, 
                "no-load-optim": null, 
                "no-load-rng": null, 
                "bf16": null,
                "task":"greedy",
                "seed": 42,
                "npu-deterministic": null,
                "transformer-impl": "local"
            }
        }
    ],

    "test_llama3_mcore_beam_search_with_sampling_tp2pp4sp": [
        {
            "param": {
                "tensor-model-parallel-size": 2,
                "pipeline-model-parallel-size": 4,
                "sequence-parallel": null,
                "use-mcore-models": null,
                "use-fused-swiglu": null,
                "use-fused-rmsnorm": null,
                "use-fused-rotary-pos-emb": null,
                "num-layers": 32,
                "hidden-size": 4096,
                "ffn-hidden-size": 14336,
                "position-embedding-type": "rope",
                "rotary-base": 500000,
                "seq-length": 8192,
                "max-position-embeddings": 8192,
                "max-new-tokens": 50,
                "micro-batch-size": 1,
                "num-attention-heads": 32,
                "num-query-groups": 8,
                "group-query-attention": null,
                "make-vocab-size-divisible-by": 16032,
                "swiglu": null,
                "normalization": "RMSNorm",
                "norm-epsilon": 1e-5,
                "hidden-dropout": 0,
                "attention-dropout": 0,
                "load": "/data/ci/models/llama3/mg/llama3-8b-mcore-tp2-pp4",
                "tokenizer-type": "PretrainedFromHF",
                "tokenizer-name-or-path":"/data/ci/models/llama3/hf/llama-3-8b-hf",
                "tokenizer-model": "/data/ci/models/llama3/hf/llama-3-8b-hf/tokenizer.model",
                "disable-bias-linear": null,
                "attention-softmax-in-fp32": null,
                "untie-embeddings-and-output-weights": null,
                "no-masked-softmax-fusion": null,
                "no-load-optim": null,
                "no-load-rng": null,
                "bf16": null,
                "task":"beam_search_with_sampling",
                "seed": 42,
                "npu-deterministic": null,
                "transformer-impl": "local"
            }
        }
    ],
    "test_llama3_chat": [
        {
            "param": {
                "tensor-model-parallel-size": 2,
                "pipeline-model-parallel-size": 4,
                "sequence-parallel": null,
                "use-mcore-models": null,
                "use-fused-swiglu": null,
                "use-fused-rmsnorm": null,
                "use-fused-rotary-pos-emb": null,
                "num-layers": 32,
                "hidden-size": 4096,
                "ffn-hidden-size": 14336,
                "position-embedding-type": "rope",
                "rotary-base": 500000,
                "seq-length": 8192,
                "max-position-embeddings": 8192,
                "max-new-tokens": 50,
                "micro-batch-size": 1,
                "num-attention-heads": 32,
                "num-query-groups": 8,
                "group-query-attention": null,
                "make-vocab-size-divisible-by": 16032,
                "swiglu": null,
                "normalization": "RMSNorm",
                "norm-epsilon": 1e-5,
                "hidden-dropout": 0,
                "attention-dropout": 0,
                "load": "/data/ci/models/llama3/mg/llama3-8b-mcore-tp2-pp4",
                "tokenizer-type": "PretrainedFromHF",
                "tokenizer-name-or-path":"/data/ci/models/llama3/hf/llama-3-8b-hf",
                "tokenizer-model": "/data/ci/models/llama3/hf/llama-3-8b-hf/tokenizer.model",
                "disable-bias-linear": null,
                "attention-softmax-in-fp32": null,
                "untie-embeddings-and-output-weights": null,
                "no-masked-softmax-fusion": null,
                "no-load-optim": null,
                "no-load-rng": null,
                "bf16": null,
                "task":"chat",
                "seed": 42,
                "npu-deterministic": null,
                "transformer-impl": "local"
            }
        }
    ],
    "test_lingmini2_mcore_greedy_search": [
        {
            "param": {
                "use-mcore-models": null,
                "spec": ["mindspeed_llm.tasks.models.spec.bailing_spec", "layer_spec"],
                "tensor-model-parallel-size": 1,
                "pipeline-model-parallel-size": 1,
                "expert-model-parallel-size": 8,
                "use-flash-attn": null,
                "num-layers": 2,
                "hidden-size": 2048,
                "ffn-hidden-size": 5120,
                "seq-length": 2048,
                "task": "greedy",
                "max-new-tokens": 32,
                "micro-batch-size": 1,
                "global-batch-size": 16,
                "num-attention-heads": 16,
                "max-position-embeddings": 65536,
                "position-embedding-type": "rope",
                "swiglu": null,
                "load": "/data/ci/models/ling_v2/mg/ling-mini-2l",
                "tokenizer-type": "PretrainedFromHF",
                "tokenizer-name-or-path": "/data/ci/models/ling_v2/hf/ling-mini-base-2.0/",
                "bf16": null,
                "normalization": "RMSNorm",
                "untie-embeddings-and-output-weights": null,
                "disable-bias-linear": null,
                "attention-softmax-in-fp32": null,
                "no-masked-softmax-fusion": null,
                "use-distributed-optimizer": null,
                "npu-deterministic": null,
                "make-vocab-size-divisible-by": 1,
                "shape-order": "BNSD",
                "use-fused-swiglu": null,
                "use-fused-rmsnorm": null,
                "use-fused-rotary-pos-emb": null,
                "use-rotary-position-embeddings": null,
                "vocab-size": 157184,
                "padded-vocab-size": 157184,
                "rotary-base": 600000,
                "rotary-percent": 0.5,
                "norm-epsilon": 1e-6,
                "attention-dropout": 0.0,
                "hidden-dropout": 0.0,
                "group-query-attention": null,
                "num-query-groups": 4,
                "qk-layernorm": null,
                "moe-grouped-gemm": null,
                "moe-permutation-async-comm": null,
                "moe-token-dispatcher-type": "alltoall_seq",
                "first-k-dense-replace": 1,
                "moe-layer-freq": 1,
                "n-shared-experts": 1,
                "num-experts": 256,
                "moe-router-topk": 8,
                "moe-ffn-hidden-size": 512,
                "moe-router-load-balancing-type": "none",
                "moe-router-group-topk": 4,
                "moe-router-num-groups": 8,
                "moe-aux-loss-coeff": 0.001,
                "moe-router-topk-scaling-factor": 2.5,
                "moe-router-enable-expert-bias": null,
                "router-gating-in-fp32": null,
                "moe-alltoall-overlap-comm": null,
                "moe-router-score-function": "sigmoid",
                "norm-topk-prob": null,
                "transformer-impl": "local"
            }
        }
    ],

    "test_baichuan2_mcore_greedy_search": [
        {
            "param": {
                "tensor-model-parallel-size": 8,
                "pipeline-model-parallel-size": 1,
                "use-mcore-models": null,
                "use-kv-cache": null,
                "use-flash-attn": null,
                "num-layers": 40,
                "hidden-size": 5120,
                "ffn-hidden-size": 13696,
                "seq-length": 1024,
                "max-new-tokens": 30,
                "micro-batch-size": 1,
                "global-batch-size": 16,
                "num-attention-heads": 40,
                "max-position-embeddings": 2048 ,
                "position-embedding-type": "alibi",
                "square-alibi-mask": null,
                "fill-neg-inf": null,
                "swiglu": null,
                "transformer-impl": "local",
                "load": "/data/ci/models/baichuan2/mg/Baichuan2-13B-tp8pp1-mcore-hf",
                "tokenizer-type": "PretrainedFromHF",
                "tokenizer-name-or-path": "/data/ci/models/baichuan2/hf/baichuan2-13B-hf/",
                "tokenizer-not-use-fast": null,
                "fp16": null,
                "normalization": "RMSNorm" ,
                "untie-embeddings-and-output-weights": null,
                "disable-bias-linear": null,
                "attention-softmax-in-fp32": null,
                "no-load-optim": null,
                "no-load-rng": null,
                "no-masked-softmax-fusion": null,
                "no-gradient-accumulation-fusion": null,
                "exit-on-missing-checkpoint": null,
                "task": "greedy",
                "make-vocab-size-divisible-by": 32
            }
        }
    ]
}