MindSpeed-MM/examples/opensoraplan1.3/t2v/eval_t2v_model.json-代码预览-MindSpeed-MM:基于昇腾芯片的多模态大模型训练套件项目 - AtomGit

Ii-robot!1029 [Refactor]configuration file reconstruction
f755d362创建于 2025年6月9日历史提交
{
    "predictor": {
        "model_id": "videoditsparse",
        "from_pretrained": "./weights/sparsedit/sparsedit_mm.pth",
        "dtype": "fp16",
        "num_layers" : 32,
        "num_heads": 24,
        "head_dim": 96,
        "in_channels": 8,
        "out_channels": 8,
        "dropout": 0.0,
        "cross_attention_dim": 2304,
        "attention_q_bias":true,
        "attention_k_bias":true,
        "attention_v_bias":true,
        "fa_layout":"sbh",
        "patch_size_thw": [1, 2, 2],
        "activation_fn": "gelu-approximate",
        "norm_elementwise_affine": false,
        "norm_eps": 1e-06,
        "caption_channels": 4096,
        "interpolation_scale": [1.0, 1.0, 1.0],
        "sparse1d": true,
        "sparse_n": 4
    },
    "ae": {
        "model_id": "wfvae",
        "from_pretrained": "./weights/vae/wfvae_mm.pt",
        "dtype": "fp16",
        "base_channels": 128,
        "decoder_energy_flow_hidden_size": 128,
        "decoder_num_resblocks": 2,
        "dropout": 0.0,
        "encoder_energy_flow_hidden_size": 64,
        "encoder_num_resblocks": 2,
        "latent_dim": 8,
        "use_attention": true,
        "norm_type": "aelayernorm",
        "t_interpolation": "trilinear",
        "vae_scale_factor": [4, 8, 8]
    },
    "text_encoder": {
        "model_id": "MT5",
        "hub_backend": "hf",
        "from_pretrained": "./weights/google/mt5-xxl",
        "dtype": "fp16"
    },
    "tokenizer":{
        "hub_backend": "hf",
        "autotokenizer_name": "AutoTokenizer",
        "from_pretrained": "./weights/google/mt5-xxl",
        "model_max_length": 512
    },

    "diffusion": {
        "model_id": "EulerAncestralDiscrete",
        "num_inference_steps":100,
        "guidance_scale":7.5,
        "prediction_type": "v_prediction",
        "rescale_betas_zero_snr": true,
        "device": "npu",
        "timestep_spacing": "trailing"
    },
    "pipeline_config": {
        "version": "v1.3",
        "use_attention_mask": true,
        "input_size": [93, 352, 640],
        "model_type": "t2v"
    },
    "frame_interval":1,
    "save_path":"examples/opensoraplan1.3/t2v/samples/",
    "fps":18,
    "prompt":"examples/opensoraplan1.3/t2v/samples_prompt.txt",
    "use_prompt_preprocess": false,
    "pipeline_class": "OpenSoraPlanPipeline",
    "device":"npu",
    "dtype": "fp16",
    "eval_config": {
        "dataset": {
            "type": "vbench_eval",
            "basic_param": {
                "data_path": "t2v-dataset/VBench_full_info.json",
                "data_folder": "vbench/prompts",
                "return_type": "list",
                "data_storage_mode": "standard"
            },
            "extra_param": {
                "augment": false,
                "prompt_file": "all_dimension.txt",
                "augmented_prompt_file": "augmented_prompts/gpt_enhanced_prompts/all_dimension_longer.txt",
                "ratio": "16-9"
            }
        },
        "dataloader_param": {
            "dataloader_mode": "sampler",
            "sampler_type": "SequentialSampler",
            "shuffle": true,
            "drop_last": false,
            "pin_memory": true,
            "group_frame": false,
            "group_resolution": false,
            "collate_param": {},
            "prefetch_factor": 4
        },
        "evaluation_model":"OpenSoraPlan-1.3",
        "evaluation_impl": "vbench_eval",
        "eval_type": "t2v",
        "long_eval_config": "path_to_long_eval_configs",
        "load_ckpt_from_local": true,
        "need_inference": true,
        "dimensions": ["subject_consistency", "background_consistency", "aesthetic_quality", "imaging_quality", "temporal_style", "overall_consistency", "human_action",
         "temporal_flickering", "motion_smoothness", "dynamic_degree", "appearance_style"],
        "eval_result_path": "./eval_result"
    }
}