MindSpeed-MM/examples/cogvideox/i2v_1.5/eval_model_i2v_1.5.json-代码预览-MindSpeed-MM:基于昇腾芯片的多模态大模型训练套件项目 - AtomGit

ascend-robotstyle: pre-commit autofix cleancode (base check)
{
    "ae": {
        "model_id": "contextparallelcasualvae",
        "from_pretrained": "1.5b-cogvideo/vae/3d-vae.pt",
        "cp_size": 1,
        "dtype": "bf16",
        "z_channels": 16,
        "conv_padding": 0,
        "num_res_blocks": 3,
        "hidden_size_mult": [1,2,2,4],
        "encoder_attention": "",
        "encoder_nonlinearity": "swish",
        "encoder_conv_in": "ContextParallelCausalConv3d",
        "encoder_conv_out": "ContextParallelCausalConv3d",
        "encoder_mid_resnet": "ContextParallelResnetBlock3D",
        "encoder_resnet_blocks": [
            "ContextParallelResnetBlock3D",
            "ContextParallelResnetBlock3D",
            "ContextParallelResnetBlock3D",
            "ContextParallelResnetBlock3D"
        ],
        "encoder_spatial_downsample": [
            "DownSample3D",
            "DownSample3D",
            "DownSample3D",
            ""
        ],
        "encoder_temporal_downsample": [
            "",
            "",
            "",
            ""
        ],
        "decoder_attention": "",
        "decoder_nonlinearity": "swish",
        "decoder_conv_in": "ContextParallelCausalConv3d",
        "decoder_conv_out": "ContextParallelCausalConv3d",
        "decoder_mid_resnet": "ContextParallelResnetBlock3D",
        "decoder_resnet_blocks": [
            "ContextParallelResnetBlock3D",
            "ContextParallelResnetBlock3D",
            "ContextParallelResnetBlock3D",
            "ContextParallelResnetBlock3D"
        ],
        "decoder_spatial_upsample": [
            "",
            "Upsample3D",
            "Upsample3D",
            "Upsample3D"
        ],
        "decoder_temporal_upsample": [
            "",
            "",
            "",
            ""
        ],
        "encoder_gather_norm": true,
        "decoder_gather_norm": true,
        "use_quant_layer": false,
        "vae_scale_factor":[4, 8, 0.7]
    },
    "text_encoder": {
        "model_id": "T5",
        "hub_backend": "hf",
        "from_pretrained": "1.5b-cogvideo/t5-v1_1-xxl",
        "dtype": "bf16",
        "low_cpu_mem_usage": true,
        "use_attention_mask": false
    },
    "tokenizer":{
        "hub_backend": "hf",
        "autotokenizer_name": "T5Tokenizer",
        "from_pretrained": "1.5b-cogvideo/t5-v1_1-xxl",
        "model_max_length": 224
    },
    "predictor": {
        "model_id": "satdit",
        "from_pretrained": null,
        "dtype": "bf16",
        "num_layers": 42,
        "num_heads": 48,
        "head_dim": 64,
        "in_channels": 32,
        "out_channels": 16,
        "dropout": 0.0,
        "cross_attention_dim": null,
        "attention_bias": true,
        "input_size": [22, 96, 170],
        "patch_type": "3D",
        "patch_size": [2, 2, 2],
        "activation_fn": "gelu-approximate",
        "num_embeds_ada_norm": 1000,
        "norm_type": "qk_ln",
        "norm_elementwise_affine": true,
        "norm_eps": 1e-5,
        "norm_out_eps": 1e-5,
        "caption_channels": null,
        "time_embed_dim": 512,
        "text_length": 224,
        "text_hidden_size": 4096,
        "concat_text_embed": true,
        "interpolation_scale": [1.0, 1.0, 1.0],
        "use_rope": true,
        "ofs_embed_dim": 512
    },
    "diffusion": {
        "model_id": "cogvideo_diffusion",
        "num_inference_steps": 50,
        "device":"npu",
        "sigma_sampler_config": {
            "uniform_sampling": true,
            "num_idx": 1000,
            "discretization_config":{
                "shift_scale": 1.0
            }
        },
        "denoiser_config": {
            "num_idx": 1000,
            "quantize_c_noise": false,
            "discretization_config":{
                "shift_scale": 1.0
            }
        },
        "scheduler_config": {
            "beta_end": 0.012,
            "beta_schedule": "scaled_linear",
            "beta_start": 0.00085,
            "clip_sample": false,
            "clip_sample_range": 1.0,
            "num_train_timesteps": 1000,
            "prediction_type": "v_prediction",
            "rescale_betas_zero_snr": true,
            "sample_max_value": 1.0,
            "set_alpha_to_one": true,
            "steps_offset": 0,
            "timestep_spacing": "trailing",
            "trained_betas": null,
            "snr_shift_scale": 1.0
        }
    },

    "pipeline_config": {
        "use_dynamic_cfg": true,
        "input_size": [81, 768, 1360],
        "guidance_scale": 6.0,
        "use_tiling": true,
        "version": 1.5,
        "vae_invert_scale_latents": true
    },

    "frame_interval": 1,
    "save_path":"./cogvideox_samples_i2v_1.5/",
    "fps": 16,
    "prompt":"examples/cogvideox/samples_i2v_prompts.txt",
    "image": "examples/cogvideox/samples_i2v_images.txt",
    "use_prompt_preprocess": true,
    "pipeline_class": "CogVideoXPipeline",
    "device":"npu",
    "dtype": "bf16",
    "eval_config": {
        "dataset": {
            "type": "vbench_i2v",
            "basic_param": {
                "data_path": "i2v-dataset/vbench2_i2v_full_info.json",
                "data_folder": "vbench2_i2v/data",
                "return_type": "list",
                "data_storage_mode": "standard"
            },
            "extra_param": {
                "ratio": "16-9"
            }
        },
        "dataloader_param": {
            "dataloader_mode": "sampler",
            "sampler_type": "SequentialSampler",
            "shuffle": true,
            "drop_last": false,
            "pin_memory": true,
            "group_frame": false,
            "group_resolution": false,
            "collate_param": {},
            "prefetch_factor": 4
        },
        "evaluation_model":"cogvideox-1.5",
        "evaluation_impl": "vbench_eval",
        "eval_type": "i2v",
        "load_ckpt_from_local": true,
        "need_inference": true,
        "dimensions": ["subject_consistency"],
        "eval_result_path": "./eval_result",
        "image_path": "vbench2_i2v/data/crop"
    }
}