MindSpeed-MM/examples/cogvideox/i2v_1.0/inference_model_i2v.json-代码预览-MindSpeed-MM:基于昇腾芯片的多模态大模型训练套件项目 - AtomGit

Ii-robot!1029 [Refactor]configuration file reconstruction
f755d362创建于 2025年6月9日历史提交
{
    "ae": {
        "model_id": "contextparallelcasualvae",
        "from_pretrained": "5b-cogvideo/vae/3d-vae.pt",
        "cp_size": 1,
        "dtype": "float16",
        "z_channels": 16,
        "conv_padding": 0,
        "num_res_blocks": 3,
        "hidden_size_mult": [1,2,2,4],
        "encoder_attention": "",
        "encoder_nonlinearity": "swish",
        "encoder_conv_in": "ContextParallelCausalConv3d",
        "encoder_conv_out": "ContextParallelCausalConv3d",
        "encoder_mid_resnet": "ContextParallelResnetBlock3D",
        "encoder_resnet_blocks": [
            "ContextParallelResnetBlock3D",
            "ContextParallelResnetBlock3D",
            "ContextParallelResnetBlock3D",
            "ContextParallelResnetBlock3D"
        ],
        "encoder_spatial_downsample": [
            "DownSample3D",
            "DownSample3D",
            "DownSample3D",
            ""
        ],
        "encoder_temporal_downsample": [
            "",
            "",
            "",
            ""
        ],
        "decoder_attention": "",
        "decoder_nonlinearity": "swish",
        "decoder_conv_in": "ContextParallelCausalConv3d",
        "decoder_conv_out": "ContextParallelCausalConv3d",
        "decoder_mid_resnet": "ContextParallelResnetBlock3D",
        "decoder_resnet_blocks": [
            "ContextParallelResnetBlock3D",
            "ContextParallelResnetBlock3D",
            "ContextParallelResnetBlock3D",
            "ContextParallelResnetBlock3D"
        ],
        "decoder_spatial_upsample": [
            "",
            "Upsample3D",
            "Upsample3D",
            "Upsample3D"
        ],
        "decoder_temporal_upsample": [
            "",
            "",
            "",
            ""
        ],
        "encoder_gather_norm": true,
        "decoder_gather_norm": true,
        "use_quant_layer": false,
        "vae_scale_factor":[4, 8, 0.7]
    },
    "text_encoder": {
        "model_id": "T5",
        "hub_backend": "hf",
        "from_pretrained": "5b-cogvideo",
        "dtype": "float16",
        "low_cpu_mem_usage": true,
        "use_attention_mask": false
    },
    "tokenizer":{
        "hub_backend": "hf",
        "autotokenizer_name": "T5Tokenizer",
        "from_pretrained": "5b-cogvideo",
        "model_max_length": 226
    },
    "predictor": {
        "model_id": "satdit",
        "from_pretrained": null,
        "dtype": "float16",
        "num_layers": 42,
        "num_heads": 48,
        "head_dim": 64,
        "in_channels": 32,
        "out_channels": 16,
        "dropout": 0.0,
        "cross_attention_dim": null,
        "attention_bias": true,
        "input_size": [13, 60, 90],
        "patch_type": "2D",
        "patch_size": [1, 2, 2],
        "activation_fn": "gelu-approximate",
        "num_embeds_ada_norm": 1000,
        "norm_type": "qk_ln",
        "norm_elementwise_affine": true,
        "norm_eps": 1e-5,
        "caption_channels": null,
        "time_embed_dim": 512,
        "text_length": 226,
        "text_hidden_size": 4096,
        "concat_text_embed": true,
        "interpolation_scale": [1.0, 1.0, 1.0],
        "use_rope": true
    },
    "diffusion": {
        "model_id": "cogvideo_diffusion",
        "num_inference_steps": 50,
        "device":"npu",
        "sigma_sampler_config": {
            "uniform_sampling": true,
            "num_idx": 1000,
            "discretization_config":{
                "shift_scale": 1.0
            }
        },
        "denoiser_config": {
            "num_idx": 1000,
            "quantize_c_noise": false,
            "discretization_config":{
                "shift_scale": 1.0
            }
        },
        "scheduler_config": {
            "beta_end": 0.012,
            "beta_schedule": "scaled_linear",
            "beta_start": 0.00085,
            "clip_sample": false,
            "clip_sample_range": 1.0,
            "num_train_timesteps": 1000,
            "prediction_type": "v_prediction",
            "rescale_betas_zero_snr": true,
            "sample_max_value": 1.0,
            "set_alpha_to_one": true,
            "steps_offset": 0,
            "timestep_spacing": "trailing",
            "trained_betas": null,
            "snr_shift_scale": 1.0
        }
    },

    "pipeline_config": {
        "use_dynamic_cfg": true,
        "input_size": [49, 480, 720],
        "guidance_scale": 6.0,
        "use_tiling": true
    },

    "frame_interval": 1,
    "save_path":"./cogvideox_samples_i2v/",
    "fps": 8,
    "prompt":"examples/cogvideox/samples_i2v_prompts.txt",
    "image": "examples/cogvideox/samples_i2v_images.txt",
    "use_prompt_preprocess": true,
    "pipeline_class": "CogVideoXPipeline",
    "device":"npu",
    "dtype": "float16"
}