{
"ae": {
"model_id": "contextparallelcasualvae",
"from_pretrained": "1.5b-cogvideo/vae/3d-vae.pt",
"cp_size": 1,
"dtype": "bf16",
"z_channels": 16,
"conv_padding": 0,
"num_res_blocks": 3,
"hidden_size_mult": [1,2,2,4],
"encoder_attention": "",
"encoder_nonlinearity": "swish",
"encoder_conv_in": "ContextParallelCausalConv3d",
"encoder_conv_out": "ContextParallelCausalConv3d",
"encoder_mid_resnet": "ContextParallelResnetBlock3D",
"encoder_resnet_blocks": [
"ContextParallelResnetBlock3D",
"ContextParallelResnetBlock3D",
"ContextParallelResnetBlock3D",
"ContextParallelResnetBlock3D"
],
"encoder_spatial_downsample": [
"DownSample3D",
"DownSample3D",
"DownSample3D",
""
],
"encoder_temporal_downsample": [
"",
"",
"",
""
],
"decoder_attention": "",
"decoder_nonlinearity": "swish",
"decoder_conv_in": "ContextParallelCausalConv3d",
"decoder_conv_out": "ContextParallelCausalConv3d",
"decoder_mid_resnet": "ContextParallelResnetBlock3D",
"decoder_resnet_blocks": [
"ContextParallelResnetBlock3D",
"ContextParallelResnetBlock3D",
"ContextParallelResnetBlock3D",
"ContextParallelResnetBlock3D"
],
"decoder_spatial_upsample": [
"",
"Upsample3D",
"Upsample3D",
"Upsample3D"
],
"decoder_temporal_upsample": [
"",
"",
"",
""
],
"encoder_gather_norm": true,
"decoder_gather_norm": true,
"use_quant_layer": false,
"vae_scale_factor":[4, 8, 0.7]
},
"text_encoder": {
"model_id": "T5",
"hub_backend": "hf",
"from_pretrained": "1.5b-cogvideo/t5-v1_1-xxl",
"dtype": "bf16",
"low_cpu_mem_usage": true,
"use_attention_mask": false
},
"tokenizer":{
"hub_backend": "hf",
"autotokenizer_name": "T5Tokenizer",
"from_pretrained": "1.5b-cogvideo/t5-v1_1-xxl",
"model_max_length": 224
},
"predictor": {
"model_id": "satdit",
"from_pretrained": null,
"dtype": "bf16",
"num_layers": 42,
"num_heads": 48,
"head_dim": 64,
"in_channels": 32,
"out_channels": 16,
"dropout": 0.0,
"cross_attention_dim": null,
"attention_bias": true,
"input_size": [22, 96, 170],
"patch_type": "3D",
"patch_size": [2, 2, 2],
"activation_fn": "gelu-approximate",
"num_embeds_ada_norm": 1000,
"norm_type": "qk_ln",
"norm_elementwise_affine": true,
"norm_eps": 1e-5,
"norm_out_eps": 1e-5,
"caption_channels": null,
"time_embed_dim": 512,
"text_length": 224,
"text_hidden_size": 4096,
"concat_text_embed": true,
"interpolation_scale": [1.0, 1.0, 1.0],
"use_rope": true,
"ofs_embed_dim": 512
},
"diffusion": {
"model_id": "cogvideo_diffusion",
"num_inference_steps": 50,
"device":"npu",
"sigma_sampler_config": {
"uniform_sampling": true,
"num_idx": 1000,
"discretization_config":{
"shift_scale": 1.0
}
},
"denoiser_config": {
"num_idx": 1000,
"quantize_c_noise": false,
"discretization_config":{
"shift_scale": 1.0
}
},
"scheduler_config": {
"beta_end": 0.012,
"beta_schedule": "scaled_linear",
"beta_start": 0.00085,
"clip_sample": false,
"clip_sample_range": 1.0,
"num_train_timesteps": 1000,
"prediction_type": "v_prediction",
"rescale_betas_zero_snr": true,
"sample_max_value": 1.0,
"set_alpha_to_one": true,
"steps_offset": 0,
"timestep_spacing": "trailing",
"trained_betas": null,
"snr_shift_scale": 1.0
}
},
"pipeline_config": {
"use_dynamic_cfg": true,
"input_size": [81, 768, 1360],
"guidance_scale": 6.0,
"use_tiling": true,
"version": 1.5,
"vae_invert_scale_latents": true
},
"frame_interval": 1,
"save_path":"./cogvideox_samples_i2v_1.5/",
"fps": 16,
"prompt":"examples/cogvideox/samples_i2v_prompts.txt",
"image": "examples/cogvideox/samples_i2v_images.txt",
"use_prompt_preprocess": true,
"pipeline_class": "CogVideoXPipeline",
"device":"npu",
"dtype": "bf16",
"eval_config": {
"dataset": {
"type": "vbench_i2v",
"basic_param": {
"data_path": "i2v-dataset/vbench2_i2v_full_info.json",
"data_folder": "vbench2_i2v/data",
"return_type": "list",
"data_storage_mode": "standard"
},
"extra_param": {
"ratio": "16-9"
}
},
"dataloader_param": {
"dataloader_mode": "sampler",
"sampler_type": "SequentialSampler",
"shuffle": true,
"drop_last": false,
"pin_memory": true,
"group_frame": false,
"group_resolution": false,
"collate_param": {},
"prefetch_factor": 4
},
"evaluation_model":"cogvideox-1.5",
"evaluation_impl": "vbench_eval",
"eval_type": "i2v",
"load_ckpt_from_local": true,
"need_inference": true,
"dimensions": ["subject_consistency"],
"eval_result_path": "./eval_result",
"image_path": "vbench2_i2v/data/crop"
}
}