{
"load_video_features": false,
"load_text_features": false,
"enable_encoder_dp": true,
"ae": {
"model_id": "wfvae",
"base_channels": 160,
"connect_res_layer_num": 1,
"decoder_energy_flow_hidden_size": 128,
"decoder_num_resblocks": 2,
"dropout": 0.0,
"encoder_energy_flow_hidden_size": 128,
"encoder_num_resblocks": 2,
"l1_dowmsample_block": "Spatial2xTime2x3DDownsample",
"l1_downsample_wavelet": "HaarWaveletTransform3D",
"l1_upsample_block": "Spatial2xTime2x3DUpsample",
"l1_upsample_wavelet": "InverseHaarWaveletTransform3D",
"l2_dowmsample_block": "Spatial2xTime2x3DDownsample",
"l2_downsample_wavelet": "HaarWaveletTransform3D",
"l2_upsample_block": "Spatial2xTime2x3DUpsample",
"l2_upsample_wavelet": "InverseHaarWaveletTransform3D",
"latent_dim": 32,
"vae_scale_factor": [8, 8, 8],
"norm_type": "aelayernorm",
"scale": [0.7031, 0.7109, 1.5391, 1.2969, 0.7109, 1.4141, 1.3828, 2.1719, 1.7266,
1.8281, 1.9141, 1.2031, 0.6875, 0.9609, 1.6484, 1.1875, 1.5312, 1.1328,
0.8828, 0.6836, 0.8828, 0.9219, 1.6953, 1.4453, 1.5312, 0.6836, 0.7656,
0.8242, 1.2344, 1.0312, 1.7266, 0.9492],
"shift": [-0.2129, 0.1226, 1.6328, 0.6211, -0.8750, 0.6172, -0.5703, 0.1348,
-0.2178, -0.9375, 0.3184, 0.3281, -0.0544, -0.1826, -0.2812, 0.4355,
0.1621, -0.2578, 0.7148, -0.7422, -0.2295, -0.2324, -1.4922, 0.6328,
1.1250, -0.2578, -2.1094, 1.0391, 1.1797, -1.2422, -0.2988, -0.9570],
"t_interpolation": "trilinear",
"use_attention": true,
"use_tiling": true,
"t_chunk_enc": 8,
"t_chunk_dec": 4,
"from_pretrained": "osp1_5_vae.pt",
"dtype": "fp32"
},
"text_encoder": [
{
"hub_backend": "hf",
"model_id": "T5",
"from_pretrained": "t5/t5-v1_1-xl",
"low_cpu_mem_usage": false,
"dtype": "fp16"
},
{
"hub_backend": "hf",
"model_id": "CLIPWithProjection",
"from_pretrained": "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
"output_key": "text_embeds",
"low_cpu_mem_usage": false,
"dtype": "fp16"
}],
"tokenizer":[
{
"hub_backend": "hf",
"autotokenizer_name": "AutoTokenizer",
"from_pretrained": "t5/t5-v1_1-xl",
"low_cpu_mem_usage": false,
"model_max_length": 512
},
{
"hub_backend": "hf",
"autotokenizer_name": "AutoTokenizer",
"from_pretrained": "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
"low_cpu_mem_usage": false,
"model_max_length": 77
}],
"predictor": {
"model_id": "SparseUMMDiT",
"num_layers": [2, 4, 6, 8, 6, 4, 2],
"sparse_n": [1, 2, 4, 8, 4, 2, 1],
"double_ff": true,
"sparse1d": true,
"num_heads": 24,
"head_dim": 128,
"in_channels": 32,
"out_channels": 32,
"timestep_embed_dim": 1024,
"caption_channels": 2048,
"pooled_projection_dim": 1280,
"skip_connection": true,
"skip_connection_zero_init": true,
"dropout": 0.0,
"attention_bias": true,
"patch_size_thw": [1, 2, 2],
"activation_fn": "gelu-approximate",
"norm_cls": "fp32_layer_norm",
"norm_elementwise_affine": true,
"norm_eps": 1e-06,
"dtype": "fp16"
},
"diffusion": {
"model_id": "OpenSoraPlan",
"weighting_scheme": "logit_normal",
"use_dynamic_shifting": true
},
"patch": {
"ae_float32": true,
"adaptive_clip_grad_norm": {
"clip_grad_ema_decay": 0.99
}
}
}