{
    "load_video_features": false,
    "load_text_features": false,
    "enable_encoder_dp": true,
    "ae": {
        "model_id": "wfvae",
        "base_channels": 160,
        "connect_res_layer_num": 1,
        "decoder_energy_flow_hidden_size": 128,
        "decoder_num_resblocks": 2,
        "dropout": 0.0,
        "encoder_energy_flow_hidden_size": 128,
        "encoder_num_resblocks": 2,
        "l1_dowmsample_block": "Spatial2xTime2x3DDownsample",
        "l1_downsample_wavelet": "HaarWaveletTransform3D",
        "l1_upsample_block": "Spatial2xTime2x3DUpsample",
        "l1_upsample_wavelet": "InverseHaarWaveletTransform3D",
        "l2_dowmsample_block": "Spatial2xTime2x3DDownsample",
        "l2_downsample_wavelet": "HaarWaveletTransform3D",
        "l2_upsample_block": "Spatial2xTime2x3DUpsample",
        "l2_upsample_wavelet": "InverseHaarWaveletTransform3D",
        "latent_dim": 32,
        "vae_scale_factor": [8, 8, 8],
        "norm_type": "aelayernorm",
        "scale": [0.7031, 0.7109, 1.5391, 1.2969, 0.7109, 1.4141, 1.3828, 2.1719, 1.7266,
        1.8281, 1.9141, 1.2031, 0.6875, 0.9609, 1.6484, 1.1875, 1.5312, 1.1328,
        0.8828, 0.6836, 0.8828, 0.9219, 1.6953, 1.4453, 1.5312, 0.6836, 0.7656,
        0.8242, 1.2344, 1.0312, 1.7266, 0.9492],
        "shift": [-0.2129,  0.1226,  1.6328,  0.6211, -0.8750,  0.6172, -0.5703,  0.1348,
        -0.2178, -0.9375,  0.3184,  0.3281, -0.0544, -0.1826, -0.2812,  0.4355,
         0.1621, -0.2578,  0.7148, -0.7422, -0.2295, -0.2324, -1.4922,  0.6328,
         1.1250, -0.2578, -2.1094,  1.0391,  1.1797, -1.2422, -0.2988, -0.9570],
        "t_interpolation": "trilinear",
        "use_attention": true,
        "use_tiling": true,
        "t_chunk_enc": 8,
        "t_chunk_dec": 4,
        "from_pretrained": "osp1_5_vae.pt",
        "dtype": "fp32"
      },
    "text_encoder": [
        {
            "hub_backend": "hf",
            "model_id": "T5",
            "from_pretrained": "t5/t5-v1_1-xl",
            "low_cpu_mem_usage": false,
            "dtype": "fp16"
        },
        {
            "hub_backend": "hf",
            "model_id": "CLIPWithProjection", 
            "from_pretrained": "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
            "output_key": "text_embeds",
            "low_cpu_mem_usage": false,
            "dtype": "fp16"
        }],
    "tokenizer":[
        {
            "hub_backend": "hf",
            "autotokenizer_name": "AutoTokenizer",
            "from_pretrained": "t5/t5-v1_1-xl",
            "low_cpu_mem_usage": false,
            "model_max_length": 512
        },
        {
            "hub_backend": "hf",
            "autotokenizer_name": "AutoTokenizer",
            "from_pretrained": "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
            "low_cpu_mem_usage": false,
            "model_max_length": 77
        }],
    "predictor": {
        "model_id": "SparseUMMDiT",
        "num_layers": [2, 4, 6, 8, 6, 4, 2],
        "sparse_n": [1, 2, 4, 8, 4, 2, 1],
        "double_ff": true,
        "sparse1d": true,
        "num_heads": 24,
        "head_dim": 128,
        "in_channels": 32,
        "out_channels": 32,
        "timestep_embed_dim": 1024,
        "caption_channels": 2048,
        "pooled_projection_dim": 1280,
        "skip_connection": true,
        "skip_connection_zero_init": true,
        "dropout": 0.0,
        "attention_bias": true,
        "patch_size_thw": [1, 2, 2],
        "activation_fn": "gelu-approximate",
        "norm_cls": "fp32_layer_norm",
        "norm_elementwise_affine": true,
        "norm_eps": 1e-06,
        "dtype": "fp16"
    },
    "diffusion": {
        "model_id": "OpenSoraPlan",
        "weighting_scheme": "logit_normal",
        "use_dynamic_shifting": true
    },
    "patch": {
        "ae_float32": true,
        "adaptive_clip_grad_norm": {
            "clip_grad_ema_decay": 0.99
        }
    }
}