{

    "ae": {

        "model_id": "autoencoder_kl_hunyuanvideo",

        "from_pretrained": "hunyuan-video-t2v-720p/vae/pytorch_model.pt",

        "dtype": "float32",

        "latent_channels": 16,

        "block_out_channels": [128, 256, 512, 512],

        "layers_per_block": 2,

        "in_channels": 3,

        "norm_num_groups": 32,

        "out_channels": 3,

        "sample_size": 256,

        "sample_tsize": 64,

        "down_block_types": [

            "DownEncoderBlockCausal3D",

            "DownEncoderBlockCausal3D",

            "DownEncoderBlockCausal3D",

            "DownEncoderBlockCausal3D"

        ],

        "up_block_types": [

            "UpDecoderBlockCausal3D", 

            "UpDecoderBlockCausal3D",

            "UpDecoderBlockCausal3D", 

            "UpDecoderBlockCausal3D"

        ],

        "scaling_factor": 0.476986,

        "time_compression_ratio": 4,

        "mid_block_add_attention": true,

        "act_fn": "silu",

        "enable_tiling": true

    },

    "text_encoder": [

        {

            "model_id": "Auto",

            "dtype": "fp16",

            "from_pretrained": "llava-llama-3-8b-text-encoder-tokenizer",

            "hub_backend": "hf",

            "use_attention_mask": true,

            "hidden_state_skip_layer": 2,

            "output_key": "hidden_states"            

        },

        {

            "model_id": "CLIP",

            "dtype": "fp16",

            "from_pretrained": "clip-vit-large-patch14",

            "hub_backend": "hf",

            "low_cpu_mem_usage": true,

            "use_attention_mask": true,

            "output_key": "pooler_output"

        }

    ]

}