{
"ae": {
"model_id": "autoencoder_kl_hunyuanvideo",
"from_pretrained": "hunyuan-video-t2v-720p/vae/pytorch_model.pt",
"dtype": "float32",
"latent_channels": 16,
"block_out_channels": [128, 256, 512, 512],
"layers_per_block": 2,
"in_channels": 3,
"norm_num_groups": 32,
"out_channels": 3,
"sample_size": 256,
"sample_tsize": 64,
"down_block_types": [
"DownEncoderBlockCausal3D",
"DownEncoderBlockCausal3D",
"DownEncoderBlockCausal3D",
"DownEncoderBlockCausal3D"
],
"up_block_types": [
"UpDecoderBlockCausal3D",
"UpDecoderBlockCausal3D",
"UpDecoderBlockCausal3D",
"UpDecoderBlockCausal3D"
],
"scaling_factor": 0.476986,
"time_compression_ratio": 4,
"mid_block_add_attention": true,
"act_fn": "silu",
"enable_tiling": true
},
"text_encoder": [
{
"model_id": "Auto",
"dtype": "fp16",
"from_pretrained": "llava-llama-3-8b-text-encoder-tokenizer",
"hub_backend": "hf",
"use_attention_mask": true,
"hidden_state_skip_layer": 2,
"output_key": "hidden_states"
},
{
"model_id": "CLIP",
"dtype": "fp16",
"from_pretrained": "clip-vit-large-patch14",
"hub_backend": "hf",
"low_cpu_mem_usage": true,
"use_attention_mask": true,
"output_key": "pooler_output"
}
]
}