MindSpeed-MM/examples/hunyuanvideo/i2v/model_hunyuanvideo_i2v.json-代码预览-MindSpeed-MM:基于昇腾芯片的多模态大模型训练套件项目 - AtomGit

Ii-robot!1029 [Refactor]configuration file reconstruction
f755d362创建于 2025年6月9日历史提交
{
    "load_video_features": true,
    "load_text_features": true,
    "task": "i2v",
    "ae": {
        "model_id": "autoencoder_kl_hunyuanvideo",
        "from_pretrained": "hunyuan-video-t2v-720p/vae/pytorch_model.pt",
        "dtype": "bf16",
        "latent_channels": 16,
        "block_out_channels": [128, 256, 512, 512],
        "layers_per_block": 2,
        "in_channels": 3,
        "norm_num_groups": 32,
        "out_channels": 3,
        "sample_size": 256,
        "sample_tsize": 64,
        "down_block_types": [
            "DownEncoderBlockCausal3D",
            "DownEncoderBlockCausal3D",
            "DownEncoderBlockCausal3D",
            "DownEncoderBlockCausal3D"
        ],
        "up_block_types": [
            "UpDecoderBlockCausal3D", 
            "UpDecoderBlockCausal3D",
            "UpDecoderBlockCausal3D", 
            "UpDecoderBlockCausal3D"
        ],
        "scaling_factor": 0.476986,
        "time_compression_ratio": 4,
        "mid_block_add_attention": true,
        "act_fn": "silu",
        "enable_tiling": true,
        "i2v_processor": {
            "processor_id": "hunyuanvideo_i2v_processor",
            "sematic_cond_drop_p": 0.0,
            "processor_path": "llava-llama-3-8b-v1_1-transformers"
        }
    },
    "tokenizer": [
            {
                "autotokenizer_name": "hunyuanMLLmTokenizer",
                "hub_backend": "hf",
                "from_pretrained": "llava-llama-3-8b-v1_1-transformers",
                "model_max_length": 256,
                "template_id": "hyv-llm-encode-video-i2v",
                "template_file_path": "examples/hunyuanvideo/template.json"
            },
            {
                "autotokenizer_name": "CLIPTokenizer",
                "hub_backend": "hf",
                "from_pretrained": "clip-vit-large-patch14",
                "model_max_length": 77
            }
    ],
    "text_encoder": [
        {
            "model_id": "HunyuanMLLmModel",
            "dtype": "bf16",
            "from_pretrained": "llava-llama-3-8b-v1_1-transformers",
            "model_type": "LlavaForConditionalGeneration", 
            "hub_backend": "hf",
            "use_attention_mask": true,
            "hidden_state_skip_layer": 2,
            "output_key": "hidden_states",
            "template_id": "hyv-llm-encode-video-i2v",
            "template_file_path": "examples/hunyuanvideo/template.json",
            "using_kwargs": [
                "pixel_values"
            ]
        },
        {
            "model_id": "CLIP",
            "dtype": "fp16",
            "from_pretrained": "clip-vit-large-patch14",
            "hub_backend": "hf",
            "low_cpu_mem_usage": true,
            "use_attention_mask": true,
            "output_key": "pooler_output"
        }
    ],
    "diffusion": {
        "model_id": "hunyuanvideo_i2v_diffusion",
        "num_train_timesteps": 1000,
        "shift": 7.0,
        "reverse": true,
        "solver": "euler"
    },
    "predictor": {
        "model_id": "hunyuanvideodit",
        "from_pretrained": null,
        "dtype": "bf16",
        "patch_size": [1, 2, 2], 
        "in_channels": 16,
        "out_channels": 16,
        "num_heads": 24,
        "head_dim": 128,
        "mlp_width_ratio": 4,
        "mlp_act_type": "gelu_tanh",
        "mm_double_blocks_depth": 20,
        "double_stream_full_recompute_layers": 20,
        "mm_single_blocks_depth": 40,
        "single_stream_full_recompute_layers": 40,
        "attention_async_offload": true,
        "rope_dim_list": [16, 56, 56],
        "qkv_bias": true, 
        "qk_norm": true,
        "qk_norm_type": "rmsnorm",
        "guidance_embed": true,
        "text_projection": "single_refiner",
        "text_states_dim": [4096, 768],
        "use_attention_mask": true,
        "i2v_condition_type": "token_replace",
        "use_fused_rmsnorm": true
    }
}