MindSpeed-MM/examples/hunyuanvideo/i2v/inference_model.json-代码预览-MindSpeed-MM:基于昇腾芯片的多模态大模型训练套件项目 - AtomGit

Ii-robot!1029 [Refactor]configuration file reconstruction
f755d362创建于 2025年6月9日历史提交
{

    "ae": {

        "model_id": "autoencoder_kl_hunyuanvideo",

        "from_pretrained": "hunyuan-video-t2v-720p/vae/pytorch_model.pt",

        "dtype": "float16",

        "latent_channels": 16,

        "block_out_channels": [128, 256, 512, 512],

        "layers_per_block": 2,

        "in_channels": 3,

        "norm_num_groups": 32,

        "out_channels": 3,

        "sample_size": 256,

        "sample_tsize": 64,

        "down_block_types": [

            "DownEncoderBlockCausal3D",

            "DownEncoderBlockCausal3D",

            "DownEncoderBlockCausal3D",

            "DownEncoderBlockCausal3D"

        ],

        "up_block_types": [

            "UpDecoderBlockCausal3D", 

            "UpDecoderBlockCausal3D",

            "UpDecoderBlockCausal3D", 

            "UpDecoderBlockCausal3D"

        ],

        "scaling_factor": 0.476986,

        "time_compression_ratio": 4,

        "mid_block_add_attention": true,

        "act_fn": "silu"

    },

    "tokenizer": [

        {

            "autotokenizer_name": "hunyuanMLLmTokenizer",

            "hub_backend": "hf",

            "from_pretrained": "llava-llama-3-8b-v1_1-transformers",

            "model_max_length": 256,

            "template_id": "hyv-llm-encode-video-i2v",

            "template_file_path": "examples/hunyuanvideo/template.json"

        },

        {

            "autotokenizer_name": "CLIPTokenizer",

            "hub_backend": "hf",

            "from_pretrained": "clip-vit-large-patch14",

            "model_max_length": 77

        }

    ],

    "text_encoder": [

        {

            "model_id": "HunyuanMLLmModel",

            "dtype": "fp16",

            "from_pretrained": "llava-llama-3-8b-v1_1-transformers",

            "model_type": "LlavaForConditionalGeneration", 

            "hub_backend": "hf",

            "use_attention_mask": true,

            "hidden_state_skip_layer": 2,

            "output_key": "hidden_states",

            "template_id": "hyv-llm-encode-video-i2v",

            "template_file_path": "examples/hunyuanvideo/template.json",

            "using_kwargs": [

                "pixel_values"

            ]

        },

        {

            "model_id": "CLIP",

            "dtype": "fp16",

            "from_pretrained": "clip-vit-large-patch14",

            "hub_backend": "hf",

            "low_cpu_mem_usage": true,

            "use_attention_mask": true,

            "output_key": "pooler_output"

        }

    ],

    "predictor": {

        "model_id": "hunyuanvideodit",

        "from_pretrained": null,

        "dtype": "bf16",

        "patch_size": [1, 2, 2], 

        "in_channels": 16,

        "out_channels": 16,

        "num_heads": 24,

        "head_dim": 128,

        "mlp_width_ratio": 4,

        "mlp_act_type": "gelu_tanh",

        "mm_double_blocks_depth": 20,

        "mm_single_blocks_depth": 40,

        "rope_dim_list": [16, 56, 56],

        "qkv_bias": true, 

        "qk_norm": true,

        "qk_norm_type": "rmsnorm",

        "guidance_embed": true,

        "text_projection": "single_refiner",

        "text_states_dim": [4096, 768],

        "use_attention_mask": true,

        "i2v_condition_type": "token_replace",

        "use_fused_rmsnorm": true

    },

    "diffusion": {

        "model_id": "flow_match_discrete_scheduler",

        "num_inference_timesteps": 50,

        "shift": 7.0,

        "reverse": true,

        "solver": "euler"

    }, 

    "pipeline_config": {

        "input_size": [65, 928, 512],

        "guidance_scale": 1.0,

        "guidance_rescale": 0.0,

        "embedded_guidance_scale": 6.0,

        "cpu_offload": true

    },

    "save_path": "./hunyuanvideo_i2v_samples/",

    "prompt": "examples/hunyuanvideo/i2v/samples_i2v_prompts.txt",

    "image": "examples/hunyuanvideo/i2v/samples_i2v_images.txt",

    "dtype": "bf16",

    "device": "npu",

    "pipeline_class": "HunyuanVideoPipeline",

    "frame_interval": 1,

    "fps": 24,

    "use_prompt_preprocess": false

}