{
"infer_data_type": "image",
"file_path": "./examples/internvl2.5/view.jpg",
"prompts": "Please describe the image shortly.",
"pipeline_class": "InternVLPipeline",
"from_pretrained": "./ckpt/model_optim_rng.pt",
"template": "internvl2_5",
"dtype": "bf16",
"device": "npu",
"pre_process": true,
"post_process": true,
"add_text_encoder": false,
"img_embedding_idx": 1,
"downsample_ratio": 0.5,
"select_layer": -1,
"ps_version": "v2",
"img_context_token_id": 151677,
"num_segments": 8,
"text_decoder": {
"model_id": "Qwen2.5llm",
"num_layers": 36,
"hidden_size": 2048,
"num_attention_heads": 16,
"num_query_groups": 2,
"ffn_hidden_size": 11008,
"kv_channels": 128,
"hidden_dropout": 0.0,
"attention_dropout": 0.0,
"layernorm_epsilon": 1e-06,
"normalization": "RMSNorm",
"qk_layernorm": false,
"add_bias_linear": false,
"add_qkv_bias": true,
"bias_activation_fusion": false,
"gated_linear_unit": true,
"init_method_std": 0.01,
"attention_softmax_in_fp32": true,
"masked_softmax_fusion": false,
"layernorm_zero_centered_gamma": false,
"bias_dropout_fusion":false,
"apply_rope_fusion": false,
"memory_efficient_layer_norm": false,
"max_position_embeddings": 4096,
"fp16": false,
"bf16": true,
"params_dtype": "bf16",
"fp16_lm_cross_entropy": false,
"rotary_percent": 1.0,
"rotary_base": 1000000,
"position_embedding_type": "rope",
"use_fused_rotary_pos_emb": false,
"rope_scaling": null,
"parallel_output": true,
"initializer_factor": 1.0,
"activation_func": "silu",
"vocab_size": 151674,
"is_encoder_decoder": false
},
"image_encoder": {
"vision_encoder": {
"model_id": "InternViT",
"num_layers": 24,
"hidden_size": 1024,
"ffn_hidden_size": 4096,
"num_attention_heads": 16,
"num_channels": 3,
"patch_size": 14,
"image_size": 448,
"add_qkv_bias": true,
"qk_layernorm": false,
"activation_func": "gelu",
"normalization": "LayerNorm",
"layernorm_epsilon": 1e-6,
"hidden_dropout": 0.0,
"drop_path_rate": 0.0,
"attention_dropout": 0.0,
"init_method_std": 0.02,
"initializer_factor": 1.0,
"output_hidden_states": false,
"use_return_dict": false,
"params_dtype": "bf16",
"post_layer_norm": false,
"downsample_ratio": 0.5,
"fp16": false,
"bf16": true,
"attention_softmax_in_fp32": false,
"select_layer": -1,
"ps_version": "v2",
"pre_tockens": 2147483647,
"next_tockens": 2147483647,
"freeze": true
},
"vision_projector": {
"model_id": "InternVLMLP",
"downsample_ratio": 0.5,
"vit_hidden_size": 1024,
"llm_hidden_size": 2048
}
},
"tokenizer":{
"hub_backend": "hf",
"autotokenizer_name": "AutoTokenizer",
"from_pretrained": "OpenGVLab/InternVL2_5-4B",
"add_eos_token": false,
"use_fast": false
},
"generation_config":{
"do_sample": false,
"bos_token_id": 151643,
"eos_token_id": 151645,
"pad_token_id": null,
"max_length": 20,
"max_new_tokens": 1024,
"temperature": 1.0,
"output_attentions":false,
"output_hidden_states":false,
"use_cache":false,
"decoder_start_token_id":null,
"min_new_tokens":null,
"min_length":0,
"constraints":null,
"num_beams":1,
"force_words_ids":null,
"top_k":50,
"top_p":1.0,
"prompt_lookup_num_tokens":null,
"guidance_scale":null,
"bad_words_ids": null,
"begin_suppress_tokens": null,
"diversity_penalty": 0.0,
"early_stopping": false,
"encoder_no_repeat_ngram_size": 0,
"encoder_repetition_penalty": 1.0,
"epsilon_cutoff": 0.0,
"eta_cutoff": 0.0,
"exponential_decay_length_penalty": null,
"forced_bos_token_id": null,
"forced_decoder_ids": null,
"forced_eos_token_id": null,
"length_penalty": 1.0,
"low_memory": null,
"max_time": null,
"no_repeat_ngram_size": 0,
"num_assistant_tokens": 5,
"num_assistant_tokens_schedule": "heuristic",
"num_beam_groups": 1,
"num_return_sequences": 1,
"output_scores": false,
"penalty_alpha": null,
"remove_invalid_values": false,
"renormalize_logits": false,
"repetition_penalty": 1.0,
"return_dict_in_generate": false,
"sequence_bias": null,
"suppress_tokens": null,
"typical_p": 1.0
},
"text_encoder": null,
"video_encoder": null
}