{
"model_id": "Qwen2VlPipeline",
"img_context_token_id": 151655,
"image_encoder": {
"vision_encoder": {
"model_id": "qwen2vit",
"num_layers": 32,
"hidden_size": 1280,
"ffn_hidden_size": 5120,
"llm_hidden_size": 3584,
"num_attention_heads": 16,
"hidden_dropout": 0.0,
"attention_dropout": 0.0,
"in_channels": 3,
"patch_size": 14,
"spatial_merge_size": 2,
"temporal_patch_size": 2,
"layernorm_epsilon": 1e-06,
"normalization": "LayerNorm",
"fp16": false,
"bf16": true,
"params_dtype": "bf16",
"activation_func": "quick_gelu",
"freeze": true,
"use_fused_rotary_pos_emb": true,
"post_layer_norm": false,
"pipeline_num_layers": [32, 0, 0, 0]
},
"vision_projector": {
"model_id": "lnmlp",
"num_layers": 1,
"gated_linear_unit": false,
"bias_activation_fusion": false,
"add_bias_linear": true,
"input_size": 1280,
"hidden_size": 3584,
"ffn_hidden_size": 5120,
"activation_func": "gelu",
"bf16": true,
"params_dtype": "bf16",
"freeze": true,
"layernorm_epsilon": 1e-06,
"normalization": "LayerNorm"
}
},
"text_decoder": {
"model_id": "qwen2lm",
"num_layers": 28,
"pipeline_num_layers": [1, 10, 10, 7],
"hidden_size": 3584,
"ffn_hidden_size": 18944,
"num_attention_heads": 28,
"max_position_embeddings": 32768,
"vocab_size": 152064,
"rope_theta":1000000.0,
"untie_embeddings_and_output_weights": true,
"disable_bias_linear": true,
"attention_dropout": 0.0,
"init_method_std": 0.01,
"hidden_dropout": 0.0,
"position_embedding_type": "mrope",
"normalization": "RMSNorm",
"activation_func": "silu",
"use_fused_rotary_pos_emb": true,
"attention_softmax_in_fp32": true,
"params_dtype": "bf16",
"bf16": true,
"parallel_output": false,
"group_query_attention": true,
"num_query_groups": 4,
"mrope_section": [16, 24, 24],
"rope_scaling": null,
"gated_linear_unit": true,
"layernorm_epsilon": 1e-06,
"add_bias_linear":false,
"add_qkv_bias": true,
"sequence_parallel": false,
"tokenizer_type": "PretrainedFromHF",
"is_encoder_decoder": false,
"use_infer_fa": true
},
"text_encoder": null,
"video_encoder": null,
"dtype": "bf16",
"device": "npu",
"tokenizer": {
"hub_backend": "hf",
"autotokenizer_name": "AutoTokenizer",
"from_pretrained": "./Qwen2-VL-7B-Instruct"
},
"generation_config": {
"bos_token_id": 151643,
"do_sample": true,
"kv_cache":true,
"split_batch":false,
"output_attentions": false,
"output_hidden_states": false,
"max_length": 20,
"min_length": 0,
"min_new_tokens": null,
"constraints": null,
"prompt_lookup_num_tokens": null,
"guidance_scale": null,
"bad_words_ids": null,
"begin_suppress_tokens": null,
"diversity_penalty": 0.0,
"early_stopping": false,
"encoder_no_repeat_ngram_size": 0,
"encoder_repetition_penalty": 1.0,
"epsilon_cutoff": 0.0,
"eta_cutoff": 0.0,
"exponential_decay_length_penalty": null,
"forced_bos_token_id": null,
"forced_decoder_ids": null,
"forced_eos_token_id": null,
"length_penalty": 1.0,
"low_memory": null,
"max_time": null,
"no_repeat_ngram_size": 0,
"num_assistant_tokens": 5,
"num_assistant_tokens_schedule": "heuristic",
"num_beam_groups": 1,
"num_return_groups": 1,
"num_return_sequences": 1,
"output_scores": false,
"output_logits": null,
"penalty_alpha": null,
"remove_invalid_values": false,
"repetition_penalty": 1.0,
"return_dict_in_generate": false,
"sequence_bias": null,
"suppress_tokens": null,
"typical_p": 1.0,
"force_words_ids": null,
"num_beams": 1,
"renormalize_logits": false,
"use_cache": true,
"eos_token_id": [
151645,
151643
],
"max_new_tokens": 256,
"pad_token_id": 151643,
"vision_start_token_id": 151652,
"image_token_id": 151655,
"video_token_id": 151656,
"temperature": 0.01,
"top_k": 1,
"top_p": 0.001,
"dola_layers": null,
"cache_implementation": null,
"cache_config": null,
"return_legacy_cache": null,
"min_p": null,
"token_healing": false,
"watermarking_config": null,
"decoder_start_token_id": null,
"max_matching_ngram_size": null,
"stop_strings": null
},
"patch":{
"infer_fa": true
},
"min_pixels": 1003520,
"max_pixels": 12845056,
"dataset_path": "./AI2D_TEST.tsv",
"evaluation_dataset":"ai2d_test",
"evaluation_model":"qwen2_vl_7b",
"result_output_path":"./evaluation_outputs/"
}