{
"pipeline_class": "GlmPipeline",
"img_context_token_id": 151343,
"image_encoder": {
"vision_encoder": {
"model_id": "glm4v_vit",
"num_layers": 24,
"pipeline_num_layers": [24,0,0,0],
"hidden_size": 1536,
"attention_bias": false,
"out_hidden_size": 4096,
"ffn_hidden_size": 4096,
"llm_hidden_size": 1280,
"gated_linear_unit": true,
"add_bias_linear": false,
"num_attention_heads": 12,
"hidden_dropout": 0.0,
"attention_dropout": 0.0,
"in_channels": 3,
"patch_size": 14,
"spatial_merge_size": 2,
"temporal_patch_size": 2,
"layernorm_epsilon": 1e-06,
"normalization": "RMSNorm",
"image_size": 336,
"fp16": false,
"bf16": true,
"params_dtype": "bf16",
"activation_func": "silu",
"hidden_act": "silu",
"freeze": true,
"use_fused_rotary_pos_emb": true,
"post_layer_norm": false,
"tokens_per_second": 2,
"rms_norm_eps": 1e-05,
"window_attn_size": 112
},
"vision_projector": {
"model_id": "GlmMLP",
"num_layers": 1,
"gated_linear_unit": false,
"bias_activation_fusion": false,
"add_bias_linear": true,
"input_size": 1280,
"hidden_size": 2048,
"ffn_hidden_size": 5120,
"bf16": true,
"params_dtype": "bf16",
"freeze": true,
"layernorm_epsilon": 1e-06,
"normalization": "RMSNorm",
"out_hidden_size": 4096,
"intermediate_size": 13696,
"hidden_act": "silu"
}
},
"text_decoder": {
"model_id": "glm4v_lm",
"num_layers": 40,
"pipeline_num_layers": [7, 11, 11, 11],
"hidden_size": 4096,
"ffn_hidden_size": 13696,
"num_attention_heads": 32,
"max_position_embeddings": 128000,
"vocab_size": 151552,
"rope_theta": 10000.0,
"untie_embeddings_and_output_weights": true,
"disable_bias_linear": true,
"attention_dropout": 0.0,
"init_method_std": 0.01,
"hidden_dropout": 0.0,
"position_embedding_type": "mrope",
"normalization": "RMSNorm",
"activation_func": "silu",
"use_fused_rotary_pos_emb": true,
"partial_rotary_factor": 0.5,
"attention_softmax_in_fp32": true,
"params_dtype": "bf16",
"hidden_act": "silu",
"bf16": true,
"parallel_output": true,
"group_query_attention": true,
"num_query_groups": 2,
"num_key_value_heads": 2,
"mrope_section": [8, 12, 12],
"rope_scaling": null,
"gated_linear_unit": true,
"layernorm_epsilon": 1e-06,
"add_bias_linear":false,
"add_qkv_bias": true,
"use_remove_padding": false,
"rms_norm_eps": 1e-05,
"is_encoder_decoder": false,
"use_infer_fa": true
},
"text_encoder": null,
"video_encoder": null,
"dtype": "bf16",
"device": "npu",
"tokenizer": {
"hub_backend": "hf",
"autotokenizer_name": "AutoTokenizer",
"from_pretrained": "ckpt/hf_path/GLM4.1V-9B-Instruct"
},
"generation_config": {
"bos_token_id": 151643,
"do_sample": true,
"kv_cache":true,
"output_attentions": false,
"output_hidden_states": false,
"max_length": 20,
"min_length": 0,
"min_new_tokens": null,
"constraints": null,
"prompt_lookup_num_tokens": null,
"guidance_scale": null,
"bad_words_ids": null,
"begin_suppress_tokens": null,
"diversity_penalty": 0.0,
"early_stopping": false,
"encoder_no_repeat_ngram_size": 0,
"encoder_repetition_penalty": 1.0,
"epsilon_cutoff": 0.0,
"eta_cutoff": 0.0,
"exponential_decay_length_penalty": null,
"forced_bos_token_id": null,
"forced_decoder_ids": null,
"forced_eos_token_id": null,
"length_penalty": 1.0,
"low_memory": null,
"max_time": null,
"no_repeat_ngram_size": 0,
"num_assistant_tokens": 5,
"num_assistant_tokens_schedule": "heuristic",
"num_beam_groups": 1,
"num_return_groups": 1,
"num_return_sequences": 1,
"output_scores": false,
"output_logits": null,
"penalty_alpha": null,
"remove_invalid_values": false,
"repetition_penalty": 1.1,
"return_dict_in_generate": false,
"sequence_bias": null,
"spatial_merge_size": 2,
"suppress_tokens": null,
"typical_p": 1.0,
"force_words_ids": null,
"num_beams": 1,
"renormalize_logits": false,
"use_cache": true,
"eos_token_id": [
151329,
151336,
151338,
151348
],
"max_new_tokens": 8192,
"pad_token_id": 151329,
"vision_start_token_id": 151341,
"vision_end_token_id": 151342,
"image_token_id": 151343,
"video_token_id": 151656,
"temperature": 0,
"top_k": 2,
"top_p": 0,
"dola_layers": null,
"cache_implementation": null,
"cache_config": null,
"return_legacy_cache": null,
"min_p": null,
"token_healing": false,
"watermarking_config": null,
"decoder_start_token_id": null,
"max_matching_ngram_size": null,
"stop_strings": null
},
"patch":{
"infer_fa": true
},
"min_pixels": 1003520,
"max_pixels": 12845056,
"image_path": "examples/qwen2vl/demo.jpeg",
"prompts": "Describe this image and keep it within 100 words.",
"return_ids": true
}