ERNIE-4.5-VL-28B-A3B-PT/modeling_ernie4_5_vl.py-代码预览-ERNIE-4.5-VL-28B-A3B-PT:基于MoE架构的多模态图文理解与生成模型 - AtomGit

65c79c1c创建于 2025年8月8日历史提交
# Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Ernie VL model"""
import re
import math
import itertools
from dataclasses import dataclass
from collections import defaultdict
from copy import deepcopy
from functools import partial
from typing import List, Optional, Tuple, Union

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.attention import SDPBackend, sdpa_kernel

from transformers.activations import ACT2FN
from transformers.generation import GenerationMixin
from transformers.modeling_outputs import ModelOutput
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import logging
from .configuration_ernie4_5_vl import (
    DFNRopeVisionTransformerConfig,
    Ernie4_5_MoEConfig,
    Ernie4_5_VLMoEConfig,
)

logger = logging.get_logger(__name__)


__all__ = [
    "Ernie4_5_VLMoeForConditionalGeneration",
    "DFNRopeVisionTransformerPreTrainedModel",
    "VariableResolutionResamplerModel",
]


class TokenType:
    """token type definition"""

    text = 0
    image = 1
    video = 2


class UniqueNameGuard:
    """name guard"""

    def __init__(self, prefix=""):
        self.prefix = prefix
        self.counter = {}

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        pass

    def get_unique_name(self, name):
        """get unique name"""
        if name not in self.counter:
            self.counter[name] = 0
        else:
            self.counter[name] += 1
        return f"{self.prefix}{name}_{self.counter[name]}"


class RopeEmbedding(nn.Module):
    """
    Rotary Position Embedding (RoPE) implementation for transformer models.

    RoPE encodes absolute positional information with rotation matrices and
    naturally incorporates relative position information in self-attention.

    Args:
        head_dim (int): Dimension size of each attention head
        compression_ratio (float, optional): Sequence length compression ratio. Defaults to 1.0.
        base (int, optional): Base value for frequency calculation. Defaults to 10000.

    Attributes:
        head_dim (int): Dimension size of each attention head
        compression_ratio (float): Sequence length compression factor
        base (int): Base value for frequency calculation
    """

    def __init__(self, head_dim, compression_ratio=1.0, base=10000, freq_allocation=0):
        """
        Initialize RoPE embedding layer.

        Args:
            head_dim: Dimension of each attention head
            compression_ratio: Scaling factor for position indices
            base: Base value for frequency calculation
        """
        super().__init__()
        self.head_dim = head_dim
        self.compression_ratio = compression_ratio
        self.base = base

        # num of freq allocated to time
        self.freq_allocation = freq_allocation

    def forward(self, seq_length, position_ids=None):
        """
        Compute rotary position embeddings for given sequence length.

        Args:
            seq_length (int): Maximum sequence length
            position_ids (Tensor, optional): Custom position indices. Defaults to None.

        Returns:
            Tensor: Rotary position embeddings of shape [1, 1, seq_length, head_dim]
        """
        indices = torch.arange(0, self.head_dim, 2, dtype=torch.float32)
        indices = 1 / self.base ** (indices / self.head_dim)
        if position_ids is None:
            position_ids = torch.arange(
                0, seq_length, 1, dtype=torch.float32
            ).unsqueeze(1)
            position_ids = position_ids / self.compression_ratio
            sinusoid_inp = position_ids * indices.unsqueeze(0)
        else:
            position_ids = position_ids / self.compression_ratio
            seq_length = position_ids.shape[-1]
            sinusoid_inp = position_ids.unsqueeze(-1).to(
                torch.float32
            ) * indices.unsqueeze(0)
        pos_emb = torch.cat([torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)], dim=-1)
        pos_emb = pos_emb.view(-1, 1, seq_length, self.head_dim)
        pos_emb = pos_emb.detach()
        return pos_emb

    def apply_rotary(self, rp, q, k):
        """
        Apply rotary position embeddings to queries and keys.

        Args:
            rp (Tensor): Rotary position embeddings
            q (Tensor): Query tensor [batch, heads, seq_len, dim]
            k (Tensor): Key tensor [batch, heads, seq_len, dim]

        Returns:
            Tuple[Tensor, Tensor]: Rotated queries and keys
        """
        sin, cos = torch.chunk(rp, 2, dim=-1)
        # sin [θ0,θ1,θ2......θd/2-1] -> sin_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]
        sin_pos = torch.stack([sin, sin], dim=-1).reshape(rp.shape)
        # cos [θ0,θ1,θ2......θd/2-1] -> cos_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]
        cos_pos = torch.stack([cos, cos], dim=-1).reshape(rp.shape)
        # rotate_half_query_layer [-q1,q0,-q3,q2......,-qd-1,qd-2]
        rotate_half_q = torch.stack(
            [-q[:, :, :, 1::2], q[:, :, :, 0::2]], dim=-1
        ).reshape(q.shape)
        query = (q.to(torch.float32) * cos_pos) + (
            rotate_half_q.to(torch.float32) * sin_pos
        )
        # rotate_half_key_layer [-k1,k0,-k3,k2......,-kd-1,kd-2]
        rotate_half_k = torch.stack(
            [-k[:, :, :, 1::2], k[:, :, :, 0::2]], dim=-1
        ).reshape(k.shape)
        key = (k.to(torch.float32) * cos_pos) + (
            rotate_half_k.to(torch.float32) * sin_pos
        )
        return query, key

    def apply_rotary_3d(self, rp, q, k, position_ids):
        """
        rope 3d rotary

        args:
            rp: [1, max_seqlen, 1, head_dim]
            q: [bsz, seqlen, head, head_dim]
            k: [bsz, seqlen, head, head_dim]
            position_ids: [bsz, seqlen, 3]
        """
        current_device = q.device
        sin, cos = torch.chunk(rp, 2, axis=-1)
        assert position_ids.shape[:1] == q.shape[:1]
        batch_indices = torch.arange(end=position_ids.shape[0])
        batch_indices = batch_indices[..., None]
        sin = sin.tile(position_ids.shape[0], 1, 1, 1).to(device=position_ids.device)
        cos = cos.tile(position_ids.shape[0], 1, 1, 1).to(device=position_ids.device)

        assert self.freq_allocation != 0
        sin_t = sin[batch_indices, position_ids[..., 0], :, -self.freq_allocation :]
        sin_h = sin[
            batch_indices,
            position_ids[..., 1],
            :,
            : self.head_dim // 2 - self.freq_allocation : 2,
        ]
        sin_w = sin[
            batch_indices,
            position_ids[..., 2],
            :,
            1 : self.head_dim // 2 - self.freq_allocation : 2,
        ]
        sin_hw = torch.stack([sin_h, sin_w], dim=-1).reshape(
            sin_h.shape[:-1] + (sin_h.shape[-1] * 2,)
        )
        sin_thw = torch.cat([sin_hw, sin_t], dim=-1)

        cos_t = cos[batch_indices, position_ids[..., 0], :, -self.freq_allocation :]
        cos_h = cos[
            batch_indices,
            position_ids[..., 1],
            :,
            : self.head_dim // 2 - self.freq_allocation : 2,
        ]
        cos_w = cos[
            batch_indices,
            position_ids[..., 2],
            :,
            1 : self.head_dim // 2 - self.freq_allocation : 2,
        ]
        cos_hw = torch.stack([cos_h, cos_w], dim=-1).reshape(
            cos_h.shape[:-1] + (cos_h.shape[-1] * 2,)
        )
        cos_thw = torch.cat([cos_hw, cos_t], dim=-1)

        # sin [θ0,θ1,θ2......θd/2-1] -> sin_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]
        sin_pos = (
            torch.stack([sin_thw, sin_thw], dim=-1)
            .reshape(sin_thw.shape[:3] + (sin_thw.shape[-1] * 2,))
            .to(current_device)
        )
        # cos [θ0,θ1,θ2......θd/2-1] -> cos_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1]
        cos_pos = (
            torch.stack([cos_thw, cos_thw], dim=-1)
            .reshape(cos_thw.shape[:3] + (cos_thw.shape[-1] * 2,))
            .to(current_device)
        )

        # rotate_half_query_layer [-q1,q0,-q3,q2......,-qd-1,qd-2]
        rotate_half_q = torch.stack(
            [-q[:, :, :, 1::2], q[:, :, :, 0::2]], dim=-1
        ).reshape(q.shape)
        query = (q.to(torch.float32) * cos_pos) + (
            rotate_half_q.to(torch.float32) * sin_pos
        )
        # rotate_half_key_layer [-k1,k0,-k3,k2......,-kd-1,kd-2]
        rotate_half_k = torch.stack(
            [-k[:, :, :, 1::2], k[:, :, :, 0::2]], dim=-1
        ).reshape(k.shape)
        key = (k.to(torch.float32) * cos_pos) + (
            rotate_half_k.to(torch.float32) * sin_pos
        )
        return query, key


class Ernie4_5_MLP(nn.Module):
    """
    Ernie4_5_MLP - Gated Multi-Layer Perceptron module used in Ernie model.
    """

    def __init__(self, config, layer_idx=0):
        """
        Initialize the MLP module with configuration options.

        Args:
            config (Ernie4_5_Config): Model configurations.
            layer_idx (int): Index of current layer (default: 0)
        """
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.intermediate_size = config.intermediate_size

        self.gate_proj = nn.Linear(
            self.hidden_size, self.intermediate_size, bias=config.use_bias
        )
        self.up_proj = nn.Linear(
            self.hidden_size, self.intermediate_size, bias=config.use_bias
        )
        self.down_proj = nn.Linear(
            self.intermediate_size, self.hidden_size, bias=config.use_bias
        )

    def forward(self, x):
        """
        Forward pass through the MLP module.

        Args:
            x (Tensor): Input tensor of shape [batch_size, seq_len, hidden_size]

        Returns:
            Tensor: Output tensor of shape [batch_size, seq_len, hidden_size]
        """
        current_device = self.gate_proj.weight.data.device
        x = x.to(current_device)
        down_proj = self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
        return down_proj


class Ernie4_5_Attention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config, layer_idx=0):
        """Initialize the attention layer.

        Args:
            config (Ernie4_5_Config): Model configuration.
            layer_idx (int, optional): Index in transformer stack. Defaults to 0.
        """
        super().__init__()
        self.layer_idx = layer_idx
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.is_gqa = (
            self.num_key_value_heads is not None
            and self.num_key_value_heads != self.num_heads
        )

        self.freq_allocation = getattr(config, "freq_allocation", 0)
        assert (
            self.freq_allocation is not None
        ), "freq_allocation must be provided if rope_3d is on."

        if config.tensor_parallel_degree > 1:
            assert (
                self.num_heads % config.tensor_parallel_degree == 0
            ), f"num_heads: {self.num_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
            self.num_heads = self.num_heads // config.tensor_parallel_degree
            if self.is_gqa:
                assert (
                    self.num_key_value_heads % config.tensor_parallel_degree == 0
                ), f"num_heads: {self.num_key_value_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
                self.num_key_value_heads = (
                    self.num_key_value_heads // config.tensor_parallel_degree
                )
        q_hidden_size = self.head_dim * self.num_heads
        if self.is_gqa:
            logger.info(
                f"use GQA - num_heads: {self.num_heads}- num_key_value_heads: {self.num_key_value_heads}"
            )
            assert (
                self.num_heads % self.num_key_value_heads == 0
            ), f"num_heads: {self.num_heads}, num_key_value_heads: {self.num_key_value_heads}"
            kv_hidden_size = self.head_dim * self.num_key_value_heads
        else:
            kv_hidden_size = self.head_dim * self.num_heads

        self.q_proj = nn.Linear(self.hidden_size, q_hidden_size, bias=config.use_bias)
        self.k_proj = nn.Linear(self.hidden_size, kv_hidden_size, bias=config.use_bias)
        self.v_proj = nn.Linear(self.hidden_size, kv_hidden_size, bias=config.use_bias)

        self.o_proj = nn.Linear(
            self.hidden_size,
            self.hidden_size,
            bias=config.use_bias,
        )

        self.rotary_emb = RopeEmbedding(
            self.head_dim,
            compression_ratio=config.compression_ratio,
            base=config.rope_theta,
            freq_allocation=self.freq_allocation,
        )
        self.config = config
        if self.config.use_flash_attention:
            self.attn_func = self._flash_attention_wrapper
        else:
            self.attn_func = self.core_attn

    def forward(
        self,
        hidden_states,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        attn_mask_start_row_indices: Optional[torch.Tensor] = None,
        position_ids: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        token_type_ids: Optional[Tuple[torch.Tensor]] = None,  # MLLM
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Compute attention outputs.

        Args:
            hidden_states (torch.Tensor): Input tensor [bsz, seq_len, hidden_size]
            past_key_value (Optional[Tuple[torch.Tensor, torch.Tensor]]): Cached key/value states
            attention_mask (Optional[torch.Tensor]): Attention mask tensor
            attn_mask_start_row_indices (Optional[torch.Tensor]): Variable length attention indices
            position_ids (Optional[torch.Tensor]): Position indices for RoPE
            output_attentions (bool): Return attention weights if True
            use_cache (bool): Cache key/value states if True

        Returns:
            Tuple containing:
                - attention_output: [bsz, seq_len, hidden_size]
                - attention_weights: Optional attention probabilities
                - updated_key_value_cache: Optional updated cache
        """
        if token_type_ids is not None:
            token_type_ids = token_type_ids[:, :-1]

        bsz, q_len, _ = hidden_states.shape
        query_states = self.q_proj(hidden_states).reshape(
            [bsz, q_len, -1, self.head_dim]
        )
        key_states = self.k_proj(hidden_states).reshape([bsz, q_len, -1, self.head_dim])
        value_states = self.v_proj(hidden_states).reshape(
            [bsz, q_len, -1, self.head_dim]
        )

        attn_output, attn_weights, past_key_value = self.rope_attn(
            query_states=query_states,
            key_states=key_states,
            value_states=value_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            past_key_value=past_key_value,
            use_cache=use_cache,
            attn_mask_start_row_indices=attn_mask_start_row_indices,
        )
        attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None

        return attn_output, attn_weights, past_key_value

    def repeat_kv(self, hidden_states, n_rep):
        """
        This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
        num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
        """
        batch, num_key_value_heads, slen, head_dim = hidden_states.shape
        if n_rep == 1:
            return hidden_states
        hidden_states = hidden_states[:, :, None, :, :].expand(
            batch, num_key_value_heads, n_rep, slen, head_dim
        )
        return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)

    def _flash_attention_wrapper(
        self,
        q,
        k,
        v,
        attention_mask=None,
        attn_mask_start_row_indices=None,
        seq_length=None,
    ):
        """Wrapper for flash attention implementation.
        Args:
            q (torch.Tensor): Query tensor
            k (torch.Tensor): Key tensor
            v (torch.Tensor): Value tensor
            attention_mask (Optional[torch.Tensor]): Attention mask
            attn_mask_start_row_indices (Optional[torch.Tensor]): Variable length indices
            seq_length (Optional[int]): Sequence length
        Returns:
            Tuple[torch.Tensor, torch.Tensor]: Attention output and weights
        """
        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)

        with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
            out = F.scaled_dot_product_attention(
                q,
                k,
                v,
                attn_mask=None,
                dropout_p=self.config.attention_probs_dropout_prob,
                is_causal=q.shape[-2] == k.shape[-2],
                scale=1
                / (getattr(self.config, "scale_qk_coeff", 1.0) * self.head_dim**0.5),
                enable_gqa=self.is_gqa,
            )
        out = out.transpose(1, 2)
        out = out.contiguous().view(out.size(0), out.size(1), -1)

        return out, None

    def core_attn(
        self,
        q,
        k,
        v,
        attention_mask=None,
        attn_mask_start_row_indices=None,
        seq_length=None,
    ):
        """Standard self-attention implementation.

        Args:
            q (torch.Tensor): Query tensor
            k (torch.Tensor): Key tensor
            v (torch.Tensor): Value tensor
            attention_mask (Optional[torch.Tensor]): Attention mask
            attn_mask_start_row_indices (Optional[torch.Tensor]): Variable length indices
            seq_length (Optional[int]): Sequence length

        Returns:
            Tuple[torch.Tensor, torch.Tensor]: Attention output and weights
        """
        origin_dtype = q.dtype

        q = q.permute(0, 2, 1, 3)
        k = k.permute(0, 2, 1, 3)
        v = v.permute(0, 2, 1, 3)

        scale_qk_coeff = getattr(self.config, "scale_qk_coeff", 1.0) * (
            self.head_dim**0.5
        )

        q = q / scale_qk_coeff

        # Handle GQA case - repeat k and v heads to match q heads
        if self.is_gqa:
            # [batch, num_key_value_heads, seq_len, head_dim] -> [batch, num_heads, seq_len, head_dim]
            repeat_factor = self.num_heads // self.num_key_value_heads
            k = self.repeat_kv(k, repeat_factor)
            v = self.repeat_kv(v, repeat_factor)

        product = torch.matmul(q, k.transpose(-2, -1))

        product = product.to(torch.float32)
        if getattr(self.config, "scale_qk_coeff", 1.0) != 1.0:
            product = product * getattr(self.config, "scale_qk_coeff", 1.0)

        seq_len = product.size(-1)
        mask = torch.triu(
            torch.ones((seq_len, seq_len), dtype=torch.bool, device=product.device),
            diagonal=1,
        )
        product = product.masked_fill(mask, float("-inf"))
        weights = F.softmax(product, dim=-1)

        weights = weights.to(origin_dtype)

        if getattr(self.config, "attention_probs_dropout_prob", 0.0) > 0:
            weights = F.dropout(
                weights,
                self.config.attention_probs_dropout_prob,
                training=self.training,
            )

        out = torch.matmul(weights, v)

        # combine heads
        out = out.permute(0, 2, 1, 3)
        out = out.contiguous().view(out.size(0), out.size(1), -1)

        return out, weights

    def rope_attn(
        self,
        query_states,
        key_states,
        value_states,
        attention_mask,
        position_ids,
        output_attentions=False,
        past_key_value=None,
        use_cache=False,
        attn_mask_start_row_indices=None,
    ):
        """Attention computation with rotary embeddings.

        Args:
            mix_layer (Optional[torch.Tensor]): Combined QKV projection
            query_states (torch.Tensor): Query states
            key_states (torch.Tensor): Key states
            value_states (torch.Tensor): Value states
            attention_mask (Optional[torch.Tensor]): Attention mask
            position_ids (Optional[torch.Tensor]): Position indices
            output_attentions (bool): Return attention weights
            past_key_value (Optional[Tuple[torch.Tensor, torch.Tensor]]): Cached states
            use_cache (bool): Cache new states
            attn_mask_start_row_indices (Optional[torch.Tensor]): Variable length indices

        Returns:
            Tuple containing:
                - attention_output: Result tensor
                - attention_weights: Optional weights
                - updated_key_value_cache: Optional cache
        """

        query_states_dtype = query_states.dtype

        assert position_ids is not None, "rope3d requires pos-id"
        kv_seq_len = position_ids.max() + 1
        offset = 0
        if past_key_value is not None:
            offset = position_ids.max()
            kv_seq_len = position_ids.max() + 1
            position_ids = position_ids[:, -1:, :]

        cos_sin = self.rotary_emb(kv_seq_len).permute([0, 2, 1, 3])
        if offset > 0 and position_ids is None:
            cos_sin = cos_sin[:, offset:]
        query_states, key_states = self.rotary_emb.apply_rotary_3d(
            cos_sin, query_states, key_states, position_ids
        )

        query_states = query_states.to(query_states_dtype)
        key_states = key_states.to(query_states_dtype)
        if past_key_value is not None:
            # reuse k, v, self_attention
            key_states = torch.cat([past_key_value[0], key_states], dim=1)
            value_states = torch.cat([past_key_value[1], value_states], dim=1)

        # shape: [2, b, s, kvh, d]
        past_key_value = [key_states, value_states] if use_cache else None
        seq_length = query_states.shape[1]
        attn_output, attn_weights = self.attn_func(
            query_states,
            key_states,
            value_states,
            attention_mask,
            attn_mask_start_row_indices,
            seq_length,
        )

        return attn_output, attn_weights, past_key_value


class FusedDropoutImpl(nn.Module):
    """
    Fused dropout implementation with residual connection support.

    This layer combines dropout and residual addition in a single operation for better performance,
    particularly on GPU devices. The dropout is conditionally applied based on the probability.

    Args:
        prob (float): Dropout probability (between 0 and 1)
        mode (str): Dropout mode, either 'upscale_in_train' or 'downscale_in_infer'

    Attributes:
        prob (float): Stores the dropout probability
        mode (str): Stores the dropout mode
        dropout (nn.Dropout): The actual dropout layer instance
    """

    def __init__(self, prob, mode):
        """
        Initialize the fused dropout layer.

        Args:
            prob (float): Dropout probability (0 means no dropout)
            mode (str): Dropout mode ('upscale_in_train' or 'downscale_in_infer')
        """
        super().__init__()
        self.prob = prob
        self.dropout = nn.Dropout(p=prob)

    def forward(self, x, y):
        """
        Forward pass of the fused dropout layer.

        Args:
            x (Tensor): Input tensor to potentially apply dropout on
            y (Tensor): Residual tensor to add to the (possibly dropped out) x

        Returns:
            Tensor: Result of x (with optional dropout) + y
        """
        if self.prob > 0:
            x = self.dropout(x)
        output = x + y

        return output


class RMSNorm(nn.Module):
    """
    Root Mean Square Layer Normalization (RMSNorm) implementation.

    RMSNorm is a simplified version of LayerNorm that focuses on the root mean square of inputs,
    omitting the mean-centering operation. This provides computational efficiency while maintaining
    good performance.

    """

    def __init__(self, config):
        """
        Initialize RMSNorm layer.

        Args:
            config (Ernie4_5_Config): Model configuration.
        """
        super().__init__()
        self.hidden_size = config.hidden_size
        self.weight = nn.Parameter(
            torch.ones(self.hidden_size, dtype=torch.get_default_dtype())
        )
        self.variance_epsilon = config.rms_norm_eps

    def forward(self, hidden_states):
        """
        Apply RMS normalization to input hidden states.

        Args:
            hidden_states (Tensor): Input tensor of shape [batch_size, seq_len, hidden_size]

        Returns:
            Tensor: Normalized output tensor of same shape as input

        Note:
            - computes RMSNorm manually:
                1. Compute variance of features
                2. Apply reciprocal square root normalization
                3. Scale by learned weight parameter
            - Maintains original dtype for numerical stability during computation
        """
        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
        hidden_states = torch.rsqrt(variance + self.variance_epsilon) * hidden_states
        return hidden_states.to(self.weight.dtype) * self.weight


class Ernie4_5_MoeMLP(Ernie4_5_MLP):
    """Mixture of Experts (MoE) variant of ERNIE's MLP layer."""

    def __init__(self, config, layer_idx=0):
        """Initialize the MoE MLP layer.

        Args:
            config (Ernie4_5_MoEConfig): Configuration for MoE architecture.
            layer_idx (int): Index of current layer in transformer stack
        """

        if getattr(config, "disable_ffn_model_parallel", False):
            config = deepcopy(config)
            config.tensor_parallel_degree = 1

        super().__init__(config, layer_idx=layer_idx)
        self.moe_dropout_prob = config.moe_dropout_prob

    def forward(self, x):
        """Forward pass through MoE MLP layer.

        Args:
            x (paddle.Tensor): Input tensor of shape [batch_size, seq_len, hidden_size]
                              or [seq_len, hidden_size]

        Returns:
            paddle.Tensor: Output tensor with same shape as input
        """
        current_device = self.gate_proj.weight.data.device
        x = x.to(current_device)
        x = F.silu(self.gate_proj(x)) * self.up_proj(x)
        if self.moe_dropout_prob > 0:
            x = F.dropout(input=x, p=self.moe_dropout_prob)
        ret = self.down_proj(x)
        return ret


def masked_fill(x, mask, value):
    """
    Fills elements of the input tensor with a given value where mask is True.
    """
    return torch.where(mask, torch.full_like(x, value), x)


def _squared_l2_norm(x: torch.Tensor) -> torch.Tensor:
    """Computes 0.5 * sum(x^2)"""
    return 0.5 * torch.sum(x * x)


@torch.no_grad()
def compute_optimal_transport(M, r, c, lam=1.0, epsilon=1e-8, max_iters: int = 10):
    """
    Computes optimal transport matrix and Sinkhorn distance using Sinkhorn-Knopp algorithm.
    """
    n, _ = M.shape
    P = F.softmax(-M / lam, dim=1)  # Applying softmax over columns
    u = torch.zeros(n, dtype=torch.float32, device=M.device)

    for _ in range(max_iters):
        P_sum_1 = P.sum(1)
        if (u - P_sum_1).abs().max() < epsilon:
            break
        u = P_sum_1
        P *= (r / (u + 1e-8)).unsqueeze(1)
        P *= (c / (P.sum(0) + 1e-8)).unsqueeze(0)

    P = torch.where(~P.isnan(), P, torch.zeros_like(P))
    return P, _


class Top2Gate(nn.Module):
    """
    Gate module implementing Top2Gating as described in Gshard paper.
    """

    def __init__(self, config, layer_idx: int, group=None, gate_weight=None) -> None:
        """
        Initialize the MoE (Mixture of Experts) layer.

        Args:
            config: Model configuration containing MoE parameters
            layer_idx: Index of this layer in the model
            group: Distributed communication group
            gate_weight: Optional pre-existing gate weight tensor
        """
        super().__init__()
        self.config = config

        self.model_dim = config.hidden_size
        self.num_experts = config.moe_num_experts
        self.num_experts_tensor = (
            sum(config.moe_num_experts)
            if config.multimodel_experts
            else config.moe_num_experts
        )

        self.cap = config.moe_capacity
        self.group = group

        self.layer_idx = layer_idx

        self.sinkhorn_2gate = config.sinkhorn_2gate
        self.sinkhorn_temp = config.sinkhorn_temp
        self.use_correction_bias = config.moe_use_aux_free  # true
        self.use_token_type_bias = config.get("moe_use_token_type_bias", False)

        self.act = partial(F.softmax, dim=-1)  # [S,E]

        self.no_jitter = True
        self.expert_drop = False
        self.eye_matrix = None
        self.eye_matrix_size = None
        self.norm_gate_logits = config.moe_norm_gate_logits  # true
        self.one = torch.ones([], dtype=torch.float32)

        self.moe_aux_loss_lambda = torch.tensor(config.moe_aux_loss_lambda).to(
            dtype=torch.float32
        )
        self.moe_z_loss_lambda = torch.tensor(config.moe_z_loss_lambda).to(
            dtype=torch.float32
        )
        self.moe_orthogonal_loss_lambda = torch.tensor(
            config.moe_orthogonal_loss_lambda
        ).to(dtype=torch.float32)

        if self.moe_aux_loss_lambda.ndim == 0:
            self.moe_aux_loss_lambda = self.moe_aux_loss_lambda.unsqueeze(0)
        if self.moe_z_loss_lambda.ndim == 0:
            self.moe_z_loss_lambda = self.moe_z_loss_lambda.unsqueeze(0)
        if self.moe_orthogonal_loss_lambda.ndim == 0:
            self.moe_orthogonal_loss_lambda = self.moe_orthogonal_loss_lambda.unsqueeze(
                0
            )

        self.experts_type_ids = None

        self.eps = torch.tensor([1e-12]).to(dtype=torch.float32)
        if config.multimodel_experts:
            if config.get("moe_use_hard_gate", False):
                self.num_experts_list = []
                self.experts_type_mask = []
                # hard-gate + group_experts 需要对gate_logits不同部分分开计算
                experts_ids = torch.zeros(
                    [sum(self.num_experts)], dtype=torch.int64
                ).reshape((1, -1))
                offset = 0
                for i, expert_num in enumerate(self.num_experts):
                    experts_ids[:, offset : offset + expert_num] = i
                    offset += expert_num
                self.experts_type_ids = experts_ids.reshape([-1])
                logger.info(
                    f"use moe_use_hard_gate, experts_ids: {self.experts_type_ids}"
                )
                for i, expert_num in enumerate(self.num_experts):
                    self.experts_type_mask.append(
                        self.experts_type_ids == i,
                    )
                    self.num_experts_list.append(expert_num)
            else:
                # 非group_experts, 依赖token_type_bias实现hard-gate能力。
                assert (
                    not config.moe_group_experts
                ), "group_experts must use hard_gate when multimodel_experts is True"
        else:
            self.num_experts_list = [self.num_experts]

        if gate_weight is not None:
            self.weight = gate_weight

            assert (
                not self.config.moe_use_token_type_bias
            ), "gate_weights is from outside, token_type_bias can't be used"
            logger.info("moe use gate_weight from outside")
            # use fp32 pecison in amp
            self._cast_to_low_precision = False
            self._cast_to_low_precison = False
        else:
            self._create_gate_parameter()
        logger.info(
            f"{config.moe_gate}: w/ capacity: {self.cap} experts:{self.num_experts} "
            f"use_token_type_bias:{self.use_token_type_bias} "
            f"gate_act:{config.moe_gate_act} "
            f"norm_gate_logits={self.norm_gate_logits} use_correction_bias={self.use_correction_bias}"
        )

    def _create_gate_parameter(self):
        """
        Create gate weight parameter.
        """
        if self.config.multimodel_experts:
            # support setting lambda for each expert group
            self.moe_z_loss_lambda = self.moe_z_loss_lambda.expand(
                len(self.num_experts)
            )
            self.moe_aux_loss_lambda = self.moe_aux_loss_lambda.expand(
                len(self.num_experts)
            )
            self.moe_orthogonal_loss_lambda = self.moe_orthogonal_loss_lambda.expand(
                len(self.num_experts)
            )

            for i, num_experts in enumerate(self.num_experts):
                if i == 1:
                    with UniqueNameGuard(f"mm_gate_{self.layer_idx}_"):
                        p = nn.Parameter(
                            torch.empty(
                                self.model_dim,
                                num_experts,
                                dtype=torch.float32,
                                device="cpu",
                            )
                        )
                        nn.init.xavier_uniform_(p)  # Common initialization
                else:
                    p = nn.Parameter(
                        torch.empty(
                            self.model_dim,
                            num_experts,
                            dtype=torch.float32,
                            device="cpu",
                        )
                    )
                    nn.init.xavier_uniform_(p)  # Common initialization
                self.register_parameter(
                    "weight" if i == 0 else f"weight_{i}",
                    p,
                )
        else:
            self.weight = nn.Parameter(
                torch.empty(self.model_dim, self.num_experts, dtype=torch.float32)
            )
            nn.init.xavier_uniform_(self.weight)  # Common initialization
        # use fp32 pecison in amp
        self._cast_to_low_precision = False
        self._cast_to_low_precison = False

    def get_gate_weight(self, transform_weight, is_multimodel=True):
        """
        在`multimodel_experts` 的情况下，将多个 weights merge 成一个整体
        transform_weight: bool, 按照 local-expert id 将 多模态 weight 交叠
        """
        if not is_multimodel or not self.config.multimodel_experts:
            return self.weight
        else:
            return torch.cat(
                [
                    getattr(self, "weight" if i == 0 else f"weight_{i}")
                    for i in range(len(self.num_experts))
                ],
                -1,
            )

    def forward(
        self,
        input: torch.Tensor,
        token_type_ids: torch.Tensor = None,
        transform_weight: bool = True,
        correction_bias: torch.Tensor = None,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Forward pass through the gate.

        Args:
            input: Input tensor of shape [Seq, Dim]
            token_type_ids: Token type IDs tensor of shape [Seq]
            transform_weight: Whether to transform weights for multimodal experts
            correction_bias: Bias tensor for correction

        Returns:
            tuple: (capacity, dispatch_mask, combine_weights, scatter_index, router_loss, logits)
        """
        orig_dtype = input.dtype
        current_device = input.device
        weight = self.get_gate_weight(transform_weight)

        logits = F.linear(
            input.to(dtype=torch.float32, device=current_device),
            weight.T.to(dtype=torch.float32, device=current_device),
        )

        (
            capacity,
            dispatch_mask,
            combine_weights,
            scatter_index,
            l_aux,
            l_zloss,
        ) = self.top2_gating(
            logits,
            correction_bias=(
                correction_bias.to(device=current_device)
                if correction_bias is not None
                else None
            ),
        )

        combine_weights = combine_weights.to(orig_dtype)
        return capacity, dispatch_mask, combine_weights, scatter_index, None, logits

    def get_capacity(self, num_tokens, cap_factor=None, is_multimodel=True):
        """
        Calculate capacity based on number of tokens.

        Args:
            num_tokens: Number of input tokens
            cap_factor: Optional capacity factor override

        Returns:
            int: Calculated capacity
        """
        if is_multimodel and self.config.multimodel_experts:
            num_experts = sum(self.num_experts_list)
        elif isinstance(self.num_experts, (list, tuple)):
            num_experts = self.num_experts[0]
        else:
            num_experts = self.num_experts
        if cap_factor is not None:
            cap = cap_factor
        else:
            if self.training:
                cap = self.cap[0]
            elif num_tokens < num_experts:  # seqlen < num_expert
                cap = self.cap[2]
            else:
                cap = self.cap[1]
        # capacity = 2S/E
        capacity = int(cap * num_tokens // num_experts)
        assert (
            capacity > 0
        ), f"requires capacity to >= 0. cap={cap}, num_tokens={num_tokens}"
        return capacity

    def top2_gating(self, logits, cap=None, correction_bias=None):
        """
        Implement Top2 gating mechanism.

        Args:
            logits: Input logits tensor
            cap: Optional capacity override
            correction_bias: Bias tensor for correction

        Returns:
            tuple: (capacity, dispatch_masks, combine_weights, scatter_indexes, loss_aux, loss_z)

        Note:
        capacity: The maximum number that each token can be dispatched.
        dispatch_masks: Masks used for dispatching. The first element is the mask for the first
        type of tokens; the second element is the mask for the second type of tokens.
        combine_weights: Weights used for combining. The first element is the weight for the first
        type of tokens; the second element is the weight for the second type of tokens.
        scatter_indexes: Indexes used for scattering. The first element is the index for the first
        type of tokens; the second element is the index for the second type of tokens.
        loss_aux: Auxiliary loss.
        loss_z: Z loss.
        """
        gates = self.act(logits)

        # gates has shape of SE
        assert logits.ndim == 2, logits.shape
        num_tokens = gates.shape[0]
        num_experts = gates.shape[1]
        # capacity = 2S/E
        capacity = self.get_capacity(logits.shape[0], cap)
        current_device = logits.device

        # Create a mask for 1st's expert per token
        score_for_argmax = (
            gates + correction_bias.unsqueeze(0)
            if correction_bias is not None
            else gates
        )
        indices1_s = torch.argmax(score_for_argmax, dim=1)
        mask1 = F.one_hot(indices1_s, num_classes=num_experts).to(
            dtype=torch.int64, device=current_device
        )  # [0,1]

        # Create a mask for 2nd's expert per token using Gumbel-max trick
        # https://timvieira.github.io/blog/post/2014/07/31/gumbel-max-trick/
        if self.training and not self.no_jitter:
            gumbels = (
                -torch.empty_like(
                    logits,
                    device=current_device,
                )
                .exponential_()
                .log()
            )  # ~Gumbel(0,1)
            logits_w_noise = logits + gumbels
        else:
            logits_w_noise = logits

        logits_except1 = masked_fill(
            logits_w_noise,
            mask1.to(dtype=torch.bool, device=current_device),
            float("-inf"),
        )
        score_for_argmax = (
            self.act(logits_except1) + correction_bias.unsqueeze(0)
            if correction_bias is not None
            else logits_except1
        )
        indices2_s_original = torch.argmax(score_for_argmax, dim=1)

        if self.training and self.sinkhorn_2gate:
            r = (
                torch.ones(num_tokens, dtype=torch.float32, device=current_device)
                / num_tokens
            )
            c_mask_sum = mask1.to(dtype=torch.float32, device=current_device).sum(0)
            c = capacity - c_mask_sum
            c = torch.maximum(c, torch.zeros_like(c, device=current_device))
            c_sum = c.sum()
            if c_sum > 0:
                c = c / c_sum
            else:  # Avoid division by zero if all experts are full from top-1
                c = torch.ones_like(c, device=current_device) / num_experts

            pi, _ = compute_optimal_transport(
                -logits_except1.to(dtype=torch.float32, device=current_device).detach(),
                r,
                c,
                lam=self.sinkhorn_temp,
            )
            pi = masked_fill(
                pi, mask1.to(dtype=torch.bool, device=current_device), float("-inf")
            )
            indices2_s = torch.argmax(pi, dim=1)
        else:
            indices2_s = indices2_s_original

        mask2 = F.one_hot(indices2_s, num_classes=self.num_experts).to(
            dtype=torch.int64, device=current_device
        )

        # Compute locations in capacity buffer
        locations1 = (
            torch.cumsum(mask1, dim=0) - 1
        )  # [0,1,1,0,1,0,0] -> [0,0,0,0,1,1,1,]
        locations2 = torch.cumsum(mask2, dim=0) - 1
        # Update 2nd's location by accounting for locations of 1st
        locations2 += torch.sum(mask1, dim=0, keepdim=True)

        # Remove locations outside capacity from mask
        mask1 = mask1 * (locations1 < capacity).to(
            dtype=torch.int64, device=current_device
        )  # [0,1,1,0,0,0,0]
        mask2 = mask2 * (locations2 < capacity).to(
            dtype=torch.int64, device=current_device
        )

        # Store the capacity location for each token
        locations1_s = torch.sum(locations1 * mask1, dim=1)
        locations2_s = torch.sum(locations2 * mask2, dim=1)

        # Normalize gate probabilities
        mask1_float = mask1.to(dtype=torch.float32, device=current_device)
        mask2_float = mask2.to(dtype=torch.float32, device=current_device)
        gates1_s = (gates * mask1_float).sum(dim=-1)
        gates2_s = (gates * mask2_float).sum(dim=-1)
        # logger.info(f'gates1_s:{gates1_s} gates2_s:{gates2_s} logits:{logits}')

        if self.norm_gate_logits:
            denom_s = gates1_s + gates2_s  # [0.2, 0.3]
            # Avoid divide-by-zero
            denom_s = torch.clamp(denom_s, min=1e-6)
            gates1_s /= denom_s
            gates2_s /= denom_s
        if self.training and self.expert_drop:
            # log.debug(gates2_s)
            gates2_s = torch.where(
                2 * gates2_s < torch.rand_like(gates2_s, device=current_device),
                torch.zeros_like(gates2_s, device=current_device),
                gates2_s,
            )

        # Calculate combine_weights and dispatch_mask
        gates1 = gates1_s.unsqueeze(1) * mask1_float
        gates2 = gates2_s.unsqueeze(1) * mask2_float

        combine1_weight, expert1_index = torch.max(gates1, dim=-1, keepdim=True)
        scatter1_index = expert1_index.squeeze(-1) * capacity + locations1_s
        scatter1_index = scatter1_index.to(dtype=torch.int64, device=current_device)
        dispatch1_mask = combine1_weight.to(
            dtype=torch.bool, device=current_device
        ).detach()

        combine2_weight, expert2_index = torch.max(gates2, dim=-1, keepdim=True)
        scatter2_index = expert2_index.squeeze(-1) * capacity + locations2_s
        scatter2_index = scatter2_index.to(dtype=torch.int64, device=current_device)
        dispatch2_mask = combine2_weight.to(
            dtype=torch.bool, device=current_device
        ).detach()
        # logger.info(f'expert-id: {expert1_index} vs {expert2_index}, mask:{mask1_float} vs {mask2_float}')

        return (
            capacity,
            torch.cat((dispatch1_mask, dispatch2_mask), 1),
            torch.cat((combine1_weight, combine2_weight), 1),
            torch.stack((scatter1_index, scatter2_index), 1),
            None,
            None,
        )

    def _cal_orthogonal_loss_opt_each_weight(self, weight, use_group):
        """
        Calculate optimized orthogonal loss for each weight.

        Args:
            weight: Weight tensor
            use_group: Whether to use expert groups

        Returns:
            Tensor: Calculated orthogonal loss
        """
        if weight.dtype != torch.float32:
            weight = weight.to(torch.float32)

        wnorm = torch.norm(weight, p=2, dim=1)
        weight = weight / torch.maximum(wnorm, self.eps.to(weight.device)).unsqueeze(1)

        if use_group:
            weight = weight.reshape(
                [self.config.moe_k, -1, weight.shape[1]]
            )  # [K, E/K, H]
            eye_matrix = torch.eye(
                weight.shape[1], dtype=weight.dtype, device=weight.device
            ).unsqueeze(0)
        else:
            eye_matrix = torch.eye(
                weight.shape[0], dtype=weight.dtype, device=weight.device
            )

        weight_matmul = torch.matmul(weight, weight.T)

        orthogonal_loss = weight_matmul - eye_matrix
        orthogonal_loss = _squared_l2_norm(orthogonal_loss) / (
            orthogonal_loss.size(0) * orthogonal_loss.size(1)
        )
        return orthogonal_loss


class TopKGate(Top2Gate):
    """
    Fused version of TopK gate for improved performance.
    """

    def forward(
        self,
        input: torch.Tensor,
        token_type_ids=None,
        transform_weight=True,
        is_multimodel=True,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Forward pass for fused gate.

        Args:
            input: Input tensor
            token_type_ids: Token type IDs
            transform_weight: Whether to transform weights

        Returns:
            tuple: (logits, capacity, router_loss)
        """
        current_device = input.device
        weight = self.get_gate_weight(transform_weight, is_multimodel=is_multimodel)

        logits = F.linear(
            input.to(dtype=torch.float32, device=current_device),
            weight.T.to(dtype=torch.float32, device=current_device),
        )
        if self.use_token_type_bias:
            assert token_type_ids is not None
            assert (
                token_type_ids.max() < self.bias.shape[0]
            ), f"token_type_ids {token_type_ids.max()} >= bias shape {self.bias.shape[0]}"
            bias = self.bias[token_type_ids]  # [seq]
            logits = logits + bias

        return logits


gate_class = dict(
    top2=Top2Gate,
    topk=TopKGate,
)


def get_gate(
    config: Ernie4_5_MoEConfig,
    expert: nn.Module,
    layer_idx: int,
) -> Tuple[nn.Module, nn.ModuleList]:
    """Initialize and distribute MoE (Mixture of Experts) components.

    Creates gate layer and distributed expert network for MoE architecture.

    Args:
        config (Ernie4_5_MoEConfig): Configuration for MoE architecture
        expert (nn.Module): Prototype expert network to be replicated
        layer_idx (int): Index of current layer in transformer stack

    Returns:
        Tuple[nn.Module, nn.ModuleList]:
            - gate: Initialized gate layer for routing
            - experts: ModuleList containing expert networks
    """
    moe_num_experts = (
        sum(config.moe_num_experts)
        if config.multimodel_experts
        else config.moe_num_experts
    )
    experts = nn.ModuleList([])

    for expert_id, (experts_num, fc) in enumerate(expert):
        experts_to_append = []
        if not hasattr(fc, "__len__"):  # run this
            experts_to_append.append(fc)
            if expert_id == 1:
                with UniqueNameGuard("_mm_deepcopy"):
                    for _ in range(experts_num - 1):
                        experts_to_append.append(deepcopy(fc))
            else:
                for _ in range(experts_num - 1):
                    experts_to_append.append(deepcopy(fc))
        else:
            experts_to_append = fc
        for ex in experts_to_append:
            for p in ex.parameters():
                p.expert_type = f"expert_type_{expert_id}"  # Different `expert_type` can have different intermediate-size
        index = 0
        for i in range(experts_num):
            if i // experts_num == 0:
                experts.append(experts_to_append[index])
                index += 1
            else:
                experts.append(None)

    assert (
        len(experts) == moe_num_experts
    ), f"experts.len={len(experts)} != experts_num={experts_num}"
    logger.info(f"MOE-GATE:-{config.moe_gate}")

    gate = gate_class[config.moe_gate.lower()](config, layer_idx=layer_idx)

    if config.multimodel_experts and config.moe_use_hard_gate and moe_num_experts > 2:
        lm_experts = experts[: config.moe_num_experts[0]]
        lm_gate = gate
    else:
        if config.multimodel_experts and config.moe_use_hard_gate:
            lm_gate, lm_experts = gate, experts
        else:
            lm_gate, lm_experts = None, None

    logger.info(f"LM-experts-{lm_experts} -- experts-{experts}")

    return gate, experts, lm_gate, lm_experts


class MoEStatics(nn.Module):
    """
    Stores MoE (Mixture of Experts) statistics
    and expert usage information.
    """

    def __init__(self, config, layer_idx):
        """
        Initialize MoE statistics tracking.

        Args:
            config: Model configuration containing MoE parameters
            layer_idx: Index of the MoE layer in the model
        """
        super().__init__()
        self._cast_to_low_precision = False
        self._cast_to_low_precison = False
        num_experts = (
            config.moe_num_experts[0]
            if config.multimodel_experts
            else config.moe_num_experts
        )
        if config.multimodel_experts:
            assert (
                len(set(config.moe_num_experts)) == 1
            ), "assume expert group has same size, got: {config.moe_num_experts}"

        with UniqueNameGuard(f"mm_layer_{layer_idx}_"):
            num_experts_groups = (
                len(config.moe_num_experts) if config.multimodel_experts else 1
            )
            p = nn.Parameter(
                torch.zeros(num_experts_groups, num_experts, dtype=torch.float32),
                requires_grad=False,
            )
            self.e_score_correction_bias = p
            p = torch.zeros(num_experts_groups, num_experts, dtype=torch.int64)
            self.expert_usage = p


def dispatching(x, dispatch_mask, scatter_index, num_experts, capacity):
    """
    Reorders input tensor based on gate results with capacity truncation and padding.

    Args:
        x (Tensor): Input tensor of shape [Seq, Dim]
        dispatch_mask (Tensor): Dispatching mask of shape [Seq, 2]
        scatter_index (Tensor): Scatter indices of shape [Seq, 2]
        num_experts (int): Number of experts
        capacity (int): Capacity per expert

    Returns:
        Tensor: Dispatched output tensor of shape [Expert*Capacity, Dim]
    """
    output = None
    orig_dtype = x.dtype
    scatter_index_unbound = [scatter_index[:, 0], scatter_index[:, 1]]
    dispatch_mask_unbound = [dispatch_mask[:, 0], dispatch_mask[:, 1]]

    for i_scatter_index, i_dispatch_mask in zip(
        scatter_index_unbound, dispatch_mask_unbound
    ):
        updates = x * i_dispatch_mask.unsqueeze(-1).to(orig_dtype)  # [seq, dim]
        init_output = torch.zeros(
            num_experts * capacity, x.shape[-1], dtype=orig_dtype, device=x.device
        )

        index = i_scatter_index.unsqueeze(-1).expand(-1, x.shape[-1])  # [seq, dim]
        if output is None:
            output = init_output.scatter_add(0, index, updates)
        else:
            output = output + init_output.scatter_add(0, index, updates)
    if output.dtype != orig_dtype:
        output = output.to(orig_dtype)
    return output


def combining(x, combine_weights, scatter_index):
    """
    Combines and aggregates input matrix using combination weights.

    Args:
        x (Tensor): Input tensor of shape [num_experts * capacity, dim]
        combine_weights (Tensor): Combination weights of shape [seq, 2]
        scatter_index (Tensor): Scatter indices of shape [seq, 2]

    Returns:
        Tensor: Combined output tensor of shape [seq, dim]
    """
    dim = x.shape[-1]

    current_device = scatter_index.device
    x = x.to(current_device)
    scatter_index = scatter_index.reshape([-1])
    num_k = combine_weights.shape[-1]

    combine_weights = combine_weights.unsqueeze(1).to(current_device)

    x = x[scatter_index].reshape([-1, num_k, dim])  # [seq, 2, dim]

    return torch.matmul(combine_weights, x).squeeze(
        1
    )  # [seq, 1, 2] @ [seq, 2, dim] -> [seq, 1, dim]


class MOELayer(nn.Module):
    """
    Mixture of Experts layer implementation based on GShard paper.
    """

    def __init__(
        self,
        gate: nn.Module,
        experts: List[nn.Module],
        layer_idx: int,
        shared_experts: Optional[List[nn.Module]] = None,
        group=None,
        recompute: bool = False,
        k: int = 2,
        all_to_all_dropout: float = 0,
        group_experts: bool = False,
        moe_statics=None,
        moe_num_experts=None,
    ):
        """
        Initialize MoE layer.

        Args:
            gate: Gate network for expert selection
            experts: List of expert networks
            layer_idx: Index of this layer in the model
            group: Distributed communication group
            recompute: Whether to enable recomputation
            k: Number of experts to select per token
            all_to_all_dropout: Dropout rate for all-to-all communication
            group_experts: Whether to group experts
            moe_statics: MoE statistics tracking object
        """
        super().__init__()
        self.gate = gate
        self.layer_idx = layer_idx

        if isinstance(experts, nn.ModuleList):
            self.experts = experts
        else:
            logger.info(f"using fused experts, type={type(experts)}")
            self.experts = experts
        self.shared_experts = shared_experts

        self.group = group
        self.k = k
        self.all_to_all_dropout = all_to_all_dropout
        self.use_correction_bias = moe_statics is not None
        self.moe_statics = moe_statics
        if self.use_correction_bias:
            logger.info(
                f"using correction bias, aux-coef:{self.gate.config.moe_aux_loss_lambda}"
            )
            assert self.gate.config.moe_use_aux_free

        self.world_size = 1
        self.rank = 0

        self.multimodal_experts = (
            isinstance(moe_num_experts, (tuple, list)) and len(moe_num_experts) > 1
        )
        self.num_local_experts = len(self.experts) // self.world_size
        if self.multimodal_experts:
            self.num_local_multimodal_experts = [
                num // self.world_size for num in moe_num_experts
            ]
            self.multimodal_expert_index = [0] + list(
                itertools.accumulate(moe_num_experts)
            )

        self.input_preprocess = self.output_postprocess = None
        self.group_experts = group_experts
        self.config = self.gate.config
        self.zero = torch.tensor(0).to(dtype=torch.float32)

    def forward_experts(self, dispatched_input):
        """
        Forward pass through experts sequentially.

        Args:
            dispatched_input: Input tensor of shape [num_experts, capacity, dim]

        Returns:
            Tensor: Expert outputs of shape [num_experts, capacity, dim]
        """

        if not self.multimodal_experts:
            true_experts = self.experts[
                self.rank
                * self.num_local_experts : (self.rank + 1)
                * self.num_local_experts
            ]
        else:
            true_experts = []
            for i, num in enumerate(self.num_local_multimodal_experts):
                current_modal_experts = self.experts[
                    self.multimodal_expert_index[i] : self.multimodal_expert_index[
                        i + 1
                    ]
                ]
                true_experts.extend(
                    current_modal_experts[self.rank * num : (self.rank + 1) * num]
                )

        dispatched_input = dispatched_input.reshape(
            [self.world_size, self.num_local_experts, -1, dispatched_input.shape[-1]]
        )
        current_device = dispatched_input.device
        expert_outputs = []
        if isinstance(self.experts, nn.ModuleList):
            chunks = dispatched_input.permute(1, 0, 2, 3).contiguous().unbind(0)
            assert len(chunks) == len(
                true_experts
            ), f"{len(chunks)}, {len(true_experts)}"
            for chunk, expert in zip(chunks, true_experts):
                expert_outputs.append(expert(chunk))
        else:
            dispatched_input = dispatched_input.permute(1, 0, 2, 3).contiguous()
            orig_shape = dispatched_input.shape
            chunks = dispatched_input.reshape(orig_shape[0], -1, orig_shape[-1])
            chunks = self.experts(chunks)
            chunks = chunks.reshape(orig_shape[:-1] + (chunks.shape[-1],)).unbind(0)
            expert_outputs.extend(chunks)

        for i, expert_output in enumerate(expert_outputs):
            expert_outputs[i] = expert_output.to(current_device)
        expert_output = torch.stack(expert_outputs, dim=1)
        return expert_output

    def moe_gate_dispatch(
        self,
        x: torch.Tensor,  # [S, H]   float16 / float32 / bfloat16
        gate_logits: torch.Tensor,  # [S, E]   float32
        k: int,
        capacity: Optional[int],
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        """dispatch input to experts based on gate logits"""

        S, H = x.shape
        E = gate_logits.shape[1]
        device = x.device
        if self.use_correction_bias:
            _, topk_idx = torch.topk(gate_logits + self.moe_statics.e_score_correction_bias[0].detach().to(gate_logits.device), k, dim=-1)     
            topk_prob = torch.gather(gate_logits, dim=1, index=topk_idx) #  [Seq, k]
        else:
            topk_prob, topk_idx = torch.topk(gate_logits, k, dim=-1)  # [S, k]
        combine_weights = topk_prob  # [S, k]
        expert_id = topk_idx  # [S, k]
        y = x.new_zeros((E, capacity, H))  # [E, C, H]
        scatter_index = x.new_full((k, S), -1, dtype=torch.int32)  # [k, S]
        # per-expert slot counters
        slot_counter = torch.zeros(E, dtype=torch.int32, device=device)

        for tok in range(S):
            for route in range(k):
                e = expert_id[tok, route].item()
                slot = slot_counter[e].item()
                if slot >= capacity:  # expert is full -> drop
                    combine_weights[tok, route] = 0.0
                    continue
                # record mapping & dispatch activation
                scatter_index[route, tok] = e * capacity + slot
                y[e, slot] = x[tok]
                slot_counter[e] += 1

        expert_offset = torch.cumsum(slot_counter, 0, dtype=torch.int64)

        return y, combine_weights, scatter_index, expert_offset, expert_id

    def gate_and_dispatch(self, input, token_type_ids=None, is_multimodel=True):
        """
        Calculate gate and dispatch inputs.

        Args:
            input: Input tensor of shape [seq, dim]

        Returns:
            tuple: (dispatched_input, combine_weights, dispatch_mask,
            scatter_index, router_loss, gate_logits, gate_prob)
        """
        d_model = input.shape[1]
        if isinstance(self.gate, (TopKGate)):
            capacity = self.gate.get_capacity(
                input.shape[0], is_multimodel=is_multimodel
            )
            if token_type_ids is not None:
                token_type_ids = token_type_ids.reshape([-1])
            gate_logits = self.gate(
                input, token_type_ids=token_type_ids, is_multimodel=is_multimodel
            )
            prob = self.gate.act(gate_logits)
            (
                dispatched_input,
                combine_weights_unnorm,
                scatter_index,
                dispatch_mask,
                _,
            ) = self.moe_gate_dispatch(input, prob, k=self.k, capacity=capacity)
            dispatch_mask = torch.diff(F.pad(dispatch_mask, (1, 0)))

            scatter_index.detach()
            dispatch_mask.detach()

            scatter_index = scatter_index.transpose(0, 1)  # [k, s] -> [s, k]
            combine_weights = combine_weights_unnorm / torch.clamp(
                combine_weights_unnorm.sum(dim=-1, keepdim=True), min=1e-12
            )
            combine_weights = combine_weights.to(dtype=dispatched_input.dtype)

        else:
            (
                capacity,
                dispatch_mask,
                combine_weights,
                scatter_index,
                router_loss,
                gate_logits,
            ) = self.gate(
                input,
            )
            prob = None
            dispatched_input = dispatching(
                input,
                dispatch_mask,
                scatter_index,
                num_experts=self.world_size * self.num_local_experts,
                capacity=capacity,
            )

        dispatched_input = dispatched_input.reshape(
            [self.world_size * self.num_local_experts, capacity, d_model]
        )

        dispatch_mask = dispatch_mask.detach()
        scatter_index = scatter_index.detach()
        return (
            dispatched_input,
            combine_weights,
            dispatch_mask,
            scatter_index,
            None,
            gate_logits,
            prob,
        )

    def combine_expert_output(self, expert_output, combine_weights, scatter_index):
        """
        Combine expert outputs using combination weights.

        Args:
            expert_output: Expert outputs [num_experts, capacity, dim]
            combine_weights: Combination weights
            scatter_index: Scatter indices

        Returns:
            Tensor: Combined output [seqlen, dim]
        """
        expert_output = expert_output.reshape(
            [-1, expert_output.shape[-1]]
        )  # [e*1,c,m]

        combined_output = combining(expert_output, combine_weights, scatter_index)

        if self.output_postprocess is not None:
            combined_output = self.output_postprocess(combined_output)

        return combined_output

    def forward(
        self,
        input: torch.Tensor,
        token_type_ids=None,
        is_multimodel=True,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Forward pass through MoE layer.

        Args:
            input: Input tensor of shape [s, d]

        Returns:
            tuple: (output, combine_weights, router_loss, gate_logits)
        """
        if input.dim() == 3:
            orig_shape = input.shape
            input = input.reshape([-1, input.shape[-1]])
        else:
            orig_shape = None
        assert (
            input.dim() == 2
        ), f"input Tensor must have dimensions: (s)equence, (d)im, got:{input.shape}"
        if token_type_ids is not None:
            token_type_ids = token_type_ids.clone()[:, :-1]

        assert self.gate is not None

        gate_input = input

        (
            dispatched_input,
            combine_weights,
            dispatch_mask,
            scatter_index,
            router_loss,
            gate_logits,
            gate_prob,
        ) = self.gate_and_dispatch(
            gate_input, token_type_ids, is_multimodel=is_multimodel
        )

        if self.shared_experts is not None:
            shared_out = self.shared_experts(input)

        expert_out = self.forward_experts(dispatched_input)

        combined_output = self.combine_expert_output(
            expert_out, combine_weights, scatter_index
        )

        if self.shared_experts is not None:
            combined_output += shared_out

        if orig_shape:
            combined_output = combined_output.clone().reshape(
                orig_shape[:-1] + (combined_output.shape[-1],)
            )
        return combined_output, combine_weights, None, gate_logits


class MOEAllGatherLayerV2(MOELayer):
    """
    MoE Layer with allgather implement.
    """

    def __init__(
        self,
        gate: nn.Module,
        experts: List[nn.Module],
        layer_idx,
        shared_experts: Optional[List[nn.Module]] = None,
        group=None,
        recompute=False,
        k=2,
        enable_reverse_token_drop=False,
        all_to_all_dropout=0,
        group_experts=False,
        use_expert_out_alltoall=True,
        use_expert_alltoall_overlap=False,
        use_padding=True,
        dense_token_type=3,  # considerd as dense tokens (no moe)
        moe_statics=None,
        moe_num_experts=None,
    ):
        super().__init__(
            gate,
            experts,
            layer_idx,
            shared_experts,
            group,
            recompute,
            k,
            all_to_all_dropout,
            group_experts,
            moe_statics,
            moe_num_experts,
        )
        self.enable_reverse_token_drop = enable_reverse_token_drop
        self.is_allgather_moe_layer = True
        self.use_padding = use_padding

        self.send_rank = None
        self.local_expert_id = None
        self.dense_experts = None
        self.dense_token_type = dense_token_type
        self.capacity_tensor = None
        logger.info(
            f"uisng MOEAllGatherLayerV2, use_expert_out_alltoall={use_expert_out_alltoall}, "  # false
            f"use_padding={use_padding}, use_expert_alltoall_overlap={use_expert_alltoall_overlap} "  # true false
            f"enable_reverse_token_drop={self.enable_reverse_token_drop}"  # false
        )
        self.two = torch.tensor(2).to(dtype=torch.float32)
        self.zero = torch.tensor(0).to(dtype=torch.float32)

    def forward(
        self,
        input: torch.Tensor,
        token_type_ids=None,
        use_dense_expert=False,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """Implements forward pass for Mixture-of-Experts (MoE) layer with distributed communication.

        Core Functionality:
          - Processes input through gating network to determine expert assignments
          - Combines expert outputs and calculates routing loss

        Key Features:
          1. Supports both dense and sparse expert computation modes
          2. Implements fused gating and dispatch for performance optimization
          3. Handles sequence length padding/unpadding for irregular inputs
          4. Enables communication-computation overlap through asynchronous operations

        Args:
            input (Tensor): Input tensor of shape [seq_len, hidden_dim]
            token_type_ids: Optional segmentation markers for heterogeneous inputs
            use_dense_expert: Flag to enable dense expert computation bypass

        Returns:
            tuple: (
                combined_output: Aggregated expert outputs [seq_len, hidden_dim],
                combine_weights: Expert combination coefficients,
            )
        """
        use_fuse = isinstance(self.gate, (TopKGate))
        assert use_fuse
        if input.ndim == 3:
            orig_shape = input.shape
            input = input.reshape([-1, input.shape[-1]])
        else:
            orig_shape = None

        assert (
            len(input.shape) == 2
        ), f"input Tensor must have dimensions: (s)equence, (d)im, got:{input.shape}"
        dispatch_token_type_ids = None
        global_dense_expert_mask = None
        if token_type_ids is not None:
            token_type_ids = token_type_ids[:, :-1].reshape([-1])
            dispatch_token_type_ids = token_type_ids
            if use_dense_expert:
                global_dense_expert_mask = (
                    dispatch_token_type_ids == self.dense_token_type
                )

        assert self.gate is not None

        (
            dispatched_input,
            global_hidden_states,
            local_combine_weights,
            expert_num_global_no_token_drop,
            expert_num_global,
            expert_num_global_list,
            local_scatter_index,
            scatter_index_rev,
            router_loss,
            (gate_logits, gate_prob),
            (gate_logits_mm, gate_prob_mm),
            expert_num_local,
        ) = self.fused_gate_and_dispatch(
            input, token_type_ids, global_dense_expert_mask
        )

        seqlen_this_mp = input.shape[0]
        if len(scatter_index_rev):
            recv_rank_local = scatter_index_rev // seqlen_this_mp
        else:
            recv_rank_local = scatter_index_rev

        if self.send_rank is None:
            capacity = self.gate.get_capacity(input.shape[0])
            self.send_rank = (
                torch.arange(1)
                .repeat_interleave(capacity * self.num_local_experts)
                .to(torch.int32)  # cap
            )
            self.local_expert_id = (
                torch.arange(self.num_local_experts)
                .repeat_interleave(capacity)
                .repeat(1)
                .to(self.send_rank.dtype)
            )
        send_rank = self.send_rank
        local_expert_id = self.local_expert_id

        expert_outs = self.forward_experts(*dispatched_input)
        for e in expert_outs:
            if e is not None:
                current_device = e.device
                break
        expert_outs = torch.cat(
            [e.to(current_device) for e in expert_outs if e is not None], dim=0
        )  # [e*c,m]

        # global -> local
        combined_output = self.combine_expert_output(
            expert_outs, local_combine_weights, local_scatter_index
        )

        if self.shared_experts is not None:
            shared_out = self.shared_experts(input).to(combined_output.device)
            combined_output += shared_out

        if orig_shape:
            combined_output = combined_output.reshape(
                *orig_shape[:-1], combined_output.shape[-1]
            )

        return combined_output, local_combine_weights, None, gate_logits

    def _expand_modality_expert_id(
        self,
        expert_id: torch.Tensor,  # (seqlen, k)
        seqlen: int,
        k: int,
        num_expert_per_modality: int,
        group_size: int,
        modality_offset: int,
        is_group_expert: bool,
    ) -> torch.Tensor:
        """
        expert_id: tensor of shape (seqlen, k), containing expert ids
        Returns: tensor of same shape, with updated expert ids
        """
        device = expert_id.device
        expert_id = expert_id.clone()

        if is_group_expert:
            # idx % k * group_size
            offsets = (torch.arange(k, device=device) * group_size).view(
                1, k
            )  # shape (1, k)
            expert_id += offsets

        if num_expert_per_modality <= 0:
            return expert_id

        # Compute rank and local expert id
        rank = expert_id // num_expert_per_modality
        expert_id_in_rank = expert_id % num_expert_per_modality

        # Compute new expert id with modality-aware adjustment
        expert_id_out = (
            rank * (num_expert_per_modality * 2)  # 2 modalities assumed
            + expert_id_in_rank
            + modality_offset * num_expert_per_modality
        )

        return expert_id_out

    def expand_modality_expert_id(
        self,
        expert_id,
        num_expert_per_modality,
        group_size,
        modality_offset,
        is_group_expert,
    ):
        """expand expert id for modality aware moe layer"""
        seq_len, k = expert_id.shape

        return self._expand_modality_expert_id(
            expert_id,
            seq_len,
            k,
            num_expert_per_modality,
            group_size,
            modality_offset,
            is_group_expert,
        )

    def fused_gate_logits_process_fused(
        self, gate_logits_lm, gate_logits_mm=None, token_type_ids=None
    ):
        """Process gating logits for expert selection in Mixture-of-Experts (MoE) layers.

        Core Functionality:
        - Transforms raw gating logits into expert selection weights and IDs
        - Supports both grouped and standard expert selection modes
        - Handles bias correction for improved expert load balancing

        Args:
            gate_logits_lm (Tensor): Raw gating scores of shape [batch_size, total_experts]

        Returns:
            tuple: (
                lm_weight_and_expert_id: Combined tensor containing selection weights
                       and expert IDs [batch_size, 2*top_k],
                prob_flat: Flattened expert probabilities [batch_size, total_experts]
            )
        """
        top_k = self.k
        num_expert_per_rank_per_modality = gate_logits_lm.shape[-1]
        group_size = gate_logits_lm.shape[-1] // top_k
        if self.group_experts:
            assert not self.use_correction_bias
            gate_logits_lm = gate_logits_lm.reshape(
                [gate_logits_lm.shape[0], top_k, -1]
            )
            prob_lm = self.gate.act(gate_logits_lm)
            prob_lm_ = prob_lm
            weight_lm, expert_id_lm = prob_lm_.topk(k=1, dim=-1)
            weight_lm = weight_lm.reshape([gate_logits_lm.shape[0], -1])
            group_size = gate_logits_lm.shape[-1]
            expert_id_lm = expert_id_lm.squeeze(-1)
        else:
            prob_lm = self.gate.act(gate_logits_lm)
            if self.use_correction_bias:
                prob_lm_ = prob_lm + self.moe_statics.e_score_correction_bias[
                    0
                ].detach().to(prob_lm.device)
            else:
                prob_lm_ = prob_lm
            weight_lm, expert_id_lm = prob_lm_.topk(k=top_k, dim=-1)

        if self.use_correction_bias:
            batch_idx = (
                torch.arange(prob_lm_.shape[0]).unsqueeze(-1).expand_as(expert_id_lm)
            )
            weight_lm = prob_lm[batch_idx, expert_id_lm]  # use correct bias

        expert_id_lm = self.expand_modality_expert_id(
            expert_id_lm,
            num_expert_per_modality=(
                num_expert_per_rank_per_modality if token_type_ids is not None else 0
            ),
            group_size=group_size,
            modality_offset=0,
            is_group_expert=self.group_experts,
        )
        expert_id_lm = expert_id_lm.reshape(weight_lm.shape)
        lm_weight_and_expert_id = torch.cat(
            [weight_lm, expert_id_lm.to(torch.float32)], -1
        )

        if token_type_ids is None or gate_logits_mm is None:
            return (
                lm_weight_and_expert_id,
                prob_lm.reshape([prob_lm.shape[0], -1]),
                None,
            )

        prob_mm = self.gate.act(gate_logits_mm)
        if self.use_correction_bias:
            prob_mm_ = prob_mm + self.moe_statics.e_score_correction_bias[
                1
            ].detach().to(prob_lm.device)
        else:
            prob_mm_ = prob_mm
        weight_mm, expert_id_mm = prob_mm_.topk(k=top_k, dim=-1)
        if self.use_correction_bias:
            batch_idx = (
                torch.arange(prob_lm_.shape[0]).unsqueeze(-1).expand_as(expert_id_lm)
            )
            weight_mm = prob_mm[batch_idx, expert_id_mm]  # use correct bias

        expert_id_mm = self.expand_modality_expert_id(
            expert_id_mm,
            num_expert_per_modality=num_expert_per_rank_per_modality,
            group_size=group_size,
            modality_offset=1,
            is_group_expert=False,
        )
        expert_id_mm = expert_id_mm.reshape(weight_mm.shape)
        mm_weight_and_expert_id = torch.cat(
            [weight_mm, expert_id_mm.to(torch.float32)], -1
        )
        weight_and_expert = torch.where(
            (token_type_ids == 0).unsqueeze(-1),
            lm_weight_and_expert_id.to(token_type_ids.device),
            mm_weight_and_expert_id.to(token_type_ids.device),
        )
        return weight_and_expert, prob_lm.reshape([prob_lm.shape[0], -1]), prob_mm

    def moe_gate_dispatch_partial_nosoftmaxtopk(
        self,
        x,
        combine_weights,
        expert_id,
        k,
        num_experts,
    ):
        """
        MoE Gate Dispatch kernel
        """
        device = x.device
        dtype = x.dtype
        num_rows, hidden_size = x.shape
        k = expert_id.shape[1]
        expert_ids_flat = expert_id.reshape(-1)  # [num_rows * k]
        combine_weights_flat = combine_weights.reshape(-1)  # [num_rows * k]

        expanded_token_ids = torch.arange(num_rows * k, device=device)  # [num_rows * k]

        sorted_expert_ids, sorted_indices = torch.sort(expert_ids_flat, stable=True)
        sorted_indices = sorted_indices.to(expanded_token_ids.device)

        sorted_expanded_token_ids = expanded_token_ids[sorted_indices]

        expert_nums_local = torch.zeros(num_experts, dtype=torch.int64, device=device)

        for expert_idx in range(num_experts):
            count = (sorted_expert_ids == expert_idx).sum().item()
            expert_nums_local[expert_idx] = count

        total_dispatched_tokens = torch.cumsum(expert_nums_local, dim=0)[-1].item()

        y = x[sorted_indices // k]  # [total_dispatched_tokens, hidden_size]

        scatter_index = torch.full((k, num_rows), -1, dtype=torch.int32, device=device)

        for i, (expanded_idx, sorted_pos) in enumerate(
            zip(sorted_expanded_token_ids, range(total_dispatched_tokens))
        ):
            token_idx = expanded_idx // k
            k_idx = expanded_idx % k
            scatter_index[k_idx, token_idx] = sorted_pos

        scatter_index_rev = sorted_indices // k

        combine_weights_out = combine_weights.clone()

        return (
            y,  # [total_dispatched_tokens, hidden_size]
            combine_weights_out,  # [num_rows, k]
            scatter_index,  # [k, num_rows]
            scatter_index_rev,  # [total_dispatched_tokens]
            expert_nums_local,  # [num_experts]
            expert_nums_local,  # [num_experts]
        )

    def fused_gate_and_dispatch(
        self, input, token_type_ids=None, global_dense_expert_mask=None
    ):
        """Implements fused expert gating and token dispatch logic for Mixture-of-Experts (MoE) layers.

        Core Functionality:
          - Computes expert selection probabilities and routing weights
          - Performs distributed token-to-expert assignment
          - Handles communication and synchronization in model-parallel environments

        Args:
            input (Tensor): Input tensor of shape [seq_len, hidden_dim]

        Returns:
            tuple: (
                dispatched_input: Expert-assigned tokens [num_experts, capacity, hidden_dim],
                global_hidden_states: Full sequence representations,
                local_combine_weights: Local expert combination weights,
                expert_num_global_notrunc: Global expert token counts (without capacity truncation),
                expert_num_global: Actual expert token counts,
                expert_num_global_list: Per-expert token counts,
                local_scatter_index: Local token reorganization indices,
                scatter_index_rev: Reverse scattering indices,
                router_loss: Calculated routing loss,
                gate_outputs: Raw gating network outputs,
                expert_num_local: Local expert utilization counts
            )
        """
        seqlen, d_model = input.shape
        args = ()
        if token_type_ids is not None:
            token_type_ids = token_type_ids.reshape([-1])
            args = (token_type_ids,)

        router_loss = torch.zeros([1], dtype=torch.float32)
        top_k = self.k

        def build_weights_and_expert_id(input):
            nonlocal token_type_ids, args
            logits = self.gate(input, *args, transform_weight=False)
            if self.config.multimodel_experts:
                gate_logits_lm, gate_logits_mm = logits.chunk(2, dim=-1)
            else:
                gate_logits_lm, gate_logits_mm = logits, None

            weigth_and_expert, gate_prob_lm, gate_prob_mm = (
                self.fused_gate_logits_process_fused(
                    gate_logits_lm,
                    gate_logits_mm,
                    token_type_ids if global_dense_expert_mask is None else None,
                )
            )
            return (
                weigth_and_expert,
                gate_logits_lm,
                gate_logits_mm,
                gate_prob_lm,
                gate_prob_mm,
            )

        capacity = self.gate.get_capacity(input.shape[0]) * self.world_size
        global_hidden_states = input
        (
            combine_weights_and_expert_id,
            gate_logits_lm,
            gate_logits_mm,
            gate_prob_lm,
            gate_prob_mm,
        ) = build_weights_and_expert_id(input)

        combine_weights_unnorm, expert_id = combine_weights_and_expert_id.chunk(
            2, dim=-1
        )
        expert_id = expert_id.to(torch.int32)
        num_experts = (
            sum(self.config.moe_num_experts)
            if isinstance(self.config.moe_num_experts, (tuple, list))
            else self.config.moe_num_experts
        )
        if global_dense_expert_mask is not None:
            combine_weights_unnorm[global_dense_expert_mask] = 0.0
            expert_id[global_dense_expert_mask] = num_experts
            num_experts += 1

        (
            dispatched_input,
            combine_weights_unnorm,
            scatter_index,  # input -> dispatched_input
            scatter_index_rev,  # dispatch-input -> input
            expert_num_global,
            expert_num_local,
        ) = self.moe_gate_dispatch_partial_nosoftmaxtopk(
            global_hidden_states,
            combine_weights_unnorm,
            expert_id,
            top_k,
            num_experts,
        )

        if self.use_correction_bias:
            if self.gate.config.multimodel_experts:
                # MLLM
                for i in range(len(self.moe_statics.expert_usage)):
                    self.moe_statics.expert_usage[i] += (
                        expert_num_local[self.gate.experts_type_mask[i]]
                        .detach()
                        .to(self.moe_statics.expert_usage.device)
                    )
            else:
                # LLM
                self.moe_statics.expert_usage[0] += expert_num_local.detach().to(
                    self.moe_statics.expert_usage.device
                )

        # When use unpad , `moe_ops_partial` output likes `scatter_index_rev==[]`.
        if scatter_index_rev.ndim == 0:
            assert not self.use_padding
            scatter_index_rev = torch.empty([0], dtype=scatter_index_rev.dtype)

        expert_num_global_notrunc = expert_num_global
        self.capacity_tensor = torch.tensor(capacity).to(dtype=expert_num_global.dtype)
        expert_num_global = torch.minimum(expert_num_global, self.capacity_tensor)

        if global_dense_expert_mask is not None:
            expert_num_global = expert_num_global[:-1]
            expert_num_local = expert_num_local[:-1]
            expert_num_global_notrunc = expert_num_global_notrunc[:-1]

        scatter_index = scatter_index.transpose(1, 0)  # [k,s] ->[s,k]
        scatter_index = scatter_index.to(combine_weights_unnorm.device)

        last_local_expert = 0
        expert_offset_global = expert_num_global.cumsum(-1)

        expert_num_global_list = expert_num_global
        if self.use_padding:
            offset = last_local_expert * capacity
        else:
            offset = 0
        local_combine_weights_unnorm = combine_weights_unnorm.contiguous()
        local_scatter_index = torch.where(
            combine_weights_unnorm > 0.0,
            scatter_index + offset,
            scatter_index,
        )
        if self.gate.norm_gate_logits:
            local_combine_weights = local_combine_weights_unnorm / torch.clip(
                local_combine_weights_unnorm.sum(-1, keepdim=True), min=1e-12
            )
        else:
            local_combine_weights = local_combine_weights_unnorm
        local_combine_weights = local_combine_weights.to(dispatched_input.dtype)
        if self.use_padding:
            dispatched_input = dispatched_input.reshape(
                [self.num_local_experts, -1, d_model]
            )
            dispatched_input = dispatched_input.unbind(0)
        else:
            s = 0
            e = self.num_local_experts
            expert_num_local = expert_num_local.tolist()[s:e]
            expert_num_local_valid = [i for i in expert_num_local if i > 0]
            valid_pos = [j for j, i in enumerate(expert_num_local) if i > 0]
            if expert_num_local_valid:
                dispatched_input_list = dispatched_input.split(expert_num_local_valid)
                dispatched_input = [None] * len(expert_num_local)
                for p, t in zip(valid_pos, dispatched_input_list):
                    dispatched_input[p] = t
            else:
                dispatched_input = [dispatched_input] + (
                    [None] * (len(expert_num_local) - 1)
                )

        expert_num_global_list = expert_num_global_list.tolist()

        return (
            dispatched_input,
            global_hidden_states,
            local_combine_weights,
            expert_num_global_notrunc,  # for auxloss calculation.
            expert_num_global,
            expert_num_global_list,
            local_scatter_index,
            scatter_index_rev,
            router_loss,
            (gate_logits_lm, gate_prob_lm),
            (gate_logits_mm, gate_prob_mm),
            expert_num_local,
        )

    def forward_experts(self, *dispatched_input):
        """Execute expert model computations in sequence for Mixture-of-Experts (MoE) layer.

        Core Functionality:
          - Distributes dispatched tokens to local expert models
          - Handles empty expert inputs with zero-initialized fallback
          - Maintains gradient flow for expert outputs
          - Aggregates outputs from all active experts

        Args:
            *dispatched_input: Variable-length expert-specific input tensors

        Returns:
            list: Expert output tensors (None for inactive experts)

        Implementation Details:
          1. Processes valid expert inputs through corresponding expert models
          2. Generates dummy inputs for inactive experts to preserve model structure
          3. Aggregates dummy outputs to first active expert to maintain gradient flow
        """
        expert_outputs = []
        assert isinstance(self.experts, nn.ModuleList), type(self.experts)

        no_tokens_expert_outputs = []
        true_experts = self.experts[
            self.rank
            * self.num_local_experts : (self.rank + 1)
            * self.num_local_experts
        ]
        for iexpert, chunk in enumerate(dispatched_input):
            if chunk is None:
                expert_outputs.append(None)
                continue

            expert_out = true_experts[iexpert](chunk.contiguous())
            expert_outputs.append(expert_out)

        if len(no_tokens_expert_outputs) > 0:
            first_has_tokens_idx = 0
            for idx, expert_out in enumerate(expert_outputs):
                if expert_out is not None:
                    first_has_tokens_idx = idx
                    break
            for idx, expert_out in enumerate(no_tokens_expert_outputs):
                expert_outputs[first_has_tokens_idx] += expert_out

        return expert_outputs


class Ernie4_5_DecoderLayer(nn.Module):
    """A single transformer decoder layer in ERNIE-MoE model.

    Contains self-attention and feed-forward components with optional MoE (Mixture of Experts)
    support, residual connections, and layer normalization.
    """

    _keep_in_fp32_modules = ["mlp.gate", "e_score_correction_bias"]

    def __init__(self, config, layer_idx):
        """Initialize the decoder layer.

        Args:
            config (Ernie4_5_MoEConfig): Model configuration.
            layer_idx (int): Index of this layer in the transformer stack
        """
        super().__init__()
        self.hidden_size = config.hidden_size
        self.layer_idx = layer_idx
        self.config = config
        self.use_moe = config.use_moe
        self.self_attn = Ernie4_5_Attention(config, layer_idx)

        moe_layer_start_index = (
            min(config.moe_layer_start_index)
            if isinstance(config.moe_layer_start_index, (tuple, list))
            else config.moe_layer_start_index
        )
        moe_layer_end_index = (
            max(config.moe_layer_end_index)
            if isinstance(config.moe_layer_end_index, (tuple, list))
            else config.moe_layer_end_index
        )

        if (
            self.use_moe
            and ((layer_idx + 1) % config.moe_layer_interval == 0)
            and layer_idx >= moe_layer_start_index  # 3
            and layer_idx <= moe_layer_end_index  # 53
        ):
            gate, experts, lm_gate, lm_experts, moe_statics = (
                self._init_gate_and_experts(layer_idx)
            )
            shared_experts = (
                self._init_shared_experts()
                if hasattr(config, "moe_num_shared_experts")
                else None
            )

            dense_experts = None
            moe_cls = MOELayer
            if config.moe_multimodal_dispatch_use_allgather:  # v2
                logger.info("Enable MOEAllGatherLayerV2!")
                moe_cls = partial(
                    MOEAllGatherLayerV2,
                    use_expert_out_alltoall="alltoall"
                    in config.moe_multimodal_dispatch_use_allgather,  # false
                    use_padding=False,
                    enable_reverse_token_drop=config.moe_reverse_token_drop,  # false
                    dense_token_type=config.moe_dense_experts_token_type_id,  # 3
                )
            else:
                assert (
                    dense_experts is None
                ), "only `MOEAllGatherLayerV2` can process dense experts"

            self.mlp = moe_cls(
                gate=gate,
                experts=experts,
                layer_idx=layer_idx,
                shared_experts=shared_experts,
                group=config.moe_group,
                recompute=False,
                k=config.moe_k,
                all_to_all_dropout=config.moe_all_to_all_dropout,
                group_experts=False,
                moe_statics=moe_statics,
                moe_num_experts=config.moe_num_experts,
            )

            _mlp_text = MOELayer(
                gate=lm_gate,
                experts=lm_experts,
                layer_idx=layer_idx,
                shared_experts=shared_experts,
                group=config.moe_group,
                recompute=False,
                k=config.moe_k,
                all_to_all_dropout=config.moe_all_to_all_dropout,
                group_experts=False,
                moe_statics=moe_statics,
                moe_num_experts=config.moe_num_experts,
            )
            self.mlp_text = (
                lambda: _mlp_text
            )  # This lambda prevents the text parameter from being scanned into the state-dict
        else:
            self.mlp = Ernie4_5_MLP(config)

        Norm = RMSNorm

        self.input_layernorm = Norm(config)
        self.post_attention_layernorm = Norm(config)

        self.residual_add1 = FusedDropoutImpl(
            config.hidden_dropout_prob, mode="upscale_in_train"
        )
        self.residual_add2 = FusedDropoutImpl(
            config.hidden_dropout_prob, mode="upscale_in_train"
        )

    def _init_shared_experts(self):
        """init shared experts

        Returns:
            _type_: _description_
        """
        cfg = deepcopy(self.config)
        if cfg.moe_num_shared_experts > 0:
            if cfg.moe_intermediate_size:
                inter_size = (
                    next(iter(cfg.moe_intermediate_size))
                    if isinstance(cfg.moe_intermediate_size, (tuple, list))
                    else cfg.moe_intermediate_size
                )
                cfg.intermediate_size = inter_size * cfg.moe_num_shared_experts
            else:
                cfg.intermediate_size = (
                    cfg.intermediate_size * cfg.moe_num_shared_experts
                )
            cfg.disable_ffn_model_parallel = False  # split shared epxert
            shared_experts = Ernie4_5_MoeMLP(cfg, True)
        else:
            shared_experts = None
        return shared_experts

    def _init_gate_and_experts(self, layer_idx):
        """Initialize MoE gate and expert networks.

        Args:
            layer_idx (int): Current layer index

        Returns:
            Tuple: Contains:
                - gate: MoE routing gate
                - experts: List of expert networks
                - moe_statics: Optional statistics tracker
        """
        cfg = deepcopy(self.config)
        fc_cls = Ernie4_5_MoeMLP
        if cfg.moe_intermediate_size:
            if isinstance(cfg.moe_intermediate_size, (tuple, list)):
                assert isinstance(cfg.moe_num_experts, (tuple, list)) and len(
                    cfg.moe_num_experts
                ) == len(cfg.moe_intermediate_size)
                fc = []
                for _i, (num_experts, intermediate_size) in enumerate(
                    zip(cfg.moe_num_experts, cfg.moe_intermediate_size)
                ):
                    ex_cfg = deepcopy(cfg)
                    ex_cfg.intermediate_size = intermediate_size
                    cur_modality_start_layer_idx = (
                        cfg.moe_layer_start_index[_i]
                        if isinstance(cfg.moe_layer_start_index, (tuple, list))
                        else cfg.moe_layer_start_index
                    )
                    cur_modality_end_layer_idx = (
                        cfg.moe_layer_end_index[_i]
                        if isinstance(cfg.moe_layer_end_index, (tuple, list))
                        else cfg.moe_layer_end_index
                    )
                    if (
                        layer_idx >= cur_modality_start_layer_idx
                        and layer_idx <= cur_modality_end_layer_idx
                    ):
                        if _i == 1:
                            with UniqueNameGuard(f"mm_expert_{layer_idx}_") as guard:
                                fc.append((num_experts, fc_cls(ex_cfg)))
                        else:
                            fc.append((num_experts, fc_cls(ex_cfg)))
                    else:
                        logger.info(
                            f"moe multimodal experts use Identity layer_idx: {layer_idx}"
                        )
                        fc.append((num_experts, nn.Identity()))
            else:
                cfg.intermediate_size = cfg.moe_intermediate_size
                fc = [(cfg.moe_num_experts, fc_cls(cfg, layer_idx))]
        else:
            fc = [(cfg.moe_num_experts, fc_cls(cfg, layer_idx))]
        if cfg.multimodel_experts:
            gate, experts, lm_gate, lm_experts = get_gate(self.config, fc, layer_idx)
        else:
            gate, experts = get_gate(self.config, fc, layer_idx)
            lm_gate, lm_experts = None, None

        # for AuxLoss Free Router:
        if cfg.moe_use_aux_free:
            moe_statics = MoEStatics(cfg, layer_idx)
        else:
            moe_statics = None
        return gate, experts, lm_gate, lm_experts, moe_statics

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        attn_mask_start_row_indices: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        use_cache: Optional[bool] = False,
        output_gate_logits=True,  # PP model should not output gate logits,
    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
        """Forward pass through the decoder layer.

        Args:
            hidden_states (torch.Tensor): Input tensor [batch_size, seq_len, hidden_size]
            attention_mask (Optional[torch.Tensor]): Attention mask tensor
            attn_mask_start_row_indices (Optional[torch.Tensor]): Indices for variable length attention
            position_ids (Optional[torch.Tensor]): Position indices for rotary embeddings
            output_attentions (Optional[bool]): Whether to return attention weights
            past_key_value (Optional[Tuple[torch.Tensor]]): Cached key/value states
            use_cache (Optional[bool]): Whether to cache key/value states
            output_gate_logits (bool): Whether to return MoE gate logits

        Returns:
            Union: Various output combinations depending on arguments:
                - Base case: Hidden states tensor
                - With attention: Tuple of (hidden_states, attention_weights)
                - With cache: Tuple of (hidden_states, cached_key_value)
                - With MoE: May include gate logits in output tuple
        """
        residual = hidden_states

        if token_type_ids is not None:
            is_multimodel_token = token_type_ids.any()
            has_dense_experts_token = (
                token_type_ids == self.config.moe_dense_experts_token_type_id
            ).any()
            is_multimodel_token_cpu = is_multimodel_token.cpu()
            has_dense_experts_token_cpu = has_dense_experts_token.cpu()
        else:
            is_multimodel_token_cpu = None
            has_dense_experts_token_cpu = None

        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        (hidden_states, self_attn_weights, present_key_value, *router_loss_attn) = (
            self.self_attn(
                hidden_states=hidden_states,
                past_key_value=past_key_value,
                attention_mask=attention_mask,
                attn_mask_start_row_indices=attn_mask_start_row_indices,
                position_ids=position_ids,
                output_attentions=output_attentions,
                use_cache=use_cache,
                token_type_ids=token_type_ids,
            )
        )
        hidden_states = self.residual_add1(hidden_states, residual)

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)

        if isinstance(self.mlp, MOELayer):
            if is_multimodel_token_cpu:
                hidden_states, _, router_loss, gate_logits = self.mlp(
                    hidden_states, token_type_ids
                )
            else:
                hidden_states, _, router_loss, gate_logits = self.mlp_text()(
                    hidden_states, None, is_multimodel=False
                )
        else:
            hidden_states = self.mlp(hidden_states)
            gate_logits, router_loss = None, None

        hidden_states = self.residual_add2(hidden_states, residual)

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (self_attn_weights,)

        if use_cache:
            outputs += (present_key_value,)

        if self.use_moe:
            # Non-empty only if `use_moe`
            if router_loss_attn:
                router_loss_attn = router_loss_attn[0]
                router_loss = router_loss + router_loss_attn

            if output_gate_logits:
                outputs += (gate_logits,)

        # remove empty tuple for pipeline parallel
        if type(outputs) is tuple and len(outputs) == 1:
            outputs = outputs[0]

        return outputs


class Ernie4_5_PretrainedModel(PreTrainedModel):
    """Base class for ERNIE pretrained models."""

    config_class = Ernie4_5_MoEConfig
    base_model_prefix = "ernie"
    _no_split_modules = ["Ernie4_5_DecoderLayer"]


class Ernie4_5_Model(Ernie4_5_PretrainedModel):
    """The core ERNIE transformer model with MoE (Mixture of Experts) support."""

    def __init__(self, config: Ernie4_5_MoEConfig):
        """Initialize the ERNIE model architecture.

        Args:
            config (Ernie4_5_MoEConfig): Model configuration.
        """
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size
        self.hidden_size = config.hidden_size
        self.config = config

        self.embed_tokens = nn.Embedding(
            self.vocab_size,
            self.hidden_size,
        )

        self.layers = nn.ModuleList(
            [Ernie4_5_DecoderLayer(config, i) for i in range(config.num_hidden_layers)]
        )
        Norm = RMSNorm
        self.norm = Norm(config)

        self.gradient_checkpointing = False

    def get_input_embeddings(self):
        """Get the input embedding layer.

        Returns:
            nn.Embedding: The embedding layer for input tokens
        """
        return self.embed_tokens

    def set_input_embeddings(self, value):
        """Set new input embeddings.

        Args:
            value (nn.Embedding): New embedding layer to use
        """
        self.embed_tokens = value

    def forward(
        self,
        input_ids=None,
        position_ids=None,
        token_type_ids=None,
        attention_mask=None,
        attn_mask_start_row_indices=None,
        inputs_embeds=None,
        use_cache=None,
        past_key_values=None,
        output_attentions=False,
        output_hidden_states=None,
        return_dict=False,
    ):
        """Forward pass through the ERNIE model.

        Args:
            input_ids (Optional[torch.Tensor]): Input token IDs
            position_ids (Optional[torch.Tensor]): Position indices
            attention_mask (Optional[torch.Tensor]): Attention mask
            attn_mask_start_row_indices (Optional[torch.Tensor]): Variable length attention indices
            inputs_embeds (Optional[torch.Tensor]): Precomputed embeddings
            use_cache (Optional[bool]): Whether to cache key/value states
            past_key_values (Optional[Tuple[Tuple[torch.Tensor]]]): Cached key/value states
            output_attentions (Optional[bool]): Whether to output attention weights
            output_hidden_states (Optional[bool]): Whether to output all hidden states
            return_dict (Optional[bool]): Whether to return dict or tuple

        Returns:
            Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
                Various outputs depending on configuration, including:
                - last_hidden_state: Final layer hidden states
                - past_key_values: Cached key/value states if use_cache=True
                - hidden_states: All hidden states if output_hidden_states=True
                - attentions: Attention weights if output_attentions=True
                - router_loss: MoE router loss if use_moe=True
                - gate_logits: MoE gate logits if use_moe=True
        """
        output_attentions = (
            output_attentions
            if output_attentions is not None
            else self.config.output_attentions
        )
        output_hidden_states = (
            output_hidden_states
            if output_hidden_states is not None
            else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
            )
        elif input_ids is not None:
            _, seq_length = input_ids.shape
        elif inputs_embeds is not None:
            _, seq_length, _ = inputs_embeds.shape
        else:
            raise ValueError(
                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
            )

        if past_key_values is None:
            past_key_values = tuple([None] * len(self.layers))

        seq_length_with_past = seq_length
        cache_length = 0
        if past_key_values[0] is not None:
            cache_length = past_key_values[0][0].shape[1]
            seq_length_with_past += cache_length
        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        inputs_embeds = inputs_embeds.to(self.embed_tokens.weight.dtype)

        hidden_states = inputs_embeds

        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = () if use_cache else None
        if getattr(self.config, "use_moe", False):
            all_router_loss = torch.tensor(0.0).to(device=inputs_embeds.device)
        else:
            all_router_loss = None
        all_gate_logits = ()

        for idx, (decoder_layer) in enumerate(self.layers):
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

            past_key_value = (
                past_key_values[idx] if past_key_values is not None else None
            )
            layer_outputs = decoder_layer(
                hidden_states,
                attention_mask,
                attn_mask_start_row_indices,
                position_ids,
                token_type_ids,
                output_attentions,
                past_key_value,
                use_cache,
            )

            if isinstance(layer_outputs, (tuple, list)):
                hidden_states = layer_outputs[0]
            else:
                hidden_states = layer_outputs

            if use_cache:
                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)

            if output_attentions:
                all_self_attns += (layer_outputs[1],)
            if self.config.use_moe:
                layer_outputs, gate_logits = layer_outputs[:-1], layer_outputs[-1]
                all_gate_logits = all_gate_logits + (gate_logits,)

            if past_key_value is not None:
                hidden_states = hidden_states[:, -1:, :]

        hidden_states = self.norm(hidden_states)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        next_cache = next_decoder_cache if use_cache else None

        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_cache,
                    all_hidden_states,
                    all_self_attns,
                    all_router_loss,
                    all_gate_logits,
                ]
                if v is not None
            )

        # assert all_router_loss is None, f'moe not support `return-dict`'
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
            cross_attentions=None,
            router_loss=all_router_loss,
            gate_logits=all_gate_logits,
        )


def parallel_matmul(
    x,
    y,
    bias=None,
    transpose_y=False,
):
    """
    Performs parallel matrix multiplication with tensor model parallelism support.

    Args:
        x (torch.Tensor): Input tensor with shape [batch_size, seq_len, hidden_size]
        y (Union[torch.Tensor, EagerParamBase]): Weight matrix which can be:
            - Regular tensor
            - Distributed parameter in tensor parallel mode
        bias (Optional[torch.Tensor]): Optional bias tensor
        transpose_y (bool): Whether to transpose the 'y' matrix before multiplication
        # tensor_parallel_degree (int): Degree of tensor model parallelism (default: 1)
        # tensor_parallel_output (bool): Whether to keep output in tensor parallel format
            or gather across devices (default: True)
        fuse_linear (bool): Whether to use fused linear operation for optimization

    Returns:
        torch.Tensor

    Raises:
        AssertionError: If tensor parallel is enabled but weight is not distributed
        AttributeError: If called without distributed.launch context
    """
    if transpose_y:
        logits = torch.matmul(x, y.T)
    else:
        logits = torch.matmul(x, y)
    if bias is not None:
        logits += bias
    return logits


def calc_lm_head_logits(
    config, hidden_states, weight, bias, tensor_parallel_output=None, training=True
):
    """
    Calculate language model head logits with support for various parallelization strategies.

    This is the core function that computes the final output logits for a language model,
    handling sequence parallelism and tensor parallelism configurations.

    Args:
        config (Ernie4_5_Config): Model configuration.
        hidden_states (Tensor): Hidden states from the transformer layers
        weight (Tensor): Weight matrix for the language model head
        bias (Tensor): Bias vector for the language model head
        tensor_parallel_output (bool, optional): Override for tensor parallel output behavior.
                                               If None, uses config.tensor_parallel_output.
                                               Defaults to None.
        training (bool, optional): Whether in training mode. Defaults to True.

    Returns:
        Tensor: The computed logits for language modeling.
    """
    if tensor_parallel_output is None:
        tensor_parallel_output = config.tensor_parallel_output
    logits = parallel_matmul(
        hidden_states,
        weight,
        bias=bias,
        transpose_y=config.tie_word_embeddings,
    )

    return logits


def calc_multimodal_logits(
    last_hidden_state: torch.Tensor,
    lm_head_weight: torch.Tensor,
    lm_head_bias: torch.Tensor,
    mm_head_weight: torch.Tensor,
    mm_head_bias: torch.Tensor,
    token_type_ids_shifted: torch.Tensor,
    config: Ernie4_5_VLMoEConfig,
):
    """
    calculate logits for pure text, multimodal text, and image
    Args:
        last_hidden_state: The hidden of the last layer, in sequence-parallel, is in the split state.
        ...
        token_type_ids_shifted: # Non-sp split tensor
            The token-type-ids at the label position is used to select the lm-head corresponding to each token.
            Note: In the id sequence of alternating images and texts, the last text token will predict the image id,
            and vice versa, so it is necessary to select the lmhead weight corresponding to the label type.
    """
    # Align the type of ids with the type of label. For the last ids, assume that the token type remains unchanged.
    # TODO: Pass token-type-ids from reader
    assert last_hidden_state.shape[:2] == token_type_ids_shifted.shape, (
        last_hidden_state.shape,
        token_type_ids_shifted.shape,
    )
    parallel_matmul_tp = partial(
        parallel_matmul,
    )

    if mm_head_weight is None:
        if config.use_recompute_loss_fn:
            return last_hidden_state, None, None
        score_text = parallel_matmul_tp(last_hidden_state, lm_head_weight, lm_head_bias)
        return score_text, None, None

    image_mask_shifted = token_type_ids_shifted == TokenType.image
    text_pos_shifted = token_type_ids_shifted == TokenType.text

    if text_pos_shifted.any().item() > 0:
        score_text = parallel_matmul_tp(
            last_hidden_state[text_pos_shifted], lm_head_weight, lm_head_bias
        )
    else:
        score_text = None

    if mm_head_weight is not None and image_mask_shifted.any().item() > 0:
        score_image = parallel_matmul_tp(
            last_hidden_state[image_mask_shifted], mm_head_weight, mm_head_bias
        )
    else:
        score_image = None

    return score_text, score_image, None


class Ernie4_5_MoeLMHead(nn.Module):
    """Language model head for ERNIE with support for tensor parallelism."""

    def __init__(self, config):
        """Initialize the language model head.

        Args:
            config (Ernie4_5_Config): Model configuration containing:
                - vocab_size: Size of vocabulary
                - hidden_size: Dimension of hidden states
                # - tensor_parallel_degree: Degree of tensor parallelism
                - tie_word_embeddings: Whether to tie input/output embeddings
                - weight_share_add_bias: Whether to add bias when weight sharing
                - use_bias: Whether to use bias term
                - use_recompute_loss_fn: Whether to defer logits computation to loss function
                - use_sparse_head_and_loss_fn: Whether to use sparse head computation
        """

        super(Ernie4_5_MoeLMHead, self).__init__()
        self.config = config
        if config.tensor_parallel_degree > 1:
            vocab_size = config.vocab_size // config.tensor_parallel_degree
        else:
            vocab_size = config.vocab_size

        if config.tie_word_embeddings:
            self.weight = nn.Parameter(
                torch.empty(
                    vocab_size, config.hidden_size, dtype=torch.get_default_dtype()
                )
            )
        else:
            self.weight = nn.Parameter(
                torch.empty(
                    config.hidden_size, vocab_size, dtype=torch.get_default_dtype()
                )
            )
        nn.init.xavier_uniform_(self.weight)

        logger.info(
            f"output-weight:{self.weight.shape} tie_word_embeddings:{config.tie_word_embeddings}"
        )

        if config.weight_share_add_bias and config.use_bias:
            self.bias = nn.Parameter(
                torch.zeros(vocab_size, dtype=torch.get_default_dtype())
            )
        else:
            self.bias = None

        # Must set distributed attr for Tensor Parallel !
        self.weight.is_distributed = (
            True if (vocab_size != config.vocab_size) else False
        )
        if config.weight_share_add_bias and config.use_bias:
            self.bias.is_distributed = (
                True if (vocab_size != config.vocab_size) else False
            )

        if self.weight.is_distributed:
            self.weight.split_axis = 1
        if (
            config.weight_share_add_bias
            and config.use_bias
            and self.bias.is_distributed
        ):
            self.bias.split_axis = 0

        if self.config.use_recompute_loss_fn:
            logger.info(
                "Using recompute_loss_fn, the calculation of logits will be moved into "
                "loss_fn for memory optimization"
            )

    def forward(self, hidden_states, tensor_parallel_output=None):
        """Project hidden states to vocabulary logits.

        Args:
            hidden_states (torch.Tensor): Input tensor of shape [batch_size, seq_len, hidden_size]
            tensor_parallel_output (Optional[bool]): Whether to output parallel results. Defaults to None.

        Returns:
            Union[
                Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
                    # When use_recompute_loss_fn or use_sparse_head_and_loss_fn
                    - hidden_states: Original input
                    - weight: Projection weights
                    - bias: Optional bias term
                Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], bool]:  # With tensor_parallel_output
                    Same as above plus tensor_parallel_output flag
                torch.Tensor:  # Normal case
                    Logits tensor of shape [batch_size, seq_len, vocab_size]
            ]
        """
        return calc_lm_head_logits(
            self.config,
            hidden_states,
            self.weight,
            self.bias,
            tensor_parallel_output,
            training=self.training,
        )


class Ernie4_5_MoeForCausalLM(Ernie4_5_PretrainedModel, GenerationMixin):
    """ERNIE Mixture of Experts (MoE) model for causal language modeling."""

    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]

    def __init__(self, config):
        """
        Initializes the ERNIE MoE model for causal language modeling.

        Args:
            config (dict): Model configuration.
        """
        super().__init__(config)

        # initialize-trick for big model,
        # see https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/README.md#std-init
        new_initializer_range = math.sqrt(0.3333 / config.hidden_size)
        logger.info(
            f"change initializer-range from {config.initializer_range} to {new_initializer_range}"
        )
        config.initializer_range = new_initializer_range
        self.config = config
        self.model = Ernie4_5_Model(config)
        self.lm_head = Ernie4_5_MoeLMHead(config)

        self.tie_weights()  # maybe weight share

    def get_input_embeddings(self):
        """Returns the input embeddings layer."""
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        """Sets the input embeddings layer."""
        self.model.embed_tokens = value

    def get_output_embeddings(self):
        """Returns the output embeddings (LM head)."""
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        """Sets the output embeddings layer."""
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        """Sets the ERNIE decoder model."""
        self.model = decoder

    def get_decoder(self):
        """Get the transformer decoder.

        Returns:
            nn.Layer: The decoder module
        """
        return self.model

    # @staticmethod
    def _update_model_kwargs_for_generation(self, outputs, model_kwargs, is_encoder_decoder=False):
        """
        Updates model kwargs for generation.

        Args:
            outputs (Any): Model outputs.
            model_kwargs (dict): Current model kwargs.
            is_encoder_decoder (bool): Whether using encoder-decoder architecture.

        Returns:
            dict: Updated model kwargs.
        """
        # update cache
        if isinstance(outputs, tuple) and len(outputs) > 1 and not isinstance(outputs[1], torch.Tensor):
            model_kwargs["past_key_values"] = outputs[1]

        if isinstance(outputs, CausalLMOutputWithCrossAttentions) and "past_key_values" in outputs:
            model_kwargs["past_key_values"] = outputs.past_key_values

        # update token_type_ids with last value
        if "token_type_ids" in model_kwargs and model_kwargs["token_type_ids"] is not None:
            token_type_ids = model_kwargs["token_type_ids"]
            model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1:]], dim=-1)

        if not is_encoder_decoder and model_kwargs.get("attention_mask", None) is not None:
            # update attention mask
            attention_mask = model_kwargs["attention_mask"]
            model_kwargs["attention_mask"] = torch.cat(
                [
                    attention_mask,
                    torch.ones((attention_mask.shape[0], 1), dtype=torch.int64, device=attention_mask.device),
                ],
                dim=-1,
            )

        # update role_ids
        if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None:
            role_ids = model_kwargs["role_ids"]
            model_kwargs["role_ids"] = torch.cat([role_ids, role_ids[:, -1:]], dim=-1)

        if self.config.get('rope_3d', False):
            assert "position_ids" in model_kwargs, "position_ids must be provided if rope_3d is on"
            position_ids = model_kwargs["position_ids"]
            bsz = position_ids.shape[0]

            max_position = position_ids.max(dim=1, keepdim=True)[0]  # [batch_size, 1, hidden_dim]
            new_positions = max_position + 1
            
            model_kwargs["position_ids"] = torch.cat(
                [position_ids, new_positions],
                dim=1
            )

        return model_kwargs


class VisionMlp(nn.Module):
    """VisionMLP"""

    def __init__(self, dim: int, hidden_dim: int, hidden_act: str) -> None:
        super().__init__()
        self.fc1 = nn.Linear(dim, hidden_dim)
        self.act = ACT2FN[hidden_act]
        self.fc2 = nn.Linear(hidden_dim, dim)

    def forward(self, x) -> torch.Tensor:
        """
        Args:
            x (torch.Tensor): input tensor

        Returns:
            torch.Tensor: VisionMLP output tensor
        """
        return self.fc2(self.act(self.fc1(x)))


class PatchEmbed(nn.Module):
    """PatchEmbed"""

    def __init__(
        self,
        patch_size: int = 14,
        in_channels: int = 3,
        embed_dim: int = 1152,
    ) -> None:
        """
        Args:
            patch_size (int, optional): patch size. Defaults to 14.
            in_channels (int, optional): number of channels. Defaults to 3.
            embed_dim (int, optional): embedding dimension. Defaults to 1152.
        """
        super().__init__()
        self.patch_size = patch_size
        self.in_channels = in_channels
        self.embed_dim = embed_dim
        self.proj = nn.Linear(
            in_channels * patch_size * patch_size, embed_dim, bias=False
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """
        Args:
            hidden_states (torch.Tensor): hidden states

        Returns:
            torch.Tensor: output tensor
        """
        target_dtype = self.proj.weight.dtype

        hidden_states = self.proj(hidden_states.to(target_dtype))

        return hidden_states


class VisionRotaryEmbedding(nn.Module):
    """VisionRotaryEmbedding"""

    def __init__(self, dim: int, theta: float = 10000.0) -> None:
        """
        Args:
            dim (int): the dimension of each token.
            theta (float, optional): the frequency factor. Defaults to 10000.0.
        """
        super().__init__()
        self.inv_freq = 1.0 / theta ** (
            torch.arange(start=0, end=dim, step=2, dtype=torch.float32) / dim
        )

    def forward(self, seqlen: int) -> torch.Tensor:
        """
        Args:
            seqlen (int): length of sequence.

        Returns:
            torch.Tensor: rotary position embedding
        """
        seq = torch.arange(seqlen).to(self.inv_freq.dtype)
        freqs = torch.outer(input=seq, vec2=self.inv_freq)
        return freqs


def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)  # shape is the same as x


def apply_rotary_pos_emb_vision(
    tensor: torch.Tensor, freqs: torch.Tensor
) -> torch.Tensor:
    """Applies Rotary Position Embedding to the input tensors.

    Args:
        tensor (torch.Tensor): The input tensor.
        freqs (torch.Tensor): The frequencies used for the rotation.
    Returns:
        output (torch.Tensor): the tensor rotated using the Rotary Position Embedding.
    """
    orig_dtype = tensor.dtype

    tensor = tensor.type(dtype=torch.float32)
    cos = freqs.cos()
    sin = freqs.sin()
    cos = cos.unsqueeze(1).tile(1, 1, 2).unsqueeze(0).type(dtype=torch.float32)
    sin = sin.unsqueeze(1).tile(1, 1, 2).unsqueeze(0).type(dtype=torch.float32)
    output = tensor * cos + rotate_half(tensor) * sin
    output = output.to(orig_dtype)
    return output


class VisionAttention(nn.Module):
    """VisionAttention"""

    def __init__(self, dim: int, num_heads: int = 16) -> None:
        super().__init__()
        self.num_heads = num_heads
        self.qkv = nn.Linear(dim, dim * 3, bias=True)
        self.proj = nn.Linear(dim, dim)
        self.head_dim = dim // num_heads  # must added

    def forward(
        self,
        hidden_states: torch.Tensor,
        cu_seqlens: torch.Tensor,
        rotary_pos_emb: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        """forward function for vision attention"""
        seq_length = hidden_states.shape[0]
        qkv = (
            self.qkv(hidden_states)
            .reshape([seq_length, 3, self.num_heads, -1])
            .permute(1, 0, 2, 3)
        )
        q, k, v = qkv.unbind(axis=0)

        q = apply_rotary_pos_emb_vision(q.unsqueeze(dim=0), rotary_pos_emb).squeeze(
            dim=0
        )
        k = apply_rotary_pos_emb_vision(k.unsqueeze(dim=0), rotary_pos_emb).squeeze(
            dim=0
        )
        
        q = q.transpose(0, 1)
        k = k.transpose(0, 1)
        v = v.transpose(0, 1)
        
        lengths = cu_seqlens[1:] - cu_seqlens[:-1]
        splits = [
            torch.split(tensor, lengths.tolist(), dim=1) for tensor in (q, k, v)
        ]
        
        attn_output = []
        for q, k, v in zip(*splits):
            attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
            attn_weights = nn.functional.softmax(
                attn_weights, dim=-1, dtype=torch.float32
            ).to(q.dtype)
            attn_output_splited = torch.matmul(attn_weights, v)
            attn_output_splited = attn_output_splited.transpose(0, 1)
            attn_output.append(attn_output_splited)
        attn_output = torch.cat(attn_output, dim=0)
        attn_output = attn_output.reshape(seq_length, -1).contiguous()
        attn_output = self.proj(attn_output)
        return attn_output


class DFNRopeVisionBlock(nn.Module):
    """DFNRopeVisionBlock"""

    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
        """
        Args:
            config (dict): model configuration.
            attn_implementation (str, optional): attention implementation. Defaults to "sdpa".
        """
        super().__init__()
        self.norm1 = nn.LayerNorm(config.embed_dim, eps=1e-6)
        self.norm2 = nn.LayerNorm(config.embed_dim, eps=1e-6)
        mlp_hidden_dim = int(config.embed_dim * config.mlp_ratio)

        self.attn = VisionAttention(config.embed_dim, num_heads=config.num_heads)
        self.mlp = VisionMlp(
            dim=config.embed_dim,
            hidden_dim=mlp_hidden_dim,
            hidden_act=config.hidden_act,
        )
        self.config = config

    def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> torch.Tensor:
        """
        Args:
            hidden_states(torch.Tensor): hidden states
            cu_seqlens (torch.Tensor): cumulative sequence lengths
            rotary_pos_emb: rotary position embedding

        Returns:
            torch.Tensor: output tensor
        """
        hidden_states = hidden_states + self.attn(
            self.norm1(hidden_states),
            cu_seqlens=cu_seqlens,
            rotary_pos_emb=rotary_pos_emb,
        )
        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
        return hidden_states


class DFNRopeVisionTransformerPreTrainedModel(PreTrainedModel):
    """DFNRopeVisionTransformerPreTrainedModel"""

    config_class = DFNRopeVisionTransformerConfig
    _tp_plan = {}

    def __init__(self, config) -> None:
        """
        Args:
            config (dict): model configuration
        """
        super().__init__(config)
        self.spatial_merge_size = config.spatial_merge_size

        self.patch_embed = PatchEmbed(
            patch_size=config.patch_size,
            in_channels=config.in_channels,
            embed_dim=config.embed_dim,
        )

        head_dim = config.embed_dim // config.num_heads
        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)

        self.blocks = nn.ModuleList(
            [DFNRopeVisionBlock(config) for _ in range(config.depth)]
        )

        assert (
            config.hidden_size == config.embed_dim
        ), "in DFNRope, vit's config.hidden must be equal to config.embed_dim"
        self.ln = nn.LayerNorm(config.hidden_size, eps=1e-6)

    def rot_pos_emb(self, grid_thw, num_pad=0):
        """rot_pos_emb

        Args:
            grid_thw (torch.Tensor): grid thw of input

        Returns:
            torch.Tensor: rotary position embedding
        """
        pos_ids = []
        grid_hw_array = np.array(grid_thw.cpu(), dtype=np.int64)
        for t, h, w in grid_hw_array:
            hpos_ids = np.arange(h).reshape([-1, 1])
            hpos_ids = np.tile(hpos_ids, (1, w))
            hpos_ids = hpos_ids.reshape(
                h // self.spatial_merge_size,
                self.spatial_merge_size,
                w // self.spatial_merge_size,
                self.spatial_merge_size,
            )
            hpos_ids = np.transpose(hpos_ids, (0, 2, 1, 3))
            hpos_ids = hpos_ids.flatten()

            wpos_ids = np.arange(w).reshape([1, -1])
            wpos_ids = np.tile(wpos_ids, (h, 1))
            wpos_ids = wpos_ids.reshape(
                h // self.spatial_merge_size,
                self.spatial_merge_size,
                w // self.spatial_merge_size,
                self.spatial_merge_size,
            )
            wpos_ids = np.transpose(wpos_ids, (0, 2, 1, 3))
            wpos_ids = wpos_ids.flatten()

            stacked_ids = np.stack([hpos_ids, wpos_ids], axis=-1)
            tiled_ids = np.tile(stacked_ids, (t, 1))
            pos_ids.append(tiled_ids)

        pos_ids = np.concatenate(pos_ids, axis=0)
        if num_pad > 0:
            pos_ids = np.concatenate(
                [pos_ids, np.zeros((num_pad, 2), dtype=pos_ids.dtype)]
            )
        max_grid_size = np.amax(grid_hw_array[:, 1:])
        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(start_dim=1)
        return rotary_pos_emb

    def forward(
        self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, num_pad=0
    ) -> torch.Tensor:
        """
        Args:
            hidden_states (torch.Tensor): input tensor
            grid_thw (torch.Tensor): grid thw of input
            num_pad (int): number of padding tokens

        Returns:
            torch.Tensor: output tensor
        """
        hidden_states = self.patch_embed(hidden_states)

        rotary_pos_emb = self.rot_pos_emb(grid_thw, num_pad=num_pad)
        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device)

        cu_seqlens = torch.repeat_interleave(
            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
        ).cumsum(dim=0, dtype=torch.int32)

        if num_pad > 0:
            cu_seqlens = F.pad(cu_seqlens, (1, 1), value=0)
            cu_seqlens[-1] = cu_seqlens[-2] + num_pad
        else:
            cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)

        for idx, blk in enumerate(self.blocks):
            hidden_states = blk(
                hidden_states,
                cu_seqlens=cu_seqlens,
                rotary_pos_emb=rotary_pos_emb,
            )

        ret = self.ln(hidden_states)  # add norm
        return ret


class VariableResolutionResamplerModel(nn.Module):
    """
    VariableResolutionResamplerModel, support variable resolution
    """

    def __init__(self, in_dim, out_dim, spatial_conv_size, temporal_conv_size, config):
        super().__init__()
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.config = config
        self.spatial_conv_size = spatial_conv_size
        self.temporal_conv_size = temporal_conv_size
        self.use_temporal_conv = config.use_temporal_conv

        # compress 2d conv(picture) to 1d
        self.spatial_dim = self.in_dim * self.spatial_conv_size * self.spatial_conv_size
        # compress 3d conv(video) to 1d
        self.temporal_dim = (
            self.in_dim
            * self.spatial_conv_size
            * self.spatial_conv_size
            * self.temporal_conv_size
        )

        # using unique name space start with "mm_resampler_"
        with UniqueNameGuard("mm_resampler_") as guard:

            self.spatial_linear = nn.Sequential(
                nn.Linear(self.spatial_dim, self.spatial_dim),
                nn.GELU(),
                nn.Linear(self.spatial_dim, self.spatial_dim),
                nn.LayerNorm(self.spatial_dim, eps=1e-6),
            )

            if self.use_temporal_conv:
                self.temporal_linear = nn.Sequential(
                    nn.Linear(self.temporal_dim, self.spatial_dim),
                    nn.GELU(),
                    nn.Linear(self.spatial_dim, self.spatial_dim),
                    nn.LayerNorm(self.spatial_dim, eps=1e-6),
                )

            self.mlp = nn.Linear(self.spatial_dim, self.out_dim)

            out_config = deepcopy(config)
            out_config.hidden_size = out_dim
            self.after_norm = RMSNorm(out_config)

    def spatial_conv_reshape(self, x, spatial_conv_size):
        """
        reshape before linear to imitation conv
        """
        S, C = x.shape
        x = x.reshape([-1, C * (spatial_conv_size**2)])
        return x

    def forward(self, x, image_mask, token_type_ids, image_type_ids, grid_thw):
        """
        x: image_features
        image_mask: [B]
        token_types_ids: [B]
        image_type_ids:  [B_image]
        grid_thw: [B_image, 3]
        """
        assert image_type_ids is not None

        def fwd_spatial(x):
            """
            x in the shape of [S, H]
            S is ordered in the following way: [ [patch_h*patch_w (row-major traversal)] * patch_time]
            H is simply hidden
            """
            x = self.spatial_conv_reshape(x, self.spatial_conv_size)

            x = self.spatial_linear(x)

            return x

        def fwd_placeholder(x, grid_thw, to_tensor=False):
            """
            x: [S, H]
            grid_thw: [S, 3]
                the second dimension: [t, h, w]
            """

            grid_thw_cpu = grid_thw.cpu().numpy()
            grid_t, grid_hw = grid_thw_cpu[:, 0], grid_thw_cpu[:, 1:]
            grid_hw_after_conv = grid_hw.prod(-1) // (self.spatial_conv_size**2)

            tokens_per_img_or_vid = grid_thw_cpu.prod(-1) // (self.spatial_conv_size**2)
            batch_offset = np.empty(
                tokens_per_img_or_vid.size, dtype=tokens_per_img_or_vid.dtype
            )
            batch_offset[0] = 0
            batch_offset[1:] = tokens_per_img_or_vid.cumsum()[:-1]

            assert (
                self.temporal_conv_size == 2
            ), f"Hard Code: temporal_conv_size==2, got:{self.temporal_conv_size}"

            # TODO: support any temporal conv size
            slice_offsets = []
            for temporoal_size, spatial_size, b_offset in zip(
                grid_t, grid_hw_after_conv, batch_offset
            ):
                for temp_offset in range(0, temporoal_size, 2):
                    slice_offsets.append(
                        np.arange(
                            b_offset + (temp_offset) * spatial_size,
                            b_offset + (temp_offset + 1) * spatial_size,
                        )
                    )
            slice_offsets = torch.tensor(np.concatenate(slice_offsets, axis=-1)).to(
                x.device
            )

            slice_offsets2 = []
            for temporoal_size, spatial_size, b_offset in zip(
                grid_t, grid_hw_after_conv, batch_offset
            ):
                for temp_offset in range(
                    1 if temporoal_size > 1 else 0, temporoal_size, 2
                ):
                    slice_offsets2.append(
                        np.arange(
                            b_offset + (temp_offset) * spatial_size,
                            b_offset + (temp_offset + 1) * spatial_size,
                        )
                    )
            slice_offsets2 = torch.tensor(np.concatenate(slice_offsets2, axis=-1)).to(
                x.device
            )

            x_timestep_1 = torch.index_select(x, dim=0, index=slice_offsets)
            x_timestep_2 = torch.index_select(x, dim=0, index=slice_offsets2)
            x = torch.concat([x_timestep_1, x_timestep_2], dim=-1)
            return x

        def fwd_temporal(x):
            x = self.temporal_linear(x)
            return x

        def fwd_mlp(x):
            x = self.mlp(x)
            x = self.after_norm(x)
            return x

        x = fwd_spatial(x)
        if self.use_temporal_conv:
            x = fwd_placeholder(x, grid_thw)
            x = fwd_temporal(x)
        x = fwd_mlp(x)
        return x


class Ernie4_5_MoeVLHead(Ernie4_5_MoeLMHead):
    """Ernie4_5_MoeVLHead"""

    def __init__(self, config):
        super().__init__(config)
        self.config = config
        if config.mm_vocab_size > 0:
            mm_vocab_config = deepcopy(config)
            mm_vocab_config.vocab_size = config.mm_vocab_size
            assert mm_vocab_config.vocab_size > 0, mm_vocab_config
            assert (
                mm_vocab_config.im_patch_id >= mm_vocab_config.max_text_id
            ), mm_vocab_config
            self.mm_head = Ernie4_5_MoeLMHead(mm_vocab_config)
        else:
            self.mm_head = None

    def forward(self, hidden_state, token_type_ids_labels, use_cache=False):
        """
        Args:
            hidden_state(torch.Tensor): hidden state
            token_type_ids_labels(torch.Tensor): token ids
            use_cache(bool): whether to use cache, default is False

        Returns:
            logits_text(torch.Tensor): text logits
            logits_image(torch.Tensor): image logits
        """
        if not use_cache:
            mm_head_weight = self.mm_head.weight if self.mm_head is not None else None
            mm_head_bias = self.mm_head.bias if self.mm_head is not None else None
            logits_text, logits_image, _ = calc_multimodal_logits(
                hidden_state,
                self.weight,
                self.bias,
                mm_head_weight,
                mm_head_bias,
                token_type_ids_labels,
                self.config,
            )
            return logits_text, logits_image, None
        else:
            # TODO，support lm_head decode only
            return (
                parallel_matmul(
                    hidden_state[:, -1:, :],
                    self.weight,
                    self.bias,
                    transpose_y=self.config.tie_word_embeddings,
                ),
                None,
                None,
            )


class Ernie4_5_VLMoeForConditionalGeneration(Ernie4_5_MoeForCausalLM):
    """Ernie4_5_VLMoeForConditionalGeneration"""

    config_class = Ernie4_5_VLMoEConfig
    main_input_name = "pixel_values"
    _keep_in_fp16_modules = ["vision_model"]
    _tp_plan = {}

    def __init__(
        self, config: Ernie4_5_VLMoEConfig, vision_model=None, resampler_model=None
    ):
        """
        initialize Ernie4_5_VLMoeForConditionalGeneration

        Args:
            config(Ernie4_5_VLMoEConfig): Model configuration.
            vision_model(nn.Module): vision model
            resampler_model(nn.Module): resampler model
        """
        super().__init__(config)

        self.vision_model = DFNRopeVisionTransformerPreTrainedModel(
            config.vision_config
        )

        self.model.resampler_model = VariableResolutionResamplerModel(
            config.pixel_hidden_size,
            config.hidden_size,
            config.spatial_conv_size,
            config.temporal_conv_size,
            config=config,
        )

        self.image_preprocess = None
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        self.post_init()

    def add_image_preprocess(self, processor):
        """add image preprocess"""
        logger.info("image preprocess is set")

        image_preprocess = processor.image_processor
        image_preprocess.image_mean_tensor = torch.tensor(
            image_preprocess.image_mean, dtype=torch.float32
        ).reshape([1, 3, 1, 1])
        image_preprocess.image_std_tensor = torch.tensor(
            image_preprocess.image_std, dtype=torch.float32
        ).reshape([1, 3, 1, 1])
        image_preprocess.rescale_factor = torch.tensor(
            image_preprocess.rescale_factor, dtype=torch.float32
        )
        image_preprocess.image_mean_tensor = image_preprocess.image_mean_tensor.squeeze(
            [-2, -1]
        ).repeat_interleave(self.config.vision_config.patch_size**2 * 1, -1)
        image_preprocess.image_std_tensor = image_preprocess.image_std_tensor.squeeze(
            [-2, -1]
        ).repeat_interleave(self.config.vision_config.patch_size**2 * 1, -1)

        self.image_preprocess = image_preprocess

    def vision_forward(
        self,
        images,
        image_position_ids,
        image_attention_mask,
        grid_thw,
    ):
        """vision_forward"""
        if self.image_preprocess is not None:
            assert images.dtype == torch.uint8, images.dtype
            current_device = images.device
            self.image_preprocess.image_mean_tensor = (
                self.image_preprocess.image_mean_tensor.to(current_device)
            )
            self.image_preprocess.image_std_tensor = (
                self.image_preprocess.image_std_tensor.to(current_device)
            )
            images = self.image_preprocess.rescale_factor * images.to(torch.float32)
            images = (
                images - self.image_preprocess.image_mean_tensor
            ) / self.image_preprocess.image_std_tensor
            images = images.to(torch.bfloat16)
        else:
            assert images.dtype == torch.bfloat16, images.dtype
        # logger.info(f"extract feature input - {images}--{grid_thw}")
        if grid_thw is not None:
            grid_thw = grid_thw[grid_thw > 0].reshape([-1, 3])
            grid_thw = F.pad(
                torch.repeat_interleave(grid_thw[:, 1:], grid_thw[:, 0], 0),
                [1, 0, 0, 0],
                value=1,
            )
        image_features = self.vision_model(images, grid_thw)
        return image_features

    def vision_mapping_forward(
        self,
        token_type_ids,
        token_type_ids_w_video,
        input_ids,
        mm_input_ids,
        image_features,
        inputs_embeds,
        image_type_ids,
        grid_thw,
    ):
        """vision_mapping_forward"""
        image_mask = input_ids == self.config.im_patch_id
        image_features = self.model.resampler_model(
            image_features,
            image_mask,
            token_type_ids_w_video,
            image_type_ids,
            grid_thw,
        )

        if image_features.dim == 2:
            B, N, C = image_features.shape
            image_features = image_features.reshape([B * N, C]).to(inputs_embeds.dtype)
        # Will overwrite the part of `ids==im_patch_id` in `mm_ids_features`
        inputs_embeds[image_mask.to(inputs_embeds.device)] = image_features.to(
            inputs_embeds.device
        )
        return inputs_embeds

    def prepare_inputs_for_generation(
        self,
        input_ids,
        images=None,
        use_cache=False,
        past_key_values=None,
        inputs_embeds=None,
        image_position_ids=None,
        image_attention_mask=None,
        token_type_ids=None,
        image_type_ids=None,
        grid_thw=None,
        **kwargs,
    ):
        """
        Prepare inputs for the decoder that can be used for generation.

        Args:
            input_ids (torch.Tensor): Input ids.
            images (torch.Tensor): Images. Default to None.
            use_cache (bool): Whether to use cache. Default to False.
            past_key_values (list): Past key values. Default to None.
            inputs_embeds (torch.Tensor): Input embeddings. Default to None.
            image_position_ids (torch.Tensor): Image position ids. Default to None.
            image_attention_mask (torch.Tensor): Image attention mask. Default to None.
            token_type_ids (torch.Tensor): Token type ids. Default to None.
            image_type_ids (torch.Tensor): Image type ids. Default to None.
            grid_thw (torch.Tensor): Grid thw. Default to None.
        """
        if past_key_values:
            input_ids = input_ids[:, -1:]
            token_type_ids = token_type_ids[:, -1:]
            image_type_ids = (
                image_type_ids[:, -1:] if image_type_ids is not None else None
            )

        if self.config.use_flash_attention:
            attention_mask = None
        else:
            attention_mask = kwargs.get("attention_mask", None)

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs.update(
            {
                "past_key_values": past_key_values,
                "use_cache": True,
                "attention_mask": attention_mask,
                "images": images,
                "image_position_ids": image_position_ids,
                "image_attention_mask": image_attention_mask,
                "image_type_ids": image_type_ids,
                "token_type_ids": torch.cat(
                    [
                        token_type_ids,
                        torch.zeros(
                            [len(token_type_ids), 1], dtype=token_type_ids.dtype
                        ).to(token_type_ids.device),
                    ],
                    dim=-1,
                ),
                "grid_thw": grid_thw,
            }
        )
        if self.config.rope_3d:
            model_inputs.update({"position_ids": kwargs["position_ids"]})

        return model_inputs

    def _post_init(self, original_init, *args, **kwargs):
        """
        Label all multimodal parameters in the model, only head and Embedding
        Experts parameters are already labeled
        """
        super()._post_init(self, original_init, *args, **kwargs)
        if self.lm_head.mm_head is not None:
            self.lm_head.mm_head.weight.expert_type = "expert_type_1"
        if getattr(self.lm_head.mm_head, "bias", None) is not None:
            self.lm_head.mm_head.bias.expert_type = "expert_type_1"

    def forward(
        self,
        input_ids: torch.Tensor,
        position_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.Tensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,
        images: Optional[torch.Tensor] = None,
        ignored_index: Optional[int] = 0,
        return_dict: Optional[bool] = None,
        image_position_ids: Optional[torch.Tensor] = None,
        image_attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        image_type_ids: Optional[torch.Tensor] = None,
        grid_thw: Optional[torch.Tensor] = None,
        **kwargs,
    ):
        """
        Forward for Ernie4_5_VLMoeForConditionalGeneration

        Args:
            input_ids (torch.Tensor): Input ids.
            position_ids (Optional[torch.Tensor], optional): Position ids. Defaults to None.
            attention_mask (Optional[torch.Tensor], optional): Attention mask. Defaults to None.
            past_key_values (Optional[List[torch.Tensor]], optional): Past key values. Defaults to None.
            use_cache (Optional[bool], optional): Use cache. Defaults to None.
            output_attentions (Optional[bool], optional): Output attentions. Defaults to None.
            output_hidden_states (Optional[bool], optional): Output hidden states. Defaults to None.
            labels (Optional[torch.Tensor], optional): Labels. Defaults to None.
            images (Optional[torch.Tensor]): Images. Defaults to None.
            ignored_index (Optional[int], optional): Ignored index. Defaults to 0.
            return_dict (Optional[bool], optional): Return dict. Defaults to None.
            image_position_ids (Optional[torch.Tensor], optional): Image position ids. Defaults to None.
            image_attention_mask (Optional[torch.Tensor], optional): Image attention mask. Defaults to None.
            token_type_ids (Optional[torch.Tensor], optional): Token type ids. Defaults to None.
            image_type_ids (Optional[torch.Tensor], optional): Image type ids. Defaults to None.
            grid_thw (Optional[torch.Tensor], optional): Grid thw. Defaults to None.
        """
        if grid_thw is not None:
            grid_thw = grid_thw[grid_thw > 0].reshape([-1, 3])
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        image_mask = input_ids == self.config.im_patch_id

        image_rate = image_mask.to(torch.float32).mean()

        if past_key_values is None:
            if images is not None:
                assert (image_mask).any().item(), (
                    image_mask.detach().cpu().numpy().tolist(),
                    input_ids.detach().cpu().numpy().tolist(),
                    self.config.im_patch_id,
                    images.shape,
                )
                image_features = self.vision_forward(
                    images,
                    image_position_ids,
                    image_attention_mask,
                    grid_thw,
                )
            else:
                image_features = None  # no more faking
        else:
            image_features = None
        if token_type_ids is None:
            token_type_ids = image_mask.to(torch.int64)
            token_type_ids_labels = torch.cat(
                [token_type_ids[:, 1:], token_type_ids[:, -1:]], 1
            )
        else:
            assert (
                token_type_ids.shape[1] == input_ids.shape[1] + 1
            ), f"token_type:{token_type_ids.shape}, ids:{input_ids.shape}"
            token_type_ids_labels = token_type_ids[..., 1:]

        lm_input_ids = input_ids.clone()
        mm_input_ids = input_ids.clone()

        inputs_embeds = self.model.embed_tokens(lm_input_ids)
        token_type_ids_w_video = token_type_ids[..., :-1].clone()
        token_type_ids[token_type_ids == TokenType.video] = TokenType.image

        if images is not None and image_features is not None:
            inputs_embeds = self.vision_mapping_forward(
                token_type_ids[..., :-1],
                token_type_ids_w_video,
                input_ids,
                mm_input_ids,
                image_features,
                inputs_embeds,
                image_type_ids,
                grid_thw,
            )
        else:
            pass  # do nothing, should not hang under DygraphShardingOptimizerV2

        outputs = self.model(
            position_ids=position_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            past_key_values=past_key_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=True,
        )

        if not use_cache:
            assert outputs.last_hidden_state.shape[:2] == token_type_ids_labels.shape, (
                outputs.last_hidden_state.shape,
                token_type_ids_labels.shape,
            )
            if self.config.use_recompute_loss_fn:
                logits = outputs.last_hidden_state
            else:
                logits = self.lm_head(outputs.last_hidden_state)
        else:
            logits = self.lm_head(outputs.last_hidden_state[:, -1:, :])

        router_loss = outputs.router_loss

        # aka Generate Decoding
        loss = None
        return CausalLMOutputWithCrossAttentions(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            router_loss=outputs.router_loss,
        )

    @staticmethod
    def _resolve_prefix_keys(state_keys_base, state_keys_real, ignore_error=False):
        """_resolve_prefix_keys"""
        # state_keys_map base to real
        state_keys_map = {}

        state_keys_base = set(state_keys_base)
        state_keys_real = set(state_keys_real)

        for key in state_keys_base:
            for x in state_keys_real:
                if "mm_embed_tokens" in x:
                    if "mm_embed_tokens" in key:
                        state_keys_map[key] = x
                        break
                elif x.endswith(key):
                    state_keys_map[key] = x
                    break
            if key not in state_keys_map:
                if not ignore_error:
                    logger.error(f"could not find name {key} in loaded state dict!")
            else:
                state_keys_real.remove(state_keys_map[key])

        return state_keys_map


@dataclass
class BaseModelOutputWithPastAndCrossAttentions(ModelOutput):
    """
    Base class for model outputs with past key values and cross attention layers,
    with additional support for router components in mixture-of-experts models.

    This extends the base model output to include:
    1. Router-related outputs for expert selection
    2. Maintains all existing functionality from the parent class
    """

    last_hidden_state: Optional[Tuple[torch.Tensor]] = None
    past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None
    hidden_states: Optional[Tuple[torch.Tensor]] = None
    attentions: Optional[Tuple[torch.Tensor]] = None
    cross_attentions: Optional[Tuple[torch.Tensor]] = None
    router_loss: Optional[torch.Tensor] = None
    gate_logits: Optional[Tuple[torch.Tensor]] = None


@dataclass
class CausalLMOutputWithCrossAttentions(ModelOutput):
    """
    Base class for causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (`tuple(torch.Tensor)`, *optional*, returned when `output_hidden_states=True`
            is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.Tensor)`, *optional*, returned when `output_attentions=True` is passed or
            when `config.output_attentions=True`):
            Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        router_loss (Optional[torch.Tensor]):
            The routing loss computed by the gating network in mixture-of-experts models.
            This is typically the load balancing loss that encourages equal expert utilization.
            None when not using mixture-of-experts routing.
    """

    loss: Optional[torch.Tensor] = None
    logits: torch.Tensor = None
    past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None
    hidden_states: Optional[Tuple[torch.Tensor]] = None
    attentions: Optional[Tuple[torch.Tensor]] = None
    router_loss: Optional[Tuple[torch.Tensor]] = None