import torch
from torch import nn
import torch.nn.functional as F
from megatron.core import parallel_state
from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding, get_pos_emb_on_this_cp_rank
from megatron.training.global_vars import get_args
class PositionGetter3D:
def __init__(self, atten_layout="BSH"):
self.cache_positions = {}
self.atten_layout = atten_layout
@staticmethod
def check_type(param):
if isinstance(param, torch.Tensor):
param = param.item()
return param
def __call__(self, b, t, h, w, device):
b = self.check_type(b)
t = self.check_type(t)
h = self.check_type(h)
w = self.check_type(w)
if not (b, t, h, w) in self.cache_positions:
x = torch.arange(w, device=device)
y = torch.arange(h, device=device)
z = torch.arange(t, device=device)
pos = torch.cartesian_prod(z, y, x)
if self.atten_layout == "SBH":
pos = pos.reshape(t * h * w, 3).transpose(0, 1).reshape(3, -1, 1).contiguous().expand(3, -1, b).clone()
elif self.atten_layout == "BSH":
pos = pos.reshape(t * h * w, 3).transpose(0, 1).reshape(3, 1, -1).contiguous().expand(3, b, -1).clone()
else:
raise ValueError(f"Unsupported layout type: {self.atten_layout}")
poses = (pos[0].contiguous(), pos[1].contiguous(), pos[2].contiguous())
max_poses = (int(poses[0].max()), int(poses[1].max()), int(poses[2].max()))
self.cache_positions[b, t, h, w] = (poses, max_poses)
pos = self.cache_positions[b, t, h, w]
return pos
class RoPE3D(nn.Module):
def __init__(self, freq=10000.0, interpolation_scale=(1, 1, 1)):
super().__init__()
self.base = freq
self.interpolation_scale_t, self.interpolation_scale_h, self.interpolation_scale_w = interpolation_scale
self.cache = {}
def get_cos_sin(self, dim, seq_len, device, dtype, interpolation_scale=1):
if (dim, seq_len, device, dtype) not in self.cache:
inv_freq = 1.0 / (self.base ** (torch.arange(0, dim, 2).float().to(device) / dim))
t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) / interpolation_scale
freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype)
freqs = torch.cat((freqs, freqs), dim=-1)
cos = freqs.cos()
sin = freqs.sin()
self.cache[dim, seq_len, device, dtype] = (cos, sin)
return self.cache[dim, seq_len, device, dtype]
@staticmethod
def rotate_half(x):
x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
return torch.cat((-x2, x1), dim=-1)
def apply_rope1d(self, tokens, pos1d, cos, sin):
if pos1d.ndim != 2:
raise AssertionError("pos1d.ndim must be 2")
cos = F.embedding(pos1d, cos)[:, :, None, :]
sin = F.embedding(pos1d, sin)[:, :, None, :]
return (tokens * cos) + (self.rotate_half(tokens) * sin)
def forward(self, tokens, positions):
"""
input:
tokens: batch_size x nheads x ntokens x dim
positions: batch_size x ntokens x 3 (t, y and x position of each token)
output:
tokens after applying RoPE3D (batch_size x nheads x ntokens x x dim)
"""
if tokens.size(3) % 3 != 0:
raise AssertionError("number of dimensions should be a multiple of three")
dim = tokens.size(3) // 3
poses, max_poses = positions
if len(poses) != 3 or poses[0].ndim != 2:
raise AssertionError("poses shape error")
cos_t, sin_t = self.get_cos_sin(dim, max_poses[0] + 1, tokens.device, tokens.dtype, self.interpolation_scale_t)
cos_y, sin_y = self.get_cos_sin(dim, max_poses[1] + 1, tokens.device, tokens.dtype, self.interpolation_scale_h)
cos_x, sin_x = self.get_cos_sin(dim, max_poses[2] + 1, tokens.device, tokens.dtype, self.interpolation_scale_w)
t, y, x = tokens.chunk(3, dim=-1)
t = self.apply_rope1d(t, poses[0], cos_t, sin_t)
y = self.apply_rope1d(y, poses[1], cos_y, sin_y)
x = self.apply_rope1d(x, poses[2], cos_x, sin_x)
tokens = torch.cat((t, y, x), dim=-1)
return tokens
class DynamicRotaryEmbedding(RotaryEmbedding):
def __init__(self,
config,
kv_channels,
rotary_percent,
rotary_interleaved=False,
seq_len_interpolation_factor=None,
rotary_base=10000,
use_cpu_initialization=False,
seq_len=None
):
super().__init__(
kv_channels=kv_channels,
rotary_percent=rotary_percent,
rotary_interleaved=rotary_interleaved,
seq_len_interpolation_factor=seq_len_interpolation_factor,
rotary_base=rotary_base,
)
dim = kv_channels
if rotary_percent < 1.0:
dim = int(dim * rotary_percent)
max_position_embeddings = config.max_position_embeddings
factor = config.rope_scaling.factor if hasattr(config.rope_scaling, 'factor') else 1.0
seq_len = seq_len if seq_len is not None and seq_len > max_position_embeddings else max_position_embeddings
device = "cpu" if use_cpu_initialization else torch.cuda.current_device()
rotary_base = rotary_base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2))
self.inv_freq = 1.0 / (rotary_base ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim))
def forward(self, max_seq_len: int, offset: int = 0):
"""Forward pass of RoPE embedding.
Args:
max_seq_len (int): Maximum size of sequence
offset (int, optional): _description_. Defaults to 0.
Returns:
Tensor: Embeddings after applying RoPE.
"""
if self.inv_freq.device.type == 'cpu':
self.inv_freq = self.inv_freq.to(device=torch.cuda.current_device())
seq = (
torch.arange(max_seq_len, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+ offset
)
if self.seq_len_interpolation_factor is not None:
seq *= 1 / self.seq_len_interpolation_factor
freqs = torch.outer(seq, self.inv_freq)
if not self.rotary_interleaved:
emb = torch.cat((freqs, freqs), dim=-1)
else:
emb = torch.stack((freqs.view(-1, 1), freqs.view(-1, 1)), dim=-1).view(
freqs.shape[0], -1
)
emb = emb[:, None, None, :]
if parallel_state.get_context_parallel_world_size() > 1 and get_args().context_parallel_algo == "ulysses_cp_algo":
emb = get_pos_emb_on_this_cp_rank(emb, 0)
return emb
def get_rotary_seq_len(
self,
inference_context,
transformer,
transformer_input,
transformer_config,
inference_params=None,
) -> float:
"""Function to get the rotary sequence length.
Args:
inference_params : Used during Inference time
transformer (TransformerBlock): The transformer block (decoder/encoder) used by the model
transformer_input (Tensor): _description_
transformer_config (TransformerConfig): Transformer config used by the model
Returns:
float: The rotary sequence length
"""
if inference_params is not None:
rotary_seq_len = inference_params.max_sequence_length
else:
if transformer.input_tensor is not None:
rotary_seq_len = transformer.input_tensor.size(0)
else:
rotary_seq_len = transformer_input.size(0)
if transformer_config.sequence_parallel:
rotary_seq_len = torch.Tensor([rotary_seq_len]).cuda()
torch.distributed.all_reduce(rotary_seq_len, group=parallel_state.get_tensor_model_parallel_group())
rotary_seq_len = rotary_seq_len.item()
if get_args().context_parallel_algo == "ulysses_cp_algo":
rotary_seq_len *= transformer_config.context_parallel_size
return rotary_seq_len