import math
import torch
from megatron.core import mpu
from megatron.training import get_args
from megatron.inference.text_generation.communication import broadcast_int_list, broadcast_tensor
from mindspeed_llm.tasks.preprocess.templates import Template, get_model_template
def _encode_no_template(tokenizer, prompts):
prompts_tokens = [[]]
if isinstance(prompts, str):
prompts_tokens = [tokenizer.encode(prompts)]
elif torch.is_tensor(prompts):
if len(prompts.shape) == 1:
prompts_tokens = prompts.unsqueeze(0).numpy().tolist()
elif len(prompts.shape) == 2:
prompts_tokens = prompts.numpy().tolist()
elif isinstance(prompts, (tuple, list)):
if len(prompts) and isinstance(prompts[0], (tuple, list)):
prompts_tokens = prompts
elif len(prompts) and isinstance(prompts[0], int):
prompts_tokens = [prompts]
elif len(prompts) and isinstance(prompts[0], str):
prompts_tokens = [tokenizer.encode(val) for val in prompts]
else:
raise TypeError("Please check input_ids in correct type.")
return prompts_tokens
def _encode_by_template(template, tokenizer, prompts):
prompts_tokens = []
if prompts is None:
return [[]]
response_prompt = [{"role": "assistant", "content": ""}]
if len(prompts) and isinstance(prompts, str):
paired_messages = [{"role": "user", "content": "{}".format(prompts)}] + response_prompt
tokens, _ = template.encode_oneturn(tokenizer=tokenizer, messages=paired_messages, tools="")
prompts_tokens.append(tokens)
elif len(prompts) and isinstance(prompts[0], (dict)):
paired_messages = prompts + response_prompt
tokens, _ = template.encode_oneturn(tokenizer=tokenizer, messages=paired_messages, tools="")
prompts_tokens.append(tokens)
elif len(prompts) and isinstance(prompts[0], (str)):
for query in prompts:
paired_messages = [{"role": "user", "content": "{}".format(query)}] + response_prompt
tokens, _ = template.encode_oneturn(tokenizer=tokenizer, messages=paired_messages, tools="")
prompts_tokens.append(tokens)
elif len(prompts) and isinstance(prompts[0], (tuple, list)):
for val in prompts:
if len(val) and isinstance(val, (tuple, list)):
paired_messages = val + response_prompt
tokens, _ = template.encode_oneturn(tokenizer=tokenizer, messages=paired_messages, tools="")
prompts_tokens.append(tokens)
else:
raise TypeError("Please check input_ids in correct type.")
return prompts_tokens if len(prompts_tokens) > 0 else [prompts_tokens]
def tokenize_prompts(tokenizer=None, prompts=None, tokens_to_generate=None, max_generate_length=128,
add_BOS=None, rank=0, broadcast=False):
"""Tokenize prompts and make them avaiable on all ranks."""
if broadcast:
sizes_list = None
prompts_tokens_cuda_long_tensor = None
prompts_length_cuda_long_tensor = None
if torch.distributed.get_rank() == rank:
if prompts is None:
raise ValueError("prompts must not be None")
if tokens_to_generate is None:
raise ValueError("tokens_to_generate must not be None")
prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor = \
_tokenize_prompts_and_batch(tokenizer, prompts, tokens_to_generate, max_generate_length, add_BOS)
sizes_list = [prompts_tokens_cuda_long_tensor.size(0),
prompts_tokens_cuda_long_tensor.size(1)]
sizes_tensor = broadcast_int_list(2, int_list=sizes_list, rank=rank)
sizes = sizes_tensor.tolist()
prompts_tokens_cuda_long_tensor = broadcast_tensor(
sizes, torch.int64, tensor=prompts_tokens_cuda_long_tensor, rank=rank)
prompts_length_cuda_long_tensor = broadcast_tensor(
sizes[0], torch.int64, tensor=prompts_length_cuda_long_tensor,
rank=rank)
else:
if prompts is None:
raise ValueError("prompts must not be None")
if tokens_to_generate is None:
raise ValueError("tokens_to_generate must not be None")
prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor = \
_tokenize_prompts_and_batch(tokenizer, prompts, tokens_to_generate, max_generate_length, add_BOS)
return prompts_tokens_cuda_long_tensor, prompts_length_cuda_long_tensor
def _tokenize_prompts_and_batch(tokenizer, prompts, tokens_to_generate, max_generate_length, add_BOS):
"""Given a set of prompts and number of tokens to generate:
- tokenize prompts
- set the sequence length to be the max of length of prompts
plus the number of tokens we would like to generate
- pad all the sequences to this length so we can convert them
into a 2D tensor.
"""
args = get_args()
template = None
if hasattr(args, "prompt_type") and args.prompt_type is not None:
template = get_model_template(args.prompt_type.strip(), args.prompt_type_path.strip(), args.enable_thinking)
if args.hf_chat_template:
if not hasattr(tokenizer, "apply_chat_template"):
raise AssertionError('The tokenizer has no Huggingface chat template, Please use chat model.')
prompts_tokens = [tokenizer.apply_chat_template(
prompts,
tokenize=True,
add_generation_prompt=True
)]
elif template is None:
prompts_tokens = _encode_no_template(tokenizer, prompts)
else:
prompts_tokens = _encode_by_template(template, tokenizer, prompts)
if add_BOS:
prompts_tokens = [[tokenizer.eod] + prompt
for prompt in prompts_tokens]
prompts_length = [len(prompt_tokens) for prompt_tokens in prompts_tokens]
max_prompt_len = max(prompts_length)
if tokens_to_generate > 0:
max_samples_length = max_prompt_len + tokens_to_generate
else:
max_samples_length = max_generate_length
for prompt_tokens, prompt_length in zip(prompts_tokens, prompts_length):
if args.sequence_parallel:
padding_size = math.ceil(max_samples_length / mpu.get_tensor_model_parallel_world_size()) * mpu.get_tensor_model_parallel_world_size() - prompt_length
else:
padding_size = max_samples_length - prompt_length
prompt_tokens.extend([tokenizer.pad_token_id] * padding_size)
prompts_tokens_tensor = torch.tensor(prompts_tokens, dtype=torch.long, device='cuda')
prompts_length_tensor = torch.tensor(prompts_length, dtype=torch.long, device='cuda')
return prompts_tokens_tensor, prompts_length_tensor