import json
from abc import ABC, abstractmethod
from collections import OrderedDict
from typing import Any
import numpy
from transformers import AutoTokenizer
class MegatronTokenizer(ABC):
"""Abstract class for tokenizer
Absent a config or class-specific tracking of which objects are uniquely identifying, we must
include all key word arguments as unique identifiers
Args:
tokenizer_paths (Tuple[str]): All tokenizer source paths or prefixes
tokenizer_options (Dict[str, Any]): All tokenizer options
"""
def __init__(self, *tokenizer_paths: str, **tokenizer_options: Any):
self.unique_identifiers = OrderedDict()
self.unique_identifiers["class"] = type(self).__name__
self.unique_identifiers["tokenizer_path"] = list(tokenizer_paths)
for option in tokenizer_options:
self.unique_identifiers[option] = str(tokenizer_options[option])
self.unique_description = json.dumps(self.unique_identifiers, indent=4)
super().__init__()
@abstractmethod
def tokenize(self, text: str) -> numpy.ndarray:
"""Convert text to embedding ids
Args:
text (str): The text to convert
Returns:
numpy.ndarray: The converted embedding ids
"""
pass
def detokenize(self, ids: numpy.ndarray) -> str:
"""Convert embedding ids to text
Args:
ids (numpy.ndarray): The ids to convert
Returns:
str: The converted text
Raises:
NotImplementedError: Non-abstract, optional method
"""
raise NotImplementedError("{} has no method 'detokenize'".format(type(self).__name__))
def offsets(self, ids: list[int], text: str) -> list[int]:
"""Convert embedding ids to text offsets
Args:
ids (list[int]): The ids to convert
text (str): The text to convert
Returns:
list[int]: The converted offsets
Raises:
NotImplementedError: Non-abstract, optional method
"""
raise NotImplementedError("{} has no method 'offsets'".format(type(self).__name__))
@property
@abstractmethod
def vocab(self):
"""Dictionary from vocab text token to id token"""
pass
@property
@abstractmethod
def inv_vocab(self):
"""Dictionary from vocab id token to text token"""
pass
@property
@abstractmethod
def vocab_size(self):
"""The vocabulary size"""
pass
@property
def cls(self):
"""The CLS token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'cls'".format(type(self).__name__))
@property
def sep(self):
"""The SEP token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'sep'".format(type(self).__name__))
@property
def pad(self):
"""The PAD token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'pad'".format(type(self).__name__))
@property
def eod(self):
"""The EOD token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'eod'".format(type(self).__name__))
@property
def bos(self):
"""The BOS token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'bos'".format(type(self).__name__))
@property
def eos(self):
"""The EOS token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'eos'".format(type(self).__name__))
@property
def mask(self):
"""The MASK token id
Raises:
NotImplementedError: Non-abstract, optional attribute
"""
raise NotImplementedError("{} has no attribute 'mask'".format(type(self).__name__))
class _AutoTokenizer(MegatronTokenizer):
"""AutoTokenizer for Hf Pretrained model loading."""
def __init__(self, tokenizer_name_or_path, vocab_extra_ids, model_max_length, use_fast, prompt_type=None, **kwargs):
name = tokenizer_name_or_path
super().__init__(name)
hf_tokenizer_kwargs = kwargs
if vocab_extra_ids > 0:
hf_tokenizer_kwargs["additional_special_tokens"] = [f"<extra_id_{_id}>" for _id in range(vocab_extra_ids)]
hf_tokenizer_kwargs["model_max_length"] = model_max_length
hf_tokenizer_kwargs["use_fast"] = use_fast
hf_tokenizer_kwargs["trust_remote_code"] = True
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, **hf_tokenizer_kwargs, local_files_only=True)
if (prompt_type is None) and (self.tokenizer.pad_token_id is None):
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.encoder = self.tokenizer.get_vocab()
self.decoder = {v: k for k, v in self.encoder.items()}
@property
def vocab_size(self):
return len(self.tokenizer)
@property
def vocab(self):
return {
**{special_token: self.tokenizer.convert_tokens_to_ids(special_token)
for special_token in self.tokenizer.additional_special_tokens},
**self.tokenizer.vocab,
}
@property
def inv_vocab(self):
return {v: k for k, v in self.vocab.items()}
def tokenize(self, text):
return self.tokenizer.encode(text)
def detokenize(self, token_ids):
return self.tokenizer.decode(token_ids)
@property
def eod(self):
return self.eos
@property
def eos_token_id(self):
return self.tokenizer.eos_token_id
@property
def pad_token_id(self):
return self.tokenizer.pad_token_id
@property
def cls(self):
candidate = self.tokenizer.cls_token_id
return self._check_token_candidate(candidate)
@property
def sep(self):
candidate = self.tokenizer.sep_token_id
return self._check_token_candidate(candidate)
@property
def pad(self):
candidate = self.tokenizer.pad_token_id
if candidate is None:
candidate = self.tokenizer.eos_token_id
return self._check_token_candidate(candidate)
@property
def mask(self):
candidate = self.tokenizer.mask_token_id
return self._check_token_candidate(candidate)
@property
def bos(self):
raise NotImplementedError("Missing <bos>")
@property
def eos(self):
candidate = self.tokenizer.eos_token_id
return self._check_token_candidate(candidate)
@property
def additional_special_tokens_ids(self):
""" All the additional special tokens you may want to use (list of strings)."""
return self.tokenizer.additional_special_tokens_ids
@staticmethod
def _check_token_candidate(candidate):
if candidate is None:
raise AttributeError("Token doesn't exist")
return candidate