Source code for mfai.tokenizers

"""
Module with various LLM tokenizers wrapped in a common interface.
"""

from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, List

import sentencepiece as spm
import tiktoken  # noqa
from huggingface_hub import hf_hub_download, login

from mfai.encoding import get_tiktoken_encoding


[docs] class Tokenizer(ABC):
[docs] @abstractmethod def name(self) -> str: pass
[docs] @abstractmethod def encode(self, text: str, *args: Any, **kwargs: Any) -> List[int]: pass
[docs] @abstractmethod def decode(self, tokens: list, *args: Any, **kwargs: Any) -> str: pass
@property @abstractmethod def eot_token(self) -> int: pass @property @abstractmethod def vocab_size(self) -> int: pass
[docs] class GPT2Tokenizer(Tokenizer): def __init__(self) -> None: self.base_tokenizer = get_tiktoken_encoding("gpt2") self.special_tokens: list[str] = [] self.tokenizer = self.base_tokenizer
[docs] def add_special_tokens(self, new_special_tokens: list[str]) -> None: """ Method to add some special tokens to the tokenizer. For more details about extending a tiktoken.Encoding https://github.com/openai/tiktoken/tree/main?tab=readme-ov-file#extending-tiktoken """ for tok in new_special_tokens: if ( tok not in self.special_tokens and tok not in self.base_tokenizer._special_tokens ): self.special_tokens.append(tok) special_tokens = { tok: self.base_tokenizer.n_vocab + i for i, tok in enumerate(self.special_tokens) } self.tokenizer = tiktoken.Encoding( name=f"custom_{self.name()}", pat_str=self.base_tokenizer._pat_str, mergeable_ranks=self.base_tokenizer._mergeable_ranks, special_tokens={**self.base_tokenizer._special_tokens} | special_tokens, )
[docs] def name(self) -> str: return "gpt2"
[docs] def encode(self, text: str, *args: Any, **kwargs: Any) -> List[int]: return self.tokenizer.encode(text, allowed_special="all", *args, **kwargs)
[docs] def decode(self, tokens: list, *args: Any, **kwargs: Any) -> str: return self.tokenizer.decode(tokens, *args, **kwargs)
@property def eot_token(self) -> int: return self.tokenizer.eot_token @property def vocab_size(self) -> int: return self.tokenizer.n_vocab
[docs] class LlamaTokenizer(Tokenizer): def __init__(self) -> None: sp = spm.SentencePieceProcessor() folderpath = Path(__file__).parent / "Llama-2-7B" if not folderpath.exists(): login() tokenizer_file = hf_hub_download( repo_id="meta-llama/Llama-2-7b", filename="tokenizer.model", local_dir=folderpath, ) else: tokenizer_file = str(folderpath / "tokenizer.model") sp.load(tokenizer_file) self.tokenizer = sp
[docs] def name(self) -> str: return "llama"
[docs] def encode(self, text: str, *args: Any, **kwargs: Any) -> List[int]: return self.tokenizer.encode_as_ids(text)
[docs] def decode(self, tokens: list, *args: Any, **kwargs: Any) -> str: return self.tokenizer.decode_pieces(tokens)
@property def eot_token(self) -> int: return self.tokenizer.eos_id() @property def vocab_size(self) -> int: return self.tokenizer.vocab_size()
[docs] class MiniGPT2Tokenizer(Tokenizer, ABC): """ A Tokenizer using a reduced set of tokens from a base GPT2Tokenizer. Typical use case is for narrow vocab problems with only 1000 tokens out of a vocab of 50000. To use these class, you only have to implement the method 'get_set_tokens'. """ def __init__(self) -> None: self.gpt2_tokenizer = GPT2Tokenizer() self.token_to_id: dict[int, int] = dict() self.id_to_token: dict[int, int] = dict() for idx, token_id in enumerate(self.tokens()): self.token_to_id[token_id] = idx self.id_to_token[idx] = token_id # Add manually the EOT token if needed mini_eot_id = self.vocab_size base_eot_id = self.gpt2_tokenizer.encode("<|endoftext|>")[0] if base_eot_id not in self.token_to_id.keys(): self.token_to_id[base_eot_id] = mini_eot_id self.id_to_token[mini_eot_id] = base_eot_id
[docs] @abstractmethod def tokens(self) -> set: """ Method that return a set of tokenized words. Example: def tokens(self) -> set: unique_tokens = set() texts: list[str] = ... # Load all texts you want to encode for text in texts: tokens = self.gpt2_tokenizer.encode(text) unique_tokens.update(tokens) return unique_tokens """
[docs] def add_special_tokens(self, special_tokens: list[str]) -> None: """ Method to add some special tokens to the tokenizer. For more details about extending a tiktoken.Encoding https://github.com/openai/tiktoken/tree/main?tab=readme-ov-file#extending-tiktoken """ self.gpt2_tokenizer.add_special_tokens(special_tokens) vocab_size = self.vocab_size for i, special_token in enumerate(self.gpt2_tokenizer.special_tokens): mini_tok_id = vocab_size + i base_tok_id = self.gpt2_tokenizer.encode(special_token)[0] if base_tok_id in self.token_to_id.keys(): vocab_size -= 1 else: self.token_to_id[base_tok_id] = mini_tok_id self.id_to_token[mini_tok_id] = base_tok_id
[docs] def name(self) -> str: return "mini_" + self.gpt2_tokenizer.name()
[docs] def encode(self, text: str, *args: Any, **kwargs: Any) -> List[int]: base_token_ids = self.gpt2_tokenizer.encode(text) return [self.token_to_id[x] for x in base_token_ids]
[docs] def decode(self, tokens: list, *args: Any, **kwargs: Any) -> str: base_tokens = [self.id_to_token[x] for x in tokens] return self.gpt2_tokenizer.decode(base_tokens)
@property def eot_token(self) -> int: return self.token_to_id[self.gpt2_tokenizer.eot_token] @property def vocab_size(self) -> int: return len(self.token_to_id)