| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """Tokenization classes for Qwen2.""" |
|
|
| from typing import Optional, Tuple |
|
|
| from transformers.tokenization_utils import AddedToken |
| from transformers.tokenization_utils_fast import PreTrainedTokenizerFast |
| from transformers.utils import logging |
| from .tokenization_qwen2 import Qwen2Tokenizer |
|
|
|
|
| logger = logging.get_logger(__name__) |
|
|
| VOCAB_FILES_NAMES = { |
| "vocab_file": "vocab.json", |
| "merges_file": "merges.txt", |
| "tokenizer_file": "tokenizer.json", |
| } |
|
|
| PRETRAINED_VOCAB_FILES_MAP = { |
| "vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"}, |
| "merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"}, |
| "tokenizer_file": { |
| "qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/tokenizer.json" |
| }, |
| } |
|
|
| MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768} |
|
|
|
|
| class Qwen2TokenizerFast(PreTrainedTokenizerFast): |
| """ |
| Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level |
| Byte-Pair-Encoding. |
| |
| Same with GPT2Tokenzier, this tokenizer has been trained to treat spaces like parts of the tokens so a word will |
| be encoded differently whether it is at the beginning of the sentence (without space) or not: |
| |
| ```python |
| >>> from transformers import Qwen2TokenizerFast |
| |
| >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer") |
| >>> tokenizer("Hello world")["input_ids"] |
| [9707, 1879] |
| |
| >>> tokenizer(" Hello world")["input_ids"] |
| [21927, 1879] |
| ``` |
| This is expected. |
| |
| This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should |
| refer to this superclass for more information regarding those methods. |
| |
| Args: |
| vocab_file (`str`, *optional*): |
| Path to the vocabulary file. |
| merges_file (`str`, *optional*): |
| Path to the merges file. |
| tokenizer_file (`str`, *optional*): |
| Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that |
| contains everything needed to load the tokenizer. |
| unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): |
| The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this |
| token instead. Not applicable to this tokenizer. |
| bos_token (`str`, *optional*): |
| The beginning of sequence token. Not applicable for this tokenizer. |
| eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): |
| The end of sequence token. |
| pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): |
| The token used for padding, for example when batching sequences of different lengths. |
| """ |
|
|
| vocab_files_names = VOCAB_FILES_NAMES |
| pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP |
| max_model_input_sizes = MAX_MODEL_INPUT_SIZES |
| model_input_names = ["input_ids", "attention_mask"] |
| slow_tokenizer_class = Qwen2Tokenizer |
|
|
| def __init__( |
| self, |
| vocab_file=None, |
| merges_file=None, |
| tokenizer_file=None, |
| unk_token="<|endoftext|>", |
| bos_token=None, |
| eos_token="<|endoftext|>", |
| pad_token="<|endoftext|>", |
| **kwargs, |
| ): |
| |
| |
| |
| |
|
|
| bos_token = ( |
| AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False) |
| if isinstance(bos_token, str) |
| else bos_token |
| ) |
| eos_token = ( |
| AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False) |
| if isinstance(eos_token, str) |
| else eos_token |
| ) |
| unk_token = ( |
| AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False) |
| if isinstance(unk_token, str) |
| else unk_token |
| ) |
| pad_token = ( |
| AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False) |
| if isinstance(pad_token, str) |
| else pad_token |
| ) |
|
|
| super().__init__( |
| vocab_file, |
| merges_file, |
| tokenizer_file=tokenizer_file, |
| unk_token=unk_token, |
| bos_token=bos_token, |
| eos_token=eos_token, |
| pad_token=pad_token, |
| **kwargs, |
| ) |
|
|
| |
| def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: |
| files = self._tokenizer.model.save(save_directory, name=filename_prefix) |
| return tuple(files) |
|
|