Upload 7 files

Browse files

Files changed (7) hide show

requirements.txt +8 -0
setup.py +14 -0
spm.model +3 -0
tokenization_spark.py +433 -0
tokenization_spark_fast.py +142 -0
tokenizer.json +0 -0
tokenizer_config.json +41 -0

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch==2.6.0
+transformers==4.56.1
+tokenizers==0.22.0
+huggingface-hub==0.34.4
+safetensors==0.6.2
+accelerate==1.10.1
+sentencepiece==0.2.0
+modelscope==1.30.0

setup.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from setuptools import setup
+setup(
+    name="Spark-Chemistry-X1-13B",
+    version="0.1.0",
+    py_modules=["tokenization_spark", "tokenization_spark_fast","configuration_spark"],
+    install_requires=[
+        "transformers>=4.30.0",
+        "sentencepiece",
+    ],
+    python_requires=">=3.8",
+    description="Installs tokenization_spark and tokenization_spark_fast as top-level modules.",
+)

spm.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3a2b5663c4f9af0106eea393c61762d569899b321648ca71021119db139c78a
+size 2231916

tokenization_spark.py ADDED Viewed

	@@ -0,0 +1,433 @@

+# coding=utf-8
+# Copyright 2020 Microsoft and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for model Spark."""
+import os
+import unicodedata
+from typing import Any, Optional
+import sentencepiece as sp
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+from transformers.utils.import_utils import requires
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}
+user_token = "<User>"
+bot_token = "<Bot>"
+@requires(backends=("sentencepiece",))
+class SparkTokenizer(PreTrainedTokenizer):
+    vocab_files_names = VOCAB_FILES_NAMES
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=False,
+        split_by_punct=False,
+        bos_token="",
+        eos_token="<end>",
+        unk_token="[UNK]",
+        pad_token="[PAD]",
+        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.do_lower_case = do_lower_case
+        self.split_by_punct = split_by_punct
+        self.vocab_file = vocab_file
+        self._tokenizer = SPMTokenizer(
+            vocab_file,
+            None,
+            split_by_punct=split_by_punct,
+            sp_model_kwargs=self.sp_model_kwargs,
+        )
+        unk_token = (
+            AddedToken(unk_token, normalized=True, special=True)
+            if isinstance(unk_token, str)
+            else unk_token
+        )
+        super().__init__(
+            do_lower_case=do_lower_case,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            split_by_punct=split_by_punct,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+        self._tokenizer.special_tokens = self.all_special_tokens
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+    @property
+    def vocab(self):
+        return self._tokenizer.vocab
+    def get_vocab(self):
+        vocab = self.vocab.copy()
+        vocab.update(self.get_added_vocab())
+        return vocab
+    def _tokenize(self, text: str) -> list[str]:
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        if self.do_lower_case:
+            text = text.lower()
+        return self._tokenizer.tokenize(text)
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self._tokenizer.spm.PieceToId(token)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return (
+            self._tokenizer.spm.IdToPiece(index)
+            if index < self.vocab_size
+            else self.unk_token
+        )
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        return self._tokenizer.decode(tokens)
+    def _get_special_tokens_ids(self):
+        eos_id = [
+            self._convert_token_to_id(token) for token in self.tokenize(self.eos_token)
+        ]
+        user_id = [
+            self._convert_token_to_id(token) for token in self.tokenize(user_token)
+        ]
+        bot_id = [
+            self._convert_token_to_id(token) for token in self.tokenize(bot_token)
+        ]
+        blank_id = [self._convert_token_to_id(token) for token in self.tokenize(" ")]
+        return eos_id, user_id, bot_id, blank_id
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        eos_id, user_id, bot_id, blank_id = self._get_special_tokens_ids()
+        if token_ids_1 is None:
+            return user_id + blank_id + token_ids_0 + eos_id + bot_id + blank_id
+        return (
+            user_id + blank_id + token_ids_0 + token_ids_1 + eos_id + bot_id + blank_id
+        )
+    def get_special_tokens_mask(
+        self, token_ids_0, token_ids_1=None, already_has_special_tokens=False
+    ):
+        eos_id, user_id, bot_id, blank_id = self._get_special_tokens_ids()
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+        if token_ids_1 is not None:
+            return (
+                [1] * len(user_id)
+                + ([0] * (len(blank_id) + len(token_ids_0) + len(token_ids_1)))
+                + [1] * (len(eos_id) + len(bot_id))
+                + [0] * len(blank_id)
+            )
+        return (
+            [1] * len(user_id)
+            + ([0] * (len(blank_id) + len(token_ids_0)))
+            + [1] * (len(eos_id) + len(bot_id))
+            + [0] * len(blank_id)
+        )
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", False)
+        if is_split_into_words or add_prefix_space:
+            text = " " + text
+        return (text, kwargs)
+    def save_vocabulary(
+        self, save_directory: str, filename_prefix: Optional[str] = None
+    ) -> tuple[str]:
+        return self._tokenizer.save_pretrained(
+            save_directory, filename_prefix=filename_prefix
+        )
+class SPMTokenizer:
+    def __init__(
+        self,
+        vocab_file,
+        special_tokens,
+        split_by_punct=False,
+        sp_model_kwargs: Optional[dict[str, Any]] = None,
+    ):
+        self.split_by_punct = split_by_punct
+        self.vocab_file = vocab_file
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
+        if not os.path.exists(vocab_file):
+            raise FileNotFoundError(f"{vocab_file} does not exist!")
+        spm.load(vocab_file)
+        bpe_vocab_size = spm.GetPieceSize()
+        self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)}
+        self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
+        self.spm = spm
+        self.special_tokens = special_tokens
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["spm"] = None
+        return state
+    def __setstate__(self, d):
+        self.__dict__ = d
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+        self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.spm.Load(self.vocab_file)
+    def tokenize(self, text):
+        return self._encode_as_pieces(text)
+    def convert_ids_to_tokens(self, ids):
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+    def decode(self, tokens, start=-1, end=-1, raw_text=None):
+        if raw_text is None:
+            current_sub_tokens = []
+            out_string = ""
+            prev_is_special = False
+            for token in tokens:
+                # make sure that special tokens are not decoded using sentencepiece model
+                if token in self.special_tokens:
+                    if not prev_is_special:
+                        out_string += " "
+                    out_string += self.spm.decode_pieces(current_sub_tokens) + token
+                    prev_is_special = True
+                    current_sub_tokens = []
+                else:
+                    current_sub_tokens.append(token)
+                    prev_is_special = False
+            out_string += self.spm.decode_pieces(current_sub_tokens)
+            return out_string.strip()
+        else:
+            words = self.split_to_words(raw_text)
+            word_tokens = [self.tokenize(w) for w in words]
+            token2words = [0] * len(tokens)
+            tid = 0
+            for i, w in enumerate(word_tokens):
+                for k, t in enumerate(w):
+                    token2words[tid] = i
+                    tid += 1
+            word_start = token2words[start]
+            word_end = token2words[end] if end < len(tokens) else len(words)
+            text = "".join(words[word_start:word_end])
+            return text
+    def part_of_whole_word(self, token, is_bos=False):
+        logger.warning_once(
+            "The `SparkTokenizer.part_of_whole_word` method is deprecated and will be removed in `transformers==4.35`"
+        )
+        if is_bos:
+            return True
+        if (
+            len(token) == 1
+            and (
+                _is_whitespace(list(token)[0])
+                or _is_control(list(token)[0])
+                or _is_punctuation(list(token)[0])
+            )
+        ) or token in self.special_tokens:
+            return False
+        word_start = b"\xe2\x96\x81".decode("utf-8")
+        return not token.startswith(word_start)
+    def pad(self):
+        return "[PAD]"
+    def bos(self):
+        return ""
+    def eos(self):
+        return "<end>"
+    def unk(self):
+        return "[UNK]"
+    def sym(self, id):
+        return self.ids_to_tokens[id]
+    def id(self, sym):
+        logger.warning_once(
+            "The `SparkTokenizer.id` method is deprecated and will be removed in `transformers==4.35`"
+        )
+        return self.vocab[sym] if sym in self.vocab else 1
+    def _encode_as_pieces(self, text):
+        text = convert_to_unicode(text)
+        if self.split_by_punct:
+            words = self._run_split_on_punc(text)
+            pieces = [self.spm.encode(w, out_type=str) for w in words]
+            return [p for w in pieces for p in w]
+        else:
+            return self.spm.encode(text, out_type=str)
+    def split_to_words(self, text):
+        pieces = self._encode_as_pieces(text)
+        word_start = b"\xe2\x96\x81".decode("utf-8")
+        words = []
+        offset = 0
+        prev_end = 0
+        for i, p in enumerate(pieces):
+            if p.startswith(word_start):
+                if offset > prev_end:
+                    words.append(text[prev_end:offset])
+                prev_end = offset
+                w = p.replace(word_start, "")
+            else:
+                w = p
+            try:
+                s = text.index(w, offset)
+                pn = ""
+                k = i + 1
+                while k < len(pieces):
+                    pn = pieces[k].replace(word_start, "")
+                    if len(pn) > 0:
+                        break
+                    k += 1
+                if len(pn) > 0 and pn in text[offset:s]:
+                    offset = offset + 1
+                else:
+                    offset = s + len(w)
+            except Exception:
+                offset = offset + 1
+        if prev_end < offset:
+            words.append(text[prev_end:offset])
+        return words
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+        return ["".join(x) for x in output]
+    def save_pretrained(self, path: str, filename_prefix: Optional[str] = None):
+        filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]]
+        if filename_prefix is not None:
+            filename = filename_prefix + "-" + filename
+        full_path = os.path.join(path, filename)
+        with open(full_path, "wb") as fs:
+            fs.write(self.spm.serialized_model_proto())
+        return (full_path,)
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically control characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if (
+        (cp >= 33 and cp <= 47)
+        or (cp >= 58 and cp <= 64)
+        or (cp >= 91 and cp <= 96)
+        or (cp >= 123 and cp <= 126)
+    ):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if isinstance(text, str):
+        return text
+    elif isinstance(text, bytes):
+        return text.decode("utf-8", "ignore")
+    else:
+        raise TypeError(f"Unsupported string type: {type(text)}")
+__all__ = ["SparkTokenizer"]

tokenization_spark_fast.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# coding=utf-8
+# Copyright 2020 Microsoft and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Tokenization class for model Spark."""
+import os
+from shutil import copyfile
+from typing import Optional
+from transformers.file_utils import is_sentencepiece_available
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from transformers.utils import logging
+if is_sentencepiece_available():
+    from tokenization_spark import SparkTokenizer
+else:
+    SparkTokenizer = None
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "spm.model", "tokenizer_file": "tokenizer.json"}
+class SparkTokenizerFast(PreTrainedTokenizerFast):
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = SparkTokenizer
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        do_lower_case=False,
+        split_by_punct=False,
+        bos_token="",
+        eos_token="<end>",
+        unk_token="[UNK]",
+        pad_token="[PAD]",
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            do_lower_case=do_lower_case,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            split_by_punct=split_by_punct,
+            **kwargs,
+        )
+        self.do_lower_case = do_lower_case
+        self.split_by_punct = split_by_punct
+        self.vocab_file = vocab_file
+    def _get_special_tokens_ids(self):
+        eos_id = [
+            self._convert_token_to_id(token) for token in self.tokenize(self.eos_token)
+        ]
+        user_id = [
+            self._convert_token_to_id(token) for token in self.tokenize(user_token)
+        ]
+        bot_id = [
+            self._convert_token_to_id(token) for token in self.tokenize(bot_token)
+        ]
+        blank_id = [self._convert_token_to_id(token) for token in self.tokenize(" ")]
+        return eos_id, user_id, bot_id, blank_id
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        eos_id, user_id, bot_id, blank_id = self._get_special_tokens_ids()
+        if token_ids_1 is None:
+            return user_id + blank_id + token_ids_0 + eos_id + bot_id + blank_id
+        return (
+            user_id + blank_id + token_ids_0 + token_ids_1 + eos_id + bot_id + blank_id
+        )
+    def get_special_tokens_mask(
+        self, token_ids_0, token_ids_1=None, already_has_special_tokens=False
+    ):
+        eos_id, user_id, bot_id, blank_id = self._get_special_tokens_ids()
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+        if token_ids_1 is not None:
+            return (
+                [1] * len(user_id)
+                + ([0] * (len(blank_id) + len(token_ids_0) + len(token_ids_1)))
+                + [1] * (len(eos_id) + len(bot_id))
+                + [0] * len(blank_id)
+            )
+        return (
+            [1] * len(user_id)
+            + ([0] * (len(blank_id) + len(token_ids_0)))
+            + [1] * (len(eos_id) + len(bot_id))
+            + [0] * len(blank_id)
+        )
+    def save_vocabulary(
+        self, save_directory: str, filename_prefix: Optional[str] = None
+    ) -> tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "")
+            + VOCAB_FILES_NAMES["vocab_file"],
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        return (out_vocab_file,)
+__all__ = ["SparkTokenizerFast"]

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<System> ' + system_message + '<end>' }}{% endif %}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<User> ' + message['content'] + '<end><Bot> ' }}{% elif message['role'] == 'assistant' %}{{  message['content'] + '<end>' }}{% endif %}{% endfor %}",
+  "do_lower_case": false,
+  "eos_token": "<end>",
+  "model_max_length": 32768,
+  "pad_token": "<pad>",
+  "sp_model_kwargs": {},
+  "split_by_punct": false,
+  "tokenizer_class": "SparkTokenizer",
+  "auto_map": {
+  "AutoTokenizer":["tokenization_spark.SparkTokenizer", "tokenization_spark_fast.SparkTokenizerFast"]},
+  "unk_token": "<unk>"
+}