licyk commited on
Commit
f8c0d54
·
verified ·
1 Parent(s): f665901

Upload 7 files

Browse files
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch==2.6.0
2
+ transformers==4.56.1
3
+ tokenizers==0.22.0
4
+ huggingface-hub==0.34.4
5
+ safetensors==0.6.2
6
+ accelerate==1.10.1
7
+ sentencepiece==0.2.0
8
+ modelscope==1.30.0
setup.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from setuptools import setup
3
+
4
+ setup(
5
+ name="Spark-Chemistry-X1-13B",
6
+ version="0.1.0",
7
+ py_modules=["tokenization_spark", "tokenization_spark_fast","configuration_spark"],
8
+ install_requires=[
9
+ "transformers>=4.30.0",
10
+ "sentencepiece",
11
+ ],
12
+ python_requires=">=3.8",
13
+ description="Installs tokenization_spark and tokenization_spark_fast as top-level modules.",
14
+ )
spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3a2b5663c4f9af0106eea393c61762d569899b321648ca71021119db139c78a
3
+ size 2231916
tokenization_spark.py ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2020 Microsoft and the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization class for model Spark."""
16
+
17
+ import os
18
+ import unicodedata
19
+ from typing import Any, Optional
20
+
21
+ import sentencepiece as sp
22
+
23
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
24
+ from transformers.utils import logging
25
+ from transformers.utils.import_utils import requires
26
+
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+
31
+ VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}
32
+
33
+ user_token = "<User>"
34
+ bot_token = "<Bot>"
35
+
36
+
37
+ @requires(backends=("sentencepiece",))
38
+ class SparkTokenizer(PreTrainedTokenizer):
39
+
40
+ vocab_files_names = VOCAB_FILES_NAMES
41
+
42
+ def __init__(
43
+ self,
44
+ vocab_file,
45
+ do_lower_case=False,
46
+ split_by_punct=False,
47
+ bos_token="",
48
+ eos_token="<end>",
49
+ unk_token="[UNK]",
50
+ pad_token="[PAD]",
51
+ sp_model_kwargs: Optional[dict[str, Any]] = None,
52
+ **kwargs,
53
+ ) -> None:
54
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
55
+
56
+ if not os.path.isfile(vocab_file):
57
+ raise ValueError(
58
+ f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
59
+ " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
60
+ )
61
+ self.do_lower_case = do_lower_case
62
+ self.split_by_punct = split_by_punct
63
+ self.vocab_file = vocab_file
64
+ self._tokenizer = SPMTokenizer(
65
+ vocab_file,
66
+ None,
67
+ split_by_punct=split_by_punct,
68
+ sp_model_kwargs=self.sp_model_kwargs,
69
+ )
70
+ unk_token = (
71
+ AddedToken(unk_token, normalized=True, special=True)
72
+ if isinstance(unk_token, str)
73
+ else unk_token
74
+ )
75
+ super().__init__(
76
+ do_lower_case=do_lower_case,
77
+ bos_token=bos_token,
78
+ eos_token=eos_token,
79
+ unk_token=unk_token,
80
+ pad_token=pad_token,
81
+ split_by_punct=split_by_punct,
82
+ sp_model_kwargs=self.sp_model_kwargs,
83
+ **kwargs,
84
+ )
85
+ self._tokenizer.special_tokens = self.all_special_tokens
86
+
87
+ @property
88
+ def vocab_size(self):
89
+ return len(self.vocab)
90
+
91
+ @property
92
+ def vocab(self):
93
+ return self._tokenizer.vocab
94
+
95
+ def get_vocab(self):
96
+ vocab = self.vocab.copy()
97
+ vocab.update(self.get_added_vocab())
98
+ return vocab
99
+
100
+ def _tokenize(self, text: str) -> list[str]:
101
+ """Take as input a string and return a list of strings (tokens) for words/sub-words"""
102
+ if self.do_lower_case:
103
+ text = text.lower()
104
+ return self._tokenizer.tokenize(text)
105
+
106
+ def _convert_token_to_id(self, token):
107
+ """Converts a token (str) in an id using the vocab."""
108
+ return self._tokenizer.spm.PieceToId(token)
109
+
110
+ def _convert_id_to_token(self, index):
111
+ """Converts an index (integer) in a token (str) using the vocab."""
112
+ return (
113
+ self._tokenizer.spm.IdToPiece(index)
114
+ if index < self.vocab_size
115
+ else self.unk_token
116
+ )
117
+
118
+ def convert_tokens_to_string(self, tokens):
119
+ """Converts a sequence of tokens (string) in a single string."""
120
+ return self._tokenizer.decode(tokens)
121
+
122
+ def _get_special_tokens_ids(self):
123
+ eos_id = [
124
+ self._convert_token_to_id(token) for token in self.tokenize(self.eos_token)
125
+ ]
126
+ user_id = [
127
+ self._convert_token_to_id(token) for token in self.tokenize(user_token)
128
+ ]
129
+ bot_id = [
130
+ self._convert_token_to_id(token) for token in self.tokenize(bot_token)
131
+ ]
132
+ blank_id = [self._convert_token_to_id(token) for token in self.tokenize(" ")]
133
+ return eos_id, user_id, bot_id, blank_id
134
+
135
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
136
+
137
+ eos_id, user_id, bot_id, blank_id = self._get_special_tokens_ids()
138
+ if token_ids_1 is None:
139
+ return user_id + blank_id + token_ids_0 + eos_id + bot_id + blank_id
140
+ return (
141
+ user_id + blank_id + token_ids_0 + token_ids_1 + eos_id + bot_id + blank_id
142
+ )
143
+
144
+ def get_special_tokens_mask(
145
+ self, token_ids_0, token_ids_1=None, already_has_special_tokens=False
146
+ ):
147
+ eos_id, user_id, bot_id, blank_id = self._get_special_tokens_ids()
148
+
149
+ if already_has_special_tokens:
150
+ return super().get_special_tokens_mask(
151
+ token_ids_0=token_ids_0,
152
+ token_ids_1=token_ids_1,
153
+ already_has_special_tokens=True,
154
+ )
155
+ if token_ids_1 is not None:
156
+ return (
157
+ [1] * len(user_id)
158
+ + ([0] * (len(blank_id) + len(token_ids_0) + len(token_ids_1)))
159
+ + [1] * (len(eos_id) + len(bot_id))
160
+ + [0] * len(blank_id)
161
+ )
162
+
163
+ return (
164
+ [1] * len(user_id)
165
+ + ([0] * (len(blank_id) + len(token_ids_0)))
166
+ + [1] * (len(eos_id) + len(bot_id))
167
+ + [0] * len(blank_id)
168
+ )
169
+
170
+ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
171
+ add_prefix_space = kwargs.pop("add_prefix_space", False)
172
+ if is_split_into_words or add_prefix_space:
173
+ text = " " + text
174
+ return (text, kwargs)
175
+
176
+ def save_vocabulary(
177
+ self, save_directory: str, filename_prefix: Optional[str] = None
178
+ ) -> tuple[str]:
179
+ return self._tokenizer.save_pretrained(
180
+ save_directory, filename_prefix=filename_prefix
181
+ )
182
+
183
+
184
+ class SPMTokenizer:
185
+
186
+ def __init__(
187
+ self,
188
+ vocab_file,
189
+ special_tokens,
190
+ split_by_punct=False,
191
+ sp_model_kwargs: Optional[dict[str, Any]] = None,
192
+ ):
193
+ self.split_by_punct = split_by_punct
194
+ self.vocab_file = vocab_file
195
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
196
+ spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
197
+ if not os.path.exists(vocab_file):
198
+ raise FileNotFoundError(f"{vocab_file} does not exist!")
199
+ spm.load(vocab_file)
200
+ bpe_vocab_size = spm.GetPieceSize()
201
+
202
+ self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)}
203
+ self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
204
+
205
+ self.spm = spm
206
+ self.special_tokens = special_tokens
207
+
208
+ def __getstate__(self):
209
+ state = self.__dict__.copy()
210
+ state["spm"] = None
211
+ return state
212
+
213
+ def __setstate__(self, d):
214
+ self.__dict__ = d
215
+
216
+ # for backward compatibility
217
+ if not hasattr(self, "sp_model_kwargs"):
218
+ self.sp_model_kwargs = {}
219
+
220
+ self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
221
+ self.spm.Load(self.vocab_file)
222
+
223
+ def tokenize(self, text):
224
+ return self._encode_as_pieces(text)
225
+
226
+ def convert_ids_to_tokens(self, ids):
227
+ tokens = []
228
+ for i in ids:
229
+ tokens.append(self.ids_to_tokens[i])
230
+ return tokens
231
+
232
+ def decode(self, tokens, start=-1, end=-1, raw_text=None):
233
+ if raw_text is None:
234
+ current_sub_tokens = []
235
+ out_string = ""
236
+ prev_is_special = False
237
+ for token in tokens:
238
+ # make sure that special tokens are not decoded using sentencepiece model
239
+ if token in self.special_tokens:
240
+ if not prev_is_special:
241
+ out_string += " "
242
+ out_string += self.spm.decode_pieces(current_sub_tokens) + token
243
+ prev_is_special = True
244
+ current_sub_tokens = []
245
+ else:
246
+ current_sub_tokens.append(token)
247
+ prev_is_special = False
248
+ out_string += self.spm.decode_pieces(current_sub_tokens)
249
+ return out_string.strip()
250
+ else:
251
+ words = self.split_to_words(raw_text)
252
+ word_tokens = [self.tokenize(w) for w in words]
253
+ token2words = [0] * len(tokens)
254
+ tid = 0
255
+ for i, w in enumerate(word_tokens):
256
+ for k, t in enumerate(w):
257
+ token2words[tid] = i
258
+ tid += 1
259
+ word_start = token2words[start]
260
+ word_end = token2words[end] if end < len(tokens) else len(words)
261
+ text = "".join(words[word_start:word_end])
262
+ return text
263
+
264
+ def part_of_whole_word(self, token, is_bos=False):
265
+ logger.warning_once(
266
+ "The `SparkTokenizer.part_of_whole_word` method is deprecated and will be removed in `transformers==4.35`"
267
+ )
268
+ if is_bos:
269
+ return True
270
+ if (
271
+ len(token) == 1
272
+ and (
273
+ _is_whitespace(list(token)[0])
274
+ or _is_control(list(token)[0])
275
+ or _is_punctuation(list(token)[0])
276
+ )
277
+ ) or token in self.special_tokens:
278
+ return False
279
+
280
+ word_start = b"\xe2\x96\x81".decode("utf-8")
281
+ return not token.startswith(word_start)
282
+
283
+ def pad(self):
284
+ return "[PAD]"
285
+
286
+ def bos(self):
287
+ return ""
288
+
289
+ def eos(self):
290
+ return "<end>"
291
+
292
+ def unk(self):
293
+ return "[UNK]"
294
+
295
+ def sym(self, id):
296
+ return self.ids_to_tokens[id]
297
+
298
+ def id(self, sym):
299
+ logger.warning_once(
300
+ "The `SparkTokenizer.id` method is deprecated and will be removed in `transformers==4.35`"
301
+ )
302
+ return self.vocab[sym] if sym in self.vocab else 1
303
+
304
+ def _encode_as_pieces(self, text):
305
+ text = convert_to_unicode(text)
306
+ if self.split_by_punct:
307
+ words = self._run_split_on_punc(text)
308
+ pieces = [self.spm.encode(w, out_type=str) for w in words]
309
+ return [p for w in pieces for p in w]
310
+ else:
311
+ return self.spm.encode(text, out_type=str)
312
+
313
+ def split_to_words(self, text):
314
+ pieces = self._encode_as_pieces(text)
315
+ word_start = b"\xe2\x96\x81".decode("utf-8")
316
+ words = []
317
+ offset = 0
318
+ prev_end = 0
319
+ for i, p in enumerate(pieces):
320
+ if p.startswith(word_start):
321
+ if offset > prev_end:
322
+ words.append(text[prev_end:offset])
323
+ prev_end = offset
324
+ w = p.replace(word_start, "")
325
+ else:
326
+ w = p
327
+ try:
328
+ s = text.index(w, offset)
329
+ pn = ""
330
+ k = i + 1
331
+ while k < len(pieces):
332
+ pn = pieces[k].replace(word_start, "")
333
+ if len(pn) > 0:
334
+ break
335
+ k += 1
336
+
337
+ if len(pn) > 0 and pn in text[offset:s]:
338
+ offset = offset + 1
339
+ else:
340
+ offset = s + len(w)
341
+ except Exception:
342
+ offset = offset + 1
343
+
344
+ if prev_end < offset:
345
+ words.append(text[prev_end:offset])
346
+
347
+ return words
348
+
349
+ def _run_split_on_punc(self, text):
350
+ """Splits punctuation on a piece of text."""
351
+ chars = list(text)
352
+ i = 0
353
+ start_new_word = True
354
+ output = []
355
+ while i < len(chars):
356
+ char = chars[i]
357
+ if _is_punctuation(char):
358
+ output.append([char])
359
+ start_new_word = True
360
+ else:
361
+ if start_new_word:
362
+ output.append([])
363
+ start_new_word = False
364
+ output[-1].append(char)
365
+ i += 1
366
+
367
+ return ["".join(x) for x in output]
368
+
369
+ def save_pretrained(self, path: str, filename_prefix: Optional[str] = None):
370
+ filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]]
371
+ if filename_prefix is not None:
372
+ filename = filename_prefix + "-" + filename
373
+ full_path = os.path.join(path, filename)
374
+ with open(full_path, "wb") as fs:
375
+ fs.write(self.spm.serialized_model_proto())
376
+ return (full_path,)
377
+
378
+
379
+ def _is_whitespace(char):
380
+ """Checks whether `chars` is a whitespace character."""
381
+ # \t, \n, and \r are technically control characters but we treat them
382
+ # as whitespace since they are generally considered as such.
383
+ if char == " " or char == "\t" or char == "\n" or char == "\r":
384
+ return True
385
+ cat = unicodedata.category(char)
386
+ if cat == "Zs":
387
+ return True
388
+ return False
389
+
390
+
391
+ def _is_control(char):
392
+ """Checks whether `chars` is a control character."""
393
+ # These are technically control characters but we count them as whitespace
394
+ # characters.
395
+ if char == "\t" or char == "\n" or char == "\r":
396
+ return False
397
+ cat = unicodedata.category(char)
398
+ if cat.startswith("C"):
399
+ return True
400
+ return False
401
+
402
+
403
+ def _is_punctuation(char):
404
+ """Checks whether `chars` is a punctuation character."""
405
+ cp = ord(char)
406
+ # We treat all non-letter/number ASCII as punctuation.
407
+ # Characters such as "^", "$", and "`" are not in the Unicode
408
+ # Punctuation class but we treat them as punctuation anyways, for
409
+ # consistency.
410
+ if (
411
+ (cp >= 33 and cp <= 47)
412
+ or (cp >= 58 and cp <= 64)
413
+ or (cp >= 91 and cp <= 96)
414
+ or (cp >= 123 and cp <= 126)
415
+ ):
416
+ return True
417
+ cat = unicodedata.category(char)
418
+ if cat.startswith("P"):
419
+ return True
420
+ return False
421
+
422
+
423
+ def convert_to_unicode(text):
424
+ """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
425
+ if isinstance(text, str):
426
+ return text
427
+ elif isinstance(text, bytes):
428
+ return text.decode("utf-8", "ignore")
429
+ else:
430
+ raise TypeError(f"Unsupported string type: {type(text)}")
431
+
432
+
433
+ __all__ = ["SparkTokenizer"]
tokenization_spark_fast.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2020 Microsoft and the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Fast Tokenization class for model Spark."""
16
+
17
+ import os
18
+ from shutil import copyfile
19
+ from typing import Optional
20
+
21
+
22
+ from transformers.file_utils import is_sentencepiece_available
23
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
24
+ from transformers.utils import logging
25
+
26
+
27
+ if is_sentencepiece_available():
28
+ from tokenization_spark import SparkTokenizer
29
+ else:
30
+ SparkTokenizer = None
31
+
32
+ logger = logging.get_logger(__name__)
33
+
34
+ VOCAB_FILES_NAMES = {"vocab_file": "spm.model", "tokenizer_file": "tokenizer.json"}
35
+
36
+
37
+ class SparkTokenizerFast(PreTrainedTokenizerFast):
38
+
39
+ vocab_files_names = VOCAB_FILES_NAMES
40
+ slow_tokenizer_class = SparkTokenizer
41
+
42
+ def __init__(
43
+ self,
44
+ vocab_file=None,
45
+ tokenizer_file=None,
46
+ do_lower_case=False,
47
+ split_by_punct=False,
48
+ bos_token="",
49
+ eos_token="<end>",
50
+ unk_token="[UNK]",
51
+ pad_token="[PAD]",
52
+ **kwargs,
53
+ ) -> None:
54
+ super().__init__(
55
+ vocab_file,
56
+ tokenizer_file=tokenizer_file,
57
+ do_lower_case=do_lower_case,
58
+ bos_token=bos_token,
59
+ eos_token=eos_token,
60
+ unk_token=unk_token,
61
+ pad_token=pad_token,
62
+ split_by_punct=split_by_punct,
63
+ **kwargs,
64
+ )
65
+
66
+ self.do_lower_case = do_lower_case
67
+ self.split_by_punct = split_by_punct
68
+ self.vocab_file = vocab_file
69
+
70
+ def _get_special_tokens_ids(self):
71
+ eos_id = [
72
+ self._convert_token_to_id(token) for token in self.tokenize(self.eos_token)
73
+ ]
74
+ user_id = [
75
+ self._convert_token_to_id(token) for token in self.tokenize(user_token)
76
+ ]
77
+ bot_id = [
78
+ self._convert_token_to_id(token) for token in self.tokenize(bot_token)
79
+ ]
80
+ blank_id = [self._convert_token_to_id(token) for token in self.tokenize(" ")]
81
+ return eos_id, user_id, bot_id, blank_id
82
+
83
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
84
+
85
+ eos_id, user_id, bot_id, blank_id = self._get_special_tokens_ids()
86
+ if token_ids_1 is None:
87
+ return user_id + blank_id + token_ids_0 + eos_id + bot_id + blank_id
88
+ return (
89
+ user_id + blank_id + token_ids_0 + token_ids_1 + eos_id + bot_id + blank_id
90
+ )
91
+
92
+ def get_special_tokens_mask(
93
+ self, token_ids_0, token_ids_1=None, already_has_special_tokens=False
94
+ ):
95
+ eos_id, user_id, bot_id, blank_id = self._get_special_tokens_ids()
96
+
97
+ if already_has_special_tokens:
98
+ return super().get_special_tokens_mask(
99
+ token_ids_0=token_ids_0,
100
+ token_ids_1=token_ids_1,
101
+ already_has_special_tokens=True,
102
+ )
103
+ if token_ids_1 is not None:
104
+ return (
105
+ [1] * len(user_id)
106
+ + ([0] * (len(blank_id) + len(token_ids_0) + len(token_ids_1)))
107
+ + [1] * (len(eos_id) + len(bot_id))
108
+ + [0] * len(blank_id)
109
+ )
110
+
111
+ return (
112
+ [1] * len(user_id)
113
+ + ([0] * (len(blank_id) + len(token_ids_0)))
114
+ + [1] * (len(eos_id) + len(bot_id))
115
+ + [0] * len(blank_id)
116
+ )
117
+
118
+ def save_vocabulary(
119
+ self, save_directory: str, filename_prefix: Optional[str] = None
120
+ ) -> tuple[str]:
121
+ if not self.can_save_slow_tokenizer:
122
+ raise ValueError(
123
+ "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
124
+ "tokenizer."
125
+ )
126
+
127
+ if not os.path.isdir(save_directory):
128
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
129
+ return
130
+ out_vocab_file = os.path.join(
131
+ save_directory,
132
+ (filename_prefix + "-" if filename_prefix else "")
133
+ + VOCAB_FILES_NAMES["vocab_file"],
134
+ )
135
+
136
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
137
+ copyfile(self.vocab_file, out_vocab_file)
138
+
139
+ return (out_vocab_file,)
140
+
141
+
142
+ __all__ = ["SparkTokenizerFast"]
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "3": {
12
+ "content": "<unk>",
13
+ "lstrip": false,
14
+ "normalized": true,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "5": {
20
+ "content": "<end>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "bos_token": null,
29
+ "clean_up_tokenization_spaces": false,
30
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<System> ' + system_message + '<end>' }}{% endif %}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<User> ' + message['content'] + '<end><Bot> ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + '<end>' }}{% endif %}{% endfor %}",
31
+ "do_lower_case": false,
32
+ "eos_token": "<end>",
33
+ "model_max_length": 32768,
34
+ "pad_token": "<pad>",
35
+ "sp_model_kwargs": {},
36
+ "split_by_punct": false,
37
+ "tokenizer_class": "SparkTokenizer",
38
+ "auto_map": {
39
+ "AutoTokenizer":["tokenization_spark.SparkTokenizer", "tokenization_spark_fast.SparkTokenizerFast"]},
40
+ "unk_token": "<unk>"
41
+ }