lukemerrick commited on
Commit
4e1600b
·
verified ·
1 Parent(s): 04a36b9

Upload folder using huggingface_hub

Browse files
artifacts/luxical-one/config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "auto_map": {
4
+ "AutoConfig": "luxical_hf_wrapper.LuxicalOneConfig",
5
+ "AutoModel": "luxical_hf_wrapper.LuxicalOneModel"
6
+ },
7
+ "embedding_dim": 192,
8
+ "max_ngram_length": 5,
9
+ "model_type": "luxical-one",
10
+ "transformers_version": "4.51.3"
11
+ }
artifacts/luxical-one/luxical_hf_wrapper.py ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from collections import OrderedDict
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Sequence
7
+
8
+ import numpy as np
9
+ import pyarrow as pa
10
+ import torch
11
+ from torch import Tensor
12
+ from transformers import PreTrainedModel, PretrainedConfig
13
+ from transformers.modeling_outputs import ModelOutput
14
+
15
+ from luxical.embedder import Embedder, _pack_int_dict, _unpack_int_dict
16
+ from luxical.sparse_to_dense_neural_nets import SparseToDenseEmbedder
17
+ from luxical.tokenization import ArrowTokenizer
18
+
19
+ DEFAULT_EMBEDDER_FILENAME = "luxical_one_embedder.npz" # deprecated; no longer used
20
+
21
+
22
+ class LuxicalOneConfig(PretrainedConfig):
23
+ """Configuration for the Luxical Huggingface wrapper.
24
+
25
+ Generic for any Luxical `Embedder` serialized in format version 1.
26
+ """
27
+
28
+ model_type = "luxical-one"
29
+
30
+ def __init__(
31
+ self,
32
+ *,
33
+ max_ngram_length: int | None = None,
34
+ embedding_dim: int | None = None,
35
+ **kwargs,
36
+ ) -> None:
37
+ super().__init__(**kwargs)
38
+ self.max_ngram_length = max_ngram_length
39
+ self.embedding_dim = embedding_dim
40
+
41
+
42
+ @dataclass
43
+ class LuxicalOneModelOutput(ModelOutput):
44
+ embeddings: Tensor
45
+
46
+
47
+ class LuxicalOneModel(PreTrainedModel):
48
+ """Huggingface `PreTrainedModel` wrapper around a Luxical `Embedder`.
49
+
50
+ Not tied to a specific checkpoint; reconstructs the `Embedder` from
51
+ serialized state stored in the weights. Safetensors-only export.
52
+ """
53
+ config_class = LuxicalOneConfig
54
+
55
+ @classmethod
56
+ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): # type: ignore[override]
57
+ """Load model and reconstruct the Luxical embedder from safetensors.
58
+
59
+ Keeps logic minimal and safetensors-only to avoid legacy branches.
60
+ """
61
+ model = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
62
+ try:
63
+ from transformers.utils import SAFE_WEIGHTS_NAME, cached_file
64
+ from safetensors.torch import load_file as load_safetensors # type: ignore
65
+ except Exception:
66
+ return model
67
+
68
+ revision = kwargs.get("revision")
69
+ cache_dir = kwargs.get("cache_dir")
70
+ force_download = kwargs.get("force_download", False)
71
+ proxies = kwargs.get("proxies")
72
+ token = kwargs.get("token")
73
+ local_files_only = kwargs.get("local_files_only", False)
74
+
75
+ weight_path = None
76
+ try:
77
+ weight_path = cached_file(
78
+ pretrained_model_name_or_path,
79
+ SAFE_WEIGHTS_NAME,
80
+ revision=revision,
81
+ cache_dir=cache_dir,
82
+ force_download=force_download,
83
+ proxies=proxies,
84
+ token=token,
85
+ local_files_only=local_files_only,
86
+ )
87
+ except Exception:
88
+ pass
89
+ if weight_path is None:
90
+ cand = Path(pretrained_model_name_or_path) / "model.safetensors"
91
+ if cand.exists():
92
+ weight_path = str(cand)
93
+
94
+ if weight_path is not None:
95
+ try:
96
+ sd = load_safetensors(weight_path)
97
+ model._embedder = _embedder_from_state_dict(sd)
98
+ model._embedder_path = None
99
+ except Exception:
100
+ pass
101
+ return model
102
+
103
+ def __init__(
104
+ self,
105
+ config: LuxicalOneConfig,
106
+ *,
107
+ embedder: Embedder | None = None,
108
+ embedder_path: str | Path | None = None,
109
+ ) -> None:
110
+ self._embedder: Embedder | None = embedder
111
+ self._embedder_path: Path | None = (
112
+ Path(embedder_path).resolve() if embedder_path is not None else None
113
+ )
114
+ super().__init__(config)
115
+
116
+ def post_init(self) -> None:
117
+ super().post_init()
118
+ if self._embedder is not None:
119
+ self.config.embedding_dim = self._embedder.embedding_dim
120
+ self.config.max_ngram_length = self._embedder.max_ngram_length
121
+
122
+ def forward(
123
+ self,
124
+ input_texts: Sequence[str] | pa.StringArray | None = None,
125
+ *,
126
+ batch_size: int = 4096,
127
+ progress_bars: bool = False,
128
+ ) -> LuxicalOneModelOutput:
129
+ if input_texts is None:
130
+ msg = "input_texts must be provided"
131
+ raise ValueError(msg)
132
+ embedder = self._ensure_embedder_loaded()
133
+ embeddings_np = embedder(
134
+ texts=input_texts,
135
+ batch_size=batch_size,
136
+ progress_bars=progress_bars,
137
+ )
138
+ embeddings = torch.from_numpy(embeddings_np)
139
+ return LuxicalOneModelOutput(embeddings=embeddings)
140
+
141
+ def save_pretrained(
142
+ self,
143
+ save_directory: str | Path,
144
+ *args,
145
+ **kwargs,
146
+ ) -> tuple[OrderedDict[str, Tensor], LuxicalOneConfig]:
147
+ save_path = Path(save_directory)
148
+ save_path.mkdir(parents=True, exist_ok=True)
149
+ # Prepare config with auto_map so AutoModel can import this module when
150
+ # loading from a Hub/local repo with trust_remote_code=True.
151
+ self.config.auto_map = {
152
+ "AutoConfig": "luxical_hf_wrapper.LuxicalOneConfig",
153
+ "AutoModel": "luxical_hf_wrapper.LuxicalOneModel",
154
+ }
155
+ # Persist the embedder inside a single Safetensors file.
156
+ embedder = self._ensure_embedder_loaded()
157
+ state_dict = _embedder_to_state_dict(embedder)
158
+ from safetensors.torch import save_file as save_safetensors # type: ignore
159
+ save_safetensors(state_dict, str(save_path / "model.safetensors"))
160
+ # Copy this module alongside to support remote code loading.
161
+ import inspect
162
+ import shutil
163
+
164
+ module_src = Path(inspect.getsourcefile(LuxicalOneModel) or __file__).resolve()
165
+ shutil.copyfile(module_src, save_path / "luxical_hf_wrapper.py")
166
+ # Save config.json last.
167
+ self.config.save_pretrained(save_path)
168
+ return state_dict, self.config
169
+
170
+ def load_state_dict(
171
+ self, state_dict: OrderedDict[str, Tensor], strict: bool = True
172
+ ): # type: ignore[override]
173
+ # Interpret the state dict as a serialized Luxical Embedder and rebuild it.
174
+ try:
175
+ self._embedder = _embedder_from_state_dict(state_dict)
176
+ self._embedder_path = None
177
+ # Update config fields if available
178
+ self.config.embedding_dim = self._embedder.embedding_dim
179
+ self.config.max_ngram_length = self._embedder.max_ngram_length
180
+ return torch.nn.modules.module._IncompatibleKeys([], [])
181
+ except KeyError:
182
+ if strict:
183
+ missing = list(state_dict.keys())
184
+ raise NotImplementedError(
185
+ "LuxicalOneModel expected serialized embedder tensors; "
186
+ f"unexpected keys: {missing}"
187
+ )
188
+ return torch.nn.modules.module._IncompatibleKeys([], list(state_dict.keys()))
189
+
190
+ def get_input_embeddings(self) -> torch.nn.Module:
191
+ msg = "LuxicalOneModel does not expose token embeddings."
192
+ raise NotImplementedError(msg)
193
+
194
+ def set_input_embeddings(self, value: torch.nn.Module) -> None:
195
+ msg = "LuxicalOneModel does not support replacing token embeddings."
196
+ raise NotImplementedError(msg)
197
+
198
+ def resize_token_embeddings(self, *args, **kwargs) -> None:
199
+ msg = "LuxicalOneModel does not use token embeddings."
200
+ raise NotImplementedError(msg)
201
+
202
+
203
+ def _ensure_embedder_loaded(self) -> Embedder:
204
+ if self._embedder is not None:
205
+ return self._embedder
206
+ raise RuntimeError(
207
+ "Luxical embedder is not initialized. Load this model via "
208
+ "AutoModel/LuxicalOneModel.from_pretrained so weights can be "
209
+ "decoded into an Embedder."
210
+ )
211
+
212
+ # No legacy file-based loader; all state lives in model.safetensors.
213
+
214
+
215
+ def export_embedder_to_huggingface_directory(
216
+ embedder: Embedder,
217
+ save_directory: str | Path,
218
+ *,
219
+ config_overrides: dict[str, object] | None = None,
220
+ ) -> Path:
221
+ save_path = Path(save_directory)
222
+ config = LuxicalOneConfig(
223
+ max_ngram_length=embedder.max_ngram_length,
224
+ embedding_dim=embedder.embedding_dim,
225
+ **(config_overrides or {}),
226
+ )
227
+ config.name_or_path = str(save_path.resolve())
228
+ model = LuxicalOneModel(config=config, embedder=embedder)
229
+ model.save_pretrained(save_path)
230
+ return save_path
231
+
232
+
233
+ # No global Auto* registration; exports include `auto_map` in config.json.
234
+
235
+
236
+ def _embedder_to_state_dict(embedder: Embedder) -> OrderedDict[str, Tensor]:
237
+ sd: "OrderedDict[str, Tensor]" = OrderedDict()
238
+ # Version
239
+ sd["embedder.version"] = torch.tensor([1], dtype=torch.long)
240
+ # Tokenizer json bytes
241
+ tok_bytes = np.frombuffer(embedder.tokenizer.to_str().encode("utf-8"), dtype=np.uint8)
242
+ sd["embedder.tokenizer"] = torch.from_numpy(tok_bytes.copy())
243
+ # Recognized ngrams
244
+ sd["embedder.recognized_ngrams"] = torch.from_numpy(embedder.recognized_ngrams.astype(np.int64, copy=False))
245
+ # Hash map keys/values
246
+ keys, vals = _unpack_int_dict(embedder.ngram_hash_to_ngram_idx)
247
+ sd["embedder.ngram_keys"] = torch.from_numpy(keys.astype(np.int64, copy=False))
248
+ sd["embedder.ngram_vals"] = torch.from_numpy(vals.astype(np.int64, copy=False))
249
+ # IDF
250
+ sd["embedder.idf_values"] = torch.from_numpy(embedder.idf_values.astype(np.float32, copy=False))
251
+ # Layers
252
+ layers = embedder.bow_to_dense_embedder.layers
253
+ sd["embedder.num_layers"] = torch.tensor([len(layers)], dtype=torch.long)
254
+ for i, layer in enumerate(layers):
255
+ sd[f"embedder.nn_layer_{i}"] = torch.from_numpy(layer.astype(np.float32, copy=False))
256
+ return sd
257
+
258
+
259
+ def _embedder_from_state_dict(state_dict: OrderedDict[str, Tensor]) -> Embedder:
260
+ version = int(state_dict["embedder.version"][0].item())
261
+ if version != 1:
262
+ raise NotImplementedError(f"Unsupported embedder version: {version}")
263
+ tok_bytes = bytes(state_dict["embedder.tokenizer"].cpu().numpy().astype(np.uint8).tolist())
264
+ tokenizer = ArrowTokenizer(tok_bytes.decode("utf-8"))
265
+ recognized_ngrams = state_dict["embedder.recognized_ngrams"].cpu().numpy().astype(np.int64, copy=False)
266
+ keys = state_dict["embedder.ngram_keys"].cpu().numpy().astype(np.int64, copy=False)
267
+ vals = state_dict["embedder.ngram_vals"].cpu().numpy().astype(np.int64, copy=False)
268
+ ngram_map = _pack_int_dict(keys, vals)
269
+ idf_values = state_dict["embedder.idf_values"].cpu().numpy().astype(np.float32, copy=False)
270
+ num_layers = int(state_dict["embedder.num_layers"][0].item())
271
+ layers = [
272
+ state_dict[f"embedder.nn_layer_{i}"].cpu().numpy().astype(np.float32, copy=False)
273
+ for i in range(num_layers)
274
+ ]
275
+ s2d = SparseToDenseEmbedder(layers=layers)
276
+ return Embedder(
277
+ tokenizer=tokenizer,
278
+ recognized_ngrams=recognized_ngrams,
279
+ ngram_hash_to_ngram_idx=ngram_map,
280
+ idf_values=idf_values,
281
+ bow_to_dense_embedder=s2d,
282
+ )
283
+
284
+
285
+ def _parse_cli_args() -> tuple[str, dict[str, object]]:
286
+ import argparse
287
+
288
+ parser = argparse.ArgumentParser(
289
+ description="Luxical One Huggingface wrapper: export and verify utilities.",
290
+ )
291
+ sub = parser.add_subparsers(dest="cmd", required=True)
292
+
293
+ p_export = sub.add_parser(
294
+ "export", help="Export a HF-formatted directory from a Luxical embedder .npz checkpoint"
295
+ )
296
+ p_export.add_argument(
297
+ "--checkpoint",
298
+ type=str,
299
+ default=str(Path("/tmp/luxical_one_rc4.npz")),
300
+ help="Path to Luxical embedder .npz checkpoint",
301
+ )
302
+ p_export.add_argument(
303
+ "--output-dir",
304
+ type=str,
305
+ default=str(Path(__file__).resolve().parent / "artifacts" / "luxical_one_hf"),
306
+ help="Directory to write the Huggingface-formatted model",
307
+ )
308
+
309
+ p_verify = sub.add_parser(
310
+ "verify", help="Verify HF-loaded model matches native Embedder outputs"
311
+ )
312
+ p_verify.add_argument(
313
+ "--checkpoint",
314
+ type=str,
315
+ default=str(Path("/tmp/luxical_one_rc4.npz")),
316
+ help="Path to Luxical embedder .npz checkpoint",
317
+ )
318
+ p_verify.add_argument(
319
+ "--export-dir",
320
+ type=str,
321
+ default=str(Path(__file__).resolve().parent / "artifacts" / "luxical_one_hf"),
322
+ help="HF directory to create/use for verification",
323
+ )
324
+ p_verify.add_argument(
325
+ "--batch-size", type=int, default=3, help="Batch size for verification"
326
+ )
327
+
328
+ args = parser.parse_args()
329
+ return args.cmd, vars(args)
330
+
331
+
332
+ def _sample_texts() -> list[str]:
333
+ return [
334
+ "Luxical embeddings make tf-idf sparkle.",
335
+ "This sentence tests the Huggingface wrapper path.",
336
+ "Short.",
337
+ ]
338
+
339
+
340
+ def _cmd_export(checkpoint: str, output_dir: str) -> None:
341
+ ckpt_path = Path(checkpoint).expanduser().resolve()
342
+ if not ckpt_path.exists():
343
+ raise FileNotFoundError(
344
+ f"Checkpoint not found at {ckpt_path}. Download with: aws s3 cp "
345
+ "s3://datology-external-artifacts/luxical/luxical_one_rc4.npz "
346
+ "/tmp/luxical_one_rc4.npz"
347
+ )
348
+ out_dir = Path(output_dir).expanduser().resolve()
349
+ out_dir.mkdir(parents=True, exist_ok=True)
350
+ embedder = Embedder.load(ckpt_path)
351
+ export_embedder_to_huggingface_directory(embedder, out_dir)
352
+ print(f"Huggingface directory written to {out_dir}")
353
+
354
+
355
+ def _cmd_verify(checkpoint: str, export_dir: str, batch_size: int) -> None:
356
+ ckpt_path = Path(checkpoint).expanduser().resolve()
357
+ if not ckpt_path.exists():
358
+ raise FileNotFoundError(
359
+ f"Checkpoint not found at {ckpt_path}. Download with: aws s3 cp "
360
+ "s3://datology-external-artifacts/luxical/luxical_one_rc4.npz "
361
+ "/tmp/luxical_one_rc4.npz"
362
+ )
363
+ exp_dir = Path(export_dir).expanduser().resolve()
364
+ exp_dir.mkdir(parents=True, exist_ok=True)
365
+
366
+ texts = _sample_texts()
367
+ embedder = Embedder.load(ckpt_path)
368
+ ref = embedder(texts, batch_size=batch_size)
369
+
370
+ export_embedder_to_huggingface_directory(embedder, exp_dir)
371
+ # Load using AutoModel so this mirrors user experience, with remote code.
372
+ from transformers import AutoModel # local import to keep top-level light
373
+ model = AutoModel.from_pretrained(exp_dir, trust_remote_code=True)
374
+ model.eval()
375
+ with torch.inference_mode():
376
+ out = (
377
+ model(texts, batch_size=batch_size, progress_bars=False)
378
+ .embeddings.cpu()
379
+ .numpy()
380
+ )
381
+ import numpy as np
382
+
383
+ np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-6)
384
+ print("Verification succeeded: Huggingface model matches embedder output.")
385
+
386
+
387
+ if __name__ == "__main__":
388
+ cmd, kv = _parse_cli_args()
389
+ if cmd == "export":
390
+ _cmd_export(checkpoint=str(kv["checkpoint"]), output_dir=str(kv["output_dir"]))
391
+ elif cmd == "verify":
392
+ _cmd_verify(
393
+ checkpoint=str(kv["checkpoint"]),
394
+ export_dir=str(kv["export_dir"]),
395
+ batch_size=int(kv["batch_size"]),
396
+ )
397
+ else:
398
+ raise SystemExit(f"Unknown command: {cmd}")
artifacts/luxical-one/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ec24f66cba56eb308214cd5078fffa37bf3316c70ca6c3f455d4ab60d7d2a95
3
+ size 929754793
config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "auto_map": {
4
+ "AutoConfig": "luxical_hf_wrapper.LuxicalOneConfig",
5
+ "AutoModel": "luxical_hf_wrapper.LuxicalOneModel"
6
+ },
7
+ "embedding_dim": 192,
8
+ "max_ngram_length": 5,
9
+ "model_type": "luxical-one",
10
+ "transformers_version": "4.51.3"
11
+ }
luxical_hf_wrapper.py ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from collections import OrderedDict
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Sequence
7
+
8
+ import numpy as np
9
+ import pyarrow as pa
10
+ import torch
11
+ from torch import Tensor
12
+ from transformers import PreTrainedModel, PretrainedConfig
13
+ from transformers.modeling_outputs import ModelOutput
14
+
15
+ from luxical.embedder import Embedder, _pack_int_dict, _unpack_int_dict
16
+ from luxical.sparse_to_dense_neural_nets import SparseToDenseEmbedder
17
+ from luxical.tokenization import ArrowTokenizer
18
+
19
+ DEFAULT_EMBEDDER_FILENAME = "luxical_one_embedder.npz" # deprecated; no longer used
20
+
21
+
22
+ class LuxicalOneConfig(PretrainedConfig):
23
+ """Configuration for the Luxical Huggingface wrapper.
24
+
25
+ Generic for any Luxical `Embedder` serialized in format version 1.
26
+ """
27
+
28
+ model_type = "luxical-one"
29
+
30
+ def __init__(
31
+ self,
32
+ *,
33
+ max_ngram_length: int | None = None,
34
+ embedding_dim: int | None = None,
35
+ **kwargs,
36
+ ) -> None:
37
+ super().__init__(**kwargs)
38
+ self.max_ngram_length = max_ngram_length
39
+ self.embedding_dim = embedding_dim
40
+
41
+
42
+ @dataclass
43
+ class LuxicalOneModelOutput(ModelOutput):
44
+ embeddings: Tensor
45
+
46
+
47
+ class LuxicalOneModel(PreTrainedModel):
48
+ """Huggingface `PreTrainedModel` wrapper around a Luxical `Embedder`.
49
+
50
+ Not tied to a specific checkpoint; reconstructs the `Embedder` from
51
+ serialized state stored in the weights. Safetensors-only export.
52
+ """
53
+ config_class = LuxicalOneConfig
54
+
55
+ @classmethod
56
+ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): # type: ignore[override]
57
+ """Load model and reconstruct the Luxical embedder from safetensors.
58
+
59
+ Keeps logic minimal and safetensors-only to avoid legacy branches.
60
+ """
61
+ model = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
62
+ try:
63
+ from transformers.utils import SAFE_WEIGHTS_NAME, cached_file
64
+ from safetensors.torch import load_file as load_safetensors # type: ignore
65
+ except Exception:
66
+ return model
67
+
68
+ revision = kwargs.get("revision")
69
+ cache_dir = kwargs.get("cache_dir")
70
+ force_download = kwargs.get("force_download", False)
71
+ proxies = kwargs.get("proxies")
72
+ token = kwargs.get("token")
73
+ local_files_only = kwargs.get("local_files_only", False)
74
+
75
+ weight_path = None
76
+ try:
77
+ weight_path = cached_file(
78
+ pretrained_model_name_or_path,
79
+ SAFE_WEIGHTS_NAME,
80
+ revision=revision,
81
+ cache_dir=cache_dir,
82
+ force_download=force_download,
83
+ proxies=proxies,
84
+ token=token,
85
+ local_files_only=local_files_only,
86
+ )
87
+ except Exception:
88
+ pass
89
+ if weight_path is None:
90
+ cand = Path(pretrained_model_name_or_path) / "model.safetensors"
91
+ if cand.exists():
92
+ weight_path = str(cand)
93
+
94
+ if weight_path is not None:
95
+ try:
96
+ sd = load_safetensors(weight_path)
97
+ model._embedder = _embedder_from_state_dict(sd)
98
+ model._embedder_path = None
99
+ except Exception:
100
+ pass
101
+ return model
102
+
103
+ def __init__(
104
+ self,
105
+ config: LuxicalOneConfig,
106
+ *,
107
+ embedder: Embedder | None = None,
108
+ embedder_path: str | Path | None = None,
109
+ ) -> None:
110
+ self._embedder: Embedder | None = embedder
111
+ self._embedder_path: Path | None = (
112
+ Path(embedder_path).resolve() if embedder_path is not None else None
113
+ )
114
+ super().__init__(config)
115
+
116
+ def post_init(self) -> None:
117
+ super().post_init()
118
+ if self._embedder is not None:
119
+ self.config.embedding_dim = self._embedder.embedding_dim
120
+ self.config.max_ngram_length = self._embedder.max_ngram_length
121
+
122
+ def forward(
123
+ self,
124
+ input_texts: Sequence[str] | pa.StringArray | None = None,
125
+ *,
126
+ batch_size: int = 4096,
127
+ progress_bars: bool = False,
128
+ ) -> LuxicalOneModelOutput:
129
+ if input_texts is None:
130
+ msg = "input_texts must be provided"
131
+ raise ValueError(msg)
132
+ embedder = self._ensure_embedder_loaded()
133
+ embeddings_np = embedder(
134
+ texts=input_texts,
135
+ batch_size=batch_size,
136
+ progress_bars=progress_bars,
137
+ )
138
+ embeddings = torch.from_numpy(embeddings_np)
139
+ return LuxicalOneModelOutput(embeddings=embeddings)
140
+
141
+ def save_pretrained(
142
+ self,
143
+ save_directory: str | Path,
144
+ *args,
145
+ **kwargs,
146
+ ) -> tuple[OrderedDict[str, Tensor], LuxicalOneConfig]:
147
+ save_path = Path(save_directory)
148
+ save_path.mkdir(parents=True, exist_ok=True)
149
+ # Prepare config with auto_map so AutoModel can import this module when
150
+ # loading from a Hub/local repo with trust_remote_code=True.
151
+ self.config.auto_map = {
152
+ "AutoConfig": "luxical_hf_wrapper.LuxicalOneConfig",
153
+ "AutoModel": "luxical_hf_wrapper.LuxicalOneModel",
154
+ }
155
+ # Persist the embedder inside a single Safetensors file.
156
+ embedder = self._ensure_embedder_loaded()
157
+ state_dict = _embedder_to_state_dict(embedder)
158
+ from safetensors.torch import save_file as save_safetensors # type: ignore
159
+ save_safetensors(state_dict, str(save_path / "model.safetensors"))
160
+ # Copy this module alongside to support remote code loading.
161
+ import inspect
162
+ import shutil
163
+
164
+ module_src = Path(inspect.getsourcefile(LuxicalOneModel) or __file__).resolve()
165
+ shutil.copyfile(module_src, save_path / "luxical_hf_wrapper.py")
166
+ # Save config.json last.
167
+ self.config.save_pretrained(save_path)
168
+ return state_dict, self.config
169
+
170
+ def load_state_dict(
171
+ self, state_dict: OrderedDict[str, Tensor], strict: bool = True
172
+ ): # type: ignore[override]
173
+ # Interpret the state dict as a serialized Luxical Embedder and rebuild it.
174
+ try:
175
+ self._embedder = _embedder_from_state_dict(state_dict)
176
+ self._embedder_path = None
177
+ # Update config fields if available
178
+ self.config.embedding_dim = self._embedder.embedding_dim
179
+ self.config.max_ngram_length = self._embedder.max_ngram_length
180
+ return torch.nn.modules.module._IncompatibleKeys([], [])
181
+ except KeyError:
182
+ if strict:
183
+ missing = list(state_dict.keys())
184
+ raise NotImplementedError(
185
+ "LuxicalOneModel expected serialized embedder tensors; "
186
+ f"unexpected keys: {missing}"
187
+ )
188
+ return torch.nn.modules.module._IncompatibleKeys([], list(state_dict.keys()))
189
+
190
+ def get_input_embeddings(self) -> torch.nn.Module:
191
+ msg = "LuxicalOneModel does not expose token embeddings."
192
+ raise NotImplementedError(msg)
193
+
194
+ def set_input_embeddings(self, value: torch.nn.Module) -> None:
195
+ msg = "LuxicalOneModel does not support replacing token embeddings."
196
+ raise NotImplementedError(msg)
197
+
198
+ def resize_token_embeddings(self, *args, **kwargs) -> None:
199
+ msg = "LuxicalOneModel does not use token embeddings."
200
+ raise NotImplementedError(msg)
201
+
202
+
203
+ def _ensure_embedder_loaded(self) -> Embedder:
204
+ if self._embedder is not None:
205
+ return self._embedder
206
+ raise RuntimeError(
207
+ "Luxical embedder is not initialized. Load this model via "
208
+ "AutoModel/LuxicalOneModel.from_pretrained so weights can be "
209
+ "decoded into an Embedder."
210
+ )
211
+
212
+ # No legacy file-based loader; all state lives in model.safetensors.
213
+
214
+
215
+ def export_embedder_to_huggingface_directory(
216
+ embedder: Embedder,
217
+ save_directory: str | Path,
218
+ *,
219
+ config_overrides: dict[str, object] | None = None,
220
+ ) -> Path:
221
+ save_path = Path(save_directory)
222
+ config = LuxicalOneConfig(
223
+ max_ngram_length=embedder.max_ngram_length,
224
+ embedding_dim=embedder.embedding_dim,
225
+ **(config_overrides or {}),
226
+ )
227
+ config.name_or_path = str(save_path.resolve())
228
+ model = LuxicalOneModel(config=config, embedder=embedder)
229
+ model.save_pretrained(save_path)
230
+ return save_path
231
+
232
+
233
+ # No global Auto* registration; exports include `auto_map` in config.json.
234
+
235
+
236
+ def _embedder_to_state_dict(embedder: Embedder) -> OrderedDict[str, Tensor]:
237
+ sd: "OrderedDict[str, Tensor]" = OrderedDict()
238
+ # Version
239
+ sd["embedder.version"] = torch.tensor([1], dtype=torch.long)
240
+ # Tokenizer json bytes
241
+ tok_bytes = np.frombuffer(embedder.tokenizer.to_str().encode("utf-8"), dtype=np.uint8)
242
+ sd["embedder.tokenizer"] = torch.from_numpy(tok_bytes.copy())
243
+ # Recognized ngrams
244
+ sd["embedder.recognized_ngrams"] = torch.from_numpy(embedder.recognized_ngrams.astype(np.int64, copy=False))
245
+ # Hash map keys/values
246
+ keys, vals = _unpack_int_dict(embedder.ngram_hash_to_ngram_idx)
247
+ sd["embedder.ngram_keys"] = torch.from_numpy(keys.astype(np.int64, copy=False))
248
+ sd["embedder.ngram_vals"] = torch.from_numpy(vals.astype(np.int64, copy=False))
249
+ # IDF
250
+ sd["embedder.idf_values"] = torch.from_numpy(embedder.idf_values.astype(np.float32, copy=False))
251
+ # Layers
252
+ layers = embedder.bow_to_dense_embedder.layers
253
+ sd["embedder.num_layers"] = torch.tensor([len(layers)], dtype=torch.long)
254
+ for i, layer in enumerate(layers):
255
+ sd[f"embedder.nn_layer_{i}"] = torch.from_numpy(layer.astype(np.float32, copy=False))
256
+ return sd
257
+
258
+
259
+ def _embedder_from_state_dict(state_dict: OrderedDict[str, Tensor]) -> Embedder:
260
+ version = int(state_dict["embedder.version"][0].item())
261
+ if version != 1:
262
+ raise NotImplementedError(f"Unsupported embedder version: {version}")
263
+ tok_bytes = bytes(state_dict["embedder.tokenizer"].cpu().numpy().astype(np.uint8).tolist())
264
+ tokenizer = ArrowTokenizer(tok_bytes.decode("utf-8"))
265
+ recognized_ngrams = state_dict["embedder.recognized_ngrams"].cpu().numpy().astype(np.int64, copy=False)
266
+ keys = state_dict["embedder.ngram_keys"].cpu().numpy().astype(np.int64, copy=False)
267
+ vals = state_dict["embedder.ngram_vals"].cpu().numpy().astype(np.int64, copy=False)
268
+ ngram_map = _pack_int_dict(keys, vals)
269
+ idf_values = state_dict["embedder.idf_values"].cpu().numpy().astype(np.float32, copy=False)
270
+ num_layers = int(state_dict["embedder.num_layers"][0].item())
271
+ layers = [
272
+ state_dict[f"embedder.nn_layer_{i}"].cpu().numpy().astype(np.float32, copy=False)
273
+ for i in range(num_layers)
274
+ ]
275
+ s2d = SparseToDenseEmbedder(layers=layers)
276
+ return Embedder(
277
+ tokenizer=tokenizer,
278
+ recognized_ngrams=recognized_ngrams,
279
+ ngram_hash_to_ngram_idx=ngram_map,
280
+ idf_values=idf_values,
281
+ bow_to_dense_embedder=s2d,
282
+ )
283
+
284
+
285
+ def _parse_cli_args() -> tuple[str, dict[str, object]]:
286
+ import argparse
287
+
288
+ parser = argparse.ArgumentParser(
289
+ description="Luxical One Huggingface wrapper: export and verify utilities.",
290
+ )
291
+ sub = parser.add_subparsers(dest="cmd", required=True)
292
+
293
+ p_export = sub.add_parser(
294
+ "export", help="Export a HF-formatted directory from a Luxical embedder .npz checkpoint"
295
+ )
296
+ p_export.add_argument(
297
+ "--checkpoint",
298
+ type=str,
299
+ default=str(Path("/tmp/luxical_one_rc4.npz")),
300
+ help="Path to Luxical embedder .npz checkpoint",
301
+ )
302
+ p_export.add_argument(
303
+ "--output-dir",
304
+ type=str,
305
+ default=str(Path(__file__).resolve().parent / "artifacts" / "luxical_one_hf"),
306
+ help="Directory to write the Huggingface-formatted model",
307
+ )
308
+
309
+ p_verify = sub.add_parser(
310
+ "verify", help="Verify HF-loaded model matches native Embedder outputs"
311
+ )
312
+ p_verify.add_argument(
313
+ "--checkpoint",
314
+ type=str,
315
+ default=str(Path("/tmp/luxical_one_rc4.npz")),
316
+ help="Path to Luxical embedder .npz checkpoint",
317
+ )
318
+ p_verify.add_argument(
319
+ "--export-dir",
320
+ type=str,
321
+ default=str(Path(__file__).resolve().parent / "artifacts" / "luxical_one_hf"),
322
+ help="HF directory to create/use for verification",
323
+ )
324
+ p_verify.add_argument(
325
+ "--batch-size", type=int, default=3, help="Batch size for verification"
326
+ )
327
+
328
+ args = parser.parse_args()
329
+ return args.cmd, vars(args)
330
+
331
+
332
+ def _sample_texts() -> list[str]:
333
+ return [
334
+ "Luxical embeddings make tf-idf sparkle.",
335
+ "This sentence tests the Huggingface wrapper path.",
336
+ "Short.",
337
+ ]
338
+
339
+
340
+ def _cmd_export(checkpoint: str, output_dir: str) -> None:
341
+ ckpt_path = Path(checkpoint).expanduser().resolve()
342
+ if not ckpt_path.exists():
343
+ raise FileNotFoundError(
344
+ f"Checkpoint not found at {ckpt_path}. Download with: aws s3 cp "
345
+ "s3://datology-external-artifacts/luxical/luxical_one_rc4.npz "
346
+ "/tmp/luxical_one_rc4.npz"
347
+ )
348
+ out_dir = Path(output_dir).expanduser().resolve()
349
+ out_dir.mkdir(parents=True, exist_ok=True)
350
+ embedder = Embedder.load(ckpt_path)
351
+ export_embedder_to_huggingface_directory(embedder, out_dir)
352
+ print(f"Huggingface directory written to {out_dir}")
353
+
354
+
355
+ def _cmd_verify(checkpoint: str, export_dir: str, batch_size: int) -> None:
356
+ ckpt_path = Path(checkpoint).expanduser().resolve()
357
+ if not ckpt_path.exists():
358
+ raise FileNotFoundError(
359
+ f"Checkpoint not found at {ckpt_path}. Download with: aws s3 cp "
360
+ "s3://datology-external-artifacts/luxical/luxical_one_rc4.npz "
361
+ "/tmp/luxical_one_rc4.npz"
362
+ )
363
+ exp_dir = Path(export_dir).expanduser().resolve()
364
+ exp_dir.mkdir(parents=True, exist_ok=True)
365
+
366
+ texts = _sample_texts()
367
+ embedder = Embedder.load(ckpt_path)
368
+ ref = embedder(texts, batch_size=batch_size)
369
+
370
+ export_embedder_to_huggingface_directory(embedder, exp_dir)
371
+ # Load using AutoModel so this mirrors user experience, with remote code.
372
+ from transformers import AutoModel # local import to keep top-level light
373
+ model = AutoModel.from_pretrained(exp_dir, trust_remote_code=True)
374
+ model.eval()
375
+ with torch.inference_mode():
376
+ out = (
377
+ model(texts, batch_size=batch_size, progress_bars=False)
378
+ .embeddings.cpu()
379
+ .numpy()
380
+ )
381
+ import numpy as np
382
+
383
+ np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-6)
384
+ print("Verification succeeded: Huggingface model matches embedder output.")
385
+
386
+
387
+ if __name__ == "__main__":
388
+ cmd, kv = _parse_cli_args()
389
+ if cmd == "export":
390
+ _cmd_export(checkpoint=str(kv["checkpoint"]), output_dir=str(kv["output_dir"]))
391
+ elif cmd == "verify":
392
+ _cmd_verify(
393
+ checkpoint=str(kv["checkpoint"]),
394
+ export_dir=str(kv["export_dir"]),
395
+ batch_size=int(kv["batch_size"]),
396
+ )
397
+ else:
398
+ raise SystemExit(f"Unknown command: {cmd}")
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ec24f66cba56eb308214cd5078fffa37bf3316c70ca6c3f455d4ab60d7d2a95
3
+ size 929754793