abeat commited on
Commit
f2f3e97
·
verified ·
1 Parent(s): 60e4f20

Switch to sharded safetensors

Browse files
config.json CHANGED
@@ -1,30 +1,29 @@
1
  {
2
- "model_type": "shallowmind",
3
- "architectures": ["MyCustomModelForCausalLM"],
4
-
 
5
  "auto_map": {
6
  "AutoConfig": "blahblahtron_1_3B.HFWrapperConfig",
7
  "AutoModelForCausalLM": "blahblahtron_1_3B.MyCustomModelForCausalLM"
8
  },
9
-
10
- "vocab_size": 50257,
11
  "embedding_dim": 2048,
12
- "num_layers": 22,
 
 
 
 
13
  "num_heads": 16,
14
  "num_kv_heads": 4,
15
- "ffn_dim_multiplier": 4.0,
16
- "max_length": 1024,
17
- "dropout": 0.1,
18
-
19
- "bos_token_id": 50256,
20
- "eos_token_id": 50256,
21
  "pad_token_id": 50256,
22
-
 
23
  "torch_dtype": "bfloat16",
24
-
25
- "_notes": "Extra training-only keys kept below; HF ignores them but they’re here for provenance.",
26
  "use_flash_attention_2": true,
27
- "learning_rate": 0.0003,
28
- "weight_decay": 0.1,
29
- "tokenizer_name_or_path": "gpt2"
30
  }
 
1
  {
2
+ "_notes": "Extra training-only keys kept below; HF ignores them but they\u2019re here for provenance.",
3
+ "architectures": [
4
+ "MyCustomModelForCausalLM"
5
+ ],
6
  "auto_map": {
7
  "AutoConfig": "blahblahtron_1_3B.HFWrapperConfig",
8
  "AutoModelForCausalLM": "blahblahtron_1_3B.MyCustomModelForCausalLM"
9
  },
10
+ "bos_token_id": 50256,
11
+ "dropout": 0.1,
12
  "embedding_dim": 2048,
13
+ "eos_token_id": 50256,
14
+ "ffn_dim_multiplier": 4.0,
15
+ "learning_rate": 0.0003,
16
+ "max_length": null,
17
+ "model_type": "shallowmind",
18
  "num_heads": 16,
19
  "num_kv_heads": 4,
20
+ "num_layers": 22,
 
 
 
 
 
21
  "pad_token_id": 50256,
22
+ "rms_norm_eps": 1e-06,
23
+ "tokenizer_name_or_path": "gpt2",
24
  "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.55.2",
 
26
  "use_flash_attention_2": true,
27
+ "vocab_size": 50257,
28
+ "weight_decay": 0.1
 
29
  }
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "max_length": 1024,
6
+ "pad_token_id": 50256,
7
+ "transformers_version": "4.55.2"
8
+ }
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1931627ce28c6e6089d40c3fbd720315c8aeafe854d897feea6265fb3b78aae5
3
+ size 1986502536
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b9c248a5c2bf1d0ff6ea9da1e2a20af52a24d4f3ffeac92a48768ae6952ed03
3
+ size 409306816
model.safetensors.index.json ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 1197897728,
4
+ "total_size": 2395795456
5
+ },
6
+ "weight_map": {
7
+ "blocks.0.attn.out_proj.weight": "model-00001-of-00002.safetensors",
8
+ "blocks.0.attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
9
+ "blocks.0.ffn.linear_1.weight": "model-00001-of-00002.safetensors",
10
+ "blocks.0.ffn.linear_2.weight": "model-00001-of-00002.safetensors",
11
+ "blocks.0.norm1.weight": "model-00001-of-00002.safetensors",
12
+ "blocks.0.norm2.weight": "model-00001-of-00002.safetensors",
13
+ "blocks.1.attn.out_proj.weight": "model-00001-of-00002.safetensors",
14
+ "blocks.1.attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
15
+ "blocks.1.ffn.linear_1.weight": "model-00001-of-00002.safetensors",
16
+ "blocks.1.ffn.linear_2.weight": "model-00001-of-00002.safetensors",
17
+ "blocks.1.norm1.weight": "model-00001-of-00002.safetensors",
18
+ "blocks.1.norm2.weight": "model-00001-of-00002.safetensors",
19
+ "blocks.10.attn.out_proj.weight": "model-00001-of-00002.safetensors",
20
+ "blocks.10.attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
21
+ "blocks.10.ffn.linear_1.weight": "model-00001-of-00002.safetensors",
22
+ "blocks.10.ffn.linear_2.weight": "model-00001-of-00002.safetensors",
23
+ "blocks.10.norm1.weight": "model-00001-of-00002.safetensors",
24
+ "blocks.10.norm2.weight": "model-00001-of-00002.safetensors",
25
+ "blocks.11.attn.out_proj.weight": "model-00001-of-00002.safetensors",
26
+ "blocks.11.attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
27
+ "blocks.11.ffn.linear_1.weight": "model-00001-of-00002.safetensors",
28
+ "blocks.11.ffn.linear_2.weight": "model-00001-of-00002.safetensors",
29
+ "blocks.11.norm1.weight": "model-00001-of-00002.safetensors",
30
+ "blocks.11.norm2.weight": "model-00001-of-00002.safetensors",
31
+ "blocks.12.attn.out_proj.weight": "model-00001-of-00002.safetensors",
32
+ "blocks.12.attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
33
+ "blocks.12.ffn.linear_1.weight": "model-00001-of-00002.safetensors",
34
+ "blocks.12.ffn.linear_2.weight": "model-00001-of-00002.safetensors",
35
+ "blocks.12.norm1.weight": "model-00001-of-00002.safetensors",
36
+ "blocks.12.norm2.weight": "model-00001-of-00002.safetensors",
37
+ "blocks.13.attn.out_proj.weight": "model-00001-of-00002.safetensors",
38
+ "blocks.13.attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
39
+ "blocks.13.ffn.linear_1.weight": "model-00001-of-00002.safetensors",
40
+ "blocks.13.ffn.linear_2.weight": "model-00001-of-00002.safetensors",
41
+ "blocks.13.norm1.weight": "model-00001-of-00002.safetensors",
42
+ "blocks.13.norm2.weight": "model-00001-of-00002.safetensors",
43
+ "blocks.14.attn.out_proj.weight": "model-00001-of-00002.safetensors",
44
+ "blocks.14.attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
45
+ "blocks.14.ffn.linear_1.weight": "model-00001-of-00002.safetensors",
46
+ "blocks.14.ffn.linear_2.weight": "model-00001-of-00002.safetensors",
47
+ "blocks.14.norm1.weight": "model-00001-of-00002.safetensors",
48
+ "blocks.14.norm2.weight": "model-00001-of-00002.safetensors",
49
+ "blocks.15.attn.out_proj.weight": "model-00001-of-00002.safetensors",
50
+ "blocks.15.attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
51
+ "blocks.15.ffn.linear_1.weight": "model-00001-of-00002.safetensors",
52
+ "blocks.15.ffn.linear_2.weight": "model-00001-of-00002.safetensors",
53
+ "blocks.15.norm1.weight": "model-00001-of-00002.safetensors",
54
+ "blocks.15.norm2.weight": "model-00001-of-00002.safetensors",
55
+ "blocks.16.attn.out_proj.weight": "model-00001-of-00002.safetensors",
56
+ "blocks.16.attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
57
+ "blocks.16.ffn.linear_1.weight": "model-00001-of-00002.safetensors",
58
+ "blocks.16.ffn.linear_2.weight": "model-00001-of-00002.safetensors",
59
+ "blocks.16.norm1.weight": "model-00001-of-00002.safetensors",
60
+ "blocks.16.norm2.weight": "model-00001-of-00002.safetensors",
61
+ "blocks.17.attn.out_proj.weight": "model-00001-of-00002.safetensors",
62
+ "blocks.17.attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
63
+ "blocks.17.ffn.linear_1.weight": "model-00001-of-00002.safetensors",
64
+ "blocks.17.ffn.linear_2.weight": "model-00001-of-00002.safetensors",
65
+ "blocks.17.norm1.weight": "model-00001-of-00002.safetensors",
66
+ "blocks.17.norm2.weight": "model-00001-of-00002.safetensors",
67
+ "blocks.18.attn.out_proj.weight": "model-00001-of-00002.safetensors",
68
+ "blocks.18.attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
69
+ "blocks.18.ffn.linear_1.weight": "model-00001-of-00002.safetensors",
70
+ "blocks.18.ffn.linear_2.weight": "model-00001-of-00002.safetensors",
71
+ "blocks.18.norm1.weight": "model-00001-of-00002.safetensors",
72
+ "blocks.18.norm2.weight": "model-00001-of-00002.safetensors",
73
+ "blocks.19.attn.out_proj.weight": "model-00001-of-00002.safetensors",
74
+ "blocks.19.attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
75
+ "blocks.19.ffn.linear_1.weight": "model-00001-of-00002.safetensors",
76
+ "blocks.19.ffn.linear_2.weight": "model-00002-of-00002.safetensors",
77
+ "blocks.19.norm1.weight": "model-00002-of-00002.safetensors",
78
+ "blocks.19.norm2.weight": "model-00002-of-00002.safetensors",
79
+ "blocks.2.attn.out_proj.weight": "model-00001-of-00002.safetensors",
80
+ "blocks.2.attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
81
+ "blocks.2.ffn.linear_1.weight": "model-00001-of-00002.safetensors",
82
+ "blocks.2.ffn.linear_2.weight": "model-00001-of-00002.safetensors",
83
+ "blocks.2.norm1.weight": "model-00001-of-00002.safetensors",
84
+ "blocks.2.norm2.weight": "model-00001-of-00002.safetensors",
85
+ "blocks.20.attn.out_proj.weight": "model-00002-of-00002.safetensors",
86
+ "blocks.20.attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
87
+ "blocks.20.ffn.linear_1.weight": "model-00002-of-00002.safetensors",
88
+ "blocks.20.ffn.linear_2.weight": "model-00002-of-00002.safetensors",
89
+ "blocks.20.norm1.weight": "model-00002-of-00002.safetensors",
90
+ "blocks.20.norm2.weight": "model-00002-of-00002.safetensors",
91
+ "blocks.21.attn.out_proj.weight": "model-00002-of-00002.safetensors",
92
+ "blocks.21.attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
93
+ "blocks.21.ffn.linear_1.weight": "model-00002-of-00002.safetensors",
94
+ "blocks.21.ffn.linear_2.weight": "model-00002-of-00002.safetensors",
95
+ "blocks.21.norm1.weight": "model-00002-of-00002.safetensors",
96
+ "blocks.21.norm2.weight": "model-00002-of-00002.safetensors",
97
+ "blocks.3.attn.out_proj.weight": "model-00001-of-00002.safetensors",
98
+ "blocks.3.attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
99
+ "blocks.3.ffn.linear_1.weight": "model-00001-of-00002.safetensors",
100
+ "blocks.3.ffn.linear_2.weight": "model-00001-of-00002.safetensors",
101
+ "blocks.3.norm1.weight": "model-00001-of-00002.safetensors",
102
+ "blocks.3.norm2.weight": "model-00001-of-00002.safetensors",
103
+ "blocks.4.attn.out_proj.weight": "model-00001-of-00002.safetensors",
104
+ "blocks.4.attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
105
+ "blocks.4.ffn.linear_1.weight": "model-00001-of-00002.safetensors",
106
+ "blocks.4.ffn.linear_2.weight": "model-00001-of-00002.safetensors",
107
+ "blocks.4.norm1.weight": "model-00001-of-00002.safetensors",
108
+ "blocks.4.norm2.weight": "model-00001-of-00002.safetensors",
109
+ "blocks.5.attn.out_proj.weight": "model-00001-of-00002.safetensors",
110
+ "blocks.5.attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
111
+ "blocks.5.ffn.linear_1.weight": "model-00001-of-00002.safetensors",
112
+ "blocks.5.ffn.linear_2.weight": "model-00001-of-00002.safetensors",
113
+ "blocks.5.norm1.weight": "model-00001-of-00002.safetensors",
114
+ "blocks.5.norm2.weight": "model-00001-of-00002.safetensors",
115
+ "blocks.6.attn.out_proj.weight": "model-00001-of-00002.safetensors",
116
+ "blocks.6.attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
117
+ "blocks.6.ffn.linear_1.weight": "model-00001-of-00002.safetensors",
118
+ "blocks.6.ffn.linear_2.weight": "model-00001-of-00002.safetensors",
119
+ "blocks.6.norm1.weight": "model-00001-of-00002.safetensors",
120
+ "blocks.6.norm2.weight": "model-00001-of-00002.safetensors",
121
+ "blocks.7.attn.out_proj.weight": "model-00001-of-00002.safetensors",
122
+ "blocks.7.attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
123
+ "blocks.7.ffn.linear_1.weight": "model-00001-of-00002.safetensors",
124
+ "blocks.7.ffn.linear_2.weight": "model-00001-of-00002.safetensors",
125
+ "blocks.7.norm1.weight": "model-00001-of-00002.safetensors",
126
+ "blocks.7.norm2.weight": "model-00001-of-00002.safetensors",
127
+ "blocks.8.attn.out_proj.weight": "model-00001-of-00002.safetensors",
128
+ "blocks.8.attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
129
+ "blocks.8.ffn.linear_1.weight": "model-00001-of-00002.safetensors",
130
+ "blocks.8.ffn.linear_2.weight": "model-00001-of-00002.safetensors",
131
+ "blocks.8.norm1.weight": "model-00001-of-00002.safetensors",
132
+ "blocks.8.norm2.weight": "model-00001-of-00002.safetensors",
133
+ "blocks.9.attn.out_proj.weight": "model-00001-of-00002.safetensors",
134
+ "blocks.9.attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
135
+ "blocks.9.ffn.linear_1.weight": "model-00001-of-00002.safetensors",
136
+ "blocks.9.ffn.linear_2.weight": "model-00001-of-00002.safetensors",
137
+ "blocks.9.norm1.weight": "model-00001-of-00002.safetensors",
138
+ "blocks.9.norm2.weight": "model-00001-of-00002.safetensors",
139
+ "final_norm.weight": "model-00002-of-00002.safetensors",
140
+ "lm_head.weight": "model-00002-of-00002.safetensors",
141
+ "token_embeddings.weight": "model-00001-of-00002.safetensors"
142
+ }
143
+ }
special_tokens_map.json CHANGED
@@ -1,6 +1,30 @@
1
  {
2
- "bos_token": "<|endoftext|>",
3
- "eos_token": "<|endoftext|>",
4
- "pad_token": "<|endoftext|>",
5
- "unk_token": "<|endoftext|>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  }
 
1
  {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
  }