wxnfifth commited on
Commit
ada336f
·
verified ·
1 Parent(s): fd80ec2

Model save

Browse files
README.md CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/wxnfifth/huggingface/runs/wgko6wgr)
31
 
32
 
33
  This model was trained with SFT.
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/wxnfifth/huggingface/runs/2miq5tco)
31
 
32
 
33
  This model was trained with SFT.
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.997779422649889,
3
  "total_flos": 76745898196992.0,
4
- "train_loss": 0.7871017519727305,
5
- "train_runtime": 4713.7692,
6
  "train_samples": 16610,
7
- "train_samples_per_second": 4.584,
8
  "train_steps_per_second": 0.071
9
  }
 
1
  {
2
  "epoch": 0.997779422649889,
3
  "total_flos": 76745898196992.0,
4
+ "train_loss": 0.7870961399389658,
5
+ "train_runtime": 4760.8197,
6
  "train_samples": 16610,
7
+ "train_samples_per_second": 4.539,
8
  "train_steps_per_second": 0.071
9
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:edc815fe6e6c82998edf9da2ec19888049d8c4542e361cde669b14ef26d3906e
3
  size 3087467144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64c543243153dc1ee263f1596a24ae1ccf7b7fb4a217284f5bd70c4a4fa99cf5
3
  size 3087467144
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.997779422649889,
3
  "total_flos": 76745898196992.0,
4
- "train_loss": 0.7871017519727305,
5
- "train_runtime": 4713.7692,
6
  "train_samples": 16610,
7
- "train_samples_per_second": 4.584,
8
  "train_steps_per_second": 0.071
9
  }
 
1
  {
2
  "epoch": 0.997779422649889,
3
  "total_flos": 76745898196992.0,
4
+ "train_loss": 0.7870961399389658,
5
+ "train_runtime": 4760.8197,
6
  "train_samples": 16610,
7
+ "train_samples_per_second": 4.539,
8
  "train_steps_per_second": 0.071
9
  }
trainer_state.json CHANGED
@@ -10,493 +10,493 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.014803849000740192,
13
- "grad_norm": 0.6499719023704529,
14
  "learning_rate": 2.9411764705882355e-06,
15
  "loss": 1.09,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.029607698001480384,
20
- "grad_norm": 0.38171473145484924,
21
  "learning_rate": 5.882352941176471e-06,
22
  "loss": 1.0792,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.04441154700222058,
27
- "grad_norm": 0.3942464590072632,
28
  "learning_rate": 8.823529411764707e-06,
29
- "loss": 1.0223,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.05921539600296077,
34
- "grad_norm": 0.28095921874046326,
35
  "learning_rate": 1.1764705882352942e-05,
36
  "loss": 0.9451,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.07401924500370097,
41
- "grad_norm": 0.22764872014522552,
42
  "learning_rate": 1.4705882352941179e-05,
43
  "loss": 0.9125,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.08882309400444116,
48
- "grad_norm": 0.1783059984445572,
49
  "learning_rate": 1.7647058823529414e-05,
50
  "loss": 0.893,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.10362694300518134,
55
- "grad_norm": 0.17370979487895966,
56
  "learning_rate": 1.9999462497359468e-05,
57
- "loss": 0.8652,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.11843079200592153,
62
- "grad_norm": 0.14947360754013062,
63
  "learning_rate": 1.9980655971335944e-05,
64
  "loss": 0.8452,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.13323464100666174,
69
- "grad_norm": 0.12460564076900482,
70
  "learning_rate": 1.993503206718859e-05,
71
  "loss": 0.8228,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 0.14803849000740193,
76
- "grad_norm": 0.14311614632606506,
77
  "learning_rate": 1.986271337340182e-05,
78
  "loss": 0.8277,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.16284233900814213,
83
- "grad_norm": 0.12113290280103683,
84
  "learning_rate": 1.976389420563607e-05,
85
- "loss": 0.8105,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 0.17764618800888232,
90
- "grad_norm": 0.12570306658744812,
91
  "learning_rate": 1.9638840084614182e-05,
92
- "loss": 0.7964,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.19245003700962252,
97
- "grad_norm": 0.12238704413175583,
98
  "learning_rate": 1.9487887022684336e-05,
99
- "loss": 0.8062,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 0.20725388601036268,
104
- "grad_norm": 0.13958358764648438,
105
  "learning_rate": 1.9311440620976597e-05,
106
  "loss": 0.7989,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.22205773501110287,
111
- "grad_norm": 0.1243973821401596,
112
  "learning_rate": 1.9109974979578852e-05,
113
  "loss": 0.7899,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 0.23686158401184307,
118
- "grad_norm": 0.12657789885997772,
119
  "learning_rate": 1.8884031423660492e-05,
120
  "loss": 0.8185,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.25166543301258326,
125
- "grad_norm": 0.12268061190843582,
126
  "learning_rate": 1.8634217048966638e-05,
127
  "loss": 0.801,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 0.2664692820133235,
132
- "grad_norm": 0.1173299178481102,
133
  "learning_rate": 1.836120309059107e-05,
134
- "loss": 0.7838,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.28127313101406365,
139
- "grad_norm": 0.1265975385904312,
140
  "learning_rate": 1.8065723119410885e-05,
141
- "loss": 0.7809,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 0.29607698001480387,
146
- "grad_norm": 0.13352826237678528,
147
  "learning_rate": 1.77485710710289e-05,
148
  "loss": 0.7879,
149
  "step": 100
150
  },
151
  {
152
  "epoch": 0.29607698001480387,
153
- "eval_loss": 0.8042058944702148,
154
- "eval_runtime": 5.8168,
155
- "eval_samples_per_second": 22.005,
156
- "eval_steps_per_second": 1.375,
157
  "step": 100
158
  },
159
  {
160
  "epoch": 0.31088082901554404,
161
- "grad_norm": 0.12055955082178116,
162
  "learning_rate": 1.741059911251997e-05,
163
  "loss": 0.7786,
164
  "step": 105
165
  },
166
  {
167
  "epoch": 0.32568467801628426,
168
- "grad_norm": 0.12688305974006653,
169
  "learning_rate": 1.7052715352713076e-05,
170
- "loss": 0.7726,
171
  "step": 110
172
  },
173
  {
174
  "epoch": 0.3404885270170244,
175
- "grad_norm": 0.12400885671377182,
176
  "learning_rate": 1.667588140216154e-05,
177
- "loss": 0.7996,
178
  "step": 115
179
  },
180
  {
181
  "epoch": 0.35529237601776464,
182
- "grad_norm": 0.13515259325504303,
183
  "learning_rate": 1.628110978935756e-05,
184
  "loss": 0.774,
185
  "step": 120
186
  },
187
  {
188
  "epoch": 0.3700962250185048,
189
- "grad_norm": 0.1352948397397995,
190
  "learning_rate": 1.586946124013354e-05,
191
  "loss": 0.7734,
192
  "step": 125
193
  },
194
  {
195
  "epoch": 0.38490007401924503,
196
- "grad_norm": 0.12024685740470886,
197
  "learning_rate": 1.5442041827560274e-05,
198
  "loss": 0.7498,
199
  "step": 130
200
  },
201
  {
202
  "epoch": 0.3997039230199852,
203
- "grad_norm": 0.11641304194927216,
204
  "learning_rate": 1.5000000000000002e-05,
205
- "loss": 0.7608,
206
  "step": 135
207
  },
208
  {
209
  "epoch": 0.41450777202072536,
210
- "grad_norm": 0.12957507371902466,
211
  "learning_rate": 1.4544523495299843e-05,
212
  "loss": 0.7669,
213
  "step": 140
214
  },
215
  {
216
  "epoch": 0.4293116210214656,
217
- "grad_norm": 0.13438324630260468,
218
  "learning_rate": 1.4076836149416889e-05,
219
  "loss": 0.7829,
220
  "step": 145
221
  },
222
  {
223
  "epoch": 0.44411547002220575,
224
- "grad_norm": 0.17085708677768707,
225
  "learning_rate": 1.3598194608050011e-05,
226
- "loss": 0.7677,
227
  "step": 150
228
  },
229
  {
230
  "epoch": 0.45891931902294597,
231
- "grad_norm": 0.11604870110750198,
232
  "learning_rate": 1.3109884950114007e-05,
233
  "loss": 0.7567,
234
  "step": 155
235
  },
236
  {
237
  "epoch": 0.47372316802368614,
238
- "grad_norm": 0.12267672270536423,
239
  "learning_rate": 1.2613219232128608e-05,
240
  "loss": 0.7568,
241
  "step": 160
242
  },
243
  {
244
  "epoch": 0.48852701702442636,
245
- "grad_norm": 0.11862842738628387,
246
  "learning_rate": 1.2109531962807333e-05,
247
  "loss": 0.7583,
248
  "step": 165
249
  },
250
  {
251
  "epoch": 0.5033308660251665,
252
- "grad_norm": 0.1191529631614685,
253
  "learning_rate": 1.1600176517318742e-05,
254
  "loss": 0.7631,
255
  "step": 170
256
  },
257
  {
258
  "epoch": 0.5181347150259067,
259
- "grad_norm": 0.12247402966022491,
260
  "learning_rate": 1.1086521500854746e-05,
261
- "loss": 0.75,
262
  "step": 175
263
  },
264
  {
265
  "epoch": 0.532938564026647,
266
- "grad_norm": 0.12190617620944977,
267
  "learning_rate": 1.0569947071276847e-05,
268
  "loss": 0.7708,
269
  "step": 180
270
  },
271
  {
272
  "epoch": 0.5477424130273871,
273
- "grad_norm": 0.13005486130714417,
274
  "learning_rate": 1.0051841230721065e-05,
275
- "loss": 0.764,
276
  "step": 185
277
  },
278
  {
279
  "epoch": 0.5625462620281273,
280
- "grad_norm": 0.13146714866161346,
281
  "learning_rate": 9.533596096125826e-06,
282
- "loss": 0.7706,
283
  "step": 190
284
  },
285
  {
286
  "epoch": 0.5773501110288675,
287
- "grad_norm": 0.1219043880701065,
288
  "learning_rate": 9.016604158703654e-06,
289
- "loss": 0.7444,
290
  "step": 195
291
  },
292
  {
293
  "epoch": 0.5921539600296077,
294
- "grad_norm": 0.13030683994293213,
295
  "learning_rate": 8.502254542407186e-06,
296
  "loss": 0.7423,
297
  "step": 200
298
  },
299
  {
300
  "epoch": 0.5921539600296077,
301
- "eval_loss": 0.7782207131385803,
302
- "eval_runtime": 5.771,
303
- "eval_samples_per_second": 22.18,
304
- "eval_steps_per_second": 1.386,
305
  "step": 200
306
  },
307
  {
308
  "epoch": 0.6069578090303479,
309
- "grad_norm": 0.1178601086139679,
310
  "learning_rate": 7.991929271442817e-06,
311
  "loss": 0.7461,
312
  "step": 205
313
  },
314
  {
315
  "epoch": 0.6217616580310881,
316
- "grad_norm": 0.1156802773475647,
317
  "learning_rate": 7.48699955686089e-06,
318
  "loss": 0.7483,
319
  "step": 210
320
  },
321
  {
322
  "epoch": 0.6365655070318282,
323
- "grad_norm": 0.11491943150758743,
324
  "learning_rate": 6.988822112200157e-06,
325
  "loss": 0.7566,
326
  "step": 215
327
  },
328
  {
329
  "epoch": 0.6513693560325685,
330
- "grad_norm": 0.12633360922336578,
331
  "learning_rate": 6.498735508086094e-06,
332
  "loss": 0.7597,
333
  "step": 220
334
  },
335
  {
336
  "epoch": 0.6661732050333087,
337
- "grad_norm": 0.11288498342037201,
338
  "learning_rate": 6.018056575578075e-06,
339
  "loss": 0.7536,
340
  "step": 225
341
  },
342
  {
343
  "epoch": 0.6809770540340488,
344
- "grad_norm": 0.10684435814619064,
345
  "learning_rate": 5.548076867929331e-06,
346
  "loss": 0.7503,
347
  "step": 230
348
  },
349
  {
350
  "epoch": 0.695780903034789,
351
- "grad_norm": 0.11590610444545746,
352
  "learning_rate": 5.090059190266779e-06,
353
  "loss": 0.7384,
354
  "step": 235
355
  },
356
  {
357
  "epoch": 0.7105847520355293,
358
- "grad_norm": 0.10783125460147858,
359
  "learning_rate": 4.645234206515171e-06,
360
- "loss": 0.7435,
361
  "step": 240
362
  },
363
  {
364
  "epoch": 0.7253886010362695,
365
- "grad_norm": 0.11286304891109467,
366
  "learning_rate": 4.214797132682597e-06,
367
  "loss": 0.7401,
368
  "step": 245
369
  },
370
  {
371
  "epoch": 0.7401924500370096,
372
- "grad_norm": 0.11659123748540878,
373
  "learning_rate": 3.799904525392251e-06,
374
  "loss": 0.747,
375
  "step": 250
376
  },
377
  {
378
  "epoch": 0.7549962990377498,
379
- "grad_norm": 0.11270508170127869,
380
  "learning_rate": 3.401671174289469e-06,
381
  "loss": 0.7371,
382
  "step": 255
383
  },
384
  {
385
  "epoch": 0.7698001480384901,
386
- "grad_norm": 0.11058636754751205,
387
  "learning_rate": 3.021167106673928e-06,
388
- "loss": 0.7532,
389
  "step": 260
390
  },
391
  {
392
  "epoch": 0.7846039970392302,
393
- "grad_norm": 0.10753703117370605,
394
  "learning_rate": 2.6594147124053983e-06,
395
- "loss": 0.7419,
396
  "step": 265
397
  },
398
  {
399
  "epoch": 0.7994078460399704,
400
- "grad_norm": 0.10991961508989334,
401
  "learning_rate": 2.317385996808195e-06,
402
- "loss": 0.7536,
403
  "step": 270
404
  },
405
  {
406
  "epoch": 0.8142116950407106,
407
- "grad_norm": 0.10330849140882492,
408
  "learning_rate": 1.9959999689556407e-06,
409
  "loss": 0.7463,
410
  "step": 275
411
  },
412
  {
413
  "epoch": 0.8290155440414507,
414
- "grad_norm": 0.10575641691684723,
415
  "learning_rate": 1.6961201723520248e-06,
416
  "loss": 0.732,
417
  "step": 280
418
  },
419
  {
420
  "epoch": 0.843819393042191,
421
- "grad_norm": 0.10046100616455078,
422
  "learning_rate": 1.4185523646469822e-06,
423
  "loss": 0.757,
424
  "step": 285
425
  },
426
  {
427
  "epoch": 0.8586232420429312,
428
- "grad_norm": 0.10767965763807297,
429
  "learning_rate": 1.1640423526166987e-06,
430
- "loss": 0.7348,
431
  "step": 290
432
  },
433
  {
434
  "epoch": 0.8734270910436713,
435
- "grad_norm": 0.10156064480543137,
436
  "learning_rate": 9.332739882292752e-07,
437
  "loss": 0.7608,
438
  "step": 295
439
  },
440
  {
441
  "epoch": 0.8882309400444115,
442
- "grad_norm": 0.10109930485486984,
443
  "learning_rate": 7.268673311786378e-07,
444
  "loss": 0.7508,
445
  "step": 300
446
  },
447
  {
448
  "epoch": 0.8882309400444115,
449
- "eval_loss": 0.7698501348495483,
450
- "eval_runtime": 5.8951,
451
- "eval_samples_per_second": 21.713,
452
- "eval_steps_per_second": 1.357,
453
  "step": 300
454
  },
455
  {
456
  "epoch": 0.9030347890451518,
457
- "grad_norm": 0.0981217697262764,
458
  "learning_rate": 5.453769828241872e-07,
459
  "loss": 0.7343,
460
  "step": 305
461
  },
462
  {
463
  "epoch": 0.9178386380458919,
464
- "grad_norm": 0.10373206436634064,
465
  "learning_rate": 3.8929059601275463e-07,
466
  "loss": 0.7668,
467
  "step": 310
468
  },
469
  {
470
  "epoch": 0.9326424870466321,
471
- "grad_norm": 0.09866725653409958,
472
  "learning_rate": 2.5902756478688674e-07,
473
  "loss": 0.749,
474
  "step": 315
475
  },
476
  {
477
  "epoch": 0.9474463360473723,
478
- "grad_norm": 0.10060535371303558,
479
  "learning_rate": 1.5493789750014032e-07,
480
  "loss": 0.7509,
481
  "step": 320
482
  },
483
  {
484
  "epoch": 0.9622501850481125,
485
- "grad_norm": 0.10740893334150314,
486
  "learning_rate": 7.730127636723539e-08,
487
  "loss": 0.7315,
488
  "step": 325
489
  },
490
  {
491
  "epoch": 0.9770540340488527,
492
- "grad_norm": 0.10483107715845108,
493
  "learning_rate": 2.6326305976001054e-08,
494
  "loss": 0.7362,
495
  "step": 330
496
  },
497
  {
498
  "epoch": 0.9918578830495929,
499
- "grad_norm": 0.11270838230848312,
500
  "learning_rate": 2.149952780321485e-09,
501
  "loss": 0.7575,
502
  "step": 335
@@ -505,9 +505,9 @@
505
  "epoch": 0.997779422649889,
506
  "step": 337,
507
  "total_flos": 76745898196992.0,
508
- "train_loss": 0.7871017519727305,
509
- "train_runtime": 4713.7692,
510
- "train_samples_per_second": 4.584,
511
  "train_steps_per_second": 0.071
512
  }
513
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.014803849000740192,
13
+ "grad_norm": 0.6503643989562988,
14
  "learning_rate": 2.9411764705882355e-06,
15
  "loss": 1.09,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.029607698001480384,
20
+ "grad_norm": 0.3810490071773529,
21
  "learning_rate": 5.882352941176471e-06,
22
  "loss": 1.0792,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.04441154700222058,
27
+ "grad_norm": 0.3948555886745453,
28
  "learning_rate": 8.823529411764707e-06,
29
+ "loss": 1.0222,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.05921539600296077,
34
+ "grad_norm": 0.2805577516555786,
35
  "learning_rate": 1.1764705882352942e-05,
36
  "loss": 0.9451,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.07401924500370097,
41
+ "grad_norm": 0.2285250872373581,
42
  "learning_rate": 1.4705882352941179e-05,
43
  "loss": 0.9125,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.08882309400444116,
48
+ "grad_norm": 0.1779119372367859,
49
  "learning_rate": 1.7647058823529414e-05,
50
  "loss": 0.893,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.10362694300518134,
55
+ "grad_norm": 0.17365849018096924,
56
  "learning_rate": 1.9999462497359468e-05,
57
+ "loss": 0.8651,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.11843079200592153,
62
+ "grad_norm": 0.14850875735282898,
63
  "learning_rate": 1.9980655971335944e-05,
64
  "loss": 0.8452,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.13323464100666174,
69
+ "grad_norm": 0.1241055279970169,
70
  "learning_rate": 1.993503206718859e-05,
71
  "loss": 0.8228,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 0.14803849000740193,
76
+ "grad_norm": 0.14281505346298218,
77
  "learning_rate": 1.986271337340182e-05,
78
  "loss": 0.8277,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.16284233900814213,
83
+ "grad_norm": 0.12098833918571472,
84
  "learning_rate": 1.976389420563607e-05,
85
+ "loss": 0.8106,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 0.17764618800888232,
90
+ "grad_norm": 0.12549127638339996,
91
  "learning_rate": 1.9638840084614182e-05,
92
+ "loss": 0.7963,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.19245003700962252,
97
+ "grad_norm": 0.12255293875932693,
98
  "learning_rate": 1.9487887022684336e-05,
99
+ "loss": 0.8063,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 0.20725388601036268,
104
+ "grad_norm": 0.1393764466047287,
105
  "learning_rate": 1.9311440620976597e-05,
106
  "loss": 0.7989,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.22205773501110287,
111
+ "grad_norm": 0.12465377151966095,
112
  "learning_rate": 1.9109974979578852e-05,
113
  "loss": 0.7899,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 0.23686158401184307,
118
+ "grad_norm": 0.12621091306209564,
119
  "learning_rate": 1.8884031423660492e-05,
120
  "loss": 0.8185,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.25166543301258326,
125
+ "grad_norm": 0.12274689227342606,
126
  "learning_rate": 1.8634217048966638e-05,
127
  "loss": 0.801,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 0.2664692820133235,
132
+ "grad_norm": 0.11692527681589127,
133
  "learning_rate": 1.836120309059107e-05,
134
+ "loss": 0.7837,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.28127313101406365,
139
+ "grad_norm": 0.12686526775360107,
140
  "learning_rate": 1.8065723119410885e-05,
141
+ "loss": 0.7808,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 0.29607698001480387,
146
+ "grad_norm": 0.13229221105575562,
147
  "learning_rate": 1.77485710710289e-05,
148
  "loss": 0.7879,
149
  "step": 100
150
  },
151
  {
152
  "epoch": 0.29607698001480387,
153
+ "eval_loss": 0.8044255375862122,
154
+ "eval_runtime": 5.859,
155
+ "eval_samples_per_second": 21.847,
156
+ "eval_steps_per_second": 1.365,
157
  "step": 100
158
  },
159
  {
160
  "epoch": 0.31088082901554404,
161
+ "grad_norm": 0.11952897906303406,
162
  "learning_rate": 1.741059911251997e-05,
163
  "loss": 0.7786,
164
  "step": 105
165
  },
166
  {
167
  "epoch": 0.32568467801628426,
168
+ "grad_norm": 0.1261080503463745,
169
  "learning_rate": 1.7052715352713076e-05,
170
+ "loss": 0.7727,
171
  "step": 110
172
  },
173
  {
174
  "epoch": 0.3404885270170244,
175
+ "grad_norm": 0.12398428469896317,
176
  "learning_rate": 1.667588140216154e-05,
177
+ "loss": 0.7995,
178
  "step": 115
179
  },
180
  {
181
  "epoch": 0.35529237601776464,
182
+ "grad_norm": 0.13477057218551636,
183
  "learning_rate": 1.628110978935756e-05,
184
  "loss": 0.774,
185
  "step": 120
186
  },
187
  {
188
  "epoch": 0.3700962250185048,
189
+ "grad_norm": 0.1344415843486786,
190
  "learning_rate": 1.586946124013354e-05,
191
  "loss": 0.7734,
192
  "step": 125
193
  },
194
  {
195
  "epoch": 0.38490007401924503,
196
+ "grad_norm": 0.12036354839801788,
197
  "learning_rate": 1.5442041827560274e-05,
198
  "loss": 0.7498,
199
  "step": 130
200
  },
201
  {
202
  "epoch": 0.3997039230199852,
203
+ "grad_norm": 0.11690913140773773,
204
  "learning_rate": 1.5000000000000002e-05,
205
+ "loss": 0.7607,
206
  "step": 135
207
  },
208
  {
209
  "epoch": 0.41450777202072536,
210
+ "grad_norm": 0.1300031542778015,
211
  "learning_rate": 1.4544523495299843e-05,
212
  "loss": 0.7669,
213
  "step": 140
214
  },
215
  {
216
  "epoch": 0.4293116210214656,
217
+ "grad_norm": 0.13464853167533875,
218
  "learning_rate": 1.4076836149416889e-05,
219
  "loss": 0.7829,
220
  "step": 145
221
  },
222
  {
223
  "epoch": 0.44411547002220575,
224
+ "grad_norm": 0.13280808925628662,
225
  "learning_rate": 1.3598194608050011e-05,
226
+ "loss": 0.7678,
227
  "step": 150
228
  },
229
  {
230
  "epoch": 0.45891931902294597,
231
+ "grad_norm": 0.11657247692346573,
232
  "learning_rate": 1.3109884950114007e-05,
233
  "loss": 0.7567,
234
  "step": 155
235
  },
236
  {
237
  "epoch": 0.47372316802368614,
238
+ "grad_norm": 0.1221541091799736,
239
  "learning_rate": 1.2613219232128608e-05,
240
  "loss": 0.7568,
241
  "step": 160
242
  },
243
  {
244
  "epoch": 0.48852701702442636,
245
+ "grad_norm": 0.11861217021942139,
246
  "learning_rate": 1.2109531962807333e-05,
247
  "loss": 0.7583,
248
  "step": 165
249
  },
250
  {
251
  "epoch": 0.5033308660251665,
252
+ "grad_norm": 0.11882445961236954,
253
  "learning_rate": 1.1600176517318742e-05,
254
  "loss": 0.7631,
255
  "step": 170
256
  },
257
  {
258
  "epoch": 0.5181347150259067,
259
+ "grad_norm": 0.12230531871318817,
260
  "learning_rate": 1.1086521500854746e-05,
261
+ "loss": 0.7499,
262
  "step": 175
263
  },
264
  {
265
  "epoch": 0.532938564026647,
266
+ "grad_norm": 0.12119855731725693,
267
  "learning_rate": 1.0569947071276847e-05,
268
  "loss": 0.7708,
269
  "step": 180
270
  },
271
  {
272
  "epoch": 0.5477424130273871,
273
+ "grad_norm": 0.12843738496303558,
274
  "learning_rate": 1.0051841230721065e-05,
275
+ "loss": 0.7639,
276
  "step": 185
277
  },
278
  {
279
  "epoch": 0.5625462620281273,
280
+ "grad_norm": 0.13090410828590393,
281
  "learning_rate": 9.533596096125826e-06,
282
+ "loss": 0.7705,
283
  "step": 190
284
  },
285
  {
286
  "epoch": 0.5773501110288675,
287
+ "grad_norm": 0.12135408818721771,
288
  "learning_rate": 9.016604158703654e-06,
289
+ "loss": 0.7443,
290
  "step": 195
291
  },
292
  {
293
  "epoch": 0.5921539600296077,
294
+ "grad_norm": 0.13062784075737,
295
  "learning_rate": 8.502254542407186e-06,
296
  "loss": 0.7423,
297
  "step": 200
298
  },
299
  {
300
  "epoch": 0.5921539600296077,
301
+ "eval_loss": 0.7782678604125977,
302
+ "eval_runtime": 5.8957,
303
+ "eval_samples_per_second": 21.711,
304
+ "eval_steps_per_second": 1.357,
305
  "step": 200
306
  },
307
  {
308
  "epoch": 0.6069578090303479,
309
+ "grad_norm": 0.11821803450584412,
310
  "learning_rate": 7.991929271442817e-06,
311
  "loss": 0.7461,
312
  "step": 205
313
  },
314
  {
315
  "epoch": 0.6217616580310881,
316
+ "grad_norm": 0.11570143699645996,
317
  "learning_rate": 7.48699955686089e-06,
318
  "loss": 0.7483,
319
  "step": 210
320
  },
321
  {
322
  "epoch": 0.6365655070318282,
323
+ "grad_norm": 0.11539588123559952,
324
  "learning_rate": 6.988822112200157e-06,
325
  "loss": 0.7566,
326
  "step": 215
327
  },
328
  {
329
  "epoch": 0.6513693560325685,
330
+ "grad_norm": 0.12663376331329346,
331
  "learning_rate": 6.498735508086094e-06,
332
  "loss": 0.7597,
333
  "step": 220
334
  },
335
  {
336
  "epoch": 0.6661732050333087,
337
+ "grad_norm": 0.11310730874538422,
338
  "learning_rate": 6.018056575578075e-06,
339
  "loss": 0.7536,
340
  "step": 225
341
  },
342
  {
343
  "epoch": 0.6809770540340488,
344
+ "grad_norm": 0.10681577771902084,
345
  "learning_rate": 5.548076867929331e-06,
346
  "loss": 0.7503,
347
  "step": 230
348
  },
349
  {
350
  "epoch": 0.695780903034789,
351
+ "grad_norm": 0.11569629609584808,
352
  "learning_rate": 5.090059190266779e-06,
353
  "loss": 0.7384,
354
  "step": 235
355
  },
356
  {
357
  "epoch": 0.7105847520355293,
358
+ "grad_norm": 0.10818663239479065,
359
  "learning_rate": 4.645234206515171e-06,
360
+ "loss": 0.7436,
361
  "step": 240
362
  },
363
  {
364
  "epoch": 0.7253886010362695,
365
+ "grad_norm": 0.1134616956114769,
366
  "learning_rate": 4.214797132682597e-06,
367
  "loss": 0.7401,
368
  "step": 245
369
  },
370
  {
371
  "epoch": 0.7401924500370096,
372
+ "grad_norm": 0.1172085627913475,
373
  "learning_rate": 3.799904525392251e-06,
374
  "loss": 0.747,
375
  "step": 250
376
  },
377
  {
378
  "epoch": 0.7549962990377498,
379
+ "grad_norm": 0.11327671259641647,
380
  "learning_rate": 3.401671174289469e-06,
381
  "loss": 0.7371,
382
  "step": 255
383
  },
384
  {
385
  "epoch": 0.7698001480384901,
386
+ "grad_norm": 0.11018561571836472,
387
  "learning_rate": 3.021167106673928e-06,
388
+ "loss": 0.7531,
389
  "step": 260
390
  },
391
  {
392
  "epoch": 0.7846039970392302,
393
+ "grad_norm": 0.10802847146987915,
394
  "learning_rate": 2.6594147124053983e-06,
395
+ "loss": 0.742,
396
  "step": 265
397
  },
398
  {
399
  "epoch": 0.7994078460399704,
400
+ "grad_norm": 0.11024806648492813,
401
  "learning_rate": 2.317385996808195e-06,
402
+ "loss": 0.7537,
403
  "step": 270
404
  },
405
  {
406
  "epoch": 0.8142116950407106,
407
+ "grad_norm": 0.10315828770399094,
408
  "learning_rate": 1.9959999689556407e-06,
409
  "loss": 0.7463,
410
  "step": 275
411
  },
412
  {
413
  "epoch": 0.8290155440414507,
414
+ "grad_norm": 0.10550706088542938,
415
  "learning_rate": 1.6961201723520248e-06,
416
  "loss": 0.732,
417
  "step": 280
418
  },
419
  {
420
  "epoch": 0.843819393042191,
421
+ "grad_norm": 0.10042756050825119,
422
  "learning_rate": 1.4185523646469822e-06,
423
  "loss": 0.757,
424
  "step": 285
425
  },
426
  {
427
  "epoch": 0.8586232420429312,
428
+ "grad_norm": 0.10768315196037292,
429
  "learning_rate": 1.1640423526166987e-06,
430
+ "loss": 0.7347,
431
  "step": 290
432
  },
433
  {
434
  "epoch": 0.8734270910436713,
435
+ "grad_norm": 0.10151806473731995,
436
  "learning_rate": 9.332739882292752e-07,
437
  "loss": 0.7608,
438
  "step": 295
439
  },
440
  {
441
  "epoch": 0.8882309400444115,
442
+ "grad_norm": 0.10100872814655304,
443
  "learning_rate": 7.268673311786378e-07,
444
  "loss": 0.7508,
445
  "step": 300
446
  },
447
  {
448
  "epoch": 0.8882309400444115,
449
+ "eval_loss": 0.7698503732681274,
450
+ "eval_runtime": 5.9196,
451
+ "eval_samples_per_second": 21.623,
452
+ "eval_steps_per_second": 1.351,
453
  "step": 300
454
  },
455
  {
456
  "epoch": 0.9030347890451518,
457
+ "grad_norm": 0.09824325144290924,
458
  "learning_rate": 5.453769828241872e-07,
459
  "loss": 0.7343,
460
  "step": 305
461
  },
462
  {
463
  "epoch": 0.9178386380458919,
464
+ "grad_norm": 0.1036561131477356,
465
  "learning_rate": 3.8929059601275463e-07,
466
  "loss": 0.7668,
467
  "step": 310
468
  },
469
  {
470
  "epoch": 0.9326424870466321,
471
+ "grad_norm": 0.09889034926891327,
472
  "learning_rate": 2.5902756478688674e-07,
473
  "loss": 0.749,
474
  "step": 315
475
  },
476
  {
477
  "epoch": 0.9474463360473723,
478
+ "grad_norm": 0.10057399421930313,
479
  "learning_rate": 1.5493789750014032e-07,
480
  "loss": 0.7509,
481
  "step": 320
482
  },
483
  {
484
  "epoch": 0.9622501850481125,
485
+ "grad_norm": 0.10745055228471756,
486
  "learning_rate": 7.730127636723539e-08,
487
  "loss": 0.7315,
488
  "step": 325
489
  },
490
  {
491
  "epoch": 0.9770540340488527,
492
+ "grad_norm": 0.10501035302877426,
493
  "learning_rate": 2.6326305976001054e-08,
494
  "loss": 0.7362,
495
  "step": 330
496
  },
497
  {
498
  "epoch": 0.9918578830495929,
499
+ "grad_norm": 0.11271944642066956,
500
  "learning_rate": 2.149952780321485e-09,
501
  "loss": 0.7575,
502
  "step": 335
 
505
  "epoch": 0.997779422649889,
506
  "step": 337,
507
  "total_flos": 76745898196992.0,
508
+ "train_loss": 0.7870961399389658,
509
+ "train_runtime": 4760.8197,
510
+ "train_samples_per_second": 4.539,
511
  "train_steps_per_second": 0.071
512
  }
513
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e03557d5ccbfcde45badb3bdd2307a19e7b1d15f3e6ea0f661a780e2ee06c2c
3
  size 7352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3484b08fd15098ca238578495a3c10611d2633e5c2273701f1d6a8ba5f1d6a79
3
  size 7352