MartinJYHuang commited on
Commit
f9b106a
·
verified ·
1 Parent(s): b0284ce

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -25,8 +25,8 @@
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
- "q_proj",
29
- "v_proj"
30
  ],
31
  "target_parameters": null,
32
  "task_type": "CAUSAL_LM",
 
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
+ "v_proj",
29
+ "q_proj"
30
  ],
31
  "target_parameters": null,
32
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c061a3917cf3a04e7dc66124efb6c9935579d2e24d2430c9be83832a4b443797
3
  size 20992792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c17095f6292d23f4591cde75ce6b9dee47c35287cff99daa7bf7d565c19918b7
3
  size 20992792
trainer_state.json CHANGED
@@ -4,728 +4,878 @@
4
  "best_model_checkpoint": null,
5
  "epoch": 3.0,
6
  "eval_steps": 500,
7
- "global_step": 471,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.06369426751592357,
14
- "grad_norm": 7.661201477050781,
15
- "learning_rate": 9.80891719745223e-06,
16
- "logits/chosen": -1.8285623788833618,
17
- "logits/rejected": -1.613466501235962,
18
- "logps/chosen": -513.4370727539062,
19
- "logps/rejected": -476.92266845703125,
20
- "loss": 0.7134,
21
- "rewards/accuracies": 0.30000001192092896,
22
- "rewards/chosen": -0.010444183833897114,
23
- "rewards/margins": -0.03570808097720146,
24
- "rewards/rejected": 0.02526390179991722,
25
  "step": 10
26
  },
27
  {
28
- "epoch": 0.12738853503184713,
29
- "grad_norm": 8.578171730041504,
30
- "learning_rate": 9.596602972399152e-06,
31
- "logits/chosen": -1.8371143341064453,
32
- "logits/rejected": -1.5572088956832886,
33
- "logps/chosen": -495.640869140625,
34
- "logps/rejected": -517.6268310546875,
35
- "loss": 0.6337,
36
- "rewards/accuracies": 0.875,
37
- "rewards/chosen": 0.12352059036493301,
38
- "rewards/margins": 0.12701454758644104,
39
- "rewards/rejected": -0.003493957919999957,
40
  "step": 20
41
  },
42
  {
43
- "epoch": 0.1910828025477707,
44
- "grad_norm": 8.120169639587402,
45
- "learning_rate": 9.384288747346073e-06,
46
- "logits/chosen": -1.794507622718811,
47
- "logits/rejected": -1.6136707067489624,
48
- "logps/chosen": -512.5538940429688,
49
- "logps/rejected": -523.3352661132812,
50
- "loss": 0.5442,
51
  "rewards/accuracies": 1.0,
52
- "rewards/chosen": 0.2799207270145416,
53
- "rewards/margins": 0.33490151166915894,
54
- "rewards/rejected": -0.05498077720403671,
55
  "step": 30
56
  },
57
  {
58
- "epoch": 0.25477707006369427,
59
- "grad_norm": 6.508784770965576,
60
- "learning_rate": 9.171974522292994e-06,
61
- "logits/chosen": -1.7962682247161865,
62
- "logits/rejected": -1.6274349689483643,
63
- "logps/chosen": -480.62139892578125,
64
- "logps/rejected": -474.36273193359375,
65
- "loss": 0.4204,
66
- "rewards/accuracies": 1.0,
67
- "rewards/chosen": 0.4761812686920166,
68
- "rewards/margins": 0.6683631539344788,
69
- "rewards/rejected": -0.19218185544013977,
70
  "step": 40
71
  },
72
  {
73
- "epoch": 0.3184713375796178,
74
- "grad_norm": 5.781623363494873,
75
- "learning_rate": 8.959660297239916e-06,
76
- "logits/chosen": -1.7765535116195679,
77
- "logits/rejected": -1.591133713722229,
78
- "logps/chosen": -507.0015563964844,
79
- "logps/rejected": -491.17242431640625,
80
- "loss": 0.327,
81
  "rewards/accuracies": 1.0,
82
- "rewards/chosen": 0.6968984007835388,
83
- "rewards/margins": 0.9841921925544739,
84
- "rewards/rejected": -0.28729376196861267,
85
  "step": 50
86
  },
87
  {
88
- "epoch": 0.3821656050955414,
89
- "grad_norm": 2.6110823154449463,
90
- "learning_rate": 8.747346072186837e-06,
91
- "logits/chosen": -1.7369922399520874,
92
- "logits/rejected": -1.5202242136001587,
93
- "logps/chosen": -490.0065002441406,
94
- "logps/rejected": -475.16748046875,
95
- "loss": 0.2115,
96
  "rewards/accuracies": 1.0,
97
- "rewards/chosen": 0.994107723236084,
98
- "rewards/margins": 1.5735362768173218,
99
- "rewards/rejected": -0.579428493976593,
100
  "step": 60
101
  },
102
  {
103
- "epoch": 0.445859872611465,
104
- "grad_norm": 3.15047287940979,
105
- "learning_rate": 8.53503184713376e-06,
106
- "logits/chosen": -1.6807092428207397,
107
- "logits/rejected": -1.5059581995010376,
108
- "logps/chosen": -487.39208984375,
109
- "logps/rejected": -473.4630432128906,
110
- "loss": 0.1456,
111
  "rewards/accuracies": 1.0,
112
- "rewards/chosen": 1.3263133764266968,
113
- "rewards/margins": 2.0210459232330322,
114
- "rewards/rejected": -0.6947325468063354,
115
  "step": 70
116
  },
117
  {
118
- "epoch": 0.5095541401273885,
119
- "grad_norm": 2.4517059326171875,
120
- "learning_rate": 8.32271762208068e-06,
121
- "logits/chosen": -1.6148916482925415,
122
- "logits/rejected": -1.3740451335906982,
123
- "logps/chosen": -476.02166748046875,
124
- "logps/rejected": -508.15887451171875,
125
- "loss": 0.0904,
126
  "rewards/accuracies": 1.0,
127
- "rewards/chosen": 1.7351124286651611,
128
- "rewards/margins": 2.557111978530884,
129
- "rewards/rejected": -0.8219996690750122,
130
  "step": 80
131
  },
132
  {
133
- "epoch": 0.5732484076433121,
134
- "grad_norm": 0.685248851776123,
135
- "learning_rate": 8.110403397027601e-06,
136
- "logits/chosen": -1.7143325805664062,
137
- "logits/rejected": -1.481576919555664,
138
- "logps/chosen": -483.890625,
139
- "logps/rejected": -489.4288024902344,
140
- "loss": 0.0508,
141
  "rewards/accuracies": 1.0,
142
- "rewards/chosen": 1.919224500656128,
143
- "rewards/margins": 3.2962067127227783,
144
- "rewards/rejected": -1.3769820928573608,
145
  "step": 90
146
  },
147
  {
148
- "epoch": 0.6369426751592356,
149
- "grad_norm": 1.927983045578003,
150
- "learning_rate": 7.898089171974524e-06,
151
- "logits/chosen": -1.7103115320205688,
152
- "logits/rejected": -1.4516280889511108,
153
- "logps/chosen": -453.33721923828125,
154
- "logps/rejected": -513.4710693359375,
155
- "loss": 0.0314,
156
  "rewards/accuracies": 1.0,
157
- "rewards/chosen": 2.1619415283203125,
158
- "rewards/margins": 3.929858446121216,
159
- "rewards/rejected": -1.7679170370101929,
160
  "step": 100
161
  },
162
  {
163
- "epoch": 0.7006369426751592,
164
- "grad_norm": 0.39471063017845154,
165
- "learning_rate": 7.685774946921445e-06,
166
- "logits/chosen": -1.6923818588256836,
167
- "logits/rejected": -1.4913740158081055,
168
- "logps/chosen": -487.2040100097656,
169
- "logps/rejected": -534.4457397460938,
170
- "loss": 0.0199,
171
  "rewards/accuracies": 1.0,
172
- "rewards/chosen": 2.3453280925750732,
173
- "rewards/margins": 4.448094844818115,
174
- "rewards/rejected": -2.102766513824463,
175
  "step": 110
176
  },
177
  {
178
- "epoch": 0.7643312101910829,
179
- "grad_norm": 0.1494450569152832,
180
- "learning_rate": 7.473460721868365e-06,
181
- "logits/chosen": -1.7227827310562134,
182
- "logits/rejected": -1.5164122581481934,
183
- "logps/chosen": -466.625244140625,
184
- "logps/rejected": -496.8236389160156,
185
- "loss": 0.0137,
186
  "rewards/accuracies": 1.0,
187
- "rewards/chosen": 2.604698419570923,
188
- "rewards/margins": 4.873368740081787,
189
- "rewards/rejected": -2.2686705589294434,
190
  "step": 120
191
  },
192
  {
193
- "epoch": 0.8280254777070064,
194
- "grad_norm": 0.979215681552887,
195
- "learning_rate": 7.261146496815287e-06,
196
- "logits/chosen": -1.6731090545654297,
197
- "logits/rejected": -1.4788906574249268,
198
- "logps/chosen": -489.17041015625,
199
- "logps/rejected": -481.39910888671875,
200
- "loss": 0.012,
201
  "rewards/accuracies": 1.0,
202
- "rewards/chosen": 2.6250174045562744,
203
- "rewards/margins": 5.168362617492676,
204
- "rewards/rejected": -2.5433454513549805,
205
  "step": 130
206
  },
207
  {
208
- "epoch": 0.89171974522293,
209
- "grad_norm": 0.5591831207275391,
210
- "learning_rate": 7.0488322717622086e-06,
211
- "logits/chosen": -1.7174581289291382,
212
- "logits/rejected": -1.491135835647583,
213
- "logps/chosen": -494.3304748535156,
214
- "logps/rejected": -539.1906127929688,
215
- "loss": 0.0144,
216
  "rewards/accuracies": 1.0,
217
- "rewards/chosen": 2.630114793777466,
218
- "rewards/margins": 5.534493923187256,
219
- "rewards/rejected": -2.904378890991211,
220
  "step": 140
221
  },
222
  {
223
- "epoch": 0.9554140127388535,
224
- "grad_norm": 0.10819283127784729,
225
- "learning_rate": 6.836518046709129e-06,
226
- "logits/chosen": -1.7261998653411865,
227
- "logits/rejected": -1.5176118612289429,
228
- "logps/chosen": -505.73779296875,
229
- "logps/rejected": -508.22442626953125,
230
- "loss": 0.0097,
231
  "rewards/accuracies": 1.0,
232
- "rewards/chosen": 2.8433146476745605,
233
- "rewards/margins": 5.865386009216309,
234
- "rewards/rejected": -3.022071123123169,
235
  "step": 150
236
  },
237
  {
238
- "epoch": 1.019108280254777,
239
- "grad_norm": 0.16133908927440643,
240
- "learning_rate": 6.624203821656051e-06,
241
- "logits/chosen": -1.8000129461288452,
242
- "logits/rejected": -1.5112035274505615,
243
- "logps/chosen": -472.0714416503906,
244
- "logps/rejected": -514.2125854492188,
245
- "loss": 0.0098,
246
  "rewards/accuracies": 1.0,
247
- "rewards/chosen": 2.4917571544647217,
248
- "rewards/margins": 5.523527145385742,
249
- "rewards/rejected": -3.0317697525024414,
250
  "step": 160
251
  },
252
  {
253
- "epoch": 1.0828025477707006,
254
- "grad_norm": 0.14261530339717865,
255
- "learning_rate": 6.411889596602973e-06,
256
- "logits/chosen": -1.873085379600525,
257
- "logits/rejected": -1.4990111589431763,
258
- "logps/chosen": -462.21563720703125,
259
- "logps/rejected": -520.8255004882812,
260
- "loss": 0.0052,
261
  "rewards/accuracies": 1.0,
262
- "rewards/chosen": 2.830996036529541,
263
- "rewards/margins": 6.318862438201904,
264
- "rewards/rejected": -3.4878666400909424,
265
  "step": 170
266
  },
267
  {
268
- "epoch": 1.1464968152866242,
269
- "grad_norm": 0.11651387810707092,
270
- "learning_rate": 6.199575371549894e-06,
271
- "logits/chosen": -1.7707847356796265,
272
- "logits/rejected": -1.548455834388733,
273
- "logps/chosen": -516.4500732421875,
274
- "logps/rejected": -549.6249389648438,
275
- "loss": 0.0065,
276
  "rewards/accuracies": 1.0,
277
- "rewards/chosen": 3.196195125579834,
278
- "rewards/margins": 6.9291205406188965,
279
- "rewards/rejected": -3.7329254150390625,
280
  "step": 180
281
  },
282
  {
283
- "epoch": 1.2101910828025477,
284
- "grad_norm": 2.9312522411346436,
285
- "learning_rate": 5.987261146496816e-06,
286
- "logits/chosen": -1.7615554332733154,
287
- "logits/rejected": -1.5339546203613281,
288
- "logps/chosen": -497.7105407714844,
289
- "logps/rejected": -507.69140625,
290
- "loss": 0.009,
291
  "rewards/accuracies": 1.0,
292
- "rewards/chosen": 2.932077407836914,
293
- "rewards/margins": 6.43866491317749,
294
- "rewards/rejected": -3.506587505340576,
295
  "step": 190
296
  },
297
  {
298
- "epoch": 1.2738853503184713,
299
- "grad_norm": 0.11866319924592972,
300
- "learning_rate": 5.774946921443737e-06,
301
- "logits/chosen": -1.8983533382415771,
302
- "logits/rejected": -1.535003662109375,
303
- "logps/chosen": -460.1544494628906,
304
- "logps/rejected": -529.9498291015625,
305
- "loss": 0.0018,
306
  "rewards/accuracies": 1.0,
307
- "rewards/chosen": 3.1939971446990967,
308
- "rewards/margins": 7.317962646484375,
309
- "rewards/rejected": -4.123966217041016,
310
  "step": 200
311
  },
312
  {
313
- "epoch": 1.3375796178343948,
314
- "grad_norm": 0.009582897648215294,
315
- "learning_rate": 5.562632696390658e-06,
316
- "logits/chosen": -1.8325570821762085,
317
- "logits/rejected": -1.5961052179336548,
318
- "logps/chosen": -457.8462829589844,
319
- "logps/rejected": -511.46881103515625,
320
- "loss": 0.0036,
321
  "rewards/accuracies": 1.0,
322
- "rewards/chosen": 3.2415060997009277,
323
- "rewards/margins": 7.01214599609375,
324
- "rewards/rejected": -3.770639419555664,
325
  "step": 210
326
  },
327
  {
328
- "epoch": 1.4012738853503186,
329
- "grad_norm": 0.2067570984363556,
330
- "learning_rate": 5.35031847133758e-06,
331
- "logits/chosen": -1.8559471368789673,
332
- "logits/rejected": -1.4903216361999512,
333
- "logps/chosen": -444.54339599609375,
334
- "logps/rejected": -536.953369140625,
335
- "loss": 0.0017,
336
  "rewards/accuracies": 1.0,
337
- "rewards/chosen": 3.150261163711548,
338
- "rewards/margins": 7.365521430969238,
339
- "rewards/rejected": -4.2152605056762695,
340
  "step": 220
341
  },
342
  {
343
- "epoch": 1.4649681528662422,
344
- "grad_norm": 0.1383599489927292,
345
- "learning_rate": 5.1380042462845016e-06,
346
- "logits/chosen": -1.8445074558258057,
347
- "logits/rejected": -1.6155668497085571,
348
- "logps/chosen": -467.86846923828125,
349
- "logps/rejected": -537.4169311523438,
350
- "loss": 0.0023,
351
  "rewards/accuracies": 1.0,
352
- "rewards/chosen": 3.124246835708618,
353
- "rewards/margins": 7.138210296630859,
354
- "rewards/rejected": -4.013963222503662,
355
  "step": 230
356
  },
357
  {
358
- "epoch": 1.5286624203821657,
359
- "grad_norm": 0.14673951268196106,
360
- "learning_rate": 4.925690021231423e-06,
361
- "logits/chosen": -1.7636497020721436,
362
- "logits/rejected": -1.5512115955352783,
363
- "logps/chosen": -480.15203857421875,
364
- "logps/rejected": -529.8653564453125,
365
- "loss": 0.0022,
366
  "rewards/accuracies": 1.0,
367
- "rewards/chosen": 3.0038843154907227,
368
- "rewards/margins": 7.056528568267822,
369
- "rewards/rejected": -4.052645206451416,
370
  "step": 240
371
  },
372
  {
373
- "epoch": 1.5923566878980893,
374
- "grad_norm": 0.19075194001197815,
375
- "learning_rate": 4.713375796178344e-06,
376
- "logits/chosen": -1.777608871459961,
377
- "logits/rejected": -1.5940536260604858,
378
- "logps/chosen": -492.18280029296875,
379
- "logps/rejected": -511.8550720214844,
380
- "loss": 0.0028,
381
  "rewards/accuracies": 1.0,
382
- "rewards/chosen": 2.5837788581848145,
383
- "rewards/margins": 6.94525146484375,
384
- "rewards/rejected": -4.361472129821777,
385
  "step": 250
386
  },
387
  {
388
- "epoch": 1.6560509554140128,
389
- "grad_norm": 0.10456167906522751,
390
- "learning_rate": 4.501061571125266e-06,
391
- "logits/chosen": -1.8056246042251587,
392
- "logits/rejected": -1.5386626720428467,
393
- "logps/chosen": -488.4483337402344,
394
- "logps/rejected": -552.2747192382812,
395
- "loss": 0.0017,
396
  "rewards/accuracies": 1.0,
397
- "rewards/chosen": 3.2254958152770996,
398
- "rewards/margins": 7.798495292663574,
399
- "rewards/rejected": -4.572999477386475,
400
  "step": 260
401
  },
402
  {
403
- "epoch": 1.7197452229299364,
404
- "grad_norm": 0.08734221756458282,
405
- "learning_rate": 4.288747346072187e-06,
406
- "logits/chosen": -1.8739216327667236,
407
- "logits/rejected": -1.5936236381530762,
408
- "logps/chosen": -481.37353515625,
409
- "logps/rejected": -506.7474670410156,
410
- "loss": 0.0041,
411
  "rewards/accuracies": 1.0,
412
- "rewards/chosen": 2.923518180847168,
413
- "rewards/margins": 7.2085113525390625,
414
- "rewards/rejected": -4.284992218017578,
415
  "step": 270
416
  },
417
  {
418
- "epoch": 1.78343949044586,
419
- "grad_norm": 0.06190163269639015,
420
- "learning_rate": 4.076433121019109e-06,
421
- "logits/chosen": -1.8493846654891968,
422
- "logits/rejected": -1.535466194152832,
423
- "logps/chosen": -465.69134521484375,
424
- "logps/rejected": -524.5106201171875,
425
- "loss": 0.0013,
426
  "rewards/accuracies": 1.0,
427
- "rewards/chosen": 2.9931843280792236,
428
- "rewards/margins": 7.830008506774902,
429
- "rewards/rejected": -4.836824893951416,
430
  "step": 280
431
  },
432
  {
433
- "epoch": 1.8471337579617835,
434
- "grad_norm": 0.056305430829524994,
435
- "learning_rate": 3.8641188959660305e-06,
436
- "logits/chosen": -1.9110437631607056,
437
- "logits/rejected": -1.5337971448898315,
438
- "logps/chosen": -461.9651794433594,
439
- "logps/rejected": -535.5386962890625,
440
- "loss": 0.0009,
441
  "rewards/accuracies": 1.0,
442
- "rewards/chosen": 3.1686594486236572,
443
- "rewards/margins": 7.800524711608887,
444
- "rewards/rejected": -4.63186502456665,
445
  "step": 290
446
  },
447
  {
448
- "epoch": 1.910828025477707,
449
- "grad_norm": 0.07272353023290634,
450
- "learning_rate": 3.6518046709129513e-06,
451
- "logits/chosen": -1.8243738412857056,
452
- "logits/rejected": -1.5750045776367188,
453
- "logps/chosen": -457.2413635253906,
454
- "logps/rejected": -554.6361083984375,
455
- "loss": 0.0011,
456
  "rewards/accuracies": 1.0,
457
- "rewards/chosen": 3.1141717433929443,
458
- "rewards/margins": 7.966392517089844,
459
- "rewards/rejected": -4.852221488952637,
460
  "step": 300
461
  },
462
  {
463
- "epoch": 1.9745222929936306,
464
- "grad_norm": 0.1764669567346573,
465
- "learning_rate": 3.4394904458598725e-06,
466
- "logits/chosen": -1.8024390935897827,
467
- "logits/rejected": -1.529760479927063,
468
- "logps/chosen": -482.1455078125,
469
- "logps/rejected": -523.8703002929688,
470
- "loss": 0.0026,
471
  "rewards/accuracies": 1.0,
472
- "rewards/chosen": 2.9039502143859863,
473
- "rewards/margins": 7.471547603607178,
474
- "rewards/rejected": -4.56759786605835,
475
  "step": 310
476
  },
477
  {
478
- "epoch": 2.038216560509554,
479
- "grad_norm": 0.06449217349290848,
480
- "learning_rate": 3.227176220806794e-06,
481
- "logits/chosen": -1.8698956966400146,
482
- "logits/rejected": -1.5352425575256348,
483
- "logps/chosen": -474.93487548828125,
484
- "logps/rejected": -555.2481689453125,
485
- "loss": 0.003,
486
  "rewards/accuracies": 1.0,
487
- "rewards/chosen": 2.7326927185058594,
488
- "rewards/margins": 7.607143402099609,
489
- "rewards/rejected": -4.874450206756592,
490
  "step": 320
491
  },
492
  {
493
- "epoch": 2.1019108280254777,
494
- "grad_norm": 0.15638966858386993,
495
- "learning_rate": 3.0148619957537158e-06,
496
- "logits/chosen": -1.8096567392349243,
497
- "logits/rejected": -1.5850298404693604,
498
- "logps/chosen": -498.39483642578125,
499
- "logps/rejected": -536.6939086914062,
500
  "loss": 0.0011,
501
  "rewards/accuracies": 1.0,
502
- "rewards/chosen": 3.083881139755249,
503
- "rewards/margins": 8.246601104736328,
504
- "rewards/rejected": -5.162720203399658,
505
  "step": 330
506
  },
507
  {
508
- "epoch": 2.1656050955414012,
509
- "grad_norm": 0.022478261962532997,
510
- "learning_rate": 2.802547770700637e-06,
511
- "logits/chosen": -1.8860228061676025,
512
- "logits/rejected": -1.5795822143554688,
513
- "logps/chosen": -471.2503356933594,
514
- "logps/rejected": -555.7251586914062,
515
- "loss": 0.001,
516
  "rewards/accuracies": 1.0,
517
- "rewards/chosen": 2.8919870853424072,
518
- "rewards/margins": 8.094644546508789,
519
- "rewards/rejected": -5.2026591300964355,
520
  "step": 340
521
  },
522
  {
523
- "epoch": 2.229299363057325,
524
- "grad_norm": 0.006824295036494732,
525
- "learning_rate": 2.5902335456475586e-06,
526
- "logits/chosen": -1.840765357017517,
527
- "logits/rejected": -1.6004451513290405,
528
- "logps/chosen": -496.37908935546875,
529
- "logps/rejected": -516.0117797851562,
530
- "loss": 0.0012,
531
  "rewards/accuracies": 1.0,
532
- "rewards/chosen": 3.2698845863342285,
533
- "rewards/margins": 8.031291007995605,
534
- "rewards/rejected": -4.761406898498535,
535
  "step": 350
536
  },
537
  {
538
- "epoch": 2.2929936305732483,
539
- "grad_norm": 0.041482195258140564,
540
- "learning_rate": 2.3779193205944802e-06,
541
- "logits/chosen": -1.9090330600738525,
542
- "logits/rejected": -1.5509936809539795,
543
- "logps/chosen": -476.31884765625,
544
- "logps/rejected": -530.485595703125,
545
- "loss": 0.0014,
546
  "rewards/accuracies": 1.0,
547
- "rewards/chosen": 3.3065025806427,
548
- "rewards/margins": 8.122949600219727,
549
- "rewards/rejected": -4.816446304321289,
550
  "step": 360
551
  },
552
  {
553
- "epoch": 2.356687898089172,
554
- "grad_norm": 0.02308792807161808,
555
- "learning_rate": 2.1656050955414015e-06,
556
- "logits/chosen": -1.887399435043335,
557
- "logits/rejected": -1.5576483011245728,
558
- "logps/chosen": -468.9479064941406,
559
- "logps/rejected": -581.8629150390625,
560
- "loss": 0.0014,
561
  "rewards/accuracies": 1.0,
562
- "rewards/chosen": 3.292541980743408,
563
- "rewards/margins": 8.23143196105957,
564
- "rewards/rejected": -4.938891410827637,
565
  "step": 370
566
  },
567
  {
568
- "epoch": 2.4203821656050954,
569
- "grad_norm": 0.06411473453044891,
570
- "learning_rate": 1.953290870488323e-06,
571
- "logits/chosen": -1.82333242893219,
572
- "logits/rejected": -1.67373788356781,
573
- "logps/chosen": -483.3836975097656,
574
- "logps/rejected": -511.4134216308594,
575
- "loss": 0.001,
576
  "rewards/accuracies": 1.0,
577
- "rewards/chosen": 3.0308749675750732,
578
- "rewards/margins": 8.01519775390625,
579
- "rewards/rejected": -4.984322547912598,
580
  "step": 380
581
  },
582
  {
583
- "epoch": 2.484076433121019,
584
- "grad_norm": 0.20943918824195862,
585
- "learning_rate": 1.740976645435244e-06,
586
- "logits/chosen": -1.8794472217559814,
587
- "logits/rejected": -1.5557177066802979,
588
- "logps/chosen": -476.9034118652344,
589
- "logps/rejected": -540.9918212890625,
590
- "loss": 0.0015,
591
  "rewards/accuracies": 1.0,
592
- "rewards/chosen": 2.99308443069458,
593
- "rewards/margins": 7.622200965881348,
594
- "rewards/rejected": -4.629117012023926,
595
  "step": 390
596
  },
597
  {
598
- "epoch": 2.5477707006369426,
599
- "grad_norm": 0.15412284433841705,
600
- "learning_rate": 1.5286624203821657e-06,
601
- "logits/chosen": -1.8760645389556885,
602
- "logits/rejected": -1.596841812133789,
603
- "logps/chosen": -473.80413818359375,
604
- "logps/rejected": -522.9478759765625,
605
- "loss": 0.0007,
606
  "rewards/accuracies": 1.0,
607
- "rewards/chosen": 3.289515972137451,
608
- "rewards/margins": 8.1244478225708,
609
- "rewards/rejected": -4.834931373596191,
610
  "step": 400
611
  },
612
  {
613
- "epoch": 2.611464968152866,
614
- "grad_norm": 0.020231205970048904,
615
- "learning_rate": 1.3163481953290871e-06,
616
- "logits/chosen": -1.9735329151153564,
617
- "logits/rejected": -1.6164305210113525,
618
- "logps/chosen": -445.96075439453125,
619
- "logps/rejected": -513.9588623046875,
620
- "loss": 0.0014,
621
  "rewards/accuracies": 1.0,
622
- "rewards/chosen": 3.1464686393737793,
623
- "rewards/margins": 8.033473014831543,
624
- "rewards/rejected": -4.887004375457764,
625
  "step": 410
626
  },
627
  {
628
- "epoch": 2.6751592356687897,
629
- "grad_norm": 0.06227200850844383,
630
- "learning_rate": 1.1040339702760086e-06,
631
- "logits/chosen": -1.883143424987793,
632
- "logits/rejected": -1.5652716159820557,
633
- "logps/chosen": -454.7635803222656,
634
- "logps/rejected": -521.7005004882812,
635
- "loss": 0.0018,
636
  "rewards/accuracies": 1.0,
637
- "rewards/chosen": 2.9739201068878174,
638
- "rewards/margins": 7.8255157470703125,
639
- "rewards/rejected": -4.851595878601074,
640
  "step": 420
641
  },
642
  {
643
- "epoch": 2.738853503184713,
644
- "grad_norm": 0.20386821031570435,
645
- "learning_rate": 8.9171974522293e-07,
646
- "logits/chosen": -1.8953783512115479,
647
- "logits/rejected": -1.6141021251678467,
648
- "logps/chosen": -455.4198303222656,
649
- "logps/rejected": -504.2140197753906,
650
- "loss": 0.0017,
651
  "rewards/accuracies": 1.0,
652
- "rewards/chosen": 3.1860811710357666,
653
- "rewards/margins": 7.73357629776001,
654
- "rewards/rejected": -4.547495365142822,
655
  "step": 430
656
  },
657
  {
658
- "epoch": 2.802547770700637,
659
- "grad_norm": 0.036846332252025604,
660
- "learning_rate": 6.794055201698514e-07,
661
- "logits/chosen": -1.8413372039794922,
662
- "logits/rejected": -1.507743000984192,
663
- "logps/chosen": -455.23681640625,
664
- "logps/rejected": -575.4034423828125,
665
- "loss": 0.0006,
666
  "rewards/accuracies": 1.0,
667
- "rewards/chosen": 3.36517596244812,
668
- "rewards/margins": 8.604715347290039,
669
- "rewards/rejected": -5.239538192749023,
670
  "step": 440
671
  },
672
  {
673
- "epoch": 2.8662420382165603,
674
- "grad_norm": 0.0049489871598780155,
675
- "learning_rate": 4.6709129511677283e-07,
676
- "logits/chosen": -1.9412555694580078,
677
- "logits/rejected": -1.5940210819244385,
678
- "logps/chosen": -454.66143798828125,
679
- "logps/rejected": -539.107177734375,
680
- "loss": 0.0019,
681
  "rewards/accuracies": 1.0,
682
- "rewards/chosen": 2.9391071796417236,
683
- "rewards/margins": 8.104011535644531,
684
- "rewards/rejected": -5.1649041175842285,
685
  "step": 450
686
  },
687
  {
688
- "epoch": 2.9299363057324843,
689
- "grad_norm": 0.014217167161405087,
690
- "learning_rate": 2.547770700636943e-07,
691
- "logits/chosen": -1.8514007329940796,
692
- "logits/rejected": -1.5744428634643555,
693
- "logps/chosen": -488.06982421875,
694
- "logps/rejected": -548.7665405273438,
695
- "loss": 0.0012,
696
  "rewards/accuracies": 1.0,
697
- "rewards/chosen": 3.1351876258850098,
698
- "rewards/margins": 8.055967330932617,
699
- "rewards/rejected": -4.920780181884766,
700
  "step": 460
701
  },
702
  {
703
- "epoch": 2.9936305732484074,
704
- "grad_norm": 0.030622508376836777,
705
- "learning_rate": 4.246284501061571e-08,
706
- "logits/chosen": -1.9187486171722412,
707
- "logits/rejected": -1.607115387916565,
708
- "logps/chosen": -481.8365173339844,
709
- "logps/rejected": -552.6552734375,
710
- "loss": 0.001,
711
  "rewards/accuracies": 1.0,
712
- "rewards/chosen": 3.1875436305999756,
713
- "rewards/margins": 8.592950820922852,
714
- "rewards/rejected": -5.405407905578613,
715
  "step": 470
716
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
717
  {
718
  "epoch": 3.0,
719
- "step": 471,
720
  "total_flos": 0.0,
721
- "train_loss": 0.07041998074037535,
722
- "train_runtime": 1050.6186,
723
- "train_samples_per_second": 1.79,
724
- "train_steps_per_second": 0.448
725
  }
726
  ],
727
  "logging_steps": 10,
728
- "max_steps": 471,
729
  "num_input_tokens_seen": 0,
730
  "num_train_epochs": 3,
731
  "save_steps": 100,
 
4
  "best_model_checkpoint": null,
5
  "epoch": 3.0,
6
  "eval_steps": 500,
7
+ "global_step": 579,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.05181347150259067,
14
+ "grad_norm": 9.140412330627441,
15
+ "learning_rate": 9.844559585492228e-06,
16
+ "logits/chosen": -1.9414482116699219,
17
+ "logits/rejected": -1.7225751876831055,
18
+ "logps/chosen": -486.18267822265625,
19
+ "logps/rejected": -488.115478515625,
20
+ "loss": 0.6963,
21
+ "rewards/accuracies": 0.5249999761581421,
22
+ "rewards/chosen": 0.023636549711227417,
23
+ "rewards/margins": -0.0014519490068778396,
24
+ "rewards/rejected": 0.02508850023150444,
25
  "step": 10
26
  },
27
  {
28
+ "epoch": 0.10362694300518134,
29
+ "grad_norm": 8.910807609558105,
30
+ "learning_rate": 9.671848013816928e-06,
31
+ "logits/chosen": -1.8969703912734985,
32
+ "logits/rejected": -1.611445665359497,
33
+ "logps/chosen": -473.8232421875,
34
+ "logps/rejected": -541.8896484375,
35
+ "loss": 0.6356,
36
+ "rewards/accuracies": 0.824999988079071,
37
+ "rewards/chosen": 0.09762275218963623,
38
+ "rewards/margins": 0.12276230752468109,
39
+ "rewards/rejected": -0.025139544159173965,
40
  "step": 20
41
  },
42
  {
43
+ "epoch": 0.15544041450777202,
44
+ "grad_norm": 8.894061088562012,
45
+ "learning_rate": 9.499136442141624e-06,
46
+ "logits/chosen": -1.7466604709625244,
47
+ "logits/rejected": -1.5235388278961182,
48
+ "logps/chosen": -502.09619140625,
49
+ "logps/rejected": -496.20867919921875,
50
+ "loss": 0.5309,
51
  "rewards/accuracies": 1.0,
52
+ "rewards/chosen": 0.2354348599910736,
53
+ "rewards/margins": 0.36666831374168396,
54
+ "rewards/rejected": -0.13123343884944916,
55
  "step": 30
56
  },
57
  {
58
+ "epoch": 0.20725388601036268,
59
+ "grad_norm": 7.1635966300964355,
60
+ "learning_rate": 9.326424870466322e-06,
61
+ "logits/chosen": -1.8220049142837524,
62
+ "logits/rejected": -1.5709198713302612,
63
+ "logps/chosen": -486.42291259765625,
64
+ "logps/rejected": -516.5711669921875,
65
+ "loss": 0.4401,
66
+ "rewards/accuracies": 0.949999988079071,
67
+ "rewards/chosen": 0.45994535088539124,
68
+ "rewards/margins": 0.6302233934402466,
69
+ "rewards/rejected": -0.17027801275253296,
70
  "step": 40
71
  },
72
  {
73
+ "epoch": 0.25906735751295334,
74
+ "grad_norm": 4.4454874992370605,
75
+ "learning_rate": 9.153713298791019e-06,
76
+ "logits/chosen": -1.7100648880004883,
77
+ "logits/rejected": -1.5425455570220947,
78
+ "logps/chosen": -521.1805419921875,
79
+ "logps/rejected": -486.8834533691406,
80
+ "loss": 0.2647,
81
  "rewards/accuracies": 1.0,
82
+ "rewards/chosen": 0.9771218299865723,
83
+ "rewards/margins": 1.2796553373336792,
84
+ "rewards/rejected": -0.3025336265563965,
85
  "step": 50
86
  },
87
  {
88
+ "epoch": 0.31088082901554404,
89
+ "grad_norm": 4.048921585083008,
90
+ "learning_rate": 8.981001727115718e-06,
91
+ "logits/chosen": -1.6415131092071533,
92
+ "logits/rejected": -1.490295648574829,
93
+ "logps/chosen": -529.8136596679688,
94
+ "logps/rejected": -503.9295349121094,
95
+ "loss": 0.1952,
96
  "rewards/accuracies": 1.0,
97
+ "rewards/chosen": 1.1363840103149414,
98
+ "rewards/margins": 1.6918432712554932,
99
+ "rewards/rejected": -0.5554592609405518,
100
  "step": 60
101
  },
102
  {
103
+ "epoch": 0.3626943005181347,
104
+ "grad_norm": 2.3416619300842285,
105
+ "learning_rate": 8.808290155440415e-06,
106
+ "logits/chosen": -1.6617555618286133,
107
+ "logits/rejected": -1.4604771137237549,
108
+ "logps/chosen": -514.0736694335938,
109
+ "logps/rejected": -523.543701171875,
110
+ "loss": 0.1252,
111
  "rewards/accuracies": 1.0,
112
+ "rewards/chosen": 1.635059118270874,
113
+ "rewards/margins": 2.2532262802124023,
114
+ "rewards/rejected": -0.6181670427322388,
115
  "step": 70
116
  },
117
  {
118
+ "epoch": 0.41450777202072536,
119
+ "grad_norm": 3.538511276245117,
120
+ "learning_rate": 8.635578583765113e-06,
121
+ "logits/chosen": -1.653364896774292,
122
+ "logits/rejected": -1.4298722743988037,
123
+ "logps/chosen": -474.07470703125,
124
+ "logps/rejected": -507.8360900878906,
125
+ "loss": 0.0758,
126
  "rewards/accuracies": 1.0,
127
+ "rewards/chosen": 1.8394428491592407,
128
+ "rewards/margins": 2.778176784515381,
129
+ "rewards/rejected": -0.9387339353561401,
130
  "step": 80
131
  },
132
  {
133
+ "epoch": 0.46632124352331605,
134
+ "grad_norm": 1.8926947116851807,
135
+ "learning_rate": 8.462867012089811e-06,
136
+ "logits/chosen": -1.6914196014404297,
137
+ "logits/rejected": -1.4301296472549438,
138
+ "logps/chosen": -478.93353271484375,
139
+ "logps/rejected": -479.542724609375,
140
+ "loss": 0.0446,
141
  "rewards/accuracies": 1.0,
142
+ "rewards/chosen": 2.3641176223754883,
143
+ "rewards/margins": 3.542463779449463,
144
+ "rewards/rejected": -1.1783459186553955,
145
  "step": 90
146
  },
147
  {
148
+ "epoch": 0.5181347150259067,
149
+ "grad_norm": 0.7793521881103516,
150
+ "learning_rate": 8.290155440414507e-06,
151
+ "logits/chosen": -1.6339620351791382,
152
+ "logits/rejected": -1.449350118637085,
153
+ "logps/chosen": -477.76446533203125,
154
+ "logps/rejected": -540.86328125,
155
+ "loss": 0.0393,
156
  "rewards/accuracies": 1.0,
157
+ "rewards/chosen": 2.5429415702819824,
158
+ "rewards/margins": 3.8789615631103516,
159
+ "rewards/rejected": -1.336020588874817,
160
  "step": 100
161
  },
162
  {
163
+ "epoch": 0.5699481865284974,
164
+ "grad_norm": 0.5772690176963806,
165
+ "learning_rate": 8.117443868739207e-06,
166
+ "logits/chosen": -1.6833152770996094,
167
+ "logits/rejected": -1.3989613056182861,
168
+ "logps/chosen": -465.7315368652344,
169
+ "logps/rejected": -520.9325561523438,
170
+ "loss": 0.0211,
171
  "rewards/accuracies": 1.0,
172
+ "rewards/chosen": 2.534696102142334,
173
+ "rewards/margins": 4.387997150421143,
174
+ "rewards/rejected": -1.85330069065094,
175
  "step": 110
176
  },
177
  {
178
+ "epoch": 0.6217616580310881,
179
+ "grad_norm": 0.4226108491420746,
180
+ "learning_rate": 7.944732297063904e-06,
181
+ "logits/chosen": -1.6814254522323608,
182
+ "logits/rejected": -1.4467347860336304,
183
+ "logps/chosen": -468.86248779296875,
184
+ "logps/rejected": -499.3148498535156,
185
+ "loss": 0.0157,
186
  "rewards/accuracies": 1.0,
187
+ "rewards/chosen": 2.5799293518066406,
188
+ "rewards/margins": 4.886545181274414,
189
+ "rewards/rejected": -2.3066158294677734,
190
  "step": 120
191
  },
192
  {
193
+ "epoch": 0.6735751295336787,
194
+ "grad_norm": 0.3847591280937195,
195
+ "learning_rate": 7.772020725388602e-06,
196
+ "logits/chosen": -1.6606528759002686,
197
+ "logits/rejected": -1.418936014175415,
198
+ "logps/chosen": -479.26446533203125,
199
+ "logps/rejected": -526.3353271484375,
200
+ "loss": 0.0154,
201
  "rewards/accuracies": 1.0,
202
+ "rewards/chosen": 2.75547456741333,
203
+ "rewards/margins": 5.223846435546875,
204
+ "rewards/rejected": -2.468371868133545,
205
  "step": 130
206
  },
207
  {
208
+ "epoch": 0.7253886010362695,
209
+ "grad_norm": 0.2281242161989212,
210
+ "learning_rate": 7.599309153713299e-06,
211
+ "logits/chosen": -1.7155954837799072,
212
+ "logits/rejected": -1.4960612058639526,
213
+ "logps/chosen": -468.90350341796875,
214
+ "logps/rejected": -520.2388305664062,
215
+ "loss": 0.0087,
216
  "rewards/accuracies": 1.0,
217
+ "rewards/chosen": 2.7697935104370117,
218
+ "rewards/margins": 5.687216281890869,
219
+ "rewards/rejected": -2.917421817779541,
220
  "step": 140
221
  },
222
  {
223
+ "epoch": 0.7772020725388601,
224
+ "grad_norm": 0.508161187171936,
225
+ "learning_rate": 7.426597582037998e-06,
226
+ "logits/chosen": -1.689793348312378,
227
+ "logits/rejected": -1.4585435390472412,
228
+ "logps/chosen": -440.17803955078125,
229
+ "logps/rejected": -500.63897705078125,
230
+ "loss": 0.0107,
231
  "rewards/accuracies": 1.0,
232
+ "rewards/chosen": 2.7548987865448,
233
+ "rewards/margins": 5.646756649017334,
234
+ "rewards/rejected": -2.891857624053955,
235
  "step": 150
236
  },
237
  {
238
+ "epoch": 0.8290155440414507,
239
+ "grad_norm": 0.1661311537027359,
240
+ "learning_rate": 7.253886010362695e-06,
241
+ "logits/chosen": -1.749815583229065,
242
+ "logits/rejected": -1.5069010257720947,
243
+ "logps/chosen": -484.00640869140625,
244
+ "logps/rejected": -507.42742919921875,
245
+ "loss": 0.0055,
246
  "rewards/accuracies": 1.0,
247
+ "rewards/chosen": 2.803175449371338,
248
+ "rewards/margins": 6.165472984313965,
249
+ "rewards/rejected": -3.362297534942627,
250
  "step": 160
251
  },
252
  {
253
+ "epoch": 0.8808290155440415,
254
+ "grad_norm": 0.19041810929775238,
255
+ "learning_rate": 7.0811744386873925e-06,
256
+ "logits/chosen": -1.694772481918335,
257
+ "logits/rejected": -1.460770606994629,
258
+ "logps/chosen": -479.5777282714844,
259
+ "logps/rejected": -535.0095825195312,
260
+ "loss": 0.007,
261
  "rewards/accuracies": 1.0,
262
+ "rewards/chosen": 2.6985023021698,
263
+ "rewards/margins": 6.264751434326172,
264
+ "rewards/rejected": -3.5662498474121094,
265
  "step": 170
266
  },
267
  {
268
+ "epoch": 0.9326424870466321,
269
+ "grad_norm": 0.2673930823802948,
270
+ "learning_rate": 6.90846286701209e-06,
271
+ "logits/chosen": -1.8051176071166992,
272
+ "logits/rejected": -1.5203473567962646,
273
+ "logps/chosen": -481.57830810546875,
274
+ "logps/rejected": -542.889404296875,
275
+ "loss": 0.0058,
276
  "rewards/accuracies": 1.0,
277
+ "rewards/chosen": 3.1937761306762695,
278
+ "rewards/margins": 6.7014665603637695,
279
+ "rewards/rejected": -3.507689952850342,
280
  "step": 180
281
  },
282
  {
283
+ "epoch": 0.9844559585492227,
284
+ "grad_norm": 0.5000354647636414,
285
+ "learning_rate": 6.735751295336788e-06,
286
+ "logits/chosen": -1.760645866394043,
287
+ "logits/rejected": -1.4444111585617065,
288
+ "logps/chosen": -480.0152282714844,
289
+ "logps/rejected": -560.7935791015625,
290
+ "loss": 0.0061,
291
  "rewards/accuracies": 1.0,
292
+ "rewards/chosen": 3.096393585205078,
293
+ "rewards/margins": 6.873165130615234,
294
+ "rewards/rejected": -3.7767722606658936,
295
  "step": 190
296
  },
297
  {
298
+ "epoch": 1.0362694300518134,
299
+ "grad_norm": 0.028729300945997238,
300
+ "learning_rate": 6.563039723661486e-06,
301
+ "logits/chosen": -1.768303632736206,
302
+ "logits/rejected": -1.4567553997039795,
303
+ "logps/chosen": -460.3399353027344,
304
+ "logps/rejected": -527.8641357421875,
305
+ "loss": 0.0041,
306
  "rewards/accuracies": 1.0,
307
+ "rewards/chosen": 2.6775355339050293,
308
+ "rewards/margins": 6.8611650466918945,
309
+ "rewards/rejected": -4.183629512786865,
310
  "step": 200
311
  },
312
  {
313
+ "epoch": 1.0880829015544042,
314
+ "grad_norm": 0.03302822262048721,
315
+ "learning_rate": 6.390328151986184e-06,
316
+ "logits/chosen": -1.7313588857650757,
317
+ "logits/rejected": -1.4315208196640015,
318
+ "logps/chosen": -494.01141357421875,
319
+ "logps/rejected": -582.895751953125,
320
+ "loss": 0.0016,
321
  "rewards/accuracies": 1.0,
322
+ "rewards/chosen": 3.1437973976135254,
323
+ "rewards/margins": 7.43361759185791,
324
+ "rewards/rejected": -4.289820194244385,
325
  "step": 210
326
  },
327
  {
328
+ "epoch": 1.1398963730569949,
329
+ "grad_norm": 0.2988791763782501,
330
+ "learning_rate": 6.217616580310881e-06,
331
+ "logits/chosen": -1.7174772024154663,
332
+ "logits/rejected": -1.4190794229507446,
333
+ "logps/chosen": -483.44439697265625,
334
+ "logps/rejected": -574.5506591796875,
335
+ "loss": 0.0049,
336
  "rewards/accuracies": 1.0,
337
+ "rewards/chosen": 2.8517348766326904,
338
+ "rewards/margins": 7.021934509277344,
339
+ "rewards/rejected": -4.170199394226074,
340
  "step": 220
341
  },
342
  {
343
+ "epoch": 1.1917098445595855,
344
+ "grad_norm": 0.1659945398569107,
345
+ "learning_rate": 6.044905008635579e-06,
346
+ "logits/chosen": -1.8686014413833618,
347
+ "logits/rejected": -1.55970299243927,
348
+ "logps/chosen": -448.3949279785156,
349
+ "logps/rejected": -565.4742431640625,
350
+ "loss": 0.0038,
351
  "rewards/accuracies": 1.0,
352
+ "rewards/chosen": 3.227496385574341,
353
+ "rewards/margins": 7.409262180328369,
354
+ "rewards/rejected": -4.181765556335449,
355
  "step": 230
356
  },
357
  {
358
+ "epoch": 1.2435233160621761,
359
+ "grad_norm": 0.0639355331659317,
360
+ "learning_rate": 5.872193436960278e-06,
361
+ "logits/chosen": -1.8060436248779297,
362
+ "logits/rejected": -1.545507550239563,
363
+ "logps/chosen": -478.74609375,
364
+ "logps/rejected": -555.3989868164062,
365
+ "loss": 0.0028,
366
  "rewards/accuracies": 1.0,
367
+ "rewards/chosen": 3.2223987579345703,
368
+ "rewards/margins": 7.58017110824585,
369
+ "rewards/rejected": -4.357772350311279,
370
  "step": 240
371
  },
372
  {
373
+ "epoch": 1.2953367875647668,
374
+ "grad_norm": 1.1631646156311035,
375
+ "learning_rate": 5.699481865284975e-06,
376
+ "logits/chosen": -1.8868166208267212,
377
+ "logits/rejected": -1.5805160999298096,
378
+ "logps/chosen": -430.96728515625,
379
+ "logps/rejected": -526.8347778320312,
380
+ "loss": 0.0034,
381
  "rewards/accuracies": 1.0,
382
+ "rewards/chosen": 3.246020555496216,
383
+ "rewards/margins": 8.115084648132324,
384
+ "rewards/rejected": -4.869065284729004,
385
  "step": 250
386
  },
387
  {
388
+ "epoch": 1.3471502590673574,
389
+ "grad_norm": 0.03497570380568504,
390
+ "learning_rate": 5.526770293609672e-06,
391
+ "logits/chosen": -1.7296884059906006,
392
+ "logits/rejected": -1.4742090702056885,
393
+ "logps/chosen": -495.16925048828125,
394
+ "logps/rejected": -553.0023803710938,
395
+ "loss": 0.0025,
396
  "rewards/accuracies": 1.0,
397
+ "rewards/chosen": 3.5322353839874268,
398
+ "rewards/margins": 7.728882789611816,
399
+ "rewards/rejected": -4.196647644042969,
400
  "step": 260
401
  },
402
  {
403
+ "epoch": 1.3989637305699483,
404
+ "grad_norm": 0.07264388352632523,
405
+ "learning_rate": 5.3540587219343694e-06,
406
+ "logits/chosen": -1.8094289302825928,
407
+ "logits/rejected": -1.5321930646896362,
408
+ "logps/chosen": -463.7430725097656,
409
+ "logps/rejected": -537.8118896484375,
410
+ "loss": 0.0008,
411
  "rewards/accuracies": 1.0,
412
+ "rewards/chosen": 3.354356288909912,
413
+ "rewards/margins": 8.136880874633789,
414
+ "rewards/rejected": -4.782524585723877,
415
  "step": 270
416
  },
417
  {
418
+ "epoch": 1.450777202072539,
419
+ "grad_norm": 0.008589176461100578,
420
+ "learning_rate": 5.1813471502590676e-06,
421
+ "logits/chosen": -1.7734190225601196,
422
+ "logits/rejected": -1.5649479627609253,
423
+ "logps/chosen": -458.47503662109375,
424
+ "logps/rejected": -526.0906372070312,
425
+ "loss": 0.0023,
426
  "rewards/accuracies": 1.0,
427
+ "rewards/chosen": 3.3651351928710938,
428
+ "rewards/margins": 8.263564109802246,
429
+ "rewards/rejected": -4.898428916931152,
430
  "step": 280
431
  },
432
  {
433
+ "epoch": 1.5025906735751295,
434
+ "grad_norm": 0.04104425758123398,
435
+ "learning_rate": 5.008635578583766e-06,
436
+ "logits/chosen": -1.878761649131775,
437
+ "logits/rejected": -1.4834644794464111,
438
+ "logps/chosen": -461.1991271972656,
439
+ "logps/rejected": -540.1414794921875,
440
+ "loss": 0.0012,
441
  "rewards/accuracies": 1.0,
442
+ "rewards/chosen": 3.5035297870635986,
443
+ "rewards/margins": 8.194310188293457,
444
+ "rewards/rejected": -4.690779685974121,
445
  "step": 290
446
  },
447
  {
448
+ "epoch": 1.5544041450777202,
449
+ "grad_norm": 0.01376403495669365,
450
+ "learning_rate": 4.835924006908464e-06,
451
+ "logits/chosen": -1.888736367225647,
452
+ "logits/rejected": -1.5105645656585693,
453
+ "logps/chosen": -470.4767150878906,
454
+ "logps/rejected": -569.6588745117188,
455
+ "loss": 0.0021,
456
  "rewards/accuracies": 1.0,
457
+ "rewards/chosen": 3.319579601287842,
458
+ "rewards/margins": 8.164112091064453,
459
+ "rewards/rejected": -4.8445329666137695,
460
  "step": 300
461
  },
462
  {
463
+ "epoch": 1.6062176165803108,
464
+ "grad_norm": 0.3114416003227234,
465
+ "learning_rate": 4.663212435233161e-06,
466
+ "logits/chosen": -1.85044264793396,
467
+ "logits/rejected": -1.5581977367401123,
468
+ "logps/chosen": -475.26690673828125,
469
+ "logps/rejected": -520.6973876953125,
470
+ "loss": 0.0018,
471
  "rewards/accuracies": 1.0,
472
+ "rewards/chosen": 3.5499215126037598,
473
+ "rewards/margins": 8.229494094848633,
474
+ "rewards/rejected": -4.679572105407715,
475
  "step": 310
476
  },
477
  {
478
+ "epoch": 1.6580310880829017,
479
+ "grad_norm": 0.08180935680866241,
480
+ "learning_rate": 4.490500863557859e-06,
481
+ "logits/chosen": -1.8729044198989868,
482
+ "logits/rejected": -1.6026229858398438,
483
+ "logps/chosen": -452.8746032714844,
484
+ "logps/rejected": -530.5598754882812,
485
+ "loss": 0.0007,
486
  "rewards/accuracies": 1.0,
487
+ "rewards/chosen": 3.1103663444519043,
488
+ "rewards/margins": 8.12248420715332,
489
+ "rewards/rejected": -5.012118339538574,
490
  "step": 320
491
  },
492
  {
493
+ "epoch": 1.709844559585492,
494
+ "grad_norm": 0.03724868223071098,
495
+ "learning_rate": 4.3177892918825564e-06,
496
+ "logits/chosen": -1.8127027750015259,
497
+ "logits/rejected": -1.5433104038238525,
498
+ "logps/chosen": -510.96234130859375,
499
+ "logps/rejected": -539.6664428710938,
500
  "loss": 0.0011,
501
  "rewards/accuracies": 1.0,
502
+ "rewards/chosen": 3.440533399581909,
503
+ "rewards/margins": 8.182977676391602,
504
+ "rewards/rejected": -4.742444038391113,
505
  "step": 330
506
  },
507
  {
508
+ "epoch": 1.761658031088083,
509
+ "grad_norm": 0.024448685348033905,
510
+ "learning_rate": 4.145077720207254e-06,
511
+ "logits/chosen": -1.9371646642684937,
512
+ "logits/rejected": -1.5605006217956543,
513
+ "logps/chosen": -457.09521484375,
514
+ "logps/rejected": -562.9553833007812,
515
+ "loss": 0.0005,
516
  "rewards/accuracies": 1.0,
517
+ "rewards/chosen": 3.8051505088806152,
518
+ "rewards/margins": 8.683636665344238,
519
+ "rewards/rejected": -4.878485679626465,
520
  "step": 340
521
  },
522
  {
523
+ "epoch": 1.8134715025906736,
524
+ "grad_norm": 0.007909624837338924,
525
+ "learning_rate": 3.972366148531952e-06,
526
+ "logits/chosen": -1.7813365459442139,
527
+ "logits/rejected": -1.5386850833892822,
528
+ "logps/chosen": -472.1241760253906,
529
+ "logps/rejected": -526.0204467773438,
530
+ "loss": 0.0008,
531
  "rewards/accuracies": 1.0,
532
+ "rewards/chosen": 3.475656032562256,
533
+ "rewards/margins": 8.600740432739258,
534
+ "rewards/rejected": -5.125083923339844,
535
  "step": 350
536
  },
537
  {
538
+ "epoch": 1.8652849740932642,
539
+ "grad_norm": 0.01100204512476921,
540
+ "learning_rate": 3.7996545768566495e-06,
541
+ "logits/chosen": -1.8551228046417236,
542
+ "logits/rejected": -1.6169090270996094,
543
+ "logps/chosen": -468.3399353027344,
544
+ "logps/rejected": -521.0477294921875,
545
+ "loss": 0.0022,
546
  "rewards/accuracies": 1.0,
547
+ "rewards/chosen": 3.692009449005127,
548
+ "rewards/margins": 8.327119827270508,
549
+ "rewards/rejected": -4.635110855102539,
550
  "step": 360
551
  },
552
  {
553
+ "epoch": 1.917098445595855,
554
+ "grad_norm": 0.002755044959485531,
555
+ "learning_rate": 3.6269430051813476e-06,
556
+ "logits/chosen": -1.920689582824707,
557
+ "logits/rejected": -1.5247467756271362,
558
+ "logps/chosen": -458.05718994140625,
559
+ "logps/rejected": -549.6717529296875,
560
+ "loss": 0.0007,
561
  "rewards/accuracies": 1.0,
562
+ "rewards/chosen": 3.8437609672546387,
563
+ "rewards/margins": 8.824178695678711,
564
+ "rewards/rejected": -4.980417251586914,
565
  "step": 370
566
  },
567
  {
568
+ "epoch": 1.9689119170984455,
569
+ "grad_norm": 0.001801456674002111,
570
+ "learning_rate": 3.454231433506045e-06,
571
+ "logits/chosen": -1.819500207901001,
572
+ "logits/rejected": -1.5786559581756592,
573
+ "logps/chosen": -478.22540283203125,
574
+ "logps/rejected": -571.024658203125,
575
+ "loss": 0.0006,
576
  "rewards/accuracies": 1.0,
577
+ "rewards/chosen": 3.27850079536438,
578
+ "rewards/margins": 8.988458633422852,
579
+ "rewards/rejected": -5.709958076477051,
580
  "step": 380
581
  },
582
  {
583
+ "epoch": 2.0207253886010363,
584
+ "grad_norm": 0.1213519424200058,
585
+ "learning_rate": 3.281519861830743e-06,
586
+ "logits/chosen": -1.7965360879898071,
587
+ "logits/rejected": -1.503381609916687,
588
+ "logps/chosen": -484.7544860839844,
589
+ "logps/rejected": -558.208740234375,
590
+ "loss": 0.0019,
591
  "rewards/accuracies": 1.0,
592
+ "rewards/chosen": 3.11090087890625,
593
+ "rewards/margins": 8.451322555541992,
594
+ "rewards/rejected": -5.340420722961426,
595
  "step": 390
596
  },
597
  {
598
+ "epoch": 2.0725388601036268,
599
+ "grad_norm": 0.033907659351825714,
600
+ "learning_rate": 3.1088082901554407e-06,
601
+ "logits/chosen": -1.9653284549713135,
602
+ "logits/rejected": -1.5873863697052002,
603
+ "logps/chosen": -445.7735290527344,
604
+ "logps/rejected": -544.8649291992188,
605
+ "loss": 0.0003,
606
  "rewards/accuracies": 1.0,
607
+ "rewards/chosen": 3.4412612915039062,
608
+ "rewards/margins": 9.170848846435547,
609
+ "rewards/rejected": -5.729588031768799,
610
  "step": 400
611
  },
612
  {
613
+ "epoch": 2.1243523316062176,
614
+ "grad_norm": 0.006869920063763857,
615
+ "learning_rate": 2.936096718480139e-06,
616
+ "logits/chosen": -1.8688383102416992,
617
+ "logits/rejected": -1.613979697227478,
618
+ "logps/chosen": -463.5596618652344,
619
+ "logps/rejected": -509.37579345703125,
620
+ "loss": 0.0013,
621
  "rewards/accuracies": 1.0,
622
+ "rewards/chosen": 3.1676273345947266,
623
+ "rewards/margins": 8.486566543579102,
624
+ "rewards/rejected": -5.318938732147217,
625
  "step": 410
626
  },
627
  {
628
+ "epoch": 2.1761658031088085,
629
+ "grad_norm": 0.016242429614067078,
630
+ "learning_rate": 2.763385146804836e-06,
631
+ "logits/chosen": -1.8480716943740845,
632
+ "logits/rejected": -1.5438343286514282,
633
+ "logps/chosen": -480.6341857910156,
634
+ "logps/rejected": -562.0118408203125,
635
+ "loss": 0.0004,
636
  "rewards/accuracies": 1.0,
637
+ "rewards/chosen": 3.927326202392578,
638
+ "rewards/margins": 9.179906845092773,
639
+ "rewards/rejected": -5.252579689025879,
640
  "step": 420
641
  },
642
  {
643
+ "epoch": 2.227979274611399,
644
+ "grad_norm": 0.01917063444852829,
645
+ "learning_rate": 2.5906735751295338e-06,
646
+ "logits/chosen": -1.8709628582000732,
647
+ "logits/rejected": -1.5309154987335205,
648
+ "logps/chosen": -454.34661865234375,
649
+ "logps/rejected": -585.8658447265625,
650
+ "loss": 0.0011,
651
  "rewards/accuracies": 1.0,
652
+ "rewards/chosen": 3.5671658515930176,
653
+ "rewards/margins": 8.807685852050781,
654
+ "rewards/rejected": -5.2405195236206055,
655
  "step": 430
656
  },
657
  {
658
+ "epoch": 2.2797927461139897,
659
+ "grad_norm": 0.011940844357013702,
660
+ "learning_rate": 2.417962003454232e-06,
661
+ "logits/chosen": -1.8891462087631226,
662
+ "logits/rejected": -1.5786548852920532,
663
+ "logps/chosen": -476.75482177734375,
664
+ "logps/rejected": -550.165771484375,
665
+ "loss": 0.001,
666
  "rewards/accuracies": 1.0,
667
+ "rewards/chosen": 3.5058720111846924,
668
+ "rewards/margins": 8.823278427124023,
669
+ "rewards/rejected": -5.317407131195068,
670
  "step": 440
671
  },
672
  {
673
+ "epoch": 2.33160621761658,
674
+ "grad_norm": 0.052170418202877045,
675
+ "learning_rate": 2.2452504317789296e-06,
676
+ "logits/chosen": -1.851637840270996,
677
+ "logits/rejected": -1.586336374282837,
678
+ "logps/chosen": -470.76483154296875,
679
+ "logps/rejected": -554.5698852539062,
680
+ "loss": 0.0008,
681
  "rewards/accuracies": 1.0,
682
+ "rewards/chosen": 2.9407029151916504,
683
+ "rewards/margins": 8.625204086303711,
684
+ "rewards/rejected": -5.6845011711120605,
685
  "step": 450
686
  },
687
  {
688
+ "epoch": 2.383419689119171,
689
+ "grad_norm": 0.003704603761434555,
690
+ "learning_rate": 2.072538860103627e-06,
691
+ "logits/chosen": -1.9217126369476318,
692
+ "logits/rejected": -1.5532411336898804,
693
+ "logps/chosen": -446.28118896484375,
694
+ "logps/rejected": -548.06982421875,
695
+ "loss": 0.0006,
696
  "rewards/accuracies": 1.0,
697
+ "rewards/chosen": 3.3570828437805176,
698
+ "rewards/margins": 8.866881370544434,
699
+ "rewards/rejected": -5.509798049926758,
700
  "step": 460
701
  },
702
  {
703
+ "epoch": 2.4352331606217614,
704
+ "grad_norm": 0.12578149139881134,
705
+ "learning_rate": 1.8998272884283248e-06,
706
+ "logits/chosen": -1.7785208225250244,
707
+ "logits/rejected": -1.5287562608718872,
708
+ "logps/chosen": -507.2691955566406,
709
+ "logps/rejected": -557.8547973632812,
710
+ "loss": 0.0017,
711
  "rewards/accuracies": 1.0,
712
+ "rewards/chosen": 3.180009365081787,
713
+ "rewards/margins": 8.626852989196777,
714
+ "rewards/rejected": -5.446843147277832,
715
  "step": 470
716
  },
717
+ {
718
+ "epoch": 2.4870466321243523,
719
+ "grad_norm": 0.04742634296417236,
720
+ "learning_rate": 1.7271157167530224e-06,
721
+ "logits/chosen": -1.7439069747924805,
722
+ "logits/rejected": -1.504612922668457,
723
+ "logps/chosen": -492.09442138671875,
724
+ "logps/rejected": -556.8635864257812,
725
+ "loss": 0.0006,
726
+ "rewards/accuracies": 1.0,
727
+ "rewards/chosen": 3.334986448287964,
728
+ "rewards/margins": 8.468341827392578,
729
+ "rewards/rejected": -5.133355140686035,
730
+ "step": 480
731
+ },
732
+ {
733
+ "epoch": 2.538860103626943,
734
+ "grad_norm": 0.005521442741155624,
735
+ "learning_rate": 1.5544041450777204e-06,
736
+ "logits/chosen": -1.9542961120605469,
737
+ "logits/rejected": -1.5555959939956665,
738
+ "logps/chosen": -452.74365234375,
739
+ "logps/rejected": -573.016357421875,
740
+ "loss": 0.0005,
741
+ "rewards/accuracies": 1.0,
742
+ "rewards/chosen": 4.025125026702881,
743
+ "rewards/margins": 9.435617446899414,
744
+ "rewards/rejected": -5.410491466522217,
745
+ "step": 490
746
+ },
747
+ {
748
+ "epoch": 2.5906735751295336,
749
+ "grad_norm": 0.004860945511609316,
750
+ "learning_rate": 1.381692573402418e-06,
751
+ "logits/chosen": -1.9969520568847656,
752
+ "logits/rejected": -1.6214845180511475,
753
+ "logps/chosen": -430.89959716796875,
754
+ "logps/rejected": -567.4713134765625,
755
+ "loss": 0.001,
756
+ "rewards/accuracies": 1.0,
757
+ "rewards/chosen": 3.1888136863708496,
758
+ "rewards/margins": 8.952463150024414,
759
+ "rewards/rejected": -5.763648509979248,
760
+ "step": 500
761
+ },
762
+ {
763
+ "epoch": 2.6424870466321244,
764
+ "grad_norm": 0.015202810987830162,
765
+ "learning_rate": 1.208981001727116e-06,
766
+ "logits/chosen": -1.8306211233139038,
767
+ "logits/rejected": -1.5266889333724976,
768
+ "logps/chosen": -476.01239013671875,
769
+ "logps/rejected": -548.576171875,
770
+ "loss": 0.0005,
771
+ "rewards/accuracies": 1.0,
772
+ "rewards/chosen": 3.322007417678833,
773
+ "rewards/margins": 8.947591781616211,
774
+ "rewards/rejected": -5.625583648681641,
775
+ "step": 510
776
+ },
777
+ {
778
+ "epoch": 2.694300518134715,
779
+ "grad_norm": 0.0031490169931203127,
780
+ "learning_rate": 1.0362694300518134e-06,
781
+ "logits/chosen": -1.7937307357788086,
782
+ "logits/rejected": -1.5639219284057617,
783
+ "logps/chosen": -476.2064514160156,
784
+ "logps/rejected": -544.3578491210938,
785
+ "loss": 0.0009,
786
+ "rewards/accuracies": 1.0,
787
+ "rewards/chosen": 3.444533586502075,
788
+ "rewards/margins": 8.885669708251953,
789
+ "rewards/rejected": -5.441136837005615,
790
+ "step": 520
791
+ },
792
+ {
793
+ "epoch": 2.7461139896373057,
794
+ "grad_norm": 0.009058469906449318,
795
+ "learning_rate": 8.635578583765112e-07,
796
+ "logits/chosen": -1.9383313655853271,
797
+ "logits/rejected": -1.6197090148925781,
798
+ "logps/chosen": -452.0750427246094,
799
+ "logps/rejected": -558.096435546875,
800
+ "loss": 0.0006,
801
+ "rewards/accuracies": 1.0,
802
+ "rewards/chosen": 3.907386064529419,
803
+ "rewards/margins": 9.26667594909668,
804
+ "rewards/rejected": -5.359290599822998,
805
+ "step": 530
806
+ },
807
+ {
808
+ "epoch": 2.7979274611398965,
809
+ "grad_norm": 0.001649867626838386,
810
+ "learning_rate": 6.90846286701209e-07,
811
+ "logits/chosen": -1.7749992609024048,
812
+ "logits/rejected": -1.5281288623809814,
813
+ "logps/chosen": -506.68621826171875,
814
+ "logps/rejected": -570.96630859375,
815
+ "loss": 0.001,
816
+ "rewards/accuracies": 1.0,
817
+ "rewards/chosen": 3.9603734016418457,
818
+ "rewards/margins": 9.308938026428223,
819
+ "rewards/rejected": -5.348565101623535,
820
+ "step": 540
821
+ },
822
+ {
823
+ "epoch": 2.849740932642487,
824
+ "grad_norm": 0.1349392831325531,
825
+ "learning_rate": 5.181347150259067e-07,
826
+ "logits/chosen": -1.9128179550170898,
827
+ "logits/rejected": -1.5522171258926392,
828
+ "logps/chosen": -459.3995666503906,
829
+ "logps/rejected": -547.3722534179688,
830
+ "loss": 0.0008,
831
+ "rewards/accuracies": 1.0,
832
+ "rewards/chosen": 3.1473488807678223,
833
+ "rewards/margins": 8.742547988891602,
834
+ "rewards/rejected": -5.595198631286621,
835
+ "step": 550
836
+ },
837
+ {
838
+ "epoch": 2.901554404145078,
839
+ "grad_norm": 0.0068402839824557304,
840
+ "learning_rate": 3.454231433506045e-07,
841
+ "logits/chosen": -1.833030104637146,
842
+ "logits/rejected": -1.5440208911895752,
843
+ "logps/chosen": -476.62060546875,
844
+ "logps/rejected": -545.6508178710938,
845
+ "loss": 0.0005,
846
+ "rewards/accuracies": 1.0,
847
+ "rewards/chosen": 3.1847169399261475,
848
+ "rewards/margins": 9.365403175354004,
849
+ "rewards/rejected": -6.180685997009277,
850
+ "step": 560
851
+ },
852
+ {
853
+ "epoch": 2.9533678756476682,
854
+ "grad_norm": 0.009579150937497616,
855
+ "learning_rate": 1.7271157167530226e-07,
856
+ "logits/chosen": -1.7974345684051514,
857
+ "logits/rejected": -1.507267713546753,
858
+ "logps/chosen": -483.2386779785156,
859
+ "logps/rejected": -540.8428955078125,
860
+ "loss": 0.0014,
861
+ "rewards/accuracies": 1.0,
862
+ "rewards/chosen": 2.946927070617676,
863
+ "rewards/margins": 8.679272651672363,
864
+ "rewards/rejected": -5.732344627380371,
865
+ "step": 570
866
+ },
867
  {
868
  "epoch": 3.0,
869
+ "step": 579,
870
  "total_flos": 0.0,
871
+ "train_loss": 0.055241224655177,
872
+ "train_runtime": 1305.5765,
873
+ "train_samples_per_second": 1.774,
874
+ "train_steps_per_second": 0.443
875
  }
876
  ],
877
  "logging_steps": 10,
878
+ "max_steps": 579,
879
  "num_input_tokens_seen": 0,
880
  "num_train_epochs": 3,
881
  "save_steps": 100,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:103731271f099b914bbf8a25222d8c2b595695ca7764d33196520cc1ba53a4df
3
  size 6929
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9e903857df414eaf4d2e8deceb20c7a0c7f1ac3a16d6cd4a85bd6c37fa86ea8
3
  size 6929