diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6329 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 2000, + "global_step": 4168, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002399232245681382, + "grad_norm": 8.049803993229359, + "learning_rate": 1.199040767386091e-09, + "logits/chosen": -1.6984773874282837, + "logits/rejected": -1.652543067932129, + "logps/chosen": -177.60882568359375, + "logps/rejected": -183.90121459960938, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0023992322456813818, + "grad_norm": 8.750131214617003, + "learning_rate": 1.199040767386091e-08, + "logits/chosen": -1.804961919784546, + "logits/rejected": -1.80453622341156, + "logps/chosen": -423.9947814941406, + "logps/rejected": -346.7825012207031, + "loss": 0.6933, + "rewards/accuracies": 0.3333333432674408, + "rewards/chosen": -0.0014058154774829745, + "rewards/margins": -0.0022073043510317802, + "rewards/rejected": 0.0008014890481717885, + "step": 10 + }, + { + "epoch": 0.0047984644913627635, + "grad_norm": 8.998429571520733, + "learning_rate": 2.398081534772182e-08, + "logits/chosen": -1.8381140232086182, + "logits/rejected": -1.8816156387329102, + "logps/chosen": -286.90869140625, + "logps/rejected": -251.70315551757812, + "loss": 0.6929, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0007559108780696988, + "rewards/margins": 0.0014887532452121377, + "rewards/rejected": -0.0007328423671424389, + "step": 20 + }, + { + "epoch": 0.007197696737044146, + "grad_norm": 8.318610335466628, + "learning_rate": 3.597122302158273e-08, + "logits/chosen": -1.6924991607666016, + "logits/rejected": -1.693213701248169, + "logps/chosen": -279.1492004394531, + "logps/rejected": -281.7704162597656, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.85531859844923e-06, + "rewards/margins": 0.0011773824226111174, + "rewards/rejected": -0.0011802377412095666, + "step": 30 + }, + { + "epoch": 0.009596928982725527, + "grad_norm": 8.147876496687319, + "learning_rate": 4.796163069544364e-08, + "logits/chosen": -1.8182836771011353, + "logits/rejected": -1.788740873336792, + "logps/chosen": -281.30780029296875, + "logps/rejected": -273.28167724609375, + "loss": 0.6935, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.00014996049867477268, + "rewards/margins": -0.0007198990788310766, + "rewards/rejected": 0.0008698596502654254, + "step": 40 + }, + { + "epoch": 0.01199616122840691, + "grad_norm": 8.87090064769473, + "learning_rate": 5.995203836930455e-08, + "logits/chosen": -1.8855431079864502, + "logits/rejected": -1.8159040212631226, + "logps/chosen": -316.3466796875, + "logps/rejected": -269.6546936035156, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0015332108596339822, + "rewards/margins": 0.0016623124247416854, + "rewards/rejected": -0.00012910175428260118, + "step": 50 + }, + { + "epoch": 0.014395393474088292, + "grad_norm": 9.378840909866367, + "learning_rate": 7.194244604316546e-08, + "logits/chosen": -1.7035154104232788, + "logits/rejected": -1.7044557332992554, + "logps/chosen": -328.5385437011719, + "logps/rejected": -298.63531494140625, + "loss": 0.6932, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0005593986716121435, + "rewards/margins": -0.0008079404942691326, + "rewards/rejected": 0.0013673391658812761, + "step": 60 + }, + { + "epoch": 0.016794625719769675, + "grad_norm": 8.296213072753831, + "learning_rate": 8.393285371702638e-08, + "logits/chosen": -1.774336814880371, + "logits/rejected": -1.7971302270889282, + "logps/chosen": -320.2900695800781, + "logps/rejected": -307.18408203125, + "loss": 0.6932, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0004135914205107838, + "rewards/margins": 0.00020718287851195782, + "rewards/rejected": -0.0006207742262631655, + "step": 70 + }, + { + "epoch": 0.019193857965451054, + "grad_norm": 8.510851873886816, + "learning_rate": 9.592326139088728e-08, + "logits/chosen": -1.7330459356307983, + "logits/rejected": -1.752417802810669, + "logps/chosen": -236.32534790039062, + "logps/rejected": -273.98394775390625, + "loss": 0.6931, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0007922305958345532, + "rewards/margins": 0.0012075863778591156, + "rewards/rejected": -0.0019998166244477034, + "step": 80 + }, + { + "epoch": 0.021593090211132437, + "grad_norm": 8.689288930584961, + "learning_rate": 1.0791366906474819e-07, + "logits/chosen": -1.7583744525909424, + "logits/rejected": -1.74367356300354, + "logps/chosen": -381.2481994628906, + "logps/rejected": -330.4988708496094, + "loss": 0.6927, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.000420374795794487, + "rewards/margins": 0.0009064264595508575, + "rewards/rejected": -0.0013268012553453445, + "step": 90 + }, + { + "epoch": 0.02399232245681382, + "grad_norm": 8.556373022379947, + "learning_rate": 1.199040767386091e-07, + "logits/chosen": -1.7948977947235107, + "logits/rejected": -1.8129732608795166, + "logps/chosen": -298.7731018066406, + "logps/rejected": -312.20770263671875, + "loss": 0.6927, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0014683209592476487, + "rewards/margins": 0.0018057385459542274, + "rewards/rejected": -0.003274058923125267, + "step": 100 + }, + { + "epoch": 0.026391554702495202, + "grad_norm": 7.674645359699389, + "learning_rate": 1.3189448441247004e-07, + "logits/chosen": -1.7367664575576782, + "logits/rejected": -1.7342132329940796, + "logps/chosen": -263.44598388671875, + "logps/rejected": -262.49713134765625, + "loss": 0.6927, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0006875266553834081, + "rewards/margins": 0.0018852024804800749, + "rewards/rejected": -0.0025727287866175175, + "step": 110 + }, + { + "epoch": 0.028790786948176585, + "grad_norm": 8.703203021899013, + "learning_rate": 1.4388489208633092e-07, + "logits/chosen": -1.7811676263809204, + "logits/rejected": -1.7650413513183594, + "logps/chosen": -351.61053466796875, + "logps/rejected": -314.179931640625, + "loss": 0.692, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0011009855661541224, + "rewards/margins": 0.002133227651938796, + "rewards/rejected": -0.0032342136837542057, + "step": 120 + }, + { + "epoch": 0.031190019193857964, + "grad_norm": 7.740616810291473, + "learning_rate": 1.5587529976019183e-07, + "logits/chosen": -1.7301524877548218, + "logits/rejected": -1.7630159854888916, + "logps/chosen": -247.3012237548828, + "logps/rejected": -335.6864013671875, + "loss": 0.6918, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0020864324178546667, + "rewards/margins": 0.0025339422281831503, + "rewards/rejected": -0.00462037418037653, + "step": 130 + }, + { + "epoch": 0.03358925143953935, + "grad_norm": 8.996324509623863, + "learning_rate": 1.6786570743405277e-07, + "logits/chosen": -1.7372894287109375, + "logits/rejected": -1.7900381088256836, + "logps/chosen": -317.2925720214844, + "logps/rejected": -304.47003173828125, + "loss": 0.6916, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.00546858087182045, + "rewards/margins": 0.0015602625207975507, + "rewards/rejected": -0.007028843276202679, + "step": 140 + }, + { + "epoch": 0.03598848368522073, + "grad_norm": 8.476448197373312, + "learning_rate": 1.7985611510791365e-07, + "logits/chosen": -1.8100560903549194, + "logits/rejected": -1.783705711364746, + "logps/chosen": -263.62109375, + "logps/rejected": -255.08297729492188, + "loss": 0.6914, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.00241624447517097, + "rewards/margins": 0.004475831054151058, + "rewards/rejected": -0.0068920752964913845, + "step": 150 + }, + { + "epoch": 0.03838771593090211, + "grad_norm": 8.422913971718383, + "learning_rate": 1.9184652278177456e-07, + "logits/chosen": -1.7406810522079468, + "logits/rejected": -1.6746171712875366, + "logps/chosen": -337.25140380859375, + "logps/rejected": -262.32965087890625, + "loss": 0.6909, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.004762451164424419, + "rewards/margins": 0.004000225104391575, + "rewards/rejected": -0.008762676268815994, + "step": 160 + }, + { + "epoch": 0.040786948176583494, + "grad_norm": 7.894373729717768, + "learning_rate": 2.038369304556355e-07, + "logits/chosen": -1.8033506870269775, + "logits/rejected": -1.8009182214736938, + "logps/chosen": -391.51910400390625, + "logps/rejected": -383.02490234375, + "loss": 0.6908, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0060736616142094135, + "rewards/margins": 0.007023586425930262, + "rewards/rejected": -0.0130972471088171, + "step": 170 + }, + { + "epoch": 0.04318618042226487, + "grad_norm": 8.987823337983558, + "learning_rate": 2.1582733812949638e-07, + "logits/chosen": -1.78794264793396, + "logits/rejected": -1.7560735940933228, + "logps/chosen": -265.91082763671875, + "logps/rejected": -259.29266357421875, + "loss": 0.6905, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.005672088824212551, + "rewards/margins": 0.0077647133730351925, + "rewards/rejected": -0.013436801731586456, + "step": 180 + }, + { + "epoch": 0.04558541266794626, + "grad_norm": 9.837905770088794, + "learning_rate": 2.278177458033573e-07, + "logits/chosen": -1.7344732284545898, + "logits/rejected": -1.7171862125396729, + "logps/chosen": -345.572509765625, + "logps/rejected": -281.0559387207031, + "loss": 0.6899, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.006667067296802998, + "rewards/margins": 0.008044905960559845, + "rewards/rejected": -0.014711974188685417, + "step": 190 + }, + { + "epoch": 0.04798464491362764, + "grad_norm": 7.67650953223531, + "learning_rate": 2.398081534772182e-07, + "logits/chosen": -1.8390235900878906, + "logits/rejected": -1.809404730796814, + "logps/chosen": -350.47918701171875, + "logps/rejected": -329.0401611328125, + "loss": 0.6883, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.006954600103199482, + "rewards/margins": 0.009905180893838406, + "rewards/rejected": -0.016859780997037888, + "step": 200 + }, + { + "epoch": 0.05038387715930902, + "grad_norm": 9.0533724066681, + "learning_rate": 2.517985611510791e-07, + "logits/chosen": -1.838897466659546, + "logits/rejected": -1.85344660282135, + "logps/chosen": -265.4543762207031, + "logps/rejected": -286.2290344238281, + "loss": 0.687, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.009936061687767506, + "rewards/margins": 0.010199328884482384, + "rewards/rejected": -0.020135391503572464, + "step": 210 + }, + { + "epoch": 0.052783109404990404, + "grad_norm": 8.012931923793188, + "learning_rate": 2.637889688249401e-07, + "logits/chosen": -1.8423738479614258, + "logits/rejected": -1.8508341312408447, + "logps/chosen": -354.40838623046875, + "logps/rejected": -349.57476806640625, + "loss": 0.6877, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.01530354656279087, + "rewards/margins": 0.00864893477410078, + "rewards/rejected": -0.023952480405569077, + "step": 220 + }, + { + "epoch": 0.05518234165067178, + "grad_norm": 8.477662927787723, + "learning_rate": 2.7577937649880093e-07, + "logits/chosen": -1.8172448873519897, + "logits/rejected": -1.884603500366211, + "logps/chosen": -273.24102783203125, + "logps/rejected": -312.290771484375, + "loss": 0.6851, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01350661925971508, + "rewards/margins": 0.024537406861782074, + "rewards/rejected": -0.0380440279841423, + "step": 230 + }, + { + "epoch": 0.05758157389635317, + "grad_norm": 9.259460223265522, + "learning_rate": 2.8776978417266184e-07, + "logits/chosen": -1.7631242275238037, + "logits/rejected": -1.746927261352539, + "logps/chosen": -335.8398742675781, + "logps/rejected": -288.20123291015625, + "loss": 0.6819, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.01196887157857418, + "rewards/margins": 0.023810286074876785, + "rewards/rejected": -0.035779163241386414, + "step": 240 + }, + { + "epoch": 0.05998080614203455, + "grad_norm": 9.099248186160446, + "learning_rate": 2.997601918465228e-07, + "logits/chosen": -1.7817866802215576, + "logits/rejected": -1.763625144958496, + "logps/chosen": -273.6108703613281, + "logps/rejected": -265.0091247558594, + "loss": 0.6818, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.02008233033120632, + "rewards/margins": 0.022292112931609154, + "rewards/rejected": -0.042374443262815475, + "step": 250 + }, + { + "epoch": 0.06238003838771593, + "grad_norm": 8.576523972151957, + "learning_rate": 3.1175059952038366e-07, + "logits/chosen": -1.7974933385849, + "logits/rejected": -1.793474793434143, + "logps/chosen": -295.26904296875, + "logps/rejected": -294.5319519042969, + "loss": 0.6788, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.01993853785097599, + "rewards/margins": 0.03191729635000229, + "rewards/rejected": -0.05185583978891373, + "step": 260 + }, + { + "epoch": 0.0647792706333973, + "grad_norm": 9.183763780102984, + "learning_rate": 3.2374100719424457e-07, + "logits/chosen": -1.6876914501190186, + "logits/rejected": -1.641826868057251, + "logps/chosen": -325.13641357421875, + "logps/rejected": -264.55670166015625, + "loss": 0.6781, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.021591879427433014, + "rewards/margins": 0.01398774515837431, + "rewards/rejected": -0.0355796255171299, + "step": 270 + }, + { + "epoch": 0.0671785028790787, + "grad_norm": 9.308232945425354, + "learning_rate": 3.3573141486810554e-07, + "logits/chosen": -1.6838023662567139, + "logits/rejected": -1.7079963684082031, + "logps/chosen": -335.07940673828125, + "logps/rejected": -321.0735778808594, + "loss": 0.6723, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.026867201551795006, + "rewards/margins": 0.040614087134599686, + "rewards/rejected": -0.06748128682374954, + "step": 280 + }, + { + "epoch": 0.06957773512476008, + "grad_norm": 8.28284151492571, + "learning_rate": 3.477218225419664e-07, + "logits/chosen": -1.7532460689544678, + "logits/rejected": -1.7269245386123657, + "logps/chosen": -324.70703125, + "logps/rejected": -296.92889404296875, + "loss": 0.6736, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03919905424118042, + "rewards/margins": 0.05154905468225479, + "rewards/rejected": -0.09074810147285461, + "step": 290 + }, + { + "epoch": 0.07197696737044146, + "grad_norm": 9.050119295228424, + "learning_rate": 3.597122302158273e-07, + "logits/chosen": -1.7930822372436523, + "logits/rejected": -1.8066909313201904, + "logps/chosen": -293.43310546875, + "logps/rejected": -310.6910705566406, + "loss": 0.6687, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.04249059036374092, + "rewards/margins": 0.0662965252995491, + "rewards/rejected": -0.10878710448741913, + "step": 300 + }, + { + "epoch": 0.07437619961612284, + "grad_norm": 8.443455234272207, + "learning_rate": 3.7170263788968827e-07, + "logits/chosen": -1.7562000751495361, + "logits/rejected": -1.7601397037506104, + "logps/chosen": -307.3702087402344, + "logps/rejected": -265.32757568359375, + "loss": 0.6738, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.046549711376428604, + "rewards/margins": 0.06525909900665283, + "rewards/rejected": -0.11180879920721054, + "step": 310 + }, + { + "epoch": 0.07677543186180422, + "grad_norm": 7.795456695576063, + "learning_rate": 3.836930455635491e-07, + "logits/chosen": -1.7414474487304688, + "logits/rejected": -1.7113139629364014, + "logps/chosen": -316.69781494140625, + "logps/rejected": -282.4702453613281, + "loss": 0.6694, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06451869010925293, + "rewards/margins": 0.043876685202121735, + "rewards/rejected": -0.10839537531137466, + "step": 320 + }, + { + "epoch": 0.07917466410748561, + "grad_norm": 8.462069845426832, + "learning_rate": 3.9568345323741003e-07, + "logits/chosen": -1.6318508386611938, + "logits/rejected": -1.672313928604126, + "logps/chosen": -287.7787170410156, + "logps/rejected": -332.0728454589844, + "loss": 0.6642, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.07303015887737274, + "rewards/margins": 0.07465466111898422, + "rewards/rejected": -0.14768482744693756, + "step": 330 + }, + { + "epoch": 0.08157389635316699, + "grad_norm": 8.05742553524233, + "learning_rate": 4.07673860911271e-07, + "logits/chosen": -1.775557279586792, + "logits/rejected": -1.7472593784332275, + "logps/chosen": -278.8173828125, + "logps/rejected": -304.4925231933594, + "loss": 0.6637, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.07761813700199127, + "rewards/margins": 0.08343236148357391, + "rewards/rejected": -0.161050483584404, + "step": 340 + }, + { + "epoch": 0.08397312859884837, + "grad_norm": 9.291989006383519, + "learning_rate": 4.1966426858513185e-07, + "logits/chosen": -1.7488950490951538, + "logits/rejected": -1.7599961757659912, + "logps/chosen": -314.4501953125, + "logps/rejected": -316.56805419921875, + "loss": 0.6681, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.09962528944015503, + "rewards/margins": 0.05370550602674484, + "rewards/rejected": -0.15333080291748047, + "step": 350 + }, + { + "epoch": 0.08637236084452975, + "grad_norm": 9.786314458966762, + "learning_rate": 4.3165467625899276e-07, + "logits/chosen": -1.7737061977386475, + "logits/rejected": -1.7398583889007568, + "logps/chosen": -308.25323486328125, + "logps/rejected": -258.8381652832031, + "loss": 0.6589, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.12272640317678452, + "rewards/margins": 0.08457072079181671, + "rewards/rejected": -0.20729713141918182, + "step": 360 + }, + { + "epoch": 0.08877159309021113, + "grad_norm": 9.472324858494085, + "learning_rate": 4.436450839328537e-07, + "logits/chosen": -1.7392994165420532, + "logits/rejected": -1.7540203332901, + "logps/chosen": -282.6414794921875, + "logps/rejected": -301.9410705566406, + "loss": 0.656, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1569872945547104, + "rewards/margins": 0.09970203042030334, + "rewards/rejected": -0.2566893398761749, + "step": 370 + }, + { + "epoch": 0.09117082533589252, + "grad_norm": 8.945522303809742, + "learning_rate": 4.556354916067146e-07, + "logits/chosen": -1.6825107336044312, + "logits/rejected": -1.7148678302764893, + "logps/chosen": -287.05340576171875, + "logps/rejected": -287.68377685546875, + "loss": 0.6442, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.10992908477783203, + "rewards/margins": 0.08619780838489532, + "rewards/rejected": -0.19612689316272736, + "step": 380 + }, + { + "epoch": 0.0935700575815739, + "grad_norm": 8.713684926919127, + "learning_rate": 4.676258992805755e-07, + "logits/chosen": -1.7045574188232422, + "logits/rejected": -1.6891578435897827, + "logps/chosen": -313.6225891113281, + "logps/rejected": -286.752197265625, + "loss": 0.6464, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1495663821697235, + "rewards/margins": 0.06755375862121582, + "rewards/rejected": -0.21712012588977814, + "step": 390 + }, + { + "epoch": 0.09596928982725528, + "grad_norm": 9.63808942427421, + "learning_rate": 4.796163069544364e-07, + "logits/chosen": -1.782142996788025, + "logits/rejected": -1.792083978652954, + "logps/chosen": -314.01641845703125, + "logps/rejected": -308.6452331542969, + "loss": 0.644, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1659998595714569, + "rewards/margins": 0.17172598838806152, + "rewards/rejected": -0.33772581815719604, + "step": 400 + }, + { + "epoch": 0.09836852207293666, + "grad_norm": 9.78181722006622, + "learning_rate": 4.916067146282974e-07, + "logits/chosen": -1.735081434249878, + "logits/rejected": -1.744166374206543, + "logps/chosen": -297.74920654296875, + "logps/rejected": -331.5750427246094, + "loss": 0.6355, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19746237993240356, + "rewards/margins": 0.12206967175006866, + "rewards/rejected": -0.3195320665836334, + "step": 410 + }, + { + "epoch": 0.10076775431861804, + "grad_norm": 10.973904389838358, + "learning_rate": 4.999992108529978e-07, + "logits/chosen": -1.6683883666992188, + "logits/rejected": -1.660827398300171, + "logps/chosen": -400.2232971191406, + "logps/rejected": -380.44244384765625, + "loss": 0.6377, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.22093410789966583, + "rewards/margins": 0.200751394033432, + "rewards/rejected": -0.42168551683425903, + "step": 420 + }, + { + "epoch": 0.10316698656429943, + "grad_norm": 10.093697115080502, + "learning_rate": 4.999851817115532e-07, + "logits/chosen": -1.762123465538025, + "logits/rejected": -1.7475643157958984, + "logps/chosen": -319.5059814453125, + "logps/rejected": -318.26104736328125, + "loss": 0.647, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.25812965631484985, + "rewards/margins": 0.19176968932151794, + "rewards/rejected": -0.4498993456363678, + "step": 430 + }, + { + "epoch": 0.10556621880998081, + "grad_norm": 8.089888318648988, + "learning_rate": 4.999536171027889e-07, + "logits/chosen": -1.726788878440857, + "logits/rejected": -1.724705457687378, + "logps/chosen": -322.8064270019531, + "logps/rejected": -320.8489685058594, + "loss": 0.64, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.29395708441734314, + "rewards/margins": 0.11490349471569061, + "rewards/rejected": -0.40886059403419495, + "step": 440 + }, + { + "epoch": 0.10796545105566219, + "grad_norm": 9.781033765252214, + "learning_rate": 4.999045192408369e-07, + "logits/chosen": -1.6206414699554443, + "logits/rejected": -1.5667846202850342, + "logps/chosen": -308.5359191894531, + "logps/rejected": -297.2460632324219, + "loss": 0.6376, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.31390589475631714, + "rewards/margins": 0.1268848180770874, + "rewards/rejected": -0.44079071283340454, + "step": 450 + }, + { + "epoch": 0.11036468330134357, + "grad_norm": 10.092022945165324, + "learning_rate": 4.998378915697171e-07, + "logits/chosen": -1.7103846073150635, + "logits/rejected": -1.7098249197006226, + "logps/chosen": -341.14385986328125, + "logps/rejected": -354.93634033203125, + "loss": 0.6202, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24522802233695984, + "rewards/margins": 0.224385067820549, + "rewards/rejected": -0.4696131646633148, + "step": 460 + }, + { + "epoch": 0.11276391554702495, + "grad_norm": 9.541263126777299, + "learning_rate": 4.997537387630958e-07, + "logits/chosen": -1.6393098831176758, + "logits/rejected": -1.6209900379180908, + "logps/chosen": -283.2040100097656, + "logps/rejected": -303.1927795410156, + "loss": 0.6117, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3058682382106781, + "rewards/margins": 0.19925224781036377, + "rewards/rejected": -0.5051204562187195, + "step": 470 + }, + { + "epoch": 0.11516314779270634, + "grad_norm": 11.184930518818184, + "learning_rate": 4.996520667239582e-07, + "logits/chosen": -1.6991548538208008, + "logits/rejected": -1.7450830936431885, + "logps/chosen": -310.8371887207031, + "logps/rejected": -389.68011474609375, + "loss": 0.6176, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.29672256112098694, + "rewards/margins": 0.26733893156051636, + "rewards/rejected": -0.5640615224838257, + "step": 480 + }, + { + "epoch": 0.11756238003838772, + "grad_norm": 10.843161831038772, + "learning_rate": 4.995328825841939e-07, + "logits/chosen": -1.653541922569275, + "logits/rejected": -1.6684210300445557, + "logps/chosen": -296.4989013671875, + "logps/rejected": -316.607666015625, + "loss": 0.6125, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2654375433921814, + "rewards/margins": 0.30850234627723694, + "rewards/rejected": -0.573939859867096, + "step": 490 + }, + { + "epoch": 0.1199616122840691, + "grad_norm": 10.23433288963883, + "learning_rate": 4.993961947040967e-07, + "logits/chosen": -1.6451704502105713, + "logits/rejected": -1.6391490697860718, + "logps/chosen": -367.9749755859375, + "logps/rejected": -340.9283447265625, + "loss": 0.6179, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.48997989296913147, + "rewards/margins": 0.12087050825357437, + "rewards/rejected": -0.61085045337677, + "step": 500 + }, + { + "epoch": 0.12236084452975048, + "grad_norm": 9.19004004428547, + "learning_rate": 4.992420126717784e-07, + "logits/chosen": -1.6687390804290771, + "logits/rejected": -1.6875911951065063, + "logps/chosen": -327.46588134765625, + "logps/rejected": -360.41552734375, + "loss": 0.6146, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3033837676048279, + "rewards/margins": 0.42626506090164185, + "rewards/rejected": -0.729648768901825, + "step": 510 + }, + { + "epoch": 0.12476007677543186, + "grad_norm": 15.16716980236411, + "learning_rate": 4.990703473024958e-07, + "logits/chosen": -1.622243881225586, + "logits/rejected": -1.6250410079956055, + "logps/chosen": -370.47137451171875, + "logps/rejected": -377.8720703125, + "loss": 0.6308, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4499855637550354, + "rewards/margins": 0.20455200970172882, + "rewards/rejected": -0.654537558555603, + "step": 520 + }, + { + "epoch": 0.12715930902111325, + "grad_norm": 11.167342534460241, + "learning_rate": 4.98881210637893e-07, + "logits/chosen": -1.5651328563690186, + "logits/rejected": -1.6187912225723267, + "logps/chosen": -289.76129150390625, + "logps/rejected": -348.9803771972656, + "loss": 0.6258, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3775269687175751, + "rewards/margins": 0.2683083415031433, + "rewards/rejected": -0.6458353400230408, + "step": 530 + }, + { + "epoch": 0.1295585412667946, + "grad_norm": 9.673391216100024, + "learning_rate": 4.986746159451553e-07, + "logits/chosen": -1.6248515844345093, + "logits/rejected": -1.6473302841186523, + "logps/chosen": -328.5186767578125, + "logps/rejected": -339.1051025390625, + "loss": 0.6084, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3727303743362427, + "rewards/margins": 0.29726287722587585, + "rewards/rejected": -0.6699932217597961, + "step": 540 + }, + { + "epoch": 0.131957773512476, + "grad_norm": 9.441216851120826, + "learning_rate": 4.984505777160795e-07, + "logits/chosen": -1.5983726978302002, + "logits/rejected": -1.6185548305511475, + "logps/chosen": -374.35858154296875, + "logps/rejected": -409.3572998046875, + "loss": 0.6263, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4160590171813965, + "rewards/margins": 0.3183329403400421, + "rewards/rejected": -0.7343919277191162, + "step": 550 + }, + { + "epoch": 0.1343570057581574, + "grad_norm": 10.885244677737745, + "learning_rate": 4.982091116660574e-07, + "logits/chosen": -1.7161201238632202, + "logits/rejected": -1.7018096446990967, + "logps/chosen": -270.3515930175781, + "logps/rejected": -255.3383331298828, + "loss": 0.6344, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4239963889122009, + "rewards/margins": 0.11987926810979843, + "rewards/rejected": -0.5438756942749023, + "step": 560 + }, + { + "epoch": 0.13675623800383876, + "grad_norm": 12.53005655363697, + "learning_rate": 4.979502347329732e-07, + "logits/chosen": -1.6089760065078735, + "logits/rejected": -1.6298621892929077, + "logps/chosen": -365.46435546875, + "logps/rejected": -424.795654296875, + "loss": 0.6191, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5458365082740784, + "rewards/margins": 0.3269902467727661, + "rewards/rejected": -0.8728267550468445, + "step": 570 + }, + { + "epoch": 0.13915547024952016, + "grad_norm": 12.599522291383552, + "learning_rate": 4.976739650760151e-07, + "logits/chosen": -1.7041542530059814, + "logits/rejected": -1.6849644184112549, + "logps/chosen": -338.31207275390625, + "logps/rejected": -345.06427001953125, + "loss": 0.6027, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.45573872327804565, + "rewards/margins": 0.2169903963804245, + "rewards/rejected": -0.6727291345596313, + "step": 580 + }, + { + "epoch": 0.14155470249520152, + "grad_norm": 17.592461761068698, + "learning_rate": 4.97380322074402e-07, + "logits/chosen": -1.705082893371582, + "logits/rejected": -1.6792469024658203, + "logps/chosen": -314.9473876953125, + "logps/rejected": -337.86419677734375, + "loss": 0.6321, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.58912193775177, + "rewards/margins": 0.29743272066116333, + "rewards/rejected": -0.8865545392036438, + "step": 590 + }, + { + "epoch": 0.14395393474088292, + "grad_norm": 13.741852019538134, + "learning_rate": 4.970693263260237e-07, + "logits/chosen": -1.5637818574905396, + "logits/rejected": -1.5617700815200806, + "logps/chosen": -378.72540283203125, + "logps/rejected": -363.66973876953125, + "loss": 0.6205, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3927595019340515, + "rewards/margins": 0.42179447412490845, + "rewards/rejected": -0.8145539164543152, + "step": 600 + }, + { + "epoch": 0.1463531669865643, + "grad_norm": 11.43978087398807, + "learning_rate": 4.967409996459966e-07, + "logits/chosen": -1.6840118169784546, + "logits/rejected": -1.7043100595474243, + "logps/chosen": -339.77081298828125, + "logps/rejected": -339.2174987792969, + "loss": 0.6071, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46397480368614197, + "rewards/margins": 0.2957712709903717, + "rewards/rejected": -0.7597460746765137, + "step": 610 + }, + { + "epoch": 0.14875239923224567, + "grad_norm": 10.153123358958398, + "learning_rate": 4.963953650651326e-07, + "logits/chosen": -1.6245992183685303, + "logits/rejected": -1.5920209884643555, + "logps/chosen": -443.45947265625, + "logps/rejected": -369.91705322265625, + "loss": 0.6018, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4939720034599304, + "rewards/margins": 0.32629066705703735, + "rewards/rejected": -0.8202627301216125, + "step": 620 + }, + { + "epoch": 0.15115163147792707, + "grad_norm": 11.83638408275226, + "learning_rate": 4.960324468283248e-07, + "logits/chosen": -1.668398141860962, + "logits/rejected": -1.7101606130599976, + "logps/chosen": -291.8460388183594, + "logps/rejected": -328.39447021484375, + "loss": 0.5703, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4231700301170349, + "rewards/margins": 0.3685685694217682, + "rewards/rejected": -0.7917385697364807, + "step": 630 + }, + { + "epoch": 0.15355086372360843, + "grad_norm": 11.889326252509655, + "learning_rate": 4.956522703928451e-07, + "logits/chosen": -1.6359535455703735, + "logits/rejected": -1.6370385885238647, + "logps/chosen": -305.43170166015625, + "logps/rejected": -346.13885498046875, + "loss": 0.5855, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3926054835319519, + "rewards/margins": 0.3129440248012543, + "rewards/rejected": -0.7055495977401733, + "step": 640 + }, + { + "epoch": 0.15595009596928983, + "grad_norm": 17.134964168150876, + "learning_rate": 4.952548624265606e-07, + "logits/chosen": -1.5452715158462524, + "logits/rejected": -1.5080727338790894, + "logps/chosen": -367.8688049316406, + "logps/rejected": -384.73480224609375, + "loss": 0.6276, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6589107513427734, + "rewards/margins": 0.28292521834373474, + "rewards/rejected": -0.9418359994888306, + "step": 650 + }, + { + "epoch": 0.15834932821497122, + "grad_norm": 9.9373193764272, + "learning_rate": 4.948402508060607e-07, + "logits/chosen": -1.5830329656600952, + "logits/rejected": -1.578583002090454, + "logps/chosen": -314.216064453125, + "logps/rejected": -347.6146240234375, + "loss": 0.6282, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5390394926071167, + "rewards/margins": 0.4961690902709961, + "rewards/rejected": -1.0352084636688232, + "step": 660 + }, + { + "epoch": 0.16074856046065258, + "grad_norm": 12.293455166160937, + "learning_rate": 4.944084646147038e-07, + "logits/chosen": -1.6564319133758545, + "logits/rejected": -1.628400444984436, + "logps/chosen": -405.21832275390625, + "logps/rejected": -395.16168212890625, + "loss": 0.6474, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5265312790870667, + "rewards/margins": 0.1809079349040985, + "rewards/rejected": -0.7074393033981323, + "step": 670 + }, + { + "epoch": 0.16314779270633398, + "grad_norm": 9.68433958884952, + "learning_rate": 4.939595341405754e-07, + "logits/chosen": -1.627952218055725, + "logits/rejected": -1.6237417459487915, + "logps/chosen": -331.4278869628906, + "logps/rejected": -332.4441833496094, + "loss": 0.6252, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4106570780277252, + "rewards/margins": 0.2862716615200043, + "rewards/rejected": -0.6969286799430847, + "step": 680 + }, + { + "epoch": 0.16554702495201534, + "grad_norm": 11.047058239262977, + "learning_rate": 4.93493490874365e-07, + "logits/chosen": -1.5566327571868896, + "logits/rejected": -1.5497907400131226, + "logps/chosen": -314.609375, + "logps/rejected": -351.512451171875, + "loss": 0.5778, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4623290002346039, + "rewards/margins": 0.2775939106941223, + "rewards/rejected": -0.7399229407310486, + "step": 690 + }, + { + "epoch": 0.16794625719769674, + "grad_norm": 12.568033206214066, + "learning_rate": 4.93010367507156e-07, + "logits/chosen": -1.516019582748413, + "logits/rejected": -1.517661690711975, + "logps/chosen": -296.07818603515625, + "logps/rejected": -312.55023193359375, + "loss": 0.5829, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.46684932708740234, + "rewards/margins": 0.4299789071083069, + "rewards/rejected": -0.8968281745910645, + "step": 700 + }, + { + "epoch": 0.17034548944337813, + "grad_norm": 13.127287004059248, + "learning_rate": 4.925101979281332e-07, + "logits/chosen": -1.5096848011016846, + "logits/rejected": -1.4953901767730713, + "logps/chosen": -378.8152160644531, + "logps/rejected": -356.99029541015625, + "loss": 0.6192, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.40125903487205505, + "rewards/margins": 0.41182875633239746, + "rewards/rejected": -0.8130879402160645, + "step": 710 + }, + { + "epoch": 0.1727447216890595, + "grad_norm": 11.101693810449992, + "learning_rate": 4.919930172222054e-07, + "logits/chosen": -1.6166894435882568, + "logits/rejected": -1.638384222984314, + "logps/chosen": -331.99139404296875, + "logps/rejected": -376.6008605957031, + "loss": 0.5784, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5163809061050415, + "rewards/margins": 0.4893184304237366, + "rewards/rejected": -1.0056993961334229, + "step": 720 + }, + { + "epoch": 0.1751439539347409, + "grad_norm": 13.45918805950352, + "learning_rate": 4.914588616675445e-07, + "logits/chosen": -1.7260338068008423, + "logits/rejected": -1.7507747411727905, + "logps/chosen": -321.32391357421875, + "logps/rejected": -318.60687255859375, + "loss": 0.6348, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.46452635526657104, + "rewards/margins": 0.23227731883525848, + "rewards/rejected": -0.6968036890029907, + "step": 730 + }, + { + "epoch": 0.17754318618042225, + "grad_norm": 12.655943206664292, + "learning_rate": 4.909077687330404e-07, + "logits/chosen": -1.647907018661499, + "logits/rejected": -1.594866394996643, + "logps/chosen": -346.6475524902344, + "logps/rejected": -333.36041259765625, + "loss": 0.5849, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.48434191942214966, + "rewards/margins": 0.15891483426094055, + "rewards/rejected": -0.643256664276123, + "step": 740 + }, + { + "epoch": 0.17994241842610365, + "grad_norm": 11.813532511500838, + "learning_rate": 4.903397770756729e-07, + "logits/chosen": -1.5877163410186768, + "logits/rejected": -1.615849494934082, + "logps/chosen": -335.9526672363281, + "logps/rejected": -378.492919921875, + "loss": 0.5884, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4574219286441803, + "rewards/margins": 0.44713205099105835, + "rewards/rejected": -0.9045540690422058, + "step": 750 + }, + { + "epoch": 0.18234165067178504, + "grad_norm": 10.499908038333208, + "learning_rate": 4.897549265378004e-07, + "logits/chosen": -1.6060655117034912, + "logits/rejected": -1.5771276950836182, + "logps/chosen": -426.05816650390625, + "logps/rejected": -438.35565185546875, + "loss": 0.6012, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6486039161682129, + "rewards/margins": 0.18572869896888733, + "rewards/rejected": -0.8343325853347778, + "step": 760 + }, + { + "epoch": 0.1847408829174664, + "grad_norm": 12.24486140616409, + "learning_rate": 4.891532581443643e-07, + "logits/chosen": -1.554749846458435, + "logits/rejected": -1.5595489740371704, + "logps/chosen": -381.22540283203125, + "logps/rejected": -434.6144104003906, + "loss": 0.5928, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.41755789518356323, + "rewards/margins": 0.5910860896110535, + "rewards/rejected": -1.0086439847946167, + "step": 770 + }, + { + "epoch": 0.1871401151631478, + "grad_norm": 15.00359467506474, + "learning_rate": 4.885348141000122e-07, + "logits/chosen": -1.5866012573242188, + "logits/rejected": -1.6440818309783936, + "logps/chosen": -323.55377197265625, + "logps/rejected": -370.2029724121094, + "loss": 0.5881, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4863899350166321, + "rewards/margins": 0.4499354362487793, + "rewards/rejected": -0.9363254308700562, + "step": 780 + }, + { + "epoch": 0.18953934740882916, + "grad_norm": 12.680892626173055, + "learning_rate": 4.878996377861367e-07, + "logits/chosen": -1.6492054462432861, + "logits/rejected": -1.6732165813446045, + "logps/chosen": -314.70477294921875, + "logps/rejected": -354.80084228515625, + "loss": 0.5606, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6022621989250183, + "rewards/margins": 0.4218460023403168, + "rewards/rejected": -1.0241084098815918, + "step": 790 + }, + { + "epoch": 0.19193857965451055, + "grad_norm": 13.244012514119087, + "learning_rate": 4.872477737578327e-07, + "logits/chosen": -1.521303415298462, + "logits/rejected": -1.5321872234344482, + "logps/chosen": -378.40533447265625, + "logps/rejected": -446.744873046875, + "loss": 0.5552, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.6043397188186646, + "rewards/margins": 0.7816438674926758, + "rewards/rejected": -1.3859835863113403, + "step": 800 + }, + { + "epoch": 0.19433781190019195, + "grad_norm": 21.823585323859376, + "learning_rate": 4.865792677407718e-07, + "logits/chosen": -1.6469818353652954, + "logits/rejected": -1.6233971118927002, + "logps/chosen": -336.48065185546875, + "logps/rejected": -357.3442687988281, + "loss": 0.595, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6294270753860474, + "rewards/margins": 0.40707144141197205, + "rewards/rejected": -1.0364984273910522, + "step": 810 + }, + { + "epoch": 0.1967370441458733, + "grad_norm": 20.67495363829826, + "learning_rate": 4.858941666279955e-07, + "logits/chosen": -1.6830040216445923, + "logits/rejected": -1.650560736656189, + "logps/chosen": -366.665283203125, + "logps/rejected": -366.9881896972656, + "loss": 0.6396, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.6258559226989746, + "rewards/margins": 0.16562694311141968, + "rewards/rejected": -0.7914828658103943, + "step": 820 + }, + { + "epoch": 0.1991362763915547, + "grad_norm": 11.049911207022598, + "learning_rate": 4.851925184766247e-07, + "logits/chosen": -1.6162611246109009, + "logits/rejected": -1.5697580575942993, + "logps/chosen": -349.7427673339844, + "logps/rejected": -363.89215087890625, + "loss": 0.5924, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6203683614730835, + "rewards/margins": 0.37688732147216797, + "rewards/rejected": -0.9972556829452515, + "step": 830 + }, + { + "epoch": 0.20153550863723607, + "grad_norm": 13.52059533188069, + "learning_rate": 4.844743725044897e-07, + "logits/chosen": -1.6768954992294312, + "logits/rejected": -1.5711721181869507, + "logps/chosen": -350.21795654296875, + "logps/rejected": -347.0363464355469, + "loss": 0.6024, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6108129620552063, + "rewards/margins": 0.2568574845790863, + "rewards/rejected": -0.8676705360412598, + "step": 840 + }, + { + "epoch": 0.20393474088291746, + "grad_norm": 10.9943035407888, + "learning_rate": 4.837397790866774e-07, + "logits/chosen": -1.521465539932251, + "logits/rejected": -1.5211381912231445, + "logps/chosen": -373.3265686035156, + "logps/rejected": -408.146728515625, + "loss": 0.5893, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.45706066489219666, + "rewards/margins": 0.632278323173523, + "rewards/rejected": -1.089339017868042, + "step": 850 + }, + { + "epoch": 0.20633397312859886, + "grad_norm": 13.156524850125006, + "learning_rate": 4.829887897519974e-07, + "logits/chosen": -1.5032659769058228, + "logits/rejected": -1.5388790369033813, + "logps/chosen": -296.7011413574219, + "logps/rejected": -349.8589172363281, + "loss": 0.6089, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3782670795917511, + "rewards/margins": 0.35436898469924927, + "rewards/rejected": -0.732636034488678, + "step": 860 + }, + { + "epoch": 0.20873320537428022, + "grad_norm": 11.12500628175069, + "learning_rate": 4.82221457179368e-07, + "logits/chosen": -1.4876271486282349, + "logits/rejected": -1.5096347332000732, + "logps/chosen": -339.2798767089844, + "logps/rejected": -392.35260009765625, + "loss": 0.5778, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.3709019720554352, + "rewards/margins": 0.6803408265113831, + "rewards/rejected": -1.0512428283691406, + "step": 870 + }, + { + "epoch": 0.21113243761996162, + "grad_norm": 10.328517431427507, + "learning_rate": 4.814378351941206e-07, + "logits/chosen": -1.5637714862823486, + "logits/rejected": -1.5632487535476685, + "logps/chosen": -338.38262939453125, + "logps/rejected": -354.84307861328125, + "loss": 0.6047, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.42736753821372986, + "rewards/margins": 0.37091171741485596, + "rewards/rejected": -0.7982791662216187, + "step": 880 + }, + { + "epoch": 0.21353166986564298, + "grad_norm": 11.098993061818858, + "learning_rate": 4.806379787642241e-07, + "logits/chosen": -1.5101463794708252, + "logits/rejected": -1.5444588661193848, + "logps/chosen": -328.9061279296875, + "logps/rejected": -380.6397399902344, + "loss": 0.6089, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.47589534521102905, + "rewards/margins": 0.5617541074752808, + "rewards/rejected": -1.037649393081665, + "step": 890 + }, + { + "epoch": 0.21593090211132437, + "grad_norm": 12.11003262957795, + "learning_rate": 4.798219439964293e-07, + "logits/chosen": -1.529482126235962, + "logits/rejected": -1.591817855834961, + "logps/chosen": -348.2182312011719, + "logps/rejected": -373.36297607421875, + "loss": 0.5677, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5653911232948303, + "rewards/margins": 0.15761128067970276, + "rewards/rejected": -0.7230023145675659, + "step": 900 + }, + { + "epoch": 0.21833013435700577, + "grad_norm": 10.880088934573154, + "learning_rate": 4.78989788132333e-07, + "logits/chosen": -1.619689702987671, + "logits/rejected": -1.6337318420410156, + "logps/chosen": -290.1446228027344, + "logps/rejected": -341.4112854003906, + "loss": 0.5492, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3265039920806885, + "rewards/margins": 0.5540784597396851, + "rewards/rejected": -0.8805824518203735, + "step": 910 + }, + { + "epoch": 0.22072936660268713, + "grad_norm": 12.09695055606614, + "learning_rate": 4.781415695443631e-07, + "logits/chosen": -1.5185593366622925, + "logits/rejected": -1.4719544649124146, + "logps/chosen": -410.1792907714844, + "logps/rejected": -434.14581298828125, + "loss": 0.5834, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7186017632484436, + "rewards/margins": 0.27431511878967285, + "rewards/rejected": -0.9929169416427612, + "step": 920 + }, + { + "epoch": 0.22312859884836853, + "grad_norm": 10.674219128353606, + "learning_rate": 4.772773477316836e-07, + "logits/chosen": -1.5478754043579102, + "logits/rejected": -1.5427707433700562, + "logps/chosen": -368.37359619140625, + "logps/rejected": -410.17071533203125, + "loss": 0.5672, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6923310160636902, + "rewards/margins": 0.38585516810417175, + "rewards/rejected": -1.078186273574829, + "step": 930 + }, + { + "epoch": 0.2255278310940499, + "grad_norm": 21.542325902976465, + "learning_rate": 4.7639718331602117e-07, + "logits/chosen": -1.48305344581604, + "logits/rejected": -1.489986538887024, + "logps/chosen": -377.23419189453125, + "logps/rejected": -443.7740173339844, + "loss": 0.561, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6387566328048706, + "rewards/margins": 0.8118622899055481, + "rewards/rejected": -1.4506187438964844, + "step": 940 + }, + { + "epoch": 0.22792706333973128, + "grad_norm": 11.804420977675692, + "learning_rate": 4.7550113803741275e-07, + "logits/chosen": -1.446746587753296, + "logits/rejected": -1.420434832572937, + "logps/chosen": -392.1725769042969, + "logps/rejected": -348.671142578125, + "loss": 0.599, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6413872838020325, + "rewards/margins": 0.3373781442642212, + "rewards/rejected": -0.9787653088569641, + "step": 950 + }, + { + "epoch": 0.23032629558541268, + "grad_norm": 14.567890880215932, + "learning_rate": 4.7458927474987454e-07, + "logits/chosen": -1.4230117797851562, + "logits/rejected": -1.3921786546707153, + "logps/chosen": -409.1028747558594, + "logps/rejected": -374.42120361328125, + "loss": 0.5784, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5606304407119751, + "rewards/margins": 0.3094702661037445, + "rewards/rejected": -0.8701007962226868, + "step": 960 + }, + { + "epoch": 0.23272552783109404, + "grad_norm": 12.52816994155686, + "learning_rate": 4.7366165741699347e-07, + "logits/chosen": -1.6109126806259155, + "logits/rejected": -1.6241203546524048, + "logps/chosen": -426.0888671875, + "logps/rejected": -440.35626220703125, + "loss": 0.5702, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6009902954101562, + "rewards/margins": 0.5381742119789124, + "rewards/rejected": -1.1391645669937134, + "step": 970 + }, + { + "epoch": 0.23512476007677544, + "grad_norm": 13.967412792433594, + "learning_rate": 4.727183511074401e-07, + "logits/chosen": -1.5793912410736084, + "logits/rejected": -1.5574654340744019, + "logps/chosen": -388.32098388671875, + "logps/rejected": -388.9101257324219, + "loss": 0.5938, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6677371859550476, + "rewards/margins": 0.1685403734445572, + "rewards/rejected": -0.8362776041030884, + "step": 980 + }, + { + "epoch": 0.2375239923224568, + "grad_norm": 12.836867506913693, + "learning_rate": 4.717594219904043e-07, + "logits/chosen": -1.5590293407440186, + "logits/rejected": -1.482609510421753, + "logps/chosen": -364.401611328125, + "logps/rejected": -365.7212829589844, + "loss": 0.5931, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6549380421638489, + "rewards/margins": 0.44590234756469727, + "rewards/rejected": -1.1008403301239014, + "step": 990 + }, + { + "epoch": 0.2399232245681382, + "grad_norm": 13.358991634768257, + "learning_rate": 4.7078493733095393e-07, + "logits/chosen": -1.5059032440185547, + "logits/rejected": -1.526222825050354, + "logps/chosen": -360.82525634765625, + "logps/rejected": -425.3335876464844, + "loss": 0.5393, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7353821992874146, + "rewards/margins": 0.6038982272148132, + "rewards/rejected": -1.339280366897583, + "step": 1000 + }, + { + "epoch": 0.2423224568138196, + "grad_norm": 13.878916500110618, + "learning_rate": 4.6979496548531614e-07, + "logits/chosen": -1.3570879697799683, + "logits/rejected": -1.439981460571289, + "logps/chosen": -375.0430603027344, + "logps/rejected": -477.24981689453125, + "loss": 0.5821, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7085081338882446, + "rewards/margins": 0.5321518182754517, + "rewards/rejected": -1.2406599521636963, + "step": 1010 + }, + { + "epoch": 0.24472168905950095, + "grad_norm": 12.115955071642206, + "learning_rate": 4.6878957589608293e-07, + "logits/chosen": -1.4926813840866089, + "logits/rejected": -1.5305813550949097, + "logps/chosen": -372.43353271484375, + "logps/rejected": -514.0599975585938, + "loss": 0.5564, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7485749125480652, + "rewards/margins": 0.9881008863449097, + "rewards/rejected": -1.7366756200790405, + "step": 1020 + }, + { + "epoch": 0.24712092130518235, + "grad_norm": 11.682900261894703, + "learning_rate": 4.6776883908733956e-07, + "logits/chosen": -1.4231523275375366, + "logits/rejected": -1.356755018234253, + "logps/chosen": -397.92718505859375, + "logps/rejected": -390.314208984375, + "loss": 0.5533, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6578688025474548, + "rewards/margins": 0.6580942273139954, + "rewards/rejected": -1.3159630298614502, + "step": 1030 + }, + { + "epoch": 0.2495201535508637, + "grad_norm": 22.740710848943593, + "learning_rate": 4.667328266597178e-07, + "logits/chosen": -1.4792171716690063, + "logits/rejected": -1.4608910083770752, + "logps/chosen": -362.1708068847656, + "logps/rejected": -393.90234375, + "loss": 0.5386, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7270316481590271, + "rewards/margins": 0.40492597222328186, + "rewards/rejected": -1.1319576501846313, + "step": 1040 + }, + { + "epoch": 0.2519193857965451, + "grad_norm": 11.334535969250396, + "learning_rate": 4.6568161128537354e-07, + "logits/chosen": -1.5162358283996582, + "logits/rejected": -1.3798558712005615, + "logps/chosen": -382.86981201171875, + "logps/rejected": -379.1867980957031, + "loss": 0.5333, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8386465907096863, + "rewards/margins": 0.6335374712944031, + "rewards/rejected": -1.4721840620040894, + "step": 1050 + }, + { + "epoch": 0.2543186180422265, + "grad_norm": 15.483666546539576, + "learning_rate": 4.6461526670288877e-07, + "logits/chosen": -1.4406474828720093, + "logits/rejected": -1.4159528017044067, + "logps/chosen": -377.63421630859375, + "logps/rejected": -390.0697326660156, + "loss": 0.6314, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7359471321105957, + "rewards/margins": 0.44949039816856384, + "rewards/rejected": -1.1854374408721924, + "step": 1060 + }, + { + "epoch": 0.2567178502879079, + "grad_norm": 15.686430191237005, + "learning_rate": 4.635338677120994e-07, + "logits/chosen": -1.3936102390289307, + "logits/rejected": -1.4112733602523804, + "logps/chosen": -355.63232421875, + "logps/rejected": -450.8311462402344, + "loss": 0.5489, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.5721979141235352, + "rewards/margins": 0.7742630839347839, + "rewards/rejected": -1.3464610576629639, + "step": 1070 + }, + { + "epoch": 0.2591170825335892, + "grad_norm": 12.004038773352622, + "learning_rate": 4.6243749016884835e-07, + "logits/chosen": -1.3307225704193115, + "logits/rejected": -1.3662413358688354, + "logps/chosen": -375.4669189453125, + "logps/rejected": -517.7915649414062, + "loss": 0.5593, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7291714549064636, + "rewards/margins": 1.0086160898208618, + "rewards/rejected": -1.7377876043319702, + "step": 1080 + }, + { + "epoch": 0.2615163147792706, + "grad_norm": 16.88316778824853, + "learning_rate": 4.613262109796645e-07, + "logits/chosen": -1.4649088382720947, + "logits/rejected": -1.5016465187072754, + "logps/chosen": -358.3075256347656, + "logps/rejected": -470.97393798828125, + "loss": 0.5612, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7826167345046997, + "rewards/margins": 0.7904613018035889, + "rewards/rejected": -1.5730780363082886, + "step": 1090 + }, + { + "epoch": 0.263915547024952, + "grad_norm": 15.603270534105851, + "learning_rate": 4.602001080963678e-07, + "logits/chosen": -1.468883991241455, + "logits/rejected": -1.4197354316711426, + "logps/chosen": -383.10491943359375, + "logps/rejected": -448.2427673339844, + "loss": 0.5741, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.794638991355896, + "rewards/margins": 0.9283525347709656, + "rewards/rejected": -1.722991704940796, + "step": 1100 + }, + { + "epoch": 0.2663147792706334, + "grad_norm": 22.933794465121334, + "learning_rate": 4.590592605106017e-07, + "logits/chosen": -1.6002753973007202, + "logits/rejected": -1.5913156270980835, + "logps/chosen": -391.6163635253906, + "logps/rejected": -417.20928955078125, + "loss": 0.591, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6840261220932007, + "rewards/margins": 0.5603744387626648, + "rewards/rejected": -1.2444005012512207, + "step": 1110 + }, + { + "epoch": 0.2687140115163148, + "grad_norm": 12.563358168881198, + "learning_rate": 4.5790374824829165e-07, + "logits/chosen": -1.3435630798339844, + "logits/rejected": -1.380367398262024, + "logps/chosen": -283.7012939453125, + "logps/rejected": -354.51690673828125, + "loss": 0.5772, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6330208778381348, + "rewards/margins": 0.6005650758743286, + "rewards/rejected": -1.233586072921753, + "step": 1120 + }, + { + "epoch": 0.27111324376199614, + "grad_norm": 12.231162002380573, + "learning_rate": 4.5673365236403216e-07, + "logits/chosen": -1.5042026042938232, + "logits/rejected": -1.5108054876327515, + "logps/chosen": -295.96514892578125, + "logps/rejected": -418.58197021484375, + "loss": 0.5657, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6303598284721375, + "rewards/margins": 0.9518154263496399, + "rewards/rejected": -1.5821754932403564, + "step": 1130 + }, + { + "epoch": 0.27351247600767753, + "grad_norm": 11.50399002885248, + "learning_rate": 4.5554905493540075e-07, + "logits/chosen": -1.4057085514068604, + "logits/rejected": -1.4004013538360596, + "logps/chosen": -321.76025390625, + "logps/rejected": -403.93304443359375, + "loss": 0.5522, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7167822122573853, + "rewards/margins": 0.7921913266181946, + "rewards/rejected": -1.5089735984802246, + "step": 1140 + }, + { + "epoch": 0.2759117082533589, + "grad_norm": 13.047950837607836, + "learning_rate": 4.5435003905720074e-07, + "logits/chosen": -1.4048264026641846, + "logits/rejected": -1.376259446144104, + "logps/chosen": -396.1643981933594, + "logps/rejected": -411.42041015625, + "loss": 0.5653, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8003758192062378, + "rewards/margins": 0.5425890684127808, + "rewards/rejected": -1.3429648876190186, + "step": 1150 + }, + { + "epoch": 0.2783109404990403, + "grad_norm": 19.1972621609012, + "learning_rate": 4.531366888356324e-07, + "logits/chosen": -1.4351606369018555, + "logits/rejected": -1.4640175104141235, + "logps/chosen": -304.96466064453125, + "logps/rejected": -468.926025390625, + "loss": 0.551, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8590118288993835, + "rewards/margins": 1.2381832599639893, + "rewards/rejected": -2.0971951484680176, + "step": 1160 + }, + { + "epoch": 0.2807101727447217, + "grad_norm": 15.284909550684677, + "learning_rate": 4.519090893823931e-07, + "logits/chosen": -1.387416124343872, + "logits/rejected": -1.3746330738067627, + "logps/chosen": -362.2872619628906, + "logps/rejected": -420.45587158203125, + "loss": 0.5541, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8361701965332031, + "rewards/margins": 0.7261063456535339, + "rewards/rejected": -1.5622767210006714, + "step": 1170 + }, + { + "epoch": 0.28310940499040305, + "grad_norm": 10.014173257118145, + "learning_rate": 4.5066732680870734e-07, + "logits/chosen": -1.2802355289459229, + "logits/rejected": -1.2717955112457275, + "logps/chosen": -363.96148681640625, + "logps/rejected": -428.07952880859375, + "loss": 0.5089, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.7651786804199219, + "rewards/margins": 1.0245896577835083, + "rewards/rejected": -1.7897684574127197, + "step": 1180 + }, + { + "epoch": 0.28550863723608444, + "grad_norm": 15.835254380159547, + "learning_rate": 4.494114882192862e-07, + "logits/chosen": -1.4262115955352783, + "logits/rejected": -1.4117512702941895, + "logps/chosen": -377.26959228515625, + "logps/rejected": -497.1695861816406, + "loss": 0.5485, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9193848371505737, + "rewards/margins": 1.441838026046753, + "rewards/rejected": -2.3612232208251953, + "step": 1190 + }, + { + "epoch": 0.28790786948176583, + "grad_norm": 21.138236050686288, + "learning_rate": 4.4814166170621735e-07, + "logits/chosen": -1.3948581218719482, + "logits/rejected": -1.4087920188903809, + "logps/chosen": -368.2770080566406, + "logps/rejected": -437.8465270996094, + "loss": 0.5604, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8511501550674438, + "rewards/margins": 0.9589170217514038, + "rewards/rejected": -1.8100671768188477, + "step": 1200 + }, + { + "epoch": 0.2903071017274472, + "grad_norm": 15.593695424526608, + "learning_rate": 4.468579363427858e-07, + "logits/chosen": -1.4883145093917847, + "logits/rejected": -1.4821951389312744, + "logps/chosen": -368.0526428222656, + "logps/rejected": -431.11602783203125, + "loss": 0.5448, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7449411749839783, + "rewards/margins": 0.931291937828064, + "rewards/rejected": -1.6762330532073975, + "step": 1210 + }, + { + "epoch": 0.2927063339731286, + "grad_norm": 15.150159523452388, + "learning_rate": 4.4556040217722555e-07, + "logits/chosen": -1.4894287586212158, + "logits/rejected": -1.5455225706100464, + "logps/chosen": -347.5841979980469, + "logps/rejected": -446.6749572753906, + "loss": 0.5461, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6967313885688782, + "rewards/margins": 0.8318474888801575, + "rewards/rejected": -1.5285788774490356, + "step": 1220 + }, + { + "epoch": 0.29510556621880996, + "grad_norm": 13.99103135806943, + "learning_rate": 4.442491502264033e-07, + "logits/chosen": -1.3990120887756348, + "logits/rejected": -1.4201018810272217, + "logps/chosen": -339.461669921875, + "logps/rejected": -365.3882141113281, + "loss": 0.5496, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7837417721748352, + "rewards/margins": 0.4338766932487488, + "rewards/rejected": -1.217618465423584, + "step": 1230 + }, + { + "epoch": 0.29750479846449135, + "grad_norm": 12.54939871596996, + "learning_rate": 4.429242724694338e-07, + "logits/chosen": -1.416416883468628, + "logits/rejected": -1.460663080215454, + "logps/chosen": -347.73980712890625, + "logps/rejected": -431.86590576171875, + "loss": 0.5656, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6948519945144653, + "rewards/margins": 0.6938368082046509, + "rewards/rejected": -1.3886888027191162, + "step": 1240 + }, + { + "epoch": 0.29990403071017274, + "grad_norm": 11.877302633667899, + "learning_rate": 4.4158586184122817e-07, + "logits/chosen": -1.3603636026382446, + "logits/rejected": -1.3027294874191284, + "logps/chosen": -438.68280029296875, + "logps/rejected": -489.89923095703125, + "loss": 0.556, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1044422388076782, + "rewards/margins": 0.9364460110664368, + "rewards/rejected": -2.0408880710601807, + "step": 1250 + }, + { + "epoch": 0.30230326295585414, + "grad_norm": 14.912670848514633, + "learning_rate": 4.4023401222597443e-07, + "logits/chosen": -1.3858801126480103, + "logits/rejected": -1.3739614486694336, + "logps/chosen": -394.9843444824219, + "logps/rejected": -426.37628173828125, + "loss": 0.5564, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9061829447746277, + "rewards/margins": 0.5654848217964172, + "rewards/rejected": -1.471667766571045, + "step": 1260 + }, + { + "epoch": 0.30470249520153553, + "grad_norm": 18.22216737775854, + "learning_rate": 4.3886881845055235e-07, + "logits/chosen": -1.3496434688568115, + "logits/rejected": -1.365230917930603, + "logps/chosen": -357.98931884765625, + "logps/rejected": -517.0890502929688, + "loss": 0.5402, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8049036264419556, + "rewards/margins": 1.6912431716918945, + "rewards/rejected": -2.4961466789245605, + "step": 1270 + }, + { + "epoch": 0.30710172744721687, + "grad_norm": 14.137278738589544, + "learning_rate": 4.374903762778814e-07, + "logits/chosen": -1.377090334892273, + "logits/rejected": -1.3758434057235718, + "logps/chosen": -417.33856201171875, + "logps/rejected": -486.22735595703125, + "loss": 0.5356, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2525304555892944, + "rewards/margins": 1.0212395191192627, + "rewards/rejected": -2.2737700939178467, + "step": 1280 + }, + { + "epoch": 0.30950095969289826, + "grad_norm": 16.93423395812018, + "learning_rate": 4.3609878240020356e-07, + "logits/chosen": -1.4830983877182007, + "logits/rejected": -1.4273139238357544, + "logps/chosen": -433.70556640625, + "logps/rejected": -457.117919921875, + "loss": 0.5527, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0427515506744385, + "rewards/margins": 0.9059782028198242, + "rewards/rejected": -1.9487298727035522, + "step": 1290 + }, + { + "epoch": 0.31190019193857965, + "grad_norm": 18.22238353135756, + "learning_rate": 4.346941344323005e-07, + "logits/chosen": -1.413414716720581, + "logits/rejected": -1.31984281539917, + "logps/chosen": -433.405029296875, + "logps/rejected": -448.2005920410156, + "loss": 0.5854, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.468630075454712, + "rewards/margins": 0.7335187792778015, + "rewards/rejected": -2.202148914337158, + "step": 1300 + }, + { + "epoch": 0.31429942418426104, + "grad_norm": 16.302508181573693, + "learning_rate": 4.332765309046467e-07, + "logits/chosen": -1.1935665607452393, + "logits/rejected": -1.1890474557876587, + "logps/chosen": -397.90740966796875, + "logps/rejected": -473.9424743652344, + "loss": 0.5572, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0285379886627197, + "rewards/margins": 1.1598167419433594, + "rewards/rejected": -2.188354969024658, + "step": 1310 + }, + { + "epoch": 0.31669865642994244, + "grad_norm": 18.87851668691691, + "learning_rate": 4.3184607125649754e-07, + "logits/chosen": -1.395169973373413, + "logits/rejected": -1.4038825035095215, + "logps/chosen": -391.494140625, + "logps/rejected": -527.1102905273438, + "loss": 0.552, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.7730218172073364, + "rewards/margins": 1.2049500942230225, + "rewards/rejected": -1.9779720306396484, + "step": 1320 + }, + { + "epoch": 0.3190978886756238, + "grad_norm": 13.330818134593827, + "learning_rate": 4.304028558289141e-07, + "logits/chosen": -1.394945502281189, + "logits/rejected": -1.4087629318237305, + "logps/chosen": -406.9620056152344, + "logps/rejected": -464.66943359375, + "loss": 0.515, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.7917548418045044, + "rewards/margins": 0.814030647277832, + "rewards/rejected": -1.6057853698730469, + "step": 1330 + }, + { + "epoch": 0.32149712092130517, + "grad_norm": 13.787070591715754, + "learning_rate": 4.28946985857725e-07, + "logits/chosen": -1.369894027709961, + "logits/rejected": -1.3589582443237305, + "logps/chosen": -439.8268127441406, + "logps/rejected": -550.6514892578125, + "loss": 0.521, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.226813554763794, + "rewards/margins": 1.2747678756713867, + "rewards/rejected": -2.5015816688537598, + "step": 1340 + }, + { + "epoch": 0.32389635316698656, + "grad_norm": 12.959527833175532, + "learning_rate": 4.2747856346642445e-07, + "logits/chosen": -1.4751712083816528, + "logits/rejected": -1.4413162469863892, + "logps/chosen": -333.6115417480469, + "logps/rejected": -390.6005859375, + "loss": 0.5163, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.788160502910614, + "rewards/margins": 0.7655519247055054, + "rewards/rejected": -1.5537126064300537, + "step": 1350 + }, + { + "epoch": 0.32629558541266795, + "grad_norm": 16.408516686123168, + "learning_rate": 4.2599769165900933e-07, + "logits/chosen": -1.3842442035675049, + "logits/rejected": -1.3772562742233276, + "logps/chosen": -423.4775390625, + "logps/rejected": -458.16632080078125, + "loss": 0.5685, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4563066959381104, + "rewards/margins": 0.6131476163864136, + "rewards/rejected": -2.0694539546966553, + "step": 1360 + }, + { + "epoch": 0.32869481765834935, + "grad_norm": 14.256511067024702, + "learning_rate": 4.245044743127535e-07, + "logits/chosen": -1.328401803970337, + "logits/rejected": -1.3963744640350342, + "logps/chosen": -411.93212890625, + "logps/rejected": -488.36492919921875, + "loss": 0.5437, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1901071071624756, + "rewards/margins": 0.8442772626876831, + "rewards/rejected": -2.034384250640869, + "step": 1370 + }, + { + "epoch": 0.3310940499040307, + "grad_norm": 15.32110503914299, + "learning_rate": 4.229990161709214e-07, + "logits/chosen": -1.3082549571990967, + "logits/rejected": -1.3658416271209717, + "logps/chosen": -353.73846435546875, + "logps/rejected": -495.6495056152344, + "loss": 0.5553, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.8750725984573364, + "rewards/margins": 1.2727442979812622, + "rewards/rejected": -2.1478168964385986, + "step": 1380 + }, + { + "epoch": 0.3334932821497121, + "grad_norm": 14.471791173710773, + "learning_rate": 4.214814228354204e-07, + "logits/chosen": -1.374188780784607, + "logits/rejected": -1.3083152770996094, + "logps/chosen": -401.9841613769531, + "logps/rejected": -544.5491943359375, + "loss": 0.5332, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.8819853663444519, + "rewards/margins": 1.6415786743164062, + "rewards/rejected": -2.523563861846924, + "step": 1390 + }, + { + "epoch": 0.33589251439539347, + "grad_norm": 13.193627569558467, + "learning_rate": 4.1995180075939375e-07, + "logits/chosen": -1.2617089748382568, + "logits/rejected": -1.2713496685028076, + "logps/chosen": -403.77313232421875, + "logps/rejected": -472.1922302246094, + "loss": 0.5176, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.9630982279777527, + "rewards/margins": 0.9366270899772644, + "rewards/rejected": -1.899725317955017, + "step": 1400 + }, + { + "epoch": 0.33829174664107486, + "grad_norm": 13.251871039227273, + "learning_rate": 4.1841025723975297e-07, + "logits/chosen": -1.384151816368103, + "logits/rejected": -1.3977984189987183, + "logps/chosen": -393.83984375, + "logps/rejected": -468.3504333496094, + "loss": 0.5117, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.752223014831543, + "rewards/margins": 1.0345475673675537, + "rewards/rejected": -1.7867704629898071, + "step": 1410 + }, + { + "epoch": 0.34069097888675626, + "grad_norm": 13.335816756093026, + "learning_rate": 4.168569004096516e-07, + "logits/chosen": -1.2721823453903198, + "logits/rejected": -1.3577107191085815, + "logps/chosen": -362.2651062011719, + "logps/rejected": -512.0301513671875, + "loss": 0.5129, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.968246340751648, + "rewards/margins": 1.3080164194107056, + "rewards/rejected": -2.2762627601623535, + "step": 1420 + }, + { + "epoch": 0.3430902111324376, + "grad_norm": 13.504356924537085, + "learning_rate": 4.152918392308997e-07, + "logits/chosen": -1.236984133720398, + "logits/rejected": -1.2587682008743286, + "logps/chosen": -418.0523376464844, + "logps/rejected": -496.4715270996094, + "loss": 0.5118, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3280388116836548, + "rewards/margins": 0.9170624017715454, + "rewards/rejected": -2.245100975036621, + "step": 1430 + }, + { + "epoch": 0.345489443378119, + "grad_norm": 35.157195838808185, + "learning_rate": 4.137151834863213e-07, + "logits/chosen": -1.3294508457183838, + "logits/rejected": -1.4013631343841553, + "logps/chosen": -426.08953857421875, + "logps/rejected": -624.5795288085938, + "loss": 0.5543, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5735900402069092, + "rewards/margins": 1.709838628768921, + "rewards/rejected": -3.283428907394409, + "step": 1440 + }, + { + "epoch": 0.3478886756238004, + "grad_norm": 19.755753209301176, + "learning_rate": 4.121270437720526e-07, + "logits/chosen": -1.4114878177642822, + "logits/rejected": -1.4613209962844849, + "logps/chosen": -365.6165466308594, + "logps/rejected": -445.4151306152344, + "loss": 0.5537, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2835686206817627, + "rewards/margins": 0.4420824944972992, + "rewards/rejected": -1.7256511449813843, + "step": 1450 + }, + { + "epoch": 0.3502879078694818, + "grad_norm": 16.56180946656553, + "learning_rate": 4.105275314897852e-07, + "logits/chosen": -1.2359731197357178, + "logits/rejected": -1.3098690509796143, + "logps/chosen": -398.44317626953125, + "logps/rejected": -597.1376953125, + "loss": 0.5212, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3910930156707764, + "rewards/margins": 1.7130632400512695, + "rewards/rejected": -3.104156017303467, + "step": 1460 + }, + { + "epoch": 0.35268714011516317, + "grad_norm": 14.537832682193645, + "learning_rate": 4.089167588389508e-07, + "logits/chosen": -1.4110276699066162, + "logits/rejected": -1.3408496379852295, + "logps/chosen": -491.8135681152344, + "logps/rejected": -528.15087890625, + "loss": 0.54, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0991634130477905, + "rewards/margins": 0.8685483932495117, + "rewards/rejected": -1.9677116870880127, + "step": 1470 + }, + { + "epoch": 0.3550863723608445, + "grad_norm": 18.28450008374322, + "learning_rate": 4.072948388088515e-07, + "logits/chosen": -1.2855981588363647, + "logits/rejected": -1.248203992843628, + "logps/chosen": -434.02093505859375, + "logps/rejected": -519.0634155273438, + "loss": 0.5606, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.252666711807251, + "rewards/margins": 0.8734865188598633, + "rewards/rejected": -2.1261532306671143, + "step": 1480 + }, + { + "epoch": 0.3574856046065259, + "grad_norm": 17.455233314945, + "learning_rate": 4.056618851707334e-07, + "logits/chosen": -1.2739641666412354, + "logits/rejected": -1.2918890714645386, + "logps/chosen": -421.3681640625, + "logps/rejected": -571.5971069335938, + "loss": 0.5079, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.0211762189865112, + "rewards/margins": 1.5285167694091797, + "rewards/rejected": -2.5496933460235596, + "step": 1490 + }, + { + "epoch": 0.3598848368522073, + "grad_norm": 21.806523126830065, + "learning_rate": 4.0401801246980675e-07, + "logits/chosen": -1.3441916704177856, + "logits/rejected": -1.3473713397979736, + "logps/chosen": -424.82958984375, + "logps/rejected": -508.1918029785156, + "loss": 0.5602, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7232739925384521, + "rewards/margins": 1.1312460899353027, + "rewards/rejected": -2.854520082473755, + "step": 1500 + }, + { + "epoch": 0.3622840690978887, + "grad_norm": 19.612284663970076, + "learning_rate": 4.0236333601721043e-07, + "logits/chosen": -1.2525314092636108, + "logits/rejected": -1.3246116638183594, + "logps/chosen": -444.3692321777344, + "logps/rejected": -531.5400390625, + "loss": 0.5727, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.3166108131408691, + "rewards/margins": 0.7165418863296509, + "rewards/rejected": -2.0331528186798096, + "step": 1510 + }, + { + "epoch": 0.3646833013435701, + "grad_norm": 17.847368107609864, + "learning_rate": 4.0069797188192364e-07, + "logits/chosen": -1.2792822122573853, + "logits/rejected": -1.2443560361862183, + "logps/chosen": -468.7420959472656, + "logps/rejected": -575.7607421875, + "loss": 0.5269, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.394544005393982, + "rewards/margins": 1.4964808225631714, + "rewards/rejected": -2.891024589538574, + "step": 1520 + }, + { + "epoch": 0.3670825335892514, + "grad_norm": 17.840550166983498, + "learning_rate": 3.9902203688262417e-07, + "logits/chosen": -1.2861576080322266, + "logits/rejected": -1.2852250337600708, + "logps/chosen": -399.22601318359375, + "logps/rejected": -450.94927978515625, + "loss": 0.5295, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0338513851165771, + "rewards/margins": 0.6771272420883179, + "rewards/rejected": -1.7109787464141846, + "step": 1530 + }, + { + "epoch": 0.3694817658349328, + "grad_norm": 18.31964270052022, + "learning_rate": 3.9733564857949365e-07, + "logits/chosen": -1.2296676635742188, + "logits/rejected": -1.2087959051132202, + "logps/chosen": -457.4476013183594, + "logps/rejected": -486.99127197265625, + "loss": 0.5033, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2284425497055054, + "rewards/margins": 0.7259224653244019, + "rewards/rejected": -1.9543651342391968, + "step": 1540 + }, + { + "epoch": 0.3718809980806142, + "grad_norm": 19.617733556006073, + "learning_rate": 3.9563892526597177e-07, + "logits/chosen": -1.2289972305297852, + "logits/rejected": -1.250226378440857, + "logps/chosen": -349.59490966796875, + "logps/rejected": -436.00518798828125, + "loss": 0.5468, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0080279111862183, + "rewards/margins": 0.4189421236515045, + "rewards/rejected": -1.4269700050354004, + "step": 1550 + }, + { + "epoch": 0.3742802303262956, + "grad_norm": 14.218555536178721, + "learning_rate": 3.9393198596045795e-07, + "logits/chosen": -1.2645368576049805, + "logits/rejected": -1.367915391921997, + "logps/chosen": -387.63916015625, + "logps/rejected": -499.51849365234375, + "loss": 0.5396, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1759942770004272, + "rewards/margins": 1.025880217552185, + "rewards/rejected": -2.2018744945526123, + "step": 1560 + }, + { + "epoch": 0.376679462571977, + "grad_norm": 15.248110894694868, + "learning_rate": 3.922149503979628e-07, + "logits/chosen": -1.2674332857131958, + "logits/rejected": -1.3409755229949951, + "logps/chosen": -480.2740783691406, + "logps/rejected": -645.871826171875, + "loss": 0.5327, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6748710870742798, + "rewards/margins": 1.680478811264038, + "rewards/rejected": -3.3553500175476074, + "step": 1570 + }, + { + "epoch": 0.3790786948176583, + "grad_norm": 16.531059307099905, + "learning_rate": 3.904879390217095e-07, + "logits/chosen": -1.443718671798706, + "logits/rejected": -1.4373607635498047, + "logps/chosen": -413.67626953125, + "logps/rejected": -450.90399169921875, + "loss": 0.534, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.243098497390747, + "rewards/margins": 0.6606565713882446, + "rewards/rejected": -1.9037549495697021, + "step": 1580 + }, + { + "epoch": 0.3814779270633397, + "grad_norm": 17.7872167866301, + "learning_rate": 3.8875107297468463e-07, + "logits/chosen": -1.3334364891052246, + "logits/rejected": -1.4180471897125244, + "logps/chosen": -386.6700134277344, + "logps/rejected": -562.6341552734375, + "loss": 0.5296, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.1550489664077759, + "rewards/margins": 1.2504987716674805, + "rewards/rejected": -2.405547618865967, + "step": 1590 + }, + { + "epoch": 0.3838771593090211, + "grad_norm": 17.180508133509086, + "learning_rate": 3.87004474091141e-07, + "logits/chosen": -1.2263559103012085, + "logits/rejected": -1.2715394496917725, + "logps/chosen": -373.009765625, + "logps/rejected": -479.8453674316406, + "loss": 0.5483, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.171359896659851, + "rewards/margins": 0.8991461992263794, + "rewards/rejected": -2.0705060958862305, + "step": 1600 + }, + { + "epoch": 0.3862763915547025, + "grad_norm": 16.037651776374364, + "learning_rate": 3.8524826488805114e-07, + "logits/chosen": -1.2601364850997925, + "logits/rejected": -1.2469688653945923, + "logps/chosen": -494.4532775878906, + "logps/rejected": -547.1580200195312, + "loss": 0.5768, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.7097291946411133, + "rewards/margins": 1.066457986831665, + "rewards/rejected": -2.7761871814727783, + "step": 1610 + }, + { + "epoch": 0.3886756238003839, + "grad_norm": 18.382482372312047, + "learning_rate": 3.834825685565133e-07, + "logits/chosen": -1.2483876943588257, + "logits/rejected": -1.222490906715393, + "logps/chosen": -373.1767578125, + "logps/rejected": -396.866455078125, + "loss": 0.525, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0781358480453491, + "rewards/margins": 0.5979953408241272, + "rewards/rejected": -1.676131248474121, + "step": 1620 + }, + { + "epoch": 0.39107485604606523, + "grad_norm": 15.211289842210352, + "learning_rate": 3.8170750895311007e-07, + "logits/chosen": -1.425077199935913, + "logits/rejected": -1.4005056619644165, + "logps/chosen": -427.9413146972656, + "logps/rejected": -507.6681213378906, + "loss": 0.4923, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1004465818405151, + "rewards/margins": 1.0908915996551514, + "rewards/rejected": -2.191338062286377, + "step": 1630 + }, + { + "epoch": 0.3934740882917466, + "grad_norm": 15.86186494666157, + "learning_rate": 3.7992321059122045e-07, + "logits/chosen": -1.2644860744476318, + "logits/rejected": -1.247738003730774, + "logps/chosen": -441.6012268066406, + "logps/rejected": -545.8192138671875, + "loss": 0.5157, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5572662353515625, + "rewards/margins": 1.2428439855575562, + "rewards/rejected": -2.800110340118408, + "step": 1640 + }, + { + "epoch": 0.395873320537428, + "grad_norm": 17.55361161687906, + "learning_rate": 3.7812979863228576e-07, + "logits/chosen": -1.2765741348266602, + "logits/rejected": -1.3307077884674072, + "logps/chosen": -417.1150817871094, + "logps/rejected": -496.439697265625, + "loss": 0.499, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.710998773574829, + "rewards/margins": 0.6753867864608765, + "rewards/rejected": -2.386385440826416, + "step": 1650 + }, + { + "epoch": 0.3982725527831094, + "grad_norm": 22.437672556954325, + "learning_rate": 3.763273988770296e-07, + "logits/chosen": -1.1883518695831299, + "logits/rejected": -1.2056753635406494, + "logps/chosen": -455.9244079589844, + "logps/rejected": -573.9075927734375, + "loss": 0.506, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7948402166366577, + "rewards/margins": 1.2663710117340088, + "rewards/rejected": -3.061211109161377, + "step": 1660 + }, + { + "epoch": 0.4006717850287908, + "grad_norm": 18.746526832997848, + "learning_rate": 3.7451613775663405e-07, + "logits/chosen": -1.3334394693374634, + "logits/rejected": -1.3598014116287231, + "logps/chosen": -431.07183837890625, + "logps/rejected": -600.5681762695312, + "loss": 0.536, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5167427062988281, + "rewards/margins": 1.6998612880706787, + "rewards/rejected": -3.216604232788086, + "step": 1670 + }, + { + "epoch": 0.40307101727447214, + "grad_norm": 19.534673796965414, + "learning_rate": 3.726961423238706e-07, + "logits/chosen": -1.337828516960144, + "logits/rejected": -1.4223301410675049, + "logps/chosen": -405.55279541015625, + "logps/rejected": -555.9108276367188, + "loss": 0.5286, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4641406536102295, + "rewards/margins": 1.3517651557922363, + "rewards/rejected": -2.8159055709838867, + "step": 1680 + }, + { + "epoch": 0.40547024952015354, + "grad_norm": 18.628825827681126, + "learning_rate": 3.708675402441882e-07, + "logits/chosen": -1.380622148513794, + "logits/rejected": -1.3093476295471191, + "logps/chosen": -451.6861877441406, + "logps/rejected": -485.03173828125, + "loss": 0.5429, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3787542581558228, + "rewards/margins": 0.7608411312103271, + "rewards/rejected": -2.1395952701568604, + "step": 1690 + }, + { + "epoch": 0.40786948176583493, + "grad_norm": 20.390708191804787, + "learning_rate": 3.6903045978675775e-07, + "logits/chosen": -1.309200406074524, + "logits/rejected": -1.3193695545196533, + "logps/chosen": -437.64434814453125, + "logps/rejected": -600.7709350585938, + "loss": 0.5118, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6412197351455688, + "rewards/margins": 1.9034411907196045, + "rewards/rejected": -3.544661045074463, + "step": 1700 + }, + { + "epoch": 0.4102687140115163, + "grad_norm": 19.4839567581448, + "learning_rate": 3.6718502981547474e-07, + "logits/chosen": -1.3340691328048706, + "logits/rejected": -1.4177117347717285, + "logps/chosen": -435.882568359375, + "logps/rejected": -559.0894775390625, + "loss": 0.5346, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.532526969909668, + "rewards/margins": 0.8194723129272461, + "rewards/rejected": -2.351999282836914, + "step": 1710 + }, + { + "epoch": 0.4126679462571977, + "grad_norm": 19.09104720437955, + "learning_rate": 3.6533137977991986e-07, + "logits/chosen": -1.461211919784546, + "logits/rejected": -1.4429075717926025, + "logps/chosen": -465.922119140625, + "logps/rejected": -541.3279418945312, + "loss": 0.5674, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4032350778579712, + "rewards/margins": 0.5128957629203796, + "rewards/rejected": -1.9161306619644165, + "step": 1720 + }, + { + "epoch": 0.41506717850287905, + "grad_norm": 16.509359329189778, + "learning_rate": 3.6346963970627865e-07, + "logits/chosen": -1.2633116245269775, + "logits/rejected": -1.3003978729248047, + "logps/chosen": -445.17559814453125, + "logps/rejected": -564.2251586914062, + "loss": 0.5067, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7176477909088135, + "rewards/margins": 1.0899405479431152, + "rewards/rejected": -2.8075881004333496, + "step": 1730 + }, + { + "epoch": 0.41746641074856045, + "grad_norm": 23.579226986168432, + "learning_rate": 3.615999401882207e-07, + "logits/chosen": -1.1746143102645874, + "logits/rejected": -1.2127888202667236, + "logps/chosen": -451.49456787109375, + "logps/rejected": -599.2230224609375, + "loss": 0.518, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8967907428741455, + "rewards/margins": 1.3978122472763062, + "rewards/rejected": -3.294602870941162, + "step": 1740 + }, + { + "epoch": 0.41986564299424184, + "grad_norm": 21.642239516918913, + "learning_rate": 3.597224123777389e-07, + "logits/chosen": -1.318002462387085, + "logits/rejected": -1.3342430591583252, + "logps/chosen": -438.5196228027344, + "logps/rejected": -583.038330078125, + "loss": 0.5196, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.5318729877471924, + "rewards/margins": 1.3291748762130737, + "rewards/rejected": -2.8610479831695557, + "step": 1750 + }, + { + "epoch": 0.42226487523992323, + "grad_norm": 19.977178630652975, + "learning_rate": 3.5783718797595e-07, + "logits/chosen": -1.351927638053894, + "logits/rejected": -1.2586073875427246, + "logps/chosen": -482.8525390625, + "logps/rejected": -535.1614990234375, + "loss": 0.5038, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4627388715744019, + "rewards/margins": 1.0629637241363525, + "rewards/rejected": -2.525702476501465, + "step": 1760 + }, + { + "epoch": 0.4246641074856046, + "grad_norm": 20.042806842677365, + "learning_rate": 3.559443992238558e-07, + "logits/chosen": -1.3026657104492188, + "logits/rejected": -1.3041784763336182, + "logps/chosen": -438.8772888183594, + "logps/rejected": -646.4869995117188, + "loss": 0.5412, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6175000667572021, + "rewards/margins": 1.8692903518676758, + "rewards/rejected": -3.486790418624878, + "step": 1770 + }, + { + "epoch": 0.42706333973128596, + "grad_norm": 17.70811036723579, + "learning_rate": 3.540441788930673e-07, + "logits/chosen": -1.2043460607528687, + "logits/rejected": -1.2459949254989624, + "logps/chosen": -507.71197509765625, + "logps/rejected": -598.0993041992188, + "loss": 0.4945, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.716043472290039, + "rewards/margins": 1.347996473312378, + "rewards/rejected": -3.064039945602417, + "step": 1780 + }, + { + "epoch": 0.42946257197696736, + "grad_norm": 18.602075356483514, + "learning_rate": 3.5213666027649123e-07, + "logits/chosen": -1.2834551334381104, + "logits/rejected": -1.2271353006362915, + "logps/chosen": -526.9220581054688, + "logps/rejected": -561.8515014648438, + "loss": 0.5203, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.126675844192505, + "rewards/margins": 0.8078119158744812, + "rewards/rejected": -2.934487819671631, + "step": 1790 + }, + { + "epoch": 0.43186180422264875, + "grad_norm": 25.358193518418922, + "learning_rate": 3.5022197717898017e-07, + "logits/chosen": -1.3789910078048706, + "logits/rejected": -1.2962301969528198, + "logps/chosen": -447.68328857421875, + "logps/rejected": -578.26171875, + "loss": 0.4826, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9742257595062256, + "rewards/margins": 1.7503993511199951, + "rewards/rejected": -3.7246251106262207, + "step": 1800 + }, + { + "epoch": 0.43426103646833014, + "grad_norm": 18.041080936481364, + "learning_rate": 3.4830026390794633e-07, + "logits/chosen": -1.419255018234253, + "logits/rejected": -1.4106934070587158, + "logps/chosen": -553.8700561523438, + "logps/rejected": -644.7379760742188, + "loss": 0.5233, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.2685706615448, + "rewards/margins": 1.4727586507797241, + "rewards/rejected": -3.7413299083709717, + "step": 1810 + }, + { + "epoch": 0.43666026871401153, + "grad_norm": 17.513224897241464, + "learning_rate": 3.4637165526394104e-07, + "logits/chosen": -1.416473627090454, + "logits/rejected": -1.4340416193008423, + "logps/chosen": -444.02001953125, + "logps/rejected": -546.510986328125, + "loss": 0.5211, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.8143060207366943, + "rewards/margins": 0.9672557711601257, + "rewards/rejected": -2.781561851501465, + "step": 1820 + }, + { + "epoch": 0.43905950095969287, + "grad_norm": 16.760580488941674, + "learning_rate": 3.4443628653119814e-07, + "logits/chosen": -1.3158726692199707, + "logits/rejected": -1.3169821500778198, + "logps/chosen": -495.0303649902344, + "logps/rejected": -762.5018310546875, + "loss": 0.5476, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8864179849624634, + "rewards/margins": 2.289444923400879, + "rewards/rejected": -4.175862789154053, + "step": 1830 + }, + { + "epoch": 0.44145873320537427, + "grad_norm": 20.13318056836045, + "learning_rate": 3.424942934681453e-07, + "logits/chosen": -1.3486835956573486, + "logits/rejected": -1.296865463256836, + "logps/chosen": -416.56048583984375, + "logps/rejected": -543.5489501953125, + "loss": 0.5081, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4626445770263672, + "rewards/margins": 1.3315180540084839, + "rewards/rejected": -2.7941625118255615, + "step": 1840 + }, + { + "epoch": 0.44385796545105566, + "grad_norm": 24.19694764760871, + "learning_rate": 3.405458122978804e-07, + "logits/chosen": -1.311305046081543, + "logits/rejected": -1.344865083694458, + "logps/chosen": -469.0625, + "logps/rejected": -519.7864990234375, + "loss": 0.5213, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6095072031021118, + "rewards/margins": 0.907385528087616, + "rewards/rejected": -2.516892433166504, + "step": 1850 + }, + { + "epoch": 0.44625719769673705, + "grad_norm": 27.962865827768972, + "learning_rate": 3.3859097969861633e-07, + "logits/chosen": -1.3778144121170044, + "logits/rejected": -1.3863356113433838, + "logps/chosen": -516.6919555664062, + "logps/rejected": -578.032958984375, + "loss": 0.5201, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8337581157684326, + "rewards/margins": 1.010434865951538, + "rewards/rejected": -2.8441929817199707, + "step": 1860 + }, + { + "epoch": 0.44865642994241844, + "grad_norm": 16.82356054576497, + "learning_rate": 3.366299327940936e-07, + "logits/chosen": -1.4125055074691772, + "logits/rejected": -1.5038913488388062, + "logps/chosen": -518.5601806640625, + "logps/rejected": -647.3727416992188, + "loss": 0.5009, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.953787088394165, + "rewards/margins": 1.0619028806686401, + "rewards/rejected": -3.0156893730163574, + "step": 1870 + }, + { + "epoch": 0.4510556621880998, + "grad_norm": 19.60058880839149, + "learning_rate": 3.3466280914396117e-07, + "logits/chosen": -1.3978347778320312, + "logits/rejected": -1.4119714498519897, + "logps/chosen": -484.0753479003906, + "logps/rejected": -630.961181640625, + "loss": 0.4954, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1772611141204834, + "rewards/margins": 1.285409688949585, + "rewards/rejected": -3.4626708030700684, + "step": 1880 + }, + { + "epoch": 0.4534548944337812, + "grad_norm": 18.114326621341235, + "learning_rate": 3.326897467341281e-07, + "logits/chosen": -1.438785433769226, + "logits/rejected": -1.424668312072754, + "logps/chosen": -447.465087890625, + "logps/rejected": -570.6567993164062, + "loss": 0.5181, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0972611904144287, + "rewards/margins": 1.1276103258132935, + "rewards/rejected": -3.2248713970184326, + "step": 1890 + }, + { + "epoch": 0.45585412667946257, + "grad_norm": 24.418842954600578, + "learning_rate": 3.3071088396708335e-07, + "logits/chosen": -1.4327937364578247, + "logits/rejected": -1.4392555952072144, + "logps/chosen": -414.9593200683594, + "logps/rejected": -611.3997802734375, + "loss": 0.5306, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.844822883605957, + "rewards/margins": 1.6931211948394775, + "rewards/rejected": -3.5379443168640137, + "step": 1900 + }, + { + "epoch": 0.45825335892514396, + "grad_norm": 18.222911016897314, + "learning_rate": 3.2872635965218824e-07, + "logits/chosen": -1.22509765625, + "logits/rejected": -1.2552540302276611, + "logps/chosen": -504.212890625, + "logps/rejected": -604.2361450195312, + "loss": 0.5372, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0499117374420166, + "rewards/margins": 0.9568948745727539, + "rewards/rejected": -3.0068068504333496, + "step": 1910 + }, + { + "epoch": 0.46065259117082535, + "grad_norm": 25.971539727826382, + "learning_rate": 3.2673631299593905e-07, + "logits/chosen": -1.4114099740982056, + "logits/rejected": -1.3771132230758667, + "logps/chosen": -498.02203369140625, + "logps/rejected": -574.3489379882812, + "loss": 0.5197, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.870059609413147, + "rewards/margins": 0.9292176365852356, + "rewards/rejected": -2.799276828765869, + "step": 1920 + }, + { + "epoch": 0.4630518234165067, + "grad_norm": 23.709749340805438, + "learning_rate": 3.247408835922024e-07, + "logits/chosen": -1.3196581602096558, + "logits/rejected": -1.3696314096450806, + "logps/chosen": -569.2474365234375, + "logps/rejected": -684.6192016601562, + "loss": 0.52, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2145485877990723, + "rewards/margins": 1.1213747262954712, + "rewards/rejected": -3.335923433303833, + "step": 1930 + }, + { + "epoch": 0.4654510556621881, + "grad_norm": 22.127147131838182, + "learning_rate": 3.2274021141242306e-07, + "logits/chosen": -1.27199387550354, + "logits/rejected": -1.2790254354476929, + "logps/chosen": -485.20294189453125, + "logps/rejected": -603.9547119140625, + "loss": 0.5052, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.964191198348999, + "rewards/margins": 1.092337727546692, + "rewards/rejected": -3.0565292835235596, + "step": 1940 + }, + { + "epoch": 0.4678502879078695, + "grad_norm": 31.60721032113732, + "learning_rate": 3.2073443679580613e-07, + "logits/chosen": -1.4323736429214478, + "logits/rejected": -1.4212892055511475, + "logps/chosen": -469.6571350097656, + "logps/rejected": -532.6696166992188, + "loss": 0.525, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7428505420684814, + "rewards/margins": 0.67039954662323, + "rewards/rejected": -2.4132497310638428, + "step": 1950 + }, + { + "epoch": 0.47024952015355087, + "grad_norm": 20.41880434164777, + "learning_rate": 3.1872370043947194e-07, + "logits/chosen": -1.398626685142517, + "logits/rejected": -1.3881211280822754, + "logps/chosen": -492.39892578125, + "logps/rejected": -643.0196533203125, + "loss": 0.4767, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9299356937408447, + "rewards/margins": 1.617673635482788, + "rewards/rejected": -3.547609806060791, + "step": 1960 + }, + { + "epoch": 0.47264875239923226, + "grad_norm": 24.52462230483932, + "learning_rate": 3.167081433885874e-07, + "logits/chosen": -1.2932389974594116, + "logits/rejected": -1.336774230003357, + "logps/chosen": -570.3352661132812, + "logps/rejected": -696.5318603515625, + "loss": 0.4902, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1901650428771973, + "rewards/margins": 0.9020577669143677, + "rewards/rejected": -3.0922226905822754, + "step": 1970 + }, + { + "epoch": 0.4750479846449136, + "grad_norm": 21.483127918632597, + "learning_rate": 3.14687907026472e-07, + "logits/chosen": -1.3658959865570068, + "logits/rejected": -1.354203462600708, + "logps/chosen": -430.875244140625, + "logps/rejected": -566.8677978515625, + "loss": 0.4874, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.818810224533081, + "rewards/margins": 1.2019588947296143, + "rewards/rejected": -3.0207691192626953, + "step": 1980 + }, + { + "epoch": 0.477447216890595, + "grad_norm": 17.998772051669235, + "learning_rate": 3.126631330646801e-07, + "logits/chosen": -1.4112380743026733, + "logits/rejected": -1.4499212503433228, + "logps/chosen": -548.3865356445312, + "logps/rejected": -625.7604370117188, + "loss": 0.5424, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.11478853225708, + "rewards/margins": 0.8033573031425476, + "rewards/rejected": -2.9181458950042725, + "step": 1990 + }, + { + "epoch": 0.4798464491362764, + "grad_norm": 21.197580700344048, + "learning_rate": 3.1063396353306097e-07, + "logits/chosen": -1.3862745761871338, + "logits/rejected": -1.3461328744888306, + "logps/chosen": -483.9371643066406, + "logps/rejected": -552.8273315429688, + "loss": 0.4998, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8819984197616577, + "rewards/margins": 1.1292860507965088, + "rewards/rejected": -3.011284351348877, + "step": 2000 + }, + { + "epoch": 0.4798464491362764, + "eval_logits/chosen": -1.2474218606948853, + "eval_logits/rejected": -1.2682809829711914, + "eval_logps/chosen": -500.0430908203125, + "eval_logps/rejected": -631.640380859375, + "eval_loss": 0.49905869364738464, + "eval_rewards/accuracies": 0.7910714149475098, + "eval_rewards/chosen": -2.1060636043548584, + "eval_rewards/margins": 1.30270254611969, + "eval_rewards/rejected": -3.408766269683838, + "eval_runtime": 44.8932, + "eval_samples_per_second": 99.369, + "eval_steps_per_second": 1.559, + "step": 2000 + }, + { + "epoch": 0.4822456813819578, + "grad_norm": 28.496015832743325, + "learning_rate": 3.0860054076979535e-07, + "logits/chosen": -1.339032530784607, + "logits/rejected": -1.3408501148223877, + "logps/chosen": -523.7064208984375, + "logps/rejected": -616.0767822265625, + "loss": 0.4846, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1367874145507812, + "rewards/margins": 1.2577636241912842, + "rewards/rejected": -3.3945510387420654, + "step": 2010 + }, + { + "epoch": 0.4846449136276392, + "grad_norm": 30.58407186929948, + "learning_rate": 3.065630074114115e-07, + "logits/chosen": -1.3784754276275635, + "logits/rejected": -1.3436543941497803, + "logps/chosen": -535.7599487304688, + "logps/rejected": -653.6842651367188, + "loss": 0.5285, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.186112642288208, + "rewards/margins": 1.6430168151855469, + "rewards/rejected": -3.829129457473755, + "step": 2020 + }, + { + "epoch": 0.4870441458733205, + "grad_norm": 17.565034281041136, + "learning_rate": 3.0452150638277947e-07, + "logits/chosen": -1.3619015216827393, + "logits/rejected": -1.376144528388977, + "logps/chosen": -474.21807861328125, + "logps/rejected": -562.9456176757812, + "loss": 0.5071, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.250333070755005, + "rewards/margins": 0.7430413961410522, + "rewards/rejected": -2.9933745861053467, + "step": 2030 + }, + { + "epoch": 0.4894433781190019, + "grad_norm": 14.89788424027872, + "learning_rate": 3.024761808870856e-07, + "logits/chosen": -1.2471539974212646, + "logits/rejected": -1.3330951929092407, + "logps/chosen": -475.13409423828125, + "logps/rejected": -689.24658203125, + "loss": 0.4669, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.0299086570739746, + "rewards/margins": 2.1725423336029053, + "rewards/rejected": -4.202450752258301, + "step": 2040 + }, + { + "epoch": 0.4918426103646833, + "grad_norm": 29.603352320287087, + "learning_rate": 3.004271743957875e-07, + "logits/chosen": -1.4348108768463135, + "logits/rejected": -1.488304853439331, + "logps/chosen": -524.4188232421875, + "logps/rejected": -627.2877807617188, + "loss": 0.4988, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3159594535827637, + "rewards/margins": 0.7349545359611511, + "rewards/rejected": -3.0509142875671387, + "step": 2050 + }, + { + "epoch": 0.4942418426103647, + "grad_norm": 21.567447373645205, + "learning_rate": 2.983746306385499e-07, + "logits/chosen": -1.321178913116455, + "logits/rejected": -1.3384603261947632, + "logps/chosen": -514.88623046875, + "logps/rejected": -694.174560546875, + "loss": 0.5084, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.3894028663635254, + "rewards/margins": 1.695604681968689, + "rewards/rejected": -4.085007190704346, + "step": 2060 + }, + { + "epoch": 0.4966410748560461, + "grad_norm": 38.86124535507397, + "learning_rate": 2.963186935931628e-07, + "logits/chosen": -1.406170129776001, + "logits/rejected": -1.4607737064361572, + "logps/chosen": -499.363525390625, + "logps/rejected": -617.50146484375, + "loss": 0.4921, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.097320556640625, + "rewards/margins": 1.222407579421997, + "rewards/rejected": -3.319728136062622, + "step": 2070 + }, + { + "epoch": 0.4990403071017274, + "grad_norm": 22.210238133746437, + "learning_rate": 2.9425950747544176e-07, + "logits/chosen": -1.3239015340805054, + "logits/rejected": -1.351828932762146, + "logps/chosen": -563.4287719726562, + "logps/rejected": -716.6170654296875, + "loss": 0.463, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.364441394805908, + "rewards/margins": 1.8510831594467163, + "rewards/rejected": -4.215524673461914, + "step": 2080 + }, + { + "epoch": 0.5014395393474088, + "grad_norm": 39.63884037214758, + "learning_rate": 2.921972167291119e-07, + "logits/chosen": -1.4712202548980713, + "logits/rejected": -1.4773231744766235, + "logps/chosen": -539.4713134765625, + "logps/rejected": -648.0231323242188, + "loss": 0.4855, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2277395725250244, + "rewards/margins": 1.014095425605774, + "rewards/rejected": -3.2418346405029297, + "step": 2090 + }, + { + "epoch": 0.5038387715930902, + "grad_norm": 23.16911181282567, + "learning_rate": 2.9013196601567567e-07, + "logits/chosen": -1.494103193283081, + "logits/rejected": -1.4985407590866089, + "logps/chosen": -482.7745666503906, + "logps/rejected": -591.887939453125, + "loss": 0.5678, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9699678421020508, + "rewards/margins": 0.9908300638198853, + "rewards/rejected": -2.9607980251312256, + "step": 2100 + }, + { + "epoch": 0.5062380038387716, + "grad_norm": 21.16062545979024, + "learning_rate": 2.8806390020426555e-07, + "logits/chosen": -1.478158712387085, + "logits/rejected": -1.519151210784912, + "logps/chosen": -507.43487548828125, + "logps/rejected": -634.43359375, + "loss": 0.487, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9820287227630615, + "rewards/margins": 1.2364323139190674, + "rewards/rejected": -3.21846079826355, + "step": 2110 + }, + { + "epoch": 0.508637236084453, + "grad_norm": 33.9734188119477, + "learning_rate": 2.8599316436148187e-07, + "logits/chosen": -1.2816598415374756, + "logits/rejected": -1.2937710285186768, + "logps/chosen": -478.5577697753906, + "logps/rejected": -558.8751831054688, + "loss": 0.5017, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9709914922714233, + "rewards/margins": 0.8311678171157837, + "rewards/rejected": -2.802159070968628, + "step": 2120 + }, + { + "epoch": 0.5110364683301344, + "grad_norm": 23.09409690656908, + "learning_rate": 2.8391990374121723e-07, + "logits/chosen": -1.3798530101776123, + "logits/rejected": -1.403876543045044, + "logps/chosen": -503.7960510253906, + "logps/rejected": -709.4195556640625, + "loss": 0.4936, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.278823137283325, + "rewards/margins": 1.7539228200912476, + "rewards/rejected": -4.032745361328125, + "step": 2130 + }, + { + "epoch": 0.5134357005758158, + "grad_norm": 15.790089353036674, + "learning_rate": 2.818442637744669e-07, + "logits/chosen": -1.3342841863632202, + "logits/rejected": -1.4296324253082275, + "logps/chosen": -512.0535278320312, + "logps/rejected": -618.1871948242188, + "loss": 0.5038, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.323772430419922, + "rewards/margins": 1.0147329568862915, + "rewards/rejected": -3.338505268096924, + "step": 2140 + }, + { + "epoch": 0.5158349328214972, + "grad_norm": 19.718603748751267, + "learning_rate": 2.797663900591284e-07, + "logits/chosen": -1.4657460451126099, + "logits/rejected": -1.42764413356781, + "logps/chosen": -523.40966796875, + "logps/rejected": -587.4160766601562, + "loss": 0.5002, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.2640388011932373, + "rewards/margins": 0.9329341053962708, + "rewards/rejected": -3.1969728469848633, + "step": 2150 + }, + { + "epoch": 0.5182341650671785, + "grad_norm": 22.217489569895896, + "learning_rate": 2.776864283497874e-07, + "logits/chosen": -1.3839603662490845, + "logits/rejected": -1.3685497045516968, + "logps/chosen": -480.69390869140625, + "logps/rejected": -636.3984985351562, + "loss": 0.4948, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.181882381439209, + "rewards/margins": 1.6789003610610962, + "rewards/rejected": -3.860783338546753, + "step": 2160 + }, + { + "epoch": 0.5206333973128598, + "grad_norm": 24.79384115508211, + "learning_rate": 2.756045245474943e-07, + "logits/chosen": -1.5136412382125854, + "logits/rejected": -1.5565202236175537, + "logps/chosen": -502.5581970214844, + "logps/rejected": -612.2777099609375, + "loss": 0.5011, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0902390480041504, + "rewards/margins": 0.9772912859916687, + "rewards/rejected": -3.067530393600464, + "step": 2170 + }, + { + "epoch": 0.5230326295585412, + "grad_norm": 20.593651930438703, + "learning_rate": 2.7352082468952977e-07, + "logits/chosen": -1.3378188610076904, + "logits/rejected": -1.4043794870376587, + "logps/chosen": -557.2437744140625, + "logps/rejected": -814.534912109375, + "loss": 0.5384, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.8106279373168945, + "rewards/margins": 2.4125828742980957, + "rewards/rejected": -5.223210334777832, + "step": 2180 + }, + { + "epoch": 0.5254318618042226, + "grad_norm": 32.109088399513396, + "learning_rate": 2.7143547493916e-07, + "logits/chosen": -1.3619160652160645, + "logits/rejected": -1.4087274074554443, + "logps/chosen": -518.8702392578125, + "logps/rejected": -732.2639770507812, + "loss": 0.5077, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.2629292011260986, + "rewards/margins": 2.1367053985595703, + "rewards/rejected": -4.39963436126709, + "step": 2190 + }, + { + "epoch": 0.527831094049904, + "grad_norm": 19.538012177296135, + "learning_rate": 2.693486215753853e-07, + "logits/chosen": -1.3854752779006958, + "logits/rejected": -1.417136549949646, + "logps/chosen": -542.5548706054688, + "logps/rejected": -683.5465698242188, + "loss": 0.495, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.5847525596618652, + "rewards/margins": 1.603996992111206, + "rewards/rejected": -4.188749313354492, + "step": 2200 + }, + { + "epoch": 0.5302303262955854, + "grad_norm": 20.462323208965326, + "learning_rate": 2.6726041098267805e-07, + "logits/chosen": -1.4916956424713135, + "logits/rejected": -1.4895212650299072, + "logps/chosen": -565.2971801757812, + "logps/rejected": -600.5526123046875, + "loss": 0.5437, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.4255809783935547, + "rewards/margins": 0.7856418490409851, + "rewards/rejected": -3.2112224102020264, + "step": 2210 + }, + { + "epoch": 0.5326295585412668, + "grad_norm": 52.5384797240852, + "learning_rate": 2.6517098964071507e-07, + "logits/chosen": -1.2951313257217407, + "logits/rejected": -1.3209047317504883, + "logps/chosen": -498.617431640625, + "logps/rejected": -545.1934814453125, + "loss": 0.5789, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1173582077026367, + "rewards/margins": 0.31116223335266113, + "rewards/rejected": -2.4285202026367188, + "step": 2220 + }, + { + "epoch": 0.5350287907869482, + "grad_norm": 27.216085783703008, + "learning_rate": 2.630805041141023e-07, + "logits/chosen": -1.2584030628204346, + "logits/rejected": -1.2679502964019775, + "logps/chosen": -497.410400390625, + "logps/rejected": -728.3447875976562, + "loss": 0.4799, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3796539306640625, + "rewards/margins": 2.1445538997650146, + "rewards/rejected": -4.524207592010498, + "step": 2230 + }, + { + "epoch": 0.5374280230326296, + "grad_norm": 23.756284108770863, + "learning_rate": 2.609891010420941e-07, + "logits/chosen": -1.3816049098968506, + "logits/rejected": -1.3730615377426147, + "logps/chosen": -573.5911865234375, + "logps/rejected": -712.1728515625, + "loss": 0.4651, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.6385135650634766, + "rewards/margins": 1.3951257467269897, + "rewards/rejected": -4.033638954162598, + "step": 2240 + }, + { + "epoch": 0.539827255278311, + "grad_norm": 27.555669029609934, + "learning_rate": 2.5889692712830674e-07, + "logits/chosen": -1.501206398010254, + "logits/rejected": -1.4833787679672241, + "logps/chosen": -502.5823669433594, + "logps/rejected": -608.8131103515625, + "loss": 0.4699, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4075236320495605, + "rewards/margins": 1.1593475341796875, + "rewards/rejected": -3.566871166229248, + "step": 2250 + }, + { + "epoch": 0.5422264875239923, + "grad_norm": 24.440656357116737, + "learning_rate": 2.5680412913042843e-07, + "logits/chosen": -1.264974594116211, + "logits/rejected": -1.2851756811141968, + "logps/chosen": -579.0538330078125, + "logps/rejected": -750.8895263671875, + "loss": 0.5023, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.963130474090576, + "rewards/margins": 1.7768911123275757, + "rewards/rejected": -4.740021705627441, + "step": 2260 + }, + { + "epoch": 0.5446257197696737, + "grad_norm": 28.10821282759682, + "learning_rate": 2.5471085384992404e-07, + "logits/chosen": -1.3328526020050049, + "logits/rejected": -1.4160661697387695, + "logps/chosen": -585.4767456054688, + "logps/rejected": -807.6009521484375, + "loss": 0.5074, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.05895733833313, + "rewards/margins": 2.211324691772461, + "rewards/rejected": -5.270281791687012, + "step": 2270 + }, + { + "epoch": 0.5470249520153551, + "grad_norm": 19.32312755127493, + "learning_rate": 2.526172481217381e-07, + "logits/chosen": -1.2847373485565186, + "logits/rejected": -1.3442981243133545, + "logps/chosen": -521.7784423828125, + "logps/rejected": -687.6416625976562, + "loss": 0.5011, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9379782676696777, + "rewards/margins": 1.4609719514846802, + "rewards/rejected": -4.39894962310791, + "step": 2280 + }, + { + "epoch": 0.5494241842610365, + "grad_norm": 24.275573203714735, + "learning_rate": 2.5052345880399456e-07, + "logits/chosen": -1.3244855403900146, + "logits/rejected": -1.3225276470184326, + "logps/chosen": -491.2483825683594, + "logps/rejected": -609.5865478515625, + "loss": 0.4687, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.298245429992676, + "rewards/margins": 1.192508339881897, + "rewards/rejected": -3.4907538890838623, + "step": 2290 + }, + { + "epoch": 0.5518234165067178, + "grad_norm": 25.143500975202826, + "learning_rate": 2.4842963276769555e-07, + "logits/chosen": -1.2658333778381348, + "logits/rejected": -1.3050744533538818, + "logps/chosen": -505.6437072753906, + "logps/rejected": -682.5819091796875, + "loss": 0.5167, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.630180597305298, + "rewards/margins": 1.306947946548462, + "rewards/rejected": -3.9371285438537598, + "step": 2300 + }, + { + "epoch": 0.5542226487523992, + "grad_norm": 16.944553092121332, + "learning_rate": 2.463359168864189e-07, + "logits/chosen": -1.4239953756332397, + "logits/rejected": -1.4199182987213135, + "logps/chosen": -552.4425048828125, + "logps/rejected": -612.037353515625, + "loss": 0.5184, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.300503730773926, + "rewards/margins": 0.9222415685653687, + "rewards/rejected": -3.222745418548584, + "step": 2310 + }, + { + "epoch": 0.5566218809980806, + "grad_norm": 56.62050293339828, + "learning_rate": 2.4424245802601555e-07, + "logits/chosen": -1.4122536182403564, + "logits/rejected": -1.47800874710083, + "logps/chosen": -477.4188537597656, + "logps/rejected": -607.6446533203125, + "loss": 0.5125, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.330216407775879, + "rewards/margins": 0.7497684359550476, + "rewards/rejected": -3.079984664916992, + "step": 2320 + }, + { + "epoch": 0.559021113243762, + "grad_norm": 22.422195639060202, + "learning_rate": 2.421494030343072e-07, + "logits/chosen": -1.3285491466522217, + "logits/rejected": -1.2832763195037842, + "logps/chosen": -510.07733154296875, + "logps/rejected": -534.5994262695312, + "loss": 0.5572, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.1058852672576904, + "rewards/margins": 0.7594197392463684, + "rewards/rejected": -2.865304946899414, + "step": 2330 + }, + { + "epoch": 0.5614203454894434, + "grad_norm": 25.303517266929582, + "learning_rate": 2.400568987307861e-07, + "logits/chosen": -1.3156358003616333, + "logits/rejected": -1.2751775979995728, + "logps/chosen": -461.8431091308594, + "logps/rejected": -486.8683166503906, + "loss": 0.4738, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.9566274881362915, + "rewards/margins": 0.45843505859375, + "rewards/rejected": -2.415062427520752, + "step": 2340 + }, + { + "epoch": 0.5638195777351248, + "grad_norm": 28.70975327071874, + "learning_rate": 2.379650918963156e-07, + "logits/chosen": -1.4588607549667358, + "logits/rejected": -1.4819213151931763, + "logps/chosen": -468.0003356933594, + "logps/rejected": -612.2194213867188, + "loss": 0.5079, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.45204496383667, + "rewards/margins": 1.3688656091690063, + "rewards/rejected": -3.820910692214966, + "step": 2350 + }, + { + "epoch": 0.5662188099808061, + "grad_norm": 23.693010150257162, + "learning_rate": 2.3587412926283438e-07, + "logits/chosen": -1.447228193283081, + "logits/rejected": -1.4866100549697876, + "logps/chosen": -562.5271606445312, + "logps/rejected": -724.3071899414062, + "loss": 0.5266, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1487090587615967, + "rewards/margins": 2.082533597946167, + "rewards/rejected": -4.231243133544922, + "step": 2360 + }, + { + "epoch": 0.5686180422264875, + "grad_norm": 21.713640263630566, + "learning_rate": 2.337841575030642e-07, + "logits/chosen": -1.4346665143966675, + "logits/rejected": -1.458919644355774, + "logps/chosen": -527.04833984375, + "logps/rejected": -640.2484130859375, + "loss": 0.5221, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.1272292137145996, + "rewards/margins": 0.9690892100334167, + "rewards/rejected": -3.0963187217712402, + "step": 2370 + }, + { + "epoch": 0.5710172744721689, + "grad_norm": 20.835416590684012, + "learning_rate": 2.316953232202206e-07, + "logits/chosen": -1.294693112373352, + "logits/rejected": -1.2616034746170044, + "logps/chosen": -473.155029296875, + "logps/rejected": -469.6883850097656, + "loss": 0.4773, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9840710163116455, + "rewards/margins": 0.7281392812728882, + "rewards/rejected": -2.712210178375244, + "step": 2380 + }, + { + "epoch": 0.5734165067178503, + "grad_norm": 19.666367928073647, + "learning_rate": 2.2960777293772958e-07, + "logits/chosen": -1.2634376287460327, + "logits/rejected": -1.2736718654632568, + "logps/chosen": -477.8089904785156, + "logps/rejected": -592.3357543945312, + "loss": 0.4813, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.3676700592041016, + "rewards/margins": 1.3964430093765259, + "rewards/rejected": -3.764112949371338, + "step": 2390 + }, + { + "epoch": 0.5758157389635317, + "grad_norm": 18.861132278479225, + "learning_rate": 2.2752165308894974e-07, + "logits/chosen": -1.3739644289016724, + "logits/rejected": -1.4131231307983398, + "logps/chosen": -456.08477783203125, + "logps/rejected": -557.81396484375, + "loss": 0.4668, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.3130087852478027, + "rewards/margins": 1.1197054386138916, + "rewards/rejected": -3.432713747024536, + "step": 2400 + }, + { + "epoch": 0.5782149712092131, + "grad_norm": 21.917850383175576, + "learning_rate": 2.254371100069005e-07, + "logits/chosen": -1.339754581451416, + "logits/rejected": -1.4103128910064697, + "logps/chosen": -459.272216796875, + "logps/rejected": -565.7703247070312, + "loss": 0.4812, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.9371192455291748, + "rewards/margins": 0.8717780113220215, + "rewards/rejected": -2.8088974952697754, + "step": 2410 + }, + { + "epoch": 0.5806142034548945, + "grad_norm": 30.184228172853466, + "learning_rate": 2.2335428991399725e-07, + "logits/chosen": -1.313047170639038, + "logits/rejected": -1.3599885702133179, + "logps/chosen": -541.0878295898438, + "logps/rejected": -837.1044921875, + "loss": 0.4859, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.005167007446289, + "rewards/margins": 2.97575044631958, + "rewards/rejected": -5.980917930603027, + "step": 2420 + }, + { + "epoch": 0.5830134357005758, + "grad_norm": 25.655852488610243, + "learning_rate": 2.2127333891179458e-07, + "logits/chosen": -1.3415940999984741, + "logits/rejected": -1.3854671716690063, + "logps/chosen": -486.8603515625, + "logps/rejected": -664.0523071289062, + "loss": 0.5168, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4606051445007324, + "rewards/margins": 1.6411139965057373, + "rewards/rejected": -4.101718902587891, + "step": 2430 + }, + { + "epoch": 0.5854126679462572, + "grad_norm": 41.50790514482655, + "learning_rate": 2.1919440297073782e-07, + "logits/chosen": -1.386488437652588, + "logits/rejected": -1.4022929668426514, + "logps/chosen": -512.3683471679688, + "logps/rejected": -648.8067016601562, + "loss": 0.5337, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5902962684631348, + "rewards/margins": 1.4189550876617432, + "rewards/rejected": -4.009251594543457, + "step": 2440 + }, + { + "epoch": 0.5878119001919386, + "grad_norm": 21.130485040294285, + "learning_rate": 2.1711762791992368e-07, + "logits/chosen": -1.3641738891601562, + "logits/rejected": -1.3558547496795654, + "logps/chosen": -568.2584228515625, + "logps/rejected": -626.7496337890625, + "loss": 0.5454, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.454071283340454, + "rewards/margins": 0.9526869654655457, + "rewards/rejected": -3.4067580699920654, + "step": 2450 + }, + { + "epoch": 0.5902111324376199, + "grad_norm": 24.01882096545757, + "learning_rate": 2.1504315943687114e-07, + "logits/chosen": -1.509023666381836, + "logits/rejected": -1.5874083042144775, + "logps/chosen": -485.9769592285156, + "logps/rejected": -670.6282958984375, + "loss": 0.5058, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.266322612762451, + "rewards/margins": 1.3989723920822144, + "rewards/rejected": -3.665295362472534, + "step": 2460 + }, + { + "epoch": 0.5926103646833013, + "grad_norm": 27.995370893182674, + "learning_rate": 2.1297114303730248e-07, + "logits/chosen": -1.2944996356964111, + "logits/rejected": -1.3878734111785889, + "logps/chosen": -469.72113037109375, + "logps/rejected": -671.64892578125, + "loss": 0.5471, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2581284046173096, + "rewards/margins": 1.4777872562408447, + "rewards/rejected": -3.735915422439575, + "step": 2470 + }, + { + "epoch": 0.5950095969289827, + "grad_norm": 19.354866892382386, + "learning_rate": 2.1090172406493616e-07, + "logits/chosen": -1.3561007976531982, + "logits/rejected": -1.4289817810058594, + "logps/chosen": -438.2345275878906, + "logps/rejected": -533.8566284179688, + "loss": 0.478, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7584648132324219, + "rewards/margins": 0.9048868417739868, + "rewards/rejected": -2.6633517742156982, + "step": 2480 + }, + { + "epoch": 0.5974088291746641, + "grad_norm": 22.83947705015176, + "learning_rate": 2.0883504768129146e-07, + "logits/chosen": -1.4529814720153809, + "logits/rejected": -1.5016182661056519, + "logps/chosen": -516.8214111328125, + "logps/rejected": -624.1456298828125, + "loss": 0.4952, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.161440134048462, + "rewards/margins": 1.1859114170074463, + "rewards/rejected": -3.347351551055908, + "step": 2490 + }, + { + "epoch": 0.5998080614203455, + "grad_norm": 28.376289946299075, + "learning_rate": 2.0677125885550571e-07, + "logits/chosen": -1.3103406429290771, + "logits/rejected": -1.2844992876052856, + "logps/chosen": -479.07470703125, + "logps/rejected": -542.7996826171875, + "loss": 0.5029, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.149184465408325, + "rewards/margins": 0.9856400489807129, + "rewards/rejected": -3.134824514389038, + "step": 2500 + }, + { + "epoch": 0.6022072936660269, + "grad_norm": 23.5766057765825, + "learning_rate": 2.0471050235416587e-07, + "logits/chosen": -1.5208079814910889, + "logits/rejected": -1.504916787147522, + "logps/chosen": -538.6173095703125, + "logps/rejected": -603.2637939453125, + "loss": 0.4817, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.401134490966797, + "rewards/margins": 1.1455899477005005, + "rewards/rejected": -3.546724319458008, + "step": 2510 + }, + { + "epoch": 0.6046065259117083, + "grad_norm": 30.516196446160524, + "learning_rate": 2.026529227311532e-07, + "logits/chosen": -1.4248836040496826, + "logits/rejected": -1.4135302305221558, + "logps/chosen": -511.12847900390625, + "logps/rejected": -612.2459716796875, + "loss": 0.5103, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6355061531066895, + "rewards/margins": 0.9499729871749878, + "rewards/rejected": -3.585479259490967, + "step": 2520 + }, + { + "epoch": 0.6070057581573897, + "grad_norm": 26.661200847179316, + "learning_rate": 2.005986643175036e-07, + "logits/chosen": -1.3937880992889404, + "logits/rejected": -1.484312653541565, + "logps/chosen": -519.8787231445312, + "logps/rejected": -694.2974853515625, + "loss": 0.4423, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.2589077949523926, + "rewards/margins": 1.906867265701294, + "rewards/rejected": -4.165774822235107, + "step": 2530 + }, + { + "epoch": 0.6094049904030711, + "grad_norm": 31.059127635780456, + "learning_rate": 1.9854787121128328e-07, + "logits/chosen": -1.3416228294372559, + "logits/rejected": -1.3227035999298096, + "logps/chosen": -458.44171142578125, + "logps/rejected": -524.4501953125, + "loss": 0.5127, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0606110095977783, + "rewards/margins": 1.1025084257125854, + "rewards/rejected": -3.163119077682495, + "step": 2540 + }, + { + "epoch": 0.6118042226487524, + "grad_norm": 20.526689955276712, + "learning_rate": 1.9650068726748106e-07, + "logits/chosen": -1.3136961460113525, + "logits/rejected": -1.3136708736419678, + "logps/chosen": -543.5650024414062, + "logps/rejected": -695.6158447265625, + "loss": 0.5028, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.336003303527832, + "rewards/margins": 1.6004486083984375, + "rewards/rejected": -3.9364514350891113, + "step": 2550 + }, + { + "epoch": 0.6142034548944337, + "grad_norm": 24.40904534011045, + "learning_rate": 1.9445725608791718e-07, + "logits/chosen": -1.3698501586914062, + "logits/rejected": -1.4069857597351074, + "logps/chosen": -556.4116821289062, + "logps/rejected": -849.5657348632812, + "loss": 0.4844, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.667902708053589, + "rewards/margins": 2.7795913219451904, + "rewards/rejected": -5.447493553161621, + "step": 2560 + }, + { + "epoch": 0.6166026871401151, + "grad_norm": 23.641844354412267, + "learning_rate": 1.924177210111705e-07, + "logits/chosen": -1.431658387184143, + "logits/rejected": -1.4328802824020386, + "logps/chosen": -483.56500244140625, + "logps/rejected": -688.3074951171875, + "loss": 0.5046, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1448919773101807, + "rewards/margins": 1.980931043624878, + "rewards/rejected": -4.125823020935059, + "step": 2570 + }, + { + "epoch": 0.6190019193857965, + "grad_norm": 29.335322659226975, + "learning_rate": 1.9038222510252364e-07, + "logits/chosen": -1.439528465270996, + "logits/rejected": -1.4357424974441528, + "logps/chosen": -494.8080139160156, + "logps/rejected": -594.5472412109375, + "loss": 0.4848, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1832897663116455, + "rewards/margins": 1.1307239532470703, + "rewards/rejected": -3.314013957977295, + "step": 2580 + }, + { + "epoch": 0.6214011516314779, + "grad_norm": 33.55681043289516, + "learning_rate": 1.883509111439277e-07, + "logits/chosen": -1.3298367261886597, + "logits/rejected": -1.413700819015503, + "logps/chosen": -499.87847900390625, + "logps/rejected": -780.3280029296875, + "loss": 0.5169, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.4794504642486572, + "rewards/margins": 2.091299057006836, + "rewards/rejected": -4.570749759674072, + "step": 2590 + }, + { + "epoch": 0.6238003838771593, + "grad_norm": 26.058458420522904, + "learning_rate": 1.8632392162398665e-07, + "logits/chosen": -1.5030263662338257, + "logits/rejected": -1.5112556219100952, + "logps/chosen": -551.242919921875, + "logps/rejected": -719.3308715820312, + "loss": 0.4793, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.2437145709991455, + "rewards/margins": 1.7527650594711304, + "rewards/rejected": -3.9964795112609863, + "step": 2600 + }, + { + "epoch": 0.6261996161228407, + "grad_norm": 21.39567262878607, + "learning_rate": 1.84301398727962e-07, + "logits/chosen": -1.2877718210220337, + "logits/rejected": -1.3708404302597046, + "logps/chosen": -450.72137451171875, + "logps/rejected": -725.76123046875, + "loss": 0.5007, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.406883716583252, + "rewards/margins": 2.3896679878234863, + "rewards/rejected": -4.796551704406738, + "step": 2610 + }, + { + "epoch": 0.6285988483685221, + "grad_norm": 42.376777430234114, + "learning_rate": 1.8228348432779966e-07, + "logits/chosen": -1.4927796125411987, + "logits/rejected": -1.506494164466858, + "logps/chosen": -500.10894775390625, + "logps/rejected": -638.76708984375, + "loss": 0.5215, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.392141819000244, + "rewards/margins": 1.4150078296661377, + "rewards/rejected": -3.807149887084961, + "step": 2620 + }, + { + "epoch": 0.6309980806142035, + "grad_norm": 20.967801071152763, + "learning_rate": 1.8027031997217773e-07, + "logits/chosen": -1.3696873188018799, + "logits/rejected": -1.432411551475525, + "logps/chosen": -527.5897827148438, + "logps/rejected": -717.5432739257812, + "loss": 0.4753, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6743311882019043, + "rewards/margins": 1.8335367441177368, + "rewards/rejected": -4.50786828994751, + "step": 2630 + }, + { + "epoch": 0.6333973128598849, + "grad_norm": 18.11218975665868, + "learning_rate": 1.7826204687657758e-07, + "logits/chosen": -1.4255046844482422, + "logits/rejected": -1.401328444480896, + "logps/chosen": -528.8450317382812, + "logps/rejected": -565.4735107421875, + "loss": 0.4868, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2101588249206543, + "rewards/margins": 0.7658090591430664, + "rewards/rejected": -2.9759678840637207, + "step": 2640 + }, + { + "epoch": 0.6357965451055663, + "grad_norm": 24.39292990785771, + "learning_rate": 1.762588059133781e-07, + "logits/chosen": -1.3965257406234741, + "logits/rejected": -1.3291213512420654, + "logps/chosen": -578.5978393554688, + "logps/rejected": -677.1031494140625, + "loss": 0.4739, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.458543300628662, + "rewards/margins": 1.312782645225525, + "rewards/rejected": -3.7713265419006348, + "step": 2650 + }, + { + "epoch": 0.6381957773512476, + "grad_norm": 23.276242624168656, + "learning_rate": 1.7426073760197406e-07, + "logits/chosen": -1.561706781387329, + "logits/rejected": -1.6332374811172485, + "logps/chosen": -543.4059448242188, + "logps/rejected": -777.9879150390625, + "loss": 0.5186, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.588839054107666, + "rewards/margins": 2.0521843433380127, + "rewards/rejected": -4.641024112701416, + "step": 2660 + }, + { + "epoch": 0.6405950095969289, + "grad_norm": 18.17519447551886, + "learning_rate": 1.7226798209891935e-07, + "logits/chosen": -1.510333776473999, + "logits/rejected": -1.461432695388794, + "logps/chosen": -558.4779052734375, + "logps/rejected": -643.7349853515625, + "loss": 0.4886, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.666550397872925, + "rewards/margins": 1.4416770935058594, + "rewards/rejected": -4.108227729797363, + "step": 2670 + }, + { + "epoch": 0.6429942418426103, + "grad_norm": 28.226456153534905, + "learning_rate": 1.7028067918809535e-07, + "logits/chosen": -1.4245549440383911, + "logits/rejected": -1.4454561471939087, + "logps/chosen": -479.9833068847656, + "logps/rejected": -708.9285278320312, + "loss": 0.4872, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.249075412750244, + "rewards/margins": 1.916194200515747, + "rewards/rejected": -4.1652703285217285, + "step": 2680 + }, + { + "epoch": 0.6453934740882917, + "grad_norm": 26.52570526793631, + "learning_rate": 1.6829896827090584e-07, + "logits/chosen": -1.5254919528961182, + "logits/rejected": -1.5353405475616455, + "logps/chosen": -532.4969482421875, + "logps/rejected": -566.6170654296875, + "loss": 0.5212, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.344062089920044, + "rewards/margins": 0.6551955938339233, + "rewards/rejected": -2.9992575645446777, + "step": 2690 + }, + { + "epoch": 0.6477927063339731, + "grad_norm": 22.151705547840702, + "learning_rate": 1.6632298835649844e-07, + "logits/chosen": -1.4246346950531006, + "logits/rejected": -1.513645052909851, + "logps/chosen": -554.13330078125, + "logps/rejected": -756.4811401367188, + "loss": 0.4413, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.500253677368164, + "rewards/margins": 1.803654670715332, + "rewards/rejected": -4.303908348083496, + "step": 2700 + }, + { + "epoch": 0.6501919385796545, + "grad_norm": 47.00479632538662, + "learning_rate": 1.6435287805201364e-07, + "logits/chosen": -1.3270941972732544, + "logits/rejected": -1.362379550933838, + "logps/chosen": -562.240478515625, + "logps/rejected": -660.1361083984375, + "loss": 0.52, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.685246706008911, + "rewards/margins": 0.9671622514724731, + "rewards/rejected": -3.652409076690674, + "step": 2710 + }, + { + "epoch": 0.6525911708253359, + "grad_norm": 31.77823919940292, + "learning_rate": 1.6238877555286207e-07, + "logits/chosen": -1.4155287742614746, + "logits/rejected": -1.4508187770843506, + "logps/chosen": -569.0968017578125, + "logps/rejected": -711.2254638671875, + "loss": 0.4521, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.6851706504821777, + "rewards/margins": 1.4043285846710205, + "rewards/rejected": -4.089498996734619, + "step": 2720 + }, + { + "epoch": 0.6549904030710173, + "grad_norm": 28.84102262485335, + "learning_rate": 1.60430818633031e-07, + "logits/chosen": -1.586005449295044, + "logits/rejected": -1.5921038389205933, + "logps/chosen": -531.1340942382812, + "logps/rejected": -683.3360595703125, + "loss": 0.4484, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.4740867614746094, + "rewards/margins": 1.6356041431427002, + "rewards/rejected": -4.1096906661987305, + "step": 2730 + }, + { + "epoch": 0.6573896353166987, + "grad_norm": 21.395562460450257, + "learning_rate": 1.5847914463541939e-07, + "logits/chosen": -1.3539658784866333, + "logits/rejected": -1.3645527362823486, + "logps/chosen": -487.83447265625, + "logps/rejected": -654.1101684570312, + "loss": 0.4858, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.6091575622558594, + "rewards/margins": 1.4568383693695068, + "rewards/rejected": -4.065995693206787, + "step": 2740 + }, + { + "epoch": 0.6597888675623801, + "grad_norm": 18.939658545906656, + "learning_rate": 1.5653389046220427e-07, + "logits/chosen": -1.3743458986282349, + "logits/rejected": -1.4485480785369873, + "logps/chosen": -463.6263732910156, + "logps/rejected": -595.83447265625, + "loss": 0.4706, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9906790256500244, + "rewards/margins": 1.1591793298721313, + "rewards/rejected": -3.1498584747314453, + "step": 2750 + }, + { + "epoch": 0.6621880998080614, + "grad_norm": 33.655103429760075, + "learning_rate": 1.545951925652375e-07, + "logits/chosen": -1.390751838684082, + "logits/rejected": -1.3566430807113647, + "logps/chosen": -580.6901245117188, + "logps/rejected": -652.6717529296875, + "loss": 0.482, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.3463029861450195, + "rewards/margins": 1.3685951232910156, + "rewards/rejected": -3.714897871017456, + "step": 2760 + }, + { + "epoch": 0.6645873320537428, + "grad_norm": 21.443571073001518, + "learning_rate": 1.5266318693647423e-07, + "logits/chosen": -1.3822195529937744, + "logits/rejected": -1.3711801767349243, + "logps/chosen": -502.9720764160156, + "logps/rejected": -597.5842895507812, + "loss": 0.4918, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9253711700439453, + "rewards/margins": 1.0280150175094604, + "rewards/rejected": -2.953385829925537, + "step": 2770 + }, + { + "epoch": 0.6669865642994242, + "grad_norm": 39.17548401272452, + "learning_rate": 1.5073800909843353e-07, + "logits/chosen": -1.4811722040176392, + "logits/rejected": -1.4104912281036377, + "logps/chosen": -548.9730224609375, + "logps/rejected": -603.4891357421875, + "loss": 0.4482, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.4605257511138916, + "rewards/margins": 1.208837866783142, + "rewards/rejected": -3.6693637371063232, + "step": 2780 + }, + { + "epoch": 0.6693857965451055, + "grad_norm": 35.672962025913655, + "learning_rate": 1.488197940946922e-07, + "logits/chosen": -1.4754455089569092, + "logits/rejected": -1.4402107000350952, + "logps/chosen": -536.0340576171875, + "logps/rejected": -621.2711791992188, + "loss": 0.4691, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2582364082336426, + "rewards/margins": 1.444352149963379, + "rewards/rejected": -3.7025885581970215, + "step": 2790 + }, + { + "epoch": 0.6717850287907869, + "grad_norm": 26.11958716115815, + "learning_rate": 1.4690867648041167e-07, + "logits/chosen": -1.4978218078613281, + "logits/rejected": -1.4710818529129028, + "logps/chosen": -516.9527587890625, + "logps/rejected": -652.12646484375, + "loss": 0.4839, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.296104907989502, + "rewards/margins": 1.6007229089736938, + "rewards/rejected": -3.8968276977539062, + "step": 2800 + }, + { + "epoch": 0.6741842610364683, + "grad_norm": 22.888417679896538, + "learning_rate": 1.4500479031289987e-07, + "logits/chosen": -1.522962212562561, + "logits/rejected": -1.4740493297576904, + "logps/chosen": -518.3988037109375, + "logps/rejected": -626.0806884765625, + "loss": 0.5199, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.204174280166626, + "rewards/margins": 1.1408765316009521, + "rewards/rejected": -3.34505033493042, + "step": 2810 + }, + { + "epoch": 0.6765834932821497, + "grad_norm": 27.3130641249735, + "learning_rate": 1.4310826914220747e-07, + "logits/chosen": -1.5253374576568604, + "logits/rejected": -1.5296834707260132, + "logps/chosen": -559.3461303710938, + "logps/rejected": -661.4520263671875, + "loss": 0.5098, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.340601921081543, + "rewards/margins": 1.1254842281341553, + "rewards/rejected": -3.4660861492156982, + "step": 2820 + }, + { + "epoch": 0.6789827255278311, + "grad_norm": 21.746070145724726, + "learning_rate": 1.412192460017597e-07, + "logits/chosen": -1.4279567003250122, + "logits/rejected": -1.4748324155807495, + "logps/chosen": -525.6238403320312, + "logps/rejected": -661.3458251953125, + "loss": 0.5082, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.574410915374756, + "rewards/margins": 1.3353570699691772, + "rewards/rejected": -3.9097678661346436, + "step": 2830 + }, + { + "epoch": 0.6813819577735125, + "grad_norm": 38.22506872905949, + "learning_rate": 1.3933785339902504e-07, + "logits/chosen": -1.339423418045044, + "logits/rejected": -1.459438681602478, + "logps/chosen": -477.89581298828125, + "logps/rejected": -652.9949340820312, + "loss": 0.5074, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.40177583694458, + "rewards/margins": 1.3252737522125244, + "rewards/rejected": -3.7270495891571045, + "step": 2840 + }, + { + "epoch": 0.6837811900191939, + "grad_norm": 15.95735931016713, + "learning_rate": 1.374642233062197e-07, + "logits/chosen": -1.4318865537643433, + "logits/rejected": -1.4523706436157227, + "logps/chosen": -555.4390258789062, + "logps/rejected": -662.3551025390625, + "loss": 0.5082, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4492201805114746, + "rewards/margins": 1.4612877368927002, + "rewards/rejected": -3.910507917404175, + "step": 2850 + }, + { + "epoch": 0.6861804222648752, + "grad_norm": 22.040967064494847, + "learning_rate": 1.355984871510511e-07, + "logits/chosen": -1.3989685773849487, + "logits/rejected": -1.4607285261154175, + "logps/chosen": -605.5526733398438, + "logps/rejected": -740.408203125, + "loss": 0.4599, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.8203961849212646, + "rewards/margins": 1.2836238145828247, + "rewards/rejected": -4.104020118713379, + "step": 2860 + }, + { + "epoch": 0.6885796545105566, + "grad_norm": 26.278035870386386, + "learning_rate": 1.3374077580749783e-07, + "logits/chosen": -1.3998380899429321, + "logits/rejected": -1.4711499214172363, + "logps/chosen": -440.7962341308594, + "logps/rejected": -611.0807495117188, + "loss": 0.4933, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.2624080181121826, + "rewards/margins": 1.5463582277297974, + "rewards/rejected": -3.8087661266326904, + "step": 2870 + }, + { + "epoch": 0.690978886756238, + "grad_norm": 31.573567498608927, + "learning_rate": 1.3189121958663024e-07, + "logits/chosen": -1.4882738590240479, + "logits/rejected": -1.4182837009429932, + "logps/chosen": -576.9618530273438, + "logps/rejected": -610.1661987304688, + "loss": 0.4838, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6651625633239746, + "rewards/margins": 0.6495493650436401, + "rewards/rejected": -3.314711809158325, + "step": 2880 + }, + { + "epoch": 0.6933781190019194, + "grad_norm": 29.70591539427168, + "learning_rate": 1.3004994822746895e-07, + "logits/chosen": -1.6041643619537354, + "logits/rejected": -1.605790376663208, + "logps/chosen": -493.31414794921875, + "logps/rejected": -618.4185791015625, + "loss": 0.5098, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.1543803215026855, + "rewards/margins": 1.1474945545196533, + "rewards/rejected": -3.301875352859497, + "step": 2890 + }, + { + "epoch": 0.6957773512476008, + "grad_norm": 21.864381671116583, + "learning_rate": 1.2821709088788434e-07, + "logits/chosen": -1.3646858930587769, + "logits/rejected": -1.425444483757019, + "logps/chosen": -471.4464416503906, + "logps/rejected": -624.8846435546875, + "loss": 0.4984, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.3850364685058594, + "rewards/margins": 1.5373098850250244, + "rewards/rejected": -3.9223461151123047, + "step": 2900 + }, + { + "epoch": 0.6981765834932822, + "grad_norm": 33.11994692324609, + "learning_rate": 1.2639277613553736e-07, + "logits/chosen": -1.24925696849823, + "logits/rejected": -1.2926084995269775, + "logps/chosen": -455.4593811035156, + "logps/rejected": -570.4303588867188, + "loss": 0.4975, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2530951499938965, + "rewards/margins": 1.1058727502822876, + "rewards/rejected": -3.3589680194854736, + "step": 2910 + }, + { + "epoch": 0.7005758157389635, + "grad_norm": 23.50588406740299, + "learning_rate": 1.2457713193885975e-07, + "logits/chosen": -1.3312451839447021, + "logits/rejected": -1.4355002641677856, + "logps/chosen": -444.69073486328125, + "logps/rejected": -629.2984619140625, + "loss": 0.4902, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.422234296798706, + "rewards/margins": 1.4957836866378784, + "rewards/rejected": -3.918017864227295, + "step": 2920 + }, + { + "epoch": 0.7029750479846449, + "grad_norm": 26.846554855658294, + "learning_rate": 1.2277028565807838e-07, + "logits/chosen": -1.4063462018966675, + "logits/rejected": -1.4050636291503906, + "logps/chosen": -496.2431640625, + "logps/rejected": -601.5455322265625, + "loss": 0.4881, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.098576545715332, + "rewards/margins": 1.163854718208313, + "rewards/rejected": -3.2624313831329346, + "step": 2930 + }, + { + "epoch": 0.7053742802303263, + "grad_norm": 29.465126519101624, + "learning_rate": 1.209723640362815e-07, + "logits/chosen": -1.5117194652557373, + "logits/rejected": -1.5291945934295654, + "logps/chosen": -551.7310180664062, + "logps/rejected": -730.3133544921875, + "loss": 0.5362, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.538487434387207, + "rewards/margins": 1.8881938457489014, + "rewards/rejected": -4.426680564880371, + "step": 2940 + }, + { + "epoch": 0.7077735124760077, + "grad_norm": 17.56122405439231, + "learning_rate": 1.191834931905277e-07, + "logits/chosen": -1.5117073059082031, + "logits/rejected": -1.529789924621582, + "logps/chosen": -586.2432861328125, + "logps/rejected": -713.4133911132812, + "loss": 0.5034, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4982831478118896, + "rewards/margins": 1.343565821647644, + "rewards/rejected": -3.841848850250244, + "step": 2950 + }, + { + "epoch": 0.710172744721689, + "grad_norm": 19.699707945902087, + "learning_rate": 1.1740379860299988e-07, + "logits/chosen": -1.4035453796386719, + "logits/rejected": -1.4652457237243652, + "logps/chosen": -558.1643676757812, + "logps/rejected": -692.0729370117188, + "loss": 0.508, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.3158202171325684, + "rewards/margins": 1.147966742515564, + "rewards/rejected": -3.463787078857422, + "step": 2960 + }, + { + "epoch": 0.7125719769673704, + "grad_norm": 21.476619403343694, + "learning_rate": 1.1563340511220254e-07, + "logits/chosen": -1.4473599195480347, + "logits/rejected": -1.4237072467803955, + "logps/chosen": -578.5172119140625, + "logps/rejected": -715.98876953125, + "loss": 0.5044, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.603010654449463, + "rewards/margins": 1.5944998264312744, + "rewards/rejected": -4.197510719299316, + "step": 2970 + }, + { + "epoch": 0.7149712092130518, + "grad_norm": 16.814789130311464, + "learning_rate": 1.1387243690420556e-07, + "logits/chosen": -1.4572012424468994, + "logits/rejected": -1.4645602703094482, + "logps/chosen": -581.74169921875, + "logps/rejected": -742.5027465820312, + "loss": 0.4874, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1709964275360107, + "rewards/margins": 1.7974427938461304, + "rewards/rejected": -3.9684395790100098, + "step": 2980 + }, + { + "epoch": 0.7173704414587332, + "grad_norm": 26.739362833005647, + "learning_rate": 1.1212101750393235e-07, + "logits/chosen": -1.3943935632705688, + "logits/rejected": -1.3927123546600342, + "logps/chosen": -515.6727294921875, + "logps/rejected": -664.1068115234375, + "loss": 0.4901, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.290250539779663, + "rewards/margins": 1.7147293090820312, + "rewards/rejected": -4.004979610443115, + "step": 2990 + }, + { + "epoch": 0.7197696737044146, + "grad_norm": 18.845316187699304, + "learning_rate": 1.1037926976649562e-07, + "logits/chosen": -1.5029922723770142, + "logits/rejected": -1.5045820474624634, + "logps/chosen": -568.1280517578125, + "logps/rejected": -767.7951049804688, + "loss": 0.5512, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.73738956451416, + "rewards/margins": 1.7677345275878906, + "rewards/rejected": -4.505124092102051, + "step": 3000 + }, + { + "epoch": 0.722168905950096, + "grad_norm": 34.077793236942945, + "learning_rate": 1.0864731586857936e-07, + "logits/chosen": -1.366353154182434, + "logits/rejected": -1.3472143411636353, + "logps/chosen": -548.4249877929688, + "logps/rejected": -680.166748046875, + "loss": 0.4527, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2916715145111084, + "rewards/margins": 1.6611700057983398, + "rewards/rejected": -3.952840805053711, + "step": 3010 + }, + { + "epoch": 0.7245681381957774, + "grad_norm": 28.29747133681879, + "learning_rate": 1.0692527729986839e-07, + "logits/chosen": -1.5336111783981323, + "logits/rejected": -1.550148606300354, + "logps/chosen": -537.4771728515625, + "logps/rejected": -702.8695068359375, + "loss": 0.4394, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.5682034492492676, + "rewards/margins": 1.8685306310653687, + "rewards/rejected": -4.436734199523926, + "step": 3020 + }, + { + "epoch": 0.7269673704414588, + "grad_norm": 28.504158716401378, + "learning_rate": 1.0521327485452692e-07, + "logits/chosen": -1.3170318603515625, + "logits/rejected": -1.3605035543441772, + "logps/chosen": -528.2706298828125, + "logps/rejected": -689.1517333984375, + "loss": 0.4711, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6031200885772705, + "rewards/margins": 1.7600791454315186, + "rewards/rejected": -4.363199234008789, + "step": 3030 + }, + { + "epoch": 0.7293666026871402, + "grad_norm": 25.228985934770044, + "learning_rate": 1.0351142862272468e-07, + "logits/chosen": -1.4619348049163818, + "logits/rejected": -1.4451758861541748, + "logps/chosen": -504.30413818359375, + "logps/rejected": -706.7340698242188, + "loss": 0.4906, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.602288007736206, + "rewards/margins": 2.2053732872009277, + "rewards/rejected": -4.807661056518555, + "step": 3040 + }, + { + "epoch": 0.7317658349328215, + "grad_norm": 21.957711187050535, + "learning_rate": 1.0181985798221343e-07, + "logits/chosen": -1.3524713516235352, + "logits/rejected": -1.4187906980514526, + "logps/chosen": -546.7843627929688, + "logps/rejected": -698.24365234375, + "loss": 0.5124, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.6667990684509277, + "rewards/margins": 1.520263910293579, + "rewards/rejected": -4.187062740325928, + "step": 3050 + }, + { + "epoch": 0.7341650671785028, + "grad_norm": 29.468316114559133, + "learning_rate": 1.0013868158995329e-07, + "logits/chosen": -1.2929412126541138, + "logits/rejected": -1.3352470397949219, + "logps/chosen": -564.3472900390625, + "logps/rejected": -687.8644409179688, + "loss": 0.4995, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.6236252784729004, + "rewards/margins": 1.4335825443267822, + "rewards/rejected": -4.057208061218262, + "step": 3060 + }, + { + "epoch": 0.7365642994241842, + "grad_norm": 19.109202971687726, + "learning_rate": 9.84680173737887e-08, + "logits/chosen": -1.4015681743621826, + "logits/rejected": -1.398230791091919, + "logps/chosen": -514.4508056640625, + "logps/rejected": -622.0556640625, + "loss": 0.5074, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.104480028152466, + "rewards/margins": 1.5557291507720947, + "rewards/rejected": -3.6602091789245605, + "step": 3070 + }, + { + "epoch": 0.7389635316698656, + "grad_norm": 24.454658862230826, + "learning_rate": 9.680798252417713e-08, + "logits/chosen": -1.4342126846313477, + "logits/rejected": -1.4470162391662598, + "logps/chosen": -483.39984130859375, + "logps/rejected": -622.90576171875, + "loss": 0.4639, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.4555461406707764, + "rewards/margins": 1.091421127319336, + "rewards/rejected": -3.5469672679901123, + "step": 3080 + }, + { + "epoch": 0.741362763915547, + "grad_norm": 28.789392431667117, + "learning_rate": 9.515869348596808e-08, + "logits/chosen": -1.524357795715332, + "logits/rejected": -1.526430606842041, + "logps/chosen": -556.3933715820312, + "logps/rejected": -653.7907104492188, + "loss": 0.4736, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.331310272216797, + "rewards/margins": 1.1727701425552368, + "rewards/rejected": -3.5040805339813232, + "step": 3090 + }, + { + "epoch": 0.7437619961612284, + "grad_norm": 21.641602516245626, + "learning_rate": 9.352026595023493e-08, + "logits/chosen": -1.5961936712265015, + "logits/rejected": -1.5464565753936768, + "logps/chosen": -540.9544677734375, + "logps/rejected": -592.5125732421875, + "loss": 0.4792, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.257542371749878, + "rewards/margins": 0.8826139569282532, + "rewards/rejected": -3.1401562690734863, + "step": 3100 + }, + { + "epoch": 0.7461612284069098, + "grad_norm": 26.71616870380173, + "learning_rate": 9.189281484616004e-08, + "logits/chosen": -1.4192516803741455, + "logits/rejected": -1.459497332572937, + "logps/chosen": -475.99755859375, + "logps/rejected": -625.4021606445312, + "loss": 0.5153, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.4731884002685547, + "rewards/margins": 1.093919277191162, + "rewards/rejected": -3.567107677459717, + "step": 3110 + }, + { + "epoch": 0.7485604606525912, + "grad_norm": 29.236127546541127, + "learning_rate": 9.027645433297249e-08, + "logits/chosen": -1.5000680685043335, + "logits/rejected": -1.3937382698059082, + "logps/chosen": -634.0570068359375, + "logps/rejected": -715.98681640625, + "loss": 0.5331, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.869029998779297, + "rewards/margins": 1.2422337532043457, + "rewards/rejected": -4.111264228820801, + "step": 3120 + }, + { + "epoch": 0.7509596928982726, + "grad_norm": 33.40105323928887, + "learning_rate": 8.867129779194066e-08, + "logits/chosen": -1.4319212436676025, + "logits/rejected": -1.446755051612854, + "logps/chosen": -468.393310546875, + "logps/rejected": -683.1968994140625, + "loss": 0.5019, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.272082805633545, + "rewards/margins": 2.1436591148376465, + "rewards/rejected": -4.415741443634033, + "step": 3130 + }, + { + "epoch": 0.753358925143954, + "grad_norm": 29.275686453238876, + "learning_rate": 8.707745781841866e-08, + "logits/chosen": -1.476662278175354, + "logits/rejected": -1.4816815853118896, + "logps/chosen": -541.4090576171875, + "logps/rejected": -709.6759033203125, + "loss": 0.503, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.7823169231414795, + "rewards/margins": 1.8224369287490845, + "rewards/rejected": -4.6047539710998535, + "step": 3140 + }, + { + "epoch": 0.7557581573896354, + "grad_norm": 16.66120725563624, + "learning_rate": 8.549504621394831e-08, + "logits/chosen": -1.5555782318115234, + "logits/rejected": -1.5570299625396729, + "logps/chosen": -496.71942138671875, + "logps/rejected": -675.0093994140625, + "loss": 0.4092, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.3446543216705322, + "rewards/margins": 1.8439651727676392, + "rewards/rejected": -4.188619613647461, + "step": 3150 + }, + { + "epoch": 0.7581573896353166, + "grad_norm": 27.50632002479432, + "learning_rate": 8.392417397841703e-08, + "logits/chosen": -1.4134910106658936, + "logits/rejected": -1.4268213510513306, + "logps/chosen": -522.3306884765625, + "logps/rejected": -660.8889770507812, + "loss": 0.4732, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.2523319721221924, + "rewards/margins": 1.2802834510803223, + "rewards/rejected": -3.5326151847839355, + "step": 3160 + }, + { + "epoch": 0.760556621880998, + "grad_norm": 58.22457295866972, + "learning_rate": 8.236495130227083e-08, + "logits/chosen": -1.4272292852401733, + "logits/rejected": -1.3632147312164307, + "logps/chosen": -584.5332641601562, + "logps/rejected": -779.2487182617188, + "loss": 0.5272, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.5950450897216797, + "rewards/margins": 2.407891035079956, + "rewards/rejected": -5.002936363220215, + "step": 3170 + }, + { + "epoch": 0.7629558541266794, + "grad_norm": 26.44354069179861, + "learning_rate": 8.081748755878612e-08, + "logits/chosen": -1.44991135597229, + "logits/rejected": -1.4308462142944336, + "logps/chosen": -564.185546875, + "logps/rejected": -631.7692260742188, + "loss": 0.4651, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4482004642486572, + "rewards/margins": 1.3537415266036987, + "rewards/rejected": -3.8019416332244873, + "step": 3180 + }, + { + "epoch": 0.7653550863723608, + "grad_norm": 26.202509095710802, + "learning_rate": 7.928189129639632e-08, + "logits/chosen": -1.4107129573822021, + "logits/rejected": -1.450067162513733, + "logps/chosen": -500.0232849121094, + "logps/rejected": -648.1038208007812, + "loss": 0.4817, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.471116542816162, + "rewards/margins": 1.3545042276382446, + "rewards/rejected": -3.825620174407959, + "step": 3190 + }, + { + "epoch": 0.7677543186180422, + "grad_norm": 23.707545921853338, + "learning_rate": 7.775827023107834e-08, + "logits/chosen": -1.463205099105835, + "logits/rejected": -1.4656999111175537, + "logps/chosen": -510.50653076171875, + "logps/rejected": -629.3123779296875, + "loss": 0.5274, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.468822956085205, + "rewards/margins": 1.0044350624084473, + "rewards/rejected": -3.4732582569122314, + "step": 3200 + }, + { + "epoch": 0.7701535508637236, + "grad_norm": 29.496261269101748, + "learning_rate": 7.624673123879682e-08, + "logits/chosen": -1.6210750341415405, + "logits/rejected": -1.5650123357772827, + "logps/chosen": -509.60760498046875, + "logps/rejected": -581.8856201171875, + "loss": 0.5226, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.3314592838287354, + "rewards/margins": 0.9805020093917847, + "rewards/rejected": -3.3119614124298096, + "step": 3210 + }, + { + "epoch": 0.772552783109405, + "grad_norm": 22.29329935500571, + "learning_rate": 7.474738034800663e-08, + "logits/chosen": -1.4924123287200928, + "logits/rejected": -1.5502279996871948, + "logps/chosen": -486.6429748535156, + "logps/rejected": -635.8275146484375, + "loss": 0.506, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4057681560516357, + "rewards/margins": 1.7794519662857056, + "rewards/rejected": -4.185219764709473, + "step": 3220 + }, + { + "epoch": 0.7749520153550864, + "grad_norm": 28.79288442348916, + "learning_rate": 7.326032273221606e-08, + "logits/chosen": -1.4728448390960693, + "logits/rejected": -1.4777170419692993, + "logps/chosen": -557.5980224609375, + "logps/rejected": -646.7634887695312, + "loss": 0.487, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.40948224067688, + "rewards/margins": 1.166581630706787, + "rewards/rejected": -3.576063871383667, + "step": 3230 + }, + { + "epoch": 0.7773512476007678, + "grad_norm": 29.631158880286737, + "learning_rate": 7.178566270260872e-08, + "logits/chosen": -1.4080318212509155, + "logits/rejected": -1.4853019714355469, + "logps/chosen": -565.4283447265625, + "logps/rejected": -737.2282104492188, + "loss": 0.5043, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.7243075370788574, + "rewards/margins": 1.5720329284667969, + "rewards/rejected": -4.296339988708496, + "step": 3240 + }, + { + "epoch": 0.7797504798464492, + "grad_norm": 24.514571475809937, + "learning_rate": 7.032350370072709e-08, + "logits/chosen": -1.4933242797851562, + "logits/rejected": -1.4735162258148193, + "logps/chosen": -534.240234375, + "logps/rejected": -662.1375122070312, + "loss": 0.457, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.262051820755005, + "rewards/margins": 1.4018381834030151, + "rewards/rejected": -3.6638896465301514, + "step": 3250 + }, + { + "epoch": 0.7821497120921305, + "grad_norm": 18.653631879799228, + "learning_rate": 6.887394829121596e-08, + "logits/chosen": -1.4374290704727173, + "logits/rejected": -1.4903333187103271, + "logps/chosen": -587.66796875, + "logps/rejected": -816.6536254882812, + "loss": 0.4718, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.851017713546753, + "rewards/margins": 2.611243486404419, + "rewards/rejected": -5.462260723114014, + "step": 3260 + }, + { + "epoch": 0.7845489443378119, + "grad_norm": 20.06410371350649, + "learning_rate": 6.743709815462833e-08, + "logits/chosen": -1.5586085319519043, + "logits/rejected": -1.5152196884155273, + "logps/chosen": -574.1953125, + "logps/rejected": -687.0452880859375, + "loss": 0.4837, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.71860671043396, + "rewards/margins": 1.5317455530166626, + "rewards/rejected": -4.250351905822754, + "step": 3270 + }, + { + "epoch": 0.7869481765834933, + "grad_norm": 15.886281295613209, + "learning_rate": 6.601305408029287e-08, + "logits/chosen": -1.3025667667388916, + "logits/rejected": -1.2801477909088135, + "logps/chosen": -542.4097900390625, + "logps/rejected": -687.2131958007812, + "loss": 0.4523, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7486352920532227, + "rewards/margins": 1.49062180519104, + "rewards/rejected": -4.2392578125, + "step": 3280 + }, + { + "epoch": 0.7893474088291746, + "grad_norm": 26.250188214625645, + "learning_rate": 6.460191595924366e-08, + "logits/chosen": -1.4919764995574951, + "logits/rejected": -1.500739336013794, + "logps/chosen": -539.6181640625, + "logps/rejected": -675.89794921875, + "loss": 0.4591, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.638044834136963, + "rewards/margins": 1.3249804973602295, + "rewards/rejected": -3.9630253314971924, + "step": 3290 + }, + { + "epoch": 0.791746641074856, + "grad_norm": 32.1037561950662, + "learning_rate": 6.320378277721342e-08, + "logits/chosen": -1.4014630317687988, + "logits/rejected": -1.4263122081756592, + "logps/chosen": -572.5446166992188, + "logps/rejected": -636.336669921875, + "loss": 0.4805, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.838632345199585, + "rewards/margins": 0.804477334022522, + "rewards/rejected": -3.6431095600128174, + "step": 3300 + }, + { + "epoch": 0.7941458733205374, + "grad_norm": 23.441297652793324, + "learning_rate": 6.181875260769032e-08, + "logits/chosen": -1.4917078018188477, + "logits/rejected": -1.4382745027542114, + "logps/chosen": -563.8847045898438, + "logps/rejected": -622.117919921875, + "loss": 0.4769, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.447789192199707, + "rewards/margins": 1.294082760810852, + "rewards/rejected": -3.7418715953826904, + "step": 3310 + }, + { + "epoch": 0.7965451055662188, + "grad_norm": 24.10934322037837, + "learning_rate": 6.044692260503797e-08, + "logits/chosen": -1.412782907485962, + "logits/rejected": -1.44136643409729, + "logps/chosen": -606.3463134765625, + "logps/rejected": -755.9545288085938, + "loss": 0.4533, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.827353000640869, + "rewards/margins": 1.7829620838165283, + "rewards/rejected": -4.610314846038818, + "step": 3320 + }, + { + "epoch": 0.7989443378119002, + "grad_norm": 22.68256485211036, + "learning_rate": 5.9088388997680984e-08, + "logits/chosen": -1.532494068145752, + "logits/rejected": -1.473752737045288, + "logps/chosen": -612.1240844726562, + "logps/rejected": -702.3494873046875, + "loss": 0.4564, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4835903644561768, + "rewards/margins": 1.7741658687591553, + "rewards/rejected": -4.257756233215332, + "step": 3330 + }, + { + "epoch": 0.8013435700575816, + "grad_norm": 27.26990440572768, + "learning_rate": 5.774324708135439e-08, + "logits/chosen": -1.4643352031707764, + "logits/rejected": -1.4224929809570312, + "logps/chosen": -482.7091369628906, + "logps/rejected": -601.3060913085938, + "loss": 0.4797, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.3892059326171875, + "rewards/margins": 1.4551842212677002, + "rewards/rejected": -3.8443901538848877, + "step": 3340 + }, + { + "epoch": 0.803742802303263, + "grad_norm": 21.455371173697543, + "learning_rate": 5.641159121241953e-08, + "logits/chosen": -1.3970617055892944, + "logits/rejected": -1.4273337125778198, + "logps/chosen": -526.7587890625, + "logps/rejected": -723.48046875, + "loss": 0.463, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.604462146759033, + "rewards/margins": 1.6828781366348267, + "rewards/rejected": -4.28734016418457, + "step": 3350 + }, + { + "epoch": 0.8061420345489443, + "grad_norm": 38.47135827342629, + "learning_rate": 5.5093514801245106e-08, + "logits/chosen": -1.4368665218353271, + "logits/rejected": -1.4622339010238647, + "logps/chosen": -541.8734741210938, + "logps/rejected": -708.7005615234375, + "loss": 0.5137, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6349403858184814, + "rewards/margins": 1.4889490604400635, + "rewards/rejected": -4.123888969421387, + "step": 3360 + }, + { + "epoch": 0.8085412667946257, + "grad_norm": 16.648065823327027, + "learning_rate": 5.378911030565453e-08, + "logits/chosen": -1.4388457536697388, + "logits/rejected": -1.464852213859558, + "logps/chosen": -587.2557373046875, + "logps/rejected": -758.8238525390625, + "loss": 0.4751, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.706027030944824, + "rewards/margins": 1.462527871131897, + "rewards/rejected": -4.168554782867432, + "step": 3370 + }, + { + "epoch": 0.8109404990403071, + "grad_norm": 21.377493595253252, + "learning_rate": 5.249846922444101e-08, + "logits/chosen": -1.3959380388259888, + "logits/rejected": -1.4403550624847412, + "logps/chosen": -540.965576171875, + "logps/rejected": -731.8458862304688, + "loss": 0.4471, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.9369583129882812, + "rewards/margins": 2.1504523754119873, + "rewards/rejected": -5.087410926818848, + "step": 3380 + }, + { + "epoch": 0.8133397312859885, + "grad_norm": 27.858555805079583, + "learning_rate": 5.122168209094865e-08, + "logits/chosen": -1.3741129636764526, + "logits/rejected": -1.4110362529754639, + "logps/chosen": -495.10015869140625, + "logps/rejected": -565.1026000976562, + "loss": 0.495, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.5415637493133545, + "rewards/margins": 0.6181079149246216, + "rewards/rejected": -3.159672260284424, + "step": 3390 + }, + { + "epoch": 0.8157389635316699, + "grad_norm": 18.996566767758726, + "learning_rate": 4.995883846672222e-08, + "logits/chosen": -1.6012929677963257, + "logits/rejected": -1.483344316482544, + "logps/chosen": -670.099609375, + "logps/rejected": -694.0413818359375, + "loss": 0.4884, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.594493865966797, + "rewards/margins": 1.0786322355270386, + "rewards/rejected": -3.673126220703125, + "step": 3400 + }, + { + "epoch": 0.8181381957773513, + "grad_norm": 21.288429602250755, + "learning_rate": 4.871002693522486e-08, + "logits/chosen": -1.4490859508514404, + "logits/rejected": -1.4328912496566772, + "logps/chosen": -564.3585205078125, + "logps/rejected": -638.011962890625, + "loss": 0.4825, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.586820125579834, + "rewards/margins": 1.1905717849731445, + "rewards/rejected": -3.7773919105529785, + "step": 3410 + }, + { + "epoch": 0.8205374280230326, + "grad_norm": 23.077767447616406, + "learning_rate": 4.7475335095623956e-08, + "logits/chosen": -1.4239362478256226, + "logits/rejected": -1.4627254009246826, + "logps/chosen": -599.957275390625, + "logps/rejected": -717.8880615234375, + "loss": 0.4935, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.0710372924804688, + "rewards/margins": 1.3537687063217163, + "rewards/rejected": -4.424806118011475, + "step": 3420 + }, + { + "epoch": 0.822936660268714, + "grad_norm": 40.82537061925496, + "learning_rate": 4.6254849556646714e-08, + "logits/chosen": -1.4856188297271729, + "logits/rejected": -1.4363281726837158, + "logps/chosen": -623.6442260742188, + "logps/rejected": -771.4346313476562, + "loss": 0.4925, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.8247361183166504, + "rewards/margins": 1.9578917026519775, + "rewards/rejected": -4.782628059387207, + "step": 3430 + }, + { + "epoch": 0.8253358925143954, + "grad_norm": 29.057320731166985, + "learning_rate": 4.504865593050483e-08, + "logits/chosen": -1.474157452583313, + "logits/rejected": -1.4843025207519531, + "logps/chosen": -568.09375, + "logps/rejected": -696.2715454101562, + "loss": 0.4873, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.646822452545166, + "rewards/margins": 1.2566282749176025, + "rewards/rejected": -3.9034507274627686, + "step": 3440 + }, + { + "epoch": 0.8277351247600768, + "grad_norm": 33.85388890551276, + "learning_rate": 4.385683882688895e-08, + "logits/chosen": -1.5223079919815063, + "logits/rejected": -1.4601997137069702, + "logps/chosen": -558.9805297851562, + "logps/rejected": -572.9657592773438, + "loss": 0.5312, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.541393756866455, + "rewards/margins": 0.5804921388626099, + "rewards/rejected": -3.1218860149383545, + "step": 3450 + }, + { + "epoch": 0.8301343570057581, + "grad_norm": 29.0264832887359, + "learning_rate": 4.2679481847033985e-08, + "logits/chosen": -1.371885061264038, + "logits/rejected": -1.3797327280044556, + "logps/chosen": -572.6829833984375, + "logps/rejected": -727.1825561523438, + "loss": 0.5188, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7417216300964355, + "rewards/margins": 1.6003021001815796, + "rewards/rejected": -4.342023849487305, + "step": 3460 + }, + { + "epoch": 0.8325335892514395, + "grad_norm": 22.186979488050614, + "learning_rate": 4.151666757785435e-08, + "logits/chosen": -1.4208916425704956, + "logits/rejected": -1.4364068508148193, + "logps/chosen": -518.295654296875, + "logps/rejected": -732.78125, + "loss": 0.4691, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.4468417167663574, + "rewards/margins": 2.2313990592956543, + "rewards/rejected": -4.678240776062012, + "step": 3470 + }, + { + "epoch": 0.8349328214971209, + "grad_norm": 20.06399966633853, + "learning_rate": 4.036847758615136e-08, + "logits/chosen": -1.4167879819869995, + "logits/rejected": -1.4322962760925293, + "logps/chosen": -581.9016723632812, + "logps/rejected": -693.2930908203125, + "loss": 0.5338, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.096466541290283, + "rewards/margins": 1.085829734802246, + "rewards/rejected": -4.182296276092529, + "step": 3480 + }, + { + "epoch": 0.8373320537428023, + "grad_norm": 25.345706635859266, + "learning_rate": 3.923499241289113e-08, + "logits/chosen": -1.5312403440475464, + "logits/rejected": -1.4787768125534058, + "logps/chosen": -653.0443725585938, + "logps/rejected": -696.0265502929688, + "loss": 0.5449, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.1662380695343018, + "rewards/margins": 1.0970689058303833, + "rewards/rejected": -4.263307094573975, + "step": 3490 + }, + { + "epoch": 0.8397312859884837, + "grad_norm": 17.280297621907234, + "learning_rate": 3.811629156755541e-08, + "logits/chosen": -1.482695460319519, + "logits/rejected": -1.5027401447296143, + "logps/chosen": -559.4757080078125, + "logps/rejected": -685.4481811523438, + "loss": 0.4673, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.3803610801696777, + "rewards/margins": 1.4251940250396729, + "rewards/rejected": -3.805555820465088, + "step": 3500 + }, + { + "epoch": 0.8421305182341651, + "grad_norm": 17.666382532614964, + "learning_rate": 3.701245352256391e-08, + "logits/chosen": -1.4617435932159424, + "logits/rejected": -1.450541377067566, + "logps/chosen": -542.1734619140625, + "logps/rejected": -605.668701171875, + "loss": 0.4688, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1690547466278076, + "rewards/margins": 0.8691450953483582, + "rewards/rejected": -3.0381999015808105, + "step": 3510 + }, + { + "epoch": 0.8445297504798465, + "grad_norm": 21.888685616511193, + "learning_rate": 3.592355570776984e-08, + "logits/chosen": -1.5304094552993774, + "logits/rejected": -1.554368257522583, + "logps/chosen": -473.56500244140625, + "logps/rejected": -634.5338134765625, + "loss": 0.479, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.2745816707611084, + "rewards/margins": 1.5497357845306396, + "rewards/rejected": -3.824317216873169, + "step": 3520 + }, + { + "epoch": 0.8469289827255279, + "grad_norm": 19.57343617094564, + "learning_rate": 3.484967450502904e-08, + "logits/chosen": -1.388850212097168, + "logits/rejected": -1.447142243385315, + "logps/chosen": -477.470458984375, + "logps/rejected": -707.7676391601562, + "loss": 0.4673, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.540025472640991, + "rewards/margins": 1.873694658279419, + "rewards/rejected": -4.41372013092041, + "step": 3530 + }, + { + "epoch": 0.8493282149712092, + "grad_norm": 25.68535137895278, + "learning_rate": 3.3790885242841296e-08, + "logits/chosen": -1.520108938217163, + "logits/rejected": -1.5441076755523682, + "logps/chosen": -573.6110229492188, + "logps/rejected": -749.3212890625, + "loss": 0.455, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9368386268615723, + "rewards/margins": 1.8771892786026, + "rewards/rejected": -4.814027309417725, + "step": 3540 + }, + { + "epoch": 0.8517274472168906, + "grad_norm": 24.712123274237268, + "learning_rate": 3.274726219106677e-08, + "logits/chosen": -1.5202531814575195, + "logits/rejected": -1.5321277379989624, + "logps/chosen": -605.6915893554688, + "logps/rejected": -762.2913208007812, + "loss": 0.484, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.9121487140655518, + "rewards/margins": 1.6904712915420532, + "rewards/rejected": -4.6026201248168945, + "step": 3550 + }, + { + "epoch": 0.8541266794625719, + "grad_norm": 22.291969059529155, + "learning_rate": 3.171887855571642e-08, + "logits/chosen": -1.5179522037506104, + "logits/rejected": -1.5329720973968506, + "logps/chosen": -489.92095947265625, + "logps/rejected": -574.1165771484375, + "loss": 0.4794, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.365051031112671, + "rewards/margins": 0.9199559092521667, + "rewards/rejected": -3.2850074768066406, + "step": 3560 + }, + { + "epoch": 0.8565259117082533, + "grad_norm": 22.785349932457247, + "learning_rate": 3.070580647381643e-08, + "logits/chosen": -1.461425542831421, + "logits/rejected": -1.4869364500045776, + "logps/chosen": -520.1343383789062, + "logps/rejected": -687.3969116210938, + "loss": 0.4958, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.5598349571228027, + "rewards/margins": 1.7437207698822021, + "rewards/rejected": -4.303555965423584, + "step": 3570 + }, + { + "epoch": 0.8589251439539347, + "grad_norm": 24.253822569693856, + "learning_rate": 2.9708117008348576e-08, + "logits/chosen": -1.391090989112854, + "logits/rejected": -1.3744146823883057, + "logps/chosen": -579.3370361328125, + "logps/rejected": -610.1915893554688, + "loss": 0.478, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.303920269012451, + "rewards/margins": 0.9596284627914429, + "rewards/rejected": -3.2635490894317627, + "step": 3580 + }, + { + "epoch": 0.8613243761996161, + "grad_norm": 30.248895668513786, + "learning_rate": 2.8725880143264992e-08, + "logits/chosen": -1.4726682901382446, + "logits/rejected": -1.4755730628967285, + "logps/chosen": -543.0050659179688, + "logps/rejected": -680.9945068359375, + "loss": 0.5117, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7209887504577637, + "rewards/margins": 0.93864506483078, + "rewards/rejected": -3.6596341133117676, + "step": 3590 + }, + { + "epoch": 0.8637236084452975, + "grad_norm": 40.08138047831897, + "learning_rate": 2.775916477857948e-08, + "logits/chosen": -1.4420602321624756, + "logits/rejected": -1.4758942127227783, + "logps/chosen": -486.1978454589844, + "logps/rejected": -574.8326416015625, + "loss": 0.4715, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.5521926879882812, + "rewards/margins": 0.9003151655197144, + "rewards/rejected": -3.452507734298706, + "step": 3600 + }, + { + "epoch": 0.8661228406909789, + "grad_norm": 31.531626669597852, + "learning_rate": 2.680803872553408e-08, + "logits/chosen": -1.4435484409332275, + "logits/rejected": -1.4838091135025024, + "logps/chosen": -553.7645263671875, + "logps/rejected": -764.061767578125, + "loss": 0.4846, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.6419153213500977, + "rewards/margins": 2.4341719150543213, + "rewards/rejected": -5.07608699798584, + "step": 3610 + }, + { + "epoch": 0.8685220729366603, + "grad_norm": 39.221812949722406, + "learning_rate": 2.5872568701842706e-08, + "logits/chosen": -1.366398572921753, + "logits/rejected": -1.4173638820648193, + "logps/chosen": -496.0492248535156, + "logps/rejected": -628.1813354492188, + "loss": 0.5317, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.546794891357422, + "rewards/margins": 1.1080689430236816, + "rewards/rejected": -3.6548638343811035, + "step": 3620 + }, + { + "epoch": 0.8709213051823417, + "grad_norm": 32.68223784273047, + "learning_rate": 2.495282032701096e-08, + "logits/chosen": -1.5151169300079346, + "logits/rejected": -1.437577486038208, + "logps/chosen": -460.25799560546875, + "logps/rejected": -550.067138671875, + "loss": 0.4728, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.3098971843719482, + "rewards/margins": 1.309601068496704, + "rewards/rejected": -3.6194984912872314, + "step": 3630 + }, + { + "epoch": 0.8733205374280231, + "grad_norm": 22.657400839131, + "learning_rate": 2.4048858117733133e-08, + "logits/chosen": -1.478313684463501, + "logits/rejected": -1.4519219398498535, + "logps/chosen": -559.4782104492188, + "logps/rejected": -737.6328735351562, + "loss": 0.4372, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.574556350708008, + "rewards/margins": 2.292501449584961, + "rewards/rejected": -4.867057800292969, + "step": 3640 + }, + { + "epoch": 0.8757197696737045, + "grad_norm": 26.289348894738573, + "learning_rate": 2.3160745483366938e-08, + "logits/chosen": -1.4442027807235718, + "logits/rejected": -1.5169882774353027, + "logps/chosen": -514.4212646484375, + "logps/rejected": -666.7352294921875, + "loss": 0.4591, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.481067180633545, + "rewards/margins": 1.1971678733825684, + "rewards/rejected": -3.678234815597534, + "step": 3650 + }, + { + "epoch": 0.8781190019193857, + "grad_norm": 40.19559237975213, + "learning_rate": 2.2288544721485197e-08, + "logits/chosen": -1.4943950176239014, + "logits/rejected": -1.574711561203003, + "logps/chosen": -460.83758544921875, + "logps/rejected": -649.78857421875, + "loss": 0.4515, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.127152442932129, + "rewards/margins": 1.7189550399780273, + "rewards/rejected": -3.8461079597473145, + "step": 3660 + }, + { + "epoch": 0.8805182341650671, + "grad_norm": 36.371263439755744, + "learning_rate": 2.1432317013506117e-08, + "logits/chosen": -1.5664393901824951, + "logits/rejected": -1.5465755462646484, + "logps/chosen": -532.2247314453125, + "logps/rejected": -576.0304565429688, + "loss": 0.53, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.37400484085083, + "rewards/margins": 0.8804348707199097, + "rewards/rejected": -3.2544398307800293, + "step": 3670 + }, + { + "epoch": 0.8829174664107485, + "grad_norm": 27.493523101953656, + "learning_rate": 2.0592122420401704e-08, + "logits/chosen": -1.4295785427093506, + "logits/rejected": -1.4089164733886719, + "logps/chosen": -526.0188598632812, + "logps/rejected": -609.1351318359375, + "loss": 0.5033, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.5828731060028076, + "rewards/margins": 0.8723591566085815, + "rewards/rejected": -3.455233097076416, + "step": 3680 + }, + { + "epoch": 0.8853166986564299, + "grad_norm": 28.344169477106256, + "learning_rate": 1.976801987848459e-08, + "logits/chosen": -1.481249451637268, + "logits/rejected": -1.51192307472229, + "logps/chosen": -563.6798095703125, + "logps/rejected": -745.6798095703125, + "loss": 0.5088, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6833484172821045, + "rewards/margins": 1.6270126104354858, + "rewards/rejected": -4.310361385345459, + "step": 3690 + }, + { + "epoch": 0.8877159309021113, + "grad_norm": 28.869068672440445, + "learning_rate": 1.8960067195273987e-08, + "logits/chosen": -1.4683550596237183, + "logits/rejected": -1.4642561674118042, + "logps/chosen": -463.25360107421875, + "logps/rejected": -622.0158081054688, + "loss": 0.4932, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.211500406265259, + "rewards/margins": 1.5927655696868896, + "rewards/rejected": -3.8042654991149902, + "step": 3700 + }, + { + "epoch": 0.8901151631477927, + "grad_norm": 23.011707613336537, + "learning_rate": 1.816832104544072e-08, + "logits/chosen": -1.3950389623641968, + "logits/rejected": -1.381929636001587, + "logps/chosen": -552.4439086914062, + "logps/rejected": -658.911376953125, + "loss": 0.4707, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.4922397136688232, + "rewards/margins": 1.3074629306793213, + "rewards/rejected": -3.7997028827667236, + "step": 3710 + }, + { + "epoch": 0.8925143953934741, + "grad_norm": 23.820545081228094, + "learning_rate": 1.7392836966831553e-08, + "logits/chosen": -1.424530267715454, + "logits/rejected": -1.4343727827072144, + "logps/chosen": -566.0718994140625, + "logps/rejected": -690.0733642578125, + "loss": 0.4677, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.6404526233673096, + "rewards/margins": 1.6194394826889038, + "rewards/rejected": -4.259891986846924, + "step": 3720 + }, + { + "epoch": 0.8949136276391555, + "grad_norm": 22.592101093719403, + "learning_rate": 1.663366935657373e-08, + "logits/chosen": -1.454740285873413, + "logits/rejected": -1.3993685245513916, + "logps/chosen": -503.31121826171875, + "logps/rejected": -624.0667724609375, + "loss": 0.5261, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.421086072921753, + "rewards/margins": 1.2379990816116333, + "rewards/rejected": -3.6590850353240967, + "step": 3730 + }, + { + "epoch": 0.8973128598848369, + "grad_norm": 43.69207506422277, + "learning_rate": 1.5890871467258898e-08, + "logits/chosen": -1.5168521404266357, + "logits/rejected": -1.5024161338806152, + "logps/chosen": -622.5906982421875, + "logps/rejected": -727.0234985351562, + "loss": 0.4954, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.6247501373291016, + "rewards/margins": 1.4162267446517944, + "rewards/rejected": -4.040976524353027, + "step": 3740 + }, + { + "epoch": 0.8997120921305183, + "grad_norm": 21.27400662832972, + "learning_rate": 1.5164495403207967e-08, + "logits/chosen": -1.5252599716186523, + "logits/rejected": -1.5785152912139893, + "logps/chosen": -562.35595703125, + "logps/rejected": -795.0603637695312, + "loss": 0.4491, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.826993227005005, + "rewards/margins": 2.0489425659179688, + "rewards/rejected": -4.875936031341553, + "step": 3750 + }, + { + "epoch": 0.9021113243761996, + "grad_norm": 23.91235252378669, + "learning_rate": 1.4454592116815962e-08, + "logits/chosen": -1.4300777912139893, + "logits/rejected": -1.4802331924438477, + "logps/chosen": -515.1054077148438, + "logps/rejected": -673.2876586914062, + "loss": 0.4622, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.3388595581054688, + "rewards/margins": 1.491581916809082, + "rewards/rejected": -3.83044171333313, + "step": 3760 + }, + { + "epoch": 0.904510556621881, + "grad_norm": 19.730016107430547, + "learning_rate": 1.3761211404977934e-08, + "logits/chosen": -1.4573686122894287, + "logits/rejected": -1.4766387939453125, + "logps/chosen": -539.85205078125, + "logps/rejected": -698.2182006835938, + "loss": 0.4292, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.8132948875427246, + "rewards/margins": 1.7728464603424072, + "rewards/rejected": -4.586141586303711, + "step": 3770 + }, + { + "epoch": 0.9069097888675623, + "grad_norm": 30.77965794013918, + "learning_rate": 1.3084401905596177e-08, + "logits/chosen": -1.6211086511611938, + "logits/rejected": -1.5908045768737793, + "logps/chosen": -572.1397705078125, + "logps/rejected": -647.9318237304688, + "loss": 0.4689, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.3642795085906982, + "rewards/margins": 1.3933395147323608, + "rewards/rejected": -3.7576186656951904, + "step": 3780 + }, + { + "epoch": 0.9093090211132437, + "grad_norm": 24.436863984944267, + "learning_rate": 1.2424211094168053e-08, + "logits/chosen": -1.3611688613891602, + "logits/rejected": -1.356358289718628, + "logps/chosen": -573.2617797851562, + "logps/rejected": -722.9159545898438, + "loss": 0.4986, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.152191400527954, + "rewards/margins": 1.6659166812896729, + "rewards/rejected": -3.818108320236206, + "step": 3790 + }, + { + "epoch": 0.9117082533589251, + "grad_norm": 20.702933969749694, + "learning_rate": 1.1780685280456143e-08, + "logits/chosen": -1.4703223705291748, + "logits/rejected": -1.5236238241195679, + "logps/chosen": -612.98095703125, + "logps/rejected": -827.43505859375, + "loss": 0.4901, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.8562207221984863, + "rewards/margins": 2.1300837993621826, + "rewards/rejected": -4.986304759979248, + "step": 3800 + }, + { + "epoch": 0.9141074856046065, + "grad_norm": 33.75173274638943, + "learning_rate": 1.1153869605239564e-08, + "logits/chosen": -1.409325361251831, + "logits/rejected": -1.3448197841644287, + "logps/chosen": -512.6187133789062, + "logps/rejected": -566.1109008789062, + "loss": 0.5013, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.2221503257751465, + "rewards/margins": 0.9565967321395874, + "rewards/rejected": -3.1787469387054443, + "step": 3810 + }, + { + "epoch": 0.9165067178502879, + "grad_norm": 29.756501450398336, + "learning_rate": 1.0543808037147606e-08, + "logits/chosen": -1.5313994884490967, + "logits/rejected": -1.5997869968414307, + "logps/chosen": -556.62841796875, + "logps/rejected": -752.580078125, + "loss": 0.4596, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6709938049316406, + "rewards/margins": 1.9339478015899658, + "rewards/rejected": -4.6049418449401855, + "step": 3820 + }, + { + "epoch": 0.9189059500959693, + "grad_norm": 19.376598320783035, + "learning_rate": 9.95054336957557e-09, + "logits/chosen": -1.5325264930725098, + "logits/rejected": -1.586796522140503, + "logps/chosen": -509.6279296875, + "logps/rejected": -615.4866943359375, + "loss": 0.4503, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.3629980087280273, + "rewards/margins": 1.0205209255218506, + "rewards/rejected": -3.383518934249878, + "step": 3830 + }, + { + "epoch": 0.9213051823416507, + "grad_norm": 27.54657646316352, + "learning_rate": 9.37411721768286e-09, + "logits/chosen": -1.3524452447891235, + "logits/rejected": -1.429274320602417, + "logps/chosen": -589.4090576171875, + "logps/rejected": -806.1002197265625, + "loss": 0.4425, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7954232692718506, + "rewards/margins": 1.8575918674468994, + "rewards/rejected": -4.65301513671875, + "step": 3840 + }, + { + "epoch": 0.9237044145873321, + "grad_norm": 26.952155576095937, + "learning_rate": 8.81457001547392e-09, + "logits/chosen": -1.4353164434432983, + "logits/rejected": -1.4641566276550293, + "logps/chosen": -517.4059448242188, + "logps/rejected": -627.7111206054688, + "loss": 0.497, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.4591031074523926, + "rewards/margins": 0.9821527600288391, + "rewards/rejected": -3.441256046295166, + "step": 3850 + }, + { + "epoch": 0.9261036468330134, + "grad_norm": 23.41248980155676, + "learning_rate": 8.271941012961942e-09, + "logits/chosen": -1.3492867946624756, + "logits/rejected": -1.392040491104126, + "logps/chosen": -513.1867065429688, + "logps/rejected": -821.3966064453125, + "loss": 0.4693, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.730609178543091, + "rewards/margins": 2.518721103668213, + "rewards/rejected": -5.249329566955566, + "step": 3860 + }, + { + "epoch": 0.9285028790786948, + "grad_norm": 21.286871650299304, + "learning_rate": 7.746268273415568e-09, + "logits/chosen": -1.4082581996917725, + "logits/rejected": -1.4812945127487183, + "logps/chosen": -517.9276733398438, + "logps/rejected": -616.101806640625, + "loss": 0.4727, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2392101287841797, + "rewards/margins": 0.46764475107192993, + "rewards/rejected": -2.7068543434143066, + "step": 3870 + }, + { + "epoch": 0.9309021113243762, + "grad_norm": 24.090047403181376, + "learning_rate": 7.237588670689076e-09, + "logits/chosen": -1.5967910289764404, + "logits/rejected": -1.573727011680603, + "logps/chosen": -554.2374267578125, + "logps/rejected": -693.7000732421875, + "loss": 0.4697, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5206637382507324, + "rewards/margins": 1.8829853534698486, + "rewards/rejected": -4.403648376464844, + "step": 3880 + }, + { + "epoch": 0.9333013435700576, + "grad_norm": 18.140948516541975, + "learning_rate": 6.745937886635606e-09, + "logits/chosen": -1.5123597383499146, + "logits/rejected": -1.5788369178771973, + "logps/chosen": -588.5560913085938, + "logps/rejected": -802.8067626953125, + "loss": 0.4541, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.6617789268493652, + "rewards/margins": 2.1918044090270996, + "rewards/rejected": -4.853583335876465, + "step": 3890 + }, + { + "epoch": 0.935700575815739, + "grad_norm": 25.30804513265546, + "learning_rate": 6.271350408604409e-09, + "logits/chosen": -1.4090216159820557, + "logits/rejected": -1.4261658191680908, + "logps/chosen": -464.48004150390625, + "logps/rejected": -654.8014526367188, + "loss": 0.4888, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.222743272781372, + "rewards/margins": 1.7430121898651123, + "rewards/rejected": -3.965754985809326, + "step": 3900 + }, + { + "epoch": 0.9380998080614203, + "grad_norm": 31.080461353798704, + "learning_rate": 5.813859527021487e-09, + "logits/chosen": -1.3681471347808838, + "logits/rejected": -1.4285444021224976, + "logps/chosen": -594.765380859375, + "logps/rejected": -748.1500244140625, + "loss": 0.4489, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.949069023132324, + "rewards/margins": 1.8202831745147705, + "rewards/rejected": -4.769351959228516, + "step": 3910 + }, + { + "epoch": 0.9404990403071017, + "grad_norm": 21.51602482669025, + "learning_rate": 5.373497333054616e-09, + "logits/chosen": -1.4571654796600342, + "logits/rejected": -1.456939935684204, + "logps/chosen": -541.2385864257812, + "logps/rejected": -650.3056640625, + "loss": 0.51, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.4899582862854004, + "rewards/margins": 1.148026466369629, + "rewards/rejected": -3.6379847526550293, + "step": 3920 + }, + { + "epoch": 0.9428982725527831, + "grad_norm": 23.21344036860555, + "learning_rate": 4.950294716362213e-09, + "logits/chosen": -1.5060057640075684, + "logits/rejected": -1.4898654222488403, + "logps/chosen": -587.83251953125, + "logps/rejected": -737.4891967773438, + "loss": 0.501, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.6853890419006348, + "rewards/margins": 1.5380029678344727, + "rewards/rejected": -4.223391532897949, + "step": 3930 + }, + { + "epoch": 0.9452975047984645, + "grad_norm": 25.423593889522927, + "learning_rate": 4.544281362926422e-09, + "logits/chosen": -1.5261056423187256, + "logits/rejected": -1.5510914325714111, + "logps/chosen": -571.5970458984375, + "logps/rejected": -696.4876098632812, + "loss": 0.5426, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4031929969787598, + "rewards/margins": 1.3733890056610107, + "rewards/rejected": -3.7765822410583496, + "step": 3940 + }, + { + "epoch": 0.9476967370441459, + "grad_norm": 23.51458593403726, + "learning_rate": 4.15548575297095e-09, + "logits/chosen": -1.5317834615707397, + "logits/rejected": -1.5348358154296875, + "logps/chosen": -544.53076171875, + "logps/rejected": -738.2919921875, + "loss": 0.4336, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.600316047668457, + "rewards/margins": 2.1086018085479736, + "rewards/rejected": -4.70891809463501, + "step": 3950 + }, + { + "epoch": 0.9500959692898272, + "grad_norm": 20.042348595522878, + "learning_rate": 3.7839351589631366e-09, + "logits/chosen": -1.5211379528045654, + "logits/rejected": -1.5875836610794067, + "logps/chosen": -498.101806640625, + "logps/rejected": -648.3429565429688, + "loss": 0.4954, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.650871753692627, + "rewards/margins": 1.0367019176483154, + "rewards/rejected": -3.6875739097595215, + "step": 3960 + }, + { + "epoch": 0.9524952015355086, + "grad_norm": 26.55526582014527, + "learning_rate": 3.4296556437010405e-09, + "logits/chosen": -1.5182307958602905, + "logits/rejected": -1.5342469215393066, + "logps/chosen": -524.88330078125, + "logps/rejected": -646.4898681640625, + "loss": 0.4859, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.885620594024658, + "rewards/margins": 1.2972753047943115, + "rewards/rejected": -4.182896137237549, + "step": 3970 + }, + { + "epoch": 0.95489443378119, + "grad_norm": 21.39526609738123, + "learning_rate": 3.092672058485124e-09, + "logits/chosen": -1.4536815881729126, + "logits/rejected": -1.4699939489364624, + "logps/chosen": -573.7540283203125, + "logps/rejected": -816.8096313476562, + "loss": 0.4932, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1411490440368652, + "rewards/margins": 2.276757001876831, + "rewards/rejected": -5.417906284332275, + "step": 3980 + }, + { + "epoch": 0.9572936660268714, + "grad_norm": 26.355467239356273, + "learning_rate": 2.7730080413750356e-09, + "logits/chosen": -1.4212232828140259, + "logits/rejected": -1.4322057962417603, + "logps/chosen": -529.4298095703125, + "logps/rejected": -661.1932373046875, + "loss": 0.5164, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.2038445472717285, + "rewards/margins": 1.3835408687591553, + "rewards/rejected": -3.587385654449463, + "step": 3990 + }, + { + "epoch": 0.9596928982725528, + "grad_norm": 23.309851091514947, + "learning_rate": 2.4706860155316033e-09, + "logits/chosen": -1.4727245569229126, + "logits/rejected": -1.4892442226409912, + "logps/chosen": -662.63916015625, + "logps/rejected": -757.2869873046875, + "loss": 0.4748, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.821819543838501, + "rewards/margins": 1.053189992904663, + "rewards/rejected": -3.8750100135803223, + "step": 4000 + }, + { + "epoch": 0.9596928982725528, + "eval_logits/chosen": -1.353100299835205, + "eval_logits/rejected": -1.3760038614273071, + "eval_logps/chosen": -552.5407104492188, + "eval_logps/rejected": -719.654296875, + "eval_loss": 0.47066760063171387, + "eval_rewards/accuracies": 0.7982142567634583, + "eval_rewards/chosen": -2.631040334701538, + "eval_rewards/margins": 1.6578660011291504, + "eval_rewards/rejected": -4.288906097412109, + "eval_runtime": 44.3555, + "eval_samples_per_second": 100.574, + "eval_steps_per_second": 1.578, + "step": 4000 + }, + { + "epoch": 0.9620921305182342, + "grad_norm": 24.27927649059906, + "learning_rate": 2.185727187643843e-09, + "logits/chosen": -1.4950577020645142, + "logits/rejected": -1.5270750522613525, + "logps/chosen": -500.038330078125, + "logps/rejected": -711.5362548828125, + "loss": 0.4795, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5300304889678955, + "rewards/margins": 2.1758103370666504, + "rewards/rejected": -4.705840587615967, + "step": 4010 + }, + { + "epoch": 0.9644913627639156, + "grad_norm": 30.64778401028549, + "learning_rate": 1.9181515464413434e-09, + "logits/chosen": -1.5454280376434326, + "logits/rejected": -1.5744469165802002, + "logps/chosen": -657.9998779296875, + "logps/rejected": -818.6771240234375, + "loss": 0.4636, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7976233959198, + "rewards/margins": 1.7087122201919556, + "rewards/rejected": -4.506335258483887, + "step": 4020 + }, + { + "epoch": 0.966890595009597, + "grad_norm": 22.563549995490632, + "learning_rate": 1.6679778612923302e-09, + "logits/chosen": -1.535348653793335, + "logits/rejected": -1.4809868335723877, + "logps/chosen": -565.5070190429688, + "logps/rejected": -673.2102661132812, + "loss": 0.4419, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.4256558418273926, + "rewards/margins": 1.0641313791275024, + "rewards/rejected": -3.4897868633270264, + "step": 4030 + }, + { + "epoch": 0.9692898272552783, + "grad_norm": 35.931156804761244, + "learning_rate": 1.43522368088686e-09, + "logits/chosen": -1.4140160083770752, + "logits/rejected": -1.4323618412017822, + "logps/chosen": -591.8991088867188, + "logps/rejected": -832.6375732421875, + "loss": 0.5389, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.999159574508667, + "rewards/margins": 2.465238094329834, + "rewards/rejected": -5.464398384094238, + "step": 4040 + }, + { + "epoch": 0.9716890595009597, + "grad_norm": 43.52676965402136, + "learning_rate": 1.2199053320059993e-09, + "logits/chosen": -1.443598985671997, + "logits/rejected": -1.4974671602249146, + "logps/chosen": -565.3262939453125, + "logps/rejected": -699.5096435546875, + "loss": 0.4874, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.5723953247070312, + "rewards/margins": 1.235426902770996, + "rewards/rejected": -3.8078227043151855, + "step": 4050 + }, + { + "epoch": 0.974088291746641, + "grad_norm": 20.507117194872063, + "learning_rate": 1.0220379183764338e-09, + "logits/chosen": -1.538069486618042, + "logits/rejected": -1.5514476299285889, + "logps/chosen": -485.15692138671875, + "logps/rejected": -685.5218505859375, + "loss": 0.4605, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4839510917663574, + "rewards/margins": 1.9514449834823608, + "rewards/rejected": -4.435396194458008, + "step": 4060 + }, + { + "epoch": 0.9764875239923224, + "grad_norm": 23.110029513543505, + "learning_rate": 8.416353196111503e-10, + "logits/chosen": -1.2751781940460205, + "logits/rejected": -1.2906882762908936, + "logps/chosen": -544.631103515625, + "logps/rejected": -683.9821166992188, + "loss": 0.5108, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7123055458068848, + "rewards/margins": 1.52595055103302, + "rewards/rejected": -4.238255500793457, + "step": 4070 + }, + { + "epoch": 0.9788867562380038, + "grad_norm": 28.604252506251186, + "learning_rate": 6.787101902356873e-10, + "logits/chosen": -1.3590788841247559, + "logits/rejected": -1.3869798183441162, + "logps/chosen": -589.5145263671875, + "logps/rejected": -724.806640625, + "loss": 0.4546, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.954763650894165, + "rewards/margins": 1.134934425354004, + "rewards/rejected": -4.089698314666748, + "step": 4080 + }, + { + "epoch": 0.9812859884836852, + "grad_norm": 37.61625439614764, + "learning_rate": 5.332739588005953e-10, + "logits/chosen": -1.4767444133758545, + "logits/rejected": -1.5401560068130493, + "logps/chosen": -463.3538513183594, + "logps/rejected": -687.2473754882812, + "loss": 0.4681, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.326956033706665, + "rewards/margins": 2.022627115249634, + "rewards/rejected": -4.349583625793457, + "step": 4090 + }, + { + "epoch": 0.9836852207293666, + "grad_norm": 26.481775607216615, + "learning_rate": 4.053368270797164e-10, + "logits/chosen": -1.366387963294983, + "logits/rejected": -1.4285480976104736, + "logps/chosen": -553.9191284179688, + "logps/rejected": -697.4608154296875, + "loss": 0.4603, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.899038314819336, + "rewards/margins": 1.5387704372406006, + "rewards/rejected": -4.437808990478516, + "step": 4100 + }, + { + "epoch": 0.986084452975048, + "grad_norm": 24.04893133679907, + "learning_rate": 2.949077693545354e-10, + "logits/chosen": -1.35219407081604, + "logits/rejected": -1.4018199443817139, + "logps/chosen": -555.1887817382812, + "logps/rejected": -677.9933471679688, + "loss": 0.5466, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.548947811126709, + "rewards/margins": 0.9237582087516785, + "rewards/rejected": -3.4727063179016113, + "step": 4110 + }, + { + "epoch": 0.9884836852207294, + "grad_norm": 27.762054928228263, + "learning_rate": 2.0199453178471047e-10, + "logits/chosen": -1.4359867572784424, + "logits/rejected": -1.3904130458831787, + "logps/chosen": -567.3448486328125, + "logps/rejected": -611.2131958007812, + "loss": 0.4739, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.2747626304626465, + "rewards/margins": 0.8477550745010376, + "rewards/rejected": -3.1225175857543945, + "step": 4120 + }, + { + "epoch": 0.9908829174664108, + "grad_norm": 23.298235071299157, + "learning_rate": 1.266036318647301e-10, + "logits/chosen": -1.4464765787124634, + "logits/rejected": -1.4459589719772339, + "logps/chosen": -616.7385864257812, + "logps/rejected": -795.5848388671875, + "loss": 0.4904, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.873567819595337, + "rewards/margins": 2.0249698162078857, + "rewards/rejected": -4.898537635803223, + "step": 4130 + }, + { + "epoch": 0.9932821497120922, + "grad_norm": 32.26180983119736, + "learning_rate": 6.874035796672339e-11, + "logits/chosen": -1.5064882040023804, + "logits/rejected": -1.523242712020874, + "logps/chosen": -583.457275390625, + "logps/rejected": -798.6057739257812, + "loss": 0.4827, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.515347957611084, + "rewards/margins": 2.765990734100342, + "rewards/rejected": -5.281338691711426, + "step": 4140 + }, + { + "epoch": 0.9956813819577736, + "grad_norm": 28.31537059638512, + "learning_rate": 2.8408768969423458e-11, + "logits/chosen": -1.5372827053070068, + "logits/rejected": -1.571604609489441, + "logps/chosen": -562.3444213867188, + "logps/rejected": -688.1968994140625, + "loss": 0.4823, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.5870022773742676, + "rewards/margins": 1.2521250247955322, + "rewards/rejected": -3.8391270637512207, + "step": 4150 + }, + { + "epoch": 0.9980806142034548, + "grad_norm": 37.07056120335297, + "learning_rate": 5.611693973617271e-12, + "logits/chosen": -1.3595718145370483, + "logits/rejected": -1.3886905908584595, + "logps/chosen": -498.3702697753906, + "logps/rejected": -663.3173828125, + "loss": 0.5019, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.395380735397339, + "rewards/margins": 1.4838107824325562, + "rewards/rejected": -3.8791911602020264, + "step": 4160 + }, + { + "epoch": 1.0, + "step": 4168, + "total_flos": 0.0, + "train_loss": 0.535525489448357, + "train_runtime": 3334.8399, + "train_samples_per_second": 39.992, + "train_steps_per_second": 1.25 + } + ], + "logging_steps": 10, + "max_steps": 4168, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}