{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 2680, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009328358208955223, "grad_norm": 2.121034418795048, "learning_rate": 1.8656716417910446e-06, "loss": 0.8543, "step": 5 }, { "epoch": 0.018656716417910446, "grad_norm": 1.386398411743523, "learning_rate": 3.7313432835820893e-06, "loss": 0.8455, "step": 10 }, { "epoch": 0.027985074626865673, "grad_norm": 1.1101470317456796, "learning_rate": 5.597014925373135e-06, "loss": 0.8038, "step": 15 }, { "epoch": 0.03731343283582089, "grad_norm": 0.8013797226467336, "learning_rate": 7.4626865671641785e-06, "loss": 0.7508, "step": 20 }, { "epoch": 0.04664179104477612, "grad_norm": 0.683953332431515, "learning_rate": 9.328358208955226e-06, "loss": 0.7165, "step": 25 }, { "epoch": 0.055970149253731345, "grad_norm": 0.4119555194078616, "learning_rate": 1.119402985074627e-05, "loss": 0.7092, "step": 30 }, { "epoch": 0.06529850746268656, "grad_norm": 0.37547887353375553, "learning_rate": 1.3059701492537313e-05, "loss": 0.6887, "step": 35 }, { "epoch": 0.07462686567164178, "grad_norm": 0.3101358013472518, "learning_rate": 1.4925373134328357e-05, "loss": 0.6693, "step": 40 }, { "epoch": 0.08395522388059702, "grad_norm": 0.26511595676985605, "learning_rate": 1.6791044776119406e-05, "loss": 0.6471, "step": 45 }, { "epoch": 0.09328358208955224, "grad_norm": 0.28373382307355094, "learning_rate": 1.865671641791045e-05, "loss": 0.6462, "step": 50 }, { "epoch": 0.10261194029850747, "grad_norm": 0.2934554192201568, "learning_rate": 2.0522388059701493e-05, "loss": 0.6466, "step": 55 }, { "epoch": 0.11194029850746269, "grad_norm": 0.3062667944477996, "learning_rate": 2.238805970149254e-05, "loss": 0.63, "step": 60 }, { "epoch": 0.12126865671641791, "grad_norm": 0.2799075678940294, "learning_rate": 2.4253731343283584e-05, "loss": 0.6246, "step": 65 }, { "epoch": 0.13059701492537312, "grad_norm": 0.2512460981933614, "learning_rate": 2.6119402985074626e-05, "loss": 0.6189, "step": 70 }, { "epoch": 0.13992537313432835, "grad_norm": 0.2938159026053503, "learning_rate": 2.7985074626865672e-05, "loss": 0.6133, "step": 75 }, { "epoch": 0.14925373134328357, "grad_norm": 0.30143409828377027, "learning_rate": 2.9850746268656714e-05, "loss": 0.6115, "step": 80 }, { "epoch": 0.15858208955223882, "grad_norm": 0.34009734256140634, "learning_rate": 3.171641791044776e-05, "loss": 0.6151, "step": 85 }, { "epoch": 0.16791044776119404, "grad_norm": 0.31419492635115615, "learning_rate": 3.358208955223881e-05, "loss": 0.611, "step": 90 }, { "epoch": 0.17723880597014927, "grad_norm": 0.32728471787745855, "learning_rate": 3.5447761194029854e-05, "loss": 0.5973, "step": 95 }, { "epoch": 0.1865671641791045, "grad_norm": 0.3591067977861098, "learning_rate": 3.73134328358209e-05, "loss": 0.593, "step": 100 }, { "epoch": 0.1958955223880597, "grad_norm": 0.4386121168984034, "learning_rate": 3.9179104477611945e-05, "loss": 0.6026, "step": 105 }, { "epoch": 0.20522388059701493, "grad_norm": 0.4119425850013888, "learning_rate": 4.104477611940299e-05, "loss": 0.5938, "step": 110 }, { "epoch": 0.21455223880597016, "grad_norm": 0.4102881865294559, "learning_rate": 4.2910447761194036e-05, "loss": 0.5881, "step": 115 }, { "epoch": 0.22388059701492538, "grad_norm": 0.46627923917215447, "learning_rate": 4.477611940298508e-05, "loss": 0.5983, "step": 120 }, { "epoch": 0.2332089552238806, "grad_norm": 0.45495754351792306, "learning_rate": 4.664179104477612e-05, "loss": 0.5834, "step": 125 }, { "epoch": 0.24253731343283583, "grad_norm": 0.5567203912631291, "learning_rate": 4.850746268656717e-05, "loss": 0.587, "step": 130 }, { "epoch": 0.251865671641791, "grad_norm": 0.39377838400086723, "learning_rate": 4.9999982870865717e-05, "loss": 0.5785, "step": 135 }, { "epoch": 0.26119402985074625, "grad_norm": 0.43477936591893745, "learning_rate": 4.9999383353904156e-05, "loss": 0.5824, "step": 140 }, { "epoch": 0.27052238805970147, "grad_norm": 0.4066219850417484, "learning_rate": 4.999792740630874e-05, "loss": 0.581, "step": 145 }, { "epoch": 0.2798507462686567, "grad_norm": 0.4026101332790226, "learning_rate": 4.999561508349957e-05, "loss": 0.5738, "step": 150 }, { "epoch": 0.2891791044776119, "grad_norm": 0.3453298481509857, "learning_rate": 4.999244647349435e-05, "loss": 0.5698, "step": 155 }, { "epoch": 0.29850746268656714, "grad_norm": 0.3993570099372397, "learning_rate": 4.998842169690504e-05, "loss": 0.5821, "step": 160 }, { "epoch": 0.30783582089552236, "grad_norm": 0.3662962109066574, "learning_rate": 4.998354090693326e-05, "loss": 0.5745, "step": 165 }, { "epoch": 0.31716417910447764, "grad_norm": 0.32997283077921064, "learning_rate": 4.997780428936446e-05, "loss": 0.5664, "step": 170 }, { "epoch": 0.32649253731343286, "grad_norm": 0.39807216361492537, "learning_rate": 4.9971212062560844e-05, "loss": 0.5723, "step": 175 }, { "epoch": 0.3358208955223881, "grad_norm": 0.3696932644843243, "learning_rate": 4.996376447745307e-05, "loss": 0.5733, "step": 180 }, { "epoch": 0.3451492537313433, "grad_norm": 0.4372976817638053, "learning_rate": 4.995546181753069e-05, "loss": 0.5769, "step": 185 }, { "epoch": 0.35447761194029853, "grad_norm": 0.4583343737343749, "learning_rate": 4.9946304398831336e-05, "loss": 0.5651, "step": 190 }, { "epoch": 0.36380597014925375, "grad_norm": 0.47360557075080717, "learning_rate": 4.993629256992876e-05, "loss": 0.5649, "step": 195 }, { "epoch": 0.373134328358209, "grad_norm": 0.43539669566382294, "learning_rate": 4.992542671191948e-05, "loss": 0.5686, "step": 200 }, { "epoch": 0.3824626865671642, "grad_norm": 0.38159352517011014, "learning_rate": 4.991370723840834e-05, "loss": 0.5591, "step": 205 }, { "epoch": 0.3917910447761194, "grad_norm": 0.4068296933055857, "learning_rate": 4.990113459549271e-05, "loss": 0.5662, "step": 210 }, { "epoch": 0.40111940298507465, "grad_norm": 0.39143685300330167, "learning_rate": 4.9887709261745566e-05, "loss": 0.5704, "step": 215 }, { "epoch": 0.41044776119402987, "grad_norm": 0.3894456921916186, "learning_rate": 4.987343174819723e-05, "loss": 0.568, "step": 220 }, { "epoch": 0.4197761194029851, "grad_norm": 0.35578713017504704, "learning_rate": 4.98583025983159e-05, "loss": 0.5637, "step": 225 }, { "epoch": 0.4291044776119403, "grad_norm": 0.3494631382796153, "learning_rate": 4.984232238798707e-05, "loss": 0.5556, "step": 230 }, { "epoch": 0.43843283582089554, "grad_norm": 0.3583576786848622, "learning_rate": 4.982549172549145e-05, "loss": 0.5565, "step": 235 }, { "epoch": 0.44776119402985076, "grad_norm": 0.2897260118430239, "learning_rate": 4.980781125148194e-05, "loss": 0.5635, "step": 240 }, { "epoch": 0.457089552238806, "grad_norm": 0.35269356942644625, "learning_rate": 4.9789281638959184e-05, "loss": 0.5618, "step": 245 }, { "epoch": 0.4664179104477612, "grad_norm": 0.3009406856158159, "learning_rate": 4.976990359324597e-05, "loss": 0.551, "step": 250 }, { "epoch": 0.47574626865671643, "grad_norm": 0.31989082834293203, "learning_rate": 4.974967785196039e-05, "loss": 0.5586, "step": 255 }, { "epoch": 0.48507462686567165, "grad_norm": 0.3115624141345025, "learning_rate": 4.9728605184987724e-05, "loss": 0.5505, "step": 260 }, { "epoch": 0.4944029850746269, "grad_norm": 0.32558028056536303, "learning_rate": 4.970668639445119e-05, "loss": 0.5443, "step": 265 }, { "epoch": 0.503731343283582, "grad_norm": 0.3274278767518983, "learning_rate": 4.9683922314681374e-05, "loss": 0.5589, "step": 270 }, { "epoch": 0.5130597014925373, "grad_norm": 0.3009635526028401, "learning_rate": 4.966031381218447e-05, "loss": 0.558, "step": 275 }, { "epoch": 0.5223880597014925, "grad_norm": 0.4396827504947035, "learning_rate": 4.9635861785609333e-05, "loss": 0.5553, "step": 280 }, { "epoch": 0.5317164179104478, "grad_norm": 0.3545148936240071, "learning_rate": 4.961056716571322e-05, "loss": 0.5427, "step": 285 }, { "epoch": 0.5410447761194029, "grad_norm": 0.3933594879121108, "learning_rate": 4.95844309153264e-05, "loss": 0.5555, "step": 290 }, { "epoch": 0.5503731343283582, "grad_norm": 0.3399572004763983, "learning_rate": 4.95574540293155e-05, "loss": 0.5443, "step": 295 }, { "epoch": 0.5597014925373134, "grad_norm": 0.3232862240098154, "learning_rate": 4.952963753454563e-05, "loss": 0.5484, "step": 300 }, { "epoch": 0.5690298507462687, "grad_norm": 0.33747902495874976, "learning_rate": 4.950098248984127e-05, "loss": 0.5431, "step": 305 }, { "epoch": 0.5783582089552238, "grad_norm": 0.3290209060277867, "learning_rate": 4.947148998594601e-05, "loss": 0.5492, "step": 310 }, { "epoch": 0.5876865671641791, "grad_norm": 0.32520191068262977, "learning_rate": 4.9441161145481016e-05, "loss": 0.5463, "step": 315 }, { "epoch": 0.5970149253731343, "grad_norm": 0.29039846259923247, "learning_rate": 4.940999712290229e-05, "loss": 0.5423, "step": 320 }, { "epoch": 0.6063432835820896, "grad_norm": 0.3204817871102438, "learning_rate": 4.9377999104456704e-05, "loss": 0.5406, "step": 325 }, { "epoch": 0.6156716417910447, "grad_norm": 0.2825835467043114, "learning_rate": 4.934516830813693e-05, "loss": 0.5449, "step": 330 }, { "epoch": 0.625, "grad_norm": 0.26725114415380724, "learning_rate": 4.931150598363494e-05, "loss": 0.5413, "step": 335 }, { "epoch": 0.6343283582089553, "grad_norm": 0.3087079192612929, "learning_rate": 4.927701341229457e-05, "loss": 0.5435, "step": 340 }, { "epoch": 0.6436567164179104, "grad_norm": 0.32408427410804364, "learning_rate": 4.924169190706271e-05, "loss": 0.5369, "step": 345 }, { "epoch": 0.6529850746268657, "grad_norm": 0.3212882285201007, "learning_rate": 4.920554281243925e-05, "loss": 0.5467, "step": 350 }, { "epoch": 0.6623134328358209, "grad_norm": 0.28326390138255647, "learning_rate": 4.9168567504425994e-05, "loss": 0.5334, "step": 355 }, { "epoch": 0.6716417910447762, "grad_norm": 0.34309833318712024, "learning_rate": 4.913076739047425e-05, "loss": 0.5341, "step": 360 }, { "epoch": 0.6809701492537313, "grad_norm": 0.297600748395305, "learning_rate": 4.909214390943127e-05, "loss": 0.5373, "step": 365 }, { "epoch": 0.6902985074626866, "grad_norm": 0.30872545241758026, "learning_rate": 4.905269853148543e-05, "loss": 0.5359, "step": 370 }, { "epoch": 0.6996268656716418, "grad_norm": 0.314658149979199, "learning_rate": 4.901243275811034e-05, "loss": 0.5386, "step": 375 }, { "epoch": 0.7089552238805971, "grad_norm": 0.2793415267373151, "learning_rate": 4.897134812200763e-05, "loss": 0.5407, "step": 380 }, { "epoch": 0.7182835820895522, "grad_norm": 0.2724464978389461, "learning_rate": 4.892944618704865e-05, "loss": 0.5434, "step": 385 }, { "epoch": 0.7276119402985075, "grad_norm": 0.3315773509057709, "learning_rate": 4.8886728548214933e-05, "loss": 0.5414, "step": 390 }, { "epoch": 0.7369402985074627, "grad_norm": 0.308645520805135, "learning_rate": 4.884319683153746e-05, "loss": 0.5384, "step": 395 }, { "epoch": 0.746268656716418, "grad_norm": 0.332026287137246, "learning_rate": 4.8798852694034775e-05, "loss": 0.5338, "step": 400 }, { "epoch": 0.7555970149253731, "grad_norm": 0.30032605296770787, "learning_rate": 4.875369782364994e-05, "loss": 0.5404, "step": 405 }, { "epoch": 0.7649253731343284, "grad_norm": 0.3129449335154319, "learning_rate": 4.8707733939186254e-05, "loss": 0.5265, "step": 410 }, { "epoch": 0.7742537313432836, "grad_norm": 0.3127862898447159, "learning_rate": 4.8660962790241824e-05, "loss": 0.542, "step": 415 }, { "epoch": 0.7835820895522388, "grad_norm": 0.3140878734044955, "learning_rate": 4.861338615714299e-05, "loss": 0.5336, "step": 420 }, { "epoch": 0.792910447761194, "grad_norm": 0.3234850128387726, "learning_rate": 4.856500585087654e-05, "loss": 0.5319, "step": 425 }, { "epoch": 0.8022388059701493, "grad_norm": 0.2778667128567412, "learning_rate": 4.851582371302078e-05, "loss": 0.5321, "step": 430 }, { "epoch": 0.8115671641791045, "grad_norm": 0.30469138781369043, "learning_rate": 4.8465841615675464e-05, "loss": 0.5338, "step": 435 }, { "epoch": 0.8208955223880597, "grad_norm": 0.2906308159807314, "learning_rate": 4.8415061461390444e-05, "loss": 0.5282, "step": 440 }, { "epoch": 0.8302238805970149, "grad_norm": 0.3185179967436335, "learning_rate": 4.836348518309337e-05, "loss": 0.5382, "step": 445 }, { "epoch": 0.8395522388059702, "grad_norm": 0.32713910558428405, "learning_rate": 4.831111474401604e-05, "loss": 0.5353, "step": 450 }, { "epoch": 0.8488805970149254, "grad_norm": 0.33990883183475434, "learning_rate": 4.825795213761967e-05, "loss": 0.528, "step": 455 }, { "epoch": 0.8582089552238806, "grad_norm": 0.3006237472444059, "learning_rate": 4.8203999387519036e-05, "loss": 0.5378, "step": 460 }, { "epoch": 0.8675373134328358, "grad_norm": 0.326536753442336, "learning_rate": 4.8149258547405466e-05, "loss": 0.5426, "step": 465 }, { "epoch": 0.8768656716417911, "grad_norm": 0.29573931532782105, "learning_rate": 4.809373170096859e-05, "loss": 0.5224, "step": 470 }, { "epoch": 0.8861940298507462, "grad_norm": 0.2883989058192913, "learning_rate": 4.803742096181711e-05, "loss": 0.5259, "step": 475 }, { "epoch": 0.8955223880597015, "grad_norm": 0.2909601202872592, "learning_rate": 4.7980328473398314e-05, "loss": 0.5304, "step": 480 }, { "epoch": 0.9048507462686567, "grad_norm": 0.3200870380282512, "learning_rate": 4.7922456408916465e-05, "loss": 0.5382, "step": 485 }, { "epoch": 0.914179104477612, "grad_norm": 0.30209946891319994, "learning_rate": 4.786380697125012e-05, "loss": 0.5243, "step": 490 }, { "epoch": 0.9235074626865671, "grad_norm": 0.25886874405552174, "learning_rate": 4.780438239286824e-05, "loss": 0.526, "step": 495 }, { "epoch": 0.9328358208955224, "grad_norm": 0.3139053559393455, "learning_rate": 4.774418493574523e-05, "loss": 0.528, "step": 500 }, { "epoch": 0.9421641791044776, "grad_norm": 0.3097126952454523, "learning_rate": 4.768321689127483e-05, "loss": 0.523, "step": 505 }, { "epoch": 0.9514925373134329, "grad_norm": 0.35405358251343566, "learning_rate": 4.7621480580182925e-05, "loss": 0.5231, "step": 510 }, { "epoch": 0.960820895522388, "grad_norm": 0.3440818935184419, "learning_rate": 4.755897835243916e-05, "loss": 0.5243, "step": 515 }, { "epoch": 0.9701492537313433, "grad_norm": 0.2926842091101089, "learning_rate": 4.74957125871675e-05, "loss": 0.5284, "step": 520 }, { "epoch": 0.9794776119402985, "grad_norm": 0.34179809103321573, "learning_rate": 4.743168569255572e-05, "loss": 0.5303, "step": 525 }, { "epoch": 0.9888059701492538, "grad_norm": 0.2912165103354673, "learning_rate": 4.736690010576368e-05, "loss": 0.5286, "step": 530 }, { "epoch": 0.9981343283582089, "grad_norm": 0.26172528091797354, "learning_rate": 4.730135829283055e-05, "loss": 0.5283, "step": 535 }, { "epoch": 1.007462686567164, "grad_norm": 0.31414431065005655, "learning_rate": 4.723506274858101e-05, "loss": 0.4969, "step": 540 }, { "epoch": 1.0167910447761195, "grad_norm": 0.2778344814847648, "learning_rate": 4.7168015996530204e-05, "loss": 0.5098, "step": 545 }, { "epoch": 1.0261194029850746, "grad_norm": 0.2973869582476319, "learning_rate": 4.7100220588787755e-05, "loss": 0.4906, "step": 550 }, { "epoch": 1.0354477611940298, "grad_norm": 0.34737437112939884, "learning_rate": 4.703167910596055e-05, "loss": 0.5022, "step": 555 }, { "epoch": 1.044776119402985, "grad_norm": 0.2975882675134865, "learning_rate": 4.696239415705458e-05, "loss": 0.5006, "step": 560 }, { "epoch": 1.0541044776119404, "grad_norm": 0.2645048410351983, "learning_rate": 4.689236837937556e-05, "loss": 0.4966, "step": 565 }, { "epoch": 1.0634328358208955, "grad_norm": 0.2701727642177748, "learning_rate": 4.6821604438428594e-05, "loss": 0.5061, "step": 570 }, { "epoch": 1.0727611940298507, "grad_norm": 0.30137178795339964, "learning_rate": 4.6750105027816716e-05, "loss": 0.4872, "step": 575 }, { "epoch": 1.0820895522388059, "grad_norm": 0.306383142029591, "learning_rate": 4.6677872869138304e-05, "loss": 0.4975, "step": 580 }, { "epoch": 1.0914179104477613, "grad_norm": 0.2950552307899199, "learning_rate": 4.660491071188353e-05, "loss": 0.495, "step": 585 }, { "epoch": 1.1007462686567164, "grad_norm": 0.2889901979990583, "learning_rate": 4.6531221333329694e-05, "loss": 0.5051, "step": 590 }, { "epoch": 1.1100746268656716, "grad_norm": 0.2972033844000557, "learning_rate": 4.64568075384355e-05, "loss": 0.4994, "step": 595 }, { "epoch": 1.1194029850746268, "grad_norm": 0.30996002146552903, "learning_rate": 4.6381672159734287e-05, "loss": 0.492, "step": 600 }, { "epoch": 1.1287313432835822, "grad_norm": 0.2822626469543736, "learning_rate": 4.6305818057226226e-05, "loss": 0.494, "step": 605 }, { "epoch": 1.1380597014925373, "grad_norm": 0.35356169265897275, "learning_rate": 4.622924811826942e-05, "loss": 0.5035, "step": 610 }, { "epoch": 1.1473880597014925, "grad_norm": 0.2719585044666097, "learning_rate": 4.615196525747003e-05, "loss": 0.4924, "step": 615 }, { "epoch": 1.1567164179104479, "grad_norm": 0.28769230273582963, "learning_rate": 4.607397241657133e-05, "loss": 0.5, "step": 620 }, { "epoch": 1.166044776119403, "grad_norm": 0.32179250115775737, "learning_rate": 4.599527256434171e-05, "loss": 0.4938, "step": 625 }, { "epoch": 1.1753731343283582, "grad_norm": 0.3269358532771784, "learning_rate": 4.5915868696461685e-05, "loss": 0.4986, "step": 630 }, { "epoch": 1.1847014925373134, "grad_norm": 0.35815419781244623, "learning_rate": 4.5835763835409864e-05, "loss": 0.499, "step": 635 }, { "epoch": 1.1940298507462686, "grad_norm": 0.3014883873373339, "learning_rate": 4.57549610303479e-05, "loss": 0.4974, "step": 640 }, { "epoch": 1.203358208955224, "grad_norm": 0.32900981412756597, "learning_rate": 4.567346335700442e-05, "loss": 0.5051, "step": 645 }, { "epoch": 1.212686567164179, "grad_norm": 0.31251868369193175, "learning_rate": 4.559127391755796e-05, "loss": 0.4875, "step": 650 }, { "epoch": 1.2220149253731343, "grad_norm": 0.2717174025148126, "learning_rate": 4.5508395840518884e-05, "loss": 0.4913, "step": 655 }, { "epoch": 1.2313432835820897, "grad_norm": 0.34926058096653734, "learning_rate": 4.5424832280610245e-05, "loss": 0.4954, "step": 660 }, { "epoch": 1.2406716417910448, "grad_norm": 0.2979539728855119, "learning_rate": 4.53405864186478e-05, "loss": 0.4956, "step": 665 }, { "epoch": 1.25, "grad_norm": 0.28119456446638286, "learning_rate": 4.5255661461418854e-05, "loss": 0.501, "step": 670 }, { "epoch": 1.2593283582089552, "grad_norm": 0.2979282368203432, "learning_rate": 4.517006064156023e-05, "loss": 0.4957, "step": 675 }, { "epoch": 1.2686567164179103, "grad_norm": 0.3054998826103471, "learning_rate": 4.5083787217435175e-05, "loss": 0.4842, "step": 680 }, { "epoch": 1.2779850746268657, "grad_norm": 0.28811809748248135, "learning_rate": 4.4996844473009425e-05, "loss": 0.5043, "step": 685 }, { "epoch": 1.287313432835821, "grad_norm": 0.2741946985743944, "learning_rate": 4.4909235717726086e-05, "loss": 0.4869, "step": 690 }, { "epoch": 1.296641791044776, "grad_norm": 0.24360891675458493, "learning_rate": 4.4820964286379764e-05, "loss": 0.4993, "step": 695 }, { "epoch": 1.3059701492537314, "grad_norm": 0.27866035694054087, "learning_rate": 4.4732033538989556e-05, "loss": 0.4871, "step": 700 }, { "epoch": 1.3152985074626866, "grad_norm": 0.33993875429669684, "learning_rate": 4.4642446860671185e-05, "loss": 0.4965, "step": 705 }, { "epoch": 1.3246268656716418, "grad_norm": 0.27262054765657656, "learning_rate": 4.455220766150814e-05, "loss": 0.4962, "step": 710 }, { "epoch": 1.333955223880597, "grad_norm": 0.3381508836646884, "learning_rate": 4.4461319376421875e-05, "loss": 0.4946, "step": 715 }, { "epoch": 1.3432835820895521, "grad_norm": 0.2827312978497954, "learning_rate": 4.436978546504105e-05, "loss": 0.488, "step": 720 }, { "epoch": 1.3526119402985075, "grad_norm": 0.26609615619911087, "learning_rate": 4.427760941156986e-05, "loss": 0.4871, "step": 725 }, { "epoch": 1.3619402985074627, "grad_norm": 0.24723042047864005, "learning_rate": 4.418479472465539e-05, "loss": 0.4917, "step": 730 }, { "epoch": 1.3712686567164178, "grad_norm": 0.3164390574163236, "learning_rate": 4.409134493725409e-05, "loss": 0.4924, "step": 735 }, { "epoch": 1.3805970149253732, "grad_norm": 0.3252809591300553, "learning_rate": 4.3997263606497225e-05, "loss": 0.5038, "step": 740 }, { "epoch": 1.3899253731343284, "grad_norm": 0.289902074351053, "learning_rate": 4.390255431355557e-05, "loss": 0.4947, "step": 745 }, { "epoch": 1.3992537313432836, "grad_norm": 0.2925282606858288, "learning_rate": 4.380722066350303e-05, "loss": 0.4921, "step": 750 }, { "epoch": 1.4085820895522387, "grad_norm": 0.3392527223209102, "learning_rate": 4.3711266285179415e-05, "loss": 0.4921, "step": 755 }, { "epoch": 1.417910447761194, "grad_norm": 0.2760653408608711, "learning_rate": 4.361469483105236e-05, "loss": 0.4917, "step": 760 }, { "epoch": 1.4272388059701493, "grad_norm": 0.2777656743215194, "learning_rate": 4.351750997707824e-05, "loss": 0.4916, "step": 765 }, { "epoch": 1.4365671641791045, "grad_norm": 0.30383717084417433, "learning_rate": 4.341971542256225e-05, "loss": 0.483, "step": 770 }, { "epoch": 1.4458955223880596, "grad_norm": 0.24254861783080864, "learning_rate": 4.332131489001762e-05, "loss": 0.4917, "step": 775 }, { "epoch": 1.455223880597015, "grad_norm": 0.2545972911860277, "learning_rate": 4.322231212502394e-05, "loss": 0.4857, "step": 780 }, { "epoch": 1.4645522388059702, "grad_norm": 0.2454238842097937, "learning_rate": 4.3122710896084504e-05, "loss": 0.4908, "step": 785 }, { "epoch": 1.4738805970149254, "grad_norm": 0.22955128945953768, "learning_rate": 4.302251499448294e-05, "loss": 0.4825, "step": 790 }, { "epoch": 1.4832089552238805, "grad_norm": 0.2867180916017195, "learning_rate": 4.292172823413887e-05, "loss": 0.4917, "step": 795 }, { "epoch": 1.4925373134328357, "grad_norm": 0.2875494459689191, "learning_rate": 4.282035445146272e-05, "loss": 0.4886, "step": 800 }, { "epoch": 1.501865671641791, "grad_norm": 0.28245546775244584, "learning_rate": 4.271839750520972e-05, "loss": 0.4882, "step": 805 }, { "epoch": 1.5111940298507462, "grad_norm": 0.2801519926842846, "learning_rate": 4.261586127633297e-05, "loss": 0.4796, "step": 810 }, { "epoch": 1.5205223880597014, "grad_norm": 0.26093007090268494, "learning_rate": 4.251274966783579e-05, "loss": 0.4888, "step": 815 }, { "epoch": 1.5298507462686568, "grad_norm": 0.25743333359730924, "learning_rate": 4.2409066604623096e-05, "loss": 0.4877, "step": 820 }, { "epoch": 1.539179104477612, "grad_norm": 0.3404739464519635, "learning_rate": 4.230481603335201e-05, "loss": 0.4896, "step": 825 }, { "epoch": 1.5485074626865671, "grad_norm": 0.3061300926409675, "learning_rate": 4.220000192228161e-05, "loss": 0.496, "step": 830 }, { "epoch": 1.5578358208955225, "grad_norm": 0.2506042462513741, "learning_rate": 4.209462826112195e-05, "loss": 0.4876, "step": 835 }, { "epoch": 1.5671641791044775, "grad_norm": 0.2759104849218615, "learning_rate": 4.1988699060882144e-05, "loss": 0.4893, "step": 840 }, { "epoch": 1.5764925373134329, "grad_norm": 0.26990049980649783, "learning_rate": 4.188221835371766e-05, "loss": 0.4908, "step": 845 }, { "epoch": 1.585820895522388, "grad_norm": 0.2586843175291616, "learning_rate": 4.1775190192776905e-05, "loss": 0.4902, "step": 850 }, { "epoch": 1.5951492537313432, "grad_norm": 0.256103499758438, "learning_rate": 4.1667618652046894e-05, "loss": 0.489, "step": 855 }, { "epoch": 1.6044776119402986, "grad_norm": 0.22792047188722547, "learning_rate": 4.155950782619819e-05, "loss": 0.4913, "step": 860 }, { "epoch": 1.6138059701492538, "grad_norm": 0.25201567903758465, "learning_rate": 4.145086183042907e-05, "loss": 0.4906, "step": 865 }, { "epoch": 1.623134328358209, "grad_norm": 0.2518565917852366, "learning_rate": 4.13416848003088e-05, "loss": 0.4868, "step": 870 }, { "epoch": 1.6324626865671643, "grad_norm": 0.23748856167927077, "learning_rate": 4.123198089162033e-05, "loss": 0.4842, "step": 875 }, { "epoch": 1.6417910447761193, "grad_norm": 0.24353602399447155, "learning_rate": 4.112175428020199e-05, "loss": 0.493, "step": 880 }, { "epoch": 1.6511194029850746, "grad_norm": 1.3222397655957763, "learning_rate": 4.1011009161788655e-05, "loss": 0.4906, "step": 885 }, { "epoch": 1.6604477611940298, "grad_norm": 0.3059990977090486, "learning_rate": 4.089974975185192e-05, "loss": 0.4945, "step": 890 }, { "epoch": 1.669776119402985, "grad_norm": 0.2249262029856417, "learning_rate": 4.078798028543974e-05, "loss": 0.4804, "step": 895 }, { "epoch": 1.6791044776119404, "grad_norm": 0.23192854470162289, "learning_rate": 4.067570501701513e-05, "loss": 0.4846, "step": 900 }, { "epoch": 1.6884328358208955, "grad_norm": 0.27874866188929165, "learning_rate": 4.056292822029432e-05, "loss": 0.4921, "step": 905 }, { "epoch": 1.6977611940298507, "grad_norm": 0.3597771307359175, "learning_rate": 4.0449654188083985e-05, "loss": 0.4878, "step": 910 }, { "epoch": 1.707089552238806, "grad_norm": 0.30317779903784486, "learning_rate": 4.033588723211793e-05, "loss": 0.4915, "step": 915 }, { "epoch": 1.716417910447761, "grad_norm": 0.24871996860642873, "learning_rate": 4.022163168289287e-05, "loss": 0.489, "step": 920 }, { "epoch": 1.7257462686567164, "grad_norm": 0.2548739095394753, "learning_rate": 4.010689188950367e-05, "loss": 0.4894, "step": 925 }, { "epoch": 1.7350746268656716, "grad_norm": 0.3091745382527843, "learning_rate": 3.999167221947777e-05, "loss": 0.4918, "step": 930 }, { "epoch": 1.7444029850746268, "grad_norm": 0.2646978821890823, "learning_rate": 3.987597705860891e-05, "loss": 0.4904, "step": 935 }, { "epoch": 1.7537313432835822, "grad_norm": 0.26945797823135964, "learning_rate": 3.9759810810790236e-05, "loss": 0.4866, "step": 940 }, { "epoch": 1.7630597014925373, "grad_norm": 0.3203065088796758, "learning_rate": 3.964317789784664e-05, "loss": 0.4883, "step": 945 }, { "epoch": 1.7723880597014925, "grad_norm": 0.2655242760294149, "learning_rate": 3.952608275936644e-05, "loss": 0.4832, "step": 950 }, { "epoch": 1.7817164179104479, "grad_norm": 0.2368067566302728, "learning_rate": 3.940852985253239e-05, "loss": 0.4763, "step": 955 }, { "epoch": 1.7910447761194028, "grad_norm": 0.2730681042901973, "learning_rate": 3.9290523651952046e-05, "loss": 0.4881, "step": 960 }, { "epoch": 1.8003731343283582, "grad_norm": 0.22160712770576704, "learning_rate": 3.9172068649487405e-05, "loss": 0.483, "step": 965 }, { "epoch": 1.8097014925373134, "grad_norm": 0.28722103458528975, "learning_rate": 3.9053169354083946e-05, "loss": 0.4846, "step": 970 }, { "epoch": 1.8190298507462686, "grad_norm": 0.25494073735839834, "learning_rate": 3.893383029159899e-05, "loss": 0.4817, "step": 975 }, { "epoch": 1.828358208955224, "grad_norm": 0.24967332583867882, "learning_rate": 3.881405600462943e-05, "loss": 0.4853, "step": 980 }, { "epoch": 1.837686567164179, "grad_norm": 0.25224718872662366, "learning_rate": 3.869385105233884e-05, "loss": 0.4891, "step": 985 }, { "epoch": 1.8470149253731343, "grad_norm": 0.28145314036193597, "learning_rate": 3.857322001028385e-05, "loss": 0.4893, "step": 990 }, { "epoch": 1.8563432835820897, "grad_norm": 0.22884685179726927, "learning_rate": 3.84521674702401e-05, "loss": 0.4865, "step": 995 }, { "epoch": 1.8656716417910446, "grad_norm": 0.25614671664081545, "learning_rate": 3.8330698040027345e-05, "loss": 0.4958, "step": 1000 }, { "epoch": 1.875, "grad_norm": 0.24607306900884124, "learning_rate": 3.8208816343334156e-05, "loss": 0.4885, "step": 1005 }, { "epoch": 1.8843283582089554, "grad_norm": 0.25164065693712045, "learning_rate": 3.808652701954183e-05, "loss": 0.4831, "step": 1010 }, { "epoch": 1.8936567164179103, "grad_norm": 0.2610394568849165, "learning_rate": 3.7963834723547866e-05, "loss": 0.4817, "step": 1015 }, { "epoch": 1.9029850746268657, "grad_norm": 0.2364369781820542, "learning_rate": 3.784074412558875e-05, "loss": 0.4814, "step": 1020 }, { "epoch": 1.912313432835821, "grad_norm": 0.2656717095516705, "learning_rate": 3.771725991106214e-05, "loss": 0.4896, "step": 1025 }, { "epoch": 1.921641791044776, "grad_norm": 0.23806169860280865, "learning_rate": 3.7593386780348625e-05, "loss": 0.4877, "step": 1030 }, { "epoch": 1.9309701492537314, "grad_norm": 0.2915019196027403, "learning_rate": 3.7469129448632704e-05, "loss": 0.4899, "step": 1035 }, { "epoch": 1.9402985074626866, "grad_norm": 0.282081760789875, "learning_rate": 3.734449264572336e-05, "loss": 0.4862, "step": 1040 }, { "epoch": 1.9496268656716418, "grad_norm": 0.2694345566497008, "learning_rate": 3.721948111587399e-05, "loss": 0.4853, "step": 1045 }, { "epoch": 1.9589552238805972, "grad_norm": 0.26633357424432796, "learning_rate": 3.709409961760186e-05, "loss": 0.4806, "step": 1050 }, { "epoch": 1.9682835820895521, "grad_norm": 0.23647682944775708, "learning_rate": 3.69683529235069e-05, "loss": 0.4835, "step": 1055 }, { "epoch": 1.9776119402985075, "grad_norm": 0.222269771960961, "learning_rate": 3.684224582009014e-05, "loss": 0.4921, "step": 1060 }, { "epoch": 1.9869402985074627, "grad_norm": 0.2686661501440812, "learning_rate": 3.67157831075714e-05, "loss": 0.4831, "step": 1065 }, { "epoch": 1.9962686567164178, "grad_norm": 0.24422430992844657, "learning_rate": 3.6588969599706665e-05, "loss": 0.4739, "step": 1070 }, { "epoch": 2.0055970149253732, "grad_norm": 0.2701482202313567, "learning_rate": 3.6461810123604805e-05, "loss": 0.4598, "step": 1075 }, { "epoch": 2.014925373134328, "grad_norm": 0.2677169649930948, "learning_rate": 3.633430951954383e-05, "loss": 0.4584, "step": 1080 }, { "epoch": 2.0242537313432836, "grad_norm": 0.23909457585088278, "learning_rate": 3.6206472640786696e-05, "loss": 0.4543, "step": 1085 }, { "epoch": 2.033582089552239, "grad_norm": 0.2625554050273166, "learning_rate": 3.607830435339648e-05, "loss": 0.4469, "step": 1090 }, { "epoch": 2.042910447761194, "grad_norm": 0.24516645623371994, "learning_rate": 3.5949809536051235e-05, "loss": 0.4523, "step": 1095 }, { "epoch": 2.0522388059701493, "grad_norm": 0.31725961510930195, "learning_rate": 3.5820993079858235e-05, "loss": 0.4486, "step": 1100 }, { "epoch": 2.0615671641791047, "grad_norm": 0.2585017243397624, "learning_rate": 3.5691859888167846e-05, "loss": 0.4524, "step": 1105 }, { "epoch": 2.0708955223880596, "grad_norm": 0.21399700096282148, "learning_rate": 3.556241487638682e-05, "loss": 0.444, "step": 1110 }, { "epoch": 2.080223880597015, "grad_norm": 0.247405554141379, "learning_rate": 3.5432662971791264e-05, "loss": 0.4537, "step": 1115 }, { "epoch": 2.08955223880597, "grad_norm": 0.27050904136729376, "learning_rate": 3.5302609113339e-05, "loss": 0.4473, "step": 1120 }, { "epoch": 2.0988805970149254, "grad_norm": 0.27997752959508676, "learning_rate": 3.517225825148164e-05, "loss": 0.4475, "step": 1125 }, { "epoch": 2.1082089552238807, "grad_norm": 0.2508324570465046, "learning_rate": 3.504161534797612e-05, "loss": 0.4564, "step": 1130 }, { "epoch": 2.1175373134328357, "grad_norm": 0.2505577185625896, "learning_rate": 3.491068537569581e-05, "loss": 0.4584, "step": 1135 }, { "epoch": 2.126865671641791, "grad_norm": 0.2512329127124906, "learning_rate": 3.477947331844127e-05, "loss": 0.4506, "step": 1140 }, { "epoch": 2.1361940298507465, "grad_norm": 0.2505092710246372, "learning_rate": 3.4647984170750506e-05, "loss": 0.4466, "step": 1145 }, { "epoch": 2.1455223880597014, "grad_norm": 0.23692759571107674, "learning_rate": 3.451622293770889e-05, "loss": 0.4504, "step": 1150 }, { "epoch": 2.154850746268657, "grad_norm": 0.24882402563302974, "learning_rate": 3.438419463475857e-05, "loss": 0.4498, "step": 1155 }, { "epoch": 2.1641791044776117, "grad_norm": 0.2636377327024063, "learning_rate": 3.425190428750767e-05, "loss": 0.4549, "step": 1160 }, { "epoch": 2.173507462686567, "grad_norm": 0.263071217750677, "learning_rate": 3.4119356931538894e-05, "loss": 0.4516, "step": 1165 }, { "epoch": 2.1828358208955225, "grad_norm": 0.25512234184313887, "learning_rate": 3.3986557612217904e-05, "loss": 0.4533, "step": 1170 }, { "epoch": 2.1921641791044775, "grad_norm": 0.24589762599569973, "learning_rate": 3.3853511384501256e-05, "loss": 0.4509, "step": 1175 }, { "epoch": 2.201492537313433, "grad_norm": 0.2306387403078458, "learning_rate": 3.372022331274397e-05, "loss": 0.4471, "step": 1180 }, { "epoch": 2.2108208955223883, "grad_norm": 0.21865053002746085, "learning_rate": 3.358669847050676e-05, "loss": 0.4413, "step": 1185 }, { "epoch": 2.220149253731343, "grad_norm": 0.2339648120862053, "learning_rate": 3.3452941940362946e-05, "loss": 0.4545, "step": 1190 }, { "epoch": 2.2294776119402986, "grad_norm": 0.24800672005935778, "learning_rate": 3.331895881370495e-05, "loss": 0.451, "step": 1195 }, { "epoch": 2.2388059701492535, "grad_norm": 0.23259368716602813, "learning_rate": 3.3184754190550506e-05, "loss": 0.4544, "step": 1200 }, { "epoch": 2.248134328358209, "grad_norm": 0.25042806312770716, "learning_rate": 3.305033317934852e-05, "loss": 0.4553, "step": 1205 }, { "epoch": 2.2574626865671643, "grad_norm": 0.22493015878466893, "learning_rate": 3.2915700896784655e-05, "loss": 0.4555, "step": 1210 }, { "epoch": 2.2667910447761193, "grad_norm": 0.24531211609267742, "learning_rate": 3.2780862467586486e-05, "loss": 0.4484, "step": 1215 }, { "epoch": 2.2761194029850746, "grad_norm": 0.289099905728071, "learning_rate": 3.264582302432856e-05, "loss": 0.457, "step": 1220 }, { "epoch": 2.28544776119403, "grad_norm": 0.23882369829329309, "learning_rate": 3.251058770723688e-05, "loss": 0.4533, "step": 1225 }, { "epoch": 2.294776119402985, "grad_norm": 0.24933032718510625, "learning_rate": 3.237516166399336e-05, "loss": 0.4505, "step": 1230 }, { "epoch": 2.3041044776119404, "grad_norm": 0.3092190390189165, "learning_rate": 3.223955004953979e-05, "loss": 0.4557, "step": 1235 }, { "epoch": 2.3134328358208958, "grad_norm": 0.23732950256272162, "learning_rate": 3.21037580258817e-05, "loss": 0.4459, "step": 1240 }, { "epoch": 2.3227611940298507, "grad_norm": 0.2694830320581079, "learning_rate": 3.1967790761891826e-05, "loss": 0.4413, "step": 1245 }, { "epoch": 2.332089552238806, "grad_norm": 0.26230996679756863, "learning_rate": 3.1831653433113317e-05, "loss": 0.4404, "step": 1250 }, { "epoch": 2.341417910447761, "grad_norm": 0.2599329871144144, "learning_rate": 3.169535122156283e-05, "loss": 0.4516, "step": 1255 }, { "epoch": 2.3507462686567164, "grad_norm": 0.2515188117499561, "learning_rate": 3.155888931553319e-05, "loss": 0.4517, "step": 1260 }, { "epoch": 2.360074626865672, "grad_norm": 0.2518581477767336, "learning_rate": 3.142227290939595e-05, "loss": 0.4471, "step": 1265 }, { "epoch": 2.3694029850746268, "grad_norm": 0.2913821168948179, "learning_rate": 3.128550720340362e-05, "loss": 0.4498, "step": 1270 }, { "epoch": 2.378731343283582, "grad_norm": 0.23742628150528924, "learning_rate": 3.1148597403491816e-05, "loss": 0.4598, "step": 1275 }, { "epoch": 2.388059701492537, "grad_norm": 0.24603486688045673, "learning_rate": 3.1011548721080955e-05, "loss": 0.4504, "step": 1280 }, { "epoch": 2.3973880597014925, "grad_norm": 0.24872391674357305, "learning_rate": 3.0874366372878036e-05, "loss": 0.4579, "step": 1285 }, { "epoch": 2.406716417910448, "grad_norm": 0.23882605051839018, "learning_rate": 3.073705558067797e-05, "loss": 0.445, "step": 1290 }, { "epoch": 2.416044776119403, "grad_norm": 0.21744783644949, "learning_rate": 3.059962157116481e-05, "loss": 0.444, "step": 1295 }, { "epoch": 2.425373134328358, "grad_norm": 0.2276749735457607, "learning_rate": 3.046206957571288e-05, "loss": 0.4572, "step": 1300 }, { "epoch": 2.4347014925373136, "grad_norm": 0.22258669822560026, "learning_rate": 3.0324404830187564e-05, "loss": 0.452, "step": 1305 }, { "epoch": 2.4440298507462686, "grad_norm": 0.24648134384762546, "learning_rate": 3.0186632574746055e-05, "loss": 0.4471, "step": 1310 }, { "epoch": 2.453358208955224, "grad_norm": 0.20660053351636773, "learning_rate": 3.0048758053637844e-05, "loss": 0.4464, "step": 1315 }, { "epoch": 2.4626865671641793, "grad_norm": 0.23411770492177655, "learning_rate": 2.9910786515005146e-05, "loss": 0.4523, "step": 1320 }, { "epoch": 2.4720149253731343, "grad_norm": 0.24129308546300254, "learning_rate": 2.977272321068311e-05, "loss": 0.4507, "step": 1325 }, { "epoch": 2.4813432835820897, "grad_norm": 0.2586800557168319, "learning_rate": 2.9634573395999916e-05, "loss": 0.4538, "step": 1330 }, { "epoch": 2.4906716417910446, "grad_norm": 0.2598119310111589, "learning_rate": 2.949634232957671e-05, "loss": 0.4473, "step": 1335 }, { "epoch": 2.5, "grad_norm": 0.23154775718286533, "learning_rate": 2.9358035273127483e-05, "loss": 0.4504, "step": 1340 }, { "epoch": 2.5093283582089554, "grad_norm": 0.21988113374945198, "learning_rate": 2.921965749125873e-05, "loss": 0.4521, "step": 1345 }, { "epoch": 2.5186567164179103, "grad_norm": 0.22071852330221872, "learning_rate": 2.9081214251269095e-05, "loss": 0.4506, "step": 1350 }, { "epoch": 2.5279850746268657, "grad_norm": 0.2505255389957608, "learning_rate": 2.894271082294887e-05, "loss": 0.4472, "step": 1355 }, { "epoch": 2.5373134328358207, "grad_norm": 0.24536125625333594, "learning_rate": 2.8804152478379377e-05, "loss": 0.448, "step": 1360 }, { "epoch": 2.546641791044776, "grad_norm": 0.2228212456437418, "learning_rate": 2.8665544491732315e-05, "loss": 0.4524, "step": 1365 }, { "epoch": 2.5559701492537314, "grad_norm": 0.23576369282989343, "learning_rate": 2.852689213906899e-05, "loss": 0.4534, "step": 1370 }, { "epoch": 2.5652985074626864, "grad_norm": 0.2534692838297303, "learning_rate": 2.8388200698139484e-05, "loss": 0.4497, "step": 1375 }, { "epoch": 2.574626865671642, "grad_norm": 0.2005031157983129, "learning_rate": 2.824947544818175e-05, "loss": 0.4479, "step": 1380 }, { "epoch": 2.583955223880597, "grad_norm": 0.21217515933261696, "learning_rate": 2.8110721669720663e-05, "loss": 0.4501, "step": 1385 }, { "epoch": 2.593283582089552, "grad_norm": 0.2219814873145583, "learning_rate": 2.7971944644367066e-05, "loss": 0.4475, "step": 1390 }, { "epoch": 2.6026119402985075, "grad_norm": 0.22257208200822728, "learning_rate": 2.7833149654616637e-05, "loss": 0.4551, "step": 1395 }, { "epoch": 2.611940298507463, "grad_norm": 0.22937197746194612, "learning_rate": 2.7694341983648884e-05, "loss": 0.4447, "step": 1400 }, { "epoch": 2.621268656716418, "grad_norm": 0.2297129799553422, "learning_rate": 2.7555526915126033e-05, "loss": 0.4497, "step": 1405 }, { "epoch": 2.6305970149253732, "grad_norm": 0.24045353041136358, "learning_rate": 2.7416709732991863e-05, "loss": 0.442, "step": 1410 }, { "epoch": 2.6399253731343286, "grad_norm": 0.2098724512574441, "learning_rate": 2.727789572127064e-05, "loss": 0.4514, "step": 1415 }, { "epoch": 2.6492537313432836, "grad_norm": 0.21086783550500132, "learning_rate": 2.7139090163865932e-05, "loss": 0.4477, "step": 1420 }, { "epoch": 2.658582089552239, "grad_norm": 0.19872376649385332, "learning_rate": 2.7000298344359494e-05, "loss": 0.451, "step": 1425 }, { "epoch": 2.667910447761194, "grad_norm": 0.21413444011174176, "learning_rate": 2.686152554581016e-05, "loss": 0.4563, "step": 1430 }, { "epoch": 2.6772388059701493, "grad_norm": 0.22019113589044567, "learning_rate": 2.6722777050552737e-05, "loss": 0.4518, "step": 1435 }, { "epoch": 2.6865671641791042, "grad_norm": 0.21430351026594524, "learning_rate": 2.6584058139996942e-05, "loss": 0.4519, "step": 1440 }, { "epoch": 2.6958955223880596, "grad_norm": 0.2124880231730821, "learning_rate": 2.644537409442635e-05, "loss": 0.4443, "step": 1445 }, { "epoch": 2.705223880597015, "grad_norm": 0.21544245549648117, "learning_rate": 2.630673019279742e-05, "loss": 0.4428, "step": 1450 }, { "epoch": 2.71455223880597, "grad_norm": 0.2374942584028172, "learning_rate": 2.616813171253855e-05, "loss": 0.4526, "step": 1455 }, { "epoch": 2.7238805970149254, "grad_norm": 0.2303175335552436, "learning_rate": 2.602958392934917e-05, "loss": 0.4414, "step": 1460 }, { "epoch": 2.7332089552238807, "grad_norm": 0.21816219945655713, "learning_rate": 2.589109211699899e-05, "loss": 0.4465, "step": 1465 }, { "epoch": 2.7425373134328357, "grad_norm": 0.26627924617132775, "learning_rate": 2.575266154712715e-05, "loss": 0.449, "step": 1470 }, { "epoch": 2.751865671641791, "grad_norm": 0.23684158389618096, "learning_rate": 2.5614297489041673e-05, "loss": 0.4576, "step": 1475 }, { "epoch": 2.7611940298507465, "grad_norm": 0.23475167325562465, "learning_rate": 2.54760052095188e-05, "loss": 0.4447, "step": 1480 }, { "epoch": 2.7705223880597014, "grad_norm": 0.23168600761578412, "learning_rate": 2.5337789972602566e-05, "loss": 0.4523, "step": 1485 }, { "epoch": 2.779850746268657, "grad_norm": 0.2278106734525843, "learning_rate": 2.519965703940441e-05, "loss": 0.4469, "step": 1490 }, { "epoch": 2.789179104477612, "grad_norm": 0.23114405617108513, "learning_rate": 2.5061611667902878e-05, "loss": 0.4447, "step": 1495 }, { "epoch": 2.798507462686567, "grad_norm": 0.20310853682777202, "learning_rate": 2.4923659112743576e-05, "loss": 0.4519, "step": 1500 }, { "epoch": 2.8078358208955225, "grad_norm": 0.22554669887963202, "learning_rate": 2.4785804625039005e-05, "loss": 0.4486, "step": 1505 }, { "epoch": 2.8171641791044775, "grad_norm": 0.20802311181549868, "learning_rate": 2.4648053452168857e-05, "loss": 0.4499, "step": 1510 }, { "epoch": 2.826492537313433, "grad_norm": 0.22419415037750132, "learning_rate": 2.4510410837580106e-05, "loss": 0.4448, "step": 1515 }, { "epoch": 2.835820895522388, "grad_norm": 0.22695329816889367, "learning_rate": 2.437288202058755e-05, "loss": 0.4473, "step": 1520 }, { "epoch": 2.845149253731343, "grad_norm": 0.24586365893566128, "learning_rate": 2.423547223617429e-05, "loss": 0.4532, "step": 1525 }, { "epoch": 2.8544776119402986, "grad_norm": 0.22237142953083974, "learning_rate": 2.4098186714792504e-05, "loss": 0.4446, "step": 1530 }, { "epoch": 2.8638059701492535, "grad_norm": 0.21485855619269772, "learning_rate": 2.396103068216437e-05, "loss": 0.4524, "step": 1535 }, { "epoch": 2.873134328358209, "grad_norm": 0.20512432795452964, "learning_rate": 2.3824009359083073e-05, "loss": 0.4478, "step": 1540 }, { "epoch": 2.8824626865671643, "grad_norm": 0.24926856432584166, "learning_rate": 2.368712796121419e-05, "loss": 0.4502, "step": 1545 }, { "epoch": 2.8917910447761193, "grad_norm": 0.20287043976884245, "learning_rate": 2.355039169889704e-05, "loss": 0.4392, "step": 1550 }, { "epoch": 2.9011194029850746, "grad_norm": 0.22247232536578646, "learning_rate": 2.3413805776946453e-05, "loss": 0.4462, "step": 1555 }, { "epoch": 2.91044776119403, "grad_norm": 0.2088778450701711, "learning_rate": 2.3277375394454594e-05, "loss": 0.4521, "step": 1560 }, { "epoch": 2.919776119402985, "grad_norm": 0.2118633921818663, "learning_rate": 2.3141105744593065e-05, "loss": 0.45, "step": 1565 }, { "epoch": 2.9291044776119404, "grad_norm": 0.214382891667105, "learning_rate": 2.3005002014415274e-05, "loss": 0.4363, "step": 1570 }, { "epoch": 2.9384328358208958, "grad_norm": 0.23652051214246758, "learning_rate": 2.2869069384658908e-05, "loss": 0.4549, "step": 1575 }, { "epoch": 2.9477611940298507, "grad_norm": 0.23608345707793307, "learning_rate": 2.273331302954883e-05, "loss": 0.4487, "step": 1580 }, { "epoch": 2.957089552238806, "grad_norm": 0.2070405094021611, "learning_rate": 2.2597738116600048e-05, "loss": 0.4453, "step": 1585 }, { "epoch": 2.966417910447761, "grad_norm": 0.20040509422113545, "learning_rate": 2.2462349806421035e-05, "loss": 0.4449, "step": 1590 }, { "epoch": 2.9757462686567164, "grad_norm": 0.2086007273425691, "learning_rate": 2.2327153252517323e-05, "loss": 0.4519, "step": 1595 }, { "epoch": 2.9850746268656714, "grad_norm": 0.21271465489446673, "learning_rate": 2.2192153601095293e-05, "loss": 0.4524, "step": 1600 }, { "epoch": 2.9944029850746268, "grad_norm": 0.22841492651261236, "learning_rate": 2.2057355990866328e-05, "loss": 0.4423, "step": 1605 }, { "epoch": 3.003731343283582, "grad_norm": 0.2355488586735545, "learning_rate": 2.1922765552851155e-05, "loss": 0.4374, "step": 1610 }, { "epoch": 3.013059701492537, "grad_norm": 0.23873187367366028, "learning_rate": 2.1788387410184603e-05, "loss": 0.4188, "step": 1615 }, { "epoch": 3.0223880597014925, "grad_norm": 0.27496687301273603, "learning_rate": 2.165422667792053e-05, "loss": 0.4149, "step": 1620 }, { "epoch": 3.031716417910448, "grad_norm": 0.2626973283332266, "learning_rate": 2.1520288462837175e-05, "loss": 0.4196, "step": 1625 }, { "epoch": 3.041044776119403, "grad_norm": 0.22595479414924702, "learning_rate": 2.1386577863242708e-05, "loss": 0.4177, "step": 1630 }, { "epoch": 3.050373134328358, "grad_norm": 0.25129656641836207, "learning_rate": 2.1253099968781237e-05, "loss": 0.4161, "step": 1635 }, { "epoch": 3.0597014925373136, "grad_norm": 0.22282939815260608, "learning_rate": 2.1119859860239023e-05, "loss": 0.4177, "step": 1640 }, { "epoch": 3.0690298507462686, "grad_norm": 0.23548492358506962, "learning_rate": 2.0986862609351077e-05, "loss": 0.4099, "step": 1645 }, { "epoch": 3.078358208955224, "grad_norm": 0.22179542751400436, "learning_rate": 2.085411327860815e-05, "loss": 0.4192, "step": 1650 }, { "epoch": 3.0876865671641793, "grad_norm": 0.25304363550861025, "learning_rate": 2.072161692106399e-05, "loss": 0.4203, "step": 1655 }, { "epoch": 3.0970149253731343, "grad_norm": 0.2480140727264067, "learning_rate": 2.0589378580143016e-05, "loss": 0.4146, "step": 1660 }, { "epoch": 3.1063432835820897, "grad_norm": 0.21847935803539179, "learning_rate": 2.0457403289448353e-05, "loss": 0.4209, "step": 1665 }, { "epoch": 3.1156716417910446, "grad_norm": 0.20902992688637434, "learning_rate": 2.0325696072570195e-05, "loss": 0.4139, "step": 1670 }, { "epoch": 3.125, "grad_norm": 0.21754030361222712, "learning_rate": 2.0194261942894628e-05, "loss": 0.4181, "step": 1675 }, { "epoch": 3.1343283582089554, "grad_norm": 0.20694365349316288, "learning_rate": 2.006310590341276e-05, "loss": 0.4105, "step": 1680 }, { "epoch": 3.1436567164179103, "grad_norm": 0.2071251610508526, "learning_rate": 1.99322329465303e-05, "loss": 0.4179, "step": 1685 }, { "epoch": 3.1529850746268657, "grad_norm": 0.21759585929009814, "learning_rate": 1.9801648053877548e-05, "loss": 0.4184, "step": 1690 }, { "epoch": 3.1623134328358207, "grad_norm": 0.22050151546560828, "learning_rate": 1.96713561961197e-05, "loss": 0.4141, "step": 1695 }, { "epoch": 3.171641791044776, "grad_norm": 0.22253613578112014, "learning_rate": 1.9541362332767737e-05, "loss": 0.4228, "step": 1700 }, { "epoch": 3.1809701492537314, "grad_norm": 0.19839501492912362, "learning_rate": 1.9411671411989568e-05, "loss": 0.414, "step": 1705 }, { "epoch": 3.1902985074626864, "grad_norm": 0.2013322826804476, "learning_rate": 1.9282288370421708e-05, "loss": 0.4147, "step": 1710 }, { "epoch": 3.199626865671642, "grad_norm": 0.20196073431908243, "learning_rate": 1.9153218132981375e-05, "loss": 0.4062, "step": 1715 }, { "epoch": 3.208955223880597, "grad_norm": 0.21588820315257334, "learning_rate": 1.9024465612678993e-05, "loss": 0.4144, "step": 1720 }, { "epoch": 3.218283582089552, "grad_norm": 0.21136292352138739, "learning_rate": 1.8896035710431225e-05, "loss": 0.4162, "step": 1725 }, { "epoch": 3.2276119402985075, "grad_norm": 0.20644301060440495, "learning_rate": 1.8767933314874382e-05, "loss": 0.4225, "step": 1730 }, { "epoch": 3.236940298507463, "grad_norm": 0.20373952596490721, "learning_rate": 1.8640163302178377e-05, "loss": 0.425, "step": 1735 }, { "epoch": 3.246268656716418, "grad_norm": 0.19956376155653088, "learning_rate": 1.851273053586105e-05, "loss": 0.4215, "step": 1740 }, { "epoch": 3.2555970149253732, "grad_norm": 0.2192621385530846, "learning_rate": 1.8385639866603144e-05, "loss": 0.4235, "step": 1745 }, { "epoch": 3.264925373134328, "grad_norm": 0.22190324817635299, "learning_rate": 1.825889613206355e-05, "loss": 0.423, "step": 1750 }, { "epoch": 3.2742537313432836, "grad_norm": 0.21509283236860452, "learning_rate": 1.8132504156695245e-05, "loss": 0.4184, "step": 1755 }, { "epoch": 3.283582089552239, "grad_norm": 0.21272223946473828, "learning_rate": 1.8006468751561628e-05, "loss": 0.4135, "step": 1760 }, { "epoch": 3.292910447761194, "grad_norm": 0.2024917471517567, "learning_rate": 1.7880794714153366e-05, "loss": 0.4173, "step": 1765 }, { "epoch": 3.3022388059701493, "grad_norm": 0.19897521312461197, "learning_rate": 1.775548682820582e-05, "loss": 0.4218, "step": 1770 }, { "epoch": 3.3115671641791042, "grad_norm": 0.1948941102488988, "learning_rate": 1.7630549863516914e-05, "loss": 0.4113, "step": 1775 }, { "epoch": 3.3208955223880596, "grad_norm": 0.20961116985028122, "learning_rate": 1.750598857576561e-05, "loss": 0.4174, "step": 1780 }, { "epoch": 3.330223880597015, "grad_norm": 0.2058003884841531, "learning_rate": 1.738180770633085e-05, "loss": 0.4217, "step": 1785 }, { "epoch": 3.33955223880597, "grad_norm": 0.20266242836573772, "learning_rate": 1.7258011982111094e-05, "loss": 0.42, "step": 1790 }, { "epoch": 3.3488805970149254, "grad_norm": 0.20969589335485514, "learning_rate": 1.7134606115344427e-05, "loss": 0.421, "step": 1795 }, { "epoch": 3.3582089552238807, "grad_norm": 0.20921219271012487, "learning_rate": 1.701159480342911e-05, "loss": 0.4164, "step": 1800 }, { "epoch": 3.3675373134328357, "grad_norm": 0.20043065344216884, "learning_rate": 1.688898272874485e-05, "loss": 0.4155, "step": 1805 }, { "epoch": 3.376865671641791, "grad_norm": 0.20153421696690615, "learning_rate": 1.6766774558474523e-05, "loss": 0.4225, "step": 1810 }, { "epoch": 3.3861940298507465, "grad_norm": 0.21247259015987596, "learning_rate": 1.664497494442654e-05, "loss": 0.4185, "step": 1815 }, { "epoch": 3.3955223880597014, "grad_norm": 0.2150656010163744, "learning_rate": 1.6523588522857784e-05, "loss": 0.4192, "step": 1820 }, { "epoch": 3.404850746268657, "grad_norm": 0.2105856063960592, "learning_rate": 1.6402619914297087e-05, "loss": 0.4201, "step": 1825 }, { "epoch": 3.4141791044776117, "grad_norm": 0.21525767148826294, "learning_rate": 1.6282073723369427e-05, "loss": 0.4168, "step": 1830 }, { "epoch": 3.423507462686567, "grad_norm": 0.21408388704168538, "learning_rate": 1.616195453862057e-05, "loss": 0.4077, "step": 1835 }, { "epoch": 3.4328358208955225, "grad_norm": 0.20173835556321396, "learning_rate": 1.6042266932342498e-05, "loss": 0.4156, "step": 1840 }, { "epoch": 3.4421641791044775, "grad_norm": 0.20254785823429441, "learning_rate": 1.5923015460399277e-05, "loss": 0.4165, "step": 1845 }, { "epoch": 3.451492537313433, "grad_norm": 0.20462441361094633, "learning_rate": 1.580420466205369e-05, "loss": 0.4207, "step": 1850 }, { "epoch": 3.4608208955223883, "grad_norm": 0.21957280319743333, "learning_rate": 1.5685839059794476e-05, "loss": 0.4135, "step": 1855 }, { "epoch": 3.470149253731343, "grad_norm": 0.1985391187078084, "learning_rate": 1.5567923159164108e-05, "loss": 0.4193, "step": 1860 }, { "epoch": 3.4794776119402986, "grad_norm": 0.1883910872971496, "learning_rate": 1.545046144858738e-05, "loss": 0.4215, "step": 1865 }, { "epoch": 3.4888059701492535, "grad_norm": 0.19311038623232277, "learning_rate": 1.533345839920045e-05, "loss": 0.4129, "step": 1870 }, { "epoch": 3.498134328358209, "grad_norm": 0.2031192810163652, "learning_rate": 1.5216918464680776e-05, "loss": 0.4208, "step": 1875 }, { "epoch": 3.5074626865671643, "grad_norm": 0.18560194077318384, "learning_rate": 1.5100846081077479e-05, "loss": 0.4181, "step": 1880 }, { "epoch": 3.5167910447761193, "grad_norm": 0.187075548940992, "learning_rate": 1.498524566664253e-05, "loss": 0.4193, "step": 1885 }, { "epoch": 3.5261194029850746, "grad_norm": 0.20482243568772138, "learning_rate": 1.4870121621662594e-05, "loss": 0.4173, "step": 1890 }, { "epoch": 3.53544776119403, "grad_norm": 0.1955696714571297, "learning_rate": 1.4755478328291476e-05, "loss": 0.4159, "step": 1895 }, { "epoch": 3.544776119402985, "grad_norm": 0.195668312408888, "learning_rate": 1.4641320150383391e-05, "loss": 0.409, "step": 1900 }, { "epoch": 3.5541044776119404, "grad_norm": 0.21306378781812377, "learning_rate": 1.4527651433326786e-05, "loss": 0.4161, "step": 1905 }, { "epoch": 3.5634328358208958, "grad_norm": 0.2186608446069958, "learning_rate": 1.4414476503878968e-05, "loss": 0.4249, "step": 1910 }, { "epoch": 3.5727611940298507, "grad_norm": 0.21069100204169788, "learning_rate": 1.430179967000141e-05, "loss": 0.4184, "step": 1915 }, { "epoch": 3.582089552238806, "grad_norm": 0.21234916884336327, "learning_rate": 1.4189625220695746e-05, "loss": 0.4253, "step": 1920 }, { "epoch": 3.591417910447761, "grad_norm": 0.19838959219370442, "learning_rate": 1.4077957425840563e-05, "loss": 0.4182, "step": 1925 }, { "epoch": 3.6007462686567164, "grad_norm": 0.1967929116537206, "learning_rate": 1.3966800536028802e-05, "loss": 0.4152, "step": 1930 }, { "epoch": 3.6100746268656714, "grad_norm": 0.20336437845148642, "learning_rate": 1.3856158782406007e-05, "loss": 0.417, "step": 1935 }, { "epoch": 3.6194029850746268, "grad_norm": 0.20715263319921318, "learning_rate": 1.3746036376509252e-05, "loss": 0.4154, "step": 1940 }, { "epoch": 3.628731343283582, "grad_norm": 0.20903474915017056, "learning_rate": 1.3636437510106836e-05, "loss": 0.4193, "step": 1945 }, { "epoch": 3.638059701492537, "grad_norm": 0.21915725837328753, "learning_rate": 1.352736635503873e-05, "loss": 0.416, "step": 1950 }, { "epoch": 3.6473880597014925, "grad_norm": 0.21123954255146313, "learning_rate": 1.3418827063057754e-05, "loss": 0.4186, "step": 1955 }, { "epoch": 3.656716417910448, "grad_norm": 0.19203461617818343, "learning_rate": 1.3310823765671571e-05, "loss": 0.4164, "step": 1960 }, { "epoch": 3.666044776119403, "grad_norm": 0.20040739732403956, "learning_rate": 1.3203360573985394e-05, "loss": 0.4106, "step": 1965 }, { "epoch": 3.675373134328358, "grad_norm": 0.2035890353096807, "learning_rate": 1.3096441578545544e-05, "loss": 0.4155, "step": 1970 }, { "epoch": 3.6847014925373136, "grad_norm": 0.20055217575181392, "learning_rate": 1.2990070849183678e-05, "loss": 0.4154, "step": 1975 }, { "epoch": 3.6940298507462686, "grad_norm": 0.1937140569002323, "learning_rate": 1.2884252434861938e-05, "loss": 0.4138, "step": 1980 }, { "epoch": 3.703358208955224, "grad_norm": 0.19376581626456674, "learning_rate": 1.2778990363518785e-05, "loss": 0.416, "step": 1985 }, { "epoch": 3.7126865671641793, "grad_norm": 0.18689539326809768, "learning_rate": 1.2674288641915688e-05, "loss": 0.4211, "step": 1990 }, { "epoch": 3.7220149253731343, "grad_norm": 0.20291920044726985, "learning_rate": 1.2570151255484639e-05, "loss": 0.4215, "step": 1995 }, { "epoch": 3.7313432835820897, "grad_norm": 0.197121530249932, "learning_rate": 1.246658216817639e-05, "loss": 0.4188, "step": 2000 }, { "epoch": 3.7406716417910446, "grad_norm": 0.19484233400429338, "learning_rate": 1.2363585322309615e-05, "loss": 0.4152, "step": 2005 }, { "epoch": 3.75, "grad_norm": 0.18314388153834107, "learning_rate": 1.2261164638420832e-05, "loss": 0.4127, "step": 2010 }, { "epoch": 3.7593283582089554, "grad_norm": 0.1928578184749698, "learning_rate": 1.2159324015115148e-05, "loss": 0.4244, "step": 2015 }, { "epoch": 3.7686567164179103, "grad_norm": 0.19076095988097325, "learning_rate": 1.205806732891793e-05, "loss": 0.4207, "step": 2020 }, { "epoch": 3.7779850746268657, "grad_norm": 0.19140574749674424, "learning_rate": 1.195739843412713e-05, "loss": 0.409, "step": 2025 }, { "epoch": 3.7873134328358207, "grad_norm": 0.19497496077538204, "learning_rate": 1.1857321162666692e-05, "loss": 0.4166, "step": 2030 }, { "epoch": 3.796641791044776, "grad_norm": 0.19377090643067832, "learning_rate": 1.1757839323940616e-05, "loss": 0.4211, "step": 2035 }, { "epoch": 3.8059701492537314, "grad_norm": 0.18375232614881823, "learning_rate": 1.1658956704687974e-05, "loss": 0.4181, "step": 2040 }, { "epoch": 3.8152985074626864, "grad_norm": 0.18812015136554003, "learning_rate": 1.15606770688388e-05, "loss": 0.4206, "step": 2045 }, { "epoch": 3.824626865671642, "grad_norm": 0.20056978856825183, "learning_rate": 1.1463004157370735e-05, "loss": 0.413, "step": 2050 }, { "epoch": 3.833955223880597, "grad_norm": 0.1969184120705017, "learning_rate": 1.1365941688166747e-05, "loss": 0.4219, "step": 2055 }, { "epoch": 3.843283582089552, "grad_norm": 0.19739845902417966, "learning_rate": 1.1269493355873498e-05, "loss": 0.4201, "step": 2060 }, { "epoch": 3.8526119402985075, "grad_norm": 0.18735404838684347, "learning_rate": 1.1173662831760798e-05, "loss": 0.4142, "step": 2065 }, { "epoch": 3.861940298507463, "grad_norm": 0.18832526355624762, "learning_rate": 1.1078453763581776e-05, "loss": 0.4226, "step": 2070 }, { "epoch": 3.871268656716418, "grad_norm": 0.19825777163511238, "learning_rate": 1.0983869775434091e-05, "loss": 0.4176, "step": 2075 }, { "epoch": 3.8805970149253732, "grad_norm": 0.18471844603333287, "learning_rate": 1.0889914467621986e-05, "loss": 0.4155, "step": 2080 }, { "epoch": 3.8899253731343286, "grad_norm": 0.1818199375077745, "learning_rate": 1.0796591416519192e-05, "loss": 0.4112, "step": 2085 }, { "epoch": 3.8992537313432836, "grad_norm": 0.18968657040318393, "learning_rate": 1.0703904174432836e-05, "loss": 0.424, "step": 2090 }, { "epoch": 3.908582089552239, "grad_norm": 0.19419283521520905, "learning_rate": 1.0611856269468203e-05, "loss": 0.4308, "step": 2095 }, { "epoch": 3.917910447761194, "grad_norm": 0.18460544977682863, "learning_rate": 1.052045120539447e-05, "loss": 0.4087, "step": 2100 }, { "epoch": 3.9272388059701493, "grad_norm": 0.19066104811794077, "learning_rate": 1.0429692461511298e-05, "loss": 0.4353, "step": 2105 }, { "epoch": 3.9365671641791042, "grad_norm": 0.1858966466664562, "learning_rate": 1.033958349251641e-05, "loss": 0.4134, "step": 2110 }, { "epoch": 3.9458955223880596, "grad_norm": 0.1834690726655155, "learning_rate": 1.0250127728374098e-05, "loss": 0.4275, "step": 2115 }, { "epoch": 3.955223880597015, "grad_norm": 0.18458040399219824, "learning_rate": 1.0161328574184645e-05, "loss": 0.4197, "step": 2120 }, { "epoch": 3.96455223880597, "grad_norm": 0.1905765281858502, "learning_rate": 1.0073189410054742e-05, "loss": 0.42, "step": 2125 }, { "epoch": 3.9738805970149254, "grad_norm": 0.19075679225184886, "learning_rate": 9.98571359096878e-06, "loss": 0.4136, "step": 2130 }, { "epoch": 3.9832089552238807, "grad_norm": 0.21521244566377756, "learning_rate": 9.898904446661188e-06, "loss": 0.4184, "step": 2135 }, { "epoch": 3.9925373134328357, "grad_norm": 0.19231538469899445, "learning_rate": 9.812765281489655e-06, "loss": 0.417, "step": 2140 }, { "epoch": 4.001865671641791, "grad_norm": 0.23764579071036102, "learning_rate": 9.72729937430936e-06, "loss": 0.4137, "step": 2145 }, { "epoch": 4.0111940298507465, "grad_norm": 0.22077296852667977, "learning_rate": 9.64250997834819e-06, "loss": 0.3947, "step": 2150 }, { "epoch": 4.020522388059701, "grad_norm": 0.19995137060771145, "learning_rate": 9.558400321082863e-06, "loss": 0.3901, "step": 2155 }, { "epoch": 4.029850746268656, "grad_norm": 0.18919387173894617, "learning_rate": 9.474973604116112e-06, "loss": 0.3996, "step": 2160 }, { "epoch": 4.039179104477612, "grad_norm": 0.2029940464220184, "learning_rate": 9.39223300305479e-06, "loss": 0.3961, "step": 2165 }, { "epoch": 4.048507462686567, "grad_norm": 0.19527296735792213, "learning_rate": 9.310181667389003e-06, "loss": 0.4012, "step": 2170 }, { "epoch": 4.057835820895522, "grad_norm": 0.19118067785233614, "learning_rate": 9.22882272037225e-06, "loss": 0.3939, "step": 2175 }, { "epoch": 4.067164179104478, "grad_norm": 0.1998777727658948, "learning_rate": 9.148159258902488e-06, "loss": 0.4046, "step": 2180 }, { "epoch": 4.076492537313433, "grad_norm": 0.20126009230909833, "learning_rate": 9.068194353404288e-06, "loss": 0.4004, "step": 2185 }, { "epoch": 4.085820895522388, "grad_norm": 0.2006709146837958, "learning_rate": 8.98893104771194e-06, "loss": 0.4002, "step": 2190 }, { "epoch": 4.095149253731344, "grad_norm": 0.1836797527594963, "learning_rate": 8.910372358953614e-06, "loss": 0.3929, "step": 2195 }, { "epoch": 4.104477611940299, "grad_norm": 0.20182018931151088, "learning_rate": 8.83252127743649e-06, "loss": 0.3942, "step": 2200 }, { "epoch": 4.1138059701492535, "grad_norm": 0.18897429740821142, "learning_rate": 8.755380766532945e-06, "loss": 0.4009, "step": 2205 }, { "epoch": 4.123134328358209, "grad_norm": 0.215464777285795, "learning_rate": 8.678953762567739e-06, "loss": 0.3954, "step": 2210 }, { "epoch": 4.132462686567164, "grad_norm": 0.18926511475861385, "learning_rate": 8.60324317470627e-06, "loss": 0.3951, "step": 2215 }, { "epoch": 4.141791044776119, "grad_norm": 0.19185247411504144, "learning_rate": 8.528251884843829e-06, "loss": 0.3955, "step": 2220 }, { "epoch": 4.151119402985074, "grad_norm": 0.18931879715852823, "learning_rate": 8.453982747495881e-06, "loss": 0.3985, "step": 2225 }, { "epoch": 4.16044776119403, "grad_norm": 0.19281445708909953, "learning_rate": 8.380438589689438e-06, "loss": 0.3969, "step": 2230 }, { "epoch": 4.169776119402985, "grad_norm": 0.18620908730115623, "learning_rate": 8.307622210855425e-06, "loss": 0.3955, "step": 2235 }, { "epoch": 4.17910447761194, "grad_norm": 0.1923341221356006, "learning_rate": 8.235536382722133e-06, "loss": 0.3897, "step": 2240 }, { "epoch": 4.188432835820896, "grad_norm": 0.19404291720314143, "learning_rate": 8.164183849209741e-06, "loss": 0.3927, "step": 2245 }, { "epoch": 4.197761194029851, "grad_norm": 0.1873703856589884, "learning_rate": 8.09356732632579e-06, "loss": 0.397, "step": 2250 }, { "epoch": 4.207089552238806, "grad_norm": 0.1869764692256595, "learning_rate": 8.023689502061897e-06, "loss": 0.3857, "step": 2255 }, { "epoch": 4.2164179104477615, "grad_norm": 0.19104839845209523, "learning_rate": 7.95455303629137e-06, "loss": 0.397, "step": 2260 }, { "epoch": 4.225746268656716, "grad_norm": 0.19551160510570317, "learning_rate": 7.886160560667984e-06, "loss": 0.4005, "step": 2265 }, { "epoch": 4.235074626865671, "grad_norm": 0.19059980614409353, "learning_rate": 7.818514678525822e-06, "loss": 0.3998, "step": 2270 }, { "epoch": 4.244402985074627, "grad_norm": 0.190590143847765, "learning_rate": 7.751617964780131e-06, "loss": 0.3982, "step": 2275 }, { "epoch": 4.253731343283582, "grad_norm": 0.19121464499994476, "learning_rate": 7.68547296582938e-06, "loss": 0.4016, "step": 2280 }, { "epoch": 4.263059701492537, "grad_norm": 0.19416409852179006, "learning_rate": 7.620082199458269e-06, "loss": 0.3969, "step": 2285 }, { "epoch": 4.272388059701493, "grad_norm": 0.18504162932439458, "learning_rate": 7.5554481547419395e-06, "loss": 0.3979, "step": 2290 }, { "epoch": 4.281716417910448, "grad_norm": 0.1873887263641866, "learning_rate": 7.491573291951176e-06, "loss": 0.3914, "step": 2295 }, { "epoch": 4.291044776119403, "grad_norm": 0.19132014281853718, "learning_rate": 7.4284600424588045e-06, "loss": 0.3979, "step": 2300 }, { "epoch": 4.300373134328359, "grad_norm": 0.20737253154898544, "learning_rate": 7.366110808647128e-06, "loss": 0.394, "step": 2305 }, { "epoch": 4.309701492537314, "grad_norm": 0.18768821182896908, "learning_rate": 7.304527963816472e-06, "loss": 0.3892, "step": 2310 }, { "epoch": 4.3190298507462686, "grad_norm": 0.19110146287380622, "learning_rate": 7.243713852094848e-06, "loss": 0.3969, "step": 2315 }, { "epoch": 4.3283582089552235, "grad_norm": 0.18612036290045256, "learning_rate": 7.183670788348726e-06, "loss": 0.3932, "step": 2320 }, { "epoch": 4.337686567164179, "grad_norm": 0.19235323208325508, "learning_rate": 7.124401058094938e-06, "loss": 0.3956, "step": 2325 }, { "epoch": 4.347014925373134, "grad_norm": 0.18801329231182345, "learning_rate": 7.0659069174136544e-06, "loss": 0.3964, "step": 2330 }, { "epoch": 4.356343283582089, "grad_norm": 0.18218164276313126, "learning_rate": 7.008190592862514e-06, "loss": 0.3958, "step": 2335 }, { "epoch": 4.365671641791045, "grad_norm": 0.1893862135453063, "learning_rate": 6.951254281391881e-06, "loss": 0.3962, "step": 2340 }, { "epoch": 4.375, "grad_norm": 0.1938741886111355, "learning_rate": 6.8951001502612065e-06, "loss": 0.3933, "step": 2345 }, { "epoch": 4.384328358208955, "grad_norm": 0.18643222876475482, "learning_rate": 6.839730336956554e-06, "loss": 0.3906, "step": 2350 }, { "epoch": 4.393656716417911, "grad_norm": 0.20334210483675225, "learning_rate": 6.785146949109206e-06, "loss": 0.3989, "step": 2355 }, { "epoch": 4.402985074626866, "grad_norm": 0.1859221546806742, "learning_rate": 6.7313520644154555e-06, "loss": 0.3952, "step": 2360 }, { "epoch": 4.412313432835821, "grad_norm": 0.1903910363811893, "learning_rate": 6.6783477305575215e-06, "loss": 0.3956, "step": 2365 }, { "epoch": 4.4216417910447765, "grad_norm": 0.19107457289214702, "learning_rate": 6.626135965125597e-06, "loss": 0.3926, "step": 2370 }, { "epoch": 4.4309701492537314, "grad_norm": 0.19503859490092373, "learning_rate": 6.574718755541061e-06, "loss": 0.3963, "step": 2375 }, { "epoch": 4.440298507462686, "grad_norm": 0.1923073695499237, "learning_rate": 6.52409805898081e-06, "loss": 0.3968, "step": 2380 }, { "epoch": 4.449626865671641, "grad_norm": 0.2019733363431846, "learning_rate": 6.474275802302776e-06, "loss": 0.4007, "step": 2385 }, { "epoch": 4.458955223880597, "grad_norm": 0.18114651913306537, "learning_rate": 6.425253881972573e-06, "loss": 0.3915, "step": 2390 }, { "epoch": 4.468283582089552, "grad_norm": 0.1914389773440768, "learning_rate": 6.377034163991308e-06, "loss": 0.3894, "step": 2395 }, { "epoch": 4.477611940298507, "grad_norm": 0.1931171988673207, "learning_rate": 6.329618483824559e-06, "loss": 0.3987, "step": 2400 }, { "epoch": 4.486940298507463, "grad_norm": 0.1803270530467132, "learning_rate": 6.283008646332507e-06, "loss": 0.4025, "step": 2405 }, { "epoch": 4.496268656716418, "grad_norm": 0.1855864433138599, "learning_rate": 6.237206425701223e-06, "loss": 0.397, "step": 2410 }, { "epoch": 4.505597014925373, "grad_norm": 0.18369191563362094, "learning_rate": 6.192213565375147e-06, "loss": 0.3945, "step": 2415 }, { "epoch": 4.514925373134329, "grad_norm": 0.18504774881559224, "learning_rate": 6.1480317779907285e-06, "loss": 0.4019, "step": 2420 }, { "epoch": 4.524253731343284, "grad_norm": 0.18295433580201753, "learning_rate": 6.104662745311222e-06, "loss": 0.3944, "step": 2425 }, { "epoch": 4.5335820895522385, "grad_norm": 0.18737860175133852, "learning_rate": 6.062108118162669e-06, "loss": 0.3956, "step": 2430 }, { "epoch": 4.542910447761194, "grad_norm": 0.1849466086995583, "learning_rate": 6.020369516371085e-06, "loss": 0.3926, "step": 2435 }, { "epoch": 4.552238805970149, "grad_norm": 0.1922247936579645, "learning_rate": 5.9794485287007696e-06, "loss": 0.3997, "step": 2440 }, { "epoch": 4.561567164179104, "grad_norm": 0.1801232979605451, "learning_rate": 5.93934671279386e-06, "loss": 0.4003, "step": 2445 }, { "epoch": 4.57089552238806, "grad_norm": 0.18612855923352808, "learning_rate": 5.900065595111014e-06, "loss": 0.3956, "step": 2450 }, { "epoch": 4.580223880597015, "grad_norm": 0.18551219007013386, "learning_rate": 5.8616066708733255e-06, "loss": 0.3929, "step": 2455 }, { "epoch": 4.58955223880597, "grad_norm": 0.18558667884140806, "learning_rate": 5.8239714040053936e-06, "loss": 0.4053, "step": 2460 }, { "epoch": 4.598880597014926, "grad_norm": 0.188182582017194, "learning_rate": 5.787161227079613e-06, "loss": 0.3948, "step": 2465 }, { "epoch": 4.608208955223881, "grad_norm": 0.19034117674308992, "learning_rate": 5.7511775412616415e-06, "loss": 0.3967, "step": 2470 }, { "epoch": 4.617537313432836, "grad_norm": 0.18441627836193264, "learning_rate": 5.716021716257047e-06, "loss": 0.3934, "step": 2475 }, { "epoch": 4.6268656716417915, "grad_norm": 0.17892281233510524, "learning_rate": 5.6816950902592005e-06, "loss": 0.3936, "step": 2480 }, { "epoch": 4.6361940298507465, "grad_norm": 0.1814398586971551, "learning_rate": 5.648198969898311e-06, "loss": 0.3896, "step": 2485 }, { "epoch": 4.645522388059701, "grad_norm": 0.1803296123167427, "learning_rate": 5.615534630191708e-06, "loss": 0.3906, "step": 2490 }, { "epoch": 4.654850746268656, "grad_norm": 0.18772408978252195, "learning_rate": 5.583703314495294e-06, "loss": 0.3944, "step": 2495 }, { "epoch": 4.664179104477612, "grad_norm": 0.17423266747974542, "learning_rate": 5.55270623445622e-06, "loss": 0.3959, "step": 2500 }, { "epoch": 4.673507462686567, "grad_norm": 0.19208509541966526, "learning_rate": 5.522544569966786e-06, "loss": 0.3992, "step": 2505 }, { "epoch": 4.682835820895522, "grad_norm": 0.17816206285593256, "learning_rate": 5.4932194691194905e-06, "loss": 0.393, "step": 2510 }, { "epoch": 4.692164179104478, "grad_norm": 0.19643118615692984, "learning_rate": 5.464732048163365e-06, "loss": 0.3992, "step": 2515 }, { "epoch": 4.701492537313433, "grad_norm": 0.1869483953998433, "learning_rate": 5.437083391461452e-06, "loss": 0.4038, "step": 2520 }, { "epoch": 4.710820895522388, "grad_norm": 0.17995004309463292, "learning_rate": 5.410274551449559e-06, "loss": 0.3935, "step": 2525 }, { "epoch": 4.720149253731344, "grad_norm": 0.18401195549507557, "learning_rate": 5.384306548596178e-06, "loss": 0.4001, "step": 2530 }, { "epoch": 4.729477611940299, "grad_norm": 0.18100504867763145, "learning_rate": 5.3591803713636545e-06, "loss": 0.3971, "step": 2535 }, { "epoch": 4.7388059701492535, "grad_norm": 0.18538575657793377, "learning_rate": 5.3348969761705446e-06, "loss": 0.3987, "step": 2540 }, { "epoch": 4.7481343283582085, "grad_norm": 0.17538072571886312, "learning_rate": 5.311457287355232e-06, "loss": 0.3943, "step": 2545 }, { "epoch": 4.757462686567164, "grad_norm": 0.19437045031923975, "learning_rate": 5.288862197140726e-06, "loss": 0.3952, "step": 2550 }, { "epoch": 4.766791044776119, "grad_norm": 0.19306536989824527, "learning_rate": 5.267112565600707e-06, "loss": 0.3981, "step": 2555 }, { "epoch": 4.776119402985074, "grad_norm": 0.18890132843011978, "learning_rate": 5.2462092206267864e-06, "loss": 0.3974, "step": 2560 }, { "epoch": 4.78544776119403, "grad_norm": 0.19250364058928918, "learning_rate": 5.2261529578969905e-06, "loss": 0.3924, "step": 2565 }, { "epoch": 4.794776119402985, "grad_norm": 0.1901735297233097, "learning_rate": 5.206944540845476e-06, "loss": 0.3964, "step": 2570 }, { "epoch": 4.80410447761194, "grad_norm": 0.192259907605008, "learning_rate": 5.188584700633478e-06, "loss": 0.402, "step": 2575 }, { "epoch": 4.813432835820896, "grad_norm": 0.18853792350039372, "learning_rate": 5.171074136121461e-06, "loss": 0.3911, "step": 2580 }, { "epoch": 4.822761194029851, "grad_norm": 0.18119584267146255, "learning_rate": 5.154413513842533e-06, "loss": 0.3937, "step": 2585 }, { "epoch": 4.832089552238806, "grad_norm": 0.1854522263583517, "learning_rate": 5.138603467977062e-06, "loss": 0.3978, "step": 2590 }, { "epoch": 4.8414179104477615, "grad_norm": 0.1840258548367489, "learning_rate": 5.123644600328549e-06, "loss": 0.4001, "step": 2595 }, { "epoch": 4.850746268656716, "grad_norm": 0.18899435178300328, "learning_rate": 5.1095374803007115e-06, "loss": 0.3952, "step": 2600 }, { "epoch": 4.860074626865671, "grad_norm": 0.1798615582375137, "learning_rate": 5.096282644875807e-06, "loss": 0.3935, "step": 2605 }, { "epoch": 4.869402985074627, "grad_norm": 0.19332133518853487, "learning_rate": 5.083880598594204e-06, "loss": 0.3952, "step": 2610 }, { "epoch": 4.878731343283582, "grad_norm": 0.18162156658223105, "learning_rate": 5.072331813535166e-06, "loss": 0.3957, "step": 2615 }, { "epoch": 4.888059701492537, "grad_norm": 0.18338363322677612, "learning_rate": 5.06163672929889e-06, "loss": 0.3949, "step": 2620 }, { "epoch": 4.897388059701493, "grad_norm": 0.19425192398377006, "learning_rate": 5.051795752989764e-06, "loss": 0.394, "step": 2625 }, { "epoch": 4.906716417910448, "grad_norm": 0.19328091513985823, "learning_rate": 5.042809259200885e-06, "loss": 0.3948, "step": 2630 }, { "epoch": 4.916044776119403, "grad_norm": 0.18686569926860114, "learning_rate": 5.034677589999783e-06, "loss": 0.3976, "step": 2635 }, { "epoch": 4.925373134328359, "grad_norm": 0.18543047058499687, "learning_rate": 5.02740105491541e-06, "loss": 0.3901, "step": 2640 }, { "epoch": 4.934701492537314, "grad_norm": 0.18776399386716744, "learning_rate": 5.020979930926365e-06, "loss": 0.3961, "step": 2645 }, { "epoch": 4.9440298507462686, "grad_norm": 0.1821670432036764, "learning_rate": 5.0154144624503365e-06, "loss": 0.3975, "step": 2650 }, { "epoch": 4.9533582089552235, "grad_norm": 0.18395392862144658, "learning_rate": 5.010704861334803e-06, "loss": 0.4005, "step": 2655 }, { "epoch": 4.962686567164179, "grad_norm": 0.17694512276923657, "learning_rate": 5.0068513068489765e-06, "loss": 0.4008, "step": 2660 }, { "epoch": 4.972014925373134, "grad_norm": 0.18407085622764693, "learning_rate": 5.003853945676969e-06, "loss": 0.3932, "step": 2665 }, { "epoch": 4.981343283582089, "grad_norm": 0.19272615608419674, "learning_rate": 5.001712891912217e-06, "loss": 0.3953, "step": 2670 }, { "epoch": 4.990671641791045, "grad_norm": 0.18589434222264428, "learning_rate": 5.000428227053131e-06, "loss": 0.397, "step": 2675 }, { "epoch": 5.0, "grad_norm": 0.18106809362994408, "learning_rate": 5e-06, "loss": 0.3946, "step": 2680 }, { "epoch": 5.0, "step": 2680, "total_flos": 2443106246983680.0, "train_loss": 0.46552796181458145, "train_runtime": 31072.493, "train_samples_per_second": 5.52, "train_steps_per_second": 0.086 } ], "logging_steps": 5, "max_steps": 2680, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2443106246983680.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }