{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 5000.0, "global_step": 14355, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002089864158829676, "grad_norm": 26.04354871405688, "learning_rate": 0.0, "loss": 3.4946, "step": 1 }, { "epoch": 0.0004179728317659352, "grad_norm": 28.746915448613994, "learning_rate": 2.320185614849188e-08, "loss": 3.8611, "step": 2 }, { "epoch": 0.0006269592476489029, "grad_norm": 28.91247736122108, "learning_rate": 4.640371229698376e-08, "loss": 3.7896, "step": 3 }, { "epoch": 0.0008359456635318704, "grad_norm": 26.41019194536408, "learning_rate": 6.960556844547565e-08, "loss": 3.5496, "step": 4 }, { "epoch": 0.0010449320794148381, "grad_norm": 30.629259499489457, "learning_rate": 9.280742459396752e-08, "loss": 3.9267, "step": 5 }, { "epoch": 0.0012539184952978057, "grad_norm": 26.72842850780666, "learning_rate": 1.160092807424594e-07, "loss": 3.48, "step": 6 }, { "epoch": 0.0014629049111807733, "grad_norm": 27.43311944817558, "learning_rate": 1.392111368909513e-07, "loss": 3.5927, "step": 7 }, { "epoch": 0.001671891327063741, "grad_norm": 30.117466639902982, "learning_rate": 1.6241299303944316e-07, "loss": 3.7743, "step": 8 }, { "epoch": 0.0018808777429467085, "grad_norm": 26.362512461089906, "learning_rate": 1.8561484918793503e-07, "loss": 3.507, "step": 9 }, { "epoch": 0.0020898641588296763, "grad_norm": 25.145984694058356, "learning_rate": 2.0881670533642693e-07, "loss": 3.4493, "step": 10 }, { "epoch": 0.0022988505747126436, "grad_norm": 27.521304880067042, "learning_rate": 2.320185614849188e-07, "loss": 3.6902, "step": 11 }, { "epoch": 0.0025078369905956114, "grad_norm": 22.707865128671763, "learning_rate": 2.5522041763341066e-07, "loss": 3.0144, "step": 12 }, { "epoch": 0.002716823406478579, "grad_norm": 34.20409022650699, "learning_rate": 2.784222737819026e-07, "loss": 4.1236, "step": 13 }, { "epoch": 0.0029258098223615466, "grad_norm": 26.142510626150642, "learning_rate": 3.0162412993039446e-07, "loss": 3.4106, "step": 14 }, { "epoch": 0.003134796238244514, "grad_norm": 26.044066528829013, "learning_rate": 3.248259860788863e-07, "loss": 3.3076, "step": 15 }, { "epoch": 0.003343782654127482, "grad_norm": 27.409549424515642, "learning_rate": 3.480278422273782e-07, "loss": 3.4983, "step": 16 }, { "epoch": 0.003552769070010449, "grad_norm": 29.92799757187842, "learning_rate": 3.7122969837587006e-07, "loss": 3.9385, "step": 17 }, { "epoch": 0.003761755485893417, "grad_norm": 34.94380222738753, "learning_rate": 3.94431554524362e-07, "loss": 4.3159, "step": 18 }, { "epoch": 0.003970741901776384, "grad_norm": 26.243395358798974, "learning_rate": 4.1763341067285385e-07, "loss": 3.4002, "step": 19 }, { "epoch": 0.0041797283176593526, "grad_norm": 25.300506790292992, "learning_rate": 4.408352668213457e-07, "loss": 3.1789, "step": 20 }, { "epoch": 0.00438871473354232, "grad_norm": 31.251277681960538, "learning_rate": 4.640371229698376e-07, "loss": 3.7398, "step": 21 }, { "epoch": 0.004597701149425287, "grad_norm": 22.96395528890351, "learning_rate": 4.872389791183295e-07, "loss": 2.6681, "step": 22 }, { "epoch": 0.004806687565308255, "grad_norm": 31.229582520182504, "learning_rate": 5.104408352668213e-07, "loss": 3.6872, "step": 23 }, { "epoch": 0.005015673981191223, "grad_norm": 26.286994744748284, "learning_rate": 5.336426914153132e-07, "loss": 3.3635, "step": 24 }, { "epoch": 0.00522466039707419, "grad_norm": 26.644410207408946, "learning_rate": 5.568445475638052e-07, "loss": 3.2687, "step": 25 }, { "epoch": 0.005433646812957158, "grad_norm": 33.660163062735045, "learning_rate": 5.80046403712297e-07, "loss": 3.9042, "step": 26 }, { "epoch": 0.005642633228840125, "grad_norm": 31.107822197497125, "learning_rate": 6.032482598607889e-07, "loss": 3.0778, "step": 27 }, { "epoch": 0.005851619644723093, "grad_norm": 20.760695490326864, "learning_rate": 6.264501160092808e-07, "loss": 2.822, "step": 28 }, { "epoch": 0.006060606060606061, "grad_norm": 20.24923358140589, "learning_rate": 6.496519721577726e-07, "loss": 2.7921, "step": 29 }, { "epoch": 0.006269592476489028, "grad_norm": 19.398403732399508, "learning_rate": 6.728538283062645e-07, "loss": 2.6184, "step": 30 }, { "epoch": 0.006478578892371996, "grad_norm": 20.546457221891128, "learning_rate": 6.960556844547564e-07, "loss": 2.8997, "step": 31 }, { "epoch": 0.006687565308254964, "grad_norm": 19.791691926726585, "learning_rate": 7.192575406032483e-07, "loss": 2.8201, "step": 32 }, { "epoch": 0.006896551724137931, "grad_norm": 21.491224018891945, "learning_rate": 7.424593967517401e-07, "loss": 3.0673, "step": 33 }, { "epoch": 0.007105538140020898, "grad_norm": 17.389747455379027, "learning_rate": 7.656612529002321e-07, "loss": 2.5957, "step": 34 }, { "epoch": 0.0073145245559038665, "grad_norm": 13.531820913134112, "learning_rate": 7.88863109048724e-07, "loss": 2.2027, "step": 35 }, { "epoch": 0.007523510971786834, "grad_norm": 17.6047633125288, "learning_rate": 8.120649651972158e-07, "loss": 2.7517, "step": 36 }, { "epoch": 0.007732497387669801, "grad_norm": 15.575339652026138, "learning_rate": 8.352668213457077e-07, "loss": 2.4121, "step": 37 }, { "epoch": 0.007941483803552769, "grad_norm": 15.988771937774576, "learning_rate": 8.584686774941996e-07, "loss": 2.6718, "step": 38 }, { "epoch": 0.008150470219435737, "grad_norm": 25.104338348111252, "learning_rate": 8.816705336426914e-07, "loss": 2.4199, "step": 39 }, { "epoch": 0.008359456635318705, "grad_norm": 29.154969834598727, "learning_rate": 9.048723897911833e-07, "loss": 2.2793, "step": 40 }, { "epoch": 0.008568443051201672, "grad_norm": 27.44992259247113, "learning_rate": 9.280742459396752e-07, "loss": 1.9422, "step": 41 }, { "epoch": 0.00877742946708464, "grad_norm": 22.195601774274174, "learning_rate": 9.51276102088167e-07, "loss": 1.8738, "step": 42 }, { "epoch": 0.008986415882967606, "grad_norm": 16.829338629014043, "learning_rate": 9.74477958236659e-07, "loss": 2.0189, "step": 43 }, { "epoch": 0.009195402298850575, "grad_norm": 13.238288977804332, "learning_rate": 9.976798143851508e-07, "loss": 1.826, "step": 44 }, { "epoch": 0.009404388714733543, "grad_norm": 13.216068208360863, "learning_rate": 1.0208816705336427e-06, "loss": 1.7937, "step": 45 }, { "epoch": 0.00961337513061651, "grad_norm": 10.038246783727026, "learning_rate": 1.0440835266821345e-06, "loss": 1.4159, "step": 46 }, { "epoch": 0.009822361546499478, "grad_norm": 10.188061508697713, "learning_rate": 1.0672853828306264e-06, "loss": 1.7384, "step": 47 }, { "epoch": 0.010031347962382446, "grad_norm": 8.879124010248738, "learning_rate": 1.0904872389791185e-06, "loss": 1.5025, "step": 48 }, { "epoch": 0.010240334378265412, "grad_norm": 9.840498779872592, "learning_rate": 1.1136890951276103e-06, "loss": 1.6163, "step": 49 }, { "epoch": 0.01044932079414838, "grad_norm": 8.622729504097698, "learning_rate": 1.1368909512761022e-06, "loss": 1.4554, "step": 50 }, { "epoch": 0.010658307210031349, "grad_norm": 9.753179595418983, "learning_rate": 1.160092807424594e-06, "loss": 1.4598, "step": 51 }, { "epoch": 0.010867293625914315, "grad_norm": 10.770306848447753, "learning_rate": 1.183294663573086e-06, "loss": 1.6523, "step": 52 }, { "epoch": 0.011076280041797283, "grad_norm": 9.525135360063981, "learning_rate": 1.2064965197215778e-06, "loss": 1.5662, "step": 53 }, { "epoch": 0.01128526645768025, "grad_norm": 8.86554993810681, "learning_rate": 1.2296983758700697e-06, "loss": 1.4493, "step": 54 }, { "epoch": 0.011494252873563218, "grad_norm": 8.321837659460861, "learning_rate": 1.2529002320185616e-06, "loss": 1.3614, "step": 55 }, { "epoch": 0.011703239289446186, "grad_norm": 10.687363351547086, "learning_rate": 1.2761020881670536e-06, "loss": 1.5476, "step": 56 }, { "epoch": 0.011912225705329153, "grad_norm": 8.002461933191155, "learning_rate": 1.2993039443155453e-06, "loss": 1.3249, "step": 57 }, { "epoch": 0.012121212121212121, "grad_norm": 7.948734473246525, "learning_rate": 1.3225058004640374e-06, "loss": 1.3004, "step": 58 }, { "epoch": 0.01233019853709509, "grad_norm": 8.56891420195788, "learning_rate": 1.345707656612529e-06, "loss": 1.2143, "step": 59 }, { "epoch": 0.012539184952978056, "grad_norm": 9.187216624585485, "learning_rate": 1.3689095127610211e-06, "loss": 1.3324, "step": 60 }, { "epoch": 0.012748171368861024, "grad_norm": 8.119320059461604, "learning_rate": 1.3921113689095128e-06, "loss": 1.198, "step": 61 }, { "epoch": 0.012957157784743992, "grad_norm": 7.116636206229549, "learning_rate": 1.4153132250580049e-06, "loss": 1.0495, "step": 62 }, { "epoch": 0.013166144200626959, "grad_norm": 6.426809635903418, "learning_rate": 1.4385150812064965e-06, "loss": 1.1075, "step": 63 }, { "epoch": 0.013375130616509927, "grad_norm": 6.770557154466463, "learning_rate": 1.4617169373549886e-06, "loss": 1.1705, "step": 64 }, { "epoch": 0.013584117032392894, "grad_norm": 7.680821494367459, "learning_rate": 1.4849187935034802e-06, "loss": 1.1061, "step": 65 }, { "epoch": 0.013793103448275862, "grad_norm": 7.09187038667355, "learning_rate": 1.5081206496519723e-06, "loss": 0.9788, "step": 66 }, { "epoch": 0.01400208986415883, "grad_norm": 6.164597361888483, "learning_rate": 1.5313225058004642e-06, "loss": 1.0314, "step": 67 }, { "epoch": 0.014211076280041797, "grad_norm": 7.403119210370693, "learning_rate": 1.554524361948956e-06, "loss": 0.9639, "step": 68 }, { "epoch": 0.014420062695924765, "grad_norm": 7.323342835189932, "learning_rate": 1.577726218097448e-06, "loss": 0.9581, "step": 69 }, { "epoch": 0.014629049111807733, "grad_norm": 4.698567820716078, "learning_rate": 1.6009280742459398e-06, "loss": 0.955, "step": 70 }, { "epoch": 0.0148380355276907, "grad_norm": 4.354052379056571, "learning_rate": 1.6241299303944317e-06, "loss": 0.9413, "step": 71 }, { "epoch": 0.015047021943573668, "grad_norm": 5.646741908677169, "learning_rate": 1.6473317865429235e-06, "loss": 0.9977, "step": 72 }, { "epoch": 0.015256008359456636, "grad_norm": 5.666562566601628, "learning_rate": 1.6705336426914154e-06, "loss": 0.9194, "step": 73 }, { "epoch": 0.015464994775339603, "grad_norm": 5.846527102981343, "learning_rate": 1.6937354988399075e-06, "loss": 0.9061, "step": 74 }, { "epoch": 0.01567398119122257, "grad_norm": 2.976525793062366, "learning_rate": 1.7169373549883992e-06, "loss": 0.6922, "step": 75 }, { "epoch": 0.015882967607105537, "grad_norm": 3.785773575716978, "learning_rate": 1.7401392111368912e-06, "loss": 0.8464, "step": 76 }, { "epoch": 0.016091954022988506, "grad_norm": 4.009422558277519, "learning_rate": 1.7633410672853829e-06, "loss": 0.8762, "step": 77 }, { "epoch": 0.016300940438871474, "grad_norm": 5.306852021628828, "learning_rate": 1.786542923433875e-06, "loss": 0.7014, "step": 78 }, { "epoch": 0.016509926854754442, "grad_norm": 4.360965973711443, "learning_rate": 1.8097447795823666e-06, "loss": 0.799, "step": 79 }, { "epoch": 0.01671891327063741, "grad_norm": 6.044069715847191, "learning_rate": 1.8329466357308587e-06, "loss": 0.7212, "step": 80 }, { "epoch": 0.016927899686520375, "grad_norm": 3.9613759775601705, "learning_rate": 1.8561484918793504e-06, "loss": 0.7821, "step": 81 }, { "epoch": 0.017136886102403343, "grad_norm": 3.89319967584578, "learning_rate": 1.8793503480278424e-06, "loss": 0.7292, "step": 82 }, { "epoch": 0.01734587251828631, "grad_norm": 5.421275601914175, "learning_rate": 1.902552204176334e-06, "loss": 0.7351, "step": 83 }, { "epoch": 0.01755485893416928, "grad_norm": 6.980417223012388, "learning_rate": 1.925754060324826e-06, "loss": 0.7649, "step": 84 }, { "epoch": 0.017763845350052248, "grad_norm": 5.787383365812194, "learning_rate": 1.948955916473318e-06, "loss": 0.7516, "step": 85 }, { "epoch": 0.017972831765935213, "grad_norm": 5.471609970186245, "learning_rate": 1.97215777262181e-06, "loss": 0.7225, "step": 86 }, { "epoch": 0.01818181818181818, "grad_norm": 6.495036338007681, "learning_rate": 1.9953596287703016e-06, "loss": 0.5893, "step": 87 }, { "epoch": 0.01839080459770115, "grad_norm": 6.64177044247243, "learning_rate": 2.0185614849187937e-06, "loss": 0.5938, "step": 88 }, { "epoch": 0.018599791013584117, "grad_norm": 4.723632608219729, "learning_rate": 2.0417633410672853e-06, "loss": 0.5003, "step": 89 }, { "epoch": 0.018808777429467086, "grad_norm": 4.628317470244376, "learning_rate": 2.0649651972157774e-06, "loss": 0.4942, "step": 90 }, { "epoch": 0.019017763845350054, "grad_norm": 3.131038615062829, "learning_rate": 2.088167053364269e-06, "loss": 0.4503, "step": 91 }, { "epoch": 0.01922675026123302, "grad_norm": 4.813143506894191, "learning_rate": 2.111368909512761e-06, "loss": 0.4876, "step": 92 }, { "epoch": 0.019435736677115987, "grad_norm": 5.463843489190518, "learning_rate": 2.134570765661253e-06, "loss": 0.4706, "step": 93 }, { "epoch": 0.019644723092998955, "grad_norm": 5.022649128107101, "learning_rate": 2.157772621809745e-06, "loss": 0.4637, "step": 94 }, { "epoch": 0.019853709508881923, "grad_norm": 2.7161103853975104, "learning_rate": 2.180974477958237e-06, "loss": 0.4546, "step": 95 }, { "epoch": 0.02006269592476489, "grad_norm": 3.794051125223398, "learning_rate": 2.2041763341067286e-06, "loss": 0.429, "step": 96 }, { "epoch": 0.020271682340647856, "grad_norm": 3.727615625449969, "learning_rate": 2.2273781902552207e-06, "loss": 0.4774, "step": 97 }, { "epoch": 0.020480668756530825, "grad_norm": 3.123407031896952, "learning_rate": 2.2505800464037123e-06, "loss": 0.4369, "step": 98 }, { "epoch": 0.020689655172413793, "grad_norm": 2.8281698507377233, "learning_rate": 2.2737819025522044e-06, "loss": 0.4256, "step": 99 }, { "epoch": 0.02089864158829676, "grad_norm": 3.281752228624019, "learning_rate": 2.296983758700696e-06, "loss": 0.4553, "step": 100 }, { "epoch": 0.02110762800417973, "grad_norm": 2.724929139008782, "learning_rate": 2.320185614849188e-06, "loss": 0.4029, "step": 101 }, { "epoch": 0.021316614420062698, "grad_norm": 2.9118929198432335, "learning_rate": 2.3433874709976802e-06, "loss": 0.4149, "step": 102 }, { "epoch": 0.021525600835945662, "grad_norm": 2.284944673461464, "learning_rate": 2.366589327146172e-06, "loss": 0.4203, "step": 103 }, { "epoch": 0.02173458725182863, "grad_norm": 2.212642721259014, "learning_rate": 2.389791183294664e-06, "loss": 0.4201, "step": 104 }, { "epoch": 0.0219435736677116, "grad_norm": 2.5974308388800234, "learning_rate": 2.4129930394431556e-06, "loss": 0.4033, "step": 105 }, { "epoch": 0.022152560083594567, "grad_norm": 3.054637989896991, "learning_rate": 2.4361948955916477e-06, "loss": 0.4143, "step": 106 }, { "epoch": 0.022361546499477535, "grad_norm": 3.299069839265495, "learning_rate": 2.4593967517401394e-06, "loss": 0.4359, "step": 107 }, { "epoch": 0.0225705329153605, "grad_norm": 3.061236746991298, "learning_rate": 2.4825986078886315e-06, "loss": 0.4272, "step": 108 }, { "epoch": 0.022779519331243468, "grad_norm": 2.797798434439955, "learning_rate": 2.505800464037123e-06, "loss": 0.3972, "step": 109 }, { "epoch": 0.022988505747126436, "grad_norm": 3.4263150546999284, "learning_rate": 2.529002320185615e-06, "loss": 0.3929, "step": 110 }, { "epoch": 0.023197492163009405, "grad_norm": 5.6627735313227445, "learning_rate": 2.5522041763341073e-06, "loss": 0.4388, "step": 111 }, { "epoch": 0.023406478578892373, "grad_norm": 4.6511044739134615, "learning_rate": 2.5754060324825985e-06, "loss": 0.395, "step": 112 }, { "epoch": 0.02361546499477534, "grad_norm": 2.941618515147156, "learning_rate": 2.5986078886310906e-06, "loss": 0.4306, "step": 113 }, { "epoch": 0.023824451410658306, "grad_norm": 3.7172296265798126, "learning_rate": 2.6218097447795827e-06, "loss": 0.3946, "step": 114 }, { "epoch": 0.024033437826541274, "grad_norm": 4.694141320997486, "learning_rate": 2.6450116009280748e-06, "loss": 0.3991, "step": 115 }, { "epoch": 0.024242424242424242, "grad_norm": 4.626682856343814, "learning_rate": 2.668213457076566e-06, "loss": 0.3952, "step": 116 }, { "epoch": 0.02445141065830721, "grad_norm": 2.9317501622501156, "learning_rate": 2.691415313225058e-06, "loss": 0.3865, "step": 117 }, { "epoch": 0.02466039707419018, "grad_norm": 4.551436250537547, "learning_rate": 2.71461716937355e-06, "loss": 0.3647, "step": 118 }, { "epoch": 0.024869383490073144, "grad_norm": 5.931824646442025, "learning_rate": 2.7378190255220422e-06, "loss": 0.3852, "step": 119 }, { "epoch": 0.025078369905956112, "grad_norm": 2.446488753020665, "learning_rate": 2.7610208816705335e-06, "loss": 0.4247, "step": 120 }, { "epoch": 0.02528735632183908, "grad_norm": 3.515303521402374, "learning_rate": 2.7842227378190255e-06, "loss": 0.3754, "step": 121 }, { "epoch": 0.02549634273772205, "grad_norm": 4.006934274786598, "learning_rate": 2.8074245939675176e-06, "loss": 0.4179, "step": 122 }, { "epoch": 0.025705329153605017, "grad_norm": 4.630560525101962, "learning_rate": 2.8306264501160097e-06, "loss": 0.375, "step": 123 }, { "epoch": 0.025914315569487985, "grad_norm": 2.659890463107089, "learning_rate": 2.853828306264501e-06, "loss": 0.3865, "step": 124 }, { "epoch": 0.02612330198537095, "grad_norm": 4.394846906113497, "learning_rate": 2.877030162412993e-06, "loss": 0.3824, "step": 125 }, { "epoch": 0.026332288401253918, "grad_norm": 3.5139934655554312, "learning_rate": 2.900232018561485e-06, "loss": 0.3854, "step": 126 }, { "epoch": 0.026541274817136886, "grad_norm": 2.5048770253379606, "learning_rate": 2.923433874709977e-06, "loss": 0.4096, "step": 127 }, { "epoch": 0.026750261233019854, "grad_norm": 2.022037625220975, "learning_rate": 2.946635730858469e-06, "loss": 0.3516, "step": 128 }, { "epoch": 0.026959247648902823, "grad_norm": 4.587335051808967, "learning_rate": 2.9698375870069605e-06, "loss": 0.3563, "step": 129 }, { "epoch": 0.027168234064785787, "grad_norm": 2.0871001309341337, "learning_rate": 2.9930394431554526e-06, "loss": 0.3542, "step": 130 }, { "epoch": 0.027377220480668756, "grad_norm": 2.315704516717328, "learning_rate": 3.0162412993039447e-06, "loss": 0.3792, "step": 131 }, { "epoch": 0.027586206896551724, "grad_norm": 3.6044591652719316, "learning_rate": 3.0394431554524363e-06, "loss": 0.3712, "step": 132 }, { "epoch": 0.027795193312434692, "grad_norm": 7.485028377721707, "learning_rate": 3.0626450116009284e-06, "loss": 0.3378, "step": 133 }, { "epoch": 0.02800417972831766, "grad_norm": 1.9496036721942793, "learning_rate": 3.08584686774942e-06, "loss": 0.3744, "step": 134 }, { "epoch": 0.02821316614420063, "grad_norm": 4.918928541736408, "learning_rate": 3.109048723897912e-06, "loss": 0.3776, "step": 135 }, { "epoch": 0.028422152560083593, "grad_norm": 4.519705243890723, "learning_rate": 3.132250580046404e-06, "loss": 0.3875, "step": 136 }, { "epoch": 0.02863113897596656, "grad_norm": 2.346229709899417, "learning_rate": 3.155452436194896e-06, "loss": 0.3443, "step": 137 }, { "epoch": 0.02884012539184953, "grad_norm": 2.4795615791506487, "learning_rate": 3.1786542923433875e-06, "loss": 0.3676, "step": 138 }, { "epoch": 0.029049111807732498, "grad_norm": 3.003979528951723, "learning_rate": 3.2018561484918796e-06, "loss": 0.3782, "step": 139 }, { "epoch": 0.029258098223615466, "grad_norm": 4.884140961143894, "learning_rate": 3.2250580046403713e-06, "loss": 0.3859, "step": 140 }, { "epoch": 0.02946708463949843, "grad_norm": 2.35591427477312, "learning_rate": 3.2482598607888633e-06, "loss": 0.3586, "step": 141 }, { "epoch": 0.0296760710553814, "grad_norm": 2.6301420159745716, "learning_rate": 3.2714617169373554e-06, "loss": 0.365, "step": 142 }, { "epoch": 0.029885057471264367, "grad_norm": 3.5663531431226083, "learning_rate": 3.294663573085847e-06, "loss": 0.3563, "step": 143 }, { "epoch": 0.030094043887147336, "grad_norm": 2.0353888314507023, "learning_rate": 3.3178654292343387e-06, "loss": 0.3615, "step": 144 }, { "epoch": 0.030303030303030304, "grad_norm": 2.817903069955643, "learning_rate": 3.341067285382831e-06, "loss": 0.351, "step": 145 }, { "epoch": 0.030512016718913272, "grad_norm": 3.353483254266229, "learning_rate": 3.364269141531323e-06, "loss": 0.3387, "step": 146 }, { "epoch": 0.030721003134796237, "grad_norm": 1.8088805129795715, "learning_rate": 3.387470997679815e-06, "loss": 0.3583, "step": 147 }, { "epoch": 0.030929989550679205, "grad_norm": 2.220020251302794, "learning_rate": 3.4106728538283066e-06, "loss": 0.3451, "step": 148 }, { "epoch": 0.031138975966562173, "grad_norm": 2.9372668682128205, "learning_rate": 3.4338747099767983e-06, "loss": 0.3714, "step": 149 }, { "epoch": 0.03134796238244514, "grad_norm": 2.5226740153567095, "learning_rate": 3.4570765661252904e-06, "loss": 0.3857, "step": 150 }, { "epoch": 0.031556948798328106, "grad_norm": 2.1775299722236587, "learning_rate": 3.4802784222737825e-06, "loss": 0.3728, "step": 151 }, { "epoch": 0.031765935214211075, "grad_norm": 3.2085503969625844, "learning_rate": 3.5034802784222745e-06, "loss": 0.3545, "step": 152 }, { "epoch": 0.03197492163009404, "grad_norm": 2.363816898836288, "learning_rate": 3.5266821345707658e-06, "loss": 0.3578, "step": 153 }, { "epoch": 0.03218390804597701, "grad_norm": 2.9492575507223466, "learning_rate": 3.549883990719258e-06, "loss": 0.4096, "step": 154 }, { "epoch": 0.03239289446185998, "grad_norm": 2.4512923295816997, "learning_rate": 3.57308584686775e-06, "loss": 0.3616, "step": 155 }, { "epoch": 0.03260188087774295, "grad_norm": 2.4086674136021364, "learning_rate": 3.596287703016242e-06, "loss": 0.3794, "step": 156 }, { "epoch": 0.032810867293625916, "grad_norm": 2.779266978124135, "learning_rate": 3.6194895591647333e-06, "loss": 0.3604, "step": 157 }, { "epoch": 0.033019853709508884, "grad_norm": 2.6819199920011485, "learning_rate": 3.6426914153132253e-06, "loss": 0.3924, "step": 158 }, { "epoch": 0.03322884012539185, "grad_norm": 3.581031150904369, "learning_rate": 3.6658932714617174e-06, "loss": 0.3608, "step": 159 }, { "epoch": 0.03343782654127482, "grad_norm": 4.333699398028322, "learning_rate": 3.6890951276102095e-06, "loss": 0.3853, "step": 160 }, { "epoch": 0.03364681295715778, "grad_norm": 2.6249483202157107, "learning_rate": 3.7122969837587007e-06, "loss": 0.3435, "step": 161 }, { "epoch": 0.03385579937304075, "grad_norm": 3.294004209148803, "learning_rate": 3.735498839907193e-06, "loss": 0.3299, "step": 162 }, { "epoch": 0.03406478578892372, "grad_norm": 2.6032535200965534, "learning_rate": 3.758700696055685e-06, "loss": 0.356, "step": 163 }, { "epoch": 0.034273772204806686, "grad_norm": 1.7184798109271304, "learning_rate": 3.781902552204177e-06, "loss": 0.3166, "step": 164 }, { "epoch": 0.034482758620689655, "grad_norm": 1.614115898739819, "learning_rate": 3.805104408352668e-06, "loss": 0.3635, "step": 165 }, { "epoch": 0.03469174503657262, "grad_norm": 3.0186036131540708, "learning_rate": 3.828306264501161e-06, "loss": 0.3543, "step": 166 }, { "epoch": 0.03490073145245559, "grad_norm": 2.089025349623365, "learning_rate": 3.851508120649652e-06, "loss": 0.3091, "step": 167 }, { "epoch": 0.03510971786833856, "grad_norm": 2.9881653038048492, "learning_rate": 3.874709976798144e-06, "loss": 0.3661, "step": 168 }, { "epoch": 0.03531870428422153, "grad_norm": 2.4439956214667333, "learning_rate": 3.897911832946636e-06, "loss": 0.323, "step": 169 }, { "epoch": 0.035527690700104496, "grad_norm": 2.00102324100191, "learning_rate": 3.921113689095128e-06, "loss": 0.3633, "step": 170 }, { "epoch": 0.035736677115987464, "grad_norm": 2.055419854572706, "learning_rate": 3.94431554524362e-06, "loss": 0.3558, "step": 171 }, { "epoch": 0.035945663531870425, "grad_norm": 1.5125297686207764, "learning_rate": 3.967517401392112e-06, "loss": 0.3412, "step": 172 }, { "epoch": 0.036154649947753394, "grad_norm": 2.5237829868988118, "learning_rate": 3.990719257540603e-06, "loss": 0.3673, "step": 173 }, { "epoch": 0.03636363636363636, "grad_norm": 2.2713151993289524, "learning_rate": 4.013921113689096e-06, "loss": 0.3618, "step": 174 }, { "epoch": 0.03657262277951933, "grad_norm": 4.744315445197851, "learning_rate": 4.037122969837587e-06, "loss": 0.3429, "step": 175 }, { "epoch": 0.0367816091954023, "grad_norm": 2.1677277479190313, "learning_rate": 4.06032482598608e-06, "loss": 0.3445, "step": 176 }, { "epoch": 0.03699059561128527, "grad_norm": 1.8943062689474643, "learning_rate": 4.083526682134571e-06, "loss": 0.3757, "step": 177 }, { "epoch": 0.037199582027168235, "grad_norm": 2.470272587562674, "learning_rate": 4.106728538283063e-06, "loss": 0.3664, "step": 178 }, { "epoch": 0.0374085684430512, "grad_norm": 2.32190159999152, "learning_rate": 4.129930394431555e-06, "loss": 0.358, "step": 179 }, { "epoch": 0.03761755485893417, "grad_norm": 1.971448133637129, "learning_rate": 4.153132250580047e-06, "loss": 0.3915, "step": 180 }, { "epoch": 0.03782654127481714, "grad_norm": 1.89480507931747, "learning_rate": 4.176334106728538e-06, "loss": 0.3451, "step": 181 }, { "epoch": 0.03803552769070011, "grad_norm": 2.057079263385225, "learning_rate": 4.199535962877031e-06, "loss": 0.3844, "step": 182 }, { "epoch": 0.03824451410658307, "grad_norm": 2.9852479416430895, "learning_rate": 4.222737819025522e-06, "loss": 0.3701, "step": 183 }, { "epoch": 0.03845350052246604, "grad_norm": 2.3731410067883973, "learning_rate": 4.245939675174015e-06, "loss": 0.3346, "step": 184 }, { "epoch": 0.038662486938349006, "grad_norm": 2.2922875006612826, "learning_rate": 4.269141531322506e-06, "loss": 0.339, "step": 185 }, { "epoch": 0.038871473354231974, "grad_norm": 2.5514739299084668, "learning_rate": 4.292343387470998e-06, "loss": 0.3257, "step": 186 }, { "epoch": 0.03908045977011494, "grad_norm": 1.8354914186405957, "learning_rate": 4.31554524361949e-06, "loss": 0.3638, "step": 187 }, { "epoch": 0.03928944618599791, "grad_norm": 2.3024717689289753, "learning_rate": 4.338747099767982e-06, "loss": 0.3687, "step": 188 }, { "epoch": 0.03949843260188088, "grad_norm": 2.2188910740125247, "learning_rate": 4.361948955916474e-06, "loss": 0.3482, "step": 189 }, { "epoch": 0.03970741901776385, "grad_norm": 3.4212601487290146, "learning_rate": 4.3851508120649656e-06, "loss": 0.3605, "step": 190 }, { "epoch": 0.039916405433646815, "grad_norm": 2.822535851426536, "learning_rate": 4.408352668213457e-06, "loss": 0.3259, "step": 191 }, { "epoch": 0.04012539184952978, "grad_norm": 2.0814116238882003, "learning_rate": 4.43155452436195e-06, "loss": 0.3215, "step": 192 }, { "epoch": 0.04033437826541275, "grad_norm": 2.3606443160184813, "learning_rate": 4.454756380510441e-06, "loss": 0.3581, "step": 193 }, { "epoch": 0.04054336468129571, "grad_norm": 3.328981207263151, "learning_rate": 4.477958236658933e-06, "loss": 0.3267, "step": 194 }, { "epoch": 0.04075235109717868, "grad_norm": 2.7768809109045334, "learning_rate": 4.501160092807425e-06, "loss": 0.3949, "step": 195 }, { "epoch": 0.04096133751306165, "grad_norm": 1.869258987187942, "learning_rate": 4.524361948955917e-06, "loss": 0.3418, "step": 196 }, { "epoch": 0.04117032392894462, "grad_norm": 6.377685165598353, "learning_rate": 4.547563805104409e-06, "loss": 0.4008, "step": 197 }, { "epoch": 0.041379310344827586, "grad_norm": 2.0573741428780794, "learning_rate": 4.5707656612529005e-06, "loss": 0.3383, "step": 198 }, { "epoch": 0.041588296760710554, "grad_norm": 2.0888568381287427, "learning_rate": 4.593967517401392e-06, "loss": 0.3543, "step": 199 }, { "epoch": 0.04179728317659352, "grad_norm": 3.0916459353208423, "learning_rate": 4.617169373549885e-06, "loss": 0.3503, "step": 200 }, { "epoch": 0.04200626959247649, "grad_norm": 4.297163417681507, "learning_rate": 4.640371229698376e-06, "loss": 0.3579, "step": 201 }, { "epoch": 0.04221525600835946, "grad_norm": 2.1672291109506423, "learning_rate": 4.663573085846868e-06, "loss": 0.3498, "step": 202 }, { "epoch": 0.04242424242424243, "grad_norm": 2.778518877967765, "learning_rate": 4.6867749419953605e-06, "loss": 0.3592, "step": 203 }, { "epoch": 0.042633228840125395, "grad_norm": 2.898223496447229, "learning_rate": 4.709976798143852e-06, "loss": 0.3991, "step": 204 }, { "epoch": 0.042842215256008356, "grad_norm": 3.042531733960985, "learning_rate": 4.733178654292344e-06, "loss": 0.3717, "step": 205 }, { "epoch": 0.043051201671891325, "grad_norm": 2.1582572903145674, "learning_rate": 4.7563805104408355e-06, "loss": 0.345, "step": 206 }, { "epoch": 0.04326018808777429, "grad_norm": 2.7448817157596994, "learning_rate": 4.779582366589328e-06, "loss": 0.3503, "step": 207 }, { "epoch": 0.04346917450365726, "grad_norm": 2.234905838065791, "learning_rate": 4.80278422273782e-06, "loss": 0.3745, "step": 208 }, { "epoch": 0.04367816091954023, "grad_norm": 1.9687597002096164, "learning_rate": 4.825986078886311e-06, "loss": 0.3489, "step": 209 }, { "epoch": 0.0438871473354232, "grad_norm": 1.9637340382434523, "learning_rate": 4.849187935034803e-06, "loss": 0.3516, "step": 210 }, { "epoch": 0.044096133751306166, "grad_norm": 1.952906321520583, "learning_rate": 4.8723897911832954e-06, "loss": 0.3558, "step": 211 }, { "epoch": 0.044305120167189134, "grad_norm": 1.8482426405568724, "learning_rate": 4.895591647331787e-06, "loss": 0.3333, "step": 212 }, { "epoch": 0.0445141065830721, "grad_norm": 2.739520133493, "learning_rate": 4.918793503480279e-06, "loss": 0.3711, "step": 213 }, { "epoch": 0.04472309299895507, "grad_norm": 3.043348249315582, "learning_rate": 4.94199535962877e-06, "loss": 0.381, "step": 214 }, { "epoch": 0.04493207941483804, "grad_norm": 1.7818905461477565, "learning_rate": 4.965197215777263e-06, "loss": 0.3567, "step": 215 }, { "epoch": 0.045141065830721, "grad_norm": 3.7889497562177405, "learning_rate": 4.988399071925755e-06, "loss": 0.3486, "step": 216 }, { "epoch": 0.04535005224660397, "grad_norm": 1.8166493802757422, "learning_rate": 5.011600928074246e-06, "loss": 0.3669, "step": 217 }, { "epoch": 0.045559038662486936, "grad_norm": 3.540829610618944, "learning_rate": 5.034802784222739e-06, "loss": 0.3596, "step": 218 }, { "epoch": 0.045768025078369905, "grad_norm": 3.4251763254348586, "learning_rate": 5.05800464037123e-06, "loss": 0.3618, "step": 219 }, { "epoch": 0.04597701149425287, "grad_norm": 2.3442819116286877, "learning_rate": 5.081206496519722e-06, "loss": 0.3251, "step": 220 }, { "epoch": 0.04618599791013584, "grad_norm": 2.3694303041700135, "learning_rate": 5.1044083526682146e-06, "loss": 0.3501, "step": 221 }, { "epoch": 0.04639498432601881, "grad_norm": 4.8527468080015295, "learning_rate": 5.127610208816705e-06, "loss": 0.3364, "step": 222 }, { "epoch": 0.04660397074190178, "grad_norm": 2.3204281696352997, "learning_rate": 5.150812064965197e-06, "loss": 0.3358, "step": 223 }, { "epoch": 0.046812957157784746, "grad_norm": 1.769535360151475, "learning_rate": 5.1740139211136895e-06, "loss": 0.3703, "step": 224 }, { "epoch": 0.047021943573667714, "grad_norm": 3.561034203702249, "learning_rate": 5.197215777262181e-06, "loss": 0.3293, "step": 225 }, { "epoch": 0.04723092998955068, "grad_norm": 3.131672910576138, "learning_rate": 5.220417633410674e-06, "loss": 0.3793, "step": 226 }, { "epoch": 0.047439916405433644, "grad_norm": 3.1258253210995393, "learning_rate": 5.243619489559165e-06, "loss": 0.3741, "step": 227 }, { "epoch": 0.04764890282131661, "grad_norm": 2.498313224787288, "learning_rate": 5.266821345707657e-06, "loss": 0.3418, "step": 228 }, { "epoch": 0.04785788923719958, "grad_norm": 4.804147557199855, "learning_rate": 5.2900232018561495e-06, "loss": 0.3792, "step": 229 }, { "epoch": 0.04806687565308255, "grad_norm": 1.8644761509316983, "learning_rate": 5.31322505800464e-06, "loss": 0.333, "step": 230 }, { "epoch": 0.04827586206896552, "grad_norm": 2.9562441278247715, "learning_rate": 5.336426914153132e-06, "loss": 0.3313, "step": 231 }, { "epoch": 0.048484848484848485, "grad_norm": 3.3538667547608645, "learning_rate": 5.3596287703016245e-06, "loss": 0.3546, "step": 232 }, { "epoch": 0.04869383490073145, "grad_norm": 1.9312961452000053, "learning_rate": 5.382830626450116e-06, "loss": 0.3755, "step": 233 }, { "epoch": 0.04890282131661442, "grad_norm": 1.840694902886721, "learning_rate": 5.406032482598609e-06, "loss": 0.3312, "step": 234 }, { "epoch": 0.04911180773249739, "grad_norm": 2.761360754125448, "learning_rate": 5.4292343387471e-06, "loss": 0.3524, "step": 235 }, { "epoch": 0.04932079414838036, "grad_norm": 2.275636645401597, "learning_rate": 5.452436194895592e-06, "loss": 0.3697, "step": 236 }, { "epoch": 0.049529780564263326, "grad_norm": 2.5767851652179785, "learning_rate": 5.4756380510440845e-06, "loss": 0.3517, "step": 237 }, { "epoch": 0.04973876698014629, "grad_norm": 2.3461840241236547, "learning_rate": 5.498839907192576e-06, "loss": 0.3384, "step": 238 }, { "epoch": 0.049947753396029256, "grad_norm": 2.965321624402512, "learning_rate": 5.522041763341067e-06, "loss": 0.357, "step": 239 }, { "epoch": 0.050156739811912224, "grad_norm": 3.600922720487478, "learning_rate": 5.5452436194895594e-06, "loss": 0.3606, "step": 240 }, { "epoch": 0.05036572622779519, "grad_norm": 3.754128474581968, "learning_rate": 5.568445475638051e-06, "loss": 0.3781, "step": 241 }, { "epoch": 0.05057471264367816, "grad_norm": 3.2407900399908813, "learning_rate": 5.591647331786544e-06, "loss": 0.3611, "step": 242 }, { "epoch": 0.05078369905956113, "grad_norm": 4.264866320687726, "learning_rate": 5.614849187935035e-06, "loss": 0.3751, "step": 243 }, { "epoch": 0.0509926854754441, "grad_norm": 1.9586898313188337, "learning_rate": 5.638051044083527e-06, "loss": 0.3725, "step": 244 }, { "epoch": 0.051201671891327065, "grad_norm": 1.7014578635629476, "learning_rate": 5.661252900232019e-06, "loss": 0.3834, "step": 245 }, { "epoch": 0.05141065830721003, "grad_norm": 3.156442517699432, "learning_rate": 5.684454756380511e-06, "loss": 0.3345, "step": 246 }, { "epoch": 0.051619644723093, "grad_norm": 2.2276133718337716, "learning_rate": 5.707656612529002e-06, "loss": 0.3573, "step": 247 }, { "epoch": 0.05182863113897597, "grad_norm": 2.164316906609856, "learning_rate": 5.730858468677495e-06, "loss": 0.3104, "step": 248 }, { "epoch": 0.05203761755485893, "grad_norm": 3.1020008675884596, "learning_rate": 5.754060324825986e-06, "loss": 0.3424, "step": 249 }, { "epoch": 0.0522466039707419, "grad_norm": 2.4325216758608907, "learning_rate": 5.7772621809744785e-06, "loss": 0.3652, "step": 250 }, { "epoch": 0.05245559038662487, "grad_norm": 2.525196923487534, "learning_rate": 5.80046403712297e-06, "loss": 0.3483, "step": 251 }, { "epoch": 0.052664576802507836, "grad_norm": 1.860392096605108, "learning_rate": 5.823665893271462e-06, "loss": 0.3329, "step": 252 }, { "epoch": 0.052873563218390804, "grad_norm": 1.672815589995745, "learning_rate": 5.846867749419954e-06, "loss": 0.3327, "step": 253 }, { "epoch": 0.05308254963427377, "grad_norm": 1.9232830470017888, "learning_rate": 5.870069605568446e-06, "loss": 0.3251, "step": 254 }, { "epoch": 0.05329153605015674, "grad_norm": 2.1413542395103016, "learning_rate": 5.893271461716938e-06, "loss": 0.3633, "step": 255 }, { "epoch": 0.05350052246603971, "grad_norm": 2.055882689039347, "learning_rate": 5.91647331786543e-06, "loss": 0.3498, "step": 256 }, { "epoch": 0.05370950888192268, "grad_norm": 2.1998917322067593, "learning_rate": 5.939675174013921e-06, "loss": 0.3073, "step": 257 }, { "epoch": 0.053918495297805645, "grad_norm": 2.3349195956938416, "learning_rate": 5.962877030162414e-06, "loss": 0.3734, "step": 258 }, { "epoch": 0.05412748171368861, "grad_norm": 2.1292075609305434, "learning_rate": 5.986078886310905e-06, "loss": 0.3447, "step": 259 }, { "epoch": 0.054336468129571575, "grad_norm": 1.8514612836789928, "learning_rate": 6.009280742459397e-06, "loss": 0.3446, "step": 260 }, { "epoch": 0.05454545454545454, "grad_norm": 1.820032785803285, "learning_rate": 6.032482598607889e-06, "loss": 0.3158, "step": 261 }, { "epoch": 0.05475444096133751, "grad_norm": 1.7045954230255624, "learning_rate": 6.055684454756381e-06, "loss": 0.3395, "step": 262 }, { "epoch": 0.05496342737722048, "grad_norm": 1.9022691938968384, "learning_rate": 6.078886310904873e-06, "loss": 0.3122, "step": 263 }, { "epoch": 0.05517241379310345, "grad_norm": 2.7683590778260374, "learning_rate": 6.102088167053365e-06, "loss": 0.3435, "step": 264 }, { "epoch": 0.055381400208986416, "grad_norm": 3.0898896455312403, "learning_rate": 6.125290023201857e-06, "loss": 0.3613, "step": 265 }, { "epoch": 0.055590386624869384, "grad_norm": 2.0943497276920415, "learning_rate": 6.148491879350349e-06, "loss": 0.3488, "step": 266 }, { "epoch": 0.05579937304075235, "grad_norm": 2.8060618756815257, "learning_rate": 6.17169373549884e-06, "loss": 0.3589, "step": 267 }, { "epoch": 0.05600835945663532, "grad_norm": 2.1488002361826544, "learning_rate": 6.194895591647332e-06, "loss": 0.3304, "step": 268 }, { "epoch": 0.05621734587251829, "grad_norm": 1.988048630323592, "learning_rate": 6.218097447795824e-06, "loss": 0.3814, "step": 269 }, { "epoch": 0.05642633228840126, "grad_norm": 3.054000046649239, "learning_rate": 6.241299303944316e-06, "loss": 0.3437, "step": 270 }, { "epoch": 0.05663531870428422, "grad_norm": 3.0805069159001204, "learning_rate": 6.264501160092808e-06, "loss": 0.3663, "step": 271 }, { "epoch": 0.056844305120167186, "grad_norm": 1.6638512318856837, "learning_rate": 6.2877030162413e-06, "loss": 0.3403, "step": 272 }, { "epoch": 0.057053291536050155, "grad_norm": 1.7031662112777373, "learning_rate": 6.310904872389792e-06, "loss": 0.3401, "step": 273 }, { "epoch": 0.05726227795193312, "grad_norm": 6.09804714705569, "learning_rate": 6.334106728538284e-06, "loss": 0.3453, "step": 274 }, { "epoch": 0.05747126436781609, "grad_norm": 1.6872703031982361, "learning_rate": 6.357308584686775e-06, "loss": 0.3159, "step": 275 }, { "epoch": 0.05768025078369906, "grad_norm": 2.310487750924323, "learning_rate": 6.380510440835267e-06, "loss": 0.3631, "step": 276 }, { "epoch": 0.05788923719958203, "grad_norm": 4.1436910724780125, "learning_rate": 6.403712296983759e-06, "loss": 0.3239, "step": 277 }, { "epoch": 0.058098223615464996, "grad_norm": 1.9457484469805173, "learning_rate": 6.426914153132251e-06, "loss": 0.3547, "step": 278 }, { "epoch": 0.058307210031347964, "grad_norm": 2.2242181025527508, "learning_rate": 6.4501160092807425e-06, "loss": 0.3496, "step": 279 }, { "epoch": 0.05851619644723093, "grad_norm": 1.837143999554278, "learning_rate": 6.473317865429235e-06, "loss": 0.3347, "step": 280 }, { "epoch": 0.0587251828631139, "grad_norm": 3.56092143821822, "learning_rate": 6.496519721577727e-06, "loss": 0.377, "step": 281 }, { "epoch": 0.05893416927899686, "grad_norm": 2.3293755788413253, "learning_rate": 6.519721577726219e-06, "loss": 0.388, "step": 282 }, { "epoch": 0.05914315569487983, "grad_norm": 3.5784643873221644, "learning_rate": 6.542923433874711e-06, "loss": 0.3677, "step": 283 }, { "epoch": 0.0593521421107628, "grad_norm": 1.4764577464396371, "learning_rate": 6.566125290023202e-06, "loss": 0.3642, "step": 284 }, { "epoch": 0.05956112852664577, "grad_norm": 2.8110493992870644, "learning_rate": 6.589327146171694e-06, "loss": 0.3371, "step": 285 }, { "epoch": 0.059770114942528735, "grad_norm": 1.7417664667420563, "learning_rate": 6.612529002320186e-06, "loss": 0.3527, "step": 286 }, { "epoch": 0.0599791013584117, "grad_norm": 1.8749159388543701, "learning_rate": 6.6357308584686775e-06, "loss": 0.3504, "step": 287 }, { "epoch": 0.06018808777429467, "grad_norm": 2.4999762934283, "learning_rate": 6.65893271461717e-06, "loss": 0.3486, "step": 288 }, { "epoch": 0.06039707419017764, "grad_norm": 3.6654986253273694, "learning_rate": 6.682134570765662e-06, "loss": 0.3489, "step": 289 }, { "epoch": 0.06060606060606061, "grad_norm": 1.8557652084726426, "learning_rate": 6.705336426914154e-06, "loss": 0.3261, "step": 290 }, { "epoch": 0.060815047021943576, "grad_norm": 2.72359346765075, "learning_rate": 6.728538283062646e-06, "loss": 0.3193, "step": 291 }, { "epoch": 0.061024033437826544, "grad_norm": 3.681379120487324, "learning_rate": 6.751740139211137e-06, "loss": 0.3514, "step": 292 }, { "epoch": 0.061233019853709506, "grad_norm": 1.8028723671775142, "learning_rate": 6.77494199535963e-06, "loss": 0.2822, "step": 293 }, { "epoch": 0.061442006269592474, "grad_norm": 4.881215632384218, "learning_rate": 6.798143851508121e-06, "loss": 0.3633, "step": 294 }, { "epoch": 0.06165099268547544, "grad_norm": 2.963593449409229, "learning_rate": 6.821345707656613e-06, "loss": 0.3388, "step": 295 }, { "epoch": 0.06185997910135841, "grad_norm": 3.219557061055995, "learning_rate": 6.844547563805105e-06, "loss": 0.3396, "step": 296 }, { "epoch": 0.06206896551724138, "grad_norm": 5.831042205710031, "learning_rate": 6.867749419953597e-06, "loss": 0.3513, "step": 297 }, { "epoch": 0.06227795193312435, "grad_norm": 2.737045548588703, "learning_rate": 6.890951276102089e-06, "loss": 0.3298, "step": 298 }, { "epoch": 0.062486938349007315, "grad_norm": 5.155017909876161, "learning_rate": 6.914153132250581e-06, "loss": 0.3095, "step": 299 }, { "epoch": 0.06269592476489028, "grad_norm": 4.8163145704586166, "learning_rate": 6.937354988399072e-06, "loss": 0.3575, "step": 300 }, { "epoch": 0.06290491118077325, "grad_norm": 1.9076227070724805, "learning_rate": 6.960556844547565e-06, "loss": 0.3635, "step": 301 }, { "epoch": 0.06311389759665621, "grad_norm": 3.0872576929589064, "learning_rate": 6.983758700696056e-06, "loss": 0.3361, "step": 302 }, { "epoch": 0.06332288401253919, "grad_norm": 3.9205580410253864, "learning_rate": 7.006960556844549e-06, "loss": 0.3451, "step": 303 }, { "epoch": 0.06353187042842215, "grad_norm": 3.4316807195285954, "learning_rate": 7.03016241299304e-06, "loss": 0.364, "step": 304 }, { "epoch": 0.06374085684430512, "grad_norm": 1.7634599317657274, "learning_rate": 7.0533642691415316e-06, "loss": 0.3366, "step": 305 }, { "epoch": 0.06394984326018809, "grad_norm": 5.088512401507613, "learning_rate": 7.076566125290024e-06, "loss": 0.3339, "step": 306 }, { "epoch": 0.06415882967607106, "grad_norm": 2.624195259934994, "learning_rate": 7.099767981438516e-06, "loss": 0.3285, "step": 307 }, { "epoch": 0.06436781609195402, "grad_norm": 1.613281014343347, "learning_rate": 7.122969837587007e-06, "loss": 0.349, "step": 308 }, { "epoch": 0.064576802507837, "grad_norm": 2.2382271577692463, "learning_rate": 7.1461716937355e-06, "loss": 0.3672, "step": 309 }, { "epoch": 0.06478578892371996, "grad_norm": 2.7911796444810832, "learning_rate": 7.1693735498839915e-06, "loss": 0.3449, "step": 310 }, { "epoch": 0.06499477533960292, "grad_norm": 2.722982249014585, "learning_rate": 7.192575406032484e-06, "loss": 0.3484, "step": 311 }, { "epoch": 0.0652037617554859, "grad_norm": 1.7123244920127438, "learning_rate": 7.215777262180975e-06, "loss": 0.3666, "step": 312 }, { "epoch": 0.06541274817136886, "grad_norm": 2.2103291217723497, "learning_rate": 7.2389791183294665e-06, "loss": 0.3116, "step": 313 }, { "epoch": 0.06562173458725183, "grad_norm": 2.706778729965271, "learning_rate": 7.262180974477959e-06, "loss": 0.3771, "step": 314 }, { "epoch": 0.06583072100313479, "grad_norm": 2.63947645787052, "learning_rate": 7.285382830626451e-06, "loss": 0.3446, "step": 315 }, { "epoch": 0.06603970741901777, "grad_norm": 1.788247333785281, "learning_rate": 7.308584686774942e-06, "loss": 0.3176, "step": 316 }, { "epoch": 0.06624869383490073, "grad_norm": 2.925681523156559, "learning_rate": 7.331786542923435e-06, "loss": 0.3409, "step": 317 }, { "epoch": 0.0664576802507837, "grad_norm": 1.3859189402016745, "learning_rate": 7.3549883990719265e-06, "loss": 0.3585, "step": 318 }, { "epoch": 0.06666666666666667, "grad_norm": 1.6574629979124755, "learning_rate": 7.378190255220419e-06, "loss": 0.3545, "step": 319 }, { "epoch": 0.06687565308254964, "grad_norm": 1.5778884994311246, "learning_rate": 7.401392111368911e-06, "loss": 0.3416, "step": 320 }, { "epoch": 0.0670846394984326, "grad_norm": 2.2137521140723893, "learning_rate": 7.4245939675174015e-06, "loss": 0.3353, "step": 321 }, { "epoch": 0.06729362591431556, "grad_norm": 2.7413316952694773, "learning_rate": 7.447795823665894e-06, "loss": 0.3236, "step": 322 }, { "epoch": 0.06750261233019854, "grad_norm": 2.0417878675548384, "learning_rate": 7.470997679814386e-06, "loss": 0.3477, "step": 323 }, { "epoch": 0.0677115987460815, "grad_norm": 1.602500997732699, "learning_rate": 7.494199535962877e-06, "loss": 0.3575, "step": 324 }, { "epoch": 0.06792058516196448, "grad_norm": 2.4163493213932106, "learning_rate": 7.51740139211137e-06, "loss": 0.3557, "step": 325 }, { "epoch": 0.06812957157784744, "grad_norm": 2.7823013189072667, "learning_rate": 7.5406032482598614e-06, "loss": 0.3448, "step": 326 }, { "epoch": 0.06833855799373041, "grad_norm": 1.9922021462892363, "learning_rate": 7.563805104408354e-06, "loss": 0.3619, "step": 327 }, { "epoch": 0.06854754440961337, "grad_norm": 3.0963155809007628, "learning_rate": 7.587006960556846e-06, "loss": 0.3616, "step": 328 }, { "epoch": 0.06875653082549635, "grad_norm": 1.7852907327720084, "learning_rate": 7.610208816705336e-06, "loss": 0.3276, "step": 329 }, { "epoch": 0.06896551724137931, "grad_norm": 2.096408907154201, "learning_rate": 7.63341067285383e-06, "loss": 0.3279, "step": 330 }, { "epoch": 0.06917450365726228, "grad_norm": 2.1152803386766292, "learning_rate": 7.656612529002321e-06, "loss": 0.3071, "step": 331 }, { "epoch": 0.06938349007314525, "grad_norm": 2.367683261213582, "learning_rate": 7.679814385150813e-06, "loss": 0.3523, "step": 332 }, { "epoch": 0.06959247648902821, "grad_norm": 1.6926944666936845, "learning_rate": 7.703016241299305e-06, "loss": 0.3349, "step": 333 }, { "epoch": 0.06980146290491118, "grad_norm": 1.6766496367605264, "learning_rate": 7.726218097447796e-06, "loss": 0.3468, "step": 334 }, { "epoch": 0.07001044932079414, "grad_norm": 1.8314133058112692, "learning_rate": 7.749419953596288e-06, "loss": 0.3056, "step": 335 }, { "epoch": 0.07021943573667712, "grad_norm": 1.722311283110366, "learning_rate": 7.77262180974478e-06, "loss": 0.3496, "step": 336 }, { "epoch": 0.07042842215256008, "grad_norm": 1.9786367410170302, "learning_rate": 7.795823665893271e-06, "loss": 0.3311, "step": 337 }, { "epoch": 0.07063740856844306, "grad_norm": 2.0949793139175323, "learning_rate": 7.819025522041765e-06, "loss": 0.3189, "step": 338 }, { "epoch": 0.07084639498432602, "grad_norm": 1.8505547134596123, "learning_rate": 7.842227378190256e-06, "loss": 0.3505, "step": 339 }, { "epoch": 0.07105538140020899, "grad_norm": 2.0712688676016304, "learning_rate": 7.865429234338748e-06, "loss": 0.3307, "step": 340 }, { "epoch": 0.07126436781609195, "grad_norm": 2.8066720012158224, "learning_rate": 7.88863109048724e-06, "loss": 0.3222, "step": 341 }, { "epoch": 0.07147335423197493, "grad_norm": 3.261524273430193, "learning_rate": 7.911832946635731e-06, "loss": 0.3446, "step": 342 }, { "epoch": 0.07168234064785789, "grad_norm": 1.7677167741847528, "learning_rate": 7.935034802784225e-06, "loss": 0.3486, "step": 343 }, { "epoch": 0.07189132706374085, "grad_norm": 2.4210104153362533, "learning_rate": 7.958236658932715e-06, "loss": 0.364, "step": 344 }, { "epoch": 0.07210031347962383, "grad_norm": 1.8491821074275625, "learning_rate": 7.981438515081206e-06, "loss": 0.3265, "step": 345 }, { "epoch": 0.07230929989550679, "grad_norm": 3.426866525518921, "learning_rate": 8.0046403712297e-06, "loss": 0.349, "step": 346 }, { "epoch": 0.07251828631138976, "grad_norm": 2.3134405302832124, "learning_rate": 8.027842227378191e-06, "loss": 0.3354, "step": 347 }, { "epoch": 0.07272727272727272, "grad_norm": 1.6644350159789596, "learning_rate": 8.051044083526683e-06, "loss": 0.3241, "step": 348 }, { "epoch": 0.0729362591431557, "grad_norm": 2.687393453015018, "learning_rate": 8.074245939675175e-06, "loss": 0.3371, "step": 349 }, { "epoch": 0.07314524555903866, "grad_norm": 2.2185711961631314, "learning_rate": 8.097447795823666e-06, "loss": 0.3612, "step": 350 }, { "epoch": 0.07335423197492164, "grad_norm": 3.1432868668519673, "learning_rate": 8.12064965197216e-06, "loss": 0.3355, "step": 351 }, { "epoch": 0.0735632183908046, "grad_norm": 2.032122787498615, "learning_rate": 8.14385150812065e-06, "loss": 0.3232, "step": 352 }, { "epoch": 0.07377220480668757, "grad_norm": 2.0405120070060123, "learning_rate": 8.167053364269141e-06, "loss": 0.3476, "step": 353 }, { "epoch": 0.07398119122257053, "grad_norm": 2.009651172196454, "learning_rate": 8.190255220417635e-06, "loss": 0.3299, "step": 354 }, { "epoch": 0.0741901776384535, "grad_norm": 1.9605420558042173, "learning_rate": 8.213457076566126e-06, "loss": 0.3492, "step": 355 }, { "epoch": 0.07439916405433647, "grad_norm": 2.3563466358291523, "learning_rate": 8.236658932714618e-06, "loss": 0.3625, "step": 356 }, { "epoch": 0.07460815047021943, "grad_norm": 3.1627224512892247, "learning_rate": 8.25986078886311e-06, "loss": 0.3599, "step": 357 }, { "epoch": 0.0748171368861024, "grad_norm": 2.3080052428681683, "learning_rate": 8.283062645011601e-06, "loss": 0.3465, "step": 358 }, { "epoch": 0.07502612330198537, "grad_norm": 1.4061129017556404, "learning_rate": 8.306264501160095e-06, "loss": 0.2967, "step": 359 }, { "epoch": 0.07523510971786834, "grad_norm": 3.1673563735422494, "learning_rate": 8.329466357308586e-06, "loss": 0.3427, "step": 360 }, { "epoch": 0.0754440961337513, "grad_norm": 3.015250046375755, "learning_rate": 8.352668213457076e-06, "loss": 0.3308, "step": 361 }, { "epoch": 0.07565308254963428, "grad_norm": 2.853006057374252, "learning_rate": 8.37587006960557e-06, "loss": 0.3545, "step": 362 }, { "epoch": 0.07586206896551724, "grad_norm": 4.127233548498106, "learning_rate": 8.399071925754061e-06, "loss": 0.3594, "step": 363 }, { "epoch": 0.07607105538140022, "grad_norm": 1.8473513986846304, "learning_rate": 8.422273781902553e-06, "loss": 0.3473, "step": 364 }, { "epoch": 0.07628004179728318, "grad_norm": 2.8490253550905074, "learning_rate": 8.445475638051045e-06, "loss": 0.3566, "step": 365 }, { "epoch": 0.07648902821316614, "grad_norm": 2.7257682615340753, "learning_rate": 8.468677494199536e-06, "loss": 0.3535, "step": 366 }, { "epoch": 0.07669801462904911, "grad_norm": 2.2272327062858612, "learning_rate": 8.49187935034803e-06, "loss": 0.3604, "step": 367 }, { "epoch": 0.07690700104493207, "grad_norm": 1.6987763635007354, "learning_rate": 8.515081206496521e-06, "loss": 0.3472, "step": 368 }, { "epoch": 0.07711598746081505, "grad_norm": 6.402049312538872, "learning_rate": 8.538283062645011e-06, "loss": 0.3447, "step": 369 }, { "epoch": 0.07732497387669801, "grad_norm": 1.6941841091253882, "learning_rate": 8.561484918793505e-06, "loss": 0.322, "step": 370 }, { "epoch": 0.07753396029258099, "grad_norm": 2.429088236255099, "learning_rate": 8.584686774941996e-06, "loss": 0.3529, "step": 371 }, { "epoch": 0.07774294670846395, "grad_norm": 1.4620530597185595, "learning_rate": 8.607888631090488e-06, "loss": 0.3321, "step": 372 }, { "epoch": 0.07795193312434692, "grad_norm": 1.9460704295153142, "learning_rate": 8.63109048723898e-06, "loss": 0.3253, "step": 373 }, { "epoch": 0.07816091954022988, "grad_norm": 2.473359946246641, "learning_rate": 8.654292343387471e-06, "loss": 0.363, "step": 374 }, { "epoch": 0.07836990595611286, "grad_norm": 1.5749721462239015, "learning_rate": 8.677494199535964e-06, "loss": 0.3028, "step": 375 }, { "epoch": 0.07857889237199582, "grad_norm": 10.82914750424329, "learning_rate": 8.700696055684456e-06, "loss": 0.3574, "step": 376 }, { "epoch": 0.07878787878787878, "grad_norm": 2.626101555655272, "learning_rate": 8.723897911832948e-06, "loss": 0.3133, "step": 377 }, { "epoch": 0.07899686520376176, "grad_norm": 1.7920192572571134, "learning_rate": 8.74709976798144e-06, "loss": 0.3346, "step": 378 }, { "epoch": 0.07920585161964472, "grad_norm": 1.8502160109078707, "learning_rate": 8.770301624129931e-06, "loss": 0.3537, "step": 379 }, { "epoch": 0.0794148380355277, "grad_norm": 2.097033409871866, "learning_rate": 8.793503480278423e-06, "loss": 0.323, "step": 380 }, { "epoch": 0.07962382445141065, "grad_norm": 2.1961225554863852, "learning_rate": 8.816705336426914e-06, "loss": 0.3215, "step": 381 }, { "epoch": 0.07983281086729363, "grad_norm": 2.0661130929548532, "learning_rate": 8.839907192575406e-06, "loss": 0.2956, "step": 382 }, { "epoch": 0.08004179728317659, "grad_norm": 6.104930034810745, "learning_rate": 8.8631090487239e-06, "loss": 0.3304, "step": 383 }, { "epoch": 0.08025078369905957, "grad_norm": 1.6575738271725098, "learning_rate": 8.886310904872391e-06, "loss": 0.3381, "step": 384 }, { "epoch": 0.08045977011494253, "grad_norm": 3.5336617976346463, "learning_rate": 8.909512761020883e-06, "loss": 0.3398, "step": 385 }, { "epoch": 0.0806687565308255, "grad_norm": 3.5102151503865104, "learning_rate": 8.932714617169374e-06, "loss": 0.3496, "step": 386 }, { "epoch": 0.08087774294670846, "grad_norm": 1.696535988141718, "learning_rate": 8.955916473317866e-06, "loss": 0.3126, "step": 387 }, { "epoch": 0.08108672936259143, "grad_norm": 3.176625931983476, "learning_rate": 8.979118329466358e-06, "loss": 0.3068, "step": 388 }, { "epoch": 0.0812957157784744, "grad_norm": 3.055053087641374, "learning_rate": 9.00232018561485e-06, "loss": 0.3408, "step": 389 }, { "epoch": 0.08150470219435736, "grad_norm": 3.153367361299047, "learning_rate": 9.025522041763341e-06, "loss": 0.3355, "step": 390 }, { "epoch": 0.08171368861024034, "grad_norm": 2.19462778924146, "learning_rate": 9.048723897911834e-06, "loss": 0.3353, "step": 391 }, { "epoch": 0.0819226750261233, "grad_norm": 3.4062823950392223, "learning_rate": 9.071925754060326e-06, "loss": 0.3333, "step": 392 }, { "epoch": 0.08213166144200627, "grad_norm": 1.7158439378852406, "learning_rate": 9.095127610208818e-06, "loss": 0.3218, "step": 393 }, { "epoch": 0.08234064785788923, "grad_norm": 4.119543425905572, "learning_rate": 9.11832946635731e-06, "loss": 0.3242, "step": 394 }, { "epoch": 0.08254963427377221, "grad_norm": 3.9738223991290362, "learning_rate": 9.141531322505801e-06, "loss": 0.3443, "step": 395 }, { "epoch": 0.08275862068965517, "grad_norm": 1.9950717514647998, "learning_rate": 9.164733178654293e-06, "loss": 0.3433, "step": 396 }, { "epoch": 0.08296760710553815, "grad_norm": 2.305455806860021, "learning_rate": 9.187935034802784e-06, "loss": 0.3341, "step": 397 }, { "epoch": 0.08317659352142111, "grad_norm": 3.9291475227981363, "learning_rate": 9.211136890951276e-06, "loss": 0.3267, "step": 398 }, { "epoch": 0.08338557993730407, "grad_norm": 2.4004096987420125, "learning_rate": 9.23433874709977e-06, "loss": 0.3288, "step": 399 }, { "epoch": 0.08359456635318704, "grad_norm": 1.5248234799372082, "learning_rate": 9.257540603248261e-06, "loss": 0.3232, "step": 400 }, { "epoch": 0.08380355276907, "grad_norm": 3.2296849491592603, "learning_rate": 9.280742459396753e-06, "loss": 0.3032, "step": 401 }, { "epoch": 0.08401253918495298, "grad_norm": 3.976591707392341, "learning_rate": 9.303944315545244e-06, "loss": 0.3232, "step": 402 }, { "epoch": 0.08422152560083594, "grad_norm": 3.1369459865557903, "learning_rate": 9.327146171693736e-06, "loss": 0.3634, "step": 403 }, { "epoch": 0.08443051201671892, "grad_norm": 3.5223434337225337, "learning_rate": 9.350348027842228e-06, "loss": 0.3298, "step": 404 }, { "epoch": 0.08463949843260188, "grad_norm": 2.74273274964881, "learning_rate": 9.373549883990721e-06, "loss": 0.322, "step": 405 }, { "epoch": 0.08484848484848485, "grad_norm": 4.09496747085497, "learning_rate": 9.396751740139211e-06, "loss": 0.3286, "step": 406 }, { "epoch": 0.08505747126436781, "grad_norm": 2.291269430187248, "learning_rate": 9.419953596287704e-06, "loss": 0.3627, "step": 407 }, { "epoch": 0.08526645768025079, "grad_norm": 1.8529345222539826, "learning_rate": 9.443155452436196e-06, "loss": 0.3222, "step": 408 }, { "epoch": 0.08547544409613375, "grad_norm": 3.5539051512650044, "learning_rate": 9.466357308584688e-06, "loss": 0.3502, "step": 409 }, { "epoch": 0.08568443051201671, "grad_norm": 2.077272556135119, "learning_rate": 9.48955916473318e-06, "loss": 0.354, "step": 410 }, { "epoch": 0.08589341692789969, "grad_norm": 1.6390237862792305, "learning_rate": 9.512761020881671e-06, "loss": 0.3297, "step": 411 }, { "epoch": 0.08610240334378265, "grad_norm": 1.5021723327376775, "learning_rate": 9.535962877030163e-06, "loss": 0.34, "step": 412 }, { "epoch": 0.08631138975966562, "grad_norm": 2.878012692596608, "learning_rate": 9.559164733178656e-06, "loss": 0.3458, "step": 413 }, { "epoch": 0.08652037617554859, "grad_norm": 1.6816858738697873, "learning_rate": 9.582366589327146e-06, "loss": 0.3691, "step": 414 }, { "epoch": 0.08672936259143156, "grad_norm": 1.5205379050625771, "learning_rate": 9.60556844547564e-06, "loss": 0.3448, "step": 415 }, { "epoch": 0.08693834900731452, "grad_norm": 1.548055551536886, "learning_rate": 9.628770301624131e-06, "loss": 0.3288, "step": 416 }, { "epoch": 0.0871473354231975, "grad_norm": 2.1488444209059283, "learning_rate": 9.651972157772623e-06, "loss": 0.3332, "step": 417 }, { "epoch": 0.08735632183908046, "grad_norm": 2.246309664988387, "learning_rate": 9.675174013921114e-06, "loss": 0.3603, "step": 418 }, { "epoch": 0.08756530825496343, "grad_norm": 1.9068262921850743, "learning_rate": 9.698375870069606e-06, "loss": 0.3303, "step": 419 }, { "epoch": 0.0877742946708464, "grad_norm": 1.5661837589126137, "learning_rate": 9.721577726218098e-06, "loss": 0.3041, "step": 420 }, { "epoch": 0.08798328108672936, "grad_norm": 1.8899650450869094, "learning_rate": 9.744779582366591e-06, "loss": 0.339, "step": 421 }, { "epoch": 0.08819226750261233, "grad_norm": 1.9567370516466598, "learning_rate": 9.767981438515083e-06, "loss": 0.3379, "step": 422 }, { "epoch": 0.08840125391849529, "grad_norm": 1.7292553210684662, "learning_rate": 9.791183294663574e-06, "loss": 0.3347, "step": 423 }, { "epoch": 0.08861024033437827, "grad_norm": 1.627120748069883, "learning_rate": 9.814385150812066e-06, "loss": 0.3667, "step": 424 }, { "epoch": 0.08881922675026123, "grad_norm": 2.7560924510631293, "learning_rate": 9.837587006960558e-06, "loss": 0.3273, "step": 425 }, { "epoch": 0.0890282131661442, "grad_norm": 1.8110337792970745, "learning_rate": 9.86078886310905e-06, "loss": 0.3359, "step": 426 }, { "epoch": 0.08923719958202717, "grad_norm": 1.5719418547061934, "learning_rate": 9.88399071925754e-06, "loss": 0.3222, "step": 427 }, { "epoch": 0.08944618599791014, "grad_norm": 1.9550965915217378, "learning_rate": 9.907192575406032e-06, "loss": 0.3599, "step": 428 }, { "epoch": 0.0896551724137931, "grad_norm": 1.668529166274587, "learning_rate": 9.930394431554526e-06, "loss": 0.3638, "step": 429 }, { "epoch": 0.08986415882967608, "grad_norm": 1.5395107897831761, "learning_rate": 9.953596287703018e-06, "loss": 0.3148, "step": 430 }, { "epoch": 0.09007314524555904, "grad_norm": 1.7011871049838028, "learning_rate": 9.97679814385151e-06, "loss": 0.3445, "step": 431 }, { "epoch": 0.090282131661442, "grad_norm": 1.5037095702487024, "learning_rate": 1e-05, "loss": 0.3054, "step": 432 }, { "epoch": 0.09049111807732498, "grad_norm": 3.1300379430972427, "learning_rate": 1.0023201856148492e-05, "loss": 0.3145, "step": 433 }, { "epoch": 0.09070010449320794, "grad_norm": 1.8058457489954431, "learning_rate": 1.0046403712296984e-05, "loss": 0.3198, "step": 434 }, { "epoch": 0.09090909090909091, "grad_norm": 1.8516046186989084, "learning_rate": 1.0069605568445477e-05, "loss": 0.3376, "step": 435 }, { "epoch": 0.09111807732497387, "grad_norm": 2.0142389044630646, "learning_rate": 1.0092807424593969e-05, "loss": 0.314, "step": 436 }, { "epoch": 0.09132706374085685, "grad_norm": 1.6366624405779024, "learning_rate": 1.011600928074246e-05, "loss": 0.2989, "step": 437 }, { "epoch": 0.09153605015673981, "grad_norm": 1.9948349579855726, "learning_rate": 1.0139211136890952e-05, "loss": 0.3669, "step": 438 }, { "epoch": 0.09174503657262278, "grad_norm": 1.6836635758792577, "learning_rate": 1.0162412993039444e-05, "loss": 0.2951, "step": 439 }, { "epoch": 0.09195402298850575, "grad_norm": 1.8791060577587837, "learning_rate": 1.0185614849187937e-05, "loss": 0.3287, "step": 440 }, { "epoch": 0.09216300940438872, "grad_norm": 1.69705887955328, "learning_rate": 1.0208816705336429e-05, "loss": 0.3302, "step": 441 }, { "epoch": 0.09237199582027168, "grad_norm": 2.0275221643523023, "learning_rate": 1.0232018561484919e-05, "loss": 0.3453, "step": 442 }, { "epoch": 0.09258098223615464, "grad_norm": 2.856128320030965, "learning_rate": 1.025522041763341e-05, "loss": 0.3154, "step": 443 }, { "epoch": 0.09278996865203762, "grad_norm": 1.7396815519211042, "learning_rate": 1.0278422273781902e-05, "loss": 0.3033, "step": 444 }, { "epoch": 0.09299895506792058, "grad_norm": 3.422805495248889, "learning_rate": 1.0301624129930394e-05, "loss": 0.3487, "step": 445 }, { "epoch": 0.09320794148380356, "grad_norm": 1.8492312712088466, "learning_rate": 1.0324825986078887e-05, "loss": 0.3431, "step": 446 }, { "epoch": 0.09341692789968652, "grad_norm": 2.160249557279284, "learning_rate": 1.0348027842227379e-05, "loss": 0.3375, "step": 447 }, { "epoch": 0.09362591431556949, "grad_norm": 1.5647546432932764, "learning_rate": 1.037122969837587e-05, "loss": 0.3178, "step": 448 }, { "epoch": 0.09383490073145245, "grad_norm": 1.7415047398460568, "learning_rate": 1.0394431554524362e-05, "loss": 0.3257, "step": 449 }, { "epoch": 0.09404388714733543, "grad_norm": 2.9718239154472754, "learning_rate": 1.0417633410672854e-05, "loss": 0.3172, "step": 450 }, { "epoch": 0.09425287356321839, "grad_norm": 1.7970505244771007, "learning_rate": 1.0440835266821347e-05, "loss": 0.299, "step": 451 }, { "epoch": 0.09446185997910136, "grad_norm": 1.9548420912268545, "learning_rate": 1.0464037122969839e-05, "loss": 0.317, "step": 452 }, { "epoch": 0.09467084639498433, "grad_norm": 2.2944588715325733, "learning_rate": 1.048723897911833e-05, "loss": 0.3256, "step": 453 }, { "epoch": 0.09487983281086729, "grad_norm": 3.0735276478187346, "learning_rate": 1.0510440835266822e-05, "loss": 0.3125, "step": 454 }, { "epoch": 0.09508881922675026, "grad_norm": 1.7535818557457796, "learning_rate": 1.0533642691415314e-05, "loss": 0.3196, "step": 455 }, { "epoch": 0.09529780564263322, "grad_norm": 4.508842691928767, "learning_rate": 1.0556844547563807e-05, "loss": 0.3292, "step": 456 }, { "epoch": 0.0955067920585162, "grad_norm": 2.487773169078752, "learning_rate": 1.0580046403712299e-05, "loss": 0.3231, "step": 457 }, { "epoch": 0.09571577847439916, "grad_norm": 1.5022920603611816, "learning_rate": 1.060324825986079e-05, "loss": 0.3038, "step": 458 }, { "epoch": 0.09592476489028214, "grad_norm": 2.819083400205685, "learning_rate": 1.062645011600928e-05, "loss": 0.3396, "step": 459 }, { "epoch": 0.0961337513061651, "grad_norm": 3.607521118800649, "learning_rate": 1.0649651972157772e-05, "loss": 0.3235, "step": 460 }, { "epoch": 0.09634273772204807, "grad_norm": 1.984212976328136, "learning_rate": 1.0672853828306264e-05, "loss": 0.3409, "step": 461 }, { "epoch": 0.09655172413793103, "grad_norm": 2.274529296073636, "learning_rate": 1.0696055684454757e-05, "loss": 0.3223, "step": 462 }, { "epoch": 0.09676071055381401, "grad_norm": 2.360363306379567, "learning_rate": 1.0719257540603249e-05, "loss": 0.3595, "step": 463 }, { "epoch": 0.09696969696969697, "grad_norm": 2.3336846379990903, "learning_rate": 1.074245939675174e-05, "loss": 0.3362, "step": 464 }, { "epoch": 0.09717868338557993, "grad_norm": 1.6198915185188432, "learning_rate": 1.0765661252900232e-05, "loss": 0.3515, "step": 465 }, { "epoch": 0.0973876698014629, "grad_norm": 2.1543842811796035, "learning_rate": 1.0788863109048724e-05, "loss": 0.3347, "step": 466 }, { "epoch": 0.09759665621734587, "grad_norm": 1.8784433775788627, "learning_rate": 1.0812064965197217e-05, "loss": 0.3269, "step": 467 }, { "epoch": 0.09780564263322884, "grad_norm": 1.685490286773072, "learning_rate": 1.0835266821345709e-05, "loss": 0.2922, "step": 468 }, { "epoch": 0.0980146290491118, "grad_norm": 1.480509338883463, "learning_rate": 1.08584686774942e-05, "loss": 0.3455, "step": 469 }, { "epoch": 0.09822361546499478, "grad_norm": 1.942226499020694, "learning_rate": 1.0881670533642692e-05, "loss": 0.3467, "step": 470 }, { "epoch": 0.09843260188087774, "grad_norm": 1.7096290442250732, "learning_rate": 1.0904872389791184e-05, "loss": 0.3307, "step": 471 }, { "epoch": 0.09864158829676072, "grad_norm": 1.3177126830589636, "learning_rate": 1.0928074245939677e-05, "loss": 0.3238, "step": 472 }, { "epoch": 0.09885057471264368, "grad_norm": 2.3519620181435896, "learning_rate": 1.0951276102088169e-05, "loss": 0.354, "step": 473 }, { "epoch": 0.09905956112852665, "grad_norm": 1.6208954203617136, "learning_rate": 1.097447795823666e-05, "loss": 0.3367, "step": 474 }, { "epoch": 0.09926854754440961, "grad_norm": 2.4709810434486488, "learning_rate": 1.0997679814385152e-05, "loss": 0.3378, "step": 475 }, { "epoch": 0.09947753396029257, "grad_norm": 1.4274833035557968, "learning_rate": 1.1020881670533642e-05, "loss": 0.319, "step": 476 }, { "epoch": 0.09968652037617555, "grad_norm": 1.5149112566906768, "learning_rate": 1.1044083526682134e-05, "loss": 0.3089, "step": 477 }, { "epoch": 0.09989550679205851, "grad_norm": 2.2639154390116487, "learning_rate": 1.1067285382830629e-05, "loss": 0.3164, "step": 478 }, { "epoch": 0.10010449320794149, "grad_norm": 2.011063930006712, "learning_rate": 1.1090487238979119e-05, "loss": 0.3461, "step": 479 }, { "epoch": 0.10031347962382445, "grad_norm": 1.648946541550743, "learning_rate": 1.111368909512761e-05, "loss": 0.3278, "step": 480 }, { "epoch": 0.10052246603970742, "grad_norm": 1.7513685910210848, "learning_rate": 1.1136890951276102e-05, "loss": 0.3182, "step": 481 }, { "epoch": 0.10073145245559038, "grad_norm": 1.7929987640901357, "learning_rate": 1.1160092807424594e-05, "loss": 0.3145, "step": 482 }, { "epoch": 0.10094043887147336, "grad_norm": 1.2825156179292236, "learning_rate": 1.1183294663573087e-05, "loss": 0.3174, "step": 483 }, { "epoch": 0.10114942528735632, "grad_norm": 2.2371962907705134, "learning_rate": 1.1206496519721579e-05, "loss": 0.3112, "step": 484 }, { "epoch": 0.1013584117032393, "grad_norm": 1.7201100185414666, "learning_rate": 1.122969837587007e-05, "loss": 0.3735, "step": 485 }, { "epoch": 0.10156739811912226, "grad_norm": 1.6492375506887587, "learning_rate": 1.1252900232018562e-05, "loss": 0.3175, "step": 486 }, { "epoch": 0.10177638453500522, "grad_norm": 2.1519676620251107, "learning_rate": 1.1276102088167054e-05, "loss": 0.3526, "step": 487 }, { "epoch": 0.1019853709508882, "grad_norm": 2.6190621116795563, "learning_rate": 1.1299303944315547e-05, "loss": 0.3414, "step": 488 }, { "epoch": 0.10219435736677115, "grad_norm": 2.18781683318282, "learning_rate": 1.1322505800464039e-05, "loss": 0.3267, "step": 489 }, { "epoch": 0.10240334378265413, "grad_norm": 1.5064882183219128, "learning_rate": 1.134570765661253e-05, "loss": 0.2975, "step": 490 }, { "epoch": 0.10261233019853709, "grad_norm": 1.9833348341408568, "learning_rate": 1.1368909512761022e-05, "loss": 0.3399, "step": 491 }, { "epoch": 0.10282131661442007, "grad_norm": 3.2200217439287964, "learning_rate": 1.1392111368909514e-05, "loss": 0.3404, "step": 492 }, { "epoch": 0.10303030303030303, "grad_norm": 1.4564588215827616, "learning_rate": 1.1415313225058004e-05, "loss": 0.3365, "step": 493 }, { "epoch": 0.103239289446186, "grad_norm": 2.9541320055627, "learning_rate": 1.1438515081206499e-05, "loss": 0.3244, "step": 494 }, { "epoch": 0.10344827586206896, "grad_norm": 1.5808857933360647, "learning_rate": 1.146171693735499e-05, "loss": 0.3303, "step": 495 }, { "epoch": 0.10365726227795194, "grad_norm": 1.8910614724142234, "learning_rate": 1.148491879350348e-05, "loss": 0.3168, "step": 496 }, { "epoch": 0.1038662486938349, "grad_norm": 2.123175330305924, "learning_rate": 1.1508120649651972e-05, "loss": 0.319, "step": 497 }, { "epoch": 0.10407523510971786, "grad_norm": 2.1821968797059386, "learning_rate": 1.1531322505800464e-05, "loss": 0.3384, "step": 498 }, { "epoch": 0.10428422152560084, "grad_norm": 1.5544958032378966, "learning_rate": 1.1554524361948957e-05, "loss": 0.3546, "step": 499 }, { "epoch": 0.1044932079414838, "grad_norm": 1.7684244747941147, "learning_rate": 1.1577726218097449e-05, "loss": 0.3219, "step": 500 }, { "epoch": 0.10470219435736677, "grad_norm": 1.6033001514260776, "learning_rate": 1.160092807424594e-05, "loss": 0.3123, "step": 501 }, { "epoch": 0.10491118077324973, "grad_norm": 1.3179586690006162, "learning_rate": 1.1624129930394432e-05, "loss": 0.3337, "step": 502 }, { "epoch": 0.10512016718913271, "grad_norm": 1.7725688206463563, "learning_rate": 1.1647331786542924e-05, "loss": 0.3537, "step": 503 }, { "epoch": 0.10532915360501567, "grad_norm": 2.3147791010709904, "learning_rate": 1.1670533642691417e-05, "loss": 0.3362, "step": 504 }, { "epoch": 0.10553814002089865, "grad_norm": 2.71137471463255, "learning_rate": 1.1693735498839909e-05, "loss": 0.3587, "step": 505 }, { "epoch": 0.10574712643678161, "grad_norm": 1.6615683432052983, "learning_rate": 1.17169373549884e-05, "loss": 0.3354, "step": 506 }, { "epoch": 0.10595611285266458, "grad_norm": 2.332871191798814, "learning_rate": 1.1740139211136892e-05, "loss": 0.3113, "step": 507 }, { "epoch": 0.10616509926854754, "grad_norm": 3.107614027381329, "learning_rate": 1.1763341067285384e-05, "loss": 0.3303, "step": 508 }, { "epoch": 0.1063740856844305, "grad_norm": 1.7393939638696085, "learning_rate": 1.1786542923433875e-05, "loss": 0.3343, "step": 509 }, { "epoch": 0.10658307210031348, "grad_norm": 1.5462014742862886, "learning_rate": 1.1809744779582369e-05, "loss": 0.3649, "step": 510 }, { "epoch": 0.10679205851619644, "grad_norm": 2.163551942928915, "learning_rate": 1.183294663573086e-05, "loss": 0.3098, "step": 511 }, { "epoch": 0.10700104493207942, "grad_norm": 2.1224415759647743, "learning_rate": 1.1856148491879352e-05, "loss": 0.3307, "step": 512 }, { "epoch": 0.10721003134796238, "grad_norm": 2.0022643307007733, "learning_rate": 1.1879350348027842e-05, "loss": 0.3443, "step": 513 }, { "epoch": 0.10741901776384535, "grad_norm": 1.6999010287668321, "learning_rate": 1.1902552204176334e-05, "loss": 0.3257, "step": 514 }, { "epoch": 0.10762800417972831, "grad_norm": 2.5958199493697567, "learning_rate": 1.1925754060324829e-05, "loss": 0.355, "step": 515 }, { "epoch": 0.10783699059561129, "grad_norm": 1.8267940787867996, "learning_rate": 1.1948955916473319e-05, "loss": 0.3164, "step": 516 }, { "epoch": 0.10804597701149425, "grad_norm": 1.8381843228518222, "learning_rate": 1.197215777262181e-05, "loss": 0.3443, "step": 517 }, { "epoch": 0.10825496342737723, "grad_norm": 1.5734105075752958, "learning_rate": 1.1995359628770302e-05, "loss": 0.33, "step": 518 }, { "epoch": 0.10846394984326019, "grad_norm": 2.0890645567170045, "learning_rate": 1.2018561484918794e-05, "loss": 0.3366, "step": 519 }, { "epoch": 0.10867293625914315, "grad_norm": 2.602924753705637, "learning_rate": 1.2041763341067287e-05, "loss": 0.3181, "step": 520 }, { "epoch": 0.10888192267502612, "grad_norm": 1.8418223383392436, "learning_rate": 1.2064965197215779e-05, "loss": 0.3448, "step": 521 }, { "epoch": 0.10909090909090909, "grad_norm": 1.8541734389485072, "learning_rate": 1.208816705336427e-05, "loss": 0.3372, "step": 522 }, { "epoch": 0.10929989550679206, "grad_norm": 2.0730429002328545, "learning_rate": 1.2111368909512762e-05, "loss": 0.3341, "step": 523 }, { "epoch": 0.10950888192267502, "grad_norm": 2.2203094364254228, "learning_rate": 1.2134570765661254e-05, "loss": 0.3347, "step": 524 }, { "epoch": 0.109717868338558, "grad_norm": 1.7092846126471424, "learning_rate": 1.2157772621809745e-05, "loss": 0.3064, "step": 525 }, { "epoch": 0.10992685475444096, "grad_norm": 2.7174075726627653, "learning_rate": 1.2180974477958239e-05, "loss": 0.3411, "step": 526 }, { "epoch": 0.11013584117032393, "grad_norm": 1.507524796512577, "learning_rate": 1.220417633410673e-05, "loss": 0.2988, "step": 527 }, { "epoch": 0.1103448275862069, "grad_norm": 1.4330120056529476, "learning_rate": 1.2227378190255222e-05, "loss": 0.3505, "step": 528 }, { "epoch": 0.11055381400208987, "grad_norm": 1.8107784679623933, "learning_rate": 1.2250580046403714e-05, "loss": 0.3006, "step": 529 }, { "epoch": 0.11076280041797283, "grad_norm": 2.0805588598703855, "learning_rate": 1.2273781902552204e-05, "loss": 0.3322, "step": 530 }, { "epoch": 0.11097178683385579, "grad_norm": 1.750678601405605, "learning_rate": 1.2296983758700699e-05, "loss": 0.3436, "step": 531 }, { "epoch": 0.11118077324973877, "grad_norm": 1.5814752340831746, "learning_rate": 1.2320185614849189e-05, "loss": 0.3324, "step": 532 }, { "epoch": 0.11138975966562173, "grad_norm": 1.5491413357346, "learning_rate": 1.234338747099768e-05, "loss": 0.3582, "step": 533 }, { "epoch": 0.1115987460815047, "grad_norm": 1.6513509490287595, "learning_rate": 1.2366589327146172e-05, "loss": 0.3504, "step": 534 }, { "epoch": 0.11180773249738767, "grad_norm": 1.438005088182847, "learning_rate": 1.2389791183294664e-05, "loss": 0.3343, "step": 535 }, { "epoch": 0.11201671891327064, "grad_norm": 1.3544905287665772, "learning_rate": 1.2412993039443157e-05, "loss": 0.3286, "step": 536 }, { "epoch": 0.1122257053291536, "grad_norm": 1.9719939344873905, "learning_rate": 1.2436194895591649e-05, "loss": 0.3401, "step": 537 }, { "epoch": 0.11243469174503658, "grad_norm": 1.5453953968511882, "learning_rate": 1.245939675174014e-05, "loss": 0.3041, "step": 538 }, { "epoch": 0.11264367816091954, "grad_norm": 1.87800060283809, "learning_rate": 1.2482598607888632e-05, "loss": 0.356, "step": 539 }, { "epoch": 0.11285266457680251, "grad_norm": 1.3655661319068557, "learning_rate": 1.2505800464037124e-05, "loss": 0.3251, "step": 540 }, { "epoch": 0.11306165099268548, "grad_norm": 1.352304253299697, "learning_rate": 1.2529002320185615e-05, "loss": 0.3307, "step": 541 }, { "epoch": 0.11327063740856844, "grad_norm": 1.5219084096103546, "learning_rate": 1.2552204176334109e-05, "loss": 0.3514, "step": 542 }, { "epoch": 0.11347962382445141, "grad_norm": 1.8908635457415175, "learning_rate": 1.25754060324826e-05, "loss": 0.3364, "step": 543 }, { "epoch": 0.11368861024033437, "grad_norm": 1.6282034894192556, "learning_rate": 1.2598607888631092e-05, "loss": 0.3174, "step": 544 }, { "epoch": 0.11389759665621735, "grad_norm": 1.690502154087474, "learning_rate": 1.2621809744779583e-05, "loss": 0.3314, "step": 545 }, { "epoch": 0.11410658307210031, "grad_norm": 2.3727667524074594, "learning_rate": 1.2645011600928075e-05, "loss": 0.3245, "step": 546 }, { "epoch": 0.11431556948798328, "grad_norm": 1.598933332321659, "learning_rate": 1.2668213457076569e-05, "loss": 0.3564, "step": 547 }, { "epoch": 0.11452455590386625, "grad_norm": 2.211967829684846, "learning_rate": 1.269141531322506e-05, "loss": 0.3278, "step": 548 }, { "epoch": 0.11473354231974922, "grad_norm": 1.6886912467539683, "learning_rate": 1.271461716937355e-05, "loss": 0.3403, "step": 549 }, { "epoch": 0.11494252873563218, "grad_norm": 1.9536936214494183, "learning_rate": 1.2737819025522042e-05, "loss": 0.3169, "step": 550 }, { "epoch": 0.11515151515151516, "grad_norm": 2.094853068080641, "learning_rate": 1.2761020881670533e-05, "loss": 0.357, "step": 551 }, { "epoch": 0.11536050156739812, "grad_norm": 2.2058925652584755, "learning_rate": 1.2784222737819027e-05, "loss": 0.3097, "step": 552 }, { "epoch": 0.11556948798328108, "grad_norm": 2.5374438120418366, "learning_rate": 1.2807424593967518e-05, "loss": 0.3536, "step": 553 }, { "epoch": 0.11577847439916406, "grad_norm": 2.348999027459725, "learning_rate": 1.283062645011601e-05, "loss": 0.3478, "step": 554 }, { "epoch": 0.11598746081504702, "grad_norm": 2.622237678445867, "learning_rate": 1.2853828306264502e-05, "loss": 0.3224, "step": 555 }, { "epoch": 0.11619644723092999, "grad_norm": 1.6400281014742002, "learning_rate": 1.2877030162412993e-05, "loss": 0.3422, "step": 556 }, { "epoch": 0.11640543364681295, "grad_norm": 1.810586865728098, "learning_rate": 1.2900232018561485e-05, "loss": 0.3323, "step": 557 }, { "epoch": 0.11661442006269593, "grad_norm": 1.6243347814925833, "learning_rate": 1.2923433874709978e-05, "loss": 0.3313, "step": 558 }, { "epoch": 0.11682340647857889, "grad_norm": 2.6037839778647998, "learning_rate": 1.294663573085847e-05, "loss": 0.3197, "step": 559 }, { "epoch": 0.11703239289446186, "grad_norm": 2.800540206854006, "learning_rate": 1.2969837587006962e-05, "loss": 0.3056, "step": 560 }, { "epoch": 0.11724137931034483, "grad_norm": 1.5578861560500556, "learning_rate": 1.2993039443155453e-05, "loss": 0.3376, "step": 561 }, { "epoch": 0.1174503657262278, "grad_norm": 2.307368162448858, "learning_rate": 1.3016241299303945e-05, "loss": 0.3079, "step": 562 }, { "epoch": 0.11765935214211076, "grad_norm": 3.65942039247797, "learning_rate": 1.3039443155452438e-05, "loss": 0.3292, "step": 563 }, { "epoch": 0.11786833855799372, "grad_norm": 1.5460224370128066, "learning_rate": 1.306264501160093e-05, "loss": 0.3178, "step": 564 }, { "epoch": 0.1180773249738767, "grad_norm": 3.7163507791076316, "learning_rate": 1.3085846867749422e-05, "loss": 0.3097, "step": 565 }, { "epoch": 0.11828631138975966, "grad_norm": 1.856900354867644, "learning_rate": 1.3109048723897912e-05, "loss": 0.2783, "step": 566 }, { "epoch": 0.11849529780564264, "grad_norm": 1.7309442512578983, "learning_rate": 1.3132250580046403e-05, "loss": 0.3101, "step": 567 }, { "epoch": 0.1187042842215256, "grad_norm": 1.94404388481336, "learning_rate": 1.3155452436194898e-05, "loss": 0.3275, "step": 568 }, { "epoch": 0.11891327063740857, "grad_norm": 2.1976074947922135, "learning_rate": 1.3178654292343388e-05, "loss": 0.333, "step": 569 }, { "epoch": 0.11912225705329153, "grad_norm": 1.8576413205126494, "learning_rate": 1.320185614849188e-05, "loss": 0.2986, "step": 570 }, { "epoch": 0.11933124346917451, "grad_norm": 1.530822999911838, "learning_rate": 1.3225058004640372e-05, "loss": 0.3256, "step": 571 }, { "epoch": 0.11954022988505747, "grad_norm": 1.597556092654986, "learning_rate": 1.3248259860788863e-05, "loss": 0.3263, "step": 572 }, { "epoch": 0.11974921630094044, "grad_norm": 2.2386047687499, "learning_rate": 1.3271461716937355e-05, "loss": 0.2997, "step": 573 }, { "epoch": 0.1199582027168234, "grad_norm": 1.942239126612817, "learning_rate": 1.3294663573085848e-05, "loss": 0.3081, "step": 574 }, { "epoch": 0.12016718913270637, "grad_norm": 1.5918125256198787, "learning_rate": 1.331786542923434e-05, "loss": 0.3157, "step": 575 }, { "epoch": 0.12037617554858934, "grad_norm": 2.23020488139555, "learning_rate": 1.3341067285382832e-05, "loss": 0.3188, "step": 576 }, { "epoch": 0.1205851619644723, "grad_norm": 1.9090016270793373, "learning_rate": 1.3364269141531323e-05, "loss": 0.3131, "step": 577 }, { "epoch": 0.12079414838035528, "grad_norm": 1.5664439920545787, "learning_rate": 1.3387470997679815e-05, "loss": 0.2863, "step": 578 }, { "epoch": 0.12100313479623824, "grad_norm": 1.6725109828356888, "learning_rate": 1.3410672853828308e-05, "loss": 0.3429, "step": 579 }, { "epoch": 0.12121212121212122, "grad_norm": 2.8961302448737887, "learning_rate": 1.34338747099768e-05, "loss": 0.3282, "step": 580 }, { "epoch": 0.12142110762800418, "grad_norm": 1.50335967099704, "learning_rate": 1.3457076566125292e-05, "loss": 0.3256, "step": 581 }, { "epoch": 0.12163009404388715, "grad_norm": 1.4954415961786591, "learning_rate": 1.3480278422273783e-05, "loss": 0.3542, "step": 582 }, { "epoch": 0.12183908045977011, "grad_norm": 1.473581018912084, "learning_rate": 1.3503480278422273e-05, "loss": 0.3052, "step": 583 }, { "epoch": 0.12204806687565309, "grad_norm": 1.914199580890921, "learning_rate": 1.3526682134570768e-05, "loss": 0.3277, "step": 584 }, { "epoch": 0.12225705329153605, "grad_norm": 1.5225585474345702, "learning_rate": 1.354988399071926e-05, "loss": 0.3099, "step": 585 }, { "epoch": 0.12246603970741901, "grad_norm": 1.749095753824108, "learning_rate": 1.357308584686775e-05, "loss": 0.314, "step": 586 }, { "epoch": 0.12267502612330199, "grad_norm": 1.4926629128713071, "learning_rate": 1.3596287703016242e-05, "loss": 0.3258, "step": 587 }, { "epoch": 0.12288401253918495, "grad_norm": 1.6757755877544929, "learning_rate": 1.3619489559164733e-05, "loss": 0.3106, "step": 588 }, { "epoch": 0.12309299895506792, "grad_norm": 1.6526588452325526, "learning_rate": 1.3642691415313227e-05, "loss": 0.3389, "step": 589 }, { "epoch": 0.12330198537095088, "grad_norm": 1.5599733387791082, "learning_rate": 1.3665893271461718e-05, "loss": 0.2975, "step": 590 }, { "epoch": 0.12351097178683386, "grad_norm": 1.5090224780118475, "learning_rate": 1.368909512761021e-05, "loss": 0.3323, "step": 591 }, { "epoch": 0.12371995820271682, "grad_norm": 2.2449691009390564, "learning_rate": 1.3712296983758702e-05, "loss": 0.3576, "step": 592 }, { "epoch": 0.1239289446185998, "grad_norm": 1.6000589197253174, "learning_rate": 1.3735498839907193e-05, "loss": 0.3281, "step": 593 }, { "epoch": 0.12413793103448276, "grad_norm": 1.479253084150457, "learning_rate": 1.3758700696055685e-05, "loss": 0.3264, "step": 594 }, { "epoch": 0.12434691745036573, "grad_norm": 2.3485738288926665, "learning_rate": 1.3781902552204178e-05, "loss": 0.3286, "step": 595 }, { "epoch": 0.1245559038662487, "grad_norm": 2.660254193177636, "learning_rate": 1.380510440835267e-05, "loss": 0.3233, "step": 596 }, { "epoch": 0.12476489028213165, "grad_norm": 1.6978993810933558, "learning_rate": 1.3828306264501162e-05, "loss": 0.3098, "step": 597 }, { "epoch": 0.12497387669801463, "grad_norm": 1.865209820662916, "learning_rate": 1.3851508120649653e-05, "loss": 0.3422, "step": 598 }, { "epoch": 0.1251828631138976, "grad_norm": 2.2230613911529296, "learning_rate": 1.3874709976798145e-05, "loss": 0.3116, "step": 599 }, { "epoch": 0.12539184952978055, "grad_norm": 1.5038566776289115, "learning_rate": 1.3897911832946638e-05, "loss": 0.3538, "step": 600 }, { "epoch": 0.12560083594566354, "grad_norm": 2.302112711997953, "learning_rate": 1.392111368909513e-05, "loss": 0.3199, "step": 601 }, { "epoch": 0.1258098223615465, "grad_norm": 1.5691418887431596, "learning_rate": 1.3944315545243622e-05, "loss": 0.33, "step": 602 }, { "epoch": 0.12601880877742946, "grad_norm": 1.3761773947142673, "learning_rate": 1.3967517401392111e-05, "loss": 0.3269, "step": 603 }, { "epoch": 0.12622779519331243, "grad_norm": 2.193707572906311, "learning_rate": 1.3990719257540603e-05, "loss": 0.3094, "step": 604 }, { "epoch": 0.12643678160919541, "grad_norm": 1.9939369501691382, "learning_rate": 1.4013921113689098e-05, "loss": 0.3097, "step": 605 }, { "epoch": 0.12664576802507838, "grad_norm": 1.7367276114532284, "learning_rate": 1.4037122969837588e-05, "loss": 0.3392, "step": 606 }, { "epoch": 0.12685475444096134, "grad_norm": 1.5846688688151194, "learning_rate": 1.406032482598608e-05, "loss": 0.3474, "step": 607 }, { "epoch": 0.1270637408568443, "grad_norm": 1.5534151126316271, "learning_rate": 1.4083526682134571e-05, "loss": 0.3292, "step": 608 }, { "epoch": 0.12727272727272726, "grad_norm": 1.9434969020043271, "learning_rate": 1.4106728538283063e-05, "loss": 0.299, "step": 609 }, { "epoch": 0.12748171368861025, "grad_norm": 1.7452551849302407, "learning_rate": 1.4129930394431555e-05, "loss": 0.3186, "step": 610 }, { "epoch": 0.1276907001044932, "grad_norm": 1.747513609354159, "learning_rate": 1.4153132250580048e-05, "loss": 0.3177, "step": 611 }, { "epoch": 0.12789968652037617, "grad_norm": 1.685222308772567, "learning_rate": 1.417633410672854e-05, "loss": 0.3047, "step": 612 }, { "epoch": 0.12810867293625913, "grad_norm": 1.773247013545785, "learning_rate": 1.4199535962877031e-05, "loss": 0.3326, "step": 613 }, { "epoch": 0.12831765935214212, "grad_norm": 2.2778324343786966, "learning_rate": 1.4222737819025523e-05, "loss": 0.3123, "step": 614 }, { "epoch": 0.12852664576802508, "grad_norm": 1.8054872591081887, "learning_rate": 1.4245939675174015e-05, "loss": 0.3136, "step": 615 }, { "epoch": 0.12873563218390804, "grad_norm": 4.311370788543365, "learning_rate": 1.4269141531322508e-05, "loss": 0.3057, "step": 616 }, { "epoch": 0.128944618599791, "grad_norm": 2.3382309064653124, "learning_rate": 1.4292343387471e-05, "loss": 0.3175, "step": 617 }, { "epoch": 0.129153605015674, "grad_norm": 1.6604600720778362, "learning_rate": 1.4315545243619491e-05, "loss": 0.2952, "step": 618 }, { "epoch": 0.12936259143155696, "grad_norm": 2.0158196999052294, "learning_rate": 1.4338747099767983e-05, "loss": 0.3241, "step": 619 }, { "epoch": 0.12957157784743992, "grad_norm": 1.6787363872280672, "learning_rate": 1.4361948955916473e-05, "loss": 0.3607, "step": 620 }, { "epoch": 0.12978056426332288, "grad_norm": 1.998421637346336, "learning_rate": 1.4385150812064968e-05, "loss": 0.3038, "step": 621 }, { "epoch": 0.12998955067920584, "grad_norm": 1.4878025728868232, "learning_rate": 1.440835266821346e-05, "loss": 0.3041, "step": 622 }, { "epoch": 0.13019853709508883, "grad_norm": 1.5762012798053484, "learning_rate": 1.443155452436195e-05, "loss": 0.3283, "step": 623 }, { "epoch": 0.1304075235109718, "grad_norm": 1.4826163222558846, "learning_rate": 1.4454756380510441e-05, "loss": 0.3363, "step": 624 }, { "epoch": 0.13061650992685475, "grad_norm": 1.8063537657086277, "learning_rate": 1.4477958236658933e-05, "loss": 0.2885, "step": 625 }, { "epoch": 0.1308254963427377, "grad_norm": 1.7479305614659155, "learning_rate": 1.4501160092807425e-05, "loss": 0.3377, "step": 626 }, { "epoch": 0.1310344827586207, "grad_norm": 1.791092417323287, "learning_rate": 1.4524361948955918e-05, "loss": 0.3072, "step": 627 }, { "epoch": 0.13124346917450366, "grad_norm": 1.5645040006973017, "learning_rate": 1.454756380510441e-05, "loss": 0.2712, "step": 628 }, { "epoch": 0.13145245559038662, "grad_norm": 1.6058292402513805, "learning_rate": 1.4570765661252901e-05, "loss": 0.2971, "step": 629 }, { "epoch": 0.13166144200626959, "grad_norm": 1.7727458115300125, "learning_rate": 1.4593967517401393e-05, "loss": 0.3081, "step": 630 }, { "epoch": 0.13187042842215255, "grad_norm": 1.9313381865821955, "learning_rate": 1.4617169373549885e-05, "loss": 0.3302, "step": 631 }, { "epoch": 0.13207941483803554, "grad_norm": 2.5879485726148705, "learning_rate": 1.4640371229698378e-05, "loss": 0.324, "step": 632 }, { "epoch": 0.1322884012539185, "grad_norm": 1.6026057308761354, "learning_rate": 1.466357308584687e-05, "loss": 0.301, "step": 633 }, { "epoch": 0.13249738766980146, "grad_norm": 2.861934405717852, "learning_rate": 1.4686774941995361e-05, "loss": 0.3339, "step": 634 }, { "epoch": 0.13270637408568442, "grad_norm": 2.282580926166304, "learning_rate": 1.4709976798143853e-05, "loss": 0.3741, "step": 635 }, { "epoch": 0.1329153605015674, "grad_norm": 1.981471962492577, "learning_rate": 1.4733178654292345e-05, "loss": 0.3185, "step": 636 }, { "epoch": 0.13312434691745037, "grad_norm": 1.5339374067443807, "learning_rate": 1.4756380510440838e-05, "loss": 0.3325, "step": 637 }, { "epoch": 0.13333333333333333, "grad_norm": 3.2289309509192963, "learning_rate": 1.477958236658933e-05, "loss": 0.3201, "step": 638 }, { "epoch": 0.1335423197492163, "grad_norm": 3.0144877756791617, "learning_rate": 1.4802784222737821e-05, "loss": 0.3413, "step": 639 }, { "epoch": 0.13375130616509928, "grad_norm": 1.3886180510230655, "learning_rate": 1.4825986078886311e-05, "loss": 0.3452, "step": 640 }, { "epoch": 0.13396029258098224, "grad_norm": 2.1834933318881253, "learning_rate": 1.4849187935034803e-05, "loss": 0.333, "step": 641 }, { "epoch": 0.1341692789968652, "grad_norm": 2.7336485589588455, "learning_rate": 1.4872389791183295e-05, "loss": 0.3143, "step": 642 }, { "epoch": 0.13437826541274817, "grad_norm": 2.7601633126148957, "learning_rate": 1.4895591647331788e-05, "loss": 0.3383, "step": 643 }, { "epoch": 0.13458725182863113, "grad_norm": 1.3568045195762455, "learning_rate": 1.491879350348028e-05, "loss": 0.3107, "step": 644 }, { "epoch": 0.13479623824451412, "grad_norm": 2.173735978721389, "learning_rate": 1.4941995359628771e-05, "loss": 0.3249, "step": 645 }, { "epoch": 0.13500522466039708, "grad_norm": 2.332194435221939, "learning_rate": 1.4965197215777263e-05, "loss": 0.3177, "step": 646 }, { "epoch": 0.13521421107628004, "grad_norm": 2.0485940842305403, "learning_rate": 1.4988399071925755e-05, "loss": 0.3654, "step": 647 }, { "epoch": 0.135423197492163, "grad_norm": 1.4930951481718582, "learning_rate": 1.5011600928074248e-05, "loss": 0.3582, "step": 648 }, { "epoch": 0.135632183908046, "grad_norm": 2.000312012239388, "learning_rate": 1.503480278422274e-05, "loss": 0.3323, "step": 649 }, { "epoch": 0.13584117032392895, "grad_norm": 2.2358128462468483, "learning_rate": 1.5058004640371231e-05, "loss": 0.295, "step": 650 }, { "epoch": 0.1360501567398119, "grad_norm": 2.0915881225076216, "learning_rate": 1.5081206496519723e-05, "loss": 0.3317, "step": 651 }, { "epoch": 0.13625914315569487, "grad_norm": 2.0787249837374127, "learning_rate": 1.5104408352668215e-05, "loss": 0.301, "step": 652 }, { "epoch": 0.13646812957157783, "grad_norm": 1.5608120608104759, "learning_rate": 1.5127610208816708e-05, "loss": 0.3192, "step": 653 }, { "epoch": 0.13667711598746082, "grad_norm": 1.7922366378020576, "learning_rate": 1.51508120649652e-05, "loss": 0.3281, "step": 654 }, { "epoch": 0.13688610240334378, "grad_norm": 2.295889956832159, "learning_rate": 1.5174013921113691e-05, "loss": 0.3399, "step": 655 }, { "epoch": 0.13709508881922675, "grad_norm": 1.8277717942446914, "learning_rate": 1.5197215777262181e-05, "loss": 0.3339, "step": 656 }, { "epoch": 0.1373040752351097, "grad_norm": 1.3882013427556295, "learning_rate": 1.5220417633410673e-05, "loss": 0.2983, "step": 657 }, { "epoch": 0.1375130616509927, "grad_norm": 2.148818899192603, "learning_rate": 1.5243619489559164e-05, "loss": 0.3406, "step": 658 }, { "epoch": 0.13772204806687566, "grad_norm": 2.601729781044458, "learning_rate": 1.526682134570766e-05, "loss": 0.3072, "step": 659 }, { "epoch": 0.13793103448275862, "grad_norm": 1.5779467511681495, "learning_rate": 1.529002320185615e-05, "loss": 0.3243, "step": 660 }, { "epoch": 0.13814002089864158, "grad_norm": 1.964349820433868, "learning_rate": 1.5313225058004643e-05, "loss": 0.3026, "step": 661 }, { "epoch": 0.13834900731452457, "grad_norm": 2.6942343411279754, "learning_rate": 1.5336426914153134e-05, "loss": 0.3128, "step": 662 }, { "epoch": 0.13855799373040753, "grad_norm": 2.5503347518966937, "learning_rate": 1.5359628770301626e-05, "loss": 0.315, "step": 663 }, { "epoch": 0.1387669801462905, "grad_norm": 1.4959009723062877, "learning_rate": 1.5382830626450118e-05, "loss": 0.3491, "step": 664 }, { "epoch": 0.13897596656217345, "grad_norm": 2.983085272173498, "learning_rate": 1.540603248259861e-05, "loss": 0.3234, "step": 665 }, { "epoch": 0.13918495297805641, "grad_norm": 2.3976110453174755, "learning_rate": 1.54292343387471e-05, "loss": 0.3189, "step": 666 }, { "epoch": 0.1393939393939394, "grad_norm": 1.4235301257116666, "learning_rate": 1.5452436194895593e-05, "loss": 0.3374, "step": 667 }, { "epoch": 0.13960292580982236, "grad_norm": 1.4712732537220905, "learning_rate": 1.5475638051044084e-05, "loss": 0.3314, "step": 668 }, { "epoch": 0.13981191222570533, "grad_norm": 2.4704208921696673, "learning_rate": 1.5498839907192576e-05, "loss": 0.3078, "step": 669 }, { "epoch": 0.1400208986415883, "grad_norm": 2.5580192993506907, "learning_rate": 1.5522041763341068e-05, "loss": 0.3085, "step": 670 }, { "epoch": 0.14022988505747128, "grad_norm": 1.2697743726992348, "learning_rate": 1.554524361948956e-05, "loss": 0.3184, "step": 671 }, { "epoch": 0.14043887147335424, "grad_norm": 1.8433338372953891, "learning_rate": 1.556844547563805e-05, "loss": 0.3304, "step": 672 }, { "epoch": 0.1406478578892372, "grad_norm": 2.911507971962662, "learning_rate": 1.5591647331786543e-05, "loss": 0.3211, "step": 673 }, { "epoch": 0.14085684430512016, "grad_norm": 1.8858129746920993, "learning_rate": 1.5614849187935034e-05, "loss": 0.3172, "step": 674 }, { "epoch": 0.14106583072100312, "grad_norm": 1.2822649051647133, "learning_rate": 1.563805104408353e-05, "loss": 0.2954, "step": 675 }, { "epoch": 0.1412748171368861, "grad_norm": 2.1728584331150484, "learning_rate": 1.566125290023202e-05, "loss": 0.325, "step": 676 }, { "epoch": 0.14148380355276907, "grad_norm": 2.42343432808741, "learning_rate": 1.5684454756380513e-05, "loss": 0.3331, "step": 677 }, { "epoch": 0.14169278996865203, "grad_norm": 1.5525430196226875, "learning_rate": 1.5707656612529004e-05, "loss": 0.3079, "step": 678 }, { "epoch": 0.141901776384535, "grad_norm": 1.2773858226403583, "learning_rate": 1.5730858468677496e-05, "loss": 0.3086, "step": 679 }, { "epoch": 0.14211076280041798, "grad_norm": 2.0212084267308854, "learning_rate": 1.5754060324825988e-05, "loss": 0.2885, "step": 680 }, { "epoch": 0.14231974921630094, "grad_norm": 1.6584874389542636, "learning_rate": 1.577726218097448e-05, "loss": 0.3366, "step": 681 }, { "epoch": 0.1425287356321839, "grad_norm": 1.6805285414529554, "learning_rate": 1.580046403712297e-05, "loss": 0.3167, "step": 682 }, { "epoch": 0.14273772204806687, "grad_norm": 1.5166398699241062, "learning_rate": 1.5823665893271463e-05, "loss": 0.3137, "step": 683 }, { "epoch": 0.14294670846394986, "grad_norm": 1.8506315298647917, "learning_rate": 1.5846867749419954e-05, "loss": 0.3061, "step": 684 }, { "epoch": 0.14315569487983282, "grad_norm": 2.171331720925884, "learning_rate": 1.587006960556845e-05, "loss": 0.34, "step": 685 }, { "epoch": 0.14336468129571578, "grad_norm": 1.3585475146623491, "learning_rate": 1.5893271461716938e-05, "loss": 0.3237, "step": 686 }, { "epoch": 0.14357366771159874, "grad_norm": 2.37182428414345, "learning_rate": 1.591647331786543e-05, "loss": 0.3342, "step": 687 }, { "epoch": 0.1437826541274817, "grad_norm": 2.382208993123097, "learning_rate": 1.593967517401392e-05, "loss": 0.3389, "step": 688 }, { "epoch": 0.1439916405433647, "grad_norm": 1.6241935239750998, "learning_rate": 1.5962877030162413e-05, "loss": 0.3443, "step": 689 }, { "epoch": 0.14420062695924765, "grad_norm": 2.37425173651259, "learning_rate": 1.5986078886310904e-05, "loss": 0.3259, "step": 690 }, { "epoch": 0.1444096133751306, "grad_norm": 2.2949571769697488, "learning_rate": 1.60092807424594e-05, "loss": 0.2932, "step": 691 }, { "epoch": 0.14461859979101357, "grad_norm": 1.4508489515192349, "learning_rate": 1.603248259860789e-05, "loss": 0.3186, "step": 692 }, { "epoch": 0.14482758620689656, "grad_norm": 1.2608567135348556, "learning_rate": 1.6055684454756383e-05, "loss": 0.3189, "step": 693 }, { "epoch": 0.14503657262277952, "grad_norm": 1.9147124687125268, "learning_rate": 1.6078886310904874e-05, "loss": 0.3093, "step": 694 }, { "epoch": 0.1452455590386625, "grad_norm": 1.5440624451776752, "learning_rate": 1.6102088167053366e-05, "loss": 0.3235, "step": 695 }, { "epoch": 0.14545454545454545, "grad_norm": 1.6427333330516838, "learning_rate": 1.6125290023201858e-05, "loss": 0.3341, "step": 696 }, { "epoch": 0.1456635318704284, "grad_norm": 1.6663417883594116, "learning_rate": 1.614849187935035e-05, "loss": 0.3053, "step": 697 }, { "epoch": 0.1458725182863114, "grad_norm": 2.2668395500009755, "learning_rate": 1.617169373549884e-05, "loss": 0.3582, "step": 698 }, { "epoch": 0.14608150470219436, "grad_norm": 1.4638421333689702, "learning_rate": 1.6194895591647333e-05, "loss": 0.3107, "step": 699 }, { "epoch": 0.14629049111807732, "grad_norm": 1.34017198212224, "learning_rate": 1.6218097447795824e-05, "loss": 0.3201, "step": 700 }, { "epoch": 0.14649947753396028, "grad_norm": 1.6801819357700758, "learning_rate": 1.624129930394432e-05, "loss": 0.3107, "step": 701 }, { "epoch": 0.14670846394984327, "grad_norm": 1.9336657478747403, "learning_rate": 1.626450116009281e-05, "loss": 0.3135, "step": 702 }, { "epoch": 0.14691745036572623, "grad_norm": 1.830854499463351, "learning_rate": 1.62877030162413e-05, "loss": 0.3273, "step": 703 }, { "epoch": 0.1471264367816092, "grad_norm": 2.568264120537481, "learning_rate": 1.631090487238979e-05, "loss": 0.3252, "step": 704 }, { "epoch": 0.14733542319749215, "grad_norm": 2.534436717585079, "learning_rate": 1.6334106728538283e-05, "loss": 0.3647, "step": 705 }, { "epoch": 0.14754440961337514, "grad_norm": 1.940929776109147, "learning_rate": 1.6357308584686774e-05, "loss": 0.3267, "step": 706 }, { "epoch": 0.1477533960292581, "grad_norm": 1.341078621019632, "learning_rate": 1.638051044083527e-05, "loss": 0.3292, "step": 707 }, { "epoch": 0.14796238244514107, "grad_norm": 1.987164491708874, "learning_rate": 1.640371229698376e-05, "loss": 0.3222, "step": 708 }, { "epoch": 0.14817136886102403, "grad_norm": 1.3215572039635215, "learning_rate": 1.6426914153132253e-05, "loss": 0.3261, "step": 709 }, { "epoch": 0.148380355276907, "grad_norm": 2.069485803292393, "learning_rate": 1.6450116009280744e-05, "loss": 0.3083, "step": 710 }, { "epoch": 0.14858934169278998, "grad_norm": 1.546114765369134, "learning_rate": 1.6473317865429236e-05, "loss": 0.2983, "step": 711 }, { "epoch": 0.14879832810867294, "grad_norm": 1.8206542432105628, "learning_rate": 1.6496519721577728e-05, "loss": 0.303, "step": 712 }, { "epoch": 0.1490073145245559, "grad_norm": 1.5111502940463621, "learning_rate": 1.651972157772622e-05, "loss": 0.3012, "step": 713 }, { "epoch": 0.14921630094043886, "grad_norm": 1.3785042172379396, "learning_rate": 1.654292343387471e-05, "loss": 0.3218, "step": 714 }, { "epoch": 0.14942528735632185, "grad_norm": 1.6011513035568203, "learning_rate": 1.6566125290023202e-05, "loss": 0.3091, "step": 715 }, { "epoch": 0.1496342737722048, "grad_norm": 1.8257250003139138, "learning_rate": 1.6589327146171694e-05, "loss": 0.3361, "step": 716 }, { "epoch": 0.14984326018808777, "grad_norm": 1.6843453480814723, "learning_rate": 1.661252900232019e-05, "loss": 0.3, "step": 717 }, { "epoch": 0.15005224660397073, "grad_norm": 1.5158459694843607, "learning_rate": 1.663573085846868e-05, "loss": 0.3407, "step": 718 }, { "epoch": 0.1502612330198537, "grad_norm": 1.522025009160265, "learning_rate": 1.6658932714617173e-05, "loss": 0.3052, "step": 719 }, { "epoch": 0.15047021943573669, "grad_norm": 1.8949447113831355, "learning_rate": 1.668213457076566e-05, "loss": 0.3178, "step": 720 }, { "epoch": 0.15067920585161965, "grad_norm": 1.3589254905136587, "learning_rate": 1.6705336426914152e-05, "loss": 0.3111, "step": 721 }, { "epoch": 0.1508881922675026, "grad_norm": 1.4988672025713587, "learning_rate": 1.6728538283062647e-05, "loss": 0.3245, "step": 722 }, { "epoch": 0.15109717868338557, "grad_norm": 1.9477441605897126, "learning_rate": 1.675174013921114e-05, "loss": 0.3146, "step": 723 }, { "epoch": 0.15130616509926856, "grad_norm": 1.5544428118873572, "learning_rate": 1.677494199535963e-05, "loss": 0.3354, "step": 724 }, { "epoch": 0.15151515151515152, "grad_norm": 1.2608535868166928, "learning_rate": 1.6798143851508122e-05, "loss": 0.2795, "step": 725 }, { "epoch": 0.15172413793103448, "grad_norm": 1.359474925543152, "learning_rate": 1.6821345707656614e-05, "loss": 0.3542, "step": 726 }, { "epoch": 0.15193312434691744, "grad_norm": 1.4745161027892395, "learning_rate": 1.6844547563805106e-05, "loss": 0.319, "step": 727 }, { "epoch": 0.15214211076280043, "grad_norm": 1.658694495352829, "learning_rate": 1.6867749419953597e-05, "loss": 0.3603, "step": 728 }, { "epoch": 0.1523510971786834, "grad_norm": 2.2525619004622595, "learning_rate": 1.689095127610209e-05, "loss": 0.3127, "step": 729 }, { "epoch": 0.15256008359456635, "grad_norm": 1.7454261117917755, "learning_rate": 1.691415313225058e-05, "loss": 0.3273, "step": 730 }, { "epoch": 0.15276907001044931, "grad_norm": 1.328559602625065, "learning_rate": 1.6937354988399072e-05, "loss": 0.3048, "step": 731 }, { "epoch": 0.15297805642633228, "grad_norm": 1.3080011777224796, "learning_rate": 1.6960556844547564e-05, "loss": 0.2945, "step": 732 }, { "epoch": 0.15318704284221527, "grad_norm": 1.4663826518973193, "learning_rate": 1.698375870069606e-05, "loss": 0.3042, "step": 733 }, { "epoch": 0.15339602925809823, "grad_norm": 1.3464085520959987, "learning_rate": 1.700696055684455e-05, "loss": 0.3586, "step": 734 }, { "epoch": 0.1536050156739812, "grad_norm": 1.8510421288735655, "learning_rate": 1.7030162412993042e-05, "loss": 0.3256, "step": 735 }, { "epoch": 0.15381400208986415, "grad_norm": 1.7207330502296951, "learning_rate": 1.7053364269141534e-05, "loss": 0.3218, "step": 736 }, { "epoch": 0.15402298850574714, "grad_norm": 1.8056825928151203, "learning_rate": 1.7076566125290022e-05, "loss": 0.3246, "step": 737 }, { "epoch": 0.1542319749216301, "grad_norm": 1.4565908585495368, "learning_rate": 1.7099767981438517e-05, "loss": 0.3184, "step": 738 }, { "epoch": 0.15444096133751306, "grad_norm": 1.4547854970799563, "learning_rate": 1.712296983758701e-05, "loss": 0.3415, "step": 739 }, { "epoch": 0.15464994775339602, "grad_norm": 1.4253611559009232, "learning_rate": 1.71461716937355e-05, "loss": 0.3124, "step": 740 }, { "epoch": 0.15485893416927898, "grad_norm": 2.117783980234564, "learning_rate": 1.7169373549883992e-05, "loss": 0.329, "step": 741 }, { "epoch": 0.15506792058516197, "grad_norm": 1.6721664783575045, "learning_rate": 1.7192575406032484e-05, "loss": 0.3167, "step": 742 }, { "epoch": 0.15527690700104493, "grad_norm": 1.2420949011770837, "learning_rate": 1.7215777262180976e-05, "loss": 0.3125, "step": 743 }, { "epoch": 0.1554858934169279, "grad_norm": 1.5619049081958376, "learning_rate": 1.7238979118329467e-05, "loss": 0.32, "step": 744 }, { "epoch": 0.15569487983281086, "grad_norm": 1.57974270692075, "learning_rate": 1.726218097447796e-05, "loss": 0.3562, "step": 745 }, { "epoch": 0.15590386624869385, "grad_norm": 1.4648294045808363, "learning_rate": 1.728538283062645e-05, "loss": 0.3151, "step": 746 }, { "epoch": 0.1561128526645768, "grad_norm": 1.5516515110645133, "learning_rate": 1.7308584686774942e-05, "loss": 0.3151, "step": 747 }, { "epoch": 0.15632183908045977, "grad_norm": 1.4823365162009994, "learning_rate": 1.7331786542923434e-05, "loss": 0.3292, "step": 748 }, { "epoch": 0.15653082549634273, "grad_norm": 1.582170689124211, "learning_rate": 1.735498839907193e-05, "loss": 0.3352, "step": 749 }, { "epoch": 0.15673981191222572, "grad_norm": 1.7134692553751663, "learning_rate": 1.737819025522042e-05, "loss": 0.3298, "step": 750 }, { "epoch": 0.15694879832810868, "grad_norm": 1.4361012099495416, "learning_rate": 1.7401392111368912e-05, "loss": 0.3001, "step": 751 }, { "epoch": 0.15715778474399164, "grad_norm": 1.3530791390317574, "learning_rate": 1.7424593967517404e-05, "loss": 0.2942, "step": 752 }, { "epoch": 0.1573667711598746, "grad_norm": 1.9838727871198751, "learning_rate": 1.7447795823665896e-05, "loss": 0.2924, "step": 753 }, { "epoch": 0.15757575757575756, "grad_norm": 1.6398071249171295, "learning_rate": 1.7470997679814387e-05, "loss": 0.3158, "step": 754 }, { "epoch": 0.15778474399164055, "grad_norm": 2.0318726372729405, "learning_rate": 1.749419953596288e-05, "loss": 0.3466, "step": 755 }, { "epoch": 0.1579937304075235, "grad_norm": 1.6289447889562227, "learning_rate": 1.751740139211137e-05, "loss": 0.308, "step": 756 }, { "epoch": 0.15820271682340648, "grad_norm": 1.6630525491228054, "learning_rate": 1.7540603248259862e-05, "loss": 0.3107, "step": 757 }, { "epoch": 0.15841170323928944, "grad_norm": 3.210353457201348, "learning_rate": 1.7563805104408354e-05, "loss": 0.338, "step": 758 }, { "epoch": 0.15862068965517243, "grad_norm": 1.621126015636725, "learning_rate": 1.7587006960556846e-05, "loss": 0.3025, "step": 759 }, { "epoch": 0.1588296760710554, "grad_norm": 1.6774691762675085, "learning_rate": 1.7610208816705337e-05, "loss": 0.29, "step": 760 }, { "epoch": 0.15903866248693835, "grad_norm": 3.108643712445238, "learning_rate": 1.763341067285383e-05, "loss": 0.3591, "step": 761 }, { "epoch": 0.1592476489028213, "grad_norm": 2.090559950071537, "learning_rate": 1.765661252900232e-05, "loss": 0.3337, "step": 762 }, { "epoch": 0.15945663531870427, "grad_norm": 1.7186714296224042, "learning_rate": 1.7679814385150812e-05, "loss": 0.3344, "step": 763 }, { "epoch": 0.15966562173458726, "grad_norm": 2.146354894835013, "learning_rate": 1.7703016241299304e-05, "loss": 0.3321, "step": 764 }, { "epoch": 0.15987460815047022, "grad_norm": 1.4828071377205263, "learning_rate": 1.77262180974478e-05, "loss": 0.3335, "step": 765 }, { "epoch": 0.16008359456635318, "grad_norm": 1.6257042551938041, "learning_rate": 1.774941995359629e-05, "loss": 0.3138, "step": 766 }, { "epoch": 0.16029258098223614, "grad_norm": 1.4669788180321943, "learning_rate": 1.7772621809744782e-05, "loss": 0.2994, "step": 767 }, { "epoch": 0.16050156739811913, "grad_norm": 1.559693161941828, "learning_rate": 1.7795823665893274e-05, "loss": 0.2945, "step": 768 }, { "epoch": 0.1607105538140021, "grad_norm": 1.663968161917605, "learning_rate": 1.7819025522041766e-05, "loss": 0.3574, "step": 769 }, { "epoch": 0.16091954022988506, "grad_norm": 1.7607319850433287, "learning_rate": 1.7842227378190257e-05, "loss": 0.3324, "step": 770 }, { "epoch": 0.16112852664576802, "grad_norm": 1.5146498447182097, "learning_rate": 1.786542923433875e-05, "loss": 0.3155, "step": 771 }, { "epoch": 0.161337513061651, "grad_norm": 2.6230894262900697, "learning_rate": 1.788863109048724e-05, "loss": 0.3479, "step": 772 }, { "epoch": 0.16154649947753397, "grad_norm": 1.4408820360585688, "learning_rate": 1.7911832946635732e-05, "loss": 0.3288, "step": 773 }, { "epoch": 0.16175548589341693, "grad_norm": 1.651190403612862, "learning_rate": 1.7935034802784224e-05, "loss": 0.3481, "step": 774 }, { "epoch": 0.1619644723092999, "grad_norm": 1.4833112041256022, "learning_rate": 1.7958236658932715e-05, "loss": 0.2804, "step": 775 }, { "epoch": 0.16217345872518285, "grad_norm": 1.5054951811365078, "learning_rate": 1.7981438515081207e-05, "loss": 0.3107, "step": 776 }, { "epoch": 0.16238244514106584, "grad_norm": 1.7055524720442286, "learning_rate": 1.80046403712297e-05, "loss": 0.3084, "step": 777 }, { "epoch": 0.1625914315569488, "grad_norm": 1.9910208442595694, "learning_rate": 1.802784222737819e-05, "loss": 0.294, "step": 778 }, { "epoch": 0.16280041797283176, "grad_norm": 1.8477319768678706, "learning_rate": 1.8051044083526682e-05, "loss": 0.3309, "step": 779 }, { "epoch": 0.16300940438871472, "grad_norm": 1.7629594145335075, "learning_rate": 1.8074245939675174e-05, "loss": 0.318, "step": 780 }, { "epoch": 0.1632183908045977, "grad_norm": 2.497494590400967, "learning_rate": 1.809744779582367e-05, "loss": 0.3064, "step": 781 }, { "epoch": 0.16342737722048067, "grad_norm": 1.424395433047386, "learning_rate": 1.812064965197216e-05, "loss": 0.305, "step": 782 }, { "epoch": 0.16363636363636364, "grad_norm": 2.772016652394744, "learning_rate": 1.8143851508120652e-05, "loss": 0.3032, "step": 783 }, { "epoch": 0.1638453500522466, "grad_norm": 2.6832577438940963, "learning_rate": 1.8167053364269144e-05, "loss": 0.309, "step": 784 }, { "epoch": 0.16405433646812956, "grad_norm": 1.4294629845420825, "learning_rate": 1.8190255220417635e-05, "loss": 0.2892, "step": 785 }, { "epoch": 0.16426332288401255, "grad_norm": 1.6794040658556177, "learning_rate": 1.8213457076566127e-05, "loss": 0.3194, "step": 786 }, { "epoch": 0.1644723092998955, "grad_norm": 1.6550893722671376, "learning_rate": 1.823665893271462e-05, "loss": 0.3038, "step": 787 }, { "epoch": 0.16468129571577847, "grad_norm": 2.23948249039861, "learning_rate": 1.825986078886311e-05, "loss": 0.3095, "step": 788 }, { "epoch": 0.16489028213166143, "grad_norm": 2.137677267375945, "learning_rate": 1.8283062645011602e-05, "loss": 0.306, "step": 789 }, { "epoch": 0.16509926854754442, "grad_norm": 1.788835713006021, "learning_rate": 1.8306264501160094e-05, "loss": 0.3329, "step": 790 }, { "epoch": 0.16530825496342738, "grad_norm": 3.551732016334007, "learning_rate": 1.8329466357308585e-05, "loss": 0.3211, "step": 791 }, { "epoch": 0.16551724137931034, "grad_norm": 2.1267088561617915, "learning_rate": 1.835266821345708e-05, "loss": 0.2796, "step": 792 }, { "epoch": 0.1657262277951933, "grad_norm": 1.725559611919673, "learning_rate": 1.837587006960557e-05, "loss": 0.3149, "step": 793 }, { "epoch": 0.1659352142110763, "grad_norm": 2.159718982507921, "learning_rate": 1.839907192575406e-05, "loss": 0.3111, "step": 794 }, { "epoch": 0.16614420062695925, "grad_norm": 1.9668973463199593, "learning_rate": 1.8422273781902552e-05, "loss": 0.3448, "step": 795 }, { "epoch": 0.16635318704284222, "grad_norm": 2.23510551609259, "learning_rate": 1.8445475638051044e-05, "loss": 0.3018, "step": 796 }, { "epoch": 0.16656217345872518, "grad_norm": 2.0404919178881937, "learning_rate": 1.846867749419954e-05, "loss": 0.3137, "step": 797 }, { "epoch": 0.16677115987460814, "grad_norm": 1.9668893961583689, "learning_rate": 1.849187935034803e-05, "loss": 0.3189, "step": 798 }, { "epoch": 0.16698014629049113, "grad_norm": 2.7475798139385508, "learning_rate": 1.8515081206496522e-05, "loss": 0.325, "step": 799 }, { "epoch": 0.1671891327063741, "grad_norm": 1.6358375866226478, "learning_rate": 1.8538283062645014e-05, "loss": 0.3233, "step": 800 }, { "epoch": 0.16739811912225705, "grad_norm": 1.4871132750091558, "learning_rate": 1.8561484918793505e-05, "loss": 0.2976, "step": 801 }, { "epoch": 0.16760710553814, "grad_norm": 2.0090584833345577, "learning_rate": 1.8584686774941997e-05, "loss": 0.3368, "step": 802 }, { "epoch": 0.167816091954023, "grad_norm": 1.3489211452058292, "learning_rate": 1.860788863109049e-05, "loss": 0.3224, "step": 803 }, { "epoch": 0.16802507836990596, "grad_norm": 3.6726412311237877, "learning_rate": 1.863109048723898e-05, "loss": 0.3358, "step": 804 }, { "epoch": 0.16823406478578892, "grad_norm": 1.474008050521873, "learning_rate": 1.8654292343387472e-05, "loss": 0.3275, "step": 805 }, { "epoch": 0.16844305120167188, "grad_norm": 2.184661883140169, "learning_rate": 1.8677494199535964e-05, "loss": 0.321, "step": 806 }, { "epoch": 0.16865203761755485, "grad_norm": 1.4679072499725352, "learning_rate": 1.8700696055684455e-05, "loss": 0.3014, "step": 807 }, { "epoch": 0.16886102403343783, "grad_norm": 1.5721301937407235, "learning_rate": 1.872389791183295e-05, "loss": 0.3186, "step": 808 }, { "epoch": 0.1690700104493208, "grad_norm": 1.4870270085004464, "learning_rate": 1.8747099767981442e-05, "loss": 0.304, "step": 809 }, { "epoch": 0.16927899686520376, "grad_norm": 1.3856792779757443, "learning_rate": 1.877030162412993e-05, "loss": 0.2851, "step": 810 }, { "epoch": 0.16948798328108672, "grad_norm": 1.7090028971893814, "learning_rate": 1.8793503480278422e-05, "loss": 0.305, "step": 811 }, { "epoch": 0.1696969696969697, "grad_norm": 1.6329479055767304, "learning_rate": 1.8816705336426914e-05, "loss": 0.3423, "step": 812 }, { "epoch": 0.16990595611285267, "grad_norm": 1.5493165997736031, "learning_rate": 1.883990719257541e-05, "loss": 0.3263, "step": 813 }, { "epoch": 0.17011494252873563, "grad_norm": 1.9003301260786936, "learning_rate": 1.88631090487239e-05, "loss": 0.3069, "step": 814 }, { "epoch": 0.1703239289446186, "grad_norm": 1.6214567206573502, "learning_rate": 1.8886310904872392e-05, "loss": 0.2939, "step": 815 }, { "epoch": 0.17053291536050158, "grad_norm": 2.327788685304206, "learning_rate": 1.8909512761020884e-05, "loss": 0.2925, "step": 816 }, { "epoch": 0.17074190177638454, "grad_norm": 1.5554621844194485, "learning_rate": 1.8932714617169375e-05, "loss": 0.3405, "step": 817 }, { "epoch": 0.1709508881922675, "grad_norm": 1.738255229685502, "learning_rate": 1.8955916473317867e-05, "loss": 0.3372, "step": 818 }, { "epoch": 0.17115987460815046, "grad_norm": 1.7783420618261823, "learning_rate": 1.897911832946636e-05, "loss": 0.3438, "step": 819 }, { "epoch": 0.17136886102403343, "grad_norm": 2.6872637208906647, "learning_rate": 1.900232018561485e-05, "loss": 0.3117, "step": 820 }, { "epoch": 0.17157784743991641, "grad_norm": 1.6834924805347087, "learning_rate": 1.9025522041763342e-05, "loss": 0.3374, "step": 821 }, { "epoch": 0.17178683385579938, "grad_norm": 1.458943467461393, "learning_rate": 1.9048723897911834e-05, "loss": 0.3088, "step": 822 }, { "epoch": 0.17199582027168234, "grad_norm": 1.7651834419429238, "learning_rate": 1.9071925754060325e-05, "loss": 0.3573, "step": 823 }, { "epoch": 0.1722048066875653, "grad_norm": 1.7899291045985424, "learning_rate": 1.909512761020882e-05, "loss": 0.3402, "step": 824 }, { "epoch": 0.1724137931034483, "grad_norm": 1.7453001975653566, "learning_rate": 1.9118329466357312e-05, "loss": 0.3353, "step": 825 }, { "epoch": 0.17262277951933125, "grad_norm": 1.478745713576821, "learning_rate": 1.9141531322505804e-05, "loss": 0.3138, "step": 826 }, { "epoch": 0.1728317659352142, "grad_norm": 1.4694532809273984, "learning_rate": 1.9164733178654292e-05, "loss": 0.2879, "step": 827 }, { "epoch": 0.17304075235109717, "grad_norm": 1.2834453436341615, "learning_rate": 1.9187935034802783e-05, "loss": 0.3051, "step": 828 }, { "epoch": 0.17324973876698013, "grad_norm": 1.8130849103412572, "learning_rate": 1.921113689095128e-05, "loss": 0.3337, "step": 829 }, { "epoch": 0.17345872518286312, "grad_norm": 1.4436770324477042, "learning_rate": 1.923433874709977e-05, "loss": 0.3051, "step": 830 }, { "epoch": 0.17366771159874608, "grad_norm": 1.7991076647637572, "learning_rate": 1.9257540603248262e-05, "loss": 0.2826, "step": 831 }, { "epoch": 0.17387669801462904, "grad_norm": 1.6119927142828594, "learning_rate": 1.9280742459396753e-05, "loss": 0.3112, "step": 832 }, { "epoch": 0.174085684430512, "grad_norm": 1.4460660102924092, "learning_rate": 1.9303944315545245e-05, "loss": 0.3147, "step": 833 }, { "epoch": 0.174294670846395, "grad_norm": 1.430353250190361, "learning_rate": 1.9327146171693737e-05, "loss": 0.317, "step": 834 }, { "epoch": 0.17450365726227796, "grad_norm": 2.0500296180874873, "learning_rate": 1.935034802784223e-05, "loss": 0.3207, "step": 835 }, { "epoch": 0.17471264367816092, "grad_norm": 1.3979950418604947, "learning_rate": 1.937354988399072e-05, "loss": 0.3329, "step": 836 }, { "epoch": 0.17492163009404388, "grad_norm": 1.756453007460831, "learning_rate": 1.9396751740139212e-05, "loss": 0.3187, "step": 837 }, { "epoch": 0.17513061650992687, "grad_norm": 1.9105636228333347, "learning_rate": 1.9419953596287703e-05, "loss": 0.327, "step": 838 }, { "epoch": 0.17533960292580983, "grad_norm": 1.603976193536016, "learning_rate": 1.9443155452436195e-05, "loss": 0.3019, "step": 839 }, { "epoch": 0.1755485893416928, "grad_norm": 2.2500199598781654, "learning_rate": 1.946635730858469e-05, "loss": 0.3178, "step": 840 }, { "epoch": 0.17575757575757575, "grad_norm": 2.3731339008333117, "learning_rate": 1.9489559164733182e-05, "loss": 0.3361, "step": 841 }, { "epoch": 0.1759665621734587, "grad_norm": 1.5453099758491593, "learning_rate": 1.9512761020881673e-05, "loss": 0.3144, "step": 842 }, { "epoch": 0.1761755485893417, "grad_norm": 2.044709435647344, "learning_rate": 1.9535962877030165e-05, "loss": 0.285, "step": 843 }, { "epoch": 0.17638453500522466, "grad_norm": 1.826330478587399, "learning_rate": 1.9559164733178653e-05, "loss": 0.3172, "step": 844 }, { "epoch": 0.17659352142110762, "grad_norm": 1.6233810686540804, "learning_rate": 1.958236658932715e-05, "loss": 0.3047, "step": 845 }, { "epoch": 0.17680250783699059, "grad_norm": 2.530012356573713, "learning_rate": 1.960556844547564e-05, "loss": 0.3235, "step": 846 }, { "epoch": 0.17701149425287357, "grad_norm": 1.6119265659109292, "learning_rate": 1.9628770301624132e-05, "loss": 0.2993, "step": 847 }, { "epoch": 0.17722048066875654, "grad_norm": 1.6883209749975299, "learning_rate": 1.9651972157772623e-05, "loss": 0.317, "step": 848 }, { "epoch": 0.1774294670846395, "grad_norm": 2.037822694368563, "learning_rate": 1.9675174013921115e-05, "loss": 0.2987, "step": 849 }, { "epoch": 0.17763845350052246, "grad_norm": 1.609318389164312, "learning_rate": 1.9698375870069607e-05, "loss": 0.3275, "step": 850 }, { "epoch": 0.17784743991640542, "grad_norm": 2.8328074232623615, "learning_rate": 1.97215777262181e-05, "loss": 0.2814, "step": 851 }, { "epoch": 0.1780564263322884, "grad_norm": 1.466955179038805, "learning_rate": 1.974477958236659e-05, "loss": 0.3079, "step": 852 }, { "epoch": 0.17826541274817137, "grad_norm": 2.8152163747759698, "learning_rate": 1.976798143851508e-05, "loss": 0.2991, "step": 853 }, { "epoch": 0.17847439916405433, "grad_norm": 2.545840032669991, "learning_rate": 1.9791183294663573e-05, "loss": 0.3101, "step": 854 }, { "epoch": 0.1786833855799373, "grad_norm": 1.5460153138559505, "learning_rate": 1.9814385150812065e-05, "loss": 0.2855, "step": 855 }, { "epoch": 0.17889237199582028, "grad_norm": 1.7450318800670839, "learning_rate": 1.983758700696056e-05, "loss": 0.3029, "step": 856 }, { "epoch": 0.17910135841170324, "grad_norm": 2.11435892688038, "learning_rate": 1.986078886310905e-05, "loss": 0.299, "step": 857 }, { "epoch": 0.1793103448275862, "grad_norm": 1.2946192903116482, "learning_rate": 1.9883990719257543e-05, "loss": 0.279, "step": 858 }, { "epoch": 0.17951933124346917, "grad_norm": 1.6812214986383367, "learning_rate": 1.9907192575406035e-05, "loss": 0.3042, "step": 859 }, { "epoch": 0.17972831765935215, "grad_norm": 1.4500642103477346, "learning_rate": 1.9930394431554527e-05, "loss": 0.3006, "step": 860 }, { "epoch": 0.17993730407523512, "grad_norm": 1.5529059784672656, "learning_rate": 1.995359628770302e-05, "loss": 0.3065, "step": 861 }, { "epoch": 0.18014629049111808, "grad_norm": 1.4114287186891212, "learning_rate": 1.997679814385151e-05, "loss": 0.3213, "step": 862 }, { "epoch": 0.18035527690700104, "grad_norm": 1.4317165098382296, "learning_rate": 2e-05, "loss": 0.3373, "step": 863 }, { "epoch": 0.180564263322884, "grad_norm": 1.8655266245780784, "learning_rate": 1.99999999363671e-05, "loss": 0.3539, "step": 864 }, { "epoch": 0.180773249738767, "grad_norm": 2.0305914335876007, "learning_rate": 1.9999999745468393e-05, "loss": 0.3212, "step": 865 }, { "epoch": 0.18098223615464995, "grad_norm": 1.67455662466064, "learning_rate": 1.999999942730389e-05, "loss": 0.301, "step": 866 }, { "epoch": 0.1811912225705329, "grad_norm": 1.6619439922988088, "learning_rate": 1.999999898187359e-05, "loss": 0.3158, "step": 867 }, { "epoch": 0.18140020898641587, "grad_norm": 1.763586329178514, "learning_rate": 1.9999998409177496e-05, "loss": 0.3191, "step": 868 }, { "epoch": 0.18160919540229886, "grad_norm": 1.3997819279975954, "learning_rate": 1.999999770921562e-05, "loss": 0.3173, "step": 869 }, { "epoch": 0.18181818181818182, "grad_norm": 1.7067336377181321, "learning_rate": 1.999999688198797e-05, "loss": 0.3145, "step": 870 }, { "epoch": 0.18202716823406478, "grad_norm": 1.5236468854973848, "learning_rate": 1.9999995927494556e-05, "loss": 0.3196, "step": 871 }, { "epoch": 0.18223615464994775, "grad_norm": 1.7272859217213798, "learning_rate": 1.999999484573539e-05, "loss": 0.3087, "step": 872 }, { "epoch": 0.18244514106583073, "grad_norm": 1.4455363860028967, "learning_rate": 1.9999993636710488e-05, "loss": 0.2692, "step": 873 }, { "epoch": 0.1826541274817137, "grad_norm": 1.4726668240026595, "learning_rate": 1.999999230041986e-05, "loss": 0.3046, "step": 874 }, { "epoch": 0.18286311389759666, "grad_norm": 1.5140442336758244, "learning_rate": 1.9999990836863525e-05, "loss": 0.3463, "step": 875 }, { "epoch": 0.18307210031347962, "grad_norm": 1.5276559471434983, "learning_rate": 1.999998924604151e-05, "loss": 0.3186, "step": 876 }, { "epoch": 0.18328108672936258, "grad_norm": 1.3431883968365548, "learning_rate": 1.9999987527953823e-05, "loss": 0.3019, "step": 877 }, { "epoch": 0.18349007314524557, "grad_norm": 1.6684039386838054, "learning_rate": 1.999998568260049e-05, "loss": 0.3235, "step": 878 }, { "epoch": 0.18369905956112853, "grad_norm": 1.8118155166917824, "learning_rate": 1.9999983709981538e-05, "loss": 0.3168, "step": 879 }, { "epoch": 0.1839080459770115, "grad_norm": 2.157639296542922, "learning_rate": 1.9999981610096988e-05, "loss": 0.2822, "step": 880 }, { "epoch": 0.18411703239289445, "grad_norm": 2.285518665123755, "learning_rate": 1.999997938294687e-05, "loss": 0.3361, "step": 881 }, { "epoch": 0.18432601880877744, "grad_norm": 1.5807911606153078, "learning_rate": 1.999997702853121e-05, "loss": 0.3098, "step": 882 }, { "epoch": 0.1845350052246604, "grad_norm": 1.4077677992572202, "learning_rate": 1.9999974546850038e-05, "loss": 0.3274, "step": 883 }, { "epoch": 0.18474399164054336, "grad_norm": 1.7962692068235926, "learning_rate": 1.9999971937903386e-05, "loss": 0.2948, "step": 884 }, { "epoch": 0.18495297805642633, "grad_norm": 1.8617046090264275, "learning_rate": 1.999996920169129e-05, "loss": 0.329, "step": 885 }, { "epoch": 0.1851619644723093, "grad_norm": 1.7578172508386913, "learning_rate": 1.999996633821378e-05, "loss": 0.3044, "step": 886 }, { "epoch": 0.18537095088819228, "grad_norm": 1.5429292413627917, "learning_rate": 1.9999963347470896e-05, "loss": 0.3155, "step": 887 }, { "epoch": 0.18557993730407524, "grad_norm": 2.395763800131133, "learning_rate": 1.9999960229462674e-05, "loss": 0.3448, "step": 888 }, { "epoch": 0.1857889237199582, "grad_norm": 1.3178052188981875, "learning_rate": 1.9999956984189155e-05, "loss": 0.2958, "step": 889 }, { "epoch": 0.18599791013584116, "grad_norm": 1.4485133124949527, "learning_rate": 1.999995361165038e-05, "loss": 0.3053, "step": 890 }, { "epoch": 0.18620689655172415, "grad_norm": 2.1108369169697574, "learning_rate": 1.999995011184639e-05, "loss": 0.2945, "step": 891 }, { "epoch": 0.1864158829676071, "grad_norm": 1.4608268790917003, "learning_rate": 1.9999946484777234e-05, "loss": 0.3034, "step": 892 }, { "epoch": 0.18662486938349007, "grad_norm": 1.3993629734118924, "learning_rate": 1.9999942730442955e-05, "loss": 0.3081, "step": 893 }, { "epoch": 0.18683385579937303, "grad_norm": 1.506639362006082, "learning_rate": 1.9999938848843597e-05, "loss": 0.315, "step": 894 }, { "epoch": 0.18704284221525602, "grad_norm": 1.4650291835306006, "learning_rate": 1.9999934839979217e-05, "loss": 0.3144, "step": 895 }, { "epoch": 0.18725182863113898, "grad_norm": 1.3593936826475235, "learning_rate": 1.9999930703849864e-05, "loss": 0.3267, "step": 896 }, { "epoch": 0.18746081504702194, "grad_norm": 2.226712129013804, "learning_rate": 1.9999926440455587e-05, "loss": 0.293, "step": 897 }, { "epoch": 0.1876698014629049, "grad_norm": 1.5627126248567773, "learning_rate": 1.9999922049796442e-05, "loss": 0.2875, "step": 898 }, { "epoch": 0.18787878787878787, "grad_norm": 1.4758679258031362, "learning_rate": 1.9999917531872487e-05, "loss": 0.3158, "step": 899 }, { "epoch": 0.18808777429467086, "grad_norm": 1.7024584532952711, "learning_rate": 1.9999912886683773e-05, "loss": 0.3248, "step": 900 }, { "epoch": 0.18829676071055382, "grad_norm": 1.1317754183902962, "learning_rate": 1.9999908114230372e-05, "loss": 0.284, "step": 901 }, { "epoch": 0.18850574712643678, "grad_norm": 2.2242752106531776, "learning_rate": 1.9999903214512334e-05, "loss": 0.3082, "step": 902 }, { "epoch": 0.18871473354231974, "grad_norm": 2.0911997656013885, "learning_rate": 1.999989818752972e-05, "loss": 0.3051, "step": 903 }, { "epoch": 0.18892371995820273, "grad_norm": 1.3738390242723801, "learning_rate": 1.9999893033282603e-05, "loss": 0.3018, "step": 904 }, { "epoch": 0.1891327063740857, "grad_norm": 2.7240610026534284, "learning_rate": 1.999988775177104e-05, "loss": 0.3224, "step": 905 }, { "epoch": 0.18934169278996865, "grad_norm": 2.288058980731369, "learning_rate": 1.99998823429951e-05, "loss": 0.3199, "step": 906 }, { "epoch": 0.1895506792058516, "grad_norm": 1.359717787905974, "learning_rate": 1.999987680695486e-05, "loss": 0.3297, "step": 907 }, { "epoch": 0.18975966562173457, "grad_norm": 1.8201600716712687, "learning_rate": 1.9999871143650383e-05, "loss": 0.3279, "step": 908 }, { "epoch": 0.18996865203761756, "grad_norm": 2.5544425306423246, "learning_rate": 1.999986535308174e-05, "loss": 0.3428, "step": 909 }, { "epoch": 0.19017763845350052, "grad_norm": 2.236561575765341, "learning_rate": 1.9999859435249007e-05, "loss": 0.3053, "step": 910 }, { "epoch": 0.1903866248693835, "grad_norm": 1.3690445773849402, "learning_rate": 1.999985339015226e-05, "loss": 0.2789, "step": 911 }, { "epoch": 0.19059561128526645, "grad_norm": 3.21755956807058, "learning_rate": 1.9999847217791574e-05, "loss": 0.3268, "step": 912 }, { "epoch": 0.19080459770114944, "grad_norm": 3.023985352032459, "learning_rate": 1.999984091816703e-05, "loss": 0.305, "step": 913 }, { "epoch": 0.1910135841170324, "grad_norm": 1.4728003228231514, "learning_rate": 1.9999834491278707e-05, "loss": 0.3222, "step": 914 }, { "epoch": 0.19122257053291536, "grad_norm": 1.7639807433944705, "learning_rate": 1.9999827937126687e-05, "loss": 0.3044, "step": 915 }, { "epoch": 0.19143155694879832, "grad_norm": 2.262149399238582, "learning_rate": 1.999982125571105e-05, "loss": 0.3462, "step": 916 }, { "epoch": 0.1916405433646813, "grad_norm": 2.004088098012969, "learning_rate": 1.9999814447031886e-05, "loss": 0.3245, "step": 917 }, { "epoch": 0.19184952978056427, "grad_norm": 1.9557432666081758, "learning_rate": 1.9999807511089283e-05, "loss": 0.3185, "step": 918 }, { "epoch": 0.19205851619644723, "grad_norm": 2.2856241811364217, "learning_rate": 1.9999800447883322e-05, "loss": 0.316, "step": 919 }, { "epoch": 0.1922675026123302, "grad_norm": 3.8696274249278018, "learning_rate": 1.9999793257414097e-05, "loss": 0.3318, "step": 920 }, { "epoch": 0.19247648902821315, "grad_norm": 1.5909321446056042, "learning_rate": 1.9999785939681703e-05, "loss": 0.2913, "step": 921 }, { "epoch": 0.19268547544409614, "grad_norm": 1.5283201054786049, "learning_rate": 1.9999778494686226e-05, "loss": 0.3351, "step": 922 }, { "epoch": 0.1928944618599791, "grad_norm": 2.125565365084895, "learning_rate": 1.9999770922427766e-05, "loss": 0.2962, "step": 923 }, { "epoch": 0.19310344827586207, "grad_norm": 2.14423920997404, "learning_rate": 1.9999763222906417e-05, "loss": 0.2916, "step": 924 }, { "epoch": 0.19331243469174503, "grad_norm": 1.374571095357672, "learning_rate": 1.9999755396122278e-05, "loss": 0.336, "step": 925 }, { "epoch": 0.19352142110762802, "grad_norm": 1.4937406541679221, "learning_rate": 1.9999747442075446e-05, "loss": 0.3131, "step": 926 }, { "epoch": 0.19373040752351098, "grad_norm": 1.7084784346147108, "learning_rate": 1.9999739360766028e-05, "loss": 0.3046, "step": 927 }, { "epoch": 0.19393939393939394, "grad_norm": 2.030334437767085, "learning_rate": 1.999973115219412e-05, "loss": 0.3461, "step": 928 }, { "epoch": 0.1941483803552769, "grad_norm": 1.5895456666283263, "learning_rate": 1.9999722816359836e-05, "loss": 0.2963, "step": 929 }, { "epoch": 0.19435736677115986, "grad_norm": 1.6704247220210198, "learning_rate": 1.9999714353263272e-05, "loss": 0.3213, "step": 930 }, { "epoch": 0.19456635318704285, "grad_norm": 1.8644023280508522, "learning_rate": 1.999970576290454e-05, "loss": 0.3259, "step": 931 }, { "epoch": 0.1947753396029258, "grad_norm": 2.294276423272638, "learning_rate": 1.9999697045283746e-05, "loss": 0.3147, "step": 932 }, { "epoch": 0.19498432601880877, "grad_norm": 1.7614161918004605, "learning_rate": 1.9999688200401007e-05, "loss": 0.2931, "step": 933 }, { "epoch": 0.19519331243469173, "grad_norm": 1.5235111441438454, "learning_rate": 1.9999679228256433e-05, "loss": 0.2968, "step": 934 }, { "epoch": 0.19540229885057472, "grad_norm": 2.1496397641178104, "learning_rate": 1.9999670128850136e-05, "loss": 0.3331, "step": 935 }, { "epoch": 0.19561128526645769, "grad_norm": 2.7409184319302584, "learning_rate": 1.9999660902182234e-05, "loss": 0.3357, "step": 936 }, { "epoch": 0.19582027168234065, "grad_norm": 1.2483227472012968, "learning_rate": 1.9999651548252843e-05, "loss": 0.3203, "step": 937 }, { "epoch": 0.1960292580982236, "grad_norm": 1.33639515113986, "learning_rate": 1.9999642067062088e-05, "loss": 0.3185, "step": 938 }, { "epoch": 0.1962382445141066, "grad_norm": 1.6775622465729416, "learning_rate": 1.9999632458610077e-05, "loss": 0.3344, "step": 939 }, { "epoch": 0.19644723092998956, "grad_norm": 1.8259242755698646, "learning_rate": 1.9999622722896943e-05, "loss": 0.2958, "step": 940 }, { "epoch": 0.19665621734587252, "grad_norm": 1.7542243874884538, "learning_rate": 1.9999612859922807e-05, "loss": 0.3066, "step": 941 }, { "epoch": 0.19686520376175548, "grad_norm": 1.5237433996719412, "learning_rate": 1.9999602869687795e-05, "loss": 0.3227, "step": 942 }, { "epoch": 0.19707419017763844, "grad_norm": 1.682802427152903, "learning_rate": 1.9999592752192032e-05, "loss": 0.3044, "step": 943 }, { "epoch": 0.19728317659352143, "grad_norm": 1.8911471593047444, "learning_rate": 1.999958250743565e-05, "loss": 0.3, "step": 944 }, { "epoch": 0.1974921630094044, "grad_norm": 1.6612125819782189, "learning_rate": 1.9999572135418777e-05, "loss": 0.299, "step": 945 }, { "epoch": 0.19770114942528735, "grad_norm": 1.549033255451652, "learning_rate": 1.9999561636141544e-05, "loss": 0.2993, "step": 946 }, { "epoch": 0.19791013584117031, "grad_norm": 1.4186068477287301, "learning_rate": 1.9999551009604087e-05, "loss": 0.3201, "step": 947 }, { "epoch": 0.1981191222570533, "grad_norm": 1.4191609566440628, "learning_rate": 1.9999540255806542e-05, "loss": 0.3056, "step": 948 }, { "epoch": 0.19832810867293627, "grad_norm": 1.7006655251891452, "learning_rate": 1.9999529374749043e-05, "loss": 0.2872, "step": 949 }, { "epoch": 0.19853709508881923, "grad_norm": 3.6040879031575765, "learning_rate": 1.9999518366431728e-05, "loss": 0.3186, "step": 950 }, { "epoch": 0.1987460815047022, "grad_norm": 1.8225981007407301, "learning_rate": 1.9999507230854743e-05, "loss": 0.2775, "step": 951 }, { "epoch": 0.19895506792058515, "grad_norm": 1.4328692418376996, "learning_rate": 1.9999495968018224e-05, "loss": 0.2953, "step": 952 }, { "epoch": 0.19916405433646814, "grad_norm": 1.4296315921842397, "learning_rate": 1.9999484577922313e-05, "loss": 0.3205, "step": 953 }, { "epoch": 0.1993730407523511, "grad_norm": 1.744315088933335, "learning_rate": 1.9999473060567162e-05, "loss": 0.3081, "step": 954 }, { "epoch": 0.19958202716823406, "grad_norm": 1.2989709960633122, "learning_rate": 1.9999461415952914e-05, "loss": 0.3142, "step": 955 }, { "epoch": 0.19979101358411702, "grad_norm": 1.4939051824714957, "learning_rate": 1.999944964407971e-05, "loss": 0.3048, "step": 956 }, { "epoch": 0.2, "grad_norm": 1.4288872400431831, "learning_rate": 1.999943774494771e-05, "loss": 0.3301, "step": 957 }, { "epoch": 0.20020898641588297, "grad_norm": 1.4510099720640661, "learning_rate": 1.9999425718557065e-05, "loss": 0.3078, "step": 958 }, { "epoch": 0.20041797283176593, "grad_norm": 1.427985289158661, "learning_rate": 1.9999413564907922e-05, "loss": 0.3, "step": 959 }, { "epoch": 0.2006269592476489, "grad_norm": 1.2840540464360601, "learning_rate": 1.999940128400044e-05, "loss": 0.2838, "step": 960 }, { "epoch": 0.20083594566353188, "grad_norm": 1.4430280953906187, "learning_rate": 1.9999388875834772e-05, "loss": 0.3152, "step": 961 }, { "epoch": 0.20104493207941485, "grad_norm": 1.5779179267293721, "learning_rate": 1.999937634041108e-05, "loss": 0.2888, "step": 962 }, { "epoch": 0.2012539184952978, "grad_norm": 1.5007197959418206, "learning_rate": 1.9999363677729522e-05, "loss": 0.2758, "step": 963 }, { "epoch": 0.20146290491118077, "grad_norm": 1.4142761169809799, "learning_rate": 1.9999350887790255e-05, "loss": 0.3165, "step": 964 }, { "epoch": 0.20167189132706373, "grad_norm": 1.5977846078401534, "learning_rate": 1.999933797059345e-05, "loss": 0.3247, "step": 965 }, { "epoch": 0.20188087774294672, "grad_norm": 1.4861726469528915, "learning_rate": 1.999932492613926e-05, "loss": 0.3103, "step": 966 }, { "epoch": 0.20208986415882968, "grad_norm": 1.314414991870177, "learning_rate": 1.9999311754427865e-05, "loss": 0.3089, "step": 967 }, { "epoch": 0.20229885057471264, "grad_norm": 1.5403196354072393, "learning_rate": 1.9999298455459422e-05, "loss": 0.3273, "step": 968 }, { "epoch": 0.2025078369905956, "grad_norm": 1.7244240288749928, "learning_rate": 1.9999285029234105e-05, "loss": 0.3282, "step": 969 }, { "epoch": 0.2027168234064786, "grad_norm": 1.8510262615038453, "learning_rate": 1.999927147575208e-05, "loss": 0.3156, "step": 970 }, { "epoch": 0.20292580982236155, "grad_norm": 1.1668254561695515, "learning_rate": 1.9999257795013526e-05, "loss": 0.326, "step": 971 }, { "epoch": 0.2031347962382445, "grad_norm": 1.5488989800948942, "learning_rate": 1.9999243987018614e-05, "loss": 0.3075, "step": 972 }, { "epoch": 0.20334378265412748, "grad_norm": 1.5530789389440447, "learning_rate": 1.999923005176752e-05, "loss": 0.3032, "step": 973 }, { "epoch": 0.20355276907001044, "grad_norm": 1.8733064968746427, "learning_rate": 1.999921598926042e-05, "loss": 0.2983, "step": 974 }, { "epoch": 0.20376175548589343, "grad_norm": 1.3233468693817128, "learning_rate": 1.999920179949749e-05, "loss": 0.3136, "step": 975 }, { "epoch": 0.2039707419017764, "grad_norm": 1.9481286276351628, "learning_rate": 1.999918748247892e-05, "loss": 0.2741, "step": 976 }, { "epoch": 0.20417972831765935, "grad_norm": 1.3965771380295842, "learning_rate": 1.9999173038204885e-05, "loss": 0.3101, "step": 977 }, { "epoch": 0.2043887147335423, "grad_norm": 1.5145268049954044, "learning_rate": 1.9999158466675572e-05, "loss": 0.3147, "step": 978 }, { "epoch": 0.2045977011494253, "grad_norm": 1.4784929455793123, "learning_rate": 1.9999143767891166e-05, "loss": 0.3271, "step": 979 }, { "epoch": 0.20480668756530826, "grad_norm": 1.5891684847865508, "learning_rate": 1.9999128941851855e-05, "loss": 0.3167, "step": 980 }, { "epoch": 0.20501567398119122, "grad_norm": 1.9468800403114321, "learning_rate": 1.999911398855782e-05, "loss": 0.3058, "step": 981 }, { "epoch": 0.20522466039707418, "grad_norm": 1.2049575727681987, "learning_rate": 1.9999098908009258e-05, "loss": 0.3178, "step": 982 }, { "epoch": 0.20543364681295717, "grad_norm": 1.3821787660997082, "learning_rate": 1.9999083700206362e-05, "loss": 0.3114, "step": 983 }, { "epoch": 0.20564263322884013, "grad_norm": 2.693727277655418, "learning_rate": 1.9999068365149326e-05, "loss": 0.2897, "step": 984 }, { "epoch": 0.2058516196447231, "grad_norm": 1.3639262060705792, "learning_rate": 1.999905290283834e-05, "loss": 0.3106, "step": 985 }, { "epoch": 0.20606060606060606, "grad_norm": 2.1747195816547604, "learning_rate": 1.9999037313273605e-05, "loss": 0.3069, "step": 986 }, { "epoch": 0.20626959247648902, "grad_norm": 1.7019303432148067, "learning_rate": 1.9999021596455316e-05, "loss": 0.3029, "step": 987 }, { "epoch": 0.206478578892372, "grad_norm": 1.5230841735001108, "learning_rate": 1.9999005752383676e-05, "loss": 0.3288, "step": 988 }, { "epoch": 0.20668756530825497, "grad_norm": 1.2673354922576345, "learning_rate": 1.999898978105889e-05, "loss": 0.278, "step": 989 }, { "epoch": 0.20689655172413793, "grad_norm": 1.8062823044292868, "learning_rate": 1.999897368248115e-05, "loss": 0.3331, "step": 990 }, { "epoch": 0.2071055381400209, "grad_norm": 1.4677118187718494, "learning_rate": 1.9998957456650673e-05, "loss": 0.3487, "step": 991 }, { "epoch": 0.20731452455590388, "grad_norm": 1.6407509979022665, "learning_rate": 1.999894110356766e-05, "loss": 0.3129, "step": 992 }, { "epoch": 0.20752351097178684, "grad_norm": 1.6028748348441801, "learning_rate": 1.9998924623232316e-05, "loss": 0.3476, "step": 993 }, { "epoch": 0.2077324973876698, "grad_norm": 1.5962173861281153, "learning_rate": 1.9998908015644856e-05, "loss": 0.2934, "step": 994 }, { "epoch": 0.20794148380355276, "grad_norm": 1.3986918143264508, "learning_rate": 1.999889128080549e-05, "loss": 0.3039, "step": 995 }, { "epoch": 0.20815047021943572, "grad_norm": 1.3999887437139358, "learning_rate": 1.999887441871443e-05, "loss": 0.2857, "step": 996 }, { "epoch": 0.2083594566353187, "grad_norm": 1.387597987521337, "learning_rate": 1.999885742937189e-05, "loss": 0.3012, "step": 997 }, { "epoch": 0.20856844305120167, "grad_norm": 1.7787218405448206, "learning_rate": 1.9998840312778088e-05, "loss": 0.2981, "step": 998 }, { "epoch": 0.20877742946708464, "grad_norm": 1.517204273152513, "learning_rate": 1.999882306893324e-05, "loss": 0.2911, "step": 999 }, { "epoch": 0.2089864158829676, "grad_norm": 1.4693277320370144, "learning_rate": 1.999880569783757e-05, "loss": 0.3157, "step": 1000 }, { "epoch": 0.20919540229885059, "grad_norm": 1.4664342840571674, "learning_rate": 1.9998788199491295e-05, "loss": 0.2976, "step": 1001 }, { "epoch": 0.20940438871473355, "grad_norm": 1.5123784592349105, "learning_rate": 1.9998770573894638e-05, "loss": 0.3107, "step": 1002 }, { "epoch": 0.2096133751306165, "grad_norm": 1.384884390267839, "learning_rate": 1.999875282104782e-05, "loss": 0.3215, "step": 1003 }, { "epoch": 0.20982236154649947, "grad_norm": 1.411593031952331, "learning_rate": 1.9998734940951078e-05, "loss": 0.2993, "step": 1004 }, { "epoch": 0.21003134796238246, "grad_norm": 1.5115189820949726, "learning_rate": 1.9998716933604626e-05, "loss": 0.3066, "step": 1005 }, { "epoch": 0.21024033437826542, "grad_norm": 1.3415255666134034, "learning_rate": 1.9998698799008703e-05, "loss": 0.3207, "step": 1006 }, { "epoch": 0.21044932079414838, "grad_norm": 1.2756879761683044, "learning_rate": 1.9998680537163535e-05, "loss": 0.3173, "step": 1007 }, { "epoch": 0.21065830721003134, "grad_norm": 1.7507348583659337, "learning_rate": 1.9998662148069353e-05, "loss": 0.3209, "step": 1008 }, { "epoch": 0.2108672936259143, "grad_norm": 1.6052699464823201, "learning_rate": 1.9998643631726397e-05, "loss": 0.3086, "step": 1009 }, { "epoch": 0.2110762800417973, "grad_norm": 1.3665620123466258, "learning_rate": 1.99986249881349e-05, "loss": 0.2872, "step": 1010 }, { "epoch": 0.21128526645768025, "grad_norm": 1.3019164845738969, "learning_rate": 1.9998606217295093e-05, "loss": 0.305, "step": 1011 }, { "epoch": 0.21149425287356322, "grad_norm": 2.3011661475303824, "learning_rate": 1.9998587319207225e-05, "loss": 0.269, "step": 1012 }, { "epoch": 0.21170323928944618, "grad_norm": 1.4283812306581753, "learning_rate": 1.9998568293871533e-05, "loss": 0.3017, "step": 1013 }, { "epoch": 0.21191222570532917, "grad_norm": 1.4597469764838358, "learning_rate": 1.9998549141288252e-05, "loss": 0.3018, "step": 1014 }, { "epoch": 0.21212121212121213, "grad_norm": 1.896113414493035, "learning_rate": 1.9998529861457638e-05, "loss": 0.3002, "step": 1015 }, { "epoch": 0.2123301985370951, "grad_norm": 1.481941031707595, "learning_rate": 1.9998510454379927e-05, "loss": 0.3285, "step": 1016 }, { "epoch": 0.21253918495297805, "grad_norm": 1.4735726298044427, "learning_rate": 1.999849092005537e-05, "loss": 0.2841, "step": 1017 }, { "epoch": 0.212748171368861, "grad_norm": 1.3726237119292335, "learning_rate": 1.9998471258484215e-05, "loss": 0.321, "step": 1018 }, { "epoch": 0.212957157784744, "grad_norm": 1.3233526423277893, "learning_rate": 1.9998451469666712e-05, "loss": 0.281, "step": 1019 }, { "epoch": 0.21316614420062696, "grad_norm": 1.476162241309221, "learning_rate": 1.9998431553603112e-05, "loss": 0.3211, "step": 1020 }, { "epoch": 0.21337513061650992, "grad_norm": 1.870673940240828, "learning_rate": 1.9998411510293673e-05, "loss": 0.3091, "step": 1021 }, { "epoch": 0.21358411703239288, "grad_norm": 1.3649044175936336, "learning_rate": 1.9998391339738644e-05, "loss": 0.3178, "step": 1022 }, { "epoch": 0.21379310344827587, "grad_norm": 1.9604709033759713, "learning_rate": 1.9998371041938284e-05, "loss": 0.2996, "step": 1023 }, { "epoch": 0.21400208986415883, "grad_norm": 1.5193905311305442, "learning_rate": 1.9998350616892847e-05, "loss": 0.2676, "step": 1024 }, { "epoch": 0.2142110762800418, "grad_norm": 1.5020321104295389, "learning_rate": 1.9998330064602603e-05, "loss": 0.3011, "step": 1025 }, { "epoch": 0.21442006269592476, "grad_norm": 1.3176729156843736, "learning_rate": 1.9998309385067808e-05, "loss": 0.2854, "step": 1026 }, { "epoch": 0.21462904911180775, "grad_norm": 1.7433648775996258, "learning_rate": 1.999828857828872e-05, "loss": 0.3354, "step": 1027 }, { "epoch": 0.2148380355276907, "grad_norm": 1.4623354396647257, "learning_rate": 1.999826764426561e-05, "loss": 0.3154, "step": 1028 }, { "epoch": 0.21504702194357367, "grad_norm": 1.4590480788398974, "learning_rate": 1.9998246582998748e-05, "loss": 0.2675, "step": 1029 }, { "epoch": 0.21525600835945663, "grad_norm": 1.6285058018692942, "learning_rate": 1.9998225394488396e-05, "loss": 0.3292, "step": 1030 }, { "epoch": 0.2154649947753396, "grad_norm": 1.5900673336650009, "learning_rate": 1.999820407873482e-05, "loss": 0.336, "step": 1031 }, { "epoch": 0.21567398119122258, "grad_norm": 1.463972253017621, "learning_rate": 1.99981826357383e-05, "loss": 0.3223, "step": 1032 }, { "epoch": 0.21588296760710554, "grad_norm": 1.5882507632854015, "learning_rate": 1.9998161065499107e-05, "loss": 0.3177, "step": 1033 }, { "epoch": 0.2160919540229885, "grad_norm": 1.9644844645665518, "learning_rate": 1.999813936801751e-05, "loss": 0.3251, "step": 1034 }, { "epoch": 0.21630094043887146, "grad_norm": 1.4438902436213439, "learning_rate": 1.9998117543293794e-05, "loss": 0.3183, "step": 1035 }, { "epoch": 0.21650992685475445, "grad_norm": 1.7424600316335885, "learning_rate": 1.9998095591328226e-05, "loss": 0.296, "step": 1036 }, { "epoch": 0.21671891327063741, "grad_norm": 1.2074076721865408, "learning_rate": 1.9998073512121094e-05, "loss": 0.2842, "step": 1037 }, { "epoch": 0.21692789968652038, "grad_norm": 1.2236471696213462, "learning_rate": 1.9998051305672673e-05, "loss": 0.274, "step": 1038 }, { "epoch": 0.21713688610240334, "grad_norm": 1.4692374580183634, "learning_rate": 1.999802897198325e-05, "loss": 0.3278, "step": 1039 }, { "epoch": 0.2173458725182863, "grad_norm": 1.2332573905386572, "learning_rate": 1.999800651105311e-05, "loss": 0.3101, "step": 1040 }, { "epoch": 0.2175548589341693, "grad_norm": 1.4748733206444082, "learning_rate": 1.9997983922882533e-05, "loss": 0.3028, "step": 1041 }, { "epoch": 0.21776384535005225, "grad_norm": 1.7410140093448359, "learning_rate": 1.9997961207471814e-05, "loss": 0.3048, "step": 1042 }, { "epoch": 0.2179728317659352, "grad_norm": 1.37535945432114, "learning_rate": 1.9997938364821235e-05, "loss": 0.2936, "step": 1043 }, { "epoch": 0.21818181818181817, "grad_norm": 1.7379504661933025, "learning_rate": 1.9997915394931092e-05, "loss": 0.3026, "step": 1044 }, { "epoch": 0.21839080459770116, "grad_norm": 1.398304326294333, "learning_rate": 1.999789229780167e-05, "loss": 0.3, "step": 1045 }, { "epoch": 0.21859979101358412, "grad_norm": 1.9717804538141457, "learning_rate": 1.9997869073433276e-05, "loss": 0.304, "step": 1046 }, { "epoch": 0.21880877742946708, "grad_norm": 2.52096328067609, "learning_rate": 1.9997845721826194e-05, "loss": 0.3406, "step": 1047 }, { "epoch": 0.21901776384535004, "grad_norm": 1.341480630342445, "learning_rate": 1.9997822242980725e-05, "loss": 0.2683, "step": 1048 }, { "epoch": 0.21922675026123303, "grad_norm": 1.6671464534637663, "learning_rate": 1.999779863689717e-05, "loss": 0.3127, "step": 1049 }, { "epoch": 0.219435736677116, "grad_norm": 2.113503265854455, "learning_rate": 1.9997774903575824e-05, "loss": 0.3137, "step": 1050 }, { "epoch": 0.21964472309299896, "grad_norm": 2.063363886284404, "learning_rate": 1.9997751043016995e-05, "loss": 0.304, "step": 1051 }, { "epoch": 0.21985370950888192, "grad_norm": 2.316221475364874, "learning_rate": 1.999772705522098e-05, "loss": 0.3196, "step": 1052 }, { "epoch": 0.22006269592476488, "grad_norm": 1.401258651538545, "learning_rate": 1.999770294018809e-05, "loss": 0.3177, "step": 1053 }, { "epoch": 0.22027168234064787, "grad_norm": 1.8705395065009558, "learning_rate": 1.999767869791863e-05, "loss": 0.3031, "step": 1054 }, { "epoch": 0.22048066875653083, "grad_norm": 2.9611823191197426, "learning_rate": 1.999765432841291e-05, "loss": 0.3271, "step": 1055 }, { "epoch": 0.2206896551724138, "grad_norm": 1.8544502724466827, "learning_rate": 1.9997629831671238e-05, "loss": 0.2955, "step": 1056 }, { "epoch": 0.22089864158829675, "grad_norm": 1.6176841689222279, "learning_rate": 1.9997605207693926e-05, "loss": 0.2891, "step": 1057 }, { "epoch": 0.22110762800417974, "grad_norm": 1.5335827967063083, "learning_rate": 1.9997580456481287e-05, "loss": 0.3299, "step": 1058 }, { "epoch": 0.2213166144200627, "grad_norm": 1.569864395271127, "learning_rate": 1.9997555578033637e-05, "loss": 0.3124, "step": 1059 }, { "epoch": 0.22152560083594566, "grad_norm": 2.3112792037504812, "learning_rate": 1.9997530572351294e-05, "loss": 0.3023, "step": 1060 }, { "epoch": 0.22173458725182862, "grad_norm": 1.693414508438043, "learning_rate": 1.9997505439434574e-05, "loss": 0.2745, "step": 1061 }, { "epoch": 0.22194357366771159, "grad_norm": 1.2479234633041099, "learning_rate": 1.9997480179283798e-05, "loss": 0.2798, "step": 1062 }, { "epoch": 0.22215256008359457, "grad_norm": 1.876195488845883, "learning_rate": 1.9997454791899285e-05, "loss": 0.3245, "step": 1063 }, { "epoch": 0.22236154649947754, "grad_norm": 1.5779617468153695, "learning_rate": 1.999742927728136e-05, "loss": 0.3197, "step": 1064 }, { "epoch": 0.2225705329153605, "grad_norm": 1.6374713890262258, "learning_rate": 1.999740363543035e-05, "loss": 0.3103, "step": 1065 }, { "epoch": 0.22277951933124346, "grad_norm": 1.6970724592461692, "learning_rate": 1.9997377866346582e-05, "loss": 0.337, "step": 1066 }, { "epoch": 0.22298850574712645, "grad_norm": 1.462326651172398, "learning_rate": 1.9997351970030373e-05, "loss": 0.3315, "step": 1067 }, { "epoch": 0.2231974921630094, "grad_norm": 1.2621059811744328, "learning_rate": 1.9997325946482067e-05, "loss": 0.3249, "step": 1068 }, { "epoch": 0.22340647857889237, "grad_norm": 1.552390870294095, "learning_rate": 1.9997299795701988e-05, "loss": 0.3239, "step": 1069 }, { "epoch": 0.22361546499477533, "grad_norm": 1.8282139076223345, "learning_rate": 1.999727351769047e-05, "loss": 0.2968, "step": 1070 }, { "epoch": 0.22382445141065832, "grad_norm": 1.7951849819252652, "learning_rate": 1.9997247112447846e-05, "loss": 0.2991, "step": 1071 }, { "epoch": 0.22403343782654128, "grad_norm": 1.4961183532200033, "learning_rate": 1.9997220579974454e-05, "loss": 0.3057, "step": 1072 }, { "epoch": 0.22424242424242424, "grad_norm": 1.5037229370416212, "learning_rate": 1.9997193920270628e-05, "loss": 0.3151, "step": 1073 }, { "epoch": 0.2244514106583072, "grad_norm": 1.825797824031944, "learning_rate": 1.9997167133336713e-05, "loss": 0.3074, "step": 1074 }, { "epoch": 0.22466039707419017, "grad_norm": 1.7546345501678349, "learning_rate": 1.9997140219173046e-05, "loss": 0.3194, "step": 1075 }, { "epoch": 0.22486938349007315, "grad_norm": 1.5417714696059468, "learning_rate": 1.999711317777997e-05, "loss": 0.3124, "step": 1076 }, { "epoch": 0.22507836990595612, "grad_norm": 2.021811898721485, "learning_rate": 1.9997086009157833e-05, "loss": 0.2899, "step": 1077 }, { "epoch": 0.22528735632183908, "grad_norm": 1.7426511807809337, "learning_rate": 1.9997058713306974e-05, "loss": 0.2858, "step": 1078 }, { "epoch": 0.22549634273772204, "grad_norm": 1.5878227142813621, "learning_rate": 1.9997031290227745e-05, "loss": 0.2862, "step": 1079 }, { "epoch": 0.22570532915360503, "grad_norm": 1.7523194065939154, "learning_rate": 1.9997003739920495e-05, "loss": 0.3078, "step": 1080 }, { "epoch": 0.225914315569488, "grad_norm": 1.5383717495708826, "learning_rate": 1.9996976062385572e-05, "loss": 0.308, "step": 1081 }, { "epoch": 0.22612330198537095, "grad_norm": 1.5013933363921836, "learning_rate": 1.999694825762333e-05, "loss": 0.3166, "step": 1082 }, { "epoch": 0.2263322884012539, "grad_norm": 1.5616654839293278, "learning_rate": 1.9996920325634122e-05, "loss": 0.286, "step": 1083 }, { "epoch": 0.22654127481713687, "grad_norm": 1.6127205707445051, "learning_rate": 1.9996892266418303e-05, "loss": 0.2767, "step": 1084 }, { "epoch": 0.22675026123301986, "grad_norm": 1.9780830444850683, "learning_rate": 1.9996864079976234e-05, "loss": 0.3263, "step": 1085 }, { "epoch": 0.22695924764890282, "grad_norm": 1.5159955301343404, "learning_rate": 1.999683576630827e-05, "loss": 0.2956, "step": 1086 }, { "epoch": 0.22716823406478578, "grad_norm": 1.608157678582042, "learning_rate": 1.999680732541477e-05, "loss": 0.2835, "step": 1087 }, { "epoch": 0.22737722048066875, "grad_norm": 1.6806962725991796, "learning_rate": 1.99967787572961e-05, "loss": 0.2868, "step": 1088 }, { "epoch": 0.22758620689655173, "grad_norm": 1.7881359310601082, "learning_rate": 1.999675006195262e-05, "loss": 0.2981, "step": 1089 }, { "epoch": 0.2277951933124347, "grad_norm": 1.69899275160617, "learning_rate": 1.99967212393847e-05, "loss": 0.2847, "step": 1090 }, { "epoch": 0.22800417972831766, "grad_norm": 1.423806084211131, "learning_rate": 1.9996692289592703e-05, "loss": 0.3017, "step": 1091 }, { "epoch": 0.22821316614420062, "grad_norm": 1.555680723685035, "learning_rate": 1.9996663212576996e-05, "loss": 0.2969, "step": 1092 }, { "epoch": 0.2284221525600836, "grad_norm": 1.7289972142973236, "learning_rate": 1.9996634008337952e-05, "loss": 0.2834, "step": 1093 }, { "epoch": 0.22863113897596657, "grad_norm": 1.5675480806002302, "learning_rate": 1.9996604676875946e-05, "loss": 0.2993, "step": 1094 }, { "epoch": 0.22884012539184953, "grad_norm": 1.4502482566236055, "learning_rate": 1.9996575218191343e-05, "loss": 0.2927, "step": 1095 }, { "epoch": 0.2290491118077325, "grad_norm": 1.6416777327062044, "learning_rate": 1.9996545632284522e-05, "loss": 0.2932, "step": 1096 }, { "epoch": 0.22925809822361545, "grad_norm": 1.5970423125297049, "learning_rate": 1.999651591915586e-05, "loss": 0.3039, "step": 1097 }, { "epoch": 0.22946708463949844, "grad_norm": 1.427609450740593, "learning_rate": 1.9996486078805735e-05, "loss": 0.287, "step": 1098 }, { "epoch": 0.2296760710553814, "grad_norm": 1.3302893984663984, "learning_rate": 1.999645611123453e-05, "loss": 0.299, "step": 1099 }, { "epoch": 0.22988505747126436, "grad_norm": 1.2287580068761883, "learning_rate": 1.999642601644262e-05, "loss": 0.3108, "step": 1100 }, { "epoch": 0.23009404388714733, "grad_norm": 1.54305178207384, "learning_rate": 1.999639579443039e-05, "loss": 0.3151, "step": 1101 }, { "epoch": 0.23030303030303031, "grad_norm": 1.8031810378938304, "learning_rate": 1.9996365445198225e-05, "loss": 0.3222, "step": 1102 }, { "epoch": 0.23051201671891328, "grad_norm": 1.4688215164249985, "learning_rate": 1.999633496874651e-05, "loss": 0.3001, "step": 1103 }, { "epoch": 0.23072100313479624, "grad_norm": 1.5499501519423666, "learning_rate": 1.999630436507564e-05, "loss": 0.3237, "step": 1104 }, { "epoch": 0.2309299895506792, "grad_norm": 1.7694964046272437, "learning_rate": 1.9996273634185995e-05, "loss": 0.2804, "step": 1105 }, { "epoch": 0.23113897596656216, "grad_norm": 1.462952994802169, "learning_rate": 1.999624277607797e-05, "loss": 0.3502, "step": 1106 }, { "epoch": 0.23134796238244515, "grad_norm": 1.4718556560508855, "learning_rate": 1.9996211790751962e-05, "loss": 0.3428, "step": 1107 }, { "epoch": 0.2315569487983281, "grad_norm": 1.305835238525672, "learning_rate": 1.999618067820836e-05, "loss": 0.2788, "step": 1108 }, { "epoch": 0.23176593521421107, "grad_norm": 1.1942976282960682, "learning_rate": 1.999614943844756e-05, "loss": 0.2682, "step": 1109 }, { "epoch": 0.23197492163009403, "grad_norm": 1.2857864580447869, "learning_rate": 1.9996118071469963e-05, "loss": 0.2934, "step": 1110 }, { "epoch": 0.23218390804597702, "grad_norm": 1.6146423834839463, "learning_rate": 1.9996086577275964e-05, "loss": 0.2953, "step": 1111 }, { "epoch": 0.23239289446185998, "grad_norm": 1.3148125717487114, "learning_rate": 1.9996054955865965e-05, "loss": 0.2687, "step": 1112 }, { "epoch": 0.23260188087774294, "grad_norm": 1.6178608487346704, "learning_rate": 1.9996023207240372e-05, "loss": 0.298, "step": 1113 }, { "epoch": 0.2328108672936259, "grad_norm": 1.396811699261938, "learning_rate": 1.9995991331399582e-05, "loss": 0.3113, "step": 1114 }, { "epoch": 0.2330198537095089, "grad_norm": 1.2047388658018574, "learning_rate": 1.999595932834401e-05, "loss": 0.2831, "step": 1115 }, { "epoch": 0.23322884012539186, "grad_norm": 1.4220928908654233, "learning_rate": 1.9995927198074054e-05, "loss": 0.3187, "step": 1116 }, { "epoch": 0.23343782654127482, "grad_norm": 1.0713917759287928, "learning_rate": 1.9995894940590128e-05, "loss": 0.2803, "step": 1117 }, { "epoch": 0.23364681295715778, "grad_norm": 1.1428335263019906, "learning_rate": 1.9995862555892645e-05, "loss": 0.2913, "step": 1118 }, { "epoch": 0.23385579937304074, "grad_norm": 1.2227771439054291, "learning_rate": 1.999583004398201e-05, "loss": 0.273, "step": 1119 }, { "epoch": 0.23406478578892373, "grad_norm": 1.7093734677305081, "learning_rate": 1.9995797404858643e-05, "loss": 0.2785, "step": 1120 }, { "epoch": 0.2342737722048067, "grad_norm": 1.47265952924282, "learning_rate": 1.9995764638522957e-05, "loss": 0.2699, "step": 1121 }, { "epoch": 0.23448275862068965, "grad_norm": 1.6946778217513379, "learning_rate": 1.9995731744975368e-05, "loss": 0.2587, "step": 1122 }, { "epoch": 0.2346917450365726, "grad_norm": 1.3926218637849537, "learning_rate": 1.9995698724216294e-05, "loss": 0.27, "step": 1123 }, { "epoch": 0.2349007314524556, "grad_norm": 1.803832524648578, "learning_rate": 1.9995665576246157e-05, "loss": 0.337, "step": 1124 }, { "epoch": 0.23510971786833856, "grad_norm": 2.0685728392190907, "learning_rate": 1.9995632301065382e-05, "loss": 0.3055, "step": 1125 }, { "epoch": 0.23531870428422152, "grad_norm": 1.7310053584713394, "learning_rate": 1.9995598898674388e-05, "loss": 0.294, "step": 1126 }, { "epoch": 0.2355276907001045, "grad_norm": 1.5518227634965391, "learning_rate": 1.9995565369073603e-05, "loss": 0.2815, "step": 1127 }, { "epoch": 0.23573667711598745, "grad_norm": 2.217620278031729, "learning_rate": 1.9995531712263446e-05, "loss": 0.2653, "step": 1128 }, { "epoch": 0.23594566353187044, "grad_norm": 1.5567632073147144, "learning_rate": 1.9995497928244356e-05, "loss": 0.3064, "step": 1129 }, { "epoch": 0.2361546499477534, "grad_norm": 1.4316967877728626, "learning_rate": 1.9995464017016756e-05, "loss": 0.3043, "step": 1130 }, { "epoch": 0.23636363636363636, "grad_norm": 1.8287695546237035, "learning_rate": 1.999542997858108e-05, "loss": 0.2998, "step": 1131 }, { "epoch": 0.23657262277951932, "grad_norm": 2.2831556472638654, "learning_rate": 1.9995395812937765e-05, "loss": 0.31, "step": 1132 }, { "epoch": 0.2367816091954023, "grad_norm": 1.341848286411537, "learning_rate": 1.999536152008724e-05, "loss": 0.2909, "step": 1133 }, { "epoch": 0.23699059561128527, "grad_norm": 1.5090770745284943, "learning_rate": 1.999532710002994e-05, "loss": 0.2872, "step": 1134 }, { "epoch": 0.23719958202716823, "grad_norm": 1.5904648821845384, "learning_rate": 1.999529255276631e-05, "loss": 0.2841, "step": 1135 }, { "epoch": 0.2374085684430512, "grad_norm": 1.786629733198219, "learning_rate": 1.9995257878296788e-05, "loss": 0.3145, "step": 1136 }, { "epoch": 0.23761755485893418, "grad_norm": 1.5174837674564516, "learning_rate": 1.9995223076621812e-05, "loss": 0.2917, "step": 1137 }, { "epoch": 0.23782654127481714, "grad_norm": 1.8621082907760391, "learning_rate": 1.9995188147741823e-05, "loss": 0.3031, "step": 1138 }, { "epoch": 0.2380355276907001, "grad_norm": 1.4191088556799696, "learning_rate": 1.999515309165727e-05, "loss": 0.3262, "step": 1139 }, { "epoch": 0.23824451410658307, "grad_norm": 1.53019732439534, "learning_rate": 1.9995117908368596e-05, "loss": 0.2913, "step": 1140 }, { "epoch": 0.23845350052246603, "grad_norm": 1.734273158047837, "learning_rate": 1.9995082597876253e-05, "loss": 0.3015, "step": 1141 }, { "epoch": 0.23866248693834902, "grad_norm": 1.4115775275471876, "learning_rate": 1.999504716018069e-05, "loss": 0.2739, "step": 1142 }, { "epoch": 0.23887147335423198, "grad_norm": 1.4282254620717738, "learning_rate": 1.9995011595282352e-05, "loss": 0.3364, "step": 1143 }, { "epoch": 0.23908045977011494, "grad_norm": 1.699219395138384, "learning_rate": 1.99949759031817e-05, "loss": 0.3304, "step": 1144 }, { "epoch": 0.2392894461859979, "grad_norm": 1.4080327147237022, "learning_rate": 1.999494008387918e-05, "loss": 0.2998, "step": 1145 }, { "epoch": 0.2394984326018809, "grad_norm": 1.3954121573806388, "learning_rate": 1.9994904137375254e-05, "loss": 0.3386, "step": 1146 }, { "epoch": 0.23970741901776385, "grad_norm": 1.2517437229702328, "learning_rate": 1.9994868063670375e-05, "loss": 0.3335, "step": 1147 }, { "epoch": 0.2399164054336468, "grad_norm": 1.6408229214748578, "learning_rate": 1.9994831862765007e-05, "loss": 0.3163, "step": 1148 }, { "epoch": 0.24012539184952977, "grad_norm": 1.2410141472978304, "learning_rate": 1.9994795534659607e-05, "loss": 0.3045, "step": 1149 }, { "epoch": 0.24033437826541273, "grad_norm": 1.5782065029439294, "learning_rate": 1.999475907935464e-05, "loss": 0.2958, "step": 1150 }, { "epoch": 0.24054336468129572, "grad_norm": 1.5329712271105602, "learning_rate": 1.9994722496850564e-05, "loss": 0.3018, "step": 1151 }, { "epoch": 0.24075235109717869, "grad_norm": 1.5703654706754093, "learning_rate": 1.9994685787147853e-05, "loss": 0.2891, "step": 1152 }, { "epoch": 0.24096133751306165, "grad_norm": 1.4073424906274377, "learning_rate": 1.999464895024697e-05, "loss": 0.3101, "step": 1153 }, { "epoch": 0.2411703239289446, "grad_norm": 1.493364984237948, "learning_rate": 1.999461198614838e-05, "loss": 0.3167, "step": 1154 }, { "epoch": 0.2413793103448276, "grad_norm": 1.2713559769431504, "learning_rate": 1.999457489485256e-05, "loss": 0.286, "step": 1155 }, { "epoch": 0.24158829676071056, "grad_norm": 1.4243989152807472, "learning_rate": 1.999453767635998e-05, "loss": 0.2947, "step": 1156 }, { "epoch": 0.24179728317659352, "grad_norm": 1.3233512443529614, "learning_rate": 1.9994500330671112e-05, "loss": 0.3014, "step": 1157 }, { "epoch": 0.24200626959247648, "grad_norm": 1.3795189342681995, "learning_rate": 1.9994462857786433e-05, "loss": 0.3103, "step": 1158 }, { "epoch": 0.24221525600835947, "grad_norm": 2.226326022425212, "learning_rate": 1.9994425257706418e-05, "loss": 0.3261, "step": 1159 }, { "epoch": 0.24242424242424243, "grad_norm": 1.752704010148811, "learning_rate": 1.9994387530431548e-05, "loss": 0.2791, "step": 1160 }, { "epoch": 0.2426332288401254, "grad_norm": 1.53687389088697, "learning_rate": 1.99943496759623e-05, "loss": 0.318, "step": 1161 }, { "epoch": 0.24284221525600835, "grad_norm": 1.765176805299159, "learning_rate": 1.999431169429916e-05, "loss": 0.2808, "step": 1162 }, { "epoch": 0.24305120167189131, "grad_norm": 2.015267443310764, "learning_rate": 1.999427358544261e-05, "loss": 0.3128, "step": 1163 }, { "epoch": 0.2432601880877743, "grad_norm": 1.6873384171449366, "learning_rate": 1.999423534939313e-05, "loss": 0.2911, "step": 1164 }, { "epoch": 0.24346917450365727, "grad_norm": 1.94402412842093, "learning_rate": 1.9994196986151212e-05, "loss": 0.2941, "step": 1165 }, { "epoch": 0.24367816091954023, "grad_norm": 1.5777426429327361, "learning_rate": 1.9994158495717343e-05, "loss": 0.2921, "step": 1166 }, { "epoch": 0.2438871473354232, "grad_norm": 1.9211509578780392, "learning_rate": 1.9994119878092017e-05, "loss": 0.2885, "step": 1167 }, { "epoch": 0.24409613375130618, "grad_norm": 1.3502752013706814, "learning_rate": 1.9994081133275715e-05, "loss": 0.3206, "step": 1168 }, { "epoch": 0.24430512016718914, "grad_norm": 2.166480462116394, "learning_rate": 1.999404226126894e-05, "loss": 0.3001, "step": 1169 }, { "epoch": 0.2445141065830721, "grad_norm": 1.454624953209962, "learning_rate": 1.999400326207218e-05, "loss": 0.3043, "step": 1170 }, { "epoch": 0.24472309299895506, "grad_norm": 2.3024266928848323, "learning_rate": 1.9993964135685934e-05, "loss": 0.3318, "step": 1171 }, { "epoch": 0.24493207941483802, "grad_norm": 1.3741672174079582, "learning_rate": 1.9993924882110703e-05, "loss": 0.304, "step": 1172 }, { "epoch": 0.245141065830721, "grad_norm": 1.5334649032041026, "learning_rate": 1.9993885501346984e-05, "loss": 0.2985, "step": 1173 }, { "epoch": 0.24535005224660397, "grad_norm": 1.5735686880794189, "learning_rate": 1.9993845993395275e-05, "loss": 0.2819, "step": 1174 }, { "epoch": 0.24555903866248693, "grad_norm": 1.5791624900253722, "learning_rate": 1.9993806358256086e-05, "loss": 0.291, "step": 1175 }, { "epoch": 0.2457680250783699, "grad_norm": 1.6399554430733483, "learning_rate": 1.9993766595929913e-05, "loss": 0.3306, "step": 1176 }, { "epoch": 0.24597701149425288, "grad_norm": 1.8021991065205585, "learning_rate": 1.9993726706417268e-05, "loss": 0.2945, "step": 1177 }, { "epoch": 0.24618599791013585, "grad_norm": 1.3874002626620834, "learning_rate": 1.9993686689718657e-05, "loss": 0.3214, "step": 1178 }, { "epoch": 0.2463949843260188, "grad_norm": 1.9767000645990722, "learning_rate": 1.999364654583459e-05, "loss": 0.2935, "step": 1179 }, { "epoch": 0.24660397074190177, "grad_norm": 1.92791696840878, "learning_rate": 1.9993606274765576e-05, "loss": 0.3059, "step": 1180 }, { "epoch": 0.24681295715778476, "grad_norm": 1.3689474086136135, "learning_rate": 1.999356587651213e-05, "loss": 0.2902, "step": 1181 }, { "epoch": 0.24702194357366772, "grad_norm": 1.2502289750012074, "learning_rate": 1.9993525351074763e-05, "loss": 0.3202, "step": 1182 }, { "epoch": 0.24723092998955068, "grad_norm": 1.2688601870714336, "learning_rate": 1.9993484698453994e-05, "loss": 0.259, "step": 1183 }, { "epoch": 0.24743991640543364, "grad_norm": 1.9527817969679084, "learning_rate": 1.9993443918650337e-05, "loss": 0.3244, "step": 1184 }, { "epoch": 0.2476489028213166, "grad_norm": 1.5492336241708258, "learning_rate": 1.9993403011664313e-05, "loss": 0.2859, "step": 1185 }, { "epoch": 0.2478578892371996, "grad_norm": 1.351124983086292, "learning_rate": 1.9993361977496445e-05, "loss": 0.3041, "step": 1186 }, { "epoch": 0.24806687565308255, "grad_norm": 1.2336507123072864, "learning_rate": 1.999332081614725e-05, "loss": 0.289, "step": 1187 }, { "epoch": 0.2482758620689655, "grad_norm": 1.5144052060731805, "learning_rate": 1.9993279527617256e-05, "loss": 0.293, "step": 1188 }, { "epoch": 0.24848484848484848, "grad_norm": 1.7632605452077164, "learning_rate": 1.9993238111906985e-05, "loss": 0.3083, "step": 1189 }, { "epoch": 0.24869383490073146, "grad_norm": 1.5208885384381416, "learning_rate": 1.9993196569016967e-05, "loss": 0.3054, "step": 1190 }, { "epoch": 0.24890282131661443, "grad_norm": 1.2914866259571585, "learning_rate": 1.999315489894773e-05, "loss": 0.3081, "step": 1191 }, { "epoch": 0.2491118077324974, "grad_norm": 1.3428130365344233, "learning_rate": 1.9993113101699806e-05, "loss": 0.2807, "step": 1192 }, { "epoch": 0.24932079414838035, "grad_norm": 1.2721233610377962, "learning_rate": 1.999307117727372e-05, "loss": 0.3035, "step": 1193 }, { "epoch": 0.2495297805642633, "grad_norm": 1.3891089169479518, "learning_rate": 1.9993029125670015e-05, "loss": 0.3186, "step": 1194 }, { "epoch": 0.2497387669801463, "grad_norm": 1.702868925743051, "learning_rate": 1.9992986946889223e-05, "loss": 0.3012, "step": 1195 }, { "epoch": 0.24994775339602926, "grad_norm": 1.6731343431293142, "learning_rate": 1.9992944640931874e-05, "loss": 0.2958, "step": 1196 }, { "epoch": 0.2501567398119122, "grad_norm": 1.6002379035449945, "learning_rate": 1.9992902207798515e-05, "loss": 0.2762, "step": 1197 }, { "epoch": 0.2503657262277952, "grad_norm": 1.4518593241037658, "learning_rate": 1.9992859647489685e-05, "loss": 0.2987, "step": 1198 }, { "epoch": 0.25057471264367814, "grad_norm": 2.3268963436267063, "learning_rate": 1.9992816960005922e-05, "loss": 0.3054, "step": 1199 }, { "epoch": 0.2507836990595611, "grad_norm": 1.4137890854447002, "learning_rate": 1.999277414534777e-05, "loss": 0.2865, "step": 1200 }, { "epoch": 0.2509926854754441, "grad_norm": 1.198543698974554, "learning_rate": 1.9992731203515773e-05, "loss": 0.2944, "step": 1201 }, { "epoch": 0.2512016718913271, "grad_norm": 1.5261998926562281, "learning_rate": 1.9992688134510482e-05, "loss": 0.3262, "step": 1202 }, { "epoch": 0.25141065830721004, "grad_norm": 1.5692681405603297, "learning_rate": 1.9992644938332443e-05, "loss": 0.2938, "step": 1203 }, { "epoch": 0.251619644723093, "grad_norm": 1.5213508212802065, "learning_rate": 1.9992601614982205e-05, "loss": 0.295, "step": 1204 }, { "epoch": 0.25182863113897597, "grad_norm": 1.8722407153032339, "learning_rate": 1.9992558164460318e-05, "loss": 0.3084, "step": 1205 }, { "epoch": 0.25203761755485893, "grad_norm": 1.2762259093005086, "learning_rate": 1.9992514586767333e-05, "loss": 0.298, "step": 1206 }, { "epoch": 0.2522466039707419, "grad_norm": 1.3209006054968442, "learning_rate": 1.9992470881903813e-05, "loss": 0.2812, "step": 1207 }, { "epoch": 0.25245559038662485, "grad_norm": 1.8000062285641525, "learning_rate": 1.9992427049870307e-05, "loss": 0.2951, "step": 1208 }, { "epoch": 0.2526645768025078, "grad_norm": 2.0223122704415952, "learning_rate": 1.9992383090667378e-05, "loss": 0.2955, "step": 1209 }, { "epoch": 0.25287356321839083, "grad_norm": 1.4952145515905404, "learning_rate": 1.999233900429558e-05, "loss": 0.3109, "step": 1210 }, { "epoch": 0.2530825496342738, "grad_norm": 1.7594094523063606, "learning_rate": 1.9992294790755477e-05, "loss": 0.3184, "step": 1211 }, { "epoch": 0.25329153605015675, "grad_norm": 1.930057121450468, "learning_rate": 1.999225045004763e-05, "loss": 0.2961, "step": 1212 }, { "epoch": 0.2535005224660397, "grad_norm": 1.9639814581744062, "learning_rate": 1.9992205982172607e-05, "loss": 0.2978, "step": 1213 }, { "epoch": 0.2537095088819227, "grad_norm": 1.3200165590346211, "learning_rate": 1.9992161387130972e-05, "loss": 0.2939, "step": 1214 }, { "epoch": 0.25391849529780564, "grad_norm": 1.717328224116354, "learning_rate": 1.9992116664923288e-05, "loss": 0.2739, "step": 1215 }, { "epoch": 0.2541274817136886, "grad_norm": 1.3803613220140287, "learning_rate": 1.999207181555013e-05, "loss": 0.3086, "step": 1216 }, { "epoch": 0.25433646812957156, "grad_norm": 1.518072622865352, "learning_rate": 1.999202683901207e-05, "loss": 0.2928, "step": 1217 }, { "epoch": 0.2545454545454545, "grad_norm": 1.4878313977271134, "learning_rate": 1.9991981735309677e-05, "loss": 0.2844, "step": 1218 }, { "epoch": 0.25475444096133754, "grad_norm": 1.4659328654024946, "learning_rate": 1.9991936504443522e-05, "loss": 0.2782, "step": 1219 }, { "epoch": 0.2549634273772205, "grad_norm": 1.2286997110441025, "learning_rate": 1.9991891146414188e-05, "loss": 0.2876, "step": 1220 }, { "epoch": 0.25517241379310346, "grad_norm": 1.5364787669571005, "learning_rate": 1.9991845661222244e-05, "loss": 0.3104, "step": 1221 }, { "epoch": 0.2553814002089864, "grad_norm": 1.412265350675513, "learning_rate": 1.9991800048868276e-05, "loss": 0.3011, "step": 1222 }, { "epoch": 0.2555903866248694, "grad_norm": 1.4376371144948419, "learning_rate": 1.999175430935286e-05, "loss": 0.2991, "step": 1223 }, { "epoch": 0.25579937304075234, "grad_norm": 1.337453124520765, "learning_rate": 1.999170844267658e-05, "loss": 0.3063, "step": 1224 }, { "epoch": 0.2560083594566353, "grad_norm": 1.7264212426115877, "learning_rate": 1.999166244884002e-05, "loss": 0.2899, "step": 1225 }, { "epoch": 0.25621734587251827, "grad_norm": 1.304184165281557, "learning_rate": 1.9991616327843767e-05, "loss": 0.2754, "step": 1226 }, { "epoch": 0.2564263322884012, "grad_norm": 1.7588619951422966, "learning_rate": 1.9991570079688404e-05, "loss": 0.2895, "step": 1227 }, { "epoch": 0.25663531870428424, "grad_norm": 1.4479624729714027, "learning_rate": 1.999152370437452e-05, "loss": 0.2666, "step": 1228 }, { "epoch": 0.2568443051201672, "grad_norm": 1.5223885500096874, "learning_rate": 1.999147720190271e-05, "loss": 0.2903, "step": 1229 }, { "epoch": 0.25705329153605017, "grad_norm": 1.4083815466493579, "learning_rate": 1.9991430572273557e-05, "loss": 0.2905, "step": 1230 }, { "epoch": 0.2572622779519331, "grad_norm": 1.3872840188746591, "learning_rate": 1.9991383815487665e-05, "loss": 0.3126, "step": 1231 }, { "epoch": 0.2574712643678161, "grad_norm": 1.2571818969473567, "learning_rate": 1.999133693154562e-05, "loss": 0.2655, "step": 1232 }, { "epoch": 0.25768025078369905, "grad_norm": 1.2745507264185159, "learning_rate": 1.9991289920448026e-05, "loss": 0.286, "step": 1233 }, { "epoch": 0.257889237199582, "grad_norm": 1.4951956093531356, "learning_rate": 1.9991242782195476e-05, "loss": 0.2692, "step": 1234 }, { "epoch": 0.258098223615465, "grad_norm": 1.436888586662528, "learning_rate": 1.9991195516788573e-05, "loss": 0.279, "step": 1235 }, { "epoch": 0.258307210031348, "grad_norm": 1.4555542933918402, "learning_rate": 1.9991148124227916e-05, "loss": 0.2937, "step": 1236 }, { "epoch": 0.25851619644723095, "grad_norm": 1.6279796265015039, "learning_rate": 1.9991100604514112e-05, "loss": 0.3374, "step": 1237 }, { "epoch": 0.2587251828631139, "grad_norm": 1.8290204863281732, "learning_rate": 1.999105295764776e-05, "loss": 0.3263, "step": 1238 }, { "epoch": 0.2589341692789969, "grad_norm": 1.4531078799662855, "learning_rate": 1.9991005183629468e-05, "loss": 0.2913, "step": 1239 }, { "epoch": 0.25914315569487983, "grad_norm": 1.5237719825347154, "learning_rate": 1.999095728245985e-05, "loss": 0.31, "step": 1240 }, { "epoch": 0.2593521421107628, "grad_norm": 1.7798174284186974, "learning_rate": 1.9990909254139507e-05, "loss": 0.3327, "step": 1241 }, { "epoch": 0.25956112852664576, "grad_norm": 2.007745408200133, "learning_rate": 1.9990861098669057e-05, "loss": 0.3216, "step": 1242 }, { "epoch": 0.2597701149425287, "grad_norm": 1.3855305836281726, "learning_rate": 1.9990812816049114e-05, "loss": 0.2871, "step": 1243 }, { "epoch": 0.2599791013584117, "grad_norm": 1.5382984893742677, "learning_rate": 1.999076440628028e-05, "loss": 0.2876, "step": 1244 }, { "epoch": 0.2601880877742947, "grad_norm": 2.2852953034962327, "learning_rate": 1.9990715869363188e-05, "loss": 0.3164, "step": 1245 }, { "epoch": 0.26039707419017766, "grad_norm": 1.314786175246474, "learning_rate": 1.9990667205298443e-05, "loss": 0.285, "step": 1246 }, { "epoch": 0.2606060606060606, "grad_norm": 1.5141422824687052, "learning_rate": 1.999061841408667e-05, "loss": 0.2858, "step": 1247 }, { "epoch": 0.2608150470219436, "grad_norm": 1.6555148685419847, "learning_rate": 1.999056949572849e-05, "loss": 0.3059, "step": 1248 }, { "epoch": 0.26102403343782654, "grad_norm": 1.4256509797078465, "learning_rate": 1.9990520450224523e-05, "loss": 0.3087, "step": 1249 }, { "epoch": 0.2612330198537095, "grad_norm": 1.4161696565611963, "learning_rate": 1.9990471277575396e-05, "loss": 0.2792, "step": 1250 }, { "epoch": 0.26144200626959246, "grad_norm": 1.6364022956681188, "learning_rate": 1.9990421977781734e-05, "loss": 0.3106, "step": 1251 }, { "epoch": 0.2616509926854754, "grad_norm": 2.1793413194063205, "learning_rate": 1.9990372550844158e-05, "loss": 0.3194, "step": 1252 }, { "epoch": 0.2618599791013584, "grad_norm": 1.866513911951744, "learning_rate": 1.9990322996763308e-05, "loss": 0.2874, "step": 1253 }, { "epoch": 0.2620689655172414, "grad_norm": 1.357292722571799, "learning_rate": 1.999027331553981e-05, "loss": 0.3283, "step": 1254 }, { "epoch": 0.26227795193312436, "grad_norm": 1.4904773720246252, "learning_rate": 1.9990223507174296e-05, "loss": 0.2902, "step": 1255 }, { "epoch": 0.2624869383490073, "grad_norm": 1.4791772983594023, "learning_rate": 1.9990173571667398e-05, "loss": 0.3038, "step": 1256 }, { "epoch": 0.2626959247648903, "grad_norm": 1.388249203656998, "learning_rate": 1.9990123509019754e-05, "loss": 0.298, "step": 1257 }, { "epoch": 0.26290491118077325, "grad_norm": 1.4335401838048627, "learning_rate": 1.9990073319231996e-05, "loss": 0.3299, "step": 1258 }, { "epoch": 0.2631138975966562, "grad_norm": 1.8017115182787171, "learning_rate": 1.999002300230477e-05, "loss": 0.306, "step": 1259 }, { "epoch": 0.26332288401253917, "grad_norm": 1.3059533473339753, "learning_rate": 1.9989972558238717e-05, "loss": 0.2954, "step": 1260 }, { "epoch": 0.26353187042842213, "grad_norm": 1.2528539790919293, "learning_rate": 1.998992198703447e-05, "loss": 0.2984, "step": 1261 }, { "epoch": 0.2637408568443051, "grad_norm": 1.475498722451089, "learning_rate": 1.9989871288692678e-05, "loss": 0.2853, "step": 1262 }, { "epoch": 0.2639498432601881, "grad_norm": 1.3824121986833133, "learning_rate": 1.998982046321399e-05, "loss": 0.2793, "step": 1263 }, { "epoch": 0.26415882967607107, "grad_norm": 1.6576210116883423, "learning_rate": 1.9989769510599048e-05, "loss": 0.3024, "step": 1264 }, { "epoch": 0.26436781609195403, "grad_norm": 2.0813275918560614, "learning_rate": 1.99897184308485e-05, "loss": 0.2964, "step": 1265 }, { "epoch": 0.264576802507837, "grad_norm": 1.422646350172504, "learning_rate": 1.9989667223963e-05, "loss": 0.3375, "step": 1266 }, { "epoch": 0.26478578892371996, "grad_norm": 2.086214486812947, "learning_rate": 1.9989615889943196e-05, "loss": 0.301, "step": 1267 }, { "epoch": 0.2649947753396029, "grad_norm": 1.9558739531089813, "learning_rate": 1.9989564428789742e-05, "loss": 0.3281, "step": 1268 }, { "epoch": 0.2652037617554859, "grad_norm": 1.6377100478330577, "learning_rate": 1.9989512840503294e-05, "loss": 0.2911, "step": 1269 }, { "epoch": 0.26541274817136884, "grad_norm": 1.3623916122271535, "learning_rate": 1.998946112508451e-05, "loss": 0.3129, "step": 1270 }, { "epoch": 0.2656217345872518, "grad_norm": 2.152237586364468, "learning_rate": 1.9989409282534043e-05, "loss": 0.3221, "step": 1271 }, { "epoch": 0.2658307210031348, "grad_norm": 1.788899924657937, "learning_rate": 1.9989357312852555e-05, "loss": 0.2709, "step": 1272 }, { "epoch": 0.2660397074190178, "grad_norm": 1.7708344746168263, "learning_rate": 1.998930521604071e-05, "loss": 0.3043, "step": 1273 }, { "epoch": 0.26624869383490074, "grad_norm": 1.6583867693527954, "learning_rate": 1.9989252992099168e-05, "loss": 0.3088, "step": 1274 }, { "epoch": 0.2664576802507837, "grad_norm": 1.4543006543660262, "learning_rate": 1.9989200641028597e-05, "loss": 0.2946, "step": 1275 }, { "epoch": 0.26666666666666666, "grad_norm": 1.9232441822940014, "learning_rate": 1.9989148162829663e-05, "loss": 0.2913, "step": 1276 }, { "epoch": 0.2668756530825496, "grad_norm": 1.5478073076194043, "learning_rate": 1.998909555750303e-05, "loss": 0.2964, "step": 1277 }, { "epoch": 0.2670846394984326, "grad_norm": 1.5077998493339093, "learning_rate": 1.998904282504937e-05, "loss": 0.3149, "step": 1278 }, { "epoch": 0.26729362591431555, "grad_norm": 1.525967969912868, "learning_rate": 1.998898996546935e-05, "loss": 0.3059, "step": 1279 }, { "epoch": 0.26750261233019856, "grad_norm": 1.2694790648547665, "learning_rate": 1.998893697876365e-05, "loss": 0.2679, "step": 1280 }, { "epoch": 0.2677115987460815, "grad_norm": 1.3948009250715017, "learning_rate": 1.998888386493294e-05, "loss": 0.3184, "step": 1281 }, { "epoch": 0.2679205851619645, "grad_norm": 1.971038837696937, "learning_rate": 1.9988830623977896e-05, "loss": 0.3004, "step": 1282 }, { "epoch": 0.26812957157784745, "grad_norm": 1.384283121130928, "learning_rate": 1.99887772558992e-05, "loss": 0.2817, "step": 1283 }, { "epoch": 0.2683385579937304, "grad_norm": 1.4323416934706927, "learning_rate": 1.9988723760697525e-05, "loss": 0.3283, "step": 1284 }, { "epoch": 0.26854754440961337, "grad_norm": 1.3491346554434236, "learning_rate": 1.998867013837355e-05, "loss": 0.2752, "step": 1285 }, { "epoch": 0.26875653082549633, "grad_norm": 1.5074328297022088, "learning_rate": 1.998861638892797e-05, "loss": 0.2639, "step": 1286 }, { "epoch": 0.2689655172413793, "grad_norm": 1.4805922275290757, "learning_rate": 1.9988562512361454e-05, "loss": 0.2947, "step": 1287 }, { "epoch": 0.26917450365726225, "grad_norm": 1.3808999273289548, "learning_rate": 1.9988508508674698e-05, "loss": 0.3092, "step": 1288 }, { "epoch": 0.26938349007314527, "grad_norm": 1.398707191077963, "learning_rate": 1.9988454377868385e-05, "loss": 0.3229, "step": 1289 }, { "epoch": 0.26959247648902823, "grad_norm": 1.347990550423591, "learning_rate": 1.9988400119943203e-05, "loss": 0.3077, "step": 1290 }, { "epoch": 0.2698014629049112, "grad_norm": 1.4489762396113905, "learning_rate": 1.9988345734899845e-05, "loss": 0.3095, "step": 1291 }, { "epoch": 0.27001044932079415, "grad_norm": 1.514750966952602, "learning_rate": 1.9988291222739004e-05, "loss": 0.313, "step": 1292 }, { "epoch": 0.2702194357366771, "grad_norm": 1.3985757670123773, "learning_rate": 1.998823658346137e-05, "loss": 0.3152, "step": 1293 }, { "epoch": 0.2704284221525601, "grad_norm": 1.4741993398383948, "learning_rate": 1.9988181817067643e-05, "loss": 0.302, "step": 1294 }, { "epoch": 0.27063740856844304, "grad_norm": 1.5923045698981237, "learning_rate": 1.9988126923558515e-05, "loss": 0.2882, "step": 1295 }, { "epoch": 0.270846394984326, "grad_norm": 1.2977384880543383, "learning_rate": 1.9988071902934686e-05, "loss": 0.3092, "step": 1296 }, { "epoch": 0.27105538140020896, "grad_norm": 1.372480558298643, "learning_rate": 1.998801675519686e-05, "loss": 0.3031, "step": 1297 }, { "epoch": 0.271264367816092, "grad_norm": 1.3841576104181732, "learning_rate": 1.9987961480345736e-05, "loss": 0.2692, "step": 1298 }, { "epoch": 0.27147335423197494, "grad_norm": 1.5060015629479737, "learning_rate": 1.9987906078382015e-05, "loss": 0.2871, "step": 1299 }, { "epoch": 0.2716823406478579, "grad_norm": 1.382527719322802, "learning_rate": 1.9987850549306405e-05, "loss": 0.2647, "step": 1300 }, { "epoch": 0.27189132706374086, "grad_norm": 1.5818907501613102, "learning_rate": 1.9987794893119616e-05, "loss": 0.3103, "step": 1301 }, { "epoch": 0.2721003134796238, "grad_norm": 1.5152862585932798, "learning_rate": 1.998773910982235e-05, "loss": 0.3137, "step": 1302 }, { "epoch": 0.2723092998955068, "grad_norm": 1.3387355964114207, "learning_rate": 1.998768319941532e-05, "loss": 0.2829, "step": 1303 }, { "epoch": 0.27251828631138975, "grad_norm": 1.735278497141071, "learning_rate": 1.9987627161899233e-05, "loss": 0.286, "step": 1304 }, { "epoch": 0.2727272727272727, "grad_norm": 1.447280996104722, "learning_rate": 1.998757099727481e-05, "loss": 0.2736, "step": 1305 }, { "epoch": 0.27293625914315567, "grad_norm": 1.663673105259235, "learning_rate": 1.998751470554276e-05, "loss": 0.2887, "step": 1306 }, { "epoch": 0.2731452455590387, "grad_norm": 1.3161006307703345, "learning_rate": 1.9987458286703803e-05, "loss": 0.2992, "step": 1307 }, { "epoch": 0.27335423197492165, "grad_norm": 1.4742882964674955, "learning_rate": 1.9987401740758658e-05, "loss": 0.322, "step": 1308 }, { "epoch": 0.2735632183908046, "grad_norm": 1.3669943890922647, "learning_rate": 1.998734506770804e-05, "loss": 0.2827, "step": 1309 }, { "epoch": 0.27377220480668757, "grad_norm": 1.3983544984060337, "learning_rate": 1.998728826755267e-05, "loss": 0.2691, "step": 1310 }, { "epoch": 0.27398119122257053, "grad_norm": 1.529574465040902, "learning_rate": 1.998723134029328e-05, "loss": 0.2819, "step": 1311 }, { "epoch": 0.2741901776384535, "grad_norm": 1.3047897849487766, "learning_rate": 1.998717428593058e-05, "loss": 0.2801, "step": 1312 }, { "epoch": 0.27439916405433645, "grad_norm": 1.4323149275173486, "learning_rate": 1.998711710446531e-05, "loss": 0.3064, "step": 1313 }, { "epoch": 0.2746081504702194, "grad_norm": 1.6676194998263467, "learning_rate": 1.9987059795898187e-05, "loss": 0.3005, "step": 1314 }, { "epoch": 0.27481713688610243, "grad_norm": 2.0212408044162116, "learning_rate": 1.9987002360229945e-05, "loss": 0.3015, "step": 1315 }, { "epoch": 0.2750261233019854, "grad_norm": 1.4575979867799003, "learning_rate": 1.9986944797461313e-05, "loss": 0.3055, "step": 1316 }, { "epoch": 0.27523510971786835, "grad_norm": 1.3595019528745198, "learning_rate": 1.9986887107593033e-05, "loss": 0.2967, "step": 1317 }, { "epoch": 0.2754440961337513, "grad_norm": 1.7584798701043796, "learning_rate": 1.9986829290625826e-05, "loss": 0.2853, "step": 1318 }, { "epoch": 0.2756530825496343, "grad_norm": 1.6614276265872856, "learning_rate": 1.998677134656043e-05, "loss": 0.2599, "step": 1319 }, { "epoch": 0.27586206896551724, "grad_norm": 1.6441762042461687, "learning_rate": 1.9986713275397593e-05, "loss": 0.2854, "step": 1320 }, { "epoch": 0.2760710553814002, "grad_norm": 1.6736608903128316, "learning_rate": 1.9986655077138043e-05, "loss": 0.3167, "step": 1321 }, { "epoch": 0.27628004179728316, "grad_norm": 1.7054079207608186, "learning_rate": 1.9986596751782525e-05, "loss": 0.2942, "step": 1322 }, { "epoch": 0.2764890282131661, "grad_norm": 1.9070429840037093, "learning_rate": 1.9986538299331784e-05, "loss": 0.279, "step": 1323 }, { "epoch": 0.27669801462904914, "grad_norm": 1.995690811432803, "learning_rate": 1.9986479719786557e-05, "loss": 0.2989, "step": 1324 }, { "epoch": 0.2769070010449321, "grad_norm": 1.5470950330602828, "learning_rate": 1.9986421013147598e-05, "loss": 0.29, "step": 1325 }, { "epoch": 0.27711598746081506, "grad_norm": 2.0402045737249166, "learning_rate": 1.9986362179415644e-05, "loss": 0.3144, "step": 1326 }, { "epoch": 0.277324973876698, "grad_norm": 1.8060454756389468, "learning_rate": 1.9986303218591452e-05, "loss": 0.2929, "step": 1327 }, { "epoch": 0.277533960292581, "grad_norm": 2.253633416647902, "learning_rate": 1.9986244130675766e-05, "loss": 0.3161, "step": 1328 }, { "epoch": 0.27774294670846394, "grad_norm": 1.4696433280781258, "learning_rate": 1.9986184915669343e-05, "loss": 0.2925, "step": 1329 }, { "epoch": 0.2779519331243469, "grad_norm": 2.120767521521417, "learning_rate": 1.9986125573572938e-05, "loss": 0.3057, "step": 1330 }, { "epoch": 0.27816091954022987, "grad_norm": 1.3285867386355283, "learning_rate": 1.9986066104387302e-05, "loss": 0.313, "step": 1331 }, { "epoch": 0.27836990595611283, "grad_norm": 1.295183521936299, "learning_rate": 1.9986006508113192e-05, "loss": 0.3049, "step": 1332 }, { "epoch": 0.27857889237199585, "grad_norm": 1.7365950200159777, "learning_rate": 1.9985946784751367e-05, "loss": 0.2892, "step": 1333 }, { "epoch": 0.2787878787878788, "grad_norm": 1.2393589049831741, "learning_rate": 1.9985886934302592e-05, "loss": 0.2877, "step": 1334 }, { "epoch": 0.27899686520376177, "grad_norm": 1.263386075536994, "learning_rate": 1.998582695676762e-05, "loss": 0.2817, "step": 1335 }, { "epoch": 0.27920585161964473, "grad_norm": 1.616192461081345, "learning_rate": 1.9985766852147222e-05, "loss": 0.2923, "step": 1336 }, { "epoch": 0.2794148380355277, "grad_norm": 1.5478934634557358, "learning_rate": 1.9985706620442154e-05, "loss": 0.3021, "step": 1337 }, { "epoch": 0.27962382445141065, "grad_norm": 1.5063664069557798, "learning_rate": 1.9985646261653197e-05, "loss": 0.2971, "step": 1338 }, { "epoch": 0.2798328108672936, "grad_norm": 1.4553989534689726, "learning_rate": 1.9985585775781104e-05, "loss": 0.3122, "step": 1339 }, { "epoch": 0.2800417972831766, "grad_norm": 1.597870380287906, "learning_rate": 1.9985525162826653e-05, "loss": 0.3003, "step": 1340 }, { "epoch": 0.28025078369905954, "grad_norm": 1.3010834502820074, "learning_rate": 1.9985464422790612e-05, "loss": 0.3144, "step": 1341 }, { "epoch": 0.28045977011494255, "grad_norm": 1.7693348259043216, "learning_rate": 1.9985403555673758e-05, "loss": 0.3128, "step": 1342 }, { "epoch": 0.2806687565308255, "grad_norm": 1.370381689683052, "learning_rate": 1.998534256147686e-05, "loss": 0.31, "step": 1343 }, { "epoch": 0.2808777429467085, "grad_norm": 1.3626229994622963, "learning_rate": 1.99852814402007e-05, "loss": 0.2891, "step": 1344 }, { "epoch": 0.28108672936259144, "grad_norm": 1.3954100732552468, "learning_rate": 1.9985220191846052e-05, "loss": 0.3227, "step": 1345 }, { "epoch": 0.2812957157784744, "grad_norm": 1.4505656861765608, "learning_rate": 1.9985158816413697e-05, "loss": 0.3042, "step": 1346 }, { "epoch": 0.28150470219435736, "grad_norm": 1.8844281260968667, "learning_rate": 1.9985097313904415e-05, "loss": 0.3067, "step": 1347 }, { "epoch": 0.2817136886102403, "grad_norm": 1.4128161973644742, "learning_rate": 1.998503568431899e-05, "loss": 0.2923, "step": 1348 }, { "epoch": 0.2819226750261233, "grad_norm": 1.4551988550017647, "learning_rate": 1.9984973927658207e-05, "loss": 0.3266, "step": 1349 }, { "epoch": 0.28213166144200624, "grad_norm": 1.8982268794662063, "learning_rate": 1.998491204392285e-05, "loss": 0.2751, "step": 1350 }, { "epoch": 0.28234064785788926, "grad_norm": 1.672203350406636, "learning_rate": 1.998485003311371e-05, "loss": 0.3199, "step": 1351 }, { "epoch": 0.2825496342737722, "grad_norm": 1.3430367002474959, "learning_rate": 1.9984787895231568e-05, "loss": 0.3111, "step": 1352 }, { "epoch": 0.2827586206896552, "grad_norm": 1.8014184927347687, "learning_rate": 1.9984725630277223e-05, "loss": 0.2948, "step": 1353 }, { "epoch": 0.28296760710553814, "grad_norm": 1.5631615046535383, "learning_rate": 1.9984663238251466e-05, "loss": 0.2834, "step": 1354 }, { "epoch": 0.2831765935214211, "grad_norm": 1.5339262464743384, "learning_rate": 1.998460071915509e-05, "loss": 0.2901, "step": 1355 }, { "epoch": 0.28338557993730407, "grad_norm": 1.5472560968193911, "learning_rate": 1.998453807298889e-05, "loss": 0.2886, "step": 1356 }, { "epoch": 0.283594566353187, "grad_norm": 1.5006334052685792, "learning_rate": 1.9984475299753664e-05, "loss": 0.2644, "step": 1357 }, { "epoch": 0.28380355276907, "grad_norm": 1.6301921706895501, "learning_rate": 1.9984412399450212e-05, "loss": 0.2792, "step": 1358 }, { "epoch": 0.284012539184953, "grad_norm": 1.4320615308879834, "learning_rate": 1.998434937207933e-05, "loss": 0.2935, "step": 1359 }, { "epoch": 0.28422152560083597, "grad_norm": 1.5753277995621762, "learning_rate": 1.9984286217641825e-05, "loss": 0.3297, "step": 1360 }, { "epoch": 0.28443051201671893, "grad_norm": 1.3553767094723468, "learning_rate": 1.9984222936138497e-05, "loss": 0.3464, "step": 1361 }, { "epoch": 0.2846394984326019, "grad_norm": 1.598429006263296, "learning_rate": 1.9984159527570157e-05, "loss": 0.3203, "step": 1362 }, { "epoch": 0.28484848484848485, "grad_norm": 1.3799197824282972, "learning_rate": 1.9984095991937608e-05, "loss": 0.2646, "step": 1363 }, { "epoch": 0.2850574712643678, "grad_norm": 2.4784590568369476, "learning_rate": 1.9984032329241657e-05, "loss": 0.3094, "step": 1364 }, { "epoch": 0.2852664576802508, "grad_norm": 1.6588324976229345, "learning_rate": 1.9983968539483114e-05, "loss": 0.2956, "step": 1365 }, { "epoch": 0.28547544409613373, "grad_norm": 1.5354019993573373, "learning_rate": 1.9983904622662798e-05, "loss": 0.2846, "step": 1366 }, { "epoch": 0.2856844305120167, "grad_norm": 1.8080343763568256, "learning_rate": 1.9983840578781515e-05, "loss": 0.2756, "step": 1367 }, { "epoch": 0.2858934169278997, "grad_norm": 1.520444106139386, "learning_rate": 1.998377640784008e-05, "loss": 0.2819, "step": 1368 }, { "epoch": 0.2861024033437827, "grad_norm": 1.44235701618669, "learning_rate": 1.9983712109839315e-05, "loss": 0.2814, "step": 1369 }, { "epoch": 0.28631138975966564, "grad_norm": 1.6769859362654675, "learning_rate": 1.9983647684780037e-05, "loss": 0.2766, "step": 1370 }, { "epoch": 0.2865203761755486, "grad_norm": 2.080093215058689, "learning_rate": 1.9983583132663062e-05, "loss": 0.2837, "step": 1371 }, { "epoch": 0.28672936259143156, "grad_norm": 1.3281974820292446, "learning_rate": 1.9983518453489214e-05, "loss": 0.2725, "step": 1372 }, { "epoch": 0.2869383490073145, "grad_norm": 1.6516338465332139, "learning_rate": 1.998345364725932e-05, "loss": 0.2966, "step": 1373 }, { "epoch": 0.2871473354231975, "grad_norm": 1.7494625920028573, "learning_rate": 1.9983388713974195e-05, "loss": 0.3137, "step": 1374 }, { "epoch": 0.28735632183908044, "grad_norm": 2.044178090234393, "learning_rate": 1.9983323653634673e-05, "loss": 0.3053, "step": 1375 }, { "epoch": 0.2875653082549634, "grad_norm": 1.5054798419146636, "learning_rate": 1.9983258466241582e-05, "loss": 0.3175, "step": 1376 }, { "epoch": 0.2877742946708464, "grad_norm": 1.8188746776095703, "learning_rate": 1.9983193151795747e-05, "loss": 0.2884, "step": 1377 }, { "epoch": 0.2879832810867294, "grad_norm": 1.4939532741165464, "learning_rate": 1.9983127710298003e-05, "loss": 0.2716, "step": 1378 }, { "epoch": 0.28819226750261234, "grad_norm": 2.412149287321241, "learning_rate": 1.9983062141749183e-05, "loss": 0.2943, "step": 1379 }, { "epoch": 0.2884012539184953, "grad_norm": 1.73332751141957, "learning_rate": 1.998299644615012e-05, "loss": 0.296, "step": 1380 }, { "epoch": 0.28861024033437827, "grad_norm": 1.3908331836387535, "learning_rate": 1.998293062350165e-05, "loss": 0.2929, "step": 1381 }, { "epoch": 0.2888192267502612, "grad_norm": 1.456287955714929, "learning_rate": 1.9982864673804612e-05, "loss": 0.2811, "step": 1382 }, { "epoch": 0.2890282131661442, "grad_norm": 1.6237509292303949, "learning_rate": 1.9982798597059846e-05, "loss": 0.2986, "step": 1383 }, { "epoch": 0.28923719958202715, "grad_norm": 2.498852735031978, "learning_rate": 1.9982732393268187e-05, "loss": 0.2765, "step": 1384 }, { "epoch": 0.2894461859979101, "grad_norm": 1.8311402141315376, "learning_rate": 1.9982666062430483e-05, "loss": 0.3172, "step": 1385 }, { "epoch": 0.2896551724137931, "grad_norm": 1.4469697965017743, "learning_rate": 1.9982599604547582e-05, "loss": 0.3303, "step": 1386 }, { "epoch": 0.2898641588296761, "grad_norm": 1.5417244569170574, "learning_rate": 1.998253301962032e-05, "loss": 0.3227, "step": 1387 }, { "epoch": 0.29007314524555905, "grad_norm": 1.606814783632479, "learning_rate": 1.998246630764955e-05, "loss": 0.3004, "step": 1388 }, { "epoch": 0.290282131661442, "grad_norm": 1.9433358526182338, "learning_rate": 1.998239946863612e-05, "loss": 0.2823, "step": 1389 }, { "epoch": 0.290491118077325, "grad_norm": 1.5161483892167902, "learning_rate": 1.998233250258088e-05, "loss": 0.2794, "step": 1390 }, { "epoch": 0.29070010449320793, "grad_norm": 1.4147380941553573, "learning_rate": 1.9982265409484686e-05, "loss": 0.2934, "step": 1391 }, { "epoch": 0.2909090909090909, "grad_norm": 1.4312905321511367, "learning_rate": 1.998219818934839e-05, "loss": 0.2947, "step": 1392 }, { "epoch": 0.29111807732497386, "grad_norm": 1.4471551255454576, "learning_rate": 1.9982130842172843e-05, "loss": 0.3169, "step": 1393 }, { "epoch": 0.2913270637408568, "grad_norm": 1.4176941137281458, "learning_rate": 1.9982063367958907e-05, "loss": 0.3175, "step": 1394 }, { "epoch": 0.29153605015673983, "grad_norm": 1.4337063398693886, "learning_rate": 1.998199576670744e-05, "loss": 0.3025, "step": 1395 }, { "epoch": 0.2917450365726228, "grad_norm": 1.513114459389824, "learning_rate": 1.99819280384193e-05, "loss": 0.3009, "step": 1396 }, { "epoch": 0.29195402298850576, "grad_norm": 1.754645966211678, "learning_rate": 1.998186018309535e-05, "loss": 0.2799, "step": 1397 }, { "epoch": 0.2921630094043887, "grad_norm": 1.516246642936205, "learning_rate": 1.9981792200736455e-05, "loss": 0.3003, "step": 1398 }, { "epoch": 0.2923719958202717, "grad_norm": 1.3432674727542648, "learning_rate": 1.9981724091343482e-05, "loss": 0.2998, "step": 1399 }, { "epoch": 0.29258098223615464, "grad_norm": 1.4657409627799116, "learning_rate": 1.998165585491729e-05, "loss": 0.3012, "step": 1400 }, { "epoch": 0.2927899686520376, "grad_norm": 1.4874298741518313, "learning_rate": 1.9981587491458758e-05, "loss": 0.2924, "step": 1401 }, { "epoch": 0.29299895506792056, "grad_norm": 1.707473455378352, "learning_rate": 1.998151900096875e-05, "loss": 0.2942, "step": 1402 }, { "epoch": 0.2932079414838036, "grad_norm": 1.4941181637606835, "learning_rate": 1.9981450383448134e-05, "loss": 0.2964, "step": 1403 }, { "epoch": 0.29341692789968654, "grad_norm": 1.331352926700852, "learning_rate": 1.9981381638897792e-05, "loss": 0.2913, "step": 1404 }, { "epoch": 0.2936259143155695, "grad_norm": 1.2654074712945274, "learning_rate": 1.9981312767318593e-05, "loss": 0.2799, "step": 1405 }, { "epoch": 0.29383490073145246, "grad_norm": 1.3133503698062217, "learning_rate": 1.9981243768711414e-05, "loss": 0.2953, "step": 1406 }, { "epoch": 0.2940438871473354, "grad_norm": 1.4186409270310474, "learning_rate": 1.9981174643077134e-05, "loss": 0.2863, "step": 1407 }, { "epoch": 0.2942528735632184, "grad_norm": 1.4255387665453245, "learning_rate": 1.998110539041663e-05, "loss": 0.308, "step": 1408 }, { "epoch": 0.29446185997910135, "grad_norm": 1.2589158765061823, "learning_rate": 1.998103601073079e-05, "loss": 0.2996, "step": 1409 }, { "epoch": 0.2946708463949843, "grad_norm": 1.49551068341216, "learning_rate": 1.9980966504020495e-05, "loss": 0.2723, "step": 1410 }, { "epoch": 0.29487983281086727, "grad_norm": 1.3631943471065473, "learning_rate": 1.9980896870286624e-05, "loss": 0.3159, "step": 1411 }, { "epoch": 0.2950888192267503, "grad_norm": 1.215800762213404, "learning_rate": 1.9980827109530065e-05, "loss": 0.2777, "step": 1412 }, { "epoch": 0.29529780564263325, "grad_norm": 1.5456326214363008, "learning_rate": 1.9980757221751714e-05, "loss": 0.3177, "step": 1413 }, { "epoch": 0.2955067920585162, "grad_norm": 1.3711879037757684, "learning_rate": 1.998068720695245e-05, "loss": 0.3074, "step": 1414 }, { "epoch": 0.29571577847439917, "grad_norm": 1.3745583756537336, "learning_rate": 1.9980617065133167e-05, "loss": 0.2952, "step": 1415 }, { "epoch": 0.29592476489028213, "grad_norm": 2.0156242977708, "learning_rate": 1.998054679629476e-05, "loss": 0.3227, "step": 1416 }, { "epoch": 0.2961337513061651, "grad_norm": 1.4198163880252717, "learning_rate": 1.9980476400438123e-05, "loss": 0.2936, "step": 1417 }, { "epoch": 0.29634273772204806, "grad_norm": 1.8122718947393373, "learning_rate": 1.998040587756415e-05, "loss": 0.3012, "step": 1418 }, { "epoch": 0.296551724137931, "grad_norm": 1.5263956623103823, "learning_rate": 1.998033522767374e-05, "loss": 0.2791, "step": 1419 }, { "epoch": 0.296760710553814, "grad_norm": 1.4415153508503202, "learning_rate": 1.998026445076779e-05, "loss": 0.3005, "step": 1420 }, { "epoch": 0.296969696969697, "grad_norm": 1.5594905507509316, "learning_rate": 1.9980193546847205e-05, "loss": 0.2783, "step": 1421 }, { "epoch": 0.29717868338557996, "grad_norm": 5.121849633428019, "learning_rate": 1.9980122515912886e-05, "loss": 0.2708, "step": 1422 }, { "epoch": 0.2973876698014629, "grad_norm": 1.416777552184211, "learning_rate": 1.9980051357965734e-05, "loss": 0.2868, "step": 1423 }, { "epoch": 0.2975966562173459, "grad_norm": 1.4816599058390405, "learning_rate": 1.9979980073006654e-05, "loss": 0.2874, "step": 1424 }, { "epoch": 0.29780564263322884, "grad_norm": 1.322381113360689, "learning_rate": 1.9979908661036557e-05, "loss": 0.2928, "step": 1425 }, { "epoch": 0.2980146290491118, "grad_norm": 1.5308909954055545, "learning_rate": 1.9979837122056353e-05, "loss": 0.2941, "step": 1426 }, { "epoch": 0.29822361546499476, "grad_norm": 1.5882117535410631, "learning_rate": 1.9979765456066946e-05, "loss": 0.2989, "step": 1427 }, { "epoch": 0.2984326018808777, "grad_norm": 1.5147187952863395, "learning_rate": 1.997969366306925e-05, "loss": 0.2934, "step": 1428 }, { "epoch": 0.2986415882967607, "grad_norm": 1.72875908344386, "learning_rate": 1.9979621743064185e-05, "loss": 0.2658, "step": 1429 }, { "epoch": 0.2988505747126437, "grad_norm": 1.5001093802817884, "learning_rate": 1.9979549696052663e-05, "loss": 0.2937, "step": 1430 }, { "epoch": 0.29905956112852666, "grad_norm": 1.322072313079796, "learning_rate": 1.99794775220356e-05, "loss": 0.3185, "step": 1431 }, { "epoch": 0.2992685475444096, "grad_norm": 1.450623650840271, "learning_rate": 1.997940522101391e-05, "loss": 0.2748, "step": 1432 }, { "epoch": 0.2994775339602926, "grad_norm": 1.4753508180362442, "learning_rate": 1.997933279298852e-05, "loss": 0.2597, "step": 1433 }, { "epoch": 0.29968652037617555, "grad_norm": 1.375351952925366, "learning_rate": 1.9979260237960348e-05, "loss": 0.3407, "step": 1434 }, { "epoch": 0.2998955067920585, "grad_norm": 1.4123620199508498, "learning_rate": 1.9979187555930317e-05, "loss": 0.2817, "step": 1435 }, { "epoch": 0.30010449320794147, "grad_norm": 1.4188118604648026, "learning_rate": 1.9979114746899356e-05, "loss": 0.2818, "step": 1436 }, { "epoch": 0.30031347962382443, "grad_norm": 1.5256269577239328, "learning_rate": 1.9979041810868393e-05, "loss": 0.2759, "step": 1437 }, { "epoch": 0.3005224660397074, "grad_norm": 1.723649092929879, "learning_rate": 1.9978968747838344e-05, "loss": 0.2921, "step": 1438 }, { "epoch": 0.3007314524555904, "grad_norm": 1.6077638513634247, "learning_rate": 1.9978895557810155e-05, "loss": 0.2715, "step": 1439 }, { "epoch": 0.30094043887147337, "grad_norm": 1.708910663360611, "learning_rate": 1.9978822240784744e-05, "loss": 0.2715, "step": 1440 }, { "epoch": 0.30114942528735633, "grad_norm": 1.2558786754949745, "learning_rate": 1.9978748796763053e-05, "loss": 0.3151, "step": 1441 }, { "epoch": 0.3013584117032393, "grad_norm": 1.4983541422296742, "learning_rate": 1.997867522574601e-05, "loss": 0.2859, "step": 1442 }, { "epoch": 0.30156739811912225, "grad_norm": 1.4463305744369472, "learning_rate": 1.997860152773456e-05, "loss": 0.3187, "step": 1443 }, { "epoch": 0.3017763845350052, "grad_norm": 1.3274587285208481, "learning_rate": 1.9978527702729632e-05, "loss": 0.2623, "step": 1444 }, { "epoch": 0.3019853709508882, "grad_norm": 1.392701759645989, "learning_rate": 1.997845375073217e-05, "loss": 0.2765, "step": 1445 }, { "epoch": 0.30219435736677114, "grad_norm": 1.3002405665051355, "learning_rate": 1.9978379671743115e-05, "loss": 0.2973, "step": 1446 }, { "epoch": 0.30240334378265415, "grad_norm": 1.4334888623147408, "learning_rate": 1.997830546576341e-05, "loss": 0.2765, "step": 1447 }, { "epoch": 0.3026123301985371, "grad_norm": 1.5430338862304174, "learning_rate": 1.9978231132793996e-05, "loss": 0.2751, "step": 1448 }, { "epoch": 0.3028213166144201, "grad_norm": 1.4509569257258375, "learning_rate": 1.997815667283582e-05, "loss": 0.2807, "step": 1449 }, { "epoch": 0.30303030303030304, "grad_norm": 1.4947423317842998, "learning_rate": 1.9978082085889837e-05, "loss": 0.2955, "step": 1450 }, { "epoch": 0.303239289446186, "grad_norm": 1.7523350674230722, "learning_rate": 1.9978007371956985e-05, "loss": 0.3096, "step": 1451 }, { "epoch": 0.30344827586206896, "grad_norm": 1.2541195527217068, "learning_rate": 1.9977932531038224e-05, "loss": 0.269, "step": 1452 }, { "epoch": 0.3036572622779519, "grad_norm": 1.442365802595424, "learning_rate": 1.9977857563134503e-05, "loss": 0.2655, "step": 1453 }, { "epoch": 0.3038662486938349, "grad_norm": 1.2996050296457227, "learning_rate": 1.9977782468246774e-05, "loss": 0.3159, "step": 1454 }, { "epoch": 0.30407523510971785, "grad_norm": 1.4606687302041892, "learning_rate": 1.9977707246375995e-05, "loss": 0.2945, "step": 1455 }, { "epoch": 0.30428422152560086, "grad_norm": 1.4791609418639742, "learning_rate": 1.9977631897523122e-05, "loss": 0.2843, "step": 1456 }, { "epoch": 0.3044932079414838, "grad_norm": 1.501160012654288, "learning_rate": 1.9977556421689118e-05, "loss": 0.327, "step": 1457 }, { "epoch": 0.3047021943573668, "grad_norm": 1.6539671635980504, "learning_rate": 1.9977480818874934e-05, "loss": 0.3165, "step": 1458 }, { "epoch": 0.30491118077324975, "grad_norm": 1.6991622511355402, "learning_rate": 1.9977405089081544e-05, "loss": 0.2972, "step": 1459 }, { "epoch": 0.3051201671891327, "grad_norm": 1.5583977695825475, "learning_rate": 1.9977329232309904e-05, "loss": 0.2718, "step": 1460 }, { "epoch": 0.30532915360501567, "grad_norm": 2.3677520641890264, "learning_rate": 1.9977253248560982e-05, "loss": 0.2647, "step": 1461 }, { "epoch": 0.30553814002089863, "grad_norm": 1.4184686456396927, "learning_rate": 1.9977177137835747e-05, "loss": 0.3037, "step": 1462 }, { "epoch": 0.3057471264367816, "grad_norm": 1.401459752478283, "learning_rate": 1.9977100900135163e-05, "loss": 0.2815, "step": 1463 }, { "epoch": 0.30595611285266455, "grad_norm": 1.4382931815610163, "learning_rate": 1.9977024535460203e-05, "loss": 0.3111, "step": 1464 }, { "epoch": 0.30616509926854757, "grad_norm": 1.7134179427896024, "learning_rate": 1.9976948043811837e-05, "loss": 0.2793, "step": 1465 }, { "epoch": 0.30637408568443053, "grad_norm": 1.2953192435977539, "learning_rate": 1.9976871425191043e-05, "loss": 0.2532, "step": 1466 }, { "epoch": 0.3065830721003135, "grad_norm": 1.5577823964059194, "learning_rate": 1.997679467959879e-05, "loss": 0.2684, "step": 1467 }, { "epoch": 0.30679205851619645, "grad_norm": 1.3116960149185308, "learning_rate": 1.997671780703606e-05, "loss": 0.3111, "step": 1468 }, { "epoch": 0.3070010449320794, "grad_norm": 1.7398371232339331, "learning_rate": 1.9976640807503828e-05, "loss": 0.2743, "step": 1469 }, { "epoch": 0.3072100313479624, "grad_norm": 1.5963411132666225, "learning_rate": 1.9976563681003075e-05, "loss": 0.3013, "step": 1470 }, { "epoch": 0.30741901776384534, "grad_norm": 1.5542859859887914, "learning_rate": 1.997648642753478e-05, "loss": 0.2998, "step": 1471 }, { "epoch": 0.3076280041797283, "grad_norm": 1.4494221328196326, "learning_rate": 1.9976409047099933e-05, "loss": 0.3001, "step": 1472 }, { "epoch": 0.30783699059561126, "grad_norm": 1.54002320111422, "learning_rate": 1.9976331539699513e-05, "loss": 0.2812, "step": 1473 }, { "epoch": 0.3080459770114943, "grad_norm": 1.3523724209327683, "learning_rate": 1.9976253905334508e-05, "loss": 0.2765, "step": 1474 }, { "epoch": 0.30825496342737724, "grad_norm": 1.4578989591444191, "learning_rate": 1.9976176144005906e-05, "loss": 0.273, "step": 1475 }, { "epoch": 0.3084639498432602, "grad_norm": 1.3693973196788667, "learning_rate": 1.9976098255714694e-05, "loss": 0.2395, "step": 1476 }, { "epoch": 0.30867293625914316, "grad_norm": 1.6162788716744023, "learning_rate": 1.9976020240461867e-05, "loss": 0.2789, "step": 1477 }, { "epoch": 0.3088819226750261, "grad_norm": 1.5177657613007922, "learning_rate": 1.997594209824842e-05, "loss": 0.299, "step": 1478 }, { "epoch": 0.3090909090909091, "grad_norm": 1.3285421547166643, "learning_rate": 1.997586382907534e-05, "loss": 0.2831, "step": 1479 }, { "epoch": 0.30929989550679204, "grad_norm": 1.4193174837247897, "learning_rate": 1.9975785432943627e-05, "loss": 0.2907, "step": 1480 }, { "epoch": 0.309508881922675, "grad_norm": 1.3934242892556628, "learning_rate": 1.9975706909854283e-05, "loss": 0.2932, "step": 1481 }, { "epoch": 0.30971786833855797, "grad_norm": 1.4108694567658318, "learning_rate": 1.99756282598083e-05, "loss": 0.2893, "step": 1482 }, { "epoch": 0.309926854754441, "grad_norm": 1.3238509544345087, "learning_rate": 1.9975549482806686e-05, "loss": 0.2701, "step": 1483 }, { "epoch": 0.31013584117032394, "grad_norm": 1.526064395405856, "learning_rate": 1.9975470578850434e-05, "loss": 0.2427, "step": 1484 }, { "epoch": 0.3103448275862069, "grad_norm": 1.4265845349211064, "learning_rate": 1.9975391547940556e-05, "loss": 0.2585, "step": 1485 }, { "epoch": 0.31055381400208987, "grad_norm": 1.2512773303015503, "learning_rate": 1.997531239007806e-05, "loss": 0.2612, "step": 1486 }, { "epoch": 0.31076280041797283, "grad_norm": 1.4449193473351019, "learning_rate": 1.9975233105263947e-05, "loss": 0.2939, "step": 1487 }, { "epoch": 0.3109717868338558, "grad_norm": 1.8172682432533558, "learning_rate": 1.9975153693499226e-05, "loss": 0.2722, "step": 1488 }, { "epoch": 0.31118077324973875, "grad_norm": 1.6920492738715915, "learning_rate": 1.997507415478491e-05, "loss": 0.2708, "step": 1489 }, { "epoch": 0.3113897596656217, "grad_norm": 1.3166865345055796, "learning_rate": 1.9974994489122013e-05, "loss": 0.2911, "step": 1490 }, { "epoch": 0.31159874608150473, "grad_norm": 1.4059584576708588, "learning_rate": 1.9974914696511546e-05, "loss": 0.2985, "step": 1491 }, { "epoch": 0.3118077324973877, "grad_norm": 1.397028130879437, "learning_rate": 1.9974834776954527e-05, "loss": 0.2889, "step": 1492 }, { "epoch": 0.31201671891327065, "grad_norm": 1.7817350776931338, "learning_rate": 1.9974754730451972e-05, "loss": 0.2807, "step": 1493 }, { "epoch": 0.3122257053291536, "grad_norm": 1.6038399860951644, "learning_rate": 1.9974674557004894e-05, "loss": 0.2983, "step": 1494 }, { "epoch": 0.3124346917450366, "grad_norm": 1.9702368192606476, "learning_rate": 1.997459425661432e-05, "loss": 0.2991, "step": 1495 }, { "epoch": 0.31264367816091954, "grad_norm": 1.4258172906218713, "learning_rate": 1.9974513829281273e-05, "loss": 0.2894, "step": 1496 }, { "epoch": 0.3128526645768025, "grad_norm": 1.5360775826083324, "learning_rate": 1.9974433275006777e-05, "loss": 0.297, "step": 1497 }, { "epoch": 0.31306165099268546, "grad_norm": 2.2570413658058968, "learning_rate": 1.997435259379185e-05, "loss": 0.2821, "step": 1498 }, { "epoch": 0.3132706374085684, "grad_norm": 1.6055875033824611, "learning_rate": 1.9974271785637526e-05, "loss": 0.3074, "step": 1499 }, { "epoch": 0.31347962382445144, "grad_norm": 2.0060862535965196, "learning_rate": 1.997419085054483e-05, "loss": 0.3287, "step": 1500 }, { "epoch": 0.3136886102403344, "grad_norm": 1.9219144139600053, "learning_rate": 1.997410978851479e-05, "loss": 0.302, "step": 1501 }, { "epoch": 0.31389759665621736, "grad_norm": 1.931787373635331, "learning_rate": 1.9974028599548442e-05, "loss": 0.3418, "step": 1502 }, { "epoch": 0.3141065830721003, "grad_norm": 1.2469686730878613, "learning_rate": 1.9973947283646815e-05, "loss": 0.2949, "step": 1503 }, { "epoch": 0.3143155694879833, "grad_norm": 1.615991288439424, "learning_rate": 1.9973865840810948e-05, "loss": 0.2744, "step": 1504 }, { "epoch": 0.31452455590386624, "grad_norm": 1.8985244025024692, "learning_rate": 1.9973784271041878e-05, "loss": 0.3116, "step": 1505 }, { "epoch": 0.3147335423197492, "grad_norm": 1.714825856079347, "learning_rate": 1.9973702574340638e-05, "loss": 0.2819, "step": 1506 }, { "epoch": 0.31494252873563217, "grad_norm": 1.5184718827111796, "learning_rate": 1.9973620750708272e-05, "loss": 0.2757, "step": 1507 }, { "epoch": 0.3151515151515151, "grad_norm": 1.6451352129392656, "learning_rate": 1.9973538800145816e-05, "loss": 0.3285, "step": 1508 }, { "epoch": 0.31536050156739814, "grad_norm": 1.6217081071785056, "learning_rate": 1.9973456722654322e-05, "loss": 0.2933, "step": 1509 }, { "epoch": 0.3155694879832811, "grad_norm": 1.5372587353407832, "learning_rate": 1.9973374518234828e-05, "loss": 0.268, "step": 1510 }, { "epoch": 0.31577847439916407, "grad_norm": 1.3433081608565745, "learning_rate": 1.9973292186888383e-05, "loss": 0.2935, "step": 1511 }, { "epoch": 0.315987460815047, "grad_norm": 1.5068272478715767, "learning_rate": 1.9973209728616032e-05, "loss": 0.2948, "step": 1512 }, { "epoch": 0.31619644723093, "grad_norm": 1.7416437452117157, "learning_rate": 1.9973127143418827e-05, "loss": 0.2852, "step": 1513 }, { "epoch": 0.31640543364681295, "grad_norm": 1.8082432421576295, "learning_rate": 1.9973044431297817e-05, "loss": 0.3069, "step": 1514 }, { "epoch": 0.3166144200626959, "grad_norm": 1.3756658420897763, "learning_rate": 1.9972961592254053e-05, "loss": 0.2665, "step": 1515 }, { "epoch": 0.3168234064785789, "grad_norm": 1.3221003812985102, "learning_rate": 1.9972878626288595e-05, "loss": 0.2886, "step": 1516 }, { "epoch": 0.31703239289446183, "grad_norm": 2.3862015627222664, "learning_rate": 1.9972795533402498e-05, "loss": 0.3068, "step": 1517 }, { "epoch": 0.31724137931034485, "grad_norm": 1.419914337707446, "learning_rate": 1.997271231359681e-05, "loss": 0.2931, "step": 1518 }, { "epoch": 0.3174503657262278, "grad_norm": 3.317098507617852, "learning_rate": 1.99726289668726e-05, "loss": 0.2863, "step": 1519 }, { "epoch": 0.3176593521421108, "grad_norm": 1.6686513347666962, "learning_rate": 1.997254549323093e-05, "loss": 0.2759, "step": 1520 }, { "epoch": 0.31786833855799373, "grad_norm": 1.4258938783245938, "learning_rate": 1.9972461892672857e-05, "loss": 0.2882, "step": 1521 }, { "epoch": 0.3180773249738767, "grad_norm": 1.5505434674628742, "learning_rate": 1.9972378165199443e-05, "loss": 0.2739, "step": 1522 }, { "epoch": 0.31828631138975966, "grad_norm": 1.5508744220810209, "learning_rate": 1.9972294310811756e-05, "loss": 0.2809, "step": 1523 }, { "epoch": 0.3184952978056426, "grad_norm": 1.4509009137621678, "learning_rate": 1.997221032951087e-05, "loss": 0.2752, "step": 1524 }, { "epoch": 0.3187042842215256, "grad_norm": 1.6496263057108602, "learning_rate": 1.9972126221297846e-05, "loss": 0.3065, "step": 1525 }, { "epoch": 0.31891327063740854, "grad_norm": 2.0474563793977154, "learning_rate": 1.9972041986173752e-05, "loss": 0.3201, "step": 1526 }, { "epoch": 0.31912225705329156, "grad_norm": 1.6572477382883426, "learning_rate": 1.997195762413967e-05, "loss": 0.2937, "step": 1527 }, { "epoch": 0.3193312434691745, "grad_norm": 1.7116636528727338, "learning_rate": 1.997187313519666e-05, "loss": 0.2535, "step": 1528 }, { "epoch": 0.3195402298850575, "grad_norm": 1.4136092755310714, "learning_rate": 1.997178851934581e-05, "loss": 0.2963, "step": 1529 }, { "epoch": 0.31974921630094044, "grad_norm": 1.4797801994159487, "learning_rate": 1.9971703776588194e-05, "loss": 0.2653, "step": 1530 }, { "epoch": 0.3199582027168234, "grad_norm": 1.7078938025588672, "learning_rate": 1.9971618906924885e-05, "loss": 0.293, "step": 1531 }, { "epoch": 0.32016718913270636, "grad_norm": 1.4610301386434932, "learning_rate": 1.9971533910356968e-05, "loss": 0.2778, "step": 1532 }, { "epoch": 0.3203761755485893, "grad_norm": 1.5640805987657094, "learning_rate": 1.9971448786885522e-05, "loss": 0.3092, "step": 1533 }, { "epoch": 0.3205851619644723, "grad_norm": 1.6433078168455284, "learning_rate": 1.9971363536511633e-05, "loss": 0.2889, "step": 1534 }, { "epoch": 0.3207941483803553, "grad_norm": 1.659923388576807, "learning_rate": 1.9971278159236386e-05, "loss": 0.2962, "step": 1535 }, { "epoch": 0.32100313479623827, "grad_norm": 1.37221771373148, "learning_rate": 1.9971192655060866e-05, "loss": 0.2726, "step": 1536 }, { "epoch": 0.3212121212121212, "grad_norm": 1.5880651872867158, "learning_rate": 1.997110702398616e-05, "loss": 0.3038, "step": 1537 }, { "epoch": 0.3214211076280042, "grad_norm": 1.3562338015232371, "learning_rate": 1.997102126601336e-05, "loss": 0.2706, "step": 1538 }, { "epoch": 0.32163009404388715, "grad_norm": 1.6396661991761659, "learning_rate": 1.9970935381143553e-05, "loss": 0.2932, "step": 1539 }, { "epoch": 0.3218390804597701, "grad_norm": 1.3814171624143212, "learning_rate": 1.997084936937784e-05, "loss": 0.2908, "step": 1540 }, { "epoch": 0.32204806687565307, "grad_norm": 1.6758483633948853, "learning_rate": 1.9970763230717308e-05, "loss": 0.2793, "step": 1541 }, { "epoch": 0.32225705329153603, "grad_norm": 1.4637413093803957, "learning_rate": 1.9970676965163062e-05, "loss": 0.2886, "step": 1542 }, { "epoch": 0.322466039707419, "grad_norm": 1.7506143160253538, "learning_rate": 1.997059057271619e-05, "loss": 0.2869, "step": 1543 }, { "epoch": 0.322675026123302, "grad_norm": 1.7959871947637265, "learning_rate": 1.9970504053377793e-05, "loss": 0.2747, "step": 1544 }, { "epoch": 0.322884012539185, "grad_norm": 1.7890062346005222, "learning_rate": 1.997041740714898e-05, "loss": 0.2989, "step": 1545 }, { "epoch": 0.32309299895506793, "grad_norm": 3.25006704341384, "learning_rate": 1.9970330634030842e-05, "loss": 0.2849, "step": 1546 }, { "epoch": 0.3233019853709509, "grad_norm": 1.7587244412193321, "learning_rate": 1.9970243734024496e-05, "loss": 0.2819, "step": 1547 }, { "epoch": 0.32351097178683386, "grad_norm": 1.4914417725246356, "learning_rate": 1.997015670713104e-05, "loss": 0.3173, "step": 1548 }, { "epoch": 0.3237199582027168, "grad_norm": 2.335782728634016, "learning_rate": 1.9970069553351583e-05, "loss": 0.251, "step": 1549 }, { "epoch": 0.3239289446185998, "grad_norm": 1.5880544230685703, "learning_rate": 1.9969982272687235e-05, "loss": 0.2928, "step": 1550 }, { "epoch": 0.32413793103448274, "grad_norm": 1.3184450636241087, "learning_rate": 1.9969894865139107e-05, "loss": 0.2592, "step": 1551 }, { "epoch": 0.3243469174503657, "grad_norm": 1.6626513949035475, "learning_rate": 1.996980733070831e-05, "loss": 0.2916, "step": 1552 }, { "epoch": 0.3245559038662487, "grad_norm": 1.4033589940014448, "learning_rate": 1.996971966939596e-05, "loss": 0.2701, "step": 1553 }, { "epoch": 0.3247648902821317, "grad_norm": 1.4564911645597638, "learning_rate": 1.996963188120317e-05, "loss": 0.271, "step": 1554 }, { "epoch": 0.32497387669801464, "grad_norm": 1.5219483171107318, "learning_rate": 1.996954396613106e-05, "loss": 0.2739, "step": 1555 }, { "epoch": 0.3251828631138976, "grad_norm": 1.9051239336194623, "learning_rate": 1.9969455924180745e-05, "loss": 0.2801, "step": 1556 }, { "epoch": 0.32539184952978056, "grad_norm": 1.639877097279699, "learning_rate": 1.996936775535335e-05, "loss": 0.2871, "step": 1557 }, { "epoch": 0.3256008359456635, "grad_norm": 1.3872514822132294, "learning_rate": 1.9969279459649992e-05, "loss": 0.2725, "step": 1558 }, { "epoch": 0.3258098223615465, "grad_norm": 1.7377363248134519, "learning_rate": 1.99691910370718e-05, "loss": 0.2902, "step": 1559 }, { "epoch": 0.32601880877742945, "grad_norm": 2.1008951636513027, "learning_rate": 1.9969102487619897e-05, "loss": 0.2842, "step": 1560 }, { "epoch": 0.3262277951933124, "grad_norm": 1.5626017105398755, "learning_rate": 1.996901381129541e-05, "loss": 0.2985, "step": 1561 }, { "epoch": 0.3264367816091954, "grad_norm": 1.350979881921948, "learning_rate": 1.9968925008099468e-05, "loss": 0.2788, "step": 1562 }, { "epoch": 0.3266457680250784, "grad_norm": 1.5597147643728984, "learning_rate": 1.99688360780332e-05, "loss": 0.2488, "step": 1563 }, { "epoch": 0.32685475444096135, "grad_norm": 2.880742224044078, "learning_rate": 1.996874702109774e-05, "loss": 0.2772, "step": 1564 }, { "epoch": 0.3270637408568443, "grad_norm": 1.6064827064860911, "learning_rate": 1.996865783729422e-05, "loss": 0.2943, "step": 1565 }, { "epoch": 0.32727272727272727, "grad_norm": 1.4244695469220718, "learning_rate": 1.996856852662377e-05, "loss": 0.3073, "step": 1566 }, { "epoch": 0.32748171368861023, "grad_norm": 1.3827961950498102, "learning_rate": 1.9968479089087537e-05, "loss": 0.2626, "step": 1567 }, { "epoch": 0.3276907001044932, "grad_norm": 1.3421709268228472, "learning_rate": 1.9968389524686655e-05, "loss": 0.2547, "step": 1568 }, { "epoch": 0.32789968652037615, "grad_norm": 1.3321474660153099, "learning_rate": 1.9968299833422257e-05, "loss": 0.2629, "step": 1569 }, { "epoch": 0.3281086729362591, "grad_norm": 1.2978061234327491, "learning_rate": 1.9968210015295494e-05, "loss": 0.2627, "step": 1570 }, { "epoch": 0.32831765935214213, "grad_norm": 1.5836931690804423, "learning_rate": 1.9968120070307503e-05, "loss": 0.3144, "step": 1571 }, { "epoch": 0.3285266457680251, "grad_norm": 1.4328714678808465, "learning_rate": 1.996802999845943e-05, "loss": 0.2664, "step": 1572 }, { "epoch": 0.32873563218390806, "grad_norm": 3.5654087643964494, "learning_rate": 1.9967939799752424e-05, "loss": 0.294, "step": 1573 }, { "epoch": 0.328944618599791, "grad_norm": 1.78135254307162, "learning_rate": 1.996784947418763e-05, "loss": 0.2907, "step": 1574 }, { "epoch": 0.329153605015674, "grad_norm": 1.4306638943441066, "learning_rate": 1.9967759021766202e-05, "loss": 0.2593, "step": 1575 }, { "epoch": 0.32936259143155694, "grad_norm": 1.332865216973724, "learning_rate": 1.9967668442489284e-05, "loss": 0.277, "step": 1576 }, { "epoch": 0.3295715778474399, "grad_norm": 1.4747616198240374, "learning_rate": 1.9967577736358032e-05, "loss": 0.3146, "step": 1577 }, { "epoch": 0.32978056426332286, "grad_norm": 1.6811106691835511, "learning_rate": 1.9967486903373604e-05, "loss": 0.2814, "step": 1578 }, { "epoch": 0.3299895506792059, "grad_norm": 1.7049232513471633, "learning_rate": 1.996739594353715e-05, "loss": 0.2851, "step": 1579 }, { "epoch": 0.33019853709508884, "grad_norm": 1.3004704224324675, "learning_rate": 1.9967304856849834e-05, "loss": 0.2651, "step": 1580 }, { "epoch": 0.3304075235109718, "grad_norm": 1.4532995478447093, "learning_rate": 1.996721364331281e-05, "loss": 0.2761, "step": 1581 }, { "epoch": 0.33061650992685476, "grad_norm": 1.499725617160314, "learning_rate": 1.9967122302927236e-05, "loss": 0.2887, "step": 1582 }, { "epoch": 0.3308254963427377, "grad_norm": 2.101820547305811, "learning_rate": 1.9967030835694286e-05, "loss": 0.2899, "step": 1583 }, { "epoch": 0.3310344827586207, "grad_norm": 1.706811544859347, "learning_rate": 1.996693924161511e-05, "loss": 0.312, "step": 1584 }, { "epoch": 0.33124346917450365, "grad_norm": 1.5508245248641224, "learning_rate": 1.9966847520690887e-05, "loss": 0.335, "step": 1585 }, { "epoch": 0.3314524555903866, "grad_norm": 2.0165381499144783, "learning_rate": 1.9966755672922774e-05, "loss": 0.2774, "step": 1586 }, { "epoch": 0.33166144200626957, "grad_norm": 1.2487603348888758, "learning_rate": 1.9966663698311948e-05, "loss": 0.3151, "step": 1587 }, { "epoch": 0.3318704284221526, "grad_norm": 1.5914890196555744, "learning_rate": 1.9966571596859574e-05, "loss": 0.2971, "step": 1588 }, { "epoch": 0.33207941483803555, "grad_norm": 1.7309852112207187, "learning_rate": 1.9966479368566826e-05, "loss": 0.3413, "step": 1589 }, { "epoch": 0.3322884012539185, "grad_norm": 1.4356188618194015, "learning_rate": 1.9966387013434875e-05, "loss": 0.301, "step": 1590 }, { "epoch": 0.33249738766980147, "grad_norm": 1.4060359939482046, "learning_rate": 1.99662945314649e-05, "loss": 0.2584, "step": 1591 }, { "epoch": 0.33270637408568443, "grad_norm": 1.5328538978829485, "learning_rate": 1.9966201922658076e-05, "loss": 0.2622, "step": 1592 }, { "epoch": 0.3329153605015674, "grad_norm": 1.6926144699706223, "learning_rate": 1.996610918701558e-05, "loss": 0.2928, "step": 1593 }, { "epoch": 0.33312434691745035, "grad_norm": 1.5685481249761608, "learning_rate": 1.99660163245386e-05, "loss": 0.2918, "step": 1594 }, { "epoch": 0.3333333333333333, "grad_norm": 1.3656132741919829, "learning_rate": 1.996592333522831e-05, "loss": 0.2789, "step": 1595 }, { "epoch": 0.3335423197492163, "grad_norm": 1.3823159489196049, "learning_rate": 1.996583021908589e-05, "loss": 0.2739, "step": 1596 }, { "epoch": 0.3337513061650993, "grad_norm": 1.3470228191578533, "learning_rate": 1.9965736976112538e-05, "loss": 0.2503, "step": 1597 }, { "epoch": 0.33396029258098225, "grad_norm": 1.868701927287031, "learning_rate": 1.996564360630943e-05, "loss": 0.2824, "step": 1598 }, { "epoch": 0.3341692789968652, "grad_norm": 2.5122608543589786, "learning_rate": 1.9965550109677764e-05, "loss": 0.2975, "step": 1599 }, { "epoch": 0.3343782654127482, "grad_norm": 1.5037226752545672, "learning_rate": 1.9965456486218718e-05, "loss": 0.2681, "step": 1600 }, { "epoch": 0.33458725182863114, "grad_norm": 1.363478880565403, "learning_rate": 1.996536273593349e-05, "loss": 0.3062, "step": 1601 }, { "epoch": 0.3347962382445141, "grad_norm": 1.4292572996724222, "learning_rate": 1.996526885882327e-05, "loss": 0.2797, "step": 1602 }, { "epoch": 0.33500522466039706, "grad_norm": 1.67665989085026, "learning_rate": 1.996517485488926e-05, "loss": 0.2601, "step": 1603 }, { "epoch": 0.33521421107628, "grad_norm": 1.6032996518490055, "learning_rate": 1.996508072413265e-05, "loss": 0.2796, "step": 1604 }, { "epoch": 0.335423197492163, "grad_norm": 1.5838138512350761, "learning_rate": 1.9964986466554636e-05, "loss": 0.2766, "step": 1605 }, { "epoch": 0.335632183908046, "grad_norm": 1.5010747789856447, "learning_rate": 1.9964892082156423e-05, "loss": 0.2752, "step": 1606 }, { "epoch": 0.33584117032392896, "grad_norm": 1.5579423117254256, "learning_rate": 1.9964797570939208e-05, "loss": 0.3185, "step": 1607 }, { "epoch": 0.3360501567398119, "grad_norm": 1.5630656123769924, "learning_rate": 1.9964702932904196e-05, "loss": 0.2879, "step": 1608 }, { "epoch": 0.3362591431556949, "grad_norm": 1.632198175859279, "learning_rate": 1.9964608168052594e-05, "loss": 0.3023, "step": 1609 }, { "epoch": 0.33646812957157785, "grad_norm": 1.3181336638250563, "learning_rate": 1.9964513276385604e-05, "loss": 0.2923, "step": 1610 }, { "epoch": 0.3366771159874608, "grad_norm": 1.561843571564113, "learning_rate": 1.9964418257904437e-05, "loss": 0.261, "step": 1611 }, { "epoch": 0.33688610240334377, "grad_norm": 1.5029056776212077, "learning_rate": 1.9964323112610297e-05, "loss": 0.2822, "step": 1612 }, { "epoch": 0.33709508881922673, "grad_norm": 1.4968019126986738, "learning_rate": 1.99642278405044e-05, "loss": 0.2834, "step": 1613 }, { "epoch": 0.3373040752351097, "grad_norm": 1.4137157573614048, "learning_rate": 1.9964132441587957e-05, "loss": 0.2818, "step": 1614 }, { "epoch": 0.3375130616509927, "grad_norm": 1.3656762043110333, "learning_rate": 1.9964036915862183e-05, "loss": 0.2546, "step": 1615 }, { "epoch": 0.33772204806687567, "grad_norm": 1.3567795413584824, "learning_rate": 1.996394126332829e-05, "loss": 0.305, "step": 1616 }, { "epoch": 0.33793103448275863, "grad_norm": 1.578765599099398, "learning_rate": 1.9963845483987497e-05, "loss": 0.2945, "step": 1617 }, { "epoch": 0.3381400208986416, "grad_norm": 1.4257247143410092, "learning_rate": 1.9963749577841022e-05, "loss": 0.2613, "step": 1618 }, { "epoch": 0.33834900731452455, "grad_norm": 1.3690450044057463, "learning_rate": 1.9963653544890093e-05, "loss": 0.2807, "step": 1619 }, { "epoch": 0.3385579937304075, "grad_norm": 1.4186585408470187, "learning_rate": 1.9963557385135922e-05, "loss": 0.2756, "step": 1620 }, { "epoch": 0.3387669801462905, "grad_norm": 1.4796408868622726, "learning_rate": 1.996346109857974e-05, "loss": 0.2744, "step": 1621 }, { "epoch": 0.33897596656217344, "grad_norm": 1.7763663743607403, "learning_rate": 1.9963364685222767e-05, "loss": 0.2789, "step": 1622 }, { "epoch": 0.33918495297805645, "grad_norm": 1.7923367357140176, "learning_rate": 1.9963268145066232e-05, "loss": 0.288, "step": 1623 }, { "epoch": 0.3393939393939394, "grad_norm": 1.3120886766693747, "learning_rate": 1.996317147811137e-05, "loss": 0.2655, "step": 1624 }, { "epoch": 0.3396029258098224, "grad_norm": 1.607251250615243, "learning_rate": 1.9963074684359396e-05, "loss": 0.2852, "step": 1625 }, { "epoch": 0.33981191222570534, "grad_norm": 1.5317683278849477, "learning_rate": 1.996297776381156e-05, "loss": 0.2817, "step": 1626 }, { "epoch": 0.3400208986415883, "grad_norm": 1.7973703324184482, "learning_rate": 1.9962880716469083e-05, "loss": 0.2925, "step": 1627 }, { "epoch": 0.34022988505747126, "grad_norm": 1.5218288498920962, "learning_rate": 1.9962783542333205e-05, "loss": 0.2835, "step": 1628 }, { "epoch": 0.3404388714733542, "grad_norm": 1.430016524823951, "learning_rate": 1.9962686241405162e-05, "loss": 0.2949, "step": 1629 }, { "epoch": 0.3406478578892372, "grad_norm": 1.6021183127477223, "learning_rate": 1.9962588813686187e-05, "loss": 0.2884, "step": 1630 }, { "epoch": 0.34085684430512014, "grad_norm": 1.7434877751733402, "learning_rate": 1.996249125917753e-05, "loss": 0.2671, "step": 1631 }, { "epoch": 0.34106583072100316, "grad_norm": 1.555842442070218, "learning_rate": 1.9962393577880427e-05, "loss": 0.2729, "step": 1632 }, { "epoch": 0.3412748171368861, "grad_norm": 1.6444062558121368, "learning_rate": 1.996229576979612e-05, "loss": 0.2955, "step": 1633 }, { "epoch": 0.3414838035527691, "grad_norm": 1.2853803452366008, "learning_rate": 1.9962197834925857e-05, "loss": 0.2909, "step": 1634 }, { "epoch": 0.34169278996865204, "grad_norm": 1.3949280009216019, "learning_rate": 1.996209977327088e-05, "loss": 0.2805, "step": 1635 }, { "epoch": 0.341901776384535, "grad_norm": 1.2775329619407998, "learning_rate": 1.996200158483244e-05, "loss": 0.2725, "step": 1636 }, { "epoch": 0.34211076280041797, "grad_norm": 1.5486220838805336, "learning_rate": 1.9961903269611787e-05, "loss": 0.3101, "step": 1637 }, { "epoch": 0.34231974921630093, "grad_norm": 1.9775337796464927, "learning_rate": 1.9961804827610174e-05, "loss": 0.2793, "step": 1638 }, { "epoch": 0.3425287356321839, "grad_norm": 1.937765142178924, "learning_rate": 1.996170625882885e-05, "loss": 0.2975, "step": 1639 }, { "epoch": 0.34273772204806685, "grad_norm": 1.6080165942786075, "learning_rate": 1.9961607563269068e-05, "loss": 0.2713, "step": 1640 }, { "epoch": 0.34294670846394987, "grad_norm": 1.2968722751458888, "learning_rate": 1.9961508740932086e-05, "loss": 0.2785, "step": 1641 }, { "epoch": 0.34315569487983283, "grad_norm": 1.5312047114116227, "learning_rate": 1.9961409791819165e-05, "loss": 0.2836, "step": 1642 }, { "epoch": 0.3433646812957158, "grad_norm": 1.3206970374693554, "learning_rate": 1.9961310715931565e-05, "loss": 0.2688, "step": 1643 }, { "epoch": 0.34357366771159875, "grad_norm": 1.3195013240891402, "learning_rate": 1.996121151327054e-05, "loss": 0.2932, "step": 1644 }, { "epoch": 0.3437826541274817, "grad_norm": 1.4388786198172714, "learning_rate": 1.9961112183837355e-05, "loss": 0.2737, "step": 1645 }, { "epoch": 0.3439916405433647, "grad_norm": 1.3049721329909287, "learning_rate": 1.9961012727633276e-05, "loss": 0.273, "step": 1646 }, { "epoch": 0.34420062695924764, "grad_norm": 1.8182931082100704, "learning_rate": 1.996091314465957e-05, "loss": 0.3102, "step": 1647 }, { "epoch": 0.3444096133751306, "grad_norm": 1.403164649439659, "learning_rate": 1.99608134349175e-05, "loss": 0.2717, "step": 1648 }, { "epoch": 0.34461859979101356, "grad_norm": 1.622354959685777, "learning_rate": 1.9960713598408336e-05, "loss": 0.2995, "step": 1649 }, { "epoch": 0.3448275862068966, "grad_norm": 1.5017958158112061, "learning_rate": 1.9960613635133353e-05, "loss": 0.3116, "step": 1650 }, { "epoch": 0.34503657262277954, "grad_norm": 1.9861002611374567, "learning_rate": 1.996051354509382e-05, "loss": 0.26, "step": 1651 }, { "epoch": 0.3452455590386625, "grad_norm": 2.2589677186223525, "learning_rate": 1.996041332829101e-05, "loss": 0.3336, "step": 1652 }, { "epoch": 0.34545454545454546, "grad_norm": 1.5831932246670153, "learning_rate": 1.99603129847262e-05, "loss": 0.3052, "step": 1653 }, { "epoch": 0.3456635318704284, "grad_norm": 1.3287064258915557, "learning_rate": 1.9960212514400665e-05, "loss": 0.2909, "step": 1654 }, { "epoch": 0.3458725182863114, "grad_norm": 1.9497381307188564, "learning_rate": 1.9960111917315685e-05, "loss": 0.2875, "step": 1655 }, { "epoch": 0.34608150470219434, "grad_norm": 1.8437338930128186, "learning_rate": 1.996001119347254e-05, "loss": 0.2693, "step": 1656 }, { "epoch": 0.3462904911180773, "grad_norm": 1.597879132446277, "learning_rate": 1.9959910342872512e-05, "loss": 0.2562, "step": 1657 }, { "epoch": 0.34649947753396027, "grad_norm": 1.4630791116378066, "learning_rate": 1.9959809365516888e-05, "loss": 0.2835, "step": 1658 }, { "epoch": 0.3467084639498433, "grad_norm": 1.8312222531345668, "learning_rate": 1.9959708261406944e-05, "loss": 0.2809, "step": 1659 }, { "epoch": 0.34691745036572624, "grad_norm": 1.6896853682073476, "learning_rate": 1.9959607030543978e-05, "loss": 0.2507, "step": 1660 }, { "epoch": 0.3471264367816092, "grad_norm": 1.6543410141443484, "learning_rate": 1.9959505672929267e-05, "loss": 0.2718, "step": 1661 }, { "epoch": 0.34733542319749217, "grad_norm": 1.4307092537396142, "learning_rate": 1.995940418856411e-05, "loss": 0.293, "step": 1662 }, { "epoch": 0.3475444096133751, "grad_norm": 1.430033479450934, "learning_rate": 1.9959302577449797e-05, "loss": 0.2672, "step": 1663 }, { "epoch": 0.3477533960292581, "grad_norm": 1.4733658512294583, "learning_rate": 1.995920083958762e-05, "loss": 0.2828, "step": 1664 }, { "epoch": 0.34796238244514105, "grad_norm": 1.7265137770474495, "learning_rate": 1.9959098974978864e-05, "loss": 0.3036, "step": 1665 }, { "epoch": 0.348171368861024, "grad_norm": 1.971844028061146, "learning_rate": 1.9958996983624842e-05, "loss": 0.2606, "step": 1666 }, { "epoch": 0.348380355276907, "grad_norm": 1.3396789092427561, "learning_rate": 1.9958894865526844e-05, "loss": 0.2931, "step": 1667 }, { "epoch": 0.34858934169279, "grad_norm": 1.6049703295330513, "learning_rate": 1.995879262068617e-05, "loss": 0.28, "step": 1668 }, { "epoch": 0.34879832810867295, "grad_norm": 1.3189511124325253, "learning_rate": 1.995869024910412e-05, "loss": 0.2484, "step": 1669 }, { "epoch": 0.3490073145245559, "grad_norm": 1.9458744955420133, "learning_rate": 1.9958587750782e-05, "loss": 0.2552, "step": 1670 }, { "epoch": 0.3492163009404389, "grad_norm": 1.551051293218955, "learning_rate": 1.9958485125721114e-05, "loss": 0.3104, "step": 1671 }, { "epoch": 0.34942528735632183, "grad_norm": 1.292472706449397, "learning_rate": 1.995838237392276e-05, "loss": 0.2778, "step": 1672 }, { "epoch": 0.3496342737722048, "grad_norm": 1.261720679020123, "learning_rate": 1.995827949538826e-05, "loss": 0.2819, "step": 1673 }, { "epoch": 0.34984326018808776, "grad_norm": 1.3847347332817168, "learning_rate": 1.9958176490118914e-05, "loss": 0.2789, "step": 1674 }, { "epoch": 0.3500522466039707, "grad_norm": 1.6406870711945238, "learning_rate": 1.9958073358116034e-05, "loss": 0.2525, "step": 1675 }, { "epoch": 0.35026123301985373, "grad_norm": 1.6051560121806787, "learning_rate": 1.9957970099380928e-05, "loss": 0.2856, "step": 1676 }, { "epoch": 0.3504702194357367, "grad_norm": 1.4716177173091365, "learning_rate": 1.995786671391492e-05, "loss": 0.2685, "step": 1677 }, { "epoch": 0.35067920585161966, "grad_norm": 1.3129999805952164, "learning_rate": 1.995776320171932e-05, "loss": 0.2902, "step": 1678 }, { "epoch": 0.3508881922675026, "grad_norm": 1.5773244360680678, "learning_rate": 1.9957659562795448e-05, "loss": 0.3079, "step": 1679 }, { "epoch": 0.3510971786833856, "grad_norm": 1.5451099380129685, "learning_rate": 1.9957555797144618e-05, "loss": 0.2774, "step": 1680 }, { "epoch": 0.35130616509926854, "grad_norm": 1.3818756662534644, "learning_rate": 1.9957451904768155e-05, "loss": 0.2785, "step": 1681 }, { "epoch": 0.3515151515151515, "grad_norm": 1.5772627833925823, "learning_rate": 1.9957347885667385e-05, "loss": 0.2728, "step": 1682 }, { "epoch": 0.35172413793103446, "grad_norm": 1.625408167086707, "learning_rate": 1.9957243739843625e-05, "loss": 0.273, "step": 1683 }, { "epoch": 0.3519331243469174, "grad_norm": 1.427934068422648, "learning_rate": 1.99571394672982e-05, "loss": 0.2652, "step": 1684 }, { "epoch": 0.35214211076280044, "grad_norm": 1.401133713512093, "learning_rate": 1.9957035068032437e-05, "loss": 0.2819, "step": 1685 }, { "epoch": 0.3523510971786834, "grad_norm": 1.5747233284687239, "learning_rate": 1.995693054204767e-05, "loss": 0.2961, "step": 1686 }, { "epoch": 0.35256008359456636, "grad_norm": 1.6091440653567424, "learning_rate": 1.9956825889345225e-05, "loss": 0.2768, "step": 1687 }, { "epoch": 0.3527690700104493, "grad_norm": 1.7399804441522466, "learning_rate": 1.9956721109926437e-05, "loss": 0.2527, "step": 1688 }, { "epoch": 0.3529780564263323, "grad_norm": 1.4002745985338791, "learning_rate": 1.9956616203792636e-05, "loss": 0.2678, "step": 1689 }, { "epoch": 0.35318704284221525, "grad_norm": 1.4748767193164067, "learning_rate": 1.9956511170945158e-05, "loss": 0.2647, "step": 1690 }, { "epoch": 0.3533960292580982, "grad_norm": 1.5380482376845255, "learning_rate": 1.9956406011385343e-05, "loss": 0.2733, "step": 1691 }, { "epoch": 0.35360501567398117, "grad_norm": 1.6824036463974104, "learning_rate": 1.9956300725114522e-05, "loss": 0.2665, "step": 1692 }, { "epoch": 0.35381400208986413, "grad_norm": 2.1446414846941813, "learning_rate": 1.9956195312134047e-05, "loss": 0.2872, "step": 1693 }, { "epoch": 0.35402298850574715, "grad_norm": 1.523964685867755, "learning_rate": 1.995608977244525e-05, "loss": 0.2664, "step": 1694 }, { "epoch": 0.3542319749216301, "grad_norm": 2.3822896973420487, "learning_rate": 1.9955984106049473e-05, "loss": 0.2591, "step": 1695 }, { "epoch": 0.35444096133751307, "grad_norm": 1.545342067564373, "learning_rate": 1.9955878312948065e-05, "loss": 0.2576, "step": 1696 }, { "epoch": 0.35464994775339603, "grad_norm": 1.8052673619170472, "learning_rate": 1.9955772393142373e-05, "loss": 0.2727, "step": 1697 }, { "epoch": 0.354858934169279, "grad_norm": 2.297247206287597, "learning_rate": 1.9955666346633742e-05, "loss": 0.2564, "step": 1698 }, { "epoch": 0.35506792058516196, "grad_norm": 1.2054491167162964, "learning_rate": 1.9955560173423524e-05, "loss": 0.266, "step": 1699 }, { "epoch": 0.3552769070010449, "grad_norm": 1.7043756427254393, "learning_rate": 1.9955453873513068e-05, "loss": 0.2743, "step": 1700 }, { "epoch": 0.3554858934169279, "grad_norm": 1.4655054123956002, "learning_rate": 1.995534744690373e-05, "loss": 0.2944, "step": 1701 }, { "epoch": 0.35569487983281084, "grad_norm": 1.5355935488146226, "learning_rate": 1.9955240893596863e-05, "loss": 0.2645, "step": 1702 }, { "epoch": 0.35590386624869386, "grad_norm": 1.9263891893460923, "learning_rate": 1.995513421359382e-05, "loss": 0.2548, "step": 1703 }, { "epoch": 0.3561128526645768, "grad_norm": 1.8698808192738723, "learning_rate": 1.9955027406895964e-05, "loss": 0.292, "step": 1704 }, { "epoch": 0.3563218390804598, "grad_norm": 1.597144954773998, "learning_rate": 1.995492047350465e-05, "loss": 0.273, "step": 1705 }, { "epoch": 0.35653082549634274, "grad_norm": 1.4703280358160455, "learning_rate": 1.9954813413421243e-05, "loss": 0.2692, "step": 1706 }, { "epoch": 0.3567398119122257, "grad_norm": 1.9373441948519639, "learning_rate": 1.99547062266471e-05, "loss": 0.3369, "step": 1707 }, { "epoch": 0.35694879832810866, "grad_norm": 1.6254706014308924, "learning_rate": 1.9954598913183593e-05, "loss": 0.2751, "step": 1708 }, { "epoch": 0.3571577847439916, "grad_norm": 1.1928378202919585, "learning_rate": 1.9954491473032077e-05, "loss": 0.2703, "step": 1709 }, { "epoch": 0.3573667711598746, "grad_norm": 1.8457648835902487, "learning_rate": 1.995438390619393e-05, "loss": 0.2859, "step": 1710 }, { "epoch": 0.3575757575757576, "grad_norm": 1.3755927519088682, "learning_rate": 1.9954276212670512e-05, "loss": 0.2874, "step": 1711 }, { "epoch": 0.35778474399164056, "grad_norm": 1.5167673002923205, "learning_rate": 1.99541683924632e-05, "loss": 0.2645, "step": 1712 }, { "epoch": 0.3579937304075235, "grad_norm": 1.4312537991233036, "learning_rate": 1.9954060445573364e-05, "loss": 0.2895, "step": 1713 }, { "epoch": 0.3582027168234065, "grad_norm": 1.384900236211294, "learning_rate": 1.9953952372002377e-05, "loss": 0.2885, "step": 1714 }, { "epoch": 0.35841170323928945, "grad_norm": 1.6408459517838494, "learning_rate": 1.9953844171751616e-05, "loss": 0.2802, "step": 1715 }, { "epoch": 0.3586206896551724, "grad_norm": 1.350804687244716, "learning_rate": 1.9953735844822456e-05, "loss": 0.2714, "step": 1716 }, { "epoch": 0.35882967607105537, "grad_norm": 1.4664132060591106, "learning_rate": 1.9953627391216277e-05, "loss": 0.291, "step": 1717 }, { "epoch": 0.35903866248693833, "grad_norm": 1.6831682214064818, "learning_rate": 1.9953518810934464e-05, "loss": 0.2548, "step": 1718 }, { "epoch": 0.3592476489028213, "grad_norm": 1.3562113826433828, "learning_rate": 1.9953410103978385e-05, "loss": 0.277, "step": 1719 }, { "epoch": 0.3594566353187043, "grad_norm": 1.476522289777118, "learning_rate": 1.9953301270349437e-05, "loss": 0.2711, "step": 1720 }, { "epoch": 0.35966562173458727, "grad_norm": 1.5091178175108257, "learning_rate": 1.9953192310049003e-05, "loss": 0.2849, "step": 1721 }, { "epoch": 0.35987460815047023, "grad_norm": 1.4668512986854922, "learning_rate": 1.9953083223078463e-05, "loss": 0.2525, "step": 1722 }, { "epoch": 0.3600835945663532, "grad_norm": 1.4886924871498908, "learning_rate": 1.9952974009439212e-05, "loss": 0.2725, "step": 1723 }, { "epoch": 0.36029258098223615, "grad_norm": 1.4364885572509203, "learning_rate": 1.9952864669132638e-05, "loss": 0.2664, "step": 1724 }, { "epoch": 0.3605015673981191, "grad_norm": 1.4594020390827944, "learning_rate": 1.995275520216013e-05, "loss": 0.3106, "step": 1725 }, { "epoch": 0.3607105538140021, "grad_norm": 1.660186855412005, "learning_rate": 1.995264560852308e-05, "loss": 0.3293, "step": 1726 }, { "epoch": 0.36091954022988504, "grad_norm": 1.985054665958761, "learning_rate": 1.9952535888222894e-05, "loss": 0.2917, "step": 1727 }, { "epoch": 0.361128526645768, "grad_norm": 1.6125667552615082, "learning_rate": 1.9952426041260954e-05, "loss": 0.2745, "step": 1728 }, { "epoch": 0.361337513061651, "grad_norm": 1.9525885998306938, "learning_rate": 1.9952316067638663e-05, "loss": 0.2887, "step": 1729 }, { "epoch": 0.361546499477534, "grad_norm": 1.8650415573884682, "learning_rate": 1.9952205967357425e-05, "loss": 0.2717, "step": 1730 }, { "epoch": 0.36175548589341694, "grad_norm": 1.5796216128656158, "learning_rate": 1.995209574041864e-05, "loss": 0.2995, "step": 1731 }, { "epoch": 0.3619644723092999, "grad_norm": 1.5174962104245897, "learning_rate": 1.9951985386823705e-05, "loss": 0.2553, "step": 1732 }, { "epoch": 0.36217345872518286, "grad_norm": 1.310711840082085, "learning_rate": 1.995187490657403e-05, "loss": 0.2743, "step": 1733 }, { "epoch": 0.3623824451410658, "grad_norm": 1.254452066303107, "learning_rate": 1.9951764299671016e-05, "loss": 0.2841, "step": 1734 }, { "epoch": 0.3625914315569488, "grad_norm": 1.9269819803409909, "learning_rate": 1.9951653566116075e-05, "loss": 0.2507, "step": 1735 }, { "epoch": 0.36280041797283175, "grad_norm": 1.7408069371440567, "learning_rate": 1.995154270591062e-05, "loss": 0.3104, "step": 1736 }, { "epoch": 0.3630094043887147, "grad_norm": 1.2947860254096548, "learning_rate": 1.9951431719056052e-05, "loss": 0.3146, "step": 1737 }, { "epoch": 0.3632183908045977, "grad_norm": 1.996164125400004, "learning_rate": 1.995132060555379e-05, "loss": 0.2805, "step": 1738 }, { "epoch": 0.3634273772204807, "grad_norm": 1.7422705253278632, "learning_rate": 1.9951209365405244e-05, "loss": 0.2567, "step": 1739 }, { "epoch": 0.36363636363636365, "grad_norm": 1.2688260623681369, "learning_rate": 1.9951097998611836e-05, "loss": 0.2805, "step": 1740 }, { "epoch": 0.3638453500522466, "grad_norm": 1.375299847999677, "learning_rate": 1.9950986505174977e-05, "loss": 0.2736, "step": 1741 }, { "epoch": 0.36405433646812957, "grad_norm": 1.6159462721949402, "learning_rate": 1.995087488509609e-05, "loss": 0.272, "step": 1742 }, { "epoch": 0.36426332288401253, "grad_norm": 1.5997873548461141, "learning_rate": 1.9950763138376595e-05, "loss": 0.2805, "step": 1743 }, { "epoch": 0.3644723092998955, "grad_norm": 1.4367780185934185, "learning_rate": 1.995065126501791e-05, "loss": 0.2803, "step": 1744 }, { "epoch": 0.36468129571577845, "grad_norm": 1.5318371216486022, "learning_rate": 1.9950539265021466e-05, "loss": 0.2559, "step": 1745 }, { "epoch": 0.36489028213166147, "grad_norm": 1.3628889191091857, "learning_rate": 1.995042713838868e-05, "loss": 0.2981, "step": 1746 }, { "epoch": 0.36509926854754443, "grad_norm": 1.3441349393507331, "learning_rate": 1.9950314885120986e-05, "loss": 0.2635, "step": 1747 }, { "epoch": 0.3653082549634274, "grad_norm": 1.3695390319701228, "learning_rate": 1.9950202505219808e-05, "loss": 0.2489, "step": 1748 }, { "epoch": 0.36551724137931035, "grad_norm": 1.7472730002633885, "learning_rate": 1.995008999868658e-05, "loss": 0.2896, "step": 1749 }, { "epoch": 0.3657262277951933, "grad_norm": 1.3083282939460628, "learning_rate": 1.9949977365522734e-05, "loss": 0.2741, "step": 1750 }, { "epoch": 0.3659352142110763, "grad_norm": 1.4154231024675283, "learning_rate": 1.9949864605729695e-05, "loss": 0.2886, "step": 1751 }, { "epoch": 0.36614420062695924, "grad_norm": 1.5191804551936505, "learning_rate": 1.9949751719308913e-05, "loss": 0.3016, "step": 1752 }, { "epoch": 0.3663531870428422, "grad_norm": 1.3708092581500615, "learning_rate": 1.9949638706261808e-05, "loss": 0.272, "step": 1753 }, { "epoch": 0.36656217345872516, "grad_norm": 1.3692509203479295, "learning_rate": 1.994952556658983e-05, "loss": 0.2627, "step": 1754 }, { "epoch": 0.3667711598746082, "grad_norm": 1.3887661131158804, "learning_rate": 1.994941230029442e-05, "loss": 0.2717, "step": 1755 }, { "epoch": 0.36698014629049114, "grad_norm": 1.3693845251902732, "learning_rate": 1.9949298907377008e-05, "loss": 0.2642, "step": 1756 }, { "epoch": 0.3671891327063741, "grad_norm": 1.2916034847760622, "learning_rate": 1.9949185387839044e-05, "loss": 0.2531, "step": 1757 }, { "epoch": 0.36739811912225706, "grad_norm": 1.4412137920113175, "learning_rate": 1.9949071741681973e-05, "loss": 0.276, "step": 1758 }, { "epoch": 0.36760710553814, "grad_norm": 1.5784005660839664, "learning_rate": 1.9948957968907243e-05, "loss": 0.2836, "step": 1759 }, { "epoch": 0.367816091954023, "grad_norm": 1.6528381835501031, "learning_rate": 1.99488440695163e-05, "loss": 0.2695, "step": 1760 }, { "epoch": 0.36802507836990594, "grad_norm": 1.4559165068203084, "learning_rate": 1.9948730043510595e-05, "loss": 0.2798, "step": 1761 }, { "epoch": 0.3682340647857889, "grad_norm": 2.366379515134331, "learning_rate": 1.9948615890891573e-05, "loss": 0.2524, "step": 1762 }, { "epoch": 0.36844305120167187, "grad_norm": 1.467736308906806, "learning_rate": 1.9948501611660692e-05, "loss": 0.2759, "step": 1763 }, { "epoch": 0.3686520376175549, "grad_norm": 1.594216911769823, "learning_rate": 1.994838720581941e-05, "loss": 0.3054, "step": 1764 }, { "epoch": 0.36886102403343785, "grad_norm": 1.3138217957107965, "learning_rate": 1.9948272673369174e-05, "loss": 0.297, "step": 1765 }, { "epoch": 0.3690700104493208, "grad_norm": 1.8091011754797126, "learning_rate": 1.9948158014311447e-05, "loss": 0.2821, "step": 1766 }, { "epoch": 0.36927899686520377, "grad_norm": 1.3400436798778135, "learning_rate": 1.994804322864769e-05, "loss": 0.2694, "step": 1767 }, { "epoch": 0.36948798328108673, "grad_norm": 1.1209041194317602, "learning_rate": 1.9947928316379358e-05, "loss": 0.2973, "step": 1768 }, { "epoch": 0.3696969696969697, "grad_norm": 1.4144810933410608, "learning_rate": 1.994781327750792e-05, "loss": 0.283, "step": 1769 }, { "epoch": 0.36990595611285265, "grad_norm": 1.3580543901076298, "learning_rate": 1.9947698112034835e-05, "loss": 0.3157, "step": 1770 }, { "epoch": 0.3701149425287356, "grad_norm": 1.2549210146506768, "learning_rate": 1.9947582819961574e-05, "loss": 0.2622, "step": 1771 }, { "epoch": 0.3703239289446186, "grad_norm": 1.4464438630912777, "learning_rate": 1.99474674012896e-05, "loss": 0.2688, "step": 1772 }, { "epoch": 0.3705329153605016, "grad_norm": 1.4840883119434105, "learning_rate": 1.9947351856020378e-05, "loss": 0.3055, "step": 1773 }, { "epoch": 0.37074190177638455, "grad_norm": 1.6369020616239163, "learning_rate": 1.9947236184155384e-05, "loss": 0.2957, "step": 1774 }, { "epoch": 0.3709508881922675, "grad_norm": 1.3202324891332682, "learning_rate": 1.9947120385696092e-05, "loss": 0.2503, "step": 1775 }, { "epoch": 0.3711598746081505, "grad_norm": 1.381197757401377, "learning_rate": 1.994700446064397e-05, "loss": 0.2825, "step": 1776 }, { "epoch": 0.37136886102403344, "grad_norm": 1.4247692541652863, "learning_rate": 1.99468884090005e-05, "loss": 0.2849, "step": 1777 }, { "epoch": 0.3715778474399164, "grad_norm": 1.5142707439057708, "learning_rate": 1.9946772230767154e-05, "loss": 0.2763, "step": 1778 }, { "epoch": 0.37178683385579936, "grad_norm": 1.6685907016363184, "learning_rate": 1.994665592594541e-05, "loss": 0.293, "step": 1779 }, { "epoch": 0.3719958202716823, "grad_norm": 1.5209498631441225, "learning_rate": 1.994653949453675e-05, "loss": 0.2658, "step": 1780 }, { "epoch": 0.3722048066875653, "grad_norm": 1.5357361226969533, "learning_rate": 1.994642293654266e-05, "loss": 0.3168, "step": 1781 }, { "epoch": 0.3724137931034483, "grad_norm": 1.6965548198429519, "learning_rate": 1.9946306251964616e-05, "loss": 0.2929, "step": 1782 }, { "epoch": 0.37262277951933126, "grad_norm": 1.2708274678349485, "learning_rate": 1.9946189440804103e-05, "loss": 0.2633, "step": 1783 }, { "epoch": 0.3728317659352142, "grad_norm": 1.8014233514135156, "learning_rate": 1.9946072503062617e-05, "loss": 0.3474, "step": 1784 }, { "epoch": 0.3730407523510972, "grad_norm": 1.5643514177335203, "learning_rate": 1.9945955438741635e-05, "loss": 0.2848, "step": 1785 }, { "epoch": 0.37324973876698014, "grad_norm": 1.4523626085771837, "learning_rate": 1.9945838247842656e-05, "loss": 0.2887, "step": 1786 }, { "epoch": 0.3734587251828631, "grad_norm": 1.489616629933418, "learning_rate": 1.9945720930367165e-05, "loss": 0.2878, "step": 1787 }, { "epoch": 0.37366771159874607, "grad_norm": 1.7687371134708108, "learning_rate": 1.9945603486316658e-05, "loss": 0.2991, "step": 1788 }, { "epoch": 0.373876698014629, "grad_norm": 1.3722216008116879, "learning_rate": 1.9945485915692624e-05, "loss": 0.2657, "step": 1789 }, { "epoch": 0.37408568443051204, "grad_norm": 1.2944593363118007, "learning_rate": 1.994536821849657e-05, "loss": 0.2885, "step": 1790 }, { "epoch": 0.374294670846395, "grad_norm": 1.3503374284606204, "learning_rate": 1.9945250394729988e-05, "loss": 0.2615, "step": 1791 }, { "epoch": 0.37450365726227797, "grad_norm": 1.6237646877295022, "learning_rate": 1.9945132444394376e-05, "loss": 0.2679, "step": 1792 }, { "epoch": 0.37471264367816093, "grad_norm": 2.4027346104627325, "learning_rate": 1.994501436749124e-05, "loss": 0.2789, "step": 1793 }, { "epoch": 0.3749216300940439, "grad_norm": 1.300626110863693, "learning_rate": 1.9944896164022074e-05, "loss": 0.2703, "step": 1794 }, { "epoch": 0.37513061650992685, "grad_norm": 1.6099946803493819, "learning_rate": 1.994477783398839e-05, "loss": 0.2781, "step": 1795 }, { "epoch": 0.3753396029258098, "grad_norm": 1.5681942029832574, "learning_rate": 1.994465937739169e-05, "loss": 0.2612, "step": 1796 }, { "epoch": 0.3755485893416928, "grad_norm": 1.5019934013740415, "learning_rate": 1.9944540794233485e-05, "loss": 0.2891, "step": 1797 }, { "epoch": 0.37575757575757573, "grad_norm": 1.48407930821278, "learning_rate": 1.9944422084515286e-05, "loss": 0.262, "step": 1798 }, { "epoch": 0.37596656217345875, "grad_norm": 1.4410355630000418, "learning_rate": 1.9944303248238594e-05, "loss": 0.2711, "step": 1799 }, { "epoch": 0.3761755485893417, "grad_norm": 1.7201106686688523, "learning_rate": 1.994418428540493e-05, "loss": 0.3054, "step": 1800 }, { "epoch": 0.3763845350052247, "grad_norm": 1.5858137406002049, "learning_rate": 1.9944065196015806e-05, "loss": 0.2763, "step": 1801 }, { "epoch": 0.37659352142110764, "grad_norm": 1.5043289323491678, "learning_rate": 1.9943945980072733e-05, "loss": 0.2918, "step": 1802 }, { "epoch": 0.3768025078369906, "grad_norm": 1.2912241138781595, "learning_rate": 1.994382663757724e-05, "loss": 0.2462, "step": 1803 }, { "epoch": 0.37701149425287356, "grad_norm": 1.3477996243789654, "learning_rate": 1.994370716853083e-05, "loss": 0.2645, "step": 1804 }, { "epoch": 0.3772204806687565, "grad_norm": 1.4847328369158364, "learning_rate": 1.9943587572935038e-05, "loss": 0.2907, "step": 1805 }, { "epoch": 0.3774294670846395, "grad_norm": 1.4523968197948975, "learning_rate": 1.9943467850791378e-05, "loss": 0.2778, "step": 1806 }, { "epoch": 0.37763845350052244, "grad_norm": 1.3932249703057795, "learning_rate": 1.9943348002101374e-05, "loss": 0.2355, "step": 1807 }, { "epoch": 0.37784743991640546, "grad_norm": 1.3176852129921361, "learning_rate": 1.9943228026866553e-05, "loss": 0.2751, "step": 1808 }, { "epoch": 0.3780564263322884, "grad_norm": 1.4609308813911086, "learning_rate": 1.994310792508844e-05, "loss": 0.3087, "step": 1809 }, { "epoch": 0.3782654127481714, "grad_norm": 1.3774546980058169, "learning_rate": 1.9942987696768567e-05, "loss": 0.2844, "step": 1810 }, { "epoch": 0.37847439916405434, "grad_norm": 1.3358015077712966, "learning_rate": 1.994286734190846e-05, "loss": 0.2937, "step": 1811 }, { "epoch": 0.3786833855799373, "grad_norm": 1.6993757278657575, "learning_rate": 1.9942746860509655e-05, "loss": 0.2661, "step": 1812 }, { "epoch": 0.37889237199582027, "grad_norm": 1.2639163892664567, "learning_rate": 1.9942626252573684e-05, "loss": 0.2562, "step": 1813 }, { "epoch": 0.3791013584117032, "grad_norm": 1.4246367256510142, "learning_rate": 1.9942505518102076e-05, "loss": 0.2648, "step": 1814 }, { "epoch": 0.3793103448275862, "grad_norm": 1.277179738186248, "learning_rate": 1.9942384657096373e-05, "loss": 0.264, "step": 1815 }, { "epoch": 0.37951933124346915, "grad_norm": 1.442511956298814, "learning_rate": 1.9942263669558118e-05, "loss": 0.2799, "step": 1816 }, { "epoch": 0.37972831765935217, "grad_norm": 1.3937673790936007, "learning_rate": 1.994214255548884e-05, "loss": 0.2656, "step": 1817 }, { "epoch": 0.3799373040752351, "grad_norm": 1.8021775677512852, "learning_rate": 1.994202131489009e-05, "loss": 0.2856, "step": 1818 }, { "epoch": 0.3801462904911181, "grad_norm": 1.7916144439060875, "learning_rate": 1.9941899947763407e-05, "loss": 0.2845, "step": 1819 }, { "epoch": 0.38035527690700105, "grad_norm": 1.3533753191966615, "learning_rate": 1.994177845411033e-05, "loss": 0.2823, "step": 1820 }, { "epoch": 0.380564263322884, "grad_norm": 1.4383960701604226, "learning_rate": 1.9941656833932416e-05, "loss": 0.2702, "step": 1821 }, { "epoch": 0.380773249738767, "grad_norm": 1.6739418696204136, "learning_rate": 1.99415350872312e-05, "loss": 0.3201, "step": 1822 }, { "epoch": 0.38098223615464993, "grad_norm": 1.3793301040771992, "learning_rate": 1.994141321400825e-05, "loss": 0.2468, "step": 1823 }, { "epoch": 0.3811912225705329, "grad_norm": 1.542343167574477, "learning_rate": 1.9941291214265096e-05, "loss": 0.2822, "step": 1824 }, { "epoch": 0.38140020898641586, "grad_norm": 1.5734265340063103, "learning_rate": 1.9941169088003305e-05, "loss": 0.2594, "step": 1825 }, { "epoch": 0.3816091954022989, "grad_norm": 1.5594316495077254, "learning_rate": 1.9941046835224424e-05, "loss": 0.2672, "step": 1826 }, { "epoch": 0.38181818181818183, "grad_norm": 1.4336981363502617, "learning_rate": 1.9940924455930015e-05, "loss": 0.2806, "step": 1827 }, { "epoch": 0.3820271682340648, "grad_norm": 1.4999549000877948, "learning_rate": 1.9940801950121627e-05, "loss": 0.2528, "step": 1828 }, { "epoch": 0.38223615464994776, "grad_norm": 1.2828199469825314, "learning_rate": 1.994067931780083e-05, "loss": 0.2866, "step": 1829 }, { "epoch": 0.3824451410658307, "grad_norm": 1.5641851924102281, "learning_rate": 1.9940556558969175e-05, "loss": 0.2632, "step": 1830 }, { "epoch": 0.3826541274817137, "grad_norm": 1.3941590348879283, "learning_rate": 1.994043367362823e-05, "loss": 0.2518, "step": 1831 }, { "epoch": 0.38286311389759664, "grad_norm": 1.3337830320317299, "learning_rate": 1.9940310661779554e-05, "loss": 0.3064, "step": 1832 }, { "epoch": 0.3830721003134796, "grad_norm": 1.3700008231935585, "learning_rate": 1.9940187523424718e-05, "loss": 0.265, "step": 1833 }, { "epoch": 0.3832810867293626, "grad_norm": 1.3075271624466185, "learning_rate": 1.9940064258565285e-05, "loss": 0.2694, "step": 1834 }, { "epoch": 0.3834900731452456, "grad_norm": 1.7703691655709308, "learning_rate": 1.9939940867202826e-05, "loss": 0.2563, "step": 1835 }, { "epoch": 0.38369905956112854, "grad_norm": 1.8208471715579204, "learning_rate": 1.9939817349338913e-05, "loss": 0.2496, "step": 1836 }, { "epoch": 0.3839080459770115, "grad_norm": 1.4180364207222487, "learning_rate": 1.9939693704975113e-05, "loss": 0.2807, "step": 1837 }, { "epoch": 0.38411703239289446, "grad_norm": 1.539927047280936, "learning_rate": 1.9939569934113003e-05, "loss": 0.2816, "step": 1838 }, { "epoch": 0.3843260188087774, "grad_norm": 1.4345156898468205, "learning_rate": 1.993944603675416e-05, "loss": 0.2423, "step": 1839 }, { "epoch": 0.3845350052246604, "grad_norm": 2.0029437649668345, "learning_rate": 1.9939322012900153e-05, "loss": 0.2775, "step": 1840 }, { "epoch": 0.38474399164054335, "grad_norm": 1.514333827741792, "learning_rate": 1.993919786255257e-05, "loss": 0.2974, "step": 1841 }, { "epoch": 0.3849529780564263, "grad_norm": 1.337790967086217, "learning_rate": 1.9939073585712983e-05, "loss": 0.2571, "step": 1842 }, { "epoch": 0.3851619644723093, "grad_norm": 1.7737906525225497, "learning_rate": 1.9938949182382982e-05, "loss": 0.2971, "step": 1843 }, { "epoch": 0.3853709508881923, "grad_norm": 1.7121574678054676, "learning_rate": 1.993882465256414e-05, "loss": 0.2997, "step": 1844 }, { "epoch": 0.38557993730407525, "grad_norm": 1.8197634823928381, "learning_rate": 1.9938699996258052e-05, "loss": 0.2532, "step": 1845 }, { "epoch": 0.3857889237199582, "grad_norm": 1.365717665206505, "learning_rate": 1.9938575213466293e-05, "loss": 0.2552, "step": 1846 }, { "epoch": 0.38599791013584117, "grad_norm": 1.4010347314908864, "learning_rate": 1.9938450304190465e-05, "loss": 0.289, "step": 1847 }, { "epoch": 0.38620689655172413, "grad_norm": 1.618694461605581, "learning_rate": 1.9938325268432148e-05, "loss": 0.2813, "step": 1848 }, { "epoch": 0.3864158829676071, "grad_norm": 1.2239907831281034, "learning_rate": 1.9938200106192936e-05, "loss": 0.2556, "step": 1849 }, { "epoch": 0.38662486938349006, "grad_norm": 1.505477642351308, "learning_rate": 1.9938074817474418e-05, "loss": 0.2955, "step": 1850 }, { "epoch": 0.386833855799373, "grad_norm": 1.2817869199018423, "learning_rate": 1.9937949402278195e-05, "loss": 0.265, "step": 1851 }, { "epoch": 0.38704284221525603, "grad_norm": 1.3098050056828585, "learning_rate": 1.993782386060586e-05, "loss": 0.2727, "step": 1852 }, { "epoch": 0.387251828631139, "grad_norm": 1.1623130698631332, "learning_rate": 1.9937698192459013e-05, "loss": 0.2496, "step": 1853 }, { "epoch": 0.38746081504702196, "grad_norm": 1.5990368563652217, "learning_rate": 1.9937572397839253e-05, "loss": 0.2832, "step": 1854 }, { "epoch": 0.3876698014629049, "grad_norm": 1.2484757356558678, "learning_rate": 1.9937446476748174e-05, "loss": 0.2745, "step": 1855 }, { "epoch": 0.3878787878787879, "grad_norm": 2.119145423490716, "learning_rate": 1.9937320429187384e-05, "loss": 0.256, "step": 1856 }, { "epoch": 0.38808777429467084, "grad_norm": 1.3267800248192236, "learning_rate": 1.993719425515849e-05, "loss": 0.288, "step": 1857 }, { "epoch": 0.3882967607105538, "grad_norm": 1.415662542984659, "learning_rate": 1.9937067954663095e-05, "loss": 0.2613, "step": 1858 }, { "epoch": 0.38850574712643676, "grad_norm": 2.3543764603481088, "learning_rate": 1.9936941527702804e-05, "loss": 0.2472, "step": 1859 }, { "epoch": 0.3887147335423197, "grad_norm": 1.8478926351857758, "learning_rate": 1.993681497427923e-05, "loss": 0.2719, "step": 1860 }, { "epoch": 0.38892371995820274, "grad_norm": 1.374297738935344, "learning_rate": 1.993668829439398e-05, "loss": 0.2684, "step": 1861 }, { "epoch": 0.3891327063740857, "grad_norm": 1.8609922658139724, "learning_rate": 1.993656148804867e-05, "loss": 0.2905, "step": 1862 }, { "epoch": 0.38934169278996866, "grad_norm": 1.9985894422171202, "learning_rate": 1.9936434555244907e-05, "loss": 0.2748, "step": 1863 }, { "epoch": 0.3895506792058516, "grad_norm": 1.559101834712838, "learning_rate": 1.9936307495984316e-05, "loss": 0.2857, "step": 1864 }, { "epoch": 0.3897596656217346, "grad_norm": 1.7225396470002658, "learning_rate": 1.9936180310268503e-05, "loss": 0.253, "step": 1865 }, { "epoch": 0.38996865203761755, "grad_norm": 1.389121312050968, "learning_rate": 1.99360529980991e-05, "loss": 0.2775, "step": 1866 }, { "epoch": 0.3901776384535005, "grad_norm": 1.5035823813587545, "learning_rate": 1.9935925559477715e-05, "loss": 0.2868, "step": 1867 }, { "epoch": 0.39038662486938347, "grad_norm": 1.5661617391885367, "learning_rate": 1.9935797994405975e-05, "loss": 0.2591, "step": 1868 }, { "epoch": 0.39059561128526643, "grad_norm": 1.4214730106339784, "learning_rate": 1.9935670302885505e-05, "loss": 0.2588, "step": 1869 }, { "epoch": 0.39080459770114945, "grad_norm": 1.6363243566781909, "learning_rate": 1.993554248491793e-05, "loss": 0.328, "step": 1870 }, { "epoch": 0.3910135841170324, "grad_norm": 1.4226380880750609, "learning_rate": 1.9935414540504874e-05, "loss": 0.2612, "step": 1871 }, { "epoch": 0.39122257053291537, "grad_norm": 1.299456638746471, "learning_rate": 1.9935286469647965e-05, "loss": 0.2862, "step": 1872 }, { "epoch": 0.39143155694879833, "grad_norm": 1.5146022629916622, "learning_rate": 1.9935158272348835e-05, "loss": 0.2328, "step": 1873 }, { "epoch": 0.3916405433646813, "grad_norm": 1.4640233875944793, "learning_rate": 1.9935029948609115e-05, "loss": 0.2787, "step": 1874 }, { "epoch": 0.39184952978056425, "grad_norm": 1.4038623234980565, "learning_rate": 1.9934901498430436e-05, "loss": 0.2888, "step": 1875 }, { "epoch": 0.3920585161964472, "grad_norm": 1.1431805494905487, "learning_rate": 1.9934772921814435e-05, "loss": 0.2642, "step": 1876 }, { "epoch": 0.3922675026123302, "grad_norm": 1.3178733362295862, "learning_rate": 1.9934644218762753e-05, "loss": 0.2741, "step": 1877 }, { "epoch": 0.3924764890282132, "grad_norm": 1.390971016450613, "learning_rate": 1.9934515389277018e-05, "loss": 0.2742, "step": 1878 }, { "epoch": 0.39268547544409615, "grad_norm": 1.5881031175356009, "learning_rate": 1.9934386433358876e-05, "loss": 0.2602, "step": 1879 }, { "epoch": 0.3928944618599791, "grad_norm": 1.36365126900619, "learning_rate": 1.9934257351009964e-05, "loss": 0.3057, "step": 1880 }, { "epoch": 0.3931034482758621, "grad_norm": 1.4219434470087793, "learning_rate": 1.9934128142231932e-05, "loss": 0.2742, "step": 1881 }, { "epoch": 0.39331243469174504, "grad_norm": 1.8796751926889932, "learning_rate": 1.993399880702642e-05, "loss": 0.3273, "step": 1882 }, { "epoch": 0.393521421107628, "grad_norm": 1.4550672176365007, "learning_rate": 1.993386934539507e-05, "loss": 0.2978, "step": 1883 }, { "epoch": 0.39373040752351096, "grad_norm": 1.2088937808644007, "learning_rate": 1.9933739757339533e-05, "loss": 0.2641, "step": 1884 }, { "epoch": 0.3939393939393939, "grad_norm": 1.3083876171411373, "learning_rate": 1.993361004286146e-05, "loss": 0.2565, "step": 1885 }, { "epoch": 0.3941483803552769, "grad_norm": 1.6431368503380168, "learning_rate": 1.9933480201962504e-05, "loss": 0.2637, "step": 1886 }, { "epoch": 0.3943573667711599, "grad_norm": 1.2811824178003548, "learning_rate": 1.993335023464431e-05, "loss": 0.2446, "step": 1887 }, { "epoch": 0.39456635318704286, "grad_norm": 1.5529310214041743, "learning_rate": 1.9933220140908534e-05, "loss": 0.281, "step": 1888 }, { "epoch": 0.3947753396029258, "grad_norm": 1.4885915065920796, "learning_rate": 1.993308992075684e-05, "loss": 0.265, "step": 1889 }, { "epoch": 0.3949843260188088, "grad_norm": 1.4503989143009968, "learning_rate": 1.993295957419087e-05, "loss": 0.2785, "step": 1890 }, { "epoch": 0.39519331243469175, "grad_norm": 1.5808381054974106, "learning_rate": 1.9932829101212298e-05, "loss": 0.2462, "step": 1891 }, { "epoch": 0.3954022988505747, "grad_norm": 1.9382974450134407, "learning_rate": 1.9932698501822777e-05, "loss": 0.2751, "step": 1892 }, { "epoch": 0.39561128526645767, "grad_norm": 1.3697852954016285, "learning_rate": 1.993256777602397e-05, "loss": 0.2788, "step": 1893 }, { "epoch": 0.39582027168234063, "grad_norm": 1.5032653117010022, "learning_rate": 1.9932436923817545e-05, "loss": 0.2775, "step": 1894 }, { "epoch": 0.3960292580982236, "grad_norm": 1.2525570369514496, "learning_rate": 1.9932305945205156e-05, "loss": 0.2801, "step": 1895 }, { "epoch": 0.3962382445141066, "grad_norm": 1.9236494381311862, "learning_rate": 1.993217484018848e-05, "loss": 0.2487, "step": 1896 }, { "epoch": 0.39644723092998957, "grad_norm": 1.6977915940893415, "learning_rate": 1.9932043608769185e-05, "loss": 0.2678, "step": 1897 }, { "epoch": 0.39665621734587253, "grad_norm": 1.5078893927999228, "learning_rate": 1.9931912250948938e-05, "loss": 0.2715, "step": 1898 }, { "epoch": 0.3968652037617555, "grad_norm": 1.4537077877033924, "learning_rate": 1.993178076672941e-05, "loss": 0.2703, "step": 1899 }, { "epoch": 0.39707419017763845, "grad_norm": 1.6340429572186388, "learning_rate": 1.9931649156112277e-05, "loss": 0.236, "step": 1900 }, { "epoch": 0.3972831765935214, "grad_norm": 2.0338146057331836, "learning_rate": 1.993151741909921e-05, "loss": 0.2998, "step": 1901 }, { "epoch": 0.3974921630094044, "grad_norm": 1.324816312026304, "learning_rate": 1.9931385555691892e-05, "loss": 0.2665, "step": 1902 }, { "epoch": 0.39770114942528734, "grad_norm": 1.3832803963943907, "learning_rate": 1.9931253565891997e-05, "loss": 0.2724, "step": 1903 }, { "epoch": 0.3979101358411703, "grad_norm": 1.6687158823933477, "learning_rate": 1.9931121449701207e-05, "loss": 0.2684, "step": 1904 }, { "epoch": 0.3981191222570533, "grad_norm": 2.0356607544280254, "learning_rate": 1.9930989207121195e-05, "loss": 0.2829, "step": 1905 }, { "epoch": 0.3983281086729363, "grad_norm": 1.6358167606012708, "learning_rate": 1.9930856838153656e-05, "loss": 0.2707, "step": 1906 }, { "epoch": 0.39853709508881924, "grad_norm": 1.3240086360482168, "learning_rate": 1.993072434280027e-05, "loss": 0.2799, "step": 1907 }, { "epoch": 0.3987460815047022, "grad_norm": 1.449738465567333, "learning_rate": 1.993059172106272e-05, "loss": 0.271, "step": 1908 }, { "epoch": 0.39895506792058516, "grad_norm": 1.50770271253304, "learning_rate": 1.9930458972942698e-05, "loss": 0.2408, "step": 1909 }, { "epoch": 0.3991640543364681, "grad_norm": 1.4577615491165379, "learning_rate": 1.993032609844189e-05, "loss": 0.2506, "step": 1910 }, { "epoch": 0.3993730407523511, "grad_norm": 1.4827808020255662, "learning_rate": 1.993019309756199e-05, "loss": 0.2494, "step": 1911 }, { "epoch": 0.39958202716823404, "grad_norm": 1.2606784076702224, "learning_rate": 1.993005997030469e-05, "loss": 0.2775, "step": 1912 }, { "epoch": 0.399791013584117, "grad_norm": 1.599131120888543, "learning_rate": 1.9929926716671685e-05, "loss": 0.2396, "step": 1913 }, { "epoch": 0.4, "grad_norm": 1.4773771619470843, "learning_rate": 1.9929793336664667e-05, "loss": 0.2677, "step": 1914 }, { "epoch": 0.400208986415883, "grad_norm": 1.4128750815404498, "learning_rate": 1.992965983028534e-05, "loss": 0.3039, "step": 1915 }, { "epoch": 0.40041797283176594, "grad_norm": 1.720498191539039, "learning_rate": 1.9929526197535392e-05, "loss": 0.2712, "step": 1916 }, { "epoch": 0.4006269592476489, "grad_norm": 1.6044577232140587, "learning_rate": 1.9929392438416536e-05, "loss": 0.2486, "step": 1917 }, { "epoch": 0.40083594566353187, "grad_norm": 1.41452353605897, "learning_rate": 1.9929258552930468e-05, "loss": 0.2607, "step": 1918 }, { "epoch": 0.40104493207941483, "grad_norm": 1.3019250750434446, "learning_rate": 1.992912454107889e-05, "loss": 0.2646, "step": 1919 }, { "epoch": 0.4012539184952978, "grad_norm": 1.3672595318525267, "learning_rate": 1.9928990402863514e-05, "loss": 0.2929, "step": 1920 }, { "epoch": 0.40146290491118075, "grad_norm": 1.2744928660859571, "learning_rate": 1.9928856138286045e-05, "loss": 0.2437, "step": 1921 }, { "epoch": 0.40167189132706377, "grad_norm": 1.4069019996208958, "learning_rate": 1.992872174734819e-05, "loss": 0.2483, "step": 1922 }, { "epoch": 0.40188087774294673, "grad_norm": 1.4506085759279082, "learning_rate": 1.9928587230051657e-05, "loss": 0.2745, "step": 1923 }, { "epoch": 0.4020898641588297, "grad_norm": 1.538333844334536, "learning_rate": 1.9928452586398164e-05, "loss": 0.2594, "step": 1924 }, { "epoch": 0.40229885057471265, "grad_norm": 1.196851853490597, "learning_rate": 1.9928317816389416e-05, "loss": 0.2403, "step": 1925 }, { "epoch": 0.4025078369905956, "grad_norm": 1.5681989217740058, "learning_rate": 1.992818292002714e-05, "loss": 0.2729, "step": 1926 }, { "epoch": 0.4027168234064786, "grad_norm": 1.3874359157860394, "learning_rate": 1.992804789731304e-05, "loss": 0.2807, "step": 1927 }, { "epoch": 0.40292580982236154, "grad_norm": 1.377784377533469, "learning_rate": 1.9927912748248843e-05, "loss": 0.2881, "step": 1928 }, { "epoch": 0.4031347962382445, "grad_norm": 1.4676655249390946, "learning_rate": 1.992777747283627e-05, "loss": 0.2938, "step": 1929 }, { "epoch": 0.40334378265412746, "grad_norm": 2.0042218654389736, "learning_rate": 1.9927642071077032e-05, "loss": 0.2276, "step": 1930 }, { "epoch": 0.4035527690700105, "grad_norm": 1.7149020726372304, "learning_rate": 1.9927506542972868e-05, "loss": 0.2763, "step": 1931 }, { "epoch": 0.40376175548589344, "grad_norm": 1.4667144030120984, "learning_rate": 1.992737088852549e-05, "loss": 0.2903, "step": 1932 }, { "epoch": 0.4039707419017764, "grad_norm": 1.8997616548344542, "learning_rate": 1.9927235107736625e-05, "loss": 0.2934, "step": 1933 }, { "epoch": 0.40417972831765936, "grad_norm": 1.4779456640562043, "learning_rate": 1.992709920060801e-05, "loss": 0.2591, "step": 1934 }, { "epoch": 0.4043887147335423, "grad_norm": 1.389071817552176, "learning_rate": 1.9926963167141365e-05, "loss": 0.3007, "step": 1935 }, { "epoch": 0.4045977011494253, "grad_norm": 1.4272206979892172, "learning_rate": 1.992682700733843e-05, "loss": 0.2716, "step": 1936 }, { "epoch": 0.40480668756530824, "grad_norm": 1.3399021517660725, "learning_rate": 1.992669072120093e-05, "loss": 0.2589, "step": 1937 }, { "epoch": 0.4050156739811912, "grad_norm": 1.2790636486238616, "learning_rate": 1.99265543087306e-05, "loss": 0.2599, "step": 1938 }, { "epoch": 0.40522466039707417, "grad_norm": 1.5378477453939243, "learning_rate": 1.9926417769929184e-05, "loss": 0.2902, "step": 1939 }, { "epoch": 0.4054336468129572, "grad_norm": 1.8487297431155492, "learning_rate": 1.9926281104798413e-05, "loss": 0.2621, "step": 1940 }, { "epoch": 0.40564263322884014, "grad_norm": 1.226602991052105, "learning_rate": 1.9926144313340028e-05, "loss": 0.2694, "step": 1941 }, { "epoch": 0.4058516196447231, "grad_norm": 1.502094345788362, "learning_rate": 1.9926007395555768e-05, "loss": 0.252, "step": 1942 }, { "epoch": 0.40606060606060607, "grad_norm": 1.6149250771957178, "learning_rate": 1.992587035144738e-05, "loss": 0.2929, "step": 1943 }, { "epoch": 0.406269592476489, "grad_norm": 1.5618551620499832, "learning_rate": 1.9925733181016604e-05, "loss": 0.2703, "step": 1944 }, { "epoch": 0.406478578892372, "grad_norm": 1.4589279569058875, "learning_rate": 1.9925595884265186e-05, "loss": 0.2341, "step": 1945 }, { "epoch": 0.40668756530825495, "grad_norm": 1.3890964759993927, "learning_rate": 1.9925458461194874e-05, "loss": 0.2802, "step": 1946 }, { "epoch": 0.4068965517241379, "grad_norm": 1.4694121138274765, "learning_rate": 1.992532091180742e-05, "loss": 0.2495, "step": 1947 }, { "epoch": 0.4071055381400209, "grad_norm": 1.4504810175816882, "learning_rate": 1.992518323610457e-05, "loss": 0.2677, "step": 1948 }, { "epoch": 0.4073145245559039, "grad_norm": 1.652570035031907, "learning_rate": 1.9925045434088078e-05, "loss": 0.2863, "step": 1949 }, { "epoch": 0.40752351097178685, "grad_norm": 1.3560642750785632, "learning_rate": 1.9924907505759695e-05, "loss": 0.3196, "step": 1950 }, { "epoch": 0.4077324973876698, "grad_norm": 1.6050118044475834, "learning_rate": 1.992476945112118e-05, "loss": 0.2704, "step": 1951 }, { "epoch": 0.4079414838035528, "grad_norm": 1.30299013118038, "learning_rate": 1.992463127017429e-05, "loss": 0.2532, "step": 1952 }, { "epoch": 0.40815047021943573, "grad_norm": 1.6716525721239799, "learning_rate": 1.9924492962920787e-05, "loss": 0.2349, "step": 1953 }, { "epoch": 0.4083594566353187, "grad_norm": 1.3470340047164224, "learning_rate": 1.9924354529362418e-05, "loss": 0.2487, "step": 1954 }, { "epoch": 0.40856844305120166, "grad_norm": 1.4990376902211835, "learning_rate": 1.992421596950096e-05, "loss": 0.2722, "step": 1955 }, { "epoch": 0.4087774294670846, "grad_norm": 1.4584772982866452, "learning_rate": 1.9924077283338167e-05, "loss": 0.2617, "step": 1956 }, { "epoch": 0.4089864158829676, "grad_norm": 1.5220179757612253, "learning_rate": 1.9923938470875808e-05, "loss": 0.2684, "step": 1957 }, { "epoch": 0.4091954022988506, "grad_norm": 1.511109818441994, "learning_rate": 1.992379953211565e-05, "loss": 0.2665, "step": 1958 }, { "epoch": 0.40940438871473356, "grad_norm": 1.3328287499877682, "learning_rate": 1.9923660467059457e-05, "loss": 0.2795, "step": 1959 }, { "epoch": 0.4096133751306165, "grad_norm": 1.2807464276242877, "learning_rate": 1.9923521275709e-05, "loss": 0.2261, "step": 1960 }, { "epoch": 0.4098223615464995, "grad_norm": 1.4587574598332496, "learning_rate": 1.9923381958066056e-05, "loss": 0.2709, "step": 1961 }, { "epoch": 0.41003134796238244, "grad_norm": 1.5555255790326092, "learning_rate": 1.992324251413239e-05, "loss": 0.2574, "step": 1962 }, { "epoch": 0.4102403343782654, "grad_norm": 1.3756497350424073, "learning_rate": 1.9923102943909787e-05, "loss": 0.2703, "step": 1963 }, { "epoch": 0.41044932079414836, "grad_norm": 1.634852259593286, "learning_rate": 1.9922963247400012e-05, "loss": 0.2618, "step": 1964 }, { "epoch": 0.4106583072100313, "grad_norm": 1.672572155092752, "learning_rate": 1.9922823424604848e-05, "loss": 0.2546, "step": 1965 }, { "epoch": 0.41086729362591434, "grad_norm": 1.3724717341616404, "learning_rate": 1.9922683475526077e-05, "loss": 0.2604, "step": 1966 }, { "epoch": 0.4110762800417973, "grad_norm": 1.5119956721374288, "learning_rate": 1.9922543400165475e-05, "loss": 0.2895, "step": 1967 }, { "epoch": 0.41128526645768027, "grad_norm": 1.3279218010299392, "learning_rate": 1.9922403198524827e-05, "loss": 0.2526, "step": 1968 }, { "epoch": 0.4114942528735632, "grad_norm": 1.8524616971685344, "learning_rate": 1.9922262870605918e-05, "loss": 0.2415, "step": 1969 }, { "epoch": 0.4117032392894462, "grad_norm": 1.2552109979563884, "learning_rate": 1.992212241641053e-05, "loss": 0.2569, "step": 1970 }, { "epoch": 0.41191222570532915, "grad_norm": 1.699316759684141, "learning_rate": 1.992198183594046e-05, "loss": 0.2885, "step": 1971 }, { "epoch": 0.4121212121212121, "grad_norm": 1.3521943640688951, "learning_rate": 1.9921841129197487e-05, "loss": 0.2202, "step": 1972 }, { "epoch": 0.41233019853709507, "grad_norm": 1.2838475146288284, "learning_rate": 1.9921700296183403e-05, "loss": 0.2311, "step": 1973 }, { "epoch": 0.41253918495297803, "grad_norm": 1.6828555090941908, "learning_rate": 1.9921559336900006e-05, "loss": 0.2606, "step": 1974 }, { "epoch": 0.41274817136886105, "grad_norm": 1.3374387193721817, "learning_rate": 1.9921418251349087e-05, "loss": 0.2538, "step": 1975 }, { "epoch": 0.412957157784744, "grad_norm": 1.697669746440757, "learning_rate": 1.992127703953244e-05, "loss": 0.2848, "step": 1976 }, { "epoch": 0.413166144200627, "grad_norm": 1.4839485495674207, "learning_rate": 1.9921135701451864e-05, "loss": 0.2675, "step": 1977 }, { "epoch": 0.41337513061650993, "grad_norm": 1.3235806806626533, "learning_rate": 1.992099423710916e-05, "loss": 0.2553, "step": 1978 }, { "epoch": 0.4135841170323929, "grad_norm": 1.6451018690069457, "learning_rate": 1.992085264650612e-05, "loss": 0.2456, "step": 1979 }, { "epoch": 0.41379310344827586, "grad_norm": 1.6948230632351466, "learning_rate": 1.9920710929644553e-05, "loss": 0.2906, "step": 1980 }, { "epoch": 0.4140020898641588, "grad_norm": 1.269127734733721, "learning_rate": 1.992056908652626e-05, "loss": 0.265, "step": 1981 }, { "epoch": 0.4142110762800418, "grad_norm": 1.8111666330947767, "learning_rate": 1.9920427117153053e-05, "loss": 0.2714, "step": 1982 }, { "epoch": 0.41442006269592474, "grad_norm": 1.4268653330245704, "learning_rate": 1.9920285021526728e-05, "loss": 0.2598, "step": 1983 }, { "epoch": 0.41462904911180776, "grad_norm": 2.5563899132131045, "learning_rate": 1.9920142799649098e-05, "loss": 0.2772, "step": 1984 }, { "epoch": 0.4148380355276907, "grad_norm": 1.8762884754239595, "learning_rate": 1.9920000451521977e-05, "loss": 0.2594, "step": 1985 }, { "epoch": 0.4150470219435737, "grad_norm": 2.659971427461686, "learning_rate": 1.991985797714717e-05, "loss": 0.3164, "step": 1986 }, { "epoch": 0.41525600835945664, "grad_norm": 1.4137557852069564, "learning_rate": 1.9919715376526497e-05, "loss": 0.2474, "step": 1987 }, { "epoch": 0.4154649947753396, "grad_norm": 1.4931938915069711, "learning_rate": 1.9919572649661763e-05, "loss": 0.2596, "step": 1988 }, { "epoch": 0.41567398119122256, "grad_norm": 1.3771934707434539, "learning_rate": 1.9919429796554795e-05, "loss": 0.2817, "step": 1989 }, { "epoch": 0.4158829676071055, "grad_norm": 1.8417409360421098, "learning_rate": 1.9919286817207405e-05, "loss": 0.2792, "step": 1990 }, { "epoch": 0.4160919540229885, "grad_norm": 1.4482679049926195, "learning_rate": 1.9919143711621417e-05, "loss": 0.2464, "step": 1991 }, { "epoch": 0.41630094043887145, "grad_norm": 1.7921565307245237, "learning_rate": 1.9919000479798645e-05, "loss": 0.2836, "step": 1992 }, { "epoch": 0.41650992685475446, "grad_norm": 1.2562977888062778, "learning_rate": 1.9918857121740917e-05, "loss": 0.2336, "step": 1993 }, { "epoch": 0.4167189132706374, "grad_norm": 1.4429356994019058, "learning_rate": 1.991871363745006e-05, "loss": 0.2127, "step": 1994 }, { "epoch": 0.4169278996865204, "grad_norm": 1.667227937457724, "learning_rate": 1.9918570026927893e-05, "loss": 0.2926, "step": 1995 }, { "epoch": 0.41713688610240335, "grad_norm": 1.8136527717721331, "learning_rate": 1.991842629017625e-05, "loss": 0.274, "step": 1996 }, { "epoch": 0.4173458725182863, "grad_norm": 1.4936130600546365, "learning_rate": 1.9918282427196957e-05, "loss": 0.2717, "step": 1997 }, { "epoch": 0.41755485893416927, "grad_norm": 1.206363313688562, "learning_rate": 1.9918138437991842e-05, "loss": 0.2647, "step": 1998 }, { "epoch": 0.41776384535005223, "grad_norm": 1.7538664207944252, "learning_rate": 1.9917994322562745e-05, "loss": 0.283, "step": 1999 }, { "epoch": 0.4179728317659352, "grad_norm": 1.533758004677805, "learning_rate": 1.9917850080911495e-05, "loss": 0.2749, "step": 2000 }, { "epoch": 0.41818181818181815, "grad_norm": 1.5918707296369004, "learning_rate": 1.991770571303993e-05, "loss": 0.2498, "step": 2001 }, { "epoch": 0.41839080459770117, "grad_norm": 1.4678278827525313, "learning_rate": 1.9917561218949886e-05, "loss": 0.2286, "step": 2002 }, { "epoch": 0.41859979101358413, "grad_norm": 1.4239195598157997, "learning_rate": 1.99174165986432e-05, "loss": 0.2962, "step": 2003 }, { "epoch": 0.4188087774294671, "grad_norm": 1.377463636203555, "learning_rate": 1.991727185212172e-05, "loss": 0.2582, "step": 2004 }, { "epoch": 0.41901776384535006, "grad_norm": 2.3646685100774207, "learning_rate": 1.9917126979387277e-05, "loss": 0.2972, "step": 2005 }, { "epoch": 0.419226750261233, "grad_norm": 1.549876663470473, "learning_rate": 1.9916981980441725e-05, "loss": 0.275, "step": 2006 }, { "epoch": 0.419435736677116, "grad_norm": 1.2763358408806007, "learning_rate": 1.99168368552869e-05, "loss": 0.2586, "step": 2007 }, { "epoch": 0.41964472309299894, "grad_norm": 1.1745120152911195, "learning_rate": 1.9916691603924656e-05, "loss": 0.263, "step": 2008 }, { "epoch": 0.4198537095088819, "grad_norm": 1.335144484596808, "learning_rate": 1.991654622635684e-05, "loss": 0.2738, "step": 2009 }, { "epoch": 0.4200626959247649, "grad_norm": 1.6838324767855348, "learning_rate": 1.99164007225853e-05, "loss": 0.2654, "step": 2010 }, { "epoch": 0.4202716823406479, "grad_norm": 1.6178783805546575, "learning_rate": 1.991625509261189e-05, "loss": 0.2509, "step": 2011 }, { "epoch": 0.42048066875653084, "grad_norm": 1.3267616502613553, "learning_rate": 1.9916109336438463e-05, "loss": 0.2724, "step": 2012 }, { "epoch": 0.4206896551724138, "grad_norm": 1.6961571487045346, "learning_rate": 1.9915963454066874e-05, "loss": 0.2858, "step": 2013 }, { "epoch": 0.42089864158829676, "grad_norm": 1.571781419920522, "learning_rate": 1.991581744549898e-05, "loss": 0.2835, "step": 2014 }, { "epoch": 0.4211076280041797, "grad_norm": 1.7453428818968153, "learning_rate": 1.9915671310736636e-05, "loss": 0.2575, "step": 2015 }, { "epoch": 0.4213166144200627, "grad_norm": 1.8611800404909977, "learning_rate": 1.9915525049781706e-05, "loss": 0.3017, "step": 2016 }, { "epoch": 0.42152560083594565, "grad_norm": 1.2978496929759804, "learning_rate": 1.9915378662636045e-05, "loss": 0.2443, "step": 2017 }, { "epoch": 0.4217345872518286, "grad_norm": 1.3168707735075758, "learning_rate": 1.9915232149301523e-05, "loss": 0.2532, "step": 2018 }, { "epoch": 0.4219435736677116, "grad_norm": 1.2789506778383177, "learning_rate": 1.9915085509780007e-05, "loss": 0.2495, "step": 2019 }, { "epoch": 0.4221525600835946, "grad_norm": 1.377080858600754, "learning_rate": 1.9914938744073354e-05, "loss": 0.2616, "step": 2020 }, { "epoch": 0.42236154649947755, "grad_norm": 1.3790043889907049, "learning_rate": 1.9914791852183436e-05, "loss": 0.2482, "step": 2021 }, { "epoch": 0.4225705329153605, "grad_norm": 1.2873749202037428, "learning_rate": 1.9914644834112125e-05, "loss": 0.2727, "step": 2022 }, { "epoch": 0.42277951933124347, "grad_norm": 1.2114344502055654, "learning_rate": 1.991449768986129e-05, "loss": 0.2538, "step": 2023 }, { "epoch": 0.42298850574712643, "grad_norm": 1.4091625460386958, "learning_rate": 1.9914350419432803e-05, "loss": 0.2299, "step": 2024 }, { "epoch": 0.4231974921630094, "grad_norm": 1.271735877739934, "learning_rate": 1.9914203022828532e-05, "loss": 0.2897, "step": 2025 }, { "epoch": 0.42340647857889235, "grad_norm": 1.3676809675938835, "learning_rate": 1.9914055500050366e-05, "loss": 0.2666, "step": 2026 }, { "epoch": 0.4236154649947753, "grad_norm": 1.5163600104471375, "learning_rate": 1.9913907851100178e-05, "loss": 0.2987, "step": 2027 }, { "epoch": 0.42382445141065833, "grad_norm": 1.5295883294420733, "learning_rate": 1.991376007597984e-05, "loss": 0.298, "step": 2028 }, { "epoch": 0.4240334378265413, "grad_norm": 1.5634512488474965, "learning_rate": 1.9913612174691243e-05, "loss": 0.2647, "step": 2029 }, { "epoch": 0.42424242424242425, "grad_norm": 1.591262916076806, "learning_rate": 1.9913464147236257e-05, "loss": 0.2276, "step": 2030 }, { "epoch": 0.4244514106583072, "grad_norm": 1.5401662609035787, "learning_rate": 1.9913315993616775e-05, "loss": 0.2586, "step": 2031 }, { "epoch": 0.4246603970741902, "grad_norm": 1.5824479422920654, "learning_rate": 1.9913167713834683e-05, "loss": 0.2772, "step": 2032 }, { "epoch": 0.42486938349007314, "grad_norm": 1.6456690287950415, "learning_rate": 1.9913019307891864e-05, "loss": 0.2565, "step": 2033 }, { "epoch": 0.4250783699059561, "grad_norm": 1.5649203366799178, "learning_rate": 1.9912870775790207e-05, "loss": 0.2793, "step": 2034 }, { "epoch": 0.42528735632183906, "grad_norm": 1.3529060904242565, "learning_rate": 1.9912722117531605e-05, "loss": 0.2713, "step": 2035 }, { "epoch": 0.425496342737722, "grad_norm": 1.3032469704605796, "learning_rate": 1.991257333311795e-05, "loss": 0.2744, "step": 2036 }, { "epoch": 0.42570532915360504, "grad_norm": 1.401577317923454, "learning_rate": 1.991242442255113e-05, "loss": 0.2244, "step": 2037 }, { "epoch": 0.425914315569488, "grad_norm": 1.5140922416736726, "learning_rate": 1.9912275385833046e-05, "loss": 0.2947, "step": 2038 }, { "epoch": 0.42612330198537096, "grad_norm": 1.5448700939972355, "learning_rate": 1.991212622296559e-05, "loss": 0.2989, "step": 2039 }, { "epoch": 0.4263322884012539, "grad_norm": 1.5141233368057818, "learning_rate": 1.9911976933950667e-05, "loss": 0.2761, "step": 2040 }, { "epoch": 0.4265412748171369, "grad_norm": 1.7141158230255662, "learning_rate": 1.991182751879017e-05, "loss": 0.2773, "step": 2041 }, { "epoch": 0.42675026123301985, "grad_norm": 1.8182393716298242, "learning_rate": 1.9911677977486008e-05, "loss": 0.2473, "step": 2042 }, { "epoch": 0.4269592476489028, "grad_norm": 1.3007251080205242, "learning_rate": 1.9911528310040073e-05, "loss": 0.2549, "step": 2043 }, { "epoch": 0.42716823406478577, "grad_norm": 1.4863068063376372, "learning_rate": 1.9911378516454277e-05, "loss": 0.2836, "step": 2044 }, { "epoch": 0.42737722048066873, "grad_norm": 1.3561840887534502, "learning_rate": 1.991122859673053e-05, "loss": 0.2468, "step": 2045 }, { "epoch": 0.42758620689655175, "grad_norm": 1.3102134547763244, "learning_rate": 1.9911078550870736e-05, "loss": 0.2721, "step": 2046 }, { "epoch": 0.4277951933124347, "grad_norm": 1.4291570826953748, "learning_rate": 1.99109283788768e-05, "loss": 0.2819, "step": 2047 }, { "epoch": 0.42800417972831767, "grad_norm": 1.3287335560344473, "learning_rate": 1.991077808075064e-05, "loss": 0.2536, "step": 2048 }, { "epoch": 0.42821316614420063, "grad_norm": 1.7191066884345394, "learning_rate": 1.991062765649417e-05, "loss": 0.2522, "step": 2049 }, { "epoch": 0.4284221525600836, "grad_norm": 1.3892126581400475, "learning_rate": 1.9910477106109293e-05, "loss": 0.2646, "step": 2050 }, { "epoch": 0.42863113897596655, "grad_norm": 1.2688756230279303, "learning_rate": 1.9910326429597936e-05, "loss": 0.2542, "step": 2051 }, { "epoch": 0.4288401253918495, "grad_norm": 1.2494733951221906, "learning_rate": 1.9910175626962017e-05, "loss": 0.2767, "step": 2052 }, { "epoch": 0.4290491118077325, "grad_norm": 1.316204368397733, "learning_rate": 1.9910024698203448e-05, "loss": 0.2943, "step": 2053 }, { "epoch": 0.4292580982236155, "grad_norm": 1.787408607803819, "learning_rate": 1.9909873643324153e-05, "loss": 0.2512, "step": 2054 }, { "epoch": 0.42946708463949845, "grad_norm": 1.771135156679509, "learning_rate": 1.9909722462326053e-05, "loss": 0.239, "step": 2055 }, { "epoch": 0.4296760710553814, "grad_norm": 1.2785000720557447, "learning_rate": 1.9909571155211077e-05, "loss": 0.2504, "step": 2056 }, { "epoch": 0.4298850574712644, "grad_norm": 1.7026924151744998, "learning_rate": 1.9909419721981145e-05, "loss": 0.294, "step": 2057 }, { "epoch": 0.43009404388714734, "grad_norm": 1.8070344745360412, "learning_rate": 1.9909268162638188e-05, "loss": 0.2428, "step": 2058 }, { "epoch": 0.4303030303030303, "grad_norm": 1.3626120880381565, "learning_rate": 1.9909116477184135e-05, "loss": 0.2867, "step": 2059 }, { "epoch": 0.43051201671891326, "grad_norm": 1.3949419977832564, "learning_rate": 1.9908964665620913e-05, "loss": 0.2778, "step": 2060 }, { "epoch": 0.4307210031347962, "grad_norm": 1.4287031774634351, "learning_rate": 1.9908812727950453e-05, "loss": 0.2802, "step": 2061 }, { "epoch": 0.4309299895506792, "grad_norm": 1.4122255437412288, "learning_rate": 1.9908660664174694e-05, "loss": 0.2602, "step": 2062 }, { "epoch": 0.4311389759665622, "grad_norm": 1.5711456332433633, "learning_rate": 1.990850847429557e-05, "loss": 0.2895, "step": 2063 }, { "epoch": 0.43134796238244516, "grad_norm": 1.1453670052074536, "learning_rate": 1.9908356158315015e-05, "loss": 0.2447, "step": 2064 }, { "epoch": 0.4315569487983281, "grad_norm": 1.3217892813484309, "learning_rate": 1.990820371623497e-05, "loss": 0.2587, "step": 2065 }, { "epoch": 0.4317659352142111, "grad_norm": 1.4665348051329543, "learning_rate": 1.9908051148057374e-05, "loss": 0.2864, "step": 2066 }, { "epoch": 0.43197492163009404, "grad_norm": 1.5096882744719522, "learning_rate": 1.9907898453784167e-05, "loss": 0.2855, "step": 2067 }, { "epoch": 0.432183908045977, "grad_norm": 1.3613723089862617, "learning_rate": 1.9907745633417295e-05, "loss": 0.2564, "step": 2068 }, { "epoch": 0.43239289446185997, "grad_norm": 1.3892557093470324, "learning_rate": 1.9907592686958703e-05, "loss": 0.269, "step": 2069 }, { "epoch": 0.43260188087774293, "grad_norm": 1.6113038267296145, "learning_rate": 1.9907439614410333e-05, "loss": 0.2478, "step": 2070 }, { "epoch": 0.4328108672936259, "grad_norm": 1.381589191801107, "learning_rate": 1.9907286415774142e-05, "loss": 0.2529, "step": 2071 }, { "epoch": 0.4330198537095089, "grad_norm": 1.837314517076211, "learning_rate": 1.990713309105207e-05, "loss": 0.2477, "step": 2072 }, { "epoch": 0.43322884012539187, "grad_norm": 1.4002250658000681, "learning_rate": 1.9906979640246072e-05, "loss": 0.271, "step": 2073 }, { "epoch": 0.43343782654127483, "grad_norm": 1.5998620603147882, "learning_rate": 1.9906826063358107e-05, "loss": 0.2594, "step": 2074 }, { "epoch": 0.4336468129571578, "grad_norm": 1.7860547094699544, "learning_rate": 1.9906672360390117e-05, "loss": 0.2536, "step": 2075 }, { "epoch": 0.43385579937304075, "grad_norm": 1.693739778564915, "learning_rate": 1.990651853134407e-05, "loss": 0.2671, "step": 2076 }, { "epoch": 0.4340647857889237, "grad_norm": 1.326866969556206, "learning_rate": 1.9906364576221916e-05, "loss": 0.2711, "step": 2077 }, { "epoch": 0.4342737722048067, "grad_norm": 1.6931535186345927, "learning_rate": 1.9906210495025617e-05, "loss": 0.2772, "step": 2078 }, { "epoch": 0.43448275862068964, "grad_norm": 1.3742409210719508, "learning_rate": 1.9906056287757135e-05, "loss": 0.2803, "step": 2079 }, { "epoch": 0.4346917450365726, "grad_norm": 1.4937396909680973, "learning_rate": 1.9905901954418434e-05, "loss": 0.2621, "step": 2080 }, { "epoch": 0.4349007314524556, "grad_norm": 1.4472813199205135, "learning_rate": 1.9905747495011472e-05, "loss": 0.2777, "step": 2081 }, { "epoch": 0.4351097178683386, "grad_norm": 1.6298750316022432, "learning_rate": 1.9905592909538222e-05, "loss": 0.2697, "step": 2082 }, { "epoch": 0.43531870428422154, "grad_norm": 1.2967797403020678, "learning_rate": 1.9905438198000644e-05, "loss": 0.2757, "step": 2083 }, { "epoch": 0.4355276907001045, "grad_norm": 1.6298913547432996, "learning_rate": 1.9905283360400712e-05, "loss": 0.2461, "step": 2084 }, { "epoch": 0.43573667711598746, "grad_norm": 1.8731882208899766, "learning_rate": 1.9905128396740397e-05, "loss": 0.2499, "step": 2085 }, { "epoch": 0.4359456635318704, "grad_norm": 1.435816597240786, "learning_rate": 1.9904973307021667e-05, "loss": 0.2716, "step": 2086 }, { "epoch": 0.4361546499477534, "grad_norm": 1.3899363068529822, "learning_rate": 1.9904818091246498e-05, "loss": 0.2584, "step": 2087 }, { "epoch": 0.43636363636363634, "grad_norm": 1.2915340309669325, "learning_rate": 1.9904662749416864e-05, "loss": 0.2611, "step": 2088 }, { "epoch": 0.4365726227795193, "grad_norm": 1.2950333309026583, "learning_rate": 1.990450728153475e-05, "loss": 0.2692, "step": 2089 }, { "epoch": 0.4367816091954023, "grad_norm": 1.3741969355605586, "learning_rate": 1.9904351687602124e-05, "loss": 0.2707, "step": 2090 }, { "epoch": 0.4369905956112853, "grad_norm": 1.6305112417107939, "learning_rate": 1.990419596762097e-05, "loss": 0.2328, "step": 2091 }, { "epoch": 0.43719958202716824, "grad_norm": 1.3112623888511057, "learning_rate": 1.9904040121593267e-05, "loss": 0.2517, "step": 2092 }, { "epoch": 0.4374085684430512, "grad_norm": 1.5357476378076345, "learning_rate": 1.9903884149521008e-05, "loss": 0.2643, "step": 2093 }, { "epoch": 0.43761755485893417, "grad_norm": 1.2944684139692322, "learning_rate": 1.990372805140617e-05, "loss": 0.2561, "step": 2094 }, { "epoch": 0.4378265412748171, "grad_norm": 1.6195936569217806, "learning_rate": 1.9903571827250738e-05, "loss": 0.2596, "step": 2095 }, { "epoch": 0.4380355276907001, "grad_norm": 1.339260156368857, "learning_rate": 1.9903415477056702e-05, "loss": 0.2571, "step": 2096 }, { "epoch": 0.43824451410658305, "grad_norm": 1.302953161231856, "learning_rate": 1.9903259000826058e-05, "loss": 0.2801, "step": 2097 }, { "epoch": 0.43845350052246607, "grad_norm": 2.0285459743620904, "learning_rate": 1.9903102398560787e-05, "loss": 0.2515, "step": 2098 }, { "epoch": 0.438662486938349, "grad_norm": 1.277589802228013, "learning_rate": 1.9902945670262892e-05, "loss": 0.2689, "step": 2099 }, { "epoch": 0.438871473354232, "grad_norm": 1.2059700071704846, "learning_rate": 1.9902788815934362e-05, "loss": 0.2407, "step": 2100 }, { "epoch": 0.43908045977011495, "grad_norm": 1.535655904247217, "learning_rate": 1.9902631835577192e-05, "loss": 0.2936, "step": 2101 }, { "epoch": 0.4392894461859979, "grad_norm": 1.2105446331008098, "learning_rate": 1.9902474729193385e-05, "loss": 0.2609, "step": 2102 }, { "epoch": 0.4394984326018809, "grad_norm": 1.8802501883346596, "learning_rate": 1.9902317496784936e-05, "loss": 0.2549, "step": 2103 }, { "epoch": 0.43970741901776383, "grad_norm": 1.4732969024591969, "learning_rate": 1.9902160138353848e-05, "loss": 0.2738, "step": 2104 }, { "epoch": 0.4399164054336468, "grad_norm": 1.9430810586912344, "learning_rate": 1.9902002653902123e-05, "loss": 0.2496, "step": 2105 }, { "epoch": 0.44012539184952976, "grad_norm": 1.7713949362129657, "learning_rate": 1.9901845043431763e-05, "loss": 0.2893, "step": 2106 }, { "epoch": 0.4403343782654128, "grad_norm": 1.4695347383723363, "learning_rate": 1.990168730694478e-05, "loss": 0.2278, "step": 2107 }, { "epoch": 0.44054336468129573, "grad_norm": 1.4725321862708096, "learning_rate": 1.9901529444443177e-05, "loss": 0.2648, "step": 2108 }, { "epoch": 0.4407523510971787, "grad_norm": 1.1409514044165374, "learning_rate": 1.9901371455928965e-05, "loss": 0.2554, "step": 2109 }, { "epoch": 0.44096133751306166, "grad_norm": 1.4216949359031033, "learning_rate": 1.9901213341404152e-05, "loss": 0.2457, "step": 2110 }, { "epoch": 0.4411703239289446, "grad_norm": 1.2844520214818782, "learning_rate": 1.9901055100870753e-05, "loss": 0.255, "step": 2111 }, { "epoch": 0.4413793103448276, "grad_norm": 1.2845379540086335, "learning_rate": 1.9900896734330775e-05, "loss": 0.2491, "step": 2112 }, { "epoch": 0.44158829676071054, "grad_norm": 1.3323007362872858, "learning_rate": 1.9900738241786244e-05, "loss": 0.2652, "step": 2113 }, { "epoch": 0.4417972831765935, "grad_norm": 1.448015278666879, "learning_rate": 1.990057962323917e-05, "loss": 0.2953, "step": 2114 }, { "epoch": 0.44200626959247646, "grad_norm": 1.1984446626514085, "learning_rate": 1.9900420878691578e-05, "loss": 0.2563, "step": 2115 }, { "epoch": 0.4422152560083595, "grad_norm": 1.564792179092922, "learning_rate": 1.9900262008145478e-05, "loss": 0.2518, "step": 2116 }, { "epoch": 0.44242424242424244, "grad_norm": 1.336582655890725, "learning_rate": 1.99001030116029e-05, "loss": 0.2681, "step": 2117 }, { "epoch": 0.4426332288401254, "grad_norm": 1.336076374081774, "learning_rate": 1.9899943889065866e-05, "loss": 0.2269, "step": 2118 }, { "epoch": 0.44284221525600836, "grad_norm": 1.3979921389274679, "learning_rate": 1.9899784640536403e-05, "loss": 0.2958, "step": 2119 }, { "epoch": 0.4430512016718913, "grad_norm": 1.4793684422851323, "learning_rate": 1.989962526601653e-05, "loss": 0.2544, "step": 2120 }, { "epoch": 0.4432601880877743, "grad_norm": 1.4940462780309574, "learning_rate": 1.9899465765508285e-05, "loss": 0.2574, "step": 2121 }, { "epoch": 0.44346917450365725, "grad_norm": 1.3808396324706436, "learning_rate": 1.9899306139013693e-05, "loss": 0.2479, "step": 2122 }, { "epoch": 0.4436781609195402, "grad_norm": 1.4245463183432792, "learning_rate": 1.989914638653478e-05, "loss": 0.2783, "step": 2123 }, { "epoch": 0.44388714733542317, "grad_norm": 1.2633555905231892, "learning_rate": 1.9898986508073593e-05, "loss": 0.2369, "step": 2124 }, { "epoch": 0.4440961337513062, "grad_norm": 1.330131506499367, "learning_rate": 1.9898826503632154e-05, "loss": 0.2288, "step": 2125 }, { "epoch": 0.44430512016718915, "grad_norm": 1.640243928102639, "learning_rate": 1.98986663732125e-05, "loss": 0.2193, "step": 2126 }, { "epoch": 0.4445141065830721, "grad_norm": 1.4354046221905117, "learning_rate": 1.989850611681668e-05, "loss": 0.2441, "step": 2127 }, { "epoch": 0.44472309299895507, "grad_norm": 1.4084744731506291, "learning_rate": 1.9898345734446726e-05, "loss": 0.2779, "step": 2128 }, { "epoch": 0.44493207941483803, "grad_norm": 2.2258958184026914, "learning_rate": 1.989818522610468e-05, "loss": 0.2862, "step": 2129 }, { "epoch": 0.445141065830721, "grad_norm": 1.3807006161103221, "learning_rate": 1.989802459179258e-05, "loss": 0.2401, "step": 2130 }, { "epoch": 0.44535005224660396, "grad_norm": 1.5042177146707711, "learning_rate": 1.9897863831512475e-05, "loss": 0.284, "step": 2131 }, { "epoch": 0.4455590386624869, "grad_norm": 1.3341944372360854, "learning_rate": 1.9897702945266414e-05, "loss": 0.2843, "step": 2132 }, { "epoch": 0.4457680250783699, "grad_norm": 1.516560936742589, "learning_rate": 1.9897541933056437e-05, "loss": 0.2926, "step": 2133 }, { "epoch": 0.4459770114942529, "grad_norm": 1.7421427228794844, "learning_rate": 1.98973807948846e-05, "loss": 0.2262, "step": 2134 }, { "epoch": 0.44618599791013586, "grad_norm": 1.3227057980254562, "learning_rate": 1.989721953075295e-05, "loss": 0.2601, "step": 2135 }, { "epoch": 0.4463949843260188, "grad_norm": 1.3917852569179374, "learning_rate": 1.989705814066354e-05, "loss": 0.2757, "step": 2136 }, { "epoch": 0.4466039707419018, "grad_norm": 1.6962738220023998, "learning_rate": 1.9896896624618424e-05, "loss": 0.2693, "step": 2137 }, { "epoch": 0.44681295715778474, "grad_norm": 1.3172493287521425, "learning_rate": 1.9896734982619658e-05, "loss": 0.2688, "step": 2138 }, { "epoch": 0.4470219435736677, "grad_norm": 1.4225084057205004, "learning_rate": 1.98965732146693e-05, "loss": 0.2387, "step": 2139 }, { "epoch": 0.44723092998955066, "grad_norm": 1.4898256470490003, "learning_rate": 1.9896411320769407e-05, "loss": 0.2503, "step": 2140 }, { "epoch": 0.4474399164054336, "grad_norm": 1.4274111157038054, "learning_rate": 1.989624930092204e-05, "loss": 0.2571, "step": 2141 }, { "epoch": 0.44764890282131664, "grad_norm": 1.7328738616691999, "learning_rate": 1.989608715512926e-05, "loss": 0.2981, "step": 2142 }, { "epoch": 0.4478578892371996, "grad_norm": 1.376085857575213, "learning_rate": 1.989592488339313e-05, "loss": 0.2587, "step": 2143 }, { "epoch": 0.44806687565308256, "grad_norm": 1.2681334324153126, "learning_rate": 1.989576248571572e-05, "loss": 0.2285, "step": 2144 }, { "epoch": 0.4482758620689655, "grad_norm": 1.4246379049002205, "learning_rate": 1.9895599962099094e-05, "loss": 0.2317, "step": 2145 }, { "epoch": 0.4484848484848485, "grad_norm": 1.494912604065126, "learning_rate": 1.9895437312545316e-05, "loss": 0.2591, "step": 2146 }, { "epoch": 0.44869383490073145, "grad_norm": 1.3055114942556303, "learning_rate": 1.989527453705646e-05, "loss": 0.2397, "step": 2147 }, { "epoch": 0.4489028213166144, "grad_norm": 1.3167906932850488, "learning_rate": 1.98951116356346e-05, "loss": 0.2762, "step": 2148 }, { "epoch": 0.44911180773249737, "grad_norm": 1.6105017367725056, "learning_rate": 1.9894948608281806e-05, "loss": 0.2741, "step": 2149 }, { "epoch": 0.44932079414838033, "grad_norm": 1.3184806872508907, "learning_rate": 1.9894785455000153e-05, "loss": 0.2676, "step": 2150 }, { "epoch": 0.44952978056426335, "grad_norm": 1.6493619927801697, "learning_rate": 1.9894622175791718e-05, "loss": 0.2791, "step": 2151 }, { "epoch": 0.4497387669801463, "grad_norm": 1.213164857522585, "learning_rate": 1.9894458770658577e-05, "loss": 0.2811, "step": 2152 }, { "epoch": 0.44994775339602927, "grad_norm": 1.4593861181049537, "learning_rate": 1.9894295239602815e-05, "loss": 0.2821, "step": 2153 }, { "epoch": 0.45015673981191223, "grad_norm": 1.6828377115023188, "learning_rate": 1.9894131582626505e-05, "loss": 0.2271, "step": 2154 }, { "epoch": 0.4503657262277952, "grad_norm": 1.3407211948582358, "learning_rate": 1.9893967799731737e-05, "loss": 0.237, "step": 2155 }, { "epoch": 0.45057471264367815, "grad_norm": 1.2942017455468444, "learning_rate": 1.9893803890920595e-05, "loss": 0.2423, "step": 2156 }, { "epoch": 0.4507836990595611, "grad_norm": 1.3042526188426546, "learning_rate": 1.9893639856195156e-05, "loss": 0.245, "step": 2157 }, { "epoch": 0.4509926854754441, "grad_norm": 1.6577946499505472, "learning_rate": 1.9893475695557517e-05, "loss": 0.2692, "step": 2158 }, { "epoch": 0.45120167189132704, "grad_norm": 1.3640626181647062, "learning_rate": 1.989331140900977e-05, "loss": 0.2478, "step": 2159 }, { "epoch": 0.45141065830721006, "grad_norm": 1.352164638837519, "learning_rate": 1.9893146996553992e-05, "loss": 0.2783, "step": 2160 }, { "epoch": 0.451619644723093, "grad_norm": 1.3556560178678914, "learning_rate": 1.9892982458192286e-05, "loss": 0.2271, "step": 2161 }, { "epoch": 0.451828631138976, "grad_norm": 1.2599108649149626, "learning_rate": 1.9892817793926746e-05, "loss": 0.2512, "step": 2162 }, { "epoch": 0.45203761755485894, "grad_norm": 1.4738135786947035, "learning_rate": 1.9892653003759462e-05, "loss": 0.2588, "step": 2163 }, { "epoch": 0.4522466039707419, "grad_norm": 1.5776902349420328, "learning_rate": 1.989248808769254e-05, "loss": 0.2563, "step": 2164 }, { "epoch": 0.45245559038662486, "grad_norm": 1.236710867635322, "learning_rate": 1.989232304572807e-05, "loss": 0.2488, "step": 2165 }, { "epoch": 0.4526645768025078, "grad_norm": 1.4204879239761394, "learning_rate": 1.9892157877868153e-05, "loss": 0.2773, "step": 2166 }, { "epoch": 0.4528735632183908, "grad_norm": 1.284331306355368, "learning_rate": 1.98919925841149e-05, "loss": 0.2774, "step": 2167 }, { "epoch": 0.45308254963427375, "grad_norm": 1.36321536431068, "learning_rate": 1.9891827164470402e-05, "loss": 0.286, "step": 2168 }, { "epoch": 0.45329153605015676, "grad_norm": 1.34388635380889, "learning_rate": 1.9891661618936774e-05, "loss": 0.2619, "step": 2169 }, { "epoch": 0.4535005224660397, "grad_norm": 1.262945576696253, "learning_rate": 1.989149594751612e-05, "loss": 0.2424, "step": 2170 }, { "epoch": 0.4537095088819227, "grad_norm": 1.3384505205420243, "learning_rate": 1.9891330150210545e-05, "loss": 0.2509, "step": 2171 }, { "epoch": 0.45391849529780565, "grad_norm": 1.2485351538336011, "learning_rate": 1.9891164227022165e-05, "loss": 0.2643, "step": 2172 }, { "epoch": 0.4541274817136886, "grad_norm": 1.3395859466415785, "learning_rate": 1.9890998177953085e-05, "loss": 0.2672, "step": 2173 }, { "epoch": 0.45433646812957157, "grad_norm": 1.1780375132744325, "learning_rate": 1.9890832003005426e-05, "loss": 0.2623, "step": 2174 }, { "epoch": 0.45454545454545453, "grad_norm": 1.2279759687939242, "learning_rate": 1.9890665702181294e-05, "loss": 0.2553, "step": 2175 }, { "epoch": 0.4547544409613375, "grad_norm": 1.1791198700665002, "learning_rate": 1.9890499275482815e-05, "loss": 0.2613, "step": 2176 }, { "epoch": 0.4549634273772205, "grad_norm": 1.5582503628289612, "learning_rate": 1.98903327229121e-05, "loss": 0.2793, "step": 2177 }, { "epoch": 0.45517241379310347, "grad_norm": 1.4059628139794063, "learning_rate": 1.989016604447127e-05, "loss": 0.271, "step": 2178 }, { "epoch": 0.45538140020898643, "grad_norm": 1.7316081878569962, "learning_rate": 1.9889999240162448e-05, "loss": 0.2548, "step": 2179 }, { "epoch": 0.4555903866248694, "grad_norm": 1.5868753015657253, "learning_rate": 1.9889832309987755e-05, "loss": 0.2381, "step": 2180 }, { "epoch": 0.45579937304075235, "grad_norm": 1.4132214961156222, "learning_rate": 1.988966525394932e-05, "loss": 0.2876, "step": 2181 }, { "epoch": 0.4560083594566353, "grad_norm": 1.3344947544407026, "learning_rate": 1.9889498072049257e-05, "loss": 0.2509, "step": 2182 }, { "epoch": 0.4562173458725183, "grad_norm": 1.665477050126975, "learning_rate": 1.988933076428971e-05, "loss": 0.2614, "step": 2183 }, { "epoch": 0.45642633228840124, "grad_norm": 1.4014199979171174, "learning_rate": 1.9889163330672794e-05, "loss": 0.2571, "step": 2184 }, { "epoch": 0.4566353187042842, "grad_norm": 1.4409667582754164, "learning_rate": 1.988899577120065e-05, "loss": 0.2577, "step": 2185 }, { "epoch": 0.4568443051201672, "grad_norm": 1.4777241565386485, "learning_rate": 1.9888828085875407e-05, "loss": 0.2778, "step": 2186 }, { "epoch": 0.4570532915360502, "grad_norm": 1.391921176088029, "learning_rate": 1.9888660274699197e-05, "loss": 0.2823, "step": 2187 }, { "epoch": 0.45726227795193314, "grad_norm": 1.318615744498674, "learning_rate": 1.988849233767416e-05, "loss": 0.2631, "step": 2188 }, { "epoch": 0.4574712643678161, "grad_norm": 1.4276877275993924, "learning_rate": 1.9888324274802428e-05, "loss": 0.2354, "step": 2189 }, { "epoch": 0.45768025078369906, "grad_norm": 1.4989899668399453, "learning_rate": 1.988815608608614e-05, "loss": 0.2917, "step": 2190 }, { "epoch": 0.457889237199582, "grad_norm": 1.7110162180310424, "learning_rate": 1.9887987771527442e-05, "loss": 0.238, "step": 2191 }, { "epoch": 0.458098223615465, "grad_norm": 1.4782505942273623, "learning_rate": 1.9887819331128474e-05, "loss": 0.2639, "step": 2192 }, { "epoch": 0.45830721003134794, "grad_norm": 1.3900174045345128, "learning_rate": 1.9887650764891378e-05, "loss": 0.2663, "step": 2193 }, { "epoch": 0.4585161964472309, "grad_norm": 1.4521128805195471, "learning_rate": 1.9887482072818297e-05, "loss": 0.2739, "step": 2194 }, { "epoch": 0.4587251828631139, "grad_norm": 1.415720078580797, "learning_rate": 1.988731325491138e-05, "loss": 0.2749, "step": 2195 }, { "epoch": 0.4589341692789969, "grad_norm": 1.2928130453979945, "learning_rate": 1.9887144311172785e-05, "loss": 0.2413, "step": 2196 }, { "epoch": 0.45914315569487985, "grad_norm": 1.462841983822924, "learning_rate": 1.9886975241604645e-05, "loss": 0.2696, "step": 2197 }, { "epoch": 0.4593521421107628, "grad_norm": 1.3886867814768606, "learning_rate": 1.988680604620912e-05, "loss": 0.2536, "step": 2198 }, { "epoch": 0.45956112852664577, "grad_norm": 1.386147048316457, "learning_rate": 1.988663672498837e-05, "loss": 0.2616, "step": 2199 }, { "epoch": 0.45977011494252873, "grad_norm": 1.333125743040463, "learning_rate": 1.9886467277944537e-05, "loss": 0.2727, "step": 2200 }, { "epoch": 0.4599791013584117, "grad_norm": 1.425676553785674, "learning_rate": 1.9886297705079785e-05, "loss": 0.253, "step": 2201 }, { "epoch": 0.46018808777429465, "grad_norm": 1.4258175917852265, "learning_rate": 1.9886128006396274e-05, "loss": 0.2778, "step": 2202 }, { "epoch": 0.4603970741901776, "grad_norm": 1.2420847983726313, "learning_rate": 1.9885958181896155e-05, "loss": 0.2216, "step": 2203 }, { "epoch": 0.46060606060606063, "grad_norm": 1.2398097913817194, "learning_rate": 1.98857882315816e-05, "loss": 0.2254, "step": 2204 }, { "epoch": 0.4608150470219436, "grad_norm": 1.3172795270162765, "learning_rate": 1.988561815545476e-05, "loss": 0.2757, "step": 2205 }, { "epoch": 0.46102403343782655, "grad_norm": 1.3401112868066918, "learning_rate": 1.9885447953517812e-05, "loss": 0.2611, "step": 2206 }, { "epoch": 0.4612330198537095, "grad_norm": 1.4758974997302434, "learning_rate": 1.9885277625772913e-05, "loss": 0.2417, "step": 2207 }, { "epoch": 0.4614420062695925, "grad_norm": 1.3343278021669218, "learning_rate": 1.988510717222224e-05, "loss": 0.2458, "step": 2208 }, { "epoch": 0.46165099268547544, "grad_norm": 1.4195298155843474, "learning_rate": 1.9884936592867947e-05, "loss": 0.2428, "step": 2209 }, { "epoch": 0.4618599791013584, "grad_norm": 1.4147885270182863, "learning_rate": 1.9884765887712217e-05, "loss": 0.2408, "step": 2210 }, { "epoch": 0.46206896551724136, "grad_norm": 1.4055804352096668, "learning_rate": 1.988459505675722e-05, "loss": 0.2364, "step": 2211 }, { "epoch": 0.4622779519331243, "grad_norm": 1.3685459887210327, "learning_rate": 1.9884424100005133e-05, "loss": 0.2485, "step": 2212 }, { "epoch": 0.46248693834900734, "grad_norm": 1.5484626175439176, "learning_rate": 1.9884253017458125e-05, "loss": 0.2407, "step": 2213 }, { "epoch": 0.4626959247648903, "grad_norm": 1.4177430307379815, "learning_rate": 1.988408180911838e-05, "loss": 0.2933, "step": 2214 }, { "epoch": 0.46290491118077326, "grad_norm": 1.808308747000229, "learning_rate": 1.988391047498807e-05, "loss": 0.2352, "step": 2215 }, { "epoch": 0.4631138975966562, "grad_norm": 1.4868408247442089, "learning_rate": 1.9883739015069378e-05, "loss": 0.2698, "step": 2216 }, { "epoch": 0.4633228840125392, "grad_norm": 1.2687898507245317, "learning_rate": 1.988356742936449e-05, "loss": 0.2919, "step": 2217 }, { "epoch": 0.46353187042842214, "grad_norm": 1.3154195487741822, "learning_rate": 1.9883395717875584e-05, "loss": 0.2702, "step": 2218 }, { "epoch": 0.4637408568443051, "grad_norm": 1.5974490503694634, "learning_rate": 1.988322388060485e-05, "loss": 0.2567, "step": 2219 }, { "epoch": 0.46394984326018807, "grad_norm": 1.234275975560857, "learning_rate": 1.9883051917554473e-05, "loss": 0.2482, "step": 2220 }, { "epoch": 0.4641588296760711, "grad_norm": 1.3283266638469275, "learning_rate": 1.988287982872664e-05, "loss": 0.2328, "step": 2221 }, { "epoch": 0.46436781609195404, "grad_norm": 1.244603470676294, "learning_rate": 1.9882707614123547e-05, "loss": 0.2513, "step": 2222 }, { "epoch": 0.464576802507837, "grad_norm": 1.2415537761736226, "learning_rate": 1.9882535273747378e-05, "loss": 0.269, "step": 2223 }, { "epoch": 0.46478578892371997, "grad_norm": 1.3984520560223819, "learning_rate": 1.988236280760033e-05, "loss": 0.2364, "step": 2224 }, { "epoch": 0.46499477533960293, "grad_norm": 1.4529266521706738, "learning_rate": 1.9882190215684597e-05, "loss": 0.2338, "step": 2225 }, { "epoch": 0.4652037617554859, "grad_norm": 1.4494217886397476, "learning_rate": 1.9882017498002376e-05, "loss": 0.2615, "step": 2226 }, { "epoch": 0.46541274817136885, "grad_norm": 1.5343937173514384, "learning_rate": 1.9881844654555864e-05, "loss": 0.2656, "step": 2227 }, { "epoch": 0.4656217345872518, "grad_norm": 1.5594639455813588, "learning_rate": 1.9881671685347263e-05, "loss": 0.2512, "step": 2228 }, { "epoch": 0.4658307210031348, "grad_norm": 1.495819538083024, "learning_rate": 1.9881498590378773e-05, "loss": 0.2402, "step": 2229 }, { "epoch": 0.4660397074190178, "grad_norm": 1.4799156366658386, "learning_rate": 1.98813253696526e-05, "loss": 0.2664, "step": 2230 }, { "epoch": 0.46624869383490075, "grad_norm": 1.652015829766398, "learning_rate": 1.988115202317094e-05, "loss": 0.2532, "step": 2231 }, { "epoch": 0.4664576802507837, "grad_norm": 1.5752425664623306, "learning_rate": 1.9880978550936007e-05, "loss": 0.2445, "step": 2232 }, { "epoch": 0.4666666666666667, "grad_norm": 1.6412044855457322, "learning_rate": 1.9880804952950008e-05, "loss": 0.2716, "step": 2233 }, { "epoch": 0.46687565308254964, "grad_norm": 1.358066132735413, "learning_rate": 1.988063122921515e-05, "loss": 0.272, "step": 2234 }, { "epoch": 0.4670846394984326, "grad_norm": 1.5901125116642851, "learning_rate": 1.9880457379733645e-05, "loss": 0.2488, "step": 2235 }, { "epoch": 0.46729362591431556, "grad_norm": 1.5248155711613407, "learning_rate": 1.9880283404507703e-05, "loss": 0.2385, "step": 2236 }, { "epoch": 0.4675026123301985, "grad_norm": 1.4427285730527588, "learning_rate": 1.988010930353954e-05, "loss": 0.2569, "step": 2237 }, { "epoch": 0.4677115987460815, "grad_norm": 1.4699737849887349, "learning_rate": 1.9879935076831375e-05, "loss": 0.2692, "step": 2238 }, { "epoch": 0.4679205851619645, "grad_norm": 1.2129150513462001, "learning_rate": 1.9879760724385422e-05, "loss": 0.2693, "step": 2239 }, { "epoch": 0.46812957157784746, "grad_norm": 1.296424316020801, "learning_rate": 1.9879586246203896e-05, "loss": 0.2756, "step": 2240 }, { "epoch": 0.4683385579937304, "grad_norm": 1.5216819111981985, "learning_rate": 1.987941164228903e-05, "loss": 0.2596, "step": 2241 }, { "epoch": 0.4685475444096134, "grad_norm": 1.306404547884122, "learning_rate": 1.9879236912643028e-05, "loss": 0.2211, "step": 2242 }, { "epoch": 0.46875653082549634, "grad_norm": 1.4464507920154517, "learning_rate": 1.9879062057268128e-05, "loss": 0.2582, "step": 2243 }, { "epoch": 0.4689655172413793, "grad_norm": 1.4125111975673845, "learning_rate": 1.987888707616655e-05, "loss": 0.2628, "step": 2244 }, { "epoch": 0.46917450365726227, "grad_norm": 1.6250236887558784, "learning_rate": 1.9878711969340524e-05, "loss": 0.3017, "step": 2245 }, { "epoch": 0.4693834900731452, "grad_norm": 1.4904042623229408, "learning_rate": 1.9878536736792275e-05, "loss": 0.2374, "step": 2246 }, { "epoch": 0.4695924764890282, "grad_norm": 1.5272583066557646, "learning_rate": 1.987836137852403e-05, "loss": 0.2523, "step": 2247 }, { "epoch": 0.4698014629049112, "grad_norm": 1.31520046362568, "learning_rate": 1.9878185894538032e-05, "loss": 0.2671, "step": 2248 }, { "epoch": 0.47001044932079417, "grad_norm": 1.848431010978684, "learning_rate": 1.9878010284836505e-05, "loss": 0.2678, "step": 2249 }, { "epoch": 0.4702194357366771, "grad_norm": 1.4799655324478667, "learning_rate": 1.9877834549421685e-05, "loss": 0.2564, "step": 2250 }, { "epoch": 0.4704284221525601, "grad_norm": 1.3900484785009062, "learning_rate": 1.9877658688295808e-05, "loss": 0.2335, "step": 2251 }, { "epoch": 0.47063740856844305, "grad_norm": 1.6146296819952926, "learning_rate": 1.9877482701461118e-05, "loss": 0.2334, "step": 2252 }, { "epoch": 0.470846394984326, "grad_norm": 1.7204600288651166, "learning_rate": 1.9877306588919846e-05, "loss": 0.2709, "step": 2253 }, { "epoch": 0.471055381400209, "grad_norm": 1.3727427545526256, "learning_rate": 1.987713035067424e-05, "loss": 0.2266, "step": 2254 }, { "epoch": 0.47126436781609193, "grad_norm": 1.5335548254904179, "learning_rate": 1.987695398672654e-05, "loss": 0.2697, "step": 2255 }, { "epoch": 0.4714733542319749, "grad_norm": 1.2904136476254628, "learning_rate": 1.9876777497078994e-05, "loss": 0.2238, "step": 2256 }, { "epoch": 0.4716823406478579, "grad_norm": 1.592529458891243, "learning_rate": 1.9876600881733843e-05, "loss": 0.2565, "step": 2257 }, { "epoch": 0.4718913270637409, "grad_norm": 1.355573676323968, "learning_rate": 1.987642414069334e-05, "loss": 0.2646, "step": 2258 }, { "epoch": 0.47210031347962383, "grad_norm": 1.5101389517830879, "learning_rate": 1.987624727395973e-05, "loss": 0.2788, "step": 2259 }, { "epoch": 0.4723092998955068, "grad_norm": 1.6556400801749893, "learning_rate": 1.9876070281535265e-05, "loss": 0.2551, "step": 2260 }, { "epoch": 0.47251828631138976, "grad_norm": 1.5570370127819673, "learning_rate": 1.9875893163422195e-05, "loss": 0.2549, "step": 2261 }, { "epoch": 0.4727272727272727, "grad_norm": 1.636888709108826, "learning_rate": 1.987571591962278e-05, "loss": 0.2304, "step": 2262 }, { "epoch": 0.4729362591431557, "grad_norm": 1.2578696266253262, "learning_rate": 1.9875538550139273e-05, "loss": 0.2532, "step": 2263 }, { "epoch": 0.47314524555903864, "grad_norm": 1.3644291039815348, "learning_rate": 1.987536105497393e-05, "loss": 0.2297, "step": 2264 }, { "epoch": 0.47335423197492166, "grad_norm": 1.541844366490467, "learning_rate": 1.987518343412901e-05, "loss": 0.2573, "step": 2265 }, { "epoch": 0.4735632183908046, "grad_norm": 1.4216400047939586, "learning_rate": 1.9875005687606777e-05, "loss": 0.2986, "step": 2266 }, { "epoch": 0.4737722048066876, "grad_norm": 1.5740006096221437, "learning_rate": 1.987482781540949e-05, "loss": 0.2339, "step": 2267 }, { "epoch": 0.47398119122257054, "grad_norm": 1.6777665957592531, "learning_rate": 1.987464981753941e-05, "loss": 0.2558, "step": 2268 }, { "epoch": 0.4741901776384535, "grad_norm": 1.3072734003721698, "learning_rate": 1.9874471693998808e-05, "loss": 0.2645, "step": 2269 }, { "epoch": 0.47439916405433646, "grad_norm": 1.2191191701370732, "learning_rate": 1.9874293444789948e-05, "loss": 0.2494, "step": 2270 }, { "epoch": 0.4746081504702194, "grad_norm": 1.2884131598910498, "learning_rate": 1.98741150699151e-05, "loss": 0.241, "step": 2271 }, { "epoch": 0.4748171368861024, "grad_norm": 1.329487954518807, "learning_rate": 1.987393656937653e-05, "loss": 0.2635, "step": 2272 }, { "epoch": 0.47502612330198535, "grad_norm": 1.5321801354770639, "learning_rate": 1.9873757943176515e-05, "loss": 0.2542, "step": 2273 }, { "epoch": 0.47523510971786836, "grad_norm": 1.5569368820422511, "learning_rate": 1.9873579191317324e-05, "loss": 0.273, "step": 2274 }, { "epoch": 0.4754440961337513, "grad_norm": 1.3378844870021236, "learning_rate": 1.9873400313801236e-05, "loss": 0.2335, "step": 2275 }, { "epoch": 0.4756530825496343, "grad_norm": 1.205646313855314, "learning_rate": 1.9873221310630528e-05, "loss": 0.2346, "step": 2276 }, { "epoch": 0.47586206896551725, "grad_norm": 1.587404414373974, "learning_rate": 1.9873042181807473e-05, "loss": 0.2812, "step": 2277 }, { "epoch": 0.4760710553814002, "grad_norm": 1.382507944214206, "learning_rate": 1.987286292733435e-05, "loss": 0.2599, "step": 2278 }, { "epoch": 0.47628004179728317, "grad_norm": 1.4963068070813894, "learning_rate": 1.9872683547213446e-05, "loss": 0.2347, "step": 2279 }, { "epoch": 0.47648902821316613, "grad_norm": 1.26516325078893, "learning_rate": 1.9872504041447044e-05, "loss": 0.2768, "step": 2280 }, { "epoch": 0.4766980146290491, "grad_norm": 1.391023943436843, "learning_rate": 1.9872324410037423e-05, "loss": 0.3082, "step": 2281 }, { "epoch": 0.47690700104493206, "grad_norm": 1.2976696467931192, "learning_rate": 1.987214465298687e-05, "loss": 0.2363, "step": 2282 }, { "epoch": 0.47711598746081507, "grad_norm": 1.6571910029315595, "learning_rate": 1.9871964770297678e-05, "loss": 0.2781, "step": 2283 }, { "epoch": 0.47732497387669803, "grad_norm": 1.4524899419382669, "learning_rate": 1.9871784761972135e-05, "loss": 0.2757, "step": 2284 }, { "epoch": 0.477533960292581, "grad_norm": 1.2009967509799249, "learning_rate": 1.9871604628012523e-05, "loss": 0.2244, "step": 2285 }, { "epoch": 0.47774294670846396, "grad_norm": 1.3433257289971472, "learning_rate": 1.9871424368421148e-05, "loss": 0.2683, "step": 2286 }, { "epoch": 0.4779519331243469, "grad_norm": 1.5399446730179678, "learning_rate": 1.9871243983200295e-05, "loss": 0.2643, "step": 2287 }, { "epoch": 0.4781609195402299, "grad_norm": 1.3381655575474156, "learning_rate": 1.9871063472352262e-05, "loss": 0.261, "step": 2288 }, { "epoch": 0.47836990595611284, "grad_norm": 1.4667779594038988, "learning_rate": 1.9870882835879347e-05, "loss": 0.2842, "step": 2289 }, { "epoch": 0.4785788923719958, "grad_norm": 1.5030708686798497, "learning_rate": 1.9870702073783846e-05, "loss": 0.254, "step": 2290 }, { "epoch": 0.47878787878787876, "grad_norm": 1.212026174505661, "learning_rate": 1.9870521186068063e-05, "loss": 0.2366, "step": 2291 }, { "epoch": 0.4789968652037618, "grad_norm": 1.3648157770761449, "learning_rate": 1.98703401727343e-05, "loss": 0.2491, "step": 2292 }, { "epoch": 0.47920585161964474, "grad_norm": 1.443086351019095, "learning_rate": 1.987015903378486e-05, "loss": 0.2643, "step": 2293 }, { "epoch": 0.4794148380355277, "grad_norm": 1.3336180712139645, "learning_rate": 1.9869977769222046e-05, "loss": 0.2575, "step": 2294 }, { "epoch": 0.47962382445141066, "grad_norm": 1.2778616034194015, "learning_rate": 1.9869796379048165e-05, "loss": 0.2287, "step": 2295 }, { "epoch": 0.4798328108672936, "grad_norm": 1.3400595096061911, "learning_rate": 1.9869614863265533e-05, "loss": 0.2392, "step": 2296 }, { "epoch": 0.4800417972831766, "grad_norm": 1.4023287339775345, "learning_rate": 1.986943322187645e-05, "loss": 0.2692, "step": 2297 }, { "epoch": 0.48025078369905955, "grad_norm": 1.3252746258295154, "learning_rate": 1.986925145488323e-05, "loss": 0.2485, "step": 2298 }, { "epoch": 0.4804597701149425, "grad_norm": 1.2806571303607237, "learning_rate": 1.9869069562288193e-05, "loss": 0.236, "step": 2299 }, { "epoch": 0.48066875653082547, "grad_norm": 1.7061737803395647, "learning_rate": 1.9868887544093646e-05, "loss": 0.2772, "step": 2300 }, { "epoch": 0.4808777429467085, "grad_norm": 1.3112796493644423, "learning_rate": 1.986870540030191e-05, "loss": 0.2516, "step": 2301 }, { "epoch": 0.48108672936259145, "grad_norm": 1.4233432636286394, "learning_rate": 1.98685231309153e-05, "loss": 0.2389, "step": 2302 }, { "epoch": 0.4812957157784744, "grad_norm": 1.2759085514814557, "learning_rate": 1.986834073593614e-05, "loss": 0.2768, "step": 2303 }, { "epoch": 0.48150470219435737, "grad_norm": 1.2799288179960562, "learning_rate": 1.9868158215366745e-05, "loss": 0.231, "step": 2304 }, { "epoch": 0.48171368861024033, "grad_norm": 1.616903367489074, "learning_rate": 1.9867975569209442e-05, "loss": 0.2261, "step": 2305 }, { "epoch": 0.4819226750261233, "grad_norm": 1.8368051956431484, "learning_rate": 1.986779279746656e-05, "loss": 0.2677, "step": 2306 }, { "epoch": 0.48213166144200625, "grad_norm": 1.4130712992361534, "learning_rate": 1.9867609900140413e-05, "loss": 0.2265, "step": 2307 }, { "epoch": 0.4823406478578892, "grad_norm": 1.3994052957106244, "learning_rate": 1.986742687723334e-05, "loss": 0.2562, "step": 2308 }, { "epoch": 0.48254963427377223, "grad_norm": 1.4537044894259676, "learning_rate": 1.9867243728747665e-05, "loss": 0.2691, "step": 2309 }, { "epoch": 0.4827586206896552, "grad_norm": 1.2807684514876747, "learning_rate": 1.986706045468572e-05, "loss": 0.2196, "step": 2310 }, { "epoch": 0.48296760710553815, "grad_norm": 1.5453352310913524, "learning_rate": 1.9866877055049835e-05, "loss": 0.2666, "step": 2311 }, { "epoch": 0.4831765935214211, "grad_norm": 1.3916880558446707, "learning_rate": 1.986669352984235e-05, "loss": 0.2163, "step": 2312 }, { "epoch": 0.4833855799373041, "grad_norm": 1.542843307417022, "learning_rate": 1.986650987906559e-05, "loss": 0.2663, "step": 2313 }, { "epoch": 0.48359456635318704, "grad_norm": 1.81081740959029, "learning_rate": 1.9866326102721906e-05, "loss": 0.273, "step": 2314 }, { "epoch": 0.48380355276907, "grad_norm": 1.4180485600696113, "learning_rate": 1.9866142200813623e-05, "loss": 0.2513, "step": 2315 }, { "epoch": 0.48401253918495296, "grad_norm": 1.541228180180041, "learning_rate": 1.9865958173343095e-05, "loss": 0.2712, "step": 2316 }, { "epoch": 0.4842215256008359, "grad_norm": 1.3197114745786058, "learning_rate": 1.9865774020312657e-05, "loss": 0.2666, "step": 2317 }, { "epoch": 0.48443051201671894, "grad_norm": 1.185659804723643, "learning_rate": 1.9865589741724653e-05, "loss": 0.2536, "step": 2318 }, { "epoch": 0.4846394984326019, "grad_norm": 1.4204298653160714, "learning_rate": 1.9865405337581425e-05, "loss": 0.2367, "step": 2319 }, { "epoch": 0.48484848484848486, "grad_norm": 1.7122835558150307, "learning_rate": 1.9865220807885323e-05, "loss": 0.2703, "step": 2320 }, { "epoch": 0.4850574712643678, "grad_norm": 1.371864990336601, "learning_rate": 1.9865036152638697e-05, "loss": 0.246, "step": 2321 }, { "epoch": 0.4852664576802508, "grad_norm": 1.373865302264977, "learning_rate": 1.9864851371843896e-05, "loss": 0.2385, "step": 2322 }, { "epoch": 0.48547544409613375, "grad_norm": 1.3488570019808634, "learning_rate": 1.986466646550327e-05, "loss": 0.2315, "step": 2323 }, { "epoch": 0.4856844305120167, "grad_norm": 1.6805627843456465, "learning_rate": 1.9864481433619175e-05, "loss": 0.2442, "step": 2324 }, { "epoch": 0.48589341692789967, "grad_norm": 1.2836295279824244, "learning_rate": 1.9864296276193966e-05, "loss": 0.2374, "step": 2325 }, { "epoch": 0.48610240334378263, "grad_norm": 1.2340999242498825, "learning_rate": 1.9864110993229993e-05, "loss": 0.2263, "step": 2326 }, { "epoch": 0.48631138975966565, "grad_norm": 1.4104289590039958, "learning_rate": 1.986392558472962e-05, "loss": 0.2402, "step": 2327 }, { "epoch": 0.4865203761755486, "grad_norm": 1.4602412088483354, "learning_rate": 1.9863740050695207e-05, "loss": 0.2667, "step": 2328 }, { "epoch": 0.48672936259143157, "grad_norm": 1.4837224671825067, "learning_rate": 1.9863554391129113e-05, "loss": 0.2645, "step": 2329 }, { "epoch": 0.48693834900731453, "grad_norm": 1.2936421877348265, "learning_rate": 1.9863368606033703e-05, "loss": 0.2192, "step": 2330 }, { "epoch": 0.4871473354231975, "grad_norm": 2.7625639116420504, "learning_rate": 1.9863182695411336e-05, "loss": 0.25, "step": 2331 }, { "epoch": 0.48735632183908045, "grad_norm": 1.4527127852106478, "learning_rate": 1.9862996659264383e-05, "loss": 0.2996, "step": 2332 }, { "epoch": 0.4875653082549634, "grad_norm": 1.4787067950121493, "learning_rate": 1.986281049759521e-05, "loss": 0.2479, "step": 2333 }, { "epoch": 0.4877742946708464, "grad_norm": 1.3872110387163863, "learning_rate": 1.986262421040619e-05, "loss": 0.2547, "step": 2334 }, { "epoch": 0.48798328108672934, "grad_norm": 1.40153577493741, "learning_rate": 1.9862437797699688e-05, "loss": 0.2462, "step": 2335 }, { "epoch": 0.48819226750261235, "grad_norm": 1.155878512233842, "learning_rate": 1.9862251259478075e-05, "loss": 0.2111, "step": 2336 }, { "epoch": 0.4884012539184953, "grad_norm": 1.4641327604327865, "learning_rate": 1.9862064595743736e-05, "loss": 0.2457, "step": 2337 }, { "epoch": 0.4886102403343783, "grad_norm": 1.6931055595703541, "learning_rate": 1.9861877806499033e-05, "loss": 0.2763, "step": 2338 }, { "epoch": 0.48881922675026124, "grad_norm": 1.4143524336369304, "learning_rate": 1.9861690891746352e-05, "loss": 0.2472, "step": 2339 }, { "epoch": 0.4890282131661442, "grad_norm": 1.3440545501145926, "learning_rate": 1.986150385148807e-05, "loss": 0.2457, "step": 2340 }, { "epoch": 0.48923719958202716, "grad_norm": 1.5776884104816349, "learning_rate": 1.9861316685726567e-05, "loss": 0.2623, "step": 2341 }, { "epoch": 0.4894461859979101, "grad_norm": 1.2554406300751442, "learning_rate": 1.9861129394464224e-05, "loss": 0.2393, "step": 2342 }, { "epoch": 0.4896551724137931, "grad_norm": 1.5607304603688297, "learning_rate": 1.9860941977703426e-05, "loss": 0.2778, "step": 2343 }, { "epoch": 0.48986415882967604, "grad_norm": 1.4611809058633942, "learning_rate": 1.9860754435446555e-05, "loss": 0.2327, "step": 2344 }, { "epoch": 0.49007314524555906, "grad_norm": 1.3150866497972244, "learning_rate": 1.9860566767696004e-05, "loss": 0.2648, "step": 2345 }, { "epoch": 0.490282131661442, "grad_norm": 1.235846452604579, "learning_rate": 1.9860378974454157e-05, "loss": 0.2354, "step": 2346 }, { "epoch": 0.490491118077325, "grad_norm": 1.3331063009034019, "learning_rate": 1.9860191055723403e-05, "loss": 0.257, "step": 2347 }, { "epoch": 0.49070010449320794, "grad_norm": 1.770696286913327, "learning_rate": 1.9860003011506134e-05, "loss": 0.2737, "step": 2348 }, { "epoch": 0.4909090909090909, "grad_norm": 1.6621264217305514, "learning_rate": 1.9859814841804742e-05, "loss": 0.2816, "step": 2349 }, { "epoch": 0.49111807732497387, "grad_norm": 1.555641279463097, "learning_rate": 1.985962654662163e-05, "loss": 0.2631, "step": 2350 }, { "epoch": 0.49132706374085683, "grad_norm": 1.2283258793689031, "learning_rate": 1.9859438125959185e-05, "loss": 0.2166, "step": 2351 }, { "epoch": 0.4915360501567398, "grad_norm": 1.367282441882038, "learning_rate": 1.9859249579819808e-05, "loss": 0.2446, "step": 2352 }, { "epoch": 0.4917450365726228, "grad_norm": 1.3583352397858852, "learning_rate": 1.98590609082059e-05, "loss": 0.2531, "step": 2353 }, { "epoch": 0.49195402298850577, "grad_norm": 1.180967988937996, "learning_rate": 1.985887211111986e-05, "loss": 0.2663, "step": 2354 }, { "epoch": 0.49216300940438873, "grad_norm": 1.326374797596779, "learning_rate": 1.9858683188564093e-05, "loss": 0.2538, "step": 2355 }, { "epoch": 0.4923719958202717, "grad_norm": 1.286632822904808, "learning_rate": 1.9858494140541e-05, "loss": 0.2746, "step": 2356 }, { "epoch": 0.49258098223615465, "grad_norm": 1.245268002049469, "learning_rate": 1.9858304967052993e-05, "loss": 0.2473, "step": 2357 }, { "epoch": 0.4927899686520376, "grad_norm": 1.1757394268857528, "learning_rate": 1.9858115668102475e-05, "loss": 0.2653, "step": 2358 }, { "epoch": 0.4929989550679206, "grad_norm": 1.3209889336303504, "learning_rate": 1.9857926243691853e-05, "loss": 0.2419, "step": 2359 }, { "epoch": 0.49320794148380354, "grad_norm": 1.2929743065149861, "learning_rate": 1.985773669382354e-05, "loss": 0.2478, "step": 2360 }, { "epoch": 0.4934169278996865, "grad_norm": 1.5304606006356172, "learning_rate": 1.985754701849995e-05, "loss": 0.2766, "step": 2361 }, { "epoch": 0.4936259143155695, "grad_norm": 1.4794303826559205, "learning_rate": 1.9857357217723497e-05, "loss": 0.2707, "step": 2362 }, { "epoch": 0.4938349007314525, "grad_norm": 1.3560999168459147, "learning_rate": 1.985716729149659e-05, "loss": 0.2605, "step": 2363 }, { "epoch": 0.49404388714733544, "grad_norm": 1.2880177351915107, "learning_rate": 1.9856977239821658e-05, "loss": 0.2203, "step": 2364 }, { "epoch": 0.4942528735632184, "grad_norm": 1.5644822624848604, "learning_rate": 1.985678706270111e-05, "loss": 0.2571, "step": 2365 }, { "epoch": 0.49446185997910136, "grad_norm": 1.5239526019254717, "learning_rate": 1.9856596760137367e-05, "loss": 0.2274, "step": 2366 }, { "epoch": 0.4946708463949843, "grad_norm": 1.4318938467069124, "learning_rate": 1.9856406332132855e-05, "loss": 0.2502, "step": 2367 }, { "epoch": 0.4948798328108673, "grad_norm": 1.2972071964677163, "learning_rate": 1.985621577869e-05, "loss": 0.2225, "step": 2368 }, { "epoch": 0.49508881922675024, "grad_norm": 1.5572355898491999, "learning_rate": 1.9856025099811218e-05, "loss": 0.2528, "step": 2369 }, { "epoch": 0.4952978056426332, "grad_norm": 1.2686289343445887, "learning_rate": 1.985583429549894e-05, "loss": 0.2548, "step": 2370 }, { "epoch": 0.4955067920585162, "grad_norm": 1.4542668121399691, "learning_rate": 1.9855643365755597e-05, "loss": 0.244, "step": 2371 }, { "epoch": 0.4957157784743992, "grad_norm": 1.317926391954877, "learning_rate": 1.9855452310583616e-05, "loss": 0.2349, "step": 2372 }, { "epoch": 0.49592476489028214, "grad_norm": 1.272564613447626, "learning_rate": 1.9855261129985428e-05, "loss": 0.2576, "step": 2373 }, { "epoch": 0.4961337513061651, "grad_norm": 1.502267594939876, "learning_rate": 1.985506982396347e-05, "loss": 0.2483, "step": 2374 }, { "epoch": 0.49634273772204807, "grad_norm": 1.3938479429841124, "learning_rate": 1.9854878392520172e-05, "loss": 0.2464, "step": 2375 }, { "epoch": 0.496551724137931, "grad_norm": 1.4686414504587029, "learning_rate": 1.985468683565797e-05, "loss": 0.2499, "step": 2376 }, { "epoch": 0.496760710553814, "grad_norm": 1.3412700720228783, "learning_rate": 1.9854495153379307e-05, "loss": 0.2179, "step": 2377 }, { "epoch": 0.49696969696969695, "grad_norm": 1.1604400254639897, "learning_rate": 1.985430334568662e-05, "loss": 0.2129, "step": 2378 }, { "epoch": 0.4971786833855799, "grad_norm": 1.485627610278766, "learning_rate": 1.9854111412582346e-05, "loss": 0.2447, "step": 2379 }, { "epoch": 0.49738766980146293, "grad_norm": 1.5864268376941857, "learning_rate": 1.9853919354068932e-05, "loss": 0.2344, "step": 2380 }, { "epoch": 0.4975966562173459, "grad_norm": 1.4817548078169707, "learning_rate": 1.9853727170148827e-05, "loss": 0.2656, "step": 2381 }, { "epoch": 0.49780564263322885, "grad_norm": 1.3684395897322832, "learning_rate": 1.9853534860824465e-05, "loss": 0.2607, "step": 2382 }, { "epoch": 0.4980146290491118, "grad_norm": 1.4783028249370547, "learning_rate": 1.98533424260983e-05, "loss": 0.3045, "step": 2383 }, { "epoch": 0.4982236154649948, "grad_norm": 1.5310027213270492, "learning_rate": 1.9853149865972786e-05, "loss": 0.2614, "step": 2384 }, { "epoch": 0.49843260188087773, "grad_norm": 1.4268055100831172, "learning_rate": 1.9852957180450364e-05, "loss": 0.2642, "step": 2385 }, { "epoch": 0.4986415882967607, "grad_norm": 1.425315547358462, "learning_rate": 1.985276436953349e-05, "loss": 0.2269, "step": 2386 }, { "epoch": 0.49885057471264366, "grad_norm": 1.3655477315827442, "learning_rate": 1.9852571433224617e-05, "loss": 0.2447, "step": 2387 }, { "epoch": 0.4990595611285266, "grad_norm": 1.4524412527226018, "learning_rate": 1.9852378371526208e-05, "loss": 0.2681, "step": 2388 }, { "epoch": 0.49926854754440964, "grad_norm": 1.4384401098261934, "learning_rate": 1.985218518444071e-05, "loss": 0.2441, "step": 2389 }, { "epoch": 0.4994775339602926, "grad_norm": 1.4224165662615114, "learning_rate": 1.9851991871970588e-05, "loss": 0.2586, "step": 2390 }, { "epoch": 0.49968652037617556, "grad_norm": 1.345634965004959, "learning_rate": 1.9851798434118294e-05, "loss": 0.2539, "step": 2391 }, { "epoch": 0.4998955067920585, "grad_norm": 1.3689793116423463, "learning_rate": 1.9851604870886302e-05, "loss": 0.2608, "step": 2392 }, { "epoch": 0.5001044932079415, "grad_norm": 1.3310533180749953, "learning_rate": 1.9851411182277067e-05, "loss": 0.25, "step": 2393 }, { "epoch": 0.5003134796238244, "grad_norm": 1.343259500555604, "learning_rate": 1.9851217368293053e-05, "loss": 0.2162, "step": 2394 }, { "epoch": 0.5005224660397074, "grad_norm": 1.206939012689504, "learning_rate": 1.9851023428936732e-05, "loss": 0.2483, "step": 2395 }, { "epoch": 0.5007314524555904, "grad_norm": 1.10107926471343, "learning_rate": 1.9850829364210566e-05, "loss": 0.2042, "step": 2396 }, { "epoch": 0.5009404388714733, "grad_norm": 1.47114144403134, "learning_rate": 1.9850635174117033e-05, "loss": 0.2414, "step": 2397 }, { "epoch": 0.5011494252873563, "grad_norm": 1.3408762190848416, "learning_rate": 1.9850440858658598e-05, "loss": 0.2421, "step": 2398 }, { "epoch": 0.5013584117032392, "grad_norm": 1.734779037693363, "learning_rate": 1.9850246417837735e-05, "loss": 0.2651, "step": 2399 }, { "epoch": 0.5015673981191222, "grad_norm": 1.370115956558023, "learning_rate": 1.9850051851656918e-05, "loss": 0.2459, "step": 2400 }, { "epoch": 0.5017763845350052, "grad_norm": 1.4426964776253672, "learning_rate": 1.984985716011863e-05, "loss": 0.2745, "step": 2401 }, { "epoch": 0.5019853709508882, "grad_norm": 2.5698535381689687, "learning_rate": 1.984966234322534e-05, "loss": 0.2597, "step": 2402 }, { "epoch": 0.5021943573667712, "grad_norm": 1.3780081857914135, "learning_rate": 1.984946740097953e-05, "loss": 0.2313, "step": 2403 }, { "epoch": 0.5024033437826542, "grad_norm": 1.1019378654504293, "learning_rate": 1.984927233338368e-05, "loss": 0.2547, "step": 2404 }, { "epoch": 0.5026123301985371, "grad_norm": 1.5058598602935245, "learning_rate": 1.984907714044028e-05, "loss": 0.2793, "step": 2405 }, { "epoch": 0.5028213166144201, "grad_norm": 1.5039721191196251, "learning_rate": 1.9848881822151802e-05, "loss": 0.2132, "step": 2406 }, { "epoch": 0.503030303030303, "grad_norm": 1.249090745400545, "learning_rate": 1.9848686378520742e-05, "loss": 0.2361, "step": 2407 }, { "epoch": 0.503239289446186, "grad_norm": 1.5231478488712253, "learning_rate": 1.9848490809549582e-05, "loss": 0.264, "step": 2408 }, { "epoch": 0.503448275862069, "grad_norm": 1.4066506600699742, "learning_rate": 1.984829511524081e-05, "loss": 0.2643, "step": 2409 }, { "epoch": 0.5036572622779519, "grad_norm": 1.2280737929758159, "learning_rate": 1.984809929559692e-05, "loss": 0.2198, "step": 2410 }, { "epoch": 0.5038662486938349, "grad_norm": 1.4032460950149095, "learning_rate": 1.9847903350620403e-05, "loss": 0.2448, "step": 2411 }, { "epoch": 0.5040752351097179, "grad_norm": 1.2691447055354586, "learning_rate": 1.9847707280313752e-05, "loss": 0.2604, "step": 2412 }, { "epoch": 0.5042842215256008, "grad_norm": 1.432661282533783, "learning_rate": 1.9847511084679465e-05, "loss": 0.2771, "step": 2413 }, { "epoch": 0.5044932079414838, "grad_norm": 1.2484474662590483, "learning_rate": 1.9847314763720037e-05, "loss": 0.2285, "step": 2414 }, { "epoch": 0.5047021943573667, "grad_norm": 1.8503806820402986, "learning_rate": 1.9847118317437965e-05, "loss": 0.252, "step": 2415 }, { "epoch": 0.5049111807732497, "grad_norm": 1.122529304577123, "learning_rate": 1.9846921745835748e-05, "loss": 0.2104, "step": 2416 }, { "epoch": 0.5051201671891327, "grad_norm": 1.3164864260577545, "learning_rate": 1.9846725048915894e-05, "loss": 0.2217, "step": 2417 }, { "epoch": 0.5053291536050156, "grad_norm": 1.2552466185780817, "learning_rate": 1.9846528226680898e-05, "loss": 0.2509, "step": 2418 }, { "epoch": 0.5055381400208987, "grad_norm": 1.5618912070555009, "learning_rate": 1.9846331279133272e-05, "loss": 0.2321, "step": 2419 }, { "epoch": 0.5057471264367817, "grad_norm": 1.2984308857082902, "learning_rate": 1.984613420627552e-05, "loss": 0.2343, "step": 2420 }, { "epoch": 0.5059561128526646, "grad_norm": 1.3404759099895711, "learning_rate": 1.984593700811015e-05, "loss": 0.2474, "step": 2421 }, { "epoch": 0.5061650992685476, "grad_norm": 1.487461099204324, "learning_rate": 1.984573968463967e-05, "loss": 0.2642, "step": 2422 }, { "epoch": 0.5063740856844305, "grad_norm": 1.366658822537365, "learning_rate": 1.9845542235866593e-05, "loss": 0.2623, "step": 2423 }, { "epoch": 0.5065830721003135, "grad_norm": 1.3343534513408837, "learning_rate": 1.984534466179343e-05, "loss": 0.2664, "step": 2424 }, { "epoch": 0.5067920585161965, "grad_norm": 1.35296983200502, "learning_rate": 1.98451469624227e-05, "loss": 0.2286, "step": 2425 }, { "epoch": 0.5070010449320794, "grad_norm": 1.47740687401501, "learning_rate": 1.9844949137756916e-05, "loss": 0.223, "step": 2426 }, { "epoch": 0.5072100313479624, "grad_norm": 1.2360492776110512, "learning_rate": 1.984475118779859e-05, "loss": 0.2602, "step": 2427 }, { "epoch": 0.5074190177638453, "grad_norm": 1.5060801497278267, "learning_rate": 1.9844553112550253e-05, "loss": 0.2656, "step": 2428 }, { "epoch": 0.5076280041797283, "grad_norm": 1.2523326343260157, "learning_rate": 1.9844354912014418e-05, "loss": 0.2207, "step": 2429 }, { "epoch": 0.5078369905956113, "grad_norm": 1.4891708943404849, "learning_rate": 1.984415658619361e-05, "loss": 0.2741, "step": 2430 }, { "epoch": 0.5080459770114942, "grad_norm": 1.4482065706437304, "learning_rate": 1.9843958135090347e-05, "loss": 0.2664, "step": 2431 }, { "epoch": 0.5082549634273772, "grad_norm": 1.3969950605309602, "learning_rate": 1.9843759558707166e-05, "loss": 0.2632, "step": 2432 }, { "epoch": 0.5084639498432602, "grad_norm": 1.35691994813017, "learning_rate": 1.9843560857046586e-05, "loss": 0.24, "step": 2433 }, { "epoch": 0.5086729362591431, "grad_norm": 1.527381831381052, "learning_rate": 1.9843362030111137e-05, "loss": 0.2587, "step": 2434 }, { "epoch": 0.5088819226750261, "grad_norm": 1.3183765411702073, "learning_rate": 1.984316307790335e-05, "loss": 0.2424, "step": 2435 }, { "epoch": 0.509090909090909, "grad_norm": 1.3776439652191967, "learning_rate": 1.9842964000425754e-05, "loss": 0.2438, "step": 2436 }, { "epoch": 0.5092998955067921, "grad_norm": 1.3365241927868314, "learning_rate": 1.984276479768089e-05, "loss": 0.234, "step": 2437 }, { "epoch": 0.5095088819226751, "grad_norm": 1.4005837221341866, "learning_rate": 1.9842565469671285e-05, "loss": 0.2525, "step": 2438 }, { "epoch": 0.509717868338558, "grad_norm": 1.5878078210162752, "learning_rate": 1.9842366016399485e-05, "loss": 0.2142, "step": 2439 }, { "epoch": 0.509926854754441, "grad_norm": 2.247404931975925, "learning_rate": 1.984216643786802e-05, "loss": 0.2714, "step": 2440 }, { "epoch": 0.510135841170324, "grad_norm": 1.2633537801323182, "learning_rate": 1.984196673407943e-05, "loss": 0.217, "step": 2441 }, { "epoch": 0.5103448275862069, "grad_norm": 1.4414612921863152, "learning_rate": 1.9841766905036263e-05, "loss": 0.248, "step": 2442 }, { "epoch": 0.5105538140020899, "grad_norm": 1.3944311289211884, "learning_rate": 1.9841566950741056e-05, "loss": 0.2249, "step": 2443 }, { "epoch": 0.5107628004179728, "grad_norm": 1.4575738304945038, "learning_rate": 1.9841366871196358e-05, "loss": 0.2364, "step": 2444 }, { "epoch": 0.5109717868338558, "grad_norm": 1.4565373735936618, "learning_rate": 1.9841166666404712e-05, "loss": 0.2322, "step": 2445 }, { "epoch": 0.5111807732497388, "grad_norm": 1.3806034495191977, "learning_rate": 1.984096633636867e-05, "loss": 0.277, "step": 2446 }, { "epoch": 0.5113897596656217, "grad_norm": 1.3029814772665895, "learning_rate": 1.984076588109078e-05, "loss": 0.2186, "step": 2447 }, { "epoch": 0.5115987460815047, "grad_norm": 1.4858712983446931, "learning_rate": 1.9840565300573587e-05, "loss": 0.2566, "step": 2448 }, { "epoch": 0.5118077324973876, "grad_norm": 1.2944496278978734, "learning_rate": 1.9840364594819653e-05, "loss": 0.2477, "step": 2449 }, { "epoch": 0.5120167189132706, "grad_norm": 1.2903376440962193, "learning_rate": 1.984016376383153e-05, "loss": 0.2387, "step": 2450 }, { "epoch": 0.5122257053291536, "grad_norm": 1.37817211802289, "learning_rate": 1.9839962807611765e-05, "loss": 0.2236, "step": 2451 }, { "epoch": 0.5124346917450365, "grad_norm": 1.1818014387733171, "learning_rate": 1.9839761726162927e-05, "loss": 0.2349, "step": 2452 }, { "epoch": 0.5126436781609195, "grad_norm": 1.7597507850463474, "learning_rate": 1.983956051948757e-05, "loss": 0.2397, "step": 2453 }, { "epoch": 0.5128526645768025, "grad_norm": 1.2302774082687162, "learning_rate": 1.9839359187588253e-05, "loss": 0.2467, "step": 2454 }, { "epoch": 0.5130616509926855, "grad_norm": 1.2522214622949306, "learning_rate": 1.9839157730467543e-05, "loss": 0.2317, "step": 2455 }, { "epoch": 0.5132706374085685, "grad_norm": 1.4238543483540969, "learning_rate": 1.9838956148128004e-05, "loss": 0.236, "step": 2456 }, { "epoch": 0.5134796238244514, "grad_norm": 1.392001767670686, "learning_rate": 1.9838754440572195e-05, "loss": 0.2489, "step": 2457 }, { "epoch": 0.5136886102403344, "grad_norm": 1.2319944753713883, "learning_rate": 1.9838552607802686e-05, "loss": 0.2434, "step": 2458 }, { "epoch": 0.5138975966562174, "grad_norm": 1.3087911592765769, "learning_rate": 1.983835064982205e-05, "loss": 0.2346, "step": 2459 }, { "epoch": 0.5141065830721003, "grad_norm": 1.3539270871552056, "learning_rate": 1.9838148566632855e-05, "loss": 0.2624, "step": 2460 }, { "epoch": 0.5143155694879833, "grad_norm": 1.3011490238806305, "learning_rate": 1.9837946358237668e-05, "loss": 0.2482, "step": 2461 }, { "epoch": 0.5145245559038663, "grad_norm": 1.4131832752543978, "learning_rate": 1.983774402463907e-05, "loss": 0.2528, "step": 2462 }, { "epoch": 0.5147335423197492, "grad_norm": 1.4444258926546458, "learning_rate": 1.9837541565839625e-05, "loss": 0.2397, "step": 2463 }, { "epoch": 0.5149425287356322, "grad_norm": 1.340446983187883, "learning_rate": 1.983733898184192e-05, "loss": 0.2253, "step": 2464 }, { "epoch": 0.5151515151515151, "grad_norm": 1.4215643674458502, "learning_rate": 1.983713627264853e-05, "loss": 0.2434, "step": 2465 }, { "epoch": 0.5153605015673981, "grad_norm": 1.5281374346301106, "learning_rate": 1.9836933438262037e-05, "loss": 0.2301, "step": 2466 }, { "epoch": 0.5155694879832811, "grad_norm": 1.8088153459532148, "learning_rate": 1.983673047868502e-05, "loss": 0.2352, "step": 2467 }, { "epoch": 0.515778474399164, "grad_norm": 1.3880217014895662, "learning_rate": 1.9836527393920058e-05, "loss": 0.2117, "step": 2468 }, { "epoch": 0.515987460815047, "grad_norm": 1.321886805926766, "learning_rate": 1.9836324183969743e-05, "loss": 0.2433, "step": 2469 }, { "epoch": 0.51619644723093, "grad_norm": 1.4106865541191418, "learning_rate": 1.9836120848836656e-05, "loss": 0.2461, "step": 2470 }, { "epoch": 0.5164054336468129, "grad_norm": 1.212103440674137, "learning_rate": 1.983591738852339e-05, "loss": 0.2184, "step": 2471 }, { "epoch": 0.516614420062696, "grad_norm": 1.2865016010722523, "learning_rate": 1.9835713803032526e-05, "loss": 0.2573, "step": 2472 }, { "epoch": 0.5168234064785789, "grad_norm": 1.3546065442009452, "learning_rate": 1.983551009236666e-05, "loss": 0.2744, "step": 2473 }, { "epoch": 0.5170323928944619, "grad_norm": 1.2613924911336525, "learning_rate": 1.9835306256528386e-05, "loss": 0.2335, "step": 2474 }, { "epoch": 0.5172413793103449, "grad_norm": 1.4886384380792608, "learning_rate": 1.9835102295520296e-05, "loss": 0.2552, "step": 2475 }, { "epoch": 0.5174503657262278, "grad_norm": 1.275544982653802, "learning_rate": 1.983489820934499e-05, "loss": 0.2256, "step": 2476 }, { "epoch": 0.5176593521421108, "grad_norm": 1.6528243941277125, "learning_rate": 1.9834693998005056e-05, "loss": 0.2601, "step": 2477 }, { "epoch": 0.5178683385579937, "grad_norm": 1.383894121940878, "learning_rate": 1.9834489661503104e-05, "loss": 0.2658, "step": 2478 }, { "epoch": 0.5180773249738767, "grad_norm": 1.5011449455428703, "learning_rate": 1.9834285199841723e-05, "loss": 0.2457, "step": 2479 }, { "epoch": 0.5182863113897597, "grad_norm": 1.1659060850468483, "learning_rate": 1.9834080613023523e-05, "loss": 0.2148, "step": 2480 }, { "epoch": 0.5184952978056426, "grad_norm": 1.5734088058041804, "learning_rate": 1.9833875901051106e-05, "loss": 0.2565, "step": 2481 }, { "epoch": 0.5187042842215256, "grad_norm": 1.2022302625023464, "learning_rate": 1.983367106392708e-05, "loss": 0.2358, "step": 2482 }, { "epoch": 0.5189132706374086, "grad_norm": 1.3683778148005128, "learning_rate": 1.9833466101654046e-05, "loss": 0.2639, "step": 2483 }, { "epoch": 0.5191222570532915, "grad_norm": 1.394588030021524, "learning_rate": 1.9833261014234614e-05, "loss": 0.2463, "step": 2484 }, { "epoch": 0.5193312434691745, "grad_norm": 1.1089717574739932, "learning_rate": 1.98330558016714e-05, "loss": 0.2449, "step": 2485 }, { "epoch": 0.5195402298850574, "grad_norm": 1.2879368832356806, "learning_rate": 1.9832850463967007e-05, "loss": 0.2714, "step": 2486 }, { "epoch": 0.5197492163009404, "grad_norm": 1.462125384375962, "learning_rate": 1.9832645001124054e-05, "loss": 0.2392, "step": 2487 }, { "epoch": 0.5199582027168234, "grad_norm": 1.4529770945510554, "learning_rate": 1.983243941314515e-05, "loss": 0.2328, "step": 2488 }, { "epoch": 0.5201671891327063, "grad_norm": 1.3907757673434549, "learning_rate": 1.9832233700032924e-05, "loss": 0.2498, "step": 2489 }, { "epoch": 0.5203761755485894, "grad_norm": 1.1201598556968753, "learning_rate": 1.983202786178998e-05, "loss": 0.2265, "step": 2490 }, { "epoch": 0.5205851619644724, "grad_norm": 1.4131971033368302, "learning_rate": 1.9831821898418943e-05, "loss": 0.269, "step": 2491 }, { "epoch": 0.5207941483803553, "grad_norm": 1.384376682808349, "learning_rate": 1.9831615809922438e-05, "loss": 0.2538, "step": 2492 }, { "epoch": 0.5210031347962383, "grad_norm": 1.2713135827097748, "learning_rate": 1.983140959630308e-05, "loss": 0.2498, "step": 2493 }, { "epoch": 0.5212121212121212, "grad_norm": 1.3518633081785159, "learning_rate": 1.98312032575635e-05, "loss": 0.2184, "step": 2494 }, { "epoch": 0.5214211076280042, "grad_norm": 1.4249395925043886, "learning_rate": 1.983099679370632e-05, "loss": 0.2361, "step": 2495 }, { "epoch": 0.5216300940438872, "grad_norm": 1.3893217202181734, "learning_rate": 1.9830790204734175e-05, "loss": 0.2349, "step": 2496 }, { "epoch": 0.5218390804597701, "grad_norm": 1.4973983445716939, "learning_rate": 1.9830583490649682e-05, "loss": 0.2047, "step": 2497 }, { "epoch": 0.5220480668756531, "grad_norm": 1.2700443713761687, "learning_rate": 1.983037665145548e-05, "loss": 0.267, "step": 2498 }, { "epoch": 0.522257053291536, "grad_norm": 1.5339163822287563, "learning_rate": 1.9830169687154198e-05, "loss": 0.2191, "step": 2499 }, { "epoch": 0.522466039707419, "grad_norm": 1.2988463262090257, "learning_rate": 1.9829962597748475e-05, "loss": 0.2731, "step": 2500 }, { "epoch": 0.522675026123302, "grad_norm": 1.4182807006891522, "learning_rate": 1.982975538324094e-05, "loss": 0.2464, "step": 2501 }, { "epoch": 0.5228840125391849, "grad_norm": 1.3121928045204618, "learning_rate": 1.9829548043634233e-05, "loss": 0.209, "step": 2502 }, { "epoch": 0.5230929989550679, "grad_norm": 1.255685775776615, "learning_rate": 1.9829340578930996e-05, "loss": 0.2676, "step": 2503 }, { "epoch": 0.5233019853709509, "grad_norm": 1.3661513565580194, "learning_rate": 1.9829132989133865e-05, "loss": 0.2494, "step": 2504 }, { "epoch": 0.5235109717868338, "grad_norm": 1.144358376352641, "learning_rate": 1.982892527424548e-05, "loss": 0.2413, "step": 2505 }, { "epoch": 0.5237199582027168, "grad_norm": 1.0689665481723876, "learning_rate": 1.9828717434268488e-05, "loss": 0.2423, "step": 2506 }, { "epoch": 0.5239289446185998, "grad_norm": 1.669398256776815, "learning_rate": 1.9828509469205535e-05, "loss": 0.2574, "step": 2507 }, { "epoch": 0.5241379310344828, "grad_norm": 1.4907900019976073, "learning_rate": 1.9828301379059272e-05, "loss": 0.2569, "step": 2508 }, { "epoch": 0.5243469174503658, "grad_norm": 1.298560550108581, "learning_rate": 1.9828093163832335e-05, "loss": 0.2422, "step": 2509 }, { "epoch": 0.5245559038662487, "grad_norm": 1.2574491011492854, "learning_rate": 1.982788482352738e-05, "loss": 0.2317, "step": 2510 }, { "epoch": 0.5247648902821317, "grad_norm": 1.2485906320909146, "learning_rate": 1.9827676358147063e-05, "loss": 0.2254, "step": 2511 }, { "epoch": 0.5249738766980147, "grad_norm": 1.620651717884978, "learning_rate": 1.982746776769403e-05, "loss": 0.267, "step": 2512 }, { "epoch": 0.5251828631138976, "grad_norm": 1.5262269186223911, "learning_rate": 1.982725905217094e-05, "loss": 0.2512, "step": 2513 }, { "epoch": 0.5253918495297806, "grad_norm": 1.1236891893758465, "learning_rate": 1.9827050211580446e-05, "loss": 0.2247, "step": 2514 }, { "epoch": 0.5256008359456635, "grad_norm": 1.2483286371589664, "learning_rate": 1.982684124592521e-05, "loss": 0.2455, "step": 2515 }, { "epoch": 0.5258098223615465, "grad_norm": 1.625070875059424, "learning_rate": 1.982663215520789e-05, "loss": 0.2601, "step": 2516 }, { "epoch": 0.5260188087774295, "grad_norm": 1.5747442249928, "learning_rate": 1.9826422939431144e-05, "loss": 0.2212, "step": 2517 }, { "epoch": 0.5262277951933124, "grad_norm": 1.4084939592660939, "learning_rate": 1.9826213598597637e-05, "loss": 0.2316, "step": 2518 }, { "epoch": 0.5264367816091954, "grad_norm": 1.3643866503785214, "learning_rate": 1.9826004132710033e-05, "loss": 0.2562, "step": 2519 }, { "epoch": 0.5266457680250783, "grad_norm": 1.2308466793110602, "learning_rate": 1.9825794541771e-05, "loss": 0.2464, "step": 2520 }, { "epoch": 0.5268547544409613, "grad_norm": 1.333018565727699, "learning_rate": 1.98255848257832e-05, "loss": 0.2074, "step": 2521 }, { "epoch": 0.5270637408568443, "grad_norm": 1.7552734111457866, "learning_rate": 1.9825374984749307e-05, "loss": 0.2693, "step": 2522 }, { "epoch": 0.5272727272727272, "grad_norm": 1.3895601929088206, "learning_rate": 1.9825165018671986e-05, "loss": 0.2402, "step": 2523 }, { "epoch": 0.5274817136886102, "grad_norm": 1.249200500658145, "learning_rate": 1.9824954927553917e-05, "loss": 0.2612, "step": 2524 }, { "epoch": 0.5276907001044933, "grad_norm": 1.4088248922560376, "learning_rate": 1.9824744711397765e-05, "loss": 0.2756, "step": 2525 }, { "epoch": 0.5278996865203762, "grad_norm": 1.4134698524007838, "learning_rate": 1.9824534370206215e-05, "loss": 0.2348, "step": 2526 }, { "epoch": 0.5281086729362592, "grad_norm": 1.1922343218713962, "learning_rate": 1.9824323903981936e-05, "loss": 0.2462, "step": 2527 }, { "epoch": 0.5283176593521421, "grad_norm": 1.3856927465087077, "learning_rate": 1.9824113312727608e-05, "loss": 0.2466, "step": 2528 }, { "epoch": 0.5285266457680251, "grad_norm": 1.3443718637470483, "learning_rate": 1.9823902596445918e-05, "loss": 0.2608, "step": 2529 }, { "epoch": 0.5287356321839081, "grad_norm": 1.3920597192251376, "learning_rate": 1.9823691755139535e-05, "loss": 0.2831, "step": 2530 }, { "epoch": 0.528944618599791, "grad_norm": 1.438146577295903, "learning_rate": 1.9823480788811155e-05, "loss": 0.2259, "step": 2531 }, { "epoch": 0.529153605015674, "grad_norm": 1.3482097373604134, "learning_rate": 1.9823269697463455e-05, "loss": 0.2627, "step": 2532 }, { "epoch": 0.529362591431557, "grad_norm": 1.3955671468125923, "learning_rate": 1.9823058481099124e-05, "loss": 0.2471, "step": 2533 }, { "epoch": 0.5295715778474399, "grad_norm": 1.4531205422269045, "learning_rate": 1.9822847139720847e-05, "loss": 0.2715, "step": 2534 }, { "epoch": 0.5297805642633229, "grad_norm": 1.4623803830912718, "learning_rate": 1.982263567333132e-05, "loss": 0.238, "step": 2535 }, { "epoch": 0.5299895506792058, "grad_norm": 1.61045771200182, "learning_rate": 1.982242408193323e-05, "loss": 0.2697, "step": 2536 }, { "epoch": 0.5301985370950888, "grad_norm": 1.1366829504518323, "learning_rate": 1.982221236552927e-05, "loss": 0.2278, "step": 2537 }, { "epoch": 0.5304075235109718, "grad_norm": 1.4273079080084936, "learning_rate": 1.9822000524122135e-05, "loss": 0.2574, "step": 2538 }, { "epoch": 0.5306165099268547, "grad_norm": 1.3005944446335376, "learning_rate": 1.982178855771452e-05, "loss": 0.2082, "step": 2539 }, { "epoch": 0.5308254963427377, "grad_norm": 1.3869414885073696, "learning_rate": 1.9821576466309125e-05, "loss": 0.2432, "step": 2540 }, { "epoch": 0.5310344827586206, "grad_norm": 1.3304085435661446, "learning_rate": 1.982136424990865e-05, "loss": 0.2636, "step": 2541 }, { "epoch": 0.5312434691745036, "grad_norm": 1.5385300201374201, "learning_rate": 1.982115190851579e-05, "loss": 0.2544, "step": 2542 }, { "epoch": 0.5314524555903867, "grad_norm": 1.210030773692958, "learning_rate": 1.982093944213325e-05, "loss": 0.2105, "step": 2543 }, { "epoch": 0.5316614420062696, "grad_norm": 1.3086115272020808, "learning_rate": 1.9820726850763737e-05, "loss": 0.2589, "step": 2544 }, { "epoch": 0.5318704284221526, "grad_norm": 1.5819400982288643, "learning_rate": 1.9820514134409956e-05, "loss": 0.2616, "step": 2545 }, { "epoch": 0.5320794148380356, "grad_norm": 1.2173430854246683, "learning_rate": 1.982030129307461e-05, "loss": 0.2111, "step": 2546 }, { "epoch": 0.5322884012539185, "grad_norm": 1.4457601400136235, "learning_rate": 1.9820088326760412e-05, "loss": 0.281, "step": 2547 }, { "epoch": 0.5324973876698015, "grad_norm": 1.329821569709998, "learning_rate": 1.981987523547007e-05, "loss": 0.265, "step": 2548 }, { "epoch": 0.5327063740856844, "grad_norm": 1.3110797720712999, "learning_rate": 1.9819662019206295e-05, "loss": 0.2623, "step": 2549 }, { "epoch": 0.5329153605015674, "grad_norm": 1.3410540053012856, "learning_rate": 1.9819448677971805e-05, "loss": 0.2525, "step": 2550 }, { "epoch": 0.5331243469174504, "grad_norm": 1.2560884298669026, "learning_rate": 1.981923521176931e-05, "loss": 0.1973, "step": 2551 }, { "epoch": 0.5333333333333333, "grad_norm": 1.264043526337371, "learning_rate": 1.9819021620601527e-05, "loss": 0.2181, "step": 2552 }, { "epoch": 0.5335423197492163, "grad_norm": 1.636290895749969, "learning_rate": 1.981880790447118e-05, "loss": 0.2508, "step": 2553 }, { "epoch": 0.5337513061650992, "grad_norm": 1.3182633112220508, "learning_rate": 1.9818594063380986e-05, "loss": 0.24, "step": 2554 }, { "epoch": 0.5339602925809822, "grad_norm": 1.6326880946655782, "learning_rate": 1.9818380097333666e-05, "loss": 0.2981, "step": 2555 }, { "epoch": 0.5341692789968652, "grad_norm": 1.4354831548057727, "learning_rate": 1.9818166006331937e-05, "loss": 0.2201, "step": 2556 }, { "epoch": 0.5343782654127481, "grad_norm": 1.2970837566618951, "learning_rate": 1.9817951790378534e-05, "loss": 0.2548, "step": 2557 }, { "epoch": 0.5345872518286311, "grad_norm": 1.3564901375095844, "learning_rate": 1.9817737449476176e-05, "loss": 0.2313, "step": 2558 }, { "epoch": 0.534796238244514, "grad_norm": 1.328229315613652, "learning_rate": 1.9817522983627596e-05, "loss": 0.2661, "step": 2559 }, { "epoch": 0.5350052246603971, "grad_norm": 1.4474157457336552, "learning_rate": 1.981730839283552e-05, "loss": 0.2553, "step": 2560 }, { "epoch": 0.5352142110762801, "grad_norm": 1.2658756873209673, "learning_rate": 1.9817093677102676e-05, "loss": 0.2434, "step": 2561 }, { "epoch": 0.535423197492163, "grad_norm": 1.4018473956219069, "learning_rate": 1.9816878836431805e-05, "loss": 0.2506, "step": 2562 }, { "epoch": 0.535632183908046, "grad_norm": 1.8607883640867438, "learning_rate": 1.9816663870825633e-05, "loss": 0.2409, "step": 2563 }, { "epoch": 0.535841170323929, "grad_norm": 1.39987280328801, "learning_rate": 1.98164487802869e-05, "loss": 0.2678, "step": 2564 }, { "epoch": 0.5360501567398119, "grad_norm": 1.314021694113564, "learning_rate": 1.9816233564818346e-05, "loss": 0.2437, "step": 2565 }, { "epoch": 0.5362591431556949, "grad_norm": 1.147904102321581, "learning_rate": 1.9816018224422705e-05, "loss": 0.2542, "step": 2566 }, { "epoch": 0.5364681295715779, "grad_norm": 1.2790897581424072, "learning_rate": 1.9815802759102718e-05, "loss": 0.2222, "step": 2567 }, { "epoch": 0.5366771159874608, "grad_norm": 1.472107195333547, "learning_rate": 1.981558716886113e-05, "loss": 0.2625, "step": 2568 }, { "epoch": 0.5368861024033438, "grad_norm": 1.3444734132049017, "learning_rate": 1.981537145370068e-05, "loss": 0.2281, "step": 2569 }, { "epoch": 0.5370950888192267, "grad_norm": 1.1913884411227584, "learning_rate": 1.9815155613624118e-05, "loss": 0.2464, "step": 2570 }, { "epoch": 0.5373040752351097, "grad_norm": 1.496056136597056, "learning_rate": 1.981493964863419e-05, "loss": 0.2552, "step": 2571 }, { "epoch": 0.5375130616509927, "grad_norm": 1.2728501611148189, "learning_rate": 1.9814723558733644e-05, "loss": 0.2617, "step": 2572 }, { "epoch": 0.5377220480668756, "grad_norm": 1.1622306710115147, "learning_rate": 1.981450734392523e-05, "loss": 0.2297, "step": 2573 }, { "epoch": 0.5379310344827586, "grad_norm": 1.5958666101132475, "learning_rate": 1.9814291004211695e-05, "loss": 0.2652, "step": 2574 }, { "epoch": 0.5381400208986415, "grad_norm": 1.7956020877325574, "learning_rate": 1.9814074539595803e-05, "loss": 0.2507, "step": 2575 }, { "epoch": 0.5383490073145245, "grad_norm": 1.4218298510911476, "learning_rate": 1.98138579500803e-05, "loss": 0.2583, "step": 2576 }, { "epoch": 0.5385579937304075, "grad_norm": 1.4310809583559805, "learning_rate": 1.9813641235667946e-05, "loss": 0.253, "step": 2577 }, { "epoch": 0.5387669801462905, "grad_norm": 1.77556614693569, "learning_rate": 1.9813424396361497e-05, "loss": 0.2418, "step": 2578 }, { "epoch": 0.5389759665621735, "grad_norm": 1.5419590002736194, "learning_rate": 1.9813207432163716e-05, "loss": 0.2536, "step": 2579 }, { "epoch": 0.5391849529780565, "grad_norm": 1.3108124058675783, "learning_rate": 1.9812990343077357e-05, "loss": 0.236, "step": 2580 }, { "epoch": 0.5393939393939394, "grad_norm": 1.326989873106318, "learning_rate": 1.9812773129105193e-05, "loss": 0.2249, "step": 2581 }, { "epoch": 0.5396029258098224, "grad_norm": 1.4827288958972125, "learning_rate": 1.981255579024998e-05, "loss": 0.2207, "step": 2582 }, { "epoch": 0.5398119122257053, "grad_norm": 1.420326447911585, "learning_rate": 1.9812338326514488e-05, "loss": 0.2277, "step": 2583 }, { "epoch": 0.5400208986415883, "grad_norm": 1.4366495769236352, "learning_rate": 1.9812120737901484e-05, "loss": 0.2349, "step": 2584 }, { "epoch": 0.5402298850574713, "grad_norm": 1.3945961377948743, "learning_rate": 1.9811903024413737e-05, "loss": 0.2655, "step": 2585 }, { "epoch": 0.5404388714733542, "grad_norm": 1.1553805057657947, "learning_rate": 1.9811685186054017e-05, "loss": 0.2709, "step": 2586 }, { "epoch": 0.5406478578892372, "grad_norm": 1.360823235397162, "learning_rate": 1.98114672228251e-05, "loss": 0.2313, "step": 2587 }, { "epoch": 0.5408568443051202, "grad_norm": 1.23189223509121, "learning_rate": 1.981124913472975e-05, "loss": 0.244, "step": 2588 }, { "epoch": 0.5410658307210031, "grad_norm": 1.4025917709860733, "learning_rate": 1.9811030921770756e-05, "loss": 0.2589, "step": 2589 }, { "epoch": 0.5412748171368861, "grad_norm": 1.1404261106000948, "learning_rate": 1.9810812583950883e-05, "loss": 0.2362, "step": 2590 }, { "epoch": 0.541483803552769, "grad_norm": 1.2489482124081848, "learning_rate": 1.9810594121272922e-05, "loss": 0.2196, "step": 2591 }, { "epoch": 0.541692789968652, "grad_norm": 1.3297101486239344, "learning_rate": 1.9810375533739642e-05, "loss": 0.2413, "step": 2592 }, { "epoch": 0.541901776384535, "grad_norm": 1.3584678983103329, "learning_rate": 1.9810156821353827e-05, "loss": 0.2607, "step": 2593 }, { "epoch": 0.5421107628004179, "grad_norm": 1.4787134980253074, "learning_rate": 1.9809937984118267e-05, "loss": 0.2543, "step": 2594 }, { "epoch": 0.542319749216301, "grad_norm": 2.785552641405152, "learning_rate": 1.980971902203574e-05, "loss": 0.2319, "step": 2595 }, { "epoch": 0.542528735632184, "grad_norm": 1.3973617643484169, "learning_rate": 1.9809499935109037e-05, "loss": 0.2383, "step": 2596 }, { "epoch": 0.5427377220480669, "grad_norm": 1.4403556027897086, "learning_rate": 1.980928072334095e-05, "loss": 0.2456, "step": 2597 }, { "epoch": 0.5429467084639499, "grad_norm": 1.311913380929291, "learning_rate": 1.9809061386734254e-05, "loss": 0.2144, "step": 2598 }, { "epoch": 0.5431556948798328, "grad_norm": 1.4082393679943233, "learning_rate": 1.9808841925291756e-05, "loss": 0.2373, "step": 2599 }, { "epoch": 0.5433646812957158, "grad_norm": 1.5880163518407548, "learning_rate": 1.9808622339016242e-05, "loss": 0.2668, "step": 2600 }, { "epoch": 0.5435736677115988, "grad_norm": 1.7809010125497131, "learning_rate": 1.9808402627910505e-05, "loss": 0.258, "step": 2601 }, { "epoch": 0.5437826541274817, "grad_norm": 1.2413513587482874, "learning_rate": 1.9808182791977346e-05, "loss": 0.2369, "step": 2602 }, { "epoch": 0.5439916405433647, "grad_norm": 1.5500324171577808, "learning_rate": 1.980796283121956e-05, "loss": 0.2333, "step": 2603 }, { "epoch": 0.5442006269592476, "grad_norm": 1.3637547665166363, "learning_rate": 1.9807742745639947e-05, "loss": 0.2575, "step": 2604 }, { "epoch": 0.5444096133751306, "grad_norm": 1.7340571852278228, "learning_rate": 1.9807522535241305e-05, "loss": 0.262, "step": 2605 }, { "epoch": 0.5446185997910136, "grad_norm": 1.225281041121287, "learning_rate": 1.9807302200026442e-05, "loss": 0.2124, "step": 2606 }, { "epoch": 0.5448275862068965, "grad_norm": 1.4169993281337516, "learning_rate": 1.980708173999816e-05, "loss": 0.2524, "step": 2607 }, { "epoch": 0.5450365726227795, "grad_norm": 1.7560236996633944, "learning_rate": 1.9806861155159263e-05, "loss": 0.2618, "step": 2608 }, { "epoch": 0.5452455590386625, "grad_norm": 1.2677900014526808, "learning_rate": 1.9806640445512558e-05, "loss": 0.2053, "step": 2609 }, { "epoch": 0.5454545454545454, "grad_norm": 1.761466766340318, "learning_rate": 1.980641961106086e-05, "loss": 0.2619, "step": 2610 }, { "epoch": 0.5456635318704284, "grad_norm": 1.4492319744885538, "learning_rate": 1.980619865180697e-05, "loss": 0.2576, "step": 2611 }, { "epoch": 0.5458725182863113, "grad_norm": 1.3901513763552775, "learning_rate": 1.9805977567753706e-05, "loss": 0.2004, "step": 2612 }, { "epoch": 0.5460815047021944, "grad_norm": 1.6689546907816717, "learning_rate": 1.980575635890388e-05, "loss": 0.242, "step": 2613 }, { "epoch": 0.5462904911180774, "grad_norm": 1.3661555167086263, "learning_rate": 1.9805535025260306e-05, "loss": 0.2385, "step": 2614 }, { "epoch": 0.5464994775339603, "grad_norm": 1.2018082825005947, "learning_rate": 1.9805313566825806e-05, "loss": 0.229, "step": 2615 }, { "epoch": 0.5467084639498433, "grad_norm": 1.4254931943396707, "learning_rate": 1.9805091983603196e-05, "loss": 0.2353, "step": 2616 }, { "epoch": 0.5469174503657263, "grad_norm": 1.4122777198644108, "learning_rate": 1.980487027559529e-05, "loss": 0.2878, "step": 2617 }, { "epoch": 0.5471264367816092, "grad_norm": 1.3692924939830982, "learning_rate": 1.980464844280492e-05, "loss": 0.2508, "step": 2618 }, { "epoch": 0.5473354231974922, "grad_norm": 1.5621138047839336, "learning_rate": 1.9804426485234897e-05, "loss": 0.2465, "step": 2619 }, { "epoch": 0.5475444096133751, "grad_norm": 1.3845313100619185, "learning_rate": 1.9804204402888057e-05, "loss": 0.2565, "step": 2620 }, { "epoch": 0.5477533960292581, "grad_norm": 1.271213395939802, "learning_rate": 1.980398219576722e-05, "loss": 0.2388, "step": 2621 }, { "epoch": 0.5479623824451411, "grad_norm": 1.3493082604785527, "learning_rate": 1.9803759863875214e-05, "loss": 0.2347, "step": 2622 }, { "epoch": 0.548171368861024, "grad_norm": 1.3782411881209835, "learning_rate": 1.9803537407214873e-05, "loss": 0.2294, "step": 2623 }, { "epoch": 0.548380355276907, "grad_norm": 1.4625433221759467, "learning_rate": 1.9803314825789028e-05, "loss": 0.2231, "step": 2624 }, { "epoch": 0.54858934169279, "grad_norm": 1.393841671056424, "learning_rate": 1.9803092119600505e-05, "loss": 0.2495, "step": 2625 }, { "epoch": 0.5487983281086729, "grad_norm": 1.2694851915702305, "learning_rate": 1.9802869288652144e-05, "loss": 0.2491, "step": 2626 }, { "epoch": 0.5490073145245559, "grad_norm": 1.327412755289757, "learning_rate": 1.980264633294678e-05, "loss": 0.2316, "step": 2627 }, { "epoch": 0.5492163009404388, "grad_norm": 1.5208268933618492, "learning_rate": 1.9802423252487245e-05, "loss": 0.2405, "step": 2628 }, { "epoch": 0.5494252873563218, "grad_norm": 1.454886757418215, "learning_rate": 1.9802200047276387e-05, "loss": 0.2068, "step": 2629 }, { "epoch": 0.5496342737722049, "grad_norm": 1.1680001689835768, "learning_rate": 1.9801976717317036e-05, "loss": 0.2377, "step": 2630 }, { "epoch": 0.5498432601880878, "grad_norm": 1.2182502424353037, "learning_rate": 1.9801753262612047e-05, "loss": 0.2455, "step": 2631 }, { "epoch": 0.5500522466039708, "grad_norm": 1.2014169406796593, "learning_rate": 1.980152968316426e-05, "loss": 0.2152, "step": 2632 }, { "epoch": 0.5502612330198537, "grad_norm": 1.8451775468128542, "learning_rate": 1.980130597897651e-05, "loss": 0.2809, "step": 2633 }, { "epoch": 0.5504702194357367, "grad_norm": 1.3720898667024044, "learning_rate": 1.9801082150051653e-05, "loss": 0.214, "step": 2634 }, { "epoch": 0.5506792058516197, "grad_norm": 1.3853447157984717, "learning_rate": 1.980085819639254e-05, "loss": 0.2468, "step": 2635 }, { "epoch": 0.5508881922675026, "grad_norm": 1.3820692894306694, "learning_rate": 1.9800634118002014e-05, "loss": 0.23, "step": 2636 }, { "epoch": 0.5510971786833856, "grad_norm": 1.5452031906553687, "learning_rate": 1.9800409914882938e-05, "loss": 0.2664, "step": 2637 }, { "epoch": 0.5513061650992686, "grad_norm": 1.4763906580338013, "learning_rate": 1.980018558703815e-05, "loss": 0.2116, "step": 2638 }, { "epoch": 0.5515151515151515, "grad_norm": 1.3767414595187626, "learning_rate": 1.9799961134470512e-05, "loss": 0.2131, "step": 2639 }, { "epoch": 0.5517241379310345, "grad_norm": 1.3783776986254892, "learning_rate": 1.9799736557182885e-05, "loss": 0.2023, "step": 2640 }, { "epoch": 0.5519331243469174, "grad_norm": 1.3672691347924792, "learning_rate": 1.979951185517812e-05, "loss": 0.2465, "step": 2641 }, { "epoch": 0.5521421107628004, "grad_norm": 1.3917883845664578, "learning_rate": 1.9799287028459084e-05, "loss": 0.2128, "step": 2642 }, { "epoch": 0.5523510971786834, "grad_norm": 1.249898733269028, "learning_rate": 1.979906207702863e-05, "loss": 0.226, "step": 2643 }, { "epoch": 0.5525600835945663, "grad_norm": 1.1761900902347466, "learning_rate": 1.9798837000889627e-05, "loss": 0.2451, "step": 2644 }, { "epoch": 0.5527690700104493, "grad_norm": 1.8970460994681901, "learning_rate": 1.9798611800044937e-05, "loss": 0.2407, "step": 2645 }, { "epoch": 0.5529780564263322, "grad_norm": 1.3792108964613627, "learning_rate": 1.9798386474497427e-05, "loss": 0.2519, "step": 2646 }, { "epoch": 0.5531870428422152, "grad_norm": 1.3358541408093247, "learning_rate": 1.9798161024249963e-05, "loss": 0.2212, "step": 2647 }, { "epoch": 0.5533960292580983, "grad_norm": 1.3855373268567306, "learning_rate": 1.9797935449305416e-05, "loss": 0.2101, "step": 2648 }, { "epoch": 0.5536050156739812, "grad_norm": 1.4677847325709106, "learning_rate": 1.9797709749666655e-05, "loss": 0.2345, "step": 2649 }, { "epoch": 0.5538140020898642, "grad_norm": 1.1434058980777755, "learning_rate": 1.9797483925336554e-05, "loss": 0.2308, "step": 2650 }, { "epoch": 0.5540229885057472, "grad_norm": 1.5440644401600305, "learning_rate": 1.9797257976317988e-05, "loss": 0.2361, "step": 2651 }, { "epoch": 0.5542319749216301, "grad_norm": 1.110498999715669, "learning_rate": 1.9797031902613828e-05, "loss": 0.233, "step": 2652 }, { "epoch": 0.5544409613375131, "grad_norm": 1.159933451097846, "learning_rate": 1.9796805704226957e-05, "loss": 0.225, "step": 2653 }, { "epoch": 0.554649947753396, "grad_norm": 1.4647899258809027, "learning_rate": 1.9796579381160248e-05, "loss": 0.173, "step": 2654 }, { "epoch": 0.554858934169279, "grad_norm": 1.3534970269267486, "learning_rate": 1.9796352933416587e-05, "loss": 0.2601, "step": 2655 }, { "epoch": 0.555067920585162, "grad_norm": 1.6928539031314205, "learning_rate": 1.9796126360998852e-05, "loss": 0.2278, "step": 2656 }, { "epoch": 0.5552769070010449, "grad_norm": 1.2292164252111735, "learning_rate": 1.9795899663909925e-05, "loss": 0.2682, "step": 2657 }, { "epoch": 0.5554858934169279, "grad_norm": 1.5026365266092077, "learning_rate": 1.9795672842152697e-05, "loss": 0.2152, "step": 2658 }, { "epoch": 0.5556948798328109, "grad_norm": 1.2176790799946315, "learning_rate": 1.979544589573005e-05, "loss": 0.2599, "step": 2659 }, { "epoch": 0.5559038662486938, "grad_norm": 1.3264359676447008, "learning_rate": 1.9795218824644876e-05, "loss": 0.2196, "step": 2660 }, { "epoch": 0.5561128526645768, "grad_norm": 1.3614293106308557, "learning_rate": 1.9794991628900057e-05, "loss": 0.2383, "step": 2661 }, { "epoch": 0.5563218390804597, "grad_norm": 1.368111780754973, "learning_rate": 1.9794764308498494e-05, "loss": 0.2084, "step": 2662 }, { "epoch": 0.5565308254963427, "grad_norm": 1.3068431004258385, "learning_rate": 1.9794536863443076e-05, "loss": 0.2411, "step": 2663 }, { "epoch": 0.5567398119122257, "grad_norm": 1.362969764527856, "learning_rate": 1.9794309293736697e-05, "loss": 0.266, "step": 2664 }, { "epoch": 0.5569487983281086, "grad_norm": 1.4590663047252672, "learning_rate": 1.9794081599382254e-05, "loss": 0.2389, "step": 2665 }, { "epoch": 0.5571577847439917, "grad_norm": 1.6986679742780177, "learning_rate": 1.979385378038264e-05, "loss": 0.2175, "step": 2666 }, { "epoch": 0.5573667711598747, "grad_norm": 1.3341180012633425, "learning_rate": 1.9793625836740763e-05, "loss": 0.2173, "step": 2667 }, { "epoch": 0.5575757575757576, "grad_norm": 1.4106907859396014, "learning_rate": 1.9793397768459523e-05, "loss": 0.2295, "step": 2668 }, { "epoch": 0.5577847439916406, "grad_norm": 1.4182398230306366, "learning_rate": 1.9793169575541812e-05, "loss": 0.2338, "step": 2669 }, { "epoch": 0.5579937304075235, "grad_norm": 1.7433560033243705, "learning_rate": 1.979294125799055e-05, "loss": 0.2505, "step": 2670 }, { "epoch": 0.5582027168234065, "grad_norm": 1.5947039725513235, "learning_rate": 1.9792712815808627e-05, "loss": 0.2458, "step": 2671 }, { "epoch": 0.5584117032392895, "grad_norm": 1.5477803782925637, "learning_rate": 1.9792484248998955e-05, "loss": 0.1952, "step": 2672 }, { "epoch": 0.5586206896551724, "grad_norm": 1.1241902032762048, "learning_rate": 1.979225555756445e-05, "loss": 0.2346, "step": 2673 }, { "epoch": 0.5588296760710554, "grad_norm": 1.2292400511748016, "learning_rate": 1.9792026741508018e-05, "loss": 0.2313, "step": 2674 }, { "epoch": 0.5590386624869383, "grad_norm": 1.4390489087935538, "learning_rate": 1.979179780083257e-05, "loss": 0.2147, "step": 2675 }, { "epoch": 0.5592476489028213, "grad_norm": 1.2855834385589087, "learning_rate": 1.9791568735541017e-05, "loss": 0.2474, "step": 2676 }, { "epoch": 0.5594566353187043, "grad_norm": 1.610700896500035, "learning_rate": 1.9791339545636285e-05, "loss": 0.2666, "step": 2677 }, { "epoch": 0.5596656217345872, "grad_norm": 1.6729862324697646, "learning_rate": 1.9791110231121276e-05, "loss": 0.2141, "step": 2678 }, { "epoch": 0.5598746081504702, "grad_norm": 1.468883733761969, "learning_rate": 1.9790880791998922e-05, "loss": 0.206, "step": 2679 }, { "epoch": 0.5600835945663531, "grad_norm": 1.4427032969898579, "learning_rate": 1.979065122827213e-05, "loss": 0.234, "step": 2680 }, { "epoch": 0.5602925809822361, "grad_norm": 1.6874361205405242, "learning_rate": 1.9790421539943832e-05, "loss": 0.2539, "step": 2681 }, { "epoch": 0.5605015673981191, "grad_norm": 1.4173634177765297, "learning_rate": 1.9790191727016948e-05, "loss": 0.22, "step": 2682 }, { "epoch": 0.5607105538140021, "grad_norm": 1.2514397645674409, "learning_rate": 1.97899617894944e-05, "loss": 0.2116, "step": 2683 }, { "epoch": 0.5609195402298851, "grad_norm": 1.4968248017678514, "learning_rate": 1.978973172737912e-05, "loss": 0.2435, "step": 2684 }, { "epoch": 0.5611285266457681, "grad_norm": 1.3968507678904558, "learning_rate": 1.978950154067403e-05, "loss": 0.2247, "step": 2685 }, { "epoch": 0.561337513061651, "grad_norm": 1.367557854690043, "learning_rate": 1.978927122938206e-05, "loss": 0.2552, "step": 2686 }, { "epoch": 0.561546499477534, "grad_norm": 1.4462991051081944, "learning_rate": 1.9789040793506146e-05, "loss": 0.2395, "step": 2687 }, { "epoch": 0.561755485893417, "grad_norm": 1.3762933438975513, "learning_rate": 1.9788810233049216e-05, "loss": 0.2198, "step": 2688 }, { "epoch": 0.5619644723092999, "grad_norm": 1.278189857525804, "learning_rate": 1.9788579548014208e-05, "loss": 0.215, "step": 2689 }, { "epoch": 0.5621734587251829, "grad_norm": 1.3772575304656973, "learning_rate": 1.9788348738404055e-05, "loss": 0.2363, "step": 2690 }, { "epoch": 0.5623824451410658, "grad_norm": 1.491136432525418, "learning_rate": 1.9788117804221694e-05, "loss": 0.2554, "step": 2691 }, { "epoch": 0.5625914315569488, "grad_norm": 1.4761680301210678, "learning_rate": 1.9787886745470067e-05, "loss": 0.216, "step": 2692 }, { "epoch": 0.5628004179728318, "grad_norm": 1.3125618253694376, "learning_rate": 1.9787655562152112e-05, "loss": 0.2271, "step": 2693 }, { "epoch": 0.5630094043887147, "grad_norm": 1.711946677920241, "learning_rate": 1.9787424254270768e-05, "loss": 0.2571, "step": 2694 }, { "epoch": 0.5632183908045977, "grad_norm": 1.385556387575798, "learning_rate": 1.9787192821828985e-05, "loss": 0.2123, "step": 2695 }, { "epoch": 0.5634273772204806, "grad_norm": 1.9051841562996008, "learning_rate": 1.9786961264829708e-05, "loss": 0.2863, "step": 2696 }, { "epoch": 0.5636363636363636, "grad_norm": 1.5453611518594357, "learning_rate": 1.978672958327588e-05, "loss": 0.2476, "step": 2697 }, { "epoch": 0.5638453500522466, "grad_norm": 1.5354588264336162, "learning_rate": 1.9786497777170452e-05, "loss": 0.2238, "step": 2698 }, { "epoch": 0.5640543364681295, "grad_norm": 1.2867303312529783, "learning_rate": 1.9786265846516374e-05, "loss": 0.2361, "step": 2699 }, { "epoch": 0.5642633228840125, "grad_norm": 1.4119917931671222, "learning_rate": 1.9786033791316597e-05, "loss": 0.2404, "step": 2700 }, { "epoch": 0.5644723092998956, "grad_norm": 1.3179146816025087, "learning_rate": 1.9785801611574074e-05, "loss": 0.2489, "step": 2701 }, { "epoch": 0.5646812957157785, "grad_norm": 1.2130744204582669, "learning_rate": 1.978556930729176e-05, "loss": 0.2179, "step": 2702 }, { "epoch": 0.5648902821316615, "grad_norm": 1.2285015474601433, "learning_rate": 1.978533687847261e-05, "loss": 0.2369, "step": 2703 }, { "epoch": 0.5650992685475444, "grad_norm": 1.6262941763909387, "learning_rate": 1.978510432511959e-05, "loss": 0.1987, "step": 2704 }, { "epoch": 0.5653082549634274, "grad_norm": 1.307833649603917, "learning_rate": 1.9784871647235647e-05, "loss": 0.2395, "step": 2705 }, { "epoch": 0.5655172413793104, "grad_norm": 1.1653783151531505, "learning_rate": 1.978463884482375e-05, "loss": 0.2416, "step": 2706 }, { "epoch": 0.5657262277951933, "grad_norm": 1.1359374699885496, "learning_rate": 1.9784405917886864e-05, "loss": 0.2223, "step": 2707 }, { "epoch": 0.5659352142110763, "grad_norm": 2.438348131461294, "learning_rate": 1.9784172866427948e-05, "loss": 0.202, "step": 2708 }, { "epoch": 0.5661442006269592, "grad_norm": 1.6478517096849519, "learning_rate": 1.9783939690449968e-05, "loss": 0.2374, "step": 2709 }, { "epoch": 0.5663531870428422, "grad_norm": 1.9037118633308812, "learning_rate": 1.9783706389955895e-05, "loss": 0.2333, "step": 2710 }, { "epoch": 0.5665621734587252, "grad_norm": 1.522046225046701, "learning_rate": 1.9783472964948697e-05, "loss": 0.2393, "step": 2711 }, { "epoch": 0.5667711598746081, "grad_norm": 1.6268818006289758, "learning_rate": 1.978323941543134e-05, "loss": 0.2521, "step": 2712 }, { "epoch": 0.5669801462904911, "grad_norm": 1.2531874642529595, "learning_rate": 1.9783005741406804e-05, "loss": 0.2325, "step": 2713 }, { "epoch": 0.567189132706374, "grad_norm": 1.2347607044496842, "learning_rate": 1.978277194287806e-05, "loss": 0.2474, "step": 2714 }, { "epoch": 0.567398119122257, "grad_norm": 1.4660041087652527, "learning_rate": 1.978253801984808e-05, "loss": 0.2667, "step": 2715 }, { "epoch": 0.56760710553814, "grad_norm": 1.3679230893320102, "learning_rate": 1.9782303972319844e-05, "loss": 0.2475, "step": 2716 }, { "epoch": 0.5678160919540229, "grad_norm": 1.3539341439029493, "learning_rate": 1.9782069800296332e-05, "loss": 0.2416, "step": 2717 }, { "epoch": 0.568025078369906, "grad_norm": 1.2150031004633717, "learning_rate": 1.978183550378052e-05, "loss": 0.249, "step": 2718 }, { "epoch": 0.568234064785789, "grad_norm": 1.1839113098848084, "learning_rate": 1.9781601082775393e-05, "loss": 0.2353, "step": 2719 }, { "epoch": 0.5684430512016719, "grad_norm": 1.34258724389607, "learning_rate": 1.9781366537283936e-05, "loss": 0.2229, "step": 2720 }, { "epoch": 0.5686520376175549, "grad_norm": 1.3770979072315823, "learning_rate": 1.9781131867309128e-05, "loss": 0.2459, "step": 2721 }, { "epoch": 0.5688610240334379, "grad_norm": 1.3915342466836564, "learning_rate": 1.9780897072853962e-05, "loss": 0.228, "step": 2722 }, { "epoch": 0.5690700104493208, "grad_norm": 1.3074307806573207, "learning_rate": 1.9780662153921422e-05, "loss": 0.2503, "step": 2723 }, { "epoch": 0.5692789968652038, "grad_norm": 1.2503324965260334, "learning_rate": 1.97804271105145e-05, "loss": 0.2198, "step": 2724 }, { "epoch": 0.5694879832810867, "grad_norm": 1.3046915696565824, "learning_rate": 1.9780191942636187e-05, "loss": 0.2595, "step": 2725 }, { "epoch": 0.5696969696969697, "grad_norm": 1.3117855331236663, "learning_rate": 1.9779956650289474e-05, "loss": 0.2376, "step": 2726 }, { "epoch": 0.5699059561128527, "grad_norm": 1.549298699991871, "learning_rate": 1.9779721233477355e-05, "loss": 0.245, "step": 2727 }, { "epoch": 0.5701149425287356, "grad_norm": 1.5245154553554954, "learning_rate": 1.977948569220283e-05, "loss": 0.2036, "step": 2728 }, { "epoch": 0.5703239289446186, "grad_norm": 1.4296132940188828, "learning_rate": 1.9779250026468894e-05, "loss": 0.2588, "step": 2729 }, { "epoch": 0.5705329153605015, "grad_norm": 1.4238731292932627, "learning_rate": 1.977901423627855e-05, "loss": 0.2828, "step": 2730 }, { "epoch": 0.5707419017763845, "grad_norm": 1.1820047228113102, "learning_rate": 1.977877832163479e-05, "loss": 0.2343, "step": 2731 }, { "epoch": 0.5709508881922675, "grad_norm": 1.3029478142811581, "learning_rate": 1.977854228254062e-05, "loss": 0.2354, "step": 2732 }, { "epoch": 0.5711598746081504, "grad_norm": 1.2901382360964482, "learning_rate": 1.9778306118999052e-05, "loss": 0.2336, "step": 2733 }, { "epoch": 0.5713688610240334, "grad_norm": 1.5395735936050179, "learning_rate": 1.9778069831013086e-05, "loss": 0.2615, "step": 2734 }, { "epoch": 0.5715778474399164, "grad_norm": 1.4207335933266731, "learning_rate": 1.9777833418585725e-05, "loss": 0.2516, "step": 2735 }, { "epoch": 0.5717868338557994, "grad_norm": 1.6156013830653617, "learning_rate": 1.977759688171998e-05, "loss": 0.2432, "step": 2736 }, { "epoch": 0.5719958202716824, "grad_norm": 1.618774368717708, "learning_rate": 1.9777360220418863e-05, "loss": 0.2485, "step": 2737 }, { "epoch": 0.5722048066875653, "grad_norm": 1.3365140656095962, "learning_rate": 1.977712343468539e-05, "loss": 0.2356, "step": 2738 }, { "epoch": 0.5724137931034483, "grad_norm": 1.291145216360662, "learning_rate": 1.9776886524522566e-05, "loss": 0.2403, "step": 2739 }, { "epoch": 0.5726227795193313, "grad_norm": 1.3970398621506497, "learning_rate": 1.9776649489933408e-05, "loss": 0.215, "step": 2740 }, { "epoch": 0.5728317659352142, "grad_norm": 1.2608483183898398, "learning_rate": 1.9776412330920938e-05, "loss": 0.2218, "step": 2741 }, { "epoch": 0.5730407523510972, "grad_norm": 1.2623153935628777, "learning_rate": 1.9776175047488167e-05, "loss": 0.2333, "step": 2742 }, { "epoch": 0.5732497387669802, "grad_norm": 1.651878226527152, "learning_rate": 1.9775937639638122e-05, "loss": 0.271, "step": 2743 }, { "epoch": 0.5734587251828631, "grad_norm": 1.2402490990498254, "learning_rate": 1.977570010737382e-05, "loss": 0.2382, "step": 2744 }, { "epoch": 0.5736677115987461, "grad_norm": 1.4975106374893818, "learning_rate": 1.9775462450698286e-05, "loss": 0.2497, "step": 2745 }, { "epoch": 0.573876698014629, "grad_norm": 1.2658843330087364, "learning_rate": 1.977522466961454e-05, "loss": 0.2236, "step": 2746 }, { "epoch": 0.574085684430512, "grad_norm": 1.3681876484191078, "learning_rate": 1.9774986764125618e-05, "loss": 0.2324, "step": 2747 }, { "epoch": 0.574294670846395, "grad_norm": 1.4254051842965823, "learning_rate": 1.9774748734234536e-05, "loss": 0.2444, "step": 2748 }, { "epoch": 0.5745036572622779, "grad_norm": 2.181323166708678, "learning_rate": 1.977451057994433e-05, "loss": 0.2242, "step": 2749 }, { "epoch": 0.5747126436781609, "grad_norm": 1.3576083821252776, "learning_rate": 1.977427230125803e-05, "loss": 0.2556, "step": 2750 }, { "epoch": 0.5749216300940438, "grad_norm": 1.1974287239613322, "learning_rate": 1.9774033898178668e-05, "loss": 0.2368, "step": 2751 }, { "epoch": 0.5751306165099268, "grad_norm": 1.6303697201422689, "learning_rate": 1.977379537070928e-05, "loss": 0.2761, "step": 2752 }, { "epoch": 0.5753396029258098, "grad_norm": 1.6590201039892791, "learning_rate": 1.9773556718852894e-05, "loss": 0.274, "step": 2753 }, { "epoch": 0.5755485893416928, "grad_norm": 1.4042557416204602, "learning_rate": 1.9773317942612557e-05, "loss": 0.2184, "step": 2754 }, { "epoch": 0.5757575757575758, "grad_norm": 1.0632971340692656, "learning_rate": 1.9773079041991304e-05, "loss": 0.2268, "step": 2755 }, { "epoch": 0.5759665621734588, "grad_norm": 1.3831510117067658, "learning_rate": 1.9772840016992173e-05, "loss": 0.2359, "step": 2756 }, { "epoch": 0.5761755485893417, "grad_norm": 1.4404344752840816, "learning_rate": 1.9772600867618214e-05, "loss": 0.2438, "step": 2757 }, { "epoch": 0.5763845350052247, "grad_norm": 1.68314510456164, "learning_rate": 1.977236159387246e-05, "loss": 0.2554, "step": 2758 }, { "epoch": 0.5765935214211076, "grad_norm": 1.6781305357273062, "learning_rate": 1.9772122195757957e-05, "loss": 0.2266, "step": 2759 }, { "epoch": 0.5768025078369906, "grad_norm": 1.324909171573313, "learning_rate": 1.977188267327776e-05, "loss": 0.2362, "step": 2760 }, { "epoch": 0.5770114942528736, "grad_norm": 1.9399483431108604, "learning_rate": 1.9771643026434916e-05, "loss": 0.2254, "step": 2761 }, { "epoch": 0.5772204806687565, "grad_norm": 1.3390206247969627, "learning_rate": 1.9771403255232467e-05, "loss": 0.2214, "step": 2762 }, { "epoch": 0.5774294670846395, "grad_norm": 1.2905356930380765, "learning_rate": 1.977116335967347e-05, "loss": 0.2371, "step": 2763 }, { "epoch": 0.5776384535005225, "grad_norm": 1.1879660735133086, "learning_rate": 1.9770923339760975e-05, "loss": 0.226, "step": 2764 }, { "epoch": 0.5778474399164054, "grad_norm": 1.2403600652666318, "learning_rate": 1.977068319549804e-05, "loss": 0.1962, "step": 2765 }, { "epoch": 0.5780564263322884, "grad_norm": 1.5416938762919647, "learning_rate": 1.977044292688772e-05, "loss": 0.2835, "step": 2766 }, { "epoch": 0.5782654127481713, "grad_norm": 1.3659170058839203, "learning_rate": 1.9770202533933078e-05, "loss": 0.2711, "step": 2767 }, { "epoch": 0.5784743991640543, "grad_norm": 1.3345930869237792, "learning_rate": 1.9769962016637162e-05, "loss": 0.2404, "step": 2768 }, { "epoch": 0.5786833855799373, "grad_norm": 1.5056523861121294, "learning_rate": 1.9769721375003043e-05, "loss": 0.2804, "step": 2769 }, { "epoch": 0.5788923719958202, "grad_norm": 1.945813493598407, "learning_rate": 1.976948060903378e-05, "loss": 0.2244, "step": 2770 }, { "epoch": 0.5791013584117033, "grad_norm": 1.614270886221216, "learning_rate": 1.9769239718732437e-05, "loss": 0.2965, "step": 2771 }, { "epoch": 0.5793103448275863, "grad_norm": 1.4113198073463666, "learning_rate": 1.9768998704102075e-05, "loss": 0.2601, "step": 2772 }, { "epoch": 0.5795193312434692, "grad_norm": 1.3541542372403894, "learning_rate": 1.976875756514577e-05, "loss": 0.2353, "step": 2773 }, { "epoch": 0.5797283176593522, "grad_norm": 1.7162027764195609, "learning_rate": 1.9768516301866584e-05, "loss": 0.2487, "step": 2774 }, { "epoch": 0.5799373040752351, "grad_norm": 1.2411453494031397, "learning_rate": 1.9768274914267594e-05, "loss": 0.2341, "step": 2775 }, { "epoch": 0.5801462904911181, "grad_norm": 1.4785114815463372, "learning_rate": 1.9768033402351866e-05, "loss": 0.2326, "step": 2776 }, { "epoch": 0.5803552769070011, "grad_norm": 1.414750153508492, "learning_rate": 1.9767791766122476e-05, "loss": 0.2119, "step": 2777 }, { "epoch": 0.580564263322884, "grad_norm": 1.0700721695184996, "learning_rate": 1.97675500055825e-05, "loss": 0.2344, "step": 2778 }, { "epoch": 0.580773249738767, "grad_norm": 1.3605964275597549, "learning_rate": 1.976730812073501e-05, "loss": 0.2262, "step": 2779 }, { "epoch": 0.58098223615465, "grad_norm": 1.4632810816683408, "learning_rate": 1.9767066111583092e-05, "loss": 0.2392, "step": 2780 }, { "epoch": 0.5811912225705329, "grad_norm": 1.1303574089889266, "learning_rate": 1.976682397812982e-05, "loss": 0.2329, "step": 2781 }, { "epoch": 0.5814002089864159, "grad_norm": 1.7709263128101491, "learning_rate": 1.9766581720378282e-05, "loss": 0.2453, "step": 2782 }, { "epoch": 0.5816091954022988, "grad_norm": 1.2474866657832024, "learning_rate": 1.976633933833155e-05, "loss": 0.2443, "step": 2783 }, { "epoch": 0.5818181818181818, "grad_norm": 1.4984701778577783, "learning_rate": 1.976609683199272e-05, "loss": 0.2239, "step": 2784 }, { "epoch": 0.5820271682340648, "grad_norm": 1.399736414269447, "learning_rate": 1.9765854201364876e-05, "loss": 0.2473, "step": 2785 }, { "epoch": 0.5822361546499477, "grad_norm": 1.3824685360785127, "learning_rate": 1.97656114464511e-05, "loss": 0.2608, "step": 2786 }, { "epoch": 0.5824451410658307, "grad_norm": 1.333423510056432, "learning_rate": 1.9765368567254483e-05, "loss": 0.244, "step": 2787 }, { "epoch": 0.5826541274817136, "grad_norm": 1.3510828857970827, "learning_rate": 1.9765125563778123e-05, "loss": 0.2817, "step": 2788 }, { "epoch": 0.5828631138975967, "grad_norm": 1.3967279431108144, "learning_rate": 1.9764882436025106e-05, "loss": 0.2, "step": 2789 }, { "epoch": 0.5830721003134797, "grad_norm": 1.156665018229819, "learning_rate": 1.9764639183998525e-05, "loss": 0.2473, "step": 2790 }, { "epoch": 0.5832810867293626, "grad_norm": 1.3786606551937293, "learning_rate": 1.976439580770148e-05, "loss": 0.22, "step": 2791 }, { "epoch": 0.5834900731452456, "grad_norm": 1.2119024314087734, "learning_rate": 1.9764152307137072e-05, "loss": 0.2497, "step": 2792 }, { "epoch": 0.5836990595611286, "grad_norm": 1.3840625900596804, "learning_rate": 1.976390868230839e-05, "loss": 0.2616, "step": 2793 }, { "epoch": 0.5839080459770115, "grad_norm": 1.6726036246938254, "learning_rate": 1.9763664933218543e-05, "loss": 0.2501, "step": 2794 }, { "epoch": 0.5841170323928945, "grad_norm": 1.4794047566522484, "learning_rate": 1.9763421059870625e-05, "loss": 0.2312, "step": 2795 }, { "epoch": 0.5843260188087774, "grad_norm": 1.387087726935148, "learning_rate": 1.9763177062267748e-05, "loss": 0.2387, "step": 2796 }, { "epoch": 0.5845350052246604, "grad_norm": 1.3823290720410615, "learning_rate": 1.976293294041301e-05, "loss": 0.2125, "step": 2797 }, { "epoch": 0.5847439916405434, "grad_norm": 1.1585934321525315, "learning_rate": 1.9762688694309526e-05, "loss": 0.2257, "step": 2798 }, { "epoch": 0.5849529780564263, "grad_norm": 1.25118256376952, "learning_rate": 1.9762444323960395e-05, "loss": 0.2659, "step": 2799 }, { "epoch": 0.5851619644723093, "grad_norm": 1.40584022671926, "learning_rate": 1.9762199829368735e-05, "loss": 0.2191, "step": 2800 }, { "epoch": 0.5853709508881922, "grad_norm": 1.3064208889408035, "learning_rate": 1.9761955210537654e-05, "loss": 0.2532, "step": 2801 }, { "epoch": 0.5855799373040752, "grad_norm": 1.4062452055175625, "learning_rate": 1.976171046747026e-05, "loss": 0.2222, "step": 2802 }, { "epoch": 0.5857889237199582, "grad_norm": 1.686784136145026, "learning_rate": 1.976146560016968e-05, "loss": 0.2267, "step": 2803 }, { "epoch": 0.5859979101358411, "grad_norm": 1.3043130013611337, "learning_rate": 1.9761220608639022e-05, "loss": 0.2384, "step": 2804 }, { "epoch": 0.5862068965517241, "grad_norm": 1.202803826189667, "learning_rate": 1.9760975492881405e-05, "loss": 0.2488, "step": 2805 }, { "epoch": 0.5864158829676072, "grad_norm": 1.194305815856612, "learning_rate": 1.9760730252899946e-05, "loss": 0.2257, "step": 2806 }, { "epoch": 0.5866248693834901, "grad_norm": 1.585411948928412, "learning_rate": 1.9760484888697774e-05, "loss": 0.2241, "step": 2807 }, { "epoch": 0.5868338557993731, "grad_norm": 1.4533390677673466, "learning_rate": 1.9760239400278004e-05, "loss": 0.2493, "step": 2808 }, { "epoch": 0.587042842215256, "grad_norm": 1.3688978175729907, "learning_rate": 1.9759993787643762e-05, "loss": 0.2265, "step": 2809 }, { "epoch": 0.587251828631139, "grad_norm": 1.2917812209565067, "learning_rate": 1.9759748050798176e-05, "loss": 0.2079, "step": 2810 }, { "epoch": 0.587460815047022, "grad_norm": 1.1885577975424129, "learning_rate": 1.9759502189744374e-05, "loss": 0.2536, "step": 2811 }, { "epoch": 0.5876698014629049, "grad_norm": 1.3101072022364473, "learning_rate": 1.975925620448548e-05, "loss": 0.2254, "step": 2812 }, { "epoch": 0.5878787878787879, "grad_norm": 1.1896852978370007, "learning_rate": 1.9759010095024627e-05, "loss": 0.22, "step": 2813 }, { "epoch": 0.5880877742946709, "grad_norm": 1.2580057173687993, "learning_rate": 1.975876386136495e-05, "loss": 0.2507, "step": 2814 }, { "epoch": 0.5882967607105538, "grad_norm": 1.295372838896473, "learning_rate": 1.975851750350958e-05, "loss": 0.2062, "step": 2815 }, { "epoch": 0.5885057471264368, "grad_norm": 1.4986230319756735, "learning_rate": 1.975827102146165e-05, "loss": 0.2287, "step": 2816 }, { "epoch": 0.5887147335423197, "grad_norm": 1.3446352835923985, "learning_rate": 1.9758024415224308e-05, "loss": 0.2119, "step": 2817 }, { "epoch": 0.5889237199582027, "grad_norm": 1.3754444893670605, "learning_rate": 1.9757777684800676e-05, "loss": 0.242, "step": 2818 }, { "epoch": 0.5891327063740857, "grad_norm": 1.270198915718475, "learning_rate": 1.9757530830193902e-05, "loss": 0.2336, "step": 2819 }, { "epoch": 0.5893416927899686, "grad_norm": 1.13607270254955, "learning_rate": 1.975728385140713e-05, "loss": 0.2396, "step": 2820 }, { "epoch": 0.5895506792058516, "grad_norm": 1.353817839675078, "learning_rate": 1.9757036748443502e-05, "loss": 0.2516, "step": 2821 }, { "epoch": 0.5897596656217345, "grad_norm": 1.193862504037952, "learning_rate": 1.975678952130616e-05, "loss": 0.2735, "step": 2822 }, { "epoch": 0.5899686520376175, "grad_norm": 1.3413077024532896, "learning_rate": 1.9756542169998253e-05, "loss": 0.25, "step": 2823 }, { "epoch": 0.5901776384535006, "grad_norm": 1.1834209936066686, "learning_rate": 1.975629469452293e-05, "loss": 0.2191, "step": 2824 }, { "epoch": 0.5903866248693835, "grad_norm": 1.2868465207265025, "learning_rate": 1.9756047094883334e-05, "loss": 0.2137, "step": 2825 }, { "epoch": 0.5905956112852665, "grad_norm": 1.3862528647121954, "learning_rate": 1.975579937108262e-05, "loss": 0.2056, "step": 2826 }, { "epoch": 0.5908045977011495, "grad_norm": 1.2508378423336213, "learning_rate": 1.9755551523123947e-05, "loss": 0.2634, "step": 2827 }, { "epoch": 0.5910135841170324, "grad_norm": 1.3123635760286365, "learning_rate": 1.975530355101046e-05, "loss": 0.2302, "step": 2828 }, { "epoch": 0.5912225705329154, "grad_norm": 1.429177162745429, "learning_rate": 1.9755055454745318e-05, "loss": 0.2345, "step": 2829 }, { "epoch": 0.5914315569487983, "grad_norm": 1.2302934198840765, "learning_rate": 1.975480723433168e-05, "loss": 0.2282, "step": 2830 }, { "epoch": 0.5916405433646813, "grad_norm": 1.2935709916508178, "learning_rate": 1.9754558889772702e-05, "loss": 0.2448, "step": 2831 }, { "epoch": 0.5918495297805643, "grad_norm": 2.3057488233757066, "learning_rate": 1.9754310421071546e-05, "loss": 0.2668, "step": 2832 }, { "epoch": 0.5920585161964472, "grad_norm": 1.3499207807147329, "learning_rate": 1.975406182823138e-05, "loss": 0.2684, "step": 2833 }, { "epoch": 0.5922675026123302, "grad_norm": 1.738929374960403, "learning_rate": 1.9753813111255354e-05, "loss": 0.2439, "step": 2834 }, { "epoch": 0.5924764890282131, "grad_norm": 1.4381876692524818, "learning_rate": 1.975356427014665e-05, "loss": 0.2321, "step": 2835 }, { "epoch": 0.5926854754440961, "grad_norm": 1.3900914710687777, "learning_rate": 1.975331530490842e-05, "loss": 0.2126, "step": 2836 }, { "epoch": 0.5928944618599791, "grad_norm": 1.8238974314041656, "learning_rate": 1.975306621554384e-05, "loss": 0.2722, "step": 2837 }, { "epoch": 0.593103448275862, "grad_norm": 1.2910081417823565, "learning_rate": 1.9752817002056083e-05, "loss": 0.198, "step": 2838 }, { "epoch": 0.593312434691745, "grad_norm": 1.395371745418257, "learning_rate": 1.975256766444831e-05, "loss": 0.2416, "step": 2839 }, { "epoch": 0.593521421107628, "grad_norm": 1.264105331786396, "learning_rate": 1.9752318202723705e-05, "loss": 0.2518, "step": 2840 }, { "epoch": 0.5937304075235109, "grad_norm": 1.1406548660016962, "learning_rate": 1.975206861688544e-05, "loss": 0.2307, "step": 2841 }, { "epoch": 0.593939393939394, "grad_norm": 1.233721676714357, "learning_rate": 1.9751818906936688e-05, "loss": 0.2391, "step": 2842 }, { "epoch": 0.594148380355277, "grad_norm": 1.4852465684843565, "learning_rate": 1.975156907288063e-05, "loss": 0.2439, "step": 2843 }, { "epoch": 0.5943573667711599, "grad_norm": 1.3832399733146326, "learning_rate": 1.9751319114720445e-05, "loss": 0.2249, "step": 2844 }, { "epoch": 0.5945663531870429, "grad_norm": 1.3214094757274917, "learning_rate": 1.975106903245931e-05, "loss": 0.2313, "step": 2845 }, { "epoch": 0.5947753396029258, "grad_norm": 1.701898767854386, "learning_rate": 1.9750818826100415e-05, "loss": 0.2924, "step": 2846 }, { "epoch": 0.5949843260188088, "grad_norm": 1.5057963142080468, "learning_rate": 1.975056849564694e-05, "loss": 0.2249, "step": 2847 }, { "epoch": 0.5951933124346918, "grad_norm": 1.4384757158372694, "learning_rate": 1.975031804110207e-05, "loss": 0.265, "step": 2848 }, { "epoch": 0.5954022988505747, "grad_norm": 1.4754997030725265, "learning_rate": 1.975006746246899e-05, "loss": 0.2248, "step": 2849 }, { "epoch": 0.5956112852664577, "grad_norm": 1.2925832034298117, "learning_rate": 1.97498167597509e-05, "loss": 0.2423, "step": 2850 }, { "epoch": 0.5958202716823406, "grad_norm": 1.4899340270171317, "learning_rate": 1.974956593295098e-05, "loss": 0.2512, "step": 2851 }, { "epoch": 0.5960292580982236, "grad_norm": 1.1238345413866275, "learning_rate": 1.9749314982072428e-05, "loss": 0.2245, "step": 2852 }, { "epoch": 0.5962382445141066, "grad_norm": 1.1859647578402321, "learning_rate": 1.9749063907118435e-05, "loss": 0.2517, "step": 2853 }, { "epoch": 0.5964472309299895, "grad_norm": 1.537652933350206, "learning_rate": 1.9748812708092194e-05, "loss": 0.2254, "step": 2854 }, { "epoch": 0.5966562173458725, "grad_norm": 1.4798828051623183, "learning_rate": 1.9748561384996904e-05, "loss": 0.2369, "step": 2855 }, { "epoch": 0.5968652037617554, "grad_norm": 1.1615469689633016, "learning_rate": 1.9748309937835767e-05, "loss": 0.2248, "step": 2856 }, { "epoch": 0.5970741901776384, "grad_norm": 1.3357152562382357, "learning_rate": 1.974805836661198e-05, "loss": 0.2193, "step": 2857 }, { "epoch": 0.5972831765935214, "grad_norm": 1.231883519265448, "learning_rate": 1.9747806671328743e-05, "loss": 0.233, "step": 2858 }, { "epoch": 0.5974921630094044, "grad_norm": 1.2295384975583359, "learning_rate": 1.974755485198926e-05, "loss": 0.2232, "step": 2859 }, { "epoch": 0.5977011494252874, "grad_norm": 1.1327093250582358, "learning_rate": 1.974730290859674e-05, "loss": 0.2495, "step": 2860 }, { "epoch": 0.5979101358411704, "grad_norm": 1.3141738152143512, "learning_rate": 1.9747050841154385e-05, "loss": 0.2376, "step": 2861 }, { "epoch": 0.5981191222570533, "grad_norm": 1.531159403279628, "learning_rate": 1.9746798649665404e-05, "loss": 0.2557, "step": 2862 }, { "epoch": 0.5983281086729363, "grad_norm": 1.3617676708773905, "learning_rate": 1.9746546334133007e-05, "loss": 0.2106, "step": 2863 }, { "epoch": 0.5985370950888192, "grad_norm": 1.261479380748092, "learning_rate": 1.9746293894560408e-05, "loss": 0.2532, "step": 2864 }, { "epoch": 0.5987460815047022, "grad_norm": 1.2104328468918084, "learning_rate": 1.974604133095081e-05, "loss": 0.2375, "step": 2865 }, { "epoch": 0.5989550679205852, "grad_norm": 1.427497081283222, "learning_rate": 1.9745788643307436e-05, "loss": 0.2059, "step": 2866 }, { "epoch": 0.5991640543364681, "grad_norm": 1.540787656845519, "learning_rate": 1.97455358316335e-05, "loss": 0.2103, "step": 2867 }, { "epoch": 0.5993730407523511, "grad_norm": 1.2998051603269127, "learning_rate": 1.974528289593222e-05, "loss": 0.2296, "step": 2868 }, { "epoch": 0.599582027168234, "grad_norm": 1.4171011042378594, "learning_rate": 1.9745029836206813e-05, "loss": 0.2361, "step": 2869 }, { "epoch": 0.599791013584117, "grad_norm": 1.3067281249503704, "learning_rate": 1.9744776652460503e-05, "loss": 0.2155, "step": 2870 }, { "epoch": 0.6, "grad_norm": 1.269264060743863, "learning_rate": 1.9744523344696507e-05, "loss": 0.2134, "step": 2871 }, { "epoch": 0.6002089864158829, "grad_norm": 1.4784648983122133, "learning_rate": 1.9744269912918052e-05, "loss": 0.2536, "step": 2872 }, { "epoch": 0.6004179728317659, "grad_norm": 1.1909274928543607, "learning_rate": 1.9744016357128362e-05, "loss": 0.2193, "step": 2873 }, { "epoch": 0.6006269592476489, "grad_norm": 1.2692039877095536, "learning_rate": 1.9743762677330664e-05, "loss": 0.2481, "step": 2874 }, { "epoch": 0.6008359456635318, "grad_norm": 1.28142388628729, "learning_rate": 1.9743508873528192e-05, "loss": 0.2283, "step": 2875 }, { "epoch": 0.6010449320794148, "grad_norm": 1.4068538575491711, "learning_rate": 1.974325494572417e-05, "loss": 0.2189, "step": 2876 }, { "epoch": 0.6012539184952979, "grad_norm": 1.5656686730737923, "learning_rate": 1.9743000893921825e-05, "loss": 0.1846, "step": 2877 }, { "epoch": 0.6014629049111808, "grad_norm": 1.221312372112917, "learning_rate": 1.97427467181244e-05, "loss": 0.2252, "step": 2878 }, { "epoch": 0.6016718913270638, "grad_norm": 1.3224615184052806, "learning_rate": 1.974249241833513e-05, "loss": 0.2323, "step": 2879 }, { "epoch": 0.6018808777429467, "grad_norm": 1.325643418385747, "learning_rate": 1.974223799455724e-05, "loss": 0.2329, "step": 2880 }, { "epoch": 0.6020898641588297, "grad_norm": 1.4419370331328696, "learning_rate": 1.974198344679398e-05, "loss": 0.1914, "step": 2881 }, { "epoch": 0.6022988505747127, "grad_norm": 1.264453153113223, "learning_rate": 1.974172877504858e-05, "loss": 0.2016, "step": 2882 }, { "epoch": 0.6025078369905956, "grad_norm": 1.3560320502200232, "learning_rate": 1.974147397932429e-05, "loss": 0.2242, "step": 2883 }, { "epoch": 0.6027168234064786, "grad_norm": 1.4722162632840101, "learning_rate": 1.974121905962435e-05, "loss": 0.2605, "step": 2884 }, { "epoch": 0.6029258098223615, "grad_norm": 1.2153413603344485, "learning_rate": 1.9740964015952e-05, "loss": 0.2495, "step": 2885 }, { "epoch": 0.6031347962382445, "grad_norm": 1.2788636678447127, "learning_rate": 1.9740708848310487e-05, "loss": 0.233, "step": 2886 }, { "epoch": 0.6033437826541275, "grad_norm": 1.172792136780527, "learning_rate": 1.9740453556703067e-05, "loss": 0.2144, "step": 2887 }, { "epoch": 0.6035527690700104, "grad_norm": 1.1845123300660287, "learning_rate": 1.9740198141132974e-05, "loss": 0.2444, "step": 2888 }, { "epoch": 0.6037617554858934, "grad_norm": 1.2217542100831773, "learning_rate": 1.973994260160347e-05, "loss": 0.2401, "step": 2889 }, { "epoch": 0.6039707419017764, "grad_norm": 1.374718729686416, "learning_rate": 1.9739686938117804e-05, "loss": 0.2295, "step": 2890 }, { "epoch": 0.6041797283176593, "grad_norm": 1.320978201269233, "learning_rate": 1.9739431150679227e-05, "loss": 0.244, "step": 2891 }, { "epoch": 0.6043887147335423, "grad_norm": 1.2985349115149674, "learning_rate": 1.9739175239291e-05, "loss": 0.2052, "step": 2892 }, { "epoch": 0.6045977011494252, "grad_norm": 1.1202322368667879, "learning_rate": 1.9738919203956376e-05, "loss": 0.1919, "step": 2893 }, { "epoch": 0.6048066875653083, "grad_norm": 1.7273691706984398, "learning_rate": 1.9738663044678613e-05, "loss": 0.2314, "step": 2894 }, { "epoch": 0.6050156739811913, "grad_norm": 1.4016743240865255, "learning_rate": 1.973840676146097e-05, "loss": 0.246, "step": 2895 }, { "epoch": 0.6052246603970742, "grad_norm": 1.2754352662456294, "learning_rate": 1.9738150354306716e-05, "loss": 0.2177, "step": 2896 }, { "epoch": 0.6054336468129572, "grad_norm": 2.014600875959183, "learning_rate": 1.9737893823219104e-05, "loss": 0.2091, "step": 2897 }, { "epoch": 0.6056426332288402, "grad_norm": 1.4127970980240623, "learning_rate": 1.9737637168201407e-05, "loss": 0.2226, "step": 2898 }, { "epoch": 0.6058516196447231, "grad_norm": 1.3999542311735056, "learning_rate": 1.9737380389256885e-05, "loss": 0.2259, "step": 2899 }, { "epoch": 0.6060606060606061, "grad_norm": 1.3155578158503882, "learning_rate": 1.973712348638881e-05, "loss": 0.2004, "step": 2900 }, { "epoch": 0.606269592476489, "grad_norm": 1.4422637457659049, "learning_rate": 1.973686645960045e-05, "loss": 0.2191, "step": 2901 }, { "epoch": 0.606478578892372, "grad_norm": 1.4968888296472067, "learning_rate": 1.973660930889508e-05, "loss": 0.2421, "step": 2902 }, { "epoch": 0.606687565308255, "grad_norm": 1.4095099389132622, "learning_rate": 1.9736352034275966e-05, "loss": 0.2071, "step": 2903 }, { "epoch": 0.6068965517241379, "grad_norm": 1.186883934470298, "learning_rate": 1.9736094635746385e-05, "loss": 0.2319, "step": 2904 }, { "epoch": 0.6071055381400209, "grad_norm": 1.4913839978773333, "learning_rate": 1.9735837113309616e-05, "loss": 0.2463, "step": 2905 }, { "epoch": 0.6073145245559038, "grad_norm": 1.5552490382013358, "learning_rate": 1.973557946696893e-05, "loss": 0.244, "step": 2906 }, { "epoch": 0.6075235109717868, "grad_norm": 1.603248430380749, "learning_rate": 1.973532169672761e-05, "loss": 0.2507, "step": 2907 }, { "epoch": 0.6077324973876698, "grad_norm": 1.3348005220995702, "learning_rate": 1.9735063802588938e-05, "loss": 0.2683, "step": 2908 }, { "epoch": 0.6079414838035527, "grad_norm": 1.4151083503055077, "learning_rate": 1.9734805784556193e-05, "loss": 0.2302, "step": 2909 }, { "epoch": 0.6081504702194357, "grad_norm": 1.3925123091720881, "learning_rate": 1.973454764263266e-05, "loss": 0.257, "step": 2910 }, { "epoch": 0.6083594566353187, "grad_norm": 1.3354986314241466, "learning_rate": 1.9734289376821622e-05, "loss": 0.2161, "step": 2911 }, { "epoch": 0.6085684430512017, "grad_norm": 1.484824094388893, "learning_rate": 1.9734030987126372e-05, "loss": 0.2632, "step": 2912 }, { "epoch": 0.6087774294670847, "grad_norm": 1.2147343546754403, "learning_rate": 1.973377247355019e-05, "loss": 0.2253, "step": 2913 }, { "epoch": 0.6089864158829676, "grad_norm": 1.0433326024378091, "learning_rate": 1.9733513836096375e-05, "loss": 0.2382, "step": 2914 }, { "epoch": 0.6091954022988506, "grad_norm": 1.086197147771133, "learning_rate": 1.973325507476821e-05, "loss": 0.2509, "step": 2915 }, { "epoch": 0.6094043887147336, "grad_norm": 1.1122513008775234, "learning_rate": 1.9732996189568994e-05, "loss": 0.2165, "step": 2916 }, { "epoch": 0.6096133751306165, "grad_norm": 1.576380944045221, "learning_rate": 1.973273718050202e-05, "loss": 0.2499, "step": 2917 }, { "epoch": 0.6098223615464995, "grad_norm": 1.214013925451525, "learning_rate": 1.973247804757058e-05, "loss": 0.2166, "step": 2918 }, { "epoch": 0.6100313479623825, "grad_norm": 1.307860685705648, "learning_rate": 1.9732218790777982e-05, "loss": 0.2117, "step": 2919 }, { "epoch": 0.6102403343782654, "grad_norm": 1.401345881822969, "learning_rate": 1.9731959410127517e-05, "loss": 0.2379, "step": 2920 }, { "epoch": 0.6104493207941484, "grad_norm": 1.357080318094534, "learning_rate": 1.9731699905622487e-05, "loss": 0.22, "step": 2921 }, { "epoch": 0.6106583072100313, "grad_norm": 1.218266333327204, "learning_rate": 1.9731440277266198e-05, "loss": 0.2304, "step": 2922 }, { "epoch": 0.6108672936259143, "grad_norm": 1.155714995322343, "learning_rate": 1.973118052506195e-05, "loss": 0.2186, "step": 2923 }, { "epoch": 0.6110762800417973, "grad_norm": 1.1239092439802407, "learning_rate": 1.9730920649013055e-05, "loss": 0.2161, "step": 2924 }, { "epoch": 0.6112852664576802, "grad_norm": 1.1323727731818027, "learning_rate": 1.9730660649122813e-05, "loss": 0.2243, "step": 2925 }, { "epoch": 0.6114942528735632, "grad_norm": 1.3371377859199958, "learning_rate": 1.9730400525394537e-05, "loss": 0.2582, "step": 2926 }, { "epoch": 0.6117032392894461, "grad_norm": 1.4608386903700803, "learning_rate": 1.973014027783154e-05, "loss": 0.2496, "step": 2927 }, { "epoch": 0.6119122257053291, "grad_norm": 1.2864757858257658, "learning_rate": 1.9729879906437124e-05, "loss": 0.2389, "step": 2928 }, { "epoch": 0.6121212121212121, "grad_norm": 1.4387616203284292, "learning_rate": 1.9729619411214615e-05, "loss": 0.2531, "step": 2929 }, { "epoch": 0.6123301985370951, "grad_norm": 1.373633506158841, "learning_rate": 1.972935879216732e-05, "loss": 0.1923, "step": 2930 }, { "epoch": 0.6125391849529781, "grad_norm": 1.323314198989197, "learning_rate": 1.972909804929856e-05, "loss": 0.2347, "step": 2931 }, { "epoch": 0.6127481713688611, "grad_norm": 1.284063932891476, "learning_rate": 1.9728837182611648e-05, "loss": 0.2125, "step": 2932 }, { "epoch": 0.612957157784744, "grad_norm": 1.3172961775599803, "learning_rate": 1.9728576192109912e-05, "loss": 0.229, "step": 2933 }, { "epoch": 0.613166144200627, "grad_norm": 1.4201180848367134, "learning_rate": 1.9728315077796666e-05, "loss": 0.2581, "step": 2934 }, { "epoch": 0.61337513061651, "grad_norm": 1.3576365703019706, "learning_rate": 1.972805383967524e-05, "loss": 0.2451, "step": 2935 }, { "epoch": 0.6135841170323929, "grad_norm": 1.3219619316160165, "learning_rate": 1.9727792477748953e-05, "loss": 0.214, "step": 2936 }, { "epoch": 0.6137931034482759, "grad_norm": 1.3222514291831142, "learning_rate": 1.9727530992021135e-05, "loss": 0.223, "step": 2937 }, { "epoch": 0.6140020898641588, "grad_norm": 1.1617286303199945, "learning_rate": 1.972726938249511e-05, "loss": 0.2132, "step": 2938 }, { "epoch": 0.6142110762800418, "grad_norm": 1.3422565745132495, "learning_rate": 1.9727007649174208e-05, "loss": 0.2441, "step": 2939 }, { "epoch": 0.6144200626959248, "grad_norm": 1.1667170318023443, "learning_rate": 1.9726745792061764e-05, "loss": 0.2117, "step": 2940 }, { "epoch": 0.6146290491118077, "grad_norm": 1.1043595565600663, "learning_rate": 1.9726483811161107e-05, "loss": 0.2002, "step": 2941 }, { "epoch": 0.6148380355276907, "grad_norm": 1.3963290078175816, "learning_rate": 1.972622170647557e-05, "loss": 0.224, "step": 2942 }, { "epoch": 0.6150470219435736, "grad_norm": 1.1776411050561622, "learning_rate": 1.9725959478008493e-05, "loss": 0.2268, "step": 2943 }, { "epoch": 0.6152560083594566, "grad_norm": 1.780913254652363, "learning_rate": 1.9725697125763212e-05, "loss": 0.242, "step": 2944 }, { "epoch": 0.6154649947753396, "grad_norm": 1.388841188274572, "learning_rate": 1.9725434649743065e-05, "loss": 0.2188, "step": 2945 }, { "epoch": 0.6156739811912225, "grad_norm": 1.3192806585794135, "learning_rate": 1.9725172049951392e-05, "loss": 0.2301, "step": 2946 }, { "epoch": 0.6158829676071056, "grad_norm": 1.368333509129014, "learning_rate": 1.972490932639153e-05, "loss": 0.2218, "step": 2947 }, { "epoch": 0.6160919540229886, "grad_norm": 1.337887423865067, "learning_rate": 1.9724646479066836e-05, "loss": 0.2234, "step": 2948 }, { "epoch": 0.6163009404388715, "grad_norm": 1.4166410547364745, "learning_rate": 1.9724383507980645e-05, "loss": 0.218, "step": 2949 }, { "epoch": 0.6165099268547545, "grad_norm": 1.0955622357500086, "learning_rate": 1.9724120413136303e-05, "loss": 0.2023, "step": 2950 }, { "epoch": 0.6167189132706374, "grad_norm": 1.3425860833643082, "learning_rate": 1.9723857194537164e-05, "loss": 0.2189, "step": 2951 }, { "epoch": 0.6169278996865204, "grad_norm": 1.312545784379439, "learning_rate": 1.972359385218657e-05, "loss": 0.2448, "step": 2952 }, { "epoch": 0.6171368861024034, "grad_norm": 1.4349194781503183, "learning_rate": 1.9723330386087883e-05, "loss": 0.2111, "step": 2953 }, { "epoch": 0.6173458725182863, "grad_norm": 1.448174337904629, "learning_rate": 1.9723066796244446e-05, "loss": 0.2489, "step": 2954 }, { "epoch": 0.6175548589341693, "grad_norm": 1.2752297377560073, "learning_rate": 1.9722803082659623e-05, "loss": 0.2172, "step": 2955 }, { "epoch": 0.6177638453500522, "grad_norm": 1.590310760697143, "learning_rate": 1.9722539245336764e-05, "loss": 0.2393, "step": 2956 }, { "epoch": 0.6179728317659352, "grad_norm": 1.65197116367654, "learning_rate": 1.9722275284279223e-05, "loss": 0.2179, "step": 2957 }, { "epoch": 0.6181818181818182, "grad_norm": 1.242355682009459, "learning_rate": 1.9722011199490366e-05, "loss": 0.2353, "step": 2958 }, { "epoch": 0.6183908045977011, "grad_norm": 1.196063313132466, "learning_rate": 1.9721746990973556e-05, "loss": 0.2084, "step": 2959 }, { "epoch": 0.6185997910135841, "grad_norm": 1.2473176073319887, "learning_rate": 1.972148265873215e-05, "loss": 0.239, "step": 2960 }, { "epoch": 0.618808777429467, "grad_norm": 1.1831053945527554, "learning_rate": 1.9721218202769512e-05, "loss": 0.2348, "step": 2961 }, { "epoch": 0.61901776384535, "grad_norm": 1.4555294205325648, "learning_rate": 1.972095362308901e-05, "loss": 0.239, "step": 2962 }, { "epoch": 0.619226750261233, "grad_norm": 1.461266516703152, "learning_rate": 1.9720688919694008e-05, "loss": 0.2859, "step": 2963 }, { "epoch": 0.6194357366771159, "grad_norm": 1.1923840007734414, "learning_rate": 1.972042409258788e-05, "loss": 0.2786, "step": 2964 }, { "epoch": 0.619644723092999, "grad_norm": 1.3687453308912743, "learning_rate": 1.9720159141773995e-05, "loss": 0.2189, "step": 2965 }, { "epoch": 0.619853709508882, "grad_norm": 1.4175117509423285, "learning_rate": 1.971989406725572e-05, "loss": 0.2193, "step": 2966 }, { "epoch": 0.6200626959247649, "grad_norm": 1.2534682912774595, "learning_rate": 1.9719628869036437e-05, "loss": 0.2053, "step": 2967 }, { "epoch": 0.6202716823406479, "grad_norm": 1.0626118385730843, "learning_rate": 1.971936354711951e-05, "loss": 0.2467, "step": 2968 }, { "epoch": 0.6204806687565309, "grad_norm": 1.1332400180093887, "learning_rate": 1.9719098101508327e-05, "loss": 0.2018, "step": 2969 }, { "epoch": 0.6206896551724138, "grad_norm": 1.329785907836764, "learning_rate": 1.971883253220626e-05, "loss": 0.2305, "step": 2970 }, { "epoch": 0.6208986415882968, "grad_norm": 1.3205344251560895, "learning_rate": 1.9718566839216685e-05, "loss": 0.2245, "step": 2971 }, { "epoch": 0.6211076280041797, "grad_norm": 1.2422309158950533, "learning_rate": 1.9718301022542992e-05, "loss": 0.2346, "step": 2972 }, { "epoch": 0.6213166144200627, "grad_norm": 1.3627626141686355, "learning_rate": 1.971803508218856e-05, "loss": 0.2512, "step": 2973 }, { "epoch": 0.6215256008359457, "grad_norm": 1.3029421352669268, "learning_rate": 1.9717769018156773e-05, "loss": 0.1917, "step": 2974 }, { "epoch": 0.6217345872518286, "grad_norm": 1.7917397602578677, "learning_rate": 1.9717502830451018e-05, "loss": 0.2491, "step": 2975 }, { "epoch": 0.6219435736677116, "grad_norm": 1.7848317035860595, "learning_rate": 1.9717236519074682e-05, "loss": 0.2268, "step": 2976 }, { "epoch": 0.6221525600835945, "grad_norm": 1.2460122909381477, "learning_rate": 1.9716970084031156e-05, "loss": 0.1991, "step": 2977 }, { "epoch": 0.6223615464994775, "grad_norm": 1.232792826452658, "learning_rate": 1.9716703525323825e-05, "loss": 0.2685, "step": 2978 }, { "epoch": 0.6225705329153605, "grad_norm": 1.577726096360198, "learning_rate": 1.971643684295609e-05, "loss": 0.2425, "step": 2979 }, { "epoch": 0.6227795193312434, "grad_norm": 1.3606845559642582, "learning_rate": 1.971617003693134e-05, "loss": 0.2479, "step": 2980 }, { "epoch": 0.6229885057471264, "grad_norm": 1.448890632557432, "learning_rate": 1.971590310725297e-05, "loss": 0.2202, "step": 2981 }, { "epoch": 0.6231974921630095, "grad_norm": 1.3649249590933632, "learning_rate": 1.9715636053924377e-05, "loss": 0.2145, "step": 2982 }, { "epoch": 0.6234064785788924, "grad_norm": 1.409168089334303, "learning_rate": 1.9715368876948965e-05, "loss": 0.2549, "step": 2983 }, { "epoch": 0.6236154649947754, "grad_norm": 1.3506769135316312, "learning_rate": 1.9715101576330124e-05, "loss": 0.24, "step": 2984 }, { "epoch": 0.6238244514106583, "grad_norm": 1.4037350083355498, "learning_rate": 1.9714834152071267e-05, "loss": 0.2156, "step": 2985 }, { "epoch": 0.6240334378265413, "grad_norm": 1.6363103609358878, "learning_rate": 1.971456660417579e-05, "loss": 0.2554, "step": 2986 }, { "epoch": 0.6242424242424243, "grad_norm": 1.143975150934094, "learning_rate": 1.97142989326471e-05, "loss": 0.2462, "step": 2987 }, { "epoch": 0.6244514106583072, "grad_norm": 1.2403500494970785, "learning_rate": 1.971403113748861e-05, "loss": 0.2028, "step": 2988 }, { "epoch": 0.6246603970741902, "grad_norm": 1.1895334630330172, "learning_rate": 1.9713763218703714e-05, "loss": 0.2485, "step": 2989 }, { "epoch": 0.6248693834900731, "grad_norm": 1.067454668219502, "learning_rate": 1.9713495176295834e-05, "loss": 0.2186, "step": 2990 }, { "epoch": 0.6250783699059561, "grad_norm": 1.7634099483560506, "learning_rate": 1.9713227010268374e-05, "loss": 0.192, "step": 2991 }, { "epoch": 0.6252873563218391, "grad_norm": 1.486574901251148, "learning_rate": 1.9712958720624753e-05, "loss": 0.2645, "step": 2992 }, { "epoch": 0.625496342737722, "grad_norm": 1.5164494998447442, "learning_rate": 1.971269030736838e-05, "loss": 0.2712, "step": 2993 }, { "epoch": 0.625705329153605, "grad_norm": 1.076297958327078, "learning_rate": 1.9712421770502675e-05, "loss": 0.2194, "step": 2994 }, { "epoch": 0.625914315569488, "grad_norm": 1.0845712154987015, "learning_rate": 1.9712153110031053e-05, "loss": 0.2188, "step": 2995 }, { "epoch": 0.6261233019853709, "grad_norm": 1.641757373215186, "learning_rate": 1.9711884325956933e-05, "loss": 0.2475, "step": 2996 }, { "epoch": 0.6263322884012539, "grad_norm": 1.5237376185251594, "learning_rate": 1.971161541828374e-05, "loss": 0.2149, "step": 2997 }, { "epoch": 0.6265412748171368, "grad_norm": 1.240046808630195, "learning_rate": 1.9711346387014888e-05, "loss": 0.2227, "step": 2998 }, { "epoch": 0.6267502612330198, "grad_norm": 1.2883773709729271, "learning_rate": 1.971107723215381e-05, "loss": 0.2529, "step": 2999 }, { "epoch": 0.6269592476489029, "grad_norm": 1.2871573384017754, "learning_rate": 1.9710807953703926e-05, "loss": 0.2379, "step": 3000 }, { "epoch": 0.6271682340647858, "grad_norm": 1.3493772977009255, "learning_rate": 1.9710538551668663e-05, "loss": 0.2327, "step": 3001 }, { "epoch": 0.6273772204806688, "grad_norm": 1.3150032874289466, "learning_rate": 1.971026902605145e-05, "loss": 0.2204, "step": 3002 }, { "epoch": 0.6275862068965518, "grad_norm": 1.160163577466607, "learning_rate": 1.970999937685572e-05, "loss": 0.2208, "step": 3003 }, { "epoch": 0.6277951933124347, "grad_norm": 1.1603014889442629, "learning_rate": 1.97097296040849e-05, "loss": 0.2394, "step": 3004 }, { "epoch": 0.6280041797283177, "grad_norm": 1.3309803176688642, "learning_rate": 1.9709459707742427e-05, "loss": 0.2036, "step": 3005 }, { "epoch": 0.6282131661442006, "grad_norm": 1.3047937043554263, "learning_rate": 1.9709189687831737e-05, "loss": 0.2243, "step": 3006 }, { "epoch": 0.6284221525600836, "grad_norm": 1.30067003502604, "learning_rate": 1.9708919544356263e-05, "loss": 0.229, "step": 3007 }, { "epoch": 0.6286311389759666, "grad_norm": 1.6506928981179332, "learning_rate": 1.970864927731944e-05, "loss": 0.2156, "step": 3008 }, { "epoch": 0.6288401253918495, "grad_norm": 1.343902500101587, "learning_rate": 1.9708378886724717e-05, "loss": 0.2207, "step": 3009 }, { "epoch": 0.6290491118077325, "grad_norm": 1.2573508434543081, "learning_rate": 1.970810837257553e-05, "loss": 0.2347, "step": 3010 }, { "epoch": 0.6292580982236154, "grad_norm": 1.3368865920024326, "learning_rate": 1.9707837734875318e-05, "loss": 0.2304, "step": 3011 }, { "epoch": 0.6294670846394984, "grad_norm": 1.1358772316064085, "learning_rate": 1.970756697362753e-05, "loss": 0.2368, "step": 3012 }, { "epoch": 0.6296760710553814, "grad_norm": 1.3064437595128722, "learning_rate": 1.970729608883561e-05, "loss": 0.2525, "step": 3013 }, { "epoch": 0.6298850574712643, "grad_norm": 1.3961150910101792, "learning_rate": 1.970702508050301e-05, "loss": 0.1835, "step": 3014 }, { "epoch": 0.6300940438871473, "grad_norm": 1.1689132130946782, "learning_rate": 1.9706753948633174e-05, "loss": 0.2039, "step": 3015 }, { "epoch": 0.6303030303030303, "grad_norm": 1.83147778555545, "learning_rate": 1.970648269322955e-05, "loss": 0.2442, "step": 3016 }, { "epoch": 0.6305120167189132, "grad_norm": 1.329832218229543, "learning_rate": 1.9706211314295595e-05, "loss": 0.2279, "step": 3017 }, { "epoch": 0.6307210031347963, "grad_norm": 1.2766850589227912, "learning_rate": 1.9705939811834765e-05, "loss": 0.2016, "step": 3018 }, { "epoch": 0.6309299895506792, "grad_norm": 1.2237911697517951, "learning_rate": 1.9705668185850507e-05, "loss": 0.2575, "step": 3019 }, { "epoch": 0.6311389759665622, "grad_norm": 1.227323750031801, "learning_rate": 1.970539643634629e-05, "loss": 0.2558, "step": 3020 }, { "epoch": 0.6313479623824452, "grad_norm": 1.2842186051472833, "learning_rate": 1.9705124563325558e-05, "loss": 0.2119, "step": 3021 }, { "epoch": 0.6315569487983281, "grad_norm": 1.1867296613589104, "learning_rate": 1.9704852566791782e-05, "loss": 0.2492, "step": 3022 }, { "epoch": 0.6317659352142111, "grad_norm": 1.201594800684134, "learning_rate": 1.970458044674842e-05, "loss": 0.2069, "step": 3023 }, { "epoch": 0.631974921630094, "grad_norm": 1.2930760645148105, "learning_rate": 1.9704308203198934e-05, "loss": 0.2468, "step": 3024 }, { "epoch": 0.632183908045977, "grad_norm": 1.2324089541305259, "learning_rate": 1.9704035836146787e-05, "loss": 0.2439, "step": 3025 }, { "epoch": 0.63239289446186, "grad_norm": 1.4610966957062628, "learning_rate": 1.970376334559545e-05, "loss": 0.2251, "step": 3026 }, { "epoch": 0.6326018808777429, "grad_norm": 1.560300983312234, "learning_rate": 1.9703490731548395e-05, "loss": 0.2625, "step": 3027 }, { "epoch": 0.6328108672936259, "grad_norm": 1.2304806847817602, "learning_rate": 1.9703217994009075e-05, "loss": 0.2555, "step": 3028 }, { "epoch": 0.6330198537095089, "grad_norm": 1.4656969364705235, "learning_rate": 1.9702945132980978e-05, "loss": 0.2451, "step": 3029 }, { "epoch": 0.6332288401253918, "grad_norm": 1.2135767954212875, "learning_rate": 1.970267214846757e-05, "loss": 0.2473, "step": 3030 }, { "epoch": 0.6334378265412748, "grad_norm": 1.3511099208358852, "learning_rate": 1.9702399040472322e-05, "loss": 0.2469, "step": 3031 }, { "epoch": 0.6336468129571577, "grad_norm": 1.1553784625103305, "learning_rate": 1.9702125808998716e-05, "loss": 0.2379, "step": 3032 }, { "epoch": 0.6338557993730407, "grad_norm": 1.1704268768799009, "learning_rate": 1.9701852454050226e-05, "loss": 0.231, "step": 3033 }, { "epoch": 0.6340647857889237, "grad_norm": 1.1955901937023232, "learning_rate": 1.9701578975630327e-05, "loss": 0.2008, "step": 3034 }, { "epoch": 0.6342737722048067, "grad_norm": 1.26426875362104, "learning_rate": 1.9701305373742508e-05, "loss": 0.2192, "step": 3035 }, { "epoch": 0.6344827586206897, "grad_norm": 1.3592121482730908, "learning_rate": 1.9701031648390245e-05, "loss": 0.2184, "step": 3036 }, { "epoch": 0.6346917450365727, "grad_norm": 1.3597038441630187, "learning_rate": 1.970075779957702e-05, "loss": 0.2462, "step": 3037 }, { "epoch": 0.6349007314524556, "grad_norm": 1.2591198456196993, "learning_rate": 1.9700483827306325e-05, "loss": 0.2315, "step": 3038 }, { "epoch": 0.6351097178683386, "grad_norm": 1.4695460215143425, "learning_rate": 1.9700209731581645e-05, "loss": 0.2215, "step": 3039 }, { "epoch": 0.6353187042842215, "grad_norm": 1.2028150912206748, "learning_rate": 1.969993551240646e-05, "loss": 0.237, "step": 3040 }, { "epoch": 0.6355276907001045, "grad_norm": 1.1207931889292149, "learning_rate": 1.969966116978427e-05, "loss": 0.2341, "step": 3041 }, { "epoch": 0.6357366771159875, "grad_norm": 1.5027680407102308, "learning_rate": 1.9699386703718562e-05, "loss": 0.2634, "step": 3042 }, { "epoch": 0.6359456635318704, "grad_norm": 1.380990169755125, "learning_rate": 1.969911211421283e-05, "loss": 0.2252, "step": 3043 }, { "epoch": 0.6361546499477534, "grad_norm": 1.3061566760131107, "learning_rate": 1.9698837401270566e-05, "loss": 0.2487, "step": 3044 }, { "epoch": 0.6363636363636364, "grad_norm": 1.3326238509773989, "learning_rate": 1.9698562564895272e-05, "loss": 0.2375, "step": 3045 }, { "epoch": 0.6365726227795193, "grad_norm": 1.2968635441750704, "learning_rate": 1.969828760509044e-05, "loss": 0.2201, "step": 3046 }, { "epoch": 0.6367816091954023, "grad_norm": 2.1779539021199104, "learning_rate": 1.969801252185957e-05, "loss": 0.2438, "step": 3047 }, { "epoch": 0.6369905956112852, "grad_norm": 1.3333792367239652, "learning_rate": 1.9697737315206168e-05, "loss": 0.2442, "step": 3048 }, { "epoch": 0.6371995820271682, "grad_norm": 1.346213988500547, "learning_rate": 1.969746198513373e-05, "loss": 0.2282, "step": 3049 }, { "epoch": 0.6374085684430512, "grad_norm": 1.5784219059854918, "learning_rate": 1.9697186531645762e-05, "loss": 0.2095, "step": 3050 }, { "epoch": 0.6376175548589341, "grad_norm": 1.1039527103319446, "learning_rate": 1.9696910954745775e-05, "loss": 0.2096, "step": 3051 }, { "epoch": 0.6378265412748171, "grad_norm": 1.2021619033044608, "learning_rate": 1.9696635254437265e-05, "loss": 0.2307, "step": 3052 }, { "epoch": 0.6380355276907002, "grad_norm": 1.38428145634809, "learning_rate": 1.969635943072375e-05, "loss": 0.2353, "step": 3053 }, { "epoch": 0.6382445141065831, "grad_norm": 1.295209398428485, "learning_rate": 1.9696083483608738e-05, "loss": 0.2127, "step": 3054 }, { "epoch": 0.6384535005224661, "grad_norm": 1.4790860482151484, "learning_rate": 1.969580741309574e-05, "loss": 0.2289, "step": 3055 }, { "epoch": 0.638662486938349, "grad_norm": 1.4567311928322453, "learning_rate": 1.9695531219188267e-05, "loss": 0.2129, "step": 3056 }, { "epoch": 0.638871473354232, "grad_norm": 1.2991110938275463, "learning_rate": 1.9695254901889842e-05, "loss": 0.2163, "step": 3057 }, { "epoch": 0.639080459770115, "grad_norm": 1.4306736825174162, "learning_rate": 1.969497846120397e-05, "loss": 0.2024, "step": 3058 }, { "epoch": 0.6392894461859979, "grad_norm": 1.299475405585218, "learning_rate": 1.9694701897134183e-05, "loss": 0.2826, "step": 3059 }, { "epoch": 0.6394984326018809, "grad_norm": 1.144195553359594, "learning_rate": 1.9694425209683986e-05, "loss": 0.234, "step": 3060 }, { "epoch": 0.6397074190177638, "grad_norm": 1.4133117382255163, "learning_rate": 1.9694148398856915e-05, "loss": 0.2274, "step": 3061 }, { "epoch": 0.6399164054336468, "grad_norm": 1.2244281154301242, "learning_rate": 1.969387146465648e-05, "loss": 0.236, "step": 3062 }, { "epoch": 0.6401253918495298, "grad_norm": 1.1998884833752883, "learning_rate": 1.969359440708621e-05, "loss": 0.2051, "step": 3063 }, { "epoch": 0.6403343782654127, "grad_norm": 1.3655137160665514, "learning_rate": 1.9693317226149635e-05, "loss": 0.2522, "step": 3064 }, { "epoch": 0.6405433646812957, "grad_norm": 1.205884912195111, "learning_rate": 1.9693039921850276e-05, "loss": 0.2102, "step": 3065 }, { "epoch": 0.6407523510971787, "grad_norm": 1.2356069603744397, "learning_rate": 1.9692762494191668e-05, "loss": 0.2195, "step": 3066 }, { "epoch": 0.6409613375130616, "grad_norm": 1.1673069113870678, "learning_rate": 1.9692484943177337e-05, "loss": 0.2304, "step": 3067 }, { "epoch": 0.6411703239289446, "grad_norm": 1.1523723365938554, "learning_rate": 1.969220726881082e-05, "loss": 0.2344, "step": 3068 }, { "epoch": 0.6413793103448275, "grad_norm": 1.2765288391878131, "learning_rate": 1.969192947109565e-05, "loss": 0.2035, "step": 3069 }, { "epoch": 0.6415882967607106, "grad_norm": 1.1717523569485642, "learning_rate": 1.9691651550035355e-05, "loss": 0.2559, "step": 3070 }, { "epoch": 0.6417972831765936, "grad_norm": 1.0057437528246014, "learning_rate": 1.9691373505633477e-05, "loss": 0.1899, "step": 3071 }, { "epoch": 0.6420062695924765, "grad_norm": 1.459204774791241, "learning_rate": 1.969109533789356e-05, "loss": 0.2491, "step": 3072 }, { "epoch": 0.6422152560083595, "grad_norm": 1.401773455129688, "learning_rate": 1.969081704681914e-05, "loss": 0.23, "step": 3073 }, { "epoch": 0.6424242424242425, "grad_norm": 1.2573165624211373, "learning_rate": 1.9690538632413753e-05, "loss": 0.2071, "step": 3074 }, { "epoch": 0.6426332288401254, "grad_norm": 1.2450063002737037, "learning_rate": 1.969026009468095e-05, "loss": 0.2119, "step": 3075 }, { "epoch": 0.6428422152560084, "grad_norm": 1.4607790410445107, "learning_rate": 1.9689981433624275e-05, "loss": 0.241, "step": 3076 }, { "epoch": 0.6430512016718913, "grad_norm": 1.3197159184673326, "learning_rate": 1.9689702649247272e-05, "loss": 0.2247, "step": 3077 }, { "epoch": 0.6432601880877743, "grad_norm": 1.3913871458670959, "learning_rate": 1.9689423741553487e-05, "loss": 0.2184, "step": 3078 }, { "epoch": 0.6434691745036573, "grad_norm": 1.6202417229668422, "learning_rate": 1.9689144710546474e-05, "loss": 0.2141, "step": 3079 }, { "epoch": 0.6436781609195402, "grad_norm": 1.4049784501137104, "learning_rate": 1.968886555622978e-05, "loss": 0.2289, "step": 3080 }, { "epoch": 0.6438871473354232, "grad_norm": 1.33329993919267, "learning_rate": 1.968858627860696e-05, "loss": 0.2375, "step": 3081 }, { "epoch": 0.6440961337513061, "grad_norm": 1.6232028264508371, "learning_rate": 1.968830687768157e-05, "loss": 0.2193, "step": 3082 }, { "epoch": 0.6443051201671891, "grad_norm": 1.4996856749238068, "learning_rate": 1.9688027353457165e-05, "loss": 0.211, "step": 3083 }, { "epoch": 0.6445141065830721, "grad_norm": 1.3831227503127264, "learning_rate": 1.96877477059373e-05, "loss": 0.2229, "step": 3084 }, { "epoch": 0.644723092998955, "grad_norm": 1.463845199812024, "learning_rate": 1.9687467935125536e-05, "loss": 0.194, "step": 3085 }, { "epoch": 0.644932079414838, "grad_norm": 1.487242790505516, "learning_rate": 1.9687188041025428e-05, "loss": 0.1968, "step": 3086 }, { "epoch": 0.645141065830721, "grad_norm": 1.2190830838533748, "learning_rate": 1.968690802364055e-05, "loss": 0.222, "step": 3087 }, { "epoch": 0.645350052246604, "grad_norm": 1.2591935389065527, "learning_rate": 1.9686627882974455e-05, "loss": 0.2158, "step": 3088 }, { "epoch": 0.645559038662487, "grad_norm": 1.2314687728408842, "learning_rate": 1.968634761903071e-05, "loss": 0.2173, "step": 3089 }, { "epoch": 0.64576802507837, "grad_norm": 1.3261179320693024, "learning_rate": 1.9686067231812888e-05, "loss": 0.2361, "step": 3090 }, { "epoch": 0.6459770114942529, "grad_norm": 1.3393452347267014, "learning_rate": 1.9685786721324546e-05, "loss": 0.2228, "step": 3091 }, { "epoch": 0.6461859979101359, "grad_norm": 1.2495311840662628, "learning_rate": 1.9685506087569268e-05, "loss": 0.2489, "step": 3092 }, { "epoch": 0.6463949843260188, "grad_norm": 1.1333480818093908, "learning_rate": 1.9685225330550614e-05, "loss": 0.2211, "step": 3093 }, { "epoch": 0.6466039707419018, "grad_norm": 1.3703318114889853, "learning_rate": 1.9684944450272163e-05, "loss": 0.2778, "step": 3094 }, { "epoch": 0.6468129571577848, "grad_norm": 1.318417300701472, "learning_rate": 1.968466344673749e-05, "loss": 0.2242, "step": 3095 }, { "epoch": 0.6470219435736677, "grad_norm": 1.426572599581534, "learning_rate": 1.9684382319950167e-05, "loss": 0.2212, "step": 3096 }, { "epoch": 0.6472309299895507, "grad_norm": 1.1283556770383285, "learning_rate": 1.9684101069913774e-05, "loss": 0.2456, "step": 3097 }, { "epoch": 0.6474399164054336, "grad_norm": 1.2398656275537965, "learning_rate": 1.9683819696631888e-05, "loss": 0.2222, "step": 3098 }, { "epoch": 0.6476489028213166, "grad_norm": 1.1653114181987707, "learning_rate": 1.9683538200108098e-05, "loss": 0.2247, "step": 3099 }, { "epoch": 0.6478578892371996, "grad_norm": 1.2730874907006373, "learning_rate": 1.9683256580345977e-05, "loss": 0.2406, "step": 3100 }, { "epoch": 0.6480668756530825, "grad_norm": 1.271396603172886, "learning_rate": 1.9682974837349114e-05, "loss": 0.2314, "step": 3101 }, { "epoch": 0.6482758620689655, "grad_norm": 1.2292744988967634, "learning_rate": 1.9682692971121094e-05, "loss": 0.2162, "step": 3102 }, { "epoch": 0.6484848484848484, "grad_norm": 1.377654651097071, "learning_rate": 1.9682410981665504e-05, "loss": 0.2535, "step": 3103 }, { "epoch": 0.6486938349007314, "grad_norm": 1.4245040506815814, "learning_rate": 1.9682128868985933e-05, "loss": 0.2049, "step": 3104 }, { "epoch": 0.6489028213166145, "grad_norm": 1.4762558106981774, "learning_rate": 1.9681846633085968e-05, "loss": 0.2286, "step": 3105 }, { "epoch": 0.6491118077324974, "grad_norm": 1.3745966084597883, "learning_rate": 1.9681564273969208e-05, "loss": 0.2663, "step": 3106 }, { "epoch": 0.6493207941483804, "grad_norm": 1.2843620235112938, "learning_rate": 1.9681281791639238e-05, "loss": 0.2348, "step": 3107 }, { "epoch": 0.6495297805642634, "grad_norm": 1.416045077427798, "learning_rate": 1.9680999186099658e-05, "loss": 0.2462, "step": 3108 }, { "epoch": 0.6497387669801463, "grad_norm": 1.122372155675613, "learning_rate": 1.9680716457354064e-05, "loss": 0.2043, "step": 3109 }, { "epoch": 0.6499477533960293, "grad_norm": 1.3664314998733826, "learning_rate": 1.9680433605406054e-05, "loss": 0.2095, "step": 3110 }, { "epoch": 0.6501567398119122, "grad_norm": 1.489004286201931, "learning_rate": 1.968015063025923e-05, "loss": 0.2184, "step": 3111 }, { "epoch": 0.6503657262277952, "grad_norm": 1.4729271022706978, "learning_rate": 1.967986753191719e-05, "loss": 0.2439, "step": 3112 }, { "epoch": 0.6505747126436782, "grad_norm": 1.3399926636970942, "learning_rate": 1.9679584310383534e-05, "loss": 0.2487, "step": 3113 }, { "epoch": 0.6507836990595611, "grad_norm": 1.3187235410873257, "learning_rate": 1.9679300965661876e-05, "loss": 0.2309, "step": 3114 }, { "epoch": 0.6509926854754441, "grad_norm": 1.1774686634195408, "learning_rate": 1.9679017497755814e-05, "loss": 0.2316, "step": 3115 }, { "epoch": 0.651201671891327, "grad_norm": 1.163119269620532, "learning_rate": 1.9678733906668958e-05, "loss": 0.2176, "step": 3116 }, { "epoch": 0.65141065830721, "grad_norm": 1.3867916905317255, "learning_rate": 1.9678450192404918e-05, "loss": 0.1959, "step": 3117 }, { "epoch": 0.651619644723093, "grad_norm": 1.1662576171640378, "learning_rate": 1.96781663549673e-05, "loss": 0.2677, "step": 3118 }, { "epoch": 0.6518286311389759, "grad_norm": 1.116267431348235, "learning_rate": 1.9677882394359727e-05, "loss": 0.212, "step": 3119 }, { "epoch": 0.6520376175548589, "grad_norm": 1.5718963307318714, "learning_rate": 1.96775983105858e-05, "loss": 0.2222, "step": 3120 }, { "epoch": 0.6522466039707419, "grad_norm": 1.582475331338765, "learning_rate": 1.9677314103649142e-05, "loss": 0.2326, "step": 3121 }, { "epoch": 0.6524555903866248, "grad_norm": 1.3895980284216303, "learning_rate": 1.967702977355337e-05, "loss": 0.2337, "step": 3122 }, { "epoch": 0.6526645768025079, "grad_norm": 1.2181258222380977, "learning_rate": 1.9676745320302098e-05, "loss": 0.1968, "step": 3123 }, { "epoch": 0.6528735632183909, "grad_norm": 1.1858206884274454, "learning_rate": 1.9676460743898952e-05, "loss": 0.2326, "step": 3124 }, { "epoch": 0.6530825496342738, "grad_norm": 1.3753529206263082, "learning_rate": 1.9676176044347546e-05, "loss": 0.2015, "step": 3125 }, { "epoch": 0.6532915360501568, "grad_norm": 1.2134409042576262, "learning_rate": 1.967589122165151e-05, "loss": 0.2122, "step": 3126 }, { "epoch": 0.6535005224660397, "grad_norm": 1.1157224618821788, "learning_rate": 1.9675606275814468e-05, "loss": 0.1855, "step": 3127 }, { "epoch": 0.6537095088819227, "grad_norm": 1.126115712869075, "learning_rate": 1.9675321206840047e-05, "loss": 0.2065, "step": 3128 }, { "epoch": 0.6539184952978057, "grad_norm": 1.564705504659586, "learning_rate": 1.967503601473187e-05, "loss": 0.2583, "step": 3129 }, { "epoch": 0.6541274817136886, "grad_norm": 1.3930286031527912, "learning_rate": 1.967475069949357e-05, "loss": 0.2093, "step": 3130 }, { "epoch": 0.6543364681295716, "grad_norm": 1.2695970283551894, "learning_rate": 1.967446526112878e-05, "loss": 0.2262, "step": 3131 }, { "epoch": 0.6545454545454545, "grad_norm": 1.0006066509343146, "learning_rate": 1.967417969964113e-05, "loss": 0.1915, "step": 3132 }, { "epoch": 0.6547544409613375, "grad_norm": 1.2775419996808746, "learning_rate": 1.967389401503425e-05, "loss": 0.1827, "step": 3133 }, { "epoch": 0.6549634273772205, "grad_norm": 1.2853684848502231, "learning_rate": 1.9673608207311784e-05, "loss": 0.2515, "step": 3134 }, { "epoch": 0.6551724137931034, "grad_norm": 1.533595224319186, "learning_rate": 1.967332227647737e-05, "loss": 0.222, "step": 3135 }, { "epoch": 0.6553814002089864, "grad_norm": 1.4873099841428519, "learning_rate": 1.9673036222534635e-05, "loss": 0.2371, "step": 3136 }, { "epoch": 0.6555903866248693, "grad_norm": 1.3982150489459992, "learning_rate": 1.9672750045487227e-05, "loss": 0.2104, "step": 3137 }, { "epoch": 0.6557993730407523, "grad_norm": 1.3186124593214072, "learning_rate": 1.9672463745338792e-05, "loss": 0.2117, "step": 3138 }, { "epoch": 0.6560083594566353, "grad_norm": 1.3192592071024734, "learning_rate": 1.967217732209297e-05, "loss": 0.2344, "step": 3139 }, { "epoch": 0.6562173458725182, "grad_norm": 1.4707469352863378, "learning_rate": 1.9671890775753403e-05, "loss": 0.2102, "step": 3140 }, { "epoch": 0.6564263322884013, "grad_norm": 7.217351451030675, "learning_rate": 1.9671604106323744e-05, "loss": 0.2332, "step": 3141 }, { "epoch": 0.6566353187042843, "grad_norm": 1.8104798505906354, "learning_rate": 1.9671317313807637e-05, "loss": 0.2026, "step": 3142 }, { "epoch": 0.6568443051201672, "grad_norm": 1.2544244199029955, "learning_rate": 1.9671030398208732e-05, "loss": 0.2231, "step": 3143 }, { "epoch": 0.6570532915360502, "grad_norm": 1.1778718369011396, "learning_rate": 1.9670743359530685e-05, "loss": 0.2126, "step": 3144 }, { "epoch": 0.6572622779519331, "grad_norm": 1.745465383862859, "learning_rate": 1.967045619777714e-05, "loss": 0.2753, "step": 3145 }, { "epoch": 0.6574712643678161, "grad_norm": 1.1494421222483344, "learning_rate": 1.967016891295176e-05, "loss": 0.2619, "step": 3146 }, { "epoch": 0.6576802507836991, "grad_norm": 1.3712360193372939, "learning_rate": 1.96698815050582e-05, "loss": 0.2063, "step": 3147 }, { "epoch": 0.657889237199582, "grad_norm": 1.5162448665500294, "learning_rate": 1.966959397410011e-05, "loss": 0.2205, "step": 3148 }, { "epoch": 0.658098223615465, "grad_norm": 1.240399220859486, "learning_rate": 1.966930632008116e-05, "loss": 0.2219, "step": 3149 }, { "epoch": 0.658307210031348, "grad_norm": 1.126537402317561, "learning_rate": 1.9669018543005006e-05, "loss": 0.1896, "step": 3150 }, { "epoch": 0.6585161964472309, "grad_norm": 1.2624321045152744, "learning_rate": 1.9668730642875305e-05, "loss": 0.2107, "step": 3151 }, { "epoch": 0.6587251828631139, "grad_norm": 1.3155933943117686, "learning_rate": 1.9668442619695733e-05, "loss": 0.2325, "step": 3152 }, { "epoch": 0.6589341692789968, "grad_norm": 1.2244498381145303, "learning_rate": 1.9668154473469947e-05, "loss": 0.24, "step": 3153 }, { "epoch": 0.6591431556948798, "grad_norm": 1.1686487540080708, "learning_rate": 1.9667866204201614e-05, "loss": 0.2416, "step": 3154 }, { "epoch": 0.6593521421107628, "grad_norm": 1.235784312324596, "learning_rate": 1.9667577811894407e-05, "loss": 0.176, "step": 3155 }, { "epoch": 0.6595611285266457, "grad_norm": 1.335965409685101, "learning_rate": 1.9667289296551995e-05, "loss": 0.1946, "step": 3156 }, { "epoch": 0.6597701149425287, "grad_norm": 1.3581516769074011, "learning_rate": 1.9667000658178048e-05, "loss": 0.2245, "step": 3157 }, { "epoch": 0.6599791013584118, "grad_norm": 1.1910469381702642, "learning_rate": 1.9666711896776237e-05, "loss": 0.2407, "step": 3158 }, { "epoch": 0.6601880877742947, "grad_norm": 1.2520364250780411, "learning_rate": 1.9666423012350243e-05, "loss": 0.2114, "step": 3159 }, { "epoch": 0.6603970741901777, "grad_norm": 1.3161315632969641, "learning_rate": 1.9666134004903742e-05, "loss": 0.2628, "step": 3160 }, { "epoch": 0.6606060606060606, "grad_norm": 1.213126378831744, "learning_rate": 1.9665844874440404e-05, "loss": 0.2012, "step": 3161 }, { "epoch": 0.6608150470219436, "grad_norm": 1.2846114993870077, "learning_rate": 1.966555562096392e-05, "loss": 0.2137, "step": 3162 }, { "epoch": 0.6610240334378266, "grad_norm": 1.6544575564646555, "learning_rate": 1.9665266244477964e-05, "loss": 0.2403, "step": 3163 }, { "epoch": 0.6612330198537095, "grad_norm": 1.2236023298047223, "learning_rate": 1.966497674498622e-05, "loss": 0.2306, "step": 3164 }, { "epoch": 0.6614420062695925, "grad_norm": 1.172418873917634, "learning_rate": 1.9664687122492368e-05, "loss": 0.2259, "step": 3165 }, { "epoch": 0.6616509926854754, "grad_norm": 1.2556668001836373, "learning_rate": 1.9664397377000104e-05, "loss": 0.2037, "step": 3166 }, { "epoch": 0.6618599791013584, "grad_norm": 1.103995275098986, "learning_rate": 1.966410750851311e-05, "loss": 0.2096, "step": 3167 }, { "epoch": 0.6620689655172414, "grad_norm": 1.4827437126924428, "learning_rate": 1.9663817517035074e-05, "loss": 0.2296, "step": 3168 }, { "epoch": 0.6622779519331243, "grad_norm": 1.5595605154014351, "learning_rate": 1.9663527402569687e-05, "loss": 0.2436, "step": 3169 }, { "epoch": 0.6624869383490073, "grad_norm": 1.3171758750465445, "learning_rate": 1.9663237165120643e-05, "loss": 0.2212, "step": 3170 }, { "epoch": 0.6626959247648903, "grad_norm": 1.5010192858655704, "learning_rate": 1.9662946804691634e-05, "loss": 0.2072, "step": 3171 }, { "epoch": 0.6629049111807732, "grad_norm": 1.1003824607393702, "learning_rate": 1.9662656321286356e-05, "loss": 0.2198, "step": 3172 }, { "epoch": 0.6631138975966562, "grad_norm": 1.3275258519982265, "learning_rate": 1.9662365714908507e-05, "loss": 0.2028, "step": 3173 }, { "epoch": 0.6633228840125391, "grad_norm": 1.330545886470988, "learning_rate": 1.9662074985561782e-05, "loss": 0.2258, "step": 3174 }, { "epoch": 0.6635318704284221, "grad_norm": 1.3453132488971946, "learning_rate": 1.9661784133249885e-05, "loss": 0.214, "step": 3175 }, { "epoch": 0.6637408568443052, "grad_norm": 1.308853997379559, "learning_rate": 1.9661493157976516e-05, "loss": 0.1919, "step": 3176 }, { "epoch": 0.6639498432601881, "grad_norm": 1.3533195513521763, "learning_rate": 1.9661202059745377e-05, "loss": 0.2317, "step": 3177 }, { "epoch": 0.6641588296760711, "grad_norm": 1.4279396923507528, "learning_rate": 1.9660910838560172e-05, "loss": 0.2211, "step": 3178 }, { "epoch": 0.664367816091954, "grad_norm": 1.265344192208234, "learning_rate": 1.9660619494424612e-05, "loss": 0.1986, "step": 3179 }, { "epoch": 0.664576802507837, "grad_norm": 1.5021113247201512, "learning_rate": 1.96603280273424e-05, "loss": 0.2495, "step": 3180 }, { "epoch": 0.66478578892372, "grad_norm": 1.301612616955135, "learning_rate": 1.9660036437317246e-05, "loss": 0.2424, "step": 3181 }, { "epoch": 0.6649947753396029, "grad_norm": 1.474296787956183, "learning_rate": 1.9659744724352864e-05, "loss": 0.2279, "step": 3182 }, { "epoch": 0.6652037617554859, "grad_norm": 1.322903356029651, "learning_rate": 1.9659452888452962e-05, "loss": 0.2201, "step": 3183 }, { "epoch": 0.6654127481713689, "grad_norm": 1.617525721211301, "learning_rate": 1.965916092962126e-05, "loss": 0.2545, "step": 3184 }, { "epoch": 0.6656217345872518, "grad_norm": 1.4538968227626363, "learning_rate": 1.9658868847861466e-05, "loss": 0.2296, "step": 3185 }, { "epoch": 0.6658307210031348, "grad_norm": 1.3674610895344397, "learning_rate": 1.96585766431773e-05, "loss": 0.2206, "step": 3186 }, { "epoch": 0.6660397074190177, "grad_norm": 1.3114294209596866, "learning_rate": 1.9658284315572486e-05, "loss": 0.2288, "step": 3187 }, { "epoch": 0.6662486938349007, "grad_norm": 1.4997080763726587, "learning_rate": 1.965799186505074e-05, "loss": 0.1997, "step": 3188 }, { "epoch": 0.6664576802507837, "grad_norm": 1.3643795134787062, "learning_rate": 1.9657699291615785e-05, "loss": 0.1866, "step": 3189 }, { "epoch": 0.6666666666666666, "grad_norm": 1.4105344995318483, "learning_rate": 1.9657406595271342e-05, "loss": 0.2432, "step": 3190 }, { "epoch": 0.6668756530825496, "grad_norm": 1.4200032145976962, "learning_rate": 1.9657113776021138e-05, "loss": 0.2303, "step": 3191 }, { "epoch": 0.6670846394984326, "grad_norm": 1.0850717556329357, "learning_rate": 1.9656820833868903e-05, "loss": 0.2087, "step": 3192 }, { "epoch": 0.6672936259143156, "grad_norm": 1.2771771949002069, "learning_rate": 1.965652776881836e-05, "loss": 0.2455, "step": 3193 }, { "epoch": 0.6675026123301986, "grad_norm": 1.4262436896236668, "learning_rate": 1.9656234580873236e-05, "loss": 0.2207, "step": 3194 }, { "epoch": 0.6677115987460815, "grad_norm": 1.215587903814805, "learning_rate": 1.965594127003727e-05, "loss": 0.2297, "step": 3195 }, { "epoch": 0.6679205851619645, "grad_norm": 1.195200102670048, "learning_rate": 1.965564783631419e-05, "loss": 0.2292, "step": 3196 }, { "epoch": 0.6681295715778475, "grad_norm": 1.04517270720991, "learning_rate": 1.9655354279707734e-05, "loss": 0.2213, "step": 3197 }, { "epoch": 0.6683385579937304, "grad_norm": 1.5170704017225372, "learning_rate": 1.9655060600221632e-05, "loss": 0.2306, "step": 3198 }, { "epoch": 0.6685475444096134, "grad_norm": 1.1240984615653176, "learning_rate": 1.965476679785963e-05, "loss": 0.202, "step": 3199 }, { "epoch": 0.6687565308254964, "grad_norm": 1.4565310876270698, "learning_rate": 1.965447287262546e-05, "loss": 0.2474, "step": 3200 }, { "epoch": 0.6689655172413793, "grad_norm": 1.1144201862012146, "learning_rate": 1.9654178824522866e-05, "loss": 0.2267, "step": 3201 }, { "epoch": 0.6691745036572623, "grad_norm": 1.195505829385479, "learning_rate": 1.9653884653555587e-05, "loss": 0.219, "step": 3202 }, { "epoch": 0.6693834900731452, "grad_norm": 1.1454290578490913, "learning_rate": 1.965359035972737e-05, "loss": 0.2176, "step": 3203 }, { "epoch": 0.6695924764890282, "grad_norm": 1.4836653818811765, "learning_rate": 1.9653295943041963e-05, "loss": 0.239, "step": 3204 }, { "epoch": 0.6698014629049112, "grad_norm": 1.2184612907590988, "learning_rate": 1.9653001403503107e-05, "loss": 0.2096, "step": 3205 }, { "epoch": 0.6700104493207941, "grad_norm": 1.310031566299331, "learning_rate": 1.9652706741114552e-05, "loss": 0.2154, "step": 3206 }, { "epoch": 0.6702194357366771, "grad_norm": 1.2836936641408545, "learning_rate": 1.9652411955880047e-05, "loss": 0.1914, "step": 3207 }, { "epoch": 0.67042842215256, "grad_norm": 1.2981607063206546, "learning_rate": 1.965211704780335e-05, "loss": 0.2211, "step": 3208 }, { "epoch": 0.670637408568443, "grad_norm": 1.544441064927114, "learning_rate": 1.9651822016888208e-05, "loss": 0.2671, "step": 3209 }, { "epoch": 0.670846394984326, "grad_norm": 1.396430360150106, "learning_rate": 1.965152686313838e-05, "loss": 0.2204, "step": 3210 }, { "epoch": 0.671055381400209, "grad_norm": 1.1612407656418549, "learning_rate": 1.9651231586557615e-05, "loss": 0.2177, "step": 3211 }, { "epoch": 0.671264367816092, "grad_norm": 1.7264674388292203, "learning_rate": 1.9650936187149674e-05, "loss": 0.2037, "step": 3212 }, { "epoch": 0.671473354231975, "grad_norm": 1.2751885324609296, "learning_rate": 1.965064066491832e-05, "loss": 0.1882, "step": 3213 }, { "epoch": 0.6716823406478579, "grad_norm": 0.9645633270468936, "learning_rate": 1.9650345019867317e-05, "loss": 0.2179, "step": 3214 }, { "epoch": 0.6718913270637409, "grad_norm": 1.392188323151815, "learning_rate": 1.9650049252000417e-05, "loss": 0.2178, "step": 3215 }, { "epoch": 0.6721003134796238, "grad_norm": 1.2825000358661254, "learning_rate": 1.964975336132139e-05, "loss": 0.241, "step": 3216 }, { "epoch": 0.6723092998955068, "grad_norm": 1.2697195439472375, "learning_rate": 1.9649457347834003e-05, "loss": 0.243, "step": 3217 }, { "epoch": 0.6725182863113898, "grad_norm": 1.3263949440244882, "learning_rate": 1.9649161211542025e-05, "loss": 0.203, "step": 3218 }, { "epoch": 0.6727272727272727, "grad_norm": 1.357750508484597, "learning_rate": 1.9648864952449215e-05, "loss": 0.2227, "step": 3219 }, { "epoch": 0.6729362591431557, "grad_norm": 1.1823001427323359, "learning_rate": 1.9648568570559353e-05, "loss": 0.2144, "step": 3220 }, { "epoch": 0.6731452455590387, "grad_norm": 1.1920051107624159, "learning_rate": 1.9648272065876205e-05, "loss": 0.2233, "step": 3221 }, { "epoch": 0.6733542319749216, "grad_norm": 1.3978303467624678, "learning_rate": 1.964797543840355e-05, "loss": 0.2553, "step": 3222 }, { "epoch": 0.6735632183908046, "grad_norm": 1.3973777203376434, "learning_rate": 1.9647678688145163e-05, "loss": 0.2388, "step": 3223 }, { "epoch": 0.6737722048066875, "grad_norm": 1.3450297591319265, "learning_rate": 1.9647381815104812e-05, "loss": 0.2397, "step": 3224 }, { "epoch": 0.6739811912225705, "grad_norm": 1.273634612742368, "learning_rate": 1.9647084819286282e-05, "loss": 0.2371, "step": 3225 }, { "epoch": 0.6741901776384535, "grad_norm": 1.38566196238478, "learning_rate": 1.9646787700693356e-05, "loss": 0.2181, "step": 3226 }, { "epoch": 0.6743991640543364, "grad_norm": 1.2366020737696966, "learning_rate": 1.9646490459329808e-05, "loss": 0.2043, "step": 3227 }, { "epoch": 0.6746081504702194, "grad_norm": 1.1710705406562256, "learning_rate": 1.964619309519942e-05, "loss": 0.238, "step": 3228 }, { "epoch": 0.6748171368861025, "grad_norm": 1.0919532127330254, "learning_rate": 1.964589560830599e-05, "loss": 0.1928, "step": 3229 }, { "epoch": 0.6750261233019854, "grad_norm": 1.2547276936058198, "learning_rate": 1.9645597998653284e-05, "loss": 0.1808, "step": 3230 }, { "epoch": 0.6752351097178684, "grad_norm": 1.234910268062454, "learning_rate": 1.9645300266245108e-05, "loss": 0.2302, "step": 3231 }, { "epoch": 0.6754440961337513, "grad_norm": 1.3411747538569616, "learning_rate": 1.9645002411085236e-05, "loss": 0.2289, "step": 3232 }, { "epoch": 0.6756530825496343, "grad_norm": 1.9202580418191075, "learning_rate": 1.964470443317747e-05, "loss": 0.2472, "step": 3233 }, { "epoch": 0.6758620689655173, "grad_norm": 1.225642093420928, "learning_rate": 1.9644406332525597e-05, "loss": 0.2608, "step": 3234 }, { "epoch": 0.6760710553814002, "grad_norm": 1.5793211335778907, "learning_rate": 1.964410810913341e-05, "loss": 0.1983, "step": 3235 }, { "epoch": 0.6762800417972832, "grad_norm": 1.502208231798803, "learning_rate": 1.9643809763004712e-05, "loss": 0.2222, "step": 3236 }, { "epoch": 0.6764890282131661, "grad_norm": 1.28049097194226, "learning_rate": 1.964351129414329e-05, "loss": 0.2132, "step": 3237 }, { "epoch": 0.6766980146290491, "grad_norm": 1.2858293288235476, "learning_rate": 1.9643212702552946e-05, "loss": 0.2069, "step": 3238 }, { "epoch": 0.6769070010449321, "grad_norm": 1.132542415020488, "learning_rate": 1.964291398823748e-05, "loss": 0.2494, "step": 3239 }, { "epoch": 0.677115987460815, "grad_norm": 1.437135112734316, "learning_rate": 1.9642615151200695e-05, "loss": 0.2131, "step": 3240 }, { "epoch": 0.677324973876698, "grad_norm": 1.3604485053995103, "learning_rate": 1.9642316191446394e-05, "loss": 0.2368, "step": 3241 }, { "epoch": 0.677533960292581, "grad_norm": 1.2034261728798028, "learning_rate": 1.964201710897838e-05, "loss": 0.2216, "step": 3242 }, { "epoch": 0.6777429467084639, "grad_norm": 1.2715132844338317, "learning_rate": 1.964171790380046e-05, "loss": 0.1969, "step": 3243 }, { "epoch": 0.6779519331243469, "grad_norm": 1.4541420917287886, "learning_rate": 1.964141857591644e-05, "loss": 0.198, "step": 3244 }, { "epoch": 0.6781609195402298, "grad_norm": 1.4576536502030573, "learning_rate": 1.964111912533014e-05, "loss": 0.2501, "step": 3245 }, { "epoch": 0.6783699059561129, "grad_norm": 1.336250884324273, "learning_rate": 1.9640819552045357e-05, "loss": 0.2132, "step": 3246 }, { "epoch": 0.6785788923719959, "grad_norm": 1.2453547395524163, "learning_rate": 1.9640519856065908e-05, "loss": 0.2418, "step": 3247 }, { "epoch": 0.6787878787878788, "grad_norm": 1.2560878550126948, "learning_rate": 1.964022003739561e-05, "loss": 0.2455, "step": 3248 }, { "epoch": 0.6789968652037618, "grad_norm": 1.2252679167998197, "learning_rate": 1.9639920096038276e-05, "loss": 0.2271, "step": 3249 }, { "epoch": 0.6792058516196448, "grad_norm": 1.279186512771691, "learning_rate": 1.9639620031997726e-05, "loss": 0.2009, "step": 3250 }, { "epoch": 0.6794148380355277, "grad_norm": 1.7454150782893108, "learning_rate": 1.9639319845277775e-05, "loss": 0.1944, "step": 3251 }, { "epoch": 0.6796238244514107, "grad_norm": 1.0646082507593855, "learning_rate": 1.9639019535882246e-05, "loss": 0.222, "step": 3252 }, { "epoch": 0.6798328108672936, "grad_norm": 1.2213729394353952, "learning_rate": 1.963871910381496e-05, "loss": 0.2188, "step": 3253 }, { "epoch": 0.6800417972831766, "grad_norm": 1.237886840000035, "learning_rate": 1.9638418549079744e-05, "loss": 0.21, "step": 3254 }, { "epoch": 0.6802507836990596, "grad_norm": 1.3977954471931124, "learning_rate": 1.9638117871680414e-05, "loss": 0.2485, "step": 3255 }, { "epoch": 0.6804597701149425, "grad_norm": 1.2089242645515794, "learning_rate": 1.9637817071620807e-05, "loss": 0.2387, "step": 3256 }, { "epoch": 0.6806687565308255, "grad_norm": 1.225112850165252, "learning_rate": 1.9637516148904743e-05, "loss": 0.2287, "step": 3257 }, { "epoch": 0.6808777429467084, "grad_norm": 1.1161679678726182, "learning_rate": 1.963721510353606e-05, "loss": 0.1978, "step": 3258 }, { "epoch": 0.6810867293625914, "grad_norm": 1.4182560876376502, "learning_rate": 1.9636913935518577e-05, "loss": 0.2159, "step": 3259 }, { "epoch": 0.6812957157784744, "grad_norm": 1.1892561341251355, "learning_rate": 1.9636612644856138e-05, "loss": 0.2301, "step": 3260 }, { "epoch": 0.6815047021943573, "grad_norm": 1.1622730347627195, "learning_rate": 1.9636311231552577e-05, "loss": 0.2326, "step": 3261 }, { "epoch": 0.6817136886102403, "grad_norm": 1.2141262807754591, "learning_rate": 1.9636009695611724e-05, "loss": 0.2471, "step": 3262 }, { "epoch": 0.6819226750261232, "grad_norm": 1.1454237947496204, "learning_rate": 1.9635708037037418e-05, "loss": 0.2295, "step": 3263 }, { "epoch": 0.6821316614420063, "grad_norm": 1.4082954784634638, "learning_rate": 1.96354062558335e-05, "loss": 0.2042, "step": 3264 }, { "epoch": 0.6823406478578893, "grad_norm": 1.3278286855702752, "learning_rate": 1.963510435200381e-05, "loss": 0.2303, "step": 3265 }, { "epoch": 0.6825496342737722, "grad_norm": 1.253822680601097, "learning_rate": 1.963480232555219e-05, "loss": 0.2437, "step": 3266 }, { "epoch": 0.6827586206896552, "grad_norm": 1.2629738406646438, "learning_rate": 1.9634500176482486e-05, "loss": 0.2365, "step": 3267 }, { "epoch": 0.6829676071055382, "grad_norm": 1.5213749905587246, "learning_rate": 1.963419790479854e-05, "loss": 0.2787, "step": 3268 }, { "epoch": 0.6831765935214211, "grad_norm": 1.295002409434365, "learning_rate": 1.96338955105042e-05, "loss": 0.225, "step": 3269 }, { "epoch": 0.6833855799373041, "grad_norm": 1.2339233240998317, "learning_rate": 1.963359299360331e-05, "loss": 0.2244, "step": 3270 }, { "epoch": 0.683594566353187, "grad_norm": 1.3477630982044488, "learning_rate": 1.963329035409973e-05, "loss": 0.2122, "step": 3271 }, { "epoch": 0.68380355276907, "grad_norm": 1.2134395148677726, "learning_rate": 1.9632987591997305e-05, "loss": 0.234, "step": 3272 }, { "epoch": 0.684012539184953, "grad_norm": 1.3486245455281183, "learning_rate": 1.963268470729989e-05, "loss": 0.2184, "step": 3273 }, { "epoch": 0.6842215256008359, "grad_norm": 1.0567782075287584, "learning_rate": 1.9632381700011338e-05, "loss": 0.2345, "step": 3274 }, { "epoch": 0.6844305120167189, "grad_norm": 1.2344834317826279, "learning_rate": 1.9632078570135506e-05, "loss": 0.2157, "step": 3275 }, { "epoch": 0.6846394984326019, "grad_norm": 1.2474777428911044, "learning_rate": 1.963177531767625e-05, "loss": 0.2241, "step": 3276 }, { "epoch": 0.6848484848484848, "grad_norm": 1.47220532980638, "learning_rate": 1.9631471942637435e-05, "loss": 0.1951, "step": 3277 }, { "epoch": 0.6850574712643678, "grad_norm": 1.2750214478653614, "learning_rate": 1.9631168445022916e-05, "loss": 0.2046, "step": 3278 }, { "epoch": 0.6852664576802507, "grad_norm": 1.144098105607682, "learning_rate": 1.963086482483656e-05, "loss": 0.2393, "step": 3279 }, { "epoch": 0.6854754440961337, "grad_norm": 1.1632975585745653, "learning_rate": 1.9630561082082224e-05, "loss": 0.2433, "step": 3280 }, { "epoch": 0.6856844305120168, "grad_norm": 1.3587037438745202, "learning_rate": 1.9630257216763785e-05, "loss": 0.2082, "step": 3281 }, { "epoch": 0.6858934169278997, "grad_norm": 1.3528448799248856, "learning_rate": 1.96299532288851e-05, "loss": 0.2466, "step": 3282 }, { "epoch": 0.6861024033437827, "grad_norm": 1.150337921479494, "learning_rate": 1.962964911845004e-05, "loss": 0.1865, "step": 3283 }, { "epoch": 0.6863113897596657, "grad_norm": 1.337862848815007, "learning_rate": 1.962934488546248e-05, "loss": 0.215, "step": 3284 }, { "epoch": 0.6865203761755486, "grad_norm": 1.2532342637988314, "learning_rate": 1.9629040529926286e-05, "loss": 0.2468, "step": 3285 }, { "epoch": 0.6867293625914316, "grad_norm": 1.5390882835797153, "learning_rate": 1.9628736051845334e-05, "loss": 0.2033, "step": 3286 }, { "epoch": 0.6869383490073145, "grad_norm": 1.2887371151399756, "learning_rate": 1.96284314512235e-05, "loss": 0.2102, "step": 3287 }, { "epoch": 0.6871473354231975, "grad_norm": 1.2784475007137062, "learning_rate": 1.962812672806466e-05, "loss": 0.2012, "step": 3288 }, { "epoch": 0.6873563218390805, "grad_norm": 1.1829792253587792, "learning_rate": 1.962782188237269e-05, "loss": 0.2328, "step": 3289 }, { "epoch": 0.6875653082549634, "grad_norm": 1.3529446805576848, "learning_rate": 1.962751691415147e-05, "loss": 0.2159, "step": 3290 }, { "epoch": 0.6877742946708464, "grad_norm": 1.1329448165385974, "learning_rate": 1.9627211823404883e-05, "loss": 0.2066, "step": 3291 }, { "epoch": 0.6879832810867293, "grad_norm": 1.0733924720109735, "learning_rate": 1.9626906610136812e-05, "loss": 0.2164, "step": 3292 }, { "epoch": 0.6881922675026123, "grad_norm": 1.2170189138599623, "learning_rate": 1.9626601274351138e-05, "loss": 0.1974, "step": 3293 }, { "epoch": 0.6884012539184953, "grad_norm": 1.0073287200151115, "learning_rate": 1.962629581605175e-05, "loss": 0.2107, "step": 3294 }, { "epoch": 0.6886102403343782, "grad_norm": 1.2591815950930816, "learning_rate": 1.9625990235242534e-05, "loss": 0.2496, "step": 3295 }, { "epoch": 0.6888192267502612, "grad_norm": 1.262370432173909, "learning_rate": 1.9625684531927384e-05, "loss": 0.2486, "step": 3296 }, { "epoch": 0.6890282131661442, "grad_norm": 1.1549559561002742, "learning_rate": 1.9625378706110182e-05, "loss": 0.206, "step": 3297 }, { "epoch": 0.6892371995820271, "grad_norm": 1.2534939518382295, "learning_rate": 1.9625072757794824e-05, "loss": 0.2214, "step": 3298 }, { "epoch": 0.6894461859979102, "grad_norm": 1.1246439914490052, "learning_rate": 1.9624766686985207e-05, "loss": 0.2168, "step": 3299 }, { "epoch": 0.6896551724137931, "grad_norm": 1.159885367940108, "learning_rate": 1.9624460493685224e-05, "loss": 0.2064, "step": 3300 }, { "epoch": 0.6898641588296761, "grad_norm": 1.3251058112028098, "learning_rate": 1.9624154177898767e-05, "loss": 0.1874, "step": 3301 }, { "epoch": 0.6900731452455591, "grad_norm": 1.4096824723222172, "learning_rate": 1.962384773962974e-05, "loss": 0.2727, "step": 3302 }, { "epoch": 0.690282131661442, "grad_norm": 1.2763962164737526, "learning_rate": 1.962354117888204e-05, "loss": 0.1991, "step": 3303 }, { "epoch": 0.690491118077325, "grad_norm": 1.3671886735714116, "learning_rate": 1.9623234495659573e-05, "loss": 0.2323, "step": 3304 }, { "epoch": 0.690700104493208, "grad_norm": 1.2630223863961154, "learning_rate": 1.9622927689966237e-05, "loss": 0.2186, "step": 3305 }, { "epoch": 0.6909090909090909, "grad_norm": 1.391249902288993, "learning_rate": 1.962262076180594e-05, "loss": 0.2232, "step": 3306 }, { "epoch": 0.6911180773249739, "grad_norm": 1.3147594205289597, "learning_rate": 1.9622313711182587e-05, "loss": 0.2126, "step": 3307 }, { "epoch": 0.6913270637408568, "grad_norm": 2.3259025896451426, "learning_rate": 1.962200653810008e-05, "loss": 0.2227, "step": 3308 }, { "epoch": 0.6915360501567398, "grad_norm": 1.101862853519154, "learning_rate": 1.9621699242562338e-05, "loss": 0.2219, "step": 3309 }, { "epoch": 0.6917450365726228, "grad_norm": 1.1505791531536809, "learning_rate": 1.9621391824573266e-05, "loss": 0.2198, "step": 3310 }, { "epoch": 0.6919540229885057, "grad_norm": 1.1859589149574972, "learning_rate": 1.962108428413678e-05, "loss": 0.2252, "step": 3311 }, { "epoch": 0.6921630094043887, "grad_norm": 1.2775192761933698, "learning_rate": 1.9620776621256787e-05, "loss": 0.1911, "step": 3312 }, { "epoch": 0.6923719958202716, "grad_norm": 1.3683383704056191, "learning_rate": 1.962046883593721e-05, "loss": 0.2079, "step": 3313 }, { "epoch": 0.6925809822361546, "grad_norm": 1.3977895743709008, "learning_rate": 1.9620160928181963e-05, "loss": 0.2356, "step": 3314 }, { "epoch": 0.6927899686520376, "grad_norm": 1.1445279857607658, "learning_rate": 1.9619852897994963e-05, "loss": 0.2026, "step": 3315 }, { "epoch": 0.6929989550679205, "grad_norm": 1.023917148895235, "learning_rate": 1.9619544745380135e-05, "loss": 0.2083, "step": 3316 }, { "epoch": 0.6932079414838036, "grad_norm": 1.19441731278042, "learning_rate": 1.9619236470341395e-05, "loss": 0.231, "step": 3317 }, { "epoch": 0.6934169278996866, "grad_norm": 1.155727209333543, "learning_rate": 1.961892807288267e-05, "loss": 0.2169, "step": 3318 }, { "epoch": 0.6936259143155695, "grad_norm": 1.32145650727475, "learning_rate": 1.9618619553007886e-05, "loss": 0.2192, "step": 3319 }, { "epoch": 0.6938349007314525, "grad_norm": 1.3509538054203167, "learning_rate": 1.9618310910720967e-05, "loss": 0.2286, "step": 3320 }, { "epoch": 0.6940438871473354, "grad_norm": 1.1707362100130345, "learning_rate": 1.9618002146025837e-05, "loss": 0.2307, "step": 3321 }, { "epoch": 0.6942528735632184, "grad_norm": 1.0658499258040623, "learning_rate": 1.9617693258926433e-05, "loss": 0.2134, "step": 3322 }, { "epoch": 0.6944618599791014, "grad_norm": 1.3334220106724817, "learning_rate": 1.9617384249426682e-05, "loss": 0.1915, "step": 3323 }, { "epoch": 0.6946708463949843, "grad_norm": 1.4452820222044367, "learning_rate": 1.9617075117530518e-05, "loss": 0.1959, "step": 3324 }, { "epoch": 0.6948798328108673, "grad_norm": 1.2661146781560697, "learning_rate": 1.9616765863241874e-05, "loss": 0.1992, "step": 3325 }, { "epoch": 0.6950888192267503, "grad_norm": 1.119090769852423, "learning_rate": 1.9616456486564687e-05, "loss": 0.1948, "step": 3326 }, { "epoch": 0.6952978056426332, "grad_norm": 1.3875281890671736, "learning_rate": 1.961614698750289e-05, "loss": 0.2334, "step": 3327 }, { "epoch": 0.6955067920585162, "grad_norm": 1.1805976856118925, "learning_rate": 1.961583736606043e-05, "loss": 0.2195, "step": 3328 }, { "epoch": 0.6957157784743991, "grad_norm": 1.1678057392533547, "learning_rate": 1.9615527622241244e-05, "loss": 0.2099, "step": 3329 }, { "epoch": 0.6959247648902821, "grad_norm": 1.3031313523571082, "learning_rate": 1.961521775604927e-05, "loss": 0.2237, "step": 3330 }, { "epoch": 0.6961337513061651, "grad_norm": 1.4053513424565616, "learning_rate": 1.9614907767488456e-05, "loss": 0.215, "step": 3331 }, { "epoch": 0.696342737722048, "grad_norm": 1.4471279635980503, "learning_rate": 1.9614597656562744e-05, "loss": 0.2283, "step": 3332 }, { "epoch": 0.696551724137931, "grad_norm": 1.360640180615919, "learning_rate": 1.9614287423276083e-05, "loss": 0.207, "step": 3333 }, { "epoch": 0.696760710553814, "grad_norm": 1.0203624508807518, "learning_rate": 1.9613977067632422e-05, "loss": 0.1973, "step": 3334 }, { "epoch": 0.696969696969697, "grad_norm": 1.1396450903782813, "learning_rate": 1.9613666589635706e-05, "loss": 0.1676, "step": 3335 }, { "epoch": 0.69717868338558, "grad_norm": 1.1896393223195374, "learning_rate": 1.9613355989289892e-05, "loss": 0.198, "step": 3336 }, { "epoch": 0.6973876698014629, "grad_norm": 1.337976715117241, "learning_rate": 1.9613045266598927e-05, "loss": 0.1845, "step": 3337 }, { "epoch": 0.6975966562173459, "grad_norm": 1.3052791156916144, "learning_rate": 1.9612734421566776e-05, "loss": 0.2641, "step": 3338 }, { "epoch": 0.6978056426332289, "grad_norm": 1.1812784648271648, "learning_rate": 1.961242345419738e-05, "loss": 0.221, "step": 3339 }, { "epoch": 0.6980146290491118, "grad_norm": 1.4324539233383375, "learning_rate": 1.9612112364494707e-05, "loss": 0.2359, "step": 3340 }, { "epoch": 0.6982236154649948, "grad_norm": 1.187692028354174, "learning_rate": 1.9611801152462715e-05, "loss": 0.2601, "step": 3341 }, { "epoch": 0.6984326018808777, "grad_norm": 1.1732729866910605, "learning_rate": 1.9611489818105363e-05, "loss": 0.2283, "step": 3342 }, { "epoch": 0.6986415882967607, "grad_norm": 1.172649443024235, "learning_rate": 1.9611178361426613e-05, "loss": 0.2061, "step": 3343 }, { "epoch": 0.6988505747126437, "grad_norm": 1.3124518415044721, "learning_rate": 1.9610866782430428e-05, "loss": 0.2238, "step": 3344 }, { "epoch": 0.6990595611285266, "grad_norm": 1.493247030556916, "learning_rate": 1.9610555081120774e-05, "loss": 0.2352, "step": 3345 }, { "epoch": 0.6992685475444096, "grad_norm": 1.1889673719640415, "learning_rate": 1.9610243257501623e-05, "loss": 0.2274, "step": 3346 }, { "epoch": 0.6994775339602926, "grad_norm": 1.152533769025284, "learning_rate": 1.9609931311576934e-05, "loss": 0.2509, "step": 3347 }, { "epoch": 0.6996865203761755, "grad_norm": 1.2657256027699022, "learning_rate": 1.9609619243350683e-05, "loss": 0.2145, "step": 3348 }, { "epoch": 0.6998955067920585, "grad_norm": 1.324991196283169, "learning_rate": 1.960930705282684e-05, "loss": 0.2271, "step": 3349 }, { "epoch": 0.7001044932079414, "grad_norm": 1.4073316731089156, "learning_rate": 1.960899474000938e-05, "loss": 0.2083, "step": 3350 }, { "epoch": 0.7003134796238244, "grad_norm": 1.2197725496675238, "learning_rate": 1.9608682304902274e-05, "loss": 0.2259, "step": 3351 }, { "epoch": 0.7005224660397075, "grad_norm": 1.2185929041455212, "learning_rate": 1.9608369747509502e-05, "loss": 0.2244, "step": 3352 }, { "epoch": 0.7007314524555904, "grad_norm": 1.1996945483997095, "learning_rate": 1.960805706783504e-05, "loss": 0.2386, "step": 3353 }, { "epoch": 0.7009404388714734, "grad_norm": 1.1043866434768956, "learning_rate": 1.960774426588287e-05, "loss": 0.2092, "step": 3354 }, { "epoch": 0.7011494252873564, "grad_norm": 1.3364907568491247, "learning_rate": 1.9607431341656966e-05, "loss": 0.2108, "step": 3355 }, { "epoch": 0.7013584117032393, "grad_norm": 1.2518973855198265, "learning_rate": 1.960711829516132e-05, "loss": 0.2138, "step": 3356 }, { "epoch": 0.7015673981191223, "grad_norm": 1.1155645821406992, "learning_rate": 1.9606805126399905e-05, "loss": 0.2128, "step": 3357 }, { "epoch": 0.7017763845350052, "grad_norm": 1.4303926137299279, "learning_rate": 1.9606491835376715e-05, "loss": 0.2318, "step": 3358 }, { "epoch": 0.7019853709508882, "grad_norm": 1.2190710494948716, "learning_rate": 1.9606178422095737e-05, "loss": 0.237, "step": 3359 }, { "epoch": 0.7021943573667712, "grad_norm": 1.2296785964251402, "learning_rate": 1.9605864886560956e-05, "loss": 0.2191, "step": 3360 }, { "epoch": 0.7024033437826541, "grad_norm": 1.2009897538119447, "learning_rate": 1.9605551228776362e-05, "loss": 0.2181, "step": 3361 }, { "epoch": 0.7026123301985371, "grad_norm": 1.1467268514354643, "learning_rate": 1.9605237448745947e-05, "loss": 0.2036, "step": 3362 }, { "epoch": 0.70282131661442, "grad_norm": 1.2415886875461184, "learning_rate": 1.960492354647371e-05, "loss": 0.2193, "step": 3363 }, { "epoch": 0.703030303030303, "grad_norm": 1.4694467841486354, "learning_rate": 1.960460952196364e-05, "loss": 0.2196, "step": 3364 }, { "epoch": 0.703239289446186, "grad_norm": 1.0752421111753, "learning_rate": 1.9604295375219733e-05, "loss": 0.2232, "step": 3365 }, { "epoch": 0.7034482758620689, "grad_norm": 1.3403908144793673, "learning_rate": 1.960398110624599e-05, "loss": 0.229, "step": 3366 }, { "epoch": 0.7036572622779519, "grad_norm": 1.1659391375570933, "learning_rate": 1.960366671504641e-05, "loss": 0.1843, "step": 3367 }, { "epoch": 0.7038662486938349, "grad_norm": 1.1595312074541897, "learning_rate": 1.9603352201624993e-05, "loss": 0.2147, "step": 3368 }, { "epoch": 0.7040752351097179, "grad_norm": 1.2968574143019334, "learning_rate": 1.9603037565985745e-05, "loss": 0.2366, "step": 3369 }, { "epoch": 0.7042842215256009, "grad_norm": 1.3003247078270863, "learning_rate": 1.9602722808132666e-05, "loss": 0.1946, "step": 3370 }, { "epoch": 0.7044932079414838, "grad_norm": 1.2627119013155457, "learning_rate": 1.9602407928069765e-05, "loss": 0.2191, "step": 3371 }, { "epoch": 0.7047021943573668, "grad_norm": 1.3130970659547088, "learning_rate": 1.9602092925801043e-05, "loss": 0.2217, "step": 3372 }, { "epoch": 0.7049111807732498, "grad_norm": 1.6644806330245439, "learning_rate": 1.960177780133052e-05, "loss": 0.2173, "step": 3373 }, { "epoch": 0.7051201671891327, "grad_norm": 1.2151204465922547, "learning_rate": 1.9601462554662193e-05, "loss": 0.23, "step": 3374 }, { "epoch": 0.7053291536050157, "grad_norm": 1.266856673249371, "learning_rate": 1.960114718580009e-05, "loss": 0.2605, "step": 3375 }, { "epoch": 0.7055381400208987, "grad_norm": 1.237317264028637, "learning_rate": 1.960083169474821e-05, "loss": 0.1979, "step": 3376 }, { "epoch": 0.7057471264367816, "grad_norm": 1.2914567486765138, "learning_rate": 1.9600516081510574e-05, "loss": 0.2325, "step": 3377 }, { "epoch": 0.7059561128526646, "grad_norm": 1.280466949334343, "learning_rate": 1.9600200346091197e-05, "loss": 0.2229, "step": 3378 }, { "epoch": 0.7061650992685475, "grad_norm": 1.7419914697195105, "learning_rate": 1.95998844884941e-05, "loss": 0.2457, "step": 3379 }, { "epoch": 0.7063740856844305, "grad_norm": 1.0802001992375165, "learning_rate": 1.9599568508723303e-05, "loss": 0.2262, "step": 3380 }, { "epoch": 0.7065830721003135, "grad_norm": 1.2745945145980384, "learning_rate": 1.9599252406782825e-05, "loss": 0.252, "step": 3381 }, { "epoch": 0.7067920585161964, "grad_norm": 1.3879389570737808, "learning_rate": 1.959893618267669e-05, "loss": 0.2381, "step": 3382 }, { "epoch": 0.7070010449320794, "grad_norm": 1.3488610367032066, "learning_rate": 1.9598619836408926e-05, "loss": 0.1956, "step": 3383 }, { "epoch": 0.7072100313479623, "grad_norm": 1.3453551543896896, "learning_rate": 1.959830336798355e-05, "loss": 0.2289, "step": 3384 }, { "epoch": 0.7074190177638453, "grad_norm": 1.317770035280253, "learning_rate": 1.9597986777404595e-05, "loss": 0.2277, "step": 3385 }, { "epoch": 0.7076280041797283, "grad_norm": 1.1232264174833906, "learning_rate": 1.959767006467609e-05, "loss": 0.2039, "step": 3386 }, { "epoch": 0.7078369905956113, "grad_norm": 1.2949049313073466, "learning_rate": 1.959735322980207e-05, "loss": 0.2343, "step": 3387 }, { "epoch": 0.7080459770114943, "grad_norm": 1.0812979206570394, "learning_rate": 1.9597036272786558e-05, "loss": 0.1865, "step": 3388 }, { "epoch": 0.7082549634273773, "grad_norm": 1.132384330455132, "learning_rate": 1.9596719193633595e-05, "loss": 0.2004, "step": 3389 }, { "epoch": 0.7084639498432602, "grad_norm": 1.279292704656917, "learning_rate": 1.9596401992347213e-05, "loss": 0.2231, "step": 3390 }, { "epoch": 0.7086729362591432, "grad_norm": 1.2153842242627872, "learning_rate": 1.959608466893145e-05, "loss": 0.2057, "step": 3391 }, { "epoch": 0.7088819226750261, "grad_norm": 1.150186142882685, "learning_rate": 1.9595767223390343e-05, "loss": 0.1797, "step": 3392 }, { "epoch": 0.7090909090909091, "grad_norm": 1.282969511232101, "learning_rate": 1.9595449655727937e-05, "loss": 0.2135, "step": 3393 }, { "epoch": 0.7092998955067921, "grad_norm": 1.151879405919897, "learning_rate": 1.9595131965948265e-05, "loss": 0.2334, "step": 3394 }, { "epoch": 0.709508881922675, "grad_norm": 1.3842599695537097, "learning_rate": 1.959481415405538e-05, "loss": 0.2282, "step": 3395 }, { "epoch": 0.709717868338558, "grad_norm": 1.4160042792451115, "learning_rate": 1.9594496220053318e-05, "loss": 0.2313, "step": 3396 }, { "epoch": 0.709926854754441, "grad_norm": 1.041130643698015, "learning_rate": 1.959417816394613e-05, "loss": 0.2482, "step": 3397 }, { "epoch": 0.7101358411703239, "grad_norm": 1.1946190928438656, "learning_rate": 1.9593859985737862e-05, "loss": 0.1968, "step": 3398 }, { "epoch": 0.7103448275862069, "grad_norm": 1.4586696439635818, "learning_rate": 1.9593541685432563e-05, "loss": 0.2178, "step": 3399 }, { "epoch": 0.7105538140020898, "grad_norm": 1.3608972094863214, "learning_rate": 1.9593223263034288e-05, "loss": 0.2301, "step": 3400 }, { "epoch": 0.7107628004179728, "grad_norm": 1.355619349494032, "learning_rate": 1.959290471854708e-05, "loss": 0.2206, "step": 3401 }, { "epoch": 0.7109717868338558, "grad_norm": 1.3189057409111793, "learning_rate": 1.9592586051975003e-05, "loss": 0.1966, "step": 3402 }, { "epoch": 0.7111807732497387, "grad_norm": 1.264482924917474, "learning_rate": 1.959226726332211e-05, "loss": 0.2183, "step": 3403 }, { "epoch": 0.7113897596656217, "grad_norm": 1.3114140198781832, "learning_rate": 1.9591948352592456e-05, "loss": 0.2447, "step": 3404 }, { "epoch": 0.7115987460815048, "grad_norm": 1.275334619089693, "learning_rate": 1.95916293197901e-05, "loss": 0.2378, "step": 3405 }, { "epoch": 0.7118077324973877, "grad_norm": 1.1345562640003521, "learning_rate": 1.95913101649191e-05, "loss": 0.2243, "step": 3406 }, { "epoch": 0.7120167189132707, "grad_norm": 1.1714490109088227, "learning_rate": 1.9590990887983526e-05, "loss": 0.2114, "step": 3407 }, { "epoch": 0.7122257053291536, "grad_norm": 1.341440662166064, "learning_rate": 1.959067148898743e-05, "loss": 0.2147, "step": 3408 }, { "epoch": 0.7124346917450366, "grad_norm": 1.1438714856962646, "learning_rate": 1.9590351967934887e-05, "loss": 0.1941, "step": 3409 }, { "epoch": 0.7126436781609196, "grad_norm": 1.1938443360965791, "learning_rate": 1.9590032324829957e-05, "loss": 0.2075, "step": 3410 }, { "epoch": 0.7128526645768025, "grad_norm": 0.9999771076503652, "learning_rate": 1.9589712559676707e-05, "loss": 0.1953, "step": 3411 }, { "epoch": 0.7130616509926855, "grad_norm": 1.1365727765048925, "learning_rate": 1.9589392672479214e-05, "loss": 0.2055, "step": 3412 }, { "epoch": 0.7132706374085684, "grad_norm": 1.2305150708908978, "learning_rate": 1.9589072663241545e-05, "loss": 0.1871, "step": 3413 }, { "epoch": 0.7134796238244514, "grad_norm": 1.130386575777831, "learning_rate": 1.958875253196777e-05, "loss": 0.2243, "step": 3414 }, { "epoch": 0.7136886102403344, "grad_norm": 1.180530978562692, "learning_rate": 1.9588432278661962e-05, "loss": 0.2338, "step": 3415 }, { "epoch": 0.7138975966562173, "grad_norm": 1.1341399484751455, "learning_rate": 1.9588111903328203e-05, "loss": 0.2247, "step": 3416 }, { "epoch": 0.7141065830721003, "grad_norm": 1.2102991516009085, "learning_rate": 1.958779140597057e-05, "loss": 0.1892, "step": 3417 }, { "epoch": 0.7143155694879832, "grad_norm": 1.2543436093054448, "learning_rate": 1.9587470786593138e-05, "loss": 0.1984, "step": 3418 }, { "epoch": 0.7145245559038662, "grad_norm": 1.3141520253629269, "learning_rate": 1.9587150045199986e-05, "loss": 0.2195, "step": 3419 }, { "epoch": 0.7147335423197492, "grad_norm": 1.2939409305212015, "learning_rate": 1.9586829181795198e-05, "loss": 0.2236, "step": 3420 }, { "epoch": 0.7149425287356321, "grad_norm": 1.3629296377202134, "learning_rate": 1.958650819638286e-05, "loss": 0.2205, "step": 3421 }, { "epoch": 0.7151515151515152, "grad_norm": 1.1981083616807346, "learning_rate": 1.9586187088967055e-05, "loss": 0.2166, "step": 3422 }, { "epoch": 0.7153605015673982, "grad_norm": 1.0923995814050296, "learning_rate": 1.9585865859551866e-05, "loss": 0.2231, "step": 3423 }, { "epoch": 0.7155694879832811, "grad_norm": 1.4789249504675543, "learning_rate": 1.958554450814139e-05, "loss": 0.2137, "step": 3424 }, { "epoch": 0.7157784743991641, "grad_norm": 1.248091796109629, "learning_rate": 1.9585223034739708e-05, "loss": 0.1986, "step": 3425 }, { "epoch": 0.715987460815047, "grad_norm": 3.198115705506287, "learning_rate": 1.958490143935092e-05, "loss": 0.249, "step": 3426 }, { "epoch": 0.71619644723093, "grad_norm": 1.2094404646182324, "learning_rate": 1.958457972197911e-05, "loss": 0.2103, "step": 3427 }, { "epoch": 0.716405433646813, "grad_norm": 1.4642397185330431, "learning_rate": 1.9584257882628372e-05, "loss": 0.2575, "step": 3428 }, { "epoch": 0.7166144200626959, "grad_norm": 1.1557960467115589, "learning_rate": 1.9583935921302812e-05, "loss": 0.2124, "step": 3429 }, { "epoch": 0.7168234064785789, "grad_norm": 1.360883258995014, "learning_rate": 1.9583613838006516e-05, "loss": 0.2148, "step": 3430 }, { "epoch": 0.7170323928944619, "grad_norm": 1.1496448084578699, "learning_rate": 1.958329163274359e-05, "loss": 0.2462, "step": 3431 }, { "epoch": 0.7172413793103448, "grad_norm": 1.2815955992254418, "learning_rate": 1.9582969305518134e-05, "loss": 0.2045, "step": 3432 }, { "epoch": 0.7174503657262278, "grad_norm": 1.3792516009795655, "learning_rate": 1.958264685633425e-05, "loss": 0.2301, "step": 3433 }, { "epoch": 0.7176593521421107, "grad_norm": 1.3026095160170812, "learning_rate": 1.9582324285196037e-05, "loss": 0.2106, "step": 3434 }, { "epoch": 0.7178683385579937, "grad_norm": 1.1536595937439658, "learning_rate": 1.958200159210761e-05, "loss": 0.2512, "step": 3435 }, { "epoch": 0.7180773249738767, "grad_norm": 1.0592519377775163, "learning_rate": 1.9581678777073063e-05, "loss": 0.2108, "step": 3436 }, { "epoch": 0.7182863113897596, "grad_norm": 1.2597572131060353, "learning_rate": 1.9581355840096515e-05, "loss": 0.2068, "step": 3437 }, { "epoch": 0.7184952978056426, "grad_norm": 1.1874186100417479, "learning_rate": 1.958103278118207e-05, "loss": 0.2028, "step": 3438 }, { "epoch": 0.7187042842215255, "grad_norm": 1.2057841790029096, "learning_rate": 1.9580709600333844e-05, "loss": 0.238, "step": 3439 }, { "epoch": 0.7189132706374086, "grad_norm": 1.150943139444087, "learning_rate": 1.9580386297555944e-05, "loss": 0.2037, "step": 3440 }, { "epoch": 0.7191222570532916, "grad_norm": 1.1238578223301168, "learning_rate": 1.958006287285249e-05, "loss": 0.2188, "step": 3441 }, { "epoch": 0.7193312434691745, "grad_norm": 0.9402096785991759, "learning_rate": 1.9579739326227592e-05, "loss": 0.2057, "step": 3442 }, { "epoch": 0.7195402298850575, "grad_norm": 1.1837938330574183, "learning_rate": 1.9579415657685375e-05, "loss": 0.2122, "step": 3443 }, { "epoch": 0.7197492163009405, "grad_norm": 1.2020770043332896, "learning_rate": 1.9579091867229956e-05, "loss": 0.2229, "step": 3444 }, { "epoch": 0.7199582027168234, "grad_norm": 1.109468078200061, "learning_rate": 1.9578767954865452e-05, "loss": 0.2205, "step": 3445 }, { "epoch": 0.7201671891327064, "grad_norm": 1.314254614566288, "learning_rate": 1.957844392059599e-05, "loss": 0.2221, "step": 3446 }, { "epoch": 0.7203761755485893, "grad_norm": 1.272264620268912, "learning_rate": 1.957811976442569e-05, "loss": 0.1918, "step": 3447 }, { "epoch": 0.7205851619644723, "grad_norm": 1.2139854455407804, "learning_rate": 1.957779548635868e-05, "loss": 0.2422, "step": 3448 }, { "epoch": 0.7207941483803553, "grad_norm": 1.5375041982643314, "learning_rate": 1.9577471086399083e-05, "loss": 0.2427, "step": 3449 }, { "epoch": 0.7210031347962382, "grad_norm": 1.3931047276569166, "learning_rate": 1.9577146564551036e-05, "loss": 0.2834, "step": 3450 }, { "epoch": 0.7212121212121212, "grad_norm": 1.2756214791324962, "learning_rate": 1.9576821920818658e-05, "loss": 0.2226, "step": 3451 }, { "epoch": 0.7214211076280042, "grad_norm": 1.3868609161856351, "learning_rate": 1.957649715520609e-05, "loss": 0.1963, "step": 3452 }, { "epoch": 0.7216300940438871, "grad_norm": 0.9135269528171622, "learning_rate": 1.957617226771746e-05, "loss": 0.1788, "step": 3453 }, { "epoch": 0.7218390804597701, "grad_norm": 1.195348398678767, "learning_rate": 1.9575847258356902e-05, "loss": 0.2144, "step": 3454 }, { "epoch": 0.722048066875653, "grad_norm": 1.2824159821290966, "learning_rate": 1.9575522127128555e-05, "loss": 0.1841, "step": 3455 }, { "epoch": 0.722257053291536, "grad_norm": 1.356277124744674, "learning_rate": 1.957519687403656e-05, "loss": 0.1969, "step": 3456 }, { "epoch": 0.7224660397074191, "grad_norm": 1.3320950500404323, "learning_rate": 1.957487149908505e-05, "loss": 0.1935, "step": 3457 }, { "epoch": 0.722675026123302, "grad_norm": 1.4232013774974435, "learning_rate": 1.9574546002278166e-05, "loss": 0.2397, "step": 3458 }, { "epoch": 0.722884012539185, "grad_norm": 1.119919202393801, "learning_rate": 1.9574220383620054e-05, "loss": 0.2082, "step": 3459 }, { "epoch": 0.723092998955068, "grad_norm": 1.034968844868413, "learning_rate": 1.957389464311486e-05, "loss": 0.2064, "step": 3460 }, { "epoch": 0.7233019853709509, "grad_norm": 1.3871639431179927, "learning_rate": 1.957356878076672e-05, "loss": 0.2015, "step": 3461 }, { "epoch": 0.7235109717868339, "grad_norm": 1.2434788294569898, "learning_rate": 1.9573242796579798e-05, "loss": 0.2422, "step": 3462 }, { "epoch": 0.7237199582027168, "grad_norm": 1.2274861682079468, "learning_rate": 1.9572916690558224e-05, "loss": 0.1756, "step": 3463 }, { "epoch": 0.7239289446185998, "grad_norm": 1.1720803543663307, "learning_rate": 1.957259046270616e-05, "loss": 0.2335, "step": 3464 }, { "epoch": 0.7241379310344828, "grad_norm": 1.257309954515398, "learning_rate": 1.9572264113027752e-05, "loss": 0.2315, "step": 3465 }, { "epoch": 0.7243469174503657, "grad_norm": 1.290345793974191, "learning_rate": 1.9571937641527155e-05, "loss": 0.1839, "step": 3466 }, { "epoch": 0.7245559038662487, "grad_norm": 1.0722984148995338, "learning_rate": 1.9571611048208525e-05, "loss": 0.2157, "step": 3467 }, { "epoch": 0.7247648902821316, "grad_norm": 1.3330342436339362, "learning_rate": 1.9571284333076018e-05, "loss": 0.225, "step": 3468 }, { "epoch": 0.7249738766980146, "grad_norm": 1.1108650022399915, "learning_rate": 1.9570957496133793e-05, "loss": 0.2491, "step": 3469 }, { "epoch": 0.7251828631138976, "grad_norm": 1.5089015278420421, "learning_rate": 1.9570630537386007e-05, "loss": 0.2072, "step": 3470 }, { "epoch": 0.7253918495297805, "grad_norm": 1.3569401240885595, "learning_rate": 1.957030345683682e-05, "loss": 0.2017, "step": 3471 }, { "epoch": 0.7256008359456635, "grad_norm": 1.2789133844588676, "learning_rate": 1.95699762544904e-05, "loss": 0.2165, "step": 3472 }, { "epoch": 0.7258098223615465, "grad_norm": 1.1312398236128194, "learning_rate": 1.9569648930350907e-05, "loss": 0.2542, "step": 3473 }, { "epoch": 0.7260188087774294, "grad_norm": 1.3798121961804404, "learning_rate": 1.9569321484422506e-05, "loss": 0.248, "step": 3474 }, { "epoch": 0.7262277951933125, "grad_norm": 1.2907289514715263, "learning_rate": 1.9568993916709372e-05, "loss": 0.2208, "step": 3475 }, { "epoch": 0.7264367816091954, "grad_norm": 1.2260736611941865, "learning_rate": 1.956866622721566e-05, "loss": 0.2198, "step": 3476 }, { "epoch": 0.7266457680250784, "grad_norm": 1.2447716586045765, "learning_rate": 1.9568338415945555e-05, "loss": 0.2159, "step": 3477 }, { "epoch": 0.7268547544409614, "grad_norm": 1.2510749595466304, "learning_rate": 1.9568010482903218e-05, "loss": 0.1931, "step": 3478 }, { "epoch": 0.7270637408568443, "grad_norm": 1.146809716255148, "learning_rate": 1.956768242809283e-05, "loss": 0.2322, "step": 3479 }, { "epoch": 0.7272727272727273, "grad_norm": 1.7358337039624117, "learning_rate": 1.956735425151856e-05, "loss": 0.2086, "step": 3480 }, { "epoch": 0.7274817136886103, "grad_norm": 1.2736702028266476, "learning_rate": 1.956702595318459e-05, "loss": 0.2286, "step": 3481 }, { "epoch": 0.7276907001044932, "grad_norm": 1.1453279337369375, "learning_rate": 1.9566697533095092e-05, "loss": 0.2304, "step": 3482 }, { "epoch": 0.7278996865203762, "grad_norm": 1.3773693354825165, "learning_rate": 1.956636899125425e-05, "loss": 0.2361, "step": 3483 }, { "epoch": 0.7281086729362591, "grad_norm": 1.2807244899166277, "learning_rate": 1.9566040327666244e-05, "loss": 0.2033, "step": 3484 }, { "epoch": 0.7283176593521421, "grad_norm": 1.3625447747944095, "learning_rate": 1.956571154233526e-05, "loss": 0.2582, "step": 3485 }, { "epoch": 0.7285266457680251, "grad_norm": 1.1879441441319605, "learning_rate": 1.9565382635265476e-05, "loss": 0.2117, "step": 3486 }, { "epoch": 0.728735632183908, "grad_norm": 1.3729427409947688, "learning_rate": 1.956505360646108e-05, "loss": 0.2618, "step": 3487 }, { "epoch": 0.728944618599791, "grad_norm": 1.319687131779694, "learning_rate": 1.9564724455926263e-05, "loss": 0.2443, "step": 3488 }, { "epoch": 0.729153605015674, "grad_norm": 1.2832011609942988, "learning_rate": 1.956439518366521e-05, "loss": 0.2332, "step": 3489 }, { "epoch": 0.7293625914315569, "grad_norm": 1.7558936064850466, "learning_rate": 1.9564065789682117e-05, "loss": 0.2058, "step": 3490 }, { "epoch": 0.7295715778474399, "grad_norm": 1.101344941622869, "learning_rate": 1.9563736273981168e-05, "loss": 0.2166, "step": 3491 }, { "epoch": 0.7297805642633229, "grad_norm": 1.2698152840626622, "learning_rate": 1.956340663656656e-05, "loss": 0.195, "step": 3492 }, { "epoch": 0.7299895506792059, "grad_norm": 1.252223660011174, "learning_rate": 1.956307687744249e-05, "loss": 0.2499, "step": 3493 }, { "epoch": 0.7301985370950889, "grad_norm": 1.3392811336881734, "learning_rate": 1.9562746996613156e-05, "loss": 0.2032, "step": 3494 }, { "epoch": 0.7304075235109718, "grad_norm": 1.1941718761625375, "learning_rate": 1.9562416994082754e-05, "loss": 0.1984, "step": 3495 }, { "epoch": 0.7306165099268548, "grad_norm": 1.010615889170268, "learning_rate": 1.956208686985548e-05, "loss": 0.2187, "step": 3496 }, { "epoch": 0.7308254963427377, "grad_norm": 1.2862233100180382, "learning_rate": 1.956175662393554e-05, "loss": 0.2225, "step": 3497 }, { "epoch": 0.7310344827586207, "grad_norm": 1.302071753608108, "learning_rate": 1.9561426256327135e-05, "loss": 0.2105, "step": 3498 }, { "epoch": 0.7312434691745037, "grad_norm": 1.1886158852457986, "learning_rate": 1.9561095767034474e-05, "loss": 0.216, "step": 3499 }, { "epoch": 0.7314524555903866, "grad_norm": 1.5417407454402987, "learning_rate": 1.9560765156061757e-05, "loss": 0.2412, "step": 3500 }, { "epoch": 0.7316614420062696, "grad_norm": 1.0880809168311498, "learning_rate": 1.9560434423413196e-05, "loss": 0.229, "step": 3501 }, { "epoch": 0.7318704284221526, "grad_norm": 1.271248148041888, "learning_rate": 1.9560103569092995e-05, "loss": 0.2046, "step": 3502 }, { "epoch": 0.7320794148380355, "grad_norm": 1.559759692143962, "learning_rate": 1.9559772593105367e-05, "loss": 0.2227, "step": 3503 }, { "epoch": 0.7322884012539185, "grad_norm": 1.3596352365528248, "learning_rate": 1.9559441495454528e-05, "loss": 0.2362, "step": 3504 }, { "epoch": 0.7324973876698014, "grad_norm": 1.1768761616563173, "learning_rate": 1.9559110276144685e-05, "loss": 0.2509, "step": 3505 }, { "epoch": 0.7327063740856844, "grad_norm": 1.5096076816472994, "learning_rate": 1.955877893518006e-05, "loss": 0.2393, "step": 3506 }, { "epoch": 0.7329153605015674, "grad_norm": 1.3383857586280152, "learning_rate": 1.9558447472564866e-05, "loss": 0.2074, "step": 3507 }, { "epoch": 0.7331243469174503, "grad_norm": 1.0004720283660398, "learning_rate": 1.955811588830332e-05, "loss": 0.2241, "step": 3508 }, { "epoch": 0.7333333333333333, "grad_norm": 1.1562872882459783, "learning_rate": 1.9557784182399645e-05, "loss": 0.1983, "step": 3509 }, { "epoch": 0.7335423197492164, "grad_norm": 1.1734585323220859, "learning_rate": 1.9557452354858066e-05, "loss": 0.216, "step": 3510 }, { "epoch": 0.7337513061650993, "grad_norm": 1.4917743025030483, "learning_rate": 1.9557120405682796e-05, "loss": 0.2141, "step": 3511 }, { "epoch": 0.7339602925809823, "grad_norm": 1.1834497167591431, "learning_rate": 1.9556788334878066e-05, "loss": 0.2041, "step": 3512 }, { "epoch": 0.7341692789968652, "grad_norm": 0.9833968556596024, "learning_rate": 1.95564561424481e-05, "loss": 0.2024, "step": 3513 }, { "epoch": 0.7343782654127482, "grad_norm": 1.0879728623013425, "learning_rate": 1.955612382839713e-05, "loss": 0.204, "step": 3514 }, { "epoch": 0.7345872518286312, "grad_norm": 1.0435690422702097, "learning_rate": 1.955579139272938e-05, "loss": 0.1987, "step": 3515 }, { "epoch": 0.7347962382445141, "grad_norm": 1.2517372082954707, "learning_rate": 1.9555458835449085e-05, "loss": 0.1945, "step": 3516 }, { "epoch": 0.7350052246603971, "grad_norm": 1.16645688581078, "learning_rate": 1.9555126156560474e-05, "loss": 0.2509, "step": 3517 }, { "epoch": 0.73521421107628, "grad_norm": 1.145700140484681, "learning_rate": 1.955479335606778e-05, "loss": 0.2004, "step": 3518 }, { "epoch": 0.735423197492163, "grad_norm": 1.1056946988218124, "learning_rate": 1.955446043397524e-05, "loss": 0.2325, "step": 3519 }, { "epoch": 0.735632183908046, "grad_norm": 1.2091177581093555, "learning_rate": 1.9554127390287097e-05, "loss": 0.2314, "step": 3520 }, { "epoch": 0.7358411703239289, "grad_norm": 1.282676764275846, "learning_rate": 1.955379422500758e-05, "loss": 0.2603, "step": 3521 }, { "epoch": 0.7360501567398119, "grad_norm": 1.2143847882914542, "learning_rate": 1.9553460938140935e-05, "loss": 0.2318, "step": 3522 }, { "epoch": 0.7362591431556949, "grad_norm": 1.275261052707156, "learning_rate": 1.9553127529691398e-05, "loss": 0.2094, "step": 3523 }, { "epoch": 0.7364681295715778, "grad_norm": 1.140369320147942, "learning_rate": 1.955279399966322e-05, "loss": 0.1891, "step": 3524 }, { "epoch": 0.7366771159874608, "grad_norm": 1.1322888653578107, "learning_rate": 1.9552460348060638e-05, "loss": 0.2088, "step": 3525 }, { "epoch": 0.7368861024033437, "grad_norm": 1.2781871707007038, "learning_rate": 1.9552126574887905e-05, "loss": 0.2404, "step": 3526 }, { "epoch": 0.7370950888192267, "grad_norm": 1.1112393559676308, "learning_rate": 1.9551792680149262e-05, "loss": 0.2373, "step": 3527 }, { "epoch": 0.7373040752351098, "grad_norm": 1.0546571722970368, "learning_rate": 1.9551458663848963e-05, "loss": 0.2075, "step": 3528 }, { "epoch": 0.7375130616509927, "grad_norm": 1.1277254559877883, "learning_rate": 1.9551124525991257e-05, "loss": 0.1933, "step": 3529 }, { "epoch": 0.7377220480668757, "grad_norm": 1.3023403136329934, "learning_rate": 1.95507902665804e-05, "loss": 0.2096, "step": 3530 }, { "epoch": 0.7379310344827587, "grad_norm": 1.2767472123542025, "learning_rate": 1.9550455885620643e-05, "loss": 0.2272, "step": 3531 }, { "epoch": 0.7381400208986416, "grad_norm": 1.3462968049753175, "learning_rate": 1.955012138311624e-05, "loss": 0.2163, "step": 3532 }, { "epoch": 0.7383490073145246, "grad_norm": 1.2185746350073114, "learning_rate": 1.9549786759071448e-05, "loss": 0.2118, "step": 3533 }, { "epoch": 0.7385579937304075, "grad_norm": 0.9729597223705194, "learning_rate": 1.954945201349053e-05, "loss": 0.1934, "step": 3534 }, { "epoch": 0.7387669801462905, "grad_norm": 1.0218843463817058, "learning_rate": 1.954911714637774e-05, "loss": 0.1929, "step": 3535 }, { "epoch": 0.7389759665621735, "grad_norm": 1.293445374496724, "learning_rate": 1.9548782157737348e-05, "loss": 0.2387, "step": 3536 }, { "epoch": 0.7391849529780564, "grad_norm": 1.2166617023578359, "learning_rate": 1.9548447047573608e-05, "loss": 0.2371, "step": 3537 }, { "epoch": 0.7393939393939394, "grad_norm": 1.0808000145038725, "learning_rate": 1.9548111815890792e-05, "loss": 0.2181, "step": 3538 }, { "epoch": 0.7396029258098223, "grad_norm": 1.0169353203022058, "learning_rate": 1.9547776462693164e-05, "loss": 0.2054, "step": 3539 }, { "epoch": 0.7398119122257053, "grad_norm": 0.9706425758802956, "learning_rate": 1.954744098798499e-05, "loss": 0.1908, "step": 3540 }, { "epoch": 0.7400208986415883, "grad_norm": 1.1151885611528929, "learning_rate": 1.954710539177054e-05, "loss": 0.2013, "step": 3541 }, { "epoch": 0.7402298850574712, "grad_norm": 12.683118019189202, "learning_rate": 1.954676967405409e-05, "loss": 0.2086, "step": 3542 }, { "epoch": 0.7404388714733542, "grad_norm": 1.2636059800408264, "learning_rate": 1.9546433834839905e-05, "loss": 0.2156, "step": 3543 }, { "epoch": 0.7406478578892371, "grad_norm": 1.4940008431307452, "learning_rate": 1.9546097874132263e-05, "loss": 0.2105, "step": 3544 }, { "epoch": 0.7408568443051202, "grad_norm": 1.2509113364932725, "learning_rate": 1.954576179193544e-05, "loss": 0.2134, "step": 3545 }, { "epoch": 0.7410658307210032, "grad_norm": 1.3005146670821381, "learning_rate": 1.9545425588253712e-05, "loss": 0.2182, "step": 3546 }, { "epoch": 0.7412748171368861, "grad_norm": 1.5032477401842135, "learning_rate": 1.9545089263091358e-05, "loss": 0.2131, "step": 3547 }, { "epoch": 0.7414838035527691, "grad_norm": 1.2498350676249825, "learning_rate": 1.9544752816452657e-05, "loss": 0.2472, "step": 3548 }, { "epoch": 0.7416927899686521, "grad_norm": 1.3354648461093792, "learning_rate": 1.9544416248341894e-05, "loss": 0.2384, "step": 3549 }, { "epoch": 0.741901776384535, "grad_norm": 1.504467576292723, "learning_rate": 1.9544079558763353e-05, "loss": 0.2335, "step": 3550 }, { "epoch": 0.742110762800418, "grad_norm": 1.3389078681415365, "learning_rate": 1.954374274772131e-05, "loss": 0.229, "step": 3551 }, { "epoch": 0.742319749216301, "grad_norm": 1.426965672286338, "learning_rate": 1.9543405815220064e-05, "loss": 0.2044, "step": 3552 }, { "epoch": 0.7425287356321839, "grad_norm": 1.2050658337730384, "learning_rate": 1.9543068761263894e-05, "loss": 0.2302, "step": 3553 }, { "epoch": 0.7427377220480669, "grad_norm": 1.406722483368349, "learning_rate": 1.9542731585857094e-05, "loss": 0.2482, "step": 3554 }, { "epoch": 0.7429467084639498, "grad_norm": 1.2401373126373219, "learning_rate": 1.9542394289003953e-05, "loss": 0.23, "step": 3555 }, { "epoch": 0.7431556948798328, "grad_norm": 1.241930480708191, "learning_rate": 1.954205687070876e-05, "loss": 0.2161, "step": 3556 }, { "epoch": 0.7433646812957158, "grad_norm": 1.517648315596074, "learning_rate": 1.9541719330975822e-05, "loss": 0.2523, "step": 3557 }, { "epoch": 0.7435736677115987, "grad_norm": 1.2931619849419473, "learning_rate": 1.954138166980942e-05, "loss": 0.2595, "step": 3558 }, { "epoch": 0.7437826541274817, "grad_norm": 1.295031005205454, "learning_rate": 1.9541043887213858e-05, "loss": 0.2086, "step": 3559 }, { "epoch": 0.7439916405433646, "grad_norm": 1.2765940210558215, "learning_rate": 1.9540705983193437e-05, "loss": 0.1926, "step": 3560 }, { "epoch": 0.7442006269592476, "grad_norm": 1.3241616419982896, "learning_rate": 1.9540367957752452e-05, "loss": 0.2348, "step": 3561 }, { "epoch": 0.7444096133751306, "grad_norm": 1.1743408031486113, "learning_rate": 1.9540029810895207e-05, "loss": 0.2529, "step": 3562 }, { "epoch": 0.7446185997910136, "grad_norm": 1.3229334100782304, "learning_rate": 1.9539691542626007e-05, "loss": 0.2337, "step": 3563 }, { "epoch": 0.7448275862068966, "grad_norm": 1.3457088196954132, "learning_rate": 1.9539353152949155e-05, "loss": 0.223, "step": 3564 }, { "epoch": 0.7450365726227796, "grad_norm": 1.1161352881981161, "learning_rate": 1.953901464186896e-05, "loss": 0.1948, "step": 3565 }, { "epoch": 0.7452455590386625, "grad_norm": 1.3223980330364014, "learning_rate": 1.9538676009389727e-05, "loss": 0.1922, "step": 3566 }, { "epoch": 0.7454545454545455, "grad_norm": 1.3864329104569904, "learning_rate": 1.9538337255515766e-05, "loss": 0.2069, "step": 3567 }, { "epoch": 0.7456635318704284, "grad_norm": 1.398205987326258, "learning_rate": 1.9537998380251392e-05, "loss": 0.2001, "step": 3568 }, { "epoch": 0.7458725182863114, "grad_norm": 1.2909319908951198, "learning_rate": 1.9537659383600916e-05, "loss": 0.235, "step": 3569 }, { "epoch": 0.7460815047021944, "grad_norm": 1.4032606090164024, "learning_rate": 1.953732026556865e-05, "loss": 0.2186, "step": 3570 }, { "epoch": 0.7462904911180773, "grad_norm": 1.394209526550022, "learning_rate": 1.953698102615891e-05, "loss": 0.253, "step": 3571 }, { "epoch": 0.7464994775339603, "grad_norm": 1.8712133340986896, "learning_rate": 1.9536641665376016e-05, "loss": 0.2579, "step": 3572 }, { "epoch": 0.7467084639498432, "grad_norm": 1.1400828076818077, "learning_rate": 1.9536302183224282e-05, "loss": 0.2288, "step": 3573 }, { "epoch": 0.7469174503657262, "grad_norm": 1.1860430330252434, "learning_rate": 1.9535962579708038e-05, "loss": 0.221, "step": 3574 }, { "epoch": 0.7471264367816092, "grad_norm": 1.3221583914759043, "learning_rate": 1.9535622854831597e-05, "loss": 0.1927, "step": 3575 }, { "epoch": 0.7473354231974921, "grad_norm": 1.2476327698308498, "learning_rate": 1.953528300859928e-05, "loss": 0.2139, "step": 3576 }, { "epoch": 0.7475444096133751, "grad_norm": 1.4537407773166326, "learning_rate": 1.9534943041015425e-05, "loss": 0.2249, "step": 3577 }, { "epoch": 0.747753396029258, "grad_norm": 1.1840065115735954, "learning_rate": 1.9534602952084348e-05, "loss": 0.1923, "step": 3578 }, { "epoch": 0.747962382445141, "grad_norm": 1.4435465066824722, "learning_rate": 1.953426274181038e-05, "loss": 0.2149, "step": 3579 }, { "epoch": 0.7481713688610241, "grad_norm": 1.4187979557019679, "learning_rate": 1.9533922410197853e-05, "loss": 0.2194, "step": 3580 }, { "epoch": 0.748380355276907, "grad_norm": 1.3482029655129024, "learning_rate": 1.9533581957251094e-05, "loss": 0.2297, "step": 3581 }, { "epoch": 0.74858934169279, "grad_norm": 1.18855253147495, "learning_rate": 1.9533241382974438e-05, "loss": 0.2063, "step": 3582 }, { "epoch": 0.748798328108673, "grad_norm": 1.260650689682798, "learning_rate": 1.953290068737222e-05, "loss": 0.2207, "step": 3583 }, { "epoch": 0.7490073145245559, "grad_norm": 1.069742036865263, "learning_rate": 1.9532559870448778e-05, "loss": 0.1808, "step": 3584 }, { "epoch": 0.7492163009404389, "grad_norm": 1.2719834119781512, "learning_rate": 1.953221893220844e-05, "loss": 0.2108, "step": 3585 }, { "epoch": 0.7494252873563219, "grad_norm": 1.1824256588032633, "learning_rate": 1.9531877872655557e-05, "loss": 0.1884, "step": 3586 }, { "epoch": 0.7496342737722048, "grad_norm": 1.2988678391673056, "learning_rate": 1.9531536691794467e-05, "loss": 0.2369, "step": 3587 }, { "epoch": 0.7498432601880878, "grad_norm": 1.2949119291764681, "learning_rate": 1.9531195389629504e-05, "loss": 0.2231, "step": 3588 }, { "epoch": 0.7500522466039707, "grad_norm": 1.2603708148527075, "learning_rate": 1.953085396616502e-05, "loss": 0.194, "step": 3589 }, { "epoch": 0.7502612330198537, "grad_norm": 1.153093603719755, "learning_rate": 1.953051242140535e-05, "loss": 0.2212, "step": 3590 }, { "epoch": 0.7504702194357367, "grad_norm": 1.2957581962176719, "learning_rate": 1.9530170755354857e-05, "loss": 0.2399, "step": 3591 }, { "epoch": 0.7506792058516196, "grad_norm": 1.3316483337969602, "learning_rate": 1.9529828968017876e-05, "loss": 0.2056, "step": 3592 }, { "epoch": 0.7508881922675026, "grad_norm": 1.2915842687872081, "learning_rate": 1.952948705939876e-05, "loss": 0.2333, "step": 3593 }, { "epoch": 0.7510971786833855, "grad_norm": 1.3429925426590776, "learning_rate": 1.952914502950186e-05, "loss": 0.2332, "step": 3594 }, { "epoch": 0.7513061650992685, "grad_norm": 1.2499280883526527, "learning_rate": 1.9528802878331534e-05, "loss": 0.2476, "step": 3595 }, { "epoch": 0.7515151515151515, "grad_norm": 1.3767972284491496, "learning_rate": 1.952846060589213e-05, "loss": 0.2184, "step": 3596 }, { "epoch": 0.7517241379310344, "grad_norm": 1.5118404405794397, "learning_rate": 1.9528118212188007e-05, "loss": 0.263, "step": 3597 }, { "epoch": 0.7519331243469175, "grad_norm": 1.2456072212306384, "learning_rate": 1.952777569722352e-05, "loss": 0.2089, "step": 3598 }, { "epoch": 0.7521421107628005, "grad_norm": 1.2041976941837933, "learning_rate": 1.952743306100303e-05, "loss": 0.2183, "step": 3599 }, { "epoch": 0.7523510971786834, "grad_norm": 1.1710519854317682, "learning_rate": 1.95270903035309e-05, "loss": 0.2119, "step": 3600 }, { "epoch": 0.7525600835945664, "grad_norm": 1.3480277164860262, "learning_rate": 1.9526747424811488e-05, "loss": 0.229, "step": 3601 }, { "epoch": 0.7527690700104493, "grad_norm": 1.1611393086456774, "learning_rate": 1.9526404424849162e-05, "loss": 0.2089, "step": 3602 }, { "epoch": 0.7529780564263323, "grad_norm": 1.5969500715398177, "learning_rate": 1.9526061303648277e-05, "loss": 0.2207, "step": 3603 }, { "epoch": 0.7531870428422153, "grad_norm": 1.2092478282195078, "learning_rate": 1.9525718061213215e-05, "loss": 0.2068, "step": 3604 }, { "epoch": 0.7533960292580982, "grad_norm": 1.2685419389613082, "learning_rate": 1.9525374697548332e-05, "loss": 0.2154, "step": 3605 }, { "epoch": 0.7536050156739812, "grad_norm": 1.0728255641473128, "learning_rate": 1.9525031212658004e-05, "loss": 0.2155, "step": 3606 }, { "epoch": 0.7538140020898642, "grad_norm": 1.176549236937214, "learning_rate": 1.9524687606546603e-05, "loss": 0.2311, "step": 3607 }, { "epoch": 0.7540229885057471, "grad_norm": 1.353936035206727, "learning_rate": 1.9524343879218498e-05, "loss": 0.2218, "step": 3608 }, { "epoch": 0.7542319749216301, "grad_norm": 1.2861278765866773, "learning_rate": 1.9524000030678064e-05, "loss": 0.23, "step": 3609 }, { "epoch": 0.754440961337513, "grad_norm": 1.2162215965226446, "learning_rate": 1.952365606092968e-05, "loss": 0.2176, "step": 3610 }, { "epoch": 0.754649947753396, "grad_norm": 1.2068146963806043, "learning_rate": 1.952331196997772e-05, "loss": 0.2493, "step": 3611 }, { "epoch": 0.754858934169279, "grad_norm": 1.2356324334948843, "learning_rate": 1.9522967757826567e-05, "loss": 0.2387, "step": 3612 }, { "epoch": 0.7550679205851619, "grad_norm": 1.156657151629162, "learning_rate": 1.9522623424480595e-05, "loss": 0.2032, "step": 3613 }, { "epoch": 0.7552769070010449, "grad_norm": 1.3430178052743138, "learning_rate": 1.9522278969944195e-05, "loss": 0.2433, "step": 3614 }, { "epoch": 0.7554858934169278, "grad_norm": 1.274419133118772, "learning_rate": 1.9521934394221748e-05, "loss": 0.2041, "step": 3615 }, { "epoch": 0.7556948798328109, "grad_norm": 1.2895619481735405, "learning_rate": 1.9521589697317634e-05, "loss": 0.2471, "step": 3616 }, { "epoch": 0.7559038662486939, "grad_norm": 1.3097948579859588, "learning_rate": 1.9521244879236245e-05, "loss": 0.2093, "step": 3617 }, { "epoch": 0.7561128526645768, "grad_norm": 1.3111614674636038, "learning_rate": 1.9520899939981964e-05, "loss": 0.2057, "step": 3618 }, { "epoch": 0.7563218390804598, "grad_norm": 1.2104071607988511, "learning_rate": 1.9520554879559187e-05, "loss": 0.1846, "step": 3619 }, { "epoch": 0.7565308254963428, "grad_norm": 1.1142296497546673, "learning_rate": 1.9520209697972304e-05, "loss": 0.212, "step": 3620 }, { "epoch": 0.7567398119122257, "grad_norm": 1.1951816636342854, "learning_rate": 1.9519864395225706e-05, "loss": 0.2176, "step": 3621 }, { "epoch": 0.7569487983281087, "grad_norm": 1.2840778337631373, "learning_rate": 1.9519518971323795e-05, "loss": 0.2379, "step": 3622 }, { "epoch": 0.7571577847439916, "grad_norm": 1.2602929994301462, "learning_rate": 1.951917342627095e-05, "loss": 0.2317, "step": 3623 }, { "epoch": 0.7573667711598746, "grad_norm": 1.1692112766676022, "learning_rate": 1.951882776007159e-05, "loss": 0.2295, "step": 3624 }, { "epoch": 0.7575757575757576, "grad_norm": 1.1465104478784418, "learning_rate": 1.9518481972730096e-05, "loss": 0.2728, "step": 3625 }, { "epoch": 0.7577847439916405, "grad_norm": 1.0564286455282015, "learning_rate": 1.9518136064250882e-05, "loss": 0.2047, "step": 3626 }, { "epoch": 0.7579937304075235, "grad_norm": 1.1580157631885297, "learning_rate": 1.951779003463834e-05, "loss": 0.2565, "step": 3627 }, { "epoch": 0.7582027168234065, "grad_norm": 1.3370684003380093, "learning_rate": 1.951744388389688e-05, "loss": 0.2617, "step": 3628 }, { "epoch": 0.7584117032392894, "grad_norm": 1.4602406298502222, "learning_rate": 1.95170976120309e-05, "loss": 0.2272, "step": 3629 }, { "epoch": 0.7586206896551724, "grad_norm": 1.5586583670275704, "learning_rate": 1.951675121904482e-05, "loss": 0.2132, "step": 3630 }, { "epoch": 0.7588296760710553, "grad_norm": 1.0134864707662907, "learning_rate": 1.951640470494304e-05, "loss": 0.1836, "step": 3631 }, { "epoch": 0.7590386624869383, "grad_norm": 1.1498830021409403, "learning_rate": 1.9516058069729968e-05, "loss": 0.2223, "step": 3632 }, { "epoch": 0.7592476489028214, "grad_norm": 1.173075936546138, "learning_rate": 1.9515711313410017e-05, "loss": 0.1857, "step": 3633 }, { "epoch": 0.7594566353187043, "grad_norm": 1.3564360103836557, "learning_rate": 1.9515364435987605e-05, "loss": 0.2252, "step": 3634 }, { "epoch": 0.7596656217345873, "grad_norm": 1.190635921814617, "learning_rate": 1.9515017437467142e-05, "loss": 0.2303, "step": 3635 }, { "epoch": 0.7598746081504703, "grad_norm": 1.2873982764414622, "learning_rate": 1.9514670317853043e-05, "loss": 0.2205, "step": 3636 }, { "epoch": 0.7600835945663532, "grad_norm": 1.0907555906328512, "learning_rate": 1.951432307714973e-05, "loss": 0.2238, "step": 3637 }, { "epoch": 0.7602925809822362, "grad_norm": 1.1072279066382924, "learning_rate": 1.9513975715361616e-05, "loss": 0.2264, "step": 3638 }, { "epoch": 0.7605015673981191, "grad_norm": 1.1354279293854037, "learning_rate": 1.9513628232493128e-05, "loss": 0.2169, "step": 3639 }, { "epoch": 0.7607105538140021, "grad_norm": 1.1728482727063017, "learning_rate": 1.9513280628548686e-05, "loss": 0.2245, "step": 3640 }, { "epoch": 0.7609195402298851, "grad_norm": 1.4344172858462263, "learning_rate": 1.951293290353271e-05, "loss": 0.2368, "step": 3641 }, { "epoch": 0.761128526645768, "grad_norm": 1.3123496500287901, "learning_rate": 1.9512585057449636e-05, "loss": 0.2059, "step": 3642 }, { "epoch": 0.761337513061651, "grad_norm": 1.555804419053014, "learning_rate": 1.9512237090303877e-05, "loss": 0.2406, "step": 3643 }, { "epoch": 0.761546499477534, "grad_norm": 1.1380057448221397, "learning_rate": 1.951188900209987e-05, "loss": 0.2099, "step": 3644 }, { "epoch": 0.7617554858934169, "grad_norm": 1.3711777401765992, "learning_rate": 1.9511540792842047e-05, "loss": 0.2134, "step": 3645 }, { "epoch": 0.7619644723092999, "grad_norm": 1.1276929703919556, "learning_rate": 1.9511192462534828e-05, "loss": 0.2456, "step": 3646 }, { "epoch": 0.7621734587251828, "grad_norm": 1.2090947780253438, "learning_rate": 1.951084401118266e-05, "loss": 0.2377, "step": 3647 }, { "epoch": 0.7623824451410658, "grad_norm": 1.2302131913678336, "learning_rate": 1.951049543878997e-05, "loss": 0.2132, "step": 3648 }, { "epoch": 0.7625914315569488, "grad_norm": 1.2606799432223141, "learning_rate": 1.9510146745361195e-05, "loss": 0.1901, "step": 3649 }, { "epoch": 0.7628004179728317, "grad_norm": 1.2133869042735232, "learning_rate": 1.9509797930900772e-05, "loss": 0.2343, "step": 3650 }, { "epoch": 0.7630094043887148, "grad_norm": 1.2119534846980988, "learning_rate": 1.9509448995413144e-05, "loss": 0.2366, "step": 3651 }, { "epoch": 0.7632183908045977, "grad_norm": 1.2467605009703597, "learning_rate": 1.950909993890275e-05, "loss": 0.1963, "step": 3652 }, { "epoch": 0.7634273772204807, "grad_norm": 1.3702517276710104, "learning_rate": 1.950875076137403e-05, "loss": 0.2043, "step": 3653 }, { "epoch": 0.7636363636363637, "grad_norm": 1.2160792168823522, "learning_rate": 1.9508401462831427e-05, "loss": 0.2077, "step": 3654 }, { "epoch": 0.7638453500522466, "grad_norm": 1.2704697613039517, "learning_rate": 1.9508052043279388e-05, "loss": 0.2224, "step": 3655 }, { "epoch": 0.7640543364681296, "grad_norm": 1.2479583747545913, "learning_rate": 1.9507702502722364e-05, "loss": 0.2102, "step": 3656 }, { "epoch": 0.7642633228840126, "grad_norm": 1.085106183311056, "learning_rate": 1.95073528411648e-05, "loss": 0.2032, "step": 3657 }, { "epoch": 0.7644723092998955, "grad_norm": 1.1120993654154319, "learning_rate": 1.9507003058611142e-05, "loss": 0.2409, "step": 3658 }, { "epoch": 0.7646812957157785, "grad_norm": 1.364961762401557, "learning_rate": 1.950665315506585e-05, "loss": 0.2195, "step": 3659 }, { "epoch": 0.7648902821316614, "grad_norm": 1.166558729474321, "learning_rate": 1.950630313053337e-05, "loss": 0.2321, "step": 3660 }, { "epoch": 0.7650992685475444, "grad_norm": 1.1850101122862764, "learning_rate": 1.9505952985018156e-05, "loss": 0.2337, "step": 3661 }, { "epoch": 0.7653082549634274, "grad_norm": 1.272403030714785, "learning_rate": 1.9505602718524673e-05, "loss": 0.2364, "step": 3662 }, { "epoch": 0.7655172413793103, "grad_norm": 1.1565295766392194, "learning_rate": 1.9505252331057372e-05, "loss": 0.2108, "step": 3663 }, { "epoch": 0.7657262277951933, "grad_norm": 1.0730799487163654, "learning_rate": 1.9504901822620712e-05, "loss": 0.2026, "step": 3664 }, { "epoch": 0.7659352142110762, "grad_norm": 1.1155101471604811, "learning_rate": 1.9504551193219158e-05, "loss": 0.2049, "step": 3665 }, { "epoch": 0.7661442006269592, "grad_norm": 1.257549463819933, "learning_rate": 1.9504200442857165e-05, "loss": 0.2251, "step": 3666 }, { "epoch": 0.7663531870428422, "grad_norm": 1.5535028231050232, "learning_rate": 1.9503849571539203e-05, "loss": 0.2181, "step": 3667 }, { "epoch": 0.7665621734587252, "grad_norm": 1.4378495697452858, "learning_rate": 1.9503498579269737e-05, "loss": 0.2252, "step": 3668 }, { "epoch": 0.7667711598746082, "grad_norm": 1.1846168586262291, "learning_rate": 1.9503147466053233e-05, "loss": 0.2076, "step": 3669 }, { "epoch": 0.7669801462904912, "grad_norm": 1.1371331994572123, "learning_rate": 1.9502796231894153e-05, "loss": 0.1777, "step": 3670 }, { "epoch": 0.7671891327063741, "grad_norm": 1.20073943236267, "learning_rate": 1.950244487679698e-05, "loss": 0.2332, "step": 3671 }, { "epoch": 0.7673981191222571, "grad_norm": 1.2492070015822527, "learning_rate": 1.9502093400766177e-05, "loss": 0.244, "step": 3672 }, { "epoch": 0.76760710553814, "grad_norm": 1.423506681968979, "learning_rate": 1.950174180380622e-05, "loss": 0.2023, "step": 3673 }, { "epoch": 0.767816091954023, "grad_norm": 1.3612434328389453, "learning_rate": 1.9501390085921577e-05, "loss": 0.243, "step": 3674 }, { "epoch": 0.768025078369906, "grad_norm": 1.065401783738278, "learning_rate": 1.9501038247116732e-05, "loss": 0.2213, "step": 3675 }, { "epoch": 0.7682340647857889, "grad_norm": 1.1982227760201702, "learning_rate": 1.950068628739616e-05, "loss": 0.23, "step": 3676 }, { "epoch": 0.7684430512016719, "grad_norm": 1.1779107779342062, "learning_rate": 1.950033420676434e-05, "loss": 0.2072, "step": 3677 }, { "epoch": 0.7686520376175549, "grad_norm": 1.1036336877753405, "learning_rate": 1.9499982005225757e-05, "loss": 0.2499, "step": 3678 }, { "epoch": 0.7688610240334378, "grad_norm": 1.0000991580443277, "learning_rate": 1.9499629682784888e-05, "loss": 0.2103, "step": 3679 }, { "epoch": 0.7690700104493208, "grad_norm": 1.1621947588620578, "learning_rate": 1.9499277239446214e-05, "loss": 0.231, "step": 3680 }, { "epoch": 0.7692789968652037, "grad_norm": 1.084814915780234, "learning_rate": 1.949892467521423e-05, "loss": 0.1957, "step": 3681 }, { "epoch": 0.7694879832810867, "grad_norm": 1.1642675107822653, "learning_rate": 1.9498571990093418e-05, "loss": 0.2554, "step": 3682 }, { "epoch": 0.7696969696969697, "grad_norm": 1.2620530830942884, "learning_rate": 1.9498219184088263e-05, "loss": 0.2249, "step": 3683 }, { "epoch": 0.7699059561128526, "grad_norm": 1.159625534959921, "learning_rate": 1.949786625720326e-05, "loss": 0.2013, "step": 3684 }, { "epoch": 0.7701149425287356, "grad_norm": 1.260554903298166, "learning_rate": 1.9497513209442896e-05, "loss": 0.2409, "step": 3685 }, { "epoch": 0.7703239289446187, "grad_norm": 1.2528104863424976, "learning_rate": 1.9497160040811668e-05, "loss": 0.2355, "step": 3686 }, { "epoch": 0.7705329153605016, "grad_norm": 1.3975701624714432, "learning_rate": 1.9496806751314073e-05, "loss": 0.2083, "step": 3687 }, { "epoch": 0.7707419017763846, "grad_norm": 1.2970499698102087, "learning_rate": 1.94964533409546e-05, "loss": 0.2206, "step": 3688 }, { "epoch": 0.7709508881922675, "grad_norm": 1.055617732652306, "learning_rate": 1.949609980973775e-05, "loss": 0.1965, "step": 3689 }, { "epoch": 0.7711598746081505, "grad_norm": 1.3017294605958294, "learning_rate": 1.9495746157668024e-05, "loss": 0.2223, "step": 3690 }, { "epoch": 0.7713688610240335, "grad_norm": 1.1071458400339635, "learning_rate": 1.949539238474992e-05, "loss": 0.2294, "step": 3691 }, { "epoch": 0.7715778474399164, "grad_norm": 1.0728016257664743, "learning_rate": 1.9495038490987946e-05, "loss": 0.2057, "step": 3692 }, { "epoch": 0.7717868338557994, "grad_norm": 1.6388821027570735, "learning_rate": 1.9494684476386598e-05, "loss": 0.2247, "step": 3693 }, { "epoch": 0.7719958202716823, "grad_norm": 1.0653619543471835, "learning_rate": 1.9494330340950385e-05, "loss": 0.1935, "step": 3694 }, { "epoch": 0.7722048066875653, "grad_norm": 1.2162514259505743, "learning_rate": 1.9493976084683814e-05, "loss": 0.2089, "step": 3695 }, { "epoch": 0.7724137931034483, "grad_norm": 1.1215033412488578, "learning_rate": 1.9493621707591395e-05, "loss": 0.2359, "step": 3696 }, { "epoch": 0.7726227795193312, "grad_norm": 1.0217416873307872, "learning_rate": 1.9493267209677634e-05, "loss": 0.2258, "step": 3697 }, { "epoch": 0.7728317659352142, "grad_norm": 1.092591825031202, "learning_rate": 1.9492912590947048e-05, "loss": 0.2083, "step": 3698 }, { "epoch": 0.7730407523510971, "grad_norm": 1.4041115258675851, "learning_rate": 1.9492557851404145e-05, "loss": 0.2325, "step": 3699 }, { "epoch": 0.7732497387669801, "grad_norm": 1.1909899224334897, "learning_rate": 1.9492202991053443e-05, "loss": 0.2198, "step": 3700 }, { "epoch": 0.7734587251828631, "grad_norm": 1.1848720075475352, "learning_rate": 1.9491848009899454e-05, "loss": 0.2047, "step": 3701 }, { "epoch": 0.773667711598746, "grad_norm": 1.2336363467852205, "learning_rate": 1.9491492907946698e-05, "loss": 0.2303, "step": 3702 }, { "epoch": 0.773876698014629, "grad_norm": 1.1647896376007554, "learning_rate": 1.9491137685199698e-05, "loss": 0.2208, "step": 3703 }, { "epoch": 0.7740856844305121, "grad_norm": 1.0424426103746947, "learning_rate": 1.9490782341662968e-05, "loss": 0.2075, "step": 3704 }, { "epoch": 0.774294670846395, "grad_norm": 1.188243510865459, "learning_rate": 1.9490426877341036e-05, "loss": 0.2105, "step": 3705 }, { "epoch": 0.774503657262278, "grad_norm": 1.4378326486812756, "learning_rate": 1.9490071292238424e-05, "loss": 0.2318, "step": 3706 }, { "epoch": 0.774712643678161, "grad_norm": 1.3092075747265222, "learning_rate": 1.9489715586359657e-05, "loss": 0.2319, "step": 3707 }, { "epoch": 0.7749216300940439, "grad_norm": 1.2189821023884417, "learning_rate": 1.9489359759709263e-05, "loss": 0.222, "step": 3708 }, { "epoch": 0.7751306165099269, "grad_norm": 1.0808992228547567, "learning_rate": 1.9489003812291763e-05, "loss": 0.2278, "step": 3709 }, { "epoch": 0.7753396029258098, "grad_norm": 1.500508777850122, "learning_rate": 1.94886477441117e-05, "loss": 0.1893, "step": 3710 }, { "epoch": 0.7755485893416928, "grad_norm": 1.3564995761445022, "learning_rate": 1.94882915551736e-05, "loss": 0.2255, "step": 3711 }, { "epoch": 0.7757575757575758, "grad_norm": 1.2657103023265468, "learning_rate": 1.948793524548199e-05, "loss": 0.234, "step": 3712 }, { "epoch": 0.7759665621734587, "grad_norm": 1.4337193961806352, "learning_rate": 1.948757881504141e-05, "loss": 0.2409, "step": 3713 }, { "epoch": 0.7761755485893417, "grad_norm": 1.1857058865762427, "learning_rate": 1.9487222263856396e-05, "loss": 0.2093, "step": 3714 }, { "epoch": 0.7763845350052246, "grad_norm": 1.138440665087657, "learning_rate": 1.948686559193149e-05, "loss": 0.2148, "step": 3715 }, { "epoch": 0.7765935214211076, "grad_norm": 1.4854428417011019, "learning_rate": 1.948650879927122e-05, "loss": 0.2396, "step": 3716 }, { "epoch": 0.7768025078369906, "grad_norm": 1.0541658746886324, "learning_rate": 1.948615188588014e-05, "loss": 0.1838, "step": 3717 }, { "epoch": 0.7770114942528735, "grad_norm": 1.3654366342052242, "learning_rate": 1.9485794851762782e-05, "loss": 0.2183, "step": 3718 }, { "epoch": 0.7772204806687565, "grad_norm": 1.101592857324765, "learning_rate": 1.9485437696923696e-05, "loss": 0.1946, "step": 3719 }, { "epoch": 0.7774294670846394, "grad_norm": 1.214228697438388, "learning_rate": 1.9485080421367422e-05, "loss": 0.215, "step": 3720 }, { "epoch": 0.7776384535005225, "grad_norm": 1.195803380564379, "learning_rate": 1.9484723025098507e-05, "loss": 0.1873, "step": 3721 }, { "epoch": 0.7778474399164055, "grad_norm": 1.20983490151549, "learning_rate": 1.948436550812151e-05, "loss": 0.2123, "step": 3722 }, { "epoch": 0.7780564263322884, "grad_norm": 1.1782218567210616, "learning_rate": 1.9484007870440968e-05, "loss": 0.1735, "step": 3723 }, { "epoch": 0.7782654127481714, "grad_norm": 1.5692316968875673, "learning_rate": 1.9483650112061437e-05, "loss": 0.1866, "step": 3724 }, { "epoch": 0.7784743991640544, "grad_norm": 1.1379060473519356, "learning_rate": 1.9483292232987473e-05, "loss": 0.1792, "step": 3725 }, { "epoch": 0.7786833855799373, "grad_norm": 1.0352482479171843, "learning_rate": 1.9482934233223626e-05, "loss": 0.2044, "step": 3726 }, { "epoch": 0.7788923719958203, "grad_norm": 1.2625071632502352, "learning_rate": 1.9482576112774458e-05, "loss": 0.1976, "step": 3727 }, { "epoch": 0.7791013584117032, "grad_norm": 1.3599380653962665, "learning_rate": 1.9482217871644517e-05, "loss": 0.2178, "step": 3728 }, { "epoch": 0.7793103448275862, "grad_norm": 1.443845724215312, "learning_rate": 1.9481859509838372e-05, "loss": 0.2736, "step": 3729 }, { "epoch": 0.7795193312434692, "grad_norm": 1.1653585524223131, "learning_rate": 1.948150102736058e-05, "loss": 0.1822, "step": 3730 }, { "epoch": 0.7797283176593521, "grad_norm": 1.1463839871325738, "learning_rate": 1.9481142424215703e-05, "loss": 0.1963, "step": 3731 }, { "epoch": 0.7799373040752351, "grad_norm": 1.0983681479882075, "learning_rate": 1.9480783700408307e-05, "loss": 0.2541, "step": 3732 }, { "epoch": 0.780146290491118, "grad_norm": 1.1059767910249827, "learning_rate": 1.948042485594295e-05, "loss": 0.2291, "step": 3733 }, { "epoch": 0.780355276907001, "grad_norm": 1.0701115408952078, "learning_rate": 1.948006589082421e-05, "loss": 0.2118, "step": 3734 }, { "epoch": 0.780564263322884, "grad_norm": 1.2926851023812358, "learning_rate": 1.9479706805056646e-05, "loss": 0.2388, "step": 3735 }, { "epoch": 0.7807732497387669, "grad_norm": 1.2001710128533913, "learning_rate": 1.9479347598644832e-05, "loss": 0.2277, "step": 3736 }, { "epoch": 0.7809822361546499, "grad_norm": 1.286653959427673, "learning_rate": 1.947898827159334e-05, "loss": 0.2542, "step": 3737 }, { "epoch": 0.7811912225705329, "grad_norm": 1.1079540667626737, "learning_rate": 1.9478628823906742e-05, "loss": 0.2537, "step": 3738 }, { "epoch": 0.7814002089864159, "grad_norm": 1.060102952730785, "learning_rate": 1.947826925558961e-05, "loss": 0.2333, "step": 3739 }, { "epoch": 0.7816091954022989, "grad_norm": 1.0975884227919108, "learning_rate": 1.9477909566646526e-05, "loss": 0.2623, "step": 3740 }, { "epoch": 0.7818181818181819, "grad_norm": 1.146084694646294, "learning_rate": 1.947754975708206e-05, "loss": 0.224, "step": 3741 }, { "epoch": 0.7820271682340648, "grad_norm": 0.9499517650956183, "learning_rate": 1.94771898269008e-05, "loss": 0.2184, "step": 3742 }, { "epoch": 0.7822361546499478, "grad_norm": 1.207635768714311, "learning_rate": 1.947682977610732e-05, "loss": 0.209, "step": 3743 }, { "epoch": 0.7824451410658307, "grad_norm": 0.9812768077981929, "learning_rate": 1.9476469604706204e-05, "loss": 0.2116, "step": 3744 }, { "epoch": 0.7826541274817137, "grad_norm": 1.089767284549075, "learning_rate": 1.9476109312702036e-05, "loss": 0.2222, "step": 3745 }, { "epoch": 0.7828631138975967, "grad_norm": 1.0922114597124066, "learning_rate": 1.94757489000994e-05, "loss": 0.1854, "step": 3746 }, { "epoch": 0.7830721003134796, "grad_norm": 1.3229557153002718, "learning_rate": 1.9475388366902886e-05, "loss": 0.2029, "step": 3747 }, { "epoch": 0.7832810867293626, "grad_norm": 1.168776269609252, "learning_rate": 1.9475027713117082e-05, "loss": 0.1933, "step": 3748 }, { "epoch": 0.7834900731452455, "grad_norm": 1.1735355617447942, "learning_rate": 1.9474666938746573e-05, "loss": 0.2099, "step": 3749 }, { "epoch": 0.7836990595611285, "grad_norm": 1.0619990696840729, "learning_rate": 1.9474306043795958e-05, "loss": 0.1833, "step": 3750 }, { "epoch": 0.7839080459770115, "grad_norm": 1.2151659420902041, "learning_rate": 1.947394502826982e-05, "loss": 0.21, "step": 3751 }, { "epoch": 0.7841170323928944, "grad_norm": 1.1465649085172283, "learning_rate": 1.9473583892172762e-05, "loss": 0.1889, "step": 3752 }, { "epoch": 0.7843260188087774, "grad_norm": 1.2308578747615302, "learning_rate": 1.9473222635509378e-05, "loss": 0.2121, "step": 3753 }, { "epoch": 0.7845350052246604, "grad_norm": 1.3051473312125017, "learning_rate": 1.947286125828426e-05, "loss": 0.194, "step": 3754 }, { "epoch": 0.7847439916405433, "grad_norm": 1.0289386174757542, "learning_rate": 1.947249976050202e-05, "loss": 0.1834, "step": 3755 }, { "epoch": 0.7849529780564264, "grad_norm": 1.0871665538234265, "learning_rate": 1.9472138142167247e-05, "loss": 0.204, "step": 3756 }, { "epoch": 0.7851619644723093, "grad_norm": 1.20014683869898, "learning_rate": 1.9471776403284543e-05, "loss": 0.1883, "step": 3757 }, { "epoch": 0.7853709508881923, "grad_norm": 1.0218807291763743, "learning_rate": 1.947141454385852e-05, "loss": 0.2115, "step": 3758 }, { "epoch": 0.7855799373040753, "grad_norm": 1.4276975444957434, "learning_rate": 1.9471052563893776e-05, "loss": 0.2226, "step": 3759 }, { "epoch": 0.7857889237199582, "grad_norm": 1.4141888575732218, "learning_rate": 1.9470690463394923e-05, "loss": 0.2033, "step": 3760 }, { "epoch": 0.7859979101358412, "grad_norm": 1.1005788094483426, "learning_rate": 1.9470328242366564e-05, "loss": 0.2079, "step": 3761 }, { "epoch": 0.7862068965517242, "grad_norm": 1.1426044009767897, "learning_rate": 1.946996590081331e-05, "loss": 0.2511, "step": 3762 }, { "epoch": 0.7864158829676071, "grad_norm": 1.1017179378937203, "learning_rate": 1.946960343873978e-05, "loss": 0.2214, "step": 3763 }, { "epoch": 0.7866248693834901, "grad_norm": 1.3027079324090665, "learning_rate": 1.9469240856150575e-05, "loss": 0.2195, "step": 3764 }, { "epoch": 0.786833855799373, "grad_norm": 1.0445139029245543, "learning_rate": 1.9468878153050318e-05, "loss": 0.1848, "step": 3765 }, { "epoch": 0.787042842215256, "grad_norm": 1.4133158614365346, "learning_rate": 1.9468515329443622e-05, "loss": 0.2086, "step": 3766 }, { "epoch": 0.787251828631139, "grad_norm": 1.4065581377641672, "learning_rate": 1.9468152385335105e-05, "loss": 0.2049, "step": 3767 }, { "epoch": 0.7874608150470219, "grad_norm": 0.9260026649113329, "learning_rate": 1.9467789320729388e-05, "loss": 0.2326, "step": 3768 }, { "epoch": 0.7876698014629049, "grad_norm": 1.2174169884505956, "learning_rate": 1.9467426135631083e-05, "loss": 0.1951, "step": 3769 }, { "epoch": 0.7878787878787878, "grad_norm": 1.0423325127391765, "learning_rate": 1.9467062830044822e-05, "loss": 0.1982, "step": 3770 }, { "epoch": 0.7880877742946708, "grad_norm": 1.349089644927016, "learning_rate": 1.9466699403975225e-05, "loss": 0.2387, "step": 3771 }, { "epoch": 0.7882967607105538, "grad_norm": 1.3692852481665423, "learning_rate": 1.946633585742692e-05, "loss": 0.2499, "step": 3772 }, { "epoch": 0.7885057471264367, "grad_norm": 1.0926247462472791, "learning_rate": 1.9465972190404526e-05, "loss": 0.2133, "step": 3773 }, { "epoch": 0.7887147335423198, "grad_norm": 1.274617919750491, "learning_rate": 1.946560840291268e-05, "loss": 0.1984, "step": 3774 }, { "epoch": 0.7889237199582028, "grad_norm": 1.109717917706536, "learning_rate": 1.9465244494956007e-05, "loss": 0.2214, "step": 3775 }, { "epoch": 0.7891327063740857, "grad_norm": 1.0523122629815174, "learning_rate": 1.9464880466539136e-05, "loss": 0.219, "step": 3776 }, { "epoch": 0.7893416927899687, "grad_norm": 0.9649273925665434, "learning_rate": 1.9464516317666707e-05, "loss": 0.2033, "step": 3777 }, { "epoch": 0.7895506792058516, "grad_norm": 1.2397549342101368, "learning_rate": 1.9464152048343346e-05, "loss": 0.1663, "step": 3778 }, { "epoch": 0.7897596656217346, "grad_norm": 1.2776270643630545, "learning_rate": 1.94637876585737e-05, "loss": 0.2286, "step": 3779 }, { "epoch": 0.7899686520376176, "grad_norm": 1.1581435393640964, "learning_rate": 1.9463423148362395e-05, "loss": 0.2002, "step": 3780 }, { "epoch": 0.7901776384535005, "grad_norm": 1.1596480425808604, "learning_rate": 1.9463058517714074e-05, "loss": 0.1875, "step": 3781 }, { "epoch": 0.7903866248693835, "grad_norm": 1.1828568813561093, "learning_rate": 1.946269376663338e-05, "loss": 0.2242, "step": 3782 }, { "epoch": 0.7905956112852665, "grad_norm": 1.3587280666131862, "learning_rate": 1.9462328895124955e-05, "loss": 0.2106, "step": 3783 }, { "epoch": 0.7908045977011494, "grad_norm": 1.1019365726731571, "learning_rate": 1.946196390319344e-05, "loss": 0.2167, "step": 3784 }, { "epoch": 0.7910135841170324, "grad_norm": 1.2196639198766601, "learning_rate": 1.946159879084348e-05, "loss": 0.2104, "step": 3785 }, { "epoch": 0.7912225705329153, "grad_norm": 1.2104773153340949, "learning_rate": 1.9461233558079722e-05, "loss": 0.1929, "step": 3786 }, { "epoch": 0.7914315569487983, "grad_norm": 1.1446177094571879, "learning_rate": 1.9460868204906815e-05, "loss": 0.1926, "step": 3787 }, { "epoch": 0.7916405433646813, "grad_norm": 1.5218815483307364, "learning_rate": 1.946050273132941e-05, "loss": 0.2512, "step": 3788 }, { "epoch": 0.7918495297805642, "grad_norm": 1.427508604972717, "learning_rate": 1.9460137137352154e-05, "loss": 0.2054, "step": 3789 }, { "epoch": 0.7920585161964472, "grad_norm": 1.1645449603406053, "learning_rate": 1.9459771422979706e-05, "loss": 0.196, "step": 3790 }, { "epoch": 0.7922675026123301, "grad_norm": 1.1502982693442534, "learning_rate": 1.9459405588216717e-05, "loss": 0.2029, "step": 3791 }, { "epoch": 0.7924764890282132, "grad_norm": 1.3705386421568764, "learning_rate": 1.945903963306784e-05, "loss": 0.2125, "step": 3792 }, { "epoch": 0.7926854754440962, "grad_norm": 1.1182356899861825, "learning_rate": 1.9458673557537733e-05, "loss": 0.1875, "step": 3793 }, { "epoch": 0.7928944618599791, "grad_norm": 1.3670934196542879, "learning_rate": 1.945830736163106e-05, "loss": 0.2252, "step": 3794 }, { "epoch": 0.7931034482758621, "grad_norm": 1.0300798276012404, "learning_rate": 1.9457941045352476e-05, "loss": 0.1971, "step": 3795 }, { "epoch": 0.7933124346917451, "grad_norm": 1.198002766549335, "learning_rate": 1.945757460870665e-05, "loss": 0.1788, "step": 3796 }, { "epoch": 0.793521421107628, "grad_norm": 1.0357258782006393, "learning_rate": 1.9457208051698235e-05, "loss": 0.2012, "step": 3797 }, { "epoch": 0.793730407523511, "grad_norm": 1.276559076199665, "learning_rate": 1.94568413743319e-05, "loss": 0.2091, "step": 3798 }, { "epoch": 0.793939393939394, "grad_norm": 1.2494094127358462, "learning_rate": 1.945647457661232e-05, "loss": 0.2404, "step": 3799 }, { "epoch": 0.7941483803552769, "grad_norm": 1.0670335055433433, "learning_rate": 1.9456107658544153e-05, "loss": 0.18, "step": 3800 }, { "epoch": 0.7943573667711599, "grad_norm": 1.1962895235842164, "learning_rate": 1.9455740620132072e-05, "loss": 0.2182, "step": 3801 }, { "epoch": 0.7945663531870428, "grad_norm": 1.138093927984686, "learning_rate": 1.945537346138075e-05, "loss": 0.1942, "step": 3802 }, { "epoch": 0.7947753396029258, "grad_norm": 1.1166441167374577, "learning_rate": 1.9455006182294855e-05, "loss": 0.2258, "step": 3803 }, { "epoch": 0.7949843260188088, "grad_norm": 1.354751922224533, "learning_rate": 1.9454638782879067e-05, "loss": 0.2185, "step": 3804 }, { "epoch": 0.7951933124346917, "grad_norm": 1.2070508717051598, "learning_rate": 1.9454271263138054e-05, "loss": 0.1932, "step": 3805 }, { "epoch": 0.7954022988505747, "grad_norm": 1.1867819433965072, "learning_rate": 1.9453903623076503e-05, "loss": 0.2313, "step": 3806 }, { "epoch": 0.7956112852664576, "grad_norm": 1.2875166922289232, "learning_rate": 1.9453535862699088e-05, "loss": 0.2337, "step": 3807 }, { "epoch": 0.7958202716823406, "grad_norm": 1.0584394688955567, "learning_rate": 1.9453167982010485e-05, "loss": 0.1978, "step": 3808 }, { "epoch": 0.7960292580982237, "grad_norm": 1.2649406962040117, "learning_rate": 1.945279998101538e-05, "loss": 0.2341, "step": 3809 }, { "epoch": 0.7962382445141066, "grad_norm": 1.0738363131673643, "learning_rate": 1.9452431859718457e-05, "loss": 0.1596, "step": 3810 }, { "epoch": 0.7964472309299896, "grad_norm": 1.2029244772716141, "learning_rate": 1.9452063618124402e-05, "loss": 0.2257, "step": 3811 }, { "epoch": 0.7966562173458726, "grad_norm": 1.3235836919885167, "learning_rate": 1.94516952562379e-05, "loss": 0.2163, "step": 3812 }, { "epoch": 0.7968652037617555, "grad_norm": 1.1462753189763901, "learning_rate": 1.9451326774063636e-05, "loss": 0.2326, "step": 3813 }, { "epoch": 0.7970741901776385, "grad_norm": 1.096232609676776, "learning_rate": 1.9450958171606306e-05, "loss": 0.2005, "step": 3814 }, { "epoch": 0.7972831765935214, "grad_norm": 1.149596114252076, "learning_rate": 1.9450589448870595e-05, "loss": 0.2326, "step": 3815 }, { "epoch": 0.7974921630094044, "grad_norm": 1.2782125942607527, "learning_rate": 1.9450220605861197e-05, "loss": 0.2109, "step": 3816 }, { "epoch": 0.7977011494252874, "grad_norm": 1.324104762866285, "learning_rate": 1.944985164258281e-05, "loss": 0.2147, "step": 3817 }, { "epoch": 0.7979101358411703, "grad_norm": 1.160848449997371, "learning_rate": 1.9449482559040124e-05, "loss": 0.1992, "step": 3818 }, { "epoch": 0.7981191222570533, "grad_norm": 1.076655980788652, "learning_rate": 1.9449113355237838e-05, "loss": 0.1965, "step": 3819 }, { "epoch": 0.7983281086729362, "grad_norm": 1.0712058962308302, "learning_rate": 1.9448744031180654e-05, "loss": 0.1729, "step": 3820 }, { "epoch": 0.7985370950888192, "grad_norm": 1.0525611681661098, "learning_rate": 1.9448374586873266e-05, "loss": 0.22, "step": 3821 }, { "epoch": 0.7987460815047022, "grad_norm": 1.1824894890709725, "learning_rate": 1.9448005022320383e-05, "loss": 0.1992, "step": 3822 }, { "epoch": 0.7989550679205851, "grad_norm": 1.0534794583295557, "learning_rate": 1.9447635337526704e-05, "loss": 0.2379, "step": 3823 }, { "epoch": 0.7991640543364681, "grad_norm": 1.2853785197765695, "learning_rate": 1.9447265532496936e-05, "loss": 0.2372, "step": 3824 }, { "epoch": 0.799373040752351, "grad_norm": 1.139799463204872, "learning_rate": 1.9446895607235783e-05, "loss": 0.2136, "step": 3825 }, { "epoch": 0.799582027168234, "grad_norm": 1.1970841842153976, "learning_rate": 1.944652556174795e-05, "loss": 0.232, "step": 3826 }, { "epoch": 0.7997910135841171, "grad_norm": 1.190533042758498, "learning_rate": 1.9446155396038156e-05, "loss": 0.2416, "step": 3827 }, { "epoch": 0.8, "grad_norm": 1.3073846782403682, "learning_rate": 1.9445785110111103e-05, "loss": 0.201, "step": 3828 }, { "epoch": 0.800208986415883, "grad_norm": 1.057124085674146, "learning_rate": 1.944541470397151e-05, "loss": 0.2262, "step": 3829 }, { "epoch": 0.800417972831766, "grad_norm": 1.005444171900659, "learning_rate": 1.944504417762408e-05, "loss": 0.1852, "step": 3830 }, { "epoch": 0.8006269592476489, "grad_norm": 1.336744775034922, "learning_rate": 1.9444673531073544e-05, "loss": 0.2422, "step": 3831 }, { "epoch": 0.8008359456635319, "grad_norm": 1.4668673740072133, "learning_rate": 1.9444302764324607e-05, "loss": 0.1996, "step": 3832 }, { "epoch": 0.8010449320794149, "grad_norm": 1.4236125016044103, "learning_rate": 1.9443931877381996e-05, "loss": 0.2137, "step": 3833 }, { "epoch": 0.8012539184952978, "grad_norm": 1.3113991758782795, "learning_rate": 1.944356087025042e-05, "loss": 0.2284, "step": 3834 }, { "epoch": 0.8014629049111808, "grad_norm": 1.2165840935968888, "learning_rate": 1.9443189742934614e-05, "loss": 0.1959, "step": 3835 }, { "epoch": 0.8016718913270637, "grad_norm": 1.2305898668992599, "learning_rate": 1.9442818495439294e-05, "loss": 0.2401, "step": 3836 }, { "epoch": 0.8018808777429467, "grad_norm": 1.036016018912574, "learning_rate": 1.9442447127769185e-05, "loss": 0.2446, "step": 3837 }, { "epoch": 0.8020898641588297, "grad_norm": 1.0398688581763593, "learning_rate": 1.944207563992901e-05, "loss": 0.22, "step": 3838 }, { "epoch": 0.8022988505747126, "grad_norm": 1.3004847270672615, "learning_rate": 1.9441704031923504e-05, "loss": 0.2354, "step": 3839 }, { "epoch": 0.8025078369905956, "grad_norm": 1.2443076550154333, "learning_rate": 1.9441332303757393e-05, "loss": 0.2229, "step": 3840 }, { "epoch": 0.8027168234064785, "grad_norm": 1.2694539959219695, "learning_rate": 1.9440960455435406e-05, "loss": 0.2136, "step": 3841 }, { "epoch": 0.8029258098223615, "grad_norm": 1.1253103011926826, "learning_rate": 1.944058848696228e-05, "loss": 0.209, "step": 3842 }, { "epoch": 0.8031347962382445, "grad_norm": 1.16286632068293, "learning_rate": 1.9440216398342744e-05, "loss": 0.2256, "step": 3843 }, { "epoch": 0.8033437826541275, "grad_norm": 1.0133603332471834, "learning_rate": 1.9439844189581535e-05, "loss": 0.1772, "step": 3844 }, { "epoch": 0.8035527690700105, "grad_norm": 1.2112689660143934, "learning_rate": 1.943947186068339e-05, "loss": 0.228, "step": 3845 }, { "epoch": 0.8037617554858935, "grad_norm": 1.200869941633844, "learning_rate": 1.943909941165305e-05, "loss": 0.2107, "step": 3846 }, { "epoch": 0.8039707419017764, "grad_norm": 1.0926195260869451, "learning_rate": 1.943872684249525e-05, "loss": 0.1855, "step": 3847 }, { "epoch": 0.8041797283176594, "grad_norm": 1.0340719094820892, "learning_rate": 1.9438354153214736e-05, "loss": 0.1836, "step": 3848 }, { "epoch": 0.8043887147335423, "grad_norm": 1.1309028922137212, "learning_rate": 1.9437981343816247e-05, "loss": 0.1932, "step": 3849 }, { "epoch": 0.8045977011494253, "grad_norm": 1.1182101399901039, "learning_rate": 1.9437608414304533e-05, "loss": 0.2065, "step": 3850 }, { "epoch": 0.8048066875653083, "grad_norm": 1.2019866917466326, "learning_rate": 1.9437235364684337e-05, "loss": 0.174, "step": 3851 }, { "epoch": 0.8050156739811912, "grad_norm": 1.0222999819862346, "learning_rate": 1.9436862194960404e-05, "loss": 0.2045, "step": 3852 }, { "epoch": 0.8052246603970742, "grad_norm": 0.9282565502131247, "learning_rate": 1.9436488905137486e-05, "loss": 0.2259, "step": 3853 }, { "epoch": 0.8054336468129571, "grad_norm": 1.4511371687377934, "learning_rate": 1.9436115495220336e-05, "loss": 0.2031, "step": 3854 }, { "epoch": 0.8056426332288401, "grad_norm": 1.2190276311989836, "learning_rate": 1.9435741965213704e-05, "loss": 0.2193, "step": 3855 }, { "epoch": 0.8058516196447231, "grad_norm": 1.2075693539740222, "learning_rate": 1.943536831512234e-05, "loss": 0.2415, "step": 3856 }, { "epoch": 0.806060606060606, "grad_norm": 1.2236348209826255, "learning_rate": 1.9434994544951006e-05, "loss": 0.2371, "step": 3857 }, { "epoch": 0.806269592476489, "grad_norm": 1.2283483988453952, "learning_rate": 1.9434620654704454e-05, "loss": 0.2365, "step": 3858 }, { "epoch": 0.806478578892372, "grad_norm": 1.3381002756592606, "learning_rate": 1.9434246644387443e-05, "loss": 0.215, "step": 3859 }, { "epoch": 0.8066875653082549, "grad_norm": 0.9452107789057612, "learning_rate": 1.943387251400474e-05, "loss": 0.1986, "step": 3860 }, { "epoch": 0.8068965517241379, "grad_norm": 1.0638109094094548, "learning_rate": 1.9433498263561095e-05, "loss": 0.233, "step": 3861 }, { "epoch": 0.807105538140021, "grad_norm": 1.0577922315222015, "learning_rate": 1.9433123893061275e-05, "loss": 0.2156, "step": 3862 }, { "epoch": 0.8073145245559039, "grad_norm": 1.0768700066606984, "learning_rate": 1.9432749402510048e-05, "loss": 0.1923, "step": 3863 }, { "epoch": 0.8075235109717869, "grad_norm": 1.0482116340630374, "learning_rate": 1.943237479191218e-05, "loss": 0.1916, "step": 3864 }, { "epoch": 0.8077324973876698, "grad_norm": 1.4955506853891702, "learning_rate": 1.9432000061272432e-05, "loss": 0.21, "step": 3865 }, { "epoch": 0.8079414838035528, "grad_norm": 1.292382633720618, "learning_rate": 1.943162521059558e-05, "loss": 0.2281, "step": 3866 }, { "epoch": 0.8081504702194358, "grad_norm": 1.1899421422201462, "learning_rate": 1.9431250239886386e-05, "loss": 0.247, "step": 3867 }, { "epoch": 0.8083594566353187, "grad_norm": 1.1221436102299616, "learning_rate": 1.9430875149149636e-05, "loss": 0.2106, "step": 3868 }, { "epoch": 0.8085684430512017, "grad_norm": 1.2050793795045744, "learning_rate": 1.943049993839009e-05, "loss": 0.2078, "step": 3869 }, { "epoch": 0.8087774294670846, "grad_norm": 1.3070185829911407, "learning_rate": 1.943012460761253e-05, "loss": 0.2331, "step": 3870 }, { "epoch": 0.8089864158829676, "grad_norm": 1.3052922583471973, "learning_rate": 1.9429749156821732e-05, "loss": 0.1789, "step": 3871 }, { "epoch": 0.8091954022988506, "grad_norm": 0.9689094978466462, "learning_rate": 1.9429373586022472e-05, "loss": 0.2122, "step": 3872 }, { "epoch": 0.8094043887147335, "grad_norm": 1.157664438301853, "learning_rate": 1.9428997895219535e-05, "loss": 0.2294, "step": 3873 }, { "epoch": 0.8096133751306165, "grad_norm": 1.2010464312358349, "learning_rate": 1.9428622084417698e-05, "loss": 0.2248, "step": 3874 }, { "epoch": 0.8098223615464994, "grad_norm": 1.1351747945908286, "learning_rate": 1.9428246153621738e-05, "loss": 0.1779, "step": 3875 }, { "epoch": 0.8100313479623824, "grad_norm": 1.012116728557041, "learning_rate": 1.9427870102836453e-05, "loss": 0.2036, "step": 3876 }, { "epoch": 0.8102403343782654, "grad_norm": 1.094705465243428, "learning_rate": 1.9427493932066616e-05, "loss": 0.2352, "step": 3877 }, { "epoch": 0.8104493207941483, "grad_norm": 1.1740493948364537, "learning_rate": 1.9427117641317022e-05, "loss": 0.2208, "step": 3878 }, { "epoch": 0.8106583072100313, "grad_norm": 1.4341293086194709, "learning_rate": 1.942674123059246e-05, "loss": 0.2618, "step": 3879 }, { "epoch": 0.8108672936259144, "grad_norm": 1.0768748237930899, "learning_rate": 1.9426364699897716e-05, "loss": 0.2157, "step": 3880 }, { "epoch": 0.8110762800417973, "grad_norm": 1.1107001255258107, "learning_rate": 1.9425988049237582e-05, "loss": 0.1951, "step": 3881 }, { "epoch": 0.8112852664576803, "grad_norm": 1.292446234133741, "learning_rate": 1.9425611278616857e-05, "loss": 0.2086, "step": 3882 }, { "epoch": 0.8114942528735632, "grad_norm": 1.1629057036842516, "learning_rate": 1.942523438804033e-05, "loss": 0.167, "step": 3883 }, { "epoch": 0.8117032392894462, "grad_norm": 1.0439705024233958, "learning_rate": 1.9424857377512803e-05, "loss": 0.2006, "step": 3884 }, { "epoch": 0.8119122257053292, "grad_norm": 1.1076603877032642, "learning_rate": 1.942448024703907e-05, "loss": 0.2275, "step": 3885 }, { "epoch": 0.8121212121212121, "grad_norm": 1.1266875771727214, "learning_rate": 1.942410299662393e-05, "loss": 0.2112, "step": 3886 }, { "epoch": 0.8123301985370951, "grad_norm": 1.0479387207068342, "learning_rate": 1.9423725626272184e-05, "loss": 0.1937, "step": 3887 }, { "epoch": 0.812539184952978, "grad_norm": 1.078933784058327, "learning_rate": 1.942334813598864e-05, "loss": 0.2057, "step": 3888 }, { "epoch": 0.812748171368861, "grad_norm": 1.4694952704143205, "learning_rate": 1.94229705257781e-05, "loss": 0.1886, "step": 3889 }, { "epoch": 0.812957157784744, "grad_norm": 0.9759927414373833, "learning_rate": 1.9422592795645367e-05, "loss": 0.1815, "step": 3890 }, { "epoch": 0.8131661442006269, "grad_norm": 1.0479863690004658, "learning_rate": 1.942221494559525e-05, "loss": 0.1822, "step": 3891 }, { "epoch": 0.8133751306165099, "grad_norm": 1.1972966197239006, "learning_rate": 1.942183697563255e-05, "loss": 0.2165, "step": 3892 }, { "epoch": 0.8135841170323929, "grad_norm": 1.1718149734114305, "learning_rate": 1.9421458885762094e-05, "loss": 0.1845, "step": 3893 }, { "epoch": 0.8137931034482758, "grad_norm": 1.4417570252870409, "learning_rate": 1.9421080675988682e-05, "loss": 0.1998, "step": 3894 }, { "epoch": 0.8140020898641588, "grad_norm": 1.229584225629095, "learning_rate": 1.942070234631713e-05, "loss": 0.2142, "step": 3895 }, { "epoch": 0.8142110762800417, "grad_norm": 1.3351962304413716, "learning_rate": 1.942032389675225e-05, "loss": 0.2574, "step": 3896 }, { "epoch": 0.8144200626959248, "grad_norm": 1.2307217237359724, "learning_rate": 1.941994532729886e-05, "loss": 0.2154, "step": 3897 }, { "epoch": 0.8146290491118078, "grad_norm": 1.2574794036380244, "learning_rate": 1.9419566637961783e-05, "loss": 0.1997, "step": 3898 }, { "epoch": 0.8148380355276907, "grad_norm": 1.3201918807623692, "learning_rate": 1.941918782874583e-05, "loss": 0.2256, "step": 3899 }, { "epoch": 0.8150470219435737, "grad_norm": 1.295547285571553, "learning_rate": 1.941880889965583e-05, "loss": 0.1884, "step": 3900 }, { "epoch": 0.8152560083594567, "grad_norm": 1.3348032727906016, "learning_rate": 1.9418429850696598e-05, "loss": 0.2324, "step": 3901 }, { "epoch": 0.8154649947753396, "grad_norm": 1.1161743429285484, "learning_rate": 1.9418050681872964e-05, "loss": 0.2063, "step": 3902 }, { "epoch": 0.8156739811912226, "grad_norm": 1.2249387595495012, "learning_rate": 1.941767139318975e-05, "loss": 0.2436, "step": 3903 }, { "epoch": 0.8158829676071055, "grad_norm": 1.0124156547845233, "learning_rate": 1.9417291984651783e-05, "loss": 0.1975, "step": 3904 }, { "epoch": 0.8160919540229885, "grad_norm": 1.1025664189046211, "learning_rate": 1.9416912456263895e-05, "loss": 0.2016, "step": 3905 }, { "epoch": 0.8163009404388715, "grad_norm": 1.1074996455808797, "learning_rate": 1.9416532808030912e-05, "loss": 0.1738, "step": 3906 }, { "epoch": 0.8165099268547544, "grad_norm": 1.4747508220636376, "learning_rate": 1.941615303995767e-05, "loss": 0.2382, "step": 3907 }, { "epoch": 0.8167189132706374, "grad_norm": 1.0056810489973596, "learning_rate": 1.9415773152048998e-05, "loss": 0.1905, "step": 3908 }, { "epoch": 0.8169278996865204, "grad_norm": 1.0155713841085245, "learning_rate": 1.941539314430973e-05, "loss": 0.1812, "step": 3909 }, { "epoch": 0.8171368861024033, "grad_norm": 1.06106983174227, "learning_rate": 1.9415013016744705e-05, "loss": 0.2117, "step": 3910 }, { "epoch": 0.8173458725182863, "grad_norm": 1.1680215132367222, "learning_rate": 1.9414632769358762e-05, "loss": 0.1841, "step": 3911 }, { "epoch": 0.8175548589341692, "grad_norm": 1.209507977628353, "learning_rate": 1.941425240215674e-05, "loss": 0.1854, "step": 3912 }, { "epoch": 0.8177638453500522, "grad_norm": 1.2344131639298659, "learning_rate": 1.9413871915143478e-05, "loss": 0.2297, "step": 3913 }, { "epoch": 0.8179728317659352, "grad_norm": 1.299742878312107, "learning_rate": 1.9413491308323816e-05, "loss": 0.2409, "step": 3914 }, { "epoch": 0.8181818181818182, "grad_norm": 1.2617026758153742, "learning_rate": 1.94131105817026e-05, "loss": 0.1887, "step": 3915 }, { "epoch": 0.8183908045977012, "grad_norm": 1.1592180579440132, "learning_rate": 1.9412729735284677e-05, "loss": 0.208, "step": 3916 }, { "epoch": 0.8185997910135842, "grad_norm": 0.9948187340184454, "learning_rate": 1.9412348769074893e-05, "loss": 0.2267, "step": 3917 }, { "epoch": 0.8188087774294671, "grad_norm": 1.265560381266135, "learning_rate": 1.9411967683078095e-05, "loss": 0.1951, "step": 3918 }, { "epoch": 0.8190177638453501, "grad_norm": 0.9092105097698426, "learning_rate": 1.9411586477299136e-05, "loss": 0.2115, "step": 3919 }, { "epoch": 0.819226750261233, "grad_norm": 1.0593252789849805, "learning_rate": 1.941120515174286e-05, "loss": 0.2125, "step": 3920 }, { "epoch": 0.819435736677116, "grad_norm": 1.1595135488210853, "learning_rate": 1.941082370641413e-05, "loss": 0.2549, "step": 3921 }, { "epoch": 0.819644723092999, "grad_norm": 1.035565366042705, "learning_rate": 1.9410442141317796e-05, "loss": 0.2051, "step": 3922 }, { "epoch": 0.8198537095088819, "grad_norm": 1.0270000700216193, "learning_rate": 1.9410060456458713e-05, "loss": 0.1821, "step": 3923 }, { "epoch": 0.8200626959247649, "grad_norm": 1.1191029545300177, "learning_rate": 1.9409678651841738e-05, "loss": 0.2178, "step": 3924 }, { "epoch": 0.8202716823406478, "grad_norm": 1.44745644178602, "learning_rate": 1.9409296727471732e-05, "loss": 0.224, "step": 3925 }, { "epoch": 0.8204806687565308, "grad_norm": 1.1034275465908872, "learning_rate": 1.9408914683353556e-05, "loss": 0.2257, "step": 3926 }, { "epoch": 0.8206896551724138, "grad_norm": 1.0629961122207028, "learning_rate": 1.9408532519492072e-05, "loss": 0.2178, "step": 3927 }, { "epoch": 0.8208986415882967, "grad_norm": 1.2710002689163198, "learning_rate": 1.940815023589214e-05, "loss": 0.1969, "step": 3928 }, { "epoch": 0.8211076280041797, "grad_norm": 1.1085055176801593, "learning_rate": 1.940776783255863e-05, "loss": 0.1999, "step": 3929 }, { "epoch": 0.8213166144200627, "grad_norm": 1.228069419230965, "learning_rate": 1.9407385309496405e-05, "loss": 0.2325, "step": 3930 }, { "epoch": 0.8215256008359456, "grad_norm": 1.1317827677849224, "learning_rate": 1.9407002666710334e-05, "loss": 0.2256, "step": 3931 }, { "epoch": 0.8217345872518287, "grad_norm": 1.1944173757048193, "learning_rate": 1.940661990420529e-05, "loss": 0.2558, "step": 3932 }, { "epoch": 0.8219435736677116, "grad_norm": 1.1950739506182226, "learning_rate": 1.940623702198614e-05, "loss": 0.1457, "step": 3933 }, { "epoch": 0.8221525600835946, "grad_norm": 1.0802957284029573, "learning_rate": 1.9405854020057758e-05, "loss": 0.2132, "step": 3934 }, { "epoch": 0.8223615464994776, "grad_norm": 1.0619429028094949, "learning_rate": 1.9405470898425022e-05, "loss": 0.201, "step": 3935 }, { "epoch": 0.8225705329153605, "grad_norm": 1.106967448492851, "learning_rate": 1.94050876570928e-05, "loss": 0.2337, "step": 3936 }, { "epoch": 0.8227795193312435, "grad_norm": 1.016617398331199, "learning_rate": 1.940470429606598e-05, "loss": 0.2352, "step": 3937 }, { "epoch": 0.8229885057471265, "grad_norm": 1.1739258541141127, "learning_rate": 1.9404320815349434e-05, "loss": 0.2147, "step": 3938 }, { "epoch": 0.8231974921630094, "grad_norm": 1.1787274647884622, "learning_rate": 1.9403937214948038e-05, "loss": 0.2186, "step": 3939 }, { "epoch": 0.8234064785788924, "grad_norm": 0.9812366727372814, "learning_rate": 1.9403553494866683e-05, "loss": 0.2467, "step": 3940 }, { "epoch": 0.8236154649947753, "grad_norm": 1.0727643775924711, "learning_rate": 1.9403169655110246e-05, "loss": 0.2141, "step": 3941 }, { "epoch": 0.8238244514106583, "grad_norm": 1.2749058915911677, "learning_rate": 1.9402785695683622e-05, "loss": 0.2222, "step": 3942 }, { "epoch": 0.8240334378265413, "grad_norm": 1.3040801652691607, "learning_rate": 1.9402401616591683e-05, "loss": 0.2131, "step": 3943 }, { "epoch": 0.8242424242424242, "grad_norm": 1.1903503186830513, "learning_rate": 1.9402017417839328e-05, "loss": 0.1761, "step": 3944 }, { "epoch": 0.8244514106583072, "grad_norm": 1.0123232279020138, "learning_rate": 1.9401633099431437e-05, "loss": 0.187, "step": 3945 }, { "epoch": 0.8246603970741901, "grad_norm": 1.1081323530031641, "learning_rate": 1.9401248661372915e-05, "loss": 0.1685, "step": 3946 }, { "epoch": 0.8248693834900731, "grad_norm": 1.1220957126894526, "learning_rate": 1.940086410366864e-05, "loss": 0.225, "step": 3947 }, { "epoch": 0.8250783699059561, "grad_norm": 1.0817375524154553, "learning_rate": 1.9400479426323515e-05, "loss": 0.2135, "step": 3948 }, { "epoch": 0.825287356321839, "grad_norm": 1.1065017063814704, "learning_rate": 1.9400094629342435e-05, "loss": 0.1873, "step": 3949 }, { "epoch": 0.8254963427377221, "grad_norm": 1.0496741679460369, "learning_rate": 1.939970971273029e-05, "loss": 0.22, "step": 3950 }, { "epoch": 0.8257053291536051, "grad_norm": 1.2296543974561083, "learning_rate": 1.939932467649199e-05, "loss": 0.1754, "step": 3951 }, { "epoch": 0.825914315569488, "grad_norm": 1.1199712237365298, "learning_rate": 1.939893952063242e-05, "loss": 0.2171, "step": 3952 }, { "epoch": 0.826123301985371, "grad_norm": 1.1890850544826481, "learning_rate": 1.93985542451565e-05, "loss": 0.2056, "step": 3953 }, { "epoch": 0.826332288401254, "grad_norm": 0.8267532966278596, "learning_rate": 1.9398168850069122e-05, "loss": 0.173, "step": 3954 }, { "epoch": 0.8265412748171369, "grad_norm": 1.2174720417646114, "learning_rate": 1.939778333537519e-05, "loss": 0.1934, "step": 3955 }, { "epoch": 0.8267502612330199, "grad_norm": 1.1124517392368463, "learning_rate": 1.9397397701079616e-05, "loss": 0.1934, "step": 3956 }, { "epoch": 0.8269592476489028, "grad_norm": 1.0826566473861763, "learning_rate": 1.9397011947187306e-05, "loss": 0.2231, "step": 3957 }, { "epoch": 0.8271682340647858, "grad_norm": 1.139012365534928, "learning_rate": 1.9396626073703164e-05, "loss": 0.1794, "step": 3958 }, { "epoch": 0.8273772204806688, "grad_norm": 1.1997188039786109, "learning_rate": 1.9396240080632107e-05, "loss": 0.2389, "step": 3959 }, { "epoch": 0.8275862068965517, "grad_norm": 1.027500074639148, "learning_rate": 1.9395853967979047e-05, "loss": 0.205, "step": 3960 }, { "epoch": 0.8277951933124347, "grad_norm": 0.9642907760773325, "learning_rate": 1.9395467735748895e-05, "loss": 0.2007, "step": 3961 }, { "epoch": 0.8280041797283176, "grad_norm": 1.2340263394890674, "learning_rate": 1.939508138394657e-05, "loss": 0.1593, "step": 3962 }, { "epoch": 0.8282131661442006, "grad_norm": 1.3821093355569312, "learning_rate": 1.939469491257698e-05, "loss": 0.2151, "step": 3963 }, { "epoch": 0.8284221525600836, "grad_norm": 1.174277569230016, "learning_rate": 1.9394308321645058e-05, "loss": 0.2285, "step": 3964 }, { "epoch": 0.8286311389759665, "grad_norm": 1.0280895229893317, "learning_rate": 1.939392161115571e-05, "loss": 0.2045, "step": 3965 }, { "epoch": 0.8288401253918495, "grad_norm": 0.987458934231753, "learning_rate": 1.9393534781113868e-05, "loss": 0.2073, "step": 3966 }, { "epoch": 0.8290491118077326, "grad_norm": 0.918970939080062, "learning_rate": 1.939314783152445e-05, "loss": 0.1935, "step": 3967 }, { "epoch": 0.8292580982236155, "grad_norm": 1.3282445814592827, "learning_rate": 1.9392760762392376e-05, "loss": 0.1953, "step": 3968 }, { "epoch": 0.8294670846394985, "grad_norm": 1.4226225488550017, "learning_rate": 1.9392373573722585e-05, "loss": 0.228, "step": 3969 }, { "epoch": 0.8296760710553814, "grad_norm": 1.1605892426147884, "learning_rate": 1.9391986265519992e-05, "loss": 0.2221, "step": 3970 }, { "epoch": 0.8298850574712644, "grad_norm": 1.2138378222559159, "learning_rate": 1.9391598837789535e-05, "loss": 0.1896, "step": 3971 }, { "epoch": 0.8300940438871474, "grad_norm": 1.2153036465171805, "learning_rate": 1.9391211290536135e-05, "loss": 0.2347, "step": 3972 }, { "epoch": 0.8303030303030303, "grad_norm": 1.2174592308152856, "learning_rate": 1.9390823623764732e-05, "loss": 0.1933, "step": 3973 }, { "epoch": 0.8305120167189133, "grad_norm": 0.9615254191713848, "learning_rate": 1.9390435837480258e-05, "loss": 0.2285, "step": 3974 }, { "epoch": 0.8307210031347962, "grad_norm": 1.0425171065128191, "learning_rate": 1.939004793168765e-05, "loss": 0.217, "step": 3975 }, { "epoch": 0.8309299895506792, "grad_norm": 1.044344701245129, "learning_rate": 1.938965990639184e-05, "loss": 0.1817, "step": 3976 }, { "epoch": 0.8311389759665622, "grad_norm": 0.9964076970240121, "learning_rate": 1.9389271761597767e-05, "loss": 0.1849, "step": 3977 }, { "epoch": 0.8313479623824451, "grad_norm": 1.009946533713297, "learning_rate": 1.9388883497310377e-05, "loss": 0.2112, "step": 3978 }, { "epoch": 0.8315569487983281, "grad_norm": 1.1015735420062769, "learning_rate": 1.9388495113534604e-05, "loss": 0.1684, "step": 3979 }, { "epoch": 0.831765935214211, "grad_norm": 1.0818701811118536, "learning_rate": 1.938810661027539e-05, "loss": 0.1832, "step": 3980 }, { "epoch": 0.831974921630094, "grad_norm": 1.1024702813257319, "learning_rate": 1.9387717987537686e-05, "loss": 0.2014, "step": 3981 }, { "epoch": 0.832183908045977, "grad_norm": 0.8857431901017268, "learning_rate": 1.9387329245326436e-05, "loss": 0.1596, "step": 3982 }, { "epoch": 0.8323928944618599, "grad_norm": 1.183527128203157, "learning_rate": 1.9386940383646587e-05, "loss": 0.1966, "step": 3983 }, { "epoch": 0.8326018808777429, "grad_norm": 1.353536999736196, "learning_rate": 1.938655140250308e-05, "loss": 0.2158, "step": 3984 }, { "epoch": 0.832810867293626, "grad_norm": 1.0760113119771364, "learning_rate": 1.9386162301900876e-05, "loss": 0.1844, "step": 3985 }, { "epoch": 0.8330198537095089, "grad_norm": 1.4247990776207309, "learning_rate": 1.9385773081844927e-05, "loss": 0.2008, "step": 3986 }, { "epoch": 0.8332288401253919, "grad_norm": 1.1623844450042258, "learning_rate": 1.938538374234018e-05, "loss": 0.189, "step": 3987 }, { "epoch": 0.8334378265412749, "grad_norm": 1.1148318875230154, "learning_rate": 1.9384994283391588e-05, "loss": 0.2062, "step": 3988 }, { "epoch": 0.8336468129571578, "grad_norm": 1.0187394641378293, "learning_rate": 1.9384604705004118e-05, "loss": 0.2044, "step": 3989 }, { "epoch": 0.8338557993730408, "grad_norm": 1.1839931301798807, "learning_rate": 1.938421500718272e-05, "loss": 0.2085, "step": 3990 }, { "epoch": 0.8340647857889237, "grad_norm": 1.0408652772324682, "learning_rate": 1.9383825189932353e-05, "loss": 0.1649, "step": 3991 }, { "epoch": 0.8342737722048067, "grad_norm": 1.2064929623447078, "learning_rate": 1.9383435253257983e-05, "loss": 0.2202, "step": 3992 }, { "epoch": 0.8344827586206897, "grad_norm": 1.5038905282767014, "learning_rate": 1.938304519716457e-05, "loss": 0.2455, "step": 3993 }, { "epoch": 0.8346917450365726, "grad_norm": 1.0992209830734125, "learning_rate": 1.9382655021657077e-05, "loss": 0.2326, "step": 3994 }, { "epoch": 0.8349007314524556, "grad_norm": 1.2800226557720855, "learning_rate": 1.9382264726740473e-05, "loss": 0.2251, "step": 3995 }, { "epoch": 0.8351097178683385, "grad_norm": 1.072681322464303, "learning_rate": 1.938187431241972e-05, "loss": 0.2003, "step": 3996 }, { "epoch": 0.8353187042842215, "grad_norm": 1.1377637218380112, "learning_rate": 1.9381483778699795e-05, "loss": 0.1604, "step": 3997 }, { "epoch": 0.8355276907001045, "grad_norm": 1.069080746142779, "learning_rate": 1.9381093125585656e-05, "loss": 0.2137, "step": 3998 }, { "epoch": 0.8357366771159874, "grad_norm": 1.178522900904782, "learning_rate": 1.9380702353082287e-05, "loss": 0.1928, "step": 3999 }, { "epoch": 0.8359456635318704, "grad_norm": 1.049607880169879, "learning_rate": 1.9380311461194652e-05, "loss": 0.233, "step": 4000 }, { "epoch": 0.8361546499477533, "grad_norm": 0.8504770579896225, "learning_rate": 1.9379920449927732e-05, "loss": 0.2033, "step": 4001 }, { "epoch": 0.8363636363636363, "grad_norm": 1.0054874006365364, "learning_rate": 1.93795293192865e-05, "loss": 0.2112, "step": 4002 }, { "epoch": 0.8365726227795194, "grad_norm": 1.0253898780362962, "learning_rate": 1.9379138069275934e-05, "loss": 0.2139, "step": 4003 }, { "epoch": 0.8367816091954023, "grad_norm": 1.1117552178151477, "learning_rate": 1.937874669990101e-05, "loss": 0.2237, "step": 4004 }, { "epoch": 0.8369905956112853, "grad_norm": 1.040785918304756, "learning_rate": 1.9378355211166714e-05, "loss": 0.1959, "step": 4005 }, { "epoch": 0.8371995820271683, "grad_norm": 1.0075512618025775, "learning_rate": 1.9377963603078028e-05, "loss": 0.192, "step": 4006 }, { "epoch": 0.8374085684430512, "grad_norm": 1.2829970654626364, "learning_rate": 1.9377571875639936e-05, "loss": 0.2259, "step": 4007 }, { "epoch": 0.8376175548589342, "grad_norm": 1.0110267117218321, "learning_rate": 1.937718002885742e-05, "loss": 0.2141, "step": 4008 }, { "epoch": 0.8378265412748171, "grad_norm": 1.0164752926296428, "learning_rate": 1.9376788062735473e-05, "loss": 0.2418, "step": 4009 }, { "epoch": 0.8380355276907001, "grad_norm": 0.9853273375049706, "learning_rate": 1.9376395977279072e-05, "loss": 0.2055, "step": 4010 }, { "epoch": 0.8382445141065831, "grad_norm": 1.1710409442462657, "learning_rate": 1.9376003772493218e-05, "loss": 0.2273, "step": 4011 }, { "epoch": 0.838453500522466, "grad_norm": 0.9893121091791022, "learning_rate": 1.93756114483829e-05, "loss": 0.1603, "step": 4012 }, { "epoch": 0.838662486938349, "grad_norm": 1.2666560449826934, "learning_rate": 1.9375219004953106e-05, "loss": 0.2188, "step": 4013 }, { "epoch": 0.838871473354232, "grad_norm": 1.273125448869077, "learning_rate": 1.9374826442208835e-05, "loss": 0.2168, "step": 4014 }, { "epoch": 0.8390804597701149, "grad_norm": 1.18162910848169, "learning_rate": 1.9374433760155084e-05, "loss": 0.1915, "step": 4015 }, { "epoch": 0.8392894461859979, "grad_norm": 1.1382181993294633, "learning_rate": 1.937404095879684e-05, "loss": 0.2054, "step": 4016 }, { "epoch": 0.8394984326018808, "grad_norm": 1.1194543973146143, "learning_rate": 1.937364803813912e-05, "loss": 0.2011, "step": 4017 }, { "epoch": 0.8397074190177638, "grad_norm": 1.0070219730954253, "learning_rate": 1.937325499818691e-05, "loss": 0.1908, "step": 4018 }, { "epoch": 0.8399164054336468, "grad_norm": 1.1248973948400665, "learning_rate": 1.937286183894522e-05, "loss": 0.1857, "step": 4019 }, { "epoch": 0.8401253918495298, "grad_norm": 1.2165664205163431, "learning_rate": 1.9372468560419048e-05, "loss": 0.2184, "step": 4020 }, { "epoch": 0.8403343782654128, "grad_norm": 1.2414517880546447, "learning_rate": 1.9372075162613404e-05, "loss": 0.2088, "step": 4021 }, { "epoch": 0.8405433646812958, "grad_norm": 1.1925375741177766, "learning_rate": 1.937168164553329e-05, "loss": 0.2783, "step": 4022 }, { "epoch": 0.8407523510971787, "grad_norm": 1.0438385744985363, "learning_rate": 1.9371288009183717e-05, "loss": 0.2007, "step": 4023 }, { "epoch": 0.8409613375130617, "grad_norm": 0.9686469981824993, "learning_rate": 1.9370894253569695e-05, "loss": 0.205, "step": 4024 }, { "epoch": 0.8411703239289446, "grad_norm": 1.1635477843462043, "learning_rate": 1.9370500378696234e-05, "loss": 0.2221, "step": 4025 }, { "epoch": 0.8413793103448276, "grad_norm": 1.055487579543821, "learning_rate": 1.9370106384568346e-05, "loss": 0.2069, "step": 4026 }, { "epoch": 0.8415882967607106, "grad_norm": 1.157557863843171, "learning_rate": 1.9369712271191047e-05, "loss": 0.2005, "step": 4027 }, { "epoch": 0.8417972831765935, "grad_norm": 1.0151828933048428, "learning_rate": 1.936931803856935e-05, "loss": 0.2097, "step": 4028 }, { "epoch": 0.8420062695924765, "grad_norm": 1.1550726224331582, "learning_rate": 1.9368923686708275e-05, "loss": 0.2474, "step": 4029 }, { "epoch": 0.8422152560083594, "grad_norm": 1.2651164502454466, "learning_rate": 1.9368529215612842e-05, "loss": 0.2185, "step": 4030 }, { "epoch": 0.8424242424242424, "grad_norm": 0.9873806326448362, "learning_rate": 1.9368134625288065e-05, "loss": 0.2297, "step": 4031 }, { "epoch": 0.8426332288401254, "grad_norm": 1.029456519935531, "learning_rate": 1.9367739915738972e-05, "loss": 0.1695, "step": 4032 }, { "epoch": 0.8428422152560083, "grad_norm": 1.1048390781992534, "learning_rate": 1.9367345086970585e-05, "loss": 0.2195, "step": 4033 }, { "epoch": 0.8430512016718913, "grad_norm": 1.359883267491971, "learning_rate": 1.9366950138987925e-05, "loss": 0.1941, "step": 4034 }, { "epoch": 0.8432601880877743, "grad_norm": 1.2862962085858354, "learning_rate": 1.9366555071796026e-05, "loss": 0.1911, "step": 4035 }, { "epoch": 0.8434691745036572, "grad_norm": 0.9384772642315568, "learning_rate": 1.9366159885399907e-05, "loss": 0.1824, "step": 4036 }, { "epoch": 0.8436781609195402, "grad_norm": 1.0579727966744634, "learning_rate": 1.93657645798046e-05, "loss": 0.1932, "step": 4037 }, { "epoch": 0.8438871473354232, "grad_norm": 0.9833748466534808, "learning_rate": 1.9365369155015142e-05, "loss": 0.1929, "step": 4038 }, { "epoch": 0.8440961337513062, "grad_norm": 1.1942623945132864, "learning_rate": 1.9364973611036556e-05, "loss": 0.2314, "step": 4039 }, { "epoch": 0.8443051201671892, "grad_norm": 1.1291470215293307, "learning_rate": 1.9364577947873885e-05, "loss": 0.2333, "step": 4040 }, { "epoch": 0.8445141065830721, "grad_norm": 1.0661018551496377, "learning_rate": 1.936418216553216e-05, "loss": 0.1989, "step": 4041 }, { "epoch": 0.8447230929989551, "grad_norm": 0.9110050156599563, "learning_rate": 1.9363786264016416e-05, "loss": 0.1785, "step": 4042 }, { "epoch": 0.844932079414838, "grad_norm": 1.1057286588731943, "learning_rate": 1.9363390243331696e-05, "loss": 0.195, "step": 4043 }, { "epoch": 0.845141065830721, "grad_norm": 1.038166004618083, "learning_rate": 1.9362994103483036e-05, "loss": 0.2262, "step": 4044 }, { "epoch": 0.845350052246604, "grad_norm": 1.0944143824011576, "learning_rate": 1.9362597844475475e-05, "loss": 0.2156, "step": 4045 }, { "epoch": 0.8455590386624869, "grad_norm": 0.9685195370778594, "learning_rate": 1.936220146631407e-05, "loss": 0.1743, "step": 4046 }, { "epoch": 0.8457680250783699, "grad_norm": 1.4144053297165906, "learning_rate": 1.9361804969003844e-05, "loss": 0.1633, "step": 4047 }, { "epoch": 0.8459770114942529, "grad_norm": 1.1254404714620383, "learning_rate": 1.936140835254986e-05, "loss": 0.2578, "step": 4048 }, { "epoch": 0.8461859979101358, "grad_norm": 1.0001736742000953, "learning_rate": 1.9361011616957165e-05, "loss": 0.2284, "step": 4049 }, { "epoch": 0.8463949843260188, "grad_norm": 1.1873972582997083, "learning_rate": 1.9360614762230798e-05, "loss": 0.2043, "step": 4050 }, { "epoch": 0.8466039707419017, "grad_norm": 1.1957940761522705, "learning_rate": 1.9360217788375815e-05, "loss": 0.2257, "step": 4051 }, { "epoch": 0.8468129571577847, "grad_norm": 1.0840834176331122, "learning_rate": 1.9359820695397267e-05, "loss": 0.2298, "step": 4052 }, { "epoch": 0.8470219435736677, "grad_norm": 1.0239132430998292, "learning_rate": 1.9359423483300214e-05, "loss": 0.1867, "step": 4053 }, { "epoch": 0.8472309299895506, "grad_norm": 1.0715670651268598, "learning_rate": 1.9359026152089702e-05, "loss": 0.1831, "step": 4054 }, { "epoch": 0.8474399164054337, "grad_norm": 1.2334108046816927, "learning_rate": 1.9358628701770793e-05, "loss": 0.2084, "step": 4055 }, { "epoch": 0.8476489028213167, "grad_norm": 1.056510762399386, "learning_rate": 1.935823113234854e-05, "loss": 0.2418, "step": 4056 }, { "epoch": 0.8478578892371996, "grad_norm": 1.031573493885866, "learning_rate": 1.935783344382801e-05, "loss": 0.1873, "step": 4057 }, { "epoch": 0.8480668756530826, "grad_norm": 1.131791671900898, "learning_rate": 1.9357435636214257e-05, "loss": 0.2166, "step": 4058 }, { "epoch": 0.8482758620689655, "grad_norm": 1.1665636325909061, "learning_rate": 1.9357037709512353e-05, "loss": 0.2147, "step": 4059 }, { "epoch": 0.8484848484848485, "grad_norm": 1.1442203784021932, "learning_rate": 1.9356639663727354e-05, "loss": 0.2344, "step": 4060 }, { "epoch": 0.8486938349007315, "grad_norm": 1.1980275051727547, "learning_rate": 1.9356241498864325e-05, "loss": 0.2128, "step": 4061 }, { "epoch": 0.8489028213166144, "grad_norm": 0.9751757112497172, "learning_rate": 1.9355843214928342e-05, "loss": 0.2064, "step": 4062 }, { "epoch": 0.8491118077324974, "grad_norm": 0.9695575433811511, "learning_rate": 1.9355444811924466e-05, "loss": 0.1881, "step": 4063 }, { "epoch": 0.8493207941483804, "grad_norm": 1.187093171872189, "learning_rate": 1.9355046289857767e-05, "loss": 0.1739, "step": 4064 }, { "epoch": 0.8495297805642633, "grad_norm": 1.1674281261699184, "learning_rate": 1.9354647648733323e-05, "loss": 0.1755, "step": 4065 }, { "epoch": 0.8497387669801463, "grad_norm": 1.3820830996705278, "learning_rate": 1.9354248888556203e-05, "loss": 0.1916, "step": 4066 }, { "epoch": 0.8499477533960292, "grad_norm": 1.091052789803186, "learning_rate": 1.9353850009331485e-05, "loss": 0.2063, "step": 4067 }, { "epoch": 0.8501567398119122, "grad_norm": 1.2677283827225834, "learning_rate": 1.9353451011064242e-05, "loss": 0.2023, "step": 4068 }, { "epoch": 0.8503657262277952, "grad_norm": 1.0357436390544819, "learning_rate": 1.9353051893759552e-05, "loss": 0.1922, "step": 4069 }, { "epoch": 0.8505747126436781, "grad_norm": 0.9440979362146847, "learning_rate": 1.9352652657422495e-05, "loss": 0.167, "step": 4070 }, { "epoch": 0.8507836990595611, "grad_norm": 1.1250442923131103, "learning_rate": 1.9352253302058152e-05, "loss": 0.2037, "step": 4071 }, { "epoch": 0.850992685475444, "grad_norm": 1.145988166863542, "learning_rate": 1.9351853827671605e-05, "loss": 0.1637, "step": 4072 }, { "epoch": 0.8512016718913271, "grad_norm": 1.2680351056016268, "learning_rate": 1.935145423426794e-05, "loss": 0.1876, "step": 4073 }, { "epoch": 0.8514106583072101, "grad_norm": 1.1976732910544807, "learning_rate": 1.935105452185224e-05, "loss": 0.1994, "step": 4074 }, { "epoch": 0.851619644723093, "grad_norm": 1.3213073002885871, "learning_rate": 1.9350654690429598e-05, "loss": 0.1988, "step": 4075 }, { "epoch": 0.851828631138976, "grad_norm": 1.292049713908496, "learning_rate": 1.9350254740005094e-05, "loss": 0.2169, "step": 4076 }, { "epoch": 0.852037617554859, "grad_norm": 1.139242614425271, "learning_rate": 1.934985467058382e-05, "loss": 0.2003, "step": 4077 }, { "epoch": 0.8522466039707419, "grad_norm": 1.066724377669138, "learning_rate": 1.9349454482170872e-05, "loss": 0.1833, "step": 4078 }, { "epoch": 0.8524555903866249, "grad_norm": 1.09923849039026, "learning_rate": 1.934905417477134e-05, "loss": 0.1977, "step": 4079 }, { "epoch": 0.8526645768025078, "grad_norm": 1.303511975899514, "learning_rate": 1.9348653748390317e-05, "loss": 0.2263, "step": 4080 }, { "epoch": 0.8528735632183908, "grad_norm": 0.9695612658184601, "learning_rate": 1.93482532030329e-05, "loss": 0.1946, "step": 4081 }, { "epoch": 0.8530825496342738, "grad_norm": 1.016365444611059, "learning_rate": 1.934785253870419e-05, "loss": 0.2057, "step": 4082 }, { "epoch": 0.8532915360501567, "grad_norm": 1.10032978494372, "learning_rate": 1.934745175540928e-05, "loss": 0.2035, "step": 4083 }, { "epoch": 0.8535005224660397, "grad_norm": 1.1974588811962084, "learning_rate": 1.934705085315328e-05, "loss": 0.1959, "step": 4084 }, { "epoch": 0.8537095088819227, "grad_norm": 1.0039694560288075, "learning_rate": 1.934664983194128e-05, "loss": 0.223, "step": 4085 }, { "epoch": 0.8539184952978056, "grad_norm": 1.5156551422796753, "learning_rate": 1.934624869177839e-05, "loss": 0.2115, "step": 4086 }, { "epoch": 0.8541274817136886, "grad_norm": 1.2938620484105074, "learning_rate": 1.9345847432669717e-05, "loss": 0.1978, "step": 4087 }, { "epoch": 0.8543364681295715, "grad_norm": 1.0677912023726808, "learning_rate": 1.9345446054620366e-05, "loss": 0.1623, "step": 4088 }, { "epoch": 0.8545454545454545, "grad_norm": 1.1423188360142371, "learning_rate": 1.9345044557635442e-05, "loss": 0.2043, "step": 4089 }, { "epoch": 0.8547544409613375, "grad_norm": 1.364961167117939, "learning_rate": 1.9344642941720063e-05, "loss": 0.2015, "step": 4090 }, { "epoch": 0.8549634273772205, "grad_norm": 1.0015316344287903, "learning_rate": 1.934424120687933e-05, "loss": 0.1839, "step": 4091 }, { "epoch": 0.8551724137931035, "grad_norm": 1.48478001385387, "learning_rate": 1.9343839353118362e-05, "loss": 0.2299, "step": 4092 }, { "epoch": 0.8553814002089865, "grad_norm": 1.1390172170064246, "learning_rate": 1.934343738044227e-05, "loss": 0.2079, "step": 4093 }, { "epoch": 0.8555903866248694, "grad_norm": 1.062734738256369, "learning_rate": 1.9343035288856172e-05, "loss": 0.2144, "step": 4094 }, { "epoch": 0.8557993730407524, "grad_norm": 1.1076983504617957, "learning_rate": 1.9342633078365185e-05, "loss": 0.2049, "step": 4095 }, { "epoch": 0.8560083594566353, "grad_norm": 1.137801395862023, "learning_rate": 1.9342230748974428e-05, "loss": 0.1997, "step": 4096 }, { "epoch": 0.8562173458725183, "grad_norm": 1.387970600543401, "learning_rate": 1.934182830068902e-05, "loss": 0.2093, "step": 4097 }, { "epoch": 0.8564263322884013, "grad_norm": 0.8303675959121947, "learning_rate": 1.9341425733514082e-05, "loss": 0.1749, "step": 4098 }, { "epoch": 0.8566353187042842, "grad_norm": 1.0456212876201982, "learning_rate": 1.9341023047454743e-05, "loss": 0.1886, "step": 4099 }, { "epoch": 0.8568443051201672, "grad_norm": 1.3072834805047735, "learning_rate": 1.9340620242516118e-05, "loss": 0.1863, "step": 4100 }, { "epoch": 0.8570532915360501, "grad_norm": 1.2287546155836426, "learning_rate": 1.9340217318703337e-05, "loss": 0.2077, "step": 4101 }, { "epoch": 0.8572622779519331, "grad_norm": 1.0462450662525697, "learning_rate": 1.9339814276021535e-05, "loss": 0.2066, "step": 4102 }, { "epoch": 0.8574712643678161, "grad_norm": 1.000580176972392, "learning_rate": 1.9339411114475835e-05, "loss": 0.1746, "step": 4103 }, { "epoch": 0.857680250783699, "grad_norm": 0.9651134607763758, "learning_rate": 1.933900783407137e-05, "loss": 0.1923, "step": 4104 }, { "epoch": 0.857889237199582, "grad_norm": 1.076878654783282, "learning_rate": 1.933860443481327e-05, "loss": 0.2127, "step": 4105 }, { "epoch": 0.858098223615465, "grad_norm": 1.2170060501442634, "learning_rate": 1.933820091670667e-05, "loss": 0.2148, "step": 4106 }, { "epoch": 0.8583072100313479, "grad_norm": 1.249262463340181, "learning_rate": 1.9337797279756705e-05, "loss": 0.2106, "step": 4107 }, { "epoch": 0.858516196447231, "grad_norm": 0.9290046768354328, "learning_rate": 1.933739352396851e-05, "loss": 0.1902, "step": 4108 }, { "epoch": 0.858725182863114, "grad_norm": 1.2743642801567885, "learning_rate": 1.9336989649347232e-05, "loss": 0.1965, "step": 4109 }, { "epoch": 0.8589341692789969, "grad_norm": 1.4697277707318537, "learning_rate": 1.9336585655898e-05, "loss": 0.2037, "step": 4110 }, { "epoch": 0.8591431556948799, "grad_norm": 0.9912052325204246, "learning_rate": 1.9336181543625963e-05, "loss": 0.1954, "step": 4111 }, { "epoch": 0.8593521421107628, "grad_norm": 0.9739357223119632, "learning_rate": 1.9335777312536258e-05, "loss": 0.1801, "step": 4112 }, { "epoch": 0.8595611285266458, "grad_norm": 1.0474634496781827, "learning_rate": 1.9335372962634037e-05, "loss": 0.2054, "step": 4113 }, { "epoch": 0.8597701149425288, "grad_norm": 1.2209696431366421, "learning_rate": 1.933496849392444e-05, "loss": 0.2203, "step": 4114 }, { "epoch": 0.8599791013584117, "grad_norm": 1.1325440110260687, "learning_rate": 1.9334563906412615e-05, "loss": 0.1966, "step": 4115 }, { "epoch": 0.8601880877742947, "grad_norm": 1.3098452774442706, "learning_rate": 1.9334159200103714e-05, "loss": 0.1883, "step": 4116 }, { "epoch": 0.8603970741901776, "grad_norm": 1.4796273234208606, "learning_rate": 1.9333754375002884e-05, "loss": 0.2294, "step": 4117 }, { "epoch": 0.8606060606060606, "grad_norm": 1.1532457496116513, "learning_rate": 1.933334943111528e-05, "loss": 0.2095, "step": 4118 }, { "epoch": 0.8608150470219436, "grad_norm": 1.0787315012749796, "learning_rate": 1.933294436844605e-05, "loss": 0.1756, "step": 4119 }, { "epoch": 0.8610240334378265, "grad_norm": 1.2571152812920932, "learning_rate": 1.933253918700036e-05, "loss": 0.2181, "step": 4120 }, { "epoch": 0.8612330198537095, "grad_norm": 1.3657470387759074, "learning_rate": 1.9332133886783358e-05, "loss": 0.1719, "step": 4121 }, { "epoch": 0.8614420062695924, "grad_norm": 1.1202145868120825, "learning_rate": 1.9331728467800203e-05, "loss": 0.1963, "step": 4122 }, { "epoch": 0.8616509926854754, "grad_norm": 1.18244863865147, "learning_rate": 1.9331322930056056e-05, "loss": 0.1834, "step": 4123 }, { "epoch": 0.8618599791013584, "grad_norm": 1.236382620728367, "learning_rate": 1.933091727355608e-05, "loss": 0.2189, "step": 4124 }, { "epoch": 0.8620689655172413, "grad_norm": 1.3541886920047685, "learning_rate": 1.9330511498305435e-05, "loss": 0.2074, "step": 4125 }, { "epoch": 0.8622779519331244, "grad_norm": 0.8581470992468182, "learning_rate": 1.9330105604309283e-05, "loss": 0.1832, "step": 4126 }, { "epoch": 0.8624869383490074, "grad_norm": 1.204949350322573, "learning_rate": 1.932969959157279e-05, "loss": 0.2439, "step": 4127 }, { "epoch": 0.8626959247648903, "grad_norm": 1.0326793820825657, "learning_rate": 1.932929346010113e-05, "loss": 0.1906, "step": 4128 }, { "epoch": 0.8629049111807733, "grad_norm": 1.2121625740524737, "learning_rate": 1.932888720989947e-05, "loss": 0.2012, "step": 4129 }, { "epoch": 0.8631138975966562, "grad_norm": 1.0103844417739458, "learning_rate": 1.9328480840972967e-05, "loss": 0.2162, "step": 4130 }, { "epoch": 0.8633228840125392, "grad_norm": 0.945585811094055, "learning_rate": 1.932807435332681e-05, "loss": 0.1966, "step": 4131 }, { "epoch": 0.8635318704284222, "grad_norm": 1.0689277276970222, "learning_rate": 1.9327667746966163e-05, "loss": 0.1787, "step": 4132 }, { "epoch": 0.8637408568443051, "grad_norm": 1.0061117981076477, "learning_rate": 1.9327261021896205e-05, "loss": 0.1815, "step": 4133 }, { "epoch": 0.8639498432601881, "grad_norm": 1.1238116362132846, "learning_rate": 1.9326854178122108e-05, "loss": 0.2179, "step": 4134 }, { "epoch": 0.864158829676071, "grad_norm": 1.0645721006690476, "learning_rate": 1.9326447215649053e-05, "loss": 0.1966, "step": 4135 }, { "epoch": 0.864367816091954, "grad_norm": 1.12232523117821, "learning_rate": 1.932604013448222e-05, "loss": 0.2038, "step": 4136 }, { "epoch": 0.864576802507837, "grad_norm": 1.2157849940757037, "learning_rate": 1.9325632934626784e-05, "loss": 0.1932, "step": 4137 }, { "epoch": 0.8647857889237199, "grad_norm": 1.2662783350794495, "learning_rate": 1.932522561608793e-05, "loss": 0.1832, "step": 4138 }, { "epoch": 0.8649947753396029, "grad_norm": 1.220498780736715, "learning_rate": 1.9324818178870845e-05, "loss": 0.2141, "step": 4139 }, { "epoch": 0.8652037617554859, "grad_norm": 0.9789570431284599, "learning_rate": 1.9324410622980713e-05, "loss": 0.1778, "step": 4140 }, { "epoch": 0.8654127481713688, "grad_norm": 1.1366483555994569, "learning_rate": 1.9324002948422715e-05, "loss": 0.2033, "step": 4141 }, { "epoch": 0.8656217345872518, "grad_norm": 0.936306360087049, "learning_rate": 1.932359515520205e-05, "loss": 0.1992, "step": 4142 }, { "epoch": 0.8658307210031349, "grad_norm": 1.1034140989440648, "learning_rate": 1.93231872433239e-05, "loss": 0.2288, "step": 4143 }, { "epoch": 0.8660397074190178, "grad_norm": 1.1965792750247826, "learning_rate": 1.932277921279346e-05, "loss": 0.2143, "step": 4144 }, { "epoch": 0.8662486938349008, "grad_norm": 1.1071839498178317, "learning_rate": 1.9322371063615915e-05, "loss": 0.2324, "step": 4145 }, { "epoch": 0.8664576802507837, "grad_norm": 0.950525268729798, "learning_rate": 1.9321962795796468e-05, "loss": 0.1571, "step": 4146 }, { "epoch": 0.8666666666666667, "grad_norm": 1.1963892919906205, "learning_rate": 1.9321554409340316e-05, "loss": 0.1737, "step": 4147 }, { "epoch": 0.8668756530825497, "grad_norm": 1.1135238486481207, "learning_rate": 1.9321145904252652e-05, "loss": 0.2227, "step": 4148 }, { "epoch": 0.8670846394984326, "grad_norm": 1.1705107484382733, "learning_rate": 1.932073728053867e-05, "loss": 0.1954, "step": 4149 }, { "epoch": 0.8672936259143156, "grad_norm": 1.1282979563364608, "learning_rate": 1.9320328538203585e-05, "loss": 0.217, "step": 4150 }, { "epoch": 0.8675026123301985, "grad_norm": 1.213193393735565, "learning_rate": 1.9319919677252582e-05, "loss": 0.2241, "step": 4151 }, { "epoch": 0.8677115987460815, "grad_norm": 1.052705965609268, "learning_rate": 1.9319510697690875e-05, "loss": 0.2441, "step": 4152 }, { "epoch": 0.8679205851619645, "grad_norm": 1.2785086505034542, "learning_rate": 1.9319101599523668e-05, "loss": 0.2101, "step": 4153 }, { "epoch": 0.8681295715778474, "grad_norm": 1.0253460068312068, "learning_rate": 1.931869238275616e-05, "loss": 0.1706, "step": 4154 }, { "epoch": 0.8683385579937304, "grad_norm": 1.1057091139831283, "learning_rate": 1.9318283047393568e-05, "loss": 0.2306, "step": 4155 }, { "epoch": 0.8685475444096133, "grad_norm": 1.0028790427818624, "learning_rate": 1.93178735934411e-05, "loss": 0.2282, "step": 4156 }, { "epoch": 0.8687565308254963, "grad_norm": 1.0246723215975628, "learning_rate": 1.931746402090396e-05, "loss": 0.1874, "step": 4157 }, { "epoch": 0.8689655172413793, "grad_norm": 1.0821088193758226, "learning_rate": 1.931705432978737e-05, "loss": 0.1977, "step": 4158 }, { "epoch": 0.8691745036572622, "grad_norm": 0.9984345275346028, "learning_rate": 1.931664452009654e-05, "loss": 0.2167, "step": 4159 }, { "epoch": 0.8693834900731452, "grad_norm": 0.974518296260996, "learning_rate": 1.931623459183668e-05, "loss": 0.2112, "step": 4160 }, { "epoch": 0.8695924764890283, "grad_norm": 1.0745385328256656, "learning_rate": 1.9315824545013014e-05, "loss": 0.1948, "step": 4161 }, { "epoch": 0.8698014629049112, "grad_norm": 1.173295515823046, "learning_rate": 1.9315414379630758e-05, "loss": 0.2427, "step": 4162 }, { "epoch": 0.8700104493207942, "grad_norm": 1.0322331952943966, "learning_rate": 1.9315004095695134e-05, "loss": 0.1743, "step": 4163 }, { "epoch": 0.8702194357366771, "grad_norm": 1.1724016677929971, "learning_rate": 1.9314593693211358e-05, "loss": 0.2347, "step": 4164 }, { "epoch": 0.8704284221525601, "grad_norm": 1.1806232854342134, "learning_rate": 1.9314183172184663e-05, "loss": 0.2289, "step": 4165 }, { "epoch": 0.8706374085684431, "grad_norm": 1.1976694050583576, "learning_rate": 1.931377253262026e-05, "loss": 0.168, "step": 4166 }, { "epoch": 0.870846394984326, "grad_norm": 0.956563798179846, "learning_rate": 1.9313361774523387e-05, "loss": 0.2035, "step": 4167 }, { "epoch": 0.871055381400209, "grad_norm": 1.0942117990204627, "learning_rate": 1.9312950897899264e-05, "loss": 0.2193, "step": 4168 }, { "epoch": 0.871264367816092, "grad_norm": 1.1527223516785179, "learning_rate": 1.9312539902753124e-05, "loss": 0.195, "step": 4169 }, { "epoch": 0.8714733542319749, "grad_norm": 1.0949115794181836, "learning_rate": 1.93121287890902e-05, "loss": 0.2078, "step": 4170 }, { "epoch": 0.8716823406478579, "grad_norm": 1.2348062073723403, "learning_rate": 1.9311717556915716e-05, "loss": 0.2025, "step": 4171 }, { "epoch": 0.8718913270637408, "grad_norm": 1.1417406061437243, "learning_rate": 1.931130620623491e-05, "loss": 0.1773, "step": 4172 }, { "epoch": 0.8721003134796238, "grad_norm": 1.1239211116313292, "learning_rate": 1.931089473705302e-05, "loss": 0.2197, "step": 4173 }, { "epoch": 0.8723092998955068, "grad_norm": 1.0243741118392244, "learning_rate": 1.9310483149375278e-05, "loss": 0.1809, "step": 4174 }, { "epoch": 0.8725182863113897, "grad_norm": 1.4425427697215545, "learning_rate": 1.9310071443206927e-05, "loss": 0.2163, "step": 4175 }, { "epoch": 0.8727272727272727, "grad_norm": 1.2856609075742935, "learning_rate": 1.93096596185532e-05, "loss": 0.2129, "step": 4176 }, { "epoch": 0.8729362591431556, "grad_norm": 1.2265002938558232, "learning_rate": 1.9309247675419342e-05, "loss": 0.2149, "step": 4177 }, { "epoch": 0.8731452455590386, "grad_norm": 1.36046914851503, "learning_rate": 1.9308835613810597e-05, "loss": 0.1879, "step": 4178 }, { "epoch": 0.8733542319749217, "grad_norm": 0.9692017540093525, "learning_rate": 1.9308423433732205e-05, "loss": 0.1965, "step": 4179 }, { "epoch": 0.8735632183908046, "grad_norm": 1.2517564935019025, "learning_rate": 1.930801113518942e-05, "loss": 0.2185, "step": 4180 }, { "epoch": 0.8737722048066876, "grad_norm": 0.85602369431039, "learning_rate": 1.9307598718187478e-05, "loss": 0.1993, "step": 4181 }, { "epoch": 0.8739811912225706, "grad_norm": 1.078417031417069, "learning_rate": 1.9307186182731633e-05, "loss": 0.2214, "step": 4182 }, { "epoch": 0.8741901776384535, "grad_norm": 1.1486581605959598, "learning_rate": 1.9306773528827137e-05, "loss": 0.219, "step": 4183 }, { "epoch": 0.8743991640543365, "grad_norm": 0.9739276310616187, "learning_rate": 1.9306360756479236e-05, "loss": 0.1697, "step": 4184 }, { "epoch": 0.8746081504702194, "grad_norm": 0.9919659911859048, "learning_rate": 1.930594786569319e-05, "loss": 0.2055, "step": 4185 }, { "epoch": 0.8748171368861024, "grad_norm": 1.065920476245686, "learning_rate": 1.930553485647425e-05, "loss": 0.1912, "step": 4186 }, { "epoch": 0.8750261233019854, "grad_norm": 0.9848097765150859, "learning_rate": 1.9305121728827673e-05, "loss": 0.1843, "step": 4187 }, { "epoch": 0.8752351097178683, "grad_norm": 1.2115456281085855, "learning_rate": 1.9304708482758717e-05, "loss": 0.1712, "step": 4188 }, { "epoch": 0.8754440961337513, "grad_norm": 1.1216096325222484, "learning_rate": 1.930429511827264e-05, "loss": 0.2024, "step": 4189 }, { "epoch": 0.8756530825496343, "grad_norm": 1.028694690716297, "learning_rate": 1.9303881635374702e-05, "loss": 0.1867, "step": 4190 }, { "epoch": 0.8758620689655172, "grad_norm": 1.276253783539609, "learning_rate": 1.930346803407017e-05, "loss": 0.1932, "step": 4191 }, { "epoch": 0.8760710553814002, "grad_norm": 1.0435504038280852, "learning_rate": 1.9303054314364304e-05, "loss": 0.1816, "step": 4192 }, { "epoch": 0.8762800417972831, "grad_norm": 1.1755878632519337, "learning_rate": 1.9302640476262367e-05, "loss": 0.2039, "step": 4193 }, { "epoch": 0.8764890282131661, "grad_norm": 1.283837156953071, "learning_rate": 1.930222651976963e-05, "loss": 0.2086, "step": 4194 }, { "epoch": 0.8766980146290491, "grad_norm": 1.3428361729001663, "learning_rate": 1.930181244489136e-05, "loss": 0.1715, "step": 4195 }, { "epoch": 0.8769070010449321, "grad_norm": 0.9890307598050284, "learning_rate": 1.9301398251632823e-05, "loss": 0.2192, "step": 4196 }, { "epoch": 0.8771159874608151, "grad_norm": 1.281221431147284, "learning_rate": 1.93009839399993e-05, "loss": 0.1984, "step": 4197 }, { "epoch": 0.877324973876698, "grad_norm": 0.9956902583462252, "learning_rate": 1.930056950999605e-05, "loss": 0.1662, "step": 4198 }, { "epoch": 0.877533960292581, "grad_norm": 1.256645317866745, "learning_rate": 1.930015496162836e-05, "loss": 0.2128, "step": 4199 }, { "epoch": 0.877742946708464, "grad_norm": 1.042505015867404, "learning_rate": 1.9299740294901497e-05, "loss": 0.2056, "step": 4200 }, { "epoch": 0.8779519331243469, "grad_norm": 1.2933354526364367, "learning_rate": 1.9299325509820744e-05, "loss": 0.2137, "step": 4201 }, { "epoch": 0.8781609195402299, "grad_norm": 1.27999187842236, "learning_rate": 1.9298910606391378e-05, "loss": 0.2352, "step": 4202 }, { "epoch": 0.8783699059561129, "grad_norm": 1.1207818555807356, "learning_rate": 1.9298495584618675e-05, "loss": 0.1764, "step": 4203 }, { "epoch": 0.8785788923719958, "grad_norm": 1.1751055435497917, "learning_rate": 1.9298080444507925e-05, "loss": 0.1892, "step": 4204 }, { "epoch": 0.8787878787878788, "grad_norm": 1.1699847291837158, "learning_rate": 1.9297665186064405e-05, "loss": 0.1931, "step": 4205 }, { "epoch": 0.8789968652037617, "grad_norm": 1.4288486878841449, "learning_rate": 1.9297249809293404e-05, "loss": 0.2381, "step": 4206 }, { "epoch": 0.8792058516196447, "grad_norm": 1.1166416932372767, "learning_rate": 1.92968343142002e-05, "loss": 0.1697, "step": 4207 }, { "epoch": 0.8794148380355277, "grad_norm": 1.3363070721903785, "learning_rate": 1.9296418700790093e-05, "loss": 0.2024, "step": 4208 }, { "epoch": 0.8796238244514106, "grad_norm": 1.2436107647992418, "learning_rate": 1.9296002969068367e-05, "loss": 0.2359, "step": 4209 }, { "epoch": 0.8798328108672936, "grad_norm": 1.1801239083894477, "learning_rate": 1.9295587119040306e-05, "loss": 0.1843, "step": 4210 }, { "epoch": 0.8800417972831766, "grad_norm": 1.2516344753177133, "learning_rate": 1.9295171150711212e-05, "loss": 0.2004, "step": 4211 }, { "epoch": 0.8802507836990595, "grad_norm": 1.1081184022553014, "learning_rate": 1.9294755064086378e-05, "loss": 0.1805, "step": 4212 }, { "epoch": 0.8804597701149425, "grad_norm": 1.133612791747788, "learning_rate": 1.9294338859171092e-05, "loss": 0.2122, "step": 4213 }, { "epoch": 0.8806687565308255, "grad_norm": 1.1705450510931417, "learning_rate": 1.929392253597066e-05, "loss": 0.2085, "step": 4214 }, { "epoch": 0.8808777429467085, "grad_norm": 0.8133324860502128, "learning_rate": 1.929350609449037e-05, "loss": 0.1544, "step": 4215 }, { "epoch": 0.8810867293625915, "grad_norm": 1.079260658770758, "learning_rate": 1.9293089534735535e-05, "loss": 0.1611, "step": 4216 }, { "epoch": 0.8812957157784744, "grad_norm": 0.9171497415125079, "learning_rate": 1.9292672856711446e-05, "loss": 0.1688, "step": 4217 }, { "epoch": 0.8815047021943574, "grad_norm": 1.0944115205620053, "learning_rate": 1.9292256060423408e-05, "loss": 0.2037, "step": 4218 }, { "epoch": 0.8817136886102404, "grad_norm": 1.2593109197183971, "learning_rate": 1.9291839145876726e-05, "loss": 0.2043, "step": 4219 }, { "epoch": 0.8819226750261233, "grad_norm": 1.2644243745492858, "learning_rate": 1.9291422113076706e-05, "loss": 0.1951, "step": 4220 }, { "epoch": 0.8821316614420063, "grad_norm": 1.2042358511362248, "learning_rate": 1.9291004962028657e-05, "loss": 0.1754, "step": 4221 }, { "epoch": 0.8823406478578892, "grad_norm": 1.004373983874469, "learning_rate": 1.929058769273789e-05, "loss": 0.1432, "step": 4222 }, { "epoch": 0.8825496342737722, "grad_norm": 1.0966293487470873, "learning_rate": 1.9290170305209706e-05, "loss": 0.1939, "step": 4223 }, { "epoch": 0.8827586206896552, "grad_norm": 1.0614729115709787, "learning_rate": 1.928975279944943e-05, "loss": 0.1975, "step": 4224 }, { "epoch": 0.8829676071055381, "grad_norm": 1.081065970440387, "learning_rate": 1.928933517546236e-05, "loss": 0.2091, "step": 4225 }, { "epoch": 0.8831765935214211, "grad_norm": 1.193426223635371, "learning_rate": 1.9288917433253823e-05, "loss": 0.2063, "step": 4226 }, { "epoch": 0.883385579937304, "grad_norm": 0.9579841111312131, "learning_rate": 1.9288499572829135e-05, "loss": 0.1677, "step": 4227 }, { "epoch": 0.883594566353187, "grad_norm": 1.2359861137188393, "learning_rate": 1.928808159419361e-05, "loss": 0.2154, "step": 4228 }, { "epoch": 0.88380355276907, "grad_norm": 1.0391955608489674, "learning_rate": 1.9287663497352563e-05, "loss": 0.2101, "step": 4229 }, { "epoch": 0.8840125391849529, "grad_norm": 1.115191895484636, "learning_rate": 1.9287245282311325e-05, "loss": 0.175, "step": 4230 }, { "epoch": 0.884221525600836, "grad_norm": 1.0118216100352682, "learning_rate": 1.9286826949075214e-05, "loss": 0.1974, "step": 4231 }, { "epoch": 0.884430512016719, "grad_norm": 0.8615681124828103, "learning_rate": 1.928640849764955e-05, "loss": 0.2064, "step": 4232 }, { "epoch": 0.8846394984326019, "grad_norm": 1.0228597189597042, "learning_rate": 1.9285989928039663e-05, "loss": 0.2073, "step": 4233 }, { "epoch": 0.8848484848484849, "grad_norm": 1.2631285813762132, "learning_rate": 1.9285571240250878e-05, "loss": 0.2063, "step": 4234 }, { "epoch": 0.8850574712643678, "grad_norm": 1.1052807755714447, "learning_rate": 1.9285152434288525e-05, "loss": 0.2083, "step": 4235 }, { "epoch": 0.8852664576802508, "grad_norm": 1.4225518821272063, "learning_rate": 1.9284733510157934e-05, "loss": 0.2201, "step": 4236 }, { "epoch": 0.8854754440961338, "grad_norm": 1.1125403532653266, "learning_rate": 1.9284314467864435e-05, "loss": 0.205, "step": 4237 }, { "epoch": 0.8856844305120167, "grad_norm": 1.080237232541285, "learning_rate": 1.9283895307413362e-05, "loss": 0.2226, "step": 4238 }, { "epoch": 0.8858934169278997, "grad_norm": 1.073949713618572, "learning_rate": 1.928347602881005e-05, "loss": 0.1749, "step": 4239 }, { "epoch": 0.8861024033437827, "grad_norm": 1.0071950216239347, "learning_rate": 1.9283056632059836e-05, "loss": 0.1951, "step": 4240 }, { "epoch": 0.8863113897596656, "grad_norm": 1.003200654645768, "learning_rate": 1.9282637117168054e-05, "loss": 0.1854, "step": 4241 }, { "epoch": 0.8865203761755486, "grad_norm": 1.0866369878702506, "learning_rate": 1.928221748414004e-05, "loss": 0.2337, "step": 4242 }, { "epoch": 0.8867293625914315, "grad_norm": 1.0101617606675202, "learning_rate": 1.9281797732981147e-05, "loss": 0.2169, "step": 4243 }, { "epoch": 0.8869383490073145, "grad_norm": 1.0549946002877628, "learning_rate": 1.9281377863696703e-05, "loss": 0.2205, "step": 4244 }, { "epoch": 0.8871473354231975, "grad_norm": 1.1231947542793266, "learning_rate": 1.9280957876292064e-05, "loss": 0.1904, "step": 4245 }, { "epoch": 0.8873563218390804, "grad_norm": 1.162210751352724, "learning_rate": 1.9280537770772563e-05, "loss": 0.1905, "step": 4246 }, { "epoch": 0.8875653082549634, "grad_norm": 1.0574548715965244, "learning_rate": 1.9280117547143553e-05, "loss": 0.1908, "step": 4247 }, { "epoch": 0.8877742946708463, "grad_norm": 0.9422533263636379, "learning_rate": 1.9279697205410385e-05, "loss": 0.2188, "step": 4248 }, { "epoch": 0.8879832810867294, "grad_norm": 0.9650350016234062, "learning_rate": 1.92792767455784e-05, "loss": 0.24, "step": 4249 }, { "epoch": 0.8881922675026124, "grad_norm": 1.2965495729682381, "learning_rate": 1.9278856167652956e-05, "loss": 0.2372, "step": 4250 }, { "epoch": 0.8884012539184953, "grad_norm": 1.151978719913132, "learning_rate": 1.9278435471639404e-05, "loss": 0.2059, "step": 4251 }, { "epoch": 0.8886102403343783, "grad_norm": 1.1731053778300222, "learning_rate": 1.9278014657543097e-05, "loss": 0.2226, "step": 4252 }, { "epoch": 0.8888192267502613, "grad_norm": 0.8626963185297675, "learning_rate": 1.927759372536939e-05, "loss": 0.1729, "step": 4253 }, { "epoch": 0.8890282131661442, "grad_norm": 1.093822468155967, "learning_rate": 1.9277172675123642e-05, "loss": 0.1733, "step": 4254 }, { "epoch": 0.8892371995820272, "grad_norm": 1.0842275603717084, "learning_rate": 1.9276751506811208e-05, "loss": 0.2249, "step": 4255 }, { "epoch": 0.8894461859979101, "grad_norm": 1.3585580823245853, "learning_rate": 1.9276330220437454e-05, "loss": 0.2087, "step": 4256 }, { "epoch": 0.8896551724137931, "grad_norm": 1.2040562569307844, "learning_rate": 1.9275908816007734e-05, "loss": 0.2006, "step": 4257 }, { "epoch": 0.8898641588296761, "grad_norm": 1.1797925700574357, "learning_rate": 1.9275487293527416e-05, "loss": 0.2269, "step": 4258 }, { "epoch": 0.890073145245559, "grad_norm": 0.8729726945764961, "learning_rate": 1.9275065653001865e-05, "loss": 0.1789, "step": 4259 }, { "epoch": 0.890282131661442, "grad_norm": 0.9805828392470741, "learning_rate": 1.9274643894436446e-05, "loss": 0.1962, "step": 4260 }, { "epoch": 0.890491118077325, "grad_norm": 0.9609989189712296, "learning_rate": 1.9274222017836524e-05, "loss": 0.1889, "step": 4261 }, { "epoch": 0.8907001044932079, "grad_norm": 1.2339984542566744, "learning_rate": 1.927380002320747e-05, "loss": 0.2064, "step": 4262 }, { "epoch": 0.8909090909090909, "grad_norm": 1.10896128404873, "learning_rate": 1.927337791055466e-05, "loss": 0.1752, "step": 4263 }, { "epoch": 0.8911180773249738, "grad_norm": 0.9377666518540965, "learning_rate": 1.9272955679883457e-05, "loss": 0.1948, "step": 4264 }, { "epoch": 0.8913270637408568, "grad_norm": 1.0911817196974367, "learning_rate": 1.9272533331199236e-05, "loss": 0.2188, "step": 4265 }, { "epoch": 0.8915360501567398, "grad_norm": 1.0152785142200749, "learning_rate": 1.9272110864507375e-05, "loss": 0.2299, "step": 4266 }, { "epoch": 0.8917450365726228, "grad_norm": 0.8771621912156468, "learning_rate": 1.9271688279813256e-05, "loss": 0.2039, "step": 4267 }, { "epoch": 0.8919540229885058, "grad_norm": 1.1033517143780698, "learning_rate": 1.9271265577122243e-05, "loss": 0.1871, "step": 4268 }, { "epoch": 0.8921630094043888, "grad_norm": 1.1108705933881393, "learning_rate": 1.9270842756439728e-05, "loss": 0.2227, "step": 4269 }, { "epoch": 0.8923719958202717, "grad_norm": 1.1433691802150816, "learning_rate": 1.9270419817771086e-05, "loss": 0.1888, "step": 4270 }, { "epoch": 0.8925809822361547, "grad_norm": 1.259451076857154, "learning_rate": 1.92699967611217e-05, "loss": 0.1657, "step": 4271 }, { "epoch": 0.8927899686520376, "grad_norm": 0.9813753186529967, "learning_rate": 1.9269573586496958e-05, "loss": 0.2051, "step": 4272 }, { "epoch": 0.8929989550679206, "grad_norm": 1.1933576279304006, "learning_rate": 1.926915029390224e-05, "loss": 0.2056, "step": 4273 }, { "epoch": 0.8932079414838036, "grad_norm": 1.0309030007509847, "learning_rate": 1.9268726883342937e-05, "loss": 0.19, "step": 4274 }, { "epoch": 0.8934169278996865, "grad_norm": 1.1366179262239735, "learning_rate": 1.9268303354824437e-05, "loss": 0.1785, "step": 4275 }, { "epoch": 0.8936259143155695, "grad_norm": 0.9199594926397388, "learning_rate": 1.926787970835213e-05, "loss": 0.1739, "step": 4276 }, { "epoch": 0.8938349007314524, "grad_norm": 1.0550075817209392, "learning_rate": 1.9267455943931407e-05, "loss": 0.1716, "step": 4277 }, { "epoch": 0.8940438871473354, "grad_norm": 1.0212788808527618, "learning_rate": 1.9267032061567658e-05, "loss": 0.2069, "step": 4278 }, { "epoch": 0.8942528735632184, "grad_norm": 1.2190138540864668, "learning_rate": 1.9266608061266284e-05, "loss": 0.222, "step": 4279 }, { "epoch": 0.8944618599791013, "grad_norm": 1.3368036460826787, "learning_rate": 1.926618394303268e-05, "loss": 0.1808, "step": 4280 }, { "epoch": 0.8946708463949843, "grad_norm": 1.192306909923044, "learning_rate": 1.926575970687224e-05, "loss": 0.1833, "step": 4281 }, { "epoch": 0.8948798328108672, "grad_norm": 0.8657299842947145, "learning_rate": 1.926533535279036e-05, "loss": 0.1685, "step": 4282 }, { "epoch": 0.8950888192267502, "grad_norm": 0.9878073817118622, "learning_rate": 1.9264910880792448e-05, "loss": 0.201, "step": 4283 }, { "epoch": 0.8952978056426333, "grad_norm": 1.168192585728873, "learning_rate": 1.92644862908839e-05, "loss": 0.2011, "step": 4284 }, { "epoch": 0.8955067920585162, "grad_norm": 1.2271945206445998, "learning_rate": 1.9264061583070126e-05, "loss": 0.1963, "step": 4285 }, { "epoch": 0.8957157784743992, "grad_norm": 1.2689974365778856, "learning_rate": 1.9263636757356527e-05, "loss": 0.2133, "step": 4286 }, { "epoch": 0.8959247648902822, "grad_norm": 1.1474527584566154, "learning_rate": 1.926321181374851e-05, "loss": 0.2014, "step": 4287 }, { "epoch": 0.8961337513061651, "grad_norm": 1.3233876977960286, "learning_rate": 1.9262786752251485e-05, "loss": 0.1969, "step": 4288 }, { "epoch": 0.8963427377220481, "grad_norm": 1.1048779158521995, "learning_rate": 1.926236157287086e-05, "loss": 0.2024, "step": 4289 }, { "epoch": 0.896551724137931, "grad_norm": 1.4100869553442306, "learning_rate": 1.9261936275612044e-05, "loss": 0.1886, "step": 4290 }, { "epoch": 0.896760710553814, "grad_norm": 1.237356923020123, "learning_rate": 1.926151086048045e-05, "loss": 0.2125, "step": 4291 }, { "epoch": 0.896969696969697, "grad_norm": 1.3661127101455228, "learning_rate": 1.9261085327481498e-05, "loss": 0.2232, "step": 4292 }, { "epoch": 0.8971786833855799, "grad_norm": 1.1303464335111957, "learning_rate": 1.92606596766206e-05, "loss": 0.1844, "step": 4293 }, { "epoch": 0.8973876698014629, "grad_norm": 0.8970850738957352, "learning_rate": 1.926023390790317e-05, "loss": 0.1989, "step": 4294 }, { "epoch": 0.8975966562173459, "grad_norm": 0.9255138303198974, "learning_rate": 1.925980802133463e-05, "loss": 0.1961, "step": 4295 }, { "epoch": 0.8978056426332288, "grad_norm": 0.950470793329718, "learning_rate": 1.9259382016920396e-05, "loss": 0.1806, "step": 4296 }, { "epoch": 0.8980146290491118, "grad_norm": 1.1251761391051136, "learning_rate": 1.9258955894665892e-05, "loss": 0.2113, "step": 4297 }, { "epoch": 0.8982236154649947, "grad_norm": 0.9509135233502352, "learning_rate": 1.9258529654576546e-05, "loss": 0.2059, "step": 4298 }, { "epoch": 0.8984326018808777, "grad_norm": 1.4285407891698692, "learning_rate": 1.9258103296657778e-05, "loss": 0.1881, "step": 4299 }, { "epoch": 0.8986415882967607, "grad_norm": 1.0232332693925763, "learning_rate": 1.9257676820915013e-05, "loss": 0.1814, "step": 4300 }, { "epoch": 0.8988505747126436, "grad_norm": 1.0481375947302312, "learning_rate": 1.9257250227353678e-05, "loss": 0.1977, "step": 4301 }, { "epoch": 0.8990595611285267, "grad_norm": 1.331572267332616, "learning_rate": 1.9256823515979205e-05, "loss": 0.2423, "step": 4302 }, { "epoch": 0.8992685475444097, "grad_norm": 1.2858649173492098, "learning_rate": 1.9256396686797024e-05, "loss": 0.2116, "step": 4303 }, { "epoch": 0.8994775339602926, "grad_norm": 0.9895032103668919, "learning_rate": 1.9255969739812568e-05, "loss": 0.2009, "step": 4304 }, { "epoch": 0.8996865203761756, "grad_norm": 1.1806178164619399, "learning_rate": 1.925554267503127e-05, "loss": 0.1973, "step": 4305 }, { "epoch": 0.8998955067920585, "grad_norm": 1.1378043957557624, "learning_rate": 1.925511549245856e-05, "loss": 0.2216, "step": 4306 }, { "epoch": 0.9001044932079415, "grad_norm": 1.1607301962809111, "learning_rate": 1.925468819209988e-05, "loss": 0.2065, "step": 4307 }, { "epoch": 0.9003134796238245, "grad_norm": 1.0379785426372872, "learning_rate": 1.9254260773960672e-05, "loss": 0.2181, "step": 4308 }, { "epoch": 0.9005224660397074, "grad_norm": 1.082928006430928, "learning_rate": 1.9253833238046367e-05, "loss": 0.1811, "step": 4309 }, { "epoch": 0.9007314524555904, "grad_norm": 1.2903654319352038, "learning_rate": 1.9253405584362407e-05, "loss": 0.2103, "step": 4310 }, { "epoch": 0.9009404388714733, "grad_norm": 1.1541745741946225, "learning_rate": 1.925297781291424e-05, "loss": 0.23, "step": 4311 }, { "epoch": 0.9011494252873563, "grad_norm": 1.1880659699042826, "learning_rate": 1.9252549923707308e-05, "loss": 0.2301, "step": 4312 }, { "epoch": 0.9013584117032393, "grad_norm": 1.0581109607754415, "learning_rate": 1.9252121916747054e-05, "loss": 0.2188, "step": 4313 }, { "epoch": 0.9015673981191222, "grad_norm": 1.2068139125299717, "learning_rate": 1.9251693792038928e-05, "loss": 0.2369, "step": 4314 }, { "epoch": 0.9017763845350052, "grad_norm": 0.9896163844667952, "learning_rate": 1.9251265549588377e-05, "loss": 0.1748, "step": 4315 }, { "epoch": 0.9019853709508882, "grad_norm": 3.2807868642756746, "learning_rate": 1.925083718940085e-05, "loss": 0.1778, "step": 4316 }, { "epoch": 0.9021943573667711, "grad_norm": 0.7984954694524761, "learning_rate": 1.9250408711481804e-05, "loss": 0.2013, "step": 4317 }, { "epoch": 0.9024033437826541, "grad_norm": 1.4070317474910132, "learning_rate": 1.9249980115836684e-05, "loss": 0.2346, "step": 4318 }, { "epoch": 0.9026123301985371, "grad_norm": 0.8356202338536096, "learning_rate": 1.9249551402470953e-05, "loss": 0.1798, "step": 4319 }, { "epoch": 0.9028213166144201, "grad_norm": 1.085283195150277, "learning_rate": 1.9249122571390058e-05, "loss": 0.1823, "step": 4320 }, { "epoch": 0.9030303030303031, "grad_norm": 0.9461703946711728, "learning_rate": 1.9248693622599464e-05, "loss": 0.1985, "step": 4321 }, { "epoch": 0.903239289446186, "grad_norm": 1.0033636667072614, "learning_rate": 1.9248264556104626e-05, "loss": 0.2191, "step": 4322 }, { "epoch": 0.903448275862069, "grad_norm": 1.0859612341539187, "learning_rate": 1.9247835371911006e-05, "loss": 0.2231, "step": 4323 }, { "epoch": 0.903657262277952, "grad_norm": 1.0014281679236061, "learning_rate": 1.9247406070024066e-05, "loss": 0.186, "step": 4324 }, { "epoch": 0.9038662486938349, "grad_norm": 1.0571016098684591, "learning_rate": 1.924697665044927e-05, "loss": 0.1867, "step": 4325 }, { "epoch": 0.9040752351097179, "grad_norm": 1.1809308233292284, "learning_rate": 1.9246547113192084e-05, "loss": 0.2326, "step": 4326 }, { "epoch": 0.9042842215256008, "grad_norm": 1.0222719106825997, "learning_rate": 1.924611745825797e-05, "loss": 0.2047, "step": 4327 }, { "epoch": 0.9044932079414838, "grad_norm": 1.0379982608737632, "learning_rate": 1.92456876856524e-05, "loss": 0.17, "step": 4328 }, { "epoch": 0.9047021943573668, "grad_norm": 1.1061814201581819, "learning_rate": 1.9245257795380843e-05, "loss": 0.2084, "step": 4329 }, { "epoch": 0.9049111807732497, "grad_norm": 1.2862129989578384, "learning_rate": 1.924482778744877e-05, "loss": 0.2304, "step": 4330 }, { "epoch": 0.9051201671891327, "grad_norm": 1.200099713086137, "learning_rate": 1.9244397661861655e-05, "loss": 0.2267, "step": 4331 }, { "epoch": 0.9053291536050156, "grad_norm": 0.9920215236830021, "learning_rate": 1.9243967418624967e-05, "loss": 0.2011, "step": 4332 }, { "epoch": 0.9055381400208986, "grad_norm": 0.9498426592626528, "learning_rate": 1.9243537057744185e-05, "loss": 0.1833, "step": 4333 }, { "epoch": 0.9057471264367816, "grad_norm": 1.2262188513988708, "learning_rate": 1.9243106579224787e-05, "loss": 0.1766, "step": 4334 }, { "epoch": 0.9059561128526645, "grad_norm": 1.1785304327509467, "learning_rate": 1.924267598307225e-05, "loss": 0.2019, "step": 4335 }, { "epoch": 0.9061650992685475, "grad_norm": 1.1294820050219176, "learning_rate": 1.9242245269292055e-05, "loss": 0.2173, "step": 4336 }, { "epoch": 0.9063740856844306, "grad_norm": 0.9487376247362874, "learning_rate": 1.924181443788968e-05, "loss": 0.2039, "step": 4337 }, { "epoch": 0.9065830721003135, "grad_norm": 1.1609786886142799, "learning_rate": 1.9241383488870614e-05, "loss": 0.1999, "step": 4338 }, { "epoch": 0.9067920585161965, "grad_norm": 1.2095020374153878, "learning_rate": 1.9240952422240337e-05, "loss": 0.2098, "step": 4339 }, { "epoch": 0.9070010449320794, "grad_norm": 1.1720668098387246, "learning_rate": 1.9240521238004336e-05, "loss": 0.1921, "step": 4340 }, { "epoch": 0.9072100313479624, "grad_norm": 0.975132283322672, "learning_rate": 1.92400899361681e-05, "loss": 0.1707, "step": 4341 }, { "epoch": 0.9074190177638454, "grad_norm": 1.1403804945570852, "learning_rate": 1.9239658516737115e-05, "loss": 0.207, "step": 4342 }, { "epoch": 0.9076280041797283, "grad_norm": 0.9761874418195617, "learning_rate": 1.9239226979716875e-05, "loss": 0.1752, "step": 4343 }, { "epoch": 0.9078369905956113, "grad_norm": 1.1312327508690978, "learning_rate": 1.9238795325112867e-05, "loss": 0.2118, "step": 4344 }, { "epoch": 0.9080459770114943, "grad_norm": 1.0946231854364092, "learning_rate": 1.9238363552930593e-05, "loss": 0.2173, "step": 4345 }, { "epoch": 0.9082549634273772, "grad_norm": 0.9029717088402552, "learning_rate": 1.923793166317554e-05, "loss": 0.1974, "step": 4346 }, { "epoch": 0.9084639498432602, "grad_norm": 1.0763121687610493, "learning_rate": 1.9237499655853207e-05, "loss": 0.193, "step": 4347 }, { "epoch": 0.9086729362591431, "grad_norm": 1.108178856495677, "learning_rate": 1.9237067530969094e-05, "loss": 0.1851, "step": 4348 }, { "epoch": 0.9088819226750261, "grad_norm": 1.0120751825853995, "learning_rate": 1.9236635288528696e-05, "loss": 0.2291, "step": 4349 }, { "epoch": 0.9090909090909091, "grad_norm": 1.1246202956075975, "learning_rate": 1.923620292853752e-05, "loss": 0.1935, "step": 4350 }, { "epoch": 0.909299895506792, "grad_norm": 1.2240041251695881, "learning_rate": 1.9235770451001064e-05, "loss": 0.1711, "step": 4351 }, { "epoch": 0.909508881922675, "grad_norm": 1.2718369174189876, "learning_rate": 1.923533785592483e-05, "loss": 0.1994, "step": 4352 }, { "epoch": 0.909717868338558, "grad_norm": 1.11385129958636, "learning_rate": 1.9234905143314327e-05, "loss": 0.2035, "step": 4353 }, { "epoch": 0.909926854754441, "grad_norm": 1.1623332453006512, "learning_rate": 1.9234472313175064e-05, "loss": 0.2099, "step": 4354 }, { "epoch": 0.910135841170324, "grad_norm": 1.576140032747874, "learning_rate": 1.9234039365512548e-05, "loss": 0.1977, "step": 4355 }, { "epoch": 0.9103448275862069, "grad_norm": 1.1199099268817754, "learning_rate": 1.9233606300332287e-05, "loss": 0.2134, "step": 4356 }, { "epoch": 0.9105538140020899, "grad_norm": 1.7108816675796439, "learning_rate": 1.923317311763979e-05, "loss": 0.1938, "step": 4357 }, { "epoch": 0.9107628004179729, "grad_norm": 0.9396861110041135, "learning_rate": 1.9232739817440577e-05, "loss": 0.1999, "step": 4358 }, { "epoch": 0.9109717868338558, "grad_norm": 1.149797959897003, "learning_rate": 1.923230639974016e-05, "loss": 0.1852, "step": 4359 }, { "epoch": 0.9111807732497388, "grad_norm": 1.0979485665508881, "learning_rate": 1.9231872864544054e-05, "loss": 0.1926, "step": 4360 }, { "epoch": 0.9113897596656217, "grad_norm": 0.9333421102808227, "learning_rate": 1.9231439211857774e-05, "loss": 0.1681, "step": 4361 }, { "epoch": 0.9115987460815047, "grad_norm": 0.9958627277246678, "learning_rate": 1.9231005441686838e-05, "loss": 0.1815, "step": 4362 }, { "epoch": 0.9118077324973877, "grad_norm": 1.1057116489581997, "learning_rate": 1.9230571554036772e-05, "loss": 0.2505, "step": 4363 }, { "epoch": 0.9120167189132706, "grad_norm": 1.164605775300037, "learning_rate": 1.9230137548913093e-05, "loss": 0.2082, "step": 4364 }, { "epoch": 0.9122257053291536, "grad_norm": 1.1174381795149628, "learning_rate": 1.922970342632133e-05, "loss": 0.2067, "step": 4365 }, { "epoch": 0.9124346917450366, "grad_norm": 1.0033339988202987, "learning_rate": 1.9229269186267005e-05, "loss": 0.186, "step": 4366 }, { "epoch": 0.9126436781609195, "grad_norm": 1.1572575389457622, "learning_rate": 1.922883482875564e-05, "loss": 0.206, "step": 4367 }, { "epoch": 0.9128526645768025, "grad_norm": 1.1532107592887755, "learning_rate": 1.9228400353792767e-05, "loss": 0.227, "step": 4368 }, { "epoch": 0.9130616509926854, "grad_norm": 1.1824916997027182, "learning_rate": 1.922796576138392e-05, "loss": 0.1878, "step": 4369 }, { "epoch": 0.9132706374085684, "grad_norm": 1.7907516520064115, "learning_rate": 1.9227531051534623e-05, "loss": 0.2228, "step": 4370 }, { "epoch": 0.9134796238244514, "grad_norm": 1.0222811094391768, "learning_rate": 1.9227096224250407e-05, "loss": 0.1828, "step": 4371 }, { "epoch": 0.9136886102403344, "grad_norm": 1.041161755169235, "learning_rate": 1.9226661279536817e-05, "loss": 0.2001, "step": 4372 }, { "epoch": 0.9138975966562174, "grad_norm": 1.4296412682293795, "learning_rate": 1.922622621739937e-05, "loss": 0.2068, "step": 4373 }, { "epoch": 0.9141065830721004, "grad_norm": 1.1847881319060156, "learning_rate": 1.9225791037843623e-05, "loss": 0.1963, "step": 4374 }, { "epoch": 0.9143155694879833, "grad_norm": 1.0538259321360512, "learning_rate": 1.92253557408751e-05, "loss": 0.2257, "step": 4375 }, { "epoch": 0.9145245559038663, "grad_norm": 0.92969791607436, "learning_rate": 1.922492032649935e-05, "loss": 0.1735, "step": 4376 }, { "epoch": 0.9147335423197492, "grad_norm": 1.0377570209304439, "learning_rate": 1.922448479472191e-05, "loss": 0.188, "step": 4377 }, { "epoch": 0.9149425287356322, "grad_norm": 1.0882763615044895, "learning_rate": 1.9224049145548316e-05, "loss": 0.1951, "step": 4378 }, { "epoch": 0.9151515151515152, "grad_norm": 1.0390724315613042, "learning_rate": 1.9223613378984125e-05, "loss": 0.2117, "step": 4379 }, { "epoch": 0.9153605015673981, "grad_norm": 0.9263038939224169, "learning_rate": 1.9223177495034873e-05, "loss": 0.1721, "step": 4380 }, { "epoch": 0.9155694879832811, "grad_norm": 1.0819121656761517, "learning_rate": 1.9222741493706113e-05, "loss": 0.1815, "step": 4381 }, { "epoch": 0.915778474399164, "grad_norm": 0.9141893418294309, "learning_rate": 1.9222305375003394e-05, "loss": 0.1851, "step": 4382 }, { "epoch": 0.915987460815047, "grad_norm": 1.022556109391148, "learning_rate": 1.9221869138932267e-05, "loss": 0.2359, "step": 4383 }, { "epoch": 0.91619644723093, "grad_norm": 1.0695489143486014, "learning_rate": 1.9221432785498277e-05, "loss": 0.204, "step": 4384 }, { "epoch": 0.9164054336468129, "grad_norm": 1.0740252147035476, "learning_rate": 1.922099631470698e-05, "loss": 0.1883, "step": 4385 }, { "epoch": 0.9166144200626959, "grad_norm": 1.0701442582302405, "learning_rate": 1.9220559726563935e-05, "loss": 0.1445, "step": 4386 }, { "epoch": 0.9168234064785789, "grad_norm": 1.409276038512844, "learning_rate": 1.9220123021074693e-05, "loss": 0.1715, "step": 4387 }, { "epoch": 0.9170323928944618, "grad_norm": 1.1573099357336394, "learning_rate": 1.921968619824482e-05, "loss": 0.1804, "step": 4388 }, { "epoch": 0.9172413793103448, "grad_norm": 1.1631947436327406, "learning_rate": 1.9219249258079865e-05, "loss": 0.2259, "step": 4389 }, { "epoch": 0.9174503657262278, "grad_norm": 1.097201836878603, "learning_rate": 1.9218812200585394e-05, "loss": 0.1846, "step": 4390 }, { "epoch": 0.9176593521421108, "grad_norm": 1.5348352288091633, "learning_rate": 1.921837502576697e-05, "loss": 0.2125, "step": 4391 }, { "epoch": 0.9178683385579938, "grad_norm": 1.2796207074620414, "learning_rate": 1.9217937733630156e-05, "loss": 0.2329, "step": 4392 }, { "epoch": 0.9180773249738767, "grad_norm": 1.1539814079707351, "learning_rate": 1.9217500324180516e-05, "loss": 0.185, "step": 4393 }, { "epoch": 0.9182863113897597, "grad_norm": 1.0456476441890914, "learning_rate": 1.9217062797423615e-05, "loss": 0.2047, "step": 4394 }, { "epoch": 0.9184952978056427, "grad_norm": 1.0850963961632771, "learning_rate": 1.9216625153365025e-05, "loss": 0.2429, "step": 4395 }, { "epoch": 0.9187042842215256, "grad_norm": 1.153359030201403, "learning_rate": 1.9216187392010316e-05, "loss": 0.1959, "step": 4396 }, { "epoch": 0.9189132706374086, "grad_norm": 1.1879307079431545, "learning_rate": 1.9215749513365057e-05, "loss": 0.2207, "step": 4397 }, { "epoch": 0.9191222570532915, "grad_norm": 1.0325880216309595, "learning_rate": 1.9215311517434818e-05, "loss": 0.1861, "step": 4398 }, { "epoch": 0.9193312434691745, "grad_norm": 1.3264748389838108, "learning_rate": 1.9214873404225177e-05, "loss": 0.2104, "step": 4399 }, { "epoch": 0.9195402298850575, "grad_norm": 1.0069358767310272, "learning_rate": 1.9214435173741714e-05, "loss": 0.2414, "step": 4400 }, { "epoch": 0.9197492163009404, "grad_norm": 1.2717262376361642, "learning_rate": 1.9213996825989998e-05, "loss": 0.1746, "step": 4401 }, { "epoch": 0.9199582027168234, "grad_norm": 1.0346603424215643, "learning_rate": 1.921355836097561e-05, "loss": 0.1663, "step": 4402 }, { "epoch": 0.9201671891327063, "grad_norm": 1.0283165098026488, "learning_rate": 1.921311977870413e-05, "loss": 0.2194, "step": 4403 }, { "epoch": 0.9203761755485893, "grad_norm": 1.2573826202165037, "learning_rate": 1.9212681079181143e-05, "loss": 0.2256, "step": 4404 }, { "epoch": 0.9205851619644723, "grad_norm": 0.9444933856090685, "learning_rate": 1.921224226241223e-05, "loss": 0.1895, "step": 4405 }, { "epoch": 0.9207941483803552, "grad_norm": 1.02057190063438, "learning_rate": 1.9211803328402974e-05, "loss": 0.2205, "step": 4406 }, { "epoch": 0.9210031347962383, "grad_norm": 1.1591281137528864, "learning_rate": 1.9211364277158964e-05, "loss": 0.1916, "step": 4407 }, { "epoch": 0.9212121212121213, "grad_norm": 1.0442166483204227, "learning_rate": 1.9210925108685783e-05, "loss": 0.2001, "step": 4408 }, { "epoch": 0.9214211076280042, "grad_norm": 1.1369675091509952, "learning_rate": 1.921048582298903e-05, "loss": 0.2291, "step": 4409 }, { "epoch": 0.9216300940438872, "grad_norm": 0.9055462505975399, "learning_rate": 1.9210046420074284e-05, "loss": 0.1691, "step": 4410 }, { "epoch": 0.9218390804597701, "grad_norm": 0.9972197059291232, "learning_rate": 1.9209606899947142e-05, "loss": 0.2139, "step": 4411 }, { "epoch": 0.9220480668756531, "grad_norm": 1.2162226223850983, "learning_rate": 1.92091672626132e-05, "loss": 0.2495, "step": 4412 }, { "epoch": 0.9222570532915361, "grad_norm": 1.1666079942829932, "learning_rate": 1.9208727508078048e-05, "loss": 0.1807, "step": 4413 }, { "epoch": 0.922466039707419, "grad_norm": 1.115055656850472, "learning_rate": 1.9208287636347286e-05, "loss": 0.2157, "step": 4414 }, { "epoch": 0.922675026123302, "grad_norm": 1.383645425593486, "learning_rate": 1.9207847647426514e-05, "loss": 0.1965, "step": 4415 }, { "epoch": 0.922884012539185, "grad_norm": 1.0848391519237057, "learning_rate": 1.920740754132133e-05, "loss": 0.1981, "step": 4416 }, { "epoch": 0.9230929989550679, "grad_norm": 1.1543221886473967, "learning_rate": 1.9206967318037328e-05, "loss": 0.2116, "step": 4417 }, { "epoch": 0.9233019853709509, "grad_norm": 1.047617400623094, "learning_rate": 1.920652697758012e-05, "loss": 0.1953, "step": 4418 }, { "epoch": 0.9235109717868338, "grad_norm": 0.9498710206187039, "learning_rate": 1.9206086519955305e-05, "loss": 0.1573, "step": 4419 }, { "epoch": 0.9237199582027168, "grad_norm": 1.1661113156058809, "learning_rate": 1.9205645945168488e-05, "loss": 0.1997, "step": 4420 }, { "epoch": 0.9239289446185998, "grad_norm": 1.0861156239534584, "learning_rate": 1.9205205253225284e-05, "loss": 0.1905, "step": 4421 }, { "epoch": 0.9241379310344827, "grad_norm": 1.2557341844246692, "learning_rate": 1.920476444413129e-05, "loss": 0.1947, "step": 4422 }, { "epoch": 0.9243469174503657, "grad_norm": 1.0746554541442759, "learning_rate": 1.9204323517892124e-05, "loss": 0.2421, "step": 4423 }, { "epoch": 0.9245559038662486, "grad_norm": 1.0510659965586249, "learning_rate": 1.92038824745134e-05, "loss": 0.213, "step": 4424 }, { "epoch": 0.9247648902821317, "grad_norm": 1.1744130491168376, "learning_rate": 1.920344131400072e-05, "loss": 0.1749, "step": 4425 }, { "epoch": 0.9249738766980147, "grad_norm": 1.3474097153260731, "learning_rate": 1.9203000036359705e-05, "loss": 0.2054, "step": 4426 }, { "epoch": 0.9251828631138976, "grad_norm": 0.8288820428211613, "learning_rate": 1.9202558641595968e-05, "loss": 0.1687, "step": 4427 }, { "epoch": 0.9253918495297806, "grad_norm": 1.2663847311614829, "learning_rate": 1.9202117129715134e-05, "loss": 0.2174, "step": 4428 }, { "epoch": 0.9256008359456636, "grad_norm": 1.0877848625585527, "learning_rate": 1.9201675500722813e-05, "loss": 0.2181, "step": 4429 }, { "epoch": 0.9258098223615465, "grad_norm": 0.9647345960799192, "learning_rate": 1.9201233754624633e-05, "loss": 0.2118, "step": 4430 }, { "epoch": 0.9260188087774295, "grad_norm": 1.192333849109523, "learning_rate": 1.9200791891426208e-05, "loss": 0.1792, "step": 4431 }, { "epoch": 0.9262277951933124, "grad_norm": 1.1351263084372936, "learning_rate": 1.920034991113317e-05, "loss": 0.2154, "step": 4432 }, { "epoch": 0.9264367816091954, "grad_norm": 1.1157528091343747, "learning_rate": 1.9199907813751134e-05, "loss": 0.2234, "step": 4433 }, { "epoch": 0.9266457680250784, "grad_norm": 1.0967463114697311, "learning_rate": 1.919946559928574e-05, "loss": 0.2218, "step": 4434 }, { "epoch": 0.9268547544409613, "grad_norm": 1.058136920558742, "learning_rate": 1.9199023267742598e-05, "loss": 0.2306, "step": 4435 }, { "epoch": 0.9270637408568443, "grad_norm": 1.5247506407753841, "learning_rate": 1.9198580819127353e-05, "loss": 0.2256, "step": 4436 }, { "epoch": 0.9272727272727272, "grad_norm": 1.1440939863363626, "learning_rate": 1.919813825344563e-05, "loss": 0.2144, "step": 4437 }, { "epoch": 0.9274817136886102, "grad_norm": 0.9997222446436177, "learning_rate": 1.919769557070306e-05, "loss": 0.1711, "step": 4438 }, { "epoch": 0.9276907001044932, "grad_norm": 1.0848297725130625, "learning_rate": 1.9197252770905277e-05, "loss": 0.2163, "step": 4439 }, { "epoch": 0.9278996865203761, "grad_norm": 0.9165087031637549, "learning_rate": 1.9196809854057917e-05, "loss": 0.1619, "step": 4440 }, { "epoch": 0.9281086729362591, "grad_norm": 0.9380187147803613, "learning_rate": 1.9196366820166622e-05, "loss": 0.2123, "step": 4441 }, { "epoch": 0.9283176593521422, "grad_norm": 0.9670462348242267, "learning_rate": 1.919592366923702e-05, "loss": 0.1971, "step": 4442 }, { "epoch": 0.9285266457680251, "grad_norm": 0.8915478626179147, "learning_rate": 1.919548040127476e-05, "loss": 0.1949, "step": 4443 }, { "epoch": 0.9287356321839081, "grad_norm": 1.1810861611660706, "learning_rate": 1.9195037016285475e-05, "loss": 0.2006, "step": 4444 }, { "epoch": 0.928944618599791, "grad_norm": 0.9875735572402417, "learning_rate": 1.919459351427482e-05, "loss": 0.2071, "step": 4445 }, { "epoch": 0.929153605015674, "grad_norm": 0.862286288043274, "learning_rate": 1.9194149895248425e-05, "loss": 0.2106, "step": 4446 }, { "epoch": 0.929362591431557, "grad_norm": 1.0163770546500122, "learning_rate": 1.9193706159211943e-05, "loss": 0.1994, "step": 4447 }, { "epoch": 0.9295715778474399, "grad_norm": 1.0307088538535232, "learning_rate": 1.9193262306171024e-05, "loss": 0.2099, "step": 4448 }, { "epoch": 0.9297805642633229, "grad_norm": 1.1156014369801908, "learning_rate": 1.9192818336131313e-05, "loss": 0.2035, "step": 4449 }, { "epoch": 0.9299895506792059, "grad_norm": 1.3174194153060577, "learning_rate": 1.919237424909846e-05, "loss": 0.2489, "step": 4450 }, { "epoch": 0.9301985370950888, "grad_norm": 0.9228306955252298, "learning_rate": 1.9191930045078115e-05, "loss": 0.1816, "step": 4451 }, { "epoch": 0.9304075235109718, "grad_norm": 1.033669980104344, "learning_rate": 1.9191485724075937e-05, "loss": 0.2021, "step": 4452 }, { "epoch": 0.9306165099268547, "grad_norm": 1.1747315496424733, "learning_rate": 1.9191041286097577e-05, "loss": 0.2054, "step": 4453 }, { "epoch": 0.9308254963427377, "grad_norm": 1.1497830322821947, "learning_rate": 1.919059673114869e-05, "loss": 0.1827, "step": 4454 }, { "epoch": 0.9310344827586207, "grad_norm": 1.048311776653486, "learning_rate": 1.9190152059234935e-05, "loss": 0.2048, "step": 4455 }, { "epoch": 0.9312434691745036, "grad_norm": 1.28403048737679, "learning_rate": 1.918970727036197e-05, "loss": 0.1878, "step": 4456 }, { "epoch": 0.9314524555903866, "grad_norm": 1.122278717981438, "learning_rate": 1.918926236453546e-05, "loss": 0.193, "step": 4457 }, { "epoch": 0.9316614420062695, "grad_norm": 0.954802406908389, "learning_rate": 1.9188817341761062e-05, "loss": 0.1415, "step": 4458 }, { "epoch": 0.9318704284221525, "grad_norm": 0.9813713748458995, "learning_rate": 1.9188372202044444e-05, "loss": 0.1709, "step": 4459 }, { "epoch": 0.9320794148380356, "grad_norm": 0.9932675027382051, "learning_rate": 1.9187926945391268e-05, "loss": 0.2036, "step": 4460 }, { "epoch": 0.9322884012539185, "grad_norm": 1.0481735356772597, "learning_rate": 1.9187481571807196e-05, "loss": 0.1763, "step": 4461 }, { "epoch": 0.9324973876698015, "grad_norm": 1.170063667921254, "learning_rate": 1.9187036081297907e-05, "loss": 0.2069, "step": 4462 }, { "epoch": 0.9327063740856845, "grad_norm": 1.0481682162873842, "learning_rate": 1.9186590473869065e-05, "loss": 0.2024, "step": 4463 }, { "epoch": 0.9329153605015674, "grad_norm": 0.8431729407002131, "learning_rate": 1.9186144749526336e-05, "loss": 0.1877, "step": 4464 }, { "epoch": 0.9331243469174504, "grad_norm": 1.0985390887213038, "learning_rate": 1.9185698908275404e-05, "loss": 0.2237, "step": 4465 }, { "epoch": 0.9333333333333333, "grad_norm": 1.2022077657071069, "learning_rate": 1.9185252950121932e-05, "loss": 0.2136, "step": 4466 }, { "epoch": 0.9335423197492163, "grad_norm": 1.3980342307711786, "learning_rate": 1.91848068750716e-05, "loss": 0.2022, "step": 4467 }, { "epoch": 0.9337513061650993, "grad_norm": 1.0757269720318878, "learning_rate": 1.918436068313009e-05, "loss": 0.1631, "step": 4468 }, { "epoch": 0.9339602925809822, "grad_norm": 1.1643831663950168, "learning_rate": 1.9183914374303073e-05, "loss": 0.1897, "step": 4469 }, { "epoch": 0.9341692789968652, "grad_norm": 1.245358125230303, "learning_rate": 1.918346794859623e-05, "loss": 0.2361, "step": 4470 }, { "epoch": 0.9343782654127482, "grad_norm": 1.0279563448924682, "learning_rate": 1.9183021406015245e-05, "loss": 0.2002, "step": 4471 }, { "epoch": 0.9345872518286311, "grad_norm": 1.1249740877963526, "learning_rate": 1.91825747465658e-05, "loss": 0.2107, "step": 4472 }, { "epoch": 0.9347962382445141, "grad_norm": 1.077602807694861, "learning_rate": 1.9182127970253582e-05, "loss": 0.2178, "step": 4473 }, { "epoch": 0.935005224660397, "grad_norm": 1.2767971343952713, "learning_rate": 1.918168107708427e-05, "loss": 0.2055, "step": 4474 }, { "epoch": 0.93521421107628, "grad_norm": 1.0119512654299623, "learning_rate": 1.9181234067063557e-05, "loss": 0.2206, "step": 4475 }, { "epoch": 0.935423197492163, "grad_norm": 1.0241896886363355, "learning_rate": 1.9180786940197135e-05, "loss": 0.2033, "step": 4476 }, { "epoch": 0.9356321839080459, "grad_norm": 0.9817713943313636, "learning_rate": 1.9180339696490686e-05, "loss": 0.2403, "step": 4477 }, { "epoch": 0.935841170323929, "grad_norm": 1.1526511954577676, "learning_rate": 1.917989233594991e-05, "loss": 0.1978, "step": 4478 }, { "epoch": 0.936050156739812, "grad_norm": 1.046708178263719, "learning_rate": 1.9179444858580495e-05, "loss": 0.1604, "step": 4479 }, { "epoch": 0.9362591431556949, "grad_norm": 1.0989659339847162, "learning_rate": 1.9178997264388137e-05, "loss": 0.2044, "step": 4480 }, { "epoch": 0.9364681295715779, "grad_norm": 1.3594485039761401, "learning_rate": 1.9178549553378527e-05, "loss": 0.2057, "step": 4481 }, { "epoch": 0.9366771159874608, "grad_norm": 1.2030453444983302, "learning_rate": 1.9178101725557375e-05, "loss": 0.185, "step": 4482 }, { "epoch": 0.9368861024033438, "grad_norm": 0.9891893500635279, "learning_rate": 1.9177653780930373e-05, "loss": 0.1691, "step": 4483 }, { "epoch": 0.9370950888192268, "grad_norm": 0.8922848043775338, "learning_rate": 1.917720571950322e-05, "loss": 0.2224, "step": 4484 }, { "epoch": 0.9373040752351097, "grad_norm": 1.0973414248894289, "learning_rate": 1.9176757541281624e-05, "loss": 0.2008, "step": 4485 }, { "epoch": 0.9375130616509927, "grad_norm": 1.0404262234934671, "learning_rate": 1.917630924627128e-05, "loss": 0.1839, "step": 4486 }, { "epoch": 0.9377220480668756, "grad_norm": 1.0357618305514074, "learning_rate": 1.9175860834477904e-05, "loss": 0.2148, "step": 4487 }, { "epoch": 0.9379310344827586, "grad_norm": 1.0098829491016332, "learning_rate": 1.9175412305907198e-05, "loss": 0.2361, "step": 4488 }, { "epoch": 0.9381400208986416, "grad_norm": 0.9301396819143232, "learning_rate": 1.917496366056487e-05, "loss": 0.1674, "step": 4489 }, { "epoch": 0.9383490073145245, "grad_norm": 1.0852170664071137, "learning_rate": 1.9174514898456625e-05, "loss": 0.1913, "step": 4490 }, { "epoch": 0.9385579937304075, "grad_norm": 1.0383898002404848, "learning_rate": 1.9174066019588183e-05, "loss": 0.188, "step": 4491 }, { "epoch": 0.9387669801462905, "grad_norm": 1.1722089162892826, "learning_rate": 1.917361702396525e-05, "loss": 0.1912, "step": 4492 }, { "epoch": 0.9389759665621734, "grad_norm": 1.1017869414624395, "learning_rate": 1.9173167911593545e-05, "loss": 0.1924, "step": 4493 }, { "epoch": 0.9391849529780564, "grad_norm": 0.9327928158583804, "learning_rate": 1.917271868247878e-05, "loss": 0.1685, "step": 4494 }, { "epoch": 0.9393939393939394, "grad_norm": 1.1401917568834652, "learning_rate": 1.9172269336626673e-05, "loss": 0.2028, "step": 4495 }, { "epoch": 0.9396029258098224, "grad_norm": 1.128405394541534, "learning_rate": 1.9171819874042944e-05, "loss": 0.1732, "step": 4496 }, { "epoch": 0.9398119122257054, "grad_norm": 1.0839302544583695, "learning_rate": 1.9171370294733312e-05, "loss": 0.2146, "step": 4497 }, { "epoch": 0.9400208986415883, "grad_norm": 1.2491653934528537, "learning_rate": 1.9170920598703498e-05, "loss": 0.2376, "step": 4498 }, { "epoch": 0.9402298850574713, "grad_norm": 1.103343921576797, "learning_rate": 1.917047078595923e-05, "loss": 0.2259, "step": 4499 }, { "epoch": 0.9404388714733543, "grad_norm": 1.1166428624962303, "learning_rate": 1.9170020856506224e-05, "loss": 0.199, "step": 4500 }, { "epoch": 0.9406478578892372, "grad_norm": 1.0297538211593542, "learning_rate": 1.9169570810350212e-05, "loss": 0.2122, "step": 4501 }, { "epoch": 0.9408568443051202, "grad_norm": 1.1966196028105602, "learning_rate": 1.916912064749692e-05, "loss": 0.1952, "step": 4502 }, { "epoch": 0.9410658307210031, "grad_norm": 1.0723002475472327, "learning_rate": 1.9168670367952077e-05, "loss": 0.2071, "step": 4503 }, { "epoch": 0.9412748171368861, "grad_norm": 1.076586372688852, "learning_rate": 1.9168219971721417e-05, "loss": 0.2039, "step": 4504 }, { "epoch": 0.9414838035527691, "grad_norm": 1.0988861277374329, "learning_rate": 1.9167769458810664e-05, "loss": 0.191, "step": 4505 }, { "epoch": 0.941692789968652, "grad_norm": 1.0983002230822119, "learning_rate": 1.9167318829225558e-05, "loss": 0.1896, "step": 4506 }, { "epoch": 0.941901776384535, "grad_norm": 1.1878154599846231, "learning_rate": 1.9166868082971834e-05, "loss": 0.2143, "step": 4507 }, { "epoch": 0.942110762800418, "grad_norm": 1.195484790328629, "learning_rate": 1.9166417220055226e-05, "loss": 0.2254, "step": 4508 }, { "epoch": 0.9423197492163009, "grad_norm": 1.214527935049523, "learning_rate": 1.916596624048147e-05, "loss": 0.2017, "step": 4509 }, { "epoch": 0.9425287356321839, "grad_norm": 1.00531103165247, "learning_rate": 1.9165515144256307e-05, "loss": 0.2372, "step": 4510 }, { "epoch": 0.9427377220480668, "grad_norm": 1.0294541205489618, "learning_rate": 1.9165063931385485e-05, "loss": 0.223, "step": 4511 }, { "epoch": 0.9429467084639498, "grad_norm": 1.1220462499619013, "learning_rate": 1.9164612601874733e-05, "loss": 0.2308, "step": 4512 }, { "epoch": 0.9431556948798329, "grad_norm": 0.945829102747699, "learning_rate": 1.9164161155729808e-05, "loss": 0.183, "step": 4513 }, { "epoch": 0.9433646812957158, "grad_norm": 1.024631253269755, "learning_rate": 1.916370959295645e-05, "loss": 0.1802, "step": 4514 }, { "epoch": 0.9435736677115988, "grad_norm": 1.1475104558111617, "learning_rate": 1.91632579135604e-05, "loss": 0.2061, "step": 4515 }, { "epoch": 0.9437826541274817, "grad_norm": 1.0222813679003266, "learning_rate": 1.9162806117547416e-05, "loss": 0.1933, "step": 4516 }, { "epoch": 0.9439916405433647, "grad_norm": 1.128094727154864, "learning_rate": 1.9162354204923242e-05, "loss": 0.2191, "step": 4517 }, { "epoch": 0.9442006269592477, "grad_norm": 1.0238757451950662, "learning_rate": 1.916190217569363e-05, "loss": 0.1548, "step": 4518 }, { "epoch": 0.9444096133751306, "grad_norm": 1.2660527346805137, "learning_rate": 1.9161450029864338e-05, "loss": 0.2299, "step": 4519 }, { "epoch": 0.9446185997910136, "grad_norm": 1.1146522990483647, "learning_rate": 1.916099776744111e-05, "loss": 0.217, "step": 4520 }, { "epoch": 0.9448275862068966, "grad_norm": 1.1411043510548315, "learning_rate": 1.916054538842971e-05, "loss": 0.2232, "step": 4521 }, { "epoch": 0.9450365726227795, "grad_norm": 0.9947814729176563, "learning_rate": 1.9160092892835893e-05, "loss": 0.1914, "step": 4522 }, { "epoch": 0.9452455590386625, "grad_norm": 0.9884974680119065, "learning_rate": 1.915964028066542e-05, "loss": 0.221, "step": 4523 }, { "epoch": 0.9454545454545454, "grad_norm": 1.2236832311886556, "learning_rate": 1.9159187551924045e-05, "loss": 0.164, "step": 4524 }, { "epoch": 0.9456635318704284, "grad_norm": 1.1237313167150123, "learning_rate": 1.9158734706617537e-05, "loss": 0.2355, "step": 4525 }, { "epoch": 0.9458725182863114, "grad_norm": 1.1095835765966897, "learning_rate": 1.9158281744751653e-05, "loss": 0.1842, "step": 4526 }, { "epoch": 0.9460815047021943, "grad_norm": 1.1568582098839515, "learning_rate": 1.915782866633216e-05, "loss": 0.2029, "step": 4527 }, { "epoch": 0.9462904911180773, "grad_norm": 1.0086481421171951, "learning_rate": 1.9157375471364825e-05, "loss": 0.1958, "step": 4528 }, { "epoch": 0.9464994775339602, "grad_norm": 1.1621047128425066, "learning_rate": 1.9156922159855415e-05, "loss": 0.1822, "step": 4529 }, { "epoch": 0.9467084639498433, "grad_norm": 0.8401832360423449, "learning_rate": 1.91564687318097e-05, "loss": 0.2005, "step": 4530 }, { "epoch": 0.9469174503657263, "grad_norm": 0.960884386971075, "learning_rate": 1.9156015187233453e-05, "loss": 0.1777, "step": 4531 }, { "epoch": 0.9471264367816092, "grad_norm": 1.1711159869271612, "learning_rate": 1.915556152613244e-05, "loss": 0.1788, "step": 4532 }, { "epoch": 0.9473354231974922, "grad_norm": 0.9847102078491176, "learning_rate": 1.9155107748512436e-05, "loss": 0.2027, "step": 4533 }, { "epoch": 0.9475444096133752, "grad_norm": 1.1383680324980834, "learning_rate": 1.915465385437922e-05, "loss": 0.1897, "step": 4534 }, { "epoch": 0.9477533960292581, "grad_norm": 1.108910041355659, "learning_rate": 1.9154199843738563e-05, "loss": 0.2002, "step": 4535 }, { "epoch": 0.9479623824451411, "grad_norm": 0.9580298124936131, "learning_rate": 1.915374571659625e-05, "loss": 0.1957, "step": 4536 }, { "epoch": 0.948171368861024, "grad_norm": 0.9844013984559824, "learning_rate": 1.9153291472958055e-05, "loss": 0.2194, "step": 4537 }, { "epoch": 0.948380355276907, "grad_norm": 1.0346913895625425, "learning_rate": 1.9152837112829757e-05, "loss": 0.1801, "step": 4538 }, { "epoch": 0.94858934169279, "grad_norm": 1.6412832108592474, "learning_rate": 1.9152382636217144e-05, "loss": 0.2311, "step": 4539 }, { "epoch": 0.9487983281086729, "grad_norm": 1.1870460494989077, "learning_rate": 1.9151928043125997e-05, "loss": 0.2024, "step": 4540 }, { "epoch": 0.9490073145245559, "grad_norm": 1.0302808693482077, "learning_rate": 1.9151473333562104e-05, "loss": 0.1978, "step": 4541 }, { "epoch": 0.9492163009404389, "grad_norm": 1.0825958415796282, "learning_rate": 1.915101850753125e-05, "loss": 0.1721, "step": 4542 }, { "epoch": 0.9494252873563218, "grad_norm": 0.9506807175479997, "learning_rate": 1.9150563565039224e-05, "loss": 0.1902, "step": 4543 }, { "epoch": 0.9496342737722048, "grad_norm": 1.059337419228065, "learning_rate": 1.9150108506091815e-05, "loss": 0.1946, "step": 4544 }, { "epoch": 0.9498432601880877, "grad_norm": 1.0991259797849333, "learning_rate": 1.9149653330694814e-05, "loss": 0.1722, "step": 4545 }, { "epoch": 0.9500522466039707, "grad_norm": 1.179449983340323, "learning_rate": 1.9149198038854012e-05, "loss": 0.2292, "step": 4546 }, { "epoch": 0.9502612330198537, "grad_norm": 1.2516780126652713, "learning_rate": 1.914874263057521e-05, "loss": 0.2127, "step": 4547 }, { "epoch": 0.9504702194357367, "grad_norm": 1.1757774594413757, "learning_rate": 1.91482871058642e-05, "loss": 0.1947, "step": 4548 }, { "epoch": 0.9506792058516197, "grad_norm": 1.2091890503110947, "learning_rate": 1.9147831464726776e-05, "loss": 0.1689, "step": 4549 }, { "epoch": 0.9508881922675027, "grad_norm": 1.202209324332672, "learning_rate": 1.914737570716874e-05, "loss": 0.1701, "step": 4550 }, { "epoch": 0.9510971786833856, "grad_norm": 1.138435550777234, "learning_rate": 1.9146919833195895e-05, "loss": 0.2079, "step": 4551 }, { "epoch": 0.9513061650992686, "grad_norm": 1.1263910408912257, "learning_rate": 1.9146463842814035e-05, "loss": 0.2272, "step": 4552 }, { "epoch": 0.9515151515151515, "grad_norm": 1.1983617232043444, "learning_rate": 1.914600773602897e-05, "loss": 0.1839, "step": 4553 }, { "epoch": 0.9517241379310345, "grad_norm": 1.0534065581255825, "learning_rate": 1.9145551512846504e-05, "loss": 0.2003, "step": 4554 }, { "epoch": 0.9519331243469175, "grad_norm": 1.2323605037517356, "learning_rate": 1.914509517327244e-05, "loss": 0.1929, "step": 4555 }, { "epoch": 0.9521421107628004, "grad_norm": 1.2626047212498315, "learning_rate": 1.914463871731259e-05, "loss": 0.202, "step": 4556 }, { "epoch": 0.9523510971786834, "grad_norm": 1.1028618410724045, "learning_rate": 1.9144182144972755e-05, "loss": 0.1913, "step": 4557 }, { "epoch": 0.9525600835945663, "grad_norm": 1.1987204055132523, "learning_rate": 1.914372545625876e-05, "loss": 0.2136, "step": 4558 }, { "epoch": 0.9527690700104493, "grad_norm": 1.028792800299043, "learning_rate": 1.9143268651176398e-05, "loss": 0.1851, "step": 4559 }, { "epoch": 0.9529780564263323, "grad_norm": 1.1312266403325877, "learning_rate": 1.9142811729731497e-05, "loss": 0.1852, "step": 4560 }, { "epoch": 0.9531870428422152, "grad_norm": 1.1193355489233514, "learning_rate": 1.914235469192987e-05, "loss": 0.1763, "step": 4561 }, { "epoch": 0.9533960292580982, "grad_norm": 1.1785574726177697, "learning_rate": 1.9141897537777328e-05, "loss": 0.1897, "step": 4562 }, { "epoch": 0.9536050156739811, "grad_norm": 1.1452732432210668, "learning_rate": 1.9141440267279694e-05, "loss": 0.2233, "step": 4563 }, { "epoch": 0.9538140020898641, "grad_norm": 1.0722539051061708, "learning_rate": 1.9140982880442786e-05, "loss": 0.2188, "step": 4564 }, { "epoch": 0.9540229885057471, "grad_norm": 0.9363788935854719, "learning_rate": 1.9140525377272424e-05, "loss": 0.2582, "step": 4565 }, { "epoch": 0.9542319749216301, "grad_norm": 1.1908301535678085, "learning_rate": 1.914006775777443e-05, "loss": 0.1941, "step": 4566 }, { "epoch": 0.9544409613375131, "grad_norm": 1.1526976002209708, "learning_rate": 1.913961002195463e-05, "loss": 0.2154, "step": 4567 }, { "epoch": 0.9546499477533961, "grad_norm": 1.0154656171353527, "learning_rate": 1.9139152169818847e-05, "loss": 0.1928, "step": 4568 }, { "epoch": 0.954858934169279, "grad_norm": 1.020278994809059, "learning_rate": 1.9138694201372915e-05, "loss": 0.198, "step": 4569 }, { "epoch": 0.955067920585162, "grad_norm": 0.9990619726125055, "learning_rate": 1.913823611662265e-05, "loss": 0.2236, "step": 4570 }, { "epoch": 0.955276907001045, "grad_norm": 1.2088404129029147, "learning_rate": 1.9137777915573893e-05, "loss": 0.2161, "step": 4571 }, { "epoch": 0.9554858934169279, "grad_norm": 1.1253949563878847, "learning_rate": 1.9137319598232468e-05, "loss": 0.1829, "step": 4572 }, { "epoch": 0.9556948798328109, "grad_norm": 1.0377488145265046, "learning_rate": 1.9136861164604214e-05, "loss": 0.2006, "step": 4573 }, { "epoch": 0.9559038662486938, "grad_norm": 1.4991622846382693, "learning_rate": 1.913640261469496e-05, "loss": 0.2168, "step": 4574 }, { "epoch": 0.9561128526645768, "grad_norm": 1.183350615287098, "learning_rate": 1.9135943948510546e-05, "loss": 0.2136, "step": 4575 }, { "epoch": 0.9563218390804598, "grad_norm": 0.9084422674761055, "learning_rate": 1.9135485166056803e-05, "loss": 0.2197, "step": 4576 }, { "epoch": 0.9565308254963427, "grad_norm": 1.200746163906972, "learning_rate": 1.9135026267339577e-05, "loss": 0.211, "step": 4577 }, { "epoch": 0.9567398119122257, "grad_norm": 1.0850304795160048, "learning_rate": 1.9134567252364703e-05, "loss": 0.2074, "step": 4578 }, { "epoch": 0.9569487983281086, "grad_norm": 1.0413871047580676, "learning_rate": 1.913410812113803e-05, "loss": 0.195, "step": 4579 }, { "epoch": 0.9571577847439916, "grad_norm": 1.0029983539046223, "learning_rate": 1.913364887366539e-05, "loss": 0.1974, "step": 4580 }, { "epoch": 0.9573667711598746, "grad_norm": 1.1598402071817613, "learning_rate": 1.9133189509952633e-05, "loss": 0.1664, "step": 4581 }, { "epoch": 0.9575757575757575, "grad_norm": 1.0188975289235498, "learning_rate": 1.913273003000561e-05, "loss": 0.177, "step": 4582 }, { "epoch": 0.9577847439916406, "grad_norm": 1.121946929618191, "learning_rate": 1.9132270433830165e-05, "loss": 0.2031, "step": 4583 }, { "epoch": 0.9579937304075236, "grad_norm": 1.0823587879504086, "learning_rate": 1.9131810721432143e-05, "loss": 0.208, "step": 4584 }, { "epoch": 0.9582027168234065, "grad_norm": 1.0361063497373764, "learning_rate": 1.91313508928174e-05, "loss": 0.174, "step": 4585 }, { "epoch": 0.9584117032392895, "grad_norm": 1.169553064129669, "learning_rate": 1.9130890947991783e-05, "loss": 0.1649, "step": 4586 }, { "epoch": 0.9586206896551724, "grad_norm": 1.020322668913952, "learning_rate": 1.913043088696115e-05, "loss": 0.1867, "step": 4587 }, { "epoch": 0.9588296760710554, "grad_norm": 1.1158437895105597, "learning_rate": 1.9129970709731356e-05, "loss": 0.199, "step": 4588 }, { "epoch": 0.9590386624869384, "grad_norm": 0.99820478940127, "learning_rate": 1.9129510416308255e-05, "loss": 0.2074, "step": 4589 }, { "epoch": 0.9592476489028213, "grad_norm": 1.5055515895858735, "learning_rate": 1.9129050006697706e-05, "loss": 0.1853, "step": 4590 }, { "epoch": 0.9594566353187043, "grad_norm": 0.8869180426330842, "learning_rate": 1.912858948090557e-05, "loss": 0.1895, "step": 4591 }, { "epoch": 0.9596656217345872, "grad_norm": 1.1570280737666074, "learning_rate": 1.91281288389377e-05, "loss": 0.1905, "step": 4592 }, { "epoch": 0.9598746081504702, "grad_norm": 0.949228594811875, "learning_rate": 1.9127668080799973e-05, "loss": 0.1711, "step": 4593 }, { "epoch": 0.9600835945663532, "grad_norm": 1.1021005262559034, "learning_rate": 1.9127207206498238e-05, "loss": 0.1919, "step": 4594 }, { "epoch": 0.9602925809822361, "grad_norm": 0.8806971819887034, "learning_rate": 1.9126746216038372e-05, "loss": 0.1829, "step": 4595 }, { "epoch": 0.9605015673981191, "grad_norm": 1.122888886759665, "learning_rate": 1.9126285109426234e-05, "loss": 0.204, "step": 4596 }, { "epoch": 0.960710553814002, "grad_norm": 1.2959021757875953, "learning_rate": 1.9125823886667694e-05, "loss": 0.1888, "step": 4597 }, { "epoch": 0.960919540229885, "grad_norm": 1.2445698803917862, "learning_rate": 1.9125362547768625e-05, "loss": 0.1952, "step": 4598 }, { "epoch": 0.961128526645768, "grad_norm": 0.9548926412296, "learning_rate": 1.9124901092734898e-05, "loss": 0.1908, "step": 4599 }, { "epoch": 0.9613375130616509, "grad_norm": 0.9966523585881393, "learning_rate": 1.912443952157238e-05, "loss": 0.2101, "step": 4600 }, { "epoch": 0.961546499477534, "grad_norm": 0.992524339496043, "learning_rate": 1.9123977834286954e-05, "loss": 0.2033, "step": 4601 }, { "epoch": 0.961755485893417, "grad_norm": 1.2566746922981153, "learning_rate": 1.912351603088449e-05, "loss": 0.1719, "step": 4602 }, { "epoch": 0.9619644723092999, "grad_norm": 1.267786428342495, "learning_rate": 1.912305411137086e-05, "loss": 0.2293, "step": 4603 }, { "epoch": 0.9621734587251829, "grad_norm": 1.0643208857865774, "learning_rate": 1.9122592075751952e-05, "loss": 0.1809, "step": 4604 }, { "epoch": 0.9623824451410659, "grad_norm": 1.2891188013451802, "learning_rate": 1.9122129924033643e-05, "loss": 0.2083, "step": 4605 }, { "epoch": 0.9625914315569488, "grad_norm": 1.3606678273559096, "learning_rate": 1.9121667656221815e-05, "loss": 0.1941, "step": 4606 }, { "epoch": 0.9628004179728318, "grad_norm": 1.1420615949021182, "learning_rate": 1.9121205272322352e-05, "loss": 0.1941, "step": 4607 }, { "epoch": 0.9630094043887147, "grad_norm": 1.048376972973909, "learning_rate": 1.9120742772341138e-05, "loss": 0.1853, "step": 4608 }, { "epoch": 0.9632183908045977, "grad_norm": 1.266878306715313, "learning_rate": 1.9120280156284053e-05, "loss": 0.2054, "step": 4609 }, { "epoch": 0.9634273772204807, "grad_norm": 1.0451758007897867, "learning_rate": 1.9119817424156992e-05, "loss": 0.24, "step": 4610 }, { "epoch": 0.9636363636363636, "grad_norm": 1.0661058962725114, "learning_rate": 1.911935457596584e-05, "loss": 0.2059, "step": 4611 }, { "epoch": 0.9638453500522466, "grad_norm": 0.8624644329038318, "learning_rate": 1.911889161171649e-05, "loss": 0.2077, "step": 4612 }, { "epoch": 0.9640543364681295, "grad_norm": 0.9948712160042466, "learning_rate": 1.9118428531414834e-05, "loss": 0.1917, "step": 4613 }, { "epoch": 0.9642633228840125, "grad_norm": 1.1472120011554372, "learning_rate": 1.9117965335066766e-05, "loss": 0.1809, "step": 4614 }, { "epoch": 0.9644723092998955, "grad_norm": 1.0551855862605197, "learning_rate": 1.911750202267818e-05, "loss": 0.2025, "step": 4615 }, { "epoch": 0.9646812957157784, "grad_norm": 0.9029617279653679, "learning_rate": 1.9117038594254966e-05, "loss": 0.2112, "step": 4616 }, { "epoch": 0.9648902821316614, "grad_norm": 1.1068240303916488, "learning_rate": 1.911657504980303e-05, "loss": 0.1861, "step": 4617 }, { "epoch": 0.9650992685475445, "grad_norm": 0.8348775018052411, "learning_rate": 1.911611138932827e-05, "loss": 0.1814, "step": 4618 }, { "epoch": 0.9653082549634274, "grad_norm": 1.1629915025032225, "learning_rate": 1.9115647612836586e-05, "loss": 0.2178, "step": 4619 }, { "epoch": 0.9655172413793104, "grad_norm": 1.0135345168956695, "learning_rate": 1.9115183720333877e-05, "loss": 0.2154, "step": 4620 }, { "epoch": 0.9657262277951933, "grad_norm": 1.0467282681189887, "learning_rate": 1.9114719711826055e-05, "loss": 0.1708, "step": 4621 }, { "epoch": 0.9659352142110763, "grad_norm": 1.0274807994501978, "learning_rate": 1.9114255587319016e-05, "loss": 0.1961, "step": 4622 }, { "epoch": 0.9661442006269593, "grad_norm": 1.0625006856104156, "learning_rate": 1.9113791346818672e-05, "loss": 0.1944, "step": 4623 }, { "epoch": 0.9663531870428422, "grad_norm": 0.9919993011850131, "learning_rate": 1.9113326990330932e-05, "loss": 0.1776, "step": 4624 }, { "epoch": 0.9665621734587252, "grad_norm": 0.9962108081831064, "learning_rate": 1.9112862517861704e-05, "loss": 0.1869, "step": 4625 }, { "epoch": 0.9667711598746082, "grad_norm": 1.136601719574537, "learning_rate": 1.9112397929416895e-05, "loss": 0.1994, "step": 4626 }, { "epoch": 0.9669801462904911, "grad_norm": 0.9772934867569547, "learning_rate": 1.9111933225002423e-05, "loss": 0.1695, "step": 4627 }, { "epoch": 0.9671891327063741, "grad_norm": 1.1459531041618376, "learning_rate": 1.9111468404624204e-05, "loss": 0.1941, "step": 4628 }, { "epoch": 0.967398119122257, "grad_norm": 1.0130210035228562, "learning_rate": 1.9111003468288144e-05, "loss": 0.1916, "step": 4629 }, { "epoch": 0.96760710553814, "grad_norm": 1.2359724815741604, "learning_rate": 1.9110538416000168e-05, "loss": 0.2114, "step": 4630 }, { "epoch": 0.967816091954023, "grad_norm": 1.0825918532705685, "learning_rate": 1.9110073247766195e-05, "loss": 0.2018, "step": 4631 }, { "epoch": 0.9680250783699059, "grad_norm": 1.1587201215400609, "learning_rate": 1.910960796359214e-05, "loss": 0.2012, "step": 4632 }, { "epoch": 0.9682340647857889, "grad_norm": 1.101272363309255, "learning_rate": 1.910914256348393e-05, "loss": 0.166, "step": 4633 }, { "epoch": 0.9684430512016718, "grad_norm": 1.2902047923460405, "learning_rate": 1.910867704744748e-05, "loss": 0.1946, "step": 4634 }, { "epoch": 0.9686520376175548, "grad_norm": 1.1261973281317235, "learning_rate": 1.9108211415488727e-05, "loss": 0.1923, "step": 4635 }, { "epoch": 0.9688610240334379, "grad_norm": 1.084149885114363, "learning_rate": 1.9107745667613585e-05, "loss": 0.1934, "step": 4636 }, { "epoch": 0.9690700104493208, "grad_norm": 1.10773908411802, "learning_rate": 1.9107279803827987e-05, "loss": 0.201, "step": 4637 }, { "epoch": 0.9692789968652038, "grad_norm": 1.1046298397922218, "learning_rate": 1.910681382413786e-05, "loss": 0.1948, "step": 4638 }, { "epoch": 0.9694879832810868, "grad_norm": 1.0168566097821339, "learning_rate": 1.9106347728549134e-05, "loss": 0.1775, "step": 4639 }, { "epoch": 0.9696969696969697, "grad_norm": 1.2532186739429183, "learning_rate": 1.910588151706774e-05, "loss": 0.211, "step": 4640 }, { "epoch": 0.9699059561128527, "grad_norm": 0.9726770132082275, "learning_rate": 1.9105415189699618e-05, "loss": 0.2078, "step": 4641 }, { "epoch": 0.9701149425287356, "grad_norm": 1.0515248236032717, "learning_rate": 1.9104948746450697e-05, "loss": 0.1777, "step": 4642 }, { "epoch": 0.9703239289446186, "grad_norm": 0.8983512414039863, "learning_rate": 1.910448218732691e-05, "loss": 0.1839, "step": 4643 }, { "epoch": 0.9705329153605016, "grad_norm": 1.2943335938270042, "learning_rate": 1.9104015512334204e-05, "loss": 0.2213, "step": 4644 }, { "epoch": 0.9707419017763845, "grad_norm": 0.9737482796958241, "learning_rate": 1.910354872147851e-05, "loss": 0.2053, "step": 4645 }, { "epoch": 0.9709508881922675, "grad_norm": 1.006234630658876, "learning_rate": 1.910308181476577e-05, "loss": 0.2001, "step": 4646 }, { "epoch": 0.9711598746081505, "grad_norm": 1.068422448646089, "learning_rate": 1.910261479220193e-05, "loss": 0.2277, "step": 4647 }, { "epoch": 0.9713688610240334, "grad_norm": 1.110555416013056, "learning_rate": 1.910214765379293e-05, "loss": 0.2528, "step": 4648 }, { "epoch": 0.9715778474399164, "grad_norm": 1.2267230207218194, "learning_rate": 1.910168039954472e-05, "loss": 0.2018, "step": 4649 }, { "epoch": 0.9717868338557993, "grad_norm": 1.3018319802993272, "learning_rate": 1.9101213029463236e-05, "loss": 0.1758, "step": 4650 }, { "epoch": 0.9719958202716823, "grad_norm": 1.0402936901580513, "learning_rate": 1.9100745543554437e-05, "loss": 0.1927, "step": 4651 }, { "epoch": 0.9722048066875653, "grad_norm": 1.1207987037425295, "learning_rate": 1.910027794182427e-05, "loss": 0.2343, "step": 4652 }, { "epoch": 0.9724137931034482, "grad_norm": 1.715158056045592, "learning_rate": 1.9099810224278682e-05, "loss": 0.2514, "step": 4653 }, { "epoch": 0.9726227795193313, "grad_norm": 1.0882406497299308, "learning_rate": 1.909934239092363e-05, "loss": 0.2004, "step": 4654 }, { "epoch": 0.9728317659352143, "grad_norm": 0.9064625980962456, "learning_rate": 1.9098874441765063e-05, "loss": 0.1575, "step": 4655 }, { "epoch": 0.9730407523510972, "grad_norm": 1.000980635322249, "learning_rate": 1.909840637680894e-05, "loss": 0.1879, "step": 4656 }, { "epoch": 0.9732497387669802, "grad_norm": 1.0460296159185598, "learning_rate": 1.9097938196061218e-05, "loss": 0.1916, "step": 4657 }, { "epoch": 0.9734587251828631, "grad_norm": 1.2078609587503613, "learning_rate": 1.9097469899527855e-05, "loss": 0.1878, "step": 4658 }, { "epoch": 0.9736677115987461, "grad_norm": 1.1001088027097625, "learning_rate": 1.909700148721481e-05, "loss": 0.2025, "step": 4659 }, { "epoch": 0.9738766980146291, "grad_norm": 1.225256257711064, "learning_rate": 1.909653295912804e-05, "loss": 0.2108, "step": 4660 }, { "epoch": 0.974085684430512, "grad_norm": 0.9857539978972338, "learning_rate": 1.909606431527352e-05, "loss": 0.1923, "step": 4661 }, { "epoch": 0.974294670846395, "grad_norm": 1.2053122959452671, "learning_rate": 1.90955955556572e-05, "loss": 0.1767, "step": 4662 }, { "epoch": 0.974503657262278, "grad_norm": 1.2175876703051145, "learning_rate": 1.9095126680285056e-05, "loss": 0.1873, "step": 4663 }, { "epoch": 0.9747126436781609, "grad_norm": 1.172786560780982, "learning_rate": 1.909465768916305e-05, "loss": 0.1813, "step": 4664 }, { "epoch": 0.9749216300940439, "grad_norm": 1.3014868839721951, "learning_rate": 1.909418858229715e-05, "loss": 0.1862, "step": 4665 }, { "epoch": 0.9751306165099268, "grad_norm": 1.1791378847857874, "learning_rate": 1.9093719359693332e-05, "loss": 0.1709, "step": 4666 }, { "epoch": 0.9753396029258098, "grad_norm": 1.3742105245781049, "learning_rate": 1.9093250021357562e-05, "loss": 0.1907, "step": 4667 }, { "epoch": 0.9755485893416928, "grad_norm": 0.9359189155973375, "learning_rate": 1.9092780567295813e-05, "loss": 0.1563, "step": 4668 }, { "epoch": 0.9757575757575757, "grad_norm": 1.191980375043811, "learning_rate": 1.9092310997514064e-05, "loss": 0.1937, "step": 4669 }, { "epoch": 0.9759665621734587, "grad_norm": 1.226202802723931, "learning_rate": 1.909184131201829e-05, "loss": 0.2045, "step": 4670 }, { "epoch": 0.9761755485893417, "grad_norm": 1.0807823583043, "learning_rate": 1.9091371510814463e-05, "loss": 0.188, "step": 4671 }, { "epoch": 0.9763845350052247, "grad_norm": 1.2288529467130844, "learning_rate": 1.9090901593908565e-05, "loss": 0.1798, "step": 4672 }, { "epoch": 0.9765935214211077, "grad_norm": 1.0661897908499203, "learning_rate": 1.909043156130658e-05, "loss": 0.1739, "step": 4673 }, { "epoch": 0.9768025078369906, "grad_norm": 1.0881678123345842, "learning_rate": 1.9089961413014487e-05, "loss": 0.2262, "step": 4674 }, { "epoch": 0.9770114942528736, "grad_norm": 1.2473458049075887, "learning_rate": 1.908949114903827e-05, "loss": 0.2293, "step": 4675 }, { "epoch": 0.9772204806687566, "grad_norm": 1.1717523720976073, "learning_rate": 1.9089020769383913e-05, "loss": 0.1766, "step": 4676 }, { "epoch": 0.9774294670846395, "grad_norm": 1.0949161503855835, "learning_rate": 1.90885502740574e-05, "loss": 0.1883, "step": 4677 }, { "epoch": 0.9776384535005225, "grad_norm": 1.1654466930904102, "learning_rate": 1.9088079663064723e-05, "loss": 0.205, "step": 4678 }, { "epoch": 0.9778474399164054, "grad_norm": 1.0667615784374276, "learning_rate": 1.9087608936411873e-05, "loss": 0.1816, "step": 4679 }, { "epoch": 0.9780564263322884, "grad_norm": 1.0741463659275428, "learning_rate": 1.9087138094104835e-05, "loss": 0.1885, "step": 4680 }, { "epoch": 0.9782654127481714, "grad_norm": 1.2328054172138465, "learning_rate": 1.9086667136149604e-05, "loss": 0.2116, "step": 4681 }, { "epoch": 0.9784743991640543, "grad_norm": 1.1684175556084841, "learning_rate": 1.9086196062552173e-05, "loss": 0.2018, "step": 4682 }, { "epoch": 0.9786833855799373, "grad_norm": 1.2005416287948363, "learning_rate": 1.9085724873318536e-05, "loss": 0.2058, "step": 4683 }, { "epoch": 0.9788923719958202, "grad_norm": 1.0458405766559806, "learning_rate": 1.9085253568454693e-05, "loss": 0.1652, "step": 4684 }, { "epoch": 0.9791013584117032, "grad_norm": 1.1230223201250737, "learning_rate": 1.9084782147966642e-05, "loss": 0.2261, "step": 4685 }, { "epoch": 0.9793103448275862, "grad_norm": 1.0766765090337362, "learning_rate": 1.908431061186038e-05, "loss": 0.2173, "step": 4686 }, { "epoch": 0.9795193312434691, "grad_norm": 0.9794098631634929, "learning_rate": 1.9083838960141904e-05, "loss": 0.194, "step": 4687 }, { "epoch": 0.9797283176593521, "grad_norm": 0.9132633577222335, "learning_rate": 1.9083367192817226e-05, "loss": 0.2149, "step": 4688 }, { "epoch": 0.9799373040752352, "grad_norm": 1.0865713549842007, "learning_rate": 1.9082895309892343e-05, "loss": 0.1667, "step": 4689 }, { "epoch": 0.9801462904911181, "grad_norm": 0.9519501091056062, "learning_rate": 1.9082423311373267e-05, "loss": 0.1885, "step": 4690 }, { "epoch": 0.9803552769070011, "grad_norm": 0.9088046878996814, "learning_rate": 1.9081951197266e-05, "loss": 0.15, "step": 4691 }, { "epoch": 0.980564263322884, "grad_norm": 1.0469957925555273, "learning_rate": 1.908147896757655e-05, "loss": 0.2144, "step": 4692 }, { "epoch": 0.980773249738767, "grad_norm": 0.9370721641194564, "learning_rate": 1.9081006622310924e-05, "loss": 0.201, "step": 4693 }, { "epoch": 0.98098223615465, "grad_norm": 1.0918329491897767, "learning_rate": 1.908053416147514e-05, "loss": 0.2228, "step": 4694 }, { "epoch": 0.9811912225705329, "grad_norm": 1.098823485695727, "learning_rate": 1.9080061585075215e-05, "loss": 0.178, "step": 4695 }, { "epoch": 0.9814002089864159, "grad_norm": 1.0317087396395221, "learning_rate": 1.907958889311715e-05, "loss": 0.1926, "step": 4696 }, { "epoch": 0.9816091954022989, "grad_norm": 1.0742778710397045, "learning_rate": 1.9079116085606966e-05, "loss": 0.2054, "step": 4697 }, { "epoch": 0.9818181818181818, "grad_norm": 1.2548926962566063, "learning_rate": 1.9078643162550686e-05, "loss": 0.2414, "step": 4698 }, { "epoch": 0.9820271682340648, "grad_norm": 1.1463074056526332, "learning_rate": 1.907817012395432e-05, "loss": 0.1969, "step": 4699 }, { "epoch": 0.9822361546499477, "grad_norm": 0.9781454746107708, "learning_rate": 1.9077696969823893e-05, "loss": 0.2046, "step": 4700 }, { "epoch": 0.9824451410658307, "grad_norm": 1.1560523580491484, "learning_rate": 1.9077223700165427e-05, "loss": 0.203, "step": 4701 }, { "epoch": 0.9826541274817137, "grad_norm": 1.131987616414696, "learning_rate": 1.9076750314984943e-05, "loss": 0.2042, "step": 4702 }, { "epoch": 0.9828631138975966, "grad_norm": 1.2239586774117048, "learning_rate": 1.907627681428847e-05, "loss": 0.1679, "step": 4703 }, { "epoch": 0.9830721003134796, "grad_norm": 1.3960382143273735, "learning_rate": 1.907580319808203e-05, "loss": 0.2209, "step": 4704 }, { "epoch": 0.9832810867293625, "grad_norm": 1.2373432597510925, "learning_rate": 1.9075329466371648e-05, "loss": 0.1867, "step": 4705 }, { "epoch": 0.9834900731452456, "grad_norm": 1.1288566303511767, "learning_rate": 1.9074855619163357e-05, "loss": 0.2076, "step": 4706 }, { "epoch": 0.9836990595611286, "grad_norm": 1.2748262591126942, "learning_rate": 1.9074381656463192e-05, "loss": 0.2318, "step": 4707 }, { "epoch": 0.9839080459770115, "grad_norm": 1.0338487617474887, "learning_rate": 1.9073907578277172e-05, "loss": 0.1905, "step": 4708 }, { "epoch": 0.9841170323928945, "grad_norm": 1.3044008914677687, "learning_rate": 1.907343338461134e-05, "loss": 0.2165, "step": 4709 }, { "epoch": 0.9843260188087775, "grad_norm": 0.9020335015344149, "learning_rate": 1.907295907547173e-05, "loss": 0.1839, "step": 4710 }, { "epoch": 0.9845350052246604, "grad_norm": 1.1262536004351373, "learning_rate": 1.907248465086438e-05, "loss": 0.1988, "step": 4711 }, { "epoch": 0.9847439916405434, "grad_norm": 0.9449749807558472, "learning_rate": 1.907201011079532e-05, "loss": 0.1996, "step": 4712 }, { "epoch": 0.9849529780564263, "grad_norm": 0.9984446700526238, "learning_rate": 1.9071535455270595e-05, "loss": 0.1841, "step": 4713 }, { "epoch": 0.9851619644723093, "grad_norm": 1.1253214214287208, "learning_rate": 1.9071060684296246e-05, "loss": 0.1901, "step": 4714 }, { "epoch": 0.9853709508881923, "grad_norm": 0.9891970974224923, "learning_rate": 1.9070585797878316e-05, "loss": 0.2151, "step": 4715 }, { "epoch": 0.9855799373040752, "grad_norm": 1.0645589054318088, "learning_rate": 1.9070110796022844e-05, "loss": 0.2076, "step": 4716 }, { "epoch": 0.9857889237199582, "grad_norm": 1.1988434574414597, "learning_rate": 1.906963567873588e-05, "loss": 0.2336, "step": 4717 }, { "epoch": 0.9859979101358411, "grad_norm": 1.1224313977364906, "learning_rate": 1.9069160446023468e-05, "loss": 0.2206, "step": 4718 }, { "epoch": 0.9862068965517241, "grad_norm": 0.9861656058004467, "learning_rate": 1.9068685097891657e-05, "loss": 0.1927, "step": 4719 }, { "epoch": 0.9864158829676071, "grad_norm": 1.123096212023791, "learning_rate": 1.9068209634346497e-05, "loss": 0.17, "step": 4720 }, { "epoch": 0.98662486938349, "grad_norm": 1.2691597989795862, "learning_rate": 1.9067734055394037e-05, "loss": 0.1694, "step": 4721 }, { "epoch": 0.986833855799373, "grad_norm": 0.9418265259158397, "learning_rate": 1.906725836104033e-05, "loss": 0.187, "step": 4722 }, { "epoch": 0.987042842215256, "grad_norm": 1.2767704971766156, "learning_rate": 1.906678255129143e-05, "loss": 0.2111, "step": 4723 }, { "epoch": 0.987251828631139, "grad_norm": 1.240519778244679, "learning_rate": 1.9066306626153396e-05, "loss": 0.1748, "step": 4724 }, { "epoch": 0.987460815047022, "grad_norm": 0.9703235816096915, "learning_rate": 1.906583058563228e-05, "loss": 0.2053, "step": 4725 }, { "epoch": 0.987669801462905, "grad_norm": 1.0458956730066555, "learning_rate": 1.9065354429734146e-05, "loss": 0.1933, "step": 4726 }, { "epoch": 0.9878787878787879, "grad_norm": 1.079516436574174, "learning_rate": 1.9064878158465046e-05, "loss": 0.198, "step": 4727 }, { "epoch": 0.9880877742946709, "grad_norm": 1.0764182596357392, "learning_rate": 1.9064401771831047e-05, "loss": 0.2511, "step": 4728 }, { "epoch": 0.9882967607105538, "grad_norm": 1.2464656711100435, "learning_rate": 1.906392526983821e-05, "loss": 0.1957, "step": 4729 }, { "epoch": 0.9885057471264368, "grad_norm": 1.1913977210476054, "learning_rate": 1.9063448652492602e-05, "loss": 0.2144, "step": 4730 }, { "epoch": 0.9887147335423198, "grad_norm": 1.2448051135007667, "learning_rate": 1.9062971919800285e-05, "loss": 0.1814, "step": 4731 }, { "epoch": 0.9889237199582027, "grad_norm": 1.0181422500060076, "learning_rate": 1.9062495071767325e-05, "loss": 0.191, "step": 4732 }, { "epoch": 0.9891327063740857, "grad_norm": 0.9449979900244981, "learning_rate": 1.9062018108399798e-05, "loss": 0.1928, "step": 4733 }, { "epoch": 0.9893416927899686, "grad_norm": 1.195850979899248, "learning_rate": 1.9061541029703768e-05, "loss": 0.1899, "step": 4734 }, { "epoch": 0.9895506792058516, "grad_norm": 1.3207102662470371, "learning_rate": 1.9061063835685306e-05, "loss": 0.2425, "step": 4735 }, { "epoch": 0.9897596656217346, "grad_norm": 0.9132925545640447, "learning_rate": 1.9060586526350485e-05, "loss": 0.1709, "step": 4736 }, { "epoch": 0.9899686520376175, "grad_norm": 1.0288196379908745, "learning_rate": 1.9060109101705382e-05, "loss": 0.1851, "step": 4737 }, { "epoch": 0.9901776384535005, "grad_norm": 1.4582307033935833, "learning_rate": 1.9059631561756077e-05, "loss": 0.2205, "step": 4738 }, { "epoch": 0.9903866248693834, "grad_norm": 0.9360611788518745, "learning_rate": 1.905915390650864e-05, "loss": 0.2074, "step": 4739 }, { "epoch": 0.9905956112852664, "grad_norm": 1.0641105888272095, "learning_rate": 1.9058676135969153e-05, "loss": 0.2177, "step": 4740 }, { "epoch": 0.9908045977011494, "grad_norm": 0.9775133500345183, "learning_rate": 1.9058198250143694e-05, "loss": 0.1967, "step": 4741 }, { "epoch": 0.9910135841170324, "grad_norm": 0.8344045608805788, "learning_rate": 1.905772024903835e-05, "loss": 0.1753, "step": 4742 }, { "epoch": 0.9912225705329154, "grad_norm": 1.1199678996656484, "learning_rate": 1.90572421326592e-05, "loss": 0.2263, "step": 4743 }, { "epoch": 0.9914315569487984, "grad_norm": 1.15273321963639, "learning_rate": 1.905676390101233e-05, "loss": 0.2216, "step": 4744 }, { "epoch": 0.9916405433646813, "grad_norm": 1.1764518254671967, "learning_rate": 1.905628555410383e-05, "loss": 0.2271, "step": 4745 }, { "epoch": 0.9918495297805643, "grad_norm": 0.90759986970269, "learning_rate": 1.905580709193978e-05, "loss": 0.1985, "step": 4746 }, { "epoch": 0.9920585161964472, "grad_norm": 1.001023757077447, "learning_rate": 1.905532851452627e-05, "loss": 0.2503, "step": 4747 }, { "epoch": 0.9922675026123302, "grad_norm": 0.8100094374211474, "learning_rate": 1.9054849821869397e-05, "loss": 0.2029, "step": 4748 }, { "epoch": 0.9924764890282132, "grad_norm": 0.9146124917390045, "learning_rate": 1.9054371013975253e-05, "loss": 0.2062, "step": 4749 }, { "epoch": 0.9926854754440961, "grad_norm": 1.0534992993223584, "learning_rate": 1.9053892090849927e-05, "loss": 0.2036, "step": 4750 }, { "epoch": 0.9928944618599791, "grad_norm": 0.868760228392686, "learning_rate": 1.9053413052499514e-05, "loss": 0.1601, "step": 4751 }, { "epoch": 0.993103448275862, "grad_norm": 1.137508555034338, "learning_rate": 1.9052933898930113e-05, "loss": 0.2132, "step": 4752 }, { "epoch": 0.993312434691745, "grad_norm": 0.9896100183982824, "learning_rate": 1.9052454630147822e-05, "loss": 0.2002, "step": 4753 }, { "epoch": 0.993521421107628, "grad_norm": 1.1616426559241906, "learning_rate": 1.9051975246158737e-05, "loss": 0.1863, "step": 4754 }, { "epoch": 0.9937304075235109, "grad_norm": 0.8216864620921901, "learning_rate": 1.9051495746968966e-05, "loss": 0.2052, "step": 4755 }, { "epoch": 0.9939393939393939, "grad_norm": 1.1988551989370262, "learning_rate": 1.9051016132584605e-05, "loss": 0.2268, "step": 4756 }, { "epoch": 0.9941483803552769, "grad_norm": 0.9936614577551411, "learning_rate": 1.905053640301176e-05, "loss": 0.1996, "step": 4757 }, { "epoch": 0.9943573667711598, "grad_norm": 1.0636427373198631, "learning_rate": 1.9050056558256534e-05, "loss": 0.2167, "step": 4758 }, { "epoch": 0.9945663531870429, "grad_norm": 1.3821726814272541, "learning_rate": 1.904957659832504e-05, "loss": 0.1698, "step": 4759 }, { "epoch": 0.9947753396029259, "grad_norm": 1.0106872041029054, "learning_rate": 1.9049096523223377e-05, "loss": 0.2081, "step": 4760 }, { "epoch": 0.9949843260188088, "grad_norm": 0.9443621040319953, "learning_rate": 1.9048616332957663e-05, "loss": 0.1941, "step": 4761 }, { "epoch": 0.9951933124346918, "grad_norm": 0.8599927641207402, "learning_rate": 1.9048136027534005e-05, "loss": 0.1934, "step": 4762 }, { "epoch": 0.9954022988505747, "grad_norm": 1.09395411735671, "learning_rate": 1.904765560695852e-05, "loss": 0.1846, "step": 4763 }, { "epoch": 0.9956112852664577, "grad_norm": 1.2183824491045947, "learning_rate": 1.9047175071237315e-05, "loss": 0.1869, "step": 4764 }, { "epoch": 0.9958202716823407, "grad_norm": 1.1299634914909624, "learning_rate": 1.9046694420376508e-05, "loss": 0.2117, "step": 4765 }, { "epoch": 0.9960292580982236, "grad_norm": 1.0966371276226436, "learning_rate": 1.9046213654382218e-05, "loss": 0.2215, "step": 4766 }, { "epoch": 0.9962382445141066, "grad_norm": 1.0958901108169057, "learning_rate": 1.9045732773260566e-05, "loss": 0.179, "step": 4767 }, { "epoch": 0.9964472309299895, "grad_norm": 1.0828610088596553, "learning_rate": 1.904525177701767e-05, "loss": 0.2087, "step": 4768 }, { "epoch": 0.9966562173458725, "grad_norm": 1.1381984149072244, "learning_rate": 1.9044770665659645e-05, "loss": 0.2239, "step": 4769 }, { "epoch": 0.9968652037617555, "grad_norm": 1.2868381907412576, "learning_rate": 1.9044289439192624e-05, "loss": 0.1641, "step": 4770 }, { "epoch": 0.9970741901776384, "grad_norm": 1.1125784121294853, "learning_rate": 1.9043808097622723e-05, "loss": 0.1879, "step": 4771 }, { "epoch": 0.9972831765935214, "grad_norm": 2.4177161958980147, "learning_rate": 1.904332664095607e-05, "loss": 0.1802, "step": 4772 }, { "epoch": 0.9974921630094044, "grad_norm": 1.0488072304477034, "learning_rate": 1.90428450691988e-05, "loss": 0.1723, "step": 4773 }, { "epoch": 0.9977011494252873, "grad_norm": 1.2139569851332506, "learning_rate": 1.9042363382357033e-05, "loss": 0.1858, "step": 4774 }, { "epoch": 0.9979101358411703, "grad_norm": 1.263034549105075, "learning_rate": 1.90418815804369e-05, "loss": 0.2136, "step": 4775 }, { "epoch": 0.9981191222570532, "grad_norm": 1.1188652796729357, "learning_rate": 1.9041399663444537e-05, "loss": 0.2087, "step": 4776 }, { "epoch": 0.9983281086729363, "grad_norm": 1.502890976975396, "learning_rate": 1.9040917631386073e-05, "loss": 0.1995, "step": 4777 }, { "epoch": 0.9985370950888193, "grad_norm": 1.0235283016873271, "learning_rate": 1.9040435484267642e-05, "loss": 0.19, "step": 4778 }, { "epoch": 0.9987460815047022, "grad_norm": 1.0584340028063648, "learning_rate": 1.9039953222095385e-05, "loss": 0.1996, "step": 4779 }, { "epoch": 0.9989550679205852, "grad_norm": 1.0215190905347087, "learning_rate": 1.9039470844875433e-05, "loss": 0.2083, "step": 4780 }, { "epoch": 0.9991640543364682, "grad_norm": 1.2786089754890895, "learning_rate": 1.9038988352613933e-05, "loss": 0.2112, "step": 4781 }, { "epoch": 0.9993730407523511, "grad_norm": 1.222946513601935, "learning_rate": 1.903850574531702e-05, "loss": 0.2198, "step": 4782 }, { "epoch": 0.9995820271682341, "grad_norm": 1.0358572331181193, "learning_rate": 1.903802302299084e-05, "loss": 0.1827, "step": 4783 }, { "epoch": 0.999791013584117, "grad_norm": 1.1393322639230612, "learning_rate": 1.903754018564153e-05, "loss": 0.1849, "step": 4784 }, { "epoch": 1.0, "grad_norm": 1.2078251913244784, "learning_rate": 1.903705723327524e-05, "loss": 0.1926, "step": 4785 }, { "epoch": 1.000208986415883, "grad_norm": 1.0320676737646037, "learning_rate": 1.9036574165898113e-05, "loss": 0.1625, "step": 4786 }, { "epoch": 1.000417972831766, "grad_norm": 0.991140638535696, "learning_rate": 1.90360909835163e-05, "loss": 0.1872, "step": 4787 }, { "epoch": 1.0006269592476489, "grad_norm": 1.1778353624681603, "learning_rate": 1.903560768613595e-05, "loss": 0.1803, "step": 4788 }, { "epoch": 1.0008359456635318, "grad_norm": 1.0602891632367892, "learning_rate": 1.903512427376321e-05, "loss": 0.2044, "step": 4789 }, { "epoch": 1.0010449320794148, "grad_norm": 0.8995771447025585, "learning_rate": 1.903464074640424e-05, "loss": 0.1634, "step": 4790 }, { "epoch": 1.0012539184952978, "grad_norm": 0.9936445532156462, "learning_rate": 1.903415710406518e-05, "loss": 0.1647, "step": 4791 }, { "epoch": 1.0014629049111807, "grad_norm": 0.9587679657825826, "learning_rate": 1.90336733467522e-05, "loss": 0.1487, "step": 4792 }, { "epoch": 1.0016718913270637, "grad_norm": 0.8959591298154936, "learning_rate": 1.903318947447145e-05, "loss": 0.1796, "step": 4793 }, { "epoch": 1.0018808777429467, "grad_norm": 1.2252945363483483, "learning_rate": 1.9032705487229088e-05, "loss": 0.1765, "step": 4794 }, { "epoch": 1.0020898641588296, "grad_norm": 1.0679015443897064, "learning_rate": 1.903222138503127e-05, "loss": 0.1735, "step": 4795 }, { "epoch": 1.0022988505747126, "grad_norm": 1.0667735885937355, "learning_rate": 1.903173716788416e-05, "loss": 0.1756, "step": 4796 }, { "epoch": 1.0025078369905955, "grad_norm": 0.886333147706573, "learning_rate": 1.903125283579393e-05, "loss": 0.1941, "step": 4797 }, { "epoch": 1.0027168234064785, "grad_norm": 0.983324239508371, "learning_rate": 1.9030768388766723e-05, "loss": 0.1537, "step": 4798 }, { "epoch": 1.0029258098223615, "grad_norm": 1.1426586353920998, "learning_rate": 1.9030283826808724e-05, "loss": 0.1828, "step": 4799 }, { "epoch": 1.0031347962382444, "grad_norm": 0.9148183604771768, "learning_rate": 1.902979914992609e-05, "loss": 0.1897, "step": 4800 }, { "epoch": 1.0033437826541274, "grad_norm": 1.1468683509951045, "learning_rate": 1.9029314358124994e-05, "loss": 0.2083, "step": 4801 }, { "epoch": 1.0035527690700103, "grad_norm": 1.286901135445048, "learning_rate": 1.9028829451411602e-05, "loss": 0.1792, "step": 4802 }, { "epoch": 1.0037617554858935, "grad_norm": 1.2029071510046148, "learning_rate": 1.9028344429792086e-05, "loss": 0.1733, "step": 4803 }, { "epoch": 1.0039707419017765, "grad_norm": 1.0720586291230838, "learning_rate": 1.9027859293272618e-05, "loss": 0.1698, "step": 4804 }, { "epoch": 1.0041797283176594, "grad_norm": 1.0193383960039433, "learning_rate": 1.902737404185937e-05, "loss": 0.1595, "step": 4805 }, { "epoch": 1.0043887147335424, "grad_norm": 0.9641904048340147, "learning_rate": 1.9026888675558526e-05, "loss": 0.1585, "step": 4806 }, { "epoch": 1.0045977011494254, "grad_norm": 1.0657648400247501, "learning_rate": 1.9026403194376256e-05, "loss": 0.1723, "step": 4807 }, { "epoch": 1.0048066875653083, "grad_norm": 1.2928745057386846, "learning_rate": 1.902591759831874e-05, "loss": 0.1885, "step": 4808 }, { "epoch": 1.0050156739811913, "grad_norm": 1.16965968146518, "learning_rate": 1.902543188739216e-05, "loss": 0.191, "step": 4809 }, { "epoch": 1.0052246603970743, "grad_norm": 1.0065174459042319, "learning_rate": 1.9024946061602694e-05, "loss": 0.1678, "step": 4810 }, { "epoch": 1.0054336468129572, "grad_norm": 1.2475915463960248, "learning_rate": 1.9024460120956526e-05, "loss": 0.1889, "step": 4811 }, { "epoch": 1.0056426332288402, "grad_norm": 1.2796890161002867, "learning_rate": 1.9023974065459843e-05, "loss": 0.1664, "step": 4812 }, { "epoch": 1.0058516196447231, "grad_norm": 1.1559915637154137, "learning_rate": 1.9023487895118828e-05, "loss": 0.154, "step": 4813 }, { "epoch": 1.006060606060606, "grad_norm": 1.4470000200758761, "learning_rate": 1.902300160993967e-05, "loss": 0.1902, "step": 4814 }, { "epoch": 1.006269592476489, "grad_norm": 1.2901898430175263, "learning_rate": 1.9022515209928555e-05, "loss": 0.1567, "step": 4815 }, { "epoch": 1.006478578892372, "grad_norm": 1.2520047682708273, "learning_rate": 1.9022028695091678e-05, "loss": 0.1706, "step": 4816 }, { "epoch": 1.006687565308255, "grad_norm": 0.9212968776480903, "learning_rate": 1.9021542065435222e-05, "loss": 0.1288, "step": 4817 }, { "epoch": 1.006896551724138, "grad_norm": 0.9909160620279307, "learning_rate": 1.9021055320965392e-05, "loss": 0.1898, "step": 4818 }, { "epoch": 1.007105538140021, "grad_norm": 1.0850565156774152, "learning_rate": 1.9020568461688374e-05, "loss": 0.1938, "step": 4819 }, { "epoch": 1.0073145245559039, "grad_norm": 1.2170303342629032, "learning_rate": 1.9020081487610368e-05, "loss": 0.1769, "step": 4820 }, { "epoch": 1.0075235109717868, "grad_norm": 1.1089095348116753, "learning_rate": 1.901959439873757e-05, "loss": 0.1875, "step": 4821 }, { "epoch": 1.0077324973876698, "grad_norm": 1.0168543706878628, "learning_rate": 1.9019107195076175e-05, "loss": 0.186, "step": 4822 }, { "epoch": 1.0079414838035528, "grad_norm": 1.0122467989042192, "learning_rate": 1.901861987663239e-05, "loss": 0.1658, "step": 4823 }, { "epoch": 1.0081504702194357, "grad_norm": 0.8648094266779189, "learning_rate": 1.9018132443412415e-05, "loss": 0.1639, "step": 4824 }, { "epoch": 1.0083594566353187, "grad_norm": 1.001358093802448, "learning_rate": 1.901764489542245e-05, "loss": 0.1539, "step": 4825 }, { "epoch": 1.0085684430512016, "grad_norm": 1.0033432133653877, "learning_rate": 1.901715723266871e-05, "loss": 0.1513, "step": 4826 }, { "epoch": 1.0087774294670846, "grad_norm": 1.2503273562753305, "learning_rate": 1.9016669455157385e-05, "loss": 0.1947, "step": 4827 }, { "epoch": 1.0089864158829676, "grad_norm": 1.0316529756272985, "learning_rate": 1.9016181562894695e-05, "loss": 0.1556, "step": 4828 }, { "epoch": 1.0091954022988505, "grad_norm": 0.8786979313193354, "learning_rate": 1.9015693555886848e-05, "loss": 0.1638, "step": 4829 }, { "epoch": 1.0094043887147335, "grad_norm": 1.0838732519754601, "learning_rate": 1.9015205434140046e-05, "loss": 0.1608, "step": 4830 }, { "epoch": 1.0096133751306164, "grad_norm": 1.0851051997885888, "learning_rate": 1.901471719766051e-05, "loss": 0.1715, "step": 4831 }, { "epoch": 1.0098223615464994, "grad_norm": 0.9591359066014178, "learning_rate": 1.9014228846454456e-05, "loss": 0.1576, "step": 4832 }, { "epoch": 1.0100313479623824, "grad_norm": 0.9232551334119852, "learning_rate": 1.901374038052809e-05, "loss": 0.1434, "step": 4833 }, { "epoch": 1.0102403343782653, "grad_norm": 1.1271855751975741, "learning_rate": 1.9013251799887635e-05, "loss": 0.1649, "step": 4834 }, { "epoch": 1.0104493207941483, "grad_norm": 1.1363666778087866, "learning_rate": 1.9012763104539306e-05, "loss": 0.1633, "step": 4835 }, { "epoch": 1.0106583072100312, "grad_norm": 1.1316626027618115, "learning_rate": 1.901227429448932e-05, "loss": 0.1767, "step": 4836 }, { "epoch": 1.0108672936259142, "grad_norm": 1.3652306401790333, "learning_rate": 1.9011785369743904e-05, "loss": 0.2124, "step": 4837 }, { "epoch": 1.0110762800417974, "grad_norm": 0.9388162607929809, "learning_rate": 1.9011296330309278e-05, "loss": 0.1218, "step": 4838 }, { "epoch": 1.0112852664576804, "grad_norm": 1.1228763109238782, "learning_rate": 1.9010807176191662e-05, "loss": 0.1995, "step": 4839 }, { "epoch": 1.0114942528735633, "grad_norm": 1.2067918589072535, "learning_rate": 1.9010317907397285e-05, "loss": 0.1793, "step": 4840 }, { "epoch": 1.0117032392894463, "grad_norm": 1.1461164013743124, "learning_rate": 1.9009828523932374e-05, "loss": 0.1623, "step": 4841 }, { "epoch": 1.0119122257053292, "grad_norm": 1.2150505668336051, "learning_rate": 1.900933902580316e-05, "loss": 0.1984, "step": 4842 }, { "epoch": 1.0121212121212122, "grad_norm": 1.1547267227896922, "learning_rate": 1.900884941301586e-05, "loss": 0.172, "step": 4843 }, { "epoch": 1.0123301985370952, "grad_norm": 1.0704352450066035, "learning_rate": 1.9008359685576718e-05, "loss": 0.1738, "step": 4844 }, { "epoch": 1.0125391849529781, "grad_norm": 1.2397446399653933, "learning_rate": 1.9007869843491963e-05, "loss": 0.1928, "step": 4845 }, { "epoch": 1.012748171368861, "grad_norm": 1.252517753137904, "learning_rate": 1.9007379886767828e-05, "loss": 0.1731, "step": 4846 }, { "epoch": 1.012957157784744, "grad_norm": 0.9129428965879101, "learning_rate": 1.900688981541055e-05, "loss": 0.1674, "step": 4847 }, { "epoch": 1.013166144200627, "grad_norm": 1.0658473111877869, "learning_rate": 1.9006399629426363e-05, "loss": 0.1733, "step": 4848 }, { "epoch": 1.01337513061651, "grad_norm": 0.9655288397548195, "learning_rate": 1.900590932882151e-05, "loss": 0.1781, "step": 4849 }, { "epoch": 1.013584117032393, "grad_norm": 1.199721633912074, "learning_rate": 1.9005418913602227e-05, "loss": 0.1473, "step": 4850 }, { "epoch": 1.013793103448276, "grad_norm": 1.068873428843445, "learning_rate": 1.900492838377476e-05, "loss": 0.2084, "step": 4851 }, { "epoch": 1.0140020898641589, "grad_norm": 1.1558817029343331, "learning_rate": 1.9004437739345344e-05, "loss": 0.1788, "step": 4852 }, { "epoch": 1.0142110762800418, "grad_norm": 1.1304448497481625, "learning_rate": 1.9003946980320224e-05, "loss": 0.1974, "step": 4853 }, { "epoch": 1.0144200626959248, "grad_norm": 1.1034095365885084, "learning_rate": 1.9003456106705655e-05, "loss": 0.1718, "step": 4854 }, { "epoch": 1.0146290491118077, "grad_norm": 1.3786680149556625, "learning_rate": 1.9002965118507877e-05, "loss": 0.1706, "step": 4855 }, { "epoch": 1.0148380355276907, "grad_norm": 1.0430922591094958, "learning_rate": 1.9002474015733138e-05, "loss": 0.1664, "step": 4856 }, { "epoch": 1.0150470219435737, "grad_norm": 1.0577939459834595, "learning_rate": 1.9001982798387692e-05, "loss": 0.1829, "step": 4857 }, { "epoch": 1.0152560083594566, "grad_norm": 1.2985299889028377, "learning_rate": 1.9001491466477788e-05, "loss": 0.1594, "step": 4858 }, { "epoch": 1.0154649947753396, "grad_norm": 0.9789987963272756, "learning_rate": 1.9001000020009683e-05, "loss": 0.172, "step": 4859 }, { "epoch": 1.0156739811912225, "grad_norm": 0.9117298889904238, "learning_rate": 1.9000508458989618e-05, "loss": 0.1882, "step": 4860 }, { "epoch": 1.0158829676071055, "grad_norm": 1.1316660495752124, "learning_rate": 1.9000016783423868e-05, "loss": 0.1829, "step": 4861 }, { "epoch": 1.0160919540229885, "grad_norm": 0.9758493830869054, "learning_rate": 1.899952499331868e-05, "loss": 0.165, "step": 4862 }, { "epoch": 1.0163009404388714, "grad_norm": 1.0631285808463906, "learning_rate": 1.899903308868031e-05, "loss": 0.1824, "step": 4863 }, { "epoch": 1.0165099268547544, "grad_norm": 1.0866682351455275, "learning_rate": 1.8998541069515025e-05, "loss": 0.1597, "step": 4864 }, { "epoch": 1.0167189132706373, "grad_norm": 1.0226930057366737, "learning_rate": 1.899804893582908e-05, "loss": 0.2047, "step": 4865 }, { "epoch": 1.0169278996865203, "grad_norm": 0.9883418811940818, "learning_rate": 1.8997556687628745e-05, "loss": 0.1671, "step": 4866 }, { "epoch": 1.0171368861024033, "grad_norm": 1.0214184415590815, "learning_rate": 1.8997064324920283e-05, "loss": 0.1427, "step": 4867 }, { "epoch": 1.0173458725182862, "grad_norm": 1.0495111525306526, "learning_rate": 1.8996571847709958e-05, "loss": 0.1803, "step": 4868 }, { "epoch": 1.0175548589341692, "grad_norm": 0.997987818571653, "learning_rate": 1.8996079256004035e-05, "loss": 0.1419, "step": 4869 }, { "epoch": 1.0177638453500522, "grad_norm": 1.3630748573648805, "learning_rate": 1.899558654980879e-05, "loss": 0.1655, "step": 4870 }, { "epoch": 1.0179728317659351, "grad_norm": 1.2040196964355236, "learning_rate": 1.899509372913049e-05, "loss": 0.1725, "step": 4871 }, { "epoch": 1.018181818181818, "grad_norm": 1.221777722935149, "learning_rate": 1.8994600793975406e-05, "loss": 0.1814, "step": 4872 }, { "epoch": 1.018390804597701, "grad_norm": 1.1982614692719447, "learning_rate": 1.8994107744349814e-05, "loss": 0.1884, "step": 4873 }, { "epoch": 1.0185997910135842, "grad_norm": 1.0584963747688123, "learning_rate": 1.899361458025998e-05, "loss": 0.1429, "step": 4874 }, { "epoch": 1.0188087774294672, "grad_norm": 1.0179869857425905, "learning_rate": 1.8993121301712194e-05, "loss": 0.1685, "step": 4875 }, { "epoch": 1.0190177638453501, "grad_norm": 1.1483268813426946, "learning_rate": 1.8992627908712723e-05, "loss": 0.1727, "step": 4876 }, { "epoch": 1.019226750261233, "grad_norm": 1.0747749471477581, "learning_rate": 1.8992134401267856e-05, "loss": 0.1551, "step": 4877 }, { "epoch": 1.019435736677116, "grad_norm": 1.168708628658916, "learning_rate": 1.899164077938386e-05, "loss": 0.163, "step": 4878 }, { "epoch": 1.019644723092999, "grad_norm": 1.1671242190056297, "learning_rate": 1.899114704306703e-05, "loss": 0.176, "step": 4879 }, { "epoch": 1.019853709508882, "grad_norm": 1.3572650503744716, "learning_rate": 1.8990653192323644e-05, "loss": 0.1737, "step": 4880 }, { "epoch": 1.020062695924765, "grad_norm": 1.0929484787913741, "learning_rate": 1.8990159227159987e-05, "loss": 0.2046, "step": 4881 }, { "epoch": 1.020271682340648, "grad_norm": 1.1219079010377087, "learning_rate": 1.8989665147582348e-05, "loss": 0.1545, "step": 4882 }, { "epoch": 1.0204806687565309, "grad_norm": 1.4262285499901834, "learning_rate": 1.8989170953597013e-05, "loss": 0.1698, "step": 4883 }, { "epoch": 1.0206896551724138, "grad_norm": 1.0664653111692592, "learning_rate": 1.898867664521027e-05, "loss": 0.1839, "step": 4884 }, { "epoch": 1.0208986415882968, "grad_norm": 1.1105217033981998, "learning_rate": 1.898818222242841e-05, "loss": 0.1687, "step": 4885 }, { "epoch": 1.0211076280041798, "grad_norm": 1.2536178390762458, "learning_rate": 1.8987687685257724e-05, "loss": 0.1804, "step": 4886 }, { "epoch": 1.0213166144200627, "grad_norm": 1.239693117608568, "learning_rate": 1.8987193033704513e-05, "loss": 0.1535, "step": 4887 }, { "epoch": 1.0215256008359457, "grad_norm": 1.0052273347913883, "learning_rate": 1.8986698267775067e-05, "loss": 0.1649, "step": 4888 }, { "epoch": 1.0217345872518286, "grad_norm": 1.0544363957184721, "learning_rate": 1.8986203387475685e-05, "loss": 0.1599, "step": 4889 }, { "epoch": 1.0219435736677116, "grad_norm": 1.2026168321866897, "learning_rate": 1.898570839281266e-05, "loss": 0.2, "step": 4890 }, { "epoch": 1.0221525600835946, "grad_norm": 1.2486998343930844, "learning_rate": 1.8985213283792295e-05, "loss": 0.1602, "step": 4891 }, { "epoch": 1.0223615464994775, "grad_norm": 1.123515035600721, "learning_rate": 1.8984718060420894e-05, "loss": 0.1841, "step": 4892 }, { "epoch": 1.0225705329153605, "grad_norm": 1.1451748517058868, "learning_rate": 1.8984222722704754e-05, "loss": 0.1457, "step": 4893 }, { "epoch": 1.0227795193312434, "grad_norm": 1.0694320069358607, "learning_rate": 1.8983727270650184e-05, "loss": 0.1877, "step": 4894 }, { "epoch": 1.0229885057471264, "grad_norm": 1.0282259862354648, "learning_rate": 1.8983231704263484e-05, "loss": 0.1414, "step": 4895 }, { "epoch": 1.0231974921630094, "grad_norm": 0.9827704497916844, "learning_rate": 1.8982736023550964e-05, "loss": 0.1656, "step": 4896 }, { "epoch": 1.0234064785788923, "grad_norm": 1.213444273584166, "learning_rate": 1.8982240228518934e-05, "loss": 0.1637, "step": 4897 }, { "epoch": 1.0236154649947753, "grad_norm": 1.3371983288812952, "learning_rate": 1.8981744319173702e-05, "loss": 0.1937, "step": 4898 }, { "epoch": 1.0238244514106583, "grad_norm": 1.028734624142091, "learning_rate": 1.8981248295521577e-05, "loss": 0.1628, "step": 4899 }, { "epoch": 1.0240334378265412, "grad_norm": 0.9868261912512569, "learning_rate": 1.8980752157568875e-05, "loss": 0.1797, "step": 4900 }, { "epoch": 1.0242424242424242, "grad_norm": 1.1298444542355985, "learning_rate": 1.8980255905321908e-05, "loss": 0.176, "step": 4901 }, { "epoch": 1.0244514106583071, "grad_norm": 1.1406004561410803, "learning_rate": 1.8979759538786994e-05, "loss": 0.1879, "step": 4902 }, { "epoch": 1.02466039707419, "grad_norm": 0.968741817085483, "learning_rate": 1.897926305797045e-05, "loss": 0.1563, "step": 4903 }, { "epoch": 1.024869383490073, "grad_norm": 1.1712199758055688, "learning_rate": 1.8978766462878588e-05, "loss": 0.2119, "step": 4904 }, { "epoch": 1.025078369905956, "grad_norm": 1.0578988829290237, "learning_rate": 1.8978269753517734e-05, "loss": 0.1519, "step": 4905 }, { "epoch": 1.025287356321839, "grad_norm": 1.0606877520565636, "learning_rate": 1.8977772929894213e-05, "loss": 0.1564, "step": 4906 }, { "epoch": 1.025496342737722, "grad_norm": 1.1627232386062039, "learning_rate": 1.8977275992014343e-05, "loss": 0.1692, "step": 4907 }, { "epoch": 1.0257053291536051, "grad_norm": 1.2728143630982571, "learning_rate": 1.8976778939884442e-05, "loss": 0.1993, "step": 4908 }, { "epoch": 1.025914315569488, "grad_norm": 1.1371311389803243, "learning_rate": 1.8976281773510848e-05, "loss": 0.1678, "step": 4909 }, { "epoch": 1.026123301985371, "grad_norm": 1.0402074539083241, "learning_rate": 1.897578449289988e-05, "loss": 0.1814, "step": 4910 }, { "epoch": 1.026332288401254, "grad_norm": 1.0756850078086904, "learning_rate": 1.897528709805787e-05, "loss": 0.1953, "step": 4911 }, { "epoch": 1.026541274817137, "grad_norm": 1.137244583182895, "learning_rate": 1.897478958899115e-05, "loss": 0.1362, "step": 4912 }, { "epoch": 1.02675026123302, "grad_norm": 0.9449010207447076, "learning_rate": 1.8974291965706045e-05, "loss": 0.1389, "step": 4913 }, { "epoch": 1.026959247648903, "grad_norm": 1.0422204906836825, "learning_rate": 1.8973794228208895e-05, "loss": 0.1983, "step": 4914 }, { "epoch": 1.0271682340647859, "grad_norm": 1.3193207918121965, "learning_rate": 1.897329637650603e-05, "loss": 0.2159, "step": 4915 }, { "epoch": 1.0273772204806688, "grad_norm": 1.0173513603574542, "learning_rate": 1.897279841060379e-05, "loss": 0.1709, "step": 4916 }, { "epoch": 1.0275862068965518, "grad_norm": 1.0595977996384334, "learning_rate": 1.8972300330508507e-05, "loss": 0.1456, "step": 4917 }, { "epoch": 1.0277951933124347, "grad_norm": 1.1587413795450374, "learning_rate": 1.8971802136226522e-05, "loss": 0.1604, "step": 4918 }, { "epoch": 1.0280041797283177, "grad_norm": 0.9901324263913329, "learning_rate": 1.8971303827764178e-05, "loss": 0.1366, "step": 4919 }, { "epoch": 1.0282131661442007, "grad_norm": 1.4728884617133444, "learning_rate": 1.8970805405127814e-05, "loss": 0.1772, "step": 4920 }, { "epoch": 1.0284221525600836, "grad_norm": 1.0539659816277418, "learning_rate": 1.8970306868323774e-05, "loss": 0.203, "step": 4921 }, { "epoch": 1.0286311389759666, "grad_norm": 0.9534874448111432, "learning_rate": 1.8969808217358406e-05, "loss": 0.1639, "step": 4922 }, { "epoch": 1.0288401253918495, "grad_norm": 1.1925754514703488, "learning_rate": 1.896930945223805e-05, "loss": 0.1982, "step": 4923 }, { "epoch": 1.0290491118077325, "grad_norm": 1.1888366132275006, "learning_rate": 1.8968810572969055e-05, "loss": 0.1983, "step": 4924 }, { "epoch": 1.0292580982236155, "grad_norm": 1.1296600677832522, "learning_rate": 1.8968311579557774e-05, "loss": 0.1955, "step": 4925 }, { "epoch": 1.0294670846394984, "grad_norm": 1.070053196935171, "learning_rate": 1.8967812472010554e-05, "loss": 0.1658, "step": 4926 }, { "epoch": 1.0296760710553814, "grad_norm": 0.9929765332761273, "learning_rate": 1.8967313250333746e-05, "loss": 0.1655, "step": 4927 }, { "epoch": 1.0298850574712644, "grad_norm": 1.0713862299411163, "learning_rate": 1.896681391453371e-05, "loss": 0.1757, "step": 4928 }, { "epoch": 1.0300940438871473, "grad_norm": 1.3296904526859674, "learning_rate": 1.8966314464616792e-05, "loss": 0.1748, "step": 4929 }, { "epoch": 1.0303030303030303, "grad_norm": 1.1470197359696286, "learning_rate": 1.8965814900589352e-05, "loss": 0.1696, "step": 4930 }, { "epoch": 1.0305120167189132, "grad_norm": 1.1867649883384999, "learning_rate": 1.896531522245775e-05, "loss": 0.1942, "step": 4931 }, { "epoch": 1.0307210031347962, "grad_norm": 1.060807922003531, "learning_rate": 1.8964815430228345e-05, "loss": 0.1696, "step": 4932 }, { "epoch": 1.0309299895506792, "grad_norm": 0.9628944284139158, "learning_rate": 1.896431552390749e-05, "loss": 0.1697, "step": 4933 }, { "epoch": 1.0311389759665621, "grad_norm": 1.2288750757368239, "learning_rate": 1.896381550350156e-05, "loss": 0.1724, "step": 4934 }, { "epoch": 1.031347962382445, "grad_norm": 1.2010695557563031, "learning_rate": 1.896331536901691e-05, "loss": 0.173, "step": 4935 }, { "epoch": 1.031556948798328, "grad_norm": 0.9555167279852844, "learning_rate": 1.8962815120459905e-05, "loss": 0.1448, "step": 4936 }, { "epoch": 1.031765935214211, "grad_norm": 1.081955602139831, "learning_rate": 1.896231475783691e-05, "loss": 0.1705, "step": 4937 }, { "epoch": 1.031974921630094, "grad_norm": 1.2377736984768637, "learning_rate": 1.89618142811543e-05, "loss": 0.1841, "step": 4938 }, { "epoch": 1.032183908045977, "grad_norm": 1.1093833050679305, "learning_rate": 1.896131369041844e-05, "loss": 0.1796, "step": 4939 }, { "epoch": 1.03239289446186, "grad_norm": 1.0586132813254339, "learning_rate": 1.8960812985635702e-05, "loss": 0.1666, "step": 4940 }, { "epoch": 1.0326018808777429, "grad_norm": 1.5240808519239561, "learning_rate": 1.8960312166812454e-05, "loss": 0.1594, "step": 4941 }, { "epoch": 1.0328108672936258, "grad_norm": 1.1072855214018091, "learning_rate": 1.8959811233955075e-05, "loss": 0.1711, "step": 4942 }, { "epoch": 1.0330198537095088, "grad_norm": 1.1381041811307806, "learning_rate": 1.895931018706994e-05, "loss": 0.1621, "step": 4943 }, { "epoch": 1.033228840125392, "grad_norm": 1.110809956027875, "learning_rate": 1.895880902616342e-05, "loss": 0.1953, "step": 4944 }, { "epoch": 1.033437826541275, "grad_norm": 1.0011599896477557, "learning_rate": 1.89583077512419e-05, "loss": 0.1561, "step": 4945 }, { "epoch": 1.0336468129571579, "grad_norm": 1.1425547992016505, "learning_rate": 1.8957806362311756e-05, "loss": 0.1712, "step": 4946 }, { "epoch": 1.0338557993730408, "grad_norm": 1.0755447021816056, "learning_rate": 1.8957304859379367e-05, "loss": 0.2109, "step": 4947 }, { "epoch": 1.0340647857889238, "grad_norm": 1.1487882646665317, "learning_rate": 1.8956803242451122e-05, "loss": 0.2018, "step": 4948 }, { "epoch": 1.0342737722048068, "grad_norm": 1.0393143165248966, "learning_rate": 1.8956301511533398e-05, "loss": 0.1747, "step": 4949 }, { "epoch": 1.0344827586206897, "grad_norm": 1.0467080558131, "learning_rate": 1.8955799666632585e-05, "loss": 0.158, "step": 4950 }, { "epoch": 1.0346917450365727, "grad_norm": 1.10598501963982, "learning_rate": 1.8955297707755065e-05, "loss": 0.1692, "step": 4951 }, { "epoch": 1.0349007314524556, "grad_norm": 1.3528039506951035, "learning_rate": 1.895479563490723e-05, "loss": 0.1911, "step": 4952 }, { "epoch": 1.0351097178683386, "grad_norm": 1.0560998829524901, "learning_rate": 1.895429344809547e-05, "loss": 0.1833, "step": 4953 }, { "epoch": 1.0353187042842216, "grad_norm": 1.1942571596518101, "learning_rate": 1.8953791147326172e-05, "loss": 0.1548, "step": 4954 }, { "epoch": 1.0355276907001045, "grad_norm": 1.0740846937843092, "learning_rate": 1.8953288732605736e-05, "loss": 0.193, "step": 4955 }, { "epoch": 1.0357366771159875, "grad_norm": 1.040592550006125, "learning_rate": 1.8952786203940547e-05, "loss": 0.1576, "step": 4956 }, { "epoch": 1.0359456635318705, "grad_norm": 1.0325426077401096, "learning_rate": 1.8952283561337006e-05, "loss": 0.1716, "step": 4957 }, { "epoch": 1.0361546499477534, "grad_norm": 1.0827517631998307, "learning_rate": 1.895178080480151e-05, "loss": 0.1806, "step": 4958 }, { "epoch": 1.0363636363636364, "grad_norm": 1.1826008024048755, "learning_rate": 1.8951277934340455e-05, "loss": 0.167, "step": 4959 }, { "epoch": 1.0365726227795193, "grad_norm": 1.245778104241849, "learning_rate": 1.895077494996024e-05, "loss": 0.1647, "step": 4960 }, { "epoch": 1.0367816091954023, "grad_norm": 1.105528217102194, "learning_rate": 1.8950271851667272e-05, "loss": 0.1936, "step": 4961 }, { "epoch": 1.0369905956112853, "grad_norm": 1.0204735894830403, "learning_rate": 1.894976863946795e-05, "loss": 0.1509, "step": 4962 }, { "epoch": 1.0371995820271682, "grad_norm": 1.0345886918352833, "learning_rate": 1.8949265313368678e-05, "loss": 0.1787, "step": 4963 }, { "epoch": 1.0374085684430512, "grad_norm": 1.0723552765913735, "learning_rate": 1.894876187337586e-05, "loss": 0.1859, "step": 4964 }, { "epoch": 1.0376175548589341, "grad_norm": 0.9874898572806, "learning_rate": 1.8948258319495906e-05, "loss": 0.1702, "step": 4965 }, { "epoch": 1.037826541274817, "grad_norm": 1.2095824002939384, "learning_rate": 1.8947754651735223e-05, "loss": 0.1564, "step": 4966 }, { "epoch": 1.0380355276907, "grad_norm": 1.1433999981208476, "learning_rate": 1.8947250870100216e-05, "loss": 0.1924, "step": 4967 }, { "epoch": 1.038244514106583, "grad_norm": 1.1261735519649836, "learning_rate": 1.894674697459731e-05, "loss": 0.1858, "step": 4968 }, { "epoch": 1.038453500522466, "grad_norm": 1.0696378423041963, "learning_rate": 1.8946242965232905e-05, "loss": 0.1734, "step": 4969 }, { "epoch": 1.038662486938349, "grad_norm": 1.1285178765936896, "learning_rate": 1.8945738842013424e-05, "loss": 0.1672, "step": 4970 }, { "epoch": 1.038871473354232, "grad_norm": 1.018392322804295, "learning_rate": 1.8945234604945275e-05, "loss": 0.139, "step": 4971 }, { "epoch": 1.0390804597701149, "grad_norm": 1.2976568691349348, "learning_rate": 1.8944730254034878e-05, "loss": 0.2248, "step": 4972 }, { "epoch": 1.0392894461859978, "grad_norm": 0.9090395453100795, "learning_rate": 1.8944225789288654e-05, "loss": 0.1708, "step": 4973 }, { "epoch": 1.0394984326018808, "grad_norm": 1.0514589013279398, "learning_rate": 1.8943721210713026e-05, "loss": 0.1865, "step": 4974 }, { "epoch": 1.0397074190177638, "grad_norm": 0.972022960884242, "learning_rate": 1.8943216518314408e-05, "loss": 0.1776, "step": 4975 }, { "epoch": 1.0399164054336467, "grad_norm": 1.0051678640562165, "learning_rate": 1.894271171209922e-05, "loss": 0.1944, "step": 4976 }, { "epoch": 1.0401253918495297, "grad_norm": 1.1637606769633477, "learning_rate": 1.89422067920739e-05, "loss": 0.2004, "step": 4977 }, { "epoch": 1.0403343782654129, "grad_norm": 1.2391931726735272, "learning_rate": 1.8941701758244865e-05, "loss": 0.1826, "step": 4978 }, { "epoch": 1.0405433646812958, "grad_norm": 1.0975526725262577, "learning_rate": 1.8941196610618547e-05, "loss": 0.1703, "step": 4979 }, { "epoch": 1.0407523510971788, "grad_norm": 1.0804025884982147, "learning_rate": 1.8940691349201366e-05, "loss": 0.1881, "step": 4980 }, { "epoch": 1.0409613375130617, "grad_norm": 0.9727517645495374, "learning_rate": 1.894018597399976e-05, "loss": 0.1406, "step": 4981 }, { "epoch": 1.0411703239289447, "grad_norm": 1.2041880634767823, "learning_rate": 1.893968048502016e-05, "loss": 0.1736, "step": 4982 }, { "epoch": 1.0413793103448277, "grad_norm": 1.08176189217516, "learning_rate": 1.8939174882268998e-05, "loss": 0.1546, "step": 4983 }, { "epoch": 1.0415882967607106, "grad_norm": 1.0238419797099365, "learning_rate": 1.8938669165752707e-05, "loss": 0.1581, "step": 4984 }, { "epoch": 1.0417972831765936, "grad_norm": 1.0029521040780573, "learning_rate": 1.8938163335477722e-05, "loss": 0.1796, "step": 4985 }, { "epoch": 1.0420062695924766, "grad_norm": 0.9441296456058562, "learning_rate": 1.893765739145049e-05, "loss": 0.1508, "step": 4986 }, { "epoch": 1.0422152560083595, "grad_norm": 1.0002269556734458, "learning_rate": 1.893715133367744e-05, "loss": 0.1619, "step": 4987 }, { "epoch": 1.0424242424242425, "grad_norm": 1.0599312107706516, "learning_rate": 1.8936645162165014e-05, "loss": 0.177, "step": 4988 }, { "epoch": 1.0426332288401254, "grad_norm": 1.1534647241543452, "learning_rate": 1.8936138876919655e-05, "loss": 0.1995, "step": 4989 }, { "epoch": 1.0428422152560084, "grad_norm": 1.1414846710225444, "learning_rate": 1.8935632477947807e-05, "loss": 0.1904, "step": 4990 }, { "epoch": 1.0430512016718914, "grad_norm": 1.2588868601719723, "learning_rate": 1.893512596525591e-05, "loss": 0.192, "step": 4991 }, { "epoch": 1.0432601880877743, "grad_norm": 1.0060889520350207, "learning_rate": 1.8934619338850423e-05, "loss": 0.144, "step": 4992 }, { "epoch": 1.0434691745036573, "grad_norm": 1.0047771993041, "learning_rate": 1.8934112598737777e-05, "loss": 0.1995, "step": 4993 }, { "epoch": 1.0436781609195402, "grad_norm": 1.128566567725031, "learning_rate": 1.8933605744924435e-05, "loss": 0.1868, "step": 4994 }, { "epoch": 1.0438871473354232, "grad_norm": 1.2181623481666224, "learning_rate": 1.893309877741684e-05, "loss": 0.1852, "step": 4995 }, { "epoch": 1.0440961337513062, "grad_norm": 1.2737133107853449, "learning_rate": 1.893259169622144e-05, "loss": 0.1773, "step": 4996 }, { "epoch": 1.0443051201671891, "grad_norm": 1.027433360049219, "learning_rate": 1.8932084501344697e-05, "loss": 0.163, "step": 4997 }, { "epoch": 1.044514106583072, "grad_norm": 1.029931593349492, "learning_rate": 1.8931577192793068e-05, "loss": 0.1762, "step": 4998 }, { "epoch": 1.044723092998955, "grad_norm": 1.178472817787803, "learning_rate": 1.8931069770573e-05, "loss": 0.1804, "step": 4999 }, { "epoch": 1.044932079414838, "grad_norm": 0.9618278635592769, "learning_rate": 1.8930562234690953e-05, "loss": 0.1714, "step": 5000 }, { "epoch": 1.045141065830721, "grad_norm": 0.9282910010837608, "learning_rate": 1.893005458515339e-05, "loss": 0.1679, "step": 5001 }, { "epoch": 1.045350052246604, "grad_norm": 1.1569975628790499, "learning_rate": 1.892954682196677e-05, "loss": 0.1825, "step": 5002 }, { "epoch": 1.045559038662487, "grad_norm": 1.0506278096506512, "learning_rate": 1.8929038945137552e-05, "loss": 0.1733, "step": 5003 }, { "epoch": 1.0457680250783699, "grad_norm": 1.0592222129049675, "learning_rate": 1.8928530954672206e-05, "loss": 0.158, "step": 5004 }, { "epoch": 1.0459770114942528, "grad_norm": 1.0899260151231807, "learning_rate": 1.892802285057719e-05, "loss": 0.1712, "step": 5005 }, { "epoch": 1.0461859979101358, "grad_norm": 0.973520613127191, "learning_rate": 1.8927514632858975e-05, "loss": 0.1837, "step": 5006 }, { "epoch": 1.0463949843260187, "grad_norm": 1.046483690062035, "learning_rate": 1.892700630152403e-05, "loss": 0.1557, "step": 5007 }, { "epoch": 1.0466039707419017, "grad_norm": 1.1556209819539711, "learning_rate": 1.892649785657882e-05, "loss": 0.1396, "step": 5008 }, { "epoch": 1.0468129571577847, "grad_norm": 1.2097458008280404, "learning_rate": 1.8925989298029818e-05, "loss": 0.186, "step": 5009 }, { "epoch": 1.0470219435736676, "grad_norm": 1.291031494928392, "learning_rate": 1.8925480625883496e-05, "loss": 0.1667, "step": 5010 }, { "epoch": 1.0472309299895506, "grad_norm": 0.9735037431042904, "learning_rate": 1.8924971840146325e-05, "loss": 0.1794, "step": 5011 }, { "epoch": 1.0474399164054335, "grad_norm": 1.1144834602029137, "learning_rate": 1.8924462940824784e-05, "loss": 0.165, "step": 5012 }, { "epoch": 1.0476489028213165, "grad_norm": 1.1375728127603726, "learning_rate": 1.8923953927925348e-05, "loss": 0.1783, "step": 5013 }, { "epoch": 1.0478578892371995, "grad_norm": 1.1356904555409093, "learning_rate": 1.8923444801454493e-05, "loss": 0.1649, "step": 5014 }, { "epoch": 1.0480668756530827, "grad_norm": 1.0500569323583175, "learning_rate": 1.8922935561418706e-05, "loss": 0.1803, "step": 5015 }, { "epoch": 1.0482758620689656, "grad_norm": 1.114648398992298, "learning_rate": 1.892242620782446e-05, "loss": 0.1504, "step": 5016 }, { "epoch": 1.0484848484848486, "grad_norm": 1.275575629449366, "learning_rate": 1.892191674067824e-05, "loss": 0.1698, "step": 5017 }, { "epoch": 1.0486938349007315, "grad_norm": 0.990829053510312, "learning_rate": 1.8921407159986527e-05, "loss": 0.1808, "step": 5018 }, { "epoch": 1.0489028213166145, "grad_norm": 1.0222341888804163, "learning_rate": 1.892089746575581e-05, "loss": 0.1518, "step": 5019 }, { "epoch": 1.0491118077324975, "grad_norm": 1.0692671355472663, "learning_rate": 1.8920387657992575e-05, "loss": 0.1684, "step": 5020 }, { "epoch": 1.0493207941483804, "grad_norm": 1.132964442591474, "learning_rate": 1.891987773670331e-05, "loss": 0.1747, "step": 5021 }, { "epoch": 1.0495297805642634, "grad_norm": 1.1199736602771084, "learning_rate": 1.8919367701894506e-05, "loss": 0.1451, "step": 5022 }, { "epoch": 1.0497387669801463, "grad_norm": 1.0747704775424216, "learning_rate": 1.891885755357265e-05, "loss": 0.1528, "step": 5023 }, { "epoch": 1.0499477533960293, "grad_norm": 1.079596309516256, "learning_rate": 1.8918347291744236e-05, "loss": 0.1735, "step": 5024 }, { "epoch": 1.0501567398119123, "grad_norm": 1.0598046385588153, "learning_rate": 1.891783691641576e-05, "loss": 0.152, "step": 5025 }, { "epoch": 1.0503657262277952, "grad_norm": 1.0170116730511933, "learning_rate": 1.8917326427593715e-05, "loss": 0.1453, "step": 5026 }, { "epoch": 1.0505747126436782, "grad_norm": 1.0534653487010448, "learning_rate": 1.8916815825284596e-05, "loss": 0.1793, "step": 5027 }, { "epoch": 1.0507836990595611, "grad_norm": 1.1529119658047131, "learning_rate": 1.8916305109494906e-05, "loss": 0.1758, "step": 5028 }, { "epoch": 1.050992685475444, "grad_norm": 0.9346331843544665, "learning_rate": 1.8915794280231143e-05, "loss": 0.1399, "step": 5029 }, { "epoch": 1.051201671891327, "grad_norm": 0.9931129286690872, "learning_rate": 1.891528333749981e-05, "loss": 0.1722, "step": 5030 }, { "epoch": 1.05141065830721, "grad_norm": 1.0112867902381293, "learning_rate": 1.8914772281307405e-05, "loss": 0.1383, "step": 5031 }, { "epoch": 1.051619644723093, "grad_norm": 0.9219476682425206, "learning_rate": 1.891426111166043e-05, "loss": 0.1626, "step": 5032 }, { "epoch": 1.051828631138976, "grad_norm": 1.0998649525115498, "learning_rate": 1.89137498285654e-05, "loss": 0.1668, "step": 5033 }, { "epoch": 1.052037617554859, "grad_norm": 1.0236884927532346, "learning_rate": 1.8913238432028814e-05, "loss": 0.1622, "step": 5034 }, { "epoch": 1.0522466039707419, "grad_norm": 1.0671774019725682, "learning_rate": 1.891272692205718e-05, "loss": 0.1385, "step": 5035 }, { "epoch": 1.0524555903866248, "grad_norm": 1.0429309605817487, "learning_rate": 1.8912215298657017e-05, "loss": 0.1552, "step": 5036 }, { "epoch": 1.0526645768025078, "grad_norm": 1.0339284960770485, "learning_rate": 1.8911703561834825e-05, "loss": 0.1423, "step": 5037 }, { "epoch": 1.0528735632183908, "grad_norm": 1.0000888866393305, "learning_rate": 1.891119171159712e-05, "loss": 0.1638, "step": 5038 }, { "epoch": 1.0530825496342737, "grad_norm": 1.1634065249254208, "learning_rate": 1.8910679747950422e-05, "loss": 0.1789, "step": 5039 }, { "epoch": 1.0532915360501567, "grad_norm": 1.030689496886106, "learning_rate": 1.8910167670901242e-05, "loss": 0.2062, "step": 5040 }, { "epoch": 1.0535005224660396, "grad_norm": 1.0102985355939111, "learning_rate": 1.8909655480456095e-05, "loss": 0.205, "step": 5041 }, { "epoch": 1.0537095088819226, "grad_norm": 1.1429801123157213, "learning_rate": 1.89091431766215e-05, "loss": 0.1997, "step": 5042 }, { "epoch": 1.0539184952978056, "grad_norm": 1.1706355221441944, "learning_rate": 1.890863075940398e-05, "loss": 0.1902, "step": 5043 }, { "epoch": 1.0541274817136885, "grad_norm": 1.1471668496854546, "learning_rate": 1.8908118228810056e-05, "loss": 0.1921, "step": 5044 }, { "epoch": 1.0543364681295715, "grad_norm": 1.0550317924036847, "learning_rate": 1.890760558484625e-05, "loss": 0.1732, "step": 5045 }, { "epoch": 1.0545454545454545, "grad_norm": 1.0563262431133107, "learning_rate": 1.8907092827519082e-05, "loss": 0.1537, "step": 5046 }, { "epoch": 1.0547544409613374, "grad_norm": 1.0801439229819532, "learning_rate": 1.8906579956835084e-05, "loss": 0.1465, "step": 5047 }, { "epoch": 1.0549634273772204, "grad_norm": 0.989267600251188, "learning_rate": 1.890606697280078e-05, "loss": 0.1921, "step": 5048 }, { "epoch": 1.0551724137931036, "grad_norm": 0.9698435774721728, "learning_rate": 1.89055538754227e-05, "loss": 0.1609, "step": 5049 }, { "epoch": 1.0553814002089865, "grad_norm": 1.0838236982640537, "learning_rate": 1.890504066470737e-05, "loss": 0.2081, "step": 5050 }, { "epoch": 1.0555903866248695, "grad_norm": 1.085562454281446, "learning_rate": 1.8904527340661326e-05, "loss": 0.1761, "step": 5051 }, { "epoch": 1.0557993730407524, "grad_norm": 0.987720730777898, "learning_rate": 1.89040139032911e-05, "loss": 0.1643, "step": 5052 }, { "epoch": 1.0560083594566354, "grad_norm": 1.2282811613736988, "learning_rate": 1.8903500352603224e-05, "loss": 0.1757, "step": 5053 }, { "epoch": 1.0562173458725184, "grad_norm": 1.1046778289057246, "learning_rate": 1.8902986688604237e-05, "loss": 0.1802, "step": 5054 }, { "epoch": 1.0564263322884013, "grad_norm": 1.2887104069880007, "learning_rate": 1.890247291130067e-05, "loss": 0.192, "step": 5055 }, { "epoch": 1.0566353187042843, "grad_norm": 1.0785221273113466, "learning_rate": 1.8901959020699074e-05, "loss": 0.1779, "step": 5056 }, { "epoch": 1.0568443051201672, "grad_norm": 1.1747625373464974, "learning_rate": 1.8901445016805972e-05, "loss": 0.1701, "step": 5057 }, { "epoch": 1.0570532915360502, "grad_norm": 0.8555646174610935, "learning_rate": 1.890093089962792e-05, "loss": 0.1757, "step": 5058 }, { "epoch": 1.0572622779519332, "grad_norm": 1.0137052200817147, "learning_rate": 1.8900416669171454e-05, "loss": 0.1586, "step": 5059 }, { "epoch": 1.0574712643678161, "grad_norm": 1.174212687779425, "learning_rate": 1.889990232544312e-05, "loss": 0.1807, "step": 5060 }, { "epoch": 1.057680250783699, "grad_norm": 0.8958557226190583, "learning_rate": 1.8899387868449466e-05, "loss": 0.1435, "step": 5061 }, { "epoch": 1.057889237199582, "grad_norm": 1.1846053012955666, "learning_rate": 1.8898873298197033e-05, "loss": 0.1604, "step": 5062 }, { "epoch": 1.058098223615465, "grad_norm": 1.2365457376997433, "learning_rate": 1.8898358614692376e-05, "loss": 0.1656, "step": 5063 }, { "epoch": 1.058307210031348, "grad_norm": 1.1371536141875225, "learning_rate": 1.8897843817942046e-05, "loss": 0.1747, "step": 5064 }, { "epoch": 1.058516196447231, "grad_norm": 1.0720187704607609, "learning_rate": 1.8897328907952586e-05, "loss": 0.1493, "step": 5065 }, { "epoch": 1.058725182863114, "grad_norm": 0.9649975141115661, "learning_rate": 1.889681388473056e-05, "loss": 0.151, "step": 5066 }, { "epoch": 1.0589341692789969, "grad_norm": 1.1414774470576765, "learning_rate": 1.889629874828251e-05, "loss": 0.1782, "step": 5067 }, { "epoch": 1.0591431556948798, "grad_norm": 1.140127222680162, "learning_rate": 1.8895783498615007e-05, "loss": 0.1908, "step": 5068 }, { "epoch": 1.0593521421107628, "grad_norm": 1.0981906368941143, "learning_rate": 1.8895268135734595e-05, "loss": 0.1794, "step": 5069 }, { "epoch": 1.0595611285266457, "grad_norm": 0.9567745894582972, "learning_rate": 1.889475265964784e-05, "loss": 0.1429, "step": 5070 }, { "epoch": 1.0597701149425287, "grad_norm": 1.0712011398081047, "learning_rate": 1.88942370703613e-05, "loss": 0.209, "step": 5071 }, { "epoch": 1.0599791013584117, "grad_norm": 1.233574601359692, "learning_rate": 1.8893721367881536e-05, "loss": 0.1766, "step": 5072 }, { "epoch": 1.0601880877742946, "grad_norm": 0.9190506354009201, "learning_rate": 1.8893205552215115e-05, "loss": 0.1509, "step": 5073 }, { "epoch": 1.0603970741901776, "grad_norm": 1.1420818334734812, "learning_rate": 1.8892689623368598e-05, "loss": 0.1615, "step": 5074 }, { "epoch": 1.0606060606060606, "grad_norm": 1.0451362542544096, "learning_rate": 1.8892173581348552e-05, "loss": 0.1658, "step": 5075 }, { "epoch": 1.0608150470219435, "grad_norm": 1.0990305198557047, "learning_rate": 1.8891657426161544e-05, "loss": 0.1523, "step": 5076 }, { "epoch": 1.0610240334378265, "grad_norm": 1.0804264477258267, "learning_rate": 1.8891141157814142e-05, "loss": 0.1713, "step": 5077 }, { "epoch": 1.0612330198537094, "grad_norm": 1.0882446420122882, "learning_rate": 1.8890624776312917e-05, "loss": 0.1831, "step": 5078 }, { "epoch": 1.0614420062695924, "grad_norm": 1.2302819546747232, "learning_rate": 1.8890108281664444e-05, "loss": 0.1803, "step": 5079 }, { "epoch": 1.0616509926854754, "grad_norm": 1.2604424942686498, "learning_rate": 1.888959167387529e-05, "loss": 0.1561, "step": 5080 }, { "epoch": 1.0618599791013583, "grad_norm": 1.255090238524676, "learning_rate": 1.8889074952952036e-05, "loss": 0.1514, "step": 5081 }, { "epoch": 1.0620689655172413, "grad_norm": 1.4307859327708563, "learning_rate": 1.8888558118901253e-05, "loss": 0.1896, "step": 5082 }, { "epoch": 1.0622779519331242, "grad_norm": 1.1034739372024138, "learning_rate": 1.8888041171729525e-05, "loss": 0.1739, "step": 5083 }, { "epoch": 1.0624869383490072, "grad_norm": 0.8339524705311946, "learning_rate": 1.888752411144342e-05, "loss": 0.1415, "step": 5084 }, { "epoch": 1.0626959247648902, "grad_norm": 0.9945680938558294, "learning_rate": 1.888700693804953e-05, "loss": 0.1655, "step": 5085 }, { "epoch": 1.0629049111807733, "grad_norm": 1.0024116016871913, "learning_rate": 1.8886489651554432e-05, "loss": 0.173, "step": 5086 }, { "epoch": 1.0631138975966563, "grad_norm": 1.484217569505605, "learning_rate": 1.8885972251964707e-05, "loss": 0.1932, "step": 5087 }, { "epoch": 1.0633228840125393, "grad_norm": 1.0031176014228878, "learning_rate": 1.8885454739286943e-05, "loss": 0.181, "step": 5088 }, { "epoch": 1.0635318704284222, "grad_norm": 1.3490989961507907, "learning_rate": 1.8884937113527725e-05, "loss": 0.1551, "step": 5089 }, { "epoch": 1.0637408568443052, "grad_norm": 1.3930165631123117, "learning_rate": 1.8884419374693644e-05, "loss": 0.1676, "step": 5090 }, { "epoch": 1.0639498432601882, "grad_norm": 1.1270440427374204, "learning_rate": 1.888390152279128e-05, "loss": 0.1948, "step": 5091 }, { "epoch": 1.0641588296760711, "grad_norm": 1.164236309773992, "learning_rate": 1.8883383557827232e-05, "loss": 0.2051, "step": 5092 }, { "epoch": 1.064367816091954, "grad_norm": 1.0744911672095967, "learning_rate": 1.888286547980809e-05, "loss": 0.1391, "step": 5093 }, { "epoch": 1.064576802507837, "grad_norm": 1.0070388321202528, "learning_rate": 1.8882347288740446e-05, "loss": 0.1565, "step": 5094 }, { "epoch": 1.06478578892372, "grad_norm": 1.4148038904268503, "learning_rate": 1.8881828984630894e-05, "loss": 0.2014, "step": 5095 }, { "epoch": 1.064994775339603, "grad_norm": 1.0080587542832848, "learning_rate": 1.8881310567486032e-05, "loss": 0.1378, "step": 5096 }, { "epoch": 1.065203761755486, "grad_norm": 1.060319860075158, "learning_rate": 1.888079203731246e-05, "loss": 0.1745, "step": 5097 }, { "epoch": 1.0654127481713689, "grad_norm": 1.0521551096634119, "learning_rate": 1.8880273394116772e-05, "loss": 0.1617, "step": 5098 }, { "epoch": 1.0656217345872518, "grad_norm": 0.9694247706361917, "learning_rate": 1.887975463790557e-05, "loss": 0.1683, "step": 5099 }, { "epoch": 1.0658307210031348, "grad_norm": 1.0812228006652298, "learning_rate": 1.8879235768685462e-05, "loss": 0.1309, "step": 5100 }, { "epoch": 1.0660397074190178, "grad_norm": 1.4621675501746931, "learning_rate": 1.8878716786463044e-05, "loss": 0.1665, "step": 5101 }, { "epoch": 1.0662486938349007, "grad_norm": 1.1674015706974454, "learning_rate": 1.8878197691244923e-05, "loss": 0.1636, "step": 5102 }, { "epoch": 1.0664576802507837, "grad_norm": 0.9410256700070303, "learning_rate": 1.8877678483037703e-05, "loss": 0.1359, "step": 5103 }, { "epoch": 1.0666666666666667, "grad_norm": 1.0831154184513205, "learning_rate": 1.8877159161847997e-05, "loss": 0.1907, "step": 5104 }, { "epoch": 1.0668756530825496, "grad_norm": 1.0837125158975676, "learning_rate": 1.8876639727682412e-05, "loss": 0.1934, "step": 5105 }, { "epoch": 1.0670846394984326, "grad_norm": 0.9957673147595423, "learning_rate": 1.8876120180547555e-05, "loss": 0.1632, "step": 5106 }, { "epoch": 1.0672936259143155, "grad_norm": 1.3499023967199077, "learning_rate": 1.8875600520450043e-05, "loss": 0.199, "step": 5107 }, { "epoch": 1.0675026123301985, "grad_norm": 1.1173628378388822, "learning_rate": 1.887508074739649e-05, "loss": 0.1794, "step": 5108 }, { "epoch": 1.0677115987460815, "grad_norm": 1.0975294007326952, "learning_rate": 1.8874560861393504e-05, "loss": 0.1401, "step": 5109 }, { "epoch": 1.0679205851619644, "grad_norm": 1.26374477616822, "learning_rate": 1.8874040862447705e-05, "loss": 0.1663, "step": 5110 }, { "epoch": 1.0681295715778474, "grad_norm": 1.1911863411197723, "learning_rate": 1.8873520750565716e-05, "loss": 0.1753, "step": 5111 }, { "epoch": 1.0683385579937303, "grad_norm": 1.0738292870602129, "learning_rate": 1.8873000525754154e-05, "loss": 0.1473, "step": 5112 }, { "epoch": 1.0685475444096133, "grad_norm": 1.2018803530631985, "learning_rate": 1.887248018801963e-05, "loss": 0.1917, "step": 5113 }, { "epoch": 1.0687565308254963, "grad_norm": 1.0289850131024656, "learning_rate": 1.887195973736878e-05, "loss": 0.1718, "step": 5114 }, { "epoch": 1.0689655172413792, "grad_norm": 1.2134950061681613, "learning_rate": 1.887143917380822e-05, "loss": 0.1935, "step": 5115 }, { "epoch": 1.0691745036572622, "grad_norm": 1.0530315278959208, "learning_rate": 1.887091849734458e-05, "loss": 0.161, "step": 5116 }, { "epoch": 1.0693834900731451, "grad_norm": 1.2139537391723902, "learning_rate": 1.8870397707984476e-05, "loss": 0.1987, "step": 5117 }, { "epoch": 1.069592476489028, "grad_norm": 1.194820587722683, "learning_rate": 1.8869876805734546e-05, "loss": 0.1799, "step": 5118 }, { "epoch": 1.0698014629049113, "grad_norm": 1.2785547702835391, "learning_rate": 1.8869355790601416e-05, "loss": 0.1276, "step": 5119 }, { "epoch": 1.0700104493207943, "grad_norm": 1.0451489931623852, "learning_rate": 1.8868834662591717e-05, "loss": 0.1505, "step": 5120 }, { "epoch": 1.0702194357366772, "grad_norm": 1.1927779766433406, "learning_rate": 1.8868313421712083e-05, "loss": 0.1589, "step": 5121 }, { "epoch": 1.0704284221525602, "grad_norm": 1.1287777932998526, "learning_rate": 1.8867792067969143e-05, "loss": 0.1613, "step": 5122 }, { "epoch": 1.0706374085684431, "grad_norm": 1.0128957600529036, "learning_rate": 1.8867270601369533e-05, "loss": 0.1605, "step": 5123 }, { "epoch": 1.070846394984326, "grad_norm": 1.0802414145418564, "learning_rate": 1.8866749021919892e-05, "loss": 0.1773, "step": 5124 }, { "epoch": 1.071055381400209, "grad_norm": 0.9718785137681253, "learning_rate": 1.886622732962686e-05, "loss": 0.1984, "step": 5125 }, { "epoch": 1.071264367816092, "grad_norm": 1.1958292791564058, "learning_rate": 1.886570552449707e-05, "loss": 0.1478, "step": 5126 }, { "epoch": 1.071473354231975, "grad_norm": 1.2976554430867615, "learning_rate": 1.8865183606537167e-05, "loss": 0.2006, "step": 5127 }, { "epoch": 1.071682340647858, "grad_norm": 0.9461333906204273, "learning_rate": 1.8864661575753792e-05, "loss": 0.17, "step": 5128 }, { "epoch": 1.071891327063741, "grad_norm": 1.239696216370815, "learning_rate": 1.886413943215359e-05, "loss": 0.1572, "step": 5129 }, { "epoch": 1.0721003134796239, "grad_norm": 1.0821963120182023, "learning_rate": 1.8863617175743202e-05, "loss": 0.1667, "step": 5130 }, { "epoch": 1.0723092998955068, "grad_norm": 1.251740082420551, "learning_rate": 1.886309480652928e-05, "loss": 0.2113, "step": 5131 }, { "epoch": 1.0725182863113898, "grad_norm": 1.130253213801349, "learning_rate": 1.8862572324518473e-05, "loss": 0.1926, "step": 5132 }, { "epoch": 1.0727272727272728, "grad_norm": 1.295749639922998, "learning_rate": 1.886204972971742e-05, "loss": 0.1604, "step": 5133 }, { "epoch": 1.0729362591431557, "grad_norm": 0.9178461723820475, "learning_rate": 1.8861527022132784e-05, "loss": 0.1579, "step": 5134 }, { "epoch": 1.0731452455590387, "grad_norm": 0.989025353441256, "learning_rate": 1.8861004201771212e-05, "loss": 0.1634, "step": 5135 }, { "epoch": 1.0733542319749216, "grad_norm": 1.2884914474018678, "learning_rate": 1.8860481268639357e-05, "loss": 0.1884, "step": 5136 }, { "epoch": 1.0735632183908046, "grad_norm": 1.2300526320328427, "learning_rate": 1.8859958222743872e-05, "loss": 0.1653, "step": 5137 }, { "epoch": 1.0737722048066876, "grad_norm": 1.2057070086501334, "learning_rate": 1.885943506409142e-05, "loss": 0.1742, "step": 5138 }, { "epoch": 1.0739811912225705, "grad_norm": 1.169813969360277, "learning_rate": 1.8858911792688655e-05, "loss": 0.179, "step": 5139 }, { "epoch": 1.0741901776384535, "grad_norm": 1.0022084591069398, "learning_rate": 1.885838840854224e-05, "loss": 0.1546, "step": 5140 }, { "epoch": 1.0743991640543364, "grad_norm": 1.0626397760157416, "learning_rate": 1.885786491165883e-05, "loss": 0.1668, "step": 5141 }, { "epoch": 1.0746081504702194, "grad_norm": 1.1517250684275475, "learning_rate": 1.885734130204509e-05, "loss": 0.1512, "step": 5142 }, { "epoch": 1.0748171368861024, "grad_norm": 1.0437186579143312, "learning_rate": 1.8856817579707684e-05, "loss": 0.1689, "step": 5143 }, { "epoch": 1.0750261233019853, "grad_norm": 1.150106272622765, "learning_rate": 1.885629374465328e-05, "loss": 0.1622, "step": 5144 }, { "epoch": 1.0752351097178683, "grad_norm": 1.0251252664174453, "learning_rate": 1.8855769796888538e-05, "loss": 0.1553, "step": 5145 }, { "epoch": 1.0754440961337512, "grad_norm": 1.0977491946910949, "learning_rate": 1.8855245736420133e-05, "loss": 0.1764, "step": 5146 }, { "epoch": 1.0756530825496342, "grad_norm": 1.0757335536548616, "learning_rate": 1.885472156325473e-05, "loss": 0.1886, "step": 5147 }, { "epoch": 1.0758620689655172, "grad_norm": 1.0986339631349535, "learning_rate": 1.8854197277399003e-05, "loss": 0.1641, "step": 5148 }, { "epoch": 1.0760710553814001, "grad_norm": 1.1833010339789711, "learning_rate": 1.885367287885962e-05, "loss": 0.1647, "step": 5149 }, { "epoch": 1.076280041797283, "grad_norm": 1.1702268090522558, "learning_rate": 1.885314836764326e-05, "loss": 0.1659, "step": 5150 }, { "epoch": 1.076489028213166, "grad_norm": 0.9435326865382542, "learning_rate": 1.8852623743756594e-05, "loss": 0.1547, "step": 5151 }, { "epoch": 1.076698014629049, "grad_norm": 0.9749431361515127, "learning_rate": 1.88520990072063e-05, "loss": 0.176, "step": 5152 }, { "epoch": 1.076907001044932, "grad_norm": 1.2193698168673177, "learning_rate": 1.885157415799906e-05, "loss": 0.2012, "step": 5153 }, { "epoch": 1.077115987460815, "grad_norm": 1.071985810259578, "learning_rate": 1.8851049196141548e-05, "loss": 0.1427, "step": 5154 }, { "epoch": 1.077324973876698, "grad_norm": 1.1158958616265895, "learning_rate": 1.8850524121640447e-05, "loss": 0.2099, "step": 5155 }, { "epoch": 1.077533960292581, "grad_norm": 0.9390561262886813, "learning_rate": 1.8849998934502437e-05, "loss": 0.1485, "step": 5156 }, { "epoch": 1.077742946708464, "grad_norm": 1.1291649106405655, "learning_rate": 1.8849473634734208e-05, "loss": 0.165, "step": 5157 }, { "epoch": 1.077951933124347, "grad_norm": 1.1046437979541208, "learning_rate": 1.8848948222342442e-05, "loss": 0.1759, "step": 5158 }, { "epoch": 1.07816091954023, "grad_norm": 1.0131769846311034, "learning_rate": 1.8848422697333824e-05, "loss": 0.1585, "step": 5159 }, { "epoch": 1.078369905956113, "grad_norm": 1.0973420473567854, "learning_rate": 1.884789705971504e-05, "loss": 0.1405, "step": 5160 }, { "epoch": 1.078578892371996, "grad_norm": 1.1062517602187838, "learning_rate": 1.8847371309492787e-05, "loss": 0.1796, "step": 5161 }, { "epoch": 1.0787878787878789, "grad_norm": 1.0067920991250443, "learning_rate": 1.884684544667375e-05, "loss": 0.15, "step": 5162 }, { "epoch": 1.0789968652037618, "grad_norm": 0.9457372137991474, "learning_rate": 1.8846319471264627e-05, "loss": 0.1372, "step": 5163 }, { "epoch": 1.0792058516196448, "grad_norm": 1.1502183147759395, "learning_rate": 1.8845793383272105e-05, "loss": 0.1568, "step": 5164 }, { "epoch": 1.0794148380355277, "grad_norm": 1.0874818482162862, "learning_rate": 1.8845267182702883e-05, "loss": 0.1417, "step": 5165 }, { "epoch": 1.0796238244514107, "grad_norm": 1.2091610742574346, "learning_rate": 1.8844740869563656e-05, "loss": 0.1753, "step": 5166 }, { "epoch": 1.0798328108672937, "grad_norm": 1.1715093595046984, "learning_rate": 1.8844214443861127e-05, "loss": 0.1547, "step": 5167 }, { "epoch": 1.0800417972831766, "grad_norm": 1.025201400619562, "learning_rate": 1.884368790560199e-05, "loss": 0.1642, "step": 5168 }, { "epoch": 1.0802507836990596, "grad_norm": 1.0758118163755397, "learning_rate": 1.884316125479295e-05, "loss": 0.1563, "step": 5169 }, { "epoch": 1.0804597701149425, "grad_norm": 1.4875876116326012, "learning_rate": 1.8842634491440704e-05, "loss": 0.1874, "step": 5170 }, { "epoch": 1.0806687565308255, "grad_norm": 1.1512588666729824, "learning_rate": 1.8842107615551965e-05, "loss": 0.1811, "step": 5171 }, { "epoch": 1.0808777429467085, "grad_norm": 1.0055336588384247, "learning_rate": 1.8841580627133427e-05, "loss": 0.1488, "step": 5172 }, { "epoch": 1.0810867293625914, "grad_norm": 1.148100682259445, "learning_rate": 1.8841053526191805e-05, "loss": 0.1825, "step": 5173 }, { "epoch": 1.0812957157784744, "grad_norm": 1.1411667887088568, "learning_rate": 1.8840526312733805e-05, "loss": 0.1782, "step": 5174 }, { "epoch": 1.0815047021943573, "grad_norm": 1.0759135024275193, "learning_rate": 1.8839998986766133e-05, "loss": 0.1473, "step": 5175 }, { "epoch": 1.0817136886102403, "grad_norm": 1.1791239645641995, "learning_rate": 1.883947154829551e-05, "loss": 0.1873, "step": 5176 }, { "epoch": 1.0819226750261233, "grad_norm": 1.1716956721408531, "learning_rate": 1.8838943997328638e-05, "loss": 0.1786, "step": 5177 }, { "epoch": 1.0821316614420062, "grad_norm": 1.3451092949694539, "learning_rate": 1.8838416333872235e-05, "loss": 0.1766, "step": 5178 }, { "epoch": 1.0823406478578892, "grad_norm": 1.1974870248285099, "learning_rate": 1.8837888557933018e-05, "loss": 0.1679, "step": 5179 }, { "epoch": 1.0825496342737722, "grad_norm": 1.007297924749538, "learning_rate": 1.88373606695177e-05, "loss": 0.1827, "step": 5180 }, { "epoch": 1.0827586206896551, "grad_norm": 1.034684689561431, "learning_rate": 1.8836832668633e-05, "loss": 0.1709, "step": 5181 }, { "epoch": 1.082967607105538, "grad_norm": 0.9832488527785945, "learning_rate": 1.8836304555285644e-05, "loss": 0.1668, "step": 5182 }, { "epoch": 1.083176593521421, "grad_norm": 1.0258854865475766, "learning_rate": 1.8835776329482343e-05, "loss": 0.1411, "step": 5183 }, { "epoch": 1.083385579937304, "grad_norm": 0.9960397561221496, "learning_rate": 1.8835247991229824e-05, "loss": 0.1596, "step": 5184 }, { "epoch": 1.083594566353187, "grad_norm": 1.2663358299180298, "learning_rate": 1.8834719540534814e-05, "loss": 0.1498, "step": 5185 }, { "epoch": 1.08380355276907, "grad_norm": 1.1039281224941702, "learning_rate": 1.8834190977404036e-05, "loss": 0.1586, "step": 5186 }, { "epoch": 1.0840125391849529, "grad_norm": 1.1091724948068231, "learning_rate": 1.8833662301844214e-05, "loss": 0.1675, "step": 5187 }, { "epoch": 1.0842215256008358, "grad_norm": 1.2175132235966395, "learning_rate": 1.883313351386208e-05, "loss": 0.1341, "step": 5188 }, { "epoch": 1.084430512016719, "grad_norm": 1.6566524463538879, "learning_rate": 1.8832604613464365e-05, "loss": 0.1659, "step": 5189 }, { "epoch": 1.084639498432602, "grad_norm": 1.182794542522753, "learning_rate": 1.8832075600657796e-05, "loss": 0.1793, "step": 5190 }, { "epoch": 1.084848484848485, "grad_norm": 1.0901252344556063, "learning_rate": 1.8831546475449104e-05, "loss": 0.1787, "step": 5191 }, { "epoch": 1.085057471264368, "grad_norm": 1.0845436956049523, "learning_rate": 1.8831017237845032e-05, "loss": 0.1531, "step": 5192 }, { "epoch": 1.0852664576802509, "grad_norm": 1.0934659405049179, "learning_rate": 1.8830487887852305e-05, "loss": 0.158, "step": 5193 }, { "epoch": 1.0854754440961338, "grad_norm": 1.117833413703864, "learning_rate": 1.8829958425477666e-05, "loss": 0.1636, "step": 5194 }, { "epoch": 1.0856844305120168, "grad_norm": 1.105834235837684, "learning_rate": 1.882942885072785e-05, "loss": 0.1661, "step": 5195 }, { "epoch": 1.0858934169278998, "grad_norm": 1.2276587373928758, "learning_rate": 1.8828899163609597e-05, "loss": 0.1751, "step": 5196 }, { "epoch": 1.0861024033437827, "grad_norm": 1.043025191811822, "learning_rate": 1.882836936412965e-05, "loss": 0.1757, "step": 5197 }, { "epoch": 1.0863113897596657, "grad_norm": 1.2370424283544244, "learning_rate": 1.8827839452294753e-05, "loss": 0.1701, "step": 5198 }, { "epoch": 1.0865203761755486, "grad_norm": 1.060170466697976, "learning_rate": 1.8827309428111647e-05, "loss": 0.173, "step": 5199 }, { "epoch": 1.0867293625914316, "grad_norm": 1.072634524220603, "learning_rate": 1.8826779291587077e-05, "loss": 0.1565, "step": 5200 }, { "epoch": 1.0869383490073146, "grad_norm": 1.1300655800118973, "learning_rate": 1.8826249042727795e-05, "loss": 0.1348, "step": 5201 }, { "epoch": 1.0871473354231975, "grad_norm": 1.162721611358113, "learning_rate": 1.882571868154054e-05, "loss": 0.1484, "step": 5202 }, { "epoch": 1.0873563218390805, "grad_norm": 1.0222896801306773, "learning_rate": 1.882518820803207e-05, "loss": 0.1753, "step": 5203 }, { "epoch": 1.0875653082549634, "grad_norm": 1.0405226219290047, "learning_rate": 1.882465762220913e-05, "loss": 0.1586, "step": 5204 }, { "epoch": 1.0877742946708464, "grad_norm": 2.132103678832986, "learning_rate": 1.8824126924078477e-05, "loss": 0.1703, "step": 5205 }, { "epoch": 1.0879832810867294, "grad_norm": 1.322688310525589, "learning_rate": 1.8823596113646864e-05, "loss": 0.1944, "step": 5206 }, { "epoch": 1.0881922675026123, "grad_norm": 1.3117232779579495, "learning_rate": 1.8823065190921044e-05, "loss": 0.1844, "step": 5207 }, { "epoch": 1.0884012539184953, "grad_norm": 1.2219930317647565, "learning_rate": 1.882253415590778e-05, "loss": 0.181, "step": 5208 }, { "epoch": 1.0886102403343783, "grad_norm": 1.3481783463468846, "learning_rate": 1.8822003008613822e-05, "loss": 0.1841, "step": 5209 }, { "epoch": 1.0888192267502612, "grad_norm": 1.1442231634967697, "learning_rate": 1.8821471749045933e-05, "loss": 0.1893, "step": 5210 }, { "epoch": 1.0890282131661442, "grad_norm": 0.9447329738974994, "learning_rate": 1.882094037721088e-05, "loss": 0.146, "step": 5211 }, { "epoch": 1.0892371995820271, "grad_norm": 1.3116113144861656, "learning_rate": 1.8820408893115418e-05, "loss": 0.1543, "step": 5212 }, { "epoch": 1.08944618599791, "grad_norm": 1.0152426683281004, "learning_rate": 1.8819877296766314e-05, "loss": 0.1473, "step": 5213 }, { "epoch": 1.089655172413793, "grad_norm": 1.1271225286389421, "learning_rate": 1.8819345588170332e-05, "loss": 0.1422, "step": 5214 }, { "epoch": 1.089864158829676, "grad_norm": 0.9920921801644288, "learning_rate": 1.8818813767334237e-05, "loss": 0.1798, "step": 5215 }, { "epoch": 1.090073145245559, "grad_norm": 1.0946977746064757, "learning_rate": 1.88182818342648e-05, "loss": 0.1654, "step": 5216 }, { "epoch": 1.090282131661442, "grad_norm": 1.297635813306604, "learning_rate": 1.8817749788968793e-05, "loss": 0.1563, "step": 5217 }, { "epoch": 1.090491118077325, "grad_norm": 1.2510450500905368, "learning_rate": 1.8817217631452986e-05, "loss": 0.1507, "step": 5218 }, { "epoch": 1.0907001044932079, "grad_norm": 1.3317667456654314, "learning_rate": 1.8816685361724147e-05, "loss": 0.2026, "step": 5219 }, { "epoch": 1.0909090909090908, "grad_norm": 1.0938748257241504, "learning_rate": 1.8816152979789055e-05, "loss": 0.169, "step": 5220 }, { "epoch": 1.0911180773249738, "grad_norm": 1.0221905210658369, "learning_rate": 1.8815620485654483e-05, "loss": 0.1648, "step": 5221 }, { "epoch": 1.0913270637408568, "grad_norm": 1.1055690268261509, "learning_rate": 1.8815087879327207e-05, "loss": 0.1632, "step": 5222 }, { "epoch": 1.0915360501567397, "grad_norm": 1.2472408382133868, "learning_rate": 1.881455516081401e-05, "loss": 0.1901, "step": 5223 }, { "epoch": 1.0917450365726227, "grad_norm": 1.0751594948067968, "learning_rate": 1.8814022330121665e-05, "loss": 0.1767, "step": 5224 }, { "epoch": 1.0919540229885056, "grad_norm": 1.1345009301796936, "learning_rate": 1.881348938725696e-05, "loss": 0.1864, "step": 5225 }, { "epoch": 1.0921630094043888, "grad_norm": 1.137764421741104, "learning_rate": 1.881295633222667e-05, "loss": 0.1938, "step": 5226 }, { "epoch": 1.0923719958202718, "grad_norm": 1.413196883620846, "learning_rate": 1.8812423165037586e-05, "loss": 0.1469, "step": 5227 }, { "epoch": 1.0925809822361547, "grad_norm": 1.2277331613889146, "learning_rate": 1.881188988569649e-05, "loss": 0.1928, "step": 5228 }, { "epoch": 1.0927899686520377, "grad_norm": 0.9564800768040892, "learning_rate": 1.8811356494210166e-05, "loss": 0.1806, "step": 5229 }, { "epoch": 1.0929989550679207, "grad_norm": 1.058243970311263, "learning_rate": 1.8810822990585407e-05, "loss": 0.1669, "step": 5230 }, { "epoch": 1.0932079414838036, "grad_norm": 1.1016776816743026, "learning_rate": 1.8810289374829005e-05, "loss": 0.1583, "step": 5231 }, { "epoch": 1.0934169278996866, "grad_norm": 0.9910279913442932, "learning_rate": 1.8809755646947744e-05, "loss": 0.1796, "step": 5232 }, { "epoch": 1.0936259143155695, "grad_norm": 1.1218554007053543, "learning_rate": 1.8809221806948418e-05, "loss": 0.174, "step": 5233 }, { "epoch": 1.0938349007314525, "grad_norm": 1.2294631523296407, "learning_rate": 1.8808687854837827e-05, "loss": 0.2038, "step": 5234 }, { "epoch": 1.0940438871473355, "grad_norm": 1.248154239837543, "learning_rate": 1.880815379062276e-05, "loss": 0.1986, "step": 5235 }, { "epoch": 1.0942528735632184, "grad_norm": 1.118971961194645, "learning_rate": 1.8807619614310016e-05, "loss": 0.1973, "step": 5236 }, { "epoch": 1.0944618599791014, "grad_norm": 1.0792530500512847, "learning_rate": 1.8807085325906393e-05, "loss": 0.1667, "step": 5237 }, { "epoch": 1.0946708463949844, "grad_norm": 0.9558271451675243, "learning_rate": 1.880655092541869e-05, "loss": 0.15, "step": 5238 }, { "epoch": 1.0948798328108673, "grad_norm": 1.0538494873510156, "learning_rate": 1.880601641285371e-05, "loss": 0.1742, "step": 5239 }, { "epoch": 1.0950888192267503, "grad_norm": 1.1280599628840335, "learning_rate": 1.8805481788218252e-05, "loss": 0.1557, "step": 5240 }, { "epoch": 1.0952978056426332, "grad_norm": 1.2067580561130158, "learning_rate": 1.8804947051519124e-05, "loss": 0.162, "step": 5241 }, { "epoch": 1.0955067920585162, "grad_norm": 1.1790515691117591, "learning_rate": 1.8804412202763132e-05, "loss": 0.17, "step": 5242 }, { "epoch": 1.0957157784743992, "grad_norm": 1.0874562931272695, "learning_rate": 1.8803877241957077e-05, "loss": 0.1492, "step": 5243 }, { "epoch": 1.0959247648902821, "grad_norm": 1.2452101608826454, "learning_rate": 1.880334216910777e-05, "loss": 0.2037, "step": 5244 }, { "epoch": 1.096133751306165, "grad_norm": 1.067882391375038, "learning_rate": 1.8802806984222027e-05, "loss": 0.1614, "step": 5245 }, { "epoch": 1.096342737722048, "grad_norm": 0.9925695675297546, "learning_rate": 1.880227168730665e-05, "loss": 0.1562, "step": 5246 }, { "epoch": 1.096551724137931, "grad_norm": 1.2208947269004267, "learning_rate": 1.8801736278368456e-05, "loss": 0.1933, "step": 5247 }, { "epoch": 1.096760710553814, "grad_norm": 1.2250225734892566, "learning_rate": 1.8801200757414255e-05, "loss": 0.1361, "step": 5248 }, { "epoch": 1.096969696969697, "grad_norm": 1.1126929797126328, "learning_rate": 1.8800665124450867e-05, "loss": 0.164, "step": 5249 }, { "epoch": 1.09717868338558, "grad_norm": 1.1789419153789313, "learning_rate": 1.8800129379485108e-05, "loss": 0.182, "step": 5250 }, { "epoch": 1.0973876698014629, "grad_norm": 1.010247257016816, "learning_rate": 1.8799593522523793e-05, "loss": 0.1957, "step": 5251 }, { "epoch": 1.0975966562173458, "grad_norm": 1.0174075870776813, "learning_rate": 1.8799057553573744e-05, "loss": 0.1726, "step": 5252 }, { "epoch": 1.0978056426332288, "grad_norm": 0.9329580698327009, "learning_rate": 1.8798521472641783e-05, "loss": 0.1823, "step": 5253 }, { "epoch": 1.0980146290491117, "grad_norm": 1.0879113907733615, "learning_rate": 1.8797985279734732e-05, "loss": 0.2067, "step": 5254 }, { "epoch": 1.0982236154649947, "grad_norm": 1.064369225564841, "learning_rate": 1.8797448974859416e-05, "loss": 0.1853, "step": 5255 }, { "epoch": 1.0984326018808777, "grad_norm": 0.9919991459769393, "learning_rate": 1.8796912558022655e-05, "loss": 0.1692, "step": 5256 }, { "epoch": 1.0986415882967606, "grad_norm": 1.0825798833682498, "learning_rate": 1.8796376029231283e-05, "loss": 0.1595, "step": 5257 }, { "epoch": 1.0988505747126436, "grad_norm": 1.1876544496098709, "learning_rate": 1.879583938849212e-05, "loss": 0.1691, "step": 5258 }, { "epoch": 1.0990595611285268, "grad_norm": 1.039358002980566, "learning_rate": 1.8795302635812002e-05, "loss": 0.1867, "step": 5259 }, { "epoch": 1.0992685475444097, "grad_norm": 0.9785793507741796, "learning_rate": 1.879476577119776e-05, "loss": 0.1681, "step": 5260 }, { "epoch": 1.0994775339602927, "grad_norm": 0.9664985189336502, "learning_rate": 1.879422879465622e-05, "loss": 0.1743, "step": 5261 }, { "epoch": 1.0996865203761756, "grad_norm": 1.1054127942178433, "learning_rate": 1.8793691706194226e-05, "loss": 0.187, "step": 5262 }, { "epoch": 1.0998955067920586, "grad_norm": 1.2341539180972332, "learning_rate": 1.8793154505818605e-05, "loss": 0.1803, "step": 5263 }, { "epoch": 1.1001044932079416, "grad_norm": 1.155329845464847, "learning_rate": 1.8792617193536197e-05, "loss": 0.1964, "step": 5264 }, { "epoch": 1.1003134796238245, "grad_norm": 1.2372886431736982, "learning_rate": 1.8792079769353838e-05, "loss": 0.1675, "step": 5265 }, { "epoch": 1.1005224660397075, "grad_norm": 1.0759621656199827, "learning_rate": 1.8791542233278372e-05, "loss": 0.1524, "step": 5266 }, { "epoch": 1.1007314524555905, "grad_norm": 1.119009741502515, "learning_rate": 1.8791004585316637e-05, "loss": 0.1921, "step": 5267 }, { "epoch": 1.1009404388714734, "grad_norm": 0.9686365034117294, "learning_rate": 1.8790466825475474e-05, "loss": 0.1341, "step": 5268 }, { "epoch": 1.1011494252873564, "grad_norm": 1.3622304603799769, "learning_rate": 1.878992895376173e-05, "loss": 0.2049, "step": 5269 }, { "epoch": 1.1013584117032393, "grad_norm": 1.2694541248274644, "learning_rate": 1.8789390970182247e-05, "loss": 0.211, "step": 5270 }, { "epoch": 1.1015673981191223, "grad_norm": 1.1346084315169576, "learning_rate": 1.8788852874743875e-05, "loss": 0.1705, "step": 5271 }, { "epoch": 1.1017763845350053, "grad_norm": 1.0622285095768125, "learning_rate": 1.878831466745346e-05, "loss": 0.16, "step": 5272 }, { "epoch": 1.1019853709508882, "grad_norm": 1.1373550629165998, "learning_rate": 1.878777634831785e-05, "loss": 0.2265, "step": 5273 }, { "epoch": 1.1021943573667712, "grad_norm": 1.0606723956173736, "learning_rate": 1.8787237917343902e-05, "loss": 0.1689, "step": 5274 }, { "epoch": 1.1024033437826541, "grad_norm": 0.9855835966389157, "learning_rate": 1.8786699374538462e-05, "loss": 0.1903, "step": 5275 }, { "epoch": 1.102612330198537, "grad_norm": 1.0077135629629412, "learning_rate": 1.8786160719908386e-05, "loss": 0.1763, "step": 5276 }, { "epoch": 1.10282131661442, "grad_norm": 1.200932615867401, "learning_rate": 1.878562195346053e-05, "loss": 0.1611, "step": 5277 }, { "epoch": 1.103030303030303, "grad_norm": 1.2091712373331602, "learning_rate": 1.878508307520175e-05, "loss": 0.1672, "step": 5278 }, { "epoch": 1.103239289446186, "grad_norm": 1.0396116289302513, "learning_rate": 1.8784544085138903e-05, "loss": 0.156, "step": 5279 }, { "epoch": 1.103448275862069, "grad_norm": 1.0734487246276125, "learning_rate": 1.8784004983278853e-05, "loss": 0.191, "step": 5280 }, { "epoch": 1.103657262277952, "grad_norm": 0.9545502896313571, "learning_rate": 1.8783465769628456e-05, "loss": 0.2046, "step": 5281 }, { "epoch": 1.1038662486938349, "grad_norm": 1.0714321132293851, "learning_rate": 1.8782926444194574e-05, "loss": 0.1628, "step": 5282 }, { "epoch": 1.1040752351097178, "grad_norm": 1.2604802738377272, "learning_rate": 1.8782387006984074e-05, "loss": 0.1852, "step": 5283 }, { "epoch": 1.1042842215256008, "grad_norm": 1.3197033554973898, "learning_rate": 1.8781847458003822e-05, "loss": 0.1867, "step": 5284 }, { "epoch": 1.1044932079414838, "grad_norm": 1.130150691427741, "learning_rate": 1.8781307797260683e-05, "loss": 0.1717, "step": 5285 }, { "epoch": 1.1047021943573667, "grad_norm": 1.0533707200432663, "learning_rate": 1.878076802476152e-05, "loss": 0.1538, "step": 5286 }, { "epoch": 1.1049111807732497, "grad_norm": 0.9426228351398639, "learning_rate": 1.878022814051321e-05, "loss": 0.1412, "step": 5287 }, { "epoch": 1.1051201671891326, "grad_norm": 1.0769283995619838, "learning_rate": 1.8779688144522625e-05, "loss": 0.1775, "step": 5288 }, { "epoch": 1.1053291536050156, "grad_norm": 1.1532818354591692, "learning_rate": 1.8779148036796625e-05, "loss": 0.1336, "step": 5289 }, { "epoch": 1.1055381400208986, "grad_norm": 1.028740949397006, "learning_rate": 1.8778607817342097e-05, "loss": 0.1632, "step": 5290 }, { "epoch": 1.1057471264367815, "grad_norm": 1.0351813812068564, "learning_rate": 1.877806748616591e-05, "loss": 0.1815, "step": 5291 }, { "epoch": 1.1059561128526645, "grad_norm": 1.1716989069827166, "learning_rate": 1.8777527043274942e-05, "loss": 0.1927, "step": 5292 }, { "epoch": 1.1061650992685474, "grad_norm": 1.131197311430833, "learning_rate": 1.877698648867607e-05, "loss": 0.2005, "step": 5293 }, { "epoch": 1.1063740856844304, "grad_norm": 1.0977448308470246, "learning_rate": 1.8776445822376176e-05, "loss": 0.1477, "step": 5294 }, { "epoch": 1.1065830721003134, "grad_norm": 1.1708672060922463, "learning_rate": 1.8775905044382136e-05, "loss": 0.1761, "step": 5295 }, { "epoch": 1.1067920585161963, "grad_norm": 1.1749694480264137, "learning_rate": 1.8775364154700835e-05, "loss": 0.1819, "step": 5296 }, { "epoch": 1.1070010449320795, "grad_norm": 1.2542519684246312, "learning_rate": 1.8774823153339155e-05, "loss": 0.146, "step": 5297 }, { "epoch": 1.1072100313479625, "grad_norm": 0.9057366949560578, "learning_rate": 1.877428204030399e-05, "loss": 0.114, "step": 5298 }, { "epoch": 1.1074190177638454, "grad_norm": 1.429590837781782, "learning_rate": 1.8773740815602217e-05, "loss": 0.1841, "step": 5299 }, { "epoch": 1.1076280041797284, "grad_norm": 1.1665275512408346, "learning_rate": 1.8773199479240724e-05, "loss": 0.1431, "step": 5300 }, { "epoch": 1.1078369905956114, "grad_norm": 1.1994909015336734, "learning_rate": 1.8772658031226404e-05, "loss": 0.1692, "step": 5301 }, { "epoch": 1.1080459770114943, "grad_norm": 1.2740286057925987, "learning_rate": 1.8772116471566145e-05, "loss": 0.1852, "step": 5302 }, { "epoch": 1.1082549634273773, "grad_norm": 1.1218640460344824, "learning_rate": 1.8771574800266843e-05, "loss": 0.1712, "step": 5303 }, { "epoch": 1.1084639498432602, "grad_norm": 1.2114935706766958, "learning_rate": 1.8771033017335387e-05, "loss": 0.1978, "step": 5304 }, { "epoch": 1.1086729362591432, "grad_norm": 1.0878434505321959, "learning_rate": 1.8770491122778677e-05, "loss": 0.1748, "step": 5305 }, { "epoch": 1.1088819226750262, "grad_norm": 1.286141012301547, "learning_rate": 1.8769949116603605e-05, "loss": 0.1535, "step": 5306 }, { "epoch": 1.1090909090909091, "grad_norm": 1.0719592635823616, "learning_rate": 1.876940699881707e-05, "loss": 0.1504, "step": 5307 }, { "epoch": 1.109299895506792, "grad_norm": 1.0290789413927748, "learning_rate": 1.8768864769425973e-05, "loss": 0.1823, "step": 5308 }, { "epoch": 1.109508881922675, "grad_norm": 1.0439078239670363, "learning_rate": 1.8768322428437215e-05, "loss": 0.1571, "step": 5309 }, { "epoch": 1.109717868338558, "grad_norm": 1.1485091257915994, "learning_rate": 1.8767779975857696e-05, "loss": 0.1796, "step": 5310 }, { "epoch": 1.109926854754441, "grad_norm": 1.1120239383277255, "learning_rate": 1.876723741169432e-05, "loss": 0.1694, "step": 5311 }, { "epoch": 1.110135841170324, "grad_norm": 1.1026680649611478, "learning_rate": 1.8766694735953993e-05, "loss": 0.1789, "step": 5312 }, { "epoch": 1.110344827586207, "grad_norm": 1.0460344860731186, "learning_rate": 1.8766151948643622e-05, "loss": 0.1667, "step": 5313 }, { "epoch": 1.1105538140020899, "grad_norm": 2.8195211022355493, "learning_rate": 1.876560904977011e-05, "loss": 0.2097, "step": 5314 }, { "epoch": 1.1107628004179728, "grad_norm": 0.938493240537535, "learning_rate": 1.8765066039340375e-05, "loss": 0.1717, "step": 5315 }, { "epoch": 1.1109717868338558, "grad_norm": 1.2227823639436588, "learning_rate": 1.876452291736132e-05, "loss": 0.1907, "step": 5316 }, { "epoch": 1.1111807732497387, "grad_norm": 1.1180388110384942, "learning_rate": 1.876397968383986e-05, "loss": 0.1655, "step": 5317 }, { "epoch": 1.1113897596656217, "grad_norm": 1.23373238780519, "learning_rate": 1.876343633878291e-05, "loss": 0.16, "step": 5318 }, { "epoch": 1.1115987460815047, "grad_norm": 0.9914679589012633, "learning_rate": 1.876289288219738e-05, "loss": 0.1684, "step": 5319 }, { "epoch": 1.1118077324973876, "grad_norm": 1.3889393316194603, "learning_rate": 1.876234931409019e-05, "loss": 0.1896, "step": 5320 }, { "epoch": 1.1120167189132706, "grad_norm": 1.2007693719362484, "learning_rate": 1.876180563446826e-05, "loss": 0.1804, "step": 5321 }, { "epoch": 1.1122257053291535, "grad_norm": 1.112382457514083, "learning_rate": 1.8761261843338504e-05, "loss": 0.1533, "step": 5322 }, { "epoch": 1.1124346917450365, "grad_norm": 1.093828432792073, "learning_rate": 1.8760717940707843e-05, "loss": 0.1896, "step": 5323 }, { "epoch": 1.1126436781609195, "grad_norm": 1.2050060150453101, "learning_rate": 1.876017392658321e-05, "loss": 0.1973, "step": 5324 }, { "epoch": 1.1128526645768024, "grad_norm": 1.1781988516436193, "learning_rate": 1.875962980097151e-05, "loss": 0.1761, "step": 5325 }, { "epoch": 1.1130616509926854, "grad_norm": 0.973118769233362, "learning_rate": 1.875908556387968e-05, "loss": 0.1433, "step": 5326 }, { "epoch": 1.1132706374085684, "grad_norm": 1.141506775166607, "learning_rate": 1.8758541215314646e-05, "loss": 0.2232, "step": 5327 }, { "epoch": 1.1134796238244513, "grad_norm": 1.0930643089664576, "learning_rate": 1.875799675528333e-05, "loss": 0.179, "step": 5328 }, { "epoch": 1.1136886102403343, "grad_norm": 1.2169514437591293, "learning_rate": 1.8757452183792665e-05, "loss": 0.2125, "step": 5329 }, { "epoch": 1.1138975966562175, "grad_norm": 1.4461207533081433, "learning_rate": 1.875690750084958e-05, "loss": 0.1923, "step": 5330 }, { "epoch": 1.1141065830721004, "grad_norm": 1.1916394080026493, "learning_rate": 1.8756362706461014e-05, "loss": 0.1946, "step": 5331 }, { "epoch": 1.1143155694879834, "grad_norm": 1.013613122691557, "learning_rate": 1.8755817800633886e-05, "loss": 0.1837, "step": 5332 }, { "epoch": 1.1145245559038663, "grad_norm": 1.0204802191781235, "learning_rate": 1.8755272783375145e-05, "loss": 0.2, "step": 5333 }, { "epoch": 1.1147335423197493, "grad_norm": 0.973408239254567, "learning_rate": 1.875472765469172e-05, "loss": 0.1623, "step": 5334 }, { "epoch": 1.1149425287356323, "grad_norm": 1.0979405564217524, "learning_rate": 1.875418241459055e-05, "loss": 0.1911, "step": 5335 }, { "epoch": 1.1151515151515152, "grad_norm": 1.1832917551864868, "learning_rate": 1.8753637063078575e-05, "loss": 0.1724, "step": 5336 }, { "epoch": 1.1153605015673982, "grad_norm": 0.9960182559344822, "learning_rate": 1.875309160016273e-05, "loss": 0.1953, "step": 5337 }, { "epoch": 1.1155694879832811, "grad_norm": 0.9540769241260539, "learning_rate": 1.875254602584996e-05, "loss": 0.1803, "step": 5338 }, { "epoch": 1.115778474399164, "grad_norm": 1.0142484569971844, "learning_rate": 1.8752000340147218e-05, "loss": 0.1422, "step": 5339 }, { "epoch": 1.115987460815047, "grad_norm": 1.060679514729133, "learning_rate": 1.8751454543061434e-05, "loss": 0.1689, "step": 5340 }, { "epoch": 1.11619644723093, "grad_norm": 1.0894030084580024, "learning_rate": 1.8750908634599563e-05, "loss": 0.1685, "step": 5341 }, { "epoch": 1.116405433646813, "grad_norm": 1.2812585823705431, "learning_rate": 1.8750362614768553e-05, "loss": 0.1485, "step": 5342 }, { "epoch": 1.116614420062696, "grad_norm": 1.0616188601550673, "learning_rate": 1.8749816483575343e-05, "loss": 0.1705, "step": 5343 }, { "epoch": 1.116823406478579, "grad_norm": 1.3263639130914033, "learning_rate": 1.8749270241026893e-05, "loss": 0.2046, "step": 5344 }, { "epoch": 1.1170323928944619, "grad_norm": 1.0898990652952432, "learning_rate": 1.8748723887130154e-05, "loss": 0.1821, "step": 5345 }, { "epoch": 1.1172413793103448, "grad_norm": 1.1671798146346086, "learning_rate": 1.8748177421892076e-05, "loss": 0.1969, "step": 5346 }, { "epoch": 1.1174503657262278, "grad_norm": 1.0910327242263882, "learning_rate": 1.874763084531961e-05, "loss": 0.186, "step": 5347 }, { "epoch": 1.1176593521421108, "grad_norm": 1.1571862940399904, "learning_rate": 1.8747084157419726e-05, "loss": 0.1728, "step": 5348 }, { "epoch": 1.1178683385579937, "grad_norm": 1.0972284845582199, "learning_rate": 1.874653735819937e-05, "loss": 0.1627, "step": 5349 }, { "epoch": 1.1180773249738767, "grad_norm": 1.143105967197838, "learning_rate": 1.87459904476655e-05, "loss": 0.1701, "step": 5350 }, { "epoch": 1.1182863113897596, "grad_norm": 0.9570008456082336, "learning_rate": 1.874544342582508e-05, "loss": 0.1604, "step": 5351 }, { "epoch": 1.1184952978056426, "grad_norm": 1.0898428123874657, "learning_rate": 1.8744896292685076e-05, "loss": 0.1629, "step": 5352 }, { "epoch": 1.1187042842215256, "grad_norm": 1.403812041668326, "learning_rate": 1.874434904825244e-05, "loss": 0.1701, "step": 5353 }, { "epoch": 1.1189132706374085, "grad_norm": 0.9067814521956835, "learning_rate": 1.874380169253415e-05, "loss": 0.1538, "step": 5354 }, { "epoch": 1.1191222570532915, "grad_norm": 1.0159357746988151, "learning_rate": 1.874325422553716e-05, "loss": 0.1704, "step": 5355 }, { "epoch": 1.1193312434691745, "grad_norm": 0.8775781889689859, "learning_rate": 1.8742706647268445e-05, "loss": 0.1483, "step": 5356 }, { "epoch": 1.1195402298850574, "grad_norm": 0.9460559756671871, "learning_rate": 1.874215895773497e-05, "loss": 0.1571, "step": 5357 }, { "epoch": 1.1197492163009404, "grad_norm": 1.2343658472928323, "learning_rate": 1.8741611156943706e-05, "loss": 0.1883, "step": 5358 }, { "epoch": 1.1199582027168233, "grad_norm": 1.1327233180521796, "learning_rate": 1.8741063244901628e-05, "loss": 0.1902, "step": 5359 }, { "epoch": 1.1201671891327063, "grad_norm": 1.1067509851495945, "learning_rate": 1.8740515221615704e-05, "loss": 0.164, "step": 5360 }, { "epoch": 1.1203761755485893, "grad_norm": 1.1731885751061533, "learning_rate": 1.873996708709291e-05, "loss": 0.1859, "step": 5361 }, { "epoch": 1.1205851619644722, "grad_norm": 1.1904013336720323, "learning_rate": 1.8739418841340223e-05, "loss": 0.1842, "step": 5362 }, { "epoch": 1.1207941483803552, "grad_norm": 1.5933510662267307, "learning_rate": 1.873887048436462e-05, "loss": 0.1895, "step": 5363 }, { "epoch": 1.1210031347962381, "grad_norm": 1.0590306737075827, "learning_rate": 1.8738322016173078e-05, "loss": 0.1521, "step": 5364 }, { "epoch": 1.121212121212121, "grad_norm": 1.1127891501335638, "learning_rate": 1.8737773436772578e-05, "loss": 0.1525, "step": 5365 }, { "epoch": 1.121421107628004, "grad_norm": 1.2794584179047626, "learning_rate": 1.8737224746170107e-05, "loss": 0.236, "step": 5366 }, { "epoch": 1.1216300940438872, "grad_norm": 1.0663217541551206, "learning_rate": 1.8736675944372637e-05, "loss": 0.1634, "step": 5367 }, { "epoch": 1.1218390804597702, "grad_norm": 1.0454013679614704, "learning_rate": 1.8736127031387164e-05, "loss": 0.1655, "step": 5368 }, { "epoch": 1.1220480668756532, "grad_norm": 1.1237037491691555, "learning_rate": 1.8735578007220665e-05, "loss": 0.1658, "step": 5369 }, { "epoch": 1.1222570532915361, "grad_norm": 0.9978038036274074, "learning_rate": 1.8735028871880133e-05, "loss": 0.1654, "step": 5370 }, { "epoch": 1.122466039707419, "grad_norm": 0.909752851393791, "learning_rate": 1.873447962537255e-05, "loss": 0.151, "step": 5371 }, { "epoch": 1.122675026123302, "grad_norm": 1.2502275447249136, "learning_rate": 1.8733930267704914e-05, "loss": 0.1556, "step": 5372 }, { "epoch": 1.122884012539185, "grad_norm": 1.2338498992576032, "learning_rate": 1.8733380798884208e-05, "loss": 0.1728, "step": 5373 }, { "epoch": 1.123092998955068, "grad_norm": 1.1932719598796782, "learning_rate": 1.8732831218917433e-05, "loss": 0.1841, "step": 5374 }, { "epoch": 1.123301985370951, "grad_norm": 1.1244129572052959, "learning_rate": 1.8732281527811578e-05, "loss": 0.1596, "step": 5375 }, { "epoch": 1.123510971786834, "grad_norm": 1.0907986660710296, "learning_rate": 1.873173172557364e-05, "loss": 0.1548, "step": 5376 }, { "epoch": 1.1237199582027169, "grad_norm": 1.2985818398008129, "learning_rate": 1.8731181812210622e-05, "loss": 0.1867, "step": 5377 }, { "epoch": 1.1239289446185998, "grad_norm": 1.0590949799268619, "learning_rate": 1.873063178772951e-05, "loss": 0.1671, "step": 5378 }, { "epoch": 1.1241379310344828, "grad_norm": 1.3280141641465253, "learning_rate": 1.8730081652137314e-05, "loss": 0.1835, "step": 5379 }, { "epoch": 1.1243469174503657, "grad_norm": 1.184674606164743, "learning_rate": 1.872953140544103e-05, "loss": 0.141, "step": 5380 }, { "epoch": 1.1245559038662487, "grad_norm": 1.1531134755419996, "learning_rate": 1.8728981047647664e-05, "loss": 0.165, "step": 5381 }, { "epoch": 1.1247648902821317, "grad_norm": 1.0609473253588517, "learning_rate": 1.872843057876422e-05, "loss": 0.1847, "step": 5382 }, { "epoch": 1.1249738766980146, "grad_norm": 1.3491279302053634, "learning_rate": 1.8727879998797704e-05, "loss": 0.1923, "step": 5383 }, { "epoch": 1.1251828631138976, "grad_norm": 1.0998500688063924, "learning_rate": 1.872732930775512e-05, "loss": 0.1594, "step": 5384 }, { "epoch": 1.1253918495297806, "grad_norm": 1.3260504748680948, "learning_rate": 1.872677850564348e-05, "loss": 0.1805, "step": 5385 }, { "epoch": 1.1256008359456635, "grad_norm": 1.1494883686295965, "learning_rate": 1.872622759246979e-05, "loss": 0.1703, "step": 5386 }, { "epoch": 1.1258098223615465, "grad_norm": 1.0133826835101571, "learning_rate": 1.8725676568241065e-05, "loss": 0.1449, "step": 5387 }, { "epoch": 1.1260188087774294, "grad_norm": 1.1547959320854961, "learning_rate": 1.8725125432964316e-05, "loss": 0.1952, "step": 5388 }, { "epoch": 1.1262277951933124, "grad_norm": 1.102547061690135, "learning_rate": 1.8724574186646555e-05, "loss": 0.1445, "step": 5389 }, { "epoch": 1.1264367816091954, "grad_norm": 0.910651094462865, "learning_rate": 1.87240228292948e-05, "loss": 0.1732, "step": 5390 }, { "epoch": 1.1266457680250783, "grad_norm": 1.111175147997043, "learning_rate": 1.8723471360916065e-05, "loss": 0.1801, "step": 5391 }, { "epoch": 1.1268547544409613, "grad_norm": 1.0733821007455018, "learning_rate": 1.8722919781517373e-05, "loss": 0.167, "step": 5392 }, { "epoch": 1.1270637408568442, "grad_norm": 1.2060938047760876, "learning_rate": 1.8722368091105744e-05, "loss": 0.1534, "step": 5393 }, { "epoch": 1.1272727272727272, "grad_norm": 1.1797437178519352, "learning_rate": 1.8721816289688193e-05, "loss": 0.1654, "step": 5394 }, { "epoch": 1.1274817136886102, "grad_norm": 1.2903589363803583, "learning_rate": 1.872126437727175e-05, "loss": 0.1798, "step": 5395 }, { "epoch": 1.1276907001044931, "grad_norm": 1.1587795369141285, "learning_rate": 1.872071235386343e-05, "loss": 0.1993, "step": 5396 }, { "epoch": 1.127899686520376, "grad_norm": 1.0650950556477217, "learning_rate": 1.8720160219470263e-05, "loss": 0.1773, "step": 5397 }, { "epoch": 1.128108672936259, "grad_norm": 1.0670917481010853, "learning_rate": 1.8719607974099282e-05, "loss": 0.1893, "step": 5398 }, { "epoch": 1.1283176593521422, "grad_norm": 0.9772337659021144, "learning_rate": 1.8719055617757505e-05, "loss": 0.148, "step": 5399 }, { "epoch": 1.1285266457680252, "grad_norm": 1.2535054428642964, "learning_rate": 1.8718503150451966e-05, "loss": 0.2112, "step": 5400 }, { "epoch": 1.1287356321839082, "grad_norm": 1.1100629074319848, "learning_rate": 1.8717950572189698e-05, "loss": 0.1738, "step": 5401 }, { "epoch": 1.1289446185997911, "grad_norm": 1.1763919433786358, "learning_rate": 1.871739788297773e-05, "loss": 0.1853, "step": 5402 }, { "epoch": 1.129153605015674, "grad_norm": 1.14860133725847, "learning_rate": 1.8716845082823095e-05, "loss": 0.1816, "step": 5403 }, { "epoch": 1.129362591431557, "grad_norm": 0.9221820023690352, "learning_rate": 1.8716292171732834e-05, "loss": 0.1664, "step": 5404 }, { "epoch": 1.12957157784744, "grad_norm": 1.1660169238497597, "learning_rate": 1.871573914971398e-05, "loss": 0.1974, "step": 5405 }, { "epoch": 1.129780564263323, "grad_norm": 1.3100161471290659, "learning_rate": 1.871518601677357e-05, "loss": 0.1797, "step": 5406 }, { "epoch": 1.129989550679206, "grad_norm": 1.1305261022536814, "learning_rate": 1.8714632772918645e-05, "loss": 0.1894, "step": 5407 }, { "epoch": 1.1301985370950889, "grad_norm": 1.0976295768540463, "learning_rate": 1.8714079418156246e-05, "loss": 0.155, "step": 5408 }, { "epoch": 1.1304075235109718, "grad_norm": 1.3664888613535895, "learning_rate": 1.8713525952493416e-05, "loss": 0.1938, "step": 5409 }, { "epoch": 1.1306165099268548, "grad_norm": 1.4200698242832406, "learning_rate": 1.8712972375937195e-05, "loss": 0.1756, "step": 5410 }, { "epoch": 1.1308254963427378, "grad_norm": 0.9391903032917575, "learning_rate": 1.871241868849463e-05, "loss": 0.1723, "step": 5411 }, { "epoch": 1.1310344827586207, "grad_norm": 0.9796794535345553, "learning_rate": 1.8711864890172773e-05, "loss": 0.159, "step": 5412 }, { "epoch": 1.1312434691745037, "grad_norm": 0.9705673803847014, "learning_rate": 1.8711310980978664e-05, "loss": 0.1792, "step": 5413 }, { "epoch": 1.1314524555903867, "grad_norm": 0.9631022618618532, "learning_rate": 1.8710756960919356e-05, "loss": 0.1412, "step": 5414 }, { "epoch": 1.1316614420062696, "grad_norm": 1.0298095462959027, "learning_rate": 1.87102028300019e-05, "loss": 0.1432, "step": 5415 }, { "epoch": 1.1318704284221526, "grad_norm": 1.2985032555479519, "learning_rate": 1.870964858823335e-05, "loss": 0.1727, "step": 5416 }, { "epoch": 1.1320794148380355, "grad_norm": 1.0165678486145422, "learning_rate": 1.8709094235620755e-05, "loss": 0.1995, "step": 5417 }, { "epoch": 1.1322884012539185, "grad_norm": 1.1909953086015561, "learning_rate": 1.870853977217117e-05, "loss": 0.1771, "step": 5418 }, { "epoch": 1.1324973876698015, "grad_norm": 1.5161900065722753, "learning_rate": 1.8707985197891657e-05, "loss": 0.1868, "step": 5419 }, { "epoch": 1.1327063740856844, "grad_norm": 1.2134559102988338, "learning_rate": 1.8707430512789272e-05, "loss": 0.1829, "step": 5420 }, { "epoch": 1.1329153605015674, "grad_norm": 1.006876730599232, "learning_rate": 1.870687571687107e-05, "loss": 0.1543, "step": 5421 }, { "epoch": 1.1331243469174503, "grad_norm": 1.1037017908171818, "learning_rate": 1.8706320810144116e-05, "loss": 0.1588, "step": 5422 }, { "epoch": 1.1333333333333333, "grad_norm": 1.1950991647918974, "learning_rate": 1.8705765792615465e-05, "loss": 0.1474, "step": 5423 }, { "epoch": 1.1335423197492163, "grad_norm": 0.9387191383524458, "learning_rate": 1.870521066429219e-05, "loss": 0.1695, "step": 5424 }, { "epoch": 1.1337513061650992, "grad_norm": 1.1855855810244238, "learning_rate": 1.8704655425181355e-05, "loss": 0.1906, "step": 5425 }, { "epoch": 1.1339602925809822, "grad_norm": 1.3400504682128378, "learning_rate": 1.870410007529002e-05, "loss": 0.2121, "step": 5426 }, { "epoch": 1.1341692789968651, "grad_norm": 1.2886467103750843, "learning_rate": 1.8703544614625255e-05, "loss": 0.1805, "step": 5427 }, { "epoch": 1.134378265412748, "grad_norm": 1.297248567284647, "learning_rate": 1.8702989043194134e-05, "loss": 0.2137, "step": 5428 }, { "epoch": 1.134587251828631, "grad_norm": 0.9557073680798178, "learning_rate": 1.870243336100372e-05, "loss": 0.1438, "step": 5429 }, { "epoch": 1.134796238244514, "grad_norm": 1.473891816016757, "learning_rate": 1.870187756806109e-05, "loss": 0.1689, "step": 5430 }, { "epoch": 1.135005224660397, "grad_norm": 1.1331236374872027, "learning_rate": 1.8701321664373312e-05, "loss": 0.1608, "step": 5431 }, { "epoch": 1.13521421107628, "grad_norm": 1.293189584072697, "learning_rate": 1.870076564994747e-05, "loss": 0.1999, "step": 5432 }, { "epoch": 1.135423197492163, "grad_norm": 1.141301542819048, "learning_rate": 1.8700209524790632e-05, "loss": 0.1812, "step": 5433 }, { "epoch": 1.1356321839080459, "grad_norm": 1.008444546750338, "learning_rate": 1.869965328890988e-05, "loss": 0.2045, "step": 5434 }, { "epoch": 1.1358411703239288, "grad_norm": 0.9726443787567919, "learning_rate": 1.869909694231229e-05, "loss": 0.1727, "step": 5435 }, { "epoch": 1.1360501567398118, "grad_norm": 0.9495532925922149, "learning_rate": 1.869854048500494e-05, "loss": 0.1625, "step": 5436 }, { "epoch": 1.1362591431556948, "grad_norm": 0.9665570907512385, "learning_rate": 1.869798391699492e-05, "loss": 0.1816, "step": 5437 }, { "epoch": 1.1364681295715777, "grad_norm": 1.2380603632537173, "learning_rate": 1.8697427238289306e-05, "loss": 0.1735, "step": 5438 }, { "epoch": 1.136677115987461, "grad_norm": 1.2382914986764169, "learning_rate": 1.869687044889519e-05, "loss": 0.1956, "step": 5439 }, { "epoch": 1.1368861024033439, "grad_norm": 1.0285767894256146, "learning_rate": 1.8696313548819648e-05, "loss": 0.1723, "step": 5440 }, { "epoch": 1.1370950888192268, "grad_norm": 1.254546292581956, "learning_rate": 1.8695756538069773e-05, "loss": 0.1914, "step": 5441 }, { "epoch": 1.1373040752351098, "grad_norm": 1.074778693731034, "learning_rate": 1.8695199416652658e-05, "loss": 0.1635, "step": 5442 }, { "epoch": 1.1375130616509928, "grad_norm": 0.9811570910014777, "learning_rate": 1.8694642184575388e-05, "loss": 0.196, "step": 5443 }, { "epoch": 1.1377220480668757, "grad_norm": 1.2841840501212023, "learning_rate": 1.869408484184505e-05, "loss": 0.189, "step": 5444 }, { "epoch": 1.1379310344827587, "grad_norm": 0.9861973339870157, "learning_rate": 1.8693527388468748e-05, "loss": 0.2089, "step": 5445 }, { "epoch": 1.1381400208986416, "grad_norm": 1.1721142718507434, "learning_rate": 1.8692969824453567e-05, "loss": 0.1749, "step": 5446 }, { "epoch": 1.1383490073145246, "grad_norm": 1.1047536846661359, "learning_rate": 1.869241214980661e-05, "loss": 0.1557, "step": 5447 }, { "epoch": 1.1385579937304076, "grad_norm": 1.2436057200101212, "learning_rate": 1.8691854364534968e-05, "loss": 0.1807, "step": 5448 }, { "epoch": 1.1387669801462905, "grad_norm": 0.9451746460592305, "learning_rate": 1.8691296468645746e-05, "loss": 0.1609, "step": 5449 }, { "epoch": 1.1389759665621735, "grad_norm": 1.1240701491993301, "learning_rate": 1.8690738462146042e-05, "loss": 0.1737, "step": 5450 }, { "epoch": 1.1391849529780564, "grad_norm": 1.153488893154946, "learning_rate": 1.8690180345042955e-05, "loss": 0.1605, "step": 5451 }, { "epoch": 1.1393939393939394, "grad_norm": 1.031963003129291, "learning_rate": 1.868962211734359e-05, "loss": 0.1724, "step": 5452 }, { "epoch": 1.1396029258098224, "grad_norm": 1.0634674216579494, "learning_rate": 1.868906377905505e-05, "loss": 0.1876, "step": 5453 }, { "epoch": 1.1398119122257053, "grad_norm": 1.3004531616515782, "learning_rate": 1.868850533018444e-05, "loss": 0.183, "step": 5454 }, { "epoch": 1.1400208986415883, "grad_norm": 1.1072435424754035, "learning_rate": 1.868794677073887e-05, "loss": 0.2017, "step": 5455 }, { "epoch": 1.1402298850574712, "grad_norm": 1.1114169765662087, "learning_rate": 1.868738810072545e-05, "loss": 0.1654, "step": 5456 }, { "epoch": 1.1404388714733542, "grad_norm": 0.9817620368872806, "learning_rate": 1.868682932015128e-05, "loss": 0.1545, "step": 5457 }, { "epoch": 1.1406478578892372, "grad_norm": 1.1789840850465019, "learning_rate": 1.8686270429023484e-05, "loss": 0.1657, "step": 5458 }, { "epoch": 1.1408568443051201, "grad_norm": 1.0781630596150211, "learning_rate": 1.8685711427349165e-05, "loss": 0.1533, "step": 5459 }, { "epoch": 1.141065830721003, "grad_norm": 1.4132241704146011, "learning_rate": 1.8685152315135448e-05, "loss": 0.1702, "step": 5460 }, { "epoch": 1.141274817136886, "grad_norm": 1.0225394269042274, "learning_rate": 1.868459309238944e-05, "loss": 0.1554, "step": 5461 }, { "epoch": 1.141483803552769, "grad_norm": 1.2481326465945366, "learning_rate": 1.8684033759118255e-05, "loss": 0.1492, "step": 5462 }, { "epoch": 1.141692789968652, "grad_norm": 1.177365150805451, "learning_rate": 1.868347431532902e-05, "loss": 0.1783, "step": 5463 }, { "epoch": 1.141901776384535, "grad_norm": 1.2665053021651482, "learning_rate": 1.8682914761028856e-05, "loss": 0.1634, "step": 5464 }, { "epoch": 1.142110762800418, "grad_norm": 0.8613792900220715, "learning_rate": 1.8682355096224873e-05, "loss": 0.1621, "step": 5465 }, { "epoch": 1.1423197492163009, "grad_norm": 1.1473994899536488, "learning_rate": 1.86817953209242e-05, "loss": 0.1699, "step": 5466 }, { "epoch": 1.1425287356321838, "grad_norm": 1.0356694960432018, "learning_rate": 1.8681235435133963e-05, "loss": 0.1786, "step": 5467 }, { "epoch": 1.1427377220480668, "grad_norm": 1.2776034219339285, "learning_rate": 1.8680675438861286e-05, "loss": 0.2019, "step": 5468 }, { "epoch": 1.14294670846395, "grad_norm": 1.0441324787113795, "learning_rate": 1.8680115332113296e-05, "loss": 0.1452, "step": 5469 }, { "epoch": 1.143155694879833, "grad_norm": 1.1620552597446645, "learning_rate": 1.8679555114897123e-05, "loss": 0.1683, "step": 5470 }, { "epoch": 1.143364681295716, "grad_norm": 1.0723066720375618, "learning_rate": 1.8678994787219887e-05, "loss": 0.17, "step": 5471 }, { "epoch": 1.1435736677115989, "grad_norm": 0.9848285521659822, "learning_rate": 1.8678434349088732e-05, "loss": 0.1504, "step": 5472 }, { "epoch": 1.1437826541274818, "grad_norm": 1.0329384689963121, "learning_rate": 1.8677873800510783e-05, "loss": 0.1895, "step": 5473 }, { "epoch": 1.1439916405433648, "grad_norm": 0.9832141916411631, "learning_rate": 1.8677313141493175e-05, "loss": 0.1696, "step": 5474 }, { "epoch": 1.1442006269592477, "grad_norm": 1.0031766581996238, "learning_rate": 1.8676752372043046e-05, "loss": 0.1498, "step": 5475 }, { "epoch": 1.1444096133751307, "grad_norm": 1.209350326053985, "learning_rate": 1.867619149216753e-05, "loss": 0.168, "step": 5476 }, { "epoch": 1.1446185997910137, "grad_norm": 1.1067131566841346, "learning_rate": 1.8675630501873764e-05, "loss": 0.2069, "step": 5477 }, { "epoch": 1.1448275862068966, "grad_norm": 0.9882449787221016, "learning_rate": 1.867506940116889e-05, "loss": 0.1839, "step": 5478 }, { "epoch": 1.1450365726227796, "grad_norm": 1.9167350769181926, "learning_rate": 1.867450819006005e-05, "loss": 0.1752, "step": 5479 }, { "epoch": 1.1452455590386625, "grad_norm": 1.1904543450363003, "learning_rate": 1.867394686855438e-05, "loss": 0.1547, "step": 5480 }, { "epoch": 1.1454545454545455, "grad_norm": 1.1922255274508715, "learning_rate": 1.8673385436659032e-05, "loss": 0.1839, "step": 5481 }, { "epoch": 1.1456635318704285, "grad_norm": 1.7532141011521734, "learning_rate": 1.8672823894381145e-05, "loss": 0.1421, "step": 5482 }, { "epoch": 1.1458725182863114, "grad_norm": 1.1004247793890212, "learning_rate": 1.867226224172787e-05, "loss": 0.1714, "step": 5483 }, { "epoch": 1.1460815047021944, "grad_norm": 1.0069695703669193, "learning_rate": 1.8671700478706348e-05, "loss": 0.1512, "step": 5484 }, { "epoch": 1.1462904911180773, "grad_norm": 1.0110346046808398, "learning_rate": 1.867113860532374e-05, "loss": 0.1912, "step": 5485 }, { "epoch": 1.1464994775339603, "grad_norm": 1.2048507044641728, "learning_rate": 1.8670576621587188e-05, "loss": 0.1581, "step": 5486 }, { "epoch": 1.1467084639498433, "grad_norm": 0.9951795717554105, "learning_rate": 1.8670014527503843e-05, "loss": 0.1738, "step": 5487 }, { "epoch": 1.1469174503657262, "grad_norm": 1.4195803364008057, "learning_rate": 1.866945232308086e-05, "loss": 0.191, "step": 5488 }, { "epoch": 1.1471264367816092, "grad_norm": 1.1414382685644797, "learning_rate": 1.8668890008325395e-05, "loss": 0.1536, "step": 5489 }, { "epoch": 1.1473354231974922, "grad_norm": 0.9190246160206504, "learning_rate": 1.8668327583244606e-05, "loss": 0.1647, "step": 5490 }, { "epoch": 1.1475444096133751, "grad_norm": 1.240237352337935, "learning_rate": 1.866776504784565e-05, "loss": 0.1703, "step": 5491 }, { "epoch": 1.147753396029258, "grad_norm": 1.0925865680815028, "learning_rate": 1.866720240213569e-05, "loss": 0.1885, "step": 5492 }, { "epoch": 1.147962382445141, "grad_norm": 1.3369180969729617, "learning_rate": 1.8666639646121877e-05, "loss": 0.2052, "step": 5493 }, { "epoch": 1.148171368861024, "grad_norm": 0.9955547107694047, "learning_rate": 1.8666076779811378e-05, "loss": 0.1706, "step": 5494 }, { "epoch": 1.148380355276907, "grad_norm": 1.1737307542545607, "learning_rate": 1.866551380321136e-05, "loss": 0.1813, "step": 5495 }, { "epoch": 1.14858934169279, "grad_norm": 1.056963756784364, "learning_rate": 1.866495071632898e-05, "loss": 0.183, "step": 5496 }, { "epoch": 1.1487983281086729, "grad_norm": 1.056616095791483, "learning_rate": 1.8664387519171408e-05, "loss": 0.1506, "step": 5497 }, { "epoch": 1.1490073145245558, "grad_norm": 1.0956180672347091, "learning_rate": 1.8663824211745814e-05, "loss": 0.1805, "step": 5498 }, { "epoch": 1.1492163009404388, "grad_norm": 1.120841104386574, "learning_rate": 1.8663260794059366e-05, "loss": 0.1773, "step": 5499 }, { "epoch": 1.1494252873563218, "grad_norm": 1.0448508918916042, "learning_rate": 1.866269726611923e-05, "loss": 0.1716, "step": 5500 }, { "epoch": 1.1496342737722047, "grad_norm": 1.300464486702411, "learning_rate": 1.8662133627932584e-05, "loss": 0.2137, "step": 5501 }, { "epoch": 1.1498432601880877, "grad_norm": 1.2684263628262473, "learning_rate": 1.8661569879506598e-05, "loss": 0.1733, "step": 5502 }, { "epoch": 1.1500522466039707, "grad_norm": 0.8959957779523366, "learning_rate": 1.8661006020848445e-05, "loss": 0.1594, "step": 5503 }, { "epoch": 1.1502612330198536, "grad_norm": 1.0458207672526896, "learning_rate": 1.8660442051965305e-05, "loss": 0.1597, "step": 5504 }, { "epoch": 1.1504702194357366, "grad_norm": 1.1875627015749237, "learning_rate": 1.865987797286435e-05, "loss": 0.1836, "step": 5505 }, { "epoch": 1.1506792058516195, "grad_norm": 1.1053939449310395, "learning_rate": 1.8659313783552763e-05, "loss": 0.1982, "step": 5506 }, { "epoch": 1.1508881922675025, "grad_norm": 0.8720512778928835, "learning_rate": 1.8658749484037725e-05, "loss": 0.1295, "step": 5507 }, { "epoch": 1.1510971786833855, "grad_norm": 1.2249515511529183, "learning_rate": 1.8658185074326415e-05, "loss": 0.1818, "step": 5508 }, { "epoch": 1.1513061650992686, "grad_norm": 1.0047518915256195, "learning_rate": 1.8657620554426016e-05, "loss": 0.1909, "step": 5509 }, { "epoch": 1.1515151515151516, "grad_norm": 0.9059731567027471, "learning_rate": 1.8657055924343716e-05, "loss": 0.1431, "step": 5510 }, { "epoch": 1.1517241379310346, "grad_norm": 1.1100862915893845, "learning_rate": 1.8656491184086696e-05, "loss": 0.1645, "step": 5511 }, { "epoch": 1.1519331243469175, "grad_norm": 1.0993475414449096, "learning_rate": 1.8655926333662144e-05, "loss": 0.1554, "step": 5512 }, { "epoch": 1.1521421107628005, "grad_norm": 1.1068459709423326, "learning_rate": 1.865536137307725e-05, "loss": 0.1721, "step": 5513 }, { "epoch": 1.1523510971786834, "grad_norm": 1.0693387936668168, "learning_rate": 1.8654796302339207e-05, "loss": 0.1866, "step": 5514 }, { "epoch": 1.1525600835945664, "grad_norm": 1.8224016962499276, "learning_rate": 1.86542311214552e-05, "loss": 0.1926, "step": 5515 }, { "epoch": 1.1527690700104494, "grad_norm": 0.9205195231695265, "learning_rate": 1.8653665830432428e-05, "loss": 0.1438, "step": 5516 }, { "epoch": 1.1529780564263323, "grad_norm": 1.3037556076820298, "learning_rate": 1.8653100429278083e-05, "loss": 0.1391, "step": 5517 }, { "epoch": 1.1531870428422153, "grad_norm": 1.0697166978751003, "learning_rate": 1.8652534917999357e-05, "loss": 0.185, "step": 5518 }, { "epoch": 1.1533960292580983, "grad_norm": 1.0390922095399409, "learning_rate": 1.865196929660345e-05, "loss": 0.1716, "step": 5519 }, { "epoch": 1.1536050156739812, "grad_norm": 1.193533354093418, "learning_rate": 1.865140356509756e-05, "loss": 0.1807, "step": 5520 }, { "epoch": 1.1538140020898642, "grad_norm": 0.8858346450817359, "learning_rate": 1.8650837723488888e-05, "loss": 0.1503, "step": 5521 }, { "epoch": 1.1540229885057471, "grad_norm": 0.9672020832592666, "learning_rate": 1.8650271771784635e-05, "loss": 0.1688, "step": 5522 }, { "epoch": 1.15423197492163, "grad_norm": 1.0753542830647453, "learning_rate": 1.8649705709992004e-05, "loss": 0.1543, "step": 5523 }, { "epoch": 1.154440961337513, "grad_norm": 0.9761545834570423, "learning_rate": 1.8649139538118196e-05, "loss": 0.1958, "step": 5524 }, { "epoch": 1.154649947753396, "grad_norm": 1.0587866496186684, "learning_rate": 1.864857325617042e-05, "loss": 0.1907, "step": 5525 }, { "epoch": 1.154858934169279, "grad_norm": 1.0763789694351014, "learning_rate": 1.8648006864155882e-05, "loss": 0.1719, "step": 5526 }, { "epoch": 1.155067920585162, "grad_norm": 3.018882090936099, "learning_rate": 1.8647440362081787e-05, "loss": 0.1652, "step": 5527 }, { "epoch": 1.155276907001045, "grad_norm": 1.254003833807441, "learning_rate": 1.864687374995535e-05, "loss": 0.1781, "step": 5528 }, { "epoch": 1.1554858934169279, "grad_norm": 1.0398926825588448, "learning_rate": 1.864630702778378e-05, "loss": 0.1907, "step": 5529 }, { "epoch": 1.1556948798328108, "grad_norm": 0.969614888274448, "learning_rate": 1.8645740195574286e-05, "loss": 0.1762, "step": 5530 }, { "epoch": 1.1559038662486938, "grad_norm": 1.0370017592694785, "learning_rate": 1.8645173253334085e-05, "loss": 0.1575, "step": 5531 }, { "epoch": 1.1561128526645768, "grad_norm": 1.1218174710664943, "learning_rate": 1.864460620107039e-05, "loss": 0.1401, "step": 5532 }, { "epoch": 1.1563218390804597, "grad_norm": 1.3699604448275673, "learning_rate": 1.8644039038790422e-05, "loss": 0.1682, "step": 5533 }, { "epoch": 1.1565308254963427, "grad_norm": 1.1129834401816565, "learning_rate": 1.8643471766501398e-05, "loss": 0.1632, "step": 5534 }, { "epoch": 1.1567398119122256, "grad_norm": 1.0266352221173571, "learning_rate": 1.8642904384210535e-05, "loss": 0.1467, "step": 5535 }, { "epoch": 1.1569487983281086, "grad_norm": 1.2206740562138814, "learning_rate": 1.864233689192505e-05, "loss": 0.1889, "step": 5536 }, { "epoch": 1.1571577847439916, "grad_norm": 1.0943273032471763, "learning_rate": 1.8641769289652176e-05, "loss": 0.1682, "step": 5537 }, { "epoch": 1.1573667711598745, "grad_norm": 1.142391690908667, "learning_rate": 1.8641201577399127e-05, "loss": 0.1977, "step": 5538 }, { "epoch": 1.1575757575757575, "grad_norm": 1.004659272413406, "learning_rate": 1.8640633755173135e-05, "loss": 0.1762, "step": 5539 }, { "epoch": 1.1577847439916407, "grad_norm": 1.212452122364446, "learning_rate": 1.8640065822981422e-05, "loss": 0.1892, "step": 5540 }, { "epoch": 1.1579937304075236, "grad_norm": 0.9655720639392851, "learning_rate": 1.8639497780831215e-05, "loss": 0.1466, "step": 5541 }, { "epoch": 1.1582027168234066, "grad_norm": 1.0786196829949735, "learning_rate": 1.8638929628729746e-05, "loss": 0.1728, "step": 5542 }, { "epoch": 1.1584117032392895, "grad_norm": 0.9439272309332184, "learning_rate": 1.8638361366684247e-05, "loss": 0.1609, "step": 5543 }, { "epoch": 1.1586206896551725, "grad_norm": 1.1511020119743585, "learning_rate": 1.8637792994701945e-05, "loss": 0.183, "step": 5544 }, { "epoch": 1.1588296760710555, "grad_norm": 1.17650194657301, "learning_rate": 1.8637224512790078e-05, "loss": 0.1577, "step": 5545 }, { "epoch": 1.1590386624869384, "grad_norm": 0.9789775693871018, "learning_rate": 1.863665592095588e-05, "loss": 0.1842, "step": 5546 }, { "epoch": 1.1592476489028214, "grad_norm": 1.1590891137662274, "learning_rate": 1.863608721920658e-05, "loss": 0.181, "step": 5547 }, { "epoch": 1.1594566353187044, "grad_norm": 1.0893184411703212, "learning_rate": 1.863551840754943e-05, "loss": 0.1209, "step": 5548 }, { "epoch": 1.1596656217345873, "grad_norm": 1.298521641885743, "learning_rate": 1.863494948599166e-05, "loss": 0.1668, "step": 5549 }, { "epoch": 1.1598746081504703, "grad_norm": 1.0593914189951594, "learning_rate": 1.8634380454540503e-05, "loss": 0.1568, "step": 5550 }, { "epoch": 1.1600835945663532, "grad_norm": 1.2289763119791075, "learning_rate": 1.8633811313203217e-05, "loss": 0.1569, "step": 5551 }, { "epoch": 1.1602925809822362, "grad_norm": 1.0104645454976329, "learning_rate": 1.8633242061987035e-05, "loss": 0.1644, "step": 5552 }, { "epoch": 1.1605015673981192, "grad_norm": 1.252598761213899, "learning_rate": 1.8632672700899207e-05, "loss": 0.1851, "step": 5553 }, { "epoch": 1.1607105538140021, "grad_norm": 1.1330643226268329, "learning_rate": 1.8632103229946974e-05, "loss": 0.1526, "step": 5554 }, { "epoch": 1.160919540229885, "grad_norm": 1.2538404321266137, "learning_rate": 1.8631533649137587e-05, "loss": 0.1503, "step": 5555 }, { "epoch": 1.161128526645768, "grad_norm": 1.0718419494005824, "learning_rate": 1.863096395847829e-05, "loss": 0.1937, "step": 5556 }, { "epoch": 1.161337513061651, "grad_norm": 1.1612138505201015, "learning_rate": 1.8630394157976338e-05, "loss": 0.1366, "step": 5557 }, { "epoch": 1.161546499477534, "grad_norm": 1.21048270039309, "learning_rate": 1.862982424763898e-05, "loss": 0.1853, "step": 5558 }, { "epoch": 1.161755485893417, "grad_norm": 1.1604401114950764, "learning_rate": 1.862925422747347e-05, "loss": 0.1608, "step": 5559 }, { "epoch": 1.1619644723093, "grad_norm": 0.981619341561139, "learning_rate": 1.8628684097487065e-05, "loss": 0.154, "step": 5560 }, { "epoch": 1.1621734587251829, "grad_norm": 1.1440191682935203, "learning_rate": 1.8628113857687017e-05, "loss": 0.1744, "step": 5561 }, { "epoch": 1.1623824451410658, "grad_norm": 1.1885490393824354, "learning_rate": 1.8627543508080586e-05, "loss": 0.1659, "step": 5562 }, { "epoch": 1.1625914315569488, "grad_norm": 1.0851774565460115, "learning_rate": 1.862697304867503e-05, "loss": 0.2061, "step": 5563 }, { "epoch": 1.1628004179728317, "grad_norm": 1.2847831739432862, "learning_rate": 1.8626402479477607e-05, "loss": 0.1999, "step": 5564 }, { "epoch": 1.1630094043887147, "grad_norm": 1.099427779638492, "learning_rate": 1.8625831800495577e-05, "loss": 0.1521, "step": 5565 }, { "epoch": 1.1632183908045977, "grad_norm": 1.319044557516004, "learning_rate": 1.8625261011736207e-05, "loss": 0.1918, "step": 5566 }, { "epoch": 1.1634273772204806, "grad_norm": 1.2893892127492563, "learning_rate": 1.862469011320676e-05, "loss": 0.1776, "step": 5567 }, { "epoch": 1.1636363636363636, "grad_norm": 1.142462129541462, "learning_rate": 1.8624119104914502e-05, "loss": 0.1302, "step": 5568 }, { "epoch": 1.1638453500522465, "grad_norm": 1.0007272874502753, "learning_rate": 1.8623547986866697e-05, "loss": 0.1621, "step": 5569 }, { "epoch": 1.1640543364681295, "grad_norm": 1.25662304394969, "learning_rate": 1.8622976759070617e-05, "loss": 0.1894, "step": 5570 }, { "epoch": 1.1642633228840125, "grad_norm": 1.1643777033626581, "learning_rate": 1.862240542153353e-05, "loss": 0.1852, "step": 5571 }, { "epoch": 1.1644723092998954, "grad_norm": 1.0963195630879559, "learning_rate": 1.8621833974262706e-05, "loss": 0.1767, "step": 5572 }, { "epoch": 1.1646812957157784, "grad_norm": 1.0520656604963103, "learning_rate": 1.862126241726542e-05, "loss": 0.1721, "step": 5573 }, { "epoch": 1.1648902821316613, "grad_norm": 0.8525568657512514, "learning_rate": 1.8620690750548944e-05, "loss": 0.1672, "step": 5574 }, { "epoch": 1.1650992685475443, "grad_norm": 0.9654757063375295, "learning_rate": 1.862011897412056e-05, "loss": 0.1943, "step": 5575 }, { "epoch": 1.1653082549634273, "grad_norm": 1.0894304044651664, "learning_rate": 1.861954708798753e-05, "loss": 0.2196, "step": 5576 }, { "epoch": 1.1655172413793102, "grad_norm": 1.0425351363233664, "learning_rate": 1.8618975092157148e-05, "loss": 0.1835, "step": 5577 }, { "epoch": 1.1657262277951932, "grad_norm": 1.068622713629137, "learning_rate": 1.8618402986636686e-05, "loss": 0.1861, "step": 5578 }, { "epoch": 1.1659352142110764, "grad_norm": 1.1177520715820095, "learning_rate": 1.8617830771433426e-05, "loss": 0.1731, "step": 5579 }, { "epoch": 1.1661442006269593, "grad_norm": 1.0494102497584683, "learning_rate": 1.8617258446554646e-05, "loss": 0.1894, "step": 5580 }, { "epoch": 1.1663531870428423, "grad_norm": 0.9418605154348122, "learning_rate": 1.8616686012007636e-05, "loss": 0.1861, "step": 5581 }, { "epoch": 1.1665621734587253, "grad_norm": 1.1267755173091383, "learning_rate": 1.861611346779968e-05, "loss": 0.1647, "step": 5582 }, { "epoch": 1.1667711598746082, "grad_norm": 1.1619043910840439, "learning_rate": 1.8615540813938063e-05, "loss": 0.1752, "step": 5583 }, { "epoch": 1.1669801462904912, "grad_norm": 1.3680492358898815, "learning_rate": 1.8614968050430075e-05, "loss": 0.1686, "step": 5584 }, { "epoch": 1.1671891327063741, "grad_norm": 1.2318676482125466, "learning_rate": 1.8614395177283e-05, "loss": 0.1718, "step": 5585 }, { "epoch": 1.167398119122257, "grad_norm": 1.8559077264345178, "learning_rate": 1.8613822194504133e-05, "loss": 0.1686, "step": 5586 }, { "epoch": 1.16760710553814, "grad_norm": 1.0838476889684425, "learning_rate": 1.861324910210077e-05, "loss": 0.1981, "step": 5587 }, { "epoch": 1.167816091954023, "grad_norm": 1.123672941414647, "learning_rate": 1.86126759000802e-05, "loss": 0.199, "step": 5588 }, { "epoch": 1.168025078369906, "grad_norm": 1.0692603663449203, "learning_rate": 1.8612102588449712e-05, "loss": 0.176, "step": 5589 }, { "epoch": 1.168234064785789, "grad_norm": 1.1542813184569556, "learning_rate": 1.8611529167216612e-05, "loss": 0.1876, "step": 5590 }, { "epoch": 1.168443051201672, "grad_norm": 1.2062605585917574, "learning_rate": 1.861095563638819e-05, "loss": 0.1857, "step": 5591 }, { "epoch": 1.1686520376175549, "grad_norm": 1.0756477792451735, "learning_rate": 1.8610381995971757e-05, "loss": 0.1932, "step": 5592 }, { "epoch": 1.1688610240334378, "grad_norm": 1.056341644020949, "learning_rate": 1.8609808245974598e-05, "loss": 0.1575, "step": 5593 }, { "epoch": 1.1690700104493208, "grad_norm": 1.2184298225900392, "learning_rate": 1.8609234386404024e-05, "loss": 0.1946, "step": 5594 }, { "epoch": 1.1692789968652038, "grad_norm": 1.1618071368971106, "learning_rate": 1.860866041726734e-05, "loss": 0.155, "step": 5595 }, { "epoch": 1.1694879832810867, "grad_norm": 1.1810208294991917, "learning_rate": 1.8608086338571846e-05, "loss": 0.1876, "step": 5596 }, { "epoch": 1.1696969696969697, "grad_norm": 1.1829851802148263, "learning_rate": 1.860751215032485e-05, "loss": 0.1628, "step": 5597 }, { "epoch": 1.1699059561128526, "grad_norm": 1.050368966578473, "learning_rate": 1.8606937852533655e-05, "loss": 0.1737, "step": 5598 }, { "epoch": 1.1701149425287356, "grad_norm": 1.1732541198487596, "learning_rate": 1.860636344520558e-05, "loss": 0.1605, "step": 5599 }, { "epoch": 1.1703239289446186, "grad_norm": 0.9404409993592769, "learning_rate": 1.860578892834792e-05, "loss": 0.1419, "step": 5600 }, { "epoch": 1.1705329153605015, "grad_norm": 1.1384987345498596, "learning_rate": 1.8605214301968006e-05, "loss": 0.1667, "step": 5601 }, { "epoch": 1.1707419017763845, "grad_norm": 1.1712714970006946, "learning_rate": 1.8604639566073133e-05, "loss": 0.1792, "step": 5602 }, { "epoch": 1.1709508881922674, "grad_norm": 1.0703275308070404, "learning_rate": 1.8604064720670626e-05, "loss": 0.2023, "step": 5603 }, { "epoch": 1.1711598746081504, "grad_norm": 1.0927107004951762, "learning_rate": 1.8603489765767795e-05, "loss": 0.1552, "step": 5604 }, { "epoch": 1.1713688610240334, "grad_norm": 1.056318582601599, "learning_rate": 1.8602914701371967e-05, "loss": 0.1698, "step": 5605 }, { "epoch": 1.1715778474399163, "grad_norm": 1.075076011119713, "learning_rate": 1.860233952749045e-05, "loss": 0.1941, "step": 5606 }, { "epoch": 1.1717868338557993, "grad_norm": 1.0186981933256123, "learning_rate": 1.8601764244130566e-05, "loss": 0.1676, "step": 5607 }, { "epoch": 1.1719958202716823, "grad_norm": 1.0693658806363309, "learning_rate": 1.8601188851299636e-05, "loss": 0.1587, "step": 5608 }, { "epoch": 1.1722048066875652, "grad_norm": 1.2998513712046733, "learning_rate": 1.860061334900499e-05, "loss": 0.2001, "step": 5609 }, { "epoch": 1.1724137931034484, "grad_norm": 1.0869258672490782, "learning_rate": 1.8600037737253943e-05, "loss": 0.1715, "step": 5610 }, { "epoch": 1.1726227795193314, "grad_norm": 1.082938043606969, "learning_rate": 1.8599462016053825e-05, "loss": 0.1705, "step": 5611 }, { "epoch": 1.1728317659352143, "grad_norm": 1.5906105929427248, "learning_rate": 1.8598886185411966e-05, "loss": 0.1968, "step": 5612 }, { "epoch": 1.1730407523510973, "grad_norm": 1.1641332007428724, "learning_rate": 1.859831024533569e-05, "loss": 0.1576, "step": 5613 }, { "epoch": 1.1732497387669802, "grad_norm": 1.1295850162454308, "learning_rate": 1.8597734195832323e-05, "loss": 0.1878, "step": 5614 }, { "epoch": 1.1734587251828632, "grad_norm": 1.025658701310052, "learning_rate": 1.8597158036909207e-05, "loss": 0.1788, "step": 5615 }, { "epoch": 1.1736677115987462, "grad_norm": 1.1018687578036777, "learning_rate": 1.8596581768573667e-05, "loss": 0.1872, "step": 5616 }, { "epoch": 1.1738766980146291, "grad_norm": 1.2482375985714687, "learning_rate": 1.8596005390833033e-05, "loss": 0.2053, "step": 5617 }, { "epoch": 1.174085684430512, "grad_norm": 1.2528379754022, "learning_rate": 1.8595428903694652e-05, "loss": 0.1797, "step": 5618 }, { "epoch": 1.174294670846395, "grad_norm": 1.0015694613017714, "learning_rate": 1.859485230716585e-05, "loss": 0.1722, "step": 5619 }, { "epoch": 1.174503657262278, "grad_norm": 1.0030364089274282, "learning_rate": 1.8594275601253972e-05, "loss": 0.177, "step": 5620 }, { "epoch": 1.174712643678161, "grad_norm": 1.0065259153819188, "learning_rate": 1.8593698785966353e-05, "loss": 0.1944, "step": 5621 }, { "epoch": 1.174921630094044, "grad_norm": 1.0396082957779114, "learning_rate": 1.8593121861310335e-05, "loss": 0.1799, "step": 5622 }, { "epoch": 1.175130616509927, "grad_norm": 1.0502205372676927, "learning_rate": 1.8592544827293264e-05, "loss": 0.1767, "step": 5623 }, { "epoch": 1.1753396029258099, "grad_norm": 1.0199037831194242, "learning_rate": 1.859196768392248e-05, "loss": 0.1856, "step": 5624 }, { "epoch": 1.1755485893416928, "grad_norm": 1.1639896140156085, "learning_rate": 1.8591390431205326e-05, "loss": 0.1461, "step": 5625 }, { "epoch": 1.1757575757575758, "grad_norm": 0.9127974901462858, "learning_rate": 1.8590813069149155e-05, "loss": 0.1552, "step": 5626 }, { "epoch": 1.1759665621734587, "grad_norm": 1.040936398891524, "learning_rate": 1.859023559776131e-05, "loss": 0.1587, "step": 5627 }, { "epoch": 1.1761755485893417, "grad_norm": 1.281997696931004, "learning_rate": 1.8589658017049138e-05, "loss": 0.1642, "step": 5628 }, { "epoch": 1.1763845350052247, "grad_norm": 1.1573359943747505, "learning_rate": 1.8589080327019997e-05, "loss": 0.1659, "step": 5629 }, { "epoch": 1.1765935214211076, "grad_norm": 1.317639040379735, "learning_rate": 1.8588502527681233e-05, "loss": 0.1634, "step": 5630 }, { "epoch": 1.1768025078369906, "grad_norm": 1.043707915796812, "learning_rate": 1.8587924619040204e-05, "loss": 0.1482, "step": 5631 }, { "epoch": 1.1770114942528735, "grad_norm": 1.3208961653975364, "learning_rate": 1.8587346601104256e-05, "loss": 0.1644, "step": 5632 }, { "epoch": 1.1772204806687565, "grad_norm": 0.9637562337102137, "learning_rate": 1.858676847388076e-05, "loss": 0.1671, "step": 5633 }, { "epoch": 1.1774294670846395, "grad_norm": 0.885240989693574, "learning_rate": 1.8586190237377058e-05, "loss": 0.1528, "step": 5634 }, { "epoch": 1.1776384535005224, "grad_norm": 1.12627118935762, "learning_rate": 1.8585611891600518e-05, "loss": 0.1752, "step": 5635 }, { "epoch": 1.1778474399164054, "grad_norm": 1.2263925356837866, "learning_rate": 1.85850334365585e-05, "loss": 0.1766, "step": 5636 }, { "epoch": 1.1780564263322884, "grad_norm": 0.9923384537454483, "learning_rate": 1.858445487225836e-05, "loss": 0.1722, "step": 5637 }, { "epoch": 1.1782654127481713, "grad_norm": 1.3337591053222984, "learning_rate": 1.858387619870747e-05, "loss": 0.2056, "step": 5638 }, { "epoch": 1.1784743991640543, "grad_norm": 1.2610291848419588, "learning_rate": 1.8583297415913185e-05, "loss": 0.1745, "step": 5639 }, { "epoch": 1.1786833855799372, "grad_norm": 1.1564423768531376, "learning_rate": 1.8582718523882878e-05, "loss": 0.1558, "step": 5640 }, { "epoch": 1.1788923719958202, "grad_norm": 1.0014868266642492, "learning_rate": 1.8582139522623916e-05, "loss": 0.1488, "step": 5641 }, { "epoch": 1.1791013584117032, "grad_norm": 1.1386480532745538, "learning_rate": 1.8581560412143663e-05, "loss": 0.1559, "step": 5642 }, { "epoch": 1.1793103448275861, "grad_norm": 1.4203292043080058, "learning_rate": 1.8580981192449492e-05, "loss": 0.1686, "step": 5643 }, { "epoch": 1.179519331243469, "grad_norm": 1.1733208862401705, "learning_rate": 1.8580401863548774e-05, "loss": 0.1583, "step": 5644 }, { "epoch": 1.179728317659352, "grad_norm": 1.1238719680993903, "learning_rate": 1.8579822425448882e-05, "loss": 0.1918, "step": 5645 }, { "epoch": 1.179937304075235, "grad_norm": 1.1815313967415646, "learning_rate": 1.857924287815719e-05, "loss": 0.1762, "step": 5646 }, { "epoch": 1.180146290491118, "grad_norm": 0.9519787684591222, "learning_rate": 1.8578663221681076e-05, "loss": 0.1566, "step": 5647 }, { "epoch": 1.180355276907001, "grad_norm": 1.0830267998417213, "learning_rate": 1.8578083456027913e-05, "loss": 0.1767, "step": 5648 }, { "epoch": 1.1805642633228839, "grad_norm": 1.0493322148954478, "learning_rate": 1.857750358120508e-05, "loss": 0.1747, "step": 5649 }, { "epoch": 1.180773249738767, "grad_norm": 1.0228247706481606, "learning_rate": 1.8576923597219964e-05, "loss": 0.1968, "step": 5650 }, { "epoch": 1.18098223615465, "grad_norm": 1.0098903673753248, "learning_rate": 1.8576343504079937e-05, "loss": 0.1949, "step": 5651 }, { "epoch": 1.181191222570533, "grad_norm": 0.9629340260385821, "learning_rate": 1.8575763301792386e-05, "loss": 0.1809, "step": 5652 }, { "epoch": 1.181400208986416, "grad_norm": 0.9813627279539094, "learning_rate": 1.8575182990364693e-05, "loss": 0.1694, "step": 5653 }, { "epoch": 1.181609195402299, "grad_norm": 1.0916286871180436, "learning_rate": 1.8574602569804246e-05, "loss": 0.1773, "step": 5654 }, { "epoch": 1.1818181818181819, "grad_norm": 1.167798253427481, "learning_rate": 1.8574022040118427e-05, "loss": 0.1888, "step": 5655 }, { "epoch": 1.1820271682340648, "grad_norm": 1.3180280550661747, "learning_rate": 1.8573441401314632e-05, "loss": 0.188, "step": 5656 }, { "epoch": 1.1822361546499478, "grad_norm": 1.2140765027277345, "learning_rate": 1.8572860653400243e-05, "loss": 0.1865, "step": 5657 }, { "epoch": 1.1824451410658308, "grad_norm": 0.9231803428774706, "learning_rate": 1.8572279796382655e-05, "loss": 0.1951, "step": 5658 }, { "epoch": 1.1826541274817137, "grad_norm": 1.0005645481710321, "learning_rate": 1.857169883026926e-05, "loss": 0.1708, "step": 5659 }, { "epoch": 1.1828631138975967, "grad_norm": 1.2093551904986621, "learning_rate": 1.857111775506745e-05, "loss": 0.2003, "step": 5660 }, { "epoch": 1.1830721003134796, "grad_norm": 1.1040993741367193, "learning_rate": 1.857053657078462e-05, "loss": 0.1742, "step": 5661 }, { "epoch": 1.1832810867293626, "grad_norm": 1.0994499475593444, "learning_rate": 1.856995527742817e-05, "loss": 0.2089, "step": 5662 }, { "epoch": 1.1834900731452456, "grad_norm": 0.9367787561686443, "learning_rate": 1.856937387500549e-05, "loss": 0.1587, "step": 5663 }, { "epoch": 1.1836990595611285, "grad_norm": 1.0589876040391317, "learning_rate": 1.8568792363523992e-05, "loss": 0.1891, "step": 5664 }, { "epoch": 1.1839080459770115, "grad_norm": 1.0766894351329184, "learning_rate": 1.8568210742991067e-05, "loss": 0.2077, "step": 5665 }, { "epoch": 1.1841170323928945, "grad_norm": 1.0358645421753507, "learning_rate": 1.8567629013414116e-05, "loss": 0.1932, "step": 5666 }, { "epoch": 1.1843260188087774, "grad_norm": 1.153021641239359, "learning_rate": 1.8567047174800548e-05, "loss": 0.1484, "step": 5667 }, { "epoch": 1.1845350052246604, "grad_norm": 1.0165649837861437, "learning_rate": 1.8566465227157767e-05, "loss": 0.2038, "step": 5668 }, { "epoch": 1.1847439916405433, "grad_norm": 1.1685851359980788, "learning_rate": 1.8565883170493175e-05, "loss": 0.2017, "step": 5669 }, { "epoch": 1.1849529780564263, "grad_norm": 1.1802443922812054, "learning_rate": 1.8565301004814186e-05, "loss": 0.1914, "step": 5670 }, { "epoch": 1.1851619644723093, "grad_norm": 1.2370500741112282, "learning_rate": 1.8564718730128202e-05, "loss": 0.148, "step": 5671 }, { "epoch": 1.1853709508881922, "grad_norm": 1.1037926621428142, "learning_rate": 1.8564136346442638e-05, "loss": 0.1856, "step": 5672 }, { "epoch": 1.1855799373040752, "grad_norm": 1.0871245390234106, "learning_rate": 1.8563553853764905e-05, "loss": 0.1869, "step": 5673 }, { "epoch": 1.1857889237199581, "grad_norm": 0.954318535293617, "learning_rate": 1.8562971252102415e-05, "loss": 0.1658, "step": 5674 }, { "epoch": 1.185997910135841, "grad_norm": 1.0602546349093647, "learning_rate": 1.8562388541462584e-05, "loss": 0.1514, "step": 5675 }, { "epoch": 1.186206896551724, "grad_norm": 0.9551736818668162, "learning_rate": 1.8561805721852827e-05, "loss": 0.1706, "step": 5676 }, { "epoch": 1.186415882967607, "grad_norm": 0.9916981410205983, "learning_rate": 1.856122279328056e-05, "loss": 0.1799, "step": 5677 }, { "epoch": 1.18662486938349, "grad_norm": 0.9544615408770163, "learning_rate": 1.8560639755753204e-05, "loss": 0.1839, "step": 5678 }, { "epoch": 1.186833855799373, "grad_norm": 1.1849447700497313, "learning_rate": 1.856005660927818e-05, "loss": 0.1887, "step": 5679 }, { "epoch": 1.1870428422152561, "grad_norm": 1.1115517854901023, "learning_rate": 1.8559473353862905e-05, "loss": 0.1787, "step": 5680 }, { "epoch": 1.187251828631139, "grad_norm": 1.2795157613337949, "learning_rate": 1.8558889989514805e-05, "loss": 0.1399, "step": 5681 }, { "epoch": 1.187460815047022, "grad_norm": 1.5285141956797494, "learning_rate": 1.8558306516241304e-05, "loss": 0.1673, "step": 5682 }, { "epoch": 1.187669801462905, "grad_norm": 1.0214146886376334, "learning_rate": 1.8557722934049828e-05, "loss": 0.1666, "step": 5683 }, { "epoch": 1.187878787878788, "grad_norm": 1.3426602344281229, "learning_rate": 1.85571392429478e-05, "loss": 0.1904, "step": 5684 }, { "epoch": 1.188087774294671, "grad_norm": 0.9862094929979452, "learning_rate": 1.8556555442942654e-05, "loss": 0.1662, "step": 5685 }, { "epoch": 1.188296760710554, "grad_norm": 1.0569030333436502, "learning_rate": 1.855597153404182e-05, "loss": 0.1479, "step": 5686 }, { "epoch": 1.1885057471264369, "grad_norm": 1.1516297496595933, "learning_rate": 1.8555387516252727e-05, "loss": 0.1511, "step": 5687 }, { "epoch": 1.1887147335423198, "grad_norm": 1.1823281531744565, "learning_rate": 1.85548033895828e-05, "loss": 0.1911, "step": 5688 }, { "epoch": 1.1889237199582028, "grad_norm": 1.1276521239409394, "learning_rate": 1.8554219154039485e-05, "loss": 0.2034, "step": 5689 }, { "epoch": 1.1891327063740857, "grad_norm": 1.2448008143186853, "learning_rate": 1.8553634809630213e-05, "loss": 0.2098, "step": 5690 }, { "epoch": 1.1893416927899687, "grad_norm": 1.0656011500134526, "learning_rate": 1.855305035636242e-05, "loss": 0.1491, "step": 5691 }, { "epoch": 1.1895506792058517, "grad_norm": 1.0623849024213503, "learning_rate": 1.8552465794243543e-05, "loss": 0.1905, "step": 5692 }, { "epoch": 1.1897596656217346, "grad_norm": 1.2519792265756169, "learning_rate": 1.8551881123281026e-05, "loss": 0.1728, "step": 5693 }, { "epoch": 1.1899686520376176, "grad_norm": 0.9600754137648589, "learning_rate": 1.8551296343482304e-05, "loss": 0.1626, "step": 5694 }, { "epoch": 1.1901776384535006, "grad_norm": 1.1205568786327536, "learning_rate": 1.855071145485482e-05, "loss": 0.175, "step": 5695 }, { "epoch": 1.1903866248693835, "grad_norm": 0.9523824354131591, "learning_rate": 1.8550126457406023e-05, "loss": 0.1791, "step": 5696 }, { "epoch": 1.1905956112852665, "grad_norm": 0.9820947459751418, "learning_rate": 1.854954135114335e-05, "loss": 0.1653, "step": 5697 }, { "epoch": 1.1908045977011494, "grad_norm": 1.2120879247093101, "learning_rate": 1.8548956136074255e-05, "loss": 0.1501, "step": 5698 }, { "epoch": 1.1910135841170324, "grad_norm": 1.093938381397702, "learning_rate": 1.8548370812206183e-05, "loss": 0.1519, "step": 5699 }, { "epoch": 1.1912225705329154, "grad_norm": 1.3102557895219473, "learning_rate": 1.8547785379546585e-05, "loss": 0.1838, "step": 5700 }, { "epoch": 1.1914315569487983, "grad_norm": 0.9652856918354752, "learning_rate": 1.8547199838102904e-05, "loss": 0.1703, "step": 5701 }, { "epoch": 1.1916405433646813, "grad_norm": 1.1455366087814902, "learning_rate": 1.85466141878826e-05, "loss": 0.1797, "step": 5702 }, { "epoch": 1.1918495297805642, "grad_norm": 1.1646218698211277, "learning_rate": 1.8546028428893125e-05, "loss": 0.166, "step": 5703 }, { "epoch": 1.1920585161964472, "grad_norm": 1.0719723090055115, "learning_rate": 1.854544256114193e-05, "loss": 0.1482, "step": 5704 }, { "epoch": 1.1922675026123302, "grad_norm": 0.9612254020158321, "learning_rate": 1.8544856584636474e-05, "loss": 0.1703, "step": 5705 }, { "epoch": 1.1924764890282131, "grad_norm": 1.128090601648474, "learning_rate": 1.8544270499384214e-05, "loss": 0.1776, "step": 5706 }, { "epoch": 1.192685475444096, "grad_norm": 1.1564461436388338, "learning_rate": 1.854368430539261e-05, "loss": 0.1778, "step": 5707 }, { "epoch": 1.192894461859979, "grad_norm": 1.0958993488627475, "learning_rate": 1.8543098002669123e-05, "loss": 0.1519, "step": 5708 }, { "epoch": 1.193103448275862, "grad_norm": 1.0852802799192691, "learning_rate": 1.854251159122121e-05, "loss": 0.1935, "step": 5709 }, { "epoch": 1.193312434691745, "grad_norm": 1.2236242766759287, "learning_rate": 1.8541925071056335e-05, "loss": 0.1519, "step": 5710 }, { "epoch": 1.193521421107628, "grad_norm": 0.9954314155393919, "learning_rate": 1.8541338442181965e-05, "loss": 0.1713, "step": 5711 }, { "epoch": 1.193730407523511, "grad_norm": 1.160464888051185, "learning_rate": 1.8540751704605566e-05, "loss": 0.1673, "step": 5712 }, { "epoch": 1.1939393939393939, "grad_norm": 0.9414033666797396, "learning_rate": 1.8540164858334604e-05, "loss": 0.2132, "step": 5713 }, { "epoch": 1.1941483803552768, "grad_norm": 1.0111343397801917, "learning_rate": 1.853957790337655e-05, "loss": 0.1521, "step": 5714 }, { "epoch": 1.1943573667711598, "grad_norm": 1.0311351571898204, "learning_rate": 1.8538990839738868e-05, "loss": 0.1944, "step": 5715 }, { "epoch": 1.1945663531870427, "grad_norm": 1.0542903779806463, "learning_rate": 1.8538403667429033e-05, "loss": 0.1988, "step": 5716 }, { "epoch": 1.1947753396029257, "grad_norm": 0.9869283892425702, "learning_rate": 1.8537816386454518e-05, "loss": 0.1777, "step": 5717 }, { "epoch": 1.1949843260188087, "grad_norm": 1.1663504406271605, "learning_rate": 1.8537228996822795e-05, "loss": 0.1806, "step": 5718 }, { "epoch": 1.1951933124346916, "grad_norm": 1.0368028551726058, "learning_rate": 1.8536641498541343e-05, "loss": 0.1695, "step": 5719 }, { "epoch": 1.1954022988505748, "grad_norm": 1.0336236036988793, "learning_rate": 1.853605389161764e-05, "loss": 0.1914, "step": 5720 }, { "epoch": 1.1956112852664578, "grad_norm": 1.011957536563863, "learning_rate": 1.8535466176059155e-05, "loss": 0.1777, "step": 5721 }, { "epoch": 1.1958202716823407, "grad_norm": 1.1045354564649756, "learning_rate": 1.8534878351873377e-05, "loss": 0.2001, "step": 5722 }, { "epoch": 1.1960292580982237, "grad_norm": 1.1713285399888462, "learning_rate": 1.8534290419067784e-05, "loss": 0.1814, "step": 5723 }, { "epoch": 1.1962382445141067, "grad_norm": 0.9817491436192977, "learning_rate": 1.8533702377649857e-05, "loss": 0.1926, "step": 5724 }, { "epoch": 1.1964472309299896, "grad_norm": 1.0868618904003218, "learning_rate": 1.8533114227627083e-05, "loss": 0.177, "step": 5725 }, { "epoch": 1.1966562173458726, "grad_norm": 0.8669288249592964, "learning_rate": 1.8532525969006943e-05, "loss": 0.1257, "step": 5726 }, { "epoch": 1.1968652037617555, "grad_norm": 0.9638633434043951, "learning_rate": 1.8531937601796925e-05, "loss": 0.176, "step": 5727 }, { "epoch": 1.1970741901776385, "grad_norm": 1.0501441483851173, "learning_rate": 1.8531349126004517e-05, "loss": 0.1765, "step": 5728 }, { "epoch": 1.1972831765935215, "grad_norm": 1.2237289521308279, "learning_rate": 1.853076054163721e-05, "loss": 0.1736, "step": 5729 }, { "epoch": 1.1974921630094044, "grad_norm": 0.8974512676482858, "learning_rate": 1.8530171848702495e-05, "loss": 0.1612, "step": 5730 }, { "epoch": 1.1977011494252874, "grad_norm": 1.0807596874055183, "learning_rate": 1.852958304720786e-05, "loss": 0.1813, "step": 5731 }, { "epoch": 1.1979101358411703, "grad_norm": 1.1401993444586667, "learning_rate": 1.85289941371608e-05, "loss": 0.1696, "step": 5732 }, { "epoch": 1.1981191222570533, "grad_norm": 0.9240460504564104, "learning_rate": 1.8528405118568814e-05, "loss": 0.1887, "step": 5733 }, { "epoch": 1.1983281086729363, "grad_norm": 0.958600040105763, "learning_rate": 1.8527815991439393e-05, "loss": 0.1652, "step": 5734 }, { "epoch": 1.1985370950888192, "grad_norm": 1.1356578108842132, "learning_rate": 1.8527226755780037e-05, "loss": 0.1663, "step": 5735 }, { "epoch": 1.1987460815047022, "grad_norm": 1.0651367910838905, "learning_rate": 1.8526637411598243e-05, "loss": 0.173, "step": 5736 }, { "epoch": 1.1989550679205851, "grad_norm": 0.9819813085194035, "learning_rate": 1.8526047958901514e-05, "loss": 0.1989, "step": 5737 }, { "epoch": 1.199164054336468, "grad_norm": 0.888497450875109, "learning_rate": 1.852545839769735e-05, "loss": 0.1429, "step": 5738 }, { "epoch": 1.199373040752351, "grad_norm": 1.0869207507764782, "learning_rate": 1.8524868727993256e-05, "loss": 0.1546, "step": 5739 }, { "epoch": 1.199582027168234, "grad_norm": 0.9372851020114129, "learning_rate": 1.8524278949796734e-05, "loss": 0.1348, "step": 5740 }, { "epoch": 1.199791013584117, "grad_norm": 1.28623624103447, "learning_rate": 1.8523689063115288e-05, "loss": 0.2045, "step": 5741 }, { "epoch": 1.2, "grad_norm": 1.1057304365924454, "learning_rate": 1.8523099067956433e-05, "loss": 0.2042, "step": 5742 }, { "epoch": 1.200208986415883, "grad_norm": 1.0763378755396613, "learning_rate": 1.852250896432767e-05, "loss": 0.1746, "step": 5743 }, { "epoch": 1.2004179728317659, "grad_norm": 1.0436430230202747, "learning_rate": 1.852191875223651e-05, "loss": 0.1685, "step": 5744 }, { "epoch": 1.2006269592476488, "grad_norm": 1.2114573500999726, "learning_rate": 1.852132843169047e-05, "loss": 0.1801, "step": 5745 }, { "epoch": 1.2008359456635318, "grad_norm": 1.040602607303474, "learning_rate": 1.8520738002697056e-05, "loss": 0.1644, "step": 5746 }, { "epoch": 1.2010449320794148, "grad_norm": 1.1896758718908405, "learning_rate": 1.8520147465263788e-05, "loss": 0.1379, "step": 5747 }, { "epoch": 1.2012539184952977, "grad_norm": 1.01113079696476, "learning_rate": 1.8519556819398173e-05, "loss": 0.1491, "step": 5748 }, { "epoch": 1.2014629049111807, "grad_norm": 1.0678235173987078, "learning_rate": 1.851896606510774e-05, "loss": 0.1664, "step": 5749 }, { "epoch": 1.2016718913270636, "grad_norm": 0.8966351081573781, "learning_rate": 1.8518375202399997e-05, "loss": 0.1444, "step": 5750 }, { "epoch": 1.2018808777429468, "grad_norm": 1.2697487407868453, "learning_rate": 1.851778423128247e-05, "loss": 0.18, "step": 5751 }, { "epoch": 1.2020898641588298, "grad_norm": 1.0697660798569888, "learning_rate": 1.8517193151762677e-05, "loss": 0.1477, "step": 5752 }, { "epoch": 1.2022988505747128, "grad_norm": 1.0132226243712623, "learning_rate": 1.8516601963848136e-05, "loss": 0.1464, "step": 5753 }, { "epoch": 1.2025078369905957, "grad_norm": 1.0800371042056236, "learning_rate": 1.851601066754638e-05, "loss": 0.1752, "step": 5754 }, { "epoch": 1.2027168234064787, "grad_norm": 0.8875147451197991, "learning_rate": 1.851541926286493e-05, "loss": 0.1757, "step": 5755 }, { "epoch": 1.2029258098223616, "grad_norm": 1.1069217703054814, "learning_rate": 1.851482774981131e-05, "loss": 0.1687, "step": 5756 }, { "epoch": 1.2031347962382446, "grad_norm": 1.0160126457496412, "learning_rate": 1.8514236128393052e-05, "loss": 0.157, "step": 5757 }, { "epoch": 1.2033437826541276, "grad_norm": 1.0033950400610674, "learning_rate": 1.851364439861768e-05, "loss": 0.1773, "step": 5758 }, { "epoch": 1.2035527690700105, "grad_norm": 1.3113918935650768, "learning_rate": 1.8513052560492736e-05, "loss": 0.1834, "step": 5759 }, { "epoch": 1.2037617554858935, "grad_norm": 1.1905025314377835, "learning_rate": 1.851246061402574e-05, "loss": 0.1722, "step": 5760 }, { "epoch": 1.2039707419017764, "grad_norm": 1.0573978063352423, "learning_rate": 1.851186855922423e-05, "loss": 0.1713, "step": 5761 }, { "epoch": 1.2041797283176594, "grad_norm": 0.989620473337116, "learning_rate": 1.8511276396095738e-05, "loss": 0.1917, "step": 5762 }, { "epoch": 1.2043887147335424, "grad_norm": 1.1983560292119229, "learning_rate": 1.851068412464781e-05, "loss": 0.1517, "step": 5763 }, { "epoch": 1.2045977011494253, "grad_norm": 1.0020066320837202, "learning_rate": 1.851009174488797e-05, "loss": 0.1421, "step": 5764 }, { "epoch": 1.2048066875653083, "grad_norm": 1.2281681306478796, "learning_rate": 1.8509499256823768e-05, "loss": 0.1688, "step": 5765 }, { "epoch": 1.2050156739811912, "grad_norm": 1.178542122589767, "learning_rate": 1.8508906660462734e-05, "loss": 0.2251, "step": 5766 }, { "epoch": 1.2052246603970742, "grad_norm": 1.3163365894029422, "learning_rate": 1.850831395581242e-05, "loss": 0.2126, "step": 5767 }, { "epoch": 1.2054336468129572, "grad_norm": 1.0842262792368447, "learning_rate": 1.8507721142880363e-05, "loss": 0.1756, "step": 5768 }, { "epoch": 1.2056426332288401, "grad_norm": 0.929071061041303, "learning_rate": 1.850712822167411e-05, "loss": 0.1734, "step": 5769 }, { "epoch": 1.205851619644723, "grad_norm": 1.1542260706938199, "learning_rate": 1.8506535192201207e-05, "loss": 0.1954, "step": 5770 }, { "epoch": 1.206060606060606, "grad_norm": 1.0372366150295438, "learning_rate": 1.8505942054469196e-05, "loss": 0.1563, "step": 5771 }, { "epoch": 1.206269592476489, "grad_norm": 1.221680763400278, "learning_rate": 1.8505348808485635e-05, "loss": 0.1986, "step": 5772 }, { "epoch": 1.206478578892372, "grad_norm": 1.1291270407468619, "learning_rate": 1.8504755454258067e-05, "loss": 0.176, "step": 5773 }, { "epoch": 1.206687565308255, "grad_norm": 1.0396788745961951, "learning_rate": 1.8504161991794044e-05, "loss": 0.2088, "step": 5774 }, { "epoch": 1.206896551724138, "grad_norm": 0.9063077731600533, "learning_rate": 1.850356842110112e-05, "loss": 0.1631, "step": 5775 }, { "epoch": 1.2071055381400209, "grad_norm": 1.141725664293415, "learning_rate": 1.850297474218685e-05, "loss": 0.1654, "step": 5776 }, { "epoch": 1.2073145245559038, "grad_norm": 1.2204702362814246, "learning_rate": 1.850238095505879e-05, "loss": 0.1914, "step": 5777 }, { "epoch": 1.2075235109717868, "grad_norm": 0.957421418569385, "learning_rate": 1.850178705972449e-05, "loss": 0.1923, "step": 5778 }, { "epoch": 1.2077324973876697, "grad_norm": 0.9360952060066696, "learning_rate": 1.850119305619152e-05, "loss": 0.1588, "step": 5779 }, { "epoch": 1.2079414838035527, "grad_norm": 0.799624505606744, "learning_rate": 1.8500598944467433e-05, "loss": 0.137, "step": 5780 }, { "epoch": 1.2081504702194357, "grad_norm": 0.9463838585346782, "learning_rate": 1.850000472455979e-05, "loss": 0.1276, "step": 5781 }, { "epoch": 1.2083594566353186, "grad_norm": 1.1379605527963377, "learning_rate": 1.8499410396476148e-05, "loss": 0.1389, "step": 5782 }, { "epoch": 1.2085684430512016, "grad_norm": 1.062982663818309, "learning_rate": 1.8498815960224085e-05, "loss": 0.195, "step": 5783 }, { "epoch": 1.2087774294670846, "grad_norm": 1.1132798128489987, "learning_rate": 1.849822141581115e-05, "loss": 0.1696, "step": 5784 }, { "epoch": 1.2089864158829675, "grad_norm": 0.9233405250061042, "learning_rate": 1.849762676324492e-05, "loss": 0.1543, "step": 5785 }, { "epoch": 1.2091954022988505, "grad_norm": 1.2886328272055403, "learning_rate": 1.849703200253296e-05, "loss": 0.1714, "step": 5786 }, { "epoch": 1.2094043887147334, "grad_norm": 1.3031306192993781, "learning_rate": 1.8496437133682842e-05, "loss": 0.1912, "step": 5787 }, { "epoch": 1.2096133751306164, "grad_norm": 0.8986563543089452, "learning_rate": 1.8495842156702134e-05, "loss": 0.1367, "step": 5788 }, { "epoch": 1.2098223615464994, "grad_norm": 0.9719538750033088, "learning_rate": 1.8495247071598405e-05, "loss": 0.1791, "step": 5789 }, { "epoch": 1.2100313479623825, "grad_norm": 0.9761005599546965, "learning_rate": 1.849465187837923e-05, "loss": 0.1664, "step": 5790 }, { "epoch": 1.2102403343782655, "grad_norm": 0.8459207876155612, "learning_rate": 1.849405657705219e-05, "loss": 0.1452, "step": 5791 }, { "epoch": 1.2104493207941485, "grad_norm": 0.9638720535050689, "learning_rate": 1.8493461167624854e-05, "loss": 0.1763, "step": 5792 }, { "epoch": 1.2106583072100314, "grad_norm": 1.0059699856495312, "learning_rate": 1.8492865650104802e-05, "loss": 0.1527, "step": 5793 }, { "epoch": 1.2108672936259144, "grad_norm": 1.1547591783622444, "learning_rate": 1.8492270024499612e-05, "loss": 0.202, "step": 5794 }, { "epoch": 1.2110762800417973, "grad_norm": 1.0163364943891593, "learning_rate": 1.8491674290816867e-05, "loss": 0.1596, "step": 5795 }, { "epoch": 1.2112852664576803, "grad_norm": 1.0252122967290211, "learning_rate": 1.8491078449064142e-05, "loss": 0.1875, "step": 5796 }, { "epoch": 1.2114942528735633, "grad_norm": 0.9498369930130117, "learning_rate": 1.8490482499249028e-05, "loss": 0.1624, "step": 5797 }, { "epoch": 1.2117032392894462, "grad_norm": 1.1020085524500294, "learning_rate": 1.8489886441379107e-05, "loss": 0.1557, "step": 5798 }, { "epoch": 1.2119122257053292, "grad_norm": 0.9056651052494232, "learning_rate": 1.848929027546196e-05, "loss": 0.1524, "step": 5799 }, { "epoch": 1.2121212121212122, "grad_norm": 1.0611109210486693, "learning_rate": 1.8488694001505183e-05, "loss": 0.1565, "step": 5800 }, { "epoch": 1.2123301985370951, "grad_norm": 1.1588091063045676, "learning_rate": 1.848809761951636e-05, "loss": 0.1999, "step": 5801 }, { "epoch": 1.212539184952978, "grad_norm": 1.1934349284303893, "learning_rate": 1.8487501129503076e-05, "loss": 0.1827, "step": 5802 }, { "epoch": 1.212748171368861, "grad_norm": 0.9675385648038195, "learning_rate": 1.848690453147293e-05, "loss": 0.183, "step": 5803 }, { "epoch": 1.212957157784744, "grad_norm": 1.006930398805741, "learning_rate": 1.848630782543351e-05, "loss": 0.1525, "step": 5804 }, { "epoch": 1.213166144200627, "grad_norm": 1.1206174094131465, "learning_rate": 1.8485711011392412e-05, "loss": 0.1829, "step": 5805 }, { "epoch": 1.21337513061651, "grad_norm": 1.358070450691959, "learning_rate": 1.848511408935723e-05, "loss": 0.2117, "step": 5806 }, { "epoch": 1.2135841170323929, "grad_norm": 0.904674252870923, "learning_rate": 1.848451705933556e-05, "loss": 0.1705, "step": 5807 }, { "epoch": 1.2137931034482758, "grad_norm": 1.2905142959605993, "learning_rate": 1.8483919921335006e-05, "loss": 0.1525, "step": 5808 }, { "epoch": 1.2140020898641588, "grad_norm": 1.2085489096281783, "learning_rate": 1.848332267536316e-05, "loss": 0.1805, "step": 5809 }, { "epoch": 1.2142110762800418, "grad_norm": 1.0481346052457747, "learning_rate": 1.8482725321427628e-05, "loss": 0.1719, "step": 5810 }, { "epoch": 1.2144200626959247, "grad_norm": 1.2697635889655312, "learning_rate": 1.848212785953601e-05, "loss": 0.1857, "step": 5811 }, { "epoch": 1.2146290491118077, "grad_norm": 1.1718872808111225, "learning_rate": 1.848153028969591e-05, "loss": 0.1504, "step": 5812 }, { "epoch": 1.2148380355276907, "grad_norm": 1.05758530553056, "learning_rate": 1.8480932611914933e-05, "loss": 0.1668, "step": 5813 }, { "epoch": 1.2150470219435736, "grad_norm": 0.9488734638279341, "learning_rate": 1.8480334826200686e-05, "loss": 0.1705, "step": 5814 }, { "epoch": 1.2152560083594566, "grad_norm": 1.2290691616427585, "learning_rate": 1.8479736932560778e-05, "loss": 0.1767, "step": 5815 }, { "epoch": 1.2154649947753395, "grad_norm": 0.9865921589504254, "learning_rate": 1.847913893100281e-05, "loss": 0.1697, "step": 5816 }, { "epoch": 1.2156739811912225, "grad_norm": 1.0589477016665878, "learning_rate": 1.8478540821534405e-05, "loss": 0.1924, "step": 5817 }, { "epoch": 1.2158829676071055, "grad_norm": 0.8900180640188575, "learning_rate": 1.8477942604163166e-05, "loss": 0.1455, "step": 5818 }, { "epoch": 1.2160919540229884, "grad_norm": 1.2337092394682874, "learning_rate": 1.8477344278896708e-05, "loss": 0.1877, "step": 5819 }, { "epoch": 1.2163009404388714, "grad_norm": 1.0062847330696612, "learning_rate": 1.847674584574265e-05, "loss": 0.1582, "step": 5820 }, { "epoch": 1.2165099268547546, "grad_norm": 1.0223493812115851, "learning_rate": 1.8476147304708606e-05, "loss": 0.1649, "step": 5821 }, { "epoch": 1.2167189132706375, "grad_norm": 1.0869700844810637, "learning_rate": 1.847554865580219e-05, "loss": 0.1496, "step": 5822 }, { "epoch": 1.2169278996865205, "grad_norm": 1.0510113219306854, "learning_rate": 1.847494989903102e-05, "loss": 0.157, "step": 5823 }, { "epoch": 1.2171368861024034, "grad_norm": 1.112136429967586, "learning_rate": 1.8474351034402724e-05, "loss": 0.1911, "step": 5824 }, { "epoch": 1.2173458725182864, "grad_norm": 1.0627673061646836, "learning_rate": 1.8473752061924916e-05, "loss": 0.1456, "step": 5825 }, { "epoch": 1.2175548589341694, "grad_norm": 1.0632034207176564, "learning_rate": 1.8473152981605222e-05, "loss": 0.1796, "step": 5826 }, { "epoch": 1.2177638453500523, "grad_norm": 0.8984893252861395, "learning_rate": 1.8472553793451264e-05, "loss": 0.159, "step": 5827 }, { "epoch": 1.2179728317659353, "grad_norm": 1.3590480480194727, "learning_rate": 1.847195449747067e-05, "loss": 0.1791, "step": 5828 }, { "epoch": 1.2181818181818183, "grad_norm": 1.084968872737821, "learning_rate": 1.8471355093671067e-05, "loss": 0.1893, "step": 5829 }, { "epoch": 1.2183908045977012, "grad_norm": 1.1238967649277545, "learning_rate": 1.8470755582060085e-05, "loss": 0.1914, "step": 5830 }, { "epoch": 1.2185997910135842, "grad_norm": 1.1692105628739775, "learning_rate": 1.8470155962645347e-05, "loss": 0.1562, "step": 5831 }, { "epoch": 1.2188087774294671, "grad_norm": 1.1773104253320656, "learning_rate": 1.8469556235434488e-05, "loss": 0.1906, "step": 5832 }, { "epoch": 1.21901776384535, "grad_norm": 1.3074626134992278, "learning_rate": 1.8468956400435143e-05, "loss": 0.1836, "step": 5833 }, { "epoch": 1.219226750261233, "grad_norm": 1.3515822243051532, "learning_rate": 1.8468356457654944e-05, "loss": 0.1853, "step": 5834 }, { "epoch": 1.219435736677116, "grad_norm": 1.1714411514660616, "learning_rate": 1.8467756407101527e-05, "loss": 0.1908, "step": 5835 }, { "epoch": 1.219644723092999, "grad_norm": 1.2478130339536657, "learning_rate": 1.8467156248782526e-05, "loss": 0.1397, "step": 5836 }, { "epoch": 1.219853709508882, "grad_norm": 1.264029140231447, "learning_rate": 1.846655598270558e-05, "loss": 0.1675, "step": 5837 }, { "epoch": 1.220062695924765, "grad_norm": 1.1233406595645465, "learning_rate": 1.846595560887833e-05, "loss": 0.1708, "step": 5838 }, { "epoch": 1.2202716823406479, "grad_norm": 1.2350453182877186, "learning_rate": 1.8465355127308416e-05, "loss": 0.1772, "step": 5839 }, { "epoch": 1.2204806687565308, "grad_norm": 1.5347683509210241, "learning_rate": 1.8464754538003477e-05, "loss": 0.1596, "step": 5840 }, { "epoch": 1.2206896551724138, "grad_norm": 0.9482193342940781, "learning_rate": 1.846415384097116e-05, "loss": 0.1659, "step": 5841 }, { "epoch": 1.2208986415882968, "grad_norm": 2.244237073456925, "learning_rate": 1.846355303621911e-05, "loss": 0.1475, "step": 5842 }, { "epoch": 1.2211076280041797, "grad_norm": 1.055856581236424, "learning_rate": 1.8462952123754974e-05, "loss": 0.1803, "step": 5843 }, { "epoch": 1.2213166144200627, "grad_norm": 1.2028745804333254, "learning_rate": 1.8462351103586394e-05, "loss": 0.2059, "step": 5844 }, { "epoch": 1.2215256008359456, "grad_norm": 1.1726440734969774, "learning_rate": 1.8461749975721024e-05, "loss": 0.1519, "step": 5845 }, { "epoch": 1.2217345872518286, "grad_norm": 1.1031571487407097, "learning_rate": 1.8461148740166513e-05, "loss": 0.1806, "step": 5846 }, { "epoch": 1.2219435736677116, "grad_norm": 1.0102914784864618, "learning_rate": 1.846054739693051e-05, "loss": 0.1919, "step": 5847 }, { "epoch": 1.2221525600835945, "grad_norm": 0.9634350222761928, "learning_rate": 1.8459945946020676e-05, "loss": 0.1397, "step": 5848 }, { "epoch": 1.2223615464994775, "grad_norm": 1.0612505922280595, "learning_rate": 1.8459344387444656e-05, "loss": 0.1671, "step": 5849 }, { "epoch": 1.2225705329153604, "grad_norm": 1.0089347568380067, "learning_rate": 1.845874272121011e-05, "loss": 0.2082, "step": 5850 }, { "epoch": 1.2227795193312434, "grad_norm": 1.054787622357431, "learning_rate": 1.84581409473247e-05, "loss": 0.1644, "step": 5851 }, { "epoch": 1.2229885057471264, "grad_norm": 1.3677197783709252, "learning_rate": 1.8457539065796073e-05, "loss": 0.2018, "step": 5852 }, { "epoch": 1.2231974921630093, "grad_norm": 1.163814922706389, "learning_rate": 1.8456937076631898e-05, "loss": 0.179, "step": 5853 }, { "epoch": 1.2234064785788923, "grad_norm": 1.1212337602052445, "learning_rate": 1.8456334979839834e-05, "loss": 0.1728, "step": 5854 }, { "epoch": 1.2236154649947752, "grad_norm": 0.9922655286093179, "learning_rate": 1.845573277542754e-05, "loss": 0.1466, "step": 5855 }, { "epoch": 1.2238244514106582, "grad_norm": 1.61794537832244, "learning_rate": 1.8455130463402687e-05, "loss": 0.1935, "step": 5856 }, { "epoch": 1.2240334378265412, "grad_norm": 1.009447899731593, "learning_rate": 1.8454528043772936e-05, "loss": 0.1482, "step": 5857 }, { "epoch": 1.2242424242424241, "grad_norm": 1.2089422397363176, "learning_rate": 1.8453925516545955e-05, "loss": 0.2173, "step": 5858 }, { "epoch": 1.224451410658307, "grad_norm": 1.0879901706776518, "learning_rate": 1.8453322881729407e-05, "loss": 0.1751, "step": 5859 }, { "epoch": 1.22466039707419, "grad_norm": 0.972407956381988, "learning_rate": 1.8452720139330972e-05, "loss": 0.2115, "step": 5860 }, { "epoch": 1.2248693834900732, "grad_norm": 0.924276937302792, "learning_rate": 1.8452117289358312e-05, "loss": 0.1513, "step": 5861 }, { "epoch": 1.2250783699059562, "grad_norm": 1.1226824579246828, "learning_rate": 1.8451514331819102e-05, "loss": 0.1546, "step": 5862 }, { "epoch": 1.2252873563218392, "grad_norm": 1.1624836968001653, "learning_rate": 1.8450911266721015e-05, "loss": 0.1733, "step": 5863 }, { "epoch": 1.2254963427377221, "grad_norm": 1.0068993919109577, "learning_rate": 1.845030809407173e-05, "loss": 0.1697, "step": 5864 }, { "epoch": 1.225705329153605, "grad_norm": 1.272081352314697, "learning_rate": 1.8449704813878914e-05, "loss": 0.1732, "step": 5865 }, { "epoch": 1.225914315569488, "grad_norm": 0.9642954790281156, "learning_rate": 1.8449101426150257e-05, "loss": 0.1522, "step": 5866 }, { "epoch": 1.226123301985371, "grad_norm": 1.2042645290122607, "learning_rate": 1.8448497930893427e-05, "loss": 0.1583, "step": 5867 }, { "epoch": 1.226332288401254, "grad_norm": 1.1319707358089102, "learning_rate": 1.8447894328116112e-05, "loss": 0.1818, "step": 5868 }, { "epoch": 1.226541274817137, "grad_norm": 0.9782628443352047, "learning_rate": 1.844729061782599e-05, "loss": 0.1808, "step": 5869 }, { "epoch": 1.22675026123302, "grad_norm": 1.1702325724365772, "learning_rate": 1.8446686800030743e-05, "loss": 0.1449, "step": 5870 }, { "epoch": 1.2269592476489029, "grad_norm": 1.1027870903414474, "learning_rate": 1.8446082874738057e-05, "loss": 0.1536, "step": 5871 }, { "epoch": 1.2271682340647858, "grad_norm": 1.1149320418625792, "learning_rate": 1.8445478841955623e-05, "loss": 0.1891, "step": 5872 }, { "epoch": 1.2273772204806688, "grad_norm": 1.1675336263450684, "learning_rate": 1.8444874701691122e-05, "loss": 0.1767, "step": 5873 }, { "epoch": 1.2275862068965517, "grad_norm": 1.1608621939471309, "learning_rate": 1.844427045395224e-05, "loss": 0.1581, "step": 5874 }, { "epoch": 1.2277951933124347, "grad_norm": 1.1964326128835128, "learning_rate": 1.844366609874668e-05, "loss": 0.1842, "step": 5875 }, { "epoch": 1.2280041797283177, "grad_norm": 1.0951522261619564, "learning_rate": 1.844306163608212e-05, "loss": 0.177, "step": 5876 }, { "epoch": 1.2282131661442006, "grad_norm": 0.9531340649801269, "learning_rate": 1.8442457065966256e-05, "loss": 0.2074, "step": 5877 }, { "epoch": 1.2284221525600836, "grad_norm": 1.6142994005836697, "learning_rate": 1.8441852388406788e-05, "loss": 0.1687, "step": 5878 }, { "epoch": 1.2286311389759665, "grad_norm": 1.181687227441376, "learning_rate": 1.8441247603411402e-05, "loss": 0.1961, "step": 5879 }, { "epoch": 1.2288401253918495, "grad_norm": 1.1063571958607465, "learning_rate": 1.8440642710987803e-05, "loss": 0.1743, "step": 5880 }, { "epoch": 1.2290491118077325, "grad_norm": 1.0127392837203042, "learning_rate": 1.844003771114369e-05, "loss": 0.1647, "step": 5881 }, { "epoch": 1.2292580982236154, "grad_norm": 1.185525347586032, "learning_rate": 1.8439432603886752e-05, "loss": 0.1883, "step": 5882 }, { "epoch": 1.2294670846394984, "grad_norm": 1.2167809647601966, "learning_rate": 1.8438827389224698e-05, "loss": 0.1756, "step": 5883 }, { "epoch": 1.2296760710553813, "grad_norm": 1.2158538131099665, "learning_rate": 1.843822206716523e-05, "loss": 0.1911, "step": 5884 }, { "epoch": 1.2298850574712643, "grad_norm": 1.2477240814823252, "learning_rate": 1.843761663771605e-05, "loss": 0.1904, "step": 5885 }, { "epoch": 1.2300940438871473, "grad_norm": 1.0891280054572985, "learning_rate": 1.8437011100884866e-05, "loss": 0.1628, "step": 5886 }, { "epoch": 1.2303030303030302, "grad_norm": 0.9462180258912719, "learning_rate": 1.843640545667938e-05, "loss": 0.1566, "step": 5887 }, { "epoch": 1.2305120167189132, "grad_norm": 1.0498114834728058, "learning_rate": 1.8435799705107303e-05, "loss": 0.138, "step": 5888 }, { "epoch": 1.2307210031347962, "grad_norm": 1.0695714527549103, "learning_rate": 1.8435193846176346e-05, "loss": 0.183, "step": 5889 }, { "epoch": 1.2309299895506791, "grad_norm": 1.1484225379187214, "learning_rate": 1.843458787989421e-05, "loss": 0.1756, "step": 5890 }, { "epoch": 1.231138975966562, "grad_norm": 1.1656440911770818, "learning_rate": 1.8433981806268617e-05, "loss": 0.1868, "step": 5891 }, { "epoch": 1.2313479623824453, "grad_norm": 1.0914563539009925, "learning_rate": 1.8433375625307278e-05, "loss": 0.1805, "step": 5892 }, { "epoch": 1.2315569487983282, "grad_norm": 1.0049761780316335, "learning_rate": 1.8432769337017905e-05, "loss": 0.1656, "step": 5893 }, { "epoch": 1.2317659352142112, "grad_norm": 1.1302830727835922, "learning_rate": 1.8432162941408215e-05, "loss": 0.1477, "step": 5894 }, { "epoch": 1.2319749216300941, "grad_norm": 1.240604623273155, "learning_rate": 1.8431556438485925e-05, "loss": 0.1789, "step": 5895 }, { "epoch": 1.232183908045977, "grad_norm": 1.1375174383075055, "learning_rate": 1.8430949828258757e-05, "loss": 0.1862, "step": 5896 }, { "epoch": 1.23239289446186, "grad_norm": 1.106013982342232, "learning_rate": 1.8430343110734426e-05, "loss": 0.1762, "step": 5897 }, { "epoch": 1.232601880877743, "grad_norm": 1.1286388445135094, "learning_rate": 1.842973628592066e-05, "loss": 0.1695, "step": 5898 }, { "epoch": 1.232810867293626, "grad_norm": 1.0984427703320434, "learning_rate": 1.842912935382517e-05, "loss": 0.1948, "step": 5899 }, { "epoch": 1.233019853709509, "grad_norm": 0.9527430281470071, "learning_rate": 1.842852231445569e-05, "loss": 0.1745, "step": 5900 }, { "epoch": 1.233228840125392, "grad_norm": 1.1093274211691402, "learning_rate": 1.842791516781995e-05, "loss": 0.1705, "step": 5901 }, { "epoch": 1.2334378265412749, "grad_norm": 1.2592361837478836, "learning_rate": 1.8427307913925668e-05, "loss": 0.2098, "step": 5902 }, { "epoch": 1.2336468129571578, "grad_norm": 0.9496342617045421, "learning_rate": 1.842670055278057e-05, "loss": 0.1711, "step": 5903 }, { "epoch": 1.2338557993730408, "grad_norm": 1.3205408868655342, "learning_rate": 1.842609308439239e-05, "loss": 0.1715, "step": 5904 }, { "epoch": 1.2340647857889238, "grad_norm": 0.9574380656941045, "learning_rate": 1.8425485508768863e-05, "loss": 0.1869, "step": 5905 }, { "epoch": 1.2342737722048067, "grad_norm": 0.9190958536416611, "learning_rate": 1.8424877825917718e-05, "loss": 0.1567, "step": 5906 }, { "epoch": 1.2344827586206897, "grad_norm": 1.1047969678312557, "learning_rate": 1.8424270035846685e-05, "loss": 0.1774, "step": 5907 }, { "epoch": 1.2346917450365726, "grad_norm": 0.9408608729209947, "learning_rate": 1.8423662138563508e-05, "loss": 0.1814, "step": 5908 }, { "epoch": 1.2349007314524556, "grad_norm": 0.9483794167063118, "learning_rate": 1.842305413407591e-05, "loss": 0.1814, "step": 5909 }, { "epoch": 1.2351097178683386, "grad_norm": 1.099171455857978, "learning_rate": 1.842244602239164e-05, "loss": 0.1569, "step": 5910 }, { "epoch": 1.2353187042842215, "grad_norm": 1.0493205708342654, "learning_rate": 1.8421837803518437e-05, "loss": 0.1849, "step": 5911 }, { "epoch": 1.2355276907001045, "grad_norm": 1.0388478526839342, "learning_rate": 1.8421229477464034e-05, "loss": 0.2235, "step": 5912 }, { "epoch": 1.2357366771159874, "grad_norm": 1.0085900915199288, "learning_rate": 1.842062104423618e-05, "loss": 0.1921, "step": 5913 }, { "epoch": 1.2359456635318704, "grad_norm": 1.0458036698373598, "learning_rate": 1.8420012503842613e-05, "loss": 0.1798, "step": 5914 }, { "epoch": 1.2361546499477534, "grad_norm": 0.9245338434806233, "learning_rate": 1.8419403856291082e-05, "loss": 0.1434, "step": 5915 }, { "epoch": 1.2363636363636363, "grad_norm": 1.183784981515824, "learning_rate": 1.841879510158933e-05, "loss": 0.1621, "step": 5916 }, { "epoch": 1.2365726227795193, "grad_norm": 1.7942649629355614, "learning_rate": 1.8418186239745108e-05, "loss": 0.1846, "step": 5917 }, { "epoch": 1.2367816091954023, "grad_norm": 1.03063295715735, "learning_rate": 1.8417577270766162e-05, "loss": 0.1513, "step": 5918 }, { "epoch": 1.2369905956112852, "grad_norm": 1.0781170456967883, "learning_rate": 1.841696819466024e-05, "loss": 0.1837, "step": 5919 }, { "epoch": 1.2371995820271682, "grad_norm": 0.9379816401779139, "learning_rate": 1.8416359011435097e-05, "loss": 0.1696, "step": 5920 }, { "epoch": 1.2374085684430511, "grad_norm": 1.0836779022142338, "learning_rate": 1.8415749721098485e-05, "loss": 0.1711, "step": 5921 }, { "epoch": 1.237617554858934, "grad_norm": 1.158445420437775, "learning_rate": 1.8415140323658156e-05, "loss": 0.16, "step": 5922 }, { "epoch": 1.237826541274817, "grad_norm": 0.9867110196564792, "learning_rate": 1.841453081912187e-05, "loss": 0.1611, "step": 5923 }, { "epoch": 1.2380355276907, "grad_norm": 1.1057177408209315, "learning_rate": 1.8413921207497377e-05, "loss": 0.1614, "step": 5924 }, { "epoch": 1.238244514106583, "grad_norm": 1.0290239250716693, "learning_rate": 1.8413311488792443e-05, "loss": 0.1534, "step": 5925 }, { "epoch": 1.238453500522466, "grad_norm": 1.0374753016937301, "learning_rate": 1.8412701663014826e-05, "loss": 0.1593, "step": 5926 }, { "epoch": 1.238662486938349, "grad_norm": 1.3200141394196865, "learning_rate": 1.8412091730172282e-05, "loss": 0.1703, "step": 5927 }, { "epoch": 1.2388714733542319, "grad_norm": 1.085457856184836, "learning_rate": 1.8411481690272576e-05, "loss": 0.1641, "step": 5928 }, { "epoch": 1.2390804597701148, "grad_norm": 1.0604423714497744, "learning_rate": 1.8410871543323474e-05, "loss": 0.1705, "step": 5929 }, { "epoch": 1.2392894461859978, "grad_norm": 1.2215342912162606, "learning_rate": 1.841026128933274e-05, "loss": 0.1599, "step": 5930 }, { "epoch": 1.239498432601881, "grad_norm": 1.1529600316151614, "learning_rate": 1.8409650928308138e-05, "loss": 0.1826, "step": 5931 }, { "epoch": 1.239707419017764, "grad_norm": 1.1398364705340636, "learning_rate": 1.8409040460257438e-05, "loss": 0.1623, "step": 5932 }, { "epoch": 1.239916405433647, "grad_norm": 0.990746113234542, "learning_rate": 1.840842988518841e-05, "loss": 0.1704, "step": 5933 }, { "epoch": 1.2401253918495299, "grad_norm": 1.1711054421016218, "learning_rate": 1.840781920310882e-05, "loss": 0.189, "step": 5934 }, { "epoch": 1.2403343782654128, "grad_norm": 1.2192988672758873, "learning_rate": 1.8407208414026446e-05, "loss": 0.1986, "step": 5935 }, { "epoch": 1.2405433646812958, "grad_norm": 0.9362253034038084, "learning_rate": 1.840659751794906e-05, "loss": 0.1566, "step": 5936 }, { "epoch": 1.2407523510971787, "grad_norm": 0.9749777902852693, "learning_rate": 1.840598651488443e-05, "loss": 0.1626, "step": 5937 }, { "epoch": 1.2409613375130617, "grad_norm": 0.9568570923379549, "learning_rate": 1.8405375404840346e-05, "loss": 0.1711, "step": 5938 }, { "epoch": 1.2411703239289447, "grad_norm": 1.091297321121394, "learning_rate": 1.8404764187824567e-05, "loss": 0.1604, "step": 5939 }, { "epoch": 1.2413793103448276, "grad_norm": 1.1202332136521371, "learning_rate": 1.8404152863844885e-05, "loss": 0.1642, "step": 5940 }, { "epoch": 1.2415882967607106, "grad_norm": 1.1876731803852931, "learning_rate": 1.840354143290908e-05, "loss": 0.1876, "step": 5941 }, { "epoch": 1.2417972831765935, "grad_norm": 1.21376372518608, "learning_rate": 1.8402929895024925e-05, "loss": 0.1788, "step": 5942 }, { "epoch": 1.2420062695924765, "grad_norm": 1.1228468242904013, "learning_rate": 1.8402318250200205e-05, "loss": 0.1576, "step": 5943 }, { "epoch": 1.2422152560083595, "grad_norm": 0.9491248751992811, "learning_rate": 1.840170649844271e-05, "loss": 0.1858, "step": 5944 }, { "epoch": 1.2424242424242424, "grad_norm": 1.3105820897670952, "learning_rate": 1.8401094639760224e-05, "loss": 0.1529, "step": 5945 }, { "epoch": 1.2426332288401254, "grad_norm": 1.13524491346144, "learning_rate": 1.8400482674160524e-05, "loss": 0.1843, "step": 5946 }, { "epoch": 1.2428422152560084, "grad_norm": 1.074410860419243, "learning_rate": 1.8399870601651413e-05, "loss": 0.1718, "step": 5947 }, { "epoch": 1.2430512016718913, "grad_norm": 1.1938291811849016, "learning_rate": 1.839925842224067e-05, "loss": 0.2164, "step": 5948 }, { "epoch": 1.2432601880877743, "grad_norm": 1.0619457862389177, "learning_rate": 1.8398646135936092e-05, "loss": 0.1631, "step": 5949 }, { "epoch": 1.2434691745036572, "grad_norm": 1.2853470977503683, "learning_rate": 1.839803374274547e-05, "loss": 0.1716, "step": 5950 }, { "epoch": 1.2436781609195402, "grad_norm": 1.3357916755413628, "learning_rate": 1.8397421242676595e-05, "loss": 0.1601, "step": 5951 }, { "epoch": 1.2438871473354232, "grad_norm": 1.1054302941310026, "learning_rate": 1.8396808635737263e-05, "loss": 0.16, "step": 5952 }, { "epoch": 1.2440961337513061, "grad_norm": 1.1769130789765399, "learning_rate": 1.8396195921935276e-05, "loss": 0.1618, "step": 5953 }, { "epoch": 1.244305120167189, "grad_norm": 1.4772990393253413, "learning_rate": 1.839558310127842e-05, "loss": 0.1729, "step": 5954 }, { "epoch": 1.244514106583072, "grad_norm": 1.2132740792810623, "learning_rate": 1.8394970173774505e-05, "loss": 0.1519, "step": 5955 }, { "epoch": 1.244723092998955, "grad_norm": 0.979572696302651, "learning_rate": 1.8394357139431327e-05, "loss": 0.1765, "step": 5956 }, { "epoch": 1.244932079414838, "grad_norm": 1.2031524412708952, "learning_rate": 1.8393743998256688e-05, "loss": 0.1777, "step": 5957 }, { "epoch": 1.245141065830721, "grad_norm": 1.0184222950768422, "learning_rate": 1.839313075025839e-05, "loss": 0.1741, "step": 5958 }, { "epoch": 1.245350052246604, "grad_norm": 1.1924106435489399, "learning_rate": 1.839251739544424e-05, "loss": 0.2114, "step": 5959 }, { "epoch": 1.2455590386624869, "grad_norm": 1.0510577673558714, "learning_rate": 1.839190393382204e-05, "loss": 0.1455, "step": 5960 }, { "epoch": 1.2457680250783698, "grad_norm": 1.138532963984797, "learning_rate": 1.839129036539961e-05, "loss": 0.1956, "step": 5961 }, { "epoch": 1.245977011494253, "grad_norm": 1.3272861602991455, "learning_rate": 1.839067669018474e-05, "loss": 0.1866, "step": 5962 }, { "epoch": 1.246185997910136, "grad_norm": 1.0573935274773252, "learning_rate": 1.8390062908185248e-05, "loss": 0.2001, "step": 5963 }, { "epoch": 1.246394984326019, "grad_norm": 1.0328268444670679, "learning_rate": 1.8389449019408948e-05, "loss": 0.1956, "step": 5964 }, { "epoch": 1.2466039707419019, "grad_norm": 1.1010917393900361, "learning_rate": 1.838883502386365e-05, "loss": 0.1506, "step": 5965 }, { "epoch": 1.2468129571577848, "grad_norm": 1.0713588702937573, "learning_rate": 1.8388220921557175e-05, "loss": 0.1927, "step": 5966 }, { "epoch": 1.2470219435736678, "grad_norm": 0.8836362530608693, "learning_rate": 1.8387606712497325e-05, "loss": 0.1423, "step": 5967 }, { "epoch": 1.2472309299895508, "grad_norm": 1.1498868334990635, "learning_rate": 1.8386992396691926e-05, "loss": 0.1719, "step": 5968 }, { "epoch": 1.2474399164054337, "grad_norm": 1.1654082559197125, "learning_rate": 1.8386377974148796e-05, "loss": 0.1671, "step": 5969 }, { "epoch": 1.2476489028213167, "grad_norm": 1.760867105517605, "learning_rate": 1.8385763444875753e-05, "loss": 0.2052, "step": 5970 }, { "epoch": 1.2478578892371996, "grad_norm": 1.0470253456897922, "learning_rate": 1.8385148808880617e-05, "loss": 0.1871, "step": 5971 }, { "epoch": 1.2480668756530826, "grad_norm": 1.0662718778048652, "learning_rate": 1.838453406617121e-05, "loss": 0.1882, "step": 5972 }, { "epoch": 1.2482758620689656, "grad_norm": 1.0478676759663312, "learning_rate": 1.8383919216755356e-05, "loss": 0.1771, "step": 5973 }, { "epoch": 1.2484848484848485, "grad_norm": 1.0987136812541218, "learning_rate": 1.838330426064088e-05, "loss": 0.2108, "step": 5974 }, { "epoch": 1.2486938349007315, "grad_norm": 1.1247039132346885, "learning_rate": 1.838268919783561e-05, "loss": 0.1644, "step": 5975 }, { "epoch": 1.2489028213166145, "grad_norm": 1.2741270528629032, "learning_rate": 1.838207402834737e-05, "loss": 0.1749, "step": 5976 }, { "epoch": 1.2491118077324974, "grad_norm": 1.0800562409353394, "learning_rate": 1.8381458752183995e-05, "loss": 0.1865, "step": 5977 }, { "epoch": 1.2493207941483804, "grad_norm": 1.0641252347892565, "learning_rate": 1.8380843369353307e-05, "loss": 0.1776, "step": 5978 }, { "epoch": 1.2495297805642633, "grad_norm": 1.0631224142393554, "learning_rate": 1.8380227879863146e-05, "loss": 0.1749, "step": 5979 }, { "epoch": 1.2497387669801463, "grad_norm": 0.9983680153097334, "learning_rate": 1.8379612283721338e-05, "loss": 0.1632, "step": 5980 }, { "epoch": 1.2499477533960293, "grad_norm": 0.8581454061437782, "learning_rate": 1.8378996580935725e-05, "loss": 0.1826, "step": 5981 }, { "epoch": 1.2501567398119122, "grad_norm": 1.0056147264259923, "learning_rate": 1.8378380771514134e-05, "loss": 0.1579, "step": 5982 }, { "epoch": 1.2503657262277952, "grad_norm": 1.0541063442764078, "learning_rate": 1.8377764855464412e-05, "loss": 0.1812, "step": 5983 }, { "epoch": 1.2505747126436781, "grad_norm": 1.1079259330732052, "learning_rate": 1.837714883279439e-05, "loss": 0.1931, "step": 5984 }, { "epoch": 1.250783699059561, "grad_norm": 1.054962588358777, "learning_rate": 1.837653270351191e-05, "loss": 0.19, "step": 5985 }, { "epoch": 1.250992685475444, "grad_norm": 1.1126126074037304, "learning_rate": 1.837591646762481e-05, "loss": 0.1822, "step": 5986 }, { "epoch": 1.251201671891327, "grad_norm": 1.223888671770063, "learning_rate": 1.837530012514094e-05, "loss": 0.1871, "step": 5987 }, { "epoch": 1.25141065830721, "grad_norm": 0.9379725048263863, "learning_rate": 1.8374683676068138e-05, "loss": 0.1609, "step": 5988 }, { "epoch": 1.251619644723093, "grad_norm": 1.3177964922170493, "learning_rate": 1.8374067120414255e-05, "loss": 0.1851, "step": 5989 }, { "epoch": 1.251828631138976, "grad_norm": 1.2709966918137827, "learning_rate": 1.837345045818713e-05, "loss": 0.2005, "step": 5990 }, { "epoch": 1.2520376175548589, "grad_norm": 0.976224275467784, "learning_rate": 1.8372833689394615e-05, "loss": 0.1637, "step": 5991 }, { "epoch": 1.2522466039707418, "grad_norm": 1.136716406562468, "learning_rate": 1.8372216814044562e-05, "loss": 0.1807, "step": 5992 }, { "epoch": 1.2524555903866248, "grad_norm": 1.1159947962776184, "learning_rate": 1.8371599832144814e-05, "loss": 0.1692, "step": 5993 }, { "epoch": 1.2526645768025078, "grad_norm": 0.9630803384443141, "learning_rate": 1.837098274370323e-05, "loss": 0.1607, "step": 5994 }, { "epoch": 1.2528735632183907, "grad_norm": 1.1027370862356436, "learning_rate": 1.8370365548727666e-05, "loss": 0.1717, "step": 5995 }, { "epoch": 1.2530825496342737, "grad_norm": 1.0241952733566042, "learning_rate": 1.8369748247225965e-05, "loss": 0.1618, "step": 5996 }, { "epoch": 1.2532915360501566, "grad_norm": 0.919357485471756, "learning_rate": 1.8369130839205992e-05, "loss": 0.1863, "step": 5997 }, { "epoch": 1.2535005224660396, "grad_norm": 1.138057511069789, "learning_rate": 1.8368513324675604e-05, "loss": 0.167, "step": 5998 }, { "epoch": 1.2537095088819226, "grad_norm": 1.057455889185661, "learning_rate": 1.836789570364266e-05, "loss": 0.1555, "step": 5999 }, { "epoch": 1.2539184952978055, "grad_norm": 1.0025102393324277, "learning_rate": 1.8367277976115015e-05, "loss": 0.1543, "step": 6000 }, { "epoch": 1.2541274817136885, "grad_norm": 0.9899616162571632, "learning_rate": 1.836666014210054e-05, "loss": 0.1547, "step": 6001 }, { "epoch": 1.2543364681295714, "grad_norm": 1.0882033578261496, "learning_rate": 1.8366042201607087e-05, "loss": 0.1601, "step": 6002 }, { "epoch": 1.2545454545454544, "grad_norm": 0.9407671566744861, "learning_rate": 1.8365424154642532e-05, "loss": 0.1648, "step": 6003 }, { "epoch": 1.2547544409613376, "grad_norm": 0.9625220877777275, "learning_rate": 1.8364806001214727e-05, "loss": 0.1652, "step": 6004 }, { "epoch": 1.2549634273772206, "grad_norm": 1.1475700451196535, "learning_rate": 1.836418774133155e-05, "loss": 0.1934, "step": 6005 }, { "epoch": 1.2551724137931035, "grad_norm": 1.3017442218859836, "learning_rate": 1.8363569375000866e-05, "loss": 0.1595, "step": 6006 }, { "epoch": 1.2553814002089865, "grad_norm": 1.1357712757187721, "learning_rate": 1.836295090223054e-05, "loss": 0.1882, "step": 6007 }, { "epoch": 1.2555903866248694, "grad_norm": 1.0757748447011493, "learning_rate": 1.8362332323028452e-05, "loss": 0.1873, "step": 6008 }, { "epoch": 1.2557993730407524, "grad_norm": 1.2143796379805678, "learning_rate": 1.8361713637402467e-05, "loss": 0.1808, "step": 6009 }, { "epoch": 1.2560083594566354, "grad_norm": 0.9792208558340796, "learning_rate": 1.836109484536046e-05, "loss": 0.179, "step": 6010 }, { "epoch": 1.2562173458725183, "grad_norm": 1.1163699015513748, "learning_rate": 1.8360475946910312e-05, "loss": 0.1617, "step": 6011 }, { "epoch": 1.2564263322884013, "grad_norm": 0.9900939461270937, "learning_rate": 1.8359856942059893e-05, "loss": 0.1683, "step": 6012 }, { "epoch": 1.2566353187042842, "grad_norm": 1.1750095314216968, "learning_rate": 1.8359237830817084e-05, "loss": 0.1827, "step": 6013 }, { "epoch": 1.2568443051201672, "grad_norm": 0.9258596752125321, "learning_rate": 1.835861861318976e-05, "loss": 0.1597, "step": 6014 }, { "epoch": 1.2570532915360502, "grad_norm": 0.8438924106766176, "learning_rate": 1.8357999289185803e-05, "loss": 0.1281, "step": 6015 }, { "epoch": 1.2572622779519331, "grad_norm": 1.169180363021124, "learning_rate": 1.83573798588131e-05, "loss": 0.1594, "step": 6016 }, { "epoch": 1.257471264367816, "grad_norm": 1.0482649926330423, "learning_rate": 1.835676032207953e-05, "loss": 0.1898, "step": 6017 }, { "epoch": 1.257680250783699, "grad_norm": 1.0659696622544985, "learning_rate": 1.835614067899298e-05, "loss": 0.1684, "step": 6018 }, { "epoch": 1.257889237199582, "grad_norm": 1.3906574753969205, "learning_rate": 1.835552092956133e-05, "loss": 0.1804, "step": 6019 }, { "epoch": 1.258098223615465, "grad_norm": 0.9461099396048834, "learning_rate": 1.8354901073792472e-05, "loss": 0.1736, "step": 6020 }, { "epoch": 1.258307210031348, "grad_norm": 1.0655895126351775, "learning_rate": 1.8354281111694297e-05, "loss": 0.153, "step": 6021 }, { "epoch": 1.258516196447231, "grad_norm": 1.292069407715232, "learning_rate": 1.835366104327469e-05, "loss": 0.149, "step": 6022 }, { "epoch": 1.2587251828631139, "grad_norm": 1.0936418565290893, "learning_rate": 1.8353040868541544e-05, "loss": 0.1959, "step": 6023 }, { "epoch": 1.2589341692789968, "grad_norm": 1.0247272811595762, "learning_rate": 1.8352420587502752e-05, "loss": 0.1745, "step": 6024 }, { "epoch": 1.2591431556948798, "grad_norm": 1.6012409345525787, "learning_rate": 1.835180020016621e-05, "loss": 0.205, "step": 6025 }, { "epoch": 1.2593521421107627, "grad_norm": 1.1363418220120358, "learning_rate": 1.835117970653981e-05, "loss": 0.1748, "step": 6026 }, { "epoch": 1.2595611285266457, "grad_norm": 1.1650520764997387, "learning_rate": 1.835055910663145e-05, "loss": 0.1401, "step": 6027 }, { "epoch": 1.2597701149425287, "grad_norm": 0.9900384246236981, "learning_rate": 1.834993840044903e-05, "loss": 0.1578, "step": 6028 }, { "epoch": 1.2599791013584116, "grad_norm": 1.0134351206003762, "learning_rate": 1.8349317588000447e-05, "loss": 0.1533, "step": 6029 }, { "epoch": 1.2601880877742948, "grad_norm": 1.1858897222743612, "learning_rate": 1.8348696669293602e-05, "loss": 0.1819, "step": 6030 }, { "epoch": 1.2603970741901778, "grad_norm": 1.2880698643086983, "learning_rate": 1.83480756443364e-05, "loss": 0.2, "step": 6031 }, { "epoch": 1.2606060606060607, "grad_norm": 1.0518541880044794, "learning_rate": 1.834745451313674e-05, "loss": 0.1839, "step": 6032 }, { "epoch": 1.2608150470219437, "grad_norm": 0.9334868376644004, "learning_rate": 1.8346833275702528e-05, "loss": 0.1679, "step": 6033 }, { "epoch": 1.2610240334378267, "grad_norm": 0.9910197712460372, "learning_rate": 1.8346211932041676e-05, "loss": 0.1711, "step": 6034 }, { "epoch": 1.2612330198537096, "grad_norm": 0.9918889196837039, "learning_rate": 1.8345590482162086e-05, "loss": 0.1318, "step": 6035 }, { "epoch": 1.2614420062695926, "grad_norm": 0.9310790611742633, "learning_rate": 1.8344968926071666e-05, "loss": 0.1582, "step": 6036 }, { "epoch": 1.2616509926854755, "grad_norm": 1.0676020792443137, "learning_rate": 1.834434726377833e-05, "loss": 0.1537, "step": 6037 }, { "epoch": 1.2618599791013585, "grad_norm": 0.9563401206226984, "learning_rate": 1.8343725495289987e-05, "loss": 0.146, "step": 6038 }, { "epoch": 1.2620689655172415, "grad_norm": 1.1612633111071853, "learning_rate": 1.834310362061455e-05, "loss": 0.1756, "step": 6039 }, { "epoch": 1.2622779519331244, "grad_norm": 1.4844968128771967, "learning_rate": 1.8342481639759934e-05, "loss": 0.1565, "step": 6040 }, { "epoch": 1.2624869383490074, "grad_norm": 1.010691860434675, "learning_rate": 1.834185955273406e-05, "loss": 0.1925, "step": 6041 }, { "epoch": 1.2626959247648903, "grad_norm": 1.1564857121680931, "learning_rate": 1.8341237359544833e-05, "loss": 0.1581, "step": 6042 }, { "epoch": 1.2629049111807733, "grad_norm": 0.9710991707046982, "learning_rate": 1.8340615060200183e-05, "loss": 0.1613, "step": 6043 }, { "epoch": 1.2631138975966563, "grad_norm": 1.0884058162050836, "learning_rate": 1.8339992654708023e-05, "loss": 0.176, "step": 6044 }, { "epoch": 1.2633228840125392, "grad_norm": 1.2391150839266833, "learning_rate": 1.833937014307628e-05, "loss": 0.1616, "step": 6045 }, { "epoch": 1.2635318704284222, "grad_norm": 1.0213011573638358, "learning_rate": 1.833874752531287e-05, "loss": 0.1996, "step": 6046 }, { "epoch": 1.2637408568443051, "grad_norm": 1.1239677073789383, "learning_rate": 1.8338124801425717e-05, "loss": 0.1826, "step": 6047 }, { "epoch": 1.263949843260188, "grad_norm": 0.9363228079812654, "learning_rate": 1.8337501971422752e-05, "loss": 0.1573, "step": 6048 }, { "epoch": 1.264158829676071, "grad_norm": 0.9658138704203726, "learning_rate": 1.8336879035311895e-05, "loss": 0.1855, "step": 6049 }, { "epoch": 1.264367816091954, "grad_norm": 0.9964251577472601, "learning_rate": 1.8336255993101078e-05, "loss": 0.1587, "step": 6050 }, { "epoch": 1.264576802507837, "grad_norm": 0.9018454772075443, "learning_rate": 1.8335632844798232e-05, "loss": 0.1945, "step": 6051 }, { "epoch": 1.26478578892372, "grad_norm": 0.9210537405032444, "learning_rate": 1.8335009590411283e-05, "loss": 0.1901, "step": 6052 }, { "epoch": 1.264994775339603, "grad_norm": 0.9965050729082607, "learning_rate": 1.8334386229948165e-05, "loss": 0.1838, "step": 6053 }, { "epoch": 1.2652037617554859, "grad_norm": 1.1630673417323032, "learning_rate": 1.8333762763416806e-05, "loss": 0.199, "step": 6054 }, { "epoch": 1.2654127481713688, "grad_norm": 1.3158425668125433, "learning_rate": 1.833313919082515e-05, "loss": 0.1712, "step": 6055 }, { "epoch": 1.2656217345872518, "grad_norm": 1.056074491605658, "learning_rate": 1.8332515512181127e-05, "loss": 0.1204, "step": 6056 }, { "epoch": 1.2658307210031348, "grad_norm": 1.0025351577852446, "learning_rate": 1.833189172749268e-05, "loss": 0.1525, "step": 6057 }, { "epoch": 1.2660397074190177, "grad_norm": 0.874590947454362, "learning_rate": 1.833126783676774e-05, "loss": 0.1525, "step": 6058 }, { "epoch": 1.2662486938349007, "grad_norm": 1.1527453792615079, "learning_rate": 1.8330643840014247e-05, "loss": 0.2192, "step": 6059 }, { "epoch": 1.2664576802507836, "grad_norm": 1.2483810710320558, "learning_rate": 1.833001973724015e-05, "loss": 0.2024, "step": 6060 }, { "epoch": 1.2666666666666666, "grad_norm": 0.9537674969307335, "learning_rate": 1.8329395528453387e-05, "loss": 0.1796, "step": 6061 }, { "epoch": 1.2668756530825496, "grad_norm": 1.1804639040687144, "learning_rate": 1.83287712136619e-05, "loss": 0.1632, "step": 6062 }, { "epoch": 1.2670846394984325, "grad_norm": 0.9997695333114144, "learning_rate": 1.8328146792873638e-05, "loss": 0.1483, "step": 6063 }, { "epoch": 1.2672936259143155, "grad_norm": 1.1974642374219555, "learning_rate": 1.832752226609655e-05, "loss": 0.1666, "step": 6064 }, { "epoch": 1.2675026123301985, "grad_norm": 1.1650405592619173, "learning_rate": 1.8326897633338577e-05, "loss": 0.1837, "step": 6065 }, { "epoch": 1.2677115987460814, "grad_norm": 1.1959972439243014, "learning_rate": 1.8326272894607672e-05, "loss": 0.1859, "step": 6066 }, { "epoch": 1.2679205851619644, "grad_norm": 1.2116686197038744, "learning_rate": 1.8325648049911787e-05, "loss": 0.1682, "step": 6067 }, { "epoch": 1.2681295715778473, "grad_norm": 1.1300558789232844, "learning_rate": 1.8325023099258873e-05, "loss": 0.198, "step": 6068 }, { "epoch": 1.2683385579937303, "grad_norm": 1.0438739280435005, "learning_rate": 1.8324398042656882e-05, "loss": 0.1411, "step": 6069 }, { "epoch": 1.2685475444096133, "grad_norm": 1.1432490487103393, "learning_rate": 1.832377288011377e-05, "loss": 0.1548, "step": 6070 }, { "epoch": 1.2687565308254962, "grad_norm": 1.0180312564622187, "learning_rate": 1.8323147611637496e-05, "loss": 0.1449, "step": 6071 }, { "epoch": 1.2689655172413792, "grad_norm": 0.9766998784372999, "learning_rate": 1.8322522237236012e-05, "loss": 0.1531, "step": 6072 }, { "epoch": 1.2691745036572621, "grad_norm": 1.1163795903193394, "learning_rate": 1.8321896756917282e-05, "loss": 0.148, "step": 6073 }, { "epoch": 1.2693834900731453, "grad_norm": 1.1197545602627919, "learning_rate": 1.8321271170689263e-05, "loss": 0.1474, "step": 6074 }, { "epoch": 1.2695924764890283, "grad_norm": 1.2747099122791516, "learning_rate": 1.832064547855992e-05, "loss": 0.2182, "step": 6075 }, { "epoch": 1.2698014629049112, "grad_norm": 1.1909094655120356, "learning_rate": 1.832001968053721e-05, "loss": 0.1664, "step": 6076 }, { "epoch": 1.2700104493207942, "grad_norm": 1.209250631740835, "learning_rate": 1.8319393776629104e-05, "loss": 0.174, "step": 6077 }, { "epoch": 1.2702194357366772, "grad_norm": 1.3016502784279615, "learning_rate": 1.8318767766843564e-05, "loss": 0.2198, "step": 6078 }, { "epoch": 1.2704284221525601, "grad_norm": 1.1207344831073576, "learning_rate": 1.8318141651188558e-05, "loss": 0.1666, "step": 6079 }, { "epoch": 1.270637408568443, "grad_norm": 1.3373650563447639, "learning_rate": 1.831751542967205e-05, "loss": 0.2067, "step": 6080 }, { "epoch": 1.270846394984326, "grad_norm": 0.8981286720950507, "learning_rate": 1.8316889102302018e-05, "loss": 0.1527, "step": 6081 }, { "epoch": 1.271055381400209, "grad_norm": 1.0910883698995741, "learning_rate": 1.8316262669086425e-05, "loss": 0.1907, "step": 6082 }, { "epoch": 1.271264367816092, "grad_norm": 1.0080961108931468, "learning_rate": 1.831563613003325e-05, "loss": 0.17, "step": 6083 }, { "epoch": 1.271473354231975, "grad_norm": 1.0122188475611062, "learning_rate": 1.8315009485150458e-05, "loss": 0.1738, "step": 6084 }, { "epoch": 1.271682340647858, "grad_norm": 1.486191488301403, "learning_rate": 1.8314382734446035e-05, "loss": 0.1704, "step": 6085 }, { "epoch": 1.2718913270637409, "grad_norm": 1.093703810288908, "learning_rate": 1.831375587792795e-05, "loss": 0.1763, "step": 6086 }, { "epoch": 1.2721003134796238, "grad_norm": 0.9423026288121914, "learning_rate": 1.8313128915604184e-05, "loss": 0.1542, "step": 6087 }, { "epoch": 1.2723092998955068, "grad_norm": 1.0148339402378255, "learning_rate": 1.8312501847482712e-05, "loss": 0.1782, "step": 6088 }, { "epoch": 1.2725182863113897, "grad_norm": 1.1992794418297525, "learning_rate": 1.8311874673571522e-05, "loss": 0.1793, "step": 6089 }, { "epoch": 1.2727272727272727, "grad_norm": 1.041851477651057, "learning_rate": 1.8311247393878588e-05, "loss": 0.1927, "step": 6090 }, { "epoch": 1.2729362591431557, "grad_norm": 0.9584651898336943, "learning_rate": 1.8310620008411896e-05, "loss": 0.1581, "step": 6091 }, { "epoch": 1.2731452455590386, "grad_norm": 1.0496974628513591, "learning_rate": 1.830999251717943e-05, "loss": 0.1691, "step": 6092 }, { "epoch": 1.2733542319749216, "grad_norm": 1.166529275675257, "learning_rate": 1.8309364920189173e-05, "loss": 0.207, "step": 6093 }, { "epoch": 1.2735632183908046, "grad_norm": 0.9849526303914469, "learning_rate": 1.8308737217449123e-05, "loss": 0.1761, "step": 6094 }, { "epoch": 1.2737722048066875, "grad_norm": 1.0584973919845186, "learning_rate": 1.8308109408967256e-05, "loss": 0.1739, "step": 6095 }, { "epoch": 1.2739811912225705, "grad_norm": 0.887589357236281, "learning_rate": 1.8307481494751567e-05, "loss": 0.1955, "step": 6096 }, { "epoch": 1.2741901776384534, "grad_norm": 1.1446171974991504, "learning_rate": 1.830685347481005e-05, "loss": 0.1669, "step": 6097 }, { "epoch": 1.2743991640543364, "grad_norm": 0.9654225376275641, "learning_rate": 1.8306225349150694e-05, "loss": 0.1911, "step": 6098 }, { "epoch": 1.2746081504702194, "grad_norm": 0.9667403085328605, "learning_rate": 1.830559711778149e-05, "loss": 0.1699, "step": 6099 }, { "epoch": 1.2748171368861025, "grad_norm": 0.9901425654492721, "learning_rate": 1.830496878071044e-05, "loss": 0.1602, "step": 6100 }, { "epoch": 1.2750261233019855, "grad_norm": 1.1941124951287334, "learning_rate": 1.830434033794554e-05, "loss": 0.1866, "step": 6101 }, { "epoch": 1.2752351097178685, "grad_norm": 1.1413361109043634, "learning_rate": 1.830371178949478e-05, "loss": 0.1954, "step": 6102 }, { "epoch": 1.2754440961337514, "grad_norm": 0.9430760821218896, "learning_rate": 1.8303083135366166e-05, "loss": 0.1677, "step": 6103 }, { "epoch": 1.2756530825496344, "grad_norm": 0.9999952073974795, "learning_rate": 1.8302454375567697e-05, "loss": 0.1665, "step": 6104 }, { "epoch": 1.2758620689655173, "grad_norm": 1.1087720950955335, "learning_rate": 1.8301825510107374e-05, "loss": 0.1542, "step": 6105 }, { "epoch": 1.2760710553814003, "grad_norm": 1.0071792780741324, "learning_rate": 1.8301196538993206e-05, "loss": 0.1748, "step": 6106 }, { "epoch": 1.2762800417972833, "grad_norm": 1.0902617643425832, "learning_rate": 1.8300567462233193e-05, "loss": 0.1944, "step": 6107 }, { "epoch": 1.2764890282131662, "grad_norm": 0.9786814731269748, "learning_rate": 1.8299938279835333e-05, "loss": 0.1882, "step": 6108 }, { "epoch": 1.2766980146290492, "grad_norm": 1.5972553983479716, "learning_rate": 1.829930899180765e-05, "loss": 0.2118, "step": 6109 }, { "epoch": 1.2769070010449322, "grad_norm": 0.9691392037402121, "learning_rate": 1.829867959815814e-05, "loss": 0.1703, "step": 6110 }, { "epoch": 1.2771159874608151, "grad_norm": 0.9871551067364541, "learning_rate": 1.829805009889482e-05, "loss": 0.1567, "step": 6111 }, { "epoch": 1.277324973876698, "grad_norm": 0.9749611962642607, "learning_rate": 1.82974204940257e-05, "loss": 0.1648, "step": 6112 }, { "epoch": 1.277533960292581, "grad_norm": 0.89163610325515, "learning_rate": 1.8296790783558785e-05, "loss": 0.1409, "step": 6113 }, { "epoch": 1.277742946708464, "grad_norm": 0.9426725516707806, "learning_rate": 1.82961609675021e-05, "loss": 0.1284, "step": 6114 }, { "epoch": 1.277951933124347, "grad_norm": 0.9099885892077626, "learning_rate": 1.8295531045863652e-05, "loss": 0.1501, "step": 6115 }, { "epoch": 1.27816091954023, "grad_norm": 1.0610137677263678, "learning_rate": 1.829490101865146e-05, "loss": 0.174, "step": 6116 }, { "epoch": 1.2783699059561129, "grad_norm": 1.250771999741666, "learning_rate": 1.829427088587355e-05, "loss": 0.1872, "step": 6117 }, { "epoch": 1.2785788923719958, "grad_norm": 1.2433691193722902, "learning_rate": 1.829364064753793e-05, "loss": 0.161, "step": 6118 }, { "epoch": 1.2787878787878788, "grad_norm": 1.0668671069324573, "learning_rate": 1.829301030365263e-05, "loss": 0.162, "step": 6119 }, { "epoch": 1.2789968652037618, "grad_norm": 1.1254879497586583, "learning_rate": 1.8292379854225667e-05, "loss": 0.1671, "step": 6120 }, { "epoch": 1.2792058516196447, "grad_norm": 1.3023971298179775, "learning_rate": 1.8291749299265062e-05, "loss": 0.2079, "step": 6121 }, { "epoch": 1.2794148380355277, "grad_norm": 0.9775633230365096, "learning_rate": 1.8291118638778846e-05, "loss": 0.1497, "step": 6122 }, { "epoch": 1.2796238244514107, "grad_norm": 1.0007269773317564, "learning_rate": 1.8290487872775042e-05, "loss": 0.1528, "step": 6123 }, { "epoch": 1.2798328108672936, "grad_norm": 0.9890135457228517, "learning_rate": 1.8289857001261677e-05, "loss": 0.1569, "step": 6124 }, { "epoch": 1.2800417972831766, "grad_norm": 1.0161941139618675, "learning_rate": 1.828922602424678e-05, "loss": 0.1644, "step": 6125 }, { "epoch": 1.2802507836990595, "grad_norm": 1.1024482037093277, "learning_rate": 1.8288594941738383e-05, "loss": 0.1976, "step": 6126 }, { "epoch": 1.2804597701149425, "grad_norm": 1.0718151058473333, "learning_rate": 1.8287963753744517e-05, "loss": 0.1758, "step": 6127 }, { "epoch": 1.2806687565308255, "grad_norm": 0.9968992295447283, "learning_rate": 1.8287332460273215e-05, "loss": 0.1385, "step": 6128 }, { "epoch": 1.2808777429467084, "grad_norm": 1.1601343606002787, "learning_rate": 1.828670106133251e-05, "loss": 0.1792, "step": 6129 }, { "epoch": 1.2810867293625914, "grad_norm": 1.0481814097960394, "learning_rate": 1.8286069556930434e-05, "loss": 0.1725, "step": 6130 }, { "epoch": 1.2812957157784743, "grad_norm": 0.8815006273359087, "learning_rate": 1.828543794707503e-05, "loss": 0.1678, "step": 6131 }, { "epoch": 1.2815047021943573, "grad_norm": 0.788261168691626, "learning_rate": 1.8284806231774338e-05, "loss": 0.1511, "step": 6132 }, { "epoch": 1.2817136886102403, "grad_norm": 0.9535013281020114, "learning_rate": 1.8284174411036388e-05, "loss": 0.1817, "step": 6133 }, { "epoch": 1.2819226750261232, "grad_norm": 1.054746542941509, "learning_rate": 1.828354248486923e-05, "loss": 0.1714, "step": 6134 }, { "epoch": 1.2821316614420062, "grad_norm": 1.1278459257182882, "learning_rate": 1.82829104532809e-05, "loss": 0.1782, "step": 6135 }, { "epoch": 1.2823406478578891, "grad_norm": 0.9537345035008259, "learning_rate": 1.8282278316279445e-05, "loss": 0.1641, "step": 6136 }, { "epoch": 1.282549634273772, "grad_norm": 0.8595221907795907, "learning_rate": 1.8281646073872915e-05, "loss": 0.1588, "step": 6137 }, { "epoch": 1.282758620689655, "grad_norm": 0.9887876069932109, "learning_rate": 1.8281013726069343e-05, "loss": 0.1614, "step": 6138 }, { "epoch": 1.282967607105538, "grad_norm": 1.1287965903913226, "learning_rate": 1.8280381272876787e-05, "loss": 0.2065, "step": 6139 }, { "epoch": 1.283176593521421, "grad_norm": 1.0185204321623789, "learning_rate": 1.8279748714303296e-05, "loss": 0.1536, "step": 6140 }, { "epoch": 1.283385579937304, "grad_norm": 1.1121502369428302, "learning_rate": 1.8279116050356915e-05, "loss": 0.1727, "step": 6141 }, { "epoch": 1.283594566353187, "grad_norm": 1.230549121252052, "learning_rate": 1.82784832810457e-05, "loss": 0.1945, "step": 6142 }, { "epoch": 1.2838035527690699, "grad_norm": 1.0542476255145157, "learning_rate": 1.82778504063777e-05, "loss": 0.1577, "step": 6143 }, { "epoch": 1.284012539184953, "grad_norm": 1.086312262832882, "learning_rate": 1.827721742636097e-05, "loss": 0.1519, "step": 6144 }, { "epoch": 1.284221525600836, "grad_norm": 0.995863888296004, "learning_rate": 1.8276584341003572e-05, "loss": 0.1514, "step": 6145 }, { "epoch": 1.284430512016719, "grad_norm": 1.0314411252956954, "learning_rate": 1.8275951150313554e-05, "loss": 0.1696, "step": 6146 }, { "epoch": 1.284639498432602, "grad_norm": 1.0113068761386528, "learning_rate": 1.8275317854298983e-05, "loss": 0.1564, "step": 6147 }, { "epoch": 1.284848484848485, "grad_norm": 0.9365500770620429, "learning_rate": 1.827468445296791e-05, "loss": 0.1392, "step": 6148 }, { "epoch": 1.2850574712643679, "grad_norm": 0.9540389714819478, "learning_rate": 1.8274050946328402e-05, "loss": 0.1638, "step": 6149 }, { "epoch": 1.2852664576802508, "grad_norm": 1.3394053517021067, "learning_rate": 1.827341733438852e-05, "loss": 0.2201, "step": 6150 }, { "epoch": 1.2854754440961338, "grad_norm": 0.9708518117319637, "learning_rate": 1.8272783617156326e-05, "loss": 0.181, "step": 6151 }, { "epoch": 1.2856844305120168, "grad_norm": 1.0318862247584724, "learning_rate": 1.827214979463989e-05, "loss": 0.171, "step": 6152 }, { "epoch": 1.2858934169278997, "grad_norm": 1.0424689832988505, "learning_rate": 1.8271515866847268e-05, "loss": 0.157, "step": 6153 }, { "epoch": 1.2861024033437827, "grad_norm": 1.1473733073825396, "learning_rate": 1.827088183378654e-05, "loss": 0.1529, "step": 6154 }, { "epoch": 1.2863113897596656, "grad_norm": 1.191987157909465, "learning_rate": 1.827024769546577e-05, "loss": 0.1705, "step": 6155 }, { "epoch": 1.2865203761755486, "grad_norm": 1.0065891088719707, "learning_rate": 1.8269613451893024e-05, "loss": 0.1766, "step": 6156 }, { "epoch": 1.2867293625914316, "grad_norm": 0.8165358718034699, "learning_rate": 1.8268979103076385e-05, "loss": 0.1924, "step": 6157 }, { "epoch": 1.2869383490073145, "grad_norm": 1.0233825511839616, "learning_rate": 1.826834464902391e-05, "loss": 0.1786, "step": 6158 }, { "epoch": 1.2871473354231975, "grad_norm": 1.1755597140138736, "learning_rate": 1.8267710089743688e-05, "loss": 0.178, "step": 6159 }, { "epoch": 1.2873563218390804, "grad_norm": 0.9420350637671802, "learning_rate": 1.826707542524379e-05, "loss": 0.1674, "step": 6160 }, { "epoch": 1.2875653082549634, "grad_norm": 1.1753577341196362, "learning_rate": 1.8266440655532287e-05, "loss": 0.1468, "step": 6161 }, { "epoch": 1.2877742946708464, "grad_norm": 1.1209194052501337, "learning_rate": 1.8265805780617264e-05, "loss": 0.1435, "step": 6162 }, { "epoch": 1.2879832810867293, "grad_norm": 1.1655789314542864, "learning_rate": 1.8265170800506804e-05, "loss": 0.1744, "step": 6163 }, { "epoch": 1.2881922675026123, "grad_norm": 1.0874773384920573, "learning_rate": 1.826453571520898e-05, "loss": 0.1806, "step": 6164 }, { "epoch": 1.2884012539184952, "grad_norm": 1.0904438596879396, "learning_rate": 1.8263900524731876e-05, "loss": 0.1622, "step": 6165 }, { "epoch": 1.2886102403343782, "grad_norm": 0.9709480536238378, "learning_rate": 1.8263265229083584e-05, "loss": 0.1743, "step": 6166 }, { "epoch": 1.2888192267502612, "grad_norm": 0.9156919051003884, "learning_rate": 1.8262629828272177e-05, "loss": 0.1578, "step": 6167 }, { "epoch": 1.2890282131661441, "grad_norm": 0.9286442087459829, "learning_rate": 1.826199432230575e-05, "loss": 0.1793, "step": 6168 }, { "epoch": 1.289237199582027, "grad_norm": 1.1971339984349305, "learning_rate": 1.826135871119239e-05, "loss": 0.1749, "step": 6169 }, { "epoch": 1.28944618599791, "grad_norm": 1.1233350353869616, "learning_rate": 1.826072299494018e-05, "loss": 0.1855, "step": 6170 }, { "epoch": 1.2896551724137932, "grad_norm": 1.2593673848525735, "learning_rate": 1.826008717355722e-05, "loss": 0.1978, "step": 6171 }, { "epoch": 1.2898641588296762, "grad_norm": 0.9649079309247042, "learning_rate": 1.8259451247051592e-05, "loss": 0.1934, "step": 6172 }, { "epoch": 1.2900731452455592, "grad_norm": 1.1385194119790745, "learning_rate": 1.8258815215431395e-05, "loss": 0.167, "step": 6173 }, { "epoch": 1.2902821316614421, "grad_norm": 1.1651129843576804, "learning_rate": 1.8258179078704726e-05, "loss": 0.178, "step": 6174 }, { "epoch": 1.290491118077325, "grad_norm": 0.9782421245905767, "learning_rate": 1.8257542836879673e-05, "loss": 0.1606, "step": 6175 }, { "epoch": 1.290700104493208, "grad_norm": 0.9427539982850727, "learning_rate": 1.825690648996434e-05, "loss": 0.2023, "step": 6176 }, { "epoch": 1.290909090909091, "grad_norm": 1.1244344440791587, "learning_rate": 1.825627003796682e-05, "loss": 0.148, "step": 6177 }, { "epoch": 1.291118077324974, "grad_norm": 1.247707738206194, "learning_rate": 1.825563348089522e-05, "loss": 0.1976, "step": 6178 }, { "epoch": 1.291327063740857, "grad_norm": 0.9653251570709016, "learning_rate": 1.8254996818757632e-05, "loss": 0.1726, "step": 6179 }, { "epoch": 1.29153605015674, "grad_norm": 1.0182998241338261, "learning_rate": 1.8254360051562168e-05, "loss": 0.1679, "step": 6180 }, { "epoch": 1.2917450365726229, "grad_norm": 1.0788955416750594, "learning_rate": 1.8253723179316926e-05, "loss": 0.1701, "step": 6181 }, { "epoch": 1.2919540229885058, "grad_norm": 3.972165585178119, "learning_rate": 1.8253086202030014e-05, "loss": 0.1599, "step": 6182 }, { "epoch": 1.2921630094043888, "grad_norm": 1.0120618031864046, "learning_rate": 1.8252449119709536e-05, "loss": 0.185, "step": 6183 }, { "epoch": 1.2923719958202717, "grad_norm": 1.0409127119038886, "learning_rate": 1.8251811932363602e-05, "loss": 0.1899, "step": 6184 }, { "epoch": 1.2925809822361547, "grad_norm": 1.1302568789507064, "learning_rate": 1.8251174640000315e-05, "loss": 0.1858, "step": 6185 }, { "epoch": 1.2927899686520377, "grad_norm": 0.8931970492971538, "learning_rate": 1.8250537242627796e-05, "loss": 0.1715, "step": 6186 }, { "epoch": 1.2929989550679206, "grad_norm": 0.8198625223409914, "learning_rate": 1.824989974025415e-05, "loss": 0.1677, "step": 6187 }, { "epoch": 1.2932079414838036, "grad_norm": 0.9743724644234076, "learning_rate": 1.8249262132887497e-05, "loss": 0.1853, "step": 6188 }, { "epoch": 1.2934169278996865, "grad_norm": 0.9690448469882721, "learning_rate": 1.824862442053594e-05, "loss": 0.1392, "step": 6189 }, { "epoch": 1.2936259143155695, "grad_norm": 1.0725655121678268, "learning_rate": 1.8247986603207602e-05, "loss": 0.185, "step": 6190 }, { "epoch": 1.2938349007314525, "grad_norm": 0.8902947341831007, "learning_rate": 1.82473486809106e-05, "loss": 0.1656, "step": 6191 }, { "epoch": 1.2940438871473354, "grad_norm": 1.1012784805850897, "learning_rate": 1.8246710653653055e-05, "loss": 0.177, "step": 6192 }, { "epoch": 1.2942528735632184, "grad_norm": 1.161718451327125, "learning_rate": 1.8246072521443082e-05, "loss": 0.1866, "step": 6193 }, { "epoch": 1.2944618599791013, "grad_norm": 0.9649372796006972, "learning_rate": 1.8245434284288804e-05, "loss": 0.1758, "step": 6194 }, { "epoch": 1.2946708463949843, "grad_norm": 0.9932926385066466, "learning_rate": 1.8244795942198347e-05, "loss": 0.1716, "step": 6195 }, { "epoch": 1.2948798328108673, "grad_norm": 1.004560156514156, "learning_rate": 1.8244157495179824e-05, "loss": 0.1454, "step": 6196 }, { "epoch": 1.2950888192267502, "grad_norm": 1.2019652335915991, "learning_rate": 1.8243518943241377e-05, "loss": 0.1911, "step": 6197 }, { "epoch": 1.2952978056426332, "grad_norm": 1.1350470037741778, "learning_rate": 1.824288028639112e-05, "loss": 0.1935, "step": 6198 }, { "epoch": 1.2955067920585162, "grad_norm": 1.0295436529554662, "learning_rate": 1.8242241524637186e-05, "loss": 0.1848, "step": 6199 }, { "epoch": 1.2957157784743991, "grad_norm": 1.0667899547727344, "learning_rate": 1.82416026579877e-05, "loss": 0.1651, "step": 6200 }, { "epoch": 1.295924764890282, "grad_norm": 0.9979842380253052, "learning_rate": 1.82409636864508e-05, "loss": 0.1882, "step": 6201 }, { "epoch": 1.296133751306165, "grad_norm": 0.987089493441176, "learning_rate": 1.8240324610034607e-05, "loss": 0.1474, "step": 6202 }, { "epoch": 1.296342737722048, "grad_norm": 0.9553854159234877, "learning_rate": 1.8239685428747264e-05, "loss": 0.1499, "step": 6203 }, { "epoch": 1.296551724137931, "grad_norm": 1.149338145622006, "learning_rate": 1.82390461425969e-05, "loss": 0.192, "step": 6204 }, { "epoch": 1.296760710553814, "grad_norm": 0.928147483338977, "learning_rate": 1.8238406751591658e-05, "loss": 0.1396, "step": 6205 }, { "epoch": 1.2969696969696969, "grad_norm": 1.2489708286313603, "learning_rate": 1.823776725573967e-05, "loss": 0.1981, "step": 6206 }, { "epoch": 1.2971786833855798, "grad_norm": 1.013392760926191, "learning_rate": 1.8237127655049073e-05, "loss": 0.1485, "step": 6207 }, { "epoch": 1.2973876698014628, "grad_norm": 1.0944981155426836, "learning_rate": 1.8236487949528005e-05, "loss": 0.2029, "step": 6208 }, { "epoch": 1.2975966562173458, "grad_norm": 1.1398859937341013, "learning_rate": 1.8235848139184617e-05, "loss": 0.1878, "step": 6209 }, { "epoch": 1.2978056426332287, "grad_norm": 1.1583995604613686, "learning_rate": 1.823520822402704e-05, "loss": 0.161, "step": 6210 }, { "epoch": 1.2980146290491117, "grad_norm": 0.9143693224005994, "learning_rate": 1.823456820406343e-05, "loss": 0.1576, "step": 6211 }, { "epoch": 1.2982236154649947, "grad_norm": 1.2013460771866198, "learning_rate": 1.8233928079301923e-05, "loss": 0.159, "step": 6212 }, { "epoch": 1.2984326018808776, "grad_norm": 1.0453520488907189, "learning_rate": 1.8233287849750668e-05, "loss": 0.1675, "step": 6213 }, { "epoch": 1.2986415882967606, "grad_norm": 1.143894405969072, "learning_rate": 1.8232647515417814e-05, "loss": 0.1719, "step": 6214 }, { "epoch": 1.2988505747126438, "grad_norm": 1.1395660193893216, "learning_rate": 1.8232007076311512e-05, "loss": 0.1746, "step": 6215 }, { "epoch": 1.2990595611285267, "grad_norm": 1.0217936482103676, "learning_rate": 1.8231366532439906e-05, "loss": 0.1414, "step": 6216 }, { "epoch": 1.2992685475444097, "grad_norm": 1.2216632944366568, "learning_rate": 1.8230725883811155e-05, "loss": 0.175, "step": 6217 }, { "epoch": 1.2994775339602926, "grad_norm": 1.3030818672150988, "learning_rate": 1.823008513043341e-05, "loss": 0.2182, "step": 6218 }, { "epoch": 1.2996865203761756, "grad_norm": 1.1531474399024988, "learning_rate": 1.8229444272314823e-05, "loss": 0.1732, "step": 6219 }, { "epoch": 1.2998955067920586, "grad_norm": 1.176465388338905, "learning_rate": 1.8228803309463554e-05, "loss": 0.1596, "step": 6220 }, { "epoch": 1.3001044932079415, "grad_norm": 1.14784140918579, "learning_rate": 1.8228162241887757e-05, "loss": 0.1714, "step": 6221 }, { "epoch": 1.3003134796238245, "grad_norm": 1.128983589084423, "learning_rate": 1.8227521069595593e-05, "loss": 0.18, "step": 6222 }, { "epoch": 1.3005224660397074, "grad_norm": 1.101754872891018, "learning_rate": 1.8226879792595223e-05, "loss": 0.1824, "step": 6223 }, { "epoch": 1.3007314524555904, "grad_norm": 1.211172644834221, "learning_rate": 1.82262384108948e-05, "loss": 0.1715, "step": 6224 }, { "epoch": 1.3009404388714734, "grad_norm": 1.0317090855354951, "learning_rate": 1.82255969245025e-05, "loss": 0.1563, "step": 6225 }, { "epoch": 1.3011494252873563, "grad_norm": 1.021901029223566, "learning_rate": 1.8224955333426476e-05, "loss": 0.1769, "step": 6226 }, { "epoch": 1.3013584117032393, "grad_norm": 1.044540278512297, "learning_rate": 1.8224313637674897e-05, "loss": 0.1863, "step": 6227 }, { "epoch": 1.3015673981191223, "grad_norm": 1.0181208979250094, "learning_rate": 1.8223671837255928e-05, "loss": 0.1633, "step": 6228 }, { "epoch": 1.3017763845350052, "grad_norm": 0.9947078841368987, "learning_rate": 1.822302993217774e-05, "loss": 0.175, "step": 6229 }, { "epoch": 1.3019853709508882, "grad_norm": 0.9818334337510053, "learning_rate": 1.82223879224485e-05, "loss": 0.1658, "step": 6230 }, { "epoch": 1.3021943573667711, "grad_norm": 1.3029741702059308, "learning_rate": 1.8221745808076383e-05, "loss": 0.1832, "step": 6231 }, { "epoch": 1.302403343782654, "grad_norm": 0.940053804315946, "learning_rate": 1.8221103589069553e-05, "loss": 0.1714, "step": 6232 }, { "epoch": 1.302612330198537, "grad_norm": 1.02982391485991, "learning_rate": 1.8220461265436188e-05, "loss": 0.1709, "step": 6233 }, { "epoch": 1.30282131661442, "grad_norm": 0.9789924898333663, "learning_rate": 1.8219818837184462e-05, "loss": 0.1556, "step": 6234 }, { "epoch": 1.303030303030303, "grad_norm": 0.9063272217498832, "learning_rate": 1.8219176304322554e-05, "loss": 0.1774, "step": 6235 }, { "epoch": 1.303239289446186, "grad_norm": 0.849706370119251, "learning_rate": 1.8218533666858637e-05, "loss": 0.1658, "step": 6236 }, { "epoch": 1.303448275862069, "grad_norm": 1.265443104798788, "learning_rate": 1.821789092480089e-05, "loss": 0.1445, "step": 6237 }, { "epoch": 1.3036572622779519, "grad_norm": 1.1027845050449603, "learning_rate": 1.8217248078157492e-05, "loss": 0.1869, "step": 6238 }, { "epoch": 1.3038662486938348, "grad_norm": 1.163652430250471, "learning_rate": 1.8216605126936627e-05, "loss": 0.1951, "step": 6239 }, { "epoch": 1.3040752351097178, "grad_norm": 1.2017149185586864, "learning_rate": 1.8215962071146477e-05, "loss": 0.1795, "step": 6240 }, { "epoch": 1.304284221525601, "grad_norm": 0.9268365419989808, "learning_rate": 1.8215318910795227e-05, "loss": 0.1572, "step": 6241 }, { "epoch": 1.304493207941484, "grad_norm": 1.0705755145175748, "learning_rate": 1.8214675645891054e-05, "loss": 0.1536, "step": 6242 }, { "epoch": 1.304702194357367, "grad_norm": 1.149363399247601, "learning_rate": 1.8214032276442158e-05, "loss": 0.1625, "step": 6243 }, { "epoch": 1.3049111807732499, "grad_norm": 1.071703783977314, "learning_rate": 1.8213388802456718e-05, "loss": 0.1853, "step": 6244 }, { "epoch": 1.3051201671891328, "grad_norm": 1.1213526435159313, "learning_rate": 1.8212745223942927e-05, "loss": 0.1684, "step": 6245 }, { "epoch": 1.3053291536050158, "grad_norm": 1.0362579968345138, "learning_rate": 1.8212101540908973e-05, "loss": 0.1587, "step": 6246 }, { "epoch": 1.3055381400208987, "grad_norm": 1.0456667114867384, "learning_rate": 1.8211457753363046e-05, "loss": 0.151, "step": 6247 }, { "epoch": 1.3057471264367817, "grad_norm": 1.4012047910414671, "learning_rate": 1.8210813861313344e-05, "loss": 0.1883, "step": 6248 }, { "epoch": 1.3059561128526647, "grad_norm": 1.0221593779952391, "learning_rate": 1.8210169864768058e-05, "loss": 0.1627, "step": 6249 }, { "epoch": 1.3061650992685476, "grad_norm": 1.0908293031363467, "learning_rate": 1.820952576373539e-05, "loss": 0.1521, "step": 6250 }, { "epoch": 1.3063740856844306, "grad_norm": 0.9983534684870596, "learning_rate": 1.820888155822353e-05, "loss": 0.2058, "step": 6251 }, { "epoch": 1.3065830721003135, "grad_norm": 1.0003436350688488, "learning_rate": 1.820823724824068e-05, "loss": 0.1752, "step": 6252 }, { "epoch": 1.3067920585161965, "grad_norm": 1.2992361017523406, "learning_rate": 1.820759283379504e-05, "loss": 0.1609, "step": 6253 }, { "epoch": 1.3070010449320795, "grad_norm": 1.0821356937565463, "learning_rate": 1.8206948314894806e-05, "loss": 0.1735, "step": 6254 }, { "epoch": 1.3072100313479624, "grad_norm": 1.1940908390952987, "learning_rate": 1.8206303691548185e-05, "loss": 0.1837, "step": 6255 }, { "epoch": 1.3074190177638454, "grad_norm": 1.0546310754394566, "learning_rate": 1.820565896376338e-05, "loss": 0.1713, "step": 6256 }, { "epoch": 1.3076280041797284, "grad_norm": 0.8914665122033748, "learning_rate": 1.82050141315486e-05, "loss": 0.1368, "step": 6257 }, { "epoch": 1.3078369905956113, "grad_norm": 1.2241563084159732, "learning_rate": 1.820436919491205e-05, "loss": 0.1586, "step": 6258 }, { "epoch": 1.3080459770114943, "grad_norm": 0.9907221413470956, "learning_rate": 1.820372415386193e-05, "loss": 0.1625, "step": 6259 }, { "epoch": 1.3082549634273772, "grad_norm": 1.1164713842429215, "learning_rate": 1.8203079008406458e-05, "loss": 0.1585, "step": 6260 }, { "epoch": 1.3084639498432602, "grad_norm": 1.0402341465507945, "learning_rate": 1.8202433758553845e-05, "loss": 0.1685, "step": 6261 }, { "epoch": 1.3086729362591432, "grad_norm": 0.9914480874711794, "learning_rate": 1.8201788404312295e-05, "loss": 0.1794, "step": 6262 }, { "epoch": 1.3088819226750261, "grad_norm": 1.056935685369621, "learning_rate": 1.8201142945690028e-05, "loss": 0.1974, "step": 6263 }, { "epoch": 1.309090909090909, "grad_norm": 1.0064458168758426, "learning_rate": 1.8200497382695257e-05, "loss": 0.1876, "step": 6264 }, { "epoch": 1.309299895506792, "grad_norm": 1.083853016203276, "learning_rate": 1.8199851715336194e-05, "loss": 0.1809, "step": 6265 }, { "epoch": 1.309508881922675, "grad_norm": 0.9625625599731362, "learning_rate": 1.819920594362106e-05, "loss": 0.1282, "step": 6266 }, { "epoch": 1.309717868338558, "grad_norm": 1.2162977664357608, "learning_rate": 1.8198560067558077e-05, "loss": 0.1467, "step": 6267 }, { "epoch": 1.309926854754441, "grad_norm": 0.9302721840434939, "learning_rate": 1.8197914087155454e-05, "loss": 0.1657, "step": 6268 }, { "epoch": 1.310135841170324, "grad_norm": 1.3149872450939852, "learning_rate": 1.8197268002421422e-05, "loss": 0.1541, "step": 6269 }, { "epoch": 1.3103448275862069, "grad_norm": 0.9763736955870241, "learning_rate": 1.81966218133642e-05, "loss": 0.1695, "step": 6270 }, { "epoch": 1.3105538140020898, "grad_norm": 1.0034591244378555, "learning_rate": 1.819597551999201e-05, "loss": 0.1721, "step": 6271 }, { "epoch": 1.3107628004179728, "grad_norm": 1.2472330066784283, "learning_rate": 1.819532912231308e-05, "loss": 0.1797, "step": 6272 }, { "epoch": 1.3109717868338557, "grad_norm": 1.1011205836218194, "learning_rate": 1.8194682620335636e-05, "loss": 0.1934, "step": 6273 }, { "epoch": 1.3111807732497387, "grad_norm": 0.9871132992892105, "learning_rate": 1.8194036014067907e-05, "loss": 0.1633, "step": 6274 }, { "epoch": 1.3113897596656217, "grad_norm": 0.9573508052429713, "learning_rate": 1.8193389303518113e-05, "loss": 0.2068, "step": 6275 }, { "epoch": 1.3115987460815046, "grad_norm": 0.9516910728377466, "learning_rate": 1.8192742488694498e-05, "loss": 0.1557, "step": 6276 }, { "epoch": 1.3118077324973876, "grad_norm": 0.973512937985778, "learning_rate": 1.8192095569605287e-05, "loss": 0.1614, "step": 6277 }, { "epoch": 1.3120167189132705, "grad_norm": 1.0497900510666385, "learning_rate": 1.819144854625871e-05, "loss": 0.1911, "step": 6278 }, { "epoch": 1.3122257053291535, "grad_norm": 1.0998442472502272, "learning_rate": 1.8190801418663007e-05, "loss": 0.1325, "step": 6279 }, { "epoch": 1.3124346917450365, "grad_norm": 1.131027439162655, "learning_rate": 1.8190154186826413e-05, "loss": 0.1835, "step": 6280 }, { "epoch": 1.3126436781609194, "grad_norm": 1.0327240818601924, "learning_rate": 1.818950685075716e-05, "loss": 0.1732, "step": 6281 }, { "epoch": 1.3128526645768024, "grad_norm": 1.153969040555567, "learning_rate": 1.8188859410463492e-05, "loss": 0.1632, "step": 6282 }, { "epoch": 1.3130616509926853, "grad_norm": 1.0660793398436814, "learning_rate": 1.8188211865953644e-05, "loss": 0.1923, "step": 6283 }, { "epoch": 1.3132706374085683, "grad_norm": 1.0526801568594473, "learning_rate": 1.8187564217235863e-05, "loss": 0.1449, "step": 6284 }, { "epoch": 1.3134796238244515, "grad_norm": 1.1756471008750975, "learning_rate": 1.8186916464318387e-05, "loss": 0.1697, "step": 6285 }, { "epoch": 1.3136886102403345, "grad_norm": 1.2673782859862748, "learning_rate": 1.8186268607209458e-05, "loss": 0.1805, "step": 6286 }, { "epoch": 1.3138975966562174, "grad_norm": 0.9931278966008116, "learning_rate": 1.8185620645917326e-05, "loss": 0.1538, "step": 6287 }, { "epoch": 1.3141065830721004, "grad_norm": 1.0184864823576125, "learning_rate": 1.8184972580450235e-05, "loss": 0.1598, "step": 6288 }, { "epoch": 1.3143155694879833, "grad_norm": 0.955145300710507, "learning_rate": 1.8184324410816435e-05, "loss": 0.1215, "step": 6289 }, { "epoch": 1.3145245559038663, "grad_norm": 0.8893570135246917, "learning_rate": 1.818367613702417e-05, "loss": 0.1616, "step": 6290 }, { "epoch": 1.3147335423197493, "grad_norm": 1.0679383632849855, "learning_rate": 1.818302775908169e-05, "loss": 0.2128, "step": 6291 }, { "epoch": 1.3149425287356322, "grad_norm": 0.9282596460720755, "learning_rate": 1.8182379276997256e-05, "loss": 0.1862, "step": 6292 }, { "epoch": 1.3151515151515152, "grad_norm": 1.1909596052021236, "learning_rate": 1.818173069077911e-05, "loss": 0.1614, "step": 6293 }, { "epoch": 1.3153605015673981, "grad_norm": 1.024678489468727, "learning_rate": 1.8181082000435512e-05, "loss": 0.1583, "step": 6294 }, { "epoch": 1.315569487983281, "grad_norm": 1.1704957149129818, "learning_rate": 1.8180433205974716e-05, "loss": 0.1343, "step": 6295 }, { "epoch": 1.315778474399164, "grad_norm": 1.260294656795218, "learning_rate": 1.817978430740498e-05, "loss": 0.1467, "step": 6296 }, { "epoch": 1.315987460815047, "grad_norm": 0.9069433426546387, "learning_rate": 1.8179135304734563e-05, "loss": 0.1539, "step": 6297 }, { "epoch": 1.31619644723093, "grad_norm": 1.0443693688216544, "learning_rate": 1.8178486197971722e-05, "loss": 0.1587, "step": 6298 }, { "epoch": 1.316405433646813, "grad_norm": 0.9564120909589624, "learning_rate": 1.8177836987124718e-05, "loss": 0.1609, "step": 6299 }, { "epoch": 1.316614420062696, "grad_norm": 1.0924895273053117, "learning_rate": 1.8177187672201814e-05, "loss": 0.1791, "step": 6300 }, { "epoch": 1.3168234064785789, "grad_norm": 1.009673893853273, "learning_rate": 1.8176538253211276e-05, "loss": 0.1835, "step": 6301 }, { "epoch": 1.3170323928944618, "grad_norm": 1.2248635614990435, "learning_rate": 1.8175888730161368e-05, "loss": 0.1813, "step": 6302 }, { "epoch": 1.3172413793103448, "grad_norm": 1.2984129536699076, "learning_rate": 1.817523910306035e-05, "loss": 0.1764, "step": 6303 }, { "epoch": 1.3174503657262278, "grad_norm": 1.2551072575067101, "learning_rate": 1.8174589371916497e-05, "loss": 0.1902, "step": 6304 }, { "epoch": 1.3176593521421107, "grad_norm": 1.177667463377398, "learning_rate": 1.8173939536738076e-05, "loss": 0.2112, "step": 6305 }, { "epoch": 1.3178683385579937, "grad_norm": 0.877833215979703, "learning_rate": 1.8173289597533356e-05, "loss": 0.177, "step": 6306 }, { "epoch": 1.3180773249738766, "grad_norm": 1.2407670724439364, "learning_rate": 1.817263955431061e-05, "loss": 0.1632, "step": 6307 }, { "epoch": 1.3182863113897596, "grad_norm": 0.9432799956400344, "learning_rate": 1.817198940707811e-05, "loss": 0.1763, "step": 6308 }, { "epoch": 1.3184952978056426, "grad_norm": 1.1345067341746353, "learning_rate": 1.8171339155844128e-05, "loss": 0.194, "step": 6309 }, { "epoch": 1.3187042842215255, "grad_norm": 0.9810171166214366, "learning_rate": 1.8170688800616944e-05, "loss": 0.1897, "step": 6310 }, { "epoch": 1.3189132706374085, "grad_norm": 0.9885946293219027, "learning_rate": 1.8170038341404833e-05, "loss": 0.1653, "step": 6311 }, { "epoch": 1.3191222570532917, "grad_norm": 0.9856869952812852, "learning_rate": 1.8169387778216067e-05, "loss": 0.1512, "step": 6312 }, { "epoch": 1.3193312434691746, "grad_norm": 1.071268943135358, "learning_rate": 1.8168737111058937e-05, "loss": 0.1689, "step": 6313 }, { "epoch": 1.3195402298850576, "grad_norm": 0.9901453205786287, "learning_rate": 1.8168086339941716e-05, "loss": 0.1698, "step": 6314 }, { "epoch": 1.3197492163009406, "grad_norm": 0.8937226855781157, "learning_rate": 1.816743546487269e-05, "loss": 0.1988, "step": 6315 }, { "epoch": 1.3199582027168235, "grad_norm": 1.075321515078293, "learning_rate": 1.8166784485860135e-05, "loss": 0.1548, "step": 6316 }, { "epoch": 1.3201671891327065, "grad_norm": 1.0764552689603322, "learning_rate": 1.8166133402912342e-05, "loss": 0.1634, "step": 6317 }, { "epoch": 1.3203761755485894, "grad_norm": 1.0277488219493405, "learning_rate": 1.8165482216037595e-05, "loss": 0.1531, "step": 6318 }, { "epoch": 1.3205851619644724, "grad_norm": 0.9319365596342153, "learning_rate": 1.8164830925244186e-05, "loss": 0.1917, "step": 6319 }, { "epoch": 1.3207941483803554, "grad_norm": 0.8672233335112581, "learning_rate": 1.8164179530540398e-05, "loss": 0.1482, "step": 6320 }, { "epoch": 1.3210031347962383, "grad_norm": 1.2153202302811512, "learning_rate": 1.8163528031934524e-05, "loss": 0.1643, "step": 6321 }, { "epoch": 1.3212121212121213, "grad_norm": 1.3495248488014768, "learning_rate": 1.816287642943485e-05, "loss": 0.1682, "step": 6322 }, { "epoch": 1.3214211076280042, "grad_norm": 1.2946972144127327, "learning_rate": 1.8162224723049678e-05, "loss": 0.1673, "step": 6323 }, { "epoch": 1.3216300940438872, "grad_norm": 1.1100581931434483, "learning_rate": 1.8161572912787298e-05, "loss": 0.2013, "step": 6324 }, { "epoch": 1.3218390804597702, "grad_norm": 1.0615230721035933, "learning_rate": 1.8160920998656e-05, "loss": 0.1758, "step": 6325 }, { "epoch": 1.3220480668756531, "grad_norm": 1.277604810677719, "learning_rate": 1.816026898066409e-05, "loss": 0.1592, "step": 6326 }, { "epoch": 1.322257053291536, "grad_norm": 1.1104676320289646, "learning_rate": 1.8159616858819856e-05, "loss": 0.1524, "step": 6327 }, { "epoch": 1.322466039707419, "grad_norm": 1.2104539792878868, "learning_rate": 1.8158964633131607e-05, "loss": 0.1884, "step": 6328 }, { "epoch": 1.322675026123302, "grad_norm": 1.1924639364271483, "learning_rate": 1.8158312303607638e-05, "loss": 0.1912, "step": 6329 }, { "epoch": 1.322884012539185, "grad_norm": 1.2004579769188444, "learning_rate": 1.815765987025625e-05, "loss": 0.1947, "step": 6330 }, { "epoch": 1.323092998955068, "grad_norm": 1.205988857362296, "learning_rate": 1.815700733308575e-05, "loss": 0.1465, "step": 6331 }, { "epoch": 1.323301985370951, "grad_norm": 1.1137664017466045, "learning_rate": 1.815635469210444e-05, "loss": 0.1704, "step": 6332 }, { "epoch": 1.3235109717868339, "grad_norm": 1.1803982162964826, "learning_rate": 1.8155701947320624e-05, "loss": 0.1692, "step": 6333 }, { "epoch": 1.3237199582027168, "grad_norm": 0.8878353431052036, "learning_rate": 1.815504909874261e-05, "loss": 0.1448, "step": 6334 }, { "epoch": 1.3239289446185998, "grad_norm": 1.0521774262456425, "learning_rate": 1.8154396146378714e-05, "loss": 0.1753, "step": 6335 }, { "epoch": 1.3241379310344827, "grad_norm": 1.041255419874674, "learning_rate": 1.815374309023724e-05, "loss": 0.1267, "step": 6336 }, { "epoch": 1.3243469174503657, "grad_norm": 0.9659628659057219, "learning_rate": 1.8153089930326495e-05, "loss": 0.1659, "step": 6337 }, { "epoch": 1.3245559038662487, "grad_norm": 0.975317890700321, "learning_rate": 1.81524366666548e-05, "loss": 0.1806, "step": 6338 }, { "epoch": 1.3247648902821316, "grad_norm": 0.9739053910693827, "learning_rate": 1.815178329923046e-05, "loss": 0.178, "step": 6339 }, { "epoch": 1.3249738766980146, "grad_norm": 1.302078439459824, "learning_rate": 1.8151129828061797e-05, "loss": 0.1861, "step": 6340 }, { "epoch": 1.3251828631138975, "grad_norm": 0.8448372708672361, "learning_rate": 1.8150476253157124e-05, "loss": 0.1659, "step": 6341 }, { "epoch": 1.3253918495297805, "grad_norm": 1.240547993415429, "learning_rate": 1.8149822574524763e-05, "loss": 0.2011, "step": 6342 }, { "epoch": 1.3256008359456635, "grad_norm": 1.129961867124767, "learning_rate": 1.814916879217303e-05, "loss": 0.168, "step": 6343 }, { "epoch": 1.3258098223615464, "grad_norm": 0.9912997951283218, "learning_rate": 1.8148514906110242e-05, "loss": 0.2069, "step": 6344 }, { "epoch": 1.3260188087774294, "grad_norm": 1.0016426370945508, "learning_rate": 1.8147860916344725e-05, "loss": 0.1756, "step": 6345 }, { "epoch": 1.3262277951933124, "grad_norm": 1.082931769432076, "learning_rate": 1.8147206822884805e-05, "loss": 0.1727, "step": 6346 }, { "epoch": 1.3264367816091953, "grad_norm": 0.671256990103067, "learning_rate": 1.81465526257388e-05, "loss": 0.1606, "step": 6347 }, { "epoch": 1.3266457680250783, "grad_norm": 0.8737942299454162, "learning_rate": 1.8145898324915035e-05, "loss": 0.1594, "step": 6348 }, { "epoch": 1.3268547544409612, "grad_norm": 1.0533654522426803, "learning_rate": 1.814524392042184e-05, "loss": 0.1434, "step": 6349 }, { "epoch": 1.3270637408568442, "grad_norm": 0.9787113890444031, "learning_rate": 1.814458941226755e-05, "loss": 0.1681, "step": 6350 }, { "epoch": 1.3272727272727272, "grad_norm": 0.9689610802162559, "learning_rate": 1.8143934800460484e-05, "loss": 0.1712, "step": 6351 }, { "epoch": 1.3274817136886101, "grad_norm": 0.9842110046411056, "learning_rate": 1.814328008500898e-05, "loss": 0.1899, "step": 6352 }, { "epoch": 1.327690700104493, "grad_norm": 0.9628945932352005, "learning_rate": 1.8142625265921365e-05, "loss": 0.1538, "step": 6353 }, { "epoch": 1.327899686520376, "grad_norm": 1.0501264948266298, "learning_rate": 1.8141970343205978e-05, "loss": 0.1772, "step": 6354 }, { "epoch": 1.328108672936259, "grad_norm": 1.1774057011874757, "learning_rate": 1.8141315316871146e-05, "loss": 0.173, "step": 6355 }, { "epoch": 1.3283176593521422, "grad_norm": 0.6895553190131648, "learning_rate": 1.8140660186925214e-05, "loss": 0.1445, "step": 6356 }, { "epoch": 1.3285266457680251, "grad_norm": 0.8757907528516016, "learning_rate": 1.8140004953376514e-05, "loss": 0.2057, "step": 6357 }, { "epoch": 1.328735632183908, "grad_norm": 0.8383743103286712, "learning_rate": 1.813934961623339e-05, "loss": 0.1576, "step": 6358 }, { "epoch": 1.328944618599791, "grad_norm": 0.9349327018466744, "learning_rate": 1.813869417550418e-05, "loss": 0.1428, "step": 6359 }, { "epoch": 1.329153605015674, "grad_norm": 0.9223652142019105, "learning_rate": 1.8138038631197218e-05, "loss": 0.1669, "step": 6360 }, { "epoch": 1.329362591431557, "grad_norm": 0.857292689690284, "learning_rate": 1.8137382983320858e-05, "loss": 0.1609, "step": 6361 }, { "epoch": 1.32957157784744, "grad_norm": 1.017251694799338, "learning_rate": 1.813672723188344e-05, "loss": 0.1692, "step": 6362 }, { "epoch": 1.329780564263323, "grad_norm": 0.9222869476988499, "learning_rate": 1.8136071376893305e-05, "loss": 0.1638, "step": 6363 }, { "epoch": 1.3299895506792059, "grad_norm": 1.0964292535912898, "learning_rate": 1.813541541835881e-05, "loss": 0.1915, "step": 6364 }, { "epoch": 1.3301985370950888, "grad_norm": 1.0823084929102498, "learning_rate": 1.813475935628829e-05, "loss": 0.1682, "step": 6365 }, { "epoch": 1.3304075235109718, "grad_norm": 1.1309638231576868, "learning_rate": 1.8134103190690103e-05, "loss": 0.1803, "step": 6366 }, { "epoch": 1.3306165099268548, "grad_norm": 1.004105240039822, "learning_rate": 1.8133446921572603e-05, "loss": 0.2074, "step": 6367 }, { "epoch": 1.3308254963427377, "grad_norm": 0.8243302634047318, "learning_rate": 1.8132790548944133e-05, "loss": 0.1659, "step": 6368 }, { "epoch": 1.3310344827586207, "grad_norm": 0.9269687698928643, "learning_rate": 1.813213407281305e-05, "loss": 0.1497, "step": 6369 }, { "epoch": 1.3312434691745036, "grad_norm": 0.9762782640732204, "learning_rate": 1.813147749318771e-05, "loss": 0.1721, "step": 6370 }, { "epoch": 1.3314524555903866, "grad_norm": 0.8103721824905135, "learning_rate": 1.8130820810076467e-05, "loss": 0.1277, "step": 6371 }, { "epoch": 1.3316614420062696, "grad_norm": 0.91218347314866, "learning_rate": 1.813016402348768e-05, "loss": 0.1527, "step": 6372 }, { "epoch": 1.3318704284221525, "grad_norm": 1.0508794356927509, "learning_rate": 1.8129507133429708e-05, "loss": 0.2278, "step": 6373 }, { "epoch": 1.3320794148380355, "grad_norm": 1.0412603391194286, "learning_rate": 1.8128850139910912e-05, "loss": 0.1743, "step": 6374 }, { "epoch": 1.3322884012539185, "grad_norm": 1.0536629174237373, "learning_rate": 1.812819304293965e-05, "loss": 0.1899, "step": 6375 }, { "epoch": 1.3324973876698014, "grad_norm": 0.9790560585228915, "learning_rate": 1.8127535842524284e-05, "loss": 0.1801, "step": 6376 }, { "epoch": 1.3327063740856844, "grad_norm": 1.0261114754227423, "learning_rate": 1.8126878538673182e-05, "loss": 0.1697, "step": 6377 }, { "epoch": 1.3329153605015673, "grad_norm": 1.1201147980463872, "learning_rate": 1.8126221131394705e-05, "loss": 0.1782, "step": 6378 }, { "epoch": 1.3331243469174503, "grad_norm": 1.0594514485170705, "learning_rate": 1.8125563620697223e-05, "loss": 0.1724, "step": 6379 }, { "epoch": 1.3333333333333333, "grad_norm": 0.9088019158718087, "learning_rate": 1.8124906006589106e-05, "loss": 0.1577, "step": 6380 }, { "epoch": 1.3335423197492162, "grad_norm": 0.9564142577411558, "learning_rate": 1.8124248289078714e-05, "loss": 0.1739, "step": 6381 }, { "epoch": 1.3337513061650994, "grad_norm": 1.026417925648749, "learning_rate": 1.812359046817443e-05, "loss": 0.1881, "step": 6382 }, { "epoch": 1.3339602925809824, "grad_norm": 1.0759009210118355, "learning_rate": 1.812293254388461e-05, "loss": 0.1838, "step": 6383 }, { "epoch": 1.3341692789968653, "grad_norm": 0.930474147501552, "learning_rate": 1.8122274516217643e-05, "loss": 0.1759, "step": 6384 }, { "epoch": 1.3343782654127483, "grad_norm": 0.911116375421098, "learning_rate": 1.8121616385181893e-05, "loss": 0.1693, "step": 6385 }, { "epoch": 1.3345872518286312, "grad_norm": 1.068661709727769, "learning_rate": 1.8120958150785742e-05, "loss": 0.1558, "step": 6386 }, { "epoch": 1.3347962382445142, "grad_norm": 1.098610002101447, "learning_rate": 1.8120299813037562e-05, "loss": 0.1884, "step": 6387 }, { "epoch": 1.3350052246603972, "grad_norm": 1.0012273765561777, "learning_rate": 1.8119641371945737e-05, "loss": 0.1873, "step": 6388 }, { "epoch": 1.3352142110762801, "grad_norm": 1.1743931378287995, "learning_rate": 1.811898282751864e-05, "loss": 0.1595, "step": 6389 }, { "epoch": 1.335423197492163, "grad_norm": 0.8029700073970527, "learning_rate": 1.811832417976466e-05, "loss": 0.1616, "step": 6390 }, { "epoch": 1.335632183908046, "grad_norm": 1.0099126284611606, "learning_rate": 1.8117665428692172e-05, "loss": 0.1842, "step": 6391 }, { "epoch": 1.335841170323929, "grad_norm": 1.0388417823978073, "learning_rate": 1.811700657430956e-05, "loss": 0.1998, "step": 6392 }, { "epoch": 1.336050156739812, "grad_norm": 1.023069406791788, "learning_rate": 1.8116347616625213e-05, "loss": 0.1641, "step": 6393 }, { "epoch": 1.336259143155695, "grad_norm": 1.0006184456340537, "learning_rate": 1.811568855564752e-05, "loss": 0.161, "step": 6394 }, { "epoch": 1.336468129571578, "grad_norm": 1.0420190970200125, "learning_rate": 1.811502939138486e-05, "loss": 0.1584, "step": 6395 }, { "epoch": 1.3366771159874609, "grad_norm": 0.9907513857321002, "learning_rate": 1.8114370123845628e-05, "loss": 0.1537, "step": 6396 }, { "epoch": 1.3368861024033438, "grad_norm": 1.1271677207063022, "learning_rate": 1.8113710753038212e-05, "loss": 0.1543, "step": 6397 }, { "epoch": 1.3370950888192268, "grad_norm": 1.054784804621501, "learning_rate": 1.8113051278971003e-05, "loss": 0.1792, "step": 6398 }, { "epoch": 1.3373040752351097, "grad_norm": 1.1258247233392273, "learning_rate": 1.8112391701652395e-05, "loss": 0.1753, "step": 6399 }, { "epoch": 1.3375130616509927, "grad_norm": 1.1776621150895827, "learning_rate": 1.8111732021090784e-05, "loss": 0.1743, "step": 6400 }, { "epoch": 1.3377220480668757, "grad_norm": 1.0318969516524934, "learning_rate": 1.8111072237294563e-05, "loss": 0.1925, "step": 6401 }, { "epoch": 1.3379310344827586, "grad_norm": 0.9991607283342374, "learning_rate": 1.811041235027213e-05, "loss": 0.1705, "step": 6402 }, { "epoch": 1.3381400208986416, "grad_norm": 0.9023253671626497, "learning_rate": 1.810975236003188e-05, "loss": 0.161, "step": 6403 }, { "epoch": 1.3383490073145246, "grad_norm": 0.9344883539620362, "learning_rate": 1.8109092266582215e-05, "loss": 0.1923, "step": 6404 }, { "epoch": 1.3385579937304075, "grad_norm": 0.9886298372166137, "learning_rate": 1.8108432069931537e-05, "loss": 0.1792, "step": 6405 }, { "epoch": 1.3387669801462905, "grad_norm": 1.055911513443551, "learning_rate": 1.8107771770088248e-05, "loss": 0.166, "step": 6406 }, { "epoch": 1.3389759665621734, "grad_norm": 0.9663468266415086, "learning_rate": 1.8107111367060747e-05, "loss": 0.1632, "step": 6407 }, { "epoch": 1.3391849529780564, "grad_norm": 1.149564256506889, "learning_rate": 1.810645086085744e-05, "loss": 0.1445, "step": 6408 }, { "epoch": 1.3393939393939394, "grad_norm": 1.1674851912855984, "learning_rate": 1.810579025148674e-05, "loss": 0.1866, "step": 6409 }, { "epoch": 1.3396029258098223, "grad_norm": 1.1432875038635726, "learning_rate": 1.8105129538957045e-05, "loss": 0.1857, "step": 6410 }, { "epoch": 1.3398119122257053, "grad_norm": 1.0709362649016978, "learning_rate": 1.810446872327677e-05, "loss": 0.1744, "step": 6411 }, { "epoch": 1.3400208986415882, "grad_norm": 1.106306687277929, "learning_rate": 1.8103807804454322e-05, "loss": 0.1568, "step": 6412 }, { "epoch": 1.3402298850574712, "grad_norm": 1.0442811945248305, "learning_rate": 1.8103146782498112e-05, "loss": 0.1381, "step": 6413 }, { "epoch": 1.3404388714733542, "grad_norm": 0.9095028161856953, "learning_rate": 1.8102485657416552e-05, "loss": 0.1544, "step": 6414 }, { "epoch": 1.3406478578892371, "grad_norm": 0.9721512716564041, "learning_rate": 1.810182442921806e-05, "loss": 0.184, "step": 6415 }, { "epoch": 1.34085684430512, "grad_norm": 1.108200887531513, "learning_rate": 1.8101163097911046e-05, "loss": 0.1659, "step": 6416 }, { "epoch": 1.341065830721003, "grad_norm": 1.0242829063796202, "learning_rate": 1.810050166350393e-05, "loss": 0.1649, "step": 6417 }, { "epoch": 1.341274817136886, "grad_norm": 1.001234616785525, "learning_rate": 1.8099840126005128e-05, "loss": 0.1578, "step": 6418 }, { "epoch": 1.341483803552769, "grad_norm": 0.9473284192302404, "learning_rate": 1.809917848542306e-05, "loss": 0.1606, "step": 6419 }, { "epoch": 1.341692789968652, "grad_norm": 1.322229978462373, "learning_rate": 1.8098516741766142e-05, "loss": 0.1626, "step": 6420 }, { "epoch": 1.341901776384535, "grad_norm": 1.3141294024862276, "learning_rate": 1.8097854895042804e-05, "loss": 0.1647, "step": 6421 }, { "epoch": 1.3421107628004179, "grad_norm": 0.9925235818953853, "learning_rate": 1.8097192945261466e-05, "loss": 0.1513, "step": 6422 }, { "epoch": 1.3423197492163008, "grad_norm": 1.0190295755934635, "learning_rate": 1.809653089243055e-05, "loss": 0.1596, "step": 6423 }, { "epoch": 1.3425287356321838, "grad_norm": 0.993526653794128, "learning_rate": 1.8095868736558485e-05, "loss": 0.1691, "step": 6424 }, { "epoch": 1.3427377220480667, "grad_norm": 1.159944729537825, "learning_rate": 1.8095206477653692e-05, "loss": 0.1998, "step": 6425 }, { "epoch": 1.34294670846395, "grad_norm": 1.1388671967066921, "learning_rate": 1.8094544115724604e-05, "loss": 0.1664, "step": 6426 }, { "epoch": 1.3431556948798329, "grad_norm": 1.0717607928357835, "learning_rate": 1.809388165077965e-05, "loss": 0.1864, "step": 6427 }, { "epoch": 1.3433646812957158, "grad_norm": 0.9619784455860285, "learning_rate": 1.8093219082827263e-05, "loss": 0.1512, "step": 6428 }, { "epoch": 1.3435736677115988, "grad_norm": 0.9661768053196225, "learning_rate": 1.809255641187587e-05, "loss": 0.1732, "step": 6429 }, { "epoch": 1.3437826541274818, "grad_norm": 0.9310936670630544, "learning_rate": 1.809189363793391e-05, "loss": 0.1557, "step": 6430 }, { "epoch": 1.3439916405433647, "grad_norm": 1.0991311962385941, "learning_rate": 1.8091230761009812e-05, "loss": 0.1471, "step": 6431 }, { "epoch": 1.3442006269592477, "grad_norm": 1.1282443519826668, "learning_rate": 1.8090567781112018e-05, "loss": 0.1894, "step": 6432 }, { "epoch": 1.3444096133751307, "grad_norm": 0.9629481072782597, "learning_rate": 1.8089904698248963e-05, "loss": 0.1661, "step": 6433 }, { "epoch": 1.3446185997910136, "grad_norm": 1.0243575711612054, "learning_rate": 1.8089241512429083e-05, "loss": 0.1923, "step": 6434 }, { "epoch": 1.3448275862068966, "grad_norm": 1.0006003384240731, "learning_rate": 1.8088578223660823e-05, "loss": 0.1746, "step": 6435 }, { "epoch": 1.3450365726227795, "grad_norm": 0.9428368678025606, "learning_rate": 1.8087914831952625e-05, "loss": 0.1645, "step": 6436 }, { "epoch": 1.3452455590386625, "grad_norm": 1.289664273919628, "learning_rate": 1.8087251337312922e-05, "loss": 0.1895, "step": 6437 }, { "epoch": 1.3454545454545455, "grad_norm": 1.2135096863946735, "learning_rate": 1.8086587739750168e-05, "loss": 0.1565, "step": 6438 }, { "epoch": 1.3456635318704284, "grad_norm": 1.036781493492591, "learning_rate": 1.8085924039272806e-05, "loss": 0.1809, "step": 6439 }, { "epoch": 1.3458725182863114, "grad_norm": 1.06082013410112, "learning_rate": 1.8085260235889282e-05, "loss": 0.1783, "step": 6440 }, { "epoch": 1.3460815047021943, "grad_norm": 0.9656178148299881, "learning_rate": 1.8084596329608044e-05, "loss": 0.1718, "step": 6441 }, { "epoch": 1.3462904911180773, "grad_norm": 1.1202304997424835, "learning_rate": 1.8083932320437543e-05, "loss": 0.1739, "step": 6442 }, { "epoch": 1.3464994775339603, "grad_norm": 1.0487707640387485, "learning_rate": 1.8083268208386225e-05, "loss": 0.1768, "step": 6443 }, { "epoch": 1.3467084639498432, "grad_norm": 0.9816727316671123, "learning_rate": 1.808260399346254e-05, "loss": 0.1308, "step": 6444 }, { "epoch": 1.3469174503657262, "grad_norm": 1.016006357410054, "learning_rate": 1.8081939675674954e-05, "loss": 0.1743, "step": 6445 }, { "epoch": 1.3471264367816091, "grad_norm": 0.8606048726052705, "learning_rate": 1.808127525503191e-05, "loss": 0.1374, "step": 6446 }, { "epoch": 1.347335423197492, "grad_norm": 1.0380439643728394, "learning_rate": 1.8080610731541867e-05, "loss": 0.1673, "step": 6447 }, { "epoch": 1.347544409613375, "grad_norm": 1.1808548558320016, "learning_rate": 1.807994610521328e-05, "loss": 0.1985, "step": 6448 }, { "epoch": 1.347753396029258, "grad_norm": 1.0750163750125816, "learning_rate": 1.8079281376054615e-05, "loss": 0.1602, "step": 6449 }, { "epoch": 1.347962382445141, "grad_norm": 1.0359686208660452, "learning_rate": 1.8078616544074318e-05, "loss": 0.1766, "step": 6450 }, { "epoch": 1.348171368861024, "grad_norm": 1.110811017764116, "learning_rate": 1.8077951609280866e-05, "loss": 0.1784, "step": 6451 }, { "epoch": 1.3483803552769071, "grad_norm": 0.9737637888949209, "learning_rate": 1.807728657168271e-05, "loss": 0.164, "step": 6452 }, { "epoch": 1.34858934169279, "grad_norm": 1.2424555251486316, "learning_rate": 1.8076621431288316e-05, "loss": 0.2128, "step": 6453 }, { "epoch": 1.348798328108673, "grad_norm": 1.135712321030799, "learning_rate": 1.8075956188106154e-05, "loss": 0.2074, "step": 6454 }, { "epoch": 1.349007314524556, "grad_norm": 0.8309745105273916, "learning_rate": 1.8075290842144685e-05, "loss": 0.1503, "step": 6455 }, { "epoch": 1.349216300940439, "grad_norm": 1.131429775413735, "learning_rate": 1.8074625393412377e-05, "loss": 0.1442, "step": 6456 }, { "epoch": 1.349425287356322, "grad_norm": 0.9613009815844996, "learning_rate": 1.80739598419177e-05, "loss": 0.149, "step": 6457 }, { "epoch": 1.349634273772205, "grad_norm": 1.1474215193279553, "learning_rate": 1.8073294187669127e-05, "loss": 0.167, "step": 6458 }, { "epoch": 1.3498432601880879, "grad_norm": 0.9923659362169932, "learning_rate": 1.8072628430675125e-05, "loss": 0.1774, "step": 6459 }, { "epoch": 1.3500522466039708, "grad_norm": 0.9079015353256078, "learning_rate": 1.807196257094417e-05, "loss": 0.1402, "step": 6460 }, { "epoch": 1.3502612330198538, "grad_norm": 0.940950220283892, "learning_rate": 1.807129660848473e-05, "loss": 0.1734, "step": 6461 }, { "epoch": 1.3504702194357368, "grad_norm": 0.7861172636555644, "learning_rate": 1.8070630543305287e-05, "loss": 0.1414, "step": 6462 }, { "epoch": 1.3506792058516197, "grad_norm": 1.1761770898666983, "learning_rate": 1.8069964375414316e-05, "loss": 0.1741, "step": 6463 }, { "epoch": 1.3508881922675027, "grad_norm": 1.0172202509297323, "learning_rate": 1.8069298104820298e-05, "loss": 0.1852, "step": 6464 }, { "epoch": 1.3510971786833856, "grad_norm": 1.012705294690421, "learning_rate": 1.8068631731531704e-05, "loss": 0.1655, "step": 6465 }, { "epoch": 1.3513061650992686, "grad_norm": 1.0417449901070235, "learning_rate": 1.8067965255557024e-05, "loss": 0.1615, "step": 6466 }, { "epoch": 1.3515151515151516, "grad_norm": 1.252529604993931, "learning_rate": 1.8067298676904733e-05, "loss": 0.1854, "step": 6467 }, { "epoch": 1.3517241379310345, "grad_norm": 1.0408446948205752, "learning_rate": 1.8066631995583318e-05, "loss": 0.1851, "step": 6468 }, { "epoch": 1.3519331243469175, "grad_norm": 0.9175065507103947, "learning_rate": 1.8065965211601265e-05, "loss": 0.1939, "step": 6469 }, { "epoch": 1.3521421107628004, "grad_norm": 1.089888214084939, "learning_rate": 1.8065298324967052e-05, "loss": 0.1671, "step": 6470 }, { "epoch": 1.3523510971786834, "grad_norm": 1.1906065702095272, "learning_rate": 1.8064631335689177e-05, "loss": 0.1688, "step": 6471 }, { "epoch": 1.3525600835945664, "grad_norm": 0.9232221411839585, "learning_rate": 1.8063964243776123e-05, "loss": 0.1683, "step": 6472 }, { "epoch": 1.3527690700104493, "grad_norm": 0.8929511940480942, "learning_rate": 1.8063297049236377e-05, "loss": 0.151, "step": 6473 }, { "epoch": 1.3529780564263323, "grad_norm": 1.1447713204805654, "learning_rate": 1.8062629752078434e-05, "loss": 0.194, "step": 6474 }, { "epoch": 1.3531870428422152, "grad_norm": 0.8727064642272576, "learning_rate": 1.8061962352310788e-05, "loss": 0.1298, "step": 6475 }, { "epoch": 1.3533960292580982, "grad_norm": 1.0356565717143809, "learning_rate": 1.8061294849941922e-05, "loss": 0.1448, "step": 6476 }, { "epoch": 1.3536050156739812, "grad_norm": 1.1033920566123268, "learning_rate": 1.8060627244980348e-05, "loss": 0.1894, "step": 6477 }, { "epoch": 1.3538140020898641, "grad_norm": 1.0511426276277396, "learning_rate": 1.8059959537434552e-05, "loss": 0.1635, "step": 6478 }, { "epoch": 1.354022988505747, "grad_norm": 0.9753153673885872, "learning_rate": 1.805929172731303e-05, "loss": 0.1603, "step": 6479 }, { "epoch": 1.35423197492163, "grad_norm": 1.1291322036908982, "learning_rate": 1.8058623814624287e-05, "loss": 0.1743, "step": 6480 }, { "epoch": 1.354440961337513, "grad_norm": 1.0841365285334579, "learning_rate": 1.805795579937682e-05, "loss": 0.1703, "step": 6481 }, { "epoch": 1.354649947753396, "grad_norm": 1.0033326826076738, "learning_rate": 1.805728768157913e-05, "loss": 0.1736, "step": 6482 }, { "epoch": 1.354858934169279, "grad_norm": 1.0619250848702793, "learning_rate": 1.805661946123972e-05, "loss": 0.1776, "step": 6483 }, { "epoch": 1.355067920585162, "grad_norm": 0.8906596753454181, "learning_rate": 1.8055951138367098e-05, "loss": 0.1827, "step": 6484 }, { "epoch": 1.3552769070010449, "grad_norm": 0.9290806494575351, "learning_rate": 1.8055282712969763e-05, "loss": 0.1467, "step": 6485 }, { "epoch": 1.3554858934169278, "grad_norm": 0.9791358450547245, "learning_rate": 1.8054614185056224e-05, "loss": 0.2019, "step": 6486 }, { "epoch": 1.3556948798328108, "grad_norm": 1.124003876114773, "learning_rate": 1.8053945554634995e-05, "loss": 0.1894, "step": 6487 }, { "epoch": 1.3559038662486937, "grad_norm": 0.9928490834678044, "learning_rate": 1.8053276821714577e-05, "loss": 0.1798, "step": 6488 }, { "epoch": 1.3561128526645767, "grad_norm": 0.9980611578309293, "learning_rate": 1.8052607986303487e-05, "loss": 0.1637, "step": 6489 }, { "epoch": 1.3563218390804597, "grad_norm": 0.9788269040888129, "learning_rate": 1.8051939048410233e-05, "loss": 0.1382, "step": 6490 }, { "epoch": 1.3565308254963426, "grad_norm": 1.0347392134493658, "learning_rate": 1.805127000804333e-05, "loss": 0.1867, "step": 6491 }, { "epoch": 1.3567398119122256, "grad_norm": 0.9561050435741155, "learning_rate": 1.805060086521129e-05, "loss": 0.1587, "step": 6492 }, { "epoch": 1.3569487983281086, "grad_norm": 1.074683166357436, "learning_rate": 1.8049931619922638e-05, "loss": 0.1758, "step": 6493 }, { "epoch": 1.3571577847439915, "grad_norm": 0.9778599970909404, "learning_rate": 1.8049262272185875e-05, "loss": 0.1895, "step": 6494 }, { "epoch": 1.3573667711598745, "grad_norm": 0.9393049924836175, "learning_rate": 1.8048592822009534e-05, "loss": 0.1684, "step": 6495 }, { "epoch": 1.3575757575757577, "grad_norm": 1.3383040610524741, "learning_rate": 1.804792326940213e-05, "loss": 0.1691, "step": 6496 }, { "epoch": 1.3577847439916406, "grad_norm": 1.0001307412936156, "learning_rate": 1.804725361437218e-05, "loss": 0.1503, "step": 6497 }, { "epoch": 1.3579937304075236, "grad_norm": 1.0763385665872953, "learning_rate": 1.804658385692821e-05, "loss": 0.1829, "step": 6498 }, { "epoch": 1.3582027168234065, "grad_norm": 1.0318112471984866, "learning_rate": 1.8045913997078754e-05, "loss": 0.1948, "step": 6499 }, { "epoch": 1.3584117032392895, "grad_norm": 0.9480786018724309, "learning_rate": 1.8045244034832317e-05, "loss": 0.1735, "step": 6500 }, { "epoch": 1.3586206896551725, "grad_norm": 0.9971402057991652, "learning_rate": 1.8044573970197436e-05, "loss": 0.1795, "step": 6501 }, { "epoch": 1.3588296760710554, "grad_norm": 0.9050947926397034, "learning_rate": 1.8043903803182638e-05, "loss": 0.1492, "step": 6502 }, { "epoch": 1.3590386624869384, "grad_norm": 0.996346255081761, "learning_rate": 1.8043233533796454e-05, "loss": 0.1899, "step": 6503 }, { "epoch": 1.3592476489028213, "grad_norm": 0.9689734452147762, "learning_rate": 1.8042563162047413e-05, "loss": 0.1745, "step": 6504 }, { "epoch": 1.3594566353187043, "grad_norm": 1.056472471711999, "learning_rate": 1.804189268794404e-05, "loss": 0.18, "step": 6505 }, { "epoch": 1.3596656217345873, "grad_norm": 1.1643968998240493, "learning_rate": 1.804122211149488e-05, "loss": 0.1894, "step": 6506 }, { "epoch": 1.3598746081504702, "grad_norm": 1.0660384819825681, "learning_rate": 1.804055143270846e-05, "loss": 0.1858, "step": 6507 }, { "epoch": 1.3600835945663532, "grad_norm": 1.1205852662446976, "learning_rate": 1.803988065159331e-05, "loss": 0.1664, "step": 6508 }, { "epoch": 1.3602925809822362, "grad_norm": 0.8814036831857089, "learning_rate": 1.8039209768157977e-05, "loss": 0.1527, "step": 6509 }, { "epoch": 1.3605015673981191, "grad_norm": 0.9940356566208686, "learning_rate": 1.8038538782410992e-05, "loss": 0.1946, "step": 6510 }, { "epoch": 1.360710553814002, "grad_norm": 0.8746706675808504, "learning_rate": 1.8037867694360902e-05, "loss": 0.1742, "step": 6511 }, { "epoch": 1.360919540229885, "grad_norm": 0.9974993224028831, "learning_rate": 1.803719650401624e-05, "loss": 0.147, "step": 6512 }, { "epoch": 1.361128526645768, "grad_norm": 1.1208782511963868, "learning_rate": 1.803652521138555e-05, "loss": 0.1899, "step": 6513 }, { "epoch": 1.361337513061651, "grad_norm": 0.988579178892815, "learning_rate": 1.8035853816477378e-05, "loss": 0.1599, "step": 6514 }, { "epoch": 1.361546499477534, "grad_norm": 1.289611458305456, "learning_rate": 1.8035182319300264e-05, "loss": 0.1766, "step": 6515 }, { "epoch": 1.3617554858934169, "grad_norm": 0.9632005027641045, "learning_rate": 1.803451071986276e-05, "loss": 0.1583, "step": 6516 }, { "epoch": 1.3619644723092998, "grad_norm": 0.9726721050888815, "learning_rate": 1.8033839018173407e-05, "loss": 0.166, "step": 6517 }, { "epoch": 1.3621734587251828, "grad_norm": 0.8898953710083524, "learning_rate": 1.8033167214240757e-05, "loss": 0.1348, "step": 6518 }, { "epoch": 1.3623824451410658, "grad_norm": 1.156456276113517, "learning_rate": 1.8032495308073357e-05, "loss": 0.1902, "step": 6519 }, { "epoch": 1.3625914315569487, "grad_norm": 0.9906973434108861, "learning_rate": 1.8031823299679764e-05, "loss": 0.1557, "step": 6520 }, { "epoch": 1.3628004179728317, "grad_norm": 1.1257173138857657, "learning_rate": 1.8031151189068522e-05, "loss": 0.1814, "step": 6521 }, { "epoch": 1.3630094043887147, "grad_norm": 0.8701942550834698, "learning_rate": 1.8030478976248192e-05, "loss": 0.1595, "step": 6522 }, { "epoch": 1.3632183908045978, "grad_norm": 1.0392418787815065, "learning_rate": 1.8029806661227323e-05, "loss": 0.1936, "step": 6523 }, { "epoch": 1.3634273772204808, "grad_norm": 1.1127600383703389, "learning_rate": 1.8029134244014478e-05, "loss": 0.1838, "step": 6524 }, { "epoch": 1.3636363636363638, "grad_norm": 1.0129038540604436, "learning_rate": 1.8028461724618206e-05, "loss": 0.1806, "step": 6525 }, { "epoch": 1.3638453500522467, "grad_norm": 0.9521208948148229, "learning_rate": 1.8027789103047074e-05, "loss": 0.1447, "step": 6526 }, { "epoch": 1.3640543364681297, "grad_norm": 1.034179478794935, "learning_rate": 1.8027116379309637e-05, "loss": 0.1773, "step": 6527 }, { "epoch": 1.3642633228840126, "grad_norm": 1.0944597410733694, "learning_rate": 1.802644355341446e-05, "loss": 0.1651, "step": 6528 }, { "epoch": 1.3644723092998956, "grad_norm": 0.9641332359350923, "learning_rate": 1.8025770625370104e-05, "loss": 0.1682, "step": 6529 }, { "epoch": 1.3646812957157786, "grad_norm": 1.2805734840787795, "learning_rate": 1.8025097595185132e-05, "loss": 0.1448, "step": 6530 }, { "epoch": 1.3648902821316615, "grad_norm": 1.2527994208285784, "learning_rate": 1.8024424462868112e-05, "loss": 0.2012, "step": 6531 }, { "epoch": 1.3650992685475445, "grad_norm": 1.1262989288619598, "learning_rate": 1.802375122842761e-05, "loss": 0.1559, "step": 6532 }, { "epoch": 1.3653082549634274, "grad_norm": 1.0744185747554718, "learning_rate": 1.802307789187219e-05, "loss": 0.1674, "step": 6533 }, { "epoch": 1.3655172413793104, "grad_norm": 0.9458020202913939, "learning_rate": 1.8022404453210426e-05, "loss": 0.1687, "step": 6534 }, { "epoch": 1.3657262277951934, "grad_norm": 0.9096045553520108, "learning_rate": 1.8021730912450887e-05, "loss": 0.1657, "step": 6535 }, { "epoch": 1.3659352142110763, "grad_norm": 0.8878676341936483, "learning_rate": 1.8021057269602142e-05, "loss": 0.1065, "step": 6536 }, { "epoch": 1.3661442006269593, "grad_norm": 1.0655642178639666, "learning_rate": 1.8020383524672768e-05, "loss": 0.1934, "step": 6537 }, { "epoch": 1.3663531870428423, "grad_norm": 1.232335009754426, "learning_rate": 1.8019709677671343e-05, "loss": 0.1514, "step": 6538 }, { "epoch": 1.3665621734587252, "grad_norm": 1.1222244784528321, "learning_rate": 1.8019035728606436e-05, "loss": 0.1611, "step": 6539 }, { "epoch": 1.3667711598746082, "grad_norm": 0.937721731325426, "learning_rate": 1.8018361677486625e-05, "loss": 0.1474, "step": 6540 }, { "epoch": 1.3669801462904911, "grad_norm": 0.9473922941008474, "learning_rate": 1.801768752432049e-05, "loss": 0.1556, "step": 6541 }, { "epoch": 1.367189132706374, "grad_norm": 1.1953424984097785, "learning_rate": 1.801701326911661e-05, "loss": 0.1534, "step": 6542 }, { "epoch": 1.367398119122257, "grad_norm": 1.121663256506815, "learning_rate": 1.8016338911883568e-05, "loss": 0.1337, "step": 6543 }, { "epoch": 1.36760710553814, "grad_norm": 1.315230335391973, "learning_rate": 1.8015664452629944e-05, "loss": 0.1894, "step": 6544 }, { "epoch": 1.367816091954023, "grad_norm": 0.9325842275121595, "learning_rate": 1.801498989136432e-05, "loss": 0.1852, "step": 6545 }, { "epoch": 1.368025078369906, "grad_norm": 0.9956561215808561, "learning_rate": 1.8014315228095286e-05, "loss": 0.1398, "step": 6546 }, { "epoch": 1.368234064785789, "grad_norm": 1.179488880466478, "learning_rate": 1.8013640462831422e-05, "loss": 0.1704, "step": 6547 }, { "epoch": 1.3684430512016719, "grad_norm": 0.9935800998123236, "learning_rate": 1.8012965595581323e-05, "loss": 0.1546, "step": 6548 }, { "epoch": 1.3686520376175548, "grad_norm": 1.0316440058895358, "learning_rate": 1.801229062635357e-05, "loss": 0.1416, "step": 6549 }, { "epoch": 1.3688610240334378, "grad_norm": 1.3147696631435408, "learning_rate": 1.8011615555156757e-05, "loss": 0.1922, "step": 6550 }, { "epoch": 1.3690700104493208, "grad_norm": 1.0665196014054052, "learning_rate": 1.8010940381999476e-05, "loss": 0.1553, "step": 6551 }, { "epoch": 1.3692789968652037, "grad_norm": 1.225190798600248, "learning_rate": 1.801026510689032e-05, "loss": 0.1936, "step": 6552 }, { "epoch": 1.3694879832810867, "grad_norm": 0.9505015474046363, "learning_rate": 1.8009589729837876e-05, "loss": 0.1718, "step": 6553 }, { "epoch": 1.3696969696969696, "grad_norm": 1.0968565150120668, "learning_rate": 1.800891425085075e-05, "loss": 0.1796, "step": 6554 }, { "epoch": 1.3699059561128526, "grad_norm": 1.1932961430397815, "learning_rate": 1.8008238669937527e-05, "loss": 0.151, "step": 6555 }, { "epoch": 1.3701149425287356, "grad_norm": 1.0244377464301955, "learning_rate": 1.8007562987106816e-05, "loss": 0.1498, "step": 6556 }, { "epoch": 1.3703239289446185, "grad_norm": 0.9946546149236705, "learning_rate": 1.8006887202367213e-05, "loss": 0.1749, "step": 6557 }, { "epoch": 1.3705329153605015, "grad_norm": 1.245521405004684, "learning_rate": 1.800621131572731e-05, "loss": 0.1592, "step": 6558 }, { "epoch": 1.3707419017763844, "grad_norm": 1.0932295141335933, "learning_rate": 1.800553532719572e-05, "loss": 0.1557, "step": 6559 }, { "epoch": 1.3709508881922674, "grad_norm": 1.3559754165308875, "learning_rate": 1.800485923678104e-05, "loss": 0.168, "step": 6560 }, { "epoch": 1.3711598746081504, "grad_norm": 0.8939654980173023, "learning_rate": 1.8004183044491876e-05, "loss": 0.165, "step": 6561 }, { "epoch": 1.3713688610240333, "grad_norm": 1.1246793520515972, "learning_rate": 1.8003506750336833e-05, "loss": 0.1786, "step": 6562 }, { "epoch": 1.3715778474399163, "grad_norm": 1.0339772767217332, "learning_rate": 1.8002830354324516e-05, "loss": 0.17, "step": 6563 }, { "epoch": 1.3717868338557992, "grad_norm": 1.0694394964571357, "learning_rate": 1.8002153856463534e-05, "loss": 0.1675, "step": 6564 }, { "epoch": 1.3719958202716822, "grad_norm": 1.2525741501745196, "learning_rate": 1.8001477256762504e-05, "loss": 0.1691, "step": 6565 }, { "epoch": 1.3722048066875652, "grad_norm": 1.13657215962308, "learning_rate": 1.8000800555230028e-05, "loss": 0.1565, "step": 6566 }, { "epoch": 1.3724137931034484, "grad_norm": 1.1294876105080789, "learning_rate": 1.800012375187472e-05, "loss": 0.1862, "step": 6567 }, { "epoch": 1.3726227795193313, "grad_norm": 0.9302103122543616, "learning_rate": 1.7999446846705195e-05, "loss": 0.1607, "step": 6568 }, { "epoch": 1.3728317659352143, "grad_norm": 1.0199799407274617, "learning_rate": 1.7998769839730063e-05, "loss": 0.1871, "step": 6569 }, { "epoch": 1.3730407523510972, "grad_norm": 0.9496676858992158, "learning_rate": 1.799809273095795e-05, "loss": 0.1969, "step": 6570 }, { "epoch": 1.3732497387669802, "grad_norm": 1.0781144091779433, "learning_rate": 1.7997415520397462e-05, "loss": 0.1906, "step": 6571 }, { "epoch": 1.3734587251828632, "grad_norm": 1.2946906030077805, "learning_rate": 1.7996738208057225e-05, "loss": 0.1857, "step": 6572 }, { "epoch": 1.3736677115987461, "grad_norm": 1.193460190782283, "learning_rate": 1.7996060793945855e-05, "loss": 0.1788, "step": 6573 }, { "epoch": 1.373876698014629, "grad_norm": 1.0198721231874452, "learning_rate": 1.7995383278071976e-05, "loss": 0.1755, "step": 6574 }, { "epoch": 1.374085684430512, "grad_norm": 1.4572718572892753, "learning_rate": 1.7994705660444213e-05, "loss": 0.1651, "step": 6575 }, { "epoch": 1.374294670846395, "grad_norm": 0.9315135169124579, "learning_rate": 1.7994027941071177e-05, "loss": 0.1625, "step": 6576 }, { "epoch": 1.374503657262278, "grad_norm": 1.140758299848428, "learning_rate": 1.799335011996151e-05, "loss": 0.1422, "step": 6577 }, { "epoch": 1.374712643678161, "grad_norm": 1.102408743484557, "learning_rate": 1.7992672197123826e-05, "loss": 0.1979, "step": 6578 }, { "epoch": 1.374921630094044, "grad_norm": 0.9946501241429714, "learning_rate": 1.7991994172566758e-05, "loss": 0.199, "step": 6579 }, { "epoch": 1.3751306165099269, "grad_norm": 1.2677415043415392, "learning_rate": 1.7991316046298935e-05, "loss": 0.1996, "step": 6580 }, { "epoch": 1.3753396029258098, "grad_norm": 1.1310651292487812, "learning_rate": 1.7990637818328984e-05, "loss": 0.1608, "step": 6581 }, { "epoch": 1.3755485893416928, "grad_norm": 0.9218955005183407, "learning_rate": 1.7989959488665542e-05, "loss": 0.1881, "step": 6582 }, { "epoch": 1.3757575757575757, "grad_norm": 1.1306285726925935, "learning_rate": 1.7989281057317238e-05, "loss": 0.1918, "step": 6583 }, { "epoch": 1.3759665621734587, "grad_norm": 0.9294418359864943, "learning_rate": 1.79886025242927e-05, "loss": 0.161, "step": 6584 }, { "epoch": 1.3761755485893417, "grad_norm": 1.0237956222357698, "learning_rate": 1.7987923889600575e-05, "loss": 0.1812, "step": 6585 }, { "epoch": 1.3763845350052246, "grad_norm": 1.1024686887135715, "learning_rate": 1.7987245153249496e-05, "loss": 0.2004, "step": 6586 }, { "epoch": 1.3765935214211076, "grad_norm": 0.8743214688129815, "learning_rate": 1.7986566315248097e-05, "loss": 0.1535, "step": 6587 }, { "epoch": 1.3768025078369905, "grad_norm": 1.0618513876252353, "learning_rate": 1.7985887375605022e-05, "loss": 0.146, "step": 6588 }, { "epoch": 1.3770114942528735, "grad_norm": 0.9249200255639042, "learning_rate": 1.798520833432891e-05, "loss": 0.1512, "step": 6589 }, { "epoch": 1.3772204806687565, "grad_norm": 0.9921987923023369, "learning_rate": 1.79845291914284e-05, "loss": 0.1889, "step": 6590 }, { "epoch": 1.3774294670846394, "grad_norm": 0.8710691632188632, "learning_rate": 1.7983849946912134e-05, "loss": 0.1494, "step": 6591 }, { "epoch": 1.3776384535005224, "grad_norm": 1.1258261083544912, "learning_rate": 1.7983170600788766e-05, "loss": 0.1805, "step": 6592 }, { "epoch": 1.3778474399164056, "grad_norm": 1.1690805174515013, "learning_rate": 1.7982491153066936e-05, "loss": 0.1993, "step": 6593 }, { "epoch": 1.3780564263322885, "grad_norm": 0.8553135094196365, "learning_rate": 1.7981811603755288e-05, "loss": 0.1412, "step": 6594 }, { "epoch": 1.3782654127481715, "grad_norm": 0.7259483726467645, "learning_rate": 1.7981131952862475e-05, "loss": 0.1533, "step": 6595 }, { "epoch": 1.3784743991640545, "grad_norm": 1.0950927585791614, "learning_rate": 1.7980452200397144e-05, "loss": 0.1516, "step": 6596 }, { "epoch": 1.3786833855799374, "grad_norm": 1.050559277707642, "learning_rate": 1.7979772346367944e-05, "loss": 0.1606, "step": 6597 }, { "epoch": 1.3788923719958204, "grad_norm": 1.2514197476845341, "learning_rate": 1.7979092390783536e-05, "loss": 0.184, "step": 6598 }, { "epoch": 1.3791013584117033, "grad_norm": 1.187437618639774, "learning_rate": 1.7978412333652563e-05, "loss": 0.201, "step": 6599 }, { "epoch": 1.3793103448275863, "grad_norm": 1.0070196155189364, "learning_rate": 1.7977732174983685e-05, "loss": 0.1708, "step": 6600 }, { "epoch": 1.3795193312434693, "grad_norm": 1.1138154978920503, "learning_rate": 1.7977051914785555e-05, "loss": 0.1885, "step": 6601 }, { "epoch": 1.3797283176593522, "grad_norm": 1.0706264054104282, "learning_rate": 1.7976371553066836e-05, "loss": 0.1766, "step": 6602 }, { "epoch": 1.3799373040752352, "grad_norm": 1.0652677395809216, "learning_rate": 1.7975691089836186e-05, "loss": 0.1924, "step": 6603 }, { "epoch": 1.3801462904911181, "grad_norm": 0.9264219950712258, "learning_rate": 1.7975010525102257e-05, "loss": 0.1519, "step": 6604 }, { "epoch": 1.380355276907001, "grad_norm": 1.083762051197909, "learning_rate": 1.7974329858873717e-05, "loss": 0.1772, "step": 6605 }, { "epoch": 1.380564263322884, "grad_norm": 0.9948350023581475, "learning_rate": 1.7973649091159225e-05, "loss": 0.1512, "step": 6606 }, { "epoch": 1.380773249738767, "grad_norm": 0.9818089840308136, "learning_rate": 1.7972968221967452e-05, "loss": 0.1963, "step": 6607 }, { "epoch": 1.38098223615465, "grad_norm": 1.0225678067199404, "learning_rate": 1.7972287251307057e-05, "loss": 0.1685, "step": 6608 }, { "epoch": 1.381191222570533, "grad_norm": 1.3721232004898527, "learning_rate": 1.7971606179186705e-05, "loss": 0.2001, "step": 6609 }, { "epoch": 1.381400208986416, "grad_norm": 0.8741657312834328, "learning_rate": 1.797092500561507e-05, "loss": 0.179, "step": 6610 }, { "epoch": 1.3816091954022989, "grad_norm": 1.0359822350779764, "learning_rate": 1.7970243730600815e-05, "loss": 0.1749, "step": 6611 }, { "epoch": 1.3818181818181818, "grad_norm": 1.1150080865390197, "learning_rate": 1.796956235415261e-05, "loss": 0.2037, "step": 6612 }, { "epoch": 1.3820271682340648, "grad_norm": 0.9281257743258255, "learning_rate": 1.7968880876279137e-05, "loss": 0.1806, "step": 6613 }, { "epoch": 1.3822361546499478, "grad_norm": 0.9399620191516245, "learning_rate": 1.7968199296989053e-05, "loss": 0.1898, "step": 6614 }, { "epoch": 1.3824451410658307, "grad_norm": 0.8656144942423623, "learning_rate": 1.7967517616291042e-05, "loss": 0.1301, "step": 6615 }, { "epoch": 1.3826541274817137, "grad_norm": 0.9833309574598055, "learning_rate": 1.7966835834193782e-05, "loss": 0.1838, "step": 6616 }, { "epoch": 1.3828631138975966, "grad_norm": 0.9606515975265516, "learning_rate": 1.7966153950705944e-05, "loss": 0.174, "step": 6617 }, { "epoch": 1.3830721003134796, "grad_norm": 0.9521298116218776, "learning_rate": 1.7965471965836203e-05, "loss": 0.1975, "step": 6618 }, { "epoch": 1.3832810867293626, "grad_norm": 1.2119604796650523, "learning_rate": 1.7964789879593247e-05, "loss": 0.1885, "step": 6619 }, { "epoch": 1.3834900731452455, "grad_norm": 1.3361506829559804, "learning_rate": 1.796410769198575e-05, "loss": 0.1618, "step": 6620 }, { "epoch": 1.3836990595611285, "grad_norm": 0.8846746622560488, "learning_rate": 1.7963425403022396e-05, "loss": 0.1533, "step": 6621 }, { "epoch": 1.3839080459770114, "grad_norm": 1.0751078145859458, "learning_rate": 1.796274301271187e-05, "loss": 0.1806, "step": 6622 }, { "epoch": 1.3841170323928944, "grad_norm": 1.2904621839411603, "learning_rate": 1.7962060521062857e-05, "loss": 0.2229, "step": 6623 }, { "epoch": 1.3843260188087774, "grad_norm": 1.0706132633166532, "learning_rate": 1.796137792808404e-05, "loss": 0.1806, "step": 6624 }, { "epoch": 1.3845350052246603, "grad_norm": 1.0064704622536365, "learning_rate": 1.7960695233784108e-05, "loss": 0.1716, "step": 6625 }, { "epoch": 1.3847439916405433, "grad_norm": 0.8805196790825114, "learning_rate": 1.7960012438171747e-05, "loss": 0.1607, "step": 6626 }, { "epoch": 1.3849529780564263, "grad_norm": 0.9537912883590461, "learning_rate": 1.7959329541255644e-05, "loss": 0.1471, "step": 6627 }, { "epoch": 1.3851619644723092, "grad_norm": 0.8998287866698899, "learning_rate": 1.79586465430445e-05, "loss": 0.1528, "step": 6628 }, { "epoch": 1.3853709508881922, "grad_norm": 1.0921364871620782, "learning_rate": 1.7957963443547e-05, "loss": 0.1783, "step": 6629 }, { "epoch": 1.3855799373040751, "grad_norm": 0.9889648472258329, "learning_rate": 1.7957280242771833e-05, "loss": 0.1443, "step": 6630 }, { "epoch": 1.385788923719958, "grad_norm": 1.1339252802099837, "learning_rate": 1.7956596940727706e-05, "loss": 0.1622, "step": 6631 }, { "epoch": 1.385997910135841, "grad_norm": 1.023615237809735, "learning_rate": 1.7955913537423307e-05, "loss": 0.2178, "step": 6632 }, { "epoch": 1.386206896551724, "grad_norm": 1.050631585853595, "learning_rate": 1.7955230032867334e-05, "loss": 0.1635, "step": 6633 }, { "epoch": 1.386415882967607, "grad_norm": 0.910349091512875, "learning_rate": 1.7954546427068487e-05, "loss": 0.1444, "step": 6634 }, { "epoch": 1.38662486938349, "grad_norm": 0.8910810295465277, "learning_rate": 1.7953862720035465e-05, "loss": 0.1417, "step": 6635 }, { "epoch": 1.386833855799373, "grad_norm": 1.0059629392481857, "learning_rate": 1.7953178911776974e-05, "loss": 0.1793, "step": 6636 }, { "epoch": 1.387042842215256, "grad_norm": 0.9568887246003638, "learning_rate": 1.7952495002301708e-05, "loss": 0.1602, "step": 6637 }, { "epoch": 1.387251828631139, "grad_norm": 1.2321685894163186, "learning_rate": 1.7951810991618375e-05, "loss": 0.1879, "step": 6638 }, { "epoch": 1.387460815047022, "grad_norm": 1.027892222174851, "learning_rate": 1.7951126879735683e-05, "loss": 0.1785, "step": 6639 }, { "epoch": 1.387669801462905, "grad_norm": 0.9283562751835778, "learning_rate": 1.7950442666662335e-05, "loss": 0.1541, "step": 6640 }, { "epoch": 1.387878787878788, "grad_norm": 0.9783372422051152, "learning_rate": 1.794975835240704e-05, "loss": 0.1368, "step": 6641 }, { "epoch": 1.388087774294671, "grad_norm": 0.9714566170069726, "learning_rate": 1.7949073936978502e-05, "loss": 0.142, "step": 6642 }, { "epoch": 1.3882967607105539, "grad_norm": 1.0623782844132375, "learning_rate": 1.7948389420385442e-05, "loss": 0.1933, "step": 6643 }, { "epoch": 1.3885057471264368, "grad_norm": 1.0887467838424714, "learning_rate": 1.794770480263656e-05, "loss": 0.1937, "step": 6644 }, { "epoch": 1.3887147335423198, "grad_norm": 0.9586824907986832, "learning_rate": 1.7947020083740575e-05, "loss": 0.1777, "step": 6645 }, { "epoch": 1.3889237199582027, "grad_norm": 1.0711922345244849, "learning_rate": 1.7946335263706202e-05, "loss": 0.1882, "step": 6646 }, { "epoch": 1.3891327063740857, "grad_norm": 1.0974740099398292, "learning_rate": 1.7945650342542157e-05, "loss": 0.1794, "step": 6647 }, { "epoch": 1.3893416927899687, "grad_norm": 1.1219435619354223, "learning_rate": 1.794496532025715e-05, "loss": 0.1876, "step": 6648 }, { "epoch": 1.3895506792058516, "grad_norm": 1.036758573616496, "learning_rate": 1.7944280196859905e-05, "loss": 0.1479, "step": 6649 }, { "epoch": 1.3897596656217346, "grad_norm": 1.0150719264668462, "learning_rate": 1.7943594972359138e-05, "loss": 0.1609, "step": 6650 }, { "epoch": 1.3899686520376175, "grad_norm": 0.8895673038102139, "learning_rate": 1.7942909646763572e-05, "loss": 0.186, "step": 6651 }, { "epoch": 1.3901776384535005, "grad_norm": 1.2122921438894345, "learning_rate": 1.794222422008193e-05, "loss": 0.1559, "step": 6652 }, { "epoch": 1.3903866248693835, "grad_norm": 1.2572648505203283, "learning_rate": 1.7941538692322933e-05, "loss": 0.2003, "step": 6653 }, { "epoch": 1.3905956112852664, "grad_norm": 1.0418029473233426, "learning_rate": 1.7940853063495302e-05, "loss": 0.196, "step": 6654 }, { "epoch": 1.3908045977011494, "grad_norm": 1.0285991668993357, "learning_rate": 1.7940167333607773e-05, "loss": 0.1935, "step": 6655 }, { "epoch": 1.3910135841170324, "grad_norm": 0.9582355140610893, "learning_rate": 1.793948150266906e-05, "loss": 0.1766, "step": 6656 }, { "epoch": 1.3912225705329153, "grad_norm": 0.9877669135294689, "learning_rate": 1.79387955706879e-05, "loss": 0.1624, "step": 6657 }, { "epoch": 1.3914315569487983, "grad_norm": 0.9227468083516521, "learning_rate": 1.793810953767302e-05, "loss": 0.1473, "step": 6658 }, { "epoch": 1.3916405433646812, "grad_norm": 0.7935469400617003, "learning_rate": 1.793742340363315e-05, "loss": 0.1772, "step": 6659 }, { "epoch": 1.3918495297805642, "grad_norm": 1.113841375038955, "learning_rate": 1.7936737168577024e-05, "loss": 0.206, "step": 6660 }, { "epoch": 1.3920585161964472, "grad_norm": 0.8078878009504581, "learning_rate": 1.7936050832513375e-05, "loss": 0.1551, "step": 6661 }, { "epoch": 1.3922675026123301, "grad_norm": 1.103747840223406, "learning_rate": 1.7935364395450937e-05, "loss": 0.1581, "step": 6662 }, { "epoch": 1.3924764890282133, "grad_norm": 1.01471566997666, "learning_rate": 1.7934677857398445e-05, "loss": 0.1792, "step": 6663 }, { "epoch": 1.3926854754440963, "grad_norm": 1.065365114995772, "learning_rate": 1.793399121836464e-05, "loss": 0.163, "step": 6664 }, { "epoch": 1.3928944618599792, "grad_norm": 0.9172547895999172, "learning_rate": 1.7933304478358256e-05, "loss": 0.1672, "step": 6665 }, { "epoch": 1.3931034482758622, "grad_norm": 0.9145853777834929, "learning_rate": 1.7932617637388036e-05, "loss": 0.1627, "step": 6666 }, { "epoch": 1.3933124346917451, "grad_norm": 0.7690034060524167, "learning_rate": 1.793193069546272e-05, "loss": 0.1434, "step": 6667 }, { "epoch": 1.393521421107628, "grad_norm": 0.7668071026722868, "learning_rate": 1.7931243652591052e-05, "loss": 0.121, "step": 6668 }, { "epoch": 1.393730407523511, "grad_norm": 0.9313821983541817, "learning_rate": 1.7930556508781772e-05, "loss": 0.1625, "step": 6669 }, { "epoch": 1.393939393939394, "grad_norm": 1.0568515770098517, "learning_rate": 1.7929869264043627e-05, "loss": 0.1822, "step": 6670 }, { "epoch": 1.394148380355277, "grad_norm": 0.9347751988713652, "learning_rate": 1.7929181918385365e-05, "loss": 0.1846, "step": 6671 }, { "epoch": 1.39435736677116, "grad_norm": 0.9895050914822029, "learning_rate": 1.7928494471815732e-05, "loss": 0.1639, "step": 6672 }, { "epoch": 1.394566353187043, "grad_norm": 0.9337765570078367, "learning_rate": 1.7927806924343475e-05, "loss": 0.1302, "step": 6673 }, { "epoch": 1.3947753396029259, "grad_norm": 0.9446917667470695, "learning_rate": 1.7927119275977346e-05, "loss": 0.1915, "step": 6674 }, { "epoch": 1.3949843260188088, "grad_norm": 0.9648849748492645, "learning_rate": 1.79264315267261e-05, "loss": 0.1648, "step": 6675 }, { "epoch": 1.3951933124346918, "grad_norm": 0.976767952881257, "learning_rate": 1.7925743676598484e-05, "loss": 0.1505, "step": 6676 }, { "epoch": 1.3954022988505748, "grad_norm": 1.2018262364360441, "learning_rate": 1.7925055725603252e-05, "loss": 0.1929, "step": 6677 }, { "epoch": 1.3956112852664577, "grad_norm": 1.067184318751279, "learning_rate": 1.7924367673749167e-05, "loss": 0.1989, "step": 6678 }, { "epoch": 1.3958202716823407, "grad_norm": 0.742818638711814, "learning_rate": 1.7923679521044977e-05, "loss": 0.1639, "step": 6679 }, { "epoch": 1.3960292580982236, "grad_norm": 1.1246081465009656, "learning_rate": 1.7922991267499442e-05, "loss": 0.1458, "step": 6680 }, { "epoch": 1.3962382445141066, "grad_norm": 0.9885844849623988, "learning_rate": 1.792230291312132e-05, "loss": 0.1476, "step": 6681 }, { "epoch": 1.3964472309299896, "grad_norm": 1.0088876302869167, "learning_rate": 1.7921614457919377e-05, "loss": 0.1758, "step": 6682 }, { "epoch": 1.3966562173458725, "grad_norm": 0.9959898390421167, "learning_rate": 1.7920925901902368e-05, "loss": 0.174, "step": 6683 }, { "epoch": 1.3968652037617555, "grad_norm": 0.9844482946597125, "learning_rate": 1.7920237245079062e-05, "loss": 0.1719, "step": 6684 }, { "epoch": 1.3970741901776385, "grad_norm": 1.0675864843540872, "learning_rate": 1.791954848745822e-05, "loss": 0.1565, "step": 6685 }, { "epoch": 1.3972831765935214, "grad_norm": 1.1545763639334503, "learning_rate": 1.7918859629048605e-05, "loss": 0.1632, "step": 6686 }, { "epoch": 1.3974921630094044, "grad_norm": 1.0386177385607498, "learning_rate": 1.7918170669858986e-05, "loss": 0.1578, "step": 6687 }, { "epoch": 1.3977011494252873, "grad_norm": 1.016465517271601, "learning_rate": 1.7917481609898132e-05, "loss": 0.1761, "step": 6688 }, { "epoch": 1.3979101358411703, "grad_norm": 0.9398412628365382, "learning_rate": 1.7916792449174815e-05, "loss": 0.1322, "step": 6689 }, { "epoch": 1.3981191222570533, "grad_norm": 1.0019338953419386, "learning_rate": 1.79161031876978e-05, "loss": 0.1762, "step": 6690 }, { "epoch": 1.3983281086729362, "grad_norm": 1.1093347975501884, "learning_rate": 1.7915413825475864e-05, "loss": 0.1679, "step": 6691 }, { "epoch": 1.3985370950888192, "grad_norm": 0.9362127227159731, "learning_rate": 1.7914724362517778e-05, "loss": 0.1943, "step": 6692 }, { "epoch": 1.3987460815047021, "grad_norm": 0.9641850254283755, "learning_rate": 1.7914034798832315e-05, "loss": 0.1873, "step": 6693 }, { "epoch": 1.398955067920585, "grad_norm": 1.0269886257961096, "learning_rate": 1.791334513442825e-05, "loss": 0.1438, "step": 6694 }, { "epoch": 1.399164054336468, "grad_norm": 0.818330172043705, "learning_rate": 1.7912655369314363e-05, "loss": 0.1329, "step": 6695 }, { "epoch": 1.399373040752351, "grad_norm": 0.920782289893109, "learning_rate": 1.7911965503499435e-05, "loss": 0.1808, "step": 6696 }, { "epoch": 1.399582027168234, "grad_norm": 1.02660583805075, "learning_rate": 1.7911275536992238e-05, "loss": 0.1462, "step": 6697 }, { "epoch": 1.399791013584117, "grad_norm": 1.0027035725630915, "learning_rate": 1.791058546980156e-05, "loss": 0.1449, "step": 6698 }, { "epoch": 1.4, "grad_norm": 0.9699465365931795, "learning_rate": 1.790989530193618e-05, "loss": 0.1711, "step": 6699 }, { "epoch": 1.4002089864158829, "grad_norm": 0.8385449528266443, "learning_rate": 1.790920503340488e-05, "loss": 0.1534, "step": 6700 }, { "epoch": 1.4004179728317658, "grad_norm": 1.080523834165966, "learning_rate": 1.7908514664216445e-05, "loss": 0.1813, "step": 6701 }, { "epoch": 1.4006269592476488, "grad_norm": 0.9425124816560939, "learning_rate": 1.7907824194379662e-05, "loss": 0.1807, "step": 6702 }, { "epoch": 1.4008359456635318, "grad_norm": 1.0557113016533295, "learning_rate": 1.7907133623903324e-05, "loss": 0.1674, "step": 6703 }, { "epoch": 1.4010449320794147, "grad_norm": 1.0586142735699444, "learning_rate": 1.7906442952796212e-05, "loss": 0.1568, "step": 6704 }, { "epoch": 1.4012539184952977, "grad_norm": 1.0811819320752047, "learning_rate": 1.790575218106712e-05, "loss": 0.181, "step": 6705 }, { "epoch": 1.4014629049111806, "grad_norm": 1.08892065986028, "learning_rate": 1.7905061308724833e-05, "loss": 0.1597, "step": 6706 }, { "epoch": 1.4016718913270638, "grad_norm": 1.017742989719831, "learning_rate": 1.790437033577815e-05, "loss": 0.1847, "step": 6707 }, { "epoch": 1.4018808777429468, "grad_norm": 1.1409993198426667, "learning_rate": 1.7903679262235866e-05, "loss": 0.2016, "step": 6708 }, { "epoch": 1.4020898641588297, "grad_norm": 0.7929324000036393, "learning_rate": 1.7902988088106773e-05, "loss": 0.154, "step": 6709 }, { "epoch": 1.4022988505747127, "grad_norm": 0.8846013264468523, "learning_rate": 1.7902296813399663e-05, "loss": 0.1374, "step": 6710 }, { "epoch": 1.4025078369905957, "grad_norm": 1.0796616626741184, "learning_rate": 1.790160543812334e-05, "loss": 0.1779, "step": 6711 }, { "epoch": 1.4027168234064786, "grad_norm": 1.055875128943441, "learning_rate": 1.7900913962286598e-05, "loss": 0.1584, "step": 6712 }, { "epoch": 1.4029258098223616, "grad_norm": 1.1897932515368934, "learning_rate": 1.790022238589824e-05, "loss": 0.1575, "step": 6713 }, { "epoch": 1.4031347962382446, "grad_norm": 1.1555496514157253, "learning_rate": 1.7899530708967067e-05, "loss": 0.1562, "step": 6714 }, { "epoch": 1.4033437826541275, "grad_norm": 1.021512733279296, "learning_rate": 1.7898838931501884e-05, "loss": 0.1643, "step": 6715 }, { "epoch": 1.4035527690700105, "grad_norm": 0.9538214717860299, "learning_rate": 1.789814705351149e-05, "loss": 0.1971, "step": 6716 }, { "epoch": 1.4037617554858934, "grad_norm": 0.9723602006180266, "learning_rate": 1.7897455075004693e-05, "loss": 0.1361, "step": 6717 }, { "epoch": 1.4039707419017764, "grad_norm": 1.369105691176168, "learning_rate": 1.78967629959903e-05, "loss": 0.1497, "step": 6718 }, { "epoch": 1.4041797283176594, "grad_norm": 1.0222091543595389, "learning_rate": 1.7896070816477116e-05, "loss": 0.1735, "step": 6719 }, { "epoch": 1.4043887147335423, "grad_norm": 0.834947663426419, "learning_rate": 1.7895378536473953e-05, "loss": 0.1558, "step": 6720 }, { "epoch": 1.4045977011494253, "grad_norm": 0.9945659782604831, "learning_rate": 1.7894686155989623e-05, "loss": 0.195, "step": 6721 }, { "epoch": 1.4048066875653082, "grad_norm": 0.9229779222245519, "learning_rate": 1.7893993675032932e-05, "loss": 0.1338, "step": 6722 }, { "epoch": 1.4050156739811912, "grad_norm": 1.0393862557007774, "learning_rate": 1.78933010936127e-05, "loss": 0.173, "step": 6723 }, { "epoch": 1.4052246603970742, "grad_norm": 1.0128886781175026, "learning_rate": 1.7892608411737736e-05, "loss": 0.1628, "step": 6724 }, { "epoch": 1.4054336468129571, "grad_norm": 0.9612583139112478, "learning_rate": 1.7891915629416852e-05, "loss": 0.166, "step": 6725 }, { "epoch": 1.40564263322884, "grad_norm": 1.135563157420779, "learning_rate": 1.7891222746658875e-05, "loss": 0.1778, "step": 6726 }, { "epoch": 1.405851619644723, "grad_norm": 0.8772762346917339, "learning_rate": 1.7890529763472615e-05, "loss": 0.1565, "step": 6727 }, { "epoch": 1.406060606060606, "grad_norm": 0.980288483766933, "learning_rate": 1.7889836679866893e-05, "loss": 0.1705, "step": 6728 }, { "epoch": 1.406269592476489, "grad_norm": 1.0622945144329994, "learning_rate": 1.7889143495850535e-05, "loss": 0.1656, "step": 6729 }, { "epoch": 1.406478578892372, "grad_norm": 1.2609161361370045, "learning_rate": 1.7888450211432356e-05, "loss": 0.1824, "step": 6730 }, { "epoch": 1.406687565308255, "grad_norm": 1.0355871746444372, "learning_rate": 1.788775682662118e-05, "loss": 0.176, "step": 6731 }, { "epoch": 1.4068965517241379, "grad_norm": 1.017259852640134, "learning_rate": 1.788706334142583e-05, "loss": 0.1702, "step": 6732 }, { "epoch": 1.4071055381400208, "grad_norm": 1.0075217708980135, "learning_rate": 1.788636975585514e-05, "loss": 0.1854, "step": 6733 }, { "epoch": 1.407314524555904, "grad_norm": 1.0396297656504079, "learning_rate": 1.788567606991793e-05, "loss": 0.1714, "step": 6734 }, { "epoch": 1.407523510971787, "grad_norm": 1.0592433409425197, "learning_rate": 1.788498228362303e-05, "loss": 0.1666, "step": 6735 }, { "epoch": 1.40773249738767, "grad_norm": 1.0010561695801579, "learning_rate": 1.7884288396979265e-05, "loss": 0.1426, "step": 6736 }, { "epoch": 1.4079414838035529, "grad_norm": 0.9242330537648107, "learning_rate": 1.7883594409995476e-05, "loss": 0.1548, "step": 6737 }, { "epoch": 1.4081504702194358, "grad_norm": 1.0952868318003948, "learning_rate": 1.788290032268049e-05, "loss": 0.1197, "step": 6738 }, { "epoch": 1.4083594566353188, "grad_norm": 0.930919705811946, "learning_rate": 1.7882206135043133e-05, "loss": 0.1454, "step": 6739 }, { "epoch": 1.4085684430512018, "grad_norm": 0.9804761048743985, "learning_rate": 1.788151184709225e-05, "loss": 0.1621, "step": 6740 }, { "epoch": 1.4087774294670847, "grad_norm": 0.8980857088209971, "learning_rate": 1.788081745883667e-05, "loss": 0.1579, "step": 6741 }, { "epoch": 1.4089864158829677, "grad_norm": 0.9570473589877779, "learning_rate": 1.788012297028524e-05, "loss": 0.1588, "step": 6742 }, { "epoch": 1.4091954022988507, "grad_norm": 0.8813613336567464, "learning_rate": 1.7879428381446783e-05, "loss": 0.1893, "step": 6743 }, { "epoch": 1.4094043887147336, "grad_norm": 0.9884149682418341, "learning_rate": 1.787873369233015e-05, "loss": 0.1792, "step": 6744 }, { "epoch": 1.4096133751306166, "grad_norm": 1.3519271589213135, "learning_rate": 1.7878038902944187e-05, "loss": 0.1922, "step": 6745 }, { "epoch": 1.4098223615464995, "grad_norm": 1.0592917737225789, "learning_rate": 1.7877344013297722e-05, "loss": 0.1963, "step": 6746 }, { "epoch": 1.4100313479623825, "grad_norm": 0.936241543539578, "learning_rate": 1.7876649023399605e-05, "loss": 0.1617, "step": 6747 }, { "epoch": 1.4102403343782655, "grad_norm": 1.0549538570617347, "learning_rate": 1.7875953933258683e-05, "loss": 0.1887, "step": 6748 }, { "epoch": 1.4104493207941484, "grad_norm": 0.8879908623999986, "learning_rate": 1.7875258742883802e-05, "loss": 0.1556, "step": 6749 }, { "epoch": 1.4106583072100314, "grad_norm": 1.1755468540402496, "learning_rate": 1.7874563452283805e-05, "loss": 0.1779, "step": 6750 }, { "epoch": 1.4108672936259143, "grad_norm": 1.1307823529205674, "learning_rate": 1.7873868061467542e-05, "loss": 0.1597, "step": 6751 }, { "epoch": 1.4110762800417973, "grad_norm": 1.085276396032345, "learning_rate": 1.7873172570443866e-05, "loss": 0.16, "step": 6752 }, { "epoch": 1.4112852664576803, "grad_norm": 0.9547978626141154, "learning_rate": 1.7872476979221627e-05, "loss": 0.1709, "step": 6753 }, { "epoch": 1.4114942528735632, "grad_norm": 0.8754432726163707, "learning_rate": 1.7871781287809678e-05, "loss": 0.1642, "step": 6754 }, { "epoch": 1.4117032392894462, "grad_norm": 1.0889396075637245, "learning_rate": 1.787108549621687e-05, "loss": 0.153, "step": 6755 }, { "epoch": 1.4119122257053291, "grad_norm": 1.1345296774828648, "learning_rate": 1.787038960445206e-05, "loss": 0.1522, "step": 6756 }, { "epoch": 1.412121212121212, "grad_norm": 1.0069502401705992, "learning_rate": 1.7869693612524107e-05, "loss": 0.1835, "step": 6757 }, { "epoch": 1.412330198537095, "grad_norm": 1.1013673424169592, "learning_rate": 1.786899752044186e-05, "loss": 0.1474, "step": 6758 }, { "epoch": 1.412539184952978, "grad_norm": 0.9947712964175504, "learning_rate": 1.786830132821419e-05, "loss": 0.1603, "step": 6759 }, { "epoch": 1.412748171368861, "grad_norm": 0.9692688315946728, "learning_rate": 1.786760503584995e-05, "loss": 0.1799, "step": 6760 }, { "epoch": 1.412957157784744, "grad_norm": 1.1358525329218718, "learning_rate": 1.7866908643357995e-05, "loss": 0.1417, "step": 6761 }, { "epoch": 1.413166144200627, "grad_norm": 1.0611457310060926, "learning_rate": 1.78662121507472e-05, "loss": 0.1751, "step": 6762 }, { "epoch": 1.4133751306165099, "grad_norm": 0.8478421133034266, "learning_rate": 1.786551555802643e-05, "loss": 0.1475, "step": 6763 }, { "epoch": 1.4135841170323928, "grad_norm": 1.1288881295282047, "learning_rate": 1.7864818865204533e-05, "loss": 0.1963, "step": 6764 }, { "epoch": 1.4137931034482758, "grad_norm": 1.0192320600409859, "learning_rate": 1.7864122072290394e-05, "loss": 0.1615, "step": 6765 }, { "epoch": 1.4140020898641588, "grad_norm": 0.8220601497694008, "learning_rate": 1.786342517929287e-05, "loss": 0.1575, "step": 6766 }, { "epoch": 1.4142110762800417, "grad_norm": 1.0393461919586633, "learning_rate": 1.7862728186220835e-05, "loss": 0.174, "step": 6767 }, { "epoch": 1.4144200626959247, "grad_norm": 1.100976952263122, "learning_rate": 1.786203109308316e-05, "loss": 0.19, "step": 6768 }, { "epoch": 1.4146290491118076, "grad_norm": 1.1410605111118504, "learning_rate": 1.786133389988871e-05, "loss": 0.1925, "step": 6769 }, { "epoch": 1.4148380355276906, "grad_norm": 1.00039168907599, "learning_rate": 1.7860636606646364e-05, "loss": 0.1486, "step": 6770 }, { "epoch": 1.4150470219435736, "grad_norm": 1.1045031674483146, "learning_rate": 1.7859939213364996e-05, "loss": 0.159, "step": 6771 }, { "epoch": 1.4152560083594565, "grad_norm": 0.9914260398640586, "learning_rate": 1.7859241720053478e-05, "loss": 0.193, "step": 6772 }, { "epoch": 1.4154649947753395, "grad_norm": 0.8913672066004408, "learning_rate": 1.7858544126720688e-05, "loss": 0.1675, "step": 6773 }, { "epoch": 1.4156739811912225, "grad_norm": 1.0636899067680707, "learning_rate": 1.7857846433375506e-05, "loss": 0.1877, "step": 6774 }, { "epoch": 1.4158829676071054, "grad_norm": 1.1330806457871774, "learning_rate": 1.7857148640026813e-05, "loss": 0.1721, "step": 6775 }, { "epoch": 1.4160919540229884, "grad_norm": 1.2678238430072999, "learning_rate": 1.785645074668348e-05, "loss": 0.1572, "step": 6776 }, { "epoch": 1.4163009404388713, "grad_norm": 0.9264724939259428, "learning_rate": 1.7855752753354398e-05, "loss": 0.1445, "step": 6777 }, { "epoch": 1.4165099268547545, "grad_norm": 1.183590514452464, "learning_rate": 1.7855054660048448e-05, "loss": 0.157, "step": 6778 }, { "epoch": 1.4167189132706375, "grad_norm": 1.0217079542396417, "learning_rate": 1.7854356466774513e-05, "loss": 0.1547, "step": 6779 }, { "epoch": 1.4169278996865204, "grad_norm": 0.9202054540343921, "learning_rate": 1.785365817354148e-05, "loss": 0.1726, "step": 6780 }, { "epoch": 1.4171368861024034, "grad_norm": 0.8225986272420563, "learning_rate": 1.7852959780358235e-05, "loss": 0.1679, "step": 6781 }, { "epoch": 1.4173458725182864, "grad_norm": 0.9281427442525098, "learning_rate": 1.7852261287233665e-05, "loss": 0.1655, "step": 6782 }, { "epoch": 1.4175548589341693, "grad_norm": 0.9830767098528769, "learning_rate": 1.785156269417666e-05, "loss": 0.1905, "step": 6783 }, { "epoch": 1.4177638453500523, "grad_norm": 1.040816729087237, "learning_rate": 1.7850864001196112e-05, "loss": 0.1607, "step": 6784 }, { "epoch": 1.4179728317659352, "grad_norm": 0.8628332437175382, "learning_rate": 1.7850165208300914e-05, "loss": 0.1415, "step": 6785 }, { "epoch": 1.4181818181818182, "grad_norm": 0.8876435515358735, "learning_rate": 1.784946631549996e-05, "loss": 0.1721, "step": 6786 }, { "epoch": 1.4183908045977012, "grad_norm": 0.9787119273831932, "learning_rate": 1.7848767322802136e-05, "loss": 0.1473, "step": 6787 }, { "epoch": 1.4185997910135841, "grad_norm": 1.02205706845528, "learning_rate": 1.7848068230216347e-05, "loss": 0.169, "step": 6788 }, { "epoch": 1.418808777429467, "grad_norm": 1.159910359805427, "learning_rate": 1.7847369037751486e-05, "loss": 0.1771, "step": 6789 }, { "epoch": 1.41901776384535, "grad_norm": 0.944063182537613, "learning_rate": 1.7846669745416453e-05, "loss": 0.1539, "step": 6790 }, { "epoch": 1.419226750261233, "grad_norm": 0.8228383257069202, "learning_rate": 1.7845970353220145e-05, "loss": 0.1341, "step": 6791 }, { "epoch": 1.419435736677116, "grad_norm": 1.7458986379568964, "learning_rate": 1.7845270861171467e-05, "loss": 0.1791, "step": 6792 }, { "epoch": 1.419644723092999, "grad_norm": 1.0269125345932582, "learning_rate": 1.784457126927932e-05, "loss": 0.165, "step": 6793 }, { "epoch": 1.419853709508882, "grad_norm": 0.9558692917503306, "learning_rate": 1.7843871577552604e-05, "loss": 0.1499, "step": 6794 }, { "epoch": 1.4200626959247649, "grad_norm": 0.8904196857180057, "learning_rate": 1.7843171786000225e-05, "loss": 0.1479, "step": 6795 }, { "epoch": 1.4202716823406478, "grad_norm": 1.0026938558020166, "learning_rate": 1.784247189463109e-05, "loss": 0.1507, "step": 6796 }, { "epoch": 1.4204806687565308, "grad_norm": 0.9763755068994316, "learning_rate": 1.7841771903454106e-05, "loss": 0.1562, "step": 6797 }, { "epoch": 1.4206896551724137, "grad_norm": 0.9632056443433517, "learning_rate": 1.784107181247819e-05, "loss": 0.1553, "step": 6798 }, { "epoch": 1.4208986415882967, "grad_norm": 1.2005620480223236, "learning_rate": 1.7840371621712233e-05, "loss": 0.1835, "step": 6799 }, { "epoch": 1.4211076280041797, "grad_norm": 1.0535678164994176, "learning_rate": 1.783967133116516e-05, "loss": 0.1765, "step": 6800 }, { "epoch": 1.4213166144200626, "grad_norm": 0.980223909425754, "learning_rate": 1.7838970940845883e-05, "loss": 0.1847, "step": 6801 }, { "epoch": 1.4215256008359456, "grad_norm": 0.9556354105682427, "learning_rate": 1.7838270450763307e-05, "loss": 0.1407, "step": 6802 }, { "epoch": 1.4217345872518286, "grad_norm": 1.189543332300921, "learning_rate": 1.7837569860926356e-05, "loss": 0.2022, "step": 6803 }, { "epoch": 1.4219435736677117, "grad_norm": 1.0673894647848097, "learning_rate": 1.7836869171343937e-05, "loss": 0.1706, "step": 6804 }, { "epoch": 1.4221525600835947, "grad_norm": 1.1729542402876694, "learning_rate": 1.7836168382024978e-05, "loss": 0.1803, "step": 6805 }, { "epoch": 1.4223615464994777, "grad_norm": 1.0753100922147696, "learning_rate": 1.783546749297839e-05, "loss": 0.1494, "step": 6806 }, { "epoch": 1.4225705329153606, "grad_norm": 1.0341190576077584, "learning_rate": 1.7834766504213097e-05, "loss": 0.1802, "step": 6807 }, { "epoch": 1.4227795193312436, "grad_norm": 1.074234215167939, "learning_rate": 1.7834065415738018e-05, "loss": 0.1589, "step": 6808 }, { "epoch": 1.4229885057471265, "grad_norm": 0.9075949699257125, "learning_rate": 1.7833364227562072e-05, "loss": 0.1638, "step": 6809 }, { "epoch": 1.4231974921630095, "grad_norm": 0.9795000546958119, "learning_rate": 1.7832662939694193e-05, "loss": 0.1742, "step": 6810 }, { "epoch": 1.4234064785788925, "grad_norm": 1.0005449434193199, "learning_rate": 1.7831961552143297e-05, "loss": 0.1497, "step": 6811 }, { "epoch": 1.4236154649947754, "grad_norm": 1.4232969842720056, "learning_rate": 1.783126006491831e-05, "loss": 0.1948, "step": 6812 }, { "epoch": 1.4238244514106584, "grad_norm": 0.9042706370569422, "learning_rate": 1.7830558478028167e-05, "loss": 0.135, "step": 6813 }, { "epoch": 1.4240334378265413, "grad_norm": 0.82168670887428, "learning_rate": 1.782985679148179e-05, "loss": 0.147, "step": 6814 }, { "epoch": 1.4242424242424243, "grad_norm": 1.231856105950873, "learning_rate": 1.782915500528811e-05, "loss": 0.1816, "step": 6815 }, { "epoch": 1.4244514106583073, "grad_norm": 0.9897801109045532, "learning_rate": 1.782845311945606e-05, "loss": 0.1538, "step": 6816 }, { "epoch": 1.4246603970741902, "grad_norm": 1.029433460374056, "learning_rate": 1.7827751133994576e-05, "loss": 0.166, "step": 6817 }, { "epoch": 1.4248693834900732, "grad_norm": 1.4169003902119714, "learning_rate": 1.7827049048912583e-05, "loss": 0.218, "step": 6818 }, { "epoch": 1.4250783699059562, "grad_norm": 1.0782332046106926, "learning_rate": 1.7826346864219024e-05, "loss": 0.1938, "step": 6819 }, { "epoch": 1.4252873563218391, "grad_norm": 0.8674161304285944, "learning_rate": 1.7825644579922833e-05, "loss": 0.1782, "step": 6820 }, { "epoch": 1.425496342737722, "grad_norm": 1.0743189263791668, "learning_rate": 1.7824942196032942e-05, "loss": 0.1845, "step": 6821 }, { "epoch": 1.425705329153605, "grad_norm": 0.8849513713368349, "learning_rate": 1.7824239712558303e-05, "loss": 0.1558, "step": 6822 }, { "epoch": 1.425914315569488, "grad_norm": 1.0639266953714488, "learning_rate": 1.7823537129507845e-05, "loss": 0.1965, "step": 6823 }, { "epoch": 1.426123301985371, "grad_norm": 0.9682299433845782, "learning_rate": 1.782283444689051e-05, "loss": 0.1645, "step": 6824 }, { "epoch": 1.426332288401254, "grad_norm": 1.0179716595972743, "learning_rate": 1.782213166471525e-05, "loss": 0.1465, "step": 6825 }, { "epoch": 1.4265412748171369, "grad_norm": 1.0318991312552388, "learning_rate": 1.7821428782991e-05, "loss": 0.1632, "step": 6826 }, { "epoch": 1.4267502612330198, "grad_norm": 1.0818866740843607, "learning_rate": 1.782072580172671e-05, "loss": 0.2068, "step": 6827 }, { "epoch": 1.4269592476489028, "grad_norm": 1.0182631024309736, "learning_rate": 1.782002272093132e-05, "loss": 0.1638, "step": 6828 }, { "epoch": 1.4271682340647858, "grad_norm": 1.1824972830963612, "learning_rate": 1.781931954061378e-05, "loss": 0.1521, "step": 6829 }, { "epoch": 1.4273772204806687, "grad_norm": 1.1182619227166033, "learning_rate": 1.781861626078305e-05, "loss": 0.1865, "step": 6830 }, { "epoch": 1.4275862068965517, "grad_norm": 0.7694991130307993, "learning_rate": 1.7817912881448066e-05, "loss": 0.1532, "step": 6831 }, { "epoch": 1.4277951933124347, "grad_norm": 1.1432012497226132, "learning_rate": 1.7817209402617786e-05, "loss": 0.1726, "step": 6832 }, { "epoch": 1.4280041797283176, "grad_norm": 0.992495433879916, "learning_rate": 1.7816505824301166e-05, "loss": 0.1715, "step": 6833 }, { "epoch": 1.4282131661442006, "grad_norm": 0.8186521089829826, "learning_rate": 1.7815802146507154e-05, "loss": 0.1629, "step": 6834 }, { "epoch": 1.4284221525600835, "grad_norm": 0.9883177544508824, "learning_rate": 1.7815098369244707e-05, "loss": 0.1789, "step": 6835 }, { "epoch": 1.4286311389759665, "grad_norm": 1.1855192007554458, "learning_rate": 1.7814394492522784e-05, "loss": 0.1416, "step": 6836 }, { "epoch": 1.4288401253918495, "grad_norm": 1.0214262603541815, "learning_rate": 1.781369051635034e-05, "loss": 0.1753, "step": 6837 }, { "epoch": 1.4290491118077324, "grad_norm": 1.198771818157644, "learning_rate": 1.7812986440736337e-05, "loss": 0.1419, "step": 6838 }, { "epoch": 1.4292580982236154, "grad_norm": 1.075650672654944, "learning_rate": 1.7812282265689732e-05, "loss": 0.1811, "step": 6839 }, { "epoch": 1.4294670846394983, "grad_norm": 1.113652020067802, "learning_rate": 1.781157799121949e-05, "loss": 0.1741, "step": 6840 }, { "epoch": 1.4296760710553813, "grad_norm": 0.8918958166176699, "learning_rate": 1.7810873617334575e-05, "loss": 0.1454, "step": 6841 }, { "epoch": 1.4298850574712643, "grad_norm": 1.0454701575828231, "learning_rate": 1.7810169144043946e-05, "loss": 0.2085, "step": 6842 }, { "epoch": 1.4300940438871472, "grad_norm": 1.313784449250103, "learning_rate": 1.7809464571356575e-05, "loss": 0.1933, "step": 6843 }, { "epoch": 1.4303030303030302, "grad_norm": 1.017759393963154, "learning_rate": 1.780875989928142e-05, "loss": 0.1484, "step": 6844 }, { "epoch": 1.4305120167189131, "grad_norm": 1.124533168286731, "learning_rate": 1.7808055127827463e-05, "loss": 0.1736, "step": 6845 }, { "epoch": 1.430721003134796, "grad_norm": 1.084819409781971, "learning_rate": 1.780735025700366e-05, "loss": 0.1814, "step": 6846 }, { "epoch": 1.430929989550679, "grad_norm": 1.0924595317531014, "learning_rate": 1.7806645286818986e-05, "loss": 0.1603, "step": 6847 }, { "epoch": 1.4311389759665623, "grad_norm": 0.8292770344735244, "learning_rate": 1.7805940217282414e-05, "loss": 0.1582, "step": 6848 }, { "epoch": 1.4313479623824452, "grad_norm": 1.3438259380327668, "learning_rate": 1.7805235048402914e-05, "loss": 0.1823, "step": 6849 }, { "epoch": 1.4315569487983282, "grad_norm": 1.0908065348438378, "learning_rate": 1.780452978018947e-05, "loss": 0.1567, "step": 6850 }, { "epoch": 1.4317659352142111, "grad_norm": 1.1457744772589298, "learning_rate": 1.7803824412651043e-05, "loss": 0.193, "step": 6851 }, { "epoch": 1.431974921630094, "grad_norm": 0.8315530048654401, "learning_rate": 1.780311894579662e-05, "loss": 0.166, "step": 6852 }, { "epoch": 1.432183908045977, "grad_norm": 0.9939401772259564, "learning_rate": 1.7802413379635176e-05, "loss": 0.1824, "step": 6853 }, { "epoch": 1.43239289446186, "grad_norm": 0.9386355464060099, "learning_rate": 1.7801707714175692e-05, "loss": 0.1521, "step": 6854 }, { "epoch": 1.432601880877743, "grad_norm": 1.051123264437223, "learning_rate": 1.780100194942715e-05, "loss": 0.1735, "step": 6855 }, { "epoch": 1.432810867293626, "grad_norm": 1.0135079721004787, "learning_rate": 1.7800296085398528e-05, "loss": 0.1606, "step": 6856 }, { "epoch": 1.433019853709509, "grad_norm": 1.0986175755969045, "learning_rate": 1.779959012209881e-05, "loss": 0.1797, "step": 6857 }, { "epoch": 1.4332288401253919, "grad_norm": 1.0743725532558979, "learning_rate": 1.779888405953698e-05, "loss": 0.1427, "step": 6858 }, { "epoch": 1.4334378265412748, "grad_norm": 0.9423019603658958, "learning_rate": 1.779817789772203e-05, "loss": 0.1794, "step": 6859 }, { "epoch": 1.4336468129571578, "grad_norm": 0.8335728299804056, "learning_rate": 1.779747163666294e-05, "loss": 0.1233, "step": 6860 }, { "epoch": 1.4338557993730408, "grad_norm": 1.017628628626227, "learning_rate": 1.7796765276368697e-05, "loss": 0.17, "step": 6861 }, { "epoch": 1.4340647857889237, "grad_norm": 0.9394671936773136, "learning_rate": 1.7796058816848297e-05, "loss": 0.1698, "step": 6862 }, { "epoch": 1.4342737722048067, "grad_norm": 0.9327292002291139, "learning_rate": 1.779535225811073e-05, "loss": 0.1699, "step": 6863 }, { "epoch": 1.4344827586206896, "grad_norm": 1.094442877192161, "learning_rate": 1.7794645600164985e-05, "loss": 0.1623, "step": 6864 }, { "epoch": 1.4346917450365726, "grad_norm": 0.8275531142778587, "learning_rate": 1.7793938843020054e-05, "loss": 0.142, "step": 6865 }, { "epoch": 1.4349007314524556, "grad_norm": 1.1208279164262576, "learning_rate": 1.7793231986684935e-05, "loss": 0.2311, "step": 6866 }, { "epoch": 1.4351097178683385, "grad_norm": 0.9382341164869341, "learning_rate": 1.7792525031168625e-05, "loss": 0.1691, "step": 6867 }, { "epoch": 1.4353187042842215, "grad_norm": 1.048779328787848, "learning_rate": 1.7791817976480118e-05, "loss": 0.139, "step": 6868 }, { "epoch": 1.4355276907001044, "grad_norm": 1.2523774743416756, "learning_rate": 1.779111082262841e-05, "loss": 0.1873, "step": 6869 }, { "epoch": 1.4357366771159874, "grad_norm": 1.0015104709026332, "learning_rate": 1.7790403569622508e-05, "loss": 0.1872, "step": 6870 }, { "epoch": 1.4359456635318704, "grad_norm": 0.8602731801686134, "learning_rate": 1.7789696217471405e-05, "loss": 0.1552, "step": 6871 }, { "epoch": 1.4361546499477533, "grad_norm": 0.9544303685795574, "learning_rate": 1.7788988766184112e-05, "loss": 0.1578, "step": 6872 }, { "epoch": 1.4363636363636363, "grad_norm": 1.073321793421317, "learning_rate": 1.7788281215769624e-05, "loss": 0.173, "step": 6873 }, { "epoch": 1.4365726227795192, "grad_norm": 1.0723944044778992, "learning_rate": 1.778757356623695e-05, "loss": 0.1342, "step": 6874 }, { "epoch": 1.4367816091954024, "grad_norm": 0.9338822946782215, "learning_rate": 1.7786865817595095e-05, "loss": 0.1394, "step": 6875 }, { "epoch": 1.4369905956112854, "grad_norm": 0.9643679711240376, "learning_rate": 1.7786157969853065e-05, "loss": 0.1593, "step": 6876 }, { "epoch": 1.4371995820271684, "grad_norm": 0.9680671032643693, "learning_rate": 1.7785450023019873e-05, "loss": 0.1381, "step": 6877 }, { "epoch": 1.4374085684430513, "grad_norm": 1.3870807151680231, "learning_rate": 1.7784741977104524e-05, "loss": 0.1536, "step": 6878 }, { "epoch": 1.4376175548589343, "grad_norm": 1.1453859162609756, "learning_rate": 1.778403383211603e-05, "loss": 0.1682, "step": 6879 }, { "epoch": 1.4378265412748172, "grad_norm": 0.8761296176747484, "learning_rate": 1.7783325588063406e-05, "loss": 0.172, "step": 6880 }, { "epoch": 1.4380355276907002, "grad_norm": 0.8757653588648443, "learning_rate": 1.778261724495566e-05, "loss": 0.173, "step": 6881 }, { "epoch": 1.4382445141065832, "grad_norm": 1.1014271036724372, "learning_rate": 1.7781908802801814e-05, "loss": 0.193, "step": 6882 }, { "epoch": 1.4384535005224661, "grad_norm": 0.9064462008882321, "learning_rate": 1.7781200261610874e-05, "loss": 0.1748, "step": 6883 }, { "epoch": 1.438662486938349, "grad_norm": 0.8640293625825113, "learning_rate": 1.778049162139187e-05, "loss": 0.185, "step": 6884 }, { "epoch": 1.438871473354232, "grad_norm": 0.997881573597116, "learning_rate": 1.7779782882153813e-05, "loss": 0.2167, "step": 6885 }, { "epoch": 1.439080459770115, "grad_norm": 1.1206973371351092, "learning_rate": 1.7779074043905722e-05, "loss": 0.1365, "step": 6886 }, { "epoch": 1.439289446185998, "grad_norm": 1.192797320667016, "learning_rate": 1.777836510665662e-05, "loss": 0.1826, "step": 6887 }, { "epoch": 1.439498432601881, "grad_norm": 0.8943740220703237, "learning_rate": 1.7777656070415533e-05, "loss": 0.1763, "step": 6888 }, { "epoch": 1.439707419017764, "grad_norm": 1.092464980571854, "learning_rate": 1.7776946935191474e-05, "loss": 0.1828, "step": 6889 }, { "epoch": 1.4399164054336469, "grad_norm": 1.0217332380232804, "learning_rate": 1.777623770099348e-05, "loss": 0.1513, "step": 6890 }, { "epoch": 1.4401253918495298, "grad_norm": 0.9541348524818424, "learning_rate": 1.7775528367830572e-05, "loss": 0.1655, "step": 6891 }, { "epoch": 1.4403343782654128, "grad_norm": 0.9585030977441124, "learning_rate": 1.7774818935711775e-05, "loss": 0.153, "step": 6892 }, { "epoch": 1.4405433646812957, "grad_norm": 1.0675750372757782, "learning_rate": 1.7774109404646122e-05, "loss": 0.1769, "step": 6893 }, { "epoch": 1.4407523510971787, "grad_norm": 0.9855186414182819, "learning_rate": 1.7773399774642637e-05, "loss": 0.1463, "step": 6894 }, { "epoch": 1.4409613375130617, "grad_norm": 0.9942907182170331, "learning_rate": 1.7772690045710363e-05, "loss": 0.1829, "step": 6895 }, { "epoch": 1.4411703239289446, "grad_norm": 0.972146306190848, "learning_rate": 1.7771980217858318e-05, "loss": 0.171, "step": 6896 }, { "epoch": 1.4413793103448276, "grad_norm": 1.148564575699074, "learning_rate": 1.7771270291095542e-05, "loss": 0.1938, "step": 6897 }, { "epoch": 1.4415882967607105, "grad_norm": 1.1257295072391047, "learning_rate": 1.7770560265431075e-05, "loss": 0.1813, "step": 6898 }, { "epoch": 1.4417972831765935, "grad_norm": 1.0404383239866586, "learning_rate": 1.7769850140873943e-05, "loss": 0.1548, "step": 6899 }, { "epoch": 1.4420062695924765, "grad_norm": 1.1877484321525658, "learning_rate": 1.7769139917433192e-05, "loss": 0.1817, "step": 6900 }, { "epoch": 1.4422152560083594, "grad_norm": 1.2426863019463465, "learning_rate": 1.7768429595117854e-05, "loss": 0.213, "step": 6901 }, { "epoch": 1.4424242424242424, "grad_norm": 1.1176575971421938, "learning_rate": 1.7767719173936974e-05, "loss": 0.1901, "step": 6902 }, { "epoch": 1.4426332288401253, "grad_norm": 1.2004115372047306, "learning_rate": 1.7767008653899592e-05, "loss": 0.1654, "step": 6903 }, { "epoch": 1.4428422152560083, "grad_norm": 1.075115194482945, "learning_rate": 1.776629803501475e-05, "loss": 0.1596, "step": 6904 }, { "epoch": 1.4430512016718913, "grad_norm": 1.0988609219139844, "learning_rate": 1.7765587317291494e-05, "loss": 0.2015, "step": 6905 }, { "epoch": 1.4432601880877742, "grad_norm": 0.9239220240064144, "learning_rate": 1.7764876500738866e-05, "loss": 0.1643, "step": 6906 }, { "epoch": 1.4434691745036572, "grad_norm": 1.0544861840656934, "learning_rate": 1.7764165585365912e-05, "loss": 0.1324, "step": 6907 }, { "epoch": 1.4436781609195402, "grad_norm": 1.0908896094536564, "learning_rate": 1.776345457118168e-05, "loss": 0.1901, "step": 6908 }, { "epoch": 1.4438871473354231, "grad_norm": 0.9796522156241224, "learning_rate": 1.7762743458195222e-05, "loss": 0.1588, "step": 6909 }, { "epoch": 1.444096133751306, "grad_norm": 0.9143568353306887, "learning_rate": 1.7762032246415584e-05, "loss": 0.1702, "step": 6910 }, { "epoch": 1.444305120167189, "grad_norm": 1.2990024971853964, "learning_rate": 1.776132093585182e-05, "loss": 0.1988, "step": 6911 }, { "epoch": 1.444514106583072, "grad_norm": 0.8832229318311535, "learning_rate": 1.7760609526512978e-05, "loss": 0.1615, "step": 6912 }, { "epoch": 1.444723092998955, "grad_norm": 1.047676333283156, "learning_rate": 1.7759898018408117e-05, "loss": 0.195, "step": 6913 }, { "epoch": 1.444932079414838, "grad_norm": 0.961981130823853, "learning_rate": 1.775918641154629e-05, "loss": 0.1813, "step": 6914 }, { "epoch": 1.4451410658307209, "grad_norm": 1.136053515533746, "learning_rate": 1.7758474705936555e-05, "loss": 0.1983, "step": 6915 }, { "epoch": 1.4453500522466038, "grad_norm": 0.9208028475463286, "learning_rate": 1.775776290158797e-05, "loss": 0.1973, "step": 6916 }, { "epoch": 1.4455590386624868, "grad_norm": 0.9529770145908394, "learning_rate": 1.7757050998509587e-05, "loss": 0.1711, "step": 6917 }, { "epoch": 1.4457680250783698, "grad_norm": 0.9391536332607847, "learning_rate": 1.7756338996710474e-05, "loss": 0.18, "step": 6918 }, { "epoch": 1.445977011494253, "grad_norm": 1.4066641040338765, "learning_rate": 1.775562689619969e-05, "loss": 0.161, "step": 6919 }, { "epoch": 1.446185997910136, "grad_norm": 0.9240703937855579, "learning_rate": 1.7754914696986295e-05, "loss": 0.1361, "step": 6920 }, { "epoch": 1.4463949843260189, "grad_norm": 1.230429548365247, "learning_rate": 1.7754202399079355e-05, "loss": 0.1792, "step": 6921 }, { "epoch": 1.4466039707419018, "grad_norm": 0.9616181384865491, "learning_rate": 1.7753490002487934e-05, "loss": 0.1848, "step": 6922 }, { "epoch": 1.4468129571577848, "grad_norm": 0.8826472826617134, "learning_rate": 1.7752777507221104e-05, "loss": 0.1536, "step": 6923 }, { "epoch": 1.4470219435736678, "grad_norm": 1.1457358480677886, "learning_rate": 1.7752064913287923e-05, "loss": 0.2034, "step": 6924 }, { "epoch": 1.4472309299895507, "grad_norm": 1.0361157997510522, "learning_rate": 1.775135222069747e-05, "loss": 0.2214, "step": 6925 }, { "epoch": 1.4474399164054337, "grad_norm": 0.9079017988510856, "learning_rate": 1.7750639429458805e-05, "loss": 0.1685, "step": 6926 }, { "epoch": 1.4476489028213166, "grad_norm": 0.8700649590468269, "learning_rate": 1.7749926539581003e-05, "loss": 0.1494, "step": 6927 }, { "epoch": 1.4478578892371996, "grad_norm": 0.8045859346599875, "learning_rate": 1.7749213551073142e-05, "loss": 0.1637, "step": 6928 }, { "epoch": 1.4480668756530826, "grad_norm": 0.910609442301072, "learning_rate": 1.774850046394429e-05, "loss": 0.166, "step": 6929 }, { "epoch": 1.4482758620689655, "grad_norm": 1.0703253723776436, "learning_rate": 1.7747787278203527e-05, "loss": 0.1965, "step": 6930 }, { "epoch": 1.4484848484848485, "grad_norm": 0.9334166158058887, "learning_rate": 1.7747073993859926e-05, "loss": 0.1557, "step": 6931 }, { "epoch": 1.4486938349007314, "grad_norm": 0.9887768582192202, "learning_rate": 1.7746360610922565e-05, "loss": 0.1278, "step": 6932 }, { "epoch": 1.4489028213166144, "grad_norm": 1.205603546093584, "learning_rate": 1.7745647129400522e-05, "loss": 0.1973, "step": 6933 }, { "epoch": 1.4491118077324974, "grad_norm": 1.105381641006468, "learning_rate": 1.774493354930288e-05, "loss": 0.2033, "step": 6934 }, { "epoch": 1.4493207941483803, "grad_norm": 1.296832763631852, "learning_rate": 1.7744219870638716e-05, "loss": 0.1648, "step": 6935 }, { "epoch": 1.4495297805642633, "grad_norm": 1.0511266716037886, "learning_rate": 1.7743506093417116e-05, "loss": 0.1257, "step": 6936 }, { "epoch": 1.4497387669801463, "grad_norm": 1.137154973653541, "learning_rate": 1.7742792217647166e-05, "loss": 0.1757, "step": 6937 }, { "epoch": 1.4499477533960292, "grad_norm": 0.9990798730486198, "learning_rate": 1.7742078243337943e-05, "loss": 0.1727, "step": 6938 }, { "epoch": 1.4501567398119122, "grad_norm": 0.9861647170626143, "learning_rate": 1.7741364170498544e-05, "loss": 0.1468, "step": 6939 }, { "epoch": 1.4503657262277951, "grad_norm": 0.8726620038317878, "learning_rate": 1.774064999913805e-05, "loss": 0.1752, "step": 6940 }, { "epoch": 1.450574712643678, "grad_norm": 1.2781647810654573, "learning_rate": 1.773993572926555e-05, "loss": 0.1758, "step": 6941 }, { "epoch": 1.450783699059561, "grad_norm": 1.156719265706107, "learning_rate": 1.7739221360890143e-05, "loss": 0.1797, "step": 6942 }, { "epoch": 1.450992685475444, "grad_norm": 1.034534907171092, "learning_rate": 1.7738506894020907e-05, "loss": 0.1705, "step": 6943 }, { "epoch": 1.451201671891327, "grad_norm": 1.098114147891286, "learning_rate": 1.7737792328666944e-05, "loss": 0.1997, "step": 6944 }, { "epoch": 1.4514106583072102, "grad_norm": 0.9588803975673618, "learning_rate": 1.7737077664837343e-05, "loss": 0.1519, "step": 6945 }, { "epoch": 1.4516196447230931, "grad_norm": 1.0717369189213226, "learning_rate": 1.7736362902541203e-05, "loss": 0.1882, "step": 6946 }, { "epoch": 1.451828631138976, "grad_norm": 1.152353297534812, "learning_rate": 1.7735648041787613e-05, "loss": 0.157, "step": 6947 }, { "epoch": 1.452037617554859, "grad_norm": 1.1545921526880458, "learning_rate": 1.7734933082585684e-05, "loss": 0.1572, "step": 6948 }, { "epoch": 1.452246603970742, "grad_norm": 1.0222416247323105, "learning_rate": 1.7734218024944507e-05, "loss": 0.1724, "step": 6949 }, { "epoch": 1.452455590386625, "grad_norm": 0.9982458633940737, "learning_rate": 1.7733502868873178e-05, "loss": 0.1942, "step": 6950 }, { "epoch": 1.452664576802508, "grad_norm": 0.8702823434148338, "learning_rate": 1.7732787614380805e-05, "loss": 0.1634, "step": 6951 }, { "epoch": 1.452873563218391, "grad_norm": 0.9748112043559046, "learning_rate": 1.773207226147649e-05, "loss": 0.1567, "step": 6952 }, { "epoch": 1.4530825496342739, "grad_norm": 0.8764784789528834, "learning_rate": 1.7731356810169335e-05, "loss": 0.1567, "step": 6953 }, { "epoch": 1.4532915360501568, "grad_norm": 0.9797842732504385, "learning_rate": 1.773064126046845e-05, "loss": 0.1414, "step": 6954 }, { "epoch": 1.4535005224660398, "grad_norm": 1.1060698104112228, "learning_rate": 1.7729925612382933e-05, "loss": 0.1797, "step": 6955 }, { "epoch": 1.4537095088819227, "grad_norm": 1.1910898233710723, "learning_rate": 1.7729209865921895e-05, "loss": 0.1812, "step": 6956 }, { "epoch": 1.4539184952978057, "grad_norm": 1.0198833499694397, "learning_rate": 1.7728494021094453e-05, "loss": 0.128, "step": 6957 }, { "epoch": 1.4541274817136887, "grad_norm": 1.368151035602307, "learning_rate": 1.7727778077909704e-05, "loss": 0.1585, "step": 6958 }, { "epoch": 1.4543364681295716, "grad_norm": 1.0503683742116752, "learning_rate": 1.772706203637677e-05, "loss": 0.165, "step": 6959 }, { "epoch": 1.4545454545454546, "grad_norm": 1.102552354175227, "learning_rate": 1.772634589650476e-05, "loss": 0.1722, "step": 6960 }, { "epoch": 1.4547544409613375, "grad_norm": 1.0933500287872802, "learning_rate": 1.7725629658302786e-05, "loss": 0.1944, "step": 6961 }, { "epoch": 1.4549634273772205, "grad_norm": 1.184669487769667, "learning_rate": 1.772491332177997e-05, "loss": 0.1328, "step": 6962 }, { "epoch": 1.4551724137931035, "grad_norm": 0.9596047477133137, "learning_rate": 1.772419688694542e-05, "loss": 0.1546, "step": 6963 }, { "epoch": 1.4553814002089864, "grad_norm": 1.135893784206236, "learning_rate": 1.7723480353808257e-05, "loss": 0.1784, "step": 6964 }, { "epoch": 1.4555903866248694, "grad_norm": 0.9465514313791835, "learning_rate": 1.77227637223776e-05, "loss": 0.1731, "step": 6965 }, { "epoch": 1.4557993730407524, "grad_norm": 0.9886370141296557, "learning_rate": 1.7722046992662573e-05, "loss": 0.1756, "step": 6966 }, { "epoch": 1.4560083594566353, "grad_norm": 1.1531744120399081, "learning_rate": 1.7721330164672293e-05, "loss": 0.1388, "step": 6967 }, { "epoch": 1.4562173458725183, "grad_norm": 1.0298025233305466, "learning_rate": 1.772061323841588e-05, "loss": 0.1474, "step": 6968 }, { "epoch": 1.4564263322884012, "grad_norm": 0.8826141636544402, "learning_rate": 1.771989621390247e-05, "loss": 0.143, "step": 6969 }, { "epoch": 1.4566353187042842, "grad_norm": 0.9548953371454875, "learning_rate": 1.7719179091141173e-05, "loss": 0.1685, "step": 6970 }, { "epoch": 1.4568443051201672, "grad_norm": 1.325285476412104, "learning_rate": 1.7718461870141127e-05, "loss": 0.1845, "step": 6971 }, { "epoch": 1.4570532915360501, "grad_norm": 1.025667862452698, "learning_rate": 1.7717744550911454e-05, "loss": 0.1847, "step": 6972 }, { "epoch": 1.457262277951933, "grad_norm": 0.8066817619239632, "learning_rate": 1.7717027133461287e-05, "loss": 0.1412, "step": 6973 }, { "epoch": 1.457471264367816, "grad_norm": 0.9921899217811709, "learning_rate": 1.7716309617799753e-05, "loss": 0.1689, "step": 6974 }, { "epoch": 1.457680250783699, "grad_norm": 0.9786185025460125, "learning_rate": 1.7715592003935984e-05, "loss": 0.2006, "step": 6975 }, { "epoch": 1.457889237199582, "grad_norm": 1.0100908868080876, "learning_rate": 1.7714874291879115e-05, "loss": 0.1715, "step": 6976 }, { "epoch": 1.458098223615465, "grad_norm": 1.0431054291039177, "learning_rate": 1.7714156481638275e-05, "loss": 0.217, "step": 6977 }, { "epoch": 1.458307210031348, "grad_norm": 0.82747725639474, "learning_rate": 1.7713438573222604e-05, "loss": 0.1796, "step": 6978 }, { "epoch": 1.4585161964472309, "grad_norm": 0.990331059165826, "learning_rate": 1.771272056664124e-05, "loss": 0.1642, "step": 6979 }, { "epoch": 1.4587251828631138, "grad_norm": 0.9732421414844198, "learning_rate": 1.7712002461903314e-05, "loss": 0.188, "step": 6980 }, { "epoch": 1.4589341692789968, "grad_norm": 1.1164562359242987, "learning_rate": 1.771128425901797e-05, "loss": 0.1594, "step": 6981 }, { "epoch": 1.4591431556948797, "grad_norm": 1.1622118616422326, "learning_rate": 1.7710565957994345e-05, "loss": 0.1809, "step": 6982 }, { "epoch": 1.4593521421107627, "grad_norm": 1.1963913678237479, "learning_rate": 1.7709847558841588e-05, "loss": 0.1803, "step": 6983 }, { "epoch": 1.4595611285266457, "grad_norm": 1.046543771476697, "learning_rate": 1.770912906156883e-05, "loss": 0.1515, "step": 6984 }, { "epoch": 1.4597701149425286, "grad_norm": 1.2735695501136493, "learning_rate": 1.770841046618523e-05, "loss": 0.1652, "step": 6985 }, { "epoch": 1.4599791013584116, "grad_norm": 1.0019601955892126, "learning_rate": 1.7707691772699916e-05, "loss": 0.2017, "step": 6986 }, { "epoch": 1.4601880877742945, "grad_norm": 0.9217861394217123, "learning_rate": 1.7706972981122046e-05, "loss": 0.1676, "step": 6987 }, { "epoch": 1.4603970741901775, "grad_norm": 1.0830385779998761, "learning_rate": 1.7706254091460765e-05, "loss": 0.1591, "step": 6988 }, { "epoch": 1.4606060606060607, "grad_norm": 0.9091397719621107, "learning_rate": 1.7705535103725223e-05, "loss": 0.1586, "step": 6989 }, { "epoch": 1.4608150470219436, "grad_norm": 1.0400003091479078, "learning_rate": 1.770481601792457e-05, "loss": 0.2061, "step": 6990 }, { "epoch": 1.4610240334378266, "grad_norm": 1.115411946880189, "learning_rate": 1.7704096834067957e-05, "loss": 0.1896, "step": 6991 }, { "epoch": 1.4612330198537096, "grad_norm": 1.0876802559237582, "learning_rate": 1.7703377552164537e-05, "loss": 0.1779, "step": 6992 }, { "epoch": 1.4614420062695925, "grad_norm": 1.0025450372013673, "learning_rate": 1.770265817222346e-05, "loss": 0.1856, "step": 6993 }, { "epoch": 1.4616509926854755, "grad_norm": 1.067500172556301, "learning_rate": 1.7701938694253887e-05, "loss": 0.1389, "step": 6994 }, { "epoch": 1.4618599791013585, "grad_norm": 1.1665182781704586, "learning_rate": 1.770121911826497e-05, "loss": 0.1663, "step": 6995 }, { "epoch": 1.4620689655172414, "grad_norm": 0.9424102155111709, "learning_rate": 1.7700499444265872e-05, "loss": 0.1449, "step": 6996 }, { "epoch": 1.4622779519331244, "grad_norm": 1.0273772326015733, "learning_rate": 1.7699779672265746e-05, "loss": 0.1855, "step": 6997 }, { "epoch": 1.4624869383490073, "grad_norm": 0.9118307722225865, "learning_rate": 1.7699059802273757e-05, "loss": 0.1598, "step": 6998 }, { "epoch": 1.4626959247648903, "grad_norm": 1.0394708672593687, "learning_rate": 1.7698339834299064e-05, "loss": 0.1961, "step": 6999 }, { "epoch": 1.4629049111807733, "grad_norm": 1.0371949058144427, "learning_rate": 1.769761976835083e-05, "loss": 0.1755, "step": 7000 }, { "epoch": 1.4631138975966562, "grad_norm": 0.7964419804660198, "learning_rate": 1.769689960443822e-05, "loss": 0.1367, "step": 7001 }, { "epoch": 1.4633228840125392, "grad_norm": 0.9576009900873217, "learning_rate": 1.76961793425704e-05, "loss": 0.1641, "step": 7002 }, { "epoch": 1.4635318704284221, "grad_norm": 1.1067701793427434, "learning_rate": 1.7695458982756535e-05, "loss": 0.1432, "step": 7003 }, { "epoch": 1.463740856844305, "grad_norm": 1.0230021532765452, "learning_rate": 1.769473852500579e-05, "loss": 0.1784, "step": 7004 }, { "epoch": 1.463949843260188, "grad_norm": 1.095318894993927, "learning_rate": 1.769401796932734e-05, "loss": 0.1693, "step": 7005 }, { "epoch": 1.464158829676071, "grad_norm": 0.986262941360399, "learning_rate": 1.769329731573035e-05, "loss": 0.1712, "step": 7006 }, { "epoch": 1.464367816091954, "grad_norm": 0.9998992492876613, "learning_rate": 1.7692576564223993e-05, "loss": 0.1463, "step": 7007 }, { "epoch": 1.464576802507837, "grad_norm": 1.073793393502007, "learning_rate": 1.7691855714817443e-05, "loss": 0.1552, "step": 7008 }, { "epoch": 1.46478578892372, "grad_norm": 1.1104424946580773, "learning_rate": 1.7691134767519874e-05, "loss": 0.1771, "step": 7009 }, { "epoch": 1.4649947753396029, "grad_norm": 1.0425162056524622, "learning_rate": 1.769041372234046e-05, "loss": 0.1559, "step": 7010 }, { "epoch": 1.4652037617554858, "grad_norm": 0.9282462766584495, "learning_rate": 1.7689692579288376e-05, "loss": 0.1534, "step": 7011 }, { "epoch": 1.4654127481713688, "grad_norm": 1.051857846325487, "learning_rate": 1.76889713383728e-05, "loss": 0.1594, "step": 7012 }, { "epoch": 1.4656217345872518, "grad_norm": 1.016756788925453, "learning_rate": 1.7688249999602916e-05, "loss": 0.1581, "step": 7013 }, { "epoch": 1.4658307210031347, "grad_norm": 0.9543596320606355, "learning_rate": 1.76875285629879e-05, "loss": 0.1536, "step": 7014 }, { "epoch": 1.466039707419018, "grad_norm": 1.442079713340222, "learning_rate": 1.7686807028536934e-05, "loss": 0.1968, "step": 7015 }, { "epoch": 1.4662486938349009, "grad_norm": 1.2529988419830886, "learning_rate": 1.7686085396259196e-05, "loss": 0.178, "step": 7016 }, { "epoch": 1.4664576802507838, "grad_norm": 1.0732268906822005, "learning_rate": 1.768536366616388e-05, "loss": 0.1785, "step": 7017 }, { "epoch": 1.4666666666666668, "grad_norm": 1.0322809852594272, "learning_rate": 1.7684641838260165e-05, "loss": 0.1755, "step": 7018 }, { "epoch": 1.4668756530825497, "grad_norm": 0.9825291302188791, "learning_rate": 1.7683919912557238e-05, "loss": 0.1659, "step": 7019 }, { "epoch": 1.4670846394984327, "grad_norm": 1.058103396800712, "learning_rate": 1.7683197889064283e-05, "loss": 0.1748, "step": 7020 }, { "epoch": 1.4672936259143157, "grad_norm": 1.1377170782964006, "learning_rate": 1.7682475767790498e-05, "loss": 0.1797, "step": 7021 }, { "epoch": 1.4675026123301986, "grad_norm": 1.0643117184313922, "learning_rate": 1.7681753548745062e-05, "loss": 0.1598, "step": 7022 }, { "epoch": 1.4677115987460816, "grad_norm": 1.0689065004061502, "learning_rate": 1.7681031231937174e-05, "loss": 0.1709, "step": 7023 }, { "epoch": 1.4679205851619646, "grad_norm": 1.053637636456673, "learning_rate": 1.7680308817376026e-05, "loss": 0.1442, "step": 7024 }, { "epoch": 1.4681295715778475, "grad_norm": 1.1011061665709385, "learning_rate": 1.767958630507081e-05, "loss": 0.1452, "step": 7025 }, { "epoch": 1.4683385579937305, "grad_norm": 1.1047984260016424, "learning_rate": 1.7678863695030723e-05, "loss": 0.1666, "step": 7026 }, { "epoch": 1.4685475444096134, "grad_norm": 0.8750878434109144, "learning_rate": 1.767814098726496e-05, "loss": 0.1756, "step": 7027 }, { "epoch": 1.4687565308254964, "grad_norm": 1.0058043508049097, "learning_rate": 1.7677418181782717e-05, "loss": 0.1698, "step": 7028 }, { "epoch": 1.4689655172413794, "grad_norm": 1.2165344831623686, "learning_rate": 1.7676695278593198e-05, "loss": 0.2114, "step": 7029 }, { "epoch": 1.4691745036572623, "grad_norm": 1.4185942614401732, "learning_rate": 1.7675972277705594e-05, "loss": 0.2304, "step": 7030 }, { "epoch": 1.4693834900731453, "grad_norm": 0.9583428939221414, "learning_rate": 1.7675249179129115e-05, "loss": 0.1548, "step": 7031 }, { "epoch": 1.4695924764890282, "grad_norm": 0.8767954593711389, "learning_rate": 1.767452598287296e-05, "loss": 0.1719, "step": 7032 }, { "epoch": 1.4698014629049112, "grad_norm": 0.9279765926565253, "learning_rate": 1.7673802688946332e-05, "loss": 0.1687, "step": 7033 }, { "epoch": 1.4700104493207942, "grad_norm": 1.0625965023996096, "learning_rate": 1.767307929735844e-05, "loss": 0.1606, "step": 7034 }, { "epoch": 1.4702194357366771, "grad_norm": 0.9152165227301905, "learning_rate": 1.7672355808118486e-05, "loss": 0.1397, "step": 7035 }, { "epoch": 1.47042842215256, "grad_norm": 1.138327520849892, "learning_rate": 1.767163222123568e-05, "loss": 0.1793, "step": 7036 }, { "epoch": 1.470637408568443, "grad_norm": 1.2130400538434285, "learning_rate": 1.767090853671923e-05, "loss": 0.2092, "step": 7037 }, { "epoch": 1.470846394984326, "grad_norm": 0.9375087667600029, "learning_rate": 1.7670184754578344e-05, "loss": 0.1422, "step": 7038 }, { "epoch": 1.471055381400209, "grad_norm": 1.1793047545667377, "learning_rate": 1.7669460874822236e-05, "loss": 0.1752, "step": 7039 }, { "epoch": 1.471264367816092, "grad_norm": 1.027245313932558, "learning_rate": 1.7668736897460123e-05, "loss": 0.1901, "step": 7040 }, { "epoch": 1.471473354231975, "grad_norm": 1.075574318269837, "learning_rate": 1.7668012822501207e-05, "loss": 0.1334, "step": 7041 }, { "epoch": 1.4716823406478579, "grad_norm": 1.0763498366175601, "learning_rate": 1.7667288649954713e-05, "loss": 0.1652, "step": 7042 }, { "epoch": 1.4718913270637408, "grad_norm": 0.9187546024386246, "learning_rate": 1.7666564379829854e-05, "loss": 0.1933, "step": 7043 }, { "epoch": 1.4721003134796238, "grad_norm": 1.1075016090037224, "learning_rate": 1.7665840012135843e-05, "loss": 0.1635, "step": 7044 }, { "epoch": 1.4723092998955067, "grad_norm": 1.2914615552286943, "learning_rate": 1.766511554688191e-05, "loss": 0.162, "step": 7045 }, { "epoch": 1.4725182863113897, "grad_norm": 1.0605164089081882, "learning_rate": 1.7664390984077263e-05, "loss": 0.1921, "step": 7046 }, { "epoch": 1.4727272727272727, "grad_norm": 1.021662660640921, "learning_rate": 1.766366632373113e-05, "loss": 0.167, "step": 7047 }, { "epoch": 1.4729362591431556, "grad_norm": 1.2037966437630991, "learning_rate": 1.766294156585273e-05, "loss": 0.1705, "step": 7048 }, { "epoch": 1.4731452455590386, "grad_norm": 1.0778582800368381, "learning_rate": 1.766221671045129e-05, "loss": 0.1619, "step": 7049 }, { "epoch": 1.4733542319749215, "grad_norm": 1.0651802442386282, "learning_rate": 1.7661491757536028e-05, "loss": 0.1764, "step": 7050 }, { "epoch": 1.4735632183908045, "grad_norm": 1.0256631391383293, "learning_rate": 1.7660766707116185e-05, "loss": 0.187, "step": 7051 }, { "epoch": 1.4737722048066875, "grad_norm": 0.9483789022792005, "learning_rate": 1.766004155920097e-05, "loss": 0.187, "step": 7052 }, { "epoch": 1.4739811912225704, "grad_norm": 0.9856756922884371, "learning_rate": 1.7659316313799624e-05, "loss": 0.1632, "step": 7053 }, { "epoch": 1.4741901776384534, "grad_norm": 1.0392168526551417, "learning_rate": 1.7658590970921375e-05, "loss": 0.1448, "step": 7054 }, { "epoch": 1.4743991640543364, "grad_norm": 0.9609072774377156, "learning_rate": 1.765786553057545e-05, "loss": 0.1594, "step": 7055 }, { "epoch": 1.4746081504702193, "grad_norm": 4.186129013668475, "learning_rate": 1.7657139992771086e-05, "loss": 0.1996, "step": 7056 }, { "epoch": 1.4748171368861023, "grad_norm": 1.1002898330449522, "learning_rate": 1.7656414357517514e-05, "loss": 0.1805, "step": 7057 }, { "epoch": 1.4750261233019852, "grad_norm": 1.1028619423694614, "learning_rate": 1.765568862482397e-05, "loss": 0.2071, "step": 7058 }, { "epoch": 1.4752351097178684, "grad_norm": 1.1182069234778444, "learning_rate": 1.7654962794699683e-05, "loss": 0.1196, "step": 7059 }, { "epoch": 1.4754440961337514, "grad_norm": 1.22638144892273, "learning_rate": 1.7654236867153903e-05, "loss": 0.1267, "step": 7060 }, { "epoch": 1.4756530825496343, "grad_norm": 1.0486383087725089, "learning_rate": 1.765351084219586e-05, "loss": 0.2007, "step": 7061 }, { "epoch": 1.4758620689655173, "grad_norm": 0.9282511468282811, "learning_rate": 1.76527847198348e-05, "loss": 0.1616, "step": 7062 }, { "epoch": 1.4760710553814003, "grad_norm": 0.9440601235910155, "learning_rate": 1.765205850007996e-05, "loss": 0.1907, "step": 7063 }, { "epoch": 1.4762800417972832, "grad_norm": 0.8050536094376516, "learning_rate": 1.7651332182940576e-05, "loss": 0.1592, "step": 7064 }, { "epoch": 1.4764890282131662, "grad_norm": 1.0891149311647226, "learning_rate": 1.7650605768425905e-05, "loss": 0.166, "step": 7065 }, { "epoch": 1.4766980146290491, "grad_norm": 1.2370258426058776, "learning_rate": 1.7649879256545183e-05, "loss": 0.1542, "step": 7066 }, { "epoch": 1.476907001044932, "grad_norm": 0.8503659861059601, "learning_rate": 1.7649152647307656e-05, "loss": 0.1255, "step": 7067 }, { "epoch": 1.477115987460815, "grad_norm": 1.1214534626906674, "learning_rate": 1.7648425940722574e-05, "loss": 0.186, "step": 7068 }, { "epoch": 1.477324973876698, "grad_norm": 1.3181958363384711, "learning_rate": 1.7647699136799183e-05, "loss": 0.1674, "step": 7069 }, { "epoch": 1.477533960292581, "grad_norm": 0.9864346349096441, "learning_rate": 1.7646972235546736e-05, "loss": 0.1957, "step": 7070 }, { "epoch": 1.477742946708464, "grad_norm": 1.0339653246023577, "learning_rate": 1.7646245236974483e-05, "loss": 0.1941, "step": 7071 }, { "epoch": 1.477951933124347, "grad_norm": 1.1005214496458469, "learning_rate": 1.7645518141091676e-05, "loss": 0.205, "step": 7072 }, { "epoch": 1.4781609195402299, "grad_norm": 1.1060990469330703, "learning_rate": 1.7644790947907565e-05, "loss": 0.1569, "step": 7073 }, { "epoch": 1.4783699059561128, "grad_norm": 1.0524009683195195, "learning_rate": 1.764406365743141e-05, "loss": 0.1474, "step": 7074 }, { "epoch": 1.4785788923719958, "grad_norm": 1.7894587590811, "learning_rate": 1.7643336269672464e-05, "loss": 0.1499, "step": 7075 }, { "epoch": 1.4787878787878788, "grad_norm": 1.0356674904648344, "learning_rate": 1.7642608784639983e-05, "loss": 0.1729, "step": 7076 }, { "epoch": 1.4789968652037617, "grad_norm": 1.1931486865544483, "learning_rate": 1.764188120234323e-05, "loss": 0.1963, "step": 7077 }, { "epoch": 1.4792058516196447, "grad_norm": 0.9390733787100528, "learning_rate": 1.764115352279146e-05, "loss": 0.1727, "step": 7078 }, { "epoch": 1.4794148380355276, "grad_norm": 1.0201973552991843, "learning_rate": 1.7640425745993934e-05, "loss": 0.1449, "step": 7079 }, { "epoch": 1.4796238244514106, "grad_norm": 1.186622992243711, "learning_rate": 1.763969787195992e-05, "loss": 0.1818, "step": 7080 }, { "epoch": 1.4798328108672936, "grad_norm": 0.855167034700089, "learning_rate": 1.7638969900698675e-05, "loss": 0.18, "step": 7081 }, { "epoch": 1.4800417972831765, "grad_norm": 1.0322684651527503, "learning_rate": 1.763824183221946e-05, "loss": 0.172, "step": 7082 }, { "epoch": 1.4802507836990595, "grad_norm": 1.0361721542200666, "learning_rate": 1.7637513666531555e-05, "loss": 0.1791, "step": 7083 }, { "epoch": 1.4804597701149425, "grad_norm": 1.0518534243416209, "learning_rate": 1.7636785403644216e-05, "loss": 0.1602, "step": 7084 }, { "epoch": 1.4806687565308254, "grad_norm": 1.0679280821942134, "learning_rate": 1.7636057043566714e-05, "loss": 0.1748, "step": 7085 }, { "epoch": 1.4808777429467086, "grad_norm": 1.5503787884513558, "learning_rate": 1.7635328586308315e-05, "loss": 0.1794, "step": 7086 }, { "epoch": 1.4810867293625916, "grad_norm": 1.1374062920752848, "learning_rate": 1.7634600031878297e-05, "loss": 0.1629, "step": 7087 }, { "epoch": 1.4812957157784745, "grad_norm": 1.5746317529387428, "learning_rate": 1.7633871380285926e-05, "loss": 0.1628, "step": 7088 }, { "epoch": 1.4815047021943575, "grad_norm": 1.1417876709753025, "learning_rate": 1.7633142631540478e-05, "loss": 0.1552, "step": 7089 }, { "epoch": 1.4817136886102404, "grad_norm": 1.0440588469677177, "learning_rate": 1.7632413785651227e-05, "loss": 0.1701, "step": 7090 }, { "epoch": 1.4819226750261234, "grad_norm": 0.8806788383194623, "learning_rate": 1.763168484262745e-05, "loss": 0.1585, "step": 7091 }, { "epoch": 1.4821316614420064, "grad_norm": 0.9437986350813452, "learning_rate": 1.7630955802478423e-05, "loss": 0.1577, "step": 7092 }, { "epoch": 1.4823406478578893, "grad_norm": 0.9642727978484634, "learning_rate": 1.763022666521342e-05, "loss": 0.1733, "step": 7093 }, { "epoch": 1.4825496342737723, "grad_norm": 0.8641105449484862, "learning_rate": 1.7629497430841726e-05, "loss": 0.1672, "step": 7094 }, { "epoch": 1.4827586206896552, "grad_norm": 0.9620257703147633, "learning_rate": 1.7628768099372617e-05, "loss": 0.1895, "step": 7095 }, { "epoch": 1.4829676071055382, "grad_norm": 0.8479043538597243, "learning_rate": 1.762803867081538e-05, "loss": 0.1553, "step": 7096 }, { "epoch": 1.4831765935214212, "grad_norm": 0.9057048576303905, "learning_rate": 1.76273091451793e-05, "loss": 0.1851, "step": 7097 }, { "epoch": 1.4833855799373041, "grad_norm": 0.9735752126819746, "learning_rate": 1.762657952247365e-05, "loss": 0.2128, "step": 7098 }, { "epoch": 1.483594566353187, "grad_norm": 0.9982261852539545, "learning_rate": 1.7625849802707727e-05, "loss": 0.1709, "step": 7099 }, { "epoch": 1.48380355276907, "grad_norm": 1.0709553857896266, "learning_rate": 1.7625119985890813e-05, "loss": 0.2036, "step": 7100 }, { "epoch": 1.484012539184953, "grad_norm": 0.9388265066519667, "learning_rate": 1.7624390072032196e-05, "loss": 0.1777, "step": 7101 }, { "epoch": 1.484221525600836, "grad_norm": 0.9970166127042567, "learning_rate": 1.7623660061141165e-05, "loss": 0.1493, "step": 7102 }, { "epoch": 1.484430512016719, "grad_norm": 0.9740329701004089, "learning_rate": 1.7622929953227012e-05, "loss": 0.152, "step": 7103 }, { "epoch": 1.484639498432602, "grad_norm": 1.232741716068484, "learning_rate": 1.7622199748299032e-05, "loss": 0.1901, "step": 7104 }, { "epoch": 1.4848484848484849, "grad_norm": 1.0798286488513669, "learning_rate": 1.762146944636651e-05, "loss": 0.1768, "step": 7105 }, { "epoch": 1.4850574712643678, "grad_norm": 1.1208011376549185, "learning_rate": 1.7620739047438747e-05, "loss": 0.1673, "step": 7106 }, { "epoch": 1.4852664576802508, "grad_norm": 1.1420958365728169, "learning_rate": 1.7620008551525035e-05, "loss": 0.2198, "step": 7107 }, { "epoch": 1.4854754440961337, "grad_norm": 1.2736934532138606, "learning_rate": 1.7619277958634674e-05, "loss": 0.1843, "step": 7108 }, { "epoch": 1.4856844305120167, "grad_norm": 1.0251130438354914, "learning_rate": 1.7618547268776957e-05, "loss": 0.1578, "step": 7109 }, { "epoch": 1.4858934169278997, "grad_norm": 0.9332874302997695, "learning_rate": 1.7617816481961184e-05, "loss": 0.1559, "step": 7110 }, { "epoch": 1.4861024033437826, "grad_norm": 1.7760455730459717, "learning_rate": 1.761708559819666e-05, "loss": 0.1781, "step": 7111 }, { "epoch": 1.4863113897596656, "grad_norm": 0.9483731215968015, "learning_rate": 1.7616354617492684e-05, "loss": 0.1647, "step": 7112 }, { "epoch": 1.4865203761755486, "grad_norm": 1.050812406847346, "learning_rate": 1.761562353985856e-05, "loss": 0.1614, "step": 7113 }, { "epoch": 1.4867293625914315, "grad_norm": 1.0031857144829874, "learning_rate": 1.761489236530359e-05, "loss": 0.1683, "step": 7114 }, { "epoch": 1.4869383490073145, "grad_norm": 0.9758532878409705, "learning_rate": 1.761416109383708e-05, "loss": 0.1794, "step": 7115 }, { "epoch": 1.4871473354231974, "grad_norm": 1.6731661646574412, "learning_rate": 1.7613429725468336e-05, "loss": 0.1998, "step": 7116 }, { "epoch": 1.4873563218390804, "grad_norm": 1.078774738099218, "learning_rate": 1.7612698260206668e-05, "loss": 0.1762, "step": 7117 }, { "epoch": 1.4875653082549634, "grad_norm": 0.9156699856509998, "learning_rate": 1.7611966698061384e-05, "loss": 0.1808, "step": 7118 }, { "epoch": 1.4877742946708463, "grad_norm": 1.0373129467821607, "learning_rate": 1.7611235039041796e-05, "loss": 0.1994, "step": 7119 }, { "epoch": 1.4879832810867293, "grad_norm": 0.8368134527329704, "learning_rate": 1.7610503283157207e-05, "loss": 0.1598, "step": 7120 }, { "epoch": 1.4881922675026122, "grad_norm": 0.9716591236145794, "learning_rate": 1.7609771430416942e-05, "loss": 0.1661, "step": 7121 }, { "epoch": 1.4884012539184952, "grad_norm": 0.9807060703419972, "learning_rate": 1.7609039480830308e-05, "loss": 0.1824, "step": 7122 }, { "epoch": 1.4886102403343782, "grad_norm": 1.032011940752455, "learning_rate": 1.7608307434406618e-05, "loss": 0.1822, "step": 7123 }, { "epoch": 1.4888192267502611, "grad_norm": 0.9873501181813208, "learning_rate": 1.7607575291155198e-05, "loss": 0.1544, "step": 7124 }, { "epoch": 1.489028213166144, "grad_norm": 1.0672012957922836, "learning_rate": 1.7606843051085356e-05, "loss": 0.208, "step": 7125 }, { "epoch": 1.489237199582027, "grad_norm": 1.0377310698989173, "learning_rate": 1.7606110714206416e-05, "loss": 0.1303, "step": 7126 }, { "epoch": 1.48944618599791, "grad_norm": 1.174898673959352, "learning_rate": 1.7605378280527696e-05, "loss": 0.1685, "step": 7127 }, { "epoch": 1.489655172413793, "grad_norm": 1.0292446209481605, "learning_rate": 1.7604645750058517e-05, "loss": 0.1607, "step": 7128 }, { "epoch": 1.489864158829676, "grad_norm": 0.9816278147255345, "learning_rate": 1.7603913122808205e-05, "loss": 0.2153, "step": 7129 }, { "epoch": 1.4900731452455591, "grad_norm": 1.0544650362755326, "learning_rate": 1.760318039878608e-05, "loss": 0.1639, "step": 7130 }, { "epoch": 1.490282131661442, "grad_norm": 0.8823353930380728, "learning_rate": 1.7602447578001468e-05, "loss": 0.1519, "step": 7131 }, { "epoch": 1.490491118077325, "grad_norm": 1.3383468178512008, "learning_rate": 1.7601714660463698e-05, "loss": 0.1993, "step": 7132 }, { "epoch": 1.490700104493208, "grad_norm": 0.996620118516546, "learning_rate": 1.7600981646182097e-05, "loss": 0.1596, "step": 7133 }, { "epoch": 1.490909090909091, "grad_norm": 1.458459230657165, "learning_rate": 1.760024853516599e-05, "loss": 0.1933, "step": 7134 }, { "epoch": 1.491118077324974, "grad_norm": 0.9320289412035301, "learning_rate": 1.7599515327424707e-05, "loss": 0.1683, "step": 7135 }, { "epoch": 1.4913270637408569, "grad_norm": 1.1035696526545498, "learning_rate": 1.7598782022967582e-05, "loss": 0.1789, "step": 7136 }, { "epoch": 1.4915360501567398, "grad_norm": 1.9272532416531176, "learning_rate": 1.759804862180395e-05, "loss": 0.1818, "step": 7137 }, { "epoch": 1.4917450365726228, "grad_norm": 0.9571181883726898, "learning_rate": 1.759731512394314e-05, "loss": 0.164, "step": 7138 }, { "epoch": 1.4919540229885058, "grad_norm": 1.1239933801540203, "learning_rate": 1.759658152939449e-05, "loss": 0.1787, "step": 7139 }, { "epoch": 1.4921630094043887, "grad_norm": 0.9169109918347509, "learning_rate": 1.759584783816733e-05, "loss": 0.1506, "step": 7140 }, { "epoch": 1.4923719958202717, "grad_norm": 1.0792014973715762, "learning_rate": 1.7595114050271008e-05, "loss": 0.1422, "step": 7141 }, { "epoch": 1.4925809822361547, "grad_norm": 1.0223263682341015, "learning_rate": 1.759438016571485e-05, "loss": 0.1601, "step": 7142 }, { "epoch": 1.4927899686520376, "grad_norm": 0.9862424299207039, "learning_rate": 1.7593646184508207e-05, "loss": 0.1796, "step": 7143 }, { "epoch": 1.4929989550679206, "grad_norm": 1.0441685132765273, "learning_rate": 1.7592912106660413e-05, "loss": 0.1547, "step": 7144 }, { "epoch": 1.4932079414838035, "grad_norm": 1.287496461137004, "learning_rate": 1.7592177932180817e-05, "loss": 0.1701, "step": 7145 }, { "epoch": 1.4934169278996865, "grad_norm": 0.9932543270315172, "learning_rate": 1.7591443661078756e-05, "loss": 0.1569, "step": 7146 }, { "epoch": 1.4936259143155695, "grad_norm": 1.0612882479456314, "learning_rate": 1.759070929336358e-05, "loss": 0.1542, "step": 7147 }, { "epoch": 1.4938349007314524, "grad_norm": 1.0378843443882468, "learning_rate": 1.7589974829044625e-05, "loss": 0.1831, "step": 7148 }, { "epoch": 1.4940438871473354, "grad_norm": 1.0843790283715946, "learning_rate": 1.758924026813125e-05, "loss": 0.1953, "step": 7149 }, { "epoch": 1.4942528735632183, "grad_norm": 0.9800218998057243, "learning_rate": 1.7588505610632797e-05, "loss": 0.1923, "step": 7150 }, { "epoch": 1.4944618599791013, "grad_norm": 1.073778655492492, "learning_rate": 1.7587770856558618e-05, "loss": 0.1789, "step": 7151 }, { "epoch": 1.4946708463949843, "grad_norm": 1.032053942911761, "learning_rate": 1.7587036005918068e-05, "loss": 0.1793, "step": 7152 }, { "epoch": 1.4948798328108672, "grad_norm": 1.8398989038394387, "learning_rate": 1.758630105872049e-05, "loss": 0.1837, "step": 7153 }, { "epoch": 1.4950888192267502, "grad_norm": 0.9192986626369358, "learning_rate": 1.758556601497524e-05, "loss": 0.1694, "step": 7154 }, { "epoch": 1.4952978056426331, "grad_norm": 1.0584412157368304, "learning_rate": 1.758483087469168e-05, "loss": 0.1828, "step": 7155 }, { "epoch": 1.4955067920585163, "grad_norm": 0.9291260923158293, "learning_rate": 1.7584095637879153e-05, "loss": 0.1808, "step": 7156 }, { "epoch": 1.4957157784743993, "grad_norm": 0.9799560836226701, "learning_rate": 1.758336030454703e-05, "loss": 0.1932, "step": 7157 }, { "epoch": 1.4959247648902823, "grad_norm": 0.9588221250299588, "learning_rate": 1.7582624874704662e-05, "loss": 0.1715, "step": 7158 }, { "epoch": 1.4961337513061652, "grad_norm": 1.0692789698938896, "learning_rate": 1.7581889348361407e-05, "loss": 0.1789, "step": 7159 }, { "epoch": 1.4963427377220482, "grad_norm": 0.942901402157395, "learning_rate": 1.758115372552663e-05, "loss": 0.1998, "step": 7160 }, { "epoch": 1.4965517241379311, "grad_norm": 1.0354511744138875, "learning_rate": 1.7580418006209688e-05, "loss": 0.1714, "step": 7161 }, { "epoch": 1.496760710553814, "grad_norm": 1.0800074651403635, "learning_rate": 1.757968219041995e-05, "loss": 0.1926, "step": 7162 }, { "epoch": 1.496969696969697, "grad_norm": 1.0199230832747026, "learning_rate": 1.7578946278166773e-05, "loss": 0.1628, "step": 7163 }, { "epoch": 1.49717868338558, "grad_norm": 1.3087276572999074, "learning_rate": 1.7578210269459532e-05, "loss": 0.1774, "step": 7164 }, { "epoch": 1.497387669801463, "grad_norm": 0.9981351193600417, "learning_rate": 1.7577474164307584e-05, "loss": 0.1869, "step": 7165 }, { "epoch": 1.497596656217346, "grad_norm": 0.8641102074386258, "learning_rate": 1.7576737962720306e-05, "loss": 0.1293, "step": 7166 }, { "epoch": 1.497805642633229, "grad_norm": 1.0168829985242973, "learning_rate": 1.7576001664707064e-05, "loss": 0.1912, "step": 7167 }, { "epoch": 1.4980146290491119, "grad_norm": 1.0816538766433552, "learning_rate": 1.7575265270277224e-05, "loss": 0.1567, "step": 7168 }, { "epoch": 1.4982236154649948, "grad_norm": 1.098281760200767, "learning_rate": 1.7574528779440164e-05, "loss": 0.1772, "step": 7169 }, { "epoch": 1.4984326018808778, "grad_norm": 1.1398505732523847, "learning_rate": 1.7573792192205256e-05, "loss": 0.2048, "step": 7170 }, { "epoch": 1.4986415882967608, "grad_norm": 1.0316958267069776, "learning_rate": 1.7573055508581868e-05, "loss": 0.1758, "step": 7171 }, { "epoch": 1.4988505747126437, "grad_norm": 0.9311749982636502, "learning_rate": 1.7572318728579385e-05, "loss": 0.1975, "step": 7172 }, { "epoch": 1.4990595611285267, "grad_norm": 0.9821884270233388, "learning_rate": 1.757158185220718e-05, "loss": 0.1754, "step": 7173 }, { "epoch": 1.4992685475444096, "grad_norm": 1.2266495812516545, "learning_rate": 1.757084487947463e-05, "loss": 0.1542, "step": 7174 }, { "epoch": 1.4994775339602926, "grad_norm": 1.0241185062016234, "learning_rate": 1.757010781039111e-05, "loss": 0.1591, "step": 7175 }, { "epoch": 1.4996865203761756, "grad_norm": 1.1101712777634551, "learning_rate": 1.7569370644966007e-05, "loss": 0.1783, "step": 7176 }, { "epoch": 1.4998955067920585, "grad_norm": 1.5444495416307342, "learning_rate": 1.75686333832087e-05, "loss": 0.1913, "step": 7177 }, { "epoch": 1.5001044932079415, "grad_norm": 1.2247782411616088, "learning_rate": 1.756789602512857e-05, "loss": 0.1377, "step": 7178 }, { "epoch": 1.5003134796238244, "grad_norm": 1.292748141253059, "learning_rate": 1.7567158570735006e-05, "loss": 0.2082, "step": 7179 }, { "epoch": 1.5005224660397074, "grad_norm": 0.9055591846455574, "learning_rate": 1.756642102003739e-05, "loss": 0.1548, "step": 7180 }, { "epoch": 1.5007314524555904, "grad_norm": 1.0254443879896264, "learning_rate": 1.7565683373045108e-05, "loss": 0.1768, "step": 7181 }, { "epoch": 1.5009404388714733, "grad_norm": 1.1391511490507744, "learning_rate": 1.756494562976755e-05, "loss": 0.1857, "step": 7182 }, { "epoch": 1.5011494252873563, "grad_norm": 1.184763315454141, "learning_rate": 1.7564207790214102e-05, "loss": 0.1654, "step": 7183 }, { "epoch": 1.5013584117032392, "grad_norm": 0.9000224878001124, "learning_rate": 1.756346985439416e-05, "loss": 0.1497, "step": 7184 }, { "epoch": 1.5015673981191222, "grad_norm": 0.927565606489144, "learning_rate": 1.7562731822317104e-05, "loss": 0.1849, "step": 7185 }, { "epoch": 1.5017763845350052, "grad_norm": 1.0852851914545278, "learning_rate": 1.7561993693992335e-05, "loss": 0.208, "step": 7186 }, { "epoch": 1.5019853709508881, "grad_norm": 0.8566122747907736, "learning_rate": 1.756125546942925e-05, "loss": 0.1662, "step": 7187 }, { "epoch": 1.502194357366771, "grad_norm": 1.1685899960039932, "learning_rate": 1.7560517148637235e-05, "loss": 0.148, "step": 7188 }, { "epoch": 1.502403343782654, "grad_norm": 0.9177211522732833, "learning_rate": 1.755977873162569e-05, "loss": 0.1622, "step": 7189 }, { "epoch": 1.502612330198537, "grad_norm": 1.1323247863734835, "learning_rate": 1.755904021840402e-05, "loss": 0.177, "step": 7190 }, { "epoch": 1.50282131661442, "grad_norm": 1.0399194129612057, "learning_rate": 1.7558301608981612e-05, "loss": 0.1508, "step": 7191 }, { "epoch": 1.503030303030303, "grad_norm": 0.9531020948397224, "learning_rate": 1.755756290336787e-05, "loss": 0.1769, "step": 7192 }, { "epoch": 1.503239289446186, "grad_norm": 1.1257461037243675, "learning_rate": 1.75568241015722e-05, "loss": 0.1798, "step": 7193 }, { "epoch": 1.5034482758620689, "grad_norm": 1.1433407559163806, "learning_rate": 1.7556085203604e-05, "loss": 0.1437, "step": 7194 }, { "epoch": 1.5036572622779518, "grad_norm": 0.9446849664559351, "learning_rate": 1.7555346209472675e-05, "loss": 0.1399, "step": 7195 }, { "epoch": 1.5038662486938348, "grad_norm": 0.9837827849749177, "learning_rate": 1.7554607119187628e-05, "loss": 0.1692, "step": 7196 }, { "epoch": 1.5040752351097177, "grad_norm": 1.1155882390271499, "learning_rate": 1.7553867932758266e-05, "loss": 0.1737, "step": 7197 }, { "epoch": 1.5042842215256007, "grad_norm": 1.0506389942080767, "learning_rate": 1.7553128650194e-05, "loss": 0.196, "step": 7198 }, { "epoch": 1.5044932079414837, "grad_norm": 0.7778107598277394, "learning_rate": 1.7552389271504232e-05, "loss": 0.1322, "step": 7199 }, { "epoch": 1.5047021943573666, "grad_norm": 0.874827099662043, "learning_rate": 1.755164979669837e-05, "loss": 0.1428, "step": 7200 }, { "epoch": 1.5049111807732496, "grad_norm": 0.9953145054614104, "learning_rate": 1.7550910225785837e-05, "loss": 0.1645, "step": 7201 }, { "epoch": 1.5051201671891326, "grad_norm": 1.0037103916266796, "learning_rate": 1.7550170558776036e-05, "loss": 0.1919, "step": 7202 }, { "epoch": 1.5053291536050155, "grad_norm": 1.0339550506088124, "learning_rate": 1.7549430795678384e-05, "loss": 0.1652, "step": 7203 }, { "epoch": 1.5055381400208987, "grad_norm": 0.9380228452079747, "learning_rate": 1.7548690936502293e-05, "loss": 0.1382, "step": 7204 }, { "epoch": 1.5057471264367817, "grad_norm": 1.0645832539606688, "learning_rate": 1.754795098125718e-05, "loss": 0.1489, "step": 7205 }, { "epoch": 1.5059561128526646, "grad_norm": 0.8633445022148056, "learning_rate": 1.7547210929952463e-05, "loss": 0.162, "step": 7206 }, { "epoch": 1.5061650992685476, "grad_norm": 0.9902096676267459, "learning_rate": 1.754647078259756e-05, "loss": 0.1986, "step": 7207 }, { "epoch": 1.5063740856844305, "grad_norm": 0.8761440291845732, "learning_rate": 1.754573053920189e-05, "loss": 0.1438, "step": 7208 }, { "epoch": 1.5065830721003135, "grad_norm": 0.9112485137630576, "learning_rate": 1.754499019977487e-05, "loss": 0.1559, "step": 7209 }, { "epoch": 1.5067920585161965, "grad_norm": 0.8816097754182795, "learning_rate": 1.754424976432593e-05, "loss": 0.1178, "step": 7210 }, { "epoch": 1.5070010449320794, "grad_norm": 0.934219507104477, "learning_rate": 1.754350923286449e-05, "loss": 0.1747, "step": 7211 }, { "epoch": 1.5072100313479624, "grad_norm": 0.9006002518929224, "learning_rate": 1.754276860539997e-05, "loss": 0.1612, "step": 7212 }, { "epoch": 1.5074190177638453, "grad_norm": 1.0574774013476174, "learning_rate": 1.75420278819418e-05, "loss": 0.1873, "step": 7213 }, { "epoch": 1.5076280041797283, "grad_norm": 1.0639895128114099, "learning_rate": 1.754128706249941e-05, "loss": 0.1695, "step": 7214 }, { "epoch": 1.5078369905956113, "grad_norm": 0.8831990930474221, "learning_rate": 1.7540546147082218e-05, "loss": 0.1599, "step": 7215 }, { "epoch": 1.5080459770114942, "grad_norm": 0.9256661254690804, "learning_rate": 1.7539805135699663e-05, "loss": 0.1749, "step": 7216 }, { "epoch": 1.5082549634273772, "grad_norm": 1.0047231546546915, "learning_rate": 1.753906402836117e-05, "loss": 0.1573, "step": 7217 }, { "epoch": 1.5084639498432602, "grad_norm": 0.9444388393468384, "learning_rate": 1.7538322825076174e-05, "loss": 0.1835, "step": 7218 }, { "epoch": 1.5086729362591431, "grad_norm": 0.9181722041506131, "learning_rate": 1.753758152585411e-05, "loss": 0.132, "step": 7219 }, { "epoch": 1.508881922675026, "grad_norm": 1.0689007765118028, "learning_rate": 1.7536840130704405e-05, "loss": 0.1873, "step": 7220 }, { "epoch": 1.509090909090909, "grad_norm": 1.048401959230566, "learning_rate": 1.7536098639636497e-05, "loss": 0.1791, "step": 7221 }, { "epoch": 1.5092998955067922, "grad_norm": 0.9129175404545959, "learning_rate": 1.753535705265983e-05, "loss": 0.1826, "step": 7222 }, { "epoch": 1.5095088819226752, "grad_norm": 1.0154019605820206, "learning_rate": 1.7534615369783828e-05, "loss": 0.1492, "step": 7223 }, { "epoch": 1.5097178683385581, "grad_norm": 1.1567084122227929, "learning_rate": 1.753387359101794e-05, "loss": 0.1833, "step": 7224 }, { "epoch": 1.509926854754441, "grad_norm": 0.9942899703299294, "learning_rate": 1.753313171637161e-05, "loss": 0.1662, "step": 7225 }, { "epoch": 1.510135841170324, "grad_norm": 1.1802785542547443, "learning_rate": 1.753238974585427e-05, "loss": 0.1543, "step": 7226 }, { "epoch": 1.510344827586207, "grad_norm": 0.9130236936453249, "learning_rate": 1.7531647679475368e-05, "loss": 0.1418, "step": 7227 }, { "epoch": 1.51055381400209, "grad_norm": 0.9879004635041714, "learning_rate": 1.7530905517244344e-05, "loss": 0.1847, "step": 7228 }, { "epoch": 1.510762800417973, "grad_norm": 1.205345563035932, "learning_rate": 1.7530163259170648e-05, "loss": 0.1527, "step": 7229 }, { "epoch": 1.510971786833856, "grad_norm": 0.9013907733893696, "learning_rate": 1.752942090526372e-05, "loss": 0.1532, "step": 7230 }, { "epoch": 1.5111807732497389, "grad_norm": 0.9683470107694794, "learning_rate": 1.7528678455533015e-05, "loss": 0.1617, "step": 7231 }, { "epoch": 1.5113897596656218, "grad_norm": 1.0538157088939992, "learning_rate": 1.7527935909987978e-05, "loss": 0.1913, "step": 7232 }, { "epoch": 1.5115987460815048, "grad_norm": 0.9153071080910871, "learning_rate": 1.752719326863806e-05, "loss": 0.1512, "step": 7233 }, { "epoch": 1.5118077324973878, "grad_norm": 0.9446370208749936, "learning_rate": 1.7526450531492712e-05, "loss": 0.1523, "step": 7234 }, { "epoch": 1.5120167189132707, "grad_norm": 1.1209297325688048, "learning_rate": 1.7525707698561383e-05, "loss": 0.201, "step": 7235 }, { "epoch": 1.5122257053291537, "grad_norm": 0.8163546364087558, "learning_rate": 1.752496476985354e-05, "loss": 0.1748, "step": 7236 }, { "epoch": 1.5124346917450366, "grad_norm": 0.8664348658376223, "learning_rate": 1.7524221745378617e-05, "loss": 0.1652, "step": 7237 }, { "epoch": 1.5126436781609196, "grad_norm": 1.183802094107577, "learning_rate": 1.7523478625146086e-05, "loss": 0.1534, "step": 7238 }, { "epoch": 1.5128526645768026, "grad_norm": 1.0014951542285577, "learning_rate": 1.75227354091654e-05, "loss": 0.1661, "step": 7239 }, { "epoch": 1.5130616509926855, "grad_norm": 0.8583908216544415, "learning_rate": 1.7521992097446015e-05, "loss": 0.165, "step": 7240 }, { "epoch": 1.5132706374085685, "grad_norm": 0.9200951954296608, "learning_rate": 1.7521248689997398e-05, "loss": 0.1796, "step": 7241 }, { "epoch": 1.5134796238244514, "grad_norm": 0.9718397216842135, "learning_rate": 1.7520505186829e-05, "loss": 0.1708, "step": 7242 }, { "epoch": 1.5136886102403344, "grad_norm": 0.9706909533969308, "learning_rate": 1.751976158795029e-05, "loss": 0.1612, "step": 7243 }, { "epoch": 1.5138975966562174, "grad_norm": 0.9997254365231534, "learning_rate": 1.7519017893370728e-05, "loss": 0.1553, "step": 7244 }, { "epoch": 1.5141065830721003, "grad_norm": 1.1348954476118218, "learning_rate": 1.7518274103099787e-05, "loss": 0.1764, "step": 7245 }, { "epoch": 1.5143155694879833, "grad_norm": 0.9500112149018668, "learning_rate": 1.751753021714692e-05, "loss": 0.1536, "step": 7246 }, { "epoch": 1.5145245559038663, "grad_norm": 1.148270192453807, "learning_rate": 1.75167862355216e-05, "loss": 0.191, "step": 7247 }, { "epoch": 1.5147335423197492, "grad_norm": 0.9099872792481715, "learning_rate": 1.75160421582333e-05, "loss": 0.1497, "step": 7248 }, { "epoch": 1.5149425287356322, "grad_norm": 1.0252308336359706, "learning_rate": 1.7515297985291482e-05, "loss": 0.1751, "step": 7249 }, { "epoch": 1.5151515151515151, "grad_norm": 0.9428103102940296, "learning_rate": 1.7514553716705624e-05, "loss": 0.1511, "step": 7250 }, { "epoch": 1.515360501567398, "grad_norm": 0.8697395996404976, "learning_rate": 1.751380935248519e-05, "loss": 0.15, "step": 7251 }, { "epoch": 1.515569487983281, "grad_norm": 0.8489216086658652, "learning_rate": 1.751306489263966e-05, "loss": 0.1651, "step": 7252 }, { "epoch": 1.515778474399164, "grad_norm": 0.9965632234780333, "learning_rate": 1.7512320337178507e-05, "loss": 0.197, "step": 7253 }, { "epoch": 1.515987460815047, "grad_norm": 0.8643845401712232, "learning_rate": 1.75115756861112e-05, "loss": 0.1474, "step": 7254 }, { "epoch": 1.51619644723093, "grad_norm": 0.9762278805595886, "learning_rate": 1.7510830939447227e-05, "loss": 0.1496, "step": 7255 }, { "epoch": 1.516405433646813, "grad_norm": 1.1047431130183518, "learning_rate": 1.7510086097196057e-05, "loss": 0.2128, "step": 7256 }, { "epoch": 1.5166144200626959, "grad_norm": 1.0304753751770077, "learning_rate": 1.7509341159367176e-05, "loss": 0.1789, "step": 7257 }, { "epoch": 1.5168234064785788, "grad_norm": 0.8318169365347095, "learning_rate": 1.750859612597006e-05, "loss": 0.1802, "step": 7258 }, { "epoch": 1.5170323928944618, "grad_norm": 0.9517211658671006, "learning_rate": 1.750785099701419e-05, "loss": 0.1664, "step": 7259 }, { "epoch": 1.5172413793103448, "grad_norm": 0.939988489171989, "learning_rate": 1.750710577250905e-05, "loss": 0.1433, "step": 7260 }, { "epoch": 1.5174503657262277, "grad_norm": 0.9319661349279984, "learning_rate": 1.750636045246413e-05, "loss": 0.1646, "step": 7261 }, { "epoch": 1.5176593521421107, "grad_norm": 1.2198161939891845, "learning_rate": 1.7505615036888907e-05, "loss": 0.1692, "step": 7262 }, { "epoch": 1.5178683385579936, "grad_norm": 0.9854336654579386, "learning_rate": 1.7504869525792874e-05, "loss": 0.1449, "step": 7263 }, { "epoch": 1.5180773249738766, "grad_norm": 0.9305836196148791, "learning_rate": 1.7504123919185514e-05, "loss": 0.1586, "step": 7264 }, { "epoch": 1.5182863113897596, "grad_norm": 1.1332413793600749, "learning_rate": 1.7503378217076315e-05, "loss": 0.1788, "step": 7265 }, { "epoch": 1.5184952978056425, "grad_norm": 0.9861386710331631, "learning_rate": 1.7502632419474773e-05, "loss": 0.1585, "step": 7266 }, { "epoch": 1.5187042842215255, "grad_norm": 0.9899279191035427, "learning_rate": 1.7501886526390378e-05, "loss": 0.178, "step": 7267 }, { "epoch": 1.5189132706374084, "grad_norm": 0.8689675043878854, "learning_rate": 1.750114053783262e-05, "loss": 0.1491, "step": 7268 }, { "epoch": 1.5191222570532914, "grad_norm": 1.0040528641355717, "learning_rate": 1.7500394453810993e-05, "loss": 0.1706, "step": 7269 }, { "epoch": 1.5193312434691744, "grad_norm": 1.1395417823170355, "learning_rate": 1.7499648274334994e-05, "loss": 0.1591, "step": 7270 }, { "epoch": 1.5195402298850573, "grad_norm": 1.062487564484764, "learning_rate": 1.7498901999414116e-05, "loss": 0.1805, "step": 7271 }, { "epoch": 1.5197492163009403, "grad_norm": 0.8842170499482149, "learning_rate": 1.7498155629057863e-05, "loss": 0.1437, "step": 7272 }, { "epoch": 1.5199582027168232, "grad_norm": 0.914374585119474, "learning_rate": 1.749740916327573e-05, "loss": 0.1699, "step": 7273 }, { "epoch": 1.5201671891327062, "grad_norm": 0.8772496379391768, "learning_rate": 1.7496662602077213e-05, "loss": 0.1735, "step": 7274 }, { "epoch": 1.5203761755485894, "grad_norm": 1.23714848220984, "learning_rate": 1.749591594547182e-05, "loss": 0.2062, "step": 7275 }, { "epoch": 1.5205851619644724, "grad_norm": 0.9234176879133252, "learning_rate": 1.7495169193469047e-05, "loss": 0.1671, "step": 7276 }, { "epoch": 1.5207941483803553, "grad_norm": 0.9228365503672513, "learning_rate": 1.7494422346078404e-05, "loss": 0.1878, "step": 7277 }, { "epoch": 1.5210031347962383, "grad_norm": 0.8915889207170467, "learning_rate": 1.749367540330939e-05, "loss": 0.1544, "step": 7278 }, { "epoch": 1.5212121212121212, "grad_norm": 1.0045668912590802, "learning_rate": 1.749292836517152e-05, "loss": 0.1646, "step": 7279 }, { "epoch": 1.5214211076280042, "grad_norm": 0.9057808947527825, "learning_rate": 1.749218123167429e-05, "loss": 0.1676, "step": 7280 }, { "epoch": 1.5216300940438872, "grad_norm": 1.089410347862233, "learning_rate": 1.749143400282721e-05, "loss": 0.1765, "step": 7281 }, { "epoch": 1.5218390804597701, "grad_norm": 1.0148728630094466, "learning_rate": 1.7490686678639796e-05, "loss": 0.1931, "step": 7282 }, { "epoch": 1.522048066875653, "grad_norm": 0.8247839284162616, "learning_rate": 1.7489939259121558e-05, "loss": 0.1525, "step": 7283 }, { "epoch": 1.522257053291536, "grad_norm": 1.092563709391597, "learning_rate": 1.7489191744282003e-05, "loss": 0.1815, "step": 7284 }, { "epoch": 1.522466039707419, "grad_norm": 1.0906633988938876, "learning_rate": 1.7488444134130653e-05, "loss": 0.1761, "step": 7285 }, { "epoch": 1.522675026123302, "grad_norm": 0.910906434651185, "learning_rate": 1.7487696428677013e-05, "loss": 0.1776, "step": 7286 }, { "epoch": 1.522884012539185, "grad_norm": 1.0665848152439898, "learning_rate": 1.74869486279306e-05, "loss": 0.1509, "step": 7287 }, { "epoch": 1.523092998955068, "grad_norm": 1.0674050912971473, "learning_rate": 1.7486200731900938e-05, "loss": 0.2078, "step": 7288 }, { "epoch": 1.5233019853709509, "grad_norm": 0.7983254383389865, "learning_rate": 1.7485452740597537e-05, "loss": 0.1655, "step": 7289 }, { "epoch": 1.5235109717868338, "grad_norm": 1.1104345474771398, "learning_rate": 1.7484704654029924e-05, "loss": 0.1967, "step": 7290 }, { "epoch": 1.5237199582027168, "grad_norm": 1.0348227369909604, "learning_rate": 1.7483956472207614e-05, "loss": 0.1995, "step": 7291 }, { "epoch": 1.5239289446186, "grad_norm": 0.7431560510236771, "learning_rate": 1.748320819514013e-05, "loss": 0.1615, "step": 7292 }, { "epoch": 1.524137931034483, "grad_norm": 0.9871578444122676, "learning_rate": 1.7482459822836996e-05, "loss": 0.1618, "step": 7293 }, { "epoch": 1.5243469174503659, "grad_norm": 0.9732859462886143, "learning_rate": 1.7481711355307735e-05, "loss": 0.1706, "step": 7294 }, { "epoch": 1.5245559038662488, "grad_norm": 1.1953350385483679, "learning_rate": 1.7480962792561875e-05, "loss": 0.1772, "step": 7295 }, { "epoch": 1.5247648902821318, "grad_norm": 1.003996540107343, "learning_rate": 1.748021413460894e-05, "loss": 0.1583, "step": 7296 }, { "epoch": 1.5249738766980148, "grad_norm": 0.9169130364378786, "learning_rate": 1.7479465381458457e-05, "loss": 0.1691, "step": 7297 }, { "epoch": 1.5251828631138977, "grad_norm": 0.8510781686973131, "learning_rate": 1.7478716533119958e-05, "loss": 0.163, "step": 7298 }, { "epoch": 1.5253918495297807, "grad_norm": 1.039773240535439, "learning_rate": 1.7477967589602973e-05, "loss": 0.1794, "step": 7299 }, { "epoch": 1.5256008359456636, "grad_norm": 0.9601498221063602, "learning_rate": 1.747721855091703e-05, "loss": 0.1754, "step": 7300 }, { "epoch": 1.5258098223615466, "grad_norm": 0.9239638496073087, "learning_rate": 1.7476469417071667e-05, "loss": 0.1799, "step": 7301 }, { "epoch": 1.5260188087774296, "grad_norm": 0.9223546897559028, "learning_rate": 1.7475720188076413e-05, "loss": 0.1449, "step": 7302 }, { "epoch": 1.5262277951933125, "grad_norm": 1.0483045259173571, "learning_rate": 1.7474970863940805e-05, "loss": 0.1779, "step": 7303 }, { "epoch": 1.5264367816091955, "grad_norm": 0.9335752368883742, "learning_rate": 1.747422144467438e-05, "loss": 0.1526, "step": 7304 }, { "epoch": 1.5266457680250785, "grad_norm": 1.0819391019270341, "learning_rate": 1.747347193028668e-05, "loss": 0.1881, "step": 7305 }, { "epoch": 1.5268547544409614, "grad_norm": 0.95882013732896, "learning_rate": 1.7472722320787233e-05, "loss": 0.1537, "step": 7306 }, { "epoch": 1.5270637408568444, "grad_norm": 1.083533134589123, "learning_rate": 1.7471972616185584e-05, "loss": 0.1642, "step": 7307 }, { "epoch": 1.5272727272727273, "grad_norm": 0.994277556282052, "learning_rate": 1.7471222816491277e-05, "loss": 0.1754, "step": 7308 }, { "epoch": 1.5274817136886103, "grad_norm": 0.9439087057550706, "learning_rate": 1.747047292171385e-05, "loss": 0.1322, "step": 7309 }, { "epoch": 1.5276907001044933, "grad_norm": 1.0677887543894877, "learning_rate": 1.7469722931862856e-05, "loss": 0.156, "step": 7310 }, { "epoch": 1.5278996865203762, "grad_norm": 1.012517960078499, "learning_rate": 1.7468972846947826e-05, "loss": 0.1656, "step": 7311 }, { "epoch": 1.5281086729362592, "grad_norm": 1.1671242881891302, "learning_rate": 1.7468222666978317e-05, "loss": 0.1885, "step": 7312 }, { "epoch": 1.5283176593521421, "grad_norm": 0.8606335732260331, "learning_rate": 1.7467472391963868e-05, "loss": 0.1426, "step": 7313 }, { "epoch": 1.528526645768025, "grad_norm": 0.8659485576769301, "learning_rate": 1.7466722021914037e-05, "loss": 0.1517, "step": 7314 }, { "epoch": 1.528735632183908, "grad_norm": 0.8458290753452422, "learning_rate": 1.7465971556838364e-05, "loss": 0.1584, "step": 7315 }, { "epoch": 1.528944618599791, "grad_norm": 1.0132740147686676, "learning_rate": 1.7465220996746405e-05, "loss": 0.1914, "step": 7316 }, { "epoch": 1.529153605015674, "grad_norm": 0.8747710949641223, "learning_rate": 1.746447034164771e-05, "loss": 0.1424, "step": 7317 }, { "epoch": 1.529362591431557, "grad_norm": 1.0988267063446657, "learning_rate": 1.7463719591551834e-05, "loss": 0.1669, "step": 7318 }, { "epoch": 1.52957157784744, "grad_norm": 1.0244569380554287, "learning_rate": 1.746296874646833e-05, "loss": 0.1668, "step": 7319 }, { "epoch": 1.5297805642633229, "grad_norm": 1.120370706407219, "learning_rate": 1.7462217806406757e-05, "loss": 0.1666, "step": 7320 }, { "epoch": 1.5299895506792058, "grad_norm": 1.182057850111655, "learning_rate": 1.7461466771376668e-05, "loss": 0.1963, "step": 7321 }, { "epoch": 1.5301985370950888, "grad_norm": 1.162219018022273, "learning_rate": 1.7460715641387623e-05, "loss": 0.1804, "step": 7322 }, { "epoch": 1.5304075235109718, "grad_norm": 1.0354896695141682, "learning_rate": 1.745996441644918e-05, "loss": 0.1493, "step": 7323 }, { "epoch": 1.5306165099268547, "grad_norm": 0.9340677599701053, "learning_rate": 1.74592130965709e-05, "loss": 0.1353, "step": 7324 }, { "epoch": 1.5308254963427377, "grad_norm": 1.376353393209823, "learning_rate": 1.7458461681762344e-05, "loss": 0.1942, "step": 7325 }, { "epoch": 1.5310344827586206, "grad_norm": 1.2202159704067224, "learning_rate": 1.7457710172033077e-05, "loss": 0.1988, "step": 7326 }, { "epoch": 1.5312434691745036, "grad_norm": 0.8877535195366626, "learning_rate": 1.7456958567392664e-05, "loss": 0.145, "step": 7327 }, { "epoch": 1.5314524555903866, "grad_norm": 1.0455037225593105, "learning_rate": 1.7456206867850666e-05, "loss": 0.1833, "step": 7328 }, { "epoch": 1.5316614420062695, "grad_norm": 0.8422281909159539, "learning_rate": 1.7455455073416655e-05, "loss": 0.1756, "step": 7329 }, { "epoch": 1.5318704284221525, "grad_norm": 0.9330100167304903, "learning_rate": 1.7454703184100194e-05, "loss": 0.1393, "step": 7330 }, { "epoch": 1.5320794148380354, "grad_norm": 1.087437897160297, "learning_rate": 1.7453951199910853e-05, "loss": 0.1799, "step": 7331 }, { "epoch": 1.5322884012539184, "grad_norm": 1.1587937515989701, "learning_rate": 1.7453199120858203e-05, "loss": 0.1725, "step": 7332 }, { "epoch": 1.5324973876698014, "grad_norm": 0.9329427362130772, "learning_rate": 1.7452446946951815e-05, "loss": 0.1377, "step": 7333 }, { "epoch": 1.5327063740856843, "grad_norm": 1.1929685980422684, "learning_rate": 1.7451694678201264e-05, "loss": 0.1642, "step": 7334 }, { "epoch": 1.5329153605015673, "grad_norm": 1.1023053365337996, "learning_rate": 1.745094231461612e-05, "loss": 0.1588, "step": 7335 }, { "epoch": 1.5331243469174503, "grad_norm": 1.1366382625962705, "learning_rate": 1.7450189856205963e-05, "loss": 0.1669, "step": 7336 }, { "epoch": 1.5333333333333332, "grad_norm": 1.0987838245627142, "learning_rate": 1.7449437302980362e-05, "loss": 0.159, "step": 7337 }, { "epoch": 1.5335423197492162, "grad_norm": 1.0839540337311206, "learning_rate": 1.74486846549489e-05, "loss": 0.1968, "step": 7338 }, { "epoch": 1.5337513061650991, "grad_norm": 1.088287881627444, "learning_rate": 1.7447931912121155e-05, "loss": 0.1499, "step": 7339 }, { "epoch": 1.533960292580982, "grad_norm": 0.9518255531832797, "learning_rate": 1.7447179074506706e-05, "loss": 0.171, "step": 7340 }, { "epoch": 1.534169278996865, "grad_norm": 0.9207825388900021, "learning_rate": 1.7446426142115134e-05, "loss": 0.1865, "step": 7341 }, { "epoch": 1.534378265412748, "grad_norm": 0.9355951767856903, "learning_rate": 1.7445673114956023e-05, "loss": 0.1783, "step": 7342 }, { "epoch": 1.534587251828631, "grad_norm": 0.9406483727938504, "learning_rate": 1.744491999303895e-05, "loss": 0.195, "step": 7343 }, { "epoch": 1.534796238244514, "grad_norm": 1.0734964244224237, "learning_rate": 1.7444166776373505e-05, "loss": 0.1439, "step": 7344 }, { "epoch": 1.5350052246603971, "grad_norm": 0.8242885347804755, "learning_rate": 1.7443413464969275e-05, "loss": 0.1503, "step": 7345 }, { "epoch": 1.53521421107628, "grad_norm": 1.0319915672685338, "learning_rate": 1.7442660058835846e-05, "loss": 0.1836, "step": 7346 }, { "epoch": 1.535423197492163, "grad_norm": 0.9281713021083159, "learning_rate": 1.7441906557982808e-05, "loss": 0.1746, "step": 7347 }, { "epoch": 1.535632183908046, "grad_norm": 1.2333523732256526, "learning_rate": 1.7441152962419747e-05, "loss": 0.2014, "step": 7348 }, { "epoch": 1.535841170323929, "grad_norm": 0.9980516336500311, "learning_rate": 1.7440399272156247e-05, "loss": 0.1542, "step": 7349 }, { "epoch": 1.536050156739812, "grad_norm": 1.0658351413579623, "learning_rate": 1.7439645487201914e-05, "loss": 0.1984, "step": 7350 }, { "epoch": 1.536259143155695, "grad_norm": 0.9460314506271155, "learning_rate": 1.7438891607566337e-05, "loss": 0.1821, "step": 7351 }, { "epoch": 1.5364681295715779, "grad_norm": 0.938883578264947, "learning_rate": 1.7438137633259104e-05, "loss": 0.1841, "step": 7352 }, { "epoch": 1.5366771159874608, "grad_norm": 0.9576620410349322, "learning_rate": 1.7437383564289816e-05, "loss": 0.179, "step": 7353 }, { "epoch": 1.5368861024033438, "grad_norm": 0.9413538521719458, "learning_rate": 1.743662940066807e-05, "loss": 0.1463, "step": 7354 }, { "epoch": 1.5370950888192267, "grad_norm": 0.9700855907164523, "learning_rate": 1.743587514240346e-05, "loss": 0.1925, "step": 7355 }, { "epoch": 1.5373040752351097, "grad_norm": 0.9600971281956069, "learning_rate": 1.743512078950559e-05, "loss": 0.182, "step": 7356 }, { "epoch": 1.5375130616509927, "grad_norm": 1.0474213600320132, "learning_rate": 1.7434366341984054e-05, "loss": 0.1781, "step": 7357 }, { "epoch": 1.5377220480668756, "grad_norm": 1.113426793841519, "learning_rate": 1.743361179984846e-05, "loss": 0.1728, "step": 7358 }, { "epoch": 1.5379310344827586, "grad_norm": 1.06251383606282, "learning_rate": 1.7432857163108408e-05, "loss": 0.1613, "step": 7359 }, { "epoch": 1.5381400208986415, "grad_norm": 1.0605521889032552, "learning_rate": 1.7432102431773497e-05, "loss": 0.1858, "step": 7360 }, { "epoch": 1.5383490073145245, "grad_norm": 0.9667969040006386, "learning_rate": 1.7431347605853345e-05, "loss": 0.1775, "step": 7361 }, { "epoch": 1.5385579937304075, "grad_norm": 1.1077118025655461, "learning_rate": 1.7430592685357543e-05, "loss": 0.1829, "step": 7362 }, { "epoch": 1.5387669801462907, "grad_norm": 1.0539865826273547, "learning_rate": 1.7429837670295713e-05, "loss": 0.1398, "step": 7363 }, { "epoch": 1.5389759665621736, "grad_norm": 0.9209401940429027, "learning_rate": 1.7429082560677454e-05, "loss": 0.1495, "step": 7364 }, { "epoch": 1.5391849529780566, "grad_norm": 1.0179349897696672, "learning_rate": 1.7428327356512377e-05, "loss": 0.1519, "step": 7365 }, { "epoch": 1.5393939393939395, "grad_norm": 1.009930887475914, "learning_rate": 1.7427572057810093e-05, "loss": 0.1832, "step": 7366 }, { "epoch": 1.5396029258098225, "grad_norm": 1.27553913493884, "learning_rate": 1.7426816664580222e-05, "loss": 0.1743, "step": 7367 }, { "epoch": 1.5398119122257055, "grad_norm": 0.8742577602450144, "learning_rate": 1.742606117683237e-05, "loss": 0.1408, "step": 7368 }, { "epoch": 1.5400208986415884, "grad_norm": 1.0005546109185832, "learning_rate": 1.742530559457615e-05, "loss": 0.1281, "step": 7369 }, { "epoch": 1.5402298850574714, "grad_norm": 0.9956701509570297, "learning_rate": 1.7424549917821184e-05, "loss": 0.1682, "step": 7370 }, { "epoch": 1.5404388714733543, "grad_norm": 1.185654732142884, "learning_rate": 1.7423794146577087e-05, "loss": 0.1785, "step": 7371 }, { "epoch": 1.5406478578892373, "grad_norm": 0.8308440687702303, "learning_rate": 1.742303828085348e-05, "loss": 0.1591, "step": 7372 }, { "epoch": 1.5408568443051203, "grad_norm": 0.831374915387446, "learning_rate": 1.7422282320659977e-05, "loss": 0.1562, "step": 7373 }, { "epoch": 1.5410658307210032, "grad_norm": 1.1748737084878724, "learning_rate": 1.74215262660062e-05, "loss": 0.1846, "step": 7374 }, { "epoch": 1.5412748171368862, "grad_norm": 0.8796573770761066, "learning_rate": 1.7420770116901774e-05, "loss": 0.1539, "step": 7375 }, { "epoch": 1.5414838035527691, "grad_norm": 1.1159786609786293, "learning_rate": 1.7420013873356324e-05, "loss": 0.159, "step": 7376 }, { "epoch": 1.541692789968652, "grad_norm": 0.9854266502320506, "learning_rate": 1.7419257535379467e-05, "loss": 0.1626, "step": 7377 }, { "epoch": 1.541901776384535, "grad_norm": 1.146476693484399, "learning_rate": 1.741850110298083e-05, "loss": 0.1512, "step": 7378 }, { "epoch": 1.542110762800418, "grad_norm": 1.1148868201973703, "learning_rate": 1.7417744576170047e-05, "loss": 0.1507, "step": 7379 }, { "epoch": 1.542319749216301, "grad_norm": 0.9291604749053043, "learning_rate": 1.7416987954956743e-05, "loss": 0.1766, "step": 7380 }, { "epoch": 1.542528735632184, "grad_norm": 1.1138176089649952, "learning_rate": 1.7416231239350544e-05, "loss": 0.1907, "step": 7381 }, { "epoch": 1.542737722048067, "grad_norm": 1.0065169105859426, "learning_rate": 1.741547442936108e-05, "loss": 0.1707, "step": 7382 }, { "epoch": 1.5429467084639499, "grad_norm": 0.9311250326812276, "learning_rate": 1.7414717524997988e-05, "loss": 0.1567, "step": 7383 }, { "epoch": 1.5431556948798328, "grad_norm": 1.024489597570107, "learning_rate": 1.7413960526270894e-05, "loss": 0.1646, "step": 7384 }, { "epoch": 1.5433646812957158, "grad_norm": 1.1359208032620942, "learning_rate": 1.7413203433189437e-05, "loss": 0.1643, "step": 7385 }, { "epoch": 1.5435736677115988, "grad_norm": 0.8178303757331002, "learning_rate": 1.7412446245763254e-05, "loss": 0.1364, "step": 7386 }, { "epoch": 1.5437826541274817, "grad_norm": 1.1245798560946332, "learning_rate": 1.7411688964001975e-05, "loss": 0.1718, "step": 7387 }, { "epoch": 1.5439916405433647, "grad_norm": 1.0186630471538878, "learning_rate": 1.741093158791524e-05, "loss": 0.1726, "step": 7388 }, { "epoch": 1.5442006269592476, "grad_norm": 0.8958030519951227, "learning_rate": 1.741017411751269e-05, "loss": 0.1465, "step": 7389 }, { "epoch": 1.5444096133751306, "grad_norm": 1.0556099462688209, "learning_rate": 1.740941655280396e-05, "loss": 0.1368, "step": 7390 }, { "epoch": 1.5446185997910136, "grad_norm": 0.9962309287374065, "learning_rate": 1.74086588937987e-05, "loss": 0.1764, "step": 7391 }, { "epoch": 1.5448275862068965, "grad_norm": 0.9985512605492644, "learning_rate": 1.7407901140506546e-05, "loss": 0.1748, "step": 7392 }, { "epoch": 1.5450365726227795, "grad_norm": 1.120832542177519, "learning_rate": 1.740714329293714e-05, "loss": 0.1532, "step": 7393 }, { "epoch": 1.5452455590386625, "grad_norm": 1.170490367134838, "learning_rate": 1.740638535110013e-05, "loss": 0.2014, "step": 7394 }, { "epoch": 1.5454545454545454, "grad_norm": 1.0117651107067904, "learning_rate": 1.7405627315005165e-05, "loss": 0.1612, "step": 7395 }, { "epoch": 1.5456635318704284, "grad_norm": 0.887175909128715, "learning_rate": 1.740486918466189e-05, "loss": 0.1841, "step": 7396 }, { "epoch": 1.5458725182863113, "grad_norm": 1.0420916067455286, "learning_rate": 1.7404110960079945e-05, "loss": 0.1755, "step": 7397 }, { "epoch": 1.5460815047021943, "grad_norm": 0.9729563104097715, "learning_rate": 1.7403352641268992e-05, "loss": 0.1267, "step": 7398 }, { "epoch": 1.5462904911180773, "grad_norm": 0.9021207901961991, "learning_rate": 1.7402594228238678e-05, "loss": 0.1621, "step": 7399 }, { "epoch": 1.5464994775339602, "grad_norm": 0.9018551967032744, "learning_rate": 1.740183572099865e-05, "loss": 0.1512, "step": 7400 }, { "epoch": 1.5467084639498432, "grad_norm": 8.062125593686682, "learning_rate": 1.7401077119558564e-05, "loss": 0.1333, "step": 7401 }, { "epoch": 1.5469174503657261, "grad_norm": 0.9105045601952775, "learning_rate": 1.7400318423928077e-05, "loss": 0.1547, "step": 7402 }, { "epoch": 1.547126436781609, "grad_norm": 0.887230454430259, "learning_rate": 1.7399559634116843e-05, "loss": 0.1529, "step": 7403 }, { "epoch": 1.547335423197492, "grad_norm": 0.9738900704996983, "learning_rate": 1.739880075013452e-05, "loss": 0.1595, "step": 7404 }, { "epoch": 1.547544409613375, "grad_norm": 0.9159446182990533, "learning_rate": 1.7398041771990762e-05, "loss": 0.1393, "step": 7405 }, { "epoch": 1.547753396029258, "grad_norm": 0.9016303805436137, "learning_rate": 1.7397282699695233e-05, "loss": 0.1643, "step": 7406 }, { "epoch": 1.547962382445141, "grad_norm": 1.0771763556746248, "learning_rate": 1.739652353325759e-05, "loss": 0.1579, "step": 7407 }, { "epoch": 1.548171368861024, "grad_norm": 1.1243408076553707, "learning_rate": 1.7395764272687494e-05, "loss": 0.17, "step": 7408 }, { "epoch": 1.5483803552769069, "grad_norm": 1.2133804626680624, "learning_rate": 1.7395004917994613e-05, "loss": 0.1706, "step": 7409 }, { "epoch": 1.5485893416927898, "grad_norm": 0.933469318101345, "learning_rate": 1.7394245469188605e-05, "loss": 0.1763, "step": 7410 }, { "epoch": 1.5487983281086728, "grad_norm": 0.9833563293880587, "learning_rate": 1.7393485926279142e-05, "loss": 0.1928, "step": 7411 }, { "epoch": 1.5490073145245558, "grad_norm": 0.9893495850066566, "learning_rate": 1.739272628927588e-05, "loss": 0.1976, "step": 7412 }, { "epoch": 1.5492163009404387, "grad_norm": 1.211184922271612, "learning_rate": 1.7391966558188498e-05, "loss": 0.1513, "step": 7413 }, { "epoch": 1.5494252873563217, "grad_norm": 1.133611054272506, "learning_rate": 1.7391206733026656e-05, "loss": 0.1661, "step": 7414 }, { "epoch": 1.5496342737722049, "grad_norm": 1.0055956692822994, "learning_rate": 1.739044681380003e-05, "loss": 0.1446, "step": 7415 }, { "epoch": 1.5498432601880878, "grad_norm": 0.9287903158737701, "learning_rate": 1.7389686800518288e-05, "loss": 0.1451, "step": 7416 }, { "epoch": 1.5500522466039708, "grad_norm": 1.119463869434572, "learning_rate": 1.7388926693191103e-05, "loss": 0.1582, "step": 7417 }, { "epoch": 1.5502612330198537, "grad_norm": 1.2751138608722126, "learning_rate": 1.738816649182815e-05, "loss": 0.1904, "step": 7418 }, { "epoch": 1.5504702194357367, "grad_norm": 1.150014502457437, "learning_rate": 1.73874061964391e-05, "loss": 0.1822, "step": 7419 }, { "epoch": 1.5506792058516197, "grad_norm": 0.8717451011803184, "learning_rate": 1.7386645807033633e-05, "loss": 0.1492, "step": 7420 }, { "epoch": 1.5508881922675026, "grad_norm": 1.067492209793024, "learning_rate": 1.738588532362142e-05, "loss": 0.1521, "step": 7421 }, { "epoch": 1.5510971786833856, "grad_norm": 1.1373982701995298, "learning_rate": 1.738512474621215e-05, "loss": 0.1821, "step": 7422 }, { "epoch": 1.5513061650992686, "grad_norm": 1.1158751430359473, "learning_rate": 1.738436407481549e-05, "loss": 0.1912, "step": 7423 }, { "epoch": 1.5515151515151515, "grad_norm": 1.133231643338957, "learning_rate": 1.738360330944113e-05, "loss": 0.1775, "step": 7424 }, { "epoch": 1.5517241379310345, "grad_norm": 1.1873161338894693, "learning_rate": 1.7382842450098753e-05, "loss": 0.1841, "step": 7425 }, { "epoch": 1.5519331243469174, "grad_norm": 0.9191688188515208, "learning_rate": 1.7382081496798036e-05, "loss": 0.1743, "step": 7426 }, { "epoch": 1.5521421107628004, "grad_norm": 0.8944990642148007, "learning_rate": 1.7381320449548663e-05, "loss": 0.1553, "step": 7427 }, { "epoch": 1.5523510971786834, "grad_norm": 1.1137041969079993, "learning_rate": 1.7380559308360324e-05, "loss": 0.158, "step": 7428 }, { "epoch": 1.5525600835945663, "grad_norm": 0.985802276345922, "learning_rate": 1.7379798073242706e-05, "loss": 0.1581, "step": 7429 }, { "epoch": 1.5527690700104493, "grad_norm": 0.949013518149872, "learning_rate": 1.737903674420549e-05, "loss": 0.1733, "step": 7430 }, { "epoch": 1.5529780564263322, "grad_norm": 1.140626053434561, "learning_rate": 1.7378275321258375e-05, "loss": 0.1451, "step": 7431 }, { "epoch": 1.5531870428422152, "grad_norm": 0.9617241311905445, "learning_rate": 1.7377513804411042e-05, "loss": 0.179, "step": 7432 }, { "epoch": 1.5533960292580984, "grad_norm": 0.927468185209015, "learning_rate": 1.737675219367319e-05, "loss": 0.1662, "step": 7433 }, { "epoch": 1.5536050156739813, "grad_norm": 0.9253430079917044, "learning_rate": 1.7375990489054506e-05, "loss": 0.1824, "step": 7434 }, { "epoch": 1.5538140020898643, "grad_norm": 1.2119614993585508, "learning_rate": 1.7375228690564684e-05, "loss": 0.2031, "step": 7435 }, { "epoch": 1.5540229885057473, "grad_norm": 1.1311323259489157, "learning_rate": 1.737446679821343e-05, "loss": 0.1834, "step": 7436 }, { "epoch": 1.5542319749216302, "grad_norm": 1.0094425024112892, "learning_rate": 1.7373704812010422e-05, "loss": 0.1448, "step": 7437 }, { "epoch": 1.5544409613375132, "grad_norm": 0.8857970764353661, "learning_rate": 1.737294273196537e-05, "loss": 0.1623, "step": 7438 }, { "epoch": 1.5546499477533962, "grad_norm": 0.9954805209922752, "learning_rate": 1.737218055808797e-05, "loss": 0.1753, "step": 7439 }, { "epoch": 1.5548589341692791, "grad_norm": 0.9608878619543211, "learning_rate": 1.7371418290387923e-05, "loss": 0.1829, "step": 7440 }, { "epoch": 1.555067920585162, "grad_norm": 0.8706309526006286, "learning_rate": 1.7370655928874927e-05, "loss": 0.1819, "step": 7441 }, { "epoch": 1.555276907001045, "grad_norm": 0.8933747497992252, "learning_rate": 1.7369893473558688e-05, "loss": 0.1603, "step": 7442 }, { "epoch": 1.555485893416928, "grad_norm": 0.9635766482320077, "learning_rate": 1.7369130924448907e-05, "loss": 0.1542, "step": 7443 }, { "epoch": 1.555694879832811, "grad_norm": 0.9563167978957211, "learning_rate": 1.7368368281555287e-05, "loss": 0.1819, "step": 7444 }, { "epoch": 1.555903866248694, "grad_norm": 1.034651783900295, "learning_rate": 1.7367605544887537e-05, "loss": 0.1402, "step": 7445 }, { "epoch": 1.5561128526645769, "grad_norm": 1.1151705165378265, "learning_rate": 1.7366842714455363e-05, "loss": 0.1651, "step": 7446 }, { "epoch": 1.5563218390804598, "grad_norm": 0.969306306009698, "learning_rate": 1.7366079790268474e-05, "loss": 0.166, "step": 7447 }, { "epoch": 1.5565308254963428, "grad_norm": 1.0559307508000204, "learning_rate": 1.736531677233658e-05, "loss": 0.1602, "step": 7448 }, { "epoch": 1.5567398119122258, "grad_norm": 0.9452059309014682, "learning_rate": 1.7364553660669386e-05, "loss": 0.1672, "step": 7449 }, { "epoch": 1.5569487983281087, "grad_norm": 1.1764479583363647, "learning_rate": 1.736379045527661e-05, "loss": 0.1813, "step": 7450 }, { "epoch": 1.5571577847439917, "grad_norm": 1.005443922574384, "learning_rate": 1.7363027156167963e-05, "loss": 0.1741, "step": 7451 }, { "epoch": 1.5573667711598747, "grad_norm": 0.9108071322017859, "learning_rate": 1.736226376335316e-05, "loss": 0.1646, "step": 7452 }, { "epoch": 1.5575757575757576, "grad_norm": 1.0246340407385297, "learning_rate": 1.7361500276841914e-05, "loss": 0.1694, "step": 7453 }, { "epoch": 1.5577847439916406, "grad_norm": 1.6023704341112563, "learning_rate": 1.7360736696643945e-05, "loss": 0.1797, "step": 7454 }, { "epoch": 1.5579937304075235, "grad_norm": 0.9107573474419141, "learning_rate": 1.7359973022768967e-05, "loss": 0.1311, "step": 7455 }, { "epoch": 1.5582027168234065, "grad_norm": 1.0291852952638527, "learning_rate": 1.73592092552267e-05, "loss": 0.1664, "step": 7456 }, { "epoch": 1.5584117032392895, "grad_norm": 1.0570718496914597, "learning_rate": 1.735844539402687e-05, "loss": 0.191, "step": 7457 }, { "epoch": 1.5586206896551724, "grad_norm": 1.1479403923101354, "learning_rate": 1.735768143917919e-05, "loss": 0.1975, "step": 7458 }, { "epoch": 1.5588296760710554, "grad_norm": 0.9260241660904057, "learning_rate": 1.7356917390693384e-05, "loss": 0.1802, "step": 7459 }, { "epoch": 1.5590386624869383, "grad_norm": 1.073441068596699, "learning_rate": 1.735615324857918e-05, "loss": 0.1474, "step": 7460 }, { "epoch": 1.5592476489028213, "grad_norm": 1.1605798707051818, "learning_rate": 1.7355389012846298e-05, "loss": 0.1621, "step": 7461 }, { "epoch": 1.5594566353187043, "grad_norm": 1.0618762510507262, "learning_rate": 1.7354624683504465e-05, "loss": 0.1388, "step": 7462 }, { "epoch": 1.5596656217345872, "grad_norm": 1.0734673422147814, "learning_rate": 1.7353860260563415e-05, "loss": 0.1718, "step": 7463 }, { "epoch": 1.5598746081504702, "grad_norm": 0.8795099883592482, "learning_rate": 1.735309574403287e-05, "loss": 0.1535, "step": 7464 }, { "epoch": 1.5600835945663531, "grad_norm": 1.0280778840714557, "learning_rate": 1.735233113392256e-05, "loss": 0.1843, "step": 7465 }, { "epoch": 1.560292580982236, "grad_norm": 0.8751082083054232, "learning_rate": 1.735156643024222e-05, "loss": 0.1403, "step": 7466 }, { "epoch": 1.560501567398119, "grad_norm": 0.9548917366757765, "learning_rate": 1.7350801633001575e-05, "loss": 0.1629, "step": 7467 }, { "epoch": 1.560710553814002, "grad_norm": 1.149291651595113, "learning_rate": 1.7350036742210364e-05, "loss": 0.1852, "step": 7468 }, { "epoch": 1.560919540229885, "grad_norm": 0.920611450548216, "learning_rate": 1.734927175787832e-05, "loss": 0.1696, "step": 7469 }, { "epoch": 1.561128526645768, "grad_norm": 1.2984980006995097, "learning_rate": 1.734850668001518e-05, "loss": 0.1678, "step": 7470 }, { "epoch": 1.561337513061651, "grad_norm": 0.9846137425622525, "learning_rate": 1.7347741508630673e-05, "loss": 0.156, "step": 7471 }, { "epoch": 1.5615464994775339, "grad_norm": 1.2200732426404577, "learning_rate": 1.7346976243734547e-05, "loss": 0.1876, "step": 7472 }, { "epoch": 1.5617554858934168, "grad_norm": 1.0088907494277446, "learning_rate": 1.734621088533654e-05, "loss": 0.1725, "step": 7473 }, { "epoch": 1.5619644723092998, "grad_norm": 1.0173789421429944, "learning_rate": 1.7345445433446385e-05, "loss": 0.1441, "step": 7474 }, { "epoch": 1.5621734587251828, "grad_norm": 1.1425500316403776, "learning_rate": 1.734467988807383e-05, "loss": 0.1545, "step": 7475 }, { "epoch": 1.5623824451410657, "grad_norm": 1.0324425868572276, "learning_rate": 1.734391424922862e-05, "loss": 0.1819, "step": 7476 }, { "epoch": 1.5625914315569487, "grad_norm": 1.2031883064377262, "learning_rate": 1.734314851692049e-05, "loss": 0.1933, "step": 7477 }, { "epoch": 1.5628004179728316, "grad_norm": 1.1944637771772115, "learning_rate": 1.7342382691159192e-05, "loss": 0.1635, "step": 7478 }, { "epoch": 1.5630094043887146, "grad_norm": 1.1301988058661556, "learning_rate": 1.734161677195447e-05, "loss": 0.1839, "step": 7479 }, { "epoch": 1.5632183908045976, "grad_norm": 1.087677518477829, "learning_rate": 1.7340850759316074e-05, "loss": 0.1881, "step": 7480 }, { "epoch": 1.5634273772204805, "grad_norm": 0.9220185723190889, "learning_rate": 1.734008465325375e-05, "loss": 0.1469, "step": 7481 }, { "epoch": 1.5636363636363635, "grad_norm": 1.0464924662852808, "learning_rate": 1.733931845377725e-05, "loss": 0.137, "step": 7482 }, { "epoch": 1.5638453500522465, "grad_norm": 0.9352874046333234, "learning_rate": 1.733855216089632e-05, "loss": 0.1721, "step": 7483 }, { "epoch": 1.5640543364681294, "grad_norm": 0.8143919599252437, "learning_rate": 1.7337785774620718e-05, "loss": 0.1641, "step": 7484 }, { "epoch": 1.5642633228840124, "grad_norm": 0.9151497682473779, "learning_rate": 1.73370192949602e-05, "loss": 0.1471, "step": 7485 }, { "epoch": 1.5644723092998956, "grad_norm": 1.2635883147812885, "learning_rate": 1.7336252721924508e-05, "loss": 0.1859, "step": 7486 }, { "epoch": 1.5646812957157785, "grad_norm": 1.2368494318963454, "learning_rate": 1.733548605552341e-05, "loss": 0.1428, "step": 7487 }, { "epoch": 1.5648902821316615, "grad_norm": 1.0581858961294375, "learning_rate": 1.7334719295766664e-05, "loss": 0.1645, "step": 7488 }, { "epoch": 1.5650992685475444, "grad_norm": 0.9406524632556148, "learning_rate": 1.733395244266402e-05, "loss": 0.1824, "step": 7489 }, { "epoch": 1.5653082549634274, "grad_norm": 1.0424172346375884, "learning_rate": 1.7333185496225237e-05, "loss": 0.1753, "step": 7490 }, { "epoch": 1.5655172413793104, "grad_norm": 1.0788265534939898, "learning_rate": 1.733241845646008e-05, "loss": 0.1732, "step": 7491 }, { "epoch": 1.5657262277951933, "grad_norm": 1.1044181177054182, "learning_rate": 1.7331651323378316e-05, "loss": 0.2084, "step": 7492 }, { "epoch": 1.5659352142110763, "grad_norm": 1.1011348725584265, "learning_rate": 1.73308840969897e-05, "loss": 0.1949, "step": 7493 }, { "epoch": 1.5661442006269592, "grad_norm": 1.0489578105677446, "learning_rate": 1.7330116777303995e-05, "loss": 0.198, "step": 7494 }, { "epoch": 1.5663531870428422, "grad_norm": 0.7936831106967492, "learning_rate": 1.7329349364330972e-05, "loss": 0.1619, "step": 7495 }, { "epoch": 1.5665621734587252, "grad_norm": 1.184344511700269, "learning_rate": 1.7328581858080395e-05, "loss": 0.1596, "step": 7496 }, { "epoch": 1.5667711598746081, "grad_norm": 0.894974646187271, "learning_rate": 1.7327814258562034e-05, "loss": 0.1571, "step": 7497 }, { "epoch": 1.566980146290491, "grad_norm": 1.0477140092857289, "learning_rate": 1.7327046565785653e-05, "loss": 0.1745, "step": 7498 }, { "epoch": 1.567189132706374, "grad_norm": 1.030344301771528, "learning_rate": 1.7326278779761025e-05, "loss": 0.1653, "step": 7499 }, { "epoch": 1.567398119122257, "grad_norm": 1.0585598036991466, "learning_rate": 1.7325510900497927e-05, "loss": 0.1852, "step": 7500 }, { "epoch": 1.56760710553814, "grad_norm": 1.1865717160817306, "learning_rate": 1.7324742928006117e-05, "loss": 0.1871, "step": 7501 }, { "epoch": 1.567816091954023, "grad_norm": 1.1675316089849703, "learning_rate": 1.7323974862295384e-05, "loss": 0.1871, "step": 7502 }, { "epoch": 1.5680250783699061, "grad_norm": 0.8634980149841811, "learning_rate": 1.7323206703375492e-05, "loss": 0.1663, "step": 7503 }, { "epoch": 1.568234064785789, "grad_norm": 1.0567306487318524, "learning_rate": 1.7322438451256226e-05, "loss": 0.1872, "step": 7504 }, { "epoch": 1.568443051201672, "grad_norm": 0.9339351574990737, "learning_rate": 1.7321670105947354e-05, "loss": 0.1844, "step": 7505 }, { "epoch": 1.568652037617555, "grad_norm": 0.8236672294552789, "learning_rate": 1.732090166745866e-05, "loss": 0.1352, "step": 7506 }, { "epoch": 1.568861024033438, "grad_norm": 0.9113204433327036, "learning_rate": 1.7320133135799924e-05, "loss": 0.1526, "step": 7507 }, { "epoch": 1.569070010449321, "grad_norm": 0.9811864575595388, "learning_rate": 1.7319364510980923e-05, "loss": 0.1776, "step": 7508 }, { "epoch": 1.569278996865204, "grad_norm": 1.3449561524233187, "learning_rate": 1.7318595793011446e-05, "loss": 0.1959, "step": 7509 }, { "epoch": 1.5694879832810869, "grad_norm": 1.0435340187841455, "learning_rate": 1.731782698190127e-05, "loss": 0.1646, "step": 7510 }, { "epoch": 1.5696969696969698, "grad_norm": 0.9981530563988924, "learning_rate": 1.7317058077660178e-05, "loss": 0.1817, "step": 7511 }, { "epoch": 1.5699059561128528, "grad_norm": 1.0226578868273537, "learning_rate": 1.731628908029796e-05, "loss": 0.1442, "step": 7512 }, { "epoch": 1.5701149425287357, "grad_norm": 0.9688060676071751, "learning_rate": 1.7315519989824402e-05, "loss": 0.1631, "step": 7513 }, { "epoch": 1.5703239289446187, "grad_norm": 0.980344549369136, "learning_rate": 1.731475080624929e-05, "loss": 0.1618, "step": 7514 }, { "epoch": 1.5705329153605017, "grad_norm": 0.9849108123626045, "learning_rate": 1.7313981529582412e-05, "loss": 0.1642, "step": 7515 }, { "epoch": 1.5707419017763846, "grad_norm": 0.9146430199271733, "learning_rate": 1.731321215983356e-05, "loss": 0.1961, "step": 7516 }, { "epoch": 1.5709508881922676, "grad_norm": 0.9949850969676208, "learning_rate": 1.7312442697012528e-05, "loss": 0.1613, "step": 7517 }, { "epoch": 1.5711598746081505, "grad_norm": 1.1354385600471857, "learning_rate": 1.7311673141129108e-05, "loss": 0.1784, "step": 7518 }, { "epoch": 1.5713688610240335, "grad_norm": 0.9161611471013138, "learning_rate": 1.731090349219309e-05, "loss": 0.1442, "step": 7519 }, { "epoch": 1.5715778474399165, "grad_norm": 0.9177424141190466, "learning_rate": 1.7310133750214272e-05, "loss": 0.1603, "step": 7520 }, { "epoch": 1.5717868338557994, "grad_norm": 1.1204729559946085, "learning_rate": 1.730936391520245e-05, "loss": 0.1982, "step": 7521 }, { "epoch": 1.5719958202716824, "grad_norm": 0.9554547093722244, "learning_rate": 1.7308593987167416e-05, "loss": 0.1531, "step": 7522 }, { "epoch": 1.5722048066875653, "grad_norm": 0.9521227050484579, "learning_rate": 1.7307823966118976e-05, "loss": 0.1598, "step": 7523 }, { "epoch": 1.5724137931034483, "grad_norm": 0.9972117986184903, "learning_rate": 1.730705385206693e-05, "loss": 0.1692, "step": 7524 }, { "epoch": 1.5726227795193313, "grad_norm": 1.0609675246251689, "learning_rate": 1.7306283645021072e-05, "loss": 0.1671, "step": 7525 }, { "epoch": 1.5728317659352142, "grad_norm": 1.0776449204271144, "learning_rate": 1.730551334499121e-05, "loss": 0.169, "step": 7526 }, { "epoch": 1.5730407523510972, "grad_norm": 1.1956891811018022, "learning_rate": 1.7304742951987145e-05, "loss": 0.1807, "step": 7527 }, { "epoch": 1.5732497387669802, "grad_norm": 0.7111536447962953, "learning_rate": 1.7303972466018685e-05, "loss": 0.1301, "step": 7528 }, { "epoch": 1.5734587251828631, "grad_norm": 0.8767846076264671, "learning_rate": 1.7303201887095627e-05, "loss": 0.1503, "step": 7529 }, { "epoch": 1.573667711598746, "grad_norm": 0.884843424456665, "learning_rate": 1.7302431215227782e-05, "loss": 0.1882, "step": 7530 }, { "epoch": 1.573876698014629, "grad_norm": 0.9013809281525886, "learning_rate": 1.7301660450424967e-05, "loss": 0.1485, "step": 7531 }, { "epoch": 1.574085684430512, "grad_norm": 0.9616575584324123, "learning_rate": 1.7300889592696975e-05, "loss": 0.1589, "step": 7532 }, { "epoch": 1.574294670846395, "grad_norm": 1.0145329719668483, "learning_rate": 1.730011864205363e-05, "loss": 0.1534, "step": 7533 }, { "epoch": 1.574503657262278, "grad_norm": 0.9952827005360957, "learning_rate": 1.729934759850474e-05, "loss": 0.168, "step": 7534 }, { "epoch": 1.5747126436781609, "grad_norm": 0.9958193770411322, "learning_rate": 1.729857646206011e-05, "loss": 0.1413, "step": 7535 }, { "epoch": 1.5749216300940438, "grad_norm": 0.9238786655893652, "learning_rate": 1.7297805232729566e-05, "loss": 0.1629, "step": 7536 }, { "epoch": 1.5751306165099268, "grad_norm": 1.113723326237755, "learning_rate": 1.7297033910522913e-05, "loss": 0.1565, "step": 7537 }, { "epoch": 1.5753396029258098, "grad_norm": 1.1637095824282346, "learning_rate": 1.7296262495449973e-05, "loss": 0.1654, "step": 7538 }, { "epoch": 1.5755485893416927, "grad_norm": 0.738740543409767, "learning_rate": 1.729549098752056e-05, "loss": 0.1516, "step": 7539 }, { "epoch": 1.5757575757575757, "grad_norm": 1.0247739515040972, "learning_rate": 1.72947193867445e-05, "loss": 0.1537, "step": 7540 }, { "epoch": 1.5759665621734587, "grad_norm": 1.0818298183828898, "learning_rate": 1.7293947693131603e-05, "loss": 0.1877, "step": 7541 }, { "epoch": 1.5761755485893416, "grad_norm": 0.9841860498612639, "learning_rate": 1.7293175906691698e-05, "loss": 0.153, "step": 7542 }, { "epoch": 1.5763845350052246, "grad_norm": 0.9990804179284554, "learning_rate": 1.72924040274346e-05, "loss": 0.1741, "step": 7543 }, { "epoch": 1.5765935214211075, "grad_norm": 0.8666820458817142, "learning_rate": 1.7291632055370137e-05, "loss": 0.1235, "step": 7544 }, { "epoch": 1.5768025078369905, "grad_norm": 1.0400804384061113, "learning_rate": 1.7290859990508133e-05, "loss": 0.1551, "step": 7545 }, { "epoch": 1.5770114942528735, "grad_norm": 1.1782713251266568, "learning_rate": 1.7290087832858417e-05, "loss": 0.1682, "step": 7546 }, { "epoch": 1.5772204806687564, "grad_norm": 1.068482265463315, "learning_rate": 1.7289315582430807e-05, "loss": 0.1721, "step": 7547 }, { "epoch": 1.5774294670846394, "grad_norm": 0.9462210698577076, "learning_rate": 1.7288543239235135e-05, "loss": 0.1652, "step": 7548 }, { "epoch": 1.5776384535005223, "grad_norm": 1.1269973730174212, "learning_rate": 1.7287770803281238e-05, "loss": 0.1756, "step": 7549 }, { "epoch": 1.5778474399164053, "grad_norm": 1.0920477524678533, "learning_rate": 1.7286998274578937e-05, "loss": 0.1527, "step": 7550 }, { "epoch": 1.5780564263322883, "grad_norm": 1.052895576564863, "learning_rate": 1.728622565313807e-05, "loss": 0.1693, "step": 7551 }, { "epoch": 1.5782654127481712, "grad_norm": 1.0155358318142678, "learning_rate": 1.728545293896846e-05, "loss": 0.1591, "step": 7552 }, { "epoch": 1.5784743991640542, "grad_norm": 1.093046908375112, "learning_rate": 1.728468013207995e-05, "loss": 0.1725, "step": 7553 }, { "epoch": 1.5786833855799371, "grad_norm": 0.7431476629640297, "learning_rate": 1.7283907232482377e-05, "loss": 0.1143, "step": 7554 }, { "epoch": 1.57889237199582, "grad_norm": 1.0463796052490464, "learning_rate": 1.7283134240185567e-05, "loss": 0.1944, "step": 7555 }, { "epoch": 1.5791013584117033, "grad_norm": 1.0302067618923145, "learning_rate": 1.7282361155199366e-05, "loss": 0.1875, "step": 7556 }, { "epoch": 1.5793103448275863, "grad_norm": 0.8919014249127917, "learning_rate": 1.7281587977533614e-05, "loss": 0.1743, "step": 7557 }, { "epoch": 1.5795193312434692, "grad_norm": 0.8328576896967969, "learning_rate": 1.7280814707198142e-05, "loss": 0.1487, "step": 7558 }, { "epoch": 1.5797283176593522, "grad_norm": 1.1705064635532567, "learning_rate": 1.72800413442028e-05, "loss": 0.1826, "step": 7559 }, { "epoch": 1.5799373040752351, "grad_norm": 0.8822099823448979, "learning_rate": 1.727926788855743e-05, "loss": 0.1334, "step": 7560 }, { "epoch": 1.580146290491118, "grad_norm": 1.0115638094864687, "learning_rate": 1.727849434027187e-05, "loss": 0.1664, "step": 7561 }, { "epoch": 1.580355276907001, "grad_norm": 0.9432743674545896, "learning_rate": 1.7277720699355962e-05, "loss": 0.154, "step": 7562 }, { "epoch": 1.580564263322884, "grad_norm": 1.1164642747251023, "learning_rate": 1.727694696581956e-05, "loss": 0.1746, "step": 7563 }, { "epoch": 1.580773249738767, "grad_norm": 0.9091898509933188, "learning_rate": 1.727617313967251e-05, "loss": 0.1721, "step": 7564 }, { "epoch": 1.58098223615465, "grad_norm": 0.86081172413395, "learning_rate": 1.7275399220924652e-05, "loss": 0.1252, "step": 7565 }, { "epoch": 1.581191222570533, "grad_norm": 1.0467277838063334, "learning_rate": 1.7274625209585845e-05, "loss": 0.1614, "step": 7566 }, { "epoch": 1.5814002089864159, "grad_norm": 0.9712672386342224, "learning_rate": 1.7273851105665937e-05, "loss": 0.166, "step": 7567 }, { "epoch": 1.5816091954022988, "grad_norm": 1.6229748542922169, "learning_rate": 1.7273076909174774e-05, "loss": 0.1688, "step": 7568 }, { "epoch": 1.5818181818181818, "grad_norm": 1.1129431307772468, "learning_rate": 1.727230262012222e-05, "loss": 0.1594, "step": 7569 }, { "epoch": 1.5820271682340648, "grad_norm": 0.9449069845094461, "learning_rate": 1.7271528238518117e-05, "loss": 0.1439, "step": 7570 }, { "epoch": 1.5822361546499477, "grad_norm": 0.988310672494551, "learning_rate": 1.7270753764372322e-05, "loss": 0.1483, "step": 7571 }, { "epoch": 1.5824451410658307, "grad_norm": 1.0987091022796485, "learning_rate": 1.7269979197694703e-05, "loss": 0.1961, "step": 7572 }, { "epoch": 1.5826541274817136, "grad_norm": 0.9874885190477385, "learning_rate": 1.7269204538495103e-05, "loss": 0.184, "step": 7573 }, { "epoch": 1.5828631138975968, "grad_norm": 1.0397720186862365, "learning_rate": 1.726842978678339e-05, "loss": 0.177, "step": 7574 }, { "epoch": 1.5830721003134798, "grad_norm": 1.0066907725255219, "learning_rate": 1.7267654942569422e-05, "loss": 0.2007, "step": 7575 }, { "epoch": 1.5832810867293627, "grad_norm": 0.8049828506642064, "learning_rate": 1.726688000586306e-05, "loss": 0.1632, "step": 7576 }, { "epoch": 1.5834900731452457, "grad_norm": 1.1430433945019796, "learning_rate": 1.7266104976674164e-05, "loss": 0.1874, "step": 7577 }, { "epoch": 1.5836990595611287, "grad_norm": 0.9104854610906068, "learning_rate": 1.72653298550126e-05, "loss": 0.1396, "step": 7578 }, { "epoch": 1.5839080459770116, "grad_norm": 0.9407249593061259, "learning_rate": 1.7264554640888226e-05, "loss": 0.1743, "step": 7579 }, { "epoch": 1.5841170323928946, "grad_norm": 0.9383686674179885, "learning_rate": 1.726377933431092e-05, "loss": 0.1586, "step": 7580 }, { "epoch": 1.5843260188087775, "grad_norm": 1.1963021184179041, "learning_rate": 1.726300393529054e-05, "loss": 0.1841, "step": 7581 }, { "epoch": 1.5845350052246605, "grad_norm": 0.8073998644771886, "learning_rate": 1.726222844383696e-05, "loss": 0.1585, "step": 7582 }, { "epoch": 1.5847439916405435, "grad_norm": 1.0061351039471078, "learning_rate": 1.726145285996004e-05, "loss": 0.1888, "step": 7583 }, { "epoch": 1.5849529780564264, "grad_norm": 1.0110122877701604, "learning_rate": 1.726067718366966e-05, "loss": 0.1728, "step": 7584 }, { "epoch": 1.5851619644723094, "grad_norm": 0.8321662481233831, "learning_rate": 1.725990141497569e-05, "loss": 0.1582, "step": 7585 }, { "epoch": 1.5853709508881924, "grad_norm": 0.9972532980249232, "learning_rate": 1.7259125553888e-05, "loss": 0.1404, "step": 7586 }, { "epoch": 1.5855799373040753, "grad_norm": 1.0128767975344448, "learning_rate": 1.7258349600416467e-05, "loss": 0.1608, "step": 7587 }, { "epoch": 1.5857889237199583, "grad_norm": 0.9531857172098657, "learning_rate": 1.725757355457096e-05, "loss": 0.1846, "step": 7588 }, { "epoch": 1.5859979101358412, "grad_norm": 1.2180550123422125, "learning_rate": 1.725679741636136e-05, "loss": 0.1428, "step": 7589 }, { "epoch": 1.5862068965517242, "grad_norm": 1.0107483977203695, "learning_rate": 1.725602118579755e-05, "loss": 0.1727, "step": 7590 }, { "epoch": 1.5864158829676072, "grad_norm": 1.00225533089858, "learning_rate": 1.72552448628894e-05, "loss": 0.157, "step": 7591 }, { "epoch": 1.5866248693834901, "grad_norm": 0.8912128882702026, "learning_rate": 1.7254468447646794e-05, "loss": 0.1687, "step": 7592 }, { "epoch": 1.586833855799373, "grad_norm": 1.0992489631411537, "learning_rate": 1.725369194007961e-05, "loss": 0.1602, "step": 7593 }, { "epoch": 1.587042842215256, "grad_norm": 1.1195220686638907, "learning_rate": 1.7252915340197732e-05, "loss": 0.2061, "step": 7594 }, { "epoch": 1.587251828631139, "grad_norm": 1.2168998256251646, "learning_rate": 1.7252138648011048e-05, "loss": 0.2066, "step": 7595 }, { "epoch": 1.587460815047022, "grad_norm": 0.8962622672991405, "learning_rate": 1.7251361863529438e-05, "loss": 0.1749, "step": 7596 }, { "epoch": 1.587669801462905, "grad_norm": 1.0130827204587878, "learning_rate": 1.7250584986762785e-05, "loss": 0.1673, "step": 7597 }, { "epoch": 1.587878787878788, "grad_norm": 0.9561534670113586, "learning_rate": 1.724980801772098e-05, "loss": 0.1563, "step": 7598 }, { "epoch": 1.5880877742946709, "grad_norm": 0.9113245330715787, "learning_rate": 1.7249030956413916e-05, "loss": 0.1739, "step": 7599 }, { "epoch": 1.5882967607105538, "grad_norm": 0.9639912343458563, "learning_rate": 1.7248253802851472e-05, "loss": 0.1819, "step": 7600 }, { "epoch": 1.5885057471264368, "grad_norm": 0.985263215733031, "learning_rate": 1.7247476557043542e-05, "loss": 0.1616, "step": 7601 }, { "epoch": 1.5887147335423197, "grad_norm": 0.8744761977026366, "learning_rate": 1.7246699219000026e-05, "loss": 0.1533, "step": 7602 }, { "epoch": 1.5889237199582027, "grad_norm": 1.1485970248289428, "learning_rate": 1.7245921788730803e-05, "loss": 0.1886, "step": 7603 }, { "epoch": 1.5891327063740857, "grad_norm": 0.91243657596634, "learning_rate": 1.7245144266245778e-05, "loss": 0.1764, "step": 7604 }, { "epoch": 1.5893416927899686, "grad_norm": 0.976991302900506, "learning_rate": 1.724436665155484e-05, "loss": 0.1842, "step": 7605 }, { "epoch": 1.5895506792058516, "grad_norm": 1.0448748812552575, "learning_rate": 1.724358894466789e-05, "loss": 0.1707, "step": 7606 }, { "epoch": 1.5897596656217345, "grad_norm": 0.8432541475594671, "learning_rate": 1.724281114559482e-05, "loss": 0.1537, "step": 7607 }, { "epoch": 1.5899686520376175, "grad_norm": 1.0344258765728165, "learning_rate": 1.7242033254345534e-05, "loss": 0.2018, "step": 7608 }, { "epoch": 1.5901776384535005, "grad_norm": 1.0320929587726162, "learning_rate": 1.724125527092993e-05, "loss": 0.1616, "step": 7609 }, { "epoch": 1.5903866248693834, "grad_norm": 0.8858243772029726, "learning_rate": 1.7240477195357906e-05, "loss": 0.1889, "step": 7610 }, { "epoch": 1.5905956112852664, "grad_norm": 0.8693770473277415, "learning_rate": 1.7239699027639374e-05, "loss": 0.1772, "step": 7611 }, { "epoch": 1.5908045977011493, "grad_norm": 1.117420117860342, "learning_rate": 1.7238920767784223e-05, "loss": 0.1807, "step": 7612 }, { "epoch": 1.5910135841170323, "grad_norm": 0.8915289570001279, "learning_rate": 1.7238142415802367e-05, "loss": 0.1655, "step": 7613 }, { "epoch": 1.5912225705329153, "grad_norm": 1.0361902028774133, "learning_rate": 1.7237363971703713e-05, "loss": 0.1375, "step": 7614 }, { "epoch": 1.5914315569487982, "grad_norm": 1.0289415586936963, "learning_rate": 1.7236585435498164e-05, "loss": 0.1487, "step": 7615 }, { "epoch": 1.5916405433646812, "grad_norm": 0.9556525500393431, "learning_rate": 1.7235806807195624e-05, "loss": 0.1713, "step": 7616 }, { "epoch": 1.5918495297805642, "grad_norm": 0.8737545474890276, "learning_rate": 1.723502808680601e-05, "loss": 0.1715, "step": 7617 }, { "epoch": 1.5920585161964471, "grad_norm": 0.9991487793529558, "learning_rate": 1.7234249274339232e-05, "loss": 0.2025, "step": 7618 }, { "epoch": 1.59226750261233, "grad_norm": 0.9044964464273118, "learning_rate": 1.7233470369805198e-05, "loss": 0.1802, "step": 7619 }, { "epoch": 1.592476489028213, "grad_norm": 0.9251867189083934, "learning_rate": 1.723269137321382e-05, "loss": 0.1777, "step": 7620 }, { "epoch": 1.592685475444096, "grad_norm": 0.9809189013549717, "learning_rate": 1.7231912284575016e-05, "loss": 0.1787, "step": 7621 }, { "epoch": 1.592894461859979, "grad_norm": 1.098868219734904, "learning_rate": 1.72311331038987e-05, "loss": 0.153, "step": 7622 }, { "epoch": 1.593103448275862, "grad_norm": 1.0873656844353339, "learning_rate": 1.7230353831194785e-05, "loss": 0.1716, "step": 7623 }, { "epoch": 1.5933124346917449, "grad_norm": 0.8779614552056492, "learning_rate": 1.722957446647319e-05, "loss": 0.1647, "step": 7624 }, { "epoch": 1.5935214211076278, "grad_norm": 0.9385149669011368, "learning_rate": 1.7228795009743836e-05, "loss": 0.1766, "step": 7625 }, { "epoch": 1.5937304075235108, "grad_norm": 1.0580612608714792, "learning_rate": 1.7228015461016644e-05, "loss": 0.1941, "step": 7626 }, { "epoch": 1.593939393939394, "grad_norm": 0.8285805373793438, "learning_rate": 1.722723582030153e-05, "loss": 0.1483, "step": 7627 }, { "epoch": 1.594148380355277, "grad_norm": 0.8320249703850017, "learning_rate": 1.7226456087608416e-05, "loss": 0.1418, "step": 7628 }, { "epoch": 1.59435736677116, "grad_norm": 1.1147188814484525, "learning_rate": 1.722567626294723e-05, "loss": 0.1626, "step": 7629 }, { "epoch": 1.5945663531870429, "grad_norm": 0.9173557009889194, "learning_rate": 1.7224896346327897e-05, "loss": 0.1759, "step": 7630 }, { "epoch": 1.5947753396029258, "grad_norm": 0.8491805824788462, "learning_rate": 1.7224116337760338e-05, "loss": 0.1609, "step": 7631 }, { "epoch": 1.5949843260188088, "grad_norm": 0.9041491457420845, "learning_rate": 1.722333623725448e-05, "loss": 0.19, "step": 7632 }, { "epoch": 1.5951933124346918, "grad_norm": 1.0466810750574318, "learning_rate": 1.7222556044820257e-05, "loss": 0.1541, "step": 7633 }, { "epoch": 1.5954022988505747, "grad_norm": 0.9331209426742906, "learning_rate": 1.7221775760467586e-05, "loss": 0.1519, "step": 7634 }, { "epoch": 1.5956112852664577, "grad_norm": 1.0310073620824498, "learning_rate": 1.7220995384206413e-05, "loss": 0.1491, "step": 7635 }, { "epoch": 1.5958202716823406, "grad_norm": 0.9307074858051783, "learning_rate": 1.7220214916046657e-05, "loss": 0.1687, "step": 7636 }, { "epoch": 1.5960292580982236, "grad_norm": 1.0680656689810506, "learning_rate": 1.721943435599826e-05, "loss": 0.1834, "step": 7637 }, { "epoch": 1.5962382445141066, "grad_norm": 1.2055866531695283, "learning_rate": 1.721865370407115e-05, "loss": 0.171, "step": 7638 }, { "epoch": 1.5964472309299895, "grad_norm": 1.1142562838204948, "learning_rate": 1.7217872960275262e-05, "loss": 0.1716, "step": 7639 }, { "epoch": 1.5966562173458725, "grad_norm": 0.9039696875690526, "learning_rate": 1.7217092124620535e-05, "loss": 0.1595, "step": 7640 }, { "epoch": 1.5968652037617554, "grad_norm": 0.9818110512393805, "learning_rate": 1.721631119711691e-05, "loss": 0.1606, "step": 7641 }, { "epoch": 1.5970741901776384, "grad_norm": 1.1568894237377392, "learning_rate": 1.7215530177774313e-05, "loss": 0.1709, "step": 7642 }, { "epoch": 1.5972831765935214, "grad_norm": 1.0125648359937323, "learning_rate": 1.7214749066602692e-05, "loss": 0.1871, "step": 7643 }, { "epoch": 1.5974921630094046, "grad_norm": 0.9841126750533973, "learning_rate": 1.7213967863611992e-05, "loss": 0.1479, "step": 7644 }, { "epoch": 1.5977011494252875, "grad_norm": 0.9894910974865345, "learning_rate": 1.7213186568812146e-05, "loss": 0.1783, "step": 7645 }, { "epoch": 1.5979101358411705, "grad_norm": 0.8529521417768965, "learning_rate": 1.7212405182213103e-05, "loss": 0.1374, "step": 7646 }, { "epoch": 1.5981191222570534, "grad_norm": 0.8195673949518295, "learning_rate": 1.7211623703824805e-05, "loss": 0.1551, "step": 7647 }, { "epoch": 1.5983281086729364, "grad_norm": 0.9816352441791775, "learning_rate": 1.7210842133657197e-05, "loss": 0.1819, "step": 7648 }, { "epoch": 1.5985370950888194, "grad_norm": 0.917261378627623, "learning_rate": 1.7210060471720228e-05, "loss": 0.1555, "step": 7649 }, { "epoch": 1.5987460815047023, "grad_norm": 1.110451304605326, "learning_rate": 1.7209278718023847e-05, "loss": 0.1822, "step": 7650 }, { "epoch": 1.5989550679205853, "grad_norm": 1.4328501484154605, "learning_rate": 1.7208496872578e-05, "loss": 0.158, "step": 7651 }, { "epoch": 1.5991640543364682, "grad_norm": 1.1439028261810293, "learning_rate": 1.720771493539264e-05, "loss": 0.1527, "step": 7652 }, { "epoch": 1.5993730407523512, "grad_norm": 1.0721186462288472, "learning_rate": 1.7206932906477717e-05, "loss": 0.2251, "step": 7653 }, { "epoch": 1.5995820271682342, "grad_norm": 1.0499376859706038, "learning_rate": 1.720615078584318e-05, "loss": 0.1567, "step": 7654 }, { "epoch": 1.5997910135841171, "grad_norm": 0.8651224389116967, "learning_rate": 1.7205368573498984e-05, "loss": 0.1283, "step": 7655 }, { "epoch": 1.6, "grad_norm": 0.9718774267356239, "learning_rate": 1.720458626945509e-05, "loss": 0.1649, "step": 7656 }, { "epoch": 1.600208986415883, "grad_norm": 1.0503900826899903, "learning_rate": 1.720380387372145e-05, "loss": 0.1856, "step": 7657 }, { "epoch": 1.600417972831766, "grad_norm": 0.7896132727521356, "learning_rate": 1.7203021386308016e-05, "loss": 0.1473, "step": 7658 }, { "epoch": 1.600626959247649, "grad_norm": 0.9612645218937983, "learning_rate": 1.7202238807224757e-05, "loss": 0.1659, "step": 7659 }, { "epoch": 1.600835945663532, "grad_norm": 1.2865397708243065, "learning_rate": 1.7201456136481623e-05, "loss": 0.1389, "step": 7660 }, { "epoch": 1.601044932079415, "grad_norm": 1.0052844597100161, "learning_rate": 1.720067337408858e-05, "loss": 0.164, "step": 7661 }, { "epoch": 1.6012539184952979, "grad_norm": 0.9469165880647258, "learning_rate": 1.719989052005559e-05, "loss": 0.1708, "step": 7662 }, { "epoch": 1.6014629049111808, "grad_norm": 1.0216065172819424, "learning_rate": 1.7199107574392613e-05, "loss": 0.1722, "step": 7663 }, { "epoch": 1.6016718913270638, "grad_norm": 0.7752763028007166, "learning_rate": 1.7198324537109616e-05, "loss": 0.1455, "step": 7664 }, { "epoch": 1.6018808777429467, "grad_norm": 1.300980299485448, "learning_rate": 1.7197541408216563e-05, "loss": 0.1817, "step": 7665 }, { "epoch": 1.6020898641588297, "grad_norm": 0.940411161839573, "learning_rate": 1.7196758187723422e-05, "loss": 0.1602, "step": 7666 }, { "epoch": 1.6022988505747127, "grad_norm": 1.0904280086998397, "learning_rate": 1.719597487564016e-05, "loss": 0.1763, "step": 7667 }, { "epoch": 1.6025078369905956, "grad_norm": 1.07980072416861, "learning_rate": 1.7195191471976743e-05, "loss": 0.1591, "step": 7668 }, { "epoch": 1.6027168234064786, "grad_norm": 0.9146535146478146, "learning_rate": 1.7194407976743143e-05, "loss": 0.1771, "step": 7669 }, { "epoch": 1.6029258098223615, "grad_norm": 0.9283881257694224, "learning_rate": 1.7193624389949334e-05, "loss": 0.164, "step": 7670 }, { "epoch": 1.6031347962382445, "grad_norm": 0.9391053990276015, "learning_rate": 1.7192840711605283e-05, "loss": 0.172, "step": 7671 }, { "epoch": 1.6033437826541275, "grad_norm": 0.9780957758695811, "learning_rate": 1.719205694172097e-05, "loss": 0.1483, "step": 7672 }, { "epoch": 1.6035527690700104, "grad_norm": 0.9360472076152838, "learning_rate": 1.7191273080306366e-05, "loss": 0.1461, "step": 7673 }, { "epoch": 1.6037617554858934, "grad_norm": 0.9524476724787767, "learning_rate": 1.7190489127371445e-05, "loss": 0.1595, "step": 7674 }, { "epoch": 1.6039707419017764, "grad_norm": 0.9983797004948894, "learning_rate": 1.718970508292619e-05, "loss": 0.1373, "step": 7675 }, { "epoch": 1.6041797283176593, "grad_norm": 0.8999156359455815, "learning_rate": 1.718892094698057e-05, "loss": 0.1745, "step": 7676 }, { "epoch": 1.6043887147335423, "grad_norm": 1.0386841820312562, "learning_rate": 1.7188136719544572e-05, "loss": 0.1724, "step": 7677 }, { "epoch": 1.6045977011494252, "grad_norm": 0.9018422326531766, "learning_rate": 1.7187352400628176e-05, "loss": 0.1403, "step": 7678 }, { "epoch": 1.6048066875653082, "grad_norm": 0.9545720345690953, "learning_rate": 1.7186567990241362e-05, "loss": 0.1568, "step": 7679 }, { "epoch": 1.6050156739811912, "grad_norm": 0.9824900503920039, "learning_rate": 1.7185783488394113e-05, "loss": 0.1423, "step": 7680 }, { "epoch": 1.6052246603970741, "grad_norm": 0.9197108865380422, "learning_rate": 1.718499889509641e-05, "loss": 0.169, "step": 7681 }, { "epoch": 1.605433646812957, "grad_norm": 0.8427960912296011, "learning_rate": 1.7184214210358243e-05, "loss": 0.1541, "step": 7682 }, { "epoch": 1.60564263322884, "grad_norm": 0.8882297743607082, "learning_rate": 1.7183429434189595e-05, "loss": 0.1895, "step": 7683 }, { "epoch": 1.605851619644723, "grad_norm": 0.9060146439106861, "learning_rate": 1.7182644566600458e-05, "loss": 0.156, "step": 7684 }, { "epoch": 1.606060606060606, "grad_norm": 0.7149766103598896, "learning_rate": 1.7181859607600816e-05, "loss": 0.1128, "step": 7685 }, { "epoch": 1.606269592476489, "grad_norm": 1.004949713480868, "learning_rate": 1.718107455720066e-05, "loss": 0.155, "step": 7686 }, { "epoch": 1.606478578892372, "grad_norm": 1.2711297141109914, "learning_rate": 1.718028941540998e-05, "loss": 0.1498, "step": 7687 }, { "epoch": 1.6066875653082549, "grad_norm": 0.9729999216031154, "learning_rate": 1.7179504182238774e-05, "loss": 0.1829, "step": 7688 }, { "epoch": 1.6068965517241378, "grad_norm": 1.0640052107924767, "learning_rate": 1.717871885769703e-05, "loss": 0.1894, "step": 7689 }, { "epoch": 1.6071055381400208, "grad_norm": 1.133063238909619, "learning_rate": 1.7177933441794744e-05, "loss": 0.1803, "step": 7690 }, { "epoch": 1.6073145245559037, "grad_norm": 0.8727871634021335, "learning_rate": 1.7177147934541907e-05, "loss": 0.155, "step": 7691 }, { "epoch": 1.6075235109717867, "grad_norm": 0.9143977488111067, "learning_rate": 1.7176362335948523e-05, "loss": 0.1429, "step": 7692 }, { "epoch": 1.6077324973876697, "grad_norm": 0.9593469231685381, "learning_rate": 1.7175576646024588e-05, "loss": 0.1746, "step": 7693 }, { "epoch": 1.6079414838035526, "grad_norm": 0.9534203896696605, "learning_rate": 1.7174790864780096e-05, "loss": 0.167, "step": 7694 }, { "epoch": 1.6081504702194356, "grad_norm": 1.13688527006544, "learning_rate": 1.7174004992225056e-05, "loss": 0.1593, "step": 7695 }, { "epoch": 1.6083594566353185, "grad_norm": 0.9297229258096379, "learning_rate": 1.7173219028369463e-05, "loss": 0.1808, "step": 7696 }, { "epoch": 1.6085684430512017, "grad_norm": 0.8387803315762336, "learning_rate": 1.7172432973223322e-05, "loss": 0.1382, "step": 7697 }, { "epoch": 1.6087774294670847, "grad_norm": 1.0026331051404833, "learning_rate": 1.7171646826796635e-05, "loss": 0.1564, "step": 7698 }, { "epoch": 1.6089864158829676, "grad_norm": 0.9171091540424844, "learning_rate": 1.717086058909941e-05, "loss": 0.1602, "step": 7699 }, { "epoch": 1.6091954022988506, "grad_norm": 0.9728316536205233, "learning_rate": 1.7170074260141652e-05, "loss": 0.202, "step": 7700 }, { "epoch": 1.6094043887147336, "grad_norm": 0.9748849854831355, "learning_rate": 1.7169287839933364e-05, "loss": 0.1391, "step": 7701 }, { "epoch": 1.6096133751306165, "grad_norm": 0.9268536652321052, "learning_rate": 1.716850132848456e-05, "loss": 0.1604, "step": 7702 }, { "epoch": 1.6098223615464995, "grad_norm": 0.9154288680742315, "learning_rate": 1.716771472580525e-05, "loss": 0.1735, "step": 7703 }, { "epoch": 1.6100313479623825, "grad_norm": 0.9956588293123174, "learning_rate": 1.716692803190544e-05, "loss": 0.1739, "step": 7704 }, { "epoch": 1.6102403343782654, "grad_norm": 0.9752715698340559, "learning_rate": 1.716614124679515e-05, "loss": 0.152, "step": 7705 }, { "epoch": 1.6104493207941484, "grad_norm": 0.9265008515007566, "learning_rate": 1.716535437048438e-05, "loss": 0.1659, "step": 7706 }, { "epoch": 1.6106583072100313, "grad_norm": 1.0993876443253687, "learning_rate": 1.7164567402983153e-05, "loss": 0.1898, "step": 7707 }, { "epoch": 1.6108672936259143, "grad_norm": 0.8933373316473094, "learning_rate": 1.7163780344301483e-05, "loss": 0.1691, "step": 7708 }, { "epoch": 1.6110762800417973, "grad_norm": 0.9739391386062587, "learning_rate": 1.716299319444939e-05, "loss": 0.1825, "step": 7709 }, { "epoch": 1.6112852664576802, "grad_norm": 0.9435776964118483, "learning_rate": 1.7162205953436884e-05, "loss": 0.1507, "step": 7710 }, { "epoch": 1.6114942528735632, "grad_norm": 0.894061036358533, "learning_rate": 1.716141862127399e-05, "loss": 0.1527, "step": 7711 }, { "epoch": 1.6117032392894461, "grad_norm": 0.886803201803145, "learning_rate": 1.716063119797073e-05, "loss": 0.1544, "step": 7712 }, { "epoch": 1.611912225705329, "grad_norm": 1.0158348904276433, "learning_rate": 1.715984368353712e-05, "loss": 0.176, "step": 7713 }, { "epoch": 1.612121212121212, "grad_norm": 1.0514493288711535, "learning_rate": 1.7159056077983182e-05, "loss": 0.1668, "step": 7714 }, { "epoch": 1.6123301985370952, "grad_norm": 0.9677901005291558, "learning_rate": 1.7158268381318943e-05, "loss": 0.1847, "step": 7715 }, { "epoch": 1.6125391849529782, "grad_norm": 1.4862754735680135, "learning_rate": 1.7157480593554426e-05, "loss": 0.1768, "step": 7716 }, { "epoch": 1.6127481713688612, "grad_norm": 0.978650283556627, "learning_rate": 1.7156692714699658e-05, "loss": 0.1812, "step": 7717 }, { "epoch": 1.6129571577847441, "grad_norm": 1.106382966805849, "learning_rate": 1.7155904744764663e-05, "loss": 0.1379, "step": 7718 }, { "epoch": 1.613166144200627, "grad_norm": 1.0058365818371453, "learning_rate": 1.715511668375947e-05, "loss": 0.1233, "step": 7719 }, { "epoch": 1.61337513061651, "grad_norm": 0.8902676728731688, "learning_rate": 1.715432853169411e-05, "loss": 0.1641, "step": 7720 }, { "epoch": 1.613584117032393, "grad_norm": 0.957982432177541, "learning_rate": 1.7153540288578616e-05, "loss": 0.1849, "step": 7721 }, { "epoch": 1.613793103448276, "grad_norm": 2.2389794682166277, "learning_rate": 1.7152751954423015e-05, "loss": 0.1749, "step": 7722 }, { "epoch": 1.614002089864159, "grad_norm": 0.9822721661333197, "learning_rate": 1.7151963529237342e-05, "loss": 0.1514, "step": 7723 }, { "epoch": 1.614211076280042, "grad_norm": 1.0116746243101298, "learning_rate": 1.715117501303163e-05, "loss": 0.1818, "step": 7724 }, { "epoch": 1.6144200626959249, "grad_norm": 1.0380200197450413, "learning_rate": 1.7150386405815914e-05, "loss": 0.1789, "step": 7725 }, { "epoch": 1.6146290491118078, "grad_norm": 0.7786940415415132, "learning_rate": 1.714959770760023e-05, "loss": 0.1276, "step": 7726 }, { "epoch": 1.6148380355276908, "grad_norm": 0.9266172699687608, "learning_rate": 1.714880891839462e-05, "loss": 0.1342, "step": 7727 }, { "epoch": 1.6150470219435737, "grad_norm": 0.9773998145697754, "learning_rate": 1.7148020038209115e-05, "loss": 0.1633, "step": 7728 }, { "epoch": 1.6152560083594567, "grad_norm": 0.8832655280264405, "learning_rate": 1.7147231067053756e-05, "loss": 0.1599, "step": 7729 }, { "epoch": 1.6154649947753397, "grad_norm": 1.0313617444172922, "learning_rate": 1.714644200493859e-05, "loss": 0.1955, "step": 7730 }, { "epoch": 1.6156739811912226, "grad_norm": 1.3034020165888587, "learning_rate": 1.714565285187366e-05, "loss": 0.1855, "step": 7731 }, { "epoch": 1.6158829676071056, "grad_norm": 0.9341947994359813, "learning_rate": 1.7144863607868995e-05, "loss": 0.1572, "step": 7732 }, { "epoch": 1.6160919540229886, "grad_norm": 0.8255938959295216, "learning_rate": 1.7144074272934656e-05, "loss": 0.1567, "step": 7733 }, { "epoch": 1.6163009404388715, "grad_norm": 0.9965358829583028, "learning_rate": 1.7143284847080678e-05, "loss": 0.1878, "step": 7734 }, { "epoch": 1.6165099268547545, "grad_norm": 1.1478709020044695, "learning_rate": 1.7142495330317113e-05, "loss": 0.1662, "step": 7735 }, { "epoch": 1.6167189132706374, "grad_norm": 1.0701132973231173, "learning_rate": 1.714170572265401e-05, "loss": 0.1846, "step": 7736 }, { "epoch": 1.6169278996865204, "grad_norm": 0.7905608173871654, "learning_rate": 1.7140916024101412e-05, "loss": 0.1508, "step": 7737 }, { "epoch": 1.6171368861024034, "grad_norm": 0.741740429257705, "learning_rate": 1.7140126234669373e-05, "loss": 0.1459, "step": 7738 }, { "epoch": 1.6173458725182863, "grad_norm": 1.0992540104337918, "learning_rate": 1.713933635436794e-05, "loss": 0.1808, "step": 7739 }, { "epoch": 1.6175548589341693, "grad_norm": 0.9605663933965433, "learning_rate": 1.7138546383207173e-05, "loss": 0.1663, "step": 7740 }, { "epoch": 1.6177638453500522, "grad_norm": 0.8012252017429196, "learning_rate": 1.713775632119712e-05, "loss": 0.1535, "step": 7741 }, { "epoch": 1.6179728317659352, "grad_norm": 0.8384255973011882, "learning_rate": 1.7136966168347838e-05, "loss": 0.1678, "step": 7742 }, { "epoch": 1.6181818181818182, "grad_norm": 0.9251789533834422, "learning_rate": 1.7136175924669384e-05, "loss": 0.1654, "step": 7743 }, { "epoch": 1.6183908045977011, "grad_norm": 0.968733291208598, "learning_rate": 1.7135385590171813e-05, "loss": 0.1822, "step": 7744 }, { "epoch": 1.618599791013584, "grad_norm": 0.9355996076886367, "learning_rate": 1.7134595164865185e-05, "loss": 0.1812, "step": 7745 }, { "epoch": 1.618808777429467, "grad_norm": 0.9073823411283841, "learning_rate": 1.7133804648759556e-05, "loss": 0.1577, "step": 7746 }, { "epoch": 1.61901776384535, "grad_norm": 0.945478919922188, "learning_rate": 1.713301404186499e-05, "loss": 0.1396, "step": 7747 }, { "epoch": 1.619226750261233, "grad_norm": 1.1555288924663865, "learning_rate": 1.7132223344191548e-05, "loss": 0.1931, "step": 7748 }, { "epoch": 1.619435736677116, "grad_norm": 0.883930110386919, "learning_rate": 1.713143255574929e-05, "loss": 0.1544, "step": 7749 }, { "epoch": 1.619644723092999, "grad_norm": 0.9835366912232749, "learning_rate": 1.7130641676548285e-05, "loss": 0.1333, "step": 7750 }, { "epoch": 1.6198537095088819, "grad_norm": 0.9347117818918995, "learning_rate": 1.712985070659859e-05, "loss": 0.1678, "step": 7751 }, { "epoch": 1.6200626959247648, "grad_norm": 0.9953536227064, "learning_rate": 1.7129059645910286e-05, "loss": 0.161, "step": 7752 }, { "epoch": 1.6202716823406478, "grad_norm": 0.9444610250497396, "learning_rate": 1.7128268494493426e-05, "loss": 0.1708, "step": 7753 }, { "epoch": 1.6204806687565307, "grad_norm": 0.8862067750878447, "learning_rate": 1.7127477252358087e-05, "loss": 0.1604, "step": 7754 }, { "epoch": 1.6206896551724137, "grad_norm": 0.9517355757997117, "learning_rate": 1.712668591951433e-05, "loss": 0.1598, "step": 7755 }, { "epoch": 1.6208986415882967, "grad_norm": 0.8265466678249963, "learning_rate": 1.712589449597224e-05, "loss": 0.1776, "step": 7756 }, { "epoch": 1.6211076280041796, "grad_norm": 1.0426896637152399, "learning_rate": 1.7125102981741877e-05, "loss": 0.2091, "step": 7757 }, { "epoch": 1.6213166144200626, "grad_norm": 1.0284716784180303, "learning_rate": 1.7124311376833315e-05, "loss": 0.1854, "step": 7758 }, { "epoch": 1.6215256008359455, "grad_norm": 0.9073883356505276, "learning_rate": 1.7123519681256638e-05, "loss": 0.1068, "step": 7759 }, { "epoch": 1.6217345872518285, "grad_norm": 0.9516058050619375, "learning_rate": 1.7122727895021912e-05, "loss": 0.1599, "step": 7760 }, { "epoch": 1.6219435736677115, "grad_norm": 0.9295718213371245, "learning_rate": 1.7121936018139215e-05, "loss": 0.1763, "step": 7761 }, { "epoch": 1.6221525600835944, "grad_norm": 1.174354475880124, "learning_rate": 1.712114405061863e-05, "loss": 0.1828, "step": 7762 }, { "epoch": 1.6223615464994774, "grad_norm": 1.1145079648061393, "learning_rate": 1.712035199247023e-05, "loss": 0.1609, "step": 7763 }, { "epoch": 1.6225705329153604, "grad_norm": 0.8987447134335894, "learning_rate": 1.7119559843704103e-05, "loss": 0.1501, "step": 7764 }, { "epoch": 1.6227795193312433, "grad_norm": 0.9687157137108368, "learning_rate": 1.7118767604330323e-05, "loss": 0.1449, "step": 7765 }, { "epoch": 1.6229885057471263, "grad_norm": 0.9095975473358674, "learning_rate": 1.7117975274358975e-05, "loss": 0.1788, "step": 7766 }, { "epoch": 1.6231974921630095, "grad_norm": 0.9563327042233751, "learning_rate": 1.7117182853800144e-05, "loss": 0.1825, "step": 7767 }, { "epoch": 1.6234064785788924, "grad_norm": 0.932833775628289, "learning_rate": 1.711639034266391e-05, "loss": 0.1793, "step": 7768 }, { "epoch": 1.6236154649947754, "grad_norm": 0.8579182776051949, "learning_rate": 1.7115597740960364e-05, "loss": 0.1498, "step": 7769 }, { "epoch": 1.6238244514106583, "grad_norm": 0.8127160729437776, "learning_rate": 1.711480504869959e-05, "loss": 0.1267, "step": 7770 }, { "epoch": 1.6240334378265413, "grad_norm": 0.8189850878202303, "learning_rate": 1.711401226589168e-05, "loss": 0.1725, "step": 7771 }, { "epoch": 1.6242424242424243, "grad_norm": 0.9456021128298315, "learning_rate": 1.711321939254672e-05, "loss": 0.1714, "step": 7772 }, { "epoch": 1.6244514106583072, "grad_norm": 0.9237258955429143, "learning_rate": 1.7112426428674803e-05, "loss": 0.1401, "step": 7773 }, { "epoch": 1.6246603970741902, "grad_norm": 0.9511703287255119, "learning_rate": 1.711163337428602e-05, "loss": 0.1397, "step": 7774 }, { "epoch": 1.6248693834900731, "grad_norm": 0.7659273959618458, "learning_rate": 1.711084022939046e-05, "loss": 0.1429, "step": 7775 }, { "epoch": 1.625078369905956, "grad_norm": 0.8224772583859219, "learning_rate": 1.7110046993998225e-05, "loss": 0.17, "step": 7776 }, { "epoch": 1.625287356321839, "grad_norm": 0.9917983900569083, "learning_rate": 1.71092536681194e-05, "loss": 0.1641, "step": 7777 }, { "epoch": 1.625496342737722, "grad_norm": 0.9348023570005635, "learning_rate": 1.7108460251764092e-05, "loss": 0.1625, "step": 7778 }, { "epoch": 1.625705329153605, "grad_norm": 1.2705930737960447, "learning_rate": 1.710766674494239e-05, "loss": 0.1872, "step": 7779 }, { "epoch": 1.625914315569488, "grad_norm": 0.9828883545060889, "learning_rate": 1.71068731476644e-05, "loss": 0.1704, "step": 7780 }, { "epoch": 1.626123301985371, "grad_norm": 1.0501407893310573, "learning_rate": 1.710607945994021e-05, "loss": 0.1302, "step": 7781 }, { "epoch": 1.6263322884012539, "grad_norm": 1.3585502500041802, "learning_rate": 1.7105285681779936e-05, "loss": 0.1516, "step": 7782 }, { "epoch": 1.6265412748171368, "grad_norm": 0.8173857098238289, "learning_rate": 1.710449181319367e-05, "loss": 0.1445, "step": 7783 }, { "epoch": 1.6267502612330198, "grad_norm": 1.059233566522327, "learning_rate": 1.7103697854191516e-05, "loss": 0.1746, "step": 7784 }, { "epoch": 1.626959247648903, "grad_norm": 1.0543468907990392, "learning_rate": 1.7102903804783584e-05, "loss": 0.156, "step": 7785 }, { "epoch": 1.627168234064786, "grad_norm": 0.8611299167540081, "learning_rate": 1.7102109664979974e-05, "loss": 0.157, "step": 7786 }, { "epoch": 1.627377220480669, "grad_norm": 0.9610319397146708, "learning_rate": 1.7101315434790797e-05, "loss": 0.1837, "step": 7787 }, { "epoch": 1.6275862068965519, "grad_norm": 1.1086903745982417, "learning_rate": 1.7100521114226155e-05, "loss": 0.195, "step": 7788 }, { "epoch": 1.6277951933124348, "grad_norm": 0.9142964556756142, "learning_rate": 1.709972670329616e-05, "loss": 0.1619, "step": 7789 }, { "epoch": 1.6280041797283178, "grad_norm": 0.8801598375955225, "learning_rate": 1.7098932202010928e-05, "loss": 0.1267, "step": 7790 }, { "epoch": 1.6282131661442008, "grad_norm": 0.9847750653600505, "learning_rate": 1.7098137610380557e-05, "loss": 0.1565, "step": 7791 }, { "epoch": 1.6284221525600837, "grad_norm": 0.7645409888252523, "learning_rate": 1.7097342928415173e-05, "loss": 0.1568, "step": 7792 }, { "epoch": 1.6286311389759667, "grad_norm": 0.8761896433232205, "learning_rate": 1.7096548156124885e-05, "loss": 0.131, "step": 7793 }, { "epoch": 1.6288401253918496, "grad_norm": 1.0288450649882046, "learning_rate": 1.70957532935198e-05, "loss": 0.1798, "step": 7794 }, { "epoch": 1.6290491118077326, "grad_norm": 1.009505116174857, "learning_rate": 1.7094958340610044e-05, "loss": 0.1526, "step": 7795 }, { "epoch": 1.6292580982236156, "grad_norm": 0.9867501660087198, "learning_rate": 1.709416329740573e-05, "loss": 0.1691, "step": 7796 }, { "epoch": 1.6294670846394985, "grad_norm": 0.7908194421704704, "learning_rate": 1.709336816391698e-05, "loss": 0.1488, "step": 7797 }, { "epoch": 1.6296760710553815, "grad_norm": 0.9700111782095324, "learning_rate": 1.7092572940153906e-05, "loss": 0.1438, "step": 7798 }, { "epoch": 1.6298850574712644, "grad_norm": 0.8976929832751895, "learning_rate": 1.7091777626126634e-05, "loss": 0.1343, "step": 7799 }, { "epoch": 1.6300940438871474, "grad_norm": 0.9451559372402327, "learning_rate": 1.709098222184528e-05, "loss": 0.1464, "step": 7800 }, { "epoch": 1.6303030303030304, "grad_norm": 0.9510823378518649, "learning_rate": 1.7090186727319978e-05, "loss": 0.177, "step": 7801 }, { "epoch": 1.6305120167189133, "grad_norm": 0.9775805228379388, "learning_rate": 1.7089391142560844e-05, "loss": 0.2013, "step": 7802 }, { "epoch": 1.6307210031347963, "grad_norm": 0.970875791386719, "learning_rate": 1.7088595467578e-05, "loss": 0.1845, "step": 7803 }, { "epoch": 1.6309299895506792, "grad_norm": 0.8809240979951294, "learning_rate": 1.7087799702381577e-05, "loss": 0.1719, "step": 7804 }, { "epoch": 1.6311389759665622, "grad_norm": 0.8105136342333468, "learning_rate": 1.70870038469817e-05, "loss": 0.1816, "step": 7805 }, { "epoch": 1.6313479623824452, "grad_norm": 0.8685483415147863, "learning_rate": 1.7086207901388503e-05, "loss": 0.1953, "step": 7806 }, { "epoch": 1.6315569487983281, "grad_norm": 1.0556202104764492, "learning_rate": 1.7085411865612108e-05, "loss": 0.1687, "step": 7807 }, { "epoch": 1.631765935214211, "grad_norm": 1.0780293251960769, "learning_rate": 1.708461573966265e-05, "loss": 0.1899, "step": 7808 }, { "epoch": 1.631974921630094, "grad_norm": 1.1924125040987774, "learning_rate": 1.708381952355026e-05, "loss": 0.1785, "step": 7809 }, { "epoch": 1.632183908045977, "grad_norm": 0.9346894147878604, "learning_rate": 1.7083023217285078e-05, "loss": 0.1705, "step": 7810 }, { "epoch": 1.63239289446186, "grad_norm": 0.969610948082345, "learning_rate": 1.7082226820877225e-05, "loss": 0.1504, "step": 7811 }, { "epoch": 1.632601880877743, "grad_norm": 0.9682902358083176, "learning_rate": 1.7081430334336844e-05, "loss": 0.1793, "step": 7812 }, { "epoch": 1.632810867293626, "grad_norm": 1.1397130913614264, "learning_rate": 1.7080633757674072e-05, "loss": 0.1892, "step": 7813 }, { "epoch": 1.6330198537095089, "grad_norm": 0.8630232982140075, "learning_rate": 1.7079837090899046e-05, "loss": 0.1614, "step": 7814 }, { "epoch": 1.6332288401253918, "grad_norm": 0.9028039206475659, "learning_rate": 1.7079040334021903e-05, "loss": 0.1581, "step": 7815 }, { "epoch": 1.6334378265412748, "grad_norm": 0.8532827185794298, "learning_rate": 1.7078243487052787e-05, "loss": 0.1576, "step": 7816 }, { "epoch": 1.6336468129571577, "grad_norm": 0.878980648137543, "learning_rate": 1.707744655000183e-05, "loss": 0.1808, "step": 7817 }, { "epoch": 1.6338557993730407, "grad_norm": 1.27464053520213, "learning_rate": 1.7076649522879187e-05, "loss": 0.1735, "step": 7818 }, { "epoch": 1.6340647857889237, "grad_norm": 0.9075117108586985, "learning_rate": 1.707585240569499e-05, "loss": 0.1451, "step": 7819 }, { "epoch": 1.6342737722048066, "grad_norm": 0.8781878887150736, "learning_rate": 1.7075055198459395e-05, "loss": 0.142, "step": 7820 }, { "epoch": 1.6344827586206896, "grad_norm": 0.8655647720348393, "learning_rate": 1.707425790118254e-05, "loss": 0.1557, "step": 7821 }, { "epoch": 1.6346917450365726, "grad_norm": 0.9882625051988658, "learning_rate": 1.707346051387457e-05, "loss": 0.1409, "step": 7822 }, { "epoch": 1.6349007314524555, "grad_norm": 0.925937700181548, "learning_rate": 1.7072663036545635e-05, "loss": 0.1586, "step": 7823 }, { "epoch": 1.6351097178683385, "grad_norm": 0.88565382797508, "learning_rate": 1.7071865469205887e-05, "loss": 0.1922, "step": 7824 }, { "epoch": 1.6353187042842214, "grad_norm": 0.9451908519792415, "learning_rate": 1.7071067811865477e-05, "loss": 0.1725, "step": 7825 }, { "epoch": 1.6355276907001044, "grad_norm": 1.1264074913268698, "learning_rate": 1.7070270064534552e-05, "loss": 0.2067, "step": 7826 }, { "epoch": 1.6357366771159874, "grad_norm": 0.8988847027956438, "learning_rate": 1.706947222722327e-05, "loss": 0.1549, "step": 7827 }, { "epoch": 1.6359456635318703, "grad_norm": 0.984945918060575, "learning_rate": 1.706867429994178e-05, "loss": 0.1659, "step": 7828 }, { "epoch": 1.6361546499477533, "grad_norm": 0.996137065395247, "learning_rate": 1.7067876282700235e-05, "loss": 0.1419, "step": 7829 }, { "epoch": 1.6363636363636362, "grad_norm": 1.0509040344708436, "learning_rate": 1.70670781755088e-05, "loss": 0.1774, "step": 7830 }, { "epoch": 1.6365726227795192, "grad_norm": 1.0461103333144177, "learning_rate": 1.706627997837762e-05, "loss": 0.1899, "step": 7831 }, { "epoch": 1.6367816091954022, "grad_norm": 1.0979925105540027, "learning_rate": 1.7065481691316866e-05, "loss": 0.1799, "step": 7832 }, { "epoch": 1.6369905956112851, "grad_norm": 0.9421862267866483, "learning_rate": 1.7064683314336688e-05, "loss": 0.1672, "step": 7833 }, { "epoch": 1.637199582027168, "grad_norm": 1.0415935119927322, "learning_rate": 1.7063884847447255e-05, "loss": 0.1752, "step": 7834 }, { "epoch": 1.637408568443051, "grad_norm": 0.8647438743741428, "learning_rate": 1.7063086290658717e-05, "loss": 0.1767, "step": 7835 }, { "epoch": 1.637617554858934, "grad_norm": 0.9971360573207545, "learning_rate": 1.706228764398125e-05, "loss": 0.1595, "step": 7836 }, { "epoch": 1.637826541274817, "grad_norm": 1.1710500797070769, "learning_rate": 1.7061488907425007e-05, "loss": 0.1811, "step": 7837 }, { "epoch": 1.6380355276907002, "grad_norm": 1.252817849446988, "learning_rate": 1.7060690081000158e-05, "loss": 0.1629, "step": 7838 }, { "epoch": 1.6382445141065831, "grad_norm": 1.0740232557711977, "learning_rate": 1.7059891164716867e-05, "loss": 0.1452, "step": 7839 }, { "epoch": 1.638453500522466, "grad_norm": 1.0744835035731113, "learning_rate": 1.7059092158585312e-05, "loss": 0.1516, "step": 7840 }, { "epoch": 1.638662486938349, "grad_norm": 1.0252615101233808, "learning_rate": 1.7058293062615647e-05, "loss": 0.1158, "step": 7841 }, { "epoch": 1.638871473354232, "grad_norm": 1.1655243198306062, "learning_rate": 1.705749387681805e-05, "loss": 0.1629, "step": 7842 }, { "epoch": 1.639080459770115, "grad_norm": 0.9253452990330671, "learning_rate": 1.705669460120269e-05, "loss": 0.177, "step": 7843 }, { "epoch": 1.639289446185998, "grad_norm": 0.933798983868853, "learning_rate": 1.705589523577974e-05, "loss": 0.1722, "step": 7844 }, { "epoch": 1.6394984326018809, "grad_norm": 1.0499987386507244, "learning_rate": 1.705509578055937e-05, "loss": 0.1561, "step": 7845 }, { "epoch": 1.6397074190177638, "grad_norm": 0.8155913352199563, "learning_rate": 1.7054296235551758e-05, "loss": 0.1837, "step": 7846 }, { "epoch": 1.6399164054336468, "grad_norm": 0.9640836603973144, "learning_rate": 1.7053496600767077e-05, "loss": 0.1847, "step": 7847 }, { "epoch": 1.6401253918495298, "grad_norm": 0.9603619738272322, "learning_rate": 1.7052696876215505e-05, "loss": 0.1589, "step": 7848 }, { "epoch": 1.6403343782654127, "grad_norm": 1.0516894442516949, "learning_rate": 1.705189706190722e-05, "loss": 0.1438, "step": 7849 }, { "epoch": 1.6405433646812957, "grad_norm": 0.8745482439574287, "learning_rate": 1.7051097157852398e-05, "loss": 0.1404, "step": 7850 }, { "epoch": 1.6407523510971787, "grad_norm": 1.2470608366639502, "learning_rate": 1.7050297164061222e-05, "loss": 0.1677, "step": 7851 }, { "epoch": 1.6409613375130616, "grad_norm": 0.9485569085332815, "learning_rate": 1.7049497080543877e-05, "loss": 0.1355, "step": 7852 }, { "epoch": 1.6411703239289446, "grad_norm": 0.9360798560676834, "learning_rate": 1.7048696907310537e-05, "loss": 0.1722, "step": 7853 }, { "epoch": 1.6413793103448275, "grad_norm": 0.7826824607166512, "learning_rate": 1.7047896644371393e-05, "loss": 0.1719, "step": 7854 }, { "epoch": 1.6415882967607107, "grad_norm": 0.9467078829971949, "learning_rate": 1.704709629173662e-05, "loss": 0.1661, "step": 7855 }, { "epoch": 1.6417972831765937, "grad_norm": 1.0241709465006281, "learning_rate": 1.7046295849416413e-05, "loss": 0.1857, "step": 7856 }, { "epoch": 1.6420062695924766, "grad_norm": 0.9461247686842279, "learning_rate": 1.7045495317420956e-05, "loss": 0.167, "step": 7857 }, { "epoch": 1.6422152560083596, "grad_norm": 1.068019270237359, "learning_rate": 1.7044694695760435e-05, "loss": 0.1312, "step": 7858 }, { "epoch": 1.6424242424242426, "grad_norm": 1.2298678634011764, "learning_rate": 1.7043893984445038e-05, "loss": 0.1908, "step": 7859 }, { "epoch": 1.6426332288401255, "grad_norm": 0.8986961301935124, "learning_rate": 1.7043093183484965e-05, "loss": 0.16, "step": 7860 }, { "epoch": 1.6428422152560085, "grad_norm": 1.1462881187077816, "learning_rate": 1.7042292292890396e-05, "loss": 0.1745, "step": 7861 }, { "epoch": 1.6430512016718914, "grad_norm": 0.9891166448770409, "learning_rate": 1.704149131267153e-05, "loss": 0.1498, "step": 7862 }, { "epoch": 1.6432601880877744, "grad_norm": 0.9046951096805571, "learning_rate": 1.704069024283856e-05, "loss": 0.1835, "step": 7863 }, { "epoch": 1.6434691745036574, "grad_norm": 0.8680772956934498, "learning_rate": 1.7039889083401675e-05, "loss": 0.1375, "step": 7864 }, { "epoch": 1.6436781609195403, "grad_norm": 1.0645915862473927, "learning_rate": 1.703908783437108e-05, "loss": 0.1844, "step": 7865 }, { "epoch": 1.6438871473354233, "grad_norm": 0.773760110949866, "learning_rate": 1.7038286495756967e-05, "loss": 0.1528, "step": 7866 }, { "epoch": 1.6440961337513063, "grad_norm": 1.0452611580412408, "learning_rate": 1.7037485067569536e-05, "loss": 0.1562, "step": 7867 }, { "epoch": 1.6443051201671892, "grad_norm": 0.9132160224203599, "learning_rate": 1.7036683549818982e-05, "loss": 0.1444, "step": 7868 }, { "epoch": 1.6445141065830722, "grad_norm": 0.8635159164182885, "learning_rate": 1.7035881942515514e-05, "loss": 0.1636, "step": 7869 }, { "epoch": 1.6447230929989551, "grad_norm": 0.9387990061526066, "learning_rate": 1.7035080245669327e-05, "loss": 0.1275, "step": 7870 }, { "epoch": 1.644932079414838, "grad_norm": 0.9276714933569765, "learning_rate": 1.7034278459290626e-05, "loss": 0.1714, "step": 7871 }, { "epoch": 1.645141065830721, "grad_norm": 0.9034351817577901, "learning_rate": 1.7033476583389616e-05, "loss": 0.1407, "step": 7872 }, { "epoch": 1.645350052246604, "grad_norm": 0.8921748104995328, "learning_rate": 1.7032674617976498e-05, "loss": 0.1674, "step": 7873 }, { "epoch": 1.645559038662487, "grad_norm": 0.9244409929092229, "learning_rate": 1.703187256306148e-05, "loss": 0.1511, "step": 7874 }, { "epoch": 1.64576802507837, "grad_norm": 0.9504466446449474, "learning_rate": 1.7031070418654777e-05, "loss": 0.1661, "step": 7875 }, { "epoch": 1.645977011494253, "grad_norm": 1.2812369227056362, "learning_rate": 1.703026818476659e-05, "loss": 0.1715, "step": 7876 }, { "epoch": 1.6461859979101359, "grad_norm": 0.8491383772990572, "learning_rate": 1.7029465861407125e-05, "loss": 0.1492, "step": 7877 }, { "epoch": 1.6463949843260188, "grad_norm": 1.1522207969529061, "learning_rate": 1.7028663448586603e-05, "loss": 0.1871, "step": 7878 }, { "epoch": 1.6466039707419018, "grad_norm": 1.9191412755557506, "learning_rate": 1.7027860946315228e-05, "loss": 0.1623, "step": 7879 }, { "epoch": 1.6468129571577848, "grad_norm": 1.0530062547743313, "learning_rate": 1.7027058354603215e-05, "loss": 0.1556, "step": 7880 }, { "epoch": 1.6470219435736677, "grad_norm": 0.9552706402696469, "learning_rate": 1.7026255673460782e-05, "loss": 0.1439, "step": 7881 }, { "epoch": 1.6472309299895507, "grad_norm": 1.0824008090315314, "learning_rate": 1.7025452902898138e-05, "loss": 0.1516, "step": 7882 }, { "epoch": 1.6474399164054336, "grad_norm": 1.0205261598130173, "learning_rate": 1.7024650042925505e-05, "loss": 0.1634, "step": 7883 }, { "epoch": 1.6476489028213166, "grad_norm": 1.1652074395483756, "learning_rate": 1.70238470935531e-05, "loss": 0.1524, "step": 7884 }, { "epoch": 1.6478578892371996, "grad_norm": 1.417007061210674, "learning_rate": 1.702304405479114e-05, "loss": 0.1523, "step": 7885 }, { "epoch": 1.6480668756530825, "grad_norm": 1.0460815778301038, "learning_rate": 1.7022240926649844e-05, "loss": 0.1661, "step": 7886 }, { "epoch": 1.6482758620689655, "grad_norm": 0.8810411360022298, "learning_rate": 1.7021437709139433e-05, "loss": 0.1512, "step": 7887 }, { "epoch": 1.6484848484848484, "grad_norm": 0.9889717433715491, "learning_rate": 1.7020634402270137e-05, "loss": 0.1632, "step": 7888 }, { "epoch": 1.6486938349007314, "grad_norm": 1.0978349722028913, "learning_rate": 1.7019831006052166e-05, "loss": 0.2073, "step": 7889 }, { "epoch": 1.6489028213166144, "grad_norm": 1.020463545173347, "learning_rate": 1.7019027520495754e-05, "loss": 0.167, "step": 7890 }, { "epoch": 1.6491118077324973, "grad_norm": 0.9626321958734758, "learning_rate": 1.701822394561112e-05, "loss": 0.1473, "step": 7891 }, { "epoch": 1.6493207941483803, "grad_norm": 0.9827281847683224, "learning_rate": 1.7017420281408505e-05, "loss": 0.1837, "step": 7892 }, { "epoch": 1.6495297805642632, "grad_norm": 1.0242609590683367, "learning_rate": 1.701661652789812e-05, "loss": 0.1601, "step": 7893 }, { "epoch": 1.6497387669801462, "grad_norm": 0.9097155424329295, "learning_rate": 1.7015812685090202e-05, "loss": 0.1861, "step": 7894 }, { "epoch": 1.6499477533960292, "grad_norm": 1.0368405066891546, "learning_rate": 1.701500875299498e-05, "loss": 0.1996, "step": 7895 }, { "epoch": 1.6501567398119121, "grad_norm": 0.9603578613517562, "learning_rate": 1.7014204731622687e-05, "loss": 0.1825, "step": 7896 }, { "epoch": 1.650365726227795, "grad_norm": 1.1771955468589903, "learning_rate": 1.7013400620983552e-05, "loss": 0.1773, "step": 7897 }, { "epoch": 1.650574712643678, "grad_norm": 1.2749785997707994, "learning_rate": 1.701259642108781e-05, "loss": 0.1523, "step": 7898 }, { "epoch": 1.650783699059561, "grad_norm": 0.9593478287846022, "learning_rate": 1.7011792131945697e-05, "loss": 0.1905, "step": 7899 }, { "epoch": 1.650992685475444, "grad_norm": 0.9281214431066235, "learning_rate": 1.7010987753567447e-05, "loss": 0.1951, "step": 7900 }, { "epoch": 1.651201671891327, "grad_norm": 0.9379023633801559, "learning_rate": 1.70101832859633e-05, "loss": 0.1757, "step": 7901 }, { "epoch": 1.65141065830721, "grad_norm": 0.9552091197378695, "learning_rate": 1.700937872914349e-05, "loss": 0.1421, "step": 7902 }, { "epoch": 1.6516196447230929, "grad_norm": 1.1342169912047733, "learning_rate": 1.700857408311826e-05, "loss": 0.1588, "step": 7903 }, { "epoch": 1.6518286311389758, "grad_norm": 1.020204372572752, "learning_rate": 1.7007769347897846e-05, "loss": 0.1388, "step": 7904 }, { "epoch": 1.6520376175548588, "grad_norm": 1.1001307074068658, "learning_rate": 1.7006964523492494e-05, "loss": 0.1805, "step": 7905 }, { "epoch": 1.6522466039707417, "grad_norm": 0.8577249750194799, "learning_rate": 1.7006159609912444e-05, "loss": 0.1594, "step": 7906 }, { "epoch": 1.6524555903866247, "grad_norm": 0.9357370258558481, "learning_rate": 1.700535460716794e-05, "loss": 0.1526, "step": 7907 }, { "epoch": 1.652664576802508, "grad_norm": 0.8868624665877426, "learning_rate": 1.7004549515269227e-05, "loss": 0.1372, "step": 7908 }, { "epoch": 1.6528735632183909, "grad_norm": 1.0659957405995568, "learning_rate": 1.7003744334226558e-05, "loss": 0.1481, "step": 7909 }, { "epoch": 1.6530825496342738, "grad_norm": 0.8873812974081915, "learning_rate": 1.7002939064050168e-05, "loss": 0.1427, "step": 7910 }, { "epoch": 1.6532915360501568, "grad_norm": 0.8396731892965976, "learning_rate": 1.7002133704750316e-05, "loss": 0.1576, "step": 7911 }, { "epoch": 1.6535005224660397, "grad_norm": 0.9394342287108391, "learning_rate": 1.7001328256337238e-05, "loss": 0.1905, "step": 7912 }, { "epoch": 1.6537095088819227, "grad_norm": 0.8865010819789818, "learning_rate": 1.70005227188212e-05, "loss": 0.1692, "step": 7913 }, { "epoch": 1.6539184952978057, "grad_norm": 0.9925644676157291, "learning_rate": 1.6999717092212444e-05, "loss": 0.1929, "step": 7914 }, { "epoch": 1.6541274817136886, "grad_norm": 1.0850908406660782, "learning_rate": 1.6998911376521227e-05, "loss": 0.156, "step": 7915 }, { "epoch": 1.6543364681295716, "grad_norm": 0.832533302971601, "learning_rate": 1.6998105571757803e-05, "loss": 0.1614, "step": 7916 }, { "epoch": 1.6545454545454545, "grad_norm": 1.0172620679209083, "learning_rate": 1.6997299677932426e-05, "loss": 0.2108, "step": 7917 }, { "epoch": 1.6547544409613375, "grad_norm": 0.988038498561922, "learning_rate": 1.699649369505535e-05, "loss": 0.1448, "step": 7918 }, { "epoch": 1.6549634273772205, "grad_norm": 1.0946109221120488, "learning_rate": 1.6995687623136835e-05, "loss": 0.1966, "step": 7919 }, { "epoch": 1.6551724137931034, "grad_norm": 1.0667259168003194, "learning_rate": 1.6994881462187143e-05, "loss": 0.1711, "step": 7920 }, { "epoch": 1.6553814002089864, "grad_norm": 1.0396289682408166, "learning_rate": 1.6994075212216528e-05, "loss": 0.1809, "step": 7921 }, { "epoch": 1.6555903866248693, "grad_norm": 0.9910944652822197, "learning_rate": 1.6993268873235252e-05, "loss": 0.1472, "step": 7922 }, { "epoch": 1.6557993730407523, "grad_norm": 1.124054729626566, "learning_rate": 1.699246244525358e-05, "loss": 0.1794, "step": 7923 }, { "epoch": 1.6560083594566353, "grad_norm": 1.1013156366155612, "learning_rate": 1.6991655928281766e-05, "loss": 0.206, "step": 7924 }, { "epoch": 1.6562173458725182, "grad_norm": 0.9852720778754168, "learning_rate": 1.6990849322330088e-05, "loss": 0.1574, "step": 7925 }, { "epoch": 1.6564263322884014, "grad_norm": 0.9669314493818435, "learning_rate": 1.69900426274088e-05, "loss": 0.1867, "step": 7926 }, { "epoch": 1.6566353187042844, "grad_norm": 0.9555664334830535, "learning_rate": 1.6989235843528175e-05, "loss": 0.1777, "step": 7927 }, { "epoch": 1.6568443051201673, "grad_norm": 0.8830551352709382, "learning_rate": 1.6988428970698475e-05, "loss": 0.1666, "step": 7928 }, { "epoch": 1.6570532915360503, "grad_norm": 0.9527699164874638, "learning_rate": 1.6987622008929977e-05, "loss": 0.1859, "step": 7929 }, { "epoch": 1.6572622779519333, "grad_norm": 0.9811072711742657, "learning_rate": 1.6986814958232943e-05, "loss": 0.1623, "step": 7930 }, { "epoch": 1.6574712643678162, "grad_norm": 0.8005019617793034, "learning_rate": 1.6986007818617647e-05, "loss": 0.1672, "step": 7931 }, { "epoch": 1.6576802507836992, "grad_norm": 1.0636342474709433, "learning_rate": 1.6985200590094364e-05, "loss": 0.1556, "step": 7932 }, { "epoch": 1.6578892371995821, "grad_norm": 1.014893268533812, "learning_rate": 1.698439327267336e-05, "loss": 0.2145, "step": 7933 }, { "epoch": 1.658098223615465, "grad_norm": 0.96367220870958, "learning_rate": 1.6983585866364916e-05, "loss": 0.1543, "step": 7934 }, { "epoch": 1.658307210031348, "grad_norm": 1.2110497601568633, "learning_rate": 1.6982778371179307e-05, "loss": 0.1958, "step": 7935 }, { "epoch": 1.658516196447231, "grad_norm": 0.899091830821133, "learning_rate": 1.6981970787126805e-05, "loss": 0.156, "step": 7936 }, { "epoch": 1.658725182863114, "grad_norm": 0.8268081765496192, "learning_rate": 1.698116311421769e-05, "loss": 0.1566, "step": 7937 }, { "epoch": 1.658934169278997, "grad_norm": 0.9197971718727949, "learning_rate": 1.6980355352462242e-05, "loss": 0.1794, "step": 7938 }, { "epoch": 1.65914315569488, "grad_norm": 0.7169932556377422, "learning_rate": 1.6979547501870742e-05, "loss": 0.161, "step": 7939 }, { "epoch": 1.6593521421107629, "grad_norm": 0.9591319984361786, "learning_rate": 1.697873956245347e-05, "loss": 0.1489, "step": 7940 }, { "epoch": 1.6595611285266458, "grad_norm": 1.1297906686079993, "learning_rate": 1.6977931534220712e-05, "loss": 0.1697, "step": 7941 }, { "epoch": 1.6597701149425288, "grad_norm": 1.4805399507802237, "learning_rate": 1.697712341718274e-05, "loss": 0.1431, "step": 7942 }, { "epoch": 1.6599791013584118, "grad_norm": 0.8706124112007569, "learning_rate": 1.6976315211349848e-05, "loss": 0.1742, "step": 7943 }, { "epoch": 1.6601880877742947, "grad_norm": 0.8676551238469719, "learning_rate": 1.6975506916732324e-05, "loss": 0.1651, "step": 7944 }, { "epoch": 1.6603970741901777, "grad_norm": 1.047853293177225, "learning_rate": 1.6974698533340445e-05, "loss": 0.1665, "step": 7945 }, { "epoch": 1.6606060606060606, "grad_norm": 1.1164921328838997, "learning_rate": 1.6973890061184506e-05, "loss": 0.1623, "step": 7946 }, { "epoch": 1.6608150470219436, "grad_norm": 1.0122295752605048, "learning_rate": 1.6973081500274796e-05, "loss": 0.2, "step": 7947 }, { "epoch": 1.6610240334378266, "grad_norm": 0.9019207405134305, "learning_rate": 1.6972272850621606e-05, "loss": 0.1527, "step": 7948 }, { "epoch": 1.6612330198537095, "grad_norm": 1.0330723221034657, "learning_rate": 1.697146411223522e-05, "loss": 0.1925, "step": 7949 }, { "epoch": 1.6614420062695925, "grad_norm": 1.0442374426265228, "learning_rate": 1.697065528512594e-05, "loss": 0.1897, "step": 7950 }, { "epoch": 1.6616509926854754, "grad_norm": 1.0086227937860994, "learning_rate": 1.6969846369304055e-05, "loss": 0.1885, "step": 7951 }, { "epoch": 1.6618599791013584, "grad_norm": 0.939006575994961, "learning_rate": 1.6969037364779856e-05, "loss": 0.1679, "step": 7952 }, { "epoch": 1.6620689655172414, "grad_norm": 1.0256400357602125, "learning_rate": 1.696822827156365e-05, "loss": 0.1833, "step": 7953 }, { "epoch": 1.6622779519331243, "grad_norm": 0.9923140656773171, "learning_rate": 1.696741908966572e-05, "loss": 0.1428, "step": 7954 }, { "epoch": 1.6624869383490073, "grad_norm": 0.8399092107238159, "learning_rate": 1.6966609819096373e-05, "loss": 0.1674, "step": 7955 }, { "epoch": 1.6626959247648903, "grad_norm": 1.275261592402789, "learning_rate": 1.696580045986591e-05, "loss": 0.1827, "step": 7956 }, { "epoch": 1.6629049111807732, "grad_norm": 1.0026815652623953, "learning_rate": 1.6964991011984625e-05, "loss": 0.1521, "step": 7957 }, { "epoch": 1.6631138975966562, "grad_norm": 1.013452625624775, "learning_rate": 1.696418147546282e-05, "loss": 0.1541, "step": 7958 }, { "epoch": 1.6633228840125391, "grad_norm": 0.8252833080386967, "learning_rate": 1.69633718503108e-05, "loss": 0.1872, "step": 7959 }, { "epoch": 1.663531870428422, "grad_norm": 0.8416799175772443, "learning_rate": 1.696256213653887e-05, "loss": 0.1632, "step": 7960 }, { "epoch": 1.663740856844305, "grad_norm": 0.7775432829825979, "learning_rate": 1.6961752334157336e-05, "loss": 0.1765, "step": 7961 }, { "epoch": 1.663949843260188, "grad_norm": 0.8482274192991982, "learning_rate": 1.69609424431765e-05, "loss": 0.1533, "step": 7962 }, { "epoch": 1.664158829676071, "grad_norm": 0.9041001639667374, "learning_rate": 1.696013246360667e-05, "loss": 0.1626, "step": 7963 }, { "epoch": 1.664367816091954, "grad_norm": 0.829808862982297, "learning_rate": 1.695932239545815e-05, "loss": 0.1504, "step": 7964 }, { "epoch": 1.664576802507837, "grad_norm": 0.9216532299467117, "learning_rate": 1.695851223874126e-05, "loss": 0.1921, "step": 7965 }, { "epoch": 1.6647857889237199, "grad_norm": 0.9038636358369875, "learning_rate": 1.6957701993466304e-05, "loss": 0.1599, "step": 7966 }, { "epoch": 1.6649947753396028, "grad_norm": 0.8465226174241502, "learning_rate": 1.6956891659643593e-05, "loss": 0.1521, "step": 7967 }, { "epoch": 1.6652037617554858, "grad_norm": 0.8746916713585335, "learning_rate": 1.6956081237283442e-05, "loss": 0.1703, "step": 7968 }, { "epoch": 1.6654127481713688, "grad_norm": 0.823555188687064, "learning_rate": 1.6955270726396165e-05, "loss": 0.1852, "step": 7969 }, { "epoch": 1.6656217345872517, "grad_norm": 1.174374752786731, "learning_rate": 1.6954460126992074e-05, "loss": 0.1479, "step": 7970 }, { "epoch": 1.6658307210031347, "grad_norm": 0.8505021062034721, "learning_rate": 1.6953649439081487e-05, "loss": 0.141, "step": 7971 }, { "epoch": 1.6660397074190176, "grad_norm": 1.2513084889641308, "learning_rate": 1.6952838662674724e-05, "loss": 0.156, "step": 7972 }, { "epoch": 1.6662486938349006, "grad_norm": 1.1176930286781077, "learning_rate": 1.69520277977821e-05, "loss": 0.2066, "step": 7973 }, { "epoch": 1.6664576802507836, "grad_norm": 1.0239671796410803, "learning_rate": 1.6951216844413937e-05, "loss": 0.1769, "step": 7974 }, { "epoch": 1.6666666666666665, "grad_norm": 0.7514998834958094, "learning_rate": 1.6950405802580553e-05, "loss": 0.1543, "step": 7975 }, { "epoch": 1.6668756530825495, "grad_norm": 0.9895973407654639, "learning_rate": 1.6949594672292267e-05, "loss": 0.1711, "step": 7976 }, { "epoch": 1.6670846394984324, "grad_norm": 1.0397843638571589, "learning_rate": 1.694878345355941e-05, "loss": 0.1551, "step": 7977 }, { "epoch": 1.6672936259143156, "grad_norm": 0.9511846808908473, "learning_rate": 1.6947972146392306e-05, "loss": 0.2024, "step": 7978 }, { "epoch": 1.6675026123301986, "grad_norm": 1.0151859065825888, "learning_rate": 1.694716075080127e-05, "loss": 0.1435, "step": 7979 }, { "epoch": 1.6677115987460815, "grad_norm": 1.1846042436078277, "learning_rate": 1.6946349266796637e-05, "loss": 0.176, "step": 7980 }, { "epoch": 1.6679205851619645, "grad_norm": 0.9732843679292436, "learning_rate": 1.6945537694388727e-05, "loss": 0.1632, "step": 7981 }, { "epoch": 1.6681295715778475, "grad_norm": 0.8842416817843465, "learning_rate": 1.6944726033587877e-05, "loss": 0.1698, "step": 7982 }, { "epoch": 1.6683385579937304, "grad_norm": 0.8408127216625126, "learning_rate": 1.6943914284404414e-05, "loss": 0.1715, "step": 7983 }, { "epoch": 1.6685475444096134, "grad_norm": 0.9542525806237864, "learning_rate": 1.6943102446848666e-05, "loss": 0.1858, "step": 7984 }, { "epoch": 1.6687565308254964, "grad_norm": 1.0121365212678033, "learning_rate": 1.6942290520930966e-05, "loss": 0.2008, "step": 7985 }, { "epoch": 1.6689655172413793, "grad_norm": 0.9605081676269801, "learning_rate": 1.694147850666165e-05, "loss": 0.1916, "step": 7986 }, { "epoch": 1.6691745036572623, "grad_norm": 0.8840082831047963, "learning_rate": 1.694066640405105e-05, "loss": 0.1471, "step": 7987 }, { "epoch": 1.6693834900731452, "grad_norm": 1.192790533239669, "learning_rate": 1.6939854213109503e-05, "loss": 0.1632, "step": 7988 }, { "epoch": 1.6695924764890282, "grad_norm": 1.014558436735333, "learning_rate": 1.693904193384734e-05, "loss": 0.1708, "step": 7989 }, { "epoch": 1.6698014629049112, "grad_norm": 1.0618165004092217, "learning_rate": 1.69382295662749e-05, "loss": 0.1876, "step": 7990 }, { "epoch": 1.6700104493207941, "grad_norm": 1.0247445706748304, "learning_rate": 1.6937417110402526e-05, "loss": 0.1675, "step": 7991 }, { "epoch": 1.670219435736677, "grad_norm": 0.9033703048305088, "learning_rate": 1.693660456624056e-05, "loss": 0.152, "step": 7992 }, { "epoch": 1.67042842215256, "grad_norm": 1.112580856364344, "learning_rate": 1.6935791933799337e-05, "loss": 0.1823, "step": 7993 }, { "epoch": 1.670637408568443, "grad_norm": 1.0175502338809053, "learning_rate": 1.69349792130892e-05, "loss": 0.2208, "step": 7994 }, { "epoch": 1.670846394984326, "grad_norm": 0.9852577892547547, "learning_rate": 1.6934166404120488e-05, "loss": 0.1291, "step": 7995 }, { "epoch": 1.6710553814002091, "grad_norm": 0.905044490940039, "learning_rate": 1.6933353506903555e-05, "loss": 0.1671, "step": 7996 }, { "epoch": 1.6712643678160921, "grad_norm": 1.1914614702802926, "learning_rate": 1.6932540521448736e-05, "loss": 0.2124, "step": 7997 }, { "epoch": 1.671473354231975, "grad_norm": 1.0424554495830582, "learning_rate": 1.693172744776639e-05, "loss": 0.1432, "step": 7998 }, { "epoch": 1.671682340647858, "grad_norm": 0.9132438505748036, "learning_rate": 1.6930914285866853e-05, "loss": 0.1662, "step": 7999 }, { "epoch": 1.671891327063741, "grad_norm": 1.3592158845684599, "learning_rate": 1.693010103576048e-05, "loss": 0.149, "step": 8000 }, { "epoch": 1.672100313479624, "grad_norm": 0.9200549352421935, "learning_rate": 1.6929287697457617e-05, "loss": 0.1693, "step": 8001 }, { "epoch": 1.672309299895507, "grad_norm": 0.9345582956758138, "learning_rate": 1.692847427096862e-05, "loss": 0.1567, "step": 8002 }, { "epoch": 1.6725182863113899, "grad_norm": 1.087022717656371, "learning_rate": 1.6927660756303838e-05, "loss": 0.167, "step": 8003 }, { "epoch": 1.6727272727272728, "grad_norm": 0.9456527100427903, "learning_rate": 1.6926847153473622e-05, "loss": 0.18, "step": 8004 }, { "epoch": 1.6729362591431558, "grad_norm": 1.0442306568028075, "learning_rate": 1.692603346248833e-05, "loss": 0.1991, "step": 8005 }, { "epoch": 1.6731452455590388, "grad_norm": 0.986313094380773, "learning_rate": 1.692521968335832e-05, "loss": 0.1601, "step": 8006 }, { "epoch": 1.6733542319749217, "grad_norm": 1.134184350372461, "learning_rate": 1.692440581609394e-05, "loss": 0.1478, "step": 8007 }, { "epoch": 1.6735632183908047, "grad_norm": 0.8373497477080525, "learning_rate": 1.6923591860705557e-05, "loss": 0.1702, "step": 8008 }, { "epoch": 1.6737722048066876, "grad_norm": 0.8707027802966685, "learning_rate": 1.6922777817203524e-05, "loss": 0.1907, "step": 8009 }, { "epoch": 1.6739811912225706, "grad_norm": 0.8745425647169643, "learning_rate": 1.6921963685598205e-05, "loss": 0.1603, "step": 8010 }, { "epoch": 1.6741901776384536, "grad_norm": 1.0511301315925066, "learning_rate": 1.6921149465899956e-05, "loss": 0.181, "step": 8011 }, { "epoch": 1.6743991640543365, "grad_norm": 0.8399293574623137, "learning_rate": 1.6920335158119144e-05, "loss": 0.1631, "step": 8012 }, { "epoch": 1.6746081504702195, "grad_norm": 0.9584767907682905, "learning_rate": 1.691952076226613e-05, "loss": 0.1771, "step": 8013 }, { "epoch": 1.6748171368861025, "grad_norm": 0.9428494182874639, "learning_rate": 1.6918706278351277e-05, "loss": 0.1919, "step": 8014 }, { "epoch": 1.6750261233019854, "grad_norm": 0.9206193787902186, "learning_rate": 1.6917891706384954e-05, "loss": 0.1378, "step": 8015 }, { "epoch": 1.6752351097178684, "grad_norm": 0.9114622589502619, "learning_rate": 1.6917077046377525e-05, "loss": 0.1655, "step": 8016 }, { "epoch": 1.6754440961337513, "grad_norm": 1.0086388071046366, "learning_rate": 1.6916262298339358e-05, "loss": 0.1783, "step": 8017 }, { "epoch": 1.6756530825496343, "grad_norm": 1.175810017438227, "learning_rate": 1.6915447462280827e-05, "loss": 0.1992, "step": 8018 }, { "epoch": 1.6758620689655173, "grad_norm": 0.87537201399637, "learning_rate": 1.6914632538212297e-05, "loss": 0.1613, "step": 8019 }, { "epoch": 1.6760710553814002, "grad_norm": 0.9956371314388177, "learning_rate": 1.6913817526144138e-05, "loss": 0.1551, "step": 8020 }, { "epoch": 1.6762800417972832, "grad_norm": 0.917820598889595, "learning_rate": 1.691300242608673e-05, "loss": 0.1963, "step": 8021 }, { "epoch": 1.6764890282131661, "grad_norm": 0.9669928371184467, "learning_rate": 1.6912187238050436e-05, "loss": 0.152, "step": 8022 }, { "epoch": 1.676698014629049, "grad_norm": 0.8694510209300162, "learning_rate": 1.6911371962045635e-05, "loss": 0.1479, "step": 8023 }, { "epoch": 1.676907001044932, "grad_norm": 1.1854897710111323, "learning_rate": 1.6910556598082707e-05, "loss": 0.1729, "step": 8024 }, { "epoch": 1.677115987460815, "grad_norm": 0.8345140590795607, "learning_rate": 1.6909741146172022e-05, "loss": 0.1472, "step": 8025 }, { "epoch": 1.677324973876698, "grad_norm": 0.904503103896624, "learning_rate": 1.690892560632396e-05, "loss": 0.1062, "step": 8026 }, { "epoch": 1.677533960292581, "grad_norm": 0.9428250939762245, "learning_rate": 1.6908109978548903e-05, "loss": 0.1721, "step": 8027 }, { "epoch": 1.677742946708464, "grad_norm": 1.1061642932571918, "learning_rate": 1.690729426285723e-05, "loss": 0.1757, "step": 8028 }, { "epoch": 1.6779519331243469, "grad_norm": 0.9933145953871108, "learning_rate": 1.690647845925932e-05, "loss": 0.1747, "step": 8029 }, { "epoch": 1.6781609195402298, "grad_norm": 0.9696083124068622, "learning_rate": 1.690566256776556e-05, "loss": 0.1378, "step": 8030 }, { "epoch": 1.6783699059561128, "grad_norm": 0.8974636990437859, "learning_rate": 1.6904846588386328e-05, "loss": 0.1838, "step": 8031 }, { "epoch": 1.6785788923719958, "grad_norm": 1.2633363296651992, "learning_rate": 1.690403052113201e-05, "loss": 0.1674, "step": 8032 }, { "epoch": 1.6787878787878787, "grad_norm": 0.9810954270783305, "learning_rate": 1.6903214366012995e-05, "loss": 0.1761, "step": 8033 }, { "epoch": 1.6789968652037617, "grad_norm": 0.9551887712577445, "learning_rate": 1.6902398123039664e-05, "loss": 0.1854, "step": 8034 }, { "epoch": 1.6792058516196446, "grad_norm": 0.9666443841148479, "learning_rate": 1.690158179222241e-05, "loss": 0.1447, "step": 8035 }, { "epoch": 1.6794148380355276, "grad_norm": 1.1931536841743668, "learning_rate": 1.6900765373571623e-05, "loss": 0.1565, "step": 8036 }, { "epoch": 1.6796238244514106, "grad_norm": 0.8587918062096846, "learning_rate": 1.689994886709769e-05, "loss": 0.1704, "step": 8037 }, { "epoch": 1.6798328108672935, "grad_norm": 1.1215639666849473, "learning_rate": 1.6899132272811e-05, "loss": 0.1968, "step": 8038 }, { "epoch": 1.6800417972831765, "grad_norm": 0.9291910650736364, "learning_rate": 1.689831559072195e-05, "loss": 0.1526, "step": 8039 }, { "epoch": 1.6802507836990594, "grad_norm": 1.0617832798967928, "learning_rate": 1.689749882084094e-05, "loss": 0.1986, "step": 8040 }, { "epoch": 1.6804597701149424, "grad_norm": 0.9200039029570771, "learning_rate": 1.6896681963178347e-05, "loss": 0.1718, "step": 8041 }, { "epoch": 1.6806687565308254, "grad_norm": 1.1129525768163104, "learning_rate": 1.689586501774458e-05, "loss": 0.1382, "step": 8042 }, { "epoch": 1.6808777429467083, "grad_norm": 1.0970496841249764, "learning_rate": 1.6895047984550032e-05, "loss": 0.2034, "step": 8043 }, { "epoch": 1.6810867293625913, "grad_norm": 0.8644224480537417, "learning_rate": 1.6894230863605103e-05, "loss": 0.1708, "step": 8044 }, { "epoch": 1.6812957157784743, "grad_norm": 0.9464579432041674, "learning_rate": 1.689341365492019e-05, "loss": 0.1623, "step": 8045 }, { "epoch": 1.6815047021943572, "grad_norm": 0.863001116207197, "learning_rate": 1.6892596358505696e-05, "loss": 0.1799, "step": 8046 }, { "epoch": 1.6817136886102402, "grad_norm": 0.9536509991662258, "learning_rate": 1.6891778974372018e-05, "loss": 0.1869, "step": 8047 }, { "epoch": 1.6819226750261231, "grad_norm": 0.9064639903235406, "learning_rate": 1.689096150252956e-05, "loss": 0.1712, "step": 8048 }, { "epoch": 1.6821316614420063, "grad_norm": 0.9639305530520881, "learning_rate": 1.689014394298873e-05, "loss": 0.1417, "step": 8049 }, { "epoch": 1.6823406478578893, "grad_norm": 1.0219944979168596, "learning_rate": 1.6889326295759926e-05, "loss": 0.1534, "step": 8050 }, { "epoch": 1.6825496342737722, "grad_norm": 0.8259052906318641, "learning_rate": 1.688850856085356e-05, "loss": 0.15, "step": 8051 }, { "epoch": 1.6827586206896552, "grad_norm": 0.9090056806020291, "learning_rate": 1.6887690738280036e-05, "loss": 0.1473, "step": 8052 }, { "epoch": 1.6829676071055382, "grad_norm": 0.833490532417145, "learning_rate": 1.6886872828049763e-05, "loss": 0.1463, "step": 8053 }, { "epoch": 1.6831765935214211, "grad_norm": 0.8823101112107306, "learning_rate": 1.6886054830173148e-05, "loss": 0.1529, "step": 8054 }, { "epoch": 1.683385579937304, "grad_norm": 0.817112812156374, "learning_rate": 1.6885236744660604e-05, "loss": 0.1663, "step": 8055 }, { "epoch": 1.683594566353187, "grad_norm": 0.9828568677229915, "learning_rate": 1.6884418571522538e-05, "loss": 0.1683, "step": 8056 }, { "epoch": 1.68380355276907, "grad_norm": 0.8542625749753399, "learning_rate": 1.688360031076937e-05, "loss": 0.1734, "step": 8057 }, { "epoch": 1.684012539184953, "grad_norm": 1.0739981039340385, "learning_rate": 1.6882781962411505e-05, "loss": 0.1893, "step": 8058 }, { "epoch": 1.684221525600836, "grad_norm": 0.8391511722057835, "learning_rate": 1.6881963526459367e-05, "loss": 0.1676, "step": 8059 }, { "epoch": 1.684430512016719, "grad_norm": 0.8963886316433834, "learning_rate": 1.688114500292336e-05, "loss": 0.1596, "step": 8060 }, { "epoch": 1.6846394984326019, "grad_norm": 0.9445375203272217, "learning_rate": 1.6880326391813917e-05, "loss": 0.1726, "step": 8061 }, { "epoch": 1.6848484848484848, "grad_norm": 0.8915569731986625, "learning_rate": 1.6879507693141442e-05, "loss": 0.1661, "step": 8062 }, { "epoch": 1.6850574712643678, "grad_norm": 0.9764717027173806, "learning_rate": 1.687868890691636e-05, "loss": 0.1737, "step": 8063 }, { "epoch": 1.6852664576802507, "grad_norm": 0.9352037671671392, "learning_rate": 1.687787003314909e-05, "loss": 0.1641, "step": 8064 }, { "epoch": 1.6854754440961337, "grad_norm": 1.0133948440169718, "learning_rate": 1.687705107185006e-05, "loss": 0.1892, "step": 8065 }, { "epoch": 1.6856844305120169, "grad_norm": 0.8155482268901229, "learning_rate": 1.6876232023029678e-05, "loss": 0.1492, "step": 8066 }, { "epoch": 1.6858934169278998, "grad_norm": 0.94894326094131, "learning_rate": 1.6875412886698383e-05, "loss": 0.2, "step": 8067 }, { "epoch": 1.6861024033437828, "grad_norm": 0.9675741867882156, "learning_rate": 1.687459366286659e-05, "loss": 0.1561, "step": 8068 }, { "epoch": 1.6863113897596658, "grad_norm": 1.094935773234016, "learning_rate": 1.687377435154473e-05, "loss": 0.1637, "step": 8069 }, { "epoch": 1.6865203761755487, "grad_norm": 1.163458548156439, "learning_rate": 1.687295495274323e-05, "loss": 0.1689, "step": 8070 }, { "epoch": 1.6867293625914317, "grad_norm": 0.8554538008188531, "learning_rate": 1.687213546647251e-05, "loss": 0.1632, "step": 8071 }, { "epoch": 1.6869383490073147, "grad_norm": 0.8127994174716827, "learning_rate": 1.687131589274301e-05, "loss": 0.1695, "step": 8072 }, { "epoch": 1.6871473354231976, "grad_norm": 0.8984572231097424, "learning_rate": 1.6870496231565157e-05, "loss": 0.1774, "step": 8073 }, { "epoch": 1.6873563218390806, "grad_norm": 0.9129419132191372, "learning_rate": 1.686967648294938e-05, "loss": 0.1609, "step": 8074 }, { "epoch": 1.6875653082549635, "grad_norm": 1.0289199175970545, "learning_rate": 1.6868856646906113e-05, "loss": 0.1637, "step": 8075 }, { "epoch": 1.6877742946708465, "grad_norm": 1.046015422648367, "learning_rate": 1.6868036723445792e-05, "loss": 0.1566, "step": 8076 }, { "epoch": 1.6879832810867295, "grad_norm": 0.8935032753461585, "learning_rate": 1.6867216712578848e-05, "loss": 0.1661, "step": 8077 }, { "epoch": 1.6881922675026124, "grad_norm": 1.2908335961051882, "learning_rate": 1.6866396614315718e-05, "loss": 0.1491, "step": 8078 }, { "epoch": 1.6884012539184954, "grad_norm": 0.8407736393335048, "learning_rate": 1.686557642866684e-05, "loss": 0.1358, "step": 8079 }, { "epoch": 1.6886102403343783, "grad_norm": 0.989986393394483, "learning_rate": 1.686475615564265e-05, "loss": 0.1337, "step": 8080 }, { "epoch": 1.6888192267502613, "grad_norm": 0.9149984111137942, "learning_rate": 1.686393579525359e-05, "loss": 0.154, "step": 8081 }, { "epoch": 1.6890282131661443, "grad_norm": 0.884454995864529, "learning_rate": 1.6863115347510106e-05, "loss": 0.1459, "step": 8082 }, { "epoch": 1.6892371995820272, "grad_norm": 1.118850304486611, "learning_rate": 1.6862294812422625e-05, "loss": 0.1957, "step": 8083 }, { "epoch": 1.6894461859979102, "grad_norm": 0.8109881071830185, "learning_rate": 1.6861474190001603e-05, "loss": 0.1694, "step": 8084 }, { "epoch": 1.6896551724137931, "grad_norm": 0.8880375698222719, "learning_rate": 1.6860653480257473e-05, "loss": 0.1506, "step": 8085 }, { "epoch": 1.689864158829676, "grad_norm": 1.0819225796584118, "learning_rate": 1.6859832683200687e-05, "loss": 0.1655, "step": 8086 }, { "epoch": 1.690073145245559, "grad_norm": 1.1228245000930788, "learning_rate": 1.685901179884169e-05, "loss": 0.1518, "step": 8087 }, { "epoch": 1.690282131661442, "grad_norm": 1.322953321769367, "learning_rate": 1.685819082719093e-05, "loss": 0.1709, "step": 8088 }, { "epoch": 1.690491118077325, "grad_norm": 1.0991332120785664, "learning_rate": 1.685736976825885e-05, "loss": 0.1871, "step": 8089 }, { "epoch": 1.690700104493208, "grad_norm": 1.0077823848458367, "learning_rate": 1.6856548622055902e-05, "loss": 0.1596, "step": 8090 }, { "epoch": 1.690909090909091, "grad_norm": 0.8912410916479697, "learning_rate": 1.685572738859254e-05, "loss": 0.1532, "step": 8091 }, { "epoch": 1.6911180773249739, "grad_norm": 1.0099742647176642, "learning_rate": 1.685490606787921e-05, "loss": 0.1752, "step": 8092 }, { "epoch": 1.6913270637408568, "grad_norm": 1.137752071163448, "learning_rate": 1.685408465992637e-05, "loss": 0.17, "step": 8093 }, { "epoch": 1.6915360501567398, "grad_norm": 0.9416903148911668, "learning_rate": 1.6853263164744468e-05, "loss": 0.1818, "step": 8094 }, { "epoch": 1.6917450365726228, "grad_norm": 0.9426815603115184, "learning_rate": 1.6852441582343964e-05, "loss": 0.1561, "step": 8095 }, { "epoch": 1.6919540229885057, "grad_norm": 0.9445679522882369, "learning_rate": 1.685161991273531e-05, "loss": 0.2039, "step": 8096 }, { "epoch": 1.6921630094043887, "grad_norm": 1.2046981617257289, "learning_rate": 1.6850798155928963e-05, "loss": 0.1761, "step": 8097 }, { "epoch": 1.6923719958202716, "grad_norm": 0.9604397093964588, "learning_rate": 1.6849976311935384e-05, "loss": 0.1731, "step": 8098 }, { "epoch": 1.6925809822361546, "grad_norm": 1.0541826044788452, "learning_rate": 1.684915438076503e-05, "loss": 0.1697, "step": 8099 }, { "epoch": 1.6927899686520376, "grad_norm": 0.9207391340191822, "learning_rate": 1.6848332362428363e-05, "loss": 0.1635, "step": 8100 }, { "epoch": 1.6929989550679205, "grad_norm": 0.8790860504941832, "learning_rate": 1.6847510256935845e-05, "loss": 0.1486, "step": 8101 }, { "epoch": 1.6932079414838035, "grad_norm": 1.0725781054279637, "learning_rate": 1.6846688064297935e-05, "loss": 0.1364, "step": 8102 }, { "epoch": 1.6934169278996865, "grad_norm": 0.8352621321758535, "learning_rate": 1.6845865784525102e-05, "loss": 0.157, "step": 8103 }, { "epoch": 1.6936259143155694, "grad_norm": 0.8216661562161239, "learning_rate": 1.6845043417627805e-05, "loss": 0.1862, "step": 8104 }, { "epoch": 1.6938349007314524, "grad_norm": 1.0292583439665532, "learning_rate": 1.6844220963616514e-05, "loss": 0.1653, "step": 8105 }, { "epoch": 1.6940438871473353, "grad_norm": 0.8080322564958099, "learning_rate": 1.6843398422501698e-05, "loss": 0.144, "step": 8106 }, { "epoch": 1.6942528735632183, "grad_norm": 0.8977549094174547, "learning_rate": 1.684257579429382e-05, "loss": 0.188, "step": 8107 }, { "epoch": 1.6944618599791013, "grad_norm": 0.7720672261964547, "learning_rate": 1.684175307900335e-05, "loss": 0.16, "step": 8108 }, { "epoch": 1.6946708463949842, "grad_norm": 0.9492467260721615, "learning_rate": 1.684093027664076e-05, "loss": 0.1416, "step": 8109 }, { "epoch": 1.6948798328108672, "grad_norm": 0.8591509440892237, "learning_rate": 1.6840107387216518e-05, "loss": 0.1606, "step": 8110 }, { "epoch": 1.6950888192267501, "grad_norm": 1.010765110704328, "learning_rate": 1.6839284410741102e-05, "loss": 0.1998, "step": 8111 }, { "epoch": 1.695297805642633, "grad_norm": 1.0136957825244493, "learning_rate": 1.6838461347224987e-05, "loss": 0.1543, "step": 8112 }, { "epoch": 1.695506792058516, "grad_norm": 0.9414926078960119, "learning_rate": 1.683763819667864e-05, "loss": 0.1795, "step": 8113 }, { "epoch": 1.695715778474399, "grad_norm": 0.9041110315137746, "learning_rate": 1.6836814959112543e-05, "loss": 0.1826, "step": 8114 }, { "epoch": 1.695924764890282, "grad_norm": 0.8528503488150804, "learning_rate": 1.683599163453717e-05, "loss": 0.1754, "step": 8115 }, { "epoch": 1.696133751306165, "grad_norm": 0.9298458527829143, "learning_rate": 1.6835168222963e-05, "loss": 0.1657, "step": 8116 }, { "epoch": 1.696342737722048, "grad_norm": 0.8277912453817161, "learning_rate": 1.683434472440051e-05, "loss": 0.1736, "step": 8117 }, { "epoch": 1.6965517241379309, "grad_norm": 1.0572777290567459, "learning_rate": 1.6833521138860187e-05, "loss": 0.1603, "step": 8118 }, { "epoch": 1.696760710553814, "grad_norm": 1.042817457236154, "learning_rate": 1.6832697466352506e-05, "loss": 0.188, "step": 8119 }, { "epoch": 1.696969696969697, "grad_norm": 0.9424410625545271, "learning_rate": 1.683187370688795e-05, "loss": 0.1721, "step": 8120 }, { "epoch": 1.69717868338558, "grad_norm": 0.973681808583604, "learning_rate": 1.6831049860477006e-05, "loss": 0.163, "step": 8121 }, { "epoch": 1.697387669801463, "grad_norm": 0.8913799777774645, "learning_rate": 1.683022592713016e-05, "loss": 0.1719, "step": 8122 }, { "epoch": 1.697596656217346, "grad_norm": 0.8301103403155983, "learning_rate": 1.682940190685789e-05, "loss": 0.1714, "step": 8123 }, { "epoch": 1.6978056426332289, "grad_norm": 1.113532470821925, "learning_rate": 1.682857779967069e-05, "loss": 0.1704, "step": 8124 }, { "epoch": 1.6980146290491118, "grad_norm": 0.9062094211404267, "learning_rate": 1.6827753605579046e-05, "loss": 0.1711, "step": 8125 }, { "epoch": 1.6982236154649948, "grad_norm": 0.7978991176170238, "learning_rate": 1.6826929324593443e-05, "loss": 0.151, "step": 8126 }, { "epoch": 1.6984326018808777, "grad_norm": 0.8117073902642281, "learning_rate": 1.6826104956724377e-05, "loss": 0.1449, "step": 8127 }, { "epoch": 1.6986415882967607, "grad_norm": 0.9119205643489344, "learning_rate": 1.682528050198234e-05, "loss": 0.1348, "step": 8128 }, { "epoch": 1.6988505747126437, "grad_norm": 1.1915520415088476, "learning_rate": 1.6824455960377822e-05, "loss": 0.1918, "step": 8129 }, { "epoch": 1.6990595611285266, "grad_norm": 1.0010788445617431, "learning_rate": 1.6823631331921316e-05, "loss": 0.1692, "step": 8130 }, { "epoch": 1.6992685475444096, "grad_norm": 0.828333298774723, "learning_rate": 1.6822806616623317e-05, "loss": 0.167, "step": 8131 }, { "epoch": 1.6994775339602926, "grad_norm": 0.9693698224396675, "learning_rate": 1.6821981814494323e-05, "loss": 0.1846, "step": 8132 }, { "epoch": 1.6996865203761755, "grad_norm": 0.9463132536165353, "learning_rate": 1.6821156925544828e-05, "loss": 0.1591, "step": 8133 }, { "epoch": 1.6998955067920585, "grad_norm": 1.1616587727016894, "learning_rate": 1.6820331949785332e-05, "loss": 0.1863, "step": 8134 }, { "epoch": 1.7001044932079414, "grad_norm": 0.9383529081604385, "learning_rate": 1.681950688722633e-05, "loss": 0.1852, "step": 8135 }, { "epoch": 1.7003134796238244, "grad_norm": 0.86724107461401, "learning_rate": 1.681868173787833e-05, "loss": 0.1492, "step": 8136 }, { "epoch": 1.7005224660397076, "grad_norm": 0.958730202616787, "learning_rate": 1.6817856501751825e-05, "loss": 0.2026, "step": 8137 }, { "epoch": 1.7007314524555905, "grad_norm": 0.9149922795804173, "learning_rate": 1.6817031178857326e-05, "loss": 0.157, "step": 8138 }, { "epoch": 1.7009404388714735, "grad_norm": 0.7473676758794399, "learning_rate": 1.6816205769205328e-05, "loss": 0.1522, "step": 8139 }, { "epoch": 1.7011494252873565, "grad_norm": 0.891589077090724, "learning_rate": 1.681538027280634e-05, "loss": 0.1392, "step": 8140 }, { "epoch": 1.7013584117032394, "grad_norm": 0.9393317511088837, "learning_rate": 1.681455468967087e-05, "loss": 0.1525, "step": 8141 }, { "epoch": 1.7015673981191224, "grad_norm": 0.76948415555747, "learning_rate": 1.6813729019809418e-05, "loss": 0.1576, "step": 8142 }, { "epoch": 1.7017763845350053, "grad_norm": 1.1256176717506652, "learning_rate": 1.6812903263232495e-05, "loss": 0.177, "step": 8143 }, { "epoch": 1.7019853709508883, "grad_norm": 1.0143339015581738, "learning_rate": 1.6812077419950616e-05, "loss": 0.1799, "step": 8144 }, { "epoch": 1.7021943573667713, "grad_norm": 0.9364389419943464, "learning_rate": 1.6811251489974284e-05, "loss": 0.1867, "step": 8145 }, { "epoch": 1.7024033437826542, "grad_norm": 0.9904997470695859, "learning_rate": 1.6810425473314014e-05, "loss": 0.1734, "step": 8146 }, { "epoch": 1.7026123301985372, "grad_norm": 0.97842318004081, "learning_rate": 1.6809599369980315e-05, "loss": 0.1717, "step": 8147 }, { "epoch": 1.7028213166144202, "grad_norm": 0.8836593878715604, "learning_rate": 1.68087731799837e-05, "loss": 0.1459, "step": 8148 }, { "epoch": 1.7030303030303031, "grad_norm": 0.8304958593679537, "learning_rate": 1.680794690333469e-05, "loss": 0.1695, "step": 8149 }, { "epoch": 1.703239289446186, "grad_norm": 1.1398992976264868, "learning_rate": 1.6807120540043794e-05, "loss": 0.1874, "step": 8150 }, { "epoch": 1.703448275862069, "grad_norm": 0.8956469065327561, "learning_rate": 1.6806294090121533e-05, "loss": 0.1827, "step": 8151 }, { "epoch": 1.703657262277952, "grad_norm": 0.9458461914326445, "learning_rate": 1.6805467553578427e-05, "loss": 0.1476, "step": 8152 }, { "epoch": 1.703866248693835, "grad_norm": 0.876058137385012, "learning_rate": 1.6804640930424985e-05, "loss": 0.1646, "step": 8153 }, { "epoch": 1.704075235109718, "grad_norm": 0.7310037668125541, "learning_rate": 1.6803814220671736e-05, "loss": 0.131, "step": 8154 }, { "epoch": 1.7042842215256009, "grad_norm": 0.8425558969700462, "learning_rate": 1.6802987424329198e-05, "loss": 0.1549, "step": 8155 }, { "epoch": 1.7044932079414838, "grad_norm": 1.0318384316087443, "learning_rate": 1.6802160541407895e-05, "loss": 0.18, "step": 8156 }, { "epoch": 1.7047021943573668, "grad_norm": 1.0390932372309705, "learning_rate": 1.6801333571918345e-05, "loss": 0.1787, "step": 8157 }, { "epoch": 1.7049111807732498, "grad_norm": 0.8885032805304334, "learning_rate": 1.6800506515871084e-05, "loss": 0.1758, "step": 8158 }, { "epoch": 1.7051201671891327, "grad_norm": 0.9565128669055737, "learning_rate": 1.6799679373276622e-05, "loss": 0.1748, "step": 8159 }, { "epoch": 1.7053291536050157, "grad_norm": 0.9352857762647167, "learning_rate": 1.67988521441455e-05, "loss": 0.2018, "step": 8160 }, { "epoch": 1.7055381400208987, "grad_norm": 0.9013678463437266, "learning_rate": 1.6798024828488238e-05, "loss": 0.126, "step": 8161 }, { "epoch": 1.7057471264367816, "grad_norm": 1.0169630691358584, "learning_rate": 1.679719742631537e-05, "loss": 0.1649, "step": 8162 }, { "epoch": 1.7059561128526646, "grad_norm": 0.8624203848177526, "learning_rate": 1.6796369937637415e-05, "loss": 0.1696, "step": 8163 }, { "epoch": 1.7061650992685475, "grad_norm": 0.9560535971400936, "learning_rate": 1.6795542362464917e-05, "loss": 0.161, "step": 8164 }, { "epoch": 1.7063740856844305, "grad_norm": 1.0611644642989881, "learning_rate": 1.67947147008084e-05, "loss": 0.2337, "step": 8165 }, { "epoch": 1.7065830721003135, "grad_norm": 0.8788440132652164, "learning_rate": 1.6793886952678403e-05, "loss": 0.1697, "step": 8166 }, { "epoch": 1.7067920585161964, "grad_norm": 0.736510558463713, "learning_rate": 1.6793059118085458e-05, "loss": 0.1183, "step": 8167 }, { "epoch": 1.7070010449320794, "grad_norm": 1.042731439033547, "learning_rate": 1.6792231197040097e-05, "loss": 0.1858, "step": 8168 }, { "epoch": 1.7072100313479623, "grad_norm": 1.0363785860340489, "learning_rate": 1.6791403189552863e-05, "loss": 0.1672, "step": 8169 }, { "epoch": 1.7074190177638453, "grad_norm": 0.8958761740745614, "learning_rate": 1.6790575095634292e-05, "loss": 0.1469, "step": 8170 }, { "epoch": 1.7076280041797283, "grad_norm": 0.9650654944392959, "learning_rate": 1.6789746915294918e-05, "loss": 0.1916, "step": 8171 }, { "epoch": 1.7078369905956112, "grad_norm": 0.7523506103695927, "learning_rate": 1.6788918648545284e-05, "loss": 0.1494, "step": 8172 }, { "epoch": 1.7080459770114942, "grad_norm": 0.8879486321364135, "learning_rate": 1.6788090295395932e-05, "loss": 0.1546, "step": 8173 }, { "epoch": 1.7082549634273771, "grad_norm": 1.05270989745218, "learning_rate": 1.6787261855857405e-05, "loss": 0.1559, "step": 8174 }, { "epoch": 1.70846394984326, "grad_norm": 1.0259341414054257, "learning_rate": 1.678643332994024e-05, "loss": 0.1623, "step": 8175 }, { "epoch": 1.708672936259143, "grad_norm": 0.9114320767676535, "learning_rate": 1.6785604717654992e-05, "loss": 0.1516, "step": 8176 }, { "epoch": 1.708881922675026, "grad_norm": 1.1525018128226976, "learning_rate": 1.6784776019012197e-05, "loss": 0.1664, "step": 8177 }, { "epoch": 1.709090909090909, "grad_norm": 0.9161459129127697, "learning_rate": 1.6783947234022405e-05, "loss": 0.1978, "step": 8178 }, { "epoch": 1.709299895506792, "grad_norm": 0.9327950800352129, "learning_rate": 1.6783118362696162e-05, "loss": 0.1866, "step": 8179 }, { "epoch": 1.709508881922675, "grad_norm": 0.9767751996444983, "learning_rate": 1.678228940504402e-05, "loss": 0.1778, "step": 8180 }, { "epoch": 1.7097178683385579, "grad_norm": 0.8284040547719049, "learning_rate": 1.6781460361076524e-05, "loss": 0.1437, "step": 8181 }, { "epoch": 1.7099268547544408, "grad_norm": 0.9031769748098628, "learning_rate": 1.678063123080423e-05, "loss": 0.1774, "step": 8182 }, { "epoch": 1.7101358411703238, "grad_norm": 0.9092896414099771, "learning_rate": 1.6779802014237686e-05, "loss": 0.1657, "step": 8183 }, { "epoch": 1.7103448275862068, "grad_norm": 0.7670891502833997, "learning_rate": 1.677897271138745e-05, "loss": 0.1389, "step": 8184 }, { "epoch": 1.7105538140020897, "grad_norm": 0.944420818471291, "learning_rate": 1.677814332226407e-05, "loss": 0.162, "step": 8185 }, { "epoch": 1.7107628004179727, "grad_norm": 0.8458293820647603, "learning_rate": 1.6777313846878108e-05, "loss": 0.147, "step": 8186 }, { "epoch": 1.7109717868338556, "grad_norm": 0.9147165324324609, "learning_rate": 1.6776484285240114e-05, "loss": 0.2058, "step": 8187 }, { "epoch": 1.7111807732497386, "grad_norm": 0.9930700286896452, "learning_rate": 1.6775654637360648e-05, "loss": 0.1614, "step": 8188 }, { "epoch": 1.7113897596656216, "grad_norm": 0.7825487498083513, "learning_rate": 1.677482490325027e-05, "loss": 0.155, "step": 8189 }, { "epoch": 1.7115987460815048, "grad_norm": 0.9601043757908131, "learning_rate": 1.6773995082919537e-05, "loss": 0.1182, "step": 8190 }, { "epoch": 1.7118077324973877, "grad_norm": 0.7553215041583927, "learning_rate": 1.677316517637901e-05, "loss": 0.1592, "step": 8191 }, { "epoch": 1.7120167189132707, "grad_norm": 0.8989483546993353, "learning_rate": 1.6772335183639255e-05, "loss": 0.175, "step": 8192 }, { "epoch": 1.7122257053291536, "grad_norm": 0.9062039872429494, "learning_rate": 1.677150510471083e-05, "loss": 0.1693, "step": 8193 }, { "epoch": 1.7124346917450366, "grad_norm": 0.7816384880972248, "learning_rate": 1.67706749396043e-05, "loss": 0.1438, "step": 8194 }, { "epoch": 1.7126436781609196, "grad_norm": 1.1104613530141412, "learning_rate": 1.6769844688330232e-05, "loss": 0.186, "step": 8195 }, { "epoch": 1.7128526645768025, "grad_norm": 0.805409983753495, "learning_rate": 1.6769014350899194e-05, "loss": 0.1343, "step": 8196 }, { "epoch": 1.7130616509926855, "grad_norm": 1.0688969025044095, "learning_rate": 1.676818392732175e-05, "loss": 0.1862, "step": 8197 }, { "epoch": 1.7132706374085684, "grad_norm": 2.1154912715092378, "learning_rate": 1.6767353417608466e-05, "loss": 0.17, "step": 8198 }, { "epoch": 1.7134796238244514, "grad_norm": 0.8816513499473961, "learning_rate": 1.6766522821769918e-05, "loss": 0.1534, "step": 8199 }, { "epoch": 1.7136886102403344, "grad_norm": 0.7906909957788576, "learning_rate": 1.6765692139816675e-05, "loss": 0.178, "step": 8200 }, { "epoch": 1.7138975966562173, "grad_norm": 0.7867767953681384, "learning_rate": 1.6764861371759304e-05, "loss": 0.1614, "step": 8201 }, { "epoch": 1.7141065830721003, "grad_norm": 0.8093070116090494, "learning_rate": 1.6764030517608383e-05, "loss": 0.1376, "step": 8202 }, { "epoch": 1.7143155694879832, "grad_norm": 0.9356715597891391, "learning_rate": 1.6763199577374485e-05, "loss": 0.1619, "step": 8203 }, { "epoch": 1.7145245559038662, "grad_norm": 0.7973775990281693, "learning_rate": 1.676236855106818e-05, "loss": 0.1465, "step": 8204 }, { "epoch": 1.7147335423197492, "grad_norm": 0.785482314949297, "learning_rate": 1.676153743870005e-05, "loss": 0.1721, "step": 8205 }, { "epoch": 1.7149425287356321, "grad_norm": 0.8932440747247362, "learning_rate": 1.6760706240280672e-05, "loss": 0.1731, "step": 8206 }, { "epoch": 1.7151515151515153, "grad_norm": 0.9343376123918357, "learning_rate": 1.6759874955820622e-05, "loss": 0.1789, "step": 8207 }, { "epoch": 1.7153605015673983, "grad_norm": 1.0168394384743331, "learning_rate": 1.6759043585330478e-05, "loss": 0.1777, "step": 8208 }, { "epoch": 1.7155694879832812, "grad_norm": 0.812598851074036, "learning_rate": 1.6758212128820827e-05, "loss": 0.1439, "step": 8209 }, { "epoch": 1.7157784743991642, "grad_norm": 0.8219598619374086, "learning_rate": 1.6757380586302242e-05, "loss": 0.1408, "step": 8210 }, { "epoch": 1.7159874608150472, "grad_norm": 1.0124762871245327, "learning_rate": 1.6756548957785313e-05, "loss": 0.1544, "step": 8211 }, { "epoch": 1.7161964472309301, "grad_norm": 0.6925968281582461, "learning_rate": 1.6755717243280618e-05, "loss": 0.1562, "step": 8212 }, { "epoch": 1.716405433646813, "grad_norm": 1.0194905821004463, "learning_rate": 1.6754885442798745e-05, "loss": 0.1434, "step": 8213 }, { "epoch": 1.716614420062696, "grad_norm": 0.9521363694676479, "learning_rate": 1.6754053556350282e-05, "loss": 0.1831, "step": 8214 }, { "epoch": 1.716823406478579, "grad_norm": 0.7577807750099674, "learning_rate": 1.675322158394581e-05, "loss": 0.1527, "step": 8215 }, { "epoch": 1.717032392894462, "grad_norm": 1.0642630291604505, "learning_rate": 1.675238952559592e-05, "loss": 0.1809, "step": 8216 }, { "epoch": 1.717241379310345, "grad_norm": 0.8188570783973288, "learning_rate": 1.6751557381311207e-05, "loss": 0.176, "step": 8217 }, { "epoch": 1.717450365726228, "grad_norm": 1.1634130129888274, "learning_rate": 1.6750725151102255e-05, "loss": 0.1696, "step": 8218 }, { "epoch": 1.7176593521421109, "grad_norm": 0.8580258580576852, "learning_rate": 1.6749892834979657e-05, "loss": 0.1461, "step": 8219 }, { "epoch": 1.7178683385579938, "grad_norm": 0.9773815269294779, "learning_rate": 1.6749060432954002e-05, "loss": 0.1587, "step": 8220 }, { "epoch": 1.7180773249738768, "grad_norm": 1.0308979552711082, "learning_rate": 1.674822794503589e-05, "loss": 0.158, "step": 8221 }, { "epoch": 1.7182863113897597, "grad_norm": 0.8598827958109581, "learning_rate": 1.674739537123591e-05, "loss": 0.1764, "step": 8222 }, { "epoch": 1.7184952978056427, "grad_norm": 1.1141916970593289, "learning_rate": 1.6746562711564663e-05, "loss": 0.2098, "step": 8223 }, { "epoch": 1.7187042842215257, "grad_norm": 1.2852306126079858, "learning_rate": 1.6745729966032742e-05, "loss": 0.1534, "step": 8224 }, { "epoch": 1.7189132706374086, "grad_norm": 0.9571406672407478, "learning_rate": 1.674489713465075e-05, "loss": 0.1853, "step": 8225 }, { "epoch": 1.7191222570532916, "grad_norm": 1.0179853651348592, "learning_rate": 1.6744064217429278e-05, "loss": 0.1322, "step": 8226 }, { "epoch": 1.7193312434691745, "grad_norm": 0.9316926469589861, "learning_rate": 1.6743231214378934e-05, "loss": 0.1768, "step": 8227 }, { "epoch": 1.7195402298850575, "grad_norm": 0.9868676973031932, "learning_rate": 1.6742398125510312e-05, "loss": 0.1962, "step": 8228 }, { "epoch": 1.7197492163009405, "grad_norm": 1.0957838458162317, "learning_rate": 1.6741564950834023e-05, "loss": 0.1721, "step": 8229 }, { "epoch": 1.7199582027168234, "grad_norm": 0.9692896278243143, "learning_rate": 1.6740731690360663e-05, "loss": 0.1509, "step": 8230 }, { "epoch": 1.7201671891327064, "grad_norm": 0.8471279555139802, "learning_rate": 1.673989834410084e-05, "loss": 0.152, "step": 8231 }, { "epoch": 1.7203761755485893, "grad_norm": 0.9722667380371134, "learning_rate": 1.673906491206516e-05, "loss": 0.1631, "step": 8232 }, { "epoch": 1.7205851619644723, "grad_norm": 0.8886795362145998, "learning_rate": 1.6738231394264228e-05, "loss": 0.2012, "step": 8233 }, { "epoch": 1.7207941483803553, "grad_norm": 1.101095109318275, "learning_rate": 1.6737397790708653e-05, "loss": 0.1605, "step": 8234 }, { "epoch": 1.7210031347962382, "grad_norm": 0.9005555890094353, "learning_rate": 1.6736564101409044e-05, "loss": 0.1725, "step": 8235 }, { "epoch": 1.7212121212121212, "grad_norm": 0.7865617588197494, "learning_rate": 1.673573032637601e-05, "loss": 0.1495, "step": 8236 }, { "epoch": 1.7214211076280042, "grad_norm": 0.9441120758397274, "learning_rate": 1.673489646562016e-05, "loss": 0.1444, "step": 8237 }, { "epoch": 1.7216300940438871, "grad_norm": 0.9505070953518434, "learning_rate": 1.6734062519152113e-05, "loss": 0.1303, "step": 8238 }, { "epoch": 1.72183908045977, "grad_norm": 1.0242772772235615, "learning_rate": 1.6733228486982475e-05, "loss": 0.1679, "step": 8239 }, { "epoch": 1.722048066875653, "grad_norm": 1.0791283546682053, "learning_rate": 1.6732394369121867e-05, "loss": 0.1712, "step": 8240 }, { "epoch": 1.722257053291536, "grad_norm": 1.0204928607710877, "learning_rate": 1.67315601655809e-05, "loss": 0.1876, "step": 8241 }, { "epoch": 1.722466039707419, "grad_norm": 0.8497637496716061, "learning_rate": 1.6730725876370188e-05, "loss": 0.1488, "step": 8242 }, { "epoch": 1.722675026123302, "grad_norm": 0.895292490773851, "learning_rate": 1.6729891501500353e-05, "loss": 0.1677, "step": 8243 }, { "epoch": 1.7228840125391849, "grad_norm": 0.9986065671758775, "learning_rate": 1.6729057040982017e-05, "loss": 0.2072, "step": 8244 }, { "epoch": 1.7230929989550678, "grad_norm": 0.9411434895470778, "learning_rate": 1.6728222494825796e-05, "loss": 0.2028, "step": 8245 }, { "epoch": 1.7233019853709508, "grad_norm": 0.8518533549353625, "learning_rate": 1.6727387863042307e-05, "loss": 0.1494, "step": 8246 }, { "epoch": 1.7235109717868338, "grad_norm": 0.9648169917219853, "learning_rate": 1.6726553145642177e-05, "loss": 0.1643, "step": 8247 }, { "epoch": 1.7237199582027167, "grad_norm": 0.9050586914376726, "learning_rate": 1.6725718342636026e-05, "loss": 0.1583, "step": 8248 }, { "epoch": 1.7239289446185997, "grad_norm": 1.1004069308680742, "learning_rate": 1.672488345403448e-05, "loss": 0.1668, "step": 8249 }, { "epoch": 1.7241379310344827, "grad_norm": 0.8465725422529052, "learning_rate": 1.672404847984817e-05, "loss": 0.1553, "step": 8250 }, { "epoch": 1.7243469174503656, "grad_norm": 0.9265535654328593, "learning_rate": 1.6723213420087713e-05, "loss": 0.1552, "step": 8251 }, { "epoch": 1.7245559038662486, "grad_norm": 1.0112465974393632, "learning_rate": 1.6722378274763737e-05, "loss": 0.1477, "step": 8252 }, { "epoch": 1.7247648902821315, "grad_norm": 0.8823786202383653, "learning_rate": 1.6721543043886878e-05, "loss": 0.1688, "step": 8253 }, { "epoch": 1.7249738766980145, "grad_norm": 1.1682614500550026, "learning_rate": 1.6720707727467758e-05, "loss": 0.1741, "step": 8254 }, { "epoch": 1.7251828631138975, "grad_norm": 0.98694489896053, "learning_rate": 1.6719872325517014e-05, "loss": 0.1726, "step": 8255 }, { "epoch": 1.7253918495297804, "grad_norm": 0.9342996183503601, "learning_rate": 1.6719036838045273e-05, "loss": 0.1608, "step": 8256 }, { "epoch": 1.7256008359456634, "grad_norm": 1.0577587964712027, "learning_rate": 1.671820126506317e-05, "loss": 0.1603, "step": 8257 }, { "epoch": 1.7258098223615463, "grad_norm": 0.7680859407351484, "learning_rate": 1.671736560658134e-05, "loss": 0.1573, "step": 8258 }, { "epoch": 1.7260188087774293, "grad_norm": 1.235885680112842, "learning_rate": 1.6716529862610413e-05, "loss": 0.1873, "step": 8259 }, { "epoch": 1.7262277951933125, "grad_norm": 1.092665887549682, "learning_rate": 1.6715694033161032e-05, "loss": 0.1619, "step": 8260 }, { "epoch": 1.7264367816091954, "grad_norm": 0.8264313896318372, "learning_rate": 1.671485811824383e-05, "loss": 0.193, "step": 8261 }, { "epoch": 1.7266457680250784, "grad_norm": 0.917924086317596, "learning_rate": 1.6714022117869445e-05, "loss": 0.163, "step": 8262 }, { "epoch": 1.7268547544409614, "grad_norm": 1.0064687552044644, "learning_rate": 1.6713186032048524e-05, "loss": 0.2051, "step": 8263 }, { "epoch": 1.7270637408568443, "grad_norm": 0.8154372552467484, "learning_rate": 1.6712349860791696e-05, "loss": 0.1445, "step": 8264 }, { "epoch": 1.7272727272727273, "grad_norm": 0.8871986599990395, "learning_rate": 1.671151360410961e-05, "loss": 0.1745, "step": 8265 }, { "epoch": 1.7274817136886103, "grad_norm": 0.8355146283750112, "learning_rate": 1.6710677262012908e-05, "loss": 0.1337, "step": 8266 }, { "epoch": 1.7276907001044932, "grad_norm": 0.8559991457545075, "learning_rate": 1.6709840834512228e-05, "loss": 0.1635, "step": 8267 }, { "epoch": 1.7278996865203762, "grad_norm": 0.8621984545338184, "learning_rate": 1.6709004321618223e-05, "loss": 0.1659, "step": 8268 }, { "epoch": 1.7281086729362591, "grad_norm": 1.0102001364732662, "learning_rate": 1.6708167723341536e-05, "loss": 0.1751, "step": 8269 }, { "epoch": 1.728317659352142, "grad_norm": 0.9444505100734896, "learning_rate": 1.670733103969281e-05, "loss": 0.1947, "step": 8270 }, { "epoch": 1.728526645768025, "grad_norm": 0.9623426111190022, "learning_rate": 1.6706494270682697e-05, "loss": 0.164, "step": 8271 }, { "epoch": 1.728735632183908, "grad_norm": 0.8341862333465839, "learning_rate": 1.670565741632185e-05, "loss": 0.1306, "step": 8272 }, { "epoch": 1.728944618599791, "grad_norm": 0.9439205201520692, "learning_rate": 1.670482047662091e-05, "loss": 0.1705, "step": 8273 }, { "epoch": 1.729153605015674, "grad_norm": 0.8249113508696194, "learning_rate": 1.6703983451590535e-05, "loss": 0.2095, "step": 8274 }, { "epoch": 1.729362591431557, "grad_norm": 0.822430194562158, "learning_rate": 1.6703146341241377e-05, "loss": 0.1745, "step": 8275 }, { "epoch": 1.7295715778474399, "grad_norm": 0.8254351011488427, "learning_rate": 1.6702309145584088e-05, "loss": 0.1713, "step": 8276 }, { "epoch": 1.729780564263323, "grad_norm": 0.8448798135062895, "learning_rate": 1.670147186462932e-05, "loss": 0.1473, "step": 8277 }, { "epoch": 1.729989550679206, "grad_norm": 0.7879264650742557, "learning_rate": 1.670063449838774e-05, "loss": 0.1619, "step": 8278 }, { "epoch": 1.730198537095089, "grad_norm": 0.7898519452038142, "learning_rate": 1.6699797046869986e-05, "loss": 0.1559, "step": 8279 }, { "epoch": 1.730407523510972, "grad_norm": 0.8709498805271036, "learning_rate": 1.6698959510086732e-05, "loss": 0.1691, "step": 8280 }, { "epoch": 1.730616509926855, "grad_norm": 1.0750275895480044, "learning_rate": 1.669812188804863e-05, "loss": 0.1609, "step": 8281 }, { "epoch": 1.7308254963427379, "grad_norm": 1.0587698595251123, "learning_rate": 1.6697284180766345e-05, "loss": 0.1712, "step": 8282 }, { "epoch": 1.7310344827586208, "grad_norm": 0.9956312010928744, "learning_rate": 1.669644638825053e-05, "loss": 0.1958, "step": 8283 }, { "epoch": 1.7312434691745038, "grad_norm": 0.9306585777363093, "learning_rate": 1.6695608510511855e-05, "loss": 0.1701, "step": 8284 }, { "epoch": 1.7314524555903867, "grad_norm": 0.8912112134578904, "learning_rate": 1.6694770547560975e-05, "loss": 0.1758, "step": 8285 }, { "epoch": 1.7316614420062697, "grad_norm": 0.9860271249264954, "learning_rate": 1.6693932499408566e-05, "loss": 0.1493, "step": 8286 }, { "epoch": 1.7318704284221527, "grad_norm": 0.9387958564495695, "learning_rate": 1.6693094366065283e-05, "loss": 0.1394, "step": 8287 }, { "epoch": 1.7320794148380356, "grad_norm": 0.8253602606799716, "learning_rate": 1.66922561475418e-05, "loss": 0.1361, "step": 8288 }, { "epoch": 1.7322884012539186, "grad_norm": 0.9936017293500836, "learning_rate": 1.6691417843848778e-05, "loss": 0.1608, "step": 8289 }, { "epoch": 1.7324973876698015, "grad_norm": 0.9874421040203407, "learning_rate": 1.6690579454996892e-05, "loss": 0.1736, "step": 8290 }, { "epoch": 1.7327063740856845, "grad_norm": 0.928372108542648, "learning_rate": 1.6689740980996805e-05, "loss": 0.1416, "step": 8291 }, { "epoch": 1.7329153605015675, "grad_norm": 1.0262427465606117, "learning_rate": 1.6688902421859197e-05, "loss": 0.1628, "step": 8292 }, { "epoch": 1.7331243469174504, "grad_norm": 1.0954784690295405, "learning_rate": 1.668806377759473e-05, "loss": 0.1698, "step": 8293 }, { "epoch": 1.7333333333333334, "grad_norm": 0.8283398147344101, "learning_rate": 1.6687225048214085e-05, "loss": 0.1782, "step": 8294 }, { "epoch": 1.7335423197492164, "grad_norm": 0.9497122863554037, "learning_rate": 1.6686386233727932e-05, "loss": 0.1738, "step": 8295 }, { "epoch": 1.7337513061650993, "grad_norm": 0.9479772350946396, "learning_rate": 1.6685547334146946e-05, "loss": 0.1344, "step": 8296 }, { "epoch": 1.7339602925809823, "grad_norm": 0.9872328151158402, "learning_rate": 1.6684708349481808e-05, "loss": 0.1503, "step": 8297 }, { "epoch": 1.7341692789968652, "grad_norm": 0.9399180411396987, "learning_rate": 1.6683869279743188e-05, "loss": 0.1502, "step": 8298 }, { "epoch": 1.7343782654127482, "grad_norm": 0.9501942097274624, "learning_rate": 1.668303012494177e-05, "loss": 0.1415, "step": 8299 }, { "epoch": 1.7345872518286312, "grad_norm": 1.0130815918717928, "learning_rate": 1.668219088508823e-05, "loss": 0.1805, "step": 8300 }, { "epoch": 1.7347962382445141, "grad_norm": 0.8021934982897327, "learning_rate": 1.668135156019325e-05, "loss": 0.1535, "step": 8301 }, { "epoch": 1.735005224660397, "grad_norm": 0.9141791470647942, "learning_rate": 1.6680512150267518e-05, "loss": 0.1605, "step": 8302 }, { "epoch": 1.73521421107628, "grad_norm": 0.9761300918212193, "learning_rate": 1.6679672655321706e-05, "loss": 0.1577, "step": 8303 }, { "epoch": 1.735423197492163, "grad_norm": 0.9504791800448987, "learning_rate": 1.667883307536651e-05, "loss": 0.1858, "step": 8304 }, { "epoch": 1.735632183908046, "grad_norm": 0.8558523257223061, "learning_rate": 1.66779934104126e-05, "loss": 0.135, "step": 8305 }, { "epoch": 1.735841170323929, "grad_norm": 1.1019682240290183, "learning_rate": 1.6677153660470673e-05, "loss": 0.1639, "step": 8306 }, { "epoch": 1.736050156739812, "grad_norm": 0.7700428910955973, "learning_rate": 1.6676313825551416e-05, "loss": 0.1664, "step": 8307 }, { "epoch": 1.7362591431556949, "grad_norm": 0.9211666946646503, "learning_rate": 1.6675473905665512e-05, "loss": 0.1585, "step": 8308 }, { "epoch": 1.7364681295715778, "grad_norm": 0.8752399356019016, "learning_rate": 1.6674633900823654e-05, "loss": 0.1659, "step": 8309 }, { "epoch": 1.7366771159874608, "grad_norm": 0.9616149916263029, "learning_rate": 1.667379381103653e-05, "loss": 0.1507, "step": 8310 }, { "epoch": 1.7368861024033437, "grad_norm": 1.131950213445028, "learning_rate": 1.667295363631483e-05, "loss": 0.1729, "step": 8311 }, { "epoch": 1.7370950888192267, "grad_norm": 0.9326019587692078, "learning_rate": 1.6672113376669255e-05, "loss": 0.135, "step": 8312 }, { "epoch": 1.7373040752351097, "grad_norm": 0.8198682740105827, "learning_rate": 1.667127303211049e-05, "loss": 0.137, "step": 8313 }, { "epoch": 1.7375130616509926, "grad_norm": 0.8056627395431332, "learning_rate": 1.6670432602649234e-05, "loss": 0.1568, "step": 8314 }, { "epoch": 1.7377220480668756, "grad_norm": 0.955487678152982, "learning_rate": 1.666959208829618e-05, "loss": 0.1632, "step": 8315 }, { "epoch": 1.7379310344827585, "grad_norm": 0.9255263865297825, "learning_rate": 1.6668751489062026e-05, "loss": 0.1799, "step": 8316 }, { "epoch": 1.7381400208986415, "grad_norm": 0.9649969710632661, "learning_rate": 1.666791080495747e-05, "loss": 0.1538, "step": 8317 }, { "epoch": 1.7383490073145245, "grad_norm": 1.3865526160599537, "learning_rate": 1.666707003599321e-05, "loss": 0.1698, "step": 8318 }, { "epoch": 1.7385579937304074, "grad_norm": 0.9220626856501051, "learning_rate": 1.6666229182179954e-05, "loss": 0.1611, "step": 8319 }, { "epoch": 1.7387669801462904, "grad_norm": 0.7295459549784044, "learning_rate": 1.666538824352839e-05, "loss": 0.1562, "step": 8320 }, { "epoch": 1.7389759665621733, "grad_norm": 0.9747338349528755, "learning_rate": 1.666454722004923e-05, "loss": 0.1627, "step": 8321 }, { "epoch": 1.7391849529780563, "grad_norm": 1.0443908499370604, "learning_rate": 1.6663706111753174e-05, "loss": 0.1759, "step": 8322 }, { "epoch": 1.7393939393939393, "grad_norm": 0.9914942868861754, "learning_rate": 1.6662864918650922e-05, "loss": 0.1834, "step": 8323 }, { "epoch": 1.7396029258098222, "grad_norm": 0.7819737065533948, "learning_rate": 1.666202364075319e-05, "loss": 0.1646, "step": 8324 }, { "epoch": 1.7398119122257052, "grad_norm": 1.0058065139791499, "learning_rate": 1.6661182278070674e-05, "loss": 0.165, "step": 8325 }, { "epoch": 1.7400208986415882, "grad_norm": 0.9827175331099895, "learning_rate": 1.666034083061409e-05, "loss": 0.1753, "step": 8326 }, { "epoch": 1.7402298850574711, "grad_norm": 1.180085168983574, "learning_rate": 1.6659499298394145e-05, "loss": 0.1802, "step": 8327 }, { "epoch": 1.740438871473354, "grad_norm": 1.0737703353009453, "learning_rate": 1.6658657681421542e-05, "loss": 0.1672, "step": 8328 }, { "epoch": 1.740647857889237, "grad_norm": 1.0365549313439253, "learning_rate": 1.6657815979707e-05, "loss": 0.1754, "step": 8329 }, { "epoch": 1.7408568443051202, "grad_norm": 1.0217516444237138, "learning_rate": 1.6656974193261225e-05, "loss": 0.1666, "step": 8330 }, { "epoch": 1.7410658307210032, "grad_norm": 0.7950708767449417, "learning_rate": 1.6656132322094936e-05, "loss": 0.1569, "step": 8331 }, { "epoch": 1.7412748171368861, "grad_norm": 1.127587481252939, "learning_rate": 1.6655290366218847e-05, "loss": 0.1779, "step": 8332 }, { "epoch": 1.741483803552769, "grad_norm": 0.8469409309689486, "learning_rate": 1.6654448325643665e-05, "loss": 0.1637, "step": 8333 }, { "epoch": 1.741692789968652, "grad_norm": 0.9795186486066789, "learning_rate": 1.665360620038011e-05, "loss": 0.1906, "step": 8334 }, { "epoch": 1.741901776384535, "grad_norm": 0.7917628523328287, "learning_rate": 1.6652763990438905e-05, "loss": 0.1415, "step": 8335 }, { "epoch": 1.742110762800418, "grad_norm": 0.9025355462476388, "learning_rate": 1.6651921695830763e-05, "loss": 0.1966, "step": 8336 }, { "epoch": 1.742319749216301, "grad_norm": 0.8413419129452487, "learning_rate": 1.6651079316566404e-05, "loss": 0.1804, "step": 8337 }, { "epoch": 1.742528735632184, "grad_norm": 0.9133098797276434, "learning_rate": 1.6650236852656552e-05, "loss": 0.1727, "step": 8338 }, { "epoch": 1.7427377220480669, "grad_norm": 0.9423878770585417, "learning_rate": 1.6649394304111927e-05, "loss": 0.1663, "step": 8339 }, { "epoch": 1.7429467084639498, "grad_norm": 0.8560610645733073, "learning_rate": 1.664855167094325e-05, "loss": 0.1582, "step": 8340 }, { "epoch": 1.7431556948798328, "grad_norm": 0.9122104485474819, "learning_rate": 1.6647708953161237e-05, "loss": 0.1454, "step": 8341 }, { "epoch": 1.7433646812957158, "grad_norm": 0.8333063468186555, "learning_rate": 1.6646866150776634e-05, "loss": 0.1767, "step": 8342 }, { "epoch": 1.7435736677115987, "grad_norm": 1.0793195853205033, "learning_rate": 1.664602326380015e-05, "loss": 0.1807, "step": 8343 }, { "epoch": 1.7437826541274817, "grad_norm": 0.8652046791179481, "learning_rate": 1.6645180292242518e-05, "loss": 0.1718, "step": 8344 }, { "epoch": 1.7439916405433646, "grad_norm": 0.8392030817476372, "learning_rate": 1.6644337236114463e-05, "loss": 0.159, "step": 8345 }, { "epoch": 1.7442006269592476, "grad_norm": 0.9716023157148915, "learning_rate": 1.6643494095426716e-05, "loss": 0.1812, "step": 8346 }, { "epoch": 1.7444096133751306, "grad_norm": 0.8897651312673438, "learning_rate": 1.664265087019001e-05, "loss": 0.1604, "step": 8347 }, { "epoch": 1.7446185997910137, "grad_norm": 0.8722591940327862, "learning_rate": 1.6641807560415073e-05, "loss": 0.1687, "step": 8348 }, { "epoch": 1.7448275862068967, "grad_norm": 0.8461706095983682, "learning_rate": 1.6640964166112638e-05, "loss": 0.1723, "step": 8349 }, { "epoch": 1.7450365726227797, "grad_norm": 0.9306235740893298, "learning_rate": 1.6640120687293438e-05, "loss": 0.1663, "step": 8350 }, { "epoch": 1.7452455590386626, "grad_norm": 0.8335806092179021, "learning_rate": 1.663927712396821e-05, "loss": 0.1504, "step": 8351 }, { "epoch": 1.7454545454545456, "grad_norm": 0.9682379075811763, "learning_rate": 1.6638433476147686e-05, "loss": 0.1753, "step": 8352 }, { "epoch": 1.7456635318704286, "grad_norm": 0.9497130768725983, "learning_rate": 1.6637589743842608e-05, "loss": 0.1615, "step": 8353 }, { "epoch": 1.7458725182863115, "grad_norm": 1.0193791086390789, "learning_rate": 1.6636745927063706e-05, "loss": 0.1549, "step": 8354 }, { "epoch": 1.7460815047021945, "grad_norm": 0.9572565450595183, "learning_rate": 1.6635902025821725e-05, "loss": 0.1679, "step": 8355 }, { "epoch": 1.7462904911180774, "grad_norm": 1.0233446522321255, "learning_rate": 1.6635058040127408e-05, "loss": 0.1807, "step": 8356 }, { "epoch": 1.7464994775339604, "grad_norm": 0.7758672350916297, "learning_rate": 1.663421396999149e-05, "loss": 0.1122, "step": 8357 }, { "epoch": 1.7467084639498434, "grad_norm": 0.8120303042074356, "learning_rate": 1.663336981542471e-05, "loss": 0.1634, "step": 8358 }, { "epoch": 1.7469174503657263, "grad_norm": 1.0705604087361353, "learning_rate": 1.663252557643782e-05, "loss": 0.2138, "step": 8359 }, { "epoch": 1.7471264367816093, "grad_norm": 0.8999022813963264, "learning_rate": 1.663168125304156e-05, "loss": 0.1575, "step": 8360 }, { "epoch": 1.7473354231974922, "grad_norm": 0.6765999105716204, "learning_rate": 1.6630836845246675e-05, "loss": 0.1421, "step": 8361 }, { "epoch": 1.7475444096133752, "grad_norm": 1.2807303014678575, "learning_rate": 1.6629992353063912e-05, "loss": 0.1498, "step": 8362 }, { "epoch": 1.7477533960292582, "grad_norm": 0.9169828159401555, "learning_rate": 1.662914777650402e-05, "loss": 0.1639, "step": 8363 }, { "epoch": 1.7479623824451411, "grad_norm": 0.9540288954718197, "learning_rate": 1.662830311557774e-05, "loss": 0.154, "step": 8364 }, { "epoch": 1.748171368861024, "grad_norm": 0.8374852268199401, "learning_rate": 1.6627458370295833e-05, "loss": 0.1489, "step": 8365 }, { "epoch": 1.748380355276907, "grad_norm": 0.9153755151388815, "learning_rate": 1.6626613540669045e-05, "loss": 0.1814, "step": 8366 }, { "epoch": 1.74858934169279, "grad_norm": 0.8379108036792137, "learning_rate": 1.6625768626708123e-05, "loss": 0.1699, "step": 8367 }, { "epoch": 1.748798328108673, "grad_norm": 0.9030360556119496, "learning_rate": 1.6624923628423827e-05, "loss": 0.1245, "step": 8368 }, { "epoch": 1.749007314524556, "grad_norm": 1.0288334790189615, "learning_rate": 1.662407854582691e-05, "loss": 0.1572, "step": 8369 }, { "epoch": 1.749216300940439, "grad_norm": 0.9262191608015307, "learning_rate": 1.662323337892812e-05, "loss": 0.18, "step": 8370 }, { "epoch": 1.7494252873563219, "grad_norm": 1.3133750412179606, "learning_rate": 1.6622388127738222e-05, "loss": 0.1817, "step": 8371 }, { "epoch": 1.7496342737722048, "grad_norm": 1.053986984460057, "learning_rate": 1.662154279226797e-05, "loss": 0.1775, "step": 8372 }, { "epoch": 1.7498432601880878, "grad_norm": 1.0400808311338048, "learning_rate": 1.662069737252812e-05, "loss": 0.1779, "step": 8373 }, { "epoch": 1.7500522466039707, "grad_norm": 0.8535933749673539, "learning_rate": 1.661985186852943e-05, "loss": 0.1222, "step": 8374 }, { "epoch": 1.7502612330198537, "grad_norm": 0.8118976578933574, "learning_rate": 1.6619006280282665e-05, "loss": 0.1719, "step": 8375 }, { "epoch": 1.7504702194357367, "grad_norm": 0.8356093106659048, "learning_rate": 1.6618160607798585e-05, "loss": 0.1598, "step": 8376 }, { "epoch": 1.7506792058516196, "grad_norm": 0.9795410780099186, "learning_rate": 1.6617314851087954e-05, "loss": 0.1723, "step": 8377 }, { "epoch": 1.7508881922675026, "grad_norm": 0.9811268582932913, "learning_rate": 1.6616469010161534e-05, "loss": 0.1587, "step": 8378 }, { "epoch": 1.7510971786833855, "grad_norm": 0.8219468825618089, "learning_rate": 1.6615623085030087e-05, "loss": 0.1622, "step": 8379 }, { "epoch": 1.7513061650992685, "grad_norm": 0.9554329310873253, "learning_rate": 1.661477707570438e-05, "loss": 0.1581, "step": 8380 }, { "epoch": 1.7515151515151515, "grad_norm": 0.8464428735297737, "learning_rate": 1.6613930982195183e-05, "loss": 0.1495, "step": 8381 }, { "epoch": 1.7517241379310344, "grad_norm": 0.9280431141119583, "learning_rate": 1.6613084804513264e-05, "loss": 0.1724, "step": 8382 }, { "epoch": 1.7519331243469174, "grad_norm": 1.048375399068132, "learning_rate": 1.6612238542669385e-05, "loss": 0.187, "step": 8383 }, { "epoch": 1.7521421107628004, "grad_norm": 0.894854535369419, "learning_rate": 1.6611392196674325e-05, "loss": 0.1518, "step": 8384 }, { "epoch": 1.7523510971786833, "grad_norm": 0.8336683657637366, "learning_rate": 1.6610545766538847e-05, "loss": 0.1386, "step": 8385 }, { "epoch": 1.7525600835945663, "grad_norm": 0.8284327198316195, "learning_rate": 1.660969925227373e-05, "loss": 0.17, "step": 8386 }, { "epoch": 1.7527690700104492, "grad_norm": 1.1142934058332923, "learning_rate": 1.660885265388974e-05, "loss": 0.1697, "step": 8387 }, { "epoch": 1.7529780564263322, "grad_norm": 1.0261150088511957, "learning_rate": 1.6608005971397663e-05, "loss": 0.1958, "step": 8388 }, { "epoch": 1.7531870428422152, "grad_norm": 0.9513799163911888, "learning_rate": 1.660715920480826e-05, "loss": 0.1474, "step": 8389 }, { "epoch": 1.7533960292580981, "grad_norm": 0.9550610302187454, "learning_rate": 1.6606312354132315e-05, "loss": 0.1471, "step": 8390 }, { "epoch": 1.753605015673981, "grad_norm": 0.9672709323165363, "learning_rate": 1.660546541938061e-05, "loss": 0.1455, "step": 8391 }, { "epoch": 1.753814002089864, "grad_norm": 0.9639756830329058, "learning_rate": 1.6604618400563913e-05, "loss": 0.1612, "step": 8392 }, { "epoch": 1.754022988505747, "grad_norm": 1.0091984254616997, "learning_rate": 1.6603771297693015e-05, "loss": 0.1706, "step": 8393 }, { "epoch": 1.75423197492163, "grad_norm": 0.9352713632777935, "learning_rate": 1.6602924110778688e-05, "loss": 0.1511, "step": 8394 }, { "epoch": 1.754440961337513, "grad_norm": 0.7656458786464104, "learning_rate": 1.6602076839831713e-05, "loss": 0.1409, "step": 8395 }, { "epoch": 1.7546499477533959, "grad_norm": 1.008227439624324, "learning_rate": 1.660122948486288e-05, "loss": 0.1755, "step": 8396 }, { "epoch": 1.7548589341692789, "grad_norm": 0.8285571420155222, "learning_rate": 1.6600382045882974e-05, "loss": 0.1475, "step": 8397 }, { "epoch": 1.7550679205851618, "grad_norm": 1.0423009467281894, "learning_rate": 1.6599534522902768e-05, "loss": 0.1927, "step": 8398 }, { "epoch": 1.7552769070010448, "grad_norm": 0.844382900469842, "learning_rate": 1.659868691593306e-05, "loss": 0.1556, "step": 8399 }, { "epoch": 1.7554858934169277, "grad_norm": 0.8638734212424071, "learning_rate": 1.6597839224984628e-05, "loss": 0.1317, "step": 8400 }, { "epoch": 1.755694879832811, "grad_norm": 1.1091229499247939, "learning_rate": 1.6596991450068266e-05, "loss": 0.1598, "step": 8401 }, { "epoch": 1.7559038662486939, "grad_norm": 0.9717946965884676, "learning_rate": 1.6596143591194763e-05, "loss": 0.193, "step": 8402 }, { "epoch": 1.7561128526645768, "grad_norm": 1.265236103659761, "learning_rate": 1.659529564837491e-05, "loss": 0.1565, "step": 8403 }, { "epoch": 1.7563218390804598, "grad_norm": 1.0876932421447754, "learning_rate": 1.6594447621619496e-05, "loss": 0.1722, "step": 8404 }, { "epoch": 1.7565308254963428, "grad_norm": 0.9034119944291349, "learning_rate": 1.6593599510939317e-05, "loss": 0.1654, "step": 8405 }, { "epoch": 1.7567398119122257, "grad_norm": 0.8266373040851214, "learning_rate": 1.659275131634516e-05, "loss": 0.1383, "step": 8406 }, { "epoch": 1.7569487983281087, "grad_norm": 0.8461563588671193, "learning_rate": 1.6591903037847825e-05, "loss": 0.1358, "step": 8407 }, { "epoch": 1.7571577847439916, "grad_norm": 0.9312523324191948, "learning_rate": 1.6591054675458105e-05, "loss": 0.1709, "step": 8408 }, { "epoch": 1.7573667711598746, "grad_norm": 0.8816256443827112, "learning_rate": 1.65902062291868e-05, "loss": 0.1374, "step": 8409 }, { "epoch": 1.7575757575757576, "grad_norm": 0.79988863620373, "learning_rate": 1.6589357699044704e-05, "loss": 0.1433, "step": 8410 }, { "epoch": 1.7577847439916405, "grad_norm": 0.7350201733518895, "learning_rate": 1.658850908504262e-05, "loss": 0.141, "step": 8411 }, { "epoch": 1.7579937304075235, "grad_norm": 0.9604171363621007, "learning_rate": 1.658766038719134e-05, "loss": 0.1539, "step": 8412 }, { "epoch": 1.7582027168234065, "grad_norm": 0.9402250843421951, "learning_rate": 1.6586811605501677e-05, "loss": 0.1831, "step": 8413 }, { "epoch": 1.7584117032392894, "grad_norm": 0.9211072485488723, "learning_rate": 1.6585962739984425e-05, "loss": 0.1761, "step": 8414 }, { "epoch": 1.7586206896551724, "grad_norm": 1.1080182757264814, "learning_rate": 1.658511379065039e-05, "loss": 0.1403, "step": 8415 }, { "epoch": 1.7588296760710553, "grad_norm": 0.947372591880255, "learning_rate": 1.6584264757510372e-05, "loss": 0.146, "step": 8416 }, { "epoch": 1.7590386624869383, "grad_norm": 0.7367182639878473, "learning_rate": 1.658341564057518e-05, "loss": 0.1432, "step": 8417 }, { "epoch": 1.7592476489028215, "grad_norm": 0.9713415762304878, "learning_rate": 1.658256643985562e-05, "loss": 0.225, "step": 8418 }, { "epoch": 1.7594566353187044, "grad_norm": 1.006054272774399, "learning_rate": 1.6581717155362503e-05, "loss": 0.1695, "step": 8419 }, { "epoch": 1.7596656217345874, "grad_norm": 0.9502810341238205, "learning_rate": 1.658086778710663e-05, "loss": 0.1829, "step": 8420 }, { "epoch": 1.7598746081504704, "grad_norm": 0.8272401756308562, "learning_rate": 1.6580018335098814e-05, "loss": 0.1798, "step": 8421 }, { "epoch": 1.7600835945663533, "grad_norm": 0.8713008999306394, "learning_rate": 1.6579168799349866e-05, "loss": 0.1759, "step": 8422 }, { "epoch": 1.7602925809822363, "grad_norm": 0.8921239302920557, "learning_rate": 1.6578319179870596e-05, "loss": 0.1787, "step": 8423 }, { "epoch": 1.7605015673981192, "grad_norm": 0.9671086866968919, "learning_rate": 1.6577469476671823e-05, "loss": 0.1614, "step": 8424 }, { "epoch": 1.7607105538140022, "grad_norm": 0.9063331088526654, "learning_rate": 1.6576619689764352e-05, "loss": 0.149, "step": 8425 }, { "epoch": 1.7609195402298852, "grad_norm": 1.1060416085042666, "learning_rate": 1.6575769819159004e-05, "loss": 0.1852, "step": 8426 }, { "epoch": 1.7611285266457681, "grad_norm": 0.8298295758794314, "learning_rate": 1.657491986486659e-05, "loss": 0.1552, "step": 8427 }, { "epoch": 1.761337513061651, "grad_norm": 1.056424160127054, "learning_rate": 1.6574069826897937e-05, "loss": 0.1554, "step": 8428 }, { "epoch": 1.761546499477534, "grad_norm": 1.0664682647164843, "learning_rate": 1.6573219705263848e-05, "loss": 0.166, "step": 8429 }, { "epoch": 1.761755485893417, "grad_norm": 1.0081999406115483, "learning_rate": 1.6572369499975156e-05, "loss": 0.1807, "step": 8430 }, { "epoch": 1.7619644723093, "grad_norm": 0.9495106303383287, "learning_rate": 1.657151921104267e-05, "loss": 0.1423, "step": 8431 }, { "epoch": 1.762173458725183, "grad_norm": 0.8020032555949793, "learning_rate": 1.6570668838477223e-05, "loss": 0.1397, "step": 8432 }, { "epoch": 1.762382445141066, "grad_norm": 0.7953107759204189, "learning_rate": 1.6569818382289623e-05, "loss": 0.1436, "step": 8433 }, { "epoch": 1.7625914315569489, "grad_norm": 1.077600577490697, "learning_rate": 1.656896784249071e-05, "loss": 0.2208, "step": 8434 }, { "epoch": 1.7628004179728318, "grad_norm": 0.9684882067860763, "learning_rate": 1.6568117219091294e-05, "loss": 0.1802, "step": 8435 }, { "epoch": 1.7630094043887148, "grad_norm": 0.9455001345640637, "learning_rate": 1.656726651210221e-05, "loss": 0.1908, "step": 8436 }, { "epoch": 1.7632183908045977, "grad_norm": 0.8426624633831216, "learning_rate": 1.6566415721534275e-05, "loss": 0.1756, "step": 8437 }, { "epoch": 1.7634273772204807, "grad_norm": 0.9053383986368706, "learning_rate": 1.6565564847398328e-05, "loss": 0.175, "step": 8438 }, { "epoch": 1.7636363636363637, "grad_norm": 0.8147875630156959, "learning_rate": 1.656471388970519e-05, "loss": 0.1552, "step": 8439 }, { "epoch": 1.7638453500522466, "grad_norm": 1.0254555922827806, "learning_rate": 1.6563862848465695e-05, "loss": 0.1731, "step": 8440 }, { "epoch": 1.7640543364681296, "grad_norm": 0.9251675554745086, "learning_rate": 1.6563011723690673e-05, "loss": 0.1542, "step": 8441 }, { "epoch": 1.7642633228840126, "grad_norm": 0.9221455631914987, "learning_rate": 1.6562160515390952e-05, "loss": 0.1252, "step": 8442 }, { "epoch": 1.7644723092998955, "grad_norm": 1.1123902648629915, "learning_rate": 1.6561309223577367e-05, "loss": 0.1657, "step": 8443 }, { "epoch": 1.7646812957157785, "grad_norm": 0.8843158570544811, "learning_rate": 1.6560457848260753e-05, "loss": 0.1572, "step": 8444 }, { "epoch": 1.7648902821316614, "grad_norm": 0.8354803404149455, "learning_rate": 1.6559606389451945e-05, "loss": 0.1849, "step": 8445 }, { "epoch": 1.7650992685475444, "grad_norm": 1.1578078731789967, "learning_rate": 1.6558754847161783e-05, "loss": 0.1579, "step": 8446 }, { "epoch": 1.7653082549634274, "grad_norm": 0.9633279670263627, "learning_rate": 1.6557903221401098e-05, "loss": 0.1618, "step": 8447 }, { "epoch": 1.7655172413793103, "grad_norm": 0.8370503834782025, "learning_rate": 1.6557051512180732e-05, "loss": 0.1515, "step": 8448 }, { "epoch": 1.7657262277951933, "grad_norm": 1.2255248439315751, "learning_rate": 1.655619971951152e-05, "loss": 0.1492, "step": 8449 }, { "epoch": 1.7659352142110762, "grad_norm": 0.9171661821675434, "learning_rate": 1.6555347843404305e-05, "loss": 0.1678, "step": 8450 }, { "epoch": 1.7661442006269592, "grad_norm": 1.0657315703024035, "learning_rate": 1.655449588386993e-05, "loss": 0.1759, "step": 8451 }, { "epoch": 1.7663531870428422, "grad_norm": 0.9213323178352686, "learning_rate": 1.6553643840919237e-05, "loss": 0.1734, "step": 8452 }, { "epoch": 1.7665621734587251, "grad_norm": 1.0057606227295788, "learning_rate": 1.6552791714563066e-05, "loss": 0.1553, "step": 8453 }, { "epoch": 1.766771159874608, "grad_norm": 1.2534936245890778, "learning_rate": 1.655193950481227e-05, "loss": 0.1796, "step": 8454 }, { "epoch": 1.766980146290491, "grad_norm": 0.7340414215953356, "learning_rate": 1.6551087211677687e-05, "loss": 0.1395, "step": 8455 }, { "epoch": 1.767189132706374, "grad_norm": 0.941814433053449, "learning_rate": 1.6550234835170165e-05, "loss": 0.1542, "step": 8456 }, { "epoch": 1.767398119122257, "grad_norm": 0.9487630823053512, "learning_rate": 1.6549382375300555e-05, "loss": 0.1446, "step": 8457 }, { "epoch": 1.76760710553814, "grad_norm": 1.1728586624898523, "learning_rate": 1.6548529832079704e-05, "loss": 0.1571, "step": 8458 }, { "epoch": 1.767816091954023, "grad_norm": 0.8140080496981856, "learning_rate": 1.654767720551846e-05, "loss": 0.1803, "step": 8459 }, { "epoch": 1.7680250783699059, "grad_norm": 0.9918885014922663, "learning_rate": 1.6546824495627676e-05, "loss": 0.1919, "step": 8460 }, { "epoch": 1.7682340647857888, "grad_norm": 1.0548350415594472, "learning_rate": 1.6545971702418204e-05, "loss": 0.1651, "step": 8461 }, { "epoch": 1.7684430512016718, "grad_norm": 0.9759687437543195, "learning_rate": 1.65451188259009e-05, "loss": 0.1746, "step": 8462 }, { "epoch": 1.7686520376175547, "grad_norm": 1.186544245645733, "learning_rate": 1.654426586608661e-05, "loss": 0.2242, "step": 8463 }, { "epoch": 1.7688610240334377, "grad_norm": 0.8467042755220097, "learning_rate": 1.6543412822986198e-05, "loss": 0.1565, "step": 8464 }, { "epoch": 1.7690700104493207, "grad_norm": 0.8562532122968665, "learning_rate": 1.6542559696610515e-05, "loss": 0.1347, "step": 8465 }, { "epoch": 1.7692789968652036, "grad_norm": 1.0899525965054808, "learning_rate": 1.654170648697042e-05, "loss": 0.1545, "step": 8466 }, { "epoch": 1.7694879832810866, "grad_norm": 1.053557941602761, "learning_rate": 1.654085319407677e-05, "loss": 0.1902, "step": 8467 }, { "epoch": 1.7696969696969695, "grad_norm": 1.1314882409416946, "learning_rate": 1.6539999817940433e-05, "loss": 0.1721, "step": 8468 }, { "epoch": 1.7699059561128525, "grad_norm": 0.9333393299175534, "learning_rate": 1.6539146358572257e-05, "loss": 0.1692, "step": 8469 }, { "epoch": 1.7701149425287355, "grad_norm": 0.8043021967817312, "learning_rate": 1.6538292815983113e-05, "loss": 0.1525, "step": 8470 }, { "epoch": 1.7703239289446187, "grad_norm": 0.9165973111520177, "learning_rate": 1.6537439190183856e-05, "loss": 0.1537, "step": 8471 }, { "epoch": 1.7705329153605016, "grad_norm": 0.8721214414676024, "learning_rate": 1.6536585481185356e-05, "loss": 0.1673, "step": 8472 }, { "epoch": 1.7707419017763846, "grad_norm": 0.9375515675767249, "learning_rate": 1.6535731688998476e-05, "loss": 0.1607, "step": 8473 }, { "epoch": 1.7709508881922675, "grad_norm": 0.8855467240931384, "learning_rate": 1.653487781363408e-05, "loss": 0.1717, "step": 8474 }, { "epoch": 1.7711598746081505, "grad_norm": 1.0574334945361543, "learning_rate": 1.6534023855103038e-05, "loss": 0.1913, "step": 8475 }, { "epoch": 1.7713688610240335, "grad_norm": 0.8834453431094764, "learning_rate": 1.6533169813416213e-05, "loss": 0.1922, "step": 8476 }, { "epoch": 1.7715778474399164, "grad_norm": 1.061856331501497, "learning_rate": 1.6532315688584483e-05, "loss": 0.1826, "step": 8477 }, { "epoch": 1.7717868338557994, "grad_norm": 0.8497861802930897, "learning_rate": 1.6531461480618708e-05, "loss": 0.1741, "step": 8478 }, { "epoch": 1.7719958202716823, "grad_norm": 0.8910526280040746, "learning_rate": 1.6530607189529764e-05, "loss": 0.1302, "step": 8479 }, { "epoch": 1.7722048066875653, "grad_norm": 1.0075125595516523, "learning_rate": 1.6529752815328525e-05, "loss": 0.1669, "step": 8480 }, { "epoch": 1.7724137931034483, "grad_norm": 0.8804242057541841, "learning_rate": 1.6528898358025856e-05, "loss": 0.1589, "step": 8481 }, { "epoch": 1.7726227795193312, "grad_norm": 0.9503335507164652, "learning_rate": 1.6528043817632643e-05, "loss": 0.1665, "step": 8482 }, { "epoch": 1.7728317659352142, "grad_norm": 0.7672644773341526, "learning_rate": 1.6527189194159755e-05, "loss": 0.1221, "step": 8483 }, { "epoch": 1.7730407523510971, "grad_norm": 1.7592028163696185, "learning_rate": 1.6526334487618068e-05, "loss": 0.1673, "step": 8484 }, { "epoch": 1.77324973876698, "grad_norm": 0.8851477698060405, "learning_rate": 1.652547969801846e-05, "loss": 0.1553, "step": 8485 }, { "epoch": 1.773458725182863, "grad_norm": 1.0383708729216965, "learning_rate": 1.6524624825371814e-05, "loss": 0.1598, "step": 8486 }, { "epoch": 1.773667711598746, "grad_norm": 0.9078466991006399, "learning_rate": 1.6523769869689003e-05, "loss": 0.1279, "step": 8487 }, { "epoch": 1.773876698014629, "grad_norm": 0.9780740421293048, "learning_rate": 1.652291483098091e-05, "loss": 0.1869, "step": 8488 }, { "epoch": 1.7740856844305122, "grad_norm": 0.8637017286864813, "learning_rate": 1.6522059709258417e-05, "loss": 0.1721, "step": 8489 }, { "epoch": 1.7742946708463951, "grad_norm": 1.0595587567741238, "learning_rate": 1.6521204504532408e-05, "loss": 0.1855, "step": 8490 }, { "epoch": 1.774503657262278, "grad_norm": 1.085164521967768, "learning_rate": 1.6520349216813766e-05, "loss": 0.1536, "step": 8491 }, { "epoch": 1.774712643678161, "grad_norm": 0.9118854195316743, "learning_rate": 1.6519493846113375e-05, "loss": 0.1682, "step": 8492 }, { "epoch": 1.774921630094044, "grad_norm": 0.9829860441935899, "learning_rate": 1.6518638392442123e-05, "loss": 0.1504, "step": 8493 }, { "epoch": 1.775130616509927, "grad_norm": 0.9126052132492962, "learning_rate": 1.6517782855810892e-05, "loss": 0.1867, "step": 8494 }, { "epoch": 1.77533960292581, "grad_norm": 0.8140647309214173, "learning_rate": 1.651692723623058e-05, "loss": 0.1508, "step": 8495 }, { "epoch": 1.775548589341693, "grad_norm": 1.0559630369890787, "learning_rate": 1.6516071533712065e-05, "loss": 0.1816, "step": 8496 }, { "epoch": 1.7757575757575759, "grad_norm": 0.7757948520305125, "learning_rate": 1.651521574826624e-05, "loss": 0.1507, "step": 8497 }, { "epoch": 1.7759665621734588, "grad_norm": 1.603238489714497, "learning_rate": 1.6514359879904e-05, "loss": 0.1615, "step": 8498 }, { "epoch": 1.7761755485893418, "grad_norm": 0.8397339097014633, "learning_rate": 1.651350392863624e-05, "loss": 0.162, "step": 8499 }, { "epoch": 1.7763845350052248, "grad_norm": 0.8863008043946893, "learning_rate": 1.6512647894473843e-05, "loss": 0.1399, "step": 8500 }, { "epoch": 1.7765935214211077, "grad_norm": 1.1245962299715022, "learning_rate": 1.6511791777427712e-05, "loss": 0.1591, "step": 8501 }, { "epoch": 1.7768025078369907, "grad_norm": 1.3429095051263695, "learning_rate": 1.651093557750874e-05, "loss": 0.1898, "step": 8502 }, { "epoch": 1.7770114942528736, "grad_norm": 0.8648458801069667, "learning_rate": 1.6510079294727823e-05, "loss": 0.1531, "step": 8503 }, { "epoch": 1.7772204806687566, "grad_norm": 0.9935962359697493, "learning_rate": 1.6509222929095855e-05, "loss": 0.1254, "step": 8504 }, { "epoch": 1.7774294670846396, "grad_norm": 0.9547731396458786, "learning_rate": 1.6508366480623743e-05, "loss": 0.1548, "step": 8505 }, { "epoch": 1.7776384535005225, "grad_norm": 1.0277609141584947, "learning_rate": 1.650750994932238e-05, "loss": 0.1608, "step": 8506 }, { "epoch": 1.7778474399164055, "grad_norm": 0.863318780411131, "learning_rate": 1.6506653335202668e-05, "loss": 0.1573, "step": 8507 }, { "epoch": 1.7780564263322884, "grad_norm": 1.2638682434408761, "learning_rate": 1.6505796638275507e-05, "loss": 0.1967, "step": 8508 }, { "epoch": 1.7782654127481714, "grad_norm": 0.9009814194438344, "learning_rate": 1.6504939858551808e-05, "loss": 0.1524, "step": 8509 }, { "epoch": 1.7784743991640544, "grad_norm": 0.838370140749275, "learning_rate": 1.6504082996042466e-05, "loss": 0.1394, "step": 8510 }, { "epoch": 1.7786833855799373, "grad_norm": 1.0037342259634168, "learning_rate": 1.650322605075839e-05, "loss": 0.1699, "step": 8511 }, { "epoch": 1.7788923719958203, "grad_norm": 1.0078040991916688, "learning_rate": 1.6502369022710483e-05, "loss": 0.1622, "step": 8512 }, { "epoch": 1.7791013584117032, "grad_norm": 1.1355081120474109, "learning_rate": 1.6501511911909655e-05, "loss": 0.1717, "step": 8513 }, { "epoch": 1.7793103448275862, "grad_norm": 0.8448324326105021, "learning_rate": 1.6500654718366817e-05, "loss": 0.1655, "step": 8514 }, { "epoch": 1.7795193312434692, "grad_norm": 1.0976791317292331, "learning_rate": 1.6499797442092868e-05, "loss": 0.1584, "step": 8515 }, { "epoch": 1.7797283176593521, "grad_norm": 0.8058817939213352, "learning_rate": 1.6498940083098728e-05, "loss": 0.1448, "step": 8516 }, { "epoch": 1.779937304075235, "grad_norm": 0.7785824728579845, "learning_rate": 1.6498082641395304e-05, "loss": 0.1359, "step": 8517 }, { "epoch": 1.780146290491118, "grad_norm": 0.8206182901101884, "learning_rate": 1.6497225116993506e-05, "loss": 0.1681, "step": 8518 }, { "epoch": 1.780355276907001, "grad_norm": 0.8483046008188674, "learning_rate": 1.6496367509904256e-05, "loss": 0.1499, "step": 8519 }, { "epoch": 1.780564263322884, "grad_norm": 1.0061742817695347, "learning_rate": 1.649550982013846e-05, "loss": 0.1848, "step": 8520 }, { "epoch": 1.780773249738767, "grad_norm": 0.9063456500780425, "learning_rate": 1.649465204770704e-05, "loss": 0.1366, "step": 8521 }, { "epoch": 1.78098223615465, "grad_norm": 0.8098428759812475, "learning_rate": 1.64937941926209e-05, "loss": 0.1658, "step": 8522 }, { "epoch": 1.7811912225705329, "grad_norm": 0.822389178221335, "learning_rate": 1.6492936254890975e-05, "loss": 0.1616, "step": 8523 }, { "epoch": 1.7814002089864158, "grad_norm": 1.0717057839584923, "learning_rate": 1.649207823452817e-05, "loss": 0.1534, "step": 8524 }, { "epoch": 1.7816091954022988, "grad_norm": 1.060119433351059, "learning_rate": 1.649122013154341e-05, "loss": 0.1912, "step": 8525 }, { "epoch": 1.7818181818181817, "grad_norm": 1.1516980683707605, "learning_rate": 1.649036194594762e-05, "loss": 0.174, "step": 8526 }, { "epoch": 1.7820271682340647, "grad_norm": 1.012034508871774, "learning_rate": 1.6489503677751712e-05, "loss": 0.1959, "step": 8527 }, { "epoch": 1.7822361546499477, "grad_norm": 0.9862922285739028, "learning_rate": 1.6488645326966617e-05, "loss": 0.1846, "step": 8528 }, { "epoch": 1.7824451410658306, "grad_norm": 1.4831660588455453, "learning_rate": 1.6487786893603256e-05, "loss": 0.1871, "step": 8529 }, { "epoch": 1.7826541274817136, "grad_norm": 0.7840220078113347, "learning_rate": 1.648692837767255e-05, "loss": 0.1378, "step": 8530 }, { "epoch": 1.7828631138975966, "grad_norm": 0.9924406984069855, "learning_rate": 1.648606977918543e-05, "loss": 0.1576, "step": 8531 }, { "epoch": 1.7830721003134795, "grad_norm": 0.9816962474897042, "learning_rate": 1.648521109815282e-05, "loss": 0.1737, "step": 8532 }, { "epoch": 1.7832810867293625, "grad_norm": 0.7384450682062963, "learning_rate": 1.6484352334585654e-05, "loss": 0.1369, "step": 8533 }, { "epoch": 1.7834900731452454, "grad_norm": 1.059375956800135, "learning_rate": 1.6483493488494854e-05, "loss": 0.1862, "step": 8534 }, { "epoch": 1.7836990595611284, "grad_norm": 0.9646395239690445, "learning_rate": 1.6482634559891357e-05, "loss": 0.1569, "step": 8535 }, { "epoch": 1.7839080459770114, "grad_norm": 1.0766871701762928, "learning_rate": 1.6481775548786084e-05, "loss": 0.2104, "step": 8536 }, { "epoch": 1.7841170323928943, "grad_norm": 0.839961832667023, "learning_rate": 1.648091645518998e-05, "loss": 0.1717, "step": 8537 }, { "epoch": 1.7843260188087773, "grad_norm": 0.774873627300007, "learning_rate": 1.6480057279113965e-05, "loss": 0.1448, "step": 8538 }, { "epoch": 1.7845350052246602, "grad_norm": 0.7740583146745648, "learning_rate": 1.6479198020568986e-05, "loss": 0.1568, "step": 8539 }, { "epoch": 1.7847439916405432, "grad_norm": 1.1110583079033092, "learning_rate": 1.6478338679565967e-05, "loss": 0.1934, "step": 8540 }, { "epoch": 1.7849529780564264, "grad_norm": 0.8740330713609712, "learning_rate": 1.6477479256115857e-05, "loss": 0.1555, "step": 8541 }, { "epoch": 1.7851619644723093, "grad_norm": 1.1652985538877325, "learning_rate": 1.6476619750229583e-05, "loss": 0.2039, "step": 8542 }, { "epoch": 1.7853709508881923, "grad_norm": 0.8828298227214476, "learning_rate": 1.647576016191809e-05, "loss": 0.1479, "step": 8543 }, { "epoch": 1.7855799373040753, "grad_norm": 0.896416547793256, "learning_rate": 1.647490049119231e-05, "loss": 0.1543, "step": 8544 }, { "epoch": 1.7857889237199582, "grad_norm": 0.8258297556374251, "learning_rate": 1.6474040738063193e-05, "loss": 0.1511, "step": 8545 }, { "epoch": 1.7859979101358412, "grad_norm": 1.0457581109116652, "learning_rate": 1.647318090254167e-05, "loss": 0.1841, "step": 8546 }, { "epoch": 1.7862068965517242, "grad_norm": 0.9943219371626385, "learning_rate": 1.6472320984638697e-05, "loss": 0.1685, "step": 8547 }, { "epoch": 1.7864158829676071, "grad_norm": 0.9993360704865754, "learning_rate": 1.6471460984365205e-05, "loss": 0.1695, "step": 8548 }, { "epoch": 1.78662486938349, "grad_norm": 0.8528459481773273, "learning_rate": 1.647060090173215e-05, "loss": 0.1668, "step": 8549 }, { "epoch": 1.786833855799373, "grad_norm": 1.1008056087805929, "learning_rate": 1.6469740736750472e-05, "loss": 0.1323, "step": 8550 }, { "epoch": 1.787042842215256, "grad_norm": 0.8869650110720942, "learning_rate": 1.6468880489431117e-05, "loss": 0.1608, "step": 8551 }, { "epoch": 1.787251828631139, "grad_norm": 1.10055028945914, "learning_rate": 1.6468020159785034e-05, "loss": 0.1958, "step": 8552 }, { "epoch": 1.787460815047022, "grad_norm": 1.1848214615501984, "learning_rate": 1.6467159747823175e-05, "loss": 0.1967, "step": 8553 }, { "epoch": 1.7876698014629049, "grad_norm": 0.9237163185956568, "learning_rate": 1.6466299253556483e-05, "loss": 0.1506, "step": 8554 }, { "epoch": 1.7878787878787878, "grad_norm": 0.968210436454181, "learning_rate": 1.646543867699592e-05, "loss": 0.1386, "step": 8555 }, { "epoch": 1.7880877742946708, "grad_norm": 1.0714822891740268, "learning_rate": 1.6464578018152426e-05, "loss": 0.1477, "step": 8556 }, { "epoch": 1.7882967607105538, "grad_norm": 0.8570508092008111, "learning_rate": 1.6463717277036965e-05, "loss": 0.1725, "step": 8557 }, { "epoch": 1.7885057471264367, "grad_norm": 0.9739718192746208, "learning_rate": 1.6462856453660487e-05, "loss": 0.2037, "step": 8558 }, { "epoch": 1.78871473354232, "grad_norm": 0.8204562411504137, "learning_rate": 1.646199554803394e-05, "loss": 0.1627, "step": 8559 }, { "epoch": 1.7889237199582029, "grad_norm": 0.7419525549477628, "learning_rate": 1.6461134560168296e-05, "loss": 0.143, "step": 8560 }, { "epoch": 1.7891327063740858, "grad_norm": 1.132775093313021, "learning_rate": 1.6460273490074498e-05, "loss": 0.1502, "step": 8561 }, { "epoch": 1.7893416927899688, "grad_norm": 0.8793147989771732, "learning_rate": 1.6459412337763512e-05, "loss": 0.1478, "step": 8562 }, { "epoch": 1.7895506792058518, "grad_norm": 0.9663877595459506, "learning_rate": 1.6458551103246297e-05, "loss": 0.1842, "step": 8563 }, { "epoch": 1.7897596656217347, "grad_norm": 0.9031436194012167, "learning_rate": 1.6457689786533812e-05, "loss": 0.1639, "step": 8564 }, { "epoch": 1.7899686520376177, "grad_norm": 0.8531386145086266, "learning_rate": 1.6456828387637017e-05, "loss": 0.1566, "step": 8565 }, { "epoch": 1.7901776384535006, "grad_norm": 0.8579656999845625, "learning_rate": 1.6455966906566878e-05, "loss": 0.145, "step": 8566 }, { "epoch": 1.7903866248693836, "grad_norm": 1.0601033995363243, "learning_rate": 1.6455105343334358e-05, "loss": 0.1666, "step": 8567 }, { "epoch": 1.7905956112852666, "grad_norm": 0.8289413927284096, "learning_rate": 1.6454243697950422e-05, "loss": 0.1477, "step": 8568 }, { "epoch": 1.7908045977011495, "grad_norm": 0.919463920816502, "learning_rate": 1.645338197042603e-05, "loss": 0.1774, "step": 8569 }, { "epoch": 1.7910135841170325, "grad_norm": 0.8281739222363973, "learning_rate": 1.6452520160772156e-05, "loss": 0.1553, "step": 8570 }, { "epoch": 1.7912225705329154, "grad_norm": 0.9098519591500166, "learning_rate": 1.6451658268999767e-05, "loss": 0.1339, "step": 8571 }, { "epoch": 1.7914315569487984, "grad_norm": 0.8971435708973349, "learning_rate": 1.6450796295119832e-05, "loss": 0.17, "step": 8572 }, { "epoch": 1.7916405433646814, "grad_norm": 0.8610205592734653, "learning_rate": 1.6449934239143313e-05, "loss": 0.1635, "step": 8573 }, { "epoch": 1.7918495297805643, "grad_norm": 0.709911517620694, "learning_rate": 1.6449072101081193e-05, "loss": 0.1638, "step": 8574 }, { "epoch": 1.7920585161964473, "grad_norm": 0.8535765813291669, "learning_rate": 1.6448209880944437e-05, "loss": 0.1318, "step": 8575 }, { "epoch": 1.7922675026123303, "grad_norm": 0.859805326906259, "learning_rate": 1.6447347578744022e-05, "loss": 0.1719, "step": 8576 }, { "epoch": 1.7924764890282132, "grad_norm": 0.9195633755973203, "learning_rate": 1.6446485194490916e-05, "loss": 0.1798, "step": 8577 }, { "epoch": 1.7926854754440962, "grad_norm": 0.9450091971373268, "learning_rate": 1.6445622728196103e-05, "loss": 0.1401, "step": 8578 }, { "epoch": 1.7928944618599791, "grad_norm": 0.8697314562828478, "learning_rate": 1.644476017987055e-05, "loss": 0.1513, "step": 8579 }, { "epoch": 1.793103448275862, "grad_norm": 1.0926002858048585, "learning_rate": 1.644389754952524e-05, "loss": 0.1579, "step": 8580 }, { "epoch": 1.793312434691745, "grad_norm": 0.9097726610647838, "learning_rate": 1.6443034837171147e-05, "loss": 0.1747, "step": 8581 }, { "epoch": 1.793521421107628, "grad_norm": 0.8900317411930518, "learning_rate": 1.644217204281926e-05, "loss": 0.1475, "step": 8582 }, { "epoch": 1.793730407523511, "grad_norm": 1.0842220428081746, "learning_rate": 1.6441309166480547e-05, "loss": 0.1715, "step": 8583 }, { "epoch": 1.793939393939394, "grad_norm": 1.1408923574389944, "learning_rate": 1.6440446208166e-05, "loss": 0.1718, "step": 8584 }, { "epoch": 1.794148380355277, "grad_norm": 0.9649942703312955, "learning_rate": 1.6439583167886593e-05, "loss": 0.1756, "step": 8585 }, { "epoch": 1.7943573667711599, "grad_norm": 1.0733523198903638, "learning_rate": 1.6438720045653314e-05, "loss": 0.1511, "step": 8586 }, { "epoch": 1.7945663531870428, "grad_norm": 1.1054009306521018, "learning_rate": 1.6437856841477147e-05, "loss": 0.1756, "step": 8587 }, { "epoch": 1.7947753396029258, "grad_norm": 0.9784537151742507, "learning_rate": 1.6436993555369078e-05, "loss": 0.1715, "step": 8588 }, { "epoch": 1.7949843260188088, "grad_norm": 0.823324795751295, "learning_rate": 1.6436130187340095e-05, "loss": 0.1561, "step": 8589 }, { "epoch": 1.7951933124346917, "grad_norm": 0.8855762979552394, "learning_rate": 1.643526673740118e-05, "loss": 0.1585, "step": 8590 }, { "epoch": 1.7954022988505747, "grad_norm": 0.9076354977174673, "learning_rate": 1.6434403205563327e-05, "loss": 0.1409, "step": 8591 }, { "epoch": 1.7956112852664576, "grad_norm": 0.8574990663664807, "learning_rate": 1.6433539591837527e-05, "loss": 0.1426, "step": 8592 }, { "epoch": 1.7958202716823406, "grad_norm": 0.9214169304282028, "learning_rate": 1.6432675896234768e-05, "loss": 0.1462, "step": 8593 }, { "epoch": 1.7960292580982236, "grad_norm": 0.9141086936659448, "learning_rate": 1.6431812118766037e-05, "loss": 0.1832, "step": 8594 }, { "epoch": 1.7962382445141065, "grad_norm": 0.8681066929672283, "learning_rate": 1.6430948259442336e-05, "loss": 0.1922, "step": 8595 }, { "epoch": 1.7964472309299895, "grad_norm": 0.8734502450369805, "learning_rate": 1.6430084318274656e-05, "loss": 0.1514, "step": 8596 }, { "epoch": 1.7966562173458724, "grad_norm": 1.0359105383741685, "learning_rate": 1.6429220295273988e-05, "loss": 0.1579, "step": 8597 }, { "epoch": 1.7968652037617554, "grad_norm": 0.8726875931712291, "learning_rate": 1.6428356190451336e-05, "loss": 0.1791, "step": 8598 }, { "epoch": 1.7970741901776384, "grad_norm": 0.9364650894966753, "learning_rate": 1.642749200381769e-05, "loss": 0.1534, "step": 8599 }, { "epoch": 1.7972831765935213, "grad_norm": 0.7700517437658888, "learning_rate": 1.642662773538405e-05, "loss": 0.1547, "step": 8600 }, { "epoch": 1.7974921630094043, "grad_norm": 0.8178946472486788, "learning_rate": 1.6425763385161416e-05, "loss": 0.1692, "step": 8601 }, { "epoch": 1.7977011494252872, "grad_norm": 1.177041155002386, "learning_rate": 1.6424898953160785e-05, "loss": 0.1637, "step": 8602 }, { "epoch": 1.7979101358411702, "grad_norm": 1.1365735683644795, "learning_rate": 1.6424034439393165e-05, "loss": 0.1964, "step": 8603 }, { "epoch": 1.7981191222570532, "grad_norm": 0.7840561981829125, "learning_rate": 1.6423169843869554e-05, "loss": 0.1279, "step": 8604 }, { "epoch": 1.7983281086729361, "grad_norm": 0.8028690134416269, "learning_rate": 1.6422305166600953e-05, "loss": 0.1312, "step": 8605 }, { "epoch": 1.798537095088819, "grad_norm": 0.8663223313809666, "learning_rate": 1.642144040759837e-05, "loss": 0.1728, "step": 8606 }, { "epoch": 1.798746081504702, "grad_norm": 1.5182353000304896, "learning_rate": 1.642057556687281e-05, "loss": 0.169, "step": 8607 }, { "epoch": 1.798955067920585, "grad_norm": 0.9849933353351358, "learning_rate": 1.6419710644435282e-05, "loss": 0.1547, "step": 8608 }, { "epoch": 1.799164054336468, "grad_norm": 0.7520780507392429, "learning_rate": 1.6418845640296788e-05, "loss": 0.1398, "step": 8609 }, { "epoch": 1.799373040752351, "grad_norm": 1.1188160016304995, "learning_rate": 1.641798055446834e-05, "loss": 0.1886, "step": 8610 }, { "epoch": 1.799582027168234, "grad_norm": 0.8234013871707653, "learning_rate": 1.6417115386960944e-05, "loss": 0.1677, "step": 8611 }, { "epoch": 1.799791013584117, "grad_norm": 1.0105534145717445, "learning_rate": 1.641625013778562e-05, "loss": 0.1861, "step": 8612 }, { "epoch": 1.8, "grad_norm": 1.001575472938589, "learning_rate": 1.6415384806953366e-05, "loss": 0.1787, "step": 8613 }, { "epoch": 1.800208986415883, "grad_norm": 1.0193671519727219, "learning_rate": 1.6414519394475202e-05, "loss": 0.1668, "step": 8614 }, { "epoch": 1.800417972831766, "grad_norm": 0.9385620045829925, "learning_rate": 1.6413653900362145e-05, "loss": 0.1762, "step": 8615 }, { "epoch": 1.800626959247649, "grad_norm": 0.8323689677124186, "learning_rate": 1.6412788324625205e-05, "loss": 0.1683, "step": 8616 }, { "epoch": 1.800835945663532, "grad_norm": 0.9067147182943207, "learning_rate": 1.64119226672754e-05, "loss": 0.1832, "step": 8617 }, { "epoch": 1.8010449320794149, "grad_norm": 0.936230062715684, "learning_rate": 1.6411056928323742e-05, "loss": 0.1558, "step": 8618 }, { "epoch": 1.8012539184952978, "grad_norm": 0.9738204009606727, "learning_rate": 1.6410191107781256e-05, "loss": 0.192, "step": 8619 }, { "epoch": 1.8014629049111808, "grad_norm": 0.8993804381285766, "learning_rate": 1.6409325205658956e-05, "loss": 0.1609, "step": 8620 }, { "epoch": 1.8016718913270637, "grad_norm": 0.958250785196989, "learning_rate": 1.640845922196787e-05, "loss": 0.1469, "step": 8621 }, { "epoch": 1.8018808777429467, "grad_norm": 0.7322236645667902, "learning_rate": 1.6407593156719004e-05, "loss": 0.1562, "step": 8622 }, { "epoch": 1.8020898641588297, "grad_norm": 1.0164023256012373, "learning_rate": 1.640672700992339e-05, "loss": 0.1749, "step": 8623 }, { "epoch": 1.8022988505747126, "grad_norm": 0.9215987822630991, "learning_rate": 1.6405860781592055e-05, "loss": 0.1604, "step": 8624 }, { "epoch": 1.8025078369905956, "grad_norm": 0.7784907267652578, "learning_rate": 1.640499447173601e-05, "loss": 0.1587, "step": 8625 }, { "epoch": 1.8027168234064785, "grad_norm": 0.9407618715698008, "learning_rate": 1.6404128080366297e-05, "loss": 0.1852, "step": 8626 }, { "epoch": 1.8029258098223615, "grad_norm": 0.9183034964013547, "learning_rate": 1.6403261607493926e-05, "loss": 0.1552, "step": 8627 }, { "epoch": 1.8031347962382445, "grad_norm": 0.8078428844498453, "learning_rate": 1.6402395053129935e-05, "loss": 0.1374, "step": 8628 }, { "epoch": 1.8033437826541276, "grad_norm": 0.9643019987830473, "learning_rate": 1.640152841728535e-05, "loss": 0.155, "step": 8629 }, { "epoch": 1.8035527690700106, "grad_norm": 0.8796725517095417, "learning_rate": 1.6400661699971196e-05, "loss": 0.1473, "step": 8630 }, { "epoch": 1.8037617554858936, "grad_norm": 0.9008569550837725, "learning_rate": 1.639979490119851e-05, "loss": 0.1657, "step": 8631 }, { "epoch": 1.8039707419017765, "grad_norm": 0.8722165287062612, "learning_rate": 1.639892802097832e-05, "loss": 0.1334, "step": 8632 }, { "epoch": 1.8041797283176595, "grad_norm": 1.0830893166464528, "learning_rate": 1.6398061059321656e-05, "loss": 0.1927, "step": 8633 }, { "epoch": 1.8043887147335425, "grad_norm": 0.8512129064267256, "learning_rate": 1.6397194016239553e-05, "loss": 0.1653, "step": 8634 }, { "epoch": 1.8045977011494254, "grad_norm": 0.8189437090063826, "learning_rate": 1.6396326891743045e-05, "loss": 0.1601, "step": 8635 }, { "epoch": 1.8048066875653084, "grad_norm": 0.8994503292386477, "learning_rate": 1.6395459685843174e-05, "loss": 0.1492, "step": 8636 }, { "epoch": 1.8050156739811913, "grad_norm": 1.2766465804314882, "learning_rate": 1.6394592398550966e-05, "loss": 0.2087, "step": 8637 }, { "epoch": 1.8052246603970743, "grad_norm": 0.8765874191432061, "learning_rate": 1.6393725029877468e-05, "loss": 0.1962, "step": 8638 }, { "epoch": 1.8054336468129573, "grad_norm": 0.9632890524707668, "learning_rate": 1.6392857579833713e-05, "loss": 0.1884, "step": 8639 }, { "epoch": 1.8056426332288402, "grad_norm": 0.7974612094757091, "learning_rate": 1.6391990048430743e-05, "loss": 0.1736, "step": 8640 }, { "epoch": 1.8058516196447232, "grad_norm": 0.9017518391842246, "learning_rate": 1.6391122435679596e-05, "loss": 0.1337, "step": 8641 }, { "epoch": 1.8060606060606061, "grad_norm": 0.7859334769159375, "learning_rate": 1.6390254741591313e-05, "loss": 0.1455, "step": 8642 }, { "epoch": 1.806269592476489, "grad_norm": 0.8757597902226668, "learning_rate": 1.6389386966176948e-05, "loss": 0.1693, "step": 8643 }, { "epoch": 1.806478578892372, "grad_norm": 0.9569926244896493, "learning_rate": 1.638851910944753e-05, "loss": 0.1809, "step": 8644 }, { "epoch": 1.806687565308255, "grad_norm": 0.7931277302409738, "learning_rate": 1.6387651171414113e-05, "loss": 0.1472, "step": 8645 }, { "epoch": 1.806896551724138, "grad_norm": 0.7213965601052853, "learning_rate": 1.6386783152087734e-05, "loss": 0.1663, "step": 8646 }, { "epoch": 1.807105538140021, "grad_norm": 1.0258133435687102, "learning_rate": 1.6385915051479453e-05, "loss": 0.1658, "step": 8647 }, { "epoch": 1.807314524555904, "grad_norm": 0.8942301800028544, "learning_rate": 1.638504686960031e-05, "loss": 0.1736, "step": 8648 }, { "epoch": 1.8075235109717869, "grad_norm": 0.836460250355414, "learning_rate": 1.6384178606461355e-05, "loss": 0.18, "step": 8649 }, { "epoch": 1.8077324973876698, "grad_norm": 0.9743698288993752, "learning_rate": 1.638331026207363e-05, "loss": 0.1484, "step": 8650 }, { "epoch": 1.8079414838035528, "grad_norm": 0.9314501658378326, "learning_rate": 1.6382441836448203e-05, "loss": 0.1636, "step": 8651 }, { "epoch": 1.8081504702194358, "grad_norm": 0.8730547972012053, "learning_rate": 1.638157332959611e-05, "loss": 0.1597, "step": 8652 }, { "epoch": 1.8083594566353187, "grad_norm": 0.9220987763748715, "learning_rate": 1.638070474152842e-05, "loss": 0.1821, "step": 8653 }, { "epoch": 1.8085684430512017, "grad_norm": 1.05186522268804, "learning_rate": 1.6379836072256173e-05, "loss": 0.1617, "step": 8654 }, { "epoch": 1.8087774294670846, "grad_norm": 0.8648719291496835, "learning_rate": 1.637896732179043e-05, "loss": 0.1437, "step": 8655 }, { "epoch": 1.8089864158829676, "grad_norm": 1.1527433365141024, "learning_rate": 1.637809849014225e-05, "loss": 0.1689, "step": 8656 }, { "epoch": 1.8091954022988506, "grad_norm": 0.8690984666370364, "learning_rate": 1.637722957732268e-05, "loss": 0.1396, "step": 8657 }, { "epoch": 1.8094043887147335, "grad_norm": 1.1318141557010615, "learning_rate": 1.637636058334279e-05, "loss": 0.1655, "step": 8658 }, { "epoch": 1.8096133751306165, "grad_norm": 0.9656967392995116, "learning_rate": 1.6375491508213634e-05, "loss": 0.1696, "step": 8659 }, { "epoch": 1.8098223615464994, "grad_norm": 0.9013924391176574, "learning_rate": 1.6374622351946273e-05, "loss": 0.1622, "step": 8660 }, { "epoch": 1.8100313479623824, "grad_norm": 0.9737555716368466, "learning_rate": 1.6373753114551767e-05, "loss": 0.178, "step": 8661 }, { "epoch": 1.8102403343782654, "grad_norm": 0.9653711922200254, "learning_rate": 1.6372883796041178e-05, "loss": 0.1314, "step": 8662 }, { "epoch": 1.8104493207941483, "grad_norm": 0.9205088524080788, "learning_rate": 1.6372014396425575e-05, "loss": 0.1697, "step": 8663 }, { "epoch": 1.8106583072100313, "grad_norm": 0.9377620679938549, "learning_rate": 1.6371144915716017e-05, "loss": 0.145, "step": 8664 }, { "epoch": 1.8108672936259143, "grad_norm": 0.8040036954281453, "learning_rate": 1.6370275353923572e-05, "loss": 0.1524, "step": 8665 }, { "epoch": 1.8110762800417972, "grad_norm": 0.8724966464408838, "learning_rate": 1.6369405711059307e-05, "loss": 0.1497, "step": 8666 }, { "epoch": 1.8112852664576802, "grad_norm": 0.9227255227949329, "learning_rate": 1.6368535987134287e-05, "loss": 0.1567, "step": 8667 }, { "epoch": 1.8114942528735631, "grad_norm": 0.9028873513571869, "learning_rate": 1.6367666182159578e-05, "loss": 0.1569, "step": 8668 }, { "epoch": 1.811703239289446, "grad_norm": 0.9339711780084753, "learning_rate": 1.636679629614626e-05, "loss": 0.1431, "step": 8669 }, { "epoch": 1.811912225705329, "grad_norm": 0.9407385960800279, "learning_rate": 1.6365926329105395e-05, "loss": 0.1366, "step": 8670 }, { "epoch": 1.812121212121212, "grad_norm": 0.9341703328150842, "learning_rate": 1.6365056281048056e-05, "loss": 0.145, "step": 8671 }, { "epoch": 1.812330198537095, "grad_norm": 1.042904406711086, "learning_rate": 1.6364186151985317e-05, "loss": 0.1791, "step": 8672 }, { "epoch": 1.812539184952978, "grad_norm": 0.9518044069737294, "learning_rate": 1.636331594192825e-05, "loss": 0.1799, "step": 8673 }, { "epoch": 1.812748171368861, "grad_norm": 0.9413804374068907, "learning_rate": 1.6362445650887934e-05, "loss": 0.1443, "step": 8674 }, { "epoch": 1.8129571577847439, "grad_norm": 0.9108410164442965, "learning_rate": 1.636157527887544e-05, "loss": 0.1595, "step": 8675 }, { "epoch": 1.8131661442006268, "grad_norm": 0.9337172890271409, "learning_rate": 1.636070482590185e-05, "loss": 0.1656, "step": 8676 }, { "epoch": 1.8133751306165098, "grad_norm": 1.2241532994415287, "learning_rate": 1.6359834291978235e-05, "loss": 0.1752, "step": 8677 }, { "epoch": 1.8135841170323928, "grad_norm": 1.0105981941393811, "learning_rate": 1.635896367711568e-05, "loss": 0.1563, "step": 8678 }, { "epoch": 1.8137931034482757, "grad_norm": 1.0110026151464413, "learning_rate": 1.6358092981325264e-05, "loss": 0.1937, "step": 8679 }, { "epoch": 1.8140020898641587, "grad_norm": 0.9018857938010895, "learning_rate": 1.6357222204618064e-05, "loss": 0.1627, "step": 8680 }, { "epoch": 1.8142110762800416, "grad_norm": 0.8703594432479503, "learning_rate": 1.6356351347005167e-05, "loss": 0.1748, "step": 8681 }, { "epoch": 1.8144200626959248, "grad_norm": 0.941807739622268, "learning_rate": 1.635548040849765e-05, "loss": 0.1364, "step": 8682 }, { "epoch": 1.8146290491118078, "grad_norm": 0.870779592937883, "learning_rate": 1.6354609389106606e-05, "loss": 0.1608, "step": 8683 }, { "epoch": 1.8148380355276907, "grad_norm": 0.7100688973198317, "learning_rate": 1.635373828884311e-05, "loss": 0.1399, "step": 8684 }, { "epoch": 1.8150470219435737, "grad_norm": 0.8210968656423935, "learning_rate": 1.6352867107718258e-05, "loss": 0.1481, "step": 8685 }, { "epoch": 1.8152560083594567, "grad_norm": 1.0657550546180135, "learning_rate": 1.6351995845743128e-05, "loss": 0.1528, "step": 8686 }, { "epoch": 1.8154649947753396, "grad_norm": 1.1096643834508955, "learning_rate": 1.6351124502928813e-05, "loss": 0.1781, "step": 8687 }, { "epoch": 1.8156739811912226, "grad_norm": 0.8695306175162391, "learning_rate": 1.6350253079286404e-05, "loss": 0.1924, "step": 8688 }, { "epoch": 1.8158829676071055, "grad_norm": 0.8404255081379325, "learning_rate": 1.6349381574826985e-05, "loss": 0.163, "step": 8689 }, { "epoch": 1.8160919540229885, "grad_norm": 0.9511063653218055, "learning_rate": 1.6348509989561652e-05, "loss": 0.1722, "step": 8690 }, { "epoch": 1.8163009404388715, "grad_norm": 0.7901082264172509, "learning_rate": 1.63476383235015e-05, "loss": 0.1508, "step": 8691 }, { "epoch": 1.8165099268547544, "grad_norm": 0.8002519250496396, "learning_rate": 1.6346766576657616e-05, "loss": 0.1374, "step": 8692 }, { "epoch": 1.8167189132706374, "grad_norm": 0.7983178378642837, "learning_rate": 1.6345894749041097e-05, "loss": 0.1462, "step": 8693 }, { "epoch": 1.8169278996865204, "grad_norm": 0.8498483765899542, "learning_rate": 1.634502284066304e-05, "loss": 0.1511, "step": 8694 }, { "epoch": 1.8171368861024033, "grad_norm": 0.9615053263390313, "learning_rate": 1.634415085153454e-05, "loss": 0.15, "step": 8695 }, { "epoch": 1.8173458725182863, "grad_norm": 0.7749800243675127, "learning_rate": 1.6343278781666693e-05, "loss": 0.1709, "step": 8696 }, { "epoch": 1.8175548589341692, "grad_norm": 0.7960139857026833, "learning_rate": 1.63424066310706e-05, "loss": 0.1543, "step": 8697 }, { "epoch": 1.8177638453500522, "grad_norm": 0.8884570976244074, "learning_rate": 1.6341534399757354e-05, "loss": 0.1463, "step": 8698 }, { "epoch": 1.8179728317659352, "grad_norm": 0.8125965962926531, "learning_rate": 1.6340662087738065e-05, "loss": 0.1318, "step": 8699 }, { "epoch": 1.8181818181818183, "grad_norm": 0.9620580134621299, "learning_rate": 1.6339789695023834e-05, "loss": 0.1475, "step": 8700 }, { "epoch": 1.8183908045977013, "grad_norm": 0.7404935517133744, "learning_rate": 1.6338917221625754e-05, "loss": 0.1252, "step": 8701 }, { "epoch": 1.8185997910135843, "grad_norm": 0.9212118635043425, "learning_rate": 1.633804466755494e-05, "loss": 0.1664, "step": 8702 }, { "epoch": 1.8188087774294672, "grad_norm": 0.6723130244251483, "learning_rate": 1.6337172032822486e-05, "loss": 0.135, "step": 8703 }, { "epoch": 1.8190177638453502, "grad_norm": 0.904512678486518, "learning_rate": 1.6336299317439504e-05, "loss": 0.1629, "step": 8704 }, { "epoch": 1.8192267502612331, "grad_norm": 0.7232834099749716, "learning_rate": 1.6335426521417098e-05, "loss": 0.157, "step": 8705 }, { "epoch": 1.8194357366771161, "grad_norm": 0.7048624478669157, "learning_rate": 1.633455364476638e-05, "loss": 0.1312, "step": 8706 }, { "epoch": 1.819644723092999, "grad_norm": 1.148316678257425, "learning_rate": 1.633368068749846e-05, "loss": 0.1694, "step": 8707 }, { "epoch": 1.819853709508882, "grad_norm": 0.7996533419909059, "learning_rate": 1.6332807649624438e-05, "loss": 0.1336, "step": 8708 }, { "epoch": 1.820062695924765, "grad_norm": 1.2589622778977119, "learning_rate": 1.633193453115543e-05, "loss": 0.164, "step": 8709 }, { "epoch": 1.820271682340648, "grad_norm": 0.908696196583606, "learning_rate": 1.633106133210255e-05, "loss": 0.1387, "step": 8710 }, { "epoch": 1.820480668756531, "grad_norm": 0.918968468712551, "learning_rate": 1.6330188052476912e-05, "loss": 0.211, "step": 8711 }, { "epoch": 1.8206896551724139, "grad_norm": 1.0011516955812234, "learning_rate": 1.632931469228962e-05, "loss": 0.1517, "step": 8712 }, { "epoch": 1.8208986415882968, "grad_norm": 0.9659661275700094, "learning_rate": 1.6328441251551804e-05, "loss": 0.1758, "step": 8713 }, { "epoch": 1.8211076280041798, "grad_norm": 0.8973660006742945, "learning_rate": 1.6327567730274567e-05, "loss": 0.1404, "step": 8714 }, { "epoch": 1.8213166144200628, "grad_norm": 0.7912800480899553, "learning_rate": 1.632669412846903e-05, "loss": 0.1373, "step": 8715 }, { "epoch": 1.8215256008359457, "grad_norm": 0.9665921635878019, "learning_rate": 1.6325820446146314e-05, "loss": 0.1705, "step": 8716 }, { "epoch": 1.8217345872518287, "grad_norm": 0.871256711420181, "learning_rate": 1.6324946683317538e-05, "loss": 0.1519, "step": 8717 }, { "epoch": 1.8219435736677116, "grad_norm": 0.684026185621134, "learning_rate": 1.6324072839993817e-05, "loss": 0.1355, "step": 8718 }, { "epoch": 1.8221525600835946, "grad_norm": 0.8796848700612693, "learning_rate": 1.6323198916186273e-05, "loss": 0.1461, "step": 8719 }, { "epoch": 1.8223615464994776, "grad_norm": 0.6954493929810776, "learning_rate": 1.6322324911906032e-05, "loss": 0.1496, "step": 8720 }, { "epoch": 1.8225705329153605, "grad_norm": 0.723765614412953, "learning_rate": 1.6321450827164213e-05, "loss": 0.1726, "step": 8721 }, { "epoch": 1.8227795193312435, "grad_norm": 0.9618736703942048, "learning_rate": 1.6320576661971944e-05, "loss": 0.1564, "step": 8722 }, { "epoch": 1.8229885057471265, "grad_norm": 1.0496816242080347, "learning_rate": 1.6319702416340347e-05, "loss": 0.1659, "step": 8723 }, { "epoch": 1.8231974921630094, "grad_norm": 0.9430606172859808, "learning_rate": 1.6318828090280553e-05, "loss": 0.1332, "step": 8724 }, { "epoch": 1.8234064785788924, "grad_norm": 0.8441544478201956, "learning_rate": 1.631795368380368e-05, "loss": 0.1512, "step": 8725 }, { "epoch": 1.8236154649947753, "grad_norm": 1.0995030760719575, "learning_rate": 1.6317079196920865e-05, "loss": 0.1693, "step": 8726 }, { "epoch": 1.8238244514106583, "grad_norm": 0.8251752444892001, "learning_rate": 1.6316204629643235e-05, "loss": 0.1641, "step": 8727 }, { "epoch": 1.8240334378265413, "grad_norm": 0.8115451967306105, "learning_rate": 1.6315329981981913e-05, "loss": 0.1579, "step": 8728 }, { "epoch": 1.8242424242424242, "grad_norm": 1.7756580350181979, "learning_rate": 1.631445525394804e-05, "loss": 0.1416, "step": 8729 }, { "epoch": 1.8244514106583072, "grad_norm": 0.8886407909099827, "learning_rate": 1.631358044555275e-05, "loss": 0.2021, "step": 8730 }, { "epoch": 1.8246603970741901, "grad_norm": 0.9382443518186021, "learning_rate": 1.631270555680716e-05, "loss": 0.1561, "step": 8731 }, { "epoch": 1.824869383490073, "grad_norm": 0.9034248909177712, "learning_rate": 1.6311830587722424e-05, "loss": 0.1435, "step": 8732 }, { "epoch": 1.825078369905956, "grad_norm": 1.0849889861929776, "learning_rate": 1.631095553830967e-05, "loss": 0.1683, "step": 8733 }, { "epoch": 1.825287356321839, "grad_norm": 0.9781877789123489, "learning_rate": 1.6310080408580027e-05, "loss": 0.1733, "step": 8734 }, { "epoch": 1.825496342737722, "grad_norm": 0.7676473406945019, "learning_rate": 1.630920519854464e-05, "loss": 0.1453, "step": 8735 }, { "epoch": 1.825705329153605, "grad_norm": 1.2806541448692488, "learning_rate": 1.630832990821465e-05, "loss": 0.203, "step": 8736 }, { "epoch": 1.825914315569488, "grad_norm": 0.9782211479518451, "learning_rate": 1.6307454537601187e-05, "loss": 0.1743, "step": 8737 }, { "epoch": 1.8261233019853709, "grad_norm": 0.934645723612821, "learning_rate": 1.63065790867154e-05, "loss": 0.1637, "step": 8738 }, { "epoch": 1.8263322884012538, "grad_norm": 0.9145604062919919, "learning_rate": 1.6305703555568426e-05, "loss": 0.1467, "step": 8739 }, { "epoch": 1.8265412748171368, "grad_norm": 0.9721404861825629, "learning_rate": 1.630482794417141e-05, "loss": 0.1402, "step": 8740 }, { "epoch": 1.8267502612330198, "grad_norm": 0.7016286580009563, "learning_rate": 1.6303952252535494e-05, "loss": 0.1203, "step": 8741 }, { "epoch": 1.8269592476489027, "grad_norm": 0.7745280970619922, "learning_rate": 1.6303076480671825e-05, "loss": 0.1539, "step": 8742 }, { "epoch": 1.8271682340647857, "grad_norm": 0.9874125036119836, "learning_rate": 1.6302200628591544e-05, "loss": 0.1583, "step": 8743 }, { "epoch": 1.8273772204806686, "grad_norm": 1.001309839611293, "learning_rate": 1.63013246963058e-05, "loss": 0.188, "step": 8744 }, { "epoch": 1.8275862068965516, "grad_norm": 0.893554747681845, "learning_rate": 1.6300448683825743e-05, "loss": 0.1523, "step": 8745 }, { "epoch": 1.8277951933124346, "grad_norm": 0.8201525252849896, "learning_rate": 1.6299572591162518e-05, "loss": 0.1771, "step": 8746 }, { "epoch": 1.8280041797283175, "grad_norm": 1.0209547761806201, "learning_rate": 1.6298696418327278e-05, "loss": 0.1612, "step": 8747 }, { "epoch": 1.8282131661442005, "grad_norm": 0.8652122512620061, "learning_rate": 1.629782016533117e-05, "loss": 0.1441, "step": 8748 }, { "epoch": 1.8284221525600834, "grad_norm": 0.7860378172287578, "learning_rate": 1.6296943832185345e-05, "loss": 0.1583, "step": 8749 }, { "epoch": 1.8286311389759664, "grad_norm": 0.9018501486541667, "learning_rate": 1.6296067418900965e-05, "loss": 0.146, "step": 8750 }, { "epoch": 1.8288401253918494, "grad_norm": 0.9269555447826633, "learning_rate": 1.6295190925489173e-05, "loss": 0.1519, "step": 8751 }, { "epoch": 1.8290491118077326, "grad_norm": 0.7942293160836477, "learning_rate": 1.629431435196113e-05, "loss": 0.162, "step": 8752 }, { "epoch": 1.8292580982236155, "grad_norm": 0.8537624627309471, "learning_rate": 1.6293437698327985e-05, "loss": 0.1665, "step": 8753 }, { "epoch": 1.8294670846394985, "grad_norm": 0.9198057702222698, "learning_rate": 1.62925609646009e-05, "loss": 0.1556, "step": 8754 }, { "epoch": 1.8296760710553814, "grad_norm": 0.6966965846576868, "learning_rate": 1.6291684150791036e-05, "loss": 0.1496, "step": 8755 }, { "epoch": 1.8298850574712644, "grad_norm": 0.9767097774012302, "learning_rate": 1.6290807256909547e-05, "loss": 0.1374, "step": 8756 }, { "epoch": 1.8300940438871474, "grad_norm": 0.9787376177035784, "learning_rate": 1.6289930282967592e-05, "loss": 0.1688, "step": 8757 }, { "epoch": 1.8303030303030303, "grad_norm": 0.9478055467917393, "learning_rate": 1.6289053228976334e-05, "loss": 0.1523, "step": 8758 }, { "epoch": 1.8305120167189133, "grad_norm": 0.8363249055911436, "learning_rate": 1.6288176094946937e-05, "loss": 0.1183, "step": 8759 }, { "epoch": 1.8307210031347962, "grad_norm": 0.9405573673279098, "learning_rate": 1.628729888089056e-05, "loss": 0.1721, "step": 8760 }, { "epoch": 1.8309299895506792, "grad_norm": 0.8212963775626488, "learning_rate": 1.6286421586818366e-05, "loss": 0.1506, "step": 8761 }, { "epoch": 1.8311389759665622, "grad_norm": 0.8332741842731319, "learning_rate": 1.6285544212741526e-05, "loss": 0.1682, "step": 8762 }, { "epoch": 1.8313479623824451, "grad_norm": 0.8056042112068645, "learning_rate": 1.6284666758671202e-05, "loss": 0.1563, "step": 8763 }, { "epoch": 1.831556948798328, "grad_norm": 0.8601307744494078, "learning_rate": 1.628378922461856e-05, "loss": 0.1798, "step": 8764 }, { "epoch": 1.831765935214211, "grad_norm": 0.9007287384988828, "learning_rate": 1.628291161059477e-05, "loss": 0.1459, "step": 8765 }, { "epoch": 1.831974921630094, "grad_norm": 0.9159643225108361, "learning_rate": 1.6282033916611003e-05, "loss": 0.1681, "step": 8766 }, { "epoch": 1.832183908045977, "grad_norm": 1.1659589230111753, "learning_rate": 1.628115614267842e-05, "loss": 0.1493, "step": 8767 }, { "epoch": 1.83239289446186, "grad_norm": 1.14666120167391, "learning_rate": 1.6280278288808203e-05, "loss": 0.1737, "step": 8768 }, { "epoch": 1.832601880877743, "grad_norm": 0.9529250263055864, "learning_rate": 1.627940035501152e-05, "loss": 0.1615, "step": 8769 }, { "epoch": 1.832810867293626, "grad_norm": 0.9788667429916362, "learning_rate": 1.6278522341299543e-05, "loss": 0.1924, "step": 8770 }, { "epoch": 1.833019853709509, "grad_norm": 1.0012137749252374, "learning_rate": 1.6277644247683447e-05, "loss": 0.1853, "step": 8771 }, { "epoch": 1.833228840125392, "grad_norm": 0.9250069232605274, "learning_rate": 1.6276766074174408e-05, "loss": 0.1511, "step": 8772 }, { "epoch": 1.833437826541275, "grad_norm": 1.1289735858330785, "learning_rate": 1.62758878207836e-05, "loss": 0.1747, "step": 8773 }, { "epoch": 1.833646812957158, "grad_norm": 1.0316199584653505, "learning_rate": 1.62750094875222e-05, "loss": 0.1213, "step": 8774 }, { "epoch": 1.8338557993730409, "grad_norm": 0.8635027926276518, "learning_rate": 1.627413107440139e-05, "loss": 0.1469, "step": 8775 }, { "epoch": 1.8340647857889238, "grad_norm": 0.747224191854027, "learning_rate": 1.6273252581432344e-05, "loss": 0.1385, "step": 8776 }, { "epoch": 1.8342737722048068, "grad_norm": 0.9936252279808319, "learning_rate": 1.6272374008626247e-05, "loss": 0.1729, "step": 8777 }, { "epoch": 1.8344827586206898, "grad_norm": 0.8659575976768399, "learning_rate": 1.6271495355994275e-05, "loss": 0.1508, "step": 8778 }, { "epoch": 1.8346917450365727, "grad_norm": 1.04194477554488, "learning_rate": 1.6270616623547614e-05, "loss": 0.1402, "step": 8779 }, { "epoch": 1.8349007314524557, "grad_norm": 0.8680721128094019, "learning_rate": 1.626973781129745e-05, "loss": 0.1654, "step": 8780 }, { "epoch": 1.8351097178683387, "grad_norm": 0.847998213369859, "learning_rate": 1.6268858919254962e-05, "loss": 0.1477, "step": 8781 }, { "epoch": 1.8353187042842216, "grad_norm": 0.9005050133483489, "learning_rate": 1.6267979947431336e-05, "loss": 0.1615, "step": 8782 }, { "epoch": 1.8355276907001046, "grad_norm": 0.9161197482961952, "learning_rate": 1.626710089583776e-05, "loss": 0.1586, "step": 8783 }, { "epoch": 1.8357366771159875, "grad_norm": 0.8541290217881171, "learning_rate": 1.6266221764485424e-05, "loss": 0.1648, "step": 8784 }, { "epoch": 1.8359456635318705, "grad_norm": 0.8337545799453491, "learning_rate": 1.626534255338551e-05, "loss": 0.15, "step": 8785 }, { "epoch": 1.8361546499477535, "grad_norm": 0.8803052899133078, "learning_rate": 1.626446326254921e-05, "loss": 0.1872, "step": 8786 }, { "epoch": 1.8363636363636364, "grad_norm": 0.8200739695851819, "learning_rate": 1.6263583891987715e-05, "loss": 0.1819, "step": 8787 }, { "epoch": 1.8365726227795194, "grad_norm": 0.8321806916743261, "learning_rate": 1.626270444171222e-05, "loss": 0.1515, "step": 8788 }, { "epoch": 1.8367816091954023, "grad_norm": 0.8826347130163752, "learning_rate": 1.626182491173391e-05, "loss": 0.1844, "step": 8789 }, { "epoch": 1.8369905956112853, "grad_norm": 0.8200537558060087, "learning_rate": 1.6260945302063982e-05, "loss": 0.1275, "step": 8790 }, { "epoch": 1.8371995820271683, "grad_norm": 0.8899880180290319, "learning_rate": 1.626006561271363e-05, "loss": 0.162, "step": 8791 }, { "epoch": 1.8374085684430512, "grad_norm": 0.8973697431440594, "learning_rate": 1.6259185843694053e-05, "loss": 0.1807, "step": 8792 }, { "epoch": 1.8376175548589342, "grad_norm": 0.7181915301366328, "learning_rate": 1.625830599501644e-05, "loss": 0.1254, "step": 8793 }, { "epoch": 1.8378265412748171, "grad_norm": 0.9992496139551396, "learning_rate": 1.6257426066691995e-05, "loss": 0.1797, "step": 8794 }, { "epoch": 1.8380355276907, "grad_norm": 0.8496798425412091, "learning_rate": 1.6256546058731918e-05, "loss": 0.1205, "step": 8795 }, { "epoch": 1.838244514106583, "grad_norm": 1.006058892927905, "learning_rate": 1.62556659711474e-05, "loss": 0.1404, "step": 8796 }, { "epoch": 1.838453500522466, "grad_norm": 0.9855330793057271, "learning_rate": 1.6254785803949648e-05, "loss": 0.1528, "step": 8797 }, { "epoch": 1.838662486938349, "grad_norm": 0.8159209439871621, "learning_rate": 1.625390555714986e-05, "loss": 0.1542, "step": 8798 }, { "epoch": 1.838871473354232, "grad_norm": 1.0876681884989228, "learning_rate": 1.625302523075924e-05, "loss": 0.1725, "step": 8799 }, { "epoch": 1.839080459770115, "grad_norm": 1.0276693957335392, "learning_rate": 1.6252144824788995e-05, "loss": 0.1693, "step": 8800 }, { "epoch": 1.8392894461859979, "grad_norm": 0.8354375094673223, "learning_rate": 1.6251264339250322e-05, "loss": 0.1809, "step": 8801 }, { "epoch": 1.8394984326018808, "grad_norm": 0.8122302363735355, "learning_rate": 1.6250383774154435e-05, "loss": 0.1623, "step": 8802 }, { "epoch": 1.8397074190177638, "grad_norm": 1.015205445029037, "learning_rate": 1.6249503129512536e-05, "loss": 0.1875, "step": 8803 }, { "epoch": 1.8399164054336468, "grad_norm": 0.9058553011329826, "learning_rate": 1.6248622405335832e-05, "loss": 0.1818, "step": 8804 }, { "epoch": 1.8401253918495297, "grad_norm": 0.9558697672133919, "learning_rate": 1.6247741601635535e-05, "loss": 0.1428, "step": 8805 }, { "epoch": 1.8403343782654127, "grad_norm": 0.842589005746757, "learning_rate": 1.624686071842285e-05, "loss": 0.164, "step": 8806 }, { "epoch": 1.8405433646812956, "grad_norm": 0.7536697048549401, "learning_rate": 1.6245979755708988e-05, "loss": 0.1446, "step": 8807 }, { "epoch": 1.8407523510971786, "grad_norm": 0.8297860759907562, "learning_rate": 1.6245098713505165e-05, "loss": 0.1557, "step": 8808 }, { "epoch": 1.8409613375130616, "grad_norm": 0.8585960088978429, "learning_rate": 1.6244217591822594e-05, "loss": 0.1507, "step": 8809 }, { "epoch": 1.8411703239289445, "grad_norm": 1.050881106206582, "learning_rate": 1.6243336390672482e-05, "loss": 0.1532, "step": 8810 }, { "epoch": 1.8413793103448275, "grad_norm": 0.8867497599472609, "learning_rate": 1.6242455110066052e-05, "loss": 0.1562, "step": 8811 }, { "epoch": 1.8415882967607105, "grad_norm": 1.0499086011345755, "learning_rate": 1.6241573750014512e-05, "loss": 0.1553, "step": 8812 }, { "epoch": 1.8417972831765934, "grad_norm": 1.208559524469272, "learning_rate": 1.6240692310529082e-05, "loss": 0.1767, "step": 8813 }, { "epoch": 1.8420062695924764, "grad_norm": 0.7417905104469311, "learning_rate": 1.623981079162098e-05, "loss": 0.1555, "step": 8814 }, { "epoch": 1.8422152560083593, "grad_norm": 1.0735168502198789, "learning_rate": 1.6238929193301423e-05, "loss": 0.1903, "step": 8815 }, { "epoch": 1.8424242424242423, "grad_norm": 0.9754922725717143, "learning_rate": 1.6238047515581638e-05, "loss": 0.1338, "step": 8816 }, { "epoch": 1.8426332288401253, "grad_norm": 1.1421381335204481, "learning_rate": 1.6237165758472836e-05, "loss": 0.1702, "step": 8817 }, { "epoch": 1.8428422152560082, "grad_norm": 0.9405673583447077, "learning_rate": 1.623628392198624e-05, "loss": 0.1629, "step": 8818 }, { "epoch": 1.8430512016718912, "grad_norm": 0.9552225520414305, "learning_rate": 1.623540200613308e-05, "loss": 0.1461, "step": 8819 }, { "epoch": 1.8432601880877741, "grad_norm": 0.9070327803276327, "learning_rate": 1.6234520010924573e-05, "loss": 0.1586, "step": 8820 }, { "epoch": 1.843469174503657, "grad_norm": 1.0619920257184647, "learning_rate": 1.6233637936371946e-05, "loss": 0.1581, "step": 8821 }, { "epoch": 1.84367816091954, "grad_norm": 0.7319117718857673, "learning_rate": 1.6232755782486426e-05, "loss": 0.1678, "step": 8822 }, { "epoch": 1.8438871473354232, "grad_norm": 0.9099693267457076, "learning_rate": 1.623187354927924e-05, "loss": 0.137, "step": 8823 }, { "epoch": 1.8440961337513062, "grad_norm": 0.806982427085233, "learning_rate": 1.623099123676161e-05, "loss": 0.1519, "step": 8824 }, { "epoch": 1.8443051201671892, "grad_norm": 0.9666588228025836, "learning_rate": 1.623010884494477e-05, "loss": 0.159, "step": 8825 }, { "epoch": 1.8445141065830721, "grad_norm": 1.0893538532926157, "learning_rate": 1.622922637383995e-05, "loss": 0.1747, "step": 8826 }, { "epoch": 1.844723092998955, "grad_norm": 0.9053919423169162, "learning_rate": 1.6228343823458383e-05, "loss": 0.1828, "step": 8827 }, { "epoch": 1.844932079414838, "grad_norm": 0.9579525379561753, "learning_rate": 1.62274611938113e-05, "loss": 0.2058, "step": 8828 }, { "epoch": 1.845141065830721, "grad_norm": 0.7629752151824912, "learning_rate": 1.6226578484909924e-05, "loss": 0.1163, "step": 8829 }, { "epoch": 1.845350052246604, "grad_norm": 1.016109957217519, "learning_rate": 1.62256956967655e-05, "loss": 0.1682, "step": 8830 }, { "epoch": 1.845559038662487, "grad_norm": 0.8830729426058042, "learning_rate": 1.6224812829389258e-05, "loss": 0.1619, "step": 8831 }, { "epoch": 1.84576802507837, "grad_norm": 0.8345135311369157, "learning_rate": 1.6223929882792434e-05, "loss": 0.1579, "step": 8832 }, { "epoch": 1.8459770114942529, "grad_norm": 0.9199386367774607, "learning_rate": 1.622304685698627e-05, "loss": 0.1986, "step": 8833 }, { "epoch": 1.8461859979101358, "grad_norm": 0.8996956068689737, "learning_rate": 1.6222163751982e-05, "loss": 0.1827, "step": 8834 }, { "epoch": 1.8463949843260188, "grad_norm": 0.968619377756949, "learning_rate": 1.622128056779086e-05, "loss": 0.1703, "step": 8835 }, { "epoch": 1.8466039707419017, "grad_norm": 0.8708697141615621, "learning_rate": 1.6220397304424095e-05, "loss": 0.1386, "step": 8836 }, { "epoch": 1.8468129571577847, "grad_norm": 0.94937186303427, "learning_rate": 1.6219513961892944e-05, "loss": 0.2034, "step": 8837 }, { "epoch": 1.8470219435736677, "grad_norm": 0.816680418979559, "learning_rate": 1.6218630540208645e-05, "loss": 0.1646, "step": 8838 }, { "epoch": 1.8472309299895506, "grad_norm": 0.7863439114635792, "learning_rate": 1.621774703938245e-05, "loss": 0.1336, "step": 8839 }, { "epoch": 1.8474399164054338, "grad_norm": 0.957307397265163, "learning_rate": 1.6216863459425594e-05, "loss": 0.1626, "step": 8840 }, { "epoch": 1.8476489028213168, "grad_norm": 0.7822735040613787, "learning_rate": 1.6215979800349327e-05, "loss": 0.1808, "step": 8841 }, { "epoch": 1.8478578892371997, "grad_norm": 0.8855549977660453, "learning_rate": 1.6215096062164892e-05, "loss": 0.1453, "step": 8842 }, { "epoch": 1.8480668756530827, "grad_norm": 0.7844027196420826, "learning_rate": 1.621421224488354e-05, "loss": 0.1525, "step": 8843 }, { "epoch": 1.8482758620689657, "grad_norm": 0.9461558889228959, "learning_rate": 1.6213328348516513e-05, "loss": 0.1922, "step": 8844 }, { "epoch": 1.8484848484848486, "grad_norm": 0.8248977597767874, "learning_rate": 1.6212444373075067e-05, "loss": 0.1324, "step": 8845 }, { "epoch": 1.8486938349007316, "grad_norm": 0.9251467189689777, "learning_rate": 1.6211560318570446e-05, "loss": 0.185, "step": 8846 }, { "epoch": 1.8489028213166145, "grad_norm": 0.737207621622818, "learning_rate": 1.6210676185013908e-05, "loss": 0.1071, "step": 8847 }, { "epoch": 1.8491118077324975, "grad_norm": 0.799011826715083, "learning_rate": 1.6209791972416694e-05, "loss": 0.1736, "step": 8848 }, { "epoch": 1.8493207941483805, "grad_norm": 0.857384593453787, "learning_rate": 1.6208907680790066e-05, "loss": 0.1516, "step": 8849 }, { "epoch": 1.8495297805642634, "grad_norm": 0.8438967754904275, "learning_rate": 1.6208023310145275e-05, "loss": 0.1571, "step": 8850 }, { "epoch": 1.8497387669801464, "grad_norm": 0.9105932689306342, "learning_rate": 1.6207138860493578e-05, "loss": 0.1672, "step": 8851 }, { "epoch": 1.8499477533960293, "grad_norm": 0.7610088609317054, "learning_rate": 1.620625433184623e-05, "loss": 0.1574, "step": 8852 }, { "epoch": 1.8501567398119123, "grad_norm": 0.7708254676211811, "learning_rate": 1.6205369724214482e-05, "loss": 0.1707, "step": 8853 }, { "epoch": 1.8503657262277953, "grad_norm": 1.0204558422526224, "learning_rate": 1.6204485037609604e-05, "loss": 0.1728, "step": 8854 }, { "epoch": 1.8505747126436782, "grad_norm": 1.1483321096042292, "learning_rate": 1.6203600272042844e-05, "loss": 0.1443, "step": 8855 }, { "epoch": 1.8507836990595612, "grad_norm": 0.9780075524692224, "learning_rate": 1.6202715427525468e-05, "loss": 0.1948, "step": 8856 }, { "epoch": 1.8509926854754442, "grad_norm": 0.9873207844851063, "learning_rate": 1.6201830504068738e-05, "loss": 0.16, "step": 8857 }, { "epoch": 1.8512016718913271, "grad_norm": 0.8664485316672133, "learning_rate": 1.6200945501683912e-05, "loss": 0.1516, "step": 8858 }, { "epoch": 1.85141065830721, "grad_norm": 0.8414424607311152, "learning_rate": 1.6200060420382257e-05, "loss": 0.1758, "step": 8859 }, { "epoch": 1.851619644723093, "grad_norm": 0.8773955991697537, "learning_rate": 1.6199175260175027e-05, "loss": 0.171, "step": 8860 }, { "epoch": 1.851828631138976, "grad_norm": 0.8645137852258591, "learning_rate": 1.6198290021073505e-05, "loss": 0.1436, "step": 8861 }, { "epoch": 1.852037617554859, "grad_norm": 0.9497893344220473, "learning_rate": 1.619740470308894e-05, "loss": 0.1391, "step": 8862 }, { "epoch": 1.852246603970742, "grad_norm": 0.8134641832975437, "learning_rate": 1.6196519306232607e-05, "loss": 0.1523, "step": 8863 }, { "epoch": 1.8524555903866249, "grad_norm": 0.7605585888757322, "learning_rate": 1.6195633830515776e-05, "loss": 0.1962, "step": 8864 }, { "epoch": 1.8526645768025078, "grad_norm": 0.7902887736629634, "learning_rate": 1.619474827594971e-05, "loss": 0.1196, "step": 8865 }, { "epoch": 1.8528735632183908, "grad_norm": 0.8988420047790233, "learning_rate": 1.6193862642545686e-05, "loss": 0.1756, "step": 8866 }, { "epoch": 1.8530825496342738, "grad_norm": 0.9310102217443181, "learning_rate": 1.6192976930314968e-05, "loss": 0.1657, "step": 8867 }, { "epoch": 1.8532915360501567, "grad_norm": 0.9600669466127926, "learning_rate": 1.619209113926883e-05, "loss": 0.1754, "step": 8868 }, { "epoch": 1.8535005224660397, "grad_norm": 0.9874144318890423, "learning_rate": 1.619120526941855e-05, "loss": 0.1687, "step": 8869 }, { "epoch": 1.8537095088819227, "grad_norm": 0.9662219587269077, "learning_rate": 1.6190319320775397e-05, "loss": 0.1591, "step": 8870 }, { "epoch": 1.8539184952978056, "grad_norm": 0.8106660317505078, "learning_rate": 1.618943329335065e-05, "loss": 0.1436, "step": 8871 }, { "epoch": 1.8541274817136886, "grad_norm": 0.9346021339383273, "learning_rate": 1.6188547187155577e-05, "loss": 0.1421, "step": 8872 }, { "epoch": 1.8543364681295715, "grad_norm": 0.976272295687258, "learning_rate": 1.6187661002201463e-05, "loss": 0.1596, "step": 8873 }, { "epoch": 1.8545454545454545, "grad_norm": 0.8346823690386513, "learning_rate": 1.6186774738499588e-05, "loss": 0.1565, "step": 8874 }, { "epoch": 1.8547544409613375, "grad_norm": 0.9830342079865662, "learning_rate": 1.6185888396061223e-05, "loss": 0.1575, "step": 8875 }, { "epoch": 1.8549634273772204, "grad_norm": 0.8128215684821505, "learning_rate": 1.6185001974897654e-05, "loss": 0.1727, "step": 8876 }, { "epoch": 1.8551724137931034, "grad_norm": 0.9328302711735006, "learning_rate": 1.618411547502016e-05, "loss": 0.1441, "step": 8877 }, { "epoch": 1.8553814002089863, "grad_norm": 1.2307086120583184, "learning_rate": 1.6183228896440024e-05, "loss": 0.1705, "step": 8878 }, { "epoch": 1.8555903866248693, "grad_norm": 0.9255156251584213, "learning_rate": 1.6182342239168525e-05, "loss": 0.1745, "step": 8879 }, { "epoch": 1.8557993730407523, "grad_norm": 1.0592927090476787, "learning_rate": 1.6181455503216956e-05, "loss": 0.1994, "step": 8880 }, { "epoch": 1.8560083594566352, "grad_norm": 0.8834993524773657, "learning_rate": 1.618056868859659e-05, "loss": 0.1681, "step": 8881 }, { "epoch": 1.8562173458725182, "grad_norm": 1.0431372357029112, "learning_rate": 1.6179681795318728e-05, "loss": 0.1774, "step": 8882 }, { "epoch": 1.8564263322884011, "grad_norm": 0.7630716489165089, "learning_rate": 1.6178794823394645e-05, "loss": 0.144, "step": 8883 }, { "epoch": 1.856635318704284, "grad_norm": 1.182161747325246, "learning_rate": 1.617790777283563e-05, "loss": 0.1543, "step": 8884 }, { "epoch": 1.856844305120167, "grad_norm": 1.0223741870799692, "learning_rate": 1.617702064365298e-05, "loss": 0.1811, "step": 8885 }, { "epoch": 1.85705329153605, "grad_norm": 0.758180545483994, "learning_rate": 1.617613343585798e-05, "loss": 0.1622, "step": 8886 }, { "epoch": 1.857262277951933, "grad_norm": 0.97464688252215, "learning_rate": 1.617524614946192e-05, "loss": 0.1787, "step": 8887 }, { "epoch": 1.857471264367816, "grad_norm": 1.0869751929926517, "learning_rate": 1.6174358784476095e-05, "loss": 0.1626, "step": 8888 }, { "epoch": 1.857680250783699, "grad_norm": 0.8163157334412741, "learning_rate": 1.6173471340911795e-05, "loss": 0.153, "step": 8889 }, { "epoch": 1.8578892371995819, "grad_norm": 1.0289323001453783, "learning_rate": 1.6172583818780318e-05, "loss": 0.1731, "step": 8890 }, { "epoch": 1.8580982236154648, "grad_norm": 0.8158905361925816, "learning_rate": 1.6171696218092954e-05, "loss": 0.1751, "step": 8891 }, { "epoch": 1.8583072100313478, "grad_norm": 0.9099466177011567, "learning_rate": 1.6170808538861002e-05, "loss": 0.1876, "step": 8892 }, { "epoch": 1.858516196447231, "grad_norm": 0.7811279299356138, "learning_rate": 1.616992078109576e-05, "loss": 0.1461, "step": 8893 }, { "epoch": 1.858725182863114, "grad_norm": 0.8737471015196338, "learning_rate": 1.616903294480853e-05, "loss": 0.1548, "step": 8894 }, { "epoch": 1.858934169278997, "grad_norm": 0.935790171188451, "learning_rate": 1.61681450300106e-05, "loss": 0.1348, "step": 8895 }, { "epoch": 1.8591431556948799, "grad_norm": 0.755254017532115, "learning_rate": 1.616725703671328e-05, "loss": 0.1483, "step": 8896 }, { "epoch": 1.8593521421107628, "grad_norm": 0.799412336253698, "learning_rate": 1.6166368964927866e-05, "loss": 0.158, "step": 8897 }, { "epoch": 1.8595611285266458, "grad_norm": 0.8990427502082635, "learning_rate": 1.6165480814665665e-05, "loss": 0.1678, "step": 8898 }, { "epoch": 1.8597701149425288, "grad_norm": 0.768214259973523, "learning_rate": 1.616459258593797e-05, "loss": 0.1462, "step": 8899 }, { "epoch": 1.8599791013584117, "grad_norm": 0.7752042621201835, "learning_rate": 1.6163704278756097e-05, "loss": 0.1613, "step": 8900 }, { "epoch": 1.8601880877742947, "grad_norm": 0.9403694401998959, "learning_rate": 1.6162815893131348e-05, "loss": 0.1506, "step": 8901 }, { "epoch": 1.8603970741901776, "grad_norm": 1.3065549409782309, "learning_rate": 1.6161927429075025e-05, "loss": 0.1633, "step": 8902 }, { "epoch": 1.8606060606060606, "grad_norm": 0.9316786185786735, "learning_rate": 1.6161038886598436e-05, "loss": 0.1656, "step": 8903 }, { "epoch": 1.8608150470219436, "grad_norm": 0.8498704005350527, "learning_rate": 1.6160150265712892e-05, "loss": 0.1547, "step": 8904 }, { "epoch": 1.8610240334378265, "grad_norm": 1.0876391555758183, "learning_rate": 1.6159261566429698e-05, "loss": 0.1649, "step": 8905 }, { "epoch": 1.8612330198537095, "grad_norm": 0.9057918620427476, "learning_rate": 1.615837278876017e-05, "loss": 0.1668, "step": 8906 }, { "epoch": 1.8614420062695924, "grad_norm": 0.8745468663791224, "learning_rate": 1.6157483932715615e-05, "loss": 0.1552, "step": 8907 }, { "epoch": 1.8616509926854754, "grad_norm": 0.8985427310693963, "learning_rate": 1.6156594998307347e-05, "loss": 0.1651, "step": 8908 }, { "epoch": 1.8618599791013584, "grad_norm": 0.979685937218014, "learning_rate": 1.6155705985546675e-05, "loss": 0.1563, "step": 8909 }, { "epoch": 1.8620689655172413, "grad_norm": 0.8384765823229803, "learning_rate": 1.615481689444492e-05, "loss": 0.1967, "step": 8910 }, { "epoch": 1.8622779519331245, "grad_norm": 0.9763850385819323, "learning_rate": 1.6153927725013384e-05, "loss": 0.1819, "step": 8911 }, { "epoch": 1.8624869383490075, "grad_norm": 0.9398532462698588, "learning_rate": 1.6153038477263404e-05, "loss": 0.1816, "step": 8912 }, { "epoch": 1.8626959247648904, "grad_norm": 0.8430850887225594, "learning_rate": 1.615214915120628e-05, "loss": 0.1526, "step": 8913 }, { "epoch": 1.8629049111807734, "grad_norm": 0.9154601235788882, "learning_rate": 1.6151259746853334e-05, "loss": 0.1533, "step": 8914 }, { "epoch": 1.8631138975966564, "grad_norm": 0.888464575258893, "learning_rate": 1.6150370264215887e-05, "loss": 0.1715, "step": 8915 }, { "epoch": 1.8633228840125393, "grad_norm": 0.7875315879487199, "learning_rate": 1.614948070330526e-05, "loss": 0.1476, "step": 8916 }, { "epoch": 1.8635318704284223, "grad_norm": 0.8994047966786033, "learning_rate": 1.6148591064132773e-05, "loss": 0.1397, "step": 8917 }, { "epoch": 1.8637408568443052, "grad_norm": 0.8767724673978834, "learning_rate": 1.6147701346709746e-05, "loss": 0.1266, "step": 8918 }, { "epoch": 1.8639498432601882, "grad_norm": 0.924290462524621, "learning_rate": 1.6146811551047505e-05, "loss": 0.1429, "step": 8919 }, { "epoch": 1.8641588296760712, "grad_norm": 0.8779615401502899, "learning_rate": 1.6145921677157374e-05, "loss": 0.1608, "step": 8920 }, { "epoch": 1.8643678160919541, "grad_norm": 0.8976264761641646, "learning_rate": 1.6145031725050672e-05, "loss": 0.1308, "step": 8921 }, { "epoch": 1.864576802507837, "grad_norm": 1.070873054681272, "learning_rate": 1.6144141694738736e-05, "loss": 0.159, "step": 8922 }, { "epoch": 1.86478578892372, "grad_norm": 0.9261696843264895, "learning_rate": 1.6143251586232883e-05, "loss": 0.1601, "step": 8923 }, { "epoch": 1.864994775339603, "grad_norm": 0.7549721099325276, "learning_rate": 1.6142361399544445e-05, "loss": 0.1606, "step": 8924 }, { "epoch": 1.865203761755486, "grad_norm": 0.9681709363318095, "learning_rate": 1.6141471134684753e-05, "loss": 0.1486, "step": 8925 }, { "epoch": 1.865412748171369, "grad_norm": 1.2254279942439723, "learning_rate": 1.6140580791665136e-05, "loss": 0.1892, "step": 8926 }, { "epoch": 1.865621734587252, "grad_norm": 0.9945815699934587, "learning_rate": 1.613969037049692e-05, "loss": 0.1604, "step": 8927 }, { "epoch": 1.8658307210031349, "grad_norm": 1.0326225341109272, "learning_rate": 1.6138799871191444e-05, "loss": 0.1743, "step": 8928 }, { "epoch": 1.8660397074190178, "grad_norm": 0.8514439696357173, "learning_rate": 1.6137909293760037e-05, "loss": 0.1532, "step": 8929 }, { "epoch": 1.8662486938349008, "grad_norm": 1.0538283857187432, "learning_rate": 1.6137018638214037e-05, "loss": 0.1823, "step": 8930 }, { "epoch": 1.8664576802507837, "grad_norm": 1.0365531893835278, "learning_rate": 1.613612790456477e-05, "loss": 0.1613, "step": 8931 }, { "epoch": 1.8666666666666667, "grad_norm": 0.8517702002036761, "learning_rate": 1.613523709282358e-05, "loss": 0.1524, "step": 8932 }, { "epoch": 1.8668756530825497, "grad_norm": 1.1755322561235562, "learning_rate": 1.6134346203001805e-05, "loss": 0.2028, "step": 8933 }, { "epoch": 1.8670846394984326, "grad_norm": 0.9117701667764733, "learning_rate": 1.613345523511078e-05, "loss": 0.134, "step": 8934 }, { "epoch": 1.8672936259143156, "grad_norm": 0.8405114391990028, "learning_rate": 1.6132564189161844e-05, "loss": 0.1389, "step": 8935 }, { "epoch": 1.8675026123301985, "grad_norm": 0.9290036688878486, "learning_rate": 1.6131673065166337e-05, "loss": 0.1536, "step": 8936 }, { "epoch": 1.8677115987460815, "grad_norm": 0.915124180544712, "learning_rate": 1.6130781863135595e-05, "loss": 0.1832, "step": 8937 }, { "epoch": 1.8679205851619645, "grad_norm": 0.9161876034588061, "learning_rate": 1.612989058308097e-05, "loss": 0.1949, "step": 8938 }, { "epoch": 1.8681295715778474, "grad_norm": 0.8699392076503274, "learning_rate": 1.61289992250138e-05, "loss": 0.1326, "step": 8939 }, { "epoch": 1.8683385579937304, "grad_norm": 0.9688052106529788, "learning_rate": 1.6128107788945425e-05, "loss": 0.1999, "step": 8940 }, { "epoch": 1.8685475444096133, "grad_norm": 0.7780732363548557, "learning_rate": 1.61272162748872e-05, "loss": 0.1442, "step": 8941 }, { "epoch": 1.8687565308254963, "grad_norm": 1.096082164765407, "learning_rate": 1.6126324682850455e-05, "loss": 0.1872, "step": 8942 }, { "epoch": 1.8689655172413793, "grad_norm": 0.8530662690045581, "learning_rate": 1.6125433012846556e-05, "loss": 0.175, "step": 8943 }, { "epoch": 1.8691745036572622, "grad_norm": 0.924987244100134, "learning_rate": 1.6124541264886837e-05, "loss": 0.1487, "step": 8944 }, { "epoch": 1.8693834900731452, "grad_norm": 0.9602137178651993, "learning_rate": 1.612364943898265e-05, "loss": 0.179, "step": 8945 }, { "epoch": 1.8695924764890282, "grad_norm": 0.7137954175459714, "learning_rate": 1.6122757535145346e-05, "loss": 0.1516, "step": 8946 }, { "epoch": 1.8698014629049111, "grad_norm": 0.885287531376961, "learning_rate": 1.6121865553386282e-05, "loss": 0.1875, "step": 8947 }, { "epoch": 1.870010449320794, "grad_norm": 0.941688867026767, "learning_rate": 1.61209734937168e-05, "loss": 0.1872, "step": 8948 }, { "epoch": 1.870219435736677, "grad_norm": 0.8033605324783525, "learning_rate": 1.6120081356148257e-05, "loss": 0.1596, "step": 8949 }, { "epoch": 1.87042842215256, "grad_norm": 0.691279190992193, "learning_rate": 1.6119189140692006e-05, "loss": 0.1249, "step": 8950 }, { "epoch": 1.870637408568443, "grad_norm": 0.8615322449640246, "learning_rate": 1.6118296847359408e-05, "loss": 0.1507, "step": 8951 }, { "epoch": 1.870846394984326, "grad_norm": 1.0150317636069712, "learning_rate": 1.6117404476161808e-05, "loss": 0.1563, "step": 8952 }, { "epoch": 1.8710553814002089, "grad_norm": 1.0119237957516765, "learning_rate": 1.611651202711057e-05, "loss": 0.1767, "step": 8953 }, { "epoch": 1.8712643678160918, "grad_norm": 0.7606902603837246, "learning_rate": 1.611561950021705e-05, "loss": 0.1516, "step": 8954 }, { "epoch": 1.8714733542319748, "grad_norm": 0.9322200976873332, "learning_rate": 1.6114726895492605e-05, "loss": 0.1767, "step": 8955 }, { "epoch": 1.8716823406478578, "grad_norm": 1.0468688678526852, "learning_rate": 1.6113834212948597e-05, "loss": 0.17, "step": 8956 }, { "epoch": 1.8718913270637407, "grad_norm": 0.8847912242078595, "learning_rate": 1.611294145259639e-05, "loss": 0.1102, "step": 8957 }, { "epoch": 1.8721003134796237, "grad_norm": 0.8460511637423266, "learning_rate": 1.6112048614447338e-05, "loss": 0.1801, "step": 8958 }, { "epoch": 1.8723092998955067, "grad_norm": 0.989281407675155, "learning_rate": 1.6111155698512814e-05, "loss": 0.1897, "step": 8959 }, { "epoch": 1.8725182863113896, "grad_norm": 1.0207723083710396, "learning_rate": 1.611026270480417e-05, "loss": 0.2005, "step": 8960 }, { "epoch": 1.8727272727272726, "grad_norm": 0.9327208147417245, "learning_rate": 1.610936963333278e-05, "loss": 0.1849, "step": 8961 }, { "epoch": 1.8729362591431555, "grad_norm": 1.0609631930169416, "learning_rate": 1.6108476484110003e-05, "loss": 0.1544, "step": 8962 }, { "epoch": 1.8731452455590385, "grad_norm": 0.9174032547942453, "learning_rate": 1.6107583257147214e-05, "loss": 0.1344, "step": 8963 }, { "epoch": 1.8733542319749217, "grad_norm": 1.1706769155023111, "learning_rate": 1.6106689952455772e-05, "loss": 0.1671, "step": 8964 }, { "epoch": 1.8735632183908046, "grad_norm": 0.8535701837497198, "learning_rate": 1.610579657004705e-05, "loss": 0.1388, "step": 8965 }, { "epoch": 1.8737722048066876, "grad_norm": 1.011927825052447, "learning_rate": 1.6104903109932415e-05, "loss": 0.1573, "step": 8966 }, { "epoch": 1.8739811912225706, "grad_norm": 1.0413307160966288, "learning_rate": 1.6104009572123246e-05, "loss": 0.1919, "step": 8967 }, { "epoch": 1.8741901776384535, "grad_norm": 0.9454122538986807, "learning_rate": 1.6103115956630902e-05, "loss": 0.1379, "step": 8968 }, { "epoch": 1.8743991640543365, "grad_norm": 2.333684697002814, "learning_rate": 1.610222226346677e-05, "loss": 0.1581, "step": 8969 }, { "epoch": 1.8746081504702194, "grad_norm": 0.91252138689756, "learning_rate": 1.610132849264221e-05, "loss": 0.1669, "step": 8970 }, { "epoch": 1.8748171368861024, "grad_norm": 0.8422661792354502, "learning_rate": 1.6100434644168603e-05, "loss": 0.1441, "step": 8971 }, { "epoch": 1.8750261233019854, "grad_norm": 1.0474048075985911, "learning_rate": 1.6099540718057327e-05, "loss": 0.1521, "step": 8972 }, { "epoch": 1.8752351097178683, "grad_norm": 0.9408410258278144, "learning_rate": 1.6098646714319755e-05, "loss": 0.1482, "step": 8973 }, { "epoch": 1.8754440961337513, "grad_norm": 0.863363026107917, "learning_rate": 1.6097752632967263e-05, "loss": 0.1423, "step": 8974 }, { "epoch": 1.8756530825496343, "grad_norm": 0.9977300426395419, "learning_rate": 1.6096858474011235e-05, "loss": 0.1775, "step": 8975 }, { "epoch": 1.8758620689655172, "grad_norm": 0.9984710148829751, "learning_rate": 1.6095964237463045e-05, "loss": 0.1809, "step": 8976 }, { "epoch": 1.8760710553814002, "grad_norm": 1.0080637142411086, "learning_rate": 1.6095069923334078e-05, "loss": 0.1527, "step": 8977 }, { "epoch": 1.8762800417972831, "grad_norm": 0.8754017218046433, "learning_rate": 1.6094175531635714e-05, "loss": 0.162, "step": 8978 }, { "epoch": 1.876489028213166, "grad_norm": 0.8889576516402176, "learning_rate": 1.6093281062379337e-05, "loss": 0.1829, "step": 8979 }, { "epoch": 1.876698014629049, "grad_norm": 0.708366880845947, "learning_rate": 1.6092386515576324e-05, "loss": 0.1437, "step": 8980 }, { "epoch": 1.8769070010449322, "grad_norm": 1.0108350094945189, "learning_rate": 1.6091491891238067e-05, "loss": 0.1821, "step": 8981 }, { "epoch": 1.8771159874608152, "grad_norm": 0.8652267479806965, "learning_rate": 1.609059718937595e-05, "loss": 0.1683, "step": 8982 }, { "epoch": 1.8773249738766982, "grad_norm": 0.9021009524843455, "learning_rate": 1.6089702410001356e-05, "loss": 0.1688, "step": 8983 }, { "epoch": 1.8775339602925811, "grad_norm": 1.020377735369663, "learning_rate": 1.6088807553125677e-05, "loss": 0.1992, "step": 8984 }, { "epoch": 1.877742946708464, "grad_norm": 0.8493460313773425, "learning_rate": 1.60879126187603e-05, "loss": 0.1452, "step": 8985 }, { "epoch": 1.877951933124347, "grad_norm": 1.0280086481070436, "learning_rate": 1.608701760691661e-05, "loss": 0.1696, "step": 8986 }, { "epoch": 1.87816091954023, "grad_norm": 0.9982277392192076, "learning_rate": 1.6086122517606003e-05, "loss": 0.1752, "step": 8987 }, { "epoch": 1.878369905956113, "grad_norm": 0.8116453040943135, "learning_rate": 1.6085227350839868e-05, "loss": 0.1473, "step": 8988 }, { "epoch": 1.878578892371996, "grad_norm": 1.0216862649522709, "learning_rate": 1.6084332106629602e-05, "loss": 0.1686, "step": 8989 }, { "epoch": 1.878787878787879, "grad_norm": 0.8975761867156251, "learning_rate": 1.6083436784986592e-05, "loss": 0.155, "step": 8990 }, { "epoch": 1.8789968652037619, "grad_norm": 0.9831669973413288, "learning_rate": 1.6082541385922234e-05, "loss": 0.1738, "step": 8991 }, { "epoch": 1.8792058516196448, "grad_norm": 0.9305114216830124, "learning_rate": 1.6081645909447924e-05, "loss": 0.1923, "step": 8992 }, { "epoch": 1.8794148380355278, "grad_norm": 0.8819459531147871, "learning_rate": 1.608075035557506e-05, "loss": 0.1415, "step": 8993 }, { "epoch": 1.8796238244514107, "grad_norm": 0.8262919428745267, "learning_rate": 1.6079854724315037e-05, "loss": 0.156, "step": 8994 }, { "epoch": 1.8798328108672937, "grad_norm": 0.9233830122223211, "learning_rate": 1.6078959015679255e-05, "loss": 0.1475, "step": 8995 }, { "epoch": 1.8800417972831767, "grad_norm": 0.8668661615089902, "learning_rate": 1.6078063229679113e-05, "loss": 0.1538, "step": 8996 }, { "epoch": 1.8802507836990596, "grad_norm": 0.7024585731924999, "learning_rate": 1.607716736632601e-05, "loss": 0.1316, "step": 8997 }, { "epoch": 1.8804597701149426, "grad_norm": 1.026060451422262, "learning_rate": 1.6076271425631347e-05, "loss": 0.1617, "step": 8998 }, { "epoch": 1.8806687565308255, "grad_norm": 0.8142800999088027, "learning_rate": 1.6075375407606532e-05, "loss": 0.1541, "step": 8999 }, { "epoch": 1.8808777429467085, "grad_norm": 0.9344499968195722, "learning_rate": 1.6074479312262957e-05, "loss": 0.1369, "step": 9000 }, { "epoch": 1.8810867293625915, "grad_norm": 1.1590887044462994, "learning_rate": 1.6073583139612034e-05, "loss": 0.1511, "step": 9001 }, { "epoch": 1.8812957157784744, "grad_norm": 1.1023440906773319, "learning_rate": 1.607268688966517e-05, "loss": 0.1958, "step": 9002 }, { "epoch": 1.8815047021943574, "grad_norm": 1.211315813265789, "learning_rate": 1.6071790562433767e-05, "loss": 0.1873, "step": 9003 }, { "epoch": 1.8817136886102404, "grad_norm": 1.0544709207689642, "learning_rate": 1.6070894157929235e-05, "loss": 0.1745, "step": 9004 }, { "epoch": 1.8819226750261233, "grad_norm": 0.7754996734052046, "learning_rate": 1.606999767616298e-05, "loss": 0.1539, "step": 9005 }, { "epoch": 1.8821316614420063, "grad_norm": 1.013844358962786, "learning_rate": 1.606910111714641e-05, "loss": 0.1948, "step": 9006 }, { "epoch": 1.8823406478578892, "grad_norm": 0.8649189907822629, "learning_rate": 1.6068204480890935e-05, "loss": 0.1388, "step": 9007 }, { "epoch": 1.8825496342737722, "grad_norm": 0.9301898859474615, "learning_rate": 1.6067307767407974e-05, "loss": 0.1417, "step": 9008 }, { "epoch": 1.8827586206896552, "grad_norm": 0.92289045752852, "learning_rate": 1.6066410976708928e-05, "loss": 0.165, "step": 9009 }, { "epoch": 1.8829676071055381, "grad_norm": 0.773100800497256, "learning_rate": 1.606551410880522e-05, "loss": 0.1433, "step": 9010 }, { "epoch": 1.883176593521421, "grad_norm": 0.7943718440700422, "learning_rate": 1.6064617163708255e-05, "loss": 0.1555, "step": 9011 }, { "epoch": 1.883385579937304, "grad_norm": 0.8350112552449395, "learning_rate": 1.6063720141429452e-05, "loss": 0.1376, "step": 9012 }, { "epoch": 1.883594566353187, "grad_norm": 0.8888531121498248, "learning_rate": 1.6062823041980228e-05, "loss": 0.1828, "step": 9013 }, { "epoch": 1.88380355276907, "grad_norm": 0.9781157461587592, "learning_rate": 1.6061925865372005e-05, "loss": 0.1444, "step": 9014 }, { "epoch": 1.884012539184953, "grad_norm": 1.127476087835021, "learning_rate": 1.6061028611616187e-05, "loss": 0.2042, "step": 9015 }, { "epoch": 1.884221525600836, "grad_norm": 0.871467754593652, "learning_rate": 1.6060131280724205e-05, "loss": 0.1567, "step": 9016 }, { "epoch": 1.8844305120167189, "grad_norm": 0.7466537534024019, "learning_rate": 1.6059233872707475e-05, "loss": 0.1508, "step": 9017 }, { "epoch": 1.8846394984326018, "grad_norm": 0.9949849930525536, "learning_rate": 1.605833638757742e-05, "loss": 0.1723, "step": 9018 }, { "epoch": 1.8848484848484848, "grad_norm": 0.8619892722965742, "learning_rate": 1.6057438825345455e-05, "loss": 0.1458, "step": 9019 }, { "epoch": 1.8850574712643677, "grad_norm": 1.0240224484024898, "learning_rate": 1.6056541186023014e-05, "loss": 0.1548, "step": 9020 }, { "epoch": 1.8852664576802507, "grad_norm": 0.8872497332555658, "learning_rate": 1.605564346962151e-05, "loss": 0.1462, "step": 9021 }, { "epoch": 1.8854754440961337, "grad_norm": 1.0097785224838736, "learning_rate": 1.6054745676152375e-05, "loss": 0.1477, "step": 9022 }, { "epoch": 1.8856844305120166, "grad_norm": 0.9636076742345924, "learning_rate": 1.6053847805627033e-05, "loss": 0.1662, "step": 9023 }, { "epoch": 1.8858934169278996, "grad_norm": 0.7491797966744933, "learning_rate": 1.605294985805691e-05, "loss": 0.1501, "step": 9024 }, { "epoch": 1.8861024033437825, "grad_norm": 0.7269468271297053, "learning_rate": 1.605205183345343e-05, "loss": 0.147, "step": 9025 }, { "epoch": 1.8863113897596655, "grad_norm": 0.7964982084848116, "learning_rate": 1.605115373182803e-05, "loss": 0.1554, "step": 9026 }, { "epoch": 1.8865203761755485, "grad_norm": 0.8679247364761841, "learning_rate": 1.6050255553192137e-05, "loss": 0.1875, "step": 9027 }, { "epoch": 1.8867293625914314, "grad_norm": 0.8371207182553801, "learning_rate": 1.604935729755718e-05, "loss": 0.133, "step": 9028 }, { "epoch": 1.8869383490073144, "grad_norm": 0.8390336192631949, "learning_rate": 1.604845896493459e-05, "loss": 0.1557, "step": 9029 }, { "epoch": 1.8871473354231973, "grad_norm": 1.18945675102276, "learning_rate": 1.6047560555335802e-05, "loss": 0.2163, "step": 9030 }, { "epoch": 1.8873563218390803, "grad_norm": 0.8890502557630364, "learning_rate": 1.6046662068772244e-05, "loss": 0.1825, "step": 9031 }, { "epoch": 1.8875653082549633, "grad_norm": 0.9014204230140117, "learning_rate": 1.604576350525536e-05, "loss": 0.148, "step": 9032 }, { "epoch": 1.8877742946708462, "grad_norm": 0.8332072908356531, "learning_rate": 1.604486486479658e-05, "loss": 0.168, "step": 9033 }, { "epoch": 1.8879832810867294, "grad_norm": 0.8082802279562133, "learning_rate": 1.6043966147407343e-05, "loss": 0.1484, "step": 9034 }, { "epoch": 1.8881922675026124, "grad_norm": 0.8163914763894992, "learning_rate": 1.6043067353099083e-05, "loss": 0.1411, "step": 9035 }, { "epoch": 1.8884012539184953, "grad_norm": 1.0909190127135295, "learning_rate": 1.604216848188324e-05, "loss": 0.1911, "step": 9036 }, { "epoch": 1.8886102403343783, "grad_norm": 0.928027767392697, "learning_rate": 1.6041269533771253e-05, "loss": 0.1537, "step": 9037 }, { "epoch": 1.8888192267502613, "grad_norm": 0.8556781879222042, "learning_rate": 1.604037050877457e-05, "loss": 0.1818, "step": 9038 }, { "epoch": 1.8890282131661442, "grad_norm": 0.8857916740609797, "learning_rate": 1.6039471406904618e-05, "loss": 0.1498, "step": 9039 }, { "epoch": 1.8892371995820272, "grad_norm": 0.6709864280483068, "learning_rate": 1.6038572228172854e-05, "loss": 0.1292, "step": 9040 }, { "epoch": 1.8894461859979101, "grad_norm": 0.8998566851148466, "learning_rate": 1.6037672972590712e-05, "loss": 0.1542, "step": 9041 }, { "epoch": 1.889655172413793, "grad_norm": 0.8170123072364256, "learning_rate": 1.603677364016964e-05, "loss": 0.1656, "step": 9042 }, { "epoch": 1.889864158829676, "grad_norm": 0.816900220398141, "learning_rate": 1.6035874230921086e-05, "loss": 0.144, "step": 9043 }, { "epoch": 1.890073145245559, "grad_norm": 0.8785785919203757, "learning_rate": 1.603497474485649e-05, "loss": 0.1661, "step": 9044 }, { "epoch": 1.890282131661442, "grad_norm": 0.7803977255604346, "learning_rate": 1.6034075181987307e-05, "loss": 0.1336, "step": 9045 }, { "epoch": 1.890491118077325, "grad_norm": 0.95309083027268, "learning_rate": 1.6033175542324977e-05, "loss": 0.2056, "step": 9046 }, { "epoch": 1.890700104493208, "grad_norm": 0.8959431012216107, "learning_rate": 1.6032275825880956e-05, "loss": 0.162, "step": 9047 }, { "epoch": 1.8909090909090909, "grad_norm": 1.0388864687439612, "learning_rate": 1.603137603266669e-05, "loss": 0.1429, "step": 9048 }, { "epoch": 1.8911180773249738, "grad_norm": 0.8011622009811302, "learning_rate": 1.6030476162693633e-05, "loss": 0.1351, "step": 9049 }, { "epoch": 1.8913270637408568, "grad_norm": 0.9801413139499845, "learning_rate": 1.6029576215973236e-05, "loss": 0.1601, "step": 9050 }, { "epoch": 1.8915360501567398, "grad_norm": 1.0826947102431415, "learning_rate": 1.6028676192516954e-05, "loss": 0.1809, "step": 9051 }, { "epoch": 1.891745036572623, "grad_norm": 0.834105215420819, "learning_rate": 1.602777609233624e-05, "loss": 0.1598, "step": 9052 }, { "epoch": 1.891954022988506, "grad_norm": 1.0030599568542227, "learning_rate": 1.602687591544255e-05, "loss": 0.1354, "step": 9053 }, { "epoch": 1.8921630094043889, "grad_norm": 1.0412419866797076, "learning_rate": 1.6025975661847337e-05, "loss": 0.1688, "step": 9054 }, { "epoch": 1.8923719958202718, "grad_norm": 1.0131046434600786, "learning_rate": 1.602507533156206e-05, "loss": 0.1537, "step": 9055 }, { "epoch": 1.8925809822361548, "grad_norm": 0.8043769204731542, "learning_rate": 1.602417492459818e-05, "loss": 0.1369, "step": 9056 }, { "epoch": 1.8927899686520377, "grad_norm": 0.9976829621241764, "learning_rate": 1.602327444096715e-05, "loss": 0.188, "step": 9057 }, { "epoch": 1.8929989550679207, "grad_norm": 1.102528012310267, "learning_rate": 1.6022373880680437e-05, "loss": 0.1856, "step": 9058 }, { "epoch": 1.8932079414838037, "grad_norm": 0.9940212705045236, "learning_rate": 1.6021473243749497e-05, "loss": 0.1856, "step": 9059 }, { "epoch": 1.8934169278996866, "grad_norm": 0.9346326873204795, "learning_rate": 1.60205725301858e-05, "loss": 0.1583, "step": 9060 }, { "epoch": 1.8936259143155696, "grad_norm": 0.8414482957427816, "learning_rate": 1.6019671740000796e-05, "loss": 0.156, "step": 9061 }, { "epoch": 1.8938349007314526, "grad_norm": 0.7920380446542196, "learning_rate": 1.6018770873205958e-05, "loss": 0.1279, "step": 9062 }, { "epoch": 1.8940438871473355, "grad_norm": 1.100040926102657, "learning_rate": 1.601786992981275e-05, "loss": 0.1835, "step": 9063 }, { "epoch": 1.8942528735632185, "grad_norm": 1.2107600958432414, "learning_rate": 1.6016968909832632e-05, "loss": 0.1993, "step": 9064 }, { "epoch": 1.8944618599791014, "grad_norm": 0.861170315710337, "learning_rate": 1.601606781327708e-05, "loss": 0.16, "step": 9065 }, { "epoch": 1.8946708463949844, "grad_norm": 0.8162219481282138, "learning_rate": 1.6015166640157556e-05, "loss": 0.1449, "step": 9066 }, { "epoch": 1.8948798328108674, "grad_norm": 0.9463294929720943, "learning_rate": 1.6014265390485527e-05, "loss": 0.1487, "step": 9067 }, { "epoch": 1.8950888192267503, "grad_norm": 1.1936369120528645, "learning_rate": 1.6013364064272474e-05, "loss": 0.186, "step": 9068 }, { "epoch": 1.8952978056426333, "grad_norm": 0.9315915632957575, "learning_rate": 1.6012462661529854e-05, "loss": 0.1758, "step": 9069 }, { "epoch": 1.8955067920585162, "grad_norm": 0.8639706755658627, "learning_rate": 1.6011561182269145e-05, "loss": 0.1592, "step": 9070 }, { "epoch": 1.8957157784743992, "grad_norm": 1.1399085582835369, "learning_rate": 1.601065962650182e-05, "loss": 0.1558, "step": 9071 }, { "epoch": 1.8959247648902822, "grad_norm": 0.7675443526666084, "learning_rate": 1.6009757994239358e-05, "loss": 0.1533, "step": 9072 }, { "epoch": 1.8961337513061651, "grad_norm": 0.8998125839093208, "learning_rate": 1.600885628549322e-05, "loss": 0.1644, "step": 9073 }, { "epoch": 1.896342737722048, "grad_norm": 0.9597560122600363, "learning_rate": 1.6007954500274895e-05, "loss": 0.1501, "step": 9074 }, { "epoch": 1.896551724137931, "grad_norm": 0.8486163158779378, "learning_rate": 1.600705263859585e-05, "loss": 0.1973, "step": 9075 }, { "epoch": 1.896760710553814, "grad_norm": 1.0585228447476436, "learning_rate": 1.600615070046757e-05, "loss": 0.1692, "step": 9076 }, { "epoch": 1.896969696969697, "grad_norm": 0.9355334589535831, "learning_rate": 1.6005248685901526e-05, "loss": 0.1598, "step": 9077 }, { "epoch": 1.89717868338558, "grad_norm": 1.0257625623718707, "learning_rate": 1.6004346594909207e-05, "loss": 0.1638, "step": 9078 }, { "epoch": 1.897387669801463, "grad_norm": 0.8899574436344023, "learning_rate": 1.6003444427502082e-05, "loss": 0.1665, "step": 9079 }, { "epoch": 1.8975966562173459, "grad_norm": 0.8142083749736038, "learning_rate": 1.6002542183691647e-05, "loss": 0.1693, "step": 9080 }, { "epoch": 1.8978056426332288, "grad_norm": 0.9353627254284687, "learning_rate": 1.600163986348937e-05, "loss": 0.1611, "step": 9081 }, { "epoch": 1.8980146290491118, "grad_norm": 0.9610650119088192, "learning_rate": 1.6000737466906745e-05, "loss": 0.1644, "step": 9082 }, { "epoch": 1.8982236154649947, "grad_norm": 0.8116861008268687, "learning_rate": 1.599983499395525e-05, "loss": 0.1412, "step": 9083 }, { "epoch": 1.8984326018808777, "grad_norm": 0.9575428424175952, "learning_rate": 1.5998932444646376e-05, "loss": 0.176, "step": 9084 }, { "epoch": 1.8986415882967607, "grad_norm": 0.9240150032951525, "learning_rate": 1.5998029818991604e-05, "loss": 0.1672, "step": 9085 }, { "epoch": 1.8988505747126436, "grad_norm": 0.8502184934078466, "learning_rate": 1.5997127117002423e-05, "loss": 0.1646, "step": 9086 }, { "epoch": 1.8990595611285266, "grad_norm": 0.9859923752952741, "learning_rate": 1.5996224338690325e-05, "loss": 0.1836, "step": 9087 }, { "epoch": 1.8992685475444095, "grad_norm": 1.071443877367482, "learning_rate": 1.599532148406679e-05, "loss": 0.1599, "step": 9088 }, { "epoch": 1.8994775339602925, "grad_norm": 1.1118125142410675, "learning_rate": 1.599441855314332e-05, "loss": 0.1663, "step": 9089 }, { "epoch": 1.8996865203761755, "grad_norm": 0.766919588940978, "learning_rate": 1.5993515545931396e-05, "loss": 0.1341, "step": 9090 }, { "epoch": 1.8998955067920584, "grad_norm": 0.9015490150740113, "learning_rate": 1.599261246244252e-05, "loss": 0.1701, "step": 9091 }, { "epoch": 1.9001044932079414, "grad_norm": 0.9055348936279973, "learning_rate": 1.5991709302688175e-05, "loss": 0.1342, "step": 9092 }, { "epoch": 1.9003134796238244, "grad_norm": 0.904305118934169, "learning_rate": 1.5990806066679863e-05, "loss": 0.1846, "step": 9093 }, { "epoch": 1.9005224660397073, "grad_norm": 0.908093127490087, "learning_rate": 1.5989902754429074e-05, "loss": 0.1715, "step": 9094 }, { "epoch": 1.9007314524555903, "grad_norm": 0.7744405240320025, "learning_rate": 1.5988999365947306e-05, "loss": 0.1766, "step": 9095 }, { "epoch": 1.9009404388714732, "grad_norm": 0.8987825519030563, "learning_rate": 1.5988095901246058e-05, "loss": 0.1841, "step": 9096 }, { "epoch": 1.9011494252873562, "grad_norm": 0.7802847461830189, "learning_rate": 1.5987192360336828e-05, "loss": 0.1314, "step": 9097 }, { "epoch": 1.9013584117032392, "grad_norm": 1.0493228927893523, "learning_rate": 1.598628874323111e-05, "loss": 0.1587, "step": 9098 }, { "epoch": 1.9015673981191221, "grad_norm": 0.9903388557418774, "learning_rate": 1.598538504994041e-05, "loss": 0.1916, "step": 9099 }, { "epoch": 1.901776384535005, "grad_norm": 1.0716838209470376, "learning_rate": 1.598448128047622e-05, "loss": 0.1394, "step": 9100 }, { "epoch": 1.901985370950888, "grad_norm": 0.8472438381130327, "learning_rate": 1.5983577434850053e-05, "loss": 0.1445, "step": 9101 }, { "epoch": 1.902194357366771, "grad_norm": 0.9621371607337319, "learning_rate": 1.5982673513073408e-05, "loss": 0.1626, "step": 9102 }, { "epoch": 1.902403343782654, "grad_norm": 0.8086477551406741, "learning_rate": 1.5981769515157787e-05, "loss": 0.1728, "step": 9103 }, { "epoch": 1.9026123301985371, "grad_norm": 0.8896295156091573, "learning_rate": 1.5980865441114693e-05, "loss": 0.1754, "step": 9104 }, { "epoch": 1.90282131661442, "grad_norm": 0.913337979372374, "learning_rate": 1.5979961290955637e-05, "loss": 0.2021, "step": 9105 }, { "epoch": 1.903030303030303, "grad_norm": 0.9242636144961259, "learning_rate": 1.597905706469212e-05, "loss": 0.1439, "step": 9106 }, { "epoch": 1.903239289446186, "grad_norm": 0.8048708683850255, "learning_rate": 1.5978152762335657e-05, "loss": 0.1455, "step": 9107 }, { "epoch": 1.903448275862069, "grad_norm": 1.1037306164846075, "learning_rate": 1.597724838389775e-05, "loss": 0.1854, "step": 9108 }, { "epoch": 1.903657262277952, "grad_norm": 0.9200431664080517, "learning_rate": 1.597634392938991e-05, "loss": 0.1529, "step": 9109 }, { "epoch": 1.903866248693835, "grad_norm": 0.8417248916882789, "learning_rate": 1.597543939882365e-05, "loss": 0.1496, "step": 9110 }, { "epoch": 1.9040752351097179, "grad_norm": 0.9288796184352798, "learning_rate": 1.5974534792210482e-05, "loss": 0.1602, "step": 9111 }, { "epoch": 1.9042842215256008, "grad_norm": 0.9676283811540871, "learning_rate": 1.5973630109561914e-05, "loss": 0.1796, "step": 9112 }, { "epoch": 1.9044932079414838, "grad_norm": 0.9784641630121099, "learning_rate": 1.597272535088946e-05, "loss": 0.1389, "step": 9113 }, { "epoch": 1.9047021943573668, "grad_norm": 1.0221600348120417, "learning_rate": 1.5971820516204643e-05, "loss": 0.1645, "step": 9114 }, { "epoch": 1.9049111807732497, "grad_norm": 1.4546838355984275, "learning_rate": 1.597091560551897e-05, "loss": 0.1547, "step": 9115 }, { "epoch": 1.9051201671891327, "grad_norm": 0.9972369961069698, "learning_rate": 1.5970010618843957e-05, "loss": 0.1784, "step": 9116 }, { "epoch": 1.9053291536050156, "grad_norm": 0.7839596472448904, "learning_rate": 1.5969105556191123e-05, "loss": 0.1297, "step": 9117 }, { "epoch": 1.9055381400208986, "grad_norm": 1.0472800636730533, "learning_rate": 1.5968200417571994e-05, "loss": 0.1681, "step": 9118 }, { "epoch": 1.9057471264367816, "grad_norm": 0.7571700776863594, "learning_rate": 1.5967295202998078e-05, "loss": 0.1573, "step": 9119 }, { "epoch": 1.9059561128526645, "grad_norm": 0.7218920391377224, "learning_rate": 1.59663899124809e-05, "loss": 0.1467, "step": 9120 }, { "epoch": 1.9061650992685475, "grad_norm": 0.9488983801177984, "learning_rate": 1.596548454603198e-05, "loss": 0.1877, "step": 9121 }, { "epoch": 1.9063740856844307, "grad_norm": 0.8947488659257632, "learning_rate": 1.596457910366285e-05, "loss": 0.1367, "step": 9122 }, { "epoch": 1.9065830721003136, "grad_norm": 0.8385875826034938, "learning_rate": 1.5963673585385016e-05, "loss": 0.1889, "step": 9123 }, { "epoch": 1.9067920585161966, "grad_norm": 0.9350285762549488, "learning_rate": 1.5962767991210012e-05, "loss": 0.1598, "step": 9124 }, { "epoch": 1.9070010449320796, "grad_norm": 1.0150619425871614, "learning_rate": 1.5961862321149364e-05, "loss": 0.1512, "step": 9125 }, { "epoch": 1.9072100313479625, "grad_norm": 0.8931301964153894, "learning_rate": 1.5960956575214597e-05, "loss": 0.1652, "step": 9126 }, { "epoch": 1.9074190177638455, "grad_norm": 1.0404268593458261, "learning_rate": 1.5960050753417236e-05, "loss": 0.1611, "step": 9127 }, { "epoch": 1.9076280041797284, "grad_norm": 0.8285860285330787, "learning_rate": 1.5959144855768806e-05, "loss": 0.1794, "step": 9128 }, { "epoch": 1.9078369905956114, "grad_norm": 1.2044738383754072, "learning_rate": 1.5958238882280847e-05, "loss": 0.1498, "step": 9129 }, { "epoch": 1.9080459770114944, "grad_norm": 0.9608297319828135, "learning_rate": 1.5957332832964877e-05, "loss": 0.1816, "step": 9130 }, { "epoch": 1.9082549634273773, "grad_norm": 0.794435218628894, "learning_rate": 1.5956426707832437e-05, "loss": 0.146, "step": 9131 }, { "epoch": 1.9084639498432603, "grad_norm": 1.003845829088077, "learning_rate": 1.5955520506895047e-05, "loss": 0.1598, "step": 9132 }, { "epoch": 1.9086729362591432, "grad_norm": 0.9612283792980726, "learning_rate": 1.595461423016425e-05, "loss": 0.1718, "step": 9133 }, { "epoch": 1.9088819226750262, "grad_norm": 0.9310875022416777, "learning_rate": 1.5953707877651577e-05, "loss": 0.1496, "step": 9134 }, { "epoch": 1.9090909090909092, "grad_norm": 0.947840698256276, "learning_rate": 1.5952801449368563e-05, "loss": 0.1802, "step": 9135 }, { "epoch": 1.9092998955067921, "grad_norm": 0.8739220102649699, "learning_rate": 1.595189494532674e-05, "loss": 0.1504, "step": 9136 }, { "epoch": 1.909508881922675, "grad_norm": 1.1435032006085897, "learning_rate": 1.5950988365537652e-05, "loss": 0.1839, "step": 9137 }, { "epoch": 1.909717868338558, "grad_norm": 1.0441203421262213, "learning_rate": 1.5950081710012825e-05, "loss": 0.1546, "step": 9138 }, { "epoch": 1.909926854754441, "grad_norm": 1.0314681379468047, "learning_rate": 1.594917497876381e-05, "loss": 0.1419, "step": 9139 }, { "epoch": 1.910135841170324, "grad_norm": 0.8840275273514143, "learning_rate": 1.594826817180214e-05, "loss": 0.1619, "step": 9140 }, { "epoch": 1.910344827586207, "grad_norm": 0.6947348528017085, "learning_rate": 1.594736128913936e-05, "loss": 0.1183, "step": 9141 }, { "epoch": 1.91055381400209, "grad_norm": 0.8061567307679025, "learning_rate": 1.5946454330787008e-05, "loss": 0.1446, "step": 9142 }, { "epoch": 1.9107628004179729, "grad_norm": 0.874753494293215, "learning_rate": 1.5945547296756628e-05, "loss": 0.1715, "step": 9143 }, { "epoch": 1.9109717868338558, "grad_norm": 1.0346552392438504, "learning_rate": 1.5944640187059758e-05, "loss": 0.1316, "step": 9144 }, { "epoch": 1.9111807732497388, "grad_norm": 0.8634449996389562, "learning_rate": 1.5943733001707952e-05, "loss": 0.1607, "step": 9145 }, { "epoch": 1.9113897596656217, "grad_norm": 0.8468548775433118, "learning_rate": 1.5942825740712742e-05, "loss": 0.1549, "step": 9146 }, { "epoch": 1.9115987460815047, "grad_norm": 0.8502795102729457, "learning_rate": 1.594191840408569e-05, "loss": 0.1845, "step": 9147 }, { "epoch": 1.9118077324973877, "grad_norm": 0.96706395722431, "learning_rate": 1.5941010991838335e-05, "loss": 0.1642, "step": 9148 }, { "epoch": 1.9120167189132706, "grad_norm": 0.945838950401518, "learning_rate": 1.5940103503982223e-05, "loss": 0.1686, "step": 9149 }, { "epoch": 1.9122257053291536, "grad_norm": 0.95434858585133, "learning_rate": 1.593919594052891e-05, "loss": 0.1465, "step": 9150 }, { "epoch": 1.9124346917450366, "grad_norm": 0.8580801347155375, "learning_rate": 1.5938288301489943e-05, "loss": 0.1542, "step": 9151 }, { "epoch": 1.9126436781609195, "grad_norm": 0.7553193907659479, "learning_rate": 1.593738058687687e-05, "loss": 0.1336, "step": 9152 }, { "epoch": 1.9128526645768025, "grad_norm": 1.0618399461016188, "learning_rate": 1.5936472796701246e-05, "loss": 0.1769, "step": 9153 }, { "epoch": 1.9130616509926854, "grad_norm": 0.892802410829511, "learning_rate": 1.5935564930974622e-05, "loss": 0.1393, "step": 9154 }, { "epoch": 1.9132706374085684, "grad_norm": 0.8264956325449334, "learning_rate": 1.5934656989708557e-05, "loss": 0.1578, "step": 9155 }, { "epoch": 1.9134796238244514, "grad_norm": 0.8215152414772832, "learning_rate": 1.59337489729146e-05, "loss": 0.1548, "step": 9156 }, { "epoch": 1.9136886102403343, "grad_norm": 0.8727455556481037, "learning_rate": 1.5932840880604316e-05, "loss": 0.147, "step": 9157 }, { "epoch": 1.9138975966562173, "grad_norm": 0.8877511098785662, "learning_rate": 1.593193271278925e-05, "loss": 0.1484, "step": 9158 }, { "epoch": 1.9141065830721002, "grad_norm": 0.9712943955121373, "learning_rate": 1.593102446948097e-05, "loss": 0.1317, "step": 9159 }, { "epoch": 1.9143155694879832, "grad_norm": 0.8691305579855754, "learning_rate": 1.5930116150691026e-05, "loss": 0.1504, "step": 9160 }, { "epoch": 1.9145245559038662, "grad_norm": 0.9420294450587839, "learning_rate": 1.5929207756430982e-05, "loss": 0.1627, "step": 9161 }, { "epoch": 1.9147335423197491, "grad_norm": 1.0285853036392079, "learning_rate": 1.5928299286712404e-05, "loss": 0.1481, "step": 9162 }, { "epoch": 1.914942528735632, "grad_norm": 0.9389066039198052, "learning_rate": 1.592739074154684e-05, "loss": 0.1318, "step": 9163 }, { "epoch": 1.915151515151515, "grad_norm": 0.9365170714747942, "learning_rate": 1.592648212094587e-05, "loss": 0.1366, "step": 9164 }, { "epoch": 1.915360501567398, "grad_norm": 0.8897496900169383, "learning_rate": 1.592557342492105e-05, "loss": 0.1643, "step": 9165 }, { "epoch": 1.915569487983281, "grad_norm": 0.8378624632038256, "learning_rate": 1.5924664653483935e-05, "loss": 0.1546, "step": 9166 }, { "epoch": 1.915778474399164, "grad_norm": 1.006674646172103, "learning_rate": 1.5923755806646103e-05, "loss": 0.1932, "step": 9167 }, { "epoch": 1.915987460815047, "grad_norm": 0.9182882616473531, "learning_rate": 1.592284688441912e-05, "loss": 0.176, "step": 9168 }, { "epoch": 1.9161964472309299, "grad_norm": 0.8819732863264845, "learning_rate": 1.5921937886814546e-05, "loss": 0.1357, "step": 9169 }, { "epoch": 1.9164054336468128, "grad_norm": 1.0051847599493011, "learning_rate": 1.5921028813843956e-05, "loss": 0.1679, "step": 9170 }, { "epoch": 1.9166144200626958, "grad_norm": 1.0049911505438083, "learning_rate": 1.5920119665518915e-05, "loss": 0.1756, "step": 9171 }, { "epoch": 1.9168234064785787, "grad_norm": 0.9860145686985677, "learning_rate": 1.5919210441851e-05, "loss": 0.1239, "step": 9172 }, { "epoch": 1.9170323928944617, "grad_norm": 1.085517917807454, "learning_rate": 1.591830114285177e-05, "loss": 0.169, "step": 9173 }, { "epoch": 1.9172413793103447, "grad_norm": 0.8884905377571726, "learning_rate": 1.591739176853281e-05, "loss": 0.173, "step": 9174 }, { "epoch": 1.9174503657262278, "grad_norm": 0.823746335140751, "learning_rate": 1.5916482318905685e-05, "loss": 0.1397, "step": 9175 }, { "epoch": 1.9176593521421108, "grad_norm": 1.0517365292327887, "learning_rate": 1.5915572793981973e-05, "loss": 0.1425, "step": 9176 }, { "epoch": 1.9178683385579938, "grad_norm": 0.8290142821996342, "learning_rate": 1.591466319377325e-05, "loss": 0.1464, "step": 9177 }, { "epoch": 1.9180773249738767, "grad_norm": 1.0584616668250855, "learning_rate": 1.5913753518291093e-05, "loss": 0.1393, "step": 9178 }, { "epoch": 1.9182863113897597, "grad_norm": 0.971963122165369, "learning_rate": 1.591284376754707e-05, "loss": 0.1625, "step": 9179 }, { "epoch": 1.9184952978056427, "grad_norm": 0.8861047015661668, "learning_rate": 1.591193394155277e-05, "loss": 0.1371, "step": 9180 }, { "epoch": 1.9187042842215256, "grad_norm": 0.9663371778283765, "learning_rate": 1.5911024040319767e-05, "loss": 0.1894, "step": 9181 }, { "epoch": 1.9189132706374086, "grad_norm": 0.9003319721606738, "learning_rate": 1.591011406385964e-05, "loss": 0.1645, "step": 9182 }, { "epoch": 1.9191222570532915, "grad_norm": 0.932633912421727, "learning_rate": 1.5909204012183974e-05, "loss": 0.1462, "step": 9183 }, { "epoch": 1.9193312434691745, "grad_norm": 1.008112547934739, "learning_rate": 1.5908293885304348e-05, "loss": 0.1572, "step": 9184 }, { "epoch": 1.9195402298850575, "grad_norm": 0.8481835354117659, "learning_rate": 1.5907383683232343e-05, "loss": 0.1461, "step": 9185 }, { "epoch": 1.9197492163009404, "grad_norm": 0.8946505054960475, "learning_rate": 1.5906473405979545e-05, "loss": 0.1583, "step": 9186 }, { "epoch": 1.9199582027168234, "grad_norm": 0.9884747141077516, "learning_rate": 1.590556305355754e-05, "loss": 0.2137, "step": 9187 }, { "epoch": 1.9201671891327063, "grad_norm": 0.8323168002980571, "learning_rate": 1.5904652625977913e-05, "loss": 0.1587, "step": 9188 }, { "epoch": 1.9203761755485893, "grad_norm": 0.8137929574228161, "learning_rate": 1.5903742123252245e-05, "loss": 0.1575, "step": 9189 }, { "epoch": 1.9205851619644723, "grad_norm": 1.0431161732493048, "learning_rate": 1.5902831545392136e-05, "loss": 0.1753, "step": 9190 }, { "epoch": 1.9207941483803552, "grad_norm": 1.0273948634620889, "learning_rate": 1.590192089240916e-05, "loss": 0.179, "step": 9191 }, { "epoch": 1.9210031347962384, "grad_norm": 1.0467854658374216, "learning_rate": 1.5901010164314915e-05, "loss": 0.1783, "step": 9192 }, { "epoch": 1.9212121212121214, "grad_norm": 0.7707395184334217, "learning_rate": 1.5900099361120992e-05, "loss": 0.1406, "step": 9193 }, { "epoch": 1.9214211076280043, "grad_norm": 0.9961574104726494, "learning_rate": 1.5899188482838977e-05, "loss": 0.1709, "step": 9194 }, { "epoch": 1.9216300940438873, "grad_norm": 0.9137377172035402, "learning_rate": 1.589827752948047e-05, "loss": 0.1485, "step": 9195 }, { "epoch": 1.9218390804597703, "grad_norm": 0.9547536963089992, "learning_rate": 1.5897366501057056e-05, "loss": 0.1493, "step": 9196 }, { "epoch": 1.9220480668756532, "grad_norm": 0.7652184807601033, "learning_rate": 1.5896455397580338e-05, "loss": 0.1585, "step": 9197 }, { "epoch": 1.9222570532915362, "grad_norm": 0.7565300134503424, "learning_rate": 1.58955442190619e-05, "loss": 0.1521, "step": 9198 }, { "epoch": 1.9224660397074191, "grad_norm": 0.9382470261778793, "learning_rate": 1.589463296551335e-05, "loss": 0.1644, "step": 9199 }, { "epoch": 1.922675026123302, "grad_norm": 0.7949234080058004, "learning_rate": 1.589372163694628e-05, "loss": 0.1354, "step": 9200 }, { "epoch": 1.922884012539185, "grad_norm": 1.2876209090605817, "learning_rate": 1.5892810233372284e-05, "loss": 0.166, "step": 9201 }, { "epoch": 1.923092998955068, "grad_norm": 0.8331985247387804, "learning_rate": 1.589189875480297e-05, "loss": 0.1638, "step": 9202 }, { "epoch": 1.923301985370951, "grad_norm": 1.0308674341024473, "learning_rate": 1.5890987201249928e-05, "loss": 0.1698, "step": 9203 }, { "epoch": 1.923510971786834, "grad_norm": 0.7046774614316973, "learning_rate": 1.5890075572724767e-05, "loss": 0.1319, "step": 9204 }, { "epoch": 1.923719958202717, "grad_norm": 0.9336051409949188, "learning_rate": 1.5889163869239084e-05, "loss": 0.1408, "step": 9205 }, { "epoch": 1.9239289446185999, "grad_norm": 0.9205658387468527, "learning_rate": 1.5888252090804487e-05, "loss": 0.1909, "step": 9206 }, { "epoch": 1.9241379310344828, "grad_norm": 0.8271455453287979, "learning_rate": 1.588734023743257e-05, "loss": 0.1939, "step": 9207 }, { "epoch": 1.9243469174503658, "grad_norm": 1.0712346360522287, "learning_rate": 1.588642830913495e-05, "loss": 0.1903, "step": 9208 }, { "epoch": 1.9245559038662488, "grad_norm": 0.8358792529923595, "learning_rate": 1.5885516305923224e-05, "loss": 0.1442, "step": 9209 }, { "epoch": 1.9247648902821317, "grad_norm": 0.9160462500758407, "learning_rate": 1.5884604227809008e-05, "loss": 0.1391, "step": 9210 }, { "epoch": 1.9249738766980147, "grad_norm": 0.9517251165019596, "learning_rate": 1.5883692074803895e-05, "loss": 0.1929, "step": 9211 }, { "epoch": 1.9251828631138976, "grad_norm": 0.8203584592856814, "learning_rate": 1.5882779846919505e-05, "loss": 0.1479, "step": 9212 }, { "epoch": 1.9253918495297806, "grad_norm": 1.0133489119688865, "learning_rate": 1.5881867544167448e-05, "loss": 0.1602, "step": 9213 }, { "epoch": 1.9256008359456636, "grad_norm": 1.0512936476788306, "learning_rate": 1.5880955166559325e-05, "loss": 0.1531, "step": 9214 }, { "epoch": 1.9258098223615465, "grad_norm": 0.987945051981838, "learning_rate": 1.5880042714106754e-05, "loss": 0.1624, "step": 9215 }, { "epoch": 1.9260188087774295, "grad_norm": 0.9911570977230877, "learning_rate": 1.5879130186821354e-05, "loss": 0.1818, "step": 9216 }, { "epoch": 1.9262277951933124, "grad_norm": 0.9649983414986911, "learning_rate": 1.5878217584714724e-05, "loss": 0.1694, "step": 9217 }, { "epoch": 1.9264367816091954, "grad_norm": 0.9811336354506056, "learning_rate": 1.587730490779849e-05, "loss": 0.1196, "step": 9218 }, { "epoch": 1.9266457680250784, "grad_norm": 0.9509048448029691, "learning_rate": 1.5876392156084256e-05, "loss": 0.1784, "step": 9219 }, { "epoch": 1.9268547544409613, "grad_norm": 1.112947003482659, "learning_rate": 1.587547932958365e-05, "loss": 0.1708, "step": 9220 }, { "epoch": 1.9270637408568443, "grad_norm": 0.9824551836182213, "learning_rate": 1.5874566428308287e-05, "loss": 0.1871, "step": 9221 }, { "epoch": 1.9272727272727272, "grad_norm": 1.0913905693071368, "learning_rate": 1.5873653452269778e-05, "loss": 0.1632, "step": 9222 }, { "epoch": 1.9274817136886102, "grad_norm": 1.2554910140322264, "learning_rate": 1.5872740401479747e-05, "loss": 0.1969, "step": 9223 }, { "epoch": 1.9276907001044932, "grad_norm": 0.8033537051841908, "learning_rate": 1.5871827275949815e-05, "loss": 0.1396, "step": 9224 }, { "epoch": 1.9278996865203761, "grad_norm": 1.0392758314239703, "learning_rate": 1.58709140756916e-05, "loss": 0.1715, "step": 9225 }, { "epoch": 1.928108672936259, "grad_norm": 0.7172848128513918, "learning_rate": 1.5870000800716724e-05, "loss": 0.1513, "step": 9226 }, { "epoch": 1.928317659352142, "grad_norm": 0.8544897862835494, "learning_rate": 1.5869087451036813e-05, "loss": 0.1431, "step": 9227 }, { "epoch": 1.928526645768025, "grad_norm": 0.6637869984045898, "learning_rate": 1.5868174026663486e-05, "loss": 0.1423, "step": 9228 }, { "epoch": 1.928735632183908, "grad_norm": 0.9330382170721332, "learning_rate": 1.5867260527608373e-05, "loss": 0.1453, "step": 9229 }, { "epoch": 1.928944618599791, "grad_norm": 1.140598956139322, "learning_rate": 1.5866346953883097e-05, "loss": 0.194, "step": 9230 }, { "epoch": 1.929153605015674, "grad_norm": 0.9035726376936257, "learning_rate": 1.5865433305499288e-05, "loss": 0.1374, "step": 9231 }, { "epoch": 1.9293625914315569, "grad_norm": 0.8109509930268208, "learning_rate": 1.5864519582468565e-05, "loss": 0.1585, "step": 9232 }, { "epoch": 1.9295715778474398, "grad_norm": 0.9104783826848865, "learning_rate": 1.5863605784802567e-05, "loss": 0.136, "step": 9233 }, { "epoch": 1.9297805642633228, "grad_norm": 0.9879599326296779, "learning_rate": 1.5862691912512916e-05, "loss": 0.1704, "step": 9234 }, { "epoch": 1.9299895506792057, "grad_norm": 0.9859958622536116, "learning_rate": 1.5861777965611246e-05, "loss": 0.1867, "step": 9235 }, { "epoch": 1.9301985370950887, "grad_norm": 0.7856703670721782, "learning_rate": 1.5860863944109186e-05, "loss": 0.1224, "step": 9236 }, { "epoch": 1.9304075235109717, "grad_norm": 0.9114452660073846, "learning_rate": 1.5859949848018373e-05, "loss": 0.1555, "step": 9237 }, { "epoch": 1.9306165099268546, "grad_norm": 0.8316832540729432, "learning_rate": 1.5859035677350436e-05, "loss": 0.1521, "step": 9238 }, { "epoch": 1.9308254963427376, "grad_norm": 0.9017364552266798, "learning_rate": 1.585812143211701e-05, "loss": 0.1722, "step": 9239 }, { "epoch": 1.9310344827586206, "grad_norm": 1.0446905821606614, "learning_rate": 1.5857207112329732e-05, "loss": 0.2022, "step": 9240 }, { "epoch": 1.9312434691745035, "grad_norm": 0.7476761021547327, "learning_rate": 1.5856292718000235e-05, "loss": 0.1624, "step": 9241 }, { "epoch": 1.9314524555903865, "grad_norm": 1.074208820764905, "learning_rate": 1.585537824914016e-05, "loss": 0.1488, "step": 9242 }, { "epoch": 1.9316614420062694, "grad_norm": 1.0346385242751337, "learning_rate": 1.585446370576114e-05, "loss": 0.1692, "step": 9243 }, { "epoch": 1.9318704284221524, "grad_norm": 0.8801522875969153, "learning_rate": 1.585354908787482e-05, "loss": 0.1465, "step": 9244 }, { "epoch": 1.9320794148380356, "grad_norm": 0.8788819963806919, "learning_rate": 1.5852634395492837e-05, "loss": 0.1509, "step": 9245 }, { "epoch": 1.9322884012539185, "grad_norm": 0.782103399494298, "learning_rate": 1.585171962862683e-05, "loss": 0.1734, "step": 9246 }, { "epoch": 1.9324973876698015, "grad_norm": 0.9110903320228424, "learning_rate": 1.585080478728845e-05, "loss": 0.1541, "step": 9247 }, { "epoch": 1.9327063740856845, "grad_norm": 0.7976357490543696, "learning_rate": 1.5849889871489325e-05, "loss": 0.1455, "step": 9248 }, { "epoch": 1.9329153605015674, "grad_norm": 0.9834633869414225, "learning_rate": 1.5848974881241107e-05, "loss": 0.1338, "step": 9249 }, { "epoch": 1.9331243469174504, "grad_norm": 0.8557998141796346, "learning_rate": 1.584805981655544e-05, "loss": 0.1565, "step": 9250 }, { "epoch": 1.9333333333333333, "grad_norm": 1.1247581996974951, "learning_rate": 1.5847144677443974e-05, "loss": 0.1494, "step": 9251 }, { "epoch": 1.9335423197492163, "grad_norm": 0.8197226768777675, "learning_rate": 1.5846229463918346e-05, "loss": 0.1516, "step": 9252 }, { "epoch": 1.9337513061650993, "grad_norm": 0.8571105344513286, "learning_rate": 1.5845314175990215e-05, "loss": 0.1657, "step": 9253 }, { "epoch": 1.9339602925809822, "grad_norm": 0.929375475955579, "learning_rate": 1.5844398813671218e-05, "loss": 0.1938, "step": 9254 }, { "epoch": 1.9341692789968652, "grad_norm": 0.7722742489284611, "learning_rate": 1.584348337697301e-05, "loss": 0.1677, "step": 9255 }, { "epoch": 1.9343782654127482, "grad_norm": 0.9189663404820287, "learning_rate": 1.5842567865907246e-05, "loss": 0.1698, "step": 9256 }, { "epoch": 1.9345872518286311, "grad_norm": 0.8556519095488173, "learning_rate": 1.5841652280485565e-05, "loss": 0.1744, "step": 9257 }, { "epoch": 1.934796238244514, "grad_norm": 0.7724145576547482, "learning_rate": 1.5840736620719636e-05, "loss": 0.1429, "step": 9258 }, { "epoch": 1.935005224660397, "grad_norm": 0.8404407730560877, "learning_rate": 1.5839820886621097e-05, "loss": 0.1485, "step": 9259 }, { "epoch": 1.93521421107628, "grad_norm": 1.2649117759532509, "learning_rate": 1.583890507820161e-05, "loss": 0.1445, "step": 9260 }, { "epoch": 1.935423197492163, "grad_norm": 0.8957592628460175, "learning_rate": 1.583798919547283e-05, "loss": 0.1824, "step": 9261 }, { "epoch": 1.935632183908046, "grad_norm": 0.973584639903803, "learning_rate": 1.5837073238446406e-05, "loss": 0.1999, "step": 9262 }, { "epoch": 1.935841170323929, "grad_norm": 0.8758464156049998, "learning_rate": 1.5836157207134e-05, "loss": 0.2078, "step": 9263 }, { "epoch": 1.936050156739812, "grad_norm": 0.9237417336146504, "learning_rate": 1.5835241101547276e-05, "loss": 0.1562, "step": 9264 }, { "epoch": 1.936259143155695, "grad_norm": 0.9394643391063155, "learning_rate": 1.5834324921697886e-05, "loss": 0.1477, "step": 9265 }, { "epoch": 1.936468129571578, "grad_norm": 0.7888430491018641, "learning_rate": 1.5833408667597486e-05, "loss": 0.1613, "step": 9266 }, { "epoch": 1.936677115987461, "grad_norm": 0.827304317166118, "learning_rate": 1.583249233925775e-05, "loss": 0.1734, "step": 9267 }, { "epoch": 1.936886102403344, "grad_norm": 0.8463219295562541, "learning_rate": 1.5831575936690325e-05, "loss": 0.1311, "step": 9268 }, { "epoch": 1.9370950888192269, "grad_norm": 0.9589103335319702, "learning_rate": 1.5830659459906885e-05, "loss": 0.1471, "step": 9269 }, { "epoch": 1.9373040752351098, "grad_norm": 0.6819373744552567, "learning_rate": 1.5829742908919082e-05, "loss": 0.1193, "step": 9270 }, { "epoch": 1.9375130616509928, "grad_norm": 1.0245275748739642, "learning_rate": 1.5828826283738593e-05, "loss": 0.1778, "step": 9271 }, { "epoch": 1.9377220480668758, "grad_norm": 0.8749556134807169, "learning_rate": 1.5827909584377076e-05, "loss": 0.144, "step": 9272 }, { "epoch": 1.9379310344827587, "grad_norm": 0.8137012648816556, "learning_rate": 1.58269928108462e-05, "loss": 0.1779, "step": 9273 }, { "epoch": 1.9381400208986417, "grad_norm": 1.03572001961522, "learning_rate": 1.582607596315763e-05, "loss": 0.1613, "step": 9274 }, { "epoch": 1.9383490073145246, "grad_norm": 0.9808053702292411, "learning_rate": 1.5825159041323038e-05, "loss": 0.1459, "step": 9275 }, { "epoch": 1.9385579937304076, "grad_norm": 1.0107331976753484, "learning_rate": 1.582424204535409e-05, "loss": 0.1799, "step": 9276 }, { "epoch": 1.9387669801462906, "grad_norm": 0.8532757351674161, "learning_rate": 1.5823324975262457e-05, "loss": 0.1485, "step": 9277 }, { "epoch": 1.9389759665621735, "grad_norm": 0.8184203893086337, "learning_rate": 1.5822407831059812e-05, "loss": 0.15, "step": 9278 }, { "epoch": 1.9391849529780565, "grad_norm": 0.9182523310325775, "learning_rate": 1.5821490612757827e-05, "loss": 0.1807, "step": 9279 }, { "epoch": 1.9393939393939394, "grad_norm": 0.8845572354422199, "learning_rate": 1.582057332036817e-05, "loss": 0.1285, "step": 9280 }, { "epoch": 1.9396029258098224, "grad_norm": 0.9731027670822694, "learning_rate": 1.5819655953902526e-05, "loss": 0.1708, "step": 9281 }, { "epoch": 1.9398119122257054, "grad_norm": 0.7247376201423104, "learning_rate": 1.5818738513372553e-05, "loss": 0.1169, "step": 9282 }, { "epoch": 1.9400208986415883, "grad_norm": 0.9629057267961993, "learning_rate": 1.5817820998789943e-05, "loss": 0.1703, "step": 9283 }, { "epoch": 1.9402298850574713, "grad_norm": 0.9329268114412607, "learning_rate": 1.5816903410166363e-05, "loss": 0.1869, "step": 9284 }, { "epoch": 1.9404388714733543, "grad_norm": 0.9892070612656715, "learning_rate": 1.58159857475135e-05, "loss": 0.173, "step": 9285 }, { "epoch": 1.9406478578892372, "grad_norm": 0.7344189950086637, "learning_rate": 1.581506801084302e-05, "loss": 0.1688, "step": 9286 }, { "epoch": 1.9408568443051202, "grad_norm": 0.757627801092181, "learning_rate": 1.5814150200166616e-05, "loss": 0.1672, "step": 9287 }, { "epoch": 1.9410658307210031, "grad_norm": 0.956566777314825, "learning_rate": 1.5813232315495958e-05, "loss": 0.1982, "step": 9288 }, { "epoch": 1.941274817136886, "grad_norm": 1.1098685640333714, "learning_rate": 1.5812314356842735e-05, "loss": 0.1478, "step": 9289 }, { "epoch": 1.941483803552769, "grad_norm": 0.9341190952044963, "learning_rate": 1.5811396324218623e-05, "loss": 0.17, "step": 9290 }, { "epoch": 1.941692789968652, "grad_norm": 0.7482259114730716, "learning_rate": 1.581047821763531e-05, "loss": 0.14, "step": 9291 }, { "epoch": 1.941901776384535, "grad_norm": 1.042218403396712, "learning_rate": 1.5809560037104477e-05, "loss": 0.1438, "step": 9292 }, { "epoch": 1.942110762800418, "grad_norm": 0.8837635824538395, "learning_rate": 1.5808641782637818e-05, "loss": 0.1717, "step": 9293 }, { "epoch": 1.942319749216301, "grad_norm": 0.9975116978065797, "learning_rate": 1.5807723454247002e-05, "loss": 0.1616, "step": 9294 }, { "epoch": 1.9425287356321839, "grad_norm": 0.9998983727186926, "learning_rate": 1.5806805051943736e-05, "loss": 0.1481, "step": 9295 }, { "epoch": 1.9427377220480668, "grad_norm": 0.9727045448834755, "learning_rate": 1.5805886575739694e-05, "loss": 0.1932, "step": 9296 }, { "epoch": 1.9429467084639498, "grad_norm": 0.8007001025837519, "learning_rate": 1.5804968025646573e-05, "loss": 0.1349, "step": 9297 }, { "epoch": 1.9431556948798328, "grad_norm": 0.7588203056974061, "learning_rate": 1.5804049401676055e-05, "loss": 0.143, "step": 9298 }, { "epoch": 1.9433646812957157, "grad_norm": 0.760448470763972, "learning_rate": 1.580313070383984e-05, "loss": 0.1383, "step": 9299 }, { "epoch": 1.9435736677115987, "grad_norm": 0.762592482800573, "learning_rate": 1.5802211932149614e-05, "loss": 0.1604, "step": 9300 }, { "epoch": 1.9437826541274816, "grad_norm": 0.7812270456210899, "learning_rate": 1.5801293086617072e-05, "loss": 0.169, "step": 9301 }, { "epoch": 1.9439916405433646, "grad_norm": 0.9072030203537623, "learning_rate": 1.5800374167253907e-05, "loss": 0.1796, "step": 9302 }, { "epoch": 1.9442006269592476, "grad_norm": 0.8150996534419876, "learning_rate": 1.5799455174071817e-05, "loss": 0.1404, "step": 9303 }, { "epoch": 1.9444096133751305, "grad_norm": 0.9459810689208487, "learning_rate": 1.579853610708249e-05, "loss": 0.1738, "step": 9304 }, { "epoch": 1.9446185997910135, "grad_norm": 1.0794896988849154, "learning_rate": 1.5797616966297632e-05, "loss": 0.1862, "step": 9305 }, { "epoch": 1.9448275862068964, "grad_norm": 0.9386763775173526, "learning_rate": 1.5796697751728933e-05, "loss": 0.1647, "step": 9306 }, { "epoch": 1.9450365726227794, "grad_norm": 0.8089961944112111, "learning_rate": 1.5795778463388092e-05, "loss": 0.1449, "step": 9307 }, { "epoch": 1.9452455590386624, "grad_norm": 1.1465426981410394, "learning_rate": 1.5794859101286815e-05, "loss": 0.1707, "step": 9308 }, { "epoch": 1.9454545454545453, "grad_norm": 0.9045788204158888, "learning_rate": 1.5793939665436796e-05, "loss": 0.1553, "step": 9309 }, { "epoch": 1.9456635318704283, "grad_norm": 0.8020086854781381, "learning_rate": 1.5793020155849736e-05, "loss": 0.1538, "step": 9310 }, { "epoch": 1.9458725182863112, "grad_norm": 1.2677252355967241, "learning_rate": 1.5792100572537342e-05, "loss": 0.1539, "step": 9311 }, { "epoch": 1.9460815047021942, "grad_norm": 0.8561982355449012, "learning_rate": 1.5791180915511317e-05, "loss": 0.1642, "step": 9312 }, { "epoch": 1.9462904911180772, "grad_norm": 0.7737098034359955, "learning_rate": 1.5790261184783355e-05, "loss": 0.1603, "step": 9313 }, { "epoch": 1.9464994775339601, "grad_norm": 0.7814020769183698, "learning_rate": 1.5789341380365173e-05, "loss": 0.1424, "step": 9314 }, { "epoch": 1.9467084639498433, "grad_norm": 0.81445507173787, "learning_rate": 1.5788421502268474e-05, "loss": 0.1594, "step": 9315 }, { "epoch": 1.9469174503657263, "grad_norm": 0.7422105820827329, "learning_rate": 1.5787501550504965e-05, "loss": 0.1633, "step": 9316 }, { "epoch": 1.9471264367816092, "grad_norm": 0.908338572492469, "learning_rate": 1.5786581525086346e-05, "loss": 0.1945, "step": 9317 }, { "epoch": 1.9473354231974922, "grad_norm": 0.8820242527252786, "learning_rate": 1.578566142602434e-05, "loss": 0.164, "step": 9318 }, { "epoch": 1.9475444096133752, "grad_norm": 0.8685585824407401, "learning_rate": 1.578474125333064e-05, "loss": 0.1502, "step": 9319 }, { "epoch": 1.9477533960292581, "grad_norm": 0.874711578332324, "learning_rate": 1.5783821007016974e-05, "loss": 0.1556, "step": 9320 }, { "epoch": 1.947962382445141, "grad_norm": 0.8135001560110514, "learning_rate": 1.578290068709504e-05, "loss": 0.1444, "step": 9321 }, { "epoch": 1.948171368861024, "grad_norm": 0.7903986581650139, "learning_rate": 1.578198029357656e-05, "loss": 0.1356, "step": 9322 }, { "epoch": 1.948380355276907, "grad_norm": 0.9085476036352299, "learning_rate": 1.578105982647324e-05, "loss": 0.1558, "step": 9323 }, { "epoch": 1.94858934169279, "grad_norm": 0.8789417354036971, "learning_rate": 1.5780139285796795e-05, "loss": 0.1795, "step": 9324 }, { "epoch": 1.948798328108673, "grad_norm": 0.8328605933351689, "learning_rate": 1.5779218671558948e-05, "loss": 0.1556, "step": 9325 }, { "epoch": 1.949007314524556, "grad_norm": 1.368222505927744, "learning_rate": 1.5778297983771407e-05, "loss": 0.1467, "step": 9326 }, { "epoch": 1.9492163009404389, "grad_norm": 0.9375324858988606, "learning_rate": 1.577737722244589e-05, "loss": 0.2095, "step": 9327 }, { "epoch": 1.9494252873563218, "grad_norm": 0.7875805383124238, "learning_rate": 1.5776456387594125e-05, "loss": 0.1595, "step": 9328 }, { "epoch": 1.9496342737722048, "grad_norm": 1.0782281865821974, "learning_rate": 1.5775535479227817e-05, "loss": 0.1795, "step": 9329 }, { "epoch": 1.9498432601880877, "grad_norm": 0.7913003906218342, "learning_rate": 1.5774614497358697e-05, "loss": 0.1429, "step": 9330 }, { "epoch": 1.9500522466039707, "grad_norm": 0.926535722858762, "learning_rate": 1.5773693441998477e-05, "loss": 0.1731, "step": 9331 }, { "epoch": 1.9502612330198537, "grad_norm": 0.7649719224804529, "learning_rate": 1.5772772313158885e-05, "loss": 0.1589, "step": 9332 }, { "epoch": 1.9504702194357368, "grad_norm": 0.7908479091446277, "learning_rate": 1.5771851110851642e-05, "loss": 0.1403, "step": 9333 }, { "epoch": 1.9506792058516198, "grad_norm": 0.7868358022073424, "learning_rate": 1.5770929835088473e-05, "loss": 0.1989, "step": 9334 }, { "epoch": 1.9508881922675028, "grad_norm": 0.7648339460038046, "learning_rate": 1.5770008485881102e-05, "loss": 0.1616, "step": 9335 }, { "epoch": 1.9510971786833857, "grad_norm": 0.8707202959358459, "learning_rate": 1.5769087063241254e-05, "loss": 0.1737, "step": 9336 }, { "epoch": 1.9513061650992687, "grad_norm": 1.0349974875881396, "learning_rate": 1.5768165567180655e-05, "loss": 0.1573, "step": 9337 }, { "epoch": 1.9515151515151516, "grad_norm": 1.0732938158842993, "learning_rate": 1.5767243997711033e-05, "loss": 0.2043, "step": 9338 }, { "epoch": 1.9517241379310346, "grad_norm": 0.8717471146661873, "learning_rate": 1.576632235484412e-05, "loss": 0.1771, "step": 9339 }, { "epoch": 1.9519331243469176, "grad_norm": 0.7728810204118362, "learning_rate": 1.576540063859164e-05, "loss": 0.1182, "step": 9340 }, { "epoch": 1.9521421107628005, "grad_norm": 0.8866744771793024, "learning_rate": 1.5764478848965325e-05, "loss": 0.1566, "step": 9341 }, { "epoch": 1.9523510971786835, "grad_norm": 0.9156614737063723, "learning_rate": 1.576355698597691e-05, "loss": 0.1732, "step": 9342 }, { "epoch": 1.9525600835945665, "grad_norm": 0.9319169819869808, "learning_rate": 1.576263504963812e-05, "loss": 0.1416, "step": 9343 }, { "epoch": 1.9527690700104494, "grad_norm": 1.044766216958065, "learning_rate": 1.576171303996069e-05, "loss": 0.1811, "step": 9344 }, { "epoch": 1.9529780564263324, "grad_norm": 0.8318510886816739, "learning_rate": 1.576079095695636e-05, "loss": 0.1622, "step": 9345 }, { "epoch": 1.9531870428422153, "grad_norm": 0.903484269655459, "learning_rate": 1.575986880063686e-05, "loss": 0.1462, "step": 9346 }, { "epoch": 1.9533960292580983, "grad_norm": 0.7123653781713707, "learning_rate": 1.5758946571013924e-05, "loss": 0.157, "step": 9347 }, { "epoch": 1.9536050156739813, "grad_norm": 0.842236336269073, "learning_rate": 1.5758024268099294e-05, "loss": 0.1493, "step": 9348 }, { "epoch": 1.9538140020898642, "grad_norm": 0.9958026646054816, "learning_rate": 1.5757101891904705e-05, "loss": 0.1662, "step": 9349 }, { "epoch": 1.9540229885057472, "grad_norm": 1.0282860307471242, "learning_rate": 1.57561794424419e-05, "loss": 0.1529, "step": 9350 }, { "epoch": 1.9542319749216301, "grad_norm": 0.9897319105547976, "learning_rate": 1.575525691972261e-05, "loss": 0.1687, "step": 9351 }, { "epoch": 1.954440961337513, "grad_norm": 0.8322063259513887, "learning_rate": 1.575433432375858e-05, "loss": 0.1591, "step": 9352 }, { "epoch": 1.954649947753396, "grad_norm": 0.9184082957535687, "learning_rate": 1.5753411654561553e-05, "loss": 0.1574, "step": 9353 }, { "epoch": 1.954858934169279, "grad_norm": 0.9504004728823895, "learning_rate": 1.575248891214327e-05, "loss": 0.1524, "step": 9354 }, { "epoch": 1.955067920585162, "grad_norm": 1.127866160716476, "learning_rate": 1.5751566096515475e-05, "loss": 0.1471, "step": 9355 }, { "epoch": 1.955276907001045, "grad_norm": 1.0253514688857612, "learning_rate": 1.5750643207689913e-05, "loss": 0.1714, "step": 9356 }, { "epoch": 1.955485893416928, "grad_norm": 0.9108680121238655, "learning_rate": 1.5749720245678326e-05, "loss": 0.1876, "step": 9357 }, { "epoch": 1.9556948798328109, "grad_norm": 0.9550179186915505, "learning_rate": 1.5748797210492464e-05, "loss": 0.1548, "step": 9358 }, { "epoch": 1.9559038662486938, "grad_norm": 1.014989233894782, "learning_rate": 1.5747874102144073e-05, "loss": 0.1615, "step": 9359 }, { "epoch": 1.9561128526645768, "grad_norm": 0.7361064530465585, "learning_rate": 1.5746950920644898e-05, "loss": 0.1407, "step": 9360 }, { "epoch": 1.9563218390804598, "grad_norm": 0.9386681233014258, "learning_rate": 1.574602766600669e-05, "loss": 0.1926, "step": 9361 }, { "epoch": 1.9565308254963427, "grad_norm": 0.8070944837222424, "learning_rate": 1.5745104338241198e-05, "loss": 0.1531, "step": 9362 }, { "epoch": 1.9567398119122257, "grad_norm": 0.7412467168827175, "learning_rate": 1.5744180937360178e-05, "loss": 0.164, "step": 9363 }, { "epoch": 1.9569487983281086, "grad_norm": 0.786382441121071, "learning_rate": 1.5743257463375375e-05, "loss": 0.144, "step": 9364 }, { "epoch": 1.9571577847439916, "grad_norm": 0.7939747145987804, "learning_rate": 1.5742333916298545e-05, "loss": 0.1445, "step": 9365 }, { "epoch": 1.9573667711598746, "grad_norm": 0.8652632629032486, "learning_rate": 1.5741410296141443e-05, "loss": 0.1886, "step": 9366 }, { "epoch": 1.9575757575757575, "grad_norm": 0.8991339630894427, "learning_rate": 1.5740486602915814e-05, "loss": 0.1538, "step": 9367 }, { "epoch": 1.9577847439916405, "grad_norm": 1.103915590317655, "learning_rate": 1.573956283663343e-05, "loss": 0.1734, "step": 9368 }, { "epoch": 1.9579937304075234, "grad_norm": 0.8655849517225599, "learning_rate": 1.5738638997306034e-05, "loss": 0.1348, "step": 9369 }, { "epoch": 1.9582027168234064, "grad_norm": 1.1877660666965297, "learning_rate": 1.5737715084945385e-05, "loss": 0.1759, "step": 9370 }, { "epoch": 1.9584117032392894, "grad_norm": 0.7787143052201617, "learning_rate": 1.5736791099563248e-05, "loss": 0.142, "step": 9371 }, { "epoch": 1.9586206896551723, "grad_norm": 1.102308150473613, "learning_rate": 1.5735867041171377e-05, "loss": 0.1746, "step": 9372 }, { "epoch": 1.9588296760710553, "grad_norm": 0.8050808200195306, "learning_rate": 1.5734942909781533e-05, "loss": 0.161, "step": 9373 }, { "epoch": 1.9590386624869383, "grad_norm": 1.0171378500691048, "learning_rate": 1.573401870540548e-05, "loss": 0.1745, "step": 9374 }, { "epoch": 1.9592476489028212, "grad_norm": 0.8381829569331414, "learning_rate": 1.573309442805497e-05, "loss": 0.1391, "step": 9375 }, { "epoch": 1.9594566353187042, "grad_norm": 1.0949487962277065, "learning_rate": 1.573217007774178e-05, "loss": 0.1821, "step": 9376 }, { "epoch": 1.9596656217345871, "grad_norm": 0.9622231628356633, "learning_rate": 1.5731245654477665e-05, "loss": 0.1779, "step": 9377 }, { "epoch": 1.95987460815047, "grad_norm": 1.1031220588497994, "learning_rate": 1.573032115827439e-05, "loss": 0.1582, "step": 9378 }, { "epoch": 1.960083594566353, "grad_norm": 1.0923032126791028, "learning_rate": 1.5729396589143722e-05, "loss": 0.1904, "step": 9379 }, { "epoch": 1.960292580982236, "grad_norm": 1.0740202599503128, "learning_rate": 1.572847194709743e-05, "loss": 0.1587, "step": 9380 }, { "epoch": 1.960501567398119, "grad_norm": 0.7939642312293911, "learning_rate": 1.572754723214728e-05, "loss": 0.151, "step": 9381 }, { "epoch": 1.960710553814002, "grad_norm": 0.8912430379713666, "learning_rate": 1.5726622444305037e-05, "loss": 0.1621, "step": 9382 }, { "epoch": 1.960919540229885, "grad_norm": 0.9173159489869642, "learning_rate": 1.5725697583582474e-05, "loss": 0.197, "step": 9383 }, { "epoch": 1.9611285266457679, "grad_norm": 0.9567138640519687, "learning_rate": 1.572477264999136e-05, "loss": 0.19, "step": 9384 }, { "epoch": 1.9613375130616508, "grad_norm": 0.8818972713676853, "learning_rate": 1.572384764354347e-05, "loss": 0.1771, "step": 9385 }, { "epoch": 1.961546499477534, "grad_norm": 0.8820893795373063, "learning_rate": 1.572292256425057e-05, "loss": 0.1585, "step": 9386 }, { "epoch": 1.961755485893417, "grad_norm": 0.7602185273639951, "learning_rate": 1.5721997412124434e-05, "loss": 0.1483, "step": 9387 }, { "epoch": 1.9619644723093, "grad_norm": 0.7241182397561173, "learning_rate": 1.572107218717684e-05, "loss": 0.1394, "step": 9388 }, { "epoch": 1.962173458725183, "grad_norm": 0.9177664761164325, "learning_rate": 1.572014688941956e-05, "loss": 0.1339, "step": 9389 }, { "epoch": 1.9623824451410659, "grad_norm": 0.8583119960089117, "learning_rate": 1.5719221518864375e-05, "loss": 0.1566, "step": 9390 }, { "epoch": 1.9625914315569488, "grad_norm": 0.936026053976846, "learning_rate": 1.5718296075523057e-05, "loss": 0.1472, "step": 9391 }, { "epoch": 1.9628004179728318, "grad_norm": 1.0033271403589934, "learning_rate": 1.571737055940738e-05, "loss": 0.1597, "step": 9392 }, { "epoch": 1.9630094043887147, "grad_norm": 0.9382282818484701, "learning_rate": 1.5716444970529132e-05, "loss": 0.1364, "step": 9393 }, { "epoch": 1.9632183908045977, "grad_norm": 0.9672019063606672, "learning_rate": 1.5715519308900086e-05, "loss": 0.181, "step": 9394 }, { "epoch": 1.9634273772204807, "grad_norm": 0.8588156278150988, "learning_rate": 1.571459357453202e-05, "loss": 0.1757, "step": 9395 }, { "epoch": 1.9636363636363636, "grad_norm": 0.9571769076347908, "learning_rate": 1.5713667767436724e-05, "loss": 0.1821, "step": 9396 }, { "epoch": 1.9638453500522466, "grad_norm": 0.8481012568390119, "learning_rate": 1.5712741887625976e-05, "loss": 0.1314, "step": 9397 }, { "epoch": 1.9640543364681295, "grad_norm": 0.9165498112558214, "learning_rate": 1.5711815935111562e-05, "loss": 0.1864, "step": 9398 }, { "epoch": 1.9642633228840125, "grad_norm": 1.123841757593654, "learning_rate": 1.5710889909905257e-05, "loss": 0.1626, "step": 9399 }, { "epoch": 1.9644723092998955, "grad_norm": 0.884877045391081, "learning_rate": 1.5709963812018857e-05, "loss": 0.1721, "step": 9400 }, { "epoch": 1.9646812957157784, "grad_norm": 0.8617092047979787, "learning_rate": 1.5709037641464142e-05, "loss": 0.1653, "step": 9401 }, { "epoch": 1.9648902821316614, "grad_norm": 0.9515827506781855, "learning_rate": 1.5708111398252902e-05, "loss": 0.1628, "step": 9402 }, { "epoch": 1.9650992685475446, "grad_norm": 0.9496722882921547, "learning_rate": 1.5707185082396922e-05, "loss": 0.1479, "step": 9403 }, { "epoch": 1.9653082549634275, "grad_norm": 1.0231509851444172, "learning_rate": 1.5706258693907992e-05, "loss": 0.1464, "step": 9404 }, { "epoch": 1.9655172413793105, "grad_norm": 0.9487674567616772, "learning_rate": 1.5705332232797903e-05, "loss": 0.1378, "step": 9405 }, { "epoch": 1.9657262277951935, "grad_norm": 0.9303676096737868, "learning_rate": 1.570440569907844e-05, "loss": 0.168, "step": 9406 }, { "epoch": 1.9659352142110764, "grad_norm": 0.9175583552120902, "learning_rate": 1.570347909276141e-05, "loss": 0.1514, "step": 9407 }, { "epoch": 1.9661442006269594, "grad_norm": 1.140475116078833, "learning_rate": 1.5702552413858587e-05, "loss": 0.1816, "step": 9408 }, { "epoch": 1.9663531870428423, "grad_norm": 0.8423239881740603, "learning_rate": 1.5701625662381778e-05, "loss": 0.1701, "step": 9409 }, { "epoch": 1.9665621734587253, "grad_norm": 0.9790521883082537, "learning_rate": 1.5700698838342763e-05, "loss": 0.1637, "step": 9410 }, { "epoch": 1.9667711598746083, "grad_norm": 0.8416664575248222, "learning_rate": 1.5699771941753354e-05, "loss": 0.1354, "step": 9411 }, { "epoch": 1.9669801462904912, "grad_norm": 1.0883069145096842, "learning_rate": 1.5698844972625334e-05, "loss": 0.2093, "step": 9412 }, { "epoch": 1.9671891327063742, "grad_norm": 0.7973631260218018, "learning_rate": 1.569791793097051e-05, "loss": 0.1521, "step": 9413 }, { "epoch": 1.9673981191222571, "grad_norm": 1.0908484661347657, "learning_rate": 1.5696990816800673e-05, "loss": 0.1744, "step": 9414 }, { "epoch": 1.96760710553814, "grad_norm": 0.9242133862309209, "learning_rate": 1.5696063630127626e-05, "loss": 0.1321, "step": 9415 }, { "epoch": 1.967816091954023, "grad_norm": 0.9955921928801014, "learning_rate": 1.5695136370963168e-05, "loss": 0.1597, "step": 9416 }, { "epoch": 1.968025078369906, "grad_norm": 0.9525357527286906, "learning_rate": 1.5694209039319098e-05, "loss": 0.1907, "step": 9417 }, { "epoch": 1.968234064785789, "grad_norm": 1.0357447784725753, "learning_rate": 1.5693281635207214e-05, "loss": 0.1714, "step": 9418 }, { "epoch": 1.968443051201672, "grad_norm": 1.0199656268078865, "learning_rate": 1.5692354158639334e-05, "loss": 0.1612, "step": 9419 }, { "epoch": 1.968652037617555, "grad_norm": 1.0556463374368947, "learning_rate": 1.5691426609627245e-05, "loss": 0.1596, "step": 9420 }, { "epoch": 1.9688610240334379, "grad_norm": 0.9403506884384573, "learning_rate": 1.5690498988182758e-05, "loss": 0.164, "step": 9421 }, { "epoch": 1.9690700104493208, "grad_norm": 1.0464374140149002, "learning_rate": 1.5689571294317678e-05, "loss": 0.1798, "step": 9422 }, { "epoch": 1.9692789968652038, "grad_norm": 1.0983898625057607, "learning_rate": 1.5688643528043814e-05, "loss": 0.2094, "step": 9423 }, { "epoch": 1.9694879832810868, "grad_norm": 1.0258717626820624, "learning_rate": 1.568771568937297e-05, "loss": 0.1431, "step": 9424 }, { "epoch": 1.9696969696969697, "grad_norm": 0.7928040385592007, "learning_rate": 1.5686787778316954e-05, "loss": 0.1556, "step": 9425 }, { "epoch": 1.9699059561128527, "grad_norm": 0.9213121402344433, "learning_rate": 1.568585979488758e-05, "loss": 0.1573, "step": 9426 }, { "epoch": 1.9701149425287356, "grad_norm": 0.8412031380641296, "learning_rate": 1.568493173909665e-05, "loss": 0.1421, "step": 9427 }, { "epoch": 1.9703239289446186, "grad_norm": 0.8847277514552647, "learning_rate": 1.568400361095598e-05, "loss": 0.1513, "step": 9428 }, { "epoch": 1.9705329153605016, "grad_norm": 0.8829642880661819, "learning_rate": 1.568307541047738e-05, "loss": 0.1661, "step": 9429 }, { "epoch": 1.9707419017763845, "grad_norm": 0.7273203691323862, "learning_rate": 1.5682147137672668e-05, "loss": 0.1586, "step": 9430 }, { "epoch": 1.9709508881922675, "grad_norm": 0.923718207742986, "learning_rate": 1.5681218792553656e-05, "loss": 0.1468, "step": 9431 }, { "epoch": 1.9711598746081505, "grad_norm": 0.7968555698124037, "learning_rate": 1.568029037513215e-05, "loss": 0.1475, "step": 9432 }, { "epoch": 1.9713688610240334, "grad_norm": 0.9191337190294925, "learning_rate": 1.5679361885419973e-05, "loss": 0.1708, "step": 9433 }, { "epoch": 1.9715778474399164, "grad_norm": 1.0434637990038789, "learning_rate": 1.5678433323428942e-05, "loss": 0.142, "step": 9434 }, { "epoch": 1.9717868338557993, "grad_norm": 0.9668048010258136, "learning_rate": 1.5677504689170876e-05, "loss": 0.1513, "step": 9435 }, { "epoch": 1.9719958202716823, "grad_norm": 1.043942336293652, "learning_rate": 1.5676575982657586e-05, "loss": 0.1778, "step": 9436 }, { "epoch": 1.9722048066875653, "grad_norm": 0.928554816562784, "learning_rate": 1.5675647203900896e-05, "loss": 0.14, "step": 9437 }, { "epoch": 1.9724137931034482, "grad_norm": 0.7959974035690349, "learning_rate": 1.567471835291263e-05, "loss": 0.1203, "step": 9438 }, { "epoch": 1.9726227795193312, "grad_norm": 0.9521916759656378, "learning_rate": 1.5673789429704597e-05, "loss": 0.1482, "step": 9439 }, { "epoch": 1.9728317659352141, "grad_norm": 0.8789182693329931, "learning_rate": 1.5672860434288633e-05, "loss": 0.1833, "step": 9440 }, { "epoch": 1.973040752351097, "grad_norm": 1.0806255518650787, "learning_rate": 1.567193136667655e-05, "loss": 0.1704, "step": 9441 }, { "epoch": 1.97324973876698, "grad_norm": 0.796838890413907, "learning_rate": 1.5671002226880182e-05, "loss": 0.1491, "step": 9442 }, { "epoch": 1.973458725182863, "grad_norm": 0.8550945977461193, "learning_rate": 1.5670073014911348e-05, "loss": 0.1558, "step": 9443 }, { "epoch": 1.973667711598746, "grad_norm": 0.9420371825566183, "learning_rate": 1.566914373078187e-05, "loss": 0.1726, "step": 9444 }, { "epoch": 1.973876698014629, "grad_norm": 0.8756512074607016, "learning_rate": 1.566821437450358e-05, "loss": 0.1899, "step": 9445 }, { "epoch": 1.974085684430512, "grad_norm": 0.8174158564085772, "learning_rate": 1.5667284946088305e-05, "loss": 0.1369, "step": 9446 }, { "epoch": 1.9742946708463949, "grad_norm": 0.8565295431454263, "learning_rate": 1.5666355445547875e-05, "loss": 0.1533, "step": 9447 }, { "epoch": 1.9745036572622778, "grad_norm": 0.8697626336473914, "learning_rate": 1.5665425872894113e-05, "loss": 0.1758, "step": 9448 }, { "epoch": 1.9747126436781608, "grad_norm": 0.8560474913483591, "learning_rate": 1.5664496228138856e-05, "loss": 0.1583, "step": 9449 }, { "epoch": 1.9749216300940438, "grad_norm": 0.9560463396479135, "learning_rate": 1.566356651129393e-05, "loss": 0.1715, "step": 9450 }, { "epoch": 1.9751306165099267, "grad_norm": 0.9170650351687215, "learning_rate": 1.5662636722371175e-05, "loss": 0.1497, "step": 9451 }, { "epoch": 1.9753396029258097, "grad_norm": 0.9768862322035268, "learning_rate": 1.566170686138241e-05, "loss": 0.1901, "step": 9452 }, { "epoch": 1.9755485893416926, "grad_norm": 1.0482209841754935, "learning_rate": 1.5660776928339486e-05, "loss": 0.1524, "step": 9453 }, { "epoch": 1.9757575757575756, "grad_norm": 1.0052043577215526, "learning_rate": 1.5659846923254225e-05, "loss": 0.1735, "step": 9454 }, { "epoch": 1.9759665621734586, "grad_norm": 0.8659740667794804, "learning_rate": 1.5658916846138467e-05, "loss": 0.1621, "step": 9455 }, { "epoch": 1.9761755485893417, "grad_norm": 0.9902222204721445, "learning_rate": 1.5657986697004052e-05, "loss": 0.1406, "step": 9456 }, { "epoch": 1.9763845350052247, "grad_norm": 1.0111677879052163, "learning_rate": 1.565705647586281e-05, "loss": 0.1683, "step": 9457 }, { "epoch": 1.9765935214211077, "grad_norm": 0.9186249931285246, "learning_rate": 1.565612618272659e-05, "loss": 0.1591, "step": 9458 }, { "epoch": 1.9768025078369906, "grad_norm": 0.8555785588250787, "learning_rate": 1.565519581760722e-05, "loss": 0.1521, "step": 9459 }, { "epoch": 1.9770114942528736, "grad_norm": 0.8138462723247469, "learning_rate": 1.565426538051655e-05, "loss": 0.1769, "step": 9460 }, { "epoch": 1.9772204806687566, "grad_norm": 0.9507363140702043, "learning_rate": 1.5653334871466417e-05, "loss": 0.173, "step": 9461 }, { "epoch": 1.9774294670846395, "grad_norm": 0.7414703817273445, "learning_rate": 1.5652404290468662e-05, "loss": 0.1172, "step": 9462 }, { "epoch": 1.9776384535005225, "grad_norm": 0.9287433159195583, "learning_rate": 1.565147363753513e-05, "loss": 0.1578, "step": 9463 }, { "epoch": 1.9778474399164054, "grad_norm": 0.9786004503482446, "learning_rate": 1.5650542912677662e-05, "loss": 0.1599, "step": 9464 }, { "epoch": 1.9780564263322884, "grad_norm": 1.051275530881646, "learning_rate": 1.5649612115908107e-05, "loss": 0.1937, "step": 9465 }, { "epoch": 1.9782654127481714, "grad_norm": 1.0544630098726513, "learning_rate": 1.5648681247238312e-05, "loss": 0.187, "step": 9466 }, { "epoch": 1.9784743991640543, "grad_norm": 0.925617047791034, "learning_rate": 1.564775030668012e-05, "loss": 0.1968, "step": 9467 }, { "epoch": 1.9786833855799373, "grad_norm": 0.8482228053883548, "learning_rate": 1.5646819294245378e-05, "loss": 0.151, "step": 9468 }, { "epoch": 1.9788923719958202, "grad_norm": 1.1228173436644788, "learning_rate": 1.5645888209945938e-05, "loss": 0.1808, "step": 9469 }, { "epoch": 1.9791013584117032, "grad_norm": 0.9961635973400526, "learning_rate": 1.5644957053793645e-05, "loss": 0.1794, "step": 9470 }, { "epoch": 1.9793103448275862, "grad_norm": 0.9016914285944904, "learning_rate": 1.5644025825800355e-05, "loss": 0.1655, "step": 9471 }, { "epoch": 1.9795193312434691, "grad_norm": 0.9109129366901213, "learning_rate": 1.564309452597792e-05, "loss": 0.2009, "step": 9472 }, { "epoch": 1.979728317659352, "grad_norm": 0.9238688067662826, "learning_rate": 1.5642163154338185e-05, "loss": 0.1889, "step": 9473 }, { "epoch": 1.9799373040752353, "grad_norm": 0.7722707952595708, "learning_rate": 1.5641231710893004e-05, "loss": 0.1299, "step": 9474 }, { "epoch": 1.9801462904911182, "grad_norm": 0.9408049030171641, "learning_rate": 1.564030019565424e-05, "loss": 0.1528, "step": 9475 }, { "epoch": 1.9803552769070012, "grad_norm": 0.9643558521264182, "learning_rate": 1.563936860863374e-05, "loss": 0.2252, "step": 9476 }, { "epoch": 1.9805642633228842, "grad_norm": 0.9146840863025291, "learning_rate": 1.563843694984336e-05, "loss": 0.125, "step": 9477 }, { "epoch": 1.9807732497387671, "grad_norm": 0.7642727031250681, "learning_rate": 1.5637505219294964e-05, "loss": 0.1279, "step": 9478 }, { "epoch": 1.98098223615465, "grad_norm": 0.7500482353399254, "learning_rate": 1.56365734170004e-05, "loss": 0.1691, "step": 9479 }, { "epoch": 1.981191222570533, "grad_norm": 1.069052233504283, "learning_rate": 1.5635641542971533e-05, "loss": 0.1855, "step": 9480 }, { "epoch": 1.981400208986416, "grad_norm": 0.8875287928700132, "learning_rate": 1.563470959722022e-05, "loss": 0.1952, "step": 9481 }, { "epoch": 1.981609195402299, "grad_norm": 0.8468660725390579, "learning_rate": 1.5633777579758328e-05, "loss": 0.2101, "step": 9482 }, { "epoch": 1.981818181818182, "grad_norm": 0.8698467425297923, "learning_rate": 1.563284549059771e-05, "loss": 0.1773, "step": 9483 }, { "epoch": 1.9820271682340649, "grad_norm": 1.1260061484149426, "learning_rate": 1.563191332975023e-05, "loss": 0.1447, "step": 9484 }, { "epoch": 1.9822361546499478, "grad_norm": 0.786184652166886, "learning_rate": 1.5630981097227752e-05, "loss": 0.1531, "step": 9485 }, { "epoch": 1.9824451410658308, "grad_norm": 1.070084585941034, "learning_rate": 1.5630048793042138e-05, "loss": 0.2208, "step": 9486 }, { "epoch": 1.9826541274817138, "grad_norm": 1.1474246997853956, "learning_rate": 1.5629116417205258e-05, "loss": 0.1726, "step": 9487 }, { "epoch": 1.9828631138975967, "grad_norm": 0.9406248161460571, "learning_rate": 1.5628183969728977e-05, "loss": 0.1998, "step": 9488 }, { "epoch": 1.9830721003134797, "grad_norm": 0.7271173443750848, "learning_rate": 1.5627251450625158e-05, "loss": 0.1632, "step": 9489 }, { "epoch": 1.9832810867293627, "grad_norm": 0.901333092808184, "learning_rate": 1.5626318859905675e-05, "loss": 0.1864, "step": 9490 }, { "epoch": 1.9834900731452456, "grad_norm": 1.013844405395288, "learning_rate": 1.562538619758239e-05, "loss": 0.1624, "step": 9491 }, { "epoch": 1.9836990595611286, "grad_norm": 0.8662223169241129, "learning_rate": 1.5624453463667174e-05, "loss": 0.1837, "step": 9492 }, { "epoch": 1.9839080459770115, "grad_norm": 0.9968224350596632, "learning_rate": 1.5623520658171903e-05, "loss": 0.204, "step": 9493 }, { "epoch": 1.9841170323928945, "grad_norm": 0.836465770385932, "learning_rate": 1.5622587781108444e-05, "loss": 0.1329, "step": 9494 }, { "epoch": 1.9843260188087775, "grad_norm": 0.7456133964384516, "learning_rate": 1.5621654832488667e-05, "loss": 0.1305, "step": 9495 }, { "epoch": 1.9845350052246604, "grad_norm": 0.8251984774698538, "learning_rate": 1.562072181232445e-05, "loss": 0.1502, "step": 9496 }, { "epoch": 1.9847439916405434, "grad_norm": 1.1118250853050973, "learning_rate": 1.5619788720627662e-05, "loss": 0.1887, "step": 9497 }, { "epoch": 1.9849529780564263, "grad_norm": 1.0271842163776623, "learning_rate": 1.5618855557410184e-05, "loss": 0.1403, "step": 9498 }, { "epoch": 1.9851619644723093, "grad_norm": 1.0492221739079115, "learning_rate": 1.5617922322683888e-05, "loss": 0.154, "step": 9499 }, { "epoch": 1.9853709508881923, "grad_norm": 0.8224287256388099, "learning_rate": 1.5616989016460652e-05, "loss": 0.1783, "step": 9500 }, { "epoch": 1.9855799373040752, "grad_norm": 0.9833908320306668, "learning_rate": 1.5616055638752355e-05, "loss": 0.1845, "step": 9501 }, { "epoch": 1.9857889237199582, "grad_norm": 0.8887927820638113, "learning_rate": 1.5615122189570872e-05, "loss": 0.1874, "step": 9502 }, { "epoch": 1.9859979101358411, "grad_norm": 1.179436433080018, "learning_rate": 1.561418866892809e-05, "loss": 0.1891, "step": 9503 }, { "epoch": 1.986206896551724, "grad_norm": 0.7998572698163998, "learning_rate": 1.5613255076835882e-05, "loss": 0.1596, "step": 9504 }, { "epoch": 1.986415882967607, "grad_norm": 0.8407826111305056, "learning_rate": 1.5612321413306132e-05, "loss": 0.1886, "step": 9505 }, { "epoch": 1.98662486938349, "grad_norm": 0.8520341335971605, "learning_rate": 1.5611387678350722e-05, "loss": 0.1482, "step": 9506 }, { "epoch": 1.986833855799373, "grad_norm": 0.9151979732485869, "learning_rate": 1.5610453871981534e-05, "loss": 0.1797, "step": 9507 }, { "epoch": 1.987042842215256, "grad_norm": 0.7585984685754523, "learning_rate": 1.5609519994210457e-05, "loss": 0.1493, "step": 9508 }, { "epoch": 1.987251828631139, "grad_norm": 0.7938583164198747, "learning_rate": 1.5608586045049373e-05, "loss": 0.1426, "step": 9509 }, { "epoch": 1.9874608150470219, "grad_norm": 0.8938992914526701, "learning_rate": 1.5607652024510167e-05, "loss": 0.1813, "step": 9510 }, { "epoch": 1.9876698014629048, "grad_norm": 0.7343573138288921, "learning_rate": 1.5606717932604726e-05, "loss": 0.1156, "step": 9511 }, { "epoch": 1.9878787878787878, "grad_norm": 0.907200446478919, "learning_rate": 1.560578376934494e-05, "loss": 0.1727, "step": 9512 }, { "epoch": 1.9880877742946708, "grad_norm": 0.9516481049719671, "learning_rate": 1.5604849534742695e-05, "loss": 0.1781, "step": 9513 }, { "epoch": 1.9882967607105537, "grad_norm": 0.858400916035228, "learning_rate": 1.5603915228809883e-05, "loss": 0.1335, "step": 9514 }, { "epoch": 1.9885057471264367, "grad_norm": 0.9142742741260963, "learning_rate": 1.5602980851558395e-05, "loss": 0.202, "step": 9515 }, { "epoch": 1.9887147335423196, "grad_norm": 0.807752986653141, "learning_rate": 1.560204640300012e-05, "loss": 0.1324, "step": 9516 }, { "epoch": 1.9889237199582026, "grad_norm": 0.9614730986260847, "learning_rate": 1.5601111883146948e-05, "loss": 0.1728, "step": 9517 }, { "epoch": 1.9891327063740856, "grad_norm": 0.9491051293595727, "learning_rate": 1.5600177292010777e-05, "loss": 0.1533, "step": 9518 }, { "epoch": 1.9893416927899685, "grad_norm": 0.9000154432749851, "learning_rate": 1.5599242629603503e-05, "loss": 0.1538, "step": 9519 }, { "epoch": 1.9895506792058515, "grad_norm": 0.8177737759544176, "learning_rate": 1.5598307895937016e-05, "loss": 0.145, "step": 9520 }, { "epoch": 1.9897596656217345, "grad_norm": 0.9267212867260906, "learning_rate": 1.5597373091023208e-05, "loss": 0.1757, "step": 9521 }, { "epoch": 1.9899686520376174, "grad_norm": 0.904053229718975, "learning_rate": 1.559643821487399e-05, "loss": 0.1429, "step": 9522 }, { "epoch": 1.9901776384535004, "grad_norm": 1.0689859147751706, "learning_rate": 1.5595503267501246e-05, "loss": 0.1671, "step": 9523 }, { "epoch": 1.9903866248693833, "grad_norm": 0.6946132082378382, "learning_rate": 1.5594568248916885e-05, "loss": 0.1291, "step": 9524 }, { "epoch": 1.9905956112852663, "grad_norm": 0.9228121922075613, "learning_rate": 1.5593633159132794e-05, "loss": 0.172, "step": 9525 }, { "epoch": 1.9908045977011493, "grad_norm": 0.8884633552989051, "learning_rate": 1.5592697998160887e-05, "loss": 0.1712, "step": 9526 }, { "epoch": 1.9910135841170324, "grad_norm": 0.9677869257770474, "learning_rate": 1.5591762766013058e-05, "loss": 0.1603, "step": 9527 }, { "epoch": 1.9912225705329154, "grad_norm": 1.077636778202083, "learning_rate": 1.559082746270121e-05, "loss": 0.1791, "step": 9528 }, { "epoch": 1.9914315569487984, "grad_norm": 0.7611314866605247, "learning_rate": 1.5589892088237248e-05, "loss": 0.173, "step": 9529 }, { "epoch": 1.9916405433646813, "grad_norm": 1.001016233343858, "learning_rate": 1.5588956642633074e-05, "loss": 0.1631, "step": 9530 }, { "epoch": 1.9918495297805643, "grad_norm": 0.9363734462663738, "learning_rate": 1.5588021125900597e-05, "loss": 0.1639, "step": 9531 }, { "epoch": 1.9920585161964472, "grad_norm": 0.8809734491811987, "learning_rate": 1.5587085538051714e-05, "loss": 0.1387, "step": 9532 }, { "epoch": 1.9922675026123302, "grad_norm": 0.7076322330701118, "learning_rate": 1.5586149879098346e-05, "loss": 0.1627, "step": 9533 }, { "epoch": 1.9924764890282132, "grad_norm": 0.8471568834754635, "learning_rate": 1.5585214149052387e-05, "loss": 0.1375, "step": 9534 }, { "epoch": 1.9926854754440961, "grad_norm": 0.7571185218615617, "learning_rate": 1.558427834792575e-05, "loss": 0.1366, "step": 9535 }, { "epoch": 1.992894461859979, "grad_norm": 0.9789669514925446, "learning_rate": 1.558334247573035e-05, "loss": 0.1587, "step": 9536 }, { "epoch": 1.993103448275862, "grad_norm": 1.2106652413108419, "learning_rate": 1.5582406532478092e-05, "loss": 0.1796, "step": 9537 }, { "epoch": 1.993312434691745, "grad_norm": 0.9395918525417818, "learning_rate": 1.5581470518180887e-05, "loss": 0.1475, "step": 9538 }, { "epoch": 1.993521421107628, "grad_norm": 0.9653923868427629, "learning_rate": 1.558053443285065e-05, "loss": 0.1545, "step": 9539 }, { "epoch": 1.993730407523511, "grad_norm": 1.5156294599314248, "learning_rate": 1.5579598276499293e-05, "loss": 0.1742, "step": 9540 }, { "epoch": 1.993939393939394, "grad_norm": 1.01324635399002, "learning_rate": 1.5578662049138732e-05, "loss": 0.1325, "step": 9541 }, { "epoch": 1.9941483803552769, "grad_norm": 0.9139339174625224, "learning_rate": 1.5577725750780877e-05, "loss": 0.1625, "step": 9542 }, { "epoch": 1.9943573667711598, "grad_norm": 0.9339643090315186, "learning_rate": 1.557678938143765e-05, "loss": 0.1661, "step": 9543 }, { "epoch": 1.994566353187043, "grad_norm": 0.9224977265447406, "learning_rate": 1.557585294112096e-05, "loss": 0.1186, "step": 9544 }, { "epoch": 1.994775339602926, "grad_norm": 0.7799399081835625, "learning_rate": 1.557491642984273e-05, "loss": 0.1454, "step": 9545 }, { "epoch": 1.994984326018809, "grad_norm": 0.79136797522715, "learning_rate": 1.5573979847614886e-05, "loss": 0.1758, "step": 9546 }, { "epoch": 1.995193312434692, "grad_norm": 0.97905733143629, "learning_rate": 1.557304319444933e-05, "loss": 0.1645, "step": 9547 }, { "epoch": 1.9954022988505749, "grad_norm": 1.0696200410959231, "learning_rate": 1.5572106470357998e-05, "loss": 0.1563, "step": 9548 }, { "epoch": 1.9956112852664578, "grad_norm": 0.9392566791449252, "learning_rate": 1.5571169675352802e-05, "loss": 0.139, "step": 9549 }, { "epoch": 1.9958202716823408, "grad_norm": 0.9441116572478876, "learning_rate": 1.557023280944567e-05, "loss": 0.1413, "step": 9550 }, { "epoch": 1.9960292580982237, "grad_norm": 0.8389288286476746, "learning_rate": 1.556929587264852e-05, "loss": 0.1301, "step": 9551 }, { "epoch": 1.9962382445141067, "grad_norm": 0.7997676096210007, "learning_rate": 1.5568358864973278e-05, "loss": 0.1403, "step": 9552 }, { "epoch": 1.9964472309299897, "grad_norm": 0.8200363243637078, "learning_rate": 1.5567421786431874e-05, "loss": 0.1388, "step": 9553 }, { "epoch": 1.9966562173458726, "grad_norm": 0.978442232747012, "learning_rate": 1.5566484637036224e-05, "loss": 0.1766, "step": 9554 }, { "epoch": 1.9968652037617556, "grad_norm": 0.8069015793267835, "learning_rate": 1.5565547416798263e-05, "loss": 0.1605, "step": 9555 }, { "epoch": 1.9970741901776385, "grad_norm": 0.8951158268290618, "learning_rate": 1.5564610125729912e-05, "loss": 0.17, "step": 9556 }, { "epoch": 1.9972831765935215, "grad_norm": 0.693326060136325, "learning_rate": 1.556367276384311e-05, "loss": 0.1137, "step": 9557 }, { "epoch": 1.9974921630094045, "grad_norm": 0.8042810975456998, "learning_rate": 1.5562735331149772e-05, "loss": 0.1469, "step": 9558 }, { "epoch": 1.9977011494252874, "grad_norm": 0.8280407926605232, "learning_rate": 1.5561797827661838e-05, "loss": 0.1392, "step": 9559 }, { "epoch": 1.9979101358411704, "grad_norm": 0.8795751306384962, "learning_rate": 1.556086025339124e-05, "loss": 0.1852, "step": 9560 }, { "epoch": 1.9981191222570533, "grad_norm": 0.8674468166136312, "learning_rate": 1.5559922608349905e-05, "loss": 0.1752, "step": 9561 }, { "epoch": 1.9983281086729363, "grad_norm": 0.81413609776933, "learning_rate": 1.5558984892549764e-05, "loss": 0.1306, "step": 9562 }, { "epoch": 1.9985370950888193, "grad_norm": 0.8984291855924522, "learning_rate": 1.555804710600276e-05, "loss": 0.1304, "step": 9563 }, { "epoch": 1.9987460815047022, "grad_norm": 0.9128942879456462, "learning_rate": 1.5557109248720822e-05, "loss": 0.1643, "step": 9564 }, { "epoch": 1.9989550679205852, "grad_norm": 0.8623953638244178, "learning_rate": 1.5556171320715886e-05, "loss": 0.1502, "step": 9565 }, { "epoch": 1.9991640543364682, "grad_norm": 0.7532352977021617, "learning_rate": 1.555523332199989e-05, "loss": 0.1474, "step": 9566 }, { "epoch": 1.9993730407523511, "grad_norm": 1.0514767719152471, "learning_rate": 1.555429525258477e-05, "loss": 0.178, "step": 9567 }, { "epoch": 1.999582027168234, "grad_norm": 0.8673799015972332, "learning_rate": 1.5553357112482463e-05, "loss": 0.1562, "step": 9568 }, { "epoch": 1.999791013584117, "grad_norm": 0.9467476022669888, "learning_rate": 1.5552418901704912e-05, "loss": 0.1673, "step": 9569 }, { "epoch": 2.0, "grad_norm": 0.7085818587672046, "learning_rate": 1.5551480620264057e-05, "loss": 0.1217, "step": 9570 }, { "epoch": 2.000208986415883, "grad_norm": 1.3573290316060307, "learning_rate": 1.555054226817184e-05, "loss": 0.1071, "step": 9571 }, { "epoch": 2.000417972831766, "grad_norm": 0.837086931987871, "learning_rate": 1.5549603845440192e-05, "loss": 0.1196, "step": 9572 }, { "epoch": 2.000626959247649, "grad_norm": 0.8932399027040238, "learning_rate": 1.5548665352081072e-05, "loss": 0.137, "step": 9573 }, { "epoch": 2.000835945663532, "grad_norm": 1.5400043795896807, "learning_rate": 1.5547726788106415e-05, "loss": 0.1159, "step": 9574 }, { "epoch": 2.001044932079415, "grad_norm": 0.9414493064148507, "learning_rate": 1.554678815352817e-05, "loss": 0.1466, "step": 9575 }, { "epoch": 2.0012539184952978, "grad_norm": 0.7445751461240632, "learning_rate": 1.5545849448358273e-05, "loss": 0.0998, "step": 9576 }, { "epoch": 2.0014629049111807, "grad_norm": 0.770309522615556, "learning_rate": 1.5544910672608682e-05, "loss": 0.1234, "step": 9577 }, { "epoch": 2.0016718913270637, "grad_norm": 0.9028445879467046, "learning_rate": 1.554397182629134e-05, "loss": 0.1311, "step": 9578 }, { "epoch": 2.0018808777429467, "grad_norm": 0.9136206555984644, "learning_rate": 1.5543032909418192e-05, "loss": 0.1211, "step": 9579 }, { "epoch": 2.0020898641588296, "grad_norm": 0.9699613815264102, "learning_rate": 1.554209392200119e-05, "loss": 0.1083, "step": 9580 }, { "epoch": 2.0022988505747126, "grad_norm": 0.7635355341253399, "learning_rate": 1.554115486405229e-05, "loss": 0.1164, "step": 9581 }, { "epoch": 2.0025078369905955, "grad_norm": 0.873239245474711, "learning_rate": 1.554021573558343e-05, "loss": 0.1182, "step": 9582 }, { "epoch": 2.0027168234064785, "grad_norm": 0.9730804837366289, "learning_rate": 1.5539276536606573e-05, "loss": 0.14, "step": 9583 }, { "epoch": 2.0029258098223615, "grad_norm": 1.0622456789083805, "learning_rate": 1.553833726713367e-05, "loss": 0.1291, "step": 9584 }, { "epoch": 2.0031347962382444, "grad_norm": 0.8286474239998763, "learning_rate": 1.553739792717667e-05, "loss": 0.1056, "step": 9585 }, { "epoch": 2.0033437826541274, "grad_norm": 1.4106674273034183, "learning_rate": 1.553645851674753e-05, "loss": 0.1202, "step": 9586 }, { "epoch": 2.0035527690700103, "grad_norm": 1.2199209725552989, "learning_rate": 1.5535519035858207e-05, "loss": 0.1218, "step": 9587 }, { "epoch": 2.0037617554858933, "grad_norm": 1.1235921659866304, "learning_rate": 1.553457948452065e-05, "loss": 0.1329, "step": 9588 }, { "epoch": 2.0039707419017763, "grad_norm": 1.096944627490425, "learning_rate": 1.553363986274683e-05, "loss": 0.135, "step": 9589 }, { "epoch": 2.0041797283176592, "grad_norm": 1.0387411606429247, "learning_rate": 1.5532700170548696e-05, "loss": 0.1354, "step": 9590 }, { "epoch": 2.004388714733542, "grad_norm": 0.8733778397670083, "learning_rate": 1.553176040793821e-05, "loss": 0.0992, "step": 9591 }, { "epoch": 2.004597701149425, "grad_norm": 1.1511857402466505, "learning_rate": 1.5530820574927328e-05, "loss": 0.1392, "step": 9592 }, { "epoch": 2.004806687565308, "grad_norm": 1.3719509614532128, "learning_rate": 1.5529880671528012e-05, "loss": 0.1232, "step": 9593 }, { "epoch": 2.005015673981191, "grad_norm": 1.0387808589023493, "learning_rate": 1.552894069775223e-05, "loss": 0.1179, "step": 9594 }, { "epoch": 2.005224660397074, "grad_norm": 1.1246018780731761, "learning_rate": 1.5528000653611935e-05, "loss": 0.1399, "step": 9595 }, { "epoch": 2.005433646812957, "grad_norm": 1.0062480667658666, "learning_rate": 1.55270605391191e-05, "loss": 0.1318, "step": 9596 }, { "epoch": 2.00564263322884, "grad_norm": 1.276407689684032, "learning_rate": 1.552612035428568e-05, "loss": 0.1305, "step": 9597 }, { "epoch": 2.005851619644723, "grad_norm": 1.1731963775537728, "learning_rate": 1.5525180099123648e-05, "loss": 0.1375, "step": 9598 }, { "epoch": 2.006060606060606, "grad_norm": 0.9490292072341847, "learning_rate": 1.5524239773644965e-05, "loss": 0.1228, "step": 9599 }, { "epoch": 2.006269592476489, "grad_norm": 0.9994575428686591, "learning_rate": 1.5523299377861603e-05, "loss": 0.1233, "step": 9600 }, { "epoch": 2.006478578892372, "grad_norm": 0.8752925519321416, "learning_rate": 1.552235891178553e-05, "loss": 0.0942, "step": 9601 }, { "epoch": 2.0066875653082548, "grad_norm": 0.9403344154563456, "learning_rate": 1.552141837542871e-05, "loss": 0.1433, "step": 9602 }, { "epoch": 2.0068965517241377, "grad_norm": 0.9590970232596329, "learning_rate": 1.5520477768803114e-05, "loss": 0.1283, "step": 9603 }, { "epoch": 2.0071055381400207, "grad_norm": 0.973559070335224, "learning_rate": 1.5519537091920714e-05, "loss": 0.1321, "step": 9604 }, { "epoch": 2.0073145245559036, "grad_norm": 1.1188834457654195, "learning_rate": 1.5518596344793482e-05, "loss": 0.1206, "step": 9605 }, { "epoch": 2.007523510971787, "grad_norm": 0.805987544871154, "learning_rate": 1.5517655527433392e-05, "loss": 0.0795, "step": 9606 }, { "epoch": 2.00773249738767, "grad_norm": 0.9832027520801365, "learning_rate": 1.5516714639852414e-05, "loss": 0.1248, "step": 9607 }, { "epoch": 2.007941483803553, "grad_norm": 0.7240199789348776, "learning_rate": 1.551577368206252e-05, "loss": 0.0948, "step": 9608 }, { "epoch": 2.008150470219436, "grad_norm": 1.436483978031583, "learning_rate": 1.5514832654075694e-05, "loss": 0.1223, "step": 9609 }, { "epoch": 2.008359456635319, "grad_norm": 0.9126852709124225, "learning_rate": 1.5513891555903908e-05, "loss": 0.0856, "step": 9610 }, { "epoch": 2.008568443051202, "grad_norm": 0.8915893516655107, "learning_rate": 1.5512950387559134e-05, "loss": 0.1217, "step": 9611 }, { "epoch": 2.008777429467085, "grad_norm": 1.1662537820462298, "learning_rate": 1.5512009149053358e-05, "loss": 0.1417, "step": 9612 }, { "epoch": 2.008986415882968, "grad_norm": 1.134596353645177, "learning_rate": 1.551106784039855e-05, "loss": 0.1272, "step": 9613 }, { "epoch": 2.0091954022988507, "grad_norm": 0.8593563519537281, "learning_rate": 1.5510126461606695e-05, "loss": 0.1155, "step": 9614 }, { "epoch": 2.0094043887147337, "grad_norm": 0.765709044151074, "learning_rate": 1.550918501268978e-05, "loss": 0.105, "step": 9615 }, { "epoch": 2.0096133751306167, "grad_norm": 1.2596662037057416, "learning_rate": 1.5508243493659773e-05, "loss": 0.1446, "step": 9616 }, { "epoch": 2.0098223615464996, "grad_norm": 0.9976319264400578, "learning_rate": 1.550730190452866e-05, "loss": 0.1139, "step": 9617 }, { "epoch": 2.0100313479623826, "grad_norm": 1.0390987024398552, "learning_rate": 1.550636024530843e-05, "loss": 0.1117, "step": 9618 }, { "epoch": 2.0102403343782655, "grad_norm": 1.169554168636998, "learning_rate": 1.550541851601106e-05, "loss": 0.1214, "step": 9619 }, { "epoch": 2.0104493207941485, "grad_norm": 0.9975444689298749, "learning_rate": 1.5504476716648547e-05, "loss": 0.1188, "step": 9620 }, { "epoch": 2.0106583072100315, "grad_norm": 0.8999440290465077, "learning_rate": 1.5503534847232862e-05, "loss": 0.1058, "step": 9621 }, { "epoch": 2.0108672936259144, "grad_norm": 1.040535451410486, "learning_rate": 1.5502592907776e-05, "loss": 0.1385, "step": 9622 }, { "epoch": 2.0110762800417974, "grad_norm": 1.1903345890244896, "learning_rate": 1.5501650898289948e-05, "loss": 0.1183, "step": 9623 }, { "epoch": 2.0112852664576804, "grad_norm": 1.0964472056191912, "learning_rate": 1.5500708818786693e-05, "loss": 0.1354, "step": 9624 }, { "epoch": 2.0114942528735633, "grad_norm": 1.1563174397555906, "learning_rate": 1.5499766669278225e-05, "loss": 0.1397, "step": 9625 }, { "epoch": 2.0117032392894463, "grad_norm": 1.0652887849669268, "learning_rate": 1.549882444977654e-05, "loss": 0.1195, "step": 9626 }, { "epoch": 2.0119122257053292, "grad_norm": 1.0451230117231998, "learning_rate": 1.5497882160293613e-05, "loss": 0.1115, "step": 9627 }, { "epoch": 2.012121212121212, "grad_norm": 0.842663423864554, "learning_rate": 1.5496939800841453e-05, "loss": 0.1058, "step": 9628 }, { "epoch": 2.012330198537095, "grad_norm": 0.8263866098610585, "learning_rate": 1.5495997371432046e-05, "loss": 0.0945, "step": 9629 }, { "epoch": 2.012539184952978, "grad_norm": 1.1007484384156492, "learning_rate": 1.5495054872077385e-05, "loss": 0.1142, "step": 9630 }, { "epoch": 2.012748171368861, "grad_norm": 0.9299763091043514, "learning_rate": 1.549411230278947e-05, "loss": 0.117, "step": 9631 }, { "epoch": 2.012957157784744, "grad_norm": 0.8822043419695783, "learning_rate": 1.5493169663580293e-05, "loss": 0.1131, "step": 9632 }, { "epoch": 2.013166144200627, "grad_norm": 1.0176463203745127, "learning_rate": 1.549222695446185e-05, "loss": 0.1143, "step": 9633 }, { "epoch": 2.01337513061651, "grad_norm": 1.0036955566110943, "learning_rate": 1.5491284175446136e-05, "loss": 0.1106, "step": 9634 }, { "epoch": 2.013584117032393, "grad_norm": 1.0507415569699279, "learning_rate": 1.549034132654516e-05, "loss": 0.1307, "step": 9635 }, { "epoch": 2.013793103448276, "grad_norm": 1.0487363631308386, "learning_rate": 1.5489398407770906e-05, "loss": 0.1262, "step": 9636 }, { "epoch": 2.014002089864159, "grad_norm": 0.9951551579438024, "learning_rate": 1.5488455419135385e-05, "loss": 0.1028, "step": 9637 }, { "epoch": 2.014211076280042, "grad_norm": 1.2631574029014123, "learning_rate": 1.5487512360650596e-05, "loss": 0.1254, "step": 9638 }, { "epoch": 2.0144200626959248, "grad_norm": 0.9336731151223736, "learning_rate": 1.5486569232328537e-05, "loss": 0.1225, "step": 9639 }, { "epoch": 2.0146290491118077, "grad_norm": 0.8654430074390452, "learning_rate": 1.548562603418122e-05, "loss": 0.1005, "step": 9640 }, { "epoch": 2.0148380355276907, "grad_norm": 0.8799613137321909, "learning_rate": 1.548468276622064e-05, "loss": 0.1181, "step": 9641 }, { "epoch": 2.0150470219435737, "grad_norm": 0.957809435478872, "learning_rate": 1.5483739428458798e-05, "loss": 0.1092, "step": 9642 }, { "epoch": 2.0152560083594566, "grad_norm": 1.590343418882052, "learning_rate": 1.548279602090771e-05, "loss": 0.1077, "step": 9643 }, { "epoch": 2.0154649947753396, "grad_norm": 1.0733669053263197, "learning_rate": 1.548185254357938e-05, "loss": 0.1101, "step": 9644 }, { "epoch": 2.0156739811912225, "grad_norm": 1.0419353358793741, "learning_rate": 1.5480908996485807e-05, "loss": 0.116, "step": 9645 }, { "epoch": 2.0158829676071055, "grad_norm": 0.9543428099152063, "learning_rate": 1.547996537963901e-05, "loss": 0.1269, "step": 9646 }, { "epoch": 2.0160919540229885, "grad_norm": 0.9914943006403427, "learning_rate": 1.5479021693050993e-05, "loss": 0.1271, "step": 9647 }, { "epoch": 2.0163009404388714, "grad_norm": 1.014231432668963, "learning_rate": 1.5478077936733767e-05, "loss": 0.1272, "step": 9648 }, { "epoch": 2.0165099268547544, "grad_norm": 0.931308247893606, "learning_rate": 1.5477134110699337e-05, "loss": 0.1006, "step": 9649 }, { "epoch": 2.0167189132706373, "grad_norm": 1.1532183599841077, "learning_rate": 1.5476190214959722e-05, "loss": 0.1329, "step": 9650 }, { "epoch": 2.0169278996865203, "grad_norm": 0.9782523000834077, "learning_rate": 1.5475246249526934e-05, "loss": 0.1257, "step": 9651 }, { "epoch": 2.0171368861024033, "grad_norm": 0.8190789461396623, "learning_rate": 1.547430221441298e-05, "loss": 0.1262, "step": 9652 }, { "epoch": 2.0173458725182862, "grad_norm": 0.8322398413237091, "learning_rate": 1.5473358109629884e-05, "loss": 0.1234, "step": 9653 }, { "epoch": 2.017554858934169, "grad_norm": 1.338843187421926, "learning_rate": 1.5472413935189656e-05, "loss": 0.1117, "step": 9654 }, { "epoch": 2.017763845350052, "grad_norm": 0.9584460175724772, "learning_rate": 1.547146969110431e-05, "loss": 0.1213, "step": 9655 }, { "epoch": 2.017972831765935, "grad_norm": 1.1013475045776493, "learning_rate": 1.5470525377385865e-05, "loss": 0.1072, "step": 9656 }, { "epoch": 2.018181818181818, "grad_norm": 1.0651269304067634, "learning_rate": 1.546958099404634e-05, "loss": 0.1344, "step": 9657 }, { "epoch": 2.018390804597701, "grad_norm": 1.0980657343096862, "learning_rate": 1.5468636541097753e-05, "loss": 0.1508, "step": 9658 }, { "epoch": 2.018599791013584, "grad_norm": 0.8952486114140108, "learning_rate": 1.5467692018552125e-05, "loss": 0.1075, "step": 9659 }, { "epoch": 2.018808777429467, "grad_norm": 0.9066063205453139, "learning_rate": 1.5466747426421472e-05, "loss": 0.1118, "step": 9660 }, { "epoch": 2.01901776384535, "grad_norm": 0.9882674753672542, "learning_rate": 1.546580276471782e-05, "loss": 0.1381, "step": 9661 }, { "epoch": 2.019226750261233, "grad_norm": 1.188502898606424, "learning_rate": 1.546485803345319e-05, "loss": 0.135, "step": 9662 }, { "epoch": 2.019435736677116, "grad_norm": 0.837334643730109, "learning_rate": 1.5463913232639603e-05, "loss": 0.1168, "step": 9663 }, { "epoch": 2.019644723092999, "grad_norm": 1.1097468600126754, "learning_rate": 1.5462968362289087e-05, "loss": 0.1366, "step": 9664 }, { "epoch": 2.0198537095088818, "grad_norm": 1.2446626270263719, "learning_rate": 1.546202342241367e-05, "loss": 0.1247, "step": 9665 }, { "epoch": 2.0200626959247647, "grad_norm": 0.945108091993585, "learning_rate": 1.5461078413025367e-05, "loss": 0.0987, "step": 9666 }, { "epoch": 2.0202716823406477, "grad_norm": 0.9228633673908877, "learning_rate": 1.546013333413621e-05, "loss": 0.1245, "step": 9667 }, { "epoch": 2.0204806687565307, "grad_norm": 0.8673029279679554, "learning_rate": 1.545918818575823e-05, "loss": 0.0987, "step": 9668 }, { "epoch": 2.0206896551724136, "grad_norm": 0.8431627349192319, "learning_rate": 1.545824296790345e-05, "loss": 0.1203, "step": 9669 }, { "epoch": 2.0208986415882966, "grad_norm": 0.9683089328398247, "learning_rate": 1.5457297680583906e-05, "loss": 0.1177, "step": 9670 }, { "epoch": 2.0211076280041795, "grad_norm": 0.8743741713605702, "learning_rate": 1.5456352323811624e-05, "loss": 0.1045, "step": 9671 }, { "epoch": 2.0213166144200625, "grad_norm": 1.3481396578694431, "learning_rate": 1.5455406897598633e-05, "loss": 0.1575, "step": 9672 }, { "epoch": 2.0215256008359455, "grad_norm": 0.9146167576376926, "learning_rate": 1.5454461401956967e-05, "loss": 0.1191, "step": 9673 }, { "epoch": 2.0217345872518284, "grad_norm": 1.070196799184041, "learning_rate": 1.5453515836898664e-05, "loss": 0.1358, "step": 9674 }, { "epoch": 2.0219435736677114, "grad_norm": 0.9037723288566121, "learning_rate": 1.5452570202435748e-05, "loss": 0.1031, "step": 9675 }, { "epoch": 2.022152560083595, "grad_norm": 1.0970385117218746, "learning_rate": 1.5451624498580264e-05, "loss": 0.1252, "step": 9676 }, { "epoch": 2.0223615464994777, "grad_norm": 1.3937828405355621, "learning_rate": 1.545067872534424e-05, "loss": 0.136, "step": 9677 }, { "epoch": 2.0225705329153607, "grad_norm": 1.1227240607515792, "learning_rate": 1.5449732882739716e-05, "loss": 0.1255, "step": 9678 }, { "epoch": 2.0227795193312437, "grad_norm": 1.0766412502123033, "learning_rate": 1.544878697077873e-05, "loss": 0.1194, "step": 9679 }, { "epoch": 2.0229885057471266, "grad_norm": 1.0444237939129495, "learning_rate": 1.544784098947332e-05, "loss": 0.1257, "step": 9680 }, { "epoch": 2.0231974921630096, "grad_norm": 0.9818020038211334, "learning_rate": 1.544689493883552e-05, "loss": 0.1128, "step": 9681 }, { "epoch": 2.0234064785788926, "grad_norm": 1.038418821178723, "learning_rate": 1.544594881887737e-05, "loss": 0.1357, "step": 9682 }, { "epoch": 2.0236154649947755, "grad_norm": 1.0233411162693926, "learning_rate": 1.544500262961092e-05, "loss": 0.108, "step": 9683 }, { "epoch": 2.0238244514106585, "grad_norm": 0.9993967927683051, "learning_rate": 1.5444056371048207e-05, "loss": 0.1199, "step": 9684 }, { "epoch": 2.0240334378265414, "grad_norm": 1.1219249916524188, "learning_rate": 1.5443110043201274e-05, "loss": 0.1245, "step": 9685 }, { "epoch": 2.0242424242424244, "grad_norm": 1.0543383830291428, "learning_rate": 1.544216364608216e-05, "loss": 0.1217, "step": 9686 }, { "epoch": 2.0244514106583074, "grad_norm": 1.1553814379418212, "learning_rate": 1.5441217179702916e-05, "loss": 0.1077, "step": 9687 }, { "epoch": 2.0246603970741903, "grad_norm": 1.1650163455779994, "learning_rate": 1.544027064407558e-05, "loss": 0.1372, "step": 9688 }, { "epoch": 2.0248693834900733, "grad_norm": 1.1999122335606645, "learning_rate": 1.5439324039212207e-05, "loss": 0.1186, "step": 9689 }, { "epoch": 2.0250783699059562, "grad_norm": 1.1152514858451978, "learning_rate": 1.5438377365124836e-05, "loss": 0.1332, "step": 9690 }, { "epoch": 2.025287356321839, "grad_norm": 1.0346777224665027, "learning_rate": 1.5437430621825522e-05, "loss": 0.1069, "step": 9691 }, { "epoch": 2.025496342737722, "grad_norm": 1.164412229916442, "learning_rate": 1.5436483809326307e-05, "loss": 0.1242, "step": 9692 }, { "epoch": 2.025705329153605, "grad_norm": 1.0825294916471233, "learning_rate": 1.5435536927639245e-05, "loss": 0.14, "step": 9693 }, { "epoch": 2.025914315569488, "grad_norm": 1.1901597482610704, "learning_rate": 1.5434589976776385e-05, "loss": 0.1457, "step": 9694 }, { "epoch": 2.026123301985371, "grad_norm": 1.1296507256917403, "learning_rate": 1.543364295674978e-05, "loss": 0.0951, "step": 9695 }, { "epoch": 2.026332288401254, "grad_norm": 1.289555564530713, "learning_rate": 1.543269586757148e-05, "loss": 0.1477, "step": 9696 }, { "epoch": 2.026541274817137, "grad_norm": 0.88900235792804, "learning_rate": 1.5431748709253543e-05, "loss": 0.1234, "step": 9697 }, { "epoch": 2.02675026123302, "grad_norm": 0.8811405987851654, "learning_rate": 1.5430801481808015e-05, "loss": 0.1148, "step": 9698 }, { "epoch": 2.026959247648903, "grad_norm": 0.9289710565328995, "learning_rate": 1.5429854185246956e-05, "loss": 0.1275, "step": 9699 }, { "epoch": 2.027168234064786, "grad_norm": 0.9854973841989488, "learning_rate": 1.5428906819582422e-05, "loss": 0.1369, "step": 9700 }, { "epoch": 2.027377220480669, "grad_norm": 1.047135072263057, "learning_rate": 1.542795938482647e-05, "loss": 0.1217, "step": 9701 }, { "epoch": 2.027586206896552, "grad_norm": 0.9897896686652515, "learning_rate": 1.5427011880991155e-05, "loss": 0.1107, "step": 9702 }, { "epoch": 2.0277951933124347, "grad_norm": 1.077229276256676, "learning_rate": 1.5426064308088542e-05, "loss": 0.1266, "step": 9703 }, { "epoch": 2.0280041797283177, "grad_norm": 1.1359350279131275, "learning_rate": 1.542511666613068e-05, "loss": 0.1612, "step": 9704 }, { "epoch": 2.0282131661442007, "grad_norm": 1.0758832242682423, "learning_rate": 1.5424168955129638e-05, "loss": 0.1309, "step": 9705 }, { "epoch": 2.0284221525600836, "grad_norm": 1.0156167985685323, "learning_rate": 1.542322117509747e-05, "loss": 0.1162, "step": 9706 }, { "epoch": 2.0286311389759666, "grad_norm": 1.0222829042657482, "learning_rate": 1.5422273326046247e-05, "loss": 0.1276, "step": 9707 }, { "epoch": 2.0288401253918495, "grad_norm": 1.0078574097352377, "learning_rate": 1.5421325407988025e-05, "loss": 0.1211, "step": 9708 }, { "epoch": 2.0290491118077325, "grad_norm": 0.8854522254170667, "learning_rate": 1.5420377420934866e-05, "loss": 0.109, "step": 9709 }, { "epoch": 2.0292580982236155, "grad_norm": 1.0043187388767263, "learning_rate": 1.5419429364898842e-05, "loss": 0.1335, "step": 9710 }, { "epoch": 2.0294670846394984, "grad_norm": 1.0536964035493108, "learning_rate": 1.5418481239892012e-05, "loss": 0.1147, "step": 9711 }, { "epoch": 2.0296760710553814, "grad_norm": 0.9299962127426785, "learning_rate": 1.5417533045926445e-05, "loss": 0.1048, "step": 9712 }, { "epoch": 2.0298850574712644, "grad_norm": 1.0389180757219674, "learning_rate": 1.541658478301421e-05, "loss": 0.1053, "step": 9713 }, { "epoch": 2.0300940438871473, "grad_norm": 0.8884293893422162, "learning_rate": 1.5415636451167377e-05, "loss": 0.1181, "step": 9714 }, { "epoch": 2.0303030303030303, "grad_norm": 1.1202926911411795, "learning_rate": 1.5414688050398003e-05, "loss": 0.1283, "step": 9715 }, { "epoch": 2.0305120167189132, "grad_norm": 1.1363443152062054, "learning_rate": 1.5413739580718173e-05, "loss": 0.1383, "step": 9716 }, { "epoch": 2.030721003134796, "grad_norm": 1.2325682504669802, "learning_rate": 1.5412791042139948e-05, "loss": 0.1306, "step": 9717 }, { "epoch": 2.030929989550679, "grad_norm": 1.1202718416182613, "learning_rate": 1.5411842434675403e-05, "loss": 0.1045, "step": 9718 }, { "epoch": 2.031138975966562, "grad_norm": 1.2745206674321654, "learning_rate": 1.5410893758336613e-05, "loss": 0.1478, "step": 9719 }, { "epoch": 2.031347962382445, "grad_norm": 1.0959906503969812, "learning_rate": 1.5409945013135644e-05, "loss": 0.1252, "step": 9720 }, { "epoch": 2.031556948798328, "grad_norm": 0.9414340671719662, "learning_rate": 1.540899619908458e-05, "loss": 0.138, "step": 9721 }, { "epoch": 2.031765935214211, "grad_norm": 0.9985403118670851, "learning_rate": 1.540804731619549e-05, "loss": 0.1253, "step": 9722 }, { "epoch": 2.031974921630094, "grad_norm": 1.1262815085154314, "learning_rate": 1.540709836448045e-05, "loss": 0.0954, "step": 9723 }, { "epoch": 2.032183908045977, "grad_norm": 0.9501557469691964, "learning_rate": 1.540614934395154e-05, "loss": 0.1281, "step": 9724 }, { "epoch": 2.03239289446186, "grad_norm": 1.0521178513788478, "learning_rate": 1.5405200254620834e-05, "loss": 0.1164, "step": 9725 }, { "epoch": 2.032601880877743, "grad_norm": 1.1312482743803287, "learning_rate": 1.540425109650041e-05, "loss": 0.1055, "step": 9726 }, { "epoch": 2.032810867293626, "grad_norm": 1.029216141217667, "learning_rate": 1.5403301869602355e-05, "loss": 0.1391, "step": 9727 }, { "epoch": 2.0330198537095088, "grad_norm": 1.027706960426029, "learning_rate": 1.5402352573938743e-05, "loss": 0.1233, "step": 9728 }, { "epoch": 2.0332288401253917, "grad_norm": 1.1802795696346409, "learning_rate": 1.5401403209521656e-05, "loss": 0.1346, "step": 9729 }, { "epoch": 2.0334378265412747, "grad_norm": 1.099879198850845, "learning_rate": 1.5400453776363177e-05, "loss": 0.1206, "step": 9730 }, { "epoch": 2.0336468129571577, "grad_norm": 1.1878704925977757, "learning_rate": 1.539950427447539e-05, "loss": 0.1363, "step": 9731 }, { "epoch": 2.0338557993730406, "grad_norm": 1.0097778174688572, "learning_rate": 1.5398554703870375e-05, "loss": 0.0912, "step": 9732 }, { "epoch": 2.0340647857889236, "grad_norm": 1.2295559010930839, "learning_rate": 1.5397605064560225e-05, "loss": 0.1411, "step": 9733 }, { "epoch": 2.0342737722048065, "grad_norm": 1.2872211458419773, "learning_rate": 1.5396655356557016e-05, "loss": 0.1354, "step": 9734 }, { "epoch": 2.0344827586206895, "grad_norm": 1.058815276109369, "learning_rate": 1.539570557987284e-05, "loss": 0.127, "step": 9735 }, { "epoch": 2.0346917450365725, "grad_norm": 1.0652807809278044, "learning_rate": 1.5394755734519783e-05, "loss": 0.0986, "step": 9736 }, { "epoch": 2.0349007314524554, "grad_norm": 1.0395621222933351, "learning_rate": 1.5393805820509932e-05, "loss": 0.1438, "step": 9737 }, { "epoch": 2.0351097178683384, "grad_norm": 1.2494174116630035, "learning_rate": 1.5392855837855382e-05, "loss": 0.1351, "step": 9738 }, { "epoch": 2.0353187042842213, "grad_norm": 0.9422351924360451, "learning_rate": 1.5391905786568216e-05, "loss": 0.1132, "step": 9739 }, { "epoch": 2.0355276907001043, "grad_norm": 0.9903906075780041, "learning_rate": 1.5390955666660526e-05, "loss": 0.1193, "step": 9740 }, { "epoch": 2.0357366771159873, "grad_norm": 1.2071298339426884, "learning_rate": 1.539000547814441e-05, "loss": 0.1154, "step": 9741 }, { "epoch": 2.0359456635318702, "grad_norm": 0.8977670239030625, "learning_rate": 1.538905522103195e-05, "loss": 0.1189, "step": 9742 }, { "epoch": 2.036154649947753, "grad_norm": 1.0798550119844845, "learning_rate": 1.5388104895335252e-05, "loss": 0.1334, "step": 9743 }, { "epoch": 2.036363636363636, "grad_norm": 1.0648544560780606, "learning_rate": 1.53871545010664e-05, "loss": 0.108, "step": 9744 }, { "epoch": 2.036572622779519, "grad_norm": 0.9948233294887148, "learning_rate": 1.5386204038237496e-05, "loss": 0.132, "step": 9745 }, { "epoch": 2.036781609195402, "grad_norm": 1.072683316347435, "learning_rate": 1.5385253506860634e-05, "loss": 0.152, "step": 9746 }, { "epoch": 2.0369905956112855, "grad_norm": 0.8939057416637852, "learning_rate": 1.5384302906947907e-05, "loss": 0.1203, "step": 9747 }, { "epoch": 2.0371995820271684, "grad_norm": 1.1200813406350327, "learning_rate": 1.5383352238511418e-05, "loss": 0.1461, "step": 9748 }, { "epoch": 2.0374085684430514, "grad_norm": 0.8711938484976096, "learning_rate": 1.5382401501563265e-05, "loss": 0.1272, "step": 9749 }, { "epoch": 2.0376175548589344, "grad_norm": 1.1467979519361562, "learning_rate": 1.5381450696115544e-05, "loss": 0.1231, "step": 9750 }, { "epoch": 2.0378265412748173, "grad_norm": 1.1225485794868566, "learning_rate": 1.538049982218036e-05, "loss": 0.1256, "step": 9751 }, { "epoch": 2.0380355276907003, "grad_norm": 1.0271502046844159, "learning_rate": 1.5379548879769814e-05, "loss": 0.1288, "step": 9752 }, { "epoch": 2.0382445141065832, "grad_norm": 1.1325808580891925, "learning_rate": 1.5378597868896003e-05, "loss": 0.0975, "step": 9753 }, { "epoch": 2.038453500522466, "grad_norm": 0.9455404693311328, "learning_rate": 1.5377646789571038e-05, "loss": 0.1115, "step": 9754 }, { "epoch": 2.038662486938349, "grad_norm": 1.0924347100129637, "learning_rate": 1.5376695641807016e-05, "loss": 0.1451, "step": 9755 }, { "epoch": 2.038871473354232, "grad_norm": 1.057587410602569, "learning_rate": 1.5375744425616048e-05, "loss": 0.1263, "step": 9756 }, { "epoch": 2.039080459770115, "grad_norm": 0.9245010382748553, "learning_rate": 1.5374793141010235e-05, "loss": 0.1171, "step": 9757 }, { "epoch": 2.039289446185998, "grad_norm": 0.8794661831891614, "learning_rate": 1.5373841788001684e-05, "loss": 0.0982, "step": 9758 }, { "epoch": 2.039498432601881, "grad_norm": 1.0476142362105774, "learning_rate": 1.5372890366602503e-05, "loss": 0.1221, "step": 9759 }, { "epoch": 2.039707419017764, "grad_norm": 1.2127030988578653, "learning_rate": 1.5371938876824802e-05, "loss": 0.1119, "step": 9760 }, { "epoch": 2.039916405433647, "grad_norm": 1.0433070517083423, "learning_rate": 1.537098731868069e-05, "loss": 0.1279, "step": 9761 }, { "epoch": 2.04012539184953, "grad_norm": 1.13514148302814, "learning_rate": 1.5370035692182276e-05, "loss": 0.1435, "step": 9762 }, { "epoch": 2.040334378265413, "grad_norm": 1.1113720279465105, "learning_rate": 1.5369083997341673e-05, "loss": 0.1235, "step": 9763 }, { "epoch": 2.040543364681296, "grad_norm": 0.960001245409407, "learning_rate": 1.5368132234170986e-05, "loss": 0.1156, "step": 9764 }, { "epoch": 2.040752351097179, "grad_norm": 1.0548201250454565, "learning_rate": 1.536718040268234e-05, "loss": 0.1342, "step": 9765 }, { "epoch": 2.0409613375130617, "grad_norm": 1.0452473931001973, "learning_rate": 1.5366228502887833e-05, "loss": 0.1274, "step": 9766 }, { "epoch": 2.0411703239289447, "grad_norm": 1.1127703307509655, "learning_rate": 1.5365276534799592e-05, "loss": 0.1271, "step": 9767 }, { "epoch": 2.0413793103448277, "grad_norm": 1.4173872584254033, "learning_rate": 1.5364324498429728e-05, "loss": 0.1448, "step": 9768 }, { "epoch": 2.0415882967607106, "grad_norm": 1.1609330010196959, "learning_rate": 1.5363372393790353e-05, "loss": 0.1025, "step": 9769 }, { "epoch": 2.0417972831765936, "grad_norm": 1.033118473103153, "learning_rate": 1.5362420220893594e-05, "loss": 0.1382, "step": 9770 }, { "epoch": 2.0420062695924766, "grad_norm": 1.2908435944731715, "learning_rate": 1.5361467979751563e-05, "loss": 0.1695, "step": 9771 }, { "epoch": 2.0422152560083595, "grad_norm": 1.2601151455211255, "learning_rate": 1.5360515670376373e-05, "loss": 0.1345, "step": 9772 }, { "epoch": 2.0424242424242425, "grad_norm": 1.148999724665392, "learning_rate": 1.5359563292780157e-05, "loss": 0.1189, "step": 9773 }, { "epoch": 2.0426332288401254, "grad_norm": 1.0118579414295203, "learning_rate": 1.535861084697502e-05, "loss": 0.1052, "step": 9774 }, { "epoch": 2.0428422152560084, "grad_norm": 1.035175403723413, "learning_rate": 1.53576583329731e-05, "loss": 0.1359, "step": 9775 }, { "epoch": 2.0430512016718914, "grad_norm": 0.9100561615067517, "learning_rate": 1.5356705750786506e-05, "loss": 0.1151, "step": 9776 }, { "epoch": 2.0432601880877743, "grad_norm": 0.908945778527099, "learning_rate": 1.5355753100427365e-05, "loss": 0.0967, "step": 9777 }, { "epoch": 2.0434691745036573, "grad_norm": 1.057612614214411, "learning_rate": 1.5354800381907804e-05, "loss": 0.1278, "step": 9778 }, { "epoch": 2.0436781609195402, "grad_norm": 0.9207820191025791, "learning_rate": 1.5353847595239943e-05, "loss": 0.1131, "step": 9779 }, { "epoch": 2.043887147335423, "grad_norm": 1.1478246749465928, "learning_rate": 1.5352894740435914e-05, "loss": 0.1158, "step": 9780 }, { "epoch": 2.044096133751306, "grad_norm": 1.06528228726999, "learning_rate": 1.5351941817507835e-05, "loss": 0.1146, "step": 9781 }, { "epoch": 2.044305120167189, "grad_norm": 1.123436438611861, "learning_rate": 1.5350988826467844e-05, "loss": 0.1172, "step": 9782 }, { "epoch": 2.044514106583072, "grad_norm": 1.0547608819919199, "learning_rate": 1.535003576732806e-05, "loss": 0.1447, "step": 9783 }, { "epoch": 2.044723092998955, "grad_norm": 1.041174543558133, "learning_rate": 1.5349082640100616e-05, "loss": 0.1218, "step": 9784 }, { "epoch": 2.044932079414838, "grad_norm": 1.0227149822238981, "learning_rate": 1.534812944479764e-05, "loss": 0.1453, "step": 9785 }, { "epoch": 2.045141065830721, "grad_norm": 1.0072526201911272, "learning_rate": 1.5347176181431268e-05, "loss": 0.1003, "step": 9786 }, { "epoch": 2.045350052246604, "grad_norm": 1.075897639438433, "learning_rate": 1.5346222850013628e-05, "loss": 0.1245, "step": 9787 }, { "epoch": 2.045559038662487, "grad_norm": 0.8863040535504763, "learning_rate": 1.5345269450556854e-05, "loss": 0.0933, "step": 9788 }, { "epoch": 2.04576802507837, "grad_norm": 0.8787748302351822, "learning_rate": 1.534431598307308e-05, "loss": 0.1127, "step": 9789 }, { "epoch": 2.045977011494253, "grad_norm": 1.125276301083501, "learning_rate": 1.5343362447574434e-05, "loss": 0.1368, "step": 9790 }, { "epoch": 2.046185997910136, "grad_norm": 0.8945320539477357, "learning_rate": 1.534240884407306e-05, "loss": 0.118, "step": 9791 }, { "epoch": 2.0463949843260187, "grad_norm": 0.9962156759207259, "learning_rate": 1.5341455172581092e-05, "loss": 0.1374, "step": 9792 }, { "epoch": 2.0466039707419017, "grad_norm": 0.9011871278668699, "learning_rate": 1.534050143311066e-05, "loss": 0.1007, "step": 9793 }, { "epoch": 2.0468129571577847, "grad_norm": 1.1052118855334365, "learning_rate": 1.533954762567391e-05, "loss": 0.1146, "step": 9794 }, { "epoch": 2.0470219435736676, "grad_norm": 0.9957260324567423, "learning_rate": 1.533859375028298e-05, "loss": 0.1165, "step": 9795 }, { "epoch": 2.0472309299895506, "grad_norm": 0.9068502986268056, "learning_rate": 1.5337639806950007e-05, "loss": 0.0881, "step": 9796 }, { "epoch": 2.0474399164054335, "grad_norm": 1.1070877809762634, "learning_rate": 1.5336685795687132e-05, "loss": 0.1417, "step": 9797 }, { "epoch": 2.0476489028213165, "grad_norm": 0.8843911218754501, "learning_rate": 1.5335731716506496e-05, "loss": 0.1162, "step": 9798 }, { "epoch": 2.0478578892371995, "grad_norm": 1.2876057511462253, "learning_rate": 1.5334777569420238e-05, "loss": 0.1298, "step": 9799 }, { "epoch": 2.0480668756530824, "grad_norm": 0.8814324778250903, "learning_rate": 1.5333823354440508e-05, "loss": 0.0933, "step": 9800 }, { "epoch": 2.0482758620689654, "grad_norm": 1.116444907276727, "learning_rate": 1.533286907157945e-05, "loss": 0.1317, "step": 9801 }, { "epoch": 2.0484848484848484, "grad_norm": 0.8849814882123603, "learning_rate": 1.53319147208492e-05, "loss": 0.1229, "step": 9802 }, { "epoch": 2.0486938349007313, "grad_norm": 1.10300284657551, "learning_rate": 1.5330960302261908e-05, "loss": 0.1657, "step": 9803 }, { "epoch": 2.0489028213166143, "grad_norm": 0.9576577158351429, "learning_rate": 1.533000581582972e-05, "loss": 0.1147, "step": 9804 }, { "epoch": 2.0491118077324972, "grad_norm": 1.047207453391985, "learning_rate": 1.5329051261564792e-05, "loss": 0.1355, "step": 9805 }, { "epoch": 2.04932079414838, "grad_norm": 1.1894300792253283, "learning_rate": 1.532809663947926e-05, "loss": 0.1596, "step": 9806 }, { "epoch": 2.049529780564263, "grad_norm": 1.0819012180440648, "learning_rate": 1.532714194958528e-05, "loss": 0.0959, "step": 9807 }, { "epoch": 2.049738766980146, "grad_norm": 1.1640454612382833, "learning_rate": 1.5326187191894996e-05, "loss": 0.1241, "step": 9808 }, { "epoch": 2.049947753396029, "grad_norm": 1.0961627996848806, "learning_rate": 1.5325232366420566e-05, "loss": 0.1119, "step": 9809 }, { "epoch": 2.050156739811912, "grad_norm": 0.9163720667597791, "learning_rate": 1.5324277473174138e-05, "loss": 0.0995, "step": 9810 }, { "epoch": 2.050365726227795, "grad_norm": 0.902742838701076, "learning_rate": 1.5323322512167863e-05, "loss": 0.1172, "step": 9811 }, { "epoch": 2.050574712643678, "grad_norm": 1.0317565347411628, "learning_rate": 1.53223674834139e-05, "loss": 0.1409, "step": 9812 }, { "epoch": 2.050783699059561, "grad_norm": 0.97032126935328, "learning_rate": 1.5321412386924394e-05, "loss": 0.129, "step": 9813 }, { "epoch": 2.050992685475444, "grad_norm": 0.9607189529845475, "learning_rate": 1.532045722271151e-05, "loss": 0.131, "step": 9814 }, { "epoch": 2.051201671891327, "grad_norm": 1.1148734489048857, "learning_rate": 1.53195019907874e-05, "loss": 0.1005, "step": 9815 }, { "epoch": 2.0514106583072103, "grad_norm": 1.1721807938336972, "learning_rate": 1.531854669116422e-05, "loss": 0.1523, "step": 9816 }, { "epoch": 2.051619644723093, "grad_norm": 1.1964478833728942, "learning_rate": 1.5317591323854122e-05, "loss": 0.1336, "step": 9817 }, { "epoch": 2.051828631138976, "grad_norm": 3.247067542249928, "learning_rate": 1.5316635888869277e-05, "loss": 0.1283, "step": 9818 }, { "epoch": 2.052037617554859, "grad_norm": 1.0451822807949218, "learning_rate": 1.5315680386221835e-05, "loss": 0.1262, "step": 9819 }, { "epoch": 2.052246603970742, "grad_norm": 1.0577001623382964, "learning_rate": 1.531472481592396e-05, "loss": 0.1302, "step": 9820 }, { "epoch": 2.052455590386625, "grad_norm": 0.9779670424648149, "learning_rate": 1.5313769177987813e-05, "loss": 0.1023, "step": 9821 }, { "epoch": 2.052664576802508, "grad_norm": 0.8540430164337333, "learning_rate": 1.5312813472425558e-05, "loss": 0.1081, "step": 9822 }, { "epoch": 2.052873563218391, "grad_norm": 1.0001030931726116, "learning_rate": 1.5311857699249354e-05, "loss": 0.1221, "step": 9823 }, { "epoch": 2.053082549634274, "grad_norm": 0.903094368202298, "learning_rate": 1.5310901858471364e-05, "loss": 0.1206, "step": 9824 }, { "epoch": 2.053291536050157, "grad_norm": 1.002987746182414, "learning_rate": 1.5309945950103755e-05, "loss": 0.1139, "step": 9825 }, { "epoch": 2.05350052246604, "grad_norm": 1.2084016875408474, "learning_rate": 1.5308989974158693e-05, "loss": 0.1414, "step": 9826 }, { "epoch": 2.053709508881923, "grad_norm": 1.214426086973219, "learning_rate": 1.5308033930648343e-05, "loss": 0.1322, "step": 9827 }, { "epoch": 2.053918495297806, "grad_norm": 1.0337346174265596, "learning_rate": 1.5307077819584873e-05, "loss": 0.1176, "step": 9828 }, { "epoch": 2.0541274817136888, "grad_norm": 1.1550023290297944, "learning_rate": 1.5306121640980453e-05, "loss": 0.1269, "step": 9829 }, { "epoch": 2.0543364681295717, "grad_norm": 0.9336829335895295, "learning_rate": 1.5305165394847246e-05, "loss": 0.1233, "step": 9830 }, { "epoch": 2.0545454545454547, "grad_norm": 1.310258204959826, "learning_rate": 1.5304209081197425e-05, "loss": 0.1462, "step": 9831 }, { "epoch": 2.0547544409613376, "grad_norm": 1.1072694122210223, "learning_rate": 1.5303252700043164e-05, "loss": 0.1314, "step": 9832 }, { "epoch": 2.0549634273772206, "grad_norm": 1.042734068448299, "learning_rate": 1.530229625139663e-05, "loss": 0.1373, "step": 9833 }, { "epoch": 2.0551724137931036, "grad_norm": 1.1130026368114012, "learning_rate": 1.5301339735269998e-05, "loss": 0.1252, "step": 9834 }, { "epoch": 2.0553814002089865, "grad_norm": 0.9588569031298343, "learning_rate": 1.5300383151675436e-05, "loss": 0.1193, "step": 9835 }, { "epoch": 2.0555903866248695, "grad_norm": 1.1431617247179122, "learning_rate": 1.5299426500625124e-05, "loss": 0.1506, "step": 9836 }, { "epoch": 2.0557993730407524, "grad_norm": 1.110866752331441, "learning_rate": 1.5298469782131234e-05, "loss": 0.1307, "step": 9837 }, { "epoch": 2.0560083594566354, "grad_norm": 1.1348816744606731, "learning_rate": 1.5297512996205942e-05, "loss": 0.0925, "step": 9838 }, { "epoch": 2.0562173458725184, "grad_norm": 1.073789894207379, "learning_rate": 1.529655614286143e-05, "loss": 0.1245, "step": 9839 }, { "epoch": 2.0564263322884013, "grad_norm": 0.9824505108462758, "learning_rate": 1.5295599222109864e-05, "loss": 0.1219, "step": 9840 }, { "epoch": 2.0566353187042843, "grad_norm": 1.25745354609849, "learning_rate": 1.529464223396343e-05, "loss": 0.1352, "step": 9841 }, { "epoch": 2.0568443051201672, "grad_norm": 1.2042182355086808, "learning_rate": 1.529368517843431e-05, "loss": 0.1427, "step": 9842 }, { "epoch": 2.05705329153605, "grad_norm": 0.9583582275654171, "learning_rate": 1.529272805553468e-05, "loss": 0.1162, "step": 9843 }, { "epoch": 2.057262277951933, "grad_norm": 1.148434768784956, "learning_rate": 1.5291770865276715e-05, "loss": 0.1078, "step": 9844 }, { "epoch": 2.057471264367816, "grad_norm": 1.3381546689203911, "learning_rate": 1.529081360767261e-05, "loss": 0.138, "step": 9845 }, { "epoch": 2.057680250783699, "grad_norm": 0.9161307976054422, "learning_rate": 1.5289856282734533e-05, "loss": 0.1135, "step": 9846 }, { "epoch": 2.057889237199582, "grad_norm": 1.170504185582019, "learning_rate": 1.5288898890474682e-05, "loss": 0.1632, "step": 9847 }, { "epoch": 2.058098223615465, "grad_norm": 1.0824978294895726, "learning_rate": 1.5287941430905232e-05, "loss": 0.0917, "step": 9848 }, { "epoch": 2.058307210031348, "grad_norm": 1.0442030339738881, "learning_rate": 1.5286983904038368e-05, "loss": 0.1071, "step": 9849 }, { "epoch": 2.058516196447231, "grad_norm": 0.9392822303899055, "learning_rate": 1.528602630988628e-05, "loss": 0.113, "step": 9850 }, { "epoch": 2.058725182863114, "grad_norm": 1.0877972869227839, "learning_rate": 1.5285068648461153e-05, "loss": 0.1149, "step": 9851 }, { "epoch": 2.058934169278997, "grad_norm": 1.238771867890696, "learning_rate": 1.5284110919775173e-05, "loss": 0.1189, "step": 9852 }, { "epoch": 2.05914315569488, "grad_norm": 1.1276295842384474, "learning_rate": 1.5283153123840534e-05, "loss": 0.1355, "step": 9853 }, { "epoch": 2.059352142110763, "grad_norm": 1.0397415262772187, "learning_rate": 1.528219526066942e-05, "loss": 0.1236, "step": 9854 }, { "epoch": 2.0595611285266457, "grad_norm": 1.0993403273634796, "learning_rate": 1.528123733027403e-05, "loss": 0.1229, "step": 9855 }, { "epoch": 2.0597701149425287, "grad_norm": 1.086238257687354, "learning_rate": 1.528027933266654e-05, "loss": 0.1187, "step": 9856 }, { "epoch": 2.0599791013584117, "grad_norm": 1.0819578271403336, "learning_rate": 1.5279321267859155e-05, "loss": 0.0981, "step": 9857 }, { "epoch": 2.0601880877742946, "grad_norm": 1.2618069533970058, "learning_rate": 1.5278363135864066e-05, "loss": 0.1363, "step": 9858 }, { "epoch": 2.0603970741901776, "grad_norm": 1.0398470006188125, "learning_rate": 1.527740493669346e-05, "loss": 0.1117, "step": 9859 }, { "epoch": 2.0606060606060606, "grad_norm": 1.2659042628718562, "learning_rate": 1.5276446670359538e-05, "loss": 0.124, "step": 9860 }, { "epoch": 2.0608150470219435, "grad_norm": 1.2944601918277208, "learning_rate": 1.527548833687449e-05, "loss": 0.1375, "step": 9861 }, { "epoch": 2.0610240334378265, "grad_norm": 1.3059572369768684, "learning_rate": 1.5274529936250518e-05, "loss": 0.1507, "step": 9862 }, { "epoch": 2.0612330198537094, "grad_norm": 0.991129179872269, "learning_rate": 1.5273571468499818e-05, "loss": 0.1052, "step": 9863 }, { "epoch": 2.0614420062695924, "grad_norm": 1.1035628050684714, "learning_rate": 1.5272612933634588e-05, "loss": 0.1289, "step": 9864 }, { "epoch": 2.0616509926854754, "grad_norm": 1.0995277759989996, "learning_rate": 1.5271654331667022e-05, "loss": 0.1199, "step": 9865 }, { "epoch": 2.0618599791013583, "grad_norm": 1.0077583222196722, "learning_rate": 1.5270695662609327e-05, "loss": 0.1309, "step": 9866 }, { "epoch": 2.0620689655172413, "grad_norm": 0.8335817667150984, "learning_rate": 1.5269736926473696e-05, "loss": 0.086, "step": 9867 }, { "epoch": 2.0622779519331242, "grad_norm": 0.9841392688343837, "learning_rate": 1.5268778123272335e-05, "loss": 0.1291, "step": 9868 }, { "epoch": 2.062486938349007, "grad_norm": 1.1846105556578013, "learning_rate": 1.526781925301745e-05, "loss": 0.1451, "step": 9869 }, { "epoch": 2.06269592476489, "grad_norm": 1.1869020058110045, "learning_rate": 1.526686031572124e-05, "loss": 0.1096, "step": 9870 }, { "epoch": 2.062904911180773, "grad_norm": 1.32963479180819, "learning_rate": 1.5265901311395907e-05, "loss": 0.131, "step": 9871 }, { "epoch": 2.063113897596656, "grad_norm": 1.2449465152502108, "learning_rate": 1.5264942240053655e-05, "loss": 0.1498, "step": 9872 }, { "epoch": 2.063322884012539, "grad_norm": 1.1288309686055849, "learning_rate": 1.5263983101706693e-05, "loss": 0.1162, "step": 9873 }, { "epoch": 2.063531870428422, "grad_norm": 1.1574769655076038, "learning_rate": 1.5263023896367228e-05, "loss": 0.1106, "step": 9874 }, { "epoch": 2.063740856844305, "grad_norm": 1.0375964984165, "learning_rate": 1.5262064624047467e-05, "loss": 0.1314, "step": 9875 }, { "epoch": 2.063949843260188, "grad_norm": 1.1597663845882176, "learning_rate": 1.5261105284759618e-05, "loss": 0.1271, "step": 9876 }, { "epoch": 2.064158829676071, "grad_norm": 0.9306022109495076, "learning_rate": 1.526014587851589e-05, "loss": 0.1231, "step": 9877 }, { "epoch": 2.064367816091954, "grad_norm": 1.1945057569316844, "learning_rate": 1.525918640532849e-05, "loss": 0.1488, "step": 9878 }, { "epoch": 2.064576802507837, "grad_norm": 1.0384962388629544, "learning_rate": 1.5258226865209638e-05, "loss": 0.1217, "step": 9879 }, { "epoch": 2.06478578892372, "grad_norm": 1.0651542236234781, "learning_rate": 1.5257267258171533e-05, "loss": 0.1539, "step": 9880 }, { "epoch": 2.0649947753396027, "grad_norm": 1.0553449754344852, "learning_rate": 1.5256307584226395e-05, "loss": 0.1288, "step": 9881 }, { "epoch": 2.0652037617554857, "grad_norm": 1.059953526500344, "learning_rate": 1.525534784338644e-05, "loss": 0.1203, "step": 9882 }, { "epoch": 2.0654127481713687, "grad_norm": 1.1555511017180105, "learning_rate": 1.5254388035663871e-05, "loss": 0.149, "step": 9883 }, { "epoch": 2.0656217345872516, "grad_norm": 1.0752910230274482, "learning_rate": 1.5253428161070915e-05, "loss": 0.1135, "step": 9884 }, { "epoch": 2.0658307210031346, "grad_norm": 1.2913489029223406, "learning_rate": 1.5252468219619788e-05, "loss": 0.1268, "step": 9885 }, { "epoch": 2.0660397074190175, "grad_norm": 1.1214364697734793, "learning_rate": 1.5251508211322692e-05, "loss": 0.11, "step": 9886 }, { "epoch": 2.0662486938349005, "grad_norm": 0.9356541648239596, "learning_rate": 1.5250548136191863e-05, "loss": 0.1221, "step": 9887 }, { "epoch": 2.066457680250784, "grad_norm": 1.0405847548942926, "learning_rate": 1.5249587994239506e-05, "loss": 0.1123, "step": 9888 }, { "epoch": 2.066666666666667, "grad_norm": 0.917782756337781, "learning_rate": 1.5248627785477847e-05, "loss": 0.1172, "step": 9889 }, { "epoch": 2.06687565308255, "grad_norm": 0.9865846429266419, "learning_rate": 1.5247667509919104e-05, "loss": 0.1127, "step": 9890 }, { "epoch": 2.067084639498433, "grad_norm": 1.0135577219244662, "learning_rate": 1.5246707167575502e-05, "loss": 0.1193, "step": 9891 }, { "epoch": 2.0672936259143158, "grad_norm": 0.9014916002914618, "learning_rate": 1.5245746758459254e-05, "loss": 0.1051, "step": 9892 }, { "epoch": 2.0675026123301987, "grad_norm": 1.2004314614451563, "learning_rate": 1.5244786282582594e-05, "loss": 0.1366, "step": 9893 }, { "epoch": 2.0677115987460817, "grad_norm": 1.1003676655833146, "learning_rate": 1.5243825739957737e-05, "loss": 0.1512, "step": 9894 }, { "epoch": 2.0679205851619646, "grad_norm": 1.0644425960511774, "learning_rate": 1.5242865130596909e-05, "loss": 0.1351, "step": 9895 }, { "epoch": 2.0681295715778476, "grad_norm": 1.2108485441244288, "learning_rate": 1.5241904454512338e-05, "loss": 0.1284, "step": 9896 }, { "epoch": 2.0683385579937306, "grad_norm": 0.967877462134615, "learning_rate": 1.5240943711716252e-05, "loss": 0.1121, "step": 9897 }, { "epoch": 2.0685475444096135, "grad_norm": 1.005539657535395, "learning_rate": 1.523998290222087e-05, "loss": 0.1077, "step": 9898 }, { "epoch": 2.0687565308254965, "grad_norm": 1.108474873857037, "learning_rate": 1.5239022026038426e-05, "loss": 0.1142, "step": 9899 }, { "epoch": 2.0689655172413794, "grad_norm": 1.0528078962273706, "learning_rate": 1.5238061083181147e-05, "loss": 0.1292, "step": 9900 }, { "epoch": 2.0691745036572624, "grad_norm": 1.1622120956643243, "learning_rate": 1.5237100073661263e-05, "loss": 0.1257, "step": 9901 }, { "epoch": 2.0693834900731454, "grad_norm": 0.9327652665465807, "learning_rate": 1.5236138997491003e-05, "loss": 0.119, "step": 9902 }, { "epoch": 2.0695924764890283, "grad_norm": 1.1228973063780834, "learning_rate": 1.5235177854682602e-05, "loss": 0.1203, "step": 9903 }, { "epoch": 2.0698014629049113, "grad_norm": 1.0017173602386067, "learning_rate": 1.5234216645248286e-05, "loss": 0.1203, "step": 9904 }, { "epoch": 2.0700104493207943, "grad_norm": 1.0658826042782426, "learning_rate": 1.5233255369200293e-05, "loss": 0.1257, "step": 9905 }, { "epoch": 2.070219435736677, "grad_norm": 1.0583505264662563, "learning_rate": 1.5232294026550856e-05, "loss": 0.1173, "step": 9906 }, { "epoch": 2.07042842215256, "grad_norm": 1.3381238941670837, "learning_rate": 1.5231332617312205e-05, "loss": 0.1346, "step": 9907 }, { "epoch": 2.070637408568443, "grad_norm": 1.1443337114041299, "learning_rate": 1.5230371141496583e-05, "loss": 0.1147, "step": 9908 }, { "epoch": 2.070846394984326, "grad_norm": 1.1935999468444685, "learning_rate": 1.522940959911622e-05, "loss": 0.1381, "step": 9909 }, { "epoch": 2.071055381400209, "grad_norm": 0.9553587028182139, "learning_rate": 1.5228447990183355e-05, "loss": 0.1089, "step": 9910 }, { "epoch": 2.071264367816092, "grad_norm": 1.102588349665266, "learning_rate": 1.5227486314710227e-05, "loss": 0.1328, "step": 9911 }, { "epoch": 2.071473354231975, "grad_norm": 0.9918546111393087, "learning_rate": 1.5226524572709074e-05, "loss": 0.1307, "step": 9912 }, { "epoch": 2.071682340647858, "grad_norm": 1.3384299068526595, "learning_rate": 1.5225562764192137e-05, "loss": 0.1095, "step": 9913 }, { "epoch": 2.071891327063741, "grad_norm": 1.1287738542251935, "learning_rate": 1.5224600889171657e-05, "loss": 0.1326, "step": 9914 }, { "epoch": 2.072100313479624, "grad_norm": 1.1088920259949238, "learning_rate": 1.5223638947659872e-05, "loss": 0.1414, "step": 9915 }, { "epoch": 2.072309299895507, "grad_norm": 0.9474967595364417, "learning_rate": 1.5222676939669026e-05, "loss": 0.1005, "step": 9916 }, { "epoch": 2.07251828631139, "grad_norm": 0.9024684732948979, "learning_rate": 1.522171486521136e-05, "loss": 0.1037, "step": 9917 }, { "epoch": 2.0727272727272728, "grad_norm": 1.1871626051165534, "learning_rate": 1.5220752724299124e-05, "loss": 0.1293, "step": 9918 }, { "epoch": 2.0729362591431557, "grad_norm": 1.119454611236181, "learning_rate": 1.5219790516944556e-05, "loss": 0.1451, "step": 9919 }, { "epoch": 2.0731452455590387, "grad_norm": 1.270338811928848, "learning_rate": 1.5218828243159909e-05, "loss": 0.1382, "step": 9920 }, { "epoch": 2.0733542319749216, "grad_norm": 0.9883633698977503, "learning_rate": 1.521786590295742e-05, "loss": 0.1159, "step": 9921 }, { "epoch": 2.0735632183908046, "grad_norm": 1.0889343660136983, "learning_rate": 1.5216903496349345e-05, "loss": 0.1559, "step": 9922 }, { "epoch": 2.0737722048066876, "grad_norm": 0.8004529363231885, "learning_rate": 1.5215941023347924e-05, "loss": 0.1037, "step": 9923 }, { "epoch": 2.0739811912225705, "grad_norm": 1.0804991005866336, "learning_rate": 1.5214978483965413e-05, "loss": 0.131, "step": 9924 }, { "epoch": 2.0741901776384535, "grad_norm": 1.3511835961243068, "learning_rate": 1.521401587821406e-05, "loss": 0.1391, "step": 9925 }, { "epoch": 2.0743991640543364, "grad_norm": 0.8577718340886102, "learning_rate": 1.5213053206106115e-05, "loss": 0.1136, "step": 9926 }, { "epoch": 2.0746081504702194, "grad_norm": 0.893071486121332, "learning_rate": 1.5212090467653826e-05, "loss": 0.1058, "step": 9927 }, { "epoch": 2.0748171368861024, "grad_norm": 1.0414172088411215, "learning_rate": 1.5211127662869451e-05, "loss": 0.1184, "step": 9928 }, { "epoch": 2.0750261233019853, "grad_norm": 1.1294421182343755, "learning_rate": 1.5210164791765242e-05, "loss": 0.138, "step": 9929 }, { "epoch": 2.0752351097178683, "grad_norm": 1.128054935552457, "learning_rate": 1.5209201854353451e-05, "loss": 0.119, "step": 9930 }, { "epoch": 2.0754440961337512, "grad_norm": 0.9801453837792774, "learning_rate": 1.5208238850646332e-05, "loss": 0.1136, "step": 9931 }, { "epoch": 2.075653082549634, "grad_norm": 1.1610856285315352, "learning_rate": 1.5207275780656148e-05, "loss": 0.1355, "step": 9932 }, { "epoch": 2.075862068965517, "grad_norm": 0.8760601837211403, "learning_rate": 1.5206312644395148e-05, "loss": 0.0999, "step": 9933 }, { "epoch": 2.0760710553814, "grad_norm": 1.217916343857443, "learning_rate": 1.520534944187559e-05, "loss": 0.1303, "step": 9934 }, { "epoch": 2.076280041797283, "grad_norm": 1.1003584182275679, "learning_rate": 1.5204386173109732e-05, "loss": 0.1384, "step": 9935 }, { "epoch": 2.076489028213166, "grad_norm": 1.114136798342271, "learning_rate": 1.5203422838109837e-05, "loss": 0.1522, "step": 9936 }, { "epoch": 2.076698014629049, "grad_norm": 0.9434820025602784, "learning_rate": 1.5202459436888166e-05, "loss": 0.103, "step": 9937 }, { "epoch": 2.076907001044932, "grad_norm": 1.150386646498763, "learning_rate": 1.5201495969456974e-05, "loss": 0.1273, "step": 9938 }, { "epoch": 2.077115987460815, "grad_norm": 1.3462236330673225, "learning_rate": 1.5200532435828528e-05, "loss": 0.1384, "step": 9939 }, { "epoch": 2.077324973876698, "grad_norm": 1.0718871887338106, "learning_rate": 1.5199568836015087e-05, "loss": 0.105, "step": 9940 }, { "epoch": 2.077533960292581, "grad_norm": 1.072292391571876, "learning_rate": 1.5198605170028912e-05, "loss": 0.1439, "step": 9941 }, { "epoch": 2.077742946708464, "grad_norm": 1.5987826959059024, "learning_rate": 1.5197641437882273e-05, "loss": 0.1372, "step": 9942 }, { "epoch": 2.077951933124347, "grad_norm": 3.726117762458075, "learning_rate": 1.5196677639587431e-05, "loss": 0.1138, "step": 9943 }, { "epoch": 2.0781609195402297, "grad_norm": 1.0483202482160465, "learning_rate": 1.519571377515666e-05, "loss": 0.0964, "step": 9944 }, { "epoch": 2.0783699059561127, "grad_norm": 1.0478030039559574, "learning_rate": 1.5194749844602213e-05, "loss": 0.1465, "step": 9945 }, { "epoch": 2.0785788923719957, "grad_norm": 1.2221335404301992, "learning_rate": 1.5193785847936368e-05, "loss": 0.1352, "step": 9946 }, { "epoch": 2.0787878787878786, "grad_norm": 1.199419376399916, "learning_rate": 1.5192821785171387e-05, "loss": 0.1345, "step": 9947 }, { "epoch": 2.0789968652037616, "grad_norm": 1.0112209683814828, "learning_rate": 1.5191857656319546e-05, "loss": 0.0988, "step": 9948 }, { "epoch": 2.0792058516196446, "grad_norm": 1.1993187566175723, "learning_rate": 1.5190893461393108e-05, "loss": 0.1359, "step": 9949 }, { "epoch": 2.0794148380355275, "grad_norm": 0.9414999473056015, "learning_rate": 1.5189929200404349e-05, "loss": 0.1128, "step": 9950 }, { "epoch": 2.0796238244514105, "grad_norm": 1.1617834457290452, "learning_rate": 1.5188964873365538e-05, "loss": 0.1313, "step": 9951 }, { "epoch": 2.0798328108672934, "grad_norm": 1.0965508258551317, "learning_rate": 1.518800048028895e-05, "loss": 0.1374, "step": 9952 }, { "epoch": 2.0800417972831764, "grad_norm": 1.1730684211991378, "learning_rate": 1.5187036021186854e-05, "loss": 0.1369, "step": 9953 }, { "epoch": 2.0802507836990594, "grad_norm": 1.0561093658913547, "learning_rate": 1.5186071496071531e-05, "loss": 0.1352, "step": 9954 }, { "epoch": 2.0804597701149423, "grad_norm": 0.9682830812211102, "learning_rate": 1.5185106904955248e-05, "loss": 0.1131, "step": 9955 }, { "epoch": 2.0806687565308257, "grad_norm": 1.1929098153251176, "learning_rate": 1.518414224785029e-05, "loss": 0.1547, "step": 9956 }, { "epoch": 2.0808777429467087, "grad_norm": 1.2929242534064114, "learning_rate": 1.5183177524768926e-05, "loss": 0.1286, "step": 9957 }, { "epoch": 2.0810867293625916, "grad_norm": 0.8504656096299177, "learning_rate": 1.5182212735723436e-05, "loss": 0.1249, "step": 9958 }, { "epoch": 2.0812957157784746, "grad_norm": 0.9560304705016309, "learning_rate": 1.5181247880726102e-05, "loss": 0.1052, "step": 9959 }, { "epoch": 2.0815047021943576, "grad_norm": 1.3119463085738174, "learning_rate": 1.5180282959789199e-05, "loss": 0.1587, "step": 9960 }, { "epoch": 2.0817136886102405, "grad_norm": 1.0018758910207477, "learning_rate": 1.5179317972925007e-05, "loss": 0.1022, "step": 9961 }, { "epoch": 2.0819226750261235, "grad_norm": 1.001079015608413, "learning_rate": 1.5178352920145811e-05, "loss": 0.1256, "step": 9962 }, { "epoch": 2.0821316614420065, "grad_norm": 0.8254863715302977, "learning_rate": 1.517738780146389e-05, "loss": 0.1182, "step": 9963 }, { "epoch": 2.0823406478578894, "grad_norm": 0.9622460329935862, "learning_rate": 1.5176422616891527e-05, "loss": 0.1119, "step": 9964 }, { "epoch": 2.0825496342737724, "grad_norm": 0.9510642578353538, "learning_rate": 1.5175457366441001e-05, "loss": 0.1128, "step": 9965 }, { "epoch": 2.0827586206896553, "grad_norm": 0.9948001268318398, "learning_rate": 1.517449205012461e-05, "loss": 0.1106, "step": 9966 }, { "epoch": 2.0829676071055383, "grad_norm": 0.996323017702473, "learning_rate": 1.5173526667954623e-05, "loss": 0.1228, "step": 9967 }, { "epoch": 2.0831765935214213, "grad_norm": 0.9379040947106742, "learning_rate": 1.5172561219943336e-05, "loss": 0.0984, "step": 9968 }, { "epoch": 2.083385579937304, "grad_norm": 1.034009611716352, "learning_rate": 1.517159570610303e-05, "loss": 0.1175, "step": 9969 }, { "epoch": 2.083594566353187, "grad_norm": 1.1081123922551666, "learning_rate": 1.5170630126446e-05, "loss": 0.1085, "step": 9970 }, { "epoch": 2.08380355276907, "grad_norm": 0.9153134616598803, "learning_rate": 1.5169664480984528e-05, "loss": 0.1138, "step": 9971 }, { "epoch": 2.084012539184953, "grad_norm": 1.0379151577121937, "learning_rate": 1.5168698769730905e-05, "loss": 0.1219, "step": 9972 }, { "epoch": 2.084221525600836, "grad_norm": 0.9251686588623272, "learning_rate": 1.5167732992697421e-05, "loss": 0.1201, "step": 9973 }, { "epoch": 2.084430512016719, "grad_norm": 1.0490793365676627, "learning_rate": 1.5166767149896368e-05, "loss": 0.1282, "step": 9974 }, { "epoch": 2.084639498432602, "grad_norm": 0.9978859140544116, "learning_rate": 1.5165801241340038e-05, "loss": 0.1154, "step": 9975 }, { "epoch": 2.084848484848485, "grad_norm": 1.046156349145879, "learning_rate": 1.5164835267040725e-05, "loss": 0.1147, "step": 9976 }, { "epoch": 2.085057471264368, "grad_norm": 1.2159236120242731, "learning_rate": 1.5163869227010721e-05, "loss": 0.1376, "step": 9977 }, { "epoch": 2.085266457680251, "grad_norm": 0.9624170660113924, "learning_rate": 1.5162903121262318e-05, "loss": 0.1094, "step": 9978 }, { "epoch": 2.085475444096134, "grad_norm": 1.051021196617449, "learning_rate": 1.5161936949807813e-05, "loss": 0.1079, "step": 9979 }, { "epoch": 2.085684430512017, "grad_norm": 1.265384629299948, "learning_rate": 1.5160970712659506e-05, "loss": 0.1189, "step": 9980 }, { "epoch": 2.0858934169278998, "grad_norm": 1.103814899884049, "learning_rate": 1.516000440982969e-05, "loss": 0.1269, "step": 9981 }, { "epoch": 2.0861024033437827, "grad_norm": 1.2149777961705033, "learning_rate": 1.5159038041330658e-05, "loss": 0.1423, "step": 9982 }, { "epoch": 2.0863113897596657, "grad_norm": 1.164618984792873, "learning_rate": 1.515807160717472e-05, "loss": 0.1213, "step": 9983 }, { "epoch": 2.0865203761755486, "grad_norm": 1.1886808383017238, "learning_rate": 1.5157105107374167e-05, "loss": 0.1226, "step": 9984 }, { "epoch": 2.0867293625914316, "grad_norm": 0.9778724080151652, "learning_rate": 1.5156138541941298e-05, "loss": 0.1195, "step": 9985 }, { "epoch": 2.0869383490073146, "grad_norm": 0.9873326003007655, "learning_rate": 1.5155171910888423e-05, "loss": 0.1267, "step": 9986 }, { "epoch": 2.0871473354231975, "grad_norm": 1.2612978728450805, "learning_rate": 1.5154205214227834e-05, "loss": 0.1399, "step": 9987 }, { "epoch": 2.0873563218390805, "grad_norm": 1.2418402855167385, "learning_rate": 1.5153238451971839e-05, "loss": 0.1418, "step": 9988 }, { "epoch": 2.0875653082549634, "grad_norm": 1.1638698141372374, "learning_rate": 1.5152271624132744e-05, "loss": 0.1234, "step": 9989 }, { "epoch": 2.0877742946708464, "grad_norm": 1.0949871036299386, "learning_rate": 1.5151304730722846e-05, "loss": 0.1076, "step": 9990 }, { "epoch": 2.0879832810867294, "grad_norm": 1.0446118738258288, "learning_rate": 1.5150337771754457e-05, "loss": 0.1155, "step": 9991 }, { "epoch": 2.0881922675026123, "grad_norm": 0.8861933208893477, "learning_rate": 1.514937074723988e-05, "loss": 0.1104, "step": 9992 }, { "epoch": 2.0884012539184953, "grad_norm": 0.9653956355004222, "learning_rate": 1.5148403657191422e-05, "loss": 0.116, "step": 9993 }, { "epoch": 2.0886102403343783, "grad_norm": 1.0770925439577357, "learning_rate": 1.514743650162139e-05, "loss": 0.1456, "step": 9994 }, { "epoch": 2.088819226750261, "grad_norm": 1.1213317381939842, "learning_rate": 1.5146469280542096e-05, "loss": 0.1169, "step": 9995 }, { "epoch": 2.089028213166144, "grad_norm": 0.8859239524044232, "learning_rate": 1.5145501993965845e-05, "loss": 0.1061, "step": 9996 }, { "epoch": 2.089237199582027, "grad_norm": 1.1486144961620444, "learning_rate": 1.514453464190495e-05, "loss": 0.1177, "step": 9997 }, { "epoch": 2.08944618599791, "grad_norm": 0.9703790667755962, "learning_rate": 1.5143567224371723e-05, "loss": 0.1125, "step": 9998 }, { "epoch": 2.089655172413793, "grad_norm": 1.0018090631158187, "learning_rate": 1.5142599741378469e-05, "loss": 0.1434, "step": 9999 }, { "epoch": 2.089864158829676, "grad_norm": 1.2686372620001336, "learning_rate": 1.5141632192937512e-05, "loss": 0.1347, "step": 10000 }, { "epoch": 2.090073145245559, "grad_norm": 0.8758284972488949, "learning_rate": 1.514066457906116e-05, "loss": 0.108, "step": 10001 }, { "epoch": 2.090282131661442, "grad_norm": 1.074748738276783, "learning_rate": 1.5139696899761721e-05, "loss": 0.1236, "step": 10002 }, { "epoch": 2.090491118077325, "grad_norm": 1.0807832723741482, "learning_rate": 1.5138729155051521e-05, "loss": 0.1405, "step": 10003 }, { "epoch": 2.090700104493208, "grad_norm": 1.210819100367121, "learning_rate": 1.513776134494287e-05, "loss": 0.14, "step": 10004 }, { "epoch": 2.090909090909091, "grad_norm": 1.048296314018047, "learning_rate": 1.5136793469448088e-05, "loss": 0.1154, "step": 10005 }, { "epoch": 2.091118077324974, "grad_norm": 0.9127062040153984, "learning_rate": 1.513582552857949e-05, "loss": 0.1237, "step": 10006 }, { "epoch": 2.0913270637408568, "grad_norm": 1.025949559320112, "learning_rate": 1.5134857522349395e-05, "loss": 0.1231, "step": 10007 }, { "epoch": 2.0915360501567397, "grad_norm": 1.0897547973943884, "learning_rate": 1.5133889450770122e-05, "loss": 0.1103, "step": 10008 }, { "epoch": 2.0917450365726227, "grad_norm": 1.0236577269732687, "learning_rate": 1.5132921313853994e-05, "loss": 0.1241, "step": 10009 }, { "epoch": 2.0919540229885056, "grad_norm": 0.9642105287139411, "learning_rate": 1.5131953111613331e-05, "loss": 0.1355, "step": 10010 }, { "epoch": 2.0921630094043886, "grad_norm": 0.906574168468839, "learning_rate": 1.5130984844060454e-05, "loss": 0.1076, "step": 10011 }, { "epoch": 2.0923719958202716, "grad_norm": 1.139524542331574, "learning_rate": 1.5130016511207685e-05, "loss": 0.1503, "step": 10012 }, { "epoch": 2.0925809822361545, "grad_norm": 1.088921950755587, "learning_rate": 1.512904811306735e-05, "loss": 0.1268, "step": 10013 }, { "epoch": 2.0927899686520375, "grad_norm": 0.9304466683839202, "learning_rate": 1.5128079649651773e-05, "loss": 0.1049, "step": 10014 }, { "epoch": 2.0929989550679204, "grad_norm": 1.019070149826768, "learning_rate": 1.5127111120973274e-05, "loss": 0.1213, "step": 10015 }, { "epoch": 2.0932079414838034, "grad_norm": 0.9531988222380617, "learning_rate": 1.5126142527044185e-05, "loss": 0.119, "step": 10016 }, { "epoch": 2.0934169278996864, "grad_norm": 1.0756736083443583, "learning_rate": 1.5125173867876832e-05, "loss": 0.1427, "step": 10017 }, { "epoch": 2.0936259143155693, "grad_norm": 0.98209731817267, "learning_rate": 1.5124205143483541e-05, "loss": 0.1274, "step": 10018 }, { "epoch": 2.0938349007314523, "grad_norm": 1.0121779966939053, "learning_rate": 1.5123236353876643e-05, "loss": 0.118, "step": 10019 }, { "epoch": 2.0940438871473352, "grad_norm": 0.7971360873911992, "learning_rate": 1.5122267499068466e-05, "loss": 0.0867, "step": 10020 }, { "epoch": 2.094252873563218, "grad_norm": 1.0582263977476334, "learning_rate": 1.5121298579071337e-05, "loss": 0.1216, "step": 10021 }, { "epoch": 2.094461859979101, "grad_norm": 1.171143357537757, "learning_rate": 1.5120329593897594e-05, "loss": 0.1448, "step": 10022 }, { "epoch": 2.094670846394984, "grad_norm": 1.0410727643624689, "learning_rate": 1.5119360543559562e-05, "loss": 0.1351, "step": 10023 }, { "epoch": 2.094879832810867, "grad_norm": 1.0720225123125624, "learning_rate": 1.5118391428069577e-05, "loss": 0.1001, "step": 10024 }, { "epoch": 2.09508881922675, "grad_norm": 1.0681605539799524, "learning_rate": 1.5117422247439973e-05, "loss": 0.1513, "step": 10025 }, { "epoch": 2.095297805642633, "grad_norm": 0.9779158560213058, "learning_rate": 1.5116453001683083e-05, "loss": 0.1056, "step": 10026 }, { "epoch": 2.095506792058516, "grad_norm": 1.2249384233454284, "learning_rate": 1.5115483690811244e-05, "loss": 0.127, "step": 10027 }, { "epoch": 2.095715778474399, "grad_norm": 0.9374601892026819, "learning_rate": 1.5114514314836787e-05, "loss": 0.1126, "step": 10028 }, { "epoch": 2.0959247648902823, "grad_norm": 1.233347539334727, "learning_rate": 1.5113544873772057e-05, "loss": 0.133, "step": 10029 }, { "epoch": 2.0961337513061653, "grad_norm": 1.1594428756323076, "learning_rate": 1.5112575367629384e-05, "loss": 0.1218, "step": 10030 }, { "epoch": 2.0963427377220483, "grad_norm": 1.0295944683349059, "learning_rate": 1.5111605796421112e-05, "loss": 0.1281, "step": 10031 }, { "epoch": 2.0965517241379312, "grad_norm": 1.0814493000165084, "learning_rate": 1.5110636160159576e-05, "loss": 0.1397, "step": 10032 }, { "epoch": 2.096760710553814, "grad_norm": 1.1045324365094458, "learning_rate": 1.510966645885712e-05, "loss": 0.1317, "step": 10033 }, { "epoch": 2.096969696969697, "grad_norm": 1.2318049474617994, "learning_rate": 1.5108696692526081e-05, "loss": 0.1299, "step": 10034 }, { "epoch": 2.09717868338558, "grad_norm": 1.1366309913546209, "learning_rate": 1.5107726861178806e-05, "loss": 0.1244, "step": 10035 }, { "epoch": 2.097387669801463, "grad_norm": 1.2356347768879457, "learning_rate": 1.5106756964827631e-05, "loss": 0.1464, "step": 10036 }, { "epoch": 2.097596656217346, "grad_norm": 1.314780883285876, "learning_rate": 1.5105787003484907e-05, "loss": 0.1289, "step": 10037 }, { "epoch": 2.097805642633229, "grad_norm": 1.006618987406515, "learning_rate": 1.510481697716297e-05, "loss": 0.125, "step": 10038 }, { "epoch": 2.098014629049112, "grad_norm": 1.0180983800764833, "learning_rate": 1.5103846885874172e-05, "loss": 0.1145, "step": 10039 }, { "epoch": 2.098223615464995, "grad_norm": 0.9219064917108565, "learning_rate": 1.5102876729630857e-05, "loss": 0.1079, "step": 10040 }, { "epoch": 2.098432601880878, "grad_norm": 0.8459293972832219, "learning_rate": 1.5101906508445372e-05, "loss": 0.0779, "step": 10041 }, { "epoch": 2.098641588296761, "grad_norm": 1.065897177752815, "learning_rate": 1.510093622233006e-05, "loss": 0.1475, "step": 10042 }, { "epoch": 2.098850574712644, "grad_norm": 0.9655927835482584, "learning_rate": 1.5099965871297277e-05, "loss": 0.1149, "step": 10043 }, { "epoch": 2.0990595611285268, "grad_norm": 0.9774751234777634, "learning_rate": 1.5098995455359365e-05, "loss": 0.1189, "step": 10044 }, { "epoch": 2.0992685475444097, "grad_norm": 1.1705637492621148, "learning_rate": 1.509802497452868e-05, "loss": 0.1301, "step": 10045 }, { "epoch": 2.0994775339602927, "grad_norm": 1.0614916405887653, "learning_rate": 1.509705442881757e-05, "loss": 0.1328, "step": 10046 }, { "epoch": 2.0996865203761756, "grad_norm": 0.912287804803132, "learning_rate": 1.509608381823839e-05, "loss": 0.1028, "step": 10047 }, { "epoch": 2.0998955067920586, "grad_norm": 0.9204211563370025, "learning_rate": 1.5095113142803483e-05, "loss": 0.1114, "step": 10048 }, { "epoch": 2.1001044932079416, "grad_norm": 0.9479426396995005, "learning_rate": 1.5094142402525213e-05, "loss": 0.1149, "step": 10049 }, { "epoch": 2.1003134796238245, "grad_norm": 1.0107909965468191, "learning_rate": 1.5093171597415931e-05, "loss": 0.1258, "step": 10050 }, { "epoch": 2.1005224660397075, "grad_norm": 1.016086482450934, "learning_rate": 1.5092200727487992e-05, "loss": 0.1187, "step": 10051 }, { "epoch": 2.1007314524555905, "grad_norm": 1.0560786443843895, "learning_rate": 1.5091229792753748e-05, "loss": 0.1235, "step": 10052 }, { "epoch": 2.1009404388714734, "grad_norm": 1.0062850003643975, "learning_rate": 1.509025879322556e-05, "loss": 0.128, "step": 10053 }, { "epoch": 2.1011494252873564, "grad_norm": 1.1514030445996175, "learning_rate": 1.5089287728915784e-05, "loss": 0.1614, "step": 10054 }, { "epoch": 2.1013584117032393, "grad_norm": 0.9618828926791891, "learning_rate": 1.5088316599836783e-05, "loss": 0.1142, "step": 10055 }, { "epoch": 2.1015673981191223, "grad_norm": 1.11600051907809, "learning_rate": 1.5087345406000906e-05, "loss": 0.1268, "step": 10056 }, { "epoch": 2.1017763845350053, "grad_norm": 0.933492929336417, "learning_rate": 1.5086374147420522e-05, "loss": 0.1094, "step": 10057 }, { "epoch": 2.101985370950888, "grad_norm": 0.9479480681396043, "learning_rate": 1.5085402824107989e-05, "loss": 0.106, "step": 10058 }, { "epoch": 2.102194357366771, "grad_norm": 1.1009130047806512, "learning_rate": 1.5084431436075666e-05, "loss": 0.1036, "step": 10059 }, { "epoch": 2.102403343782654, "grad_norm": 1.3541775601923787, "learning_rate": 1.508345998333592e-05, "loss": 0.1477, "step": 10060 }, { "epoch": 2.102612330198537, "grad_norm": 0.9265334986510999, "learning_rate": 1.508248846590111e-05, "loss": 0.1208, "step": 10061 }, { "epoch": 2.10282131661442, "grad_norm": 1.049230191444008, "learning_rate": 1.5081516883783604e-05, "loss": 0.1368, "step": 10062 }, { "epoch": 2.103030303030303, "grad_norm": 0.9368195047271638, "learning_rate": 1.5080545236995764e-05, "loss": 0.0984, "step": 10063 }, { "epoch": 2.103239289446186, "grad_norm": 0.839213562311807, "learning_rate": 1.5079573525549957e-05, "loss": 0.1072, "step": 10064 }, { "epoch": 2.103448275862069, "grad_norm": 1.1534262444013814, "learning_rate": 1.5078601749458547e-05, "loss": 0.129, "step": 10065 }, { "epoch": 2.103657262277952, "grad_norm": 1.0422559505353648, "learning_rate": 1.5077629908733906e-05, "loss": 0.1102, "step": 10066 }, { "epoch": 2.103866248693835, "grad_norm": 1.110208999118002, "learning_rate": 1.50766580033884e-05, "loss": 0.1395, "step": 10067 }, { "epoch": 2.104075235109718, "grad_norm": 1.0271670813835008, "learning_rate": 1.5075686033434398e-05, "loss": 0.111, "step": 10068 }, { "epoch": 2.104284221525601, "grad_norm": 1.0300212289796158, "learning_rate": 1.507471399888427e-05, "loss": 0.13, "step": 10069 }, { "epoch": 2.1044932079414838, "grad_norm": 0.8606804372414701, "learning_rate": 1.5073741899750387e-05, "loss": 0.1168, "step": 10070 }, { "epoch": 2.1047021943573667, "grad_norm": 1.147312430196905, "learning_rate": 1.5072769736045118e-05, "loss": 0.1293, "step": 10071 }, { "epoch": 2.1049111807732497, "grad_norm": 0.990163238810933, "learning_rate": 1.5071797507780836e-05, "loss": 0.1307, "step": 10072 }, { "epoch": 2.1051201671891326, "grad_norm": 1.0756040006380814, "learning_rate": 1.507082521496992e-05, "loss": 0.1441, "step": 10073 }, { "epoch": 2.1053291536050156, "grad_norm": 1.0720530386775993, "learning_rate": 1.5069852857624737e-05, "loss": 0.1319, "step": 10074 }, { "epoch": 2.1055381400208986, "grad_norm": 0.9779406478763532, "learning_rate": 1.5068880435757663e-05, "loss": 0.1062, "step": 10075 }, { "epoch": 2.1057471264367815, "grad_norm": 1.2051671128820174, "learning_rate": 1.5067907949381076e-05, "loss": 0.1155, "step": 10076 }, { "epoch": 2.1059561128526645, "grad_norm": 1.1057369309306984, "learning_rate": 1.5066935398507352e-05, "loss": 0.1292, "step": 10077 }, { "epoch": 2.1061650992685474, "grad_norm": 0.9737446563978724, "learning_rate": 1.5065962783148864e-05, "loss": 0.1203, "step": 10078 }, { "epoch": 2.1063740856844304, "grad_norm": 0.9602292787103376, "learning_rate": 1.5064990103317997e-05, "loss": 0.1162, "step": 10079 }, { "epoch": 2.1065830721003134, "grad_norm": 0.7491577190625159, "learning_rate": 1.5064017359027125e-05, "loss": 0.1017, "step": 10080 }, { "epoch": 2.1067920585161963, "grad_norm": 1.2121999111036215, "learning_rate": 1.5063044550288629e-05, "loss": 0.1162, "step": 10081 }, { "epoch": 2.1070010449320793, "grad_norm": 0.976472590738271, "learning_rate": 1.506207167711489e-05, "loss": 0.1398, "step": 10082 }, { "epoch": 2.1072100313479623, "grad_norm": 0.9678461021241422, "learning_rate": 1.506109873951829e-05, "loss": 0.1198, "step": 10083 }, { "epoch": 2.107419017763845, "grad_norm": 1.223787171444005, "learning_rate": 1.506012573751121e-05, "loss": 0.1318, "step": 10084 }, { "epoch": 2.107628004179728, "grad_norm": 1.0764743198699345, "learning_rate": 1.505915267110603e-05, "loss": 0.1137, "step": 10085 }, { "epoch": 2.107836990595611, "grad_norm": 1.0241143288149597, "learning_rate": 1.505817954031514e-05, "loss": 0.1064, "step": 10086 }, { "epoch": 2.108045977011494, "grad_norm": 1.0833072623686755, "learning_rate": 1.5057206345150918e-05, "loss": 0.1406, "step": 10087 }, { "epoch": 2.108254963427377, "grad_norm": 1.2251245799274004, "learning_rate": 1.505623308562576e-05, "loss": 0.1352, "step": 10088 }, { "epoch": 2.10846394984326, "grad_norm": 1.1684937953420924, "learning_rate": 1.505525976175204e-05, "loss": 0.1088, "step": 10089 }, { "epoch": 2.108672936259143, "grad_norm": 1.1545808671800917, "learning_rate": 1.505428637354215e-05, "loss": 0.1463, "step": 10090 }, { "epoch": 2.108881922675026, "grad_norm": 1.2195504434674183, "learning_rate": 1.505331292100848e-05, "loss": 0.1325, "step": 10091 }, { "epoch": 2.109090909090909, "grad_norm": 0.9468161962142236, "learning_rate": 1.5052339404163419e-05, "loss": 0.1072, "step": 10092 }, { "epoch": 2.109299895506792, "grad_norm": 0.9176858951735819, "learning_rate": 1.505136582301935e-05, "loss": 0.0857, "step": 10093 }, { "epoch": 2.109508881922675, "grad_norm": 1.064341540508059, "learning_rate": 1.505039217758867e-05, "loss": 0.1223, "step": 10094 }, { "epoch": 2.109717868338558, "grad_norm": 1.1666957636811677, "learning_rate": 1.504941846788377e-05, "loss": 0.1418, "step": 10095 }, { "epoch": 2.1099268547544408, "grad_norm": 1.0759605360506033, "learning_rate": 1.5048444693917038e-05, "loss": 0.1336, "step": 10096 }, { "epoch": 2.110135841170324, "grad_norm": 1.1108463626796954, "learning_rate": 1.5047470855700868e-05, "loss": 0.1365, "step": 10097 }, { "epoch": 2.110344827586207, "grad_norm": 0.9491734426413596, "learning_rate": 1.5046496953247656e-05, "loss": 0.1211, "step": 10098 }, { "epoch": 2.11055381400209, "grad_norm": 1.012369729942158, "learning_rate": 1.5045522986569791e-05, "loss": 0.1217, "step": 10099 }, { "epoch": 2.110762800417973, "grad_norm": 1.111881648942154, "learning_rate": 1.5044548955679679e-05, "loss": 0.1354, "step": 10100 }, { "epoch": 2.110971786833856, "grad_norm": 1.1435371866821458, "learning_rate": 1.5043574860589706e-05, "loss": 0.1265, "step": 10101 }, { "epoch": 2.111180773249739, "grad_norm": 1.0007328856750408, "learning_rate": 1.5042600701312271e-05, "loss": 0.1222, "step": 10102 }, { "epoch": 2.111389759665622, "grad_norm": 0.9173453912020625, "learning_rate": 1.5041626477859774e-05, "loss": 0.1187, "step": 10103 }, { "epoch": 2.111598746081505, "grad_norm": 0.8586550747049481, "learning_rate": 1.5040652190244615e-05, "loss": 0.1041, "step": 10104 }, { "epoch": 2.111807732497388, "grad_norm": 0.9796310016432555, "learning_rate": 1.5039677838479185e-05, "loss": 0.1163, "step": 10105 }, { "epoch": 2.112016718913271, "grad_norm": 1.2235392881525582, "learning_rate": 1.5038703422575895e-05, "loss": 0.1363, "step": 10106 }, { "epoch": 2.1122257053291538, "grad_norm": 1.0620571665741962, "learning_rate": 1.503772894254714e-05, "loss": 0.1362, "step": 10107 }, { "epoch": 2.1124346917450367, "grad_norm": 1.1051065942457015, "learning_rate": 1.5036754398405322e-05, "loss": 0.1203, "step": 10108 }, { "epoch": 2.1126436781609197, "grad_norm": 1.0288170724038983, "learning_rate": 1.5035779790162845e-05, "loss": 0.0978, "step": 10109 }, { "epoch": 2.1128526645768027, "grad_norm": 1.016551415555396, "learning_rate": 1.5034805117832113e-05, "loss": 0.1051, "step": 10110 }, { "epoch": 2.1130616509926856, "grad_norm": 0.9415273207060175, "learning_rate": 1.5033830381425523e-05, "loss": 0.1088, "step": 10111 }, { "epoch": 2.1132706374085686, "grad_norm": 1.0957285017907692, "learning_rate": 1.5032855580955494e-05, "loss": 0.1211, "step": 10112 }, { "epoch": 2.1134796238244515, "grad_norm": 1.0356078804961641, "learning_rate": 1.5031880716434419e-05, "loss": 0.1113, "step": 10113 }, { "epoch": 2.1136886102403345, "grad_norm": 0.9614613955787387, "learning_rate": 1.5030905787874712e-05, "loss": 0.106, "step": 10114 }, { "epoch": 2.1138975966562175, "grad_norm": 1.2557217678931842, "learning_rate": 1.5029930795288774e-05, "loss": 0.1283, "step": 10115 }, { "epoch": 2.1141065830721004, "grad_norm": 1.1634439844537705, "learning_rate": 1.5028955738689024e-05, "loss": 0.1215, "step": 10116 }, { "epoch": 2.1143155694879834, "grad_norm": 0.917192034642532, "learning_rate": 1.5027980618087859e-05, "loss": 0.1258, "step": 10117 }, { "epoch": 2.1145245559038663, "grad_norm": 1.0285887063253056, "learning_rate": 1.5027005433497697e-05, "loss": 0.1288, "step": 10118 }, { "epoch": 2.1147335423197493, "grad_norm": 1.1309407780826677, "learning_rate": 1.5026030184930949e-05, "loss": 0.1355, "step": 10119 }, { "epoch": 2.1149425287356323, "grad_norm": 1.0259987151017802, "learning_rate": 1.502505487240002e-05, "loss": 0.1092, "step": 10120 }, { "epoch": 2.1151515151515152, "grad_norm": 0.9856250174957231, "learning_rate": 1.5024079495917328e-05, "loss": 0.13, "step": 10121 }, { "epoch": 2.115360501567398, "grad_norm": 0.9452365583425956, "learning_rate": 1.5023104055495284e-05, "loss": 0.1387, "step": 10122 }, { "epoch": 2.115569487983281, "grad_norm": 1.0999099666127286, "learning_rate": 1.5022128551146302e-05, "loss": 0.1388, "step": 10123 }, { "epoch": 2.115778474399164, "grad_norm": 0.96213142501627, "learning_rate": 1.5021152982882798e-05, "loss": 0.1364, "step": 10124 }, { "epoch": 2.115987460815047, "grad_norm": 1.0817015852874394, "learning_rate": 1.5020177350717186e-05, "loss": 0.1062, "step": 10125 }, { "epoch": 2.11619644723093, "grad_norm": 1.0692317077855236, "learning_rate": 1.5019201654661886e-05, "loss": 0.1436, "step": 10126 }, { "epoch": 2.116405433646813, "grad_norm": 0.9925246232322134, "learning_rate": 1.5018225894729313e-05, "loss": 0.1238, "step": 10127 }, { "epoch": 2.116614420062696, "grad_norm": 0.919712939117128, "learning_rate": 1.501725007093188e-05, "loss": 0.1159, "step": 10128 }, { "epoch": 2.116823406478579, "grad_norm": 1.006153787596328, "learning_rate": 1.5016274183282015e-05, "loss": 0.1207, "step": 10129 }, { "epoch": 2.117032392894462, "grad_norm": 1.0548494151925303, "learning_rate": 1.5015298231792131e-05, "loss": 0.1363, "step": 10130 }, { "epoch": 2.117241379310345, "grad_norm": 1.145142343746574, "learning_rate": 1.5014322216474653e-05, "loss": 0.1327, "step": 10131 }, { "epoch": 2.117450365726228, "grad_norm": 1.063409481565896, "learning_rate": 1.5013346137342e-05, "loss": 0.1249, "step": 10132 }, { "epoch": 2.1176593521421108, "grad_norm": 0.8796902510412695, "learning_rate": 1.5012369994406593e-05, "loss": 0.1069, "step": 10133 }, { "epoch": 2.1178683385579937, "grad_norm": 0.9742544975846871, "learning_rate": 1.5011393787680856e-05, "loss": 0.1322, "step": 10134 }, { "epoch": 2.1180773249738767, "grad_norm": 1.116872616620401, "learning_rate": 1.5010417517177216e-05, "loss": 0.1459, "step": 10135 }, { "epoch": 2.1182863113897596, "grad_norm": 1.2875626185874403, "learning_rate": 1.5009441182908093e-05, "loss": 0.1271, "step": 10136 }, { "epoch": 2.1184952978056426, "grad_norm": 1.1610571224223245, "learning_rate": 1.5008464784885917e-05, "loss": 0.1327, "step": 10137 }, { "epoch": 2.1187042842215256, "grad_norm": 1.0369008964615145, "learning_rate": 1.5007488323123107e-05, "loss": 0.1259, "step": 10138 }, { "epoch": 2.1189132706374085, "grad_norm": 1.0993685966580482, "learning_rate": 1.5006511797632098e-05, "loss": 0.1167, "step": 10139 }, { "epoch": 2.1191222570532915, "grad_norm": 0.8030119965227932, "learning_rate": 1.5005535208425313e-05, "loss": 0.1089, "step": 10140 }, { "epoch": 2.1193312434691745, "grad_norm": 0.880534846820913, "learning_rate": 1.5004558555515182e-05, "loss": 0.1114, "step": 10141 }, { "epoch": 2.1195402298850574, "grad_norm": 1.3458623188620753, "learning_rate": 1.5003581838914133e-05, "loss": 0.1286, "step": 10142 }, { "epoch": 2.1197492163009404, "grad_norm": 1.1072767404755044, "learning_rate": 1.5002605058634597e-05, "loss": 0.1169, "step": 10143 }, { "epoch": 2.1199582027168233, "grad_norm": 0.9901540311440402, "learning_rate": 1.5001628214689005e-05, "loss": 0.1171, "step": 10144 }, { "epoch": 2.1201671891327063, "grad_norm": 1.1290778093298095, "learning_rate": 1.5000651307089796e-05, "loss": 0.1167, "step": 10145 }, { "epoch": 2.1203761755485893, "grad_norm": 1.0406471589578077, "learning_rate": 1.4999674335849392e-05, "loss": 0.144, "step": 10146 }, { "epoch": 2.120585161964472, "grad_norm": 0.9818837196846694, "learning_rate": 1.499869730098023e-05, "loss": 0.1185, "step": 10147 }, { "epoch": 2.120794148380355, "grad_norm": 0.9774343363996879, "learning_rate": 1.4997720202494746e-05, "loss": 0.1076, "step": 10148 }, { "epoch": 2.121003134796238, "grad_norm": 1.0106282671938398, "learning_rate": 1.4996743040405373e-05, "loss": 0.1222, "step": 10149 }, { "epoch": 2.121212121212121, "grad_norm": 1.2259433865449785, "learning_rate": 1.499576581472455e-05, "loss": 0.1357, "step": 10150 }, { "epoch": 2.121421107628004, "grad_norm": 0.9766135871490378, "learning_rate": 1.4994788525464712e-05, "loss": 0.1254, "step": 10151 }, { "epoch": 2.121630094043887, "grad_norm": 1.2424622512793768, "learning_rate": 1.4993811172638295e-05, "loss": 0.1477, "step": 10152 }, { "epoch": 2.12183908045977, "grad_norm": 0.936908317923505, "learning_rate": 1.499283375625774e-05, "loss": 0.1016, "step": 10153 }, { "epoch": 2.122048066875653, "grad_norm": 1.2271832905024456, "learning_rate": 1.4991856276335484e-05, "loss": 0.1461, "step": 10154 }, { "epoch": 2.122257053291536, "grad_norm": 1.0056666077257355, "learning_rate": 1.4990878732883967e-05, "loss": 0.1297, "step": 10155 }, { "epoch": 2.122466039707419, "grad_norm": 1.2989123037936152, "learning_rate": 1.4989901125915632e-05, "loss": 0.1555, "step": 10156 }, { "epoch": 2.122675026123302, "grad_norm": 1.0916063135387113, "learning_rate": 1.498892345544292e-05, "loss": 0.1144, "step": 10157 }, { "epoch": 2.122884012539185, "grad_norm": 0.8713885469252811, "learning_rate": 1.4987945721478272e-05, "loss": 0.1181, "step": 10158 }, { "epoch": 2.1230929989550678, "grad_norm": 1.1655233539506258, "learning_rate": 1.4986967924034131e-05, "loss": 0.1426, "step": 10159 }, { "epoch": 2.1233019853709507, "grad_norm": 0.9551278075357185, "learning_rate": 1.4985990063122941e-05, "loss": 0.1059, "step": 10160 }, { "epoch": 2.1235109717868337, "grad_norm": 1.065680249651595, "learning_rate": 1.4985012138757152e-05, "loss": 0.1357, "step": 10161 }, { "epoch": 2.1237199582027166, "grad_norm": 1.1171365524239563, "learning_rate": 1.49840341509492e-05, "loss": 0.1401, "step": 10162 }, { "epoch": 2.1239289446185996, "grad_norm": 0.9910180684256216, "learning_rate": 1.4983056099711541e-05, "loss": 0.1337, "step": 10163 }, { "epoch": 2.1241379310344826, "grad_norm": 0.9548304119068696, "learning_rate": 1.4982077985056618e-05, "loss": 0.1346, "step": 10164 }, { "epoch": 2.1243469174503655, "grad_norm": 1.003915895736446, "learning_rate": 1.4981099806996876e-05, "loss": 0.1242, "step": 10165 }, { "epoch": 2.1245559038662485, "grad_norm": 0.9640843219223095, "learning_rate": 1.498012156554477e-05, "loss": 0.1115, "step": 10166 }, { "epoch": 2.1247648902821314, "grad_norm": 0.9010514921907024, "learning_rate": 1.4979143260712748e-05, "loss": 0.0971, "step": 10167 }, { "epoch": 2.1249738766980144, "grad_norm": 1.0305631675473705, "learning_rate": 1.4978164892513251e-05, "loss": 0.1058, "step": 10168 }, { "epoch": 2.1251828631138974, "grad_norm": 1.1169124213731711, "learning_rate": 1.4977186460958746e-05, "loss": 0.1237, "step": 10169 }, { "epoch": 2.1253918495297803, "grad_norm": 0.986866422915906, "learning_rate": 1.4976207966061678e-05, "loss": 0.1419, "step": 10170 }, { "epoch": 2.1256008359456637, "grad_norm": 0.9308982762537799, "learning_rate": 1.4975229407834495e-05, "loss": 0.1124, "step": 10171 }, { "epoch": 2.1258098223615467, "grad_norm": 1.0412227744318594, "learning_rate": 1.4974250786289656e-05, "loss": 0.1256, "step": 10172 }, { "epoch": 2.1260188087774297, "grad_norm": 1.0073913656832065, "learning_rate": 1.4973272101439618e-05, "loss": 0.1352, "step": 10173 }, { "epoch": 2.1262277951933126, "grad_norm": 0.930368128769757, "learning_rate": 1.4972293353296827e-05, "loss": 0.1052, "step": 10174 }, { "epoch": 2.1264367816091956, "grad_norm": 0.9930818538442452, "learning_rate": 1.4971314541873751e-05, "loss": 0.1287, "step": 10175 }, { "epoch": 2.1266457680250785, "grad_norm": 1.055708672794996, "learning_rate": 1.4970335667182838e-05, "loss": 0.1313, "step": 10176 }, { "epoch": 2.1268547544409615, "grad_norm": 1.1523352222211967, "learning_rate": 1.4969356729236549e-05, "loss": 0.1503, "step": 10177 }, { "epoch": 2.1270637408568445, "grad_norm": 1.0690635641712667, "learning_rate": 1.4968377728047343e-05, "loss": 0.1473, "step": 10178 }, { "epoch": 2.1272727272727274, "grad_norm": 1.0493172230095982, "learning_rate": 1.496739866362768e-05, "loss": 0.1304, "step": 10179 }, { "epoch": 2.1274817136886104, "grad_norm": 0.9944755668945253, "learning_rate": 1.4966419535990013e-05, "loss": 0.0994, "step": 10180 }, { "epoch": 2.1276907001044933, "grad_norm": 1.089619921009649, "learning_rate": 1.4965440345146815e-05, "loss": 0.1332, "step": 10181 }, { "epoch": 2.1278996865203763, "grad_norm": 1.2054312355770496, "learning_rate": 1.4964461091110539e-05, "loss": 0.1222, "step": 10182 }, { "epoch": 2.1281086729362593, "grad_norm": 0.7267452236144423, "learning_rate": 1.496348177389365e-05, "loss": 0.0899, "step": 10183 }, { "epoch": 2.1283176593521422, "grad_norm": 1.210005821077495, "learning_rate": 1.4962502393508609e-05, "loss": 0.113, "step": 10184 }, { "epoch": 2.128526645768025, "grad_norm": 0.8874425781913655, "learning_rate": 1.4961522949967887e-05, "loss": 0.1201, "step": 10185 }, { "epoch": 2.128735632183908, "grad_norm": 1.02962296096795, "learning_rate": 1.4960543443283938e-05, "loss": 0.1263, "step": 10186 }, { "epoch": 2.128944618599791, "grad_norm": 1.2114576249137814, "learning_rate": 1.495956387346924e-05, "loss": 0.1498, "step": 10187 }, { "epoch": 2.129153605015674, "grad_norm": 1.1329821619370093, "learning_rate": 1.4958584240536251e-05, "loss": 0.1251, "step": 10188 }, { "epoch": 2.129362591431557, "grad_norm": 0.9921061209399092, "learning_rate": 1.4957604544497442e-05, "loss": 0.1098, "step": 10189 }, { "epoch": 2.12957157784744, "grad_norm": 1.0395629952459482, "learning_rate": 1.4956624785365278e-05, "loss": 0.1275, "step": 10190 }, { "epoch": 2.129780564263323, "grad_norm": 1.2528255212724024, "learning_rate": 1.4955644963152237e-05, "loss": 0.1404, "step": 10191 }, { "epoch": 2.129989550679206, "grad_norm": 1.276000076324942, "learning_rate": 1.4954665077870773e-05, "loss": 0.1213, "step": 10192 }, { "epoch": 2.130198537095089, "grad_norm": 1.129448890980998, "learning_rate": 1.4953685129533373e-05, "loss": 0.1217, "step": 10193 }, { "epoch": 2.130407523510972, "grad_norm": 0.9885151669109508, "learning_rate": 1.4952705118152499e-05, "loss": 0.1317, "step": 10194 }, { "epoch": 2.130616509926855, "grad_norm": 1.0796880936866011, "learning_rate": 1.4951725043740623e-05, "loss": 0.1295, "step": 10195 }, { "epoch": 2.1308254963427378, "grad_norm": 0.9377845624319684, "learning_rate": 1.4950744906310223e-05, "loss": 0.1173, "step": 10196 }, { "epoch": 2.1310344827586207, "grad_norm": 1.0466912766292462, "learning_rate": 1.494976470587377e-05, "loss": 0.1127, "step": 10197 }, { "epoch": 2.1312434691745037, "grad_norm": 1.0745943080031561, "learning_rate": 1.4948784442443738e-05, "loss": 0.1202, "step": 10198 }, { "epoch": 2.1314524555903867, "grad_norm": 0.9259346499827168, "learning_rate": 1.4947804116032603e-05, "loss": 0.1277, "step": 10199 }, { "epoch": 2.1316614420062696, "grad_norm": 1.1672491448513418, "learning_rate": 1.4946823726652842e-05, "loss": 0.1317, "step": 10200 }, { "epoch": 2.1318704284221526, "grad_norm": 0.9923138657170297, "learning_rate": 1.4945843274316929e-05, "loss": 0.1263, "step": 10201 }, { "epoch": 2.1320794148380355, "grad_norm": 0.9052354507380155, "learning_rate": 1.4944862759037346e-05, "loss": 0.1335, "step": 10202 }, { "epoch": 2.1322884012539185, "grad_norm": 0.8274278848607359, "learning_rate": 1.494388218082657e-05, "loss": 0.0962, "step": 10203 }, { "epoch": 2.1324973876698015, "grad_norm": 0.9904442318295487, "learning_rate": 1.494290153969708e-05, "loss": 0.111, "step": 10204 }, { "epoch": 2.1327063740856844, "grad_norm": 0.8291208404548938, "learning_rate": 1.4941920835661353e-05, "loss": 0.1055, "step": 10205 }, { "epoch": 2.1329153605015674, "grad_norm": 0.8281287241770404, "learning_rate": 1.4940940068731876e-05, "loss": 0.105, "step": 10206 }, { "epoch": 2.1331243469174503, "grad_norm": 1.030825902012848, "learning_rate": 1.4939959238921127e-05, "loss": 0.1122, "step": 10207 }, { "epoch": 2.1333333333333333, "grad_norm": 1.040640022871196, "learning_rate": 1.4938978346241592e-05, "loss": 0.1284, "step": 10208 }, { "epoch": 2.1335423197492163, "grad_norm": 1.0123058653304378, "learning_rate": 1.4937997390705751e-05, "loss": 0.1362, "step": 10209 }, { "epoch": 2.1337513061650992, "grad_norm": 0.9052677198016214, "learning_rate": 1.493701637232609e-05, "loss": 0.1111, "step": 10210 }, { "epoch": 2.133960292580982, "grad_norm": 1.052074517515906, "learning_rate": 1.4936035291115091e-05, "loss": 0.1126, "step": 10211 }, { "epoch": 2.134169278996865, "grad_norm": 0.8187392235142317, "learning_rate": 1.4935054147085245e-05, "loss": 0.1064, "step": 10212 }, { "epoch": 2.134378265412748, "grad_norm": 0.9696167795608549, "learning_rate": 1.4934072940249033e-05, "loss": 0.1153, "step": 10213 }, { "epoch": 2.134587251828631, "grad_norm": 0.9461957292890051, "learning_rate": 1.4933091670618949e-05, "loss": 0.1185, "step": 10214 }, { "epoch": 2.134796238244514, "grad_norm": 0.925165357802819, "learning_rate": 1.4932110338207475e-05, "loss": 0.1161, "step": 10215 }, { "epoch": 2.135005224660397, "grad_norm": 0.9740889444107897, "learning_rate": 1.4931128943027103e-05, "loss": 0.1048, "step": 10216 }, { "epoch": 2.13521421107628, "grad_norm": 1.4156873974620676, "learning_rate": 1.493014748509032e-05, "loss": 0.1126, "step": 10217 }, { "epoch": 2.135423197492163, "grad_norm": 1.0610458464684083, "learning_rate": 1.4929165964409622e-05, "loss": 0.1193, "step": 10218 }, { "epoch": 2.135632183908046, "grad_norm": 1.0073548361386404, "learning_rate": 1.4928184380997495e-05, "loss": 0.1248, "step": 10219 }, { "epoch": 2.135841170323929, "grad_norm": 1.1336477806232776, "learning_rate": 1.4927202734866434e-05, "loss": 0.1285, "step": 10220 }, { "epoch": 2.136050156739812, "grad_norm": 1.102037242117992, "learning_rate": 1.4926221026028933e-05, "loss": 0.1111, "step": 10221 }, { "epoch": 2.1362591431556948, "grad_norm": 1.166956892205562, "learning_rate": 1.4925239254497484e-05, "loss": 0.1268, "step": 10222 }, { "epoch": 2.1364681295715777, "grad_norm": 1.1583851234897253, "learning_rate": 1.4924257420284578e-05, "loss": 0.1376, "step": 10223 }, { "epoch": 2.1366771159874607, "grad_norm": 1.2384659052757223, "learning_rate": 1.4923275523402719e-05, "loss": 0.1368, "step": 10224 }, { "epoch": 2.1368861024033436, "grad_norm": 1.204666515560785, "learning_rate": 1.4922293563864394e-05, "loss": 0.1107, "step": 10225 }, { "epoch": 2.1370950888192266, "grad_norm": 1.2464708821356427, "learning_rate": 1.4921311541682109e-05, "loss": 0.1442, "step": 10226 }, { "epoch": 2.1373040752351096, "grad_norm": 1.09667143915549, "learning_rate": 1.4920329456868355e-05, "loss": 0.1248, "step": 10227 }, { "epoch": 2.1375130616509925, "grad_norm": 1.014996390135851, "learning_rate": 1.4919347309435632e-05, "loss": 0.1177, "step": 10228 }, { "epoch": 2.1377220480668755, "grad_norm": 0.8428034605284862, "learning_rate": 1.4918365099396438e-05, "loss": 0.1042, "step": 10229 }, { "epoch": 2.1379310344827585, "grad_norm": 0.9938486188151109, "learning_rate": 1.491738282676328e-05, "loss": 0.1217, "step": 10230 }, { "epoch": 2.1381400208986414, "grad_norm": 0.793945388031688, "learning_rate": 1.491640049154865e-05, "loss": 0.0896, "step": 10231 }, { "epoch": 2.1383490073145244, "grad_norm": 0.9754896549962475, "learning_rate": 1.4915418093765059e-05, "loss": 0.1179, "step": 10232 }, { "epoch": 2.1385579937304073, "grad_norm": 0.9831999774654329, "learning_rate": 1.4914435633425002e-05, "loss": 0.1183, "step": 10233 }, { "epoch": 2.1387669801462903, "grad_norm": 1.0700765260890426, "learning_rate": 1.4913453110540983e-05, "loss": 0.1257, "step": 10234 }, { "epoch": 2.1389759665621733, "grad_norm": 1.268676324096289, "learning_rate": 1.4912470525125509e-05, "loss": 0.1166, "step": 10235 }, { "epoch": 2.139184952978056, "grad_norm": 1.1839748048193546, "learning_rate": 1.4911487877191087e-05, "loss": 0.1585, "step": 10236 }, { "epoch": 2.1393939393939396, "grad_norm": 1.0373182046551617, "learning_rate": 1.4910505166750216e-05, "loss": 0.1291, "step": 10237 }, { "epoch": 2.1396029258098226, "grad_norm": 1.0746788516843335, "learning_rate": 1.490952239381541e-05, "loss": 0.1228, "step": 10238 }, { "epoch": 2.1398119122257055, "grad_norm": 0.9930888423458104, "learning_rate": 1.490853955839917e-05, "loss": 0.1201, "step": 10239 }, { "epoch": 2.1400208986415885, "grad_norm": 1.148801571146843, "learning_rate": 1.4907556660514006e-05, "loss": 0.1269, "step": 10240 }, { "epoch": 2.1402298850574715, "grad_norm": 1.1740230458253729, "learning_rate": 1.490657370017243e-05, "loss": 0.1356, "step": 10241 }, { "epoch": 2.1404388714733544, "grad_norm": 0.9498576063516371, "learning_rate": 1.490559067738695e-05, "loss": 0.1098, "step": 10242 }, { "epoch": 2.1406478578892374, "grad_norm": 0.9916766415822195, "learning_rate": 1.4904607592170073e-05, "loss": 0.127, "step": 10243 }, { "epoch": 2.1408568443051204, "grad_norm": 0.9969200874228016, "learning_rate": 1.4903624444534317e-05, "loss": 0.1209, "step": 10244 }, { "epoch": 2.1410658307210033, "grad_norm": 1.0239727775525707, "learning_rate": 1.4902641234492189e-05, "loss": 0.1115, "step": 10245 }, { "epoch": 2.1412748171368863, "grad_norm": 1.3310740250038278, "learning_rate": 1.4901657962056202e-05, "loss": 0.1257, "step": 10246 }, { "epoch": 2.1414838035527692, "grad_norm": 1.0862403772914218, "learning_rate": 1.4900674627238872e-05, "loss": 0.1097, "step": 10247 }, { "epoch": 2.141692789968652, "grad_norm": 0.9350351523068683, "learning_rate": 1.4899691230052713e-05, "loss": 0.1112, "step": 10248 }, { "epoch": 2.141901776384535, "grad_norm": 0.9315750675543631, "learning_rate": 1.4898707770510237e-05, "loss": 0.1138, "step": 10249 }, { "epoch": 2.142110762800418, "grad_norm": 1.0593830549632264, "learning_rate": 1.4897724248623966e-05, "loss": 0.1293, "step": 10250 }, { "epoch": 2.142319749216301, "grad_norm": 1.0157241435923285, "learning_rate": 1.4896740664406413e-05, "loss": 0.1268, "step": 10251 }, { "epoch": 2.142528735632184, "grad_norm": 2.866271685863909, "learning_rate": 1.4895757017870096e-05, "loss": 0.1228, "step": 10252 }, { "epoch": 2.142737722048067, "grad_norm": 1.0747182881542152, "learning_rate": 1.4894773309027533e-05, "loss": 0.1399, "step": 10253 }, { "epoch": 2.14294670846395, "grad_norm": 1.0440058736094848, "learning_rate": 1.4893789537891247e-05, "loss": 0.114, "step": 10254 }, { "epoch": 2.143155694879833, "grad_norm": 0.915146051775931, "learning_rate": 1.4892805704473751e-05, "loss": 0.1322, "step": 10255 }, { "epoch": 2.143364681295716, "grad_norm": 1.176828977370397, "learning_rate": 1.4891821808787574e-05, "loss": 0.153, "step": 10256 }, { "epoch": 2.143573667711599, "grad_norm": 0.9508369946303773, "learning_rate": 1.4890837850845232e-05, "loss": 0.1052, "step": 10257 }, { "epoch": 2.143782654127482, "grad_norm": 0.9684435596769609, "learning_rate": 1.4889853830659249e-05, "loss": 0.127, "step": 10258 }, { "epoch": 2.1439916405433648, "grad_norm": 0.8524194848498281, "learning_rate": 1.4888869748242147e-05, "loss": 0.0962, "step": 10259 }, { "epoch": 2.1442006269592477, "grad_norm": 1.0689499258939574, "learning_rate": 1.4887885603606453e-05, "loss": 0.1306, "step": 10260 }, { "epoch": 2.1444096133751307, "grad_norm": 1.2624347354698537, "learning_rate": 1.4886901396764689e-05, "loss": 0.1541, "step": 10261 }, { "epoch": 2.1446185997910137, "grad_norm": 1.1274075293475563, "learning_rate": 1.4885917127729381e-05, "loss": 0.1329, "step": 10262 }, { "epoch": 2.1448275862068966, "grad_norm": 0.9925618440691981, "learning_rate": 1.488493279651306e-05, "loss": 0.1219, "step": 10263 }, { "epoch": 2.1450365726227796, "grad_norm": 1.2192932162047614, "learning_rate": 1.4883948403128243e-05, "loss": 0.1124, "step": 10264 }, { "epoch": 2.1452455590386625, "grad_norm": 0.9289079722894215, "learning_rate": 1.4882963947587468e-05, "loss": 0.1074, "step": 10265 }, { "epoch": 2.1454545454545455, "grad_norm": 1.3035657565700303, "learning_rate": 1.4881979429903261e-05, "loss": 0.1594, "step": 10266 }, { "epoch": 2.1456635318704285, "grad_norm": 1.1156035958030888, "learning_rate": 1.4880994850088147e-05, "loss": 0.1407, "step": 10267 }, { "epoch": 2.1458725182863114, "grad_norm": 1.1609670977053481, "learning_rate": 1.4880010208154663e-05, "loss": 0.1481, "step": 10268 }, { "epoch": 2.1460815047021944, "grad_norm": 1.010376339681192, "learning_rate": 1.4879025504115334e-05, "loss": 0.1149, "step": 10269 }, { "epoch": 2.1462904911180773, "grad_norm": 0.8515168038215617, "learning_rate": 1.4878040737982695e-05, "loss": 0.1215, "step": 10270 }, { "epoch": 2.1464994775339603, "grad_norm": 1.0931282592434248, "learning_rate": 1.4877055909769282e-05, "loss": 0.1215, "step": 10271 }, { "epoch": 2.1467084639498433, "grad_norm": 1.123400158787263, "learning_rate": 1.4876071019487623e-05, "loss": 0.1367, "step": 10272 }, { "epoch": 2.1469174503657262, "grad_norm": 1.0587552058597618, "learning_rate": 1.4875086067150252e-05, "loss": 0.1271, "step": 10273 }, { "epoch": 2.147126436781609, "grad_norm": 0.8928665158556808, "learning_rate": 1.4874101052769709e-05, "loss": 0.1165, "step": 10274 }, { "epoch": 2.147335423197492, "grad_norm": 1.048682004104735, "learning_rate": 1.4873115976358525e-05, "loss": 0.1506, "step": 10275 }, { "epoch": 2.147544409613375, "grad_norm": 1.0190013049708382, "learning_rate": 1.4872130837929242e-05, "loss": 0.1264, "step": 10276 }, { "epoch": 2.147753396029258, "grad_norm": 1.0204924771561668, "learning_rate": 1.4871145637494393e-05, "loss": 0.1201, "step": 10277 }, { "epoch": 2.147962382445141, "grad_norm": 1.1562124788584285, "learning_rate": 1.4870160375066519e-05, "loss": 0.1547, "step": 10278 }, { "epoch": 2.148171368861024, "grad_norm": 0.8340418400065456, "learning_rate": 1.4869175050658155e-05, "loss": 0.1299, "step": 10279 }, { "epoch": 2.148380355276907, "grad_norm": 0.9384364618386152, "learning_rate": 1.4868189664281844e-05, "loss": 0.1172, "step": 10280 }, { "epoch": 2.14858934169279, "grad_norm": 1.1102708015537017, "learning_rate": 1.4867204215950126e-05, "loss": 0.1534, "step": 10281 }, { "epoch": 2.148798328108673, "grad_norm": 0.870639519709329, "learning_rate": 1.4866218705675541e-05, "loss": 0.1147, "step": 10282 }, { "epoch": 2.149007314524556, "grad_norm": 0.9091816385013392, "learning_rate": 1.4865233133470635e-05, "loss": 0.1062, "step": 10283 }, { "epoch": 2.149216300940439, "grad_norm": 1.108310610855656, "learning_rate": 1.486424749934795e-05, "loss": 0.1229, "step": 10284 }, { "epoch": 2.1494252873563218, "grad_norm": 0.9927028206414469, "learning_rate": 1.4863261803320025e-05, "loss": 0.1242, "step": 10285 }, { "epoch": 2.1496342737722047, "grad_norm": 0.9755822176297556, "learning_rate": 1.4862276045399408e-05, "loss": 0.1138, "step": 10286 }, { "epoch": 2.1498432601880877, "grad_norm": 0.8972918505520774, "learning_rate": 1.4861290225598646e-05, "loss": 0.1056, "step": 10287 }, { "epoch": 2.1500522466039707, "grad_norm": 1.0850556542327814, "learning_rate": 1.4860304343930281e-05, "loss": 0.122, "step": 10288 }, { "epoch": 2.1502612330198536, "grad_norm": 0.9910623085045265, "learning_rate": 1.4859318400406866e-05, "loss": 0.1081, "step": 10289 }, { "epoch": 2.1504702194357366, "grad_norm": 1.0406681072877986, "learning_rate": 1.4858332395040943e-05, "loss": 0.1177, "step": 10290 }, { "epoch": 2.1506792058516195, "grad_norm": 1.0543578379634213, "learning_rate": 1.485734632784506e-05, "loss": 0.1051, "step": 10291 }, { "epoch": 2.1508881922675025, "grad_norm": 1.2629682312706036, "learning_rate": 1.485636019883177e-05, "loss": 0.1269, "step": 10292 }, { "epoch": 2.1510971786833855, "grad_norm": 1.166684084568201, "learning_rate": 1.4855374008013622e-05, "loss": 0.1153, "step": 10293 }, { "epoch": 2.1513061650992684, "grad_norm": 1.073254468690258, "learning_rate": 1.4854387755403165e-05, "loss": 0.1347, "step": 10294 }, { "epoch": 2.1515151515151514, "grad_norm": 0.9750967900722551, "learning_rate": 1.4853401441012956e-05, "loss": 0.1131, "step": 10295 }, { "epoch": 2.1517241379310343, "grad_norm": 1.1777850278797415, "learning_rate": 1.4852415064855539e-05, "loss": 0.1441, "step": 10296 }, { "epoch": 2.1519331243469173, "grad_norm": 1.2678380647233645, "learning_rate": 1.4851428626943474e-05, "loss": 0.1581, "step": 10297 }, { "epoch": 2.1521421107628003, "grad_norm": 1.0966514653765989, "learning_rate": 1.4850442127289314e-05, "loss": 0.1221, "step": 10298 }, { "epoch": 2.1523510971786832, "grad_norm": 0.8632832536064399, "learning_rate": 1.484945556590561e-05, "loss": 0.1145, "step": 10299 }, { "epoch": 2.152560083594566, "grad_norm": 1.0129131209468873, "learning_rate": 1.4848468942804919e-05, "loss": 0.1118, "step": 10300 }, { "epoch": 2.152769070010449, "grad_norm": 1.330470848556573, "learning_rate": 1.48474822579998e-05, "loss": 0.153, "step": 10301 }, { "epoch": 2.152978056426332, "grad_norm": 1.2704479538752398, "learning_rate": 1.484649551150281e-05, "loss": 0.1282, "step": 10302 }, { "epoch": 2.153187042842215, "grad_norm": 0.9635307567862009, "learning_rate": 1.4845508703326504e-05, "loss": 0.1382, "step": 10303 }, { "epoch": 2.153396029258098, "grad_norm": 0.9971871889578015, "learning_rate": 1.4844521833483441e-05, "loss": 0.1295, "step": 10304 }, { "epoch": 2.153605015673981, "grad_norm": 1.158004156829427, "learning_rate": 1.4843534901986182e-05, "loss": 0.1289, "step": 10305 }, { "epoch": 2.153814002089864, "grad_norm": 1.0130105509393694, "learning_rate": 1.4842547908847287e-05, "loss": 0.1017, "step": 10306 }, { "epoch": 2.154022988505747, "grad_norm": 0.9998580031574843, "learning_rate": 1.484156085407932e-05, "loss": 0.1014, "step": 10307 }, { "epoch": 2.15423197492163, "grad_norm": 1.0249197090967215, "learning_rate": 1.4840573737694835e-05, "loss": 0.1321, "step": 10308 }, { "epoch": 2.154440961337513, "grad_norm": 1.031152673075519, "learning_rate": 1.4839586559706401e-05, "loss": 0.1325, "step": 10309 }, { "epoch": 2.154649947753396, "grad_norm": 1.0147020382015464, "learning_rate": 1.4838599320126581e-05, "loss": 0.1203, "step": 10310 }, { "epoch": 2.1548589341692788, "grad_norm": 0.9576039755657431, "learning_rate": 1.4837612018967937e-05, "loss": 0.132, "step": 10311 }, { "epoch": 2.155067920585162, "grad_norm": 1.077596626333427, "learning_rate": 1.4836624656243033e-05, "loss": 0.143, "step": 10312 }, { "epoch": 2.155276907001045, "grad_norm": 0.8884011278680445, "learning_rate": 1.4835637231964438e-05, "loss": 0.1092, "step": 10313 }, { "epoch": 2.155485893416928, "grad_norm": 0.8361900995160266, "learning_rate": 1.4834649746144717e-05, "loss": 0.109, "step": 10314 }, { "epoch": 2.155694879832811, "grad_norm": 1.016805434783844, "learning_rate": 1.483366219879644e-05, "loss": 0.1217, "step": 10315 }, { "epoch": 2.155903866248694, "grad_norm": 0.917627819200025, "learning_rate": 1.483267458993217e-05, "loss": 0.1065, "step": 10316 }, { "epoch": 2.156112852664577, "grad_norm": 0.9806260041124754, "learning_rate": 1.483168691956448e-05, "loss": 0.1133, "step": 10317 }, { "epoch": 2.15632183908046, "grad_norm": 0.9352295333766507, "learning_rate": 1.4830699187705934e-05, "loss": 0.1342, "step": 10318 }, { "epoch": 2.156530825496343, "grad_norm": 0.9891881396335508, "learning_rate": 1.482971139436911e-05, "loss": 0.1214, "step": 10319 }, { "epoch": 2.156739811912226, "grad_norm": 1.0073379325162986, "learning_rate": 1.4828723539566576e-05, "loss": 0.1243, "step": 10320 }, { "epoch": 2.156948798328109, "grad_norm": 0.861117724402724, "learning_rate": 1.4827735623310903e-05, "loss": 0.1196, "step": 10321 }, { "epoch": 2.157157784743992, "grad_norm": 1.135795990834309, "learning_rate": 1.4826747645614665e-05, "loss": 0.1356, "step": 10322 }, { "epoch": 2.1573667711598747, "grad_norm": 1.104368150875285, "learning_rate": 1.4825759606490438e-05, "loss": 0.1248, "step": 10323 }, { "epoch": 2.1575757575757577, "grad_norm": 1.117217125611557, "learning_rate": 1.4824771505950785e-05, "loss": 0.1325, "step": 10324 }, { "epoch": 2.1577847439916407, "grad_norm": 0.9735571509108404, "learning_rate": 1.4823783344008298e-05, "loss": 0.1191, "step": 10325 }, { "epoch": 2.1579937304075236, "grad_norm": 0.9597508392604936, "learning_rate": 1.4822795120675542e-05, "loss": 0.1396, "step": 10326 }, { "epoch": 2.1582027168234066, "grad_norm": 0.9598767329421403, "learning_rate": 1.4821806835965095e-05, "loss": 0.1053, "step": 10327 }, { "epoch": 2.1584117032392895, "grad_norm": 0.9523568553595726, "learning_rate": 1.4820818489889536e-05, "loss": 0.1052, "step": 10328 }, { "epoch": 2.1586206896551725, "grad_norm": 1.060467572003044, "learning_rate": 1.4819830082461446e-05, "loss": 0.139, "step": 10329 }, { "epoch": 2.1588296760710555, "grad_norm": 0.9569242284902, "learning_rate": 1.4818841613693399e-05, "loss": 0.1393, "step": 10330 }, { "epoch": 2.1590386624869384, "grad_norm": 0.881551783236532, "learning_rate": 1.4817853083597977e-05, "loss": 0.1169, "step": 10331 }, { "epoch": 2.1592476489028214, "grad_norm": 1.001684320923296, "learning_rate": 1.481686449218776e-05, "loss": 0.1245, "step": 10332 }, { "epoch": 2.1594566353187044, "grad_norm": 0.9918534568307117, "learning_rate": 1.481587583947533e-05, "loss": 0.1347, "step": 10333 }, { "epoch": 2.1596656217345873, "grad_norm": 1.0493585073632332, "learning_rate": 1.481488712547327e-05, "loss": 0.1298, "step": 10334 }, { "epoch": 2.1598746081504703, "grad_norm": 0.9836347885263546, "learning_rate": 1.4813898350194163e-05, "loss": 0.0968, "step": 10335 }, { "epoch": 2.1600835945663532, "grad_norm": 0.9292582374418313, "learning_rate": 1.4812909513650586e-05, "loss": 0.1311, "step": 10336 }, { "epoch": 2.160292580982236, "grad_norm": 1.1407995336057422, "learning_rate": 1.4811920615855136e-05, "loss": 0.1287, "step": 10337 }, { "epoch": 2.160501567398119, "grad_norm": 0.9481531600324222, "learning_rate": 1.4810931656820388e-05, "loss": 0.127, "step": 10338 }, { "epoch": 2.160710553814002, "grad_norm": 0.9651801145320963, "learning_rate": 1.4809942636558935e-05, "loss": 0.1239, "step": 10339 }, { "epoch": 2.160919540229885, "grad_norm": 0.9918745581839454, "learning_rate": 1.4808953555083357e-05, "loss": 0.1199, "step": 10340 }, { "epoch": 2.161128526645768, "grad_norm": 1.1983492041173336, "learning_rate": 1.4807964412406247e-05, "loss": 0.1326, "step": 10341 }, { "epoch": 2.161337513061651, "grad_norm": 0.9354244948409337, "learning_rate": 1.4806975208540188e-05, "loss": 0.104, "step": 10342 }, { "epoch": 2.161546499477534, "grad_norm": 0.9613023746019066, "learning_rate": 1.4805985943497778e-05, "loss": 0.1411, "step": 10343 }, { "epoch": 2.161755485893417, "grad_norm": 0.9210483689089835, "learning_rate": 1.4804996617291598e-05, "loss": 0.1211, "step": 10344 }, { "epoch": 2.1619644723093, "grad_norm": 1.0429963750168505, "learning_rate": 1.4804007229934243e-05, "loss": 0.1257, "step": 10345 }, { "epoch": 2.162173458725183, "grad_norm": 4.608583921020861, "learning_rate": 1.4803017781438306e-05, "loss": 0.1183, "step": 10346 }, { "epoch": 2.162382445141066, "grad_norm": 0.9865825310725276, "learning_rate": 1.4802028271816376e-05, "loss": 0.1098, "step": 10347 }, { "epoch": 2.1625914315569488, "grad_norm": 1.1250038753057334, "learning_rate": 1.4801038701081046e-05, "loss": 0.1258, "step": 10348 }, { "epoch": 2.1628004179728317, "grad_norm": 1.0631021100849989, "learning_rate": 1.4800049069244913e-05, "loss": 0.128, "step": 10349 }, { "epoch": 2.1630094043887147, "grad_norm": 1.0077507226435145, "learning_rate": 1.4799059376320568e-05, "loss": 0.129, "step": 10350 }, { "epoch": 2.1632183908045977, "grad_norm": 0.9677289914369261, "learning_rate": 1.4798069622320607e-05, "loss": 0.1208, "step": 10351 }, { "epoch": 2.1634273772204806, "grad_norm": 1.0498605094681845, "learning_rate": 1.4797079807257632e-05, "loss": 0.1339, "step": 10352 }, { "epoch": 2.1636363636363636, "grad_norm": 1.2936073700349442, "learning_rate": 1.479608993114423e-05, "loss": 0.1396, "step": 10353 }, { "epoch": 2.1638453500522465, "grad_norm": 1.1137032845767367, "learning_rate": 1.479509999399301e-05, "loss": 0.1448, "step": 10354 }, { "epoch": 2.1640543364681295, "grad_norm": 1.0347694934960117, "learning_rate": 1.4794109995816559e-05, "loss": 0.1186, "step": 10355 }, { "epoch": 2.1642633228840125, "grad_norm": 0.8896544175118732, "learning_rate": 1.4793119936627484e-05, "loss": 0.1025, "step": 10356 }, { "epoch": 2.1644723092998954, "grad_norm": 0.9010103013366942, "learning_rate": 1.4792129816438383e-05, "loss": 0.1161, "step": 10357 }, { "epoch": 2.1646812957157784, "grad_norm": 0.9950267792106101, "learning_rate": 1.4791139635261857e-05, "loss": 0.1126, "step": 10358 }, { "epoch": 2.1648902821316613, "grad_norm": 1.1149470718563153, "learning_rate": 1.4790149393110506e-05, "loss": 0.1589, "step": 10359 }, { "epoch": 2.1650992685475443, "grad_norm": 1.0482783433533107, "learning_rate": 1.4789159089996936e-05, "loss": 0.1282, "step": 10360 }, { "epoch": 2.1653082549634273, "grad_norm": 1.126560135352089, "learning_rate": 1.4788168725933747e-05, "loss": 0.1285, "step": 10361 }, { "epoch": 2.1655172413793102, "grad_norm": 0.923373420529825, "learning_rate": 1.4787178300933543e-05, "loss": 0.1092, "step": 10362 }, { "epoch": 2.165726227795193, "grad_norm": 1.3079548249079553, "learning_rate": 1.478618781500893e-05, "loss": 0.1247, "step": 10363 }, { "epoch": 2.165935214211076, "grad_norm": 1.0360731665362142, "learning_rate": 1.4785197268172515e-05, "loss": 0.143, "step": 10364 }, { "epoch": 2.166144200626959, "grad_norm": 0.9430520841891215, "learning_rate": 1.47842066604369e-05, "loss": 0.1027, "step": 10365 }, { "epoch": 2.166353187042842, "grad_norm": 0.9878544709418258, "learning_rate": 1.4783215991814697e-05, "loss": 0.1096, "step": 10366 }, { "epoch": 2.166562173458725, "grad_norm": 0.9800275173774617, "learning_rate": 1.4782225262318508e-05, "loss": 0.1528, "step": 10367 }, { "epoch": 2.166771159874608, "grad_norm": 1.0091034355637736, "learning_rate": 1.4781234471960947e-05, "loss": 0.0968, "step": 10368 }, { "epoch": 2.166980146290491, "grad_norm": 1.0952408655399573, "learning_rate": 1.478024362075462e-05, "loss": 0.1156, "step": 10369 }, { "epoch": 2.167189132706374, "grad_norm": 1.1059385963509651, "learning_rate": 1.477925270871214e-05, "loss": 0.1086, "step": 10370 }, { "epoch": 2.167398119122257, "grad_norm": 1.1213114958514698, "learning_rate": 1.4778261735846116e-05, "loss": 0.1085, "step": 10371 }, { "epoch": 2.16760710553814, "grad_norm": 1.081762212989437, "learning_rate": 1.4777270702169157e-05, "loss": 0.1508, "step": 10372 }, { "epoch": 2.167816091954023, "grad_norm": 1.0698029046161592, "learning_rate": 1.4776279607693881e-05, "loss": 0.1346, "step": 10373 }, { "epoch": 2.1680250783699058, "grad_norm": 0.9298968699554666, "learning_rate": 1.47752884524329e-05, "loss": 0.1348, "step": 10374 }, { "epoch": 2.1682340647857887, "grad_norm": 0.8241078307337623, "learning_rate": 1.4774297236398822e-05, "loss": 0.0999, "step": 10375 }, { "epoch": 2.1684430512016717, "grad_norm": 1.0703035098645644, "learning_rate": 1.4773305959604273e-05, "loss": 0.1198, "step": 10376 }, { "epoch": 2.1686520376175547, "grad_norm": 0.9613640095183628, "learning_rate": 1.4772314622061858e-05, "loss": 0.1426, "step": 10377 }, { "epoch": 2.168861024033438, "grad_norm": 1.0440828204096364, "learning_rate": 1.4771323223784197e-05, "loss": 0.1747, "step": 10378 }, { "epoch": 2.169070010449321, "grad_norm": 1.014513690869398, "learning_rate": 1.477033176478391e-05, "loss": 0.1295, "step": 10379 }, { "epoch": 2.169278996865204, "grad_norm": 1.0547625052353826, "learning_rate": 1.4769340245073613e-05, "loss": 0.12, "step": 10380 }, { "epoch": 2.169487983281087, "grad_norm": 1.0845953438426705, "learning_rate": 1.4768348664665922e-05, "loss": 0.1149, "step": 10381 }, { "epoch": 2.16969696969697, "grad_norm": 1.2072813846905954, "learning_rate": 1.4767357023573458e-05, "loss": 0.123, "step": 10382 }, { "epoch": 2.169905956112853, "grad_norm": 0.9989304282543559, "learning_rate": 1.4766365321808842e-05, "loss": 0.1191, "step": 10383 }, { "epoch": 2.170114942528736, "grad_norm": 1.0105238666128264, "learning_rate": 1.4765373559384694e-05, "loss": 0.1054, "step": 10384 }, { "epoch": 2.170323928944619, "grad_norm": 0.9369338427671087, "learning_rate": 1.476438173631364e-05, "loss": 0.099, "step": 10385 }, { "epoch": 2.1705329153605017, "grad_norm": 1.2039545335628634, "learning_rate": 1.4763389852608296e-05, "loss": 0.1345, "step": 10386 }, { "epoch": 2.1707419017763847, "grad_norm": 1.0552496033811973, "learning_rate": 1.4762397908281288e-05, "loss": 0.1415, "step": 10387 }, { "epoch": 2.1709508881922677, "grad_norm": 0.894319004585564, "learning_rate": 1.4761405903345244e-05, "loss": 0.0969, "step": 10388 }, { "epoch": 2.1711598746081506, "grad_norm": 0.9689274820845583, "learning_rate": 1.476041383781278e-05, "loss": 0.0969, "step": 10389 }, { "epoch": 2.1713688610240336, "grad_norm": 0.9438930916043847, "learning_rate": 1.4759421711696529e-05, "loss": 0.1082, "step": 10390 }, { "epoch": 2.1715778474399166, "grad_norm": 1.0880918780936004, "learning_rate": 1.4758429525009115e-05, "loss": 0.1101, "step": 10391 }, { "epoch": 2.1717868338557995, "grad_norm": 1.0092439140091582, "learning_rate": 1.4757437277763167e-05, "loss": 0.1217, "step": 10392 }, { "epoch": 2.1719958202716825, "grad_norm": 1.080270871458307, "learning_rate": 1.4756444969971306e-05, "loss": 0.1568, "step": 10393 }, { "epoch": 2.1722048066875654, "grad_norm": 1.1717287283103877, "learning_rate": 1.4755452601646173e-05, "loss": 0.1283, "step": 10394 }, { "epoch": 2.1724137931034484, "grad_norm": 0.9642589578594714, "learning_rate": 1.4754460172800384e-05, "loss": 0.1198, "step": 10395 }, { "epoch": 2.1726227795193314, "grad_norm": 1.2383351767155601, "learning_rate": 1.4753467683446579e-05, "loss": 0.1169, "step": 10396 }, { "epoch": 2.1728317659352143, "grad_norm": 1.0533814116013491, "learning_rate": 1.4752475133597384e-05, "loss": 0.1284, "step": 10397 }, { "epoch": 2.1730407523510973, "grad_norm": 0.9978038062398722, "learning_rate": 1.4751482523265432e-05, "loss": 0.1281, "step": 10398 }, { "epoch": 2.1732497387669802, "grad_norm": 0.9863480321576057, "learning_rate": 1.4750489852463354e-05, "loss": 0.116, "step": 10399 }, { "epoch": 2.173458725182863, "grad_norm": 1.0647193966250221, "learning_rate": 1.4749497121203791e-05, "loss": 0.1313, "step": 10400 }, { "epoch": 2.173667711598746, "grad_norm": 0.8541533002210995, "learning_rate": 1.4748504329499366e-05, "loss": 0.1043, "step": 10401 }, { "epoch": 2.173876698014629, "grad_norm": 1.1506595506008865, "learning_rate": 1.4747511477362721e-05, "loss": 0.1407, "step": 10402 }, { "epoch": 2.174085684430512, "grad_norm": 0.8884021350562613, "learning_rate": 1.4746518564806488e-05, "loss": 0.119, "step": 10403 }, { "epoch": 2.174294670846395, "grad_norm": 0.9574375925579318, "learning_rate": 1.4745525591843311e-05, "loss": 0.1199, "step": 10404 }, { "epoch": 2.174503657262278, "grad_norm": 1.0654565826799312, "learning_rate": 1.4744532558485814e-05, "loss": 0.1301, "step": 10405 }, { "epoch": 2.174712643678161, "grad_norm": 0.9667797188291593, "learning_rate": 1.4743539464746646e-05, "loss": 0.1169, "step": 10406 }, { "epoch": 2.174921630094044, "grad_norm": 1.0101447193708584, "learning_rate": 1.4742546310638441e-05, "loss": 0.1222, "step": 10407 }, { "epoch": 2.175130616509927, "grad_norm": 0.992198983656233, "learning_rate": 1.4741553096173838e-05, "loss": 0.1186, "step": 10408 }, { "epoch": 2.17533960292581, "grad_norm": 1.0009440216023802, "learning_rate": 1.474055982136548e-05, "loss": 0.1163, "step": 10409 }, { "epoch": 2.175548589341693, "grad_norm": 0.8777014152196192, "learning_rate": 1.4739566486226006e-05, "loss": 0.1242, "step": 10410 }, { "epoch": 2.175757575757576, "grad_norm": 1.0044148916329634, "learning_rate": 1.4738573090768055e-05, "loss": 0.1123, "step": 10411 }, { "epoch": 2.1759665621734587, "grad_norm": 0.9897698470507784, "learning_rate": 1.4737579635004278e-05, "loss": 0.1292, "step": 10412 }, { "epoch": 2.1761755485893417, "grad_norm": 1.0411325739396446, "learning_rate": 1.4736586118947311e-05, "loss": 0.1091, "step": 10413 }, { "epoch": 2.1763845350052247, "grad_norm": 1.100400651862681, "learning_rate": 1.4735592542609797e-05, "loss": 0.1333, "step": 10414 }, { "epoch": 2.1765935214211076, "grad_norm": 1.1977735610767641, "learning_rate": 1.473459890600439e-05, "loss": 0.1113, "step": 10415 }, { "epoch": 2.1768025078369906, "grad_norm": 1.0024579978262123, "learning_rate": 1.4733605209143727e-05, "loss": 0.1215, "step": 10416 }, { "epoch": 2.1770114942528735, "grad_norm": 0.8740188431125916, "learning_rate": 1.4732611452040452e-05, "loss": 0.1052, "step": 10417 }, { "epoch": 2.1772204806687565, "grad_norm": 0.7687064842374361, "learning_rate": 1.4731617634707225e-05, "loss": 0.0934, "step": 10418 }, { "epoch": 2.1774294670846395, "grad_norm": 0.9037850828388334, "learning_rate": 1.473062375715668e-05, "loss": 0.1158, "step": 10419 }, { "epoch": 2.1776384535005224, "grad_norm": 1.140617624450767, "learning_rate": 1.4729629819401474e-05, "loss": 0.1268, "step": 10420 }, { "epoch": 2.1778474399164054, "grad_norm": 0.9700349068463419, "learning_rate": 1.4728635821454255e-05, "loss": 0.108, "step": 10421 }, { "epoch": 2.1780564263322884, "grad_norm": 1.0090995687996362, "learning_rate": 1.4727641763327669e-05, "loss": 0.1127, "step": 10422 }, { "epoch": 2.1782654127481713, "grad_norm": 1.0195849424264385, "learning_rate": 1.4726647645034374e-05, "loss": 0.1149, "step": 10423 }, { "epoch": 2.1784743991640543, "grad_norm": 1.2462506778127609, "learning_rate": 1.4725653466587013e-05, "loss": 0.1372, "step": 10424 }, { "epoch": 2.1786833855799372, "grad_norm": 0.9005810157169354, "learning_rate": 1.4724659227998248e-05, "loss": 0.1109, "step": 10425 }, { "epoch": 2.17889237199582, "grad_norm": 0.8504878544173251, "learning_rate": 1.4723664929280725e-05, "loss": 0.0906, "step": 10426 }, { "epoch": 2.179101358411703, "grad_norm": 1.2018126628343997, "learning_rate": 1.4722670570447103e-05, "loss": 0.1344, "step": 10427 }, { "epoch": 2.179310344827586, "grad_norm": 0.9879472429008002, "learning_rate": 1.4721676151510035e-05, "loss": 0.0962, "step": 10428 }, { "epoch": 2.179519331243469, "grad_norm": 1.1202102753634722, "learning_rate": 1.4720681672482172e-05, "loss": 0.1204, "step": 10429 }, { "epoch": 2.179728317659352, "grad_norm": 0.9526875466093313, "learning_rate": 1.4719687133376175e-05, "loss": 0.1091, "step": 10430 }, { "epoch": 2.179937304075235, "grad_norm": 1.0117093194911921, "learning_rate": 1.4718692534204701e-05, "loss": 0.1184, "step": 10431 }, { "epoch": 2.180146290491118, "grad_norm": 1.049917275404013, "learning_rate": 1.4717697874980408e-05, "loss": 0.1219, "step": 10432 }, { "epoch": 2.180355276907001, "grad_norm": 1.0979428437589225, "learning_rate": 1.4716703155715954e-05, "loss": 0.1254, "step": 10433 }, { "epoch": 2.180564263322884, "grad_norm": 0.8750470134037065, "learning_rate": 1.4715708376423999e-05, "loss": 0.1034, "step": 10434 }, { "epoch": 2.180773249738767, "grad_norm": 1.036328926329322, "learning_rate": 1.47147135371172e-05, "loss": 0.1363, "step": 10435 }, { "epoch": 2.18098223615465, "grad_norm": 1.0269816843458475, "learning_rate": 1.471371863780822e-05, "loss": 0.1222, "step": 10436 }, { "epoch": 2.1811912225705328, "grad_norm": 1.045623637686294, "learning_rate": 1.4712723678509725e-05, "loss": 0.127, "step": 10437 }, { "epoch": 2.1814002089864157, "grad_norm": 0.9039896499293659, "learning_rate": 1.4711728659234367e-05, "loss": 0.0945, "step": 10438 }, { "epoch": 2.1816091954022987, "grad_norm": 1.1284498510703302, "learning_rate": 1.4710733579994823e-05, "loss": 0.0978, "step": 10439 }, { "epoch": 2.1818181818181817, "grad_norm": 1.0759971562410224, "learning_rate": 1.4709738440803745e-05, "loss": 0.1226, "step": 10440 }, { "epoch": 2.1820271682340646, "grad_norm": 1.1647242550768495, "learning_rate": 1.4708743241673804e-05, "loss": 0.1298, "step": 10441 }, { "epoch": 2.1822361546499476, "grad_norm": 0.9708017102820233, "learning_rate": 1.4707747982617662e-05, "loss": 0.1122, "step": 10442 }, { "epoch": 2.1824451410658305, "grad_norm": 1.072523700833897, "learning_rate": 1.470675266364799e-05, "loss": 0.1363, "step": 10443 }, { "epoch": 2.1826541274817135, "grad_norm": 1.0890033061485351, "learning_rate": 1.4705757284777447e-05, "loss": 0.1442, "step": 10444 }, { "epoch": 2.1828631138975965, "grad_norm": 1.0847082716431866, "learning_rate": 1.4704761846018709e-05, "loss": 0.11, "step": 10445 }, { "epoch": 2.1830721003134794, "grad_norm": 1.0653448717550453, "learning_rate": 1.4703766347384441e-05, "loss": 0.1018, "step": 10446 }, { "epoch": 2.1832810867293624, "grad_norm": 0.8776605570418649, "learning_rate": 1.470277078888731e-05, "loss": 0.1099, "step": 10447 }, { "epoch": 2.1834900731452453, "grad_norm": 0.7688548531697665, "learning_rate": 1.4701775170539991e-05, "loss": 0.0894, "step": 10448 }, { "epoch": 2.1836990595611283, "grad_norm": 1.0582228232820594, "learning_rate": 1.4700779492355155e-05, "loss": 0.1378, "step": 10449 }, { "epoch": 2.1839080459770113, "grad_norm": 1.063078346765766, "learning_rate": 1.4699783754345465e-05, "loss": 0.118, "step": 10450 }, { "epoch": 2.1841170323928942, "grad_norm": 1.0192296483980428, "learning_rate": 1.4698787956523606e-05, "loss": 0.1365, "step": 10451 }, { "epoch": 2.1843260188087776, "grad_norm": 0.9509590110146167, "learning_rate": 1.469779209890224e-05, "loss": 0.0988, "step": 10452 }, { "epoch": 2.1845350052246606, "grad_norm": 1.0591772273388433, "learning_rate": 1.4696796181494046e-05, "loss": 0.1336, "step": 10453 }, { "epoch": 2.1847439916405436, "grad_norm": 1.0314882848627989, "learning_rate": 1.4695800204311698e-05, "loss": 0.1324, "step": 10454 }, { "epoch": 2.1849529780564265, "grad_norm": 0.8324480617492842, "learning_rate": 1.4694804167367874e-05, "loss": 0.1081, "step": 10455 }, { "epoch": 2.1851619644723095, "grad_norm": 1.1355281015474972, "learning_rate": 1.4693808070675242e-05, "loss": 0.1376, "step": 10456 }, { "epoch": 2.1853709508881924, "grad_norm": 0.9497361513043783, "learning_rate": 1.4692811914246488e-05, "loss": 0.1311, "step": 10457 }, { "epoch": 2.1855799373040754, "grad_norm": 0.9146272602721162, "learning_rate": 1.4691815698094285e-05, "loss": 0.1306, "step": 10458 }, { "epoch": 2.1857889237199584, "grad_norm": 1.0436521165218164, "learning_rate": 1.4690819422231313e-05, "loss": 0.1312, "step": 10459 }, { "epoch": 2.1859979101358413, "grad_norm": 1.0503803273287113, "learning_rate": 1.468982308667025e-05, "loss": 0.1227, "step": 10460 }, { "epoch": 2.1862068965517243, "grad_norm": 0.9288498435421274, "learning_rate": 1.468882669142378e-05, "loss": 0.1082, "step": 10461 }, { "epoch": 2.1864158829676072, "grad_norm": 0.9187786816453339, "learning_rate": 1.4687830236504574e-05, "loss": 0.111, "step": 10462 }, { "epoch": 2.18662486938349, "grad_norm": 1.0914721688380515, "learning_rate": 1.4686833721925324e-05, "loss": 0.1319, "step": 10463 }, { "epoch": 2.186833855799373, "grad_norm": 1.1179406916462074, "learning_rate": 1.4685837147698708e-05, "loss": 0.1328, "step": 10464 }, { "epoch": 2.187042842215256, "grad_norm": 0.8875032927415764, "learning_rate": 1.4684840513837408e-05, "loss": 0.1325, "step": 10465 }, { "epoch": 2.187251828631139, "grad_norm": 1.0202153071689315, "learning_rate": 1.4683843820354109e-05, "loss": 0.1291, "step": 10466 }, { "epoch": 2.187460815047022, "grad_norm": 0.9717714321537011, "learning_rate": 1.4682847067261493e-05, "loss": 0.128, "step": 10467 }, { "epoch": 2.187669801462905, "grad_norm": 0.9880644894358451, "learning_rate": 1.468185025457225e-05, "loss": 0.1354, "step": 10468 }, { "epoch": 2.187878787878788, "grad_norm": 0.9314979018398194, "learning_rate": 1.4680853382299063e-05, "loss": 0.1143, "step": 10469 }, { "epoch": 2.188087774294671, "grad_norm": 1.168439623392553, "learning_rate": 1.467985645045462e-05, "loss": 0.1405, "step": 10470 }, { "epoch": 2.188296760710554, "grad_norm": 1.017958054156336, "learning_rate": 1.4678859459051607e-05, "loss": 0.1432, "step": 10471 }, { "epoch": 2.188505747126437, "grad_norm": 1.001148124374282, "learning_rate": 1.4677862408102712e-05, "loss": 0.108, "step": 10472 }, { "epoch": 2.18871473354232, "grad_norm": 0.9789386006720078, "learning_rate": 1.4676865297620628e-05, "loss": 0.1293, "step": 10473 }, { "epoch": 2.188923719958203, "grad_norm": 0.972934061977262, "learning_rate": 1.4675868127618038e-05, "loss": 0.1356, "step": 10474 }, { "epoch": 2.1891327063740857, "grad_norm": 0.9189301120161113, "learning_rate": 1.467487089810764e-05, "loss": 0.1111, "step": 10475 }, { "epoch": 2.1893416927899687, "grad_norm": 0.903705792839469, "learning_rate": 1.467387360910212e-05, "loss": 0.1257, "step": 10476 }, { "epoch": 2.1895506792058517, "grad_norm": 1.0036455268044038, "learning_rate": 1.4672876260614171e-05, "loss": 0.1318, "step": 10477 }, { "epoch": 2.1897596656217346, "grad_norm": 0.747682502697724, "learning_rate": 1.467187885265649e-05, "loss": 0.0842, "step": 10478 }, { "epoch": 2.1899686520376176, "grad_norm": 0.8711262301431727, "learning_rate": 1.4670881385241764e-05, "loss": 0.1113, "step": 10479 }, { "epoch": 2.1901776384535006, "grad_norm": 1.0391216724693533, "learning_rate": 1.4669883858382689e-05, "loss": 0.1398, "step": 10480 }, { "epoch": 2.1903866248693835, "grad_norm": 1.1675273172644172, "learning_rate": 1.4668886272091966e-05, "loss": 0.1078, "step": 10481 }, { "epoch": 2.1905956112852665, "grad_norm": 1.1146782826547101, "learning_rate": 1.4667888626382287e-05, "loss": 0.1204, "step": 10482 }, { "epoch": 2.1908045977011494, "grad_norm": 1.041998217860434, "learning_rate": 1.4666890921266345e-05, "loss": 0.1134, "step": 10483 }, { "epoch": 2.1910135841170324, "grad_norm": 1.1170672686341814, "learning_rate": 1.4665893156756842e-05, "loss": 0.1508, "step": 10484 }, { "epoch": 2.1912225705329154, "grad_norm": 1.026055282818921, "learning_rate": 1.4664895332866478e-05, "loss": 0.13, "step": 10485 }, { "epoch": 2.1914315569487983, "grad_norm": 1.0996304268228738, "learning_rate": 1.4663897449607942e-05, "loss": 0.1232, "step": 10486 }, { "epoch": 2.1916405433646813, "grad_norm": 0.95639336145891, "learning_rate": 1.4662899506993946e-05, "loss": 0.1202, "step": 10487 }, { "epoch": 2.1918495297805642, "grad_norm": 0.9074018859580841, "learning_rate": 1.4661901505037181e-05, "loss": 0.1138, "step": 10488 }, { "epoch": 2.192058516196447, "grad_norm": 1.0020062626896395, "learning_rate": 1.4660903443750355e-05, "loss": 0.1214, "step": 10489 }, { "epoch": 2.19226750261233, "grad_norm": 1.2144627503525958, "learning_rate": 1.4659905323146165e-05, "loss": 0.1325, "step": 10490 }, { "epoch": 2.192476489028213, "grad_norm": 1.2173679585752784, "learning_rate": 1.4658907143237315e-05, "loss": 0.1484, "step": 10491 }, { "epoch": 2.192685475444096, "grad_norm": 1.1018718292702816, "learning_rate": 1.465790890403651e-05, "loss": 0.1009, "step": 10492 }, { "epoch": 2.192894461859979, "grad_norm": 1.1519029702382453, "learning_rate": 1.4656910605556452e-05, "loss": 0.1167, "step": 10493 }, { "epoch": 2.193103448275862, "grad_norm": 0.9022405630091026, "learning_rate": 1.4655912247809847e-05, "loss": 0.107, "step": 10494 }, { "epoch": 2.193312434691745, "grad_norm": 1.0004564476133093, "learning_rate": 1.4654913830809401e-05, "loss": 0.1247, "step": 10495 }, { "epoch": 2.193521421107628, "grad_norm": 0.9686208816335659, "learning_rate": 1.465391535456782e-05, "loss": 0.1251, "step": 10496 }, { "epoch": 2.193730407523511, "grad_norm": 1.1389752809654805, "learning_rate": 1.4652916819097811e-05, "loss": 0.1292, "step": 10497 }, { "epoch": 2.193939393939394, "grad_norm": 1.0117234615576067, "learning_rate": 1.4651918224412083e-05, "loss": 0.1291, "step": 10498 }, { "epoch": 2.194148380355277, "grad_norm": 1.022508100743751, "learning_rate": 1.465091957052334e-05, "loss": 0.1183, "step": 10499 }, { "epoch": 2.19435736677116, "grad_norm": 1.1748948368843584, "learning_rate": 1.46499208574443e-05, "loss": 0.1242, "step": 10500 }, { "epoch": 2.1945663531870427, "grad_norm": 0.8809594339378782, "learning_rate": 1.4648922085187667e-05, "loss": 0.1112, "step": 10501 }, { "epoch": 2.1947753396029257, "grad_norm": 0.8824252010745122, "learning_rate": 1.4647923253766154e-05, "loss": 0.0811, "step": 10502 }, { "epoch": 2.1949843260188087, "grad_norm": 1.3178556275210433, "learning_rate": 1.4646924363192472e-05, "loss": 0.148, "step": 10503 }, { "epoch": 2.1951933124346916, "grad_norm": 1.0117782328380514, "learning_rate": 1.4645925413479335e-05, "loss": 0.1346, "step": 10504 }, { "epoch": 2.1954022988505746, "grad_norm": 1.1170071784341065, "learning_rate": 1.464492640463945e-05, "loss": 0.1031, "step": 10505 }, { "epoch": 2.1956112852664575, "grad_norm": 0.8893904951917961, "learning_rate": 1.4643927336685542e-05, "loss": 0.1163, "step": 10506 }, { "epoch": 2.1958202716823405, "grad_norm": 1.008048556151717, "learning_rate": 1.4642928209630314e-05, "loss": 0.134, "step": 10507 }, { "epoch": 2.1960292580982235, "grad_norm": 0.9560490174311492, "learning_rate": 1.464192902348649e-05, "loss": 0.1126, "step": 10508 }, { "epoch": 2.1962382445141064, "grad_norm": 0.8788149087376481, "learning_rate": 1.464092977826678e-05, "loss": 0.0999, "step": 10509 }, { "epoch": 2.1964472309299894, "grad_norm": 0.9569550898759543, "learning_rate": 1.463993047398391e-05, "loss": 0.1202, "step": 10510 }, { "epoch": 2.1966562173458724, "grad_norm": 1.0704779233802155, "learning_rate": 1.4638931110650587e-05, "loss": 0.1171, "step": 10511 }, { "epoch": 2.1968652037617553, "grad_norm": 0.9112547086113834, "learning_rate": 1.463793168827954e-05, "loss": 0.1375, "step": 10512 }, { "epoch": 2.1970741901776383, "grad_norm": 0.9723200060636195, "learning_rate": 1.4636932206883474e-05, "loss": 0.1171, "step": 10513 }, { "epoch": 2.1972831765935212, "grad_norm": 0.9956039747354197, "learning_rate": 1.4635932666475126e-05, "loss": 0.114, "step": 10514 }, { "epoch": 2.197492163009404, "grad_norm": 1.1808174882008422, "learning_rate": 1.4634933067067203e-05, "loss": 0.1221, "step": 10515 }, { "epoch": 2.197701149425287, "grad_norm": 1.013130424322475, "learning_rate": 1.4633933408672434e-05, "loss": 0.1097, "step": 10516 }, { "epoch": 2.19791013584117, "grad_norm": 1.0530695266575578, "learning_rate": 1.4632933691303536e-05, "loss": 0.1163, "step": 10517 }, { "epoch": 2.1981191222570535, "grad_norm": 1.0375711510639398, "learning_rate": 1.463193391497324e-05, "loss": 0.1205, "step": 10518 }, { "epoch": 2.1983281086729365, "grad_norm": 0.950232272173691, "learning_rate": 1.463093407969426e-05, "loss": 0.1234, "step": 10519 }, { "epoch": 2.1985370950888194, "grad_norm": 0.9387062095591581, "learning_rate": 1.462993418547933e-05, "loss": 0.1321, "step": 10520 }, { "epoch": 2.1987460815047024, "grad_norm": 0.9339671314079057, "learning_rate": 1.4628934232341167e-05, "loss": 0.1168, "step": 10521 }, { "epoch": 2.1989550679205854, "grad_norm": 0.986027087160947, "learning_rate": 1.4627934220292501e-05, "loss": 0.1327, "step": 10522 }, { "epoch": 2.1991640543364683, "grad_norm": 0.8897549558340058, "learning_rate": 1.462693414934606e-05, "loss": 0.119, "step": 10523 }, { "epoch": 2.1993730407523513, "grad_norm": 0.9687296949904276, "learning_rate": 1.462593401951457e-05, "loss": 0.1176, "step": 10524 }, { "epoch": 2.1995820271682343, "grad_norm": 0.7964675354393648, "learning_rate": 1.4624933830810755e-05, "loss": 0.0984, "step": 10525 }, { "epoch": 2.199791013584117, "grad_norm": 1.084352293012884, "learning_rate": 1.4623933583247352e-05, "loss": 0.1157, "step": 10526 }, { "epoch": 2.2, "grad_norm": 0.9912785385169723, "learning_rate": 1.4622933276837083e-05, "loss": 0.1258, "step": 10527 }, { "epoch": 2.200208986415883, "grad_norm": 0.9924064716257808, "learning_rate": 1.4621932911592684e-05, "loss": 0.1146, "step": 10528 }, { "epoch": 2.200417972831766, "grad_norm": 1.1842387378781831, "learning_rate": 1.4620932487526887e-05, "loss": 0.1472, "step": 10529 }, { "epoch": 2.200626959247649, "grad_norm": 0.9600057158674221, "learning_rate": 1.461993200465242e-05, "loss": 0.1009, "step": 10530 }, { "epoch": 2.200835945663532, "grad_norm": 1.1395308816072125, "learning_rate": 1.4618931462982012e-05, "loss": 0.124, "step": 10531 }, { "epoch": 2.201044932079415, "grad_norm": 1.1561432257564561, "learning_rate": 1.4617930862528407e-05, "loss": 0.1344, "step": 10532 }, { "epoch": 2.201253918495298, "grad_norm": 1.0290918078280804, "learning_rate": 1.461693020330433e-05, "loss": 0.0942, "step": 10533 }, { "epoch": 2.201462904911181, "grad_norm": 1.1350625433895225, "learning_rate": 1.4615929485322524e-05, "loss": 0.1216, "step": 10534 }, { "epoch": 2.201671891327064, "grad_norm": 1.3250040854466674, "learning_rate": 1.4614928708595717e-05, "loss": 0.166, "step": 10535 }, { "epoch": 2.201880877742947, "grad_norm": 1.0307404317654547, "learning_rate": 1.4613927873136652e-05, "loss": 0.1084, "step": 10536 }, { "epoch": 2.20208986415883, "grad_norm": 1.2871520003887789, "learning_rate": 1.461292697895806e-05, "loss": 0.1246, "step": 10537 }, { "epoch": 2.2022988505747128, "grad_norm": 1.2141842589503538, "learning_rate": 1.4611926026072685e-05, "loss": 0.1225, "step": 10538 }, { "epoch": 2.2025078369905957, "grad_norm": 1.1768830176962772, "learning_rate": 1.461092501449326e-05, "loss": 0.1377, "step": 10539 }, { "epoch": 2.2027168234064787, "grad_norm": 0.9384808941697613, "learning_rate": 1.460992394423253e-05, "loss": 0.1103, "step": 10540 }, { "epoch": 2.2029258098223616, "grad_norm": 0.9617761083777857, "learning_rate": 1.4608922815303229e-05, "loss": 0.1179, "step": 10541 }, { "epoch": 2.2031347962382446, "grad_norm": 1.0146658184695363, "learning_rate": 1.4607921627718105e-05, "loss": 0.1248, "step": 10542 }, { "epoch": 2.2033437826541276, "grad_norm": 0.8850402930603006, "learning_rate": 1.4606920381489893e-05, "loss": 0.1253, "step": 10543 }, { "epoch": 2.2035527690700105, "grad_norm": 1.0061833187686942, "learning_rate": 1.460591907663134e-05, "loss": 0.1233, "step": 10544 }, { "epoch": 2.2037617554858935, "grad_norm": 0.9851735697953391, "learning_rate": 1.4604917713155187e-05, "loss": 0.1185, "step": 10545 }, { "epoch": 2.2039707419017764, "grad_norm": 0.9846187381946105, "learning_rate": 1.4603916291074178e-05, "loss": 0.0832, "step": 10546 }, { "epoch": 2.2041797283176594, "grad_norm": 0.8886580677384628, "learning_rate": 1.4602914810401061e-05, "loss": 0.1182, "step": 10547 }, { "epoch": 2.2043887147335424, "grad_norm": 1.0748178604489318, "learning_rate": 1.4601913271148577e-05, "loss": 0.1161, "step": 10548 }, { "epoch": 2.2045977011494253, "grad_norm": 0.9107906038321167, "learning_rate": 1.4600911673329474e-05, "loss": 0.1137, "step": 10549 }, { "epoch": 2.2048066875653083, "grad_norm": 1.0192977706506703, "learning_rate": 1.45999100169565e-05, "loss": 0.1356, "step": 10550 }, { "epoch": 2.2050156739811912, "grad_norm": 1.2241214432185383, "learning_rate": 1.45989083020424e-05, "loss": 0.1357, "step": 10551 }, { "epoch": 2.205224660397074, "grad_norm": 1.1456462970211214, "learning_rate": 1.4597906528599923e-05, "loss": 0.1305, "step": 10552 }, { "epoch": 2.205433646812957, "grad_norm": 1.0605838748280565, "learning_rate": 1.459690469664182e-05, "loss": 0.1215, "step": 10553 }, { "epoch": 2.20564263322884, "grad_norm": 0.9166585990248431, "learning_rate": 1.4595902806180844e-05, "loss": 0.1182, "step": 10554 }, { "epoch": 2.205851619644723, "grad_norm": 1.116913046017933, "learning_rate": 1.4594900857229735e-05, "loss": 0.1335, "step": 10555 }, { "epoch": 2.206060606060606, "grad_norm": 0.9973857051001189, "learning_rate": 1.4593898849801256e-05, "loss": 0.1186, "step": 10556 }, { "epoch": 2.206269592476489, "grad_norm": 0.9499509904794771, "learning_rate": 1.459289678390815e-05, "loss": 0.1178, "step": 10557 }, { "epoch": 2.206478578892372, "grad_norm": 1.1888025225514873, "learning_rate": 1.4591894659563175e-05, "loss": 0.1276, "step": 10558 }, { "epoch": 2.206687565308255, "grad_norm": 1.6093679189151806, "learning_rate": 1.4590892476779085e-05, "loss": 0.1344, "step": 10559 }, { "epoch": 2.206896551724138, "grad_norm": 1.0211706105604654, "learning_rate": 1.4589890235568636e-05, "loss": 0.1221, "step": 10560 }, { "epoch": 2.207105538140021, "grad_norm": 0.923235505195383, "learning_rate": 1.4588887935944574e-05, "loss": 0.1217, "step": 10561 }, { "epoch": 2.207314524555904, "grad_norm": 1.0492934936074754, "learning_rate": 1.4587885577919662e-05, "loss": 0.1187, "step": 10562 }, { "epoch": 2.207523510971787, "grad_norm": 0.9494329496304147, "learning_rate": 1.4586883161506658e-05, "loss": 0.1158, "step": 10563 }, { "epoch": 2.2077324973876697, "grad_norm": 1.0290532221733315, "learning_rate": 1.4585880686718317e-05, "loss": 0.1518, "step": 10564 }, { "epoch": 2.2079414838035527, "grad_norm": 1.0398419130393042, "learning_rate": 1.4584878153567395e-05, "loss": 0.1276, "step": 10565 }, { "epoch": 2.2081504702194357, "grad_norm": 0.9869139725938026, "learning_rate": 1.4583875562066653e-05, "loss": 0.1307, "step": 10566 }, { "epoch": 2.2083594566353186, "grad_norm": 0.9323472580013773, "learning_rate": 1.4582872912228849e-05, "loss": 0.1226, "step": 10567 }, { "epoch": 2.2085684430512016, "grad_norm": 1.144368918780617, "learning_rate": 1.4581870204066749e-05, "loss": 0.1318, "step": 10568 }, { "epoch": 2.2087774294670846, "grad_norm": 0.9786303291498979, "learning_rate": 1.4580867437593105e-05, "loss": 0.0986, "step": 10569 }, { "epoch": 2.2089864158829675, "grad_norm": 1.1187811940217494, "learning_rate": 1.4579864612820684e-05, "loss": 0.1286, "step": 10570 }, { "epoch": 2.2091954022988505, "grad_norm": 1.0378260049780295, "learning_rate": 1.4578861729762252e-05, "loss": 0.1219, "step": 10571 }, { "epoch": 2.2094043887147334, "grad_norm": 0.986858986345442, "learning_rate": 1.4577858788430563e-05, "loss": 0.0901, "step": 10572 }, { "epoch": 2.2096133751306164, "grad_norm": 0.9771169725300541, "learning_rate": 1.4576855788838388e-05, "loss": 0.1116, "step": 10573 }, { "epoch": 2.2098223615464994, "grad_norm": 0.8629893131468462, "learning_rate": 1.4575852730998493e-05, "loss": 0.1135, "step": 10574 }, { "epoch": 2.2100313479623823, "grad_norm": 1.0105663800251552, "learning_rate": 1.4574849614923638e-05, "loss": 0.1159, "step": 10575 }, { "epoch": 2.2102403343782653, "grad_norm": 1.0935289488173747, "learning_rate": 1.4573846440626588e-05, "loss": 0.1259, "step": 10576 }, { "epoch": 2.2104493207941482, "grad_norm": 0.9079336419386835, "learning_rate": 1.457284320812012e-05, "loss": 0.0947, "step": 10577 }, { "epoch": 2.210658307210031, "grad_norm": 1.13408220575597, "learning_rate": 1.4571839917416992e-05, "loss": 0.1546, "step": 10578 }, { "epoch": 2.210867293625914, "grad_norm": 0.9098474597396164, "learning_rate": 1.4570836568529977e-05, "loss": 0.1204, "step": 10579 }, { "epoch": 2.211076280041797, "grad_norm": 1.0441114911910905, "learning_rate": 1.4569833161471843e-05, "loss": 0.1333, "step": 10580 }, { "epoch": 2.21128526645768, "grad_norm": 0.9034522663089687, "learning_rate": 1.456882969625536e-05, "loss": 0.1002, "step": 10581 }, { "epoch": 2.211494252873563, "grad_norm": 0.9884201501358775, "learning_rate": 1.4567826172893296e-05, "loss": 0.1105, "step": 10582 }, { "epoch": 2.211703239289446, "grad_norm": 0.9161697856420659, "learning_rate": 1.4566822591398429e-05, "loss": 0.0968, "step": 10583 }, { "epoch": 2.211912225705329, "grad_norm": 1.0520674570240682, "learning_rate": 1.4565818951783526e-05, "loss": 0.1173, "step": 10584 }, { "epoch": 2.212121212121212, "grad_norm": 3.872911922433807, "learning_rate": 1.456481525406136e-05, "loss": 0.101, "step": 10585 }, { "epoch": 2.212330198537095, "grad_norm": 0.9484433301897804, "learning_rate": 1.4563811498244706e-05, "loss": 0.1118, "step": 10586 }, { "epoch": 2.212539184952978, "grad_norm": 1.0557572266625543, "learning_rate": 1.456280768434634e-05, "loss": 0.1231, "step": 10587 }, { "epoch": 2.212748171368861, "grad_norm": 0.8719554481428234, "learning_rate": 1.4561803812379034e-05, "loss": 0.1197, "step": 10588 }, { "epoch": 2.212957157784744, "grad_norm": 0.8754438348532716, "learning_rate": 1.4560799882355565e-05, "loss": 0.1095, "step": 10589 }, { "epoch": 2.2131661442006267, "grad_norm": 0.8791957394284939, "learning_rate": 1.455979589428871e-05, "loss": 0.1294, "step": 10590 }, { "epoch": 2.2133751306165097, "grad_norm": 0.9537380413031064, "learning_rate": 1.4558791848191247e-05, "loss": 0.1293, "step": 10591 }, { "epoch": 2.2135841170323927, "grad_norm": 1.1394965915746833, "learning_rate": 1.4557787744075951e-05, "loss": 0.1298, "step": 10592 }, { "epoch": 2.213793103448276, "grad_norm": 0.9919071004205919, "learning_rate": 1.4556783581955607e-05, "loss": 0.1203, "step": 10593 }, { "epoch": 2.214002089864159, "grad_norm": 0.9883212232220174, "learning_rate": 1.4555779361842986e-05, "loss": 0.1284, "step": 10594 }, { "epoch": 2.214211076280042, "grad_norm": 0.9754447948931081, "learning_rate": 1.4554775083750876e-05, "loss": 0.1092, "step": 10595 }, { "epoch": 2.214420062695925, "grad_norm": 1.249249119971814, "learning_rate": 1.4553770747692053e-05, "loss": 0.1446, "step": 10596 }, { "epoch": 2.214629049111808, "grad_norm": 0.9753732324778255, "learning_rate": 1.45527663536793e-05, "loss": 0.1238, "step": 10597 }, { "epoch": 2.214838035527691, "grad_norm": 0.9385089139364847, "learning_rate": 1.4551761901725402e-05, "loss": 0.1295, "step": 10598 }, { "epoch": 2.215047021943574, "grad_norm": 1.0685220883876478, "learning_rate": 1.4550757391843142e-05, "loss": 0.1139, "step": 10599 }, { "epoch": 2.215256008359457, "grad_norm": 1.036628953006341, "learning_rate": 1.4549752824045298e-05, "loss": 0.1665, "step": 10600 }, { "epoch": 2.2154649947753398, "grad_norm": 0.9483231594879405, "learning_rate": 1.454874819834466e-05, "loss": 0.1, "step": 10601 }, { "epoch": 2.2156739811912227, "grad_norm": 1.2515088074947378, "learning_rate": 1.4547743514754013e-05, "loss": 0.1413, "step": 10602 }, { "epoch": 2.2158829676071057, "grad_norm": 1.1677186477009052, "learning_rate": 1.4546738773286144e-05, "loss": 0.1203, "step": 10603 }, { "epoch": 2.2160919540229886, "grad_norm": 1.0641266684025086, "learning_rate": 1.4545733973953837e-05, "loss": 0.132, "step": 10604 }, { "epoch": 2.2163009404388716, "grad_norm": 0.9887658854312523, "learning_rate": 1.4544729116769884e-05, "loss": 0.1037, "step": 10605 }, { "epoch": 2.2165099268547546, "grad_norm": 1.0762129009460433, "learning_rate": 1.4543724201747066e-05, "loss": 0.117, "step": 10606 }, { "epoch": 2.2167189132706375, "grad_norm": 1.1437256630028856, "learning_rate": 1.4542719228898181e-05, "loss": 0.1237, "step": 10607 }, { "epoch": 2.2169278996865205, "grad_norm": 1.2054041535960844, "learning_rate": 1.4541714198236011e-05, "loss": 0.1429, "step": 10608 }, { "epoch": 2.2171368861024034, "grad_norm": 1.0694931506971563, "learning_rate": 1.4540709109773355e-05, "loss": 0.1148, "step": 10609 }, { "epoch": 2.2173458725182864, "grad_norm": 0.8771871300342206, "learning_rate": 1.4539703963522995e-05, "loss": 0.1188, "step": 10610 }, { "epoch": 2.2175548589341694, "grad_norm": 1.067589739575051, "learning_rate": 1.4538698759497731e-05, "loss": 0.1087, "step": 10611 }, { "epoch": 2.2177638453500523, "grad_norm": 1.155092774192675, "learning_rate": 1.453769349771035e-05, "loss": 0.1082, "step": 10612 }, { "epoch": 2.2179728317659353, "grad_norm": 0.9132797633577232, "learning_rate": 1.453668817817365e-05, "loss": 0.1115, "step": 10613 }, { "epoch": 2.2181818181818183, "grad_norm": 1.0145898183077, "learning_rate": 1.4535682800900422e-05, "loss": 0.1299, "step": 10614 }, { "epoch": 2.218390804597701, "grad_norm": 0.9803507763589681, "learning_rate": 1.4534677365903463e-05, "loss": 0.1066, "step": 10615 }, { "epoch": 2.218599791013584, "grad_norm": 1.049655140296668, "learning_rate": 1.4533671873195567e-05, "loss": 0.1415, "step": 10616 }, { "epoch": 2.218808777429467, "grad_norm": 1.0591785980696737, "learning_rate": 1.4532666322789534e-05, "loss": 0.1251, "step": 10617 }, { "epoch": 2.21901776384535, "grad_norm": 1.187820444389455, "learning_rate": 1.4531660714698155e-05, "loss": 0.1249, "step": 10618 }, { "epoch": 2.219226750261233, "grad_norm": 0.975338285135805, "learning_rate": 1.4530655048934235e-05, "loss": 0.1057, "step": 10619 }, { "epoch": 2.219435736677116, "grad_norm": 1.1658870504714411, "learning_rate": 1.4529649325510567e-05, "loss": 0.1378, "step": 10620 }, { "epoch": 2.219644723092999, "grad_norm": 1.1681303849642017, "learning_rate": 1.4528643544439955e-05, "loss": 0.1371, "step": 10621 }, { "epoch": 2.219853709508882, "grad_norm": 0.904376394932467, "learning_rate": 1.4527637705735194e-05, "loss": 0.1039, "step": 10622 }, { "epoch": 2.220062695924765, "grad_norm": 1.0240963143333583, "learning_rate": 1.4526631809409093e-05, "loss": 0.1197, "step": 10623 }, { "epoch": 2.220271682340648, "grad_norm": 0.8790077249254671, "learning_rate": 1.4525625855474444e-05, "loss": 0.1066, "step": 10624 }, { "epoch": 2.220480668756531, "grad_norm": 1.2740070037492526, "learning_rate": 1.4524619843944054e-05, "loss": 0.14, "step": 10625 }, { "epoch": 2.220689655172414, "grad_norm": 1.0393610246652758, "learning_rate": 1.4523613774830727e-05, "loss": 0.1357, "step": 10626 }, { "epoch": 2.2208986415882968, "grad_norm": 0.9507667343159564, "learning_rate": 1.4522607648147266e-05, "loss": 0.1373, "step": 10627 }, { "epoch": 2.2211076280041797, "grad_norm": 0.9943901553140564, "learning_rate": 1.4521601463906473e-05, "loss": 0.1126, "step": 10628 }, { "epoch": 2.2213166144200627, "grad_norm": 1.1564556715810914, "learning_rate": 1.452059522212116e-05, "loss": 0.1225, "step": 10629 }, { "epoch": 2.2215256008359456, "grad_norm": 1.0251480953310383, "learning_rate": 1.4519588922804127e-05, "loss": 0.1109, "step": 10630 }, { "epoch": 2.2217345872518286, "grad_norm": 1.1110804286488112, "learning_rate": 1.4518582565968182e-05, "loss": 0.1541, "step": 10631 }, { "epoch": 2.2219435736677116, "grad_norm": 0.8771328864764268, "learning_rate": 1.4517576151626132e-05, "loss": 0.0917, "step": 10632 }, { "epoch": 2.2221525600835945, "grad_norm": 1.0061023889812968, "learning_rate": 1.4516569679790786e-05, "loss": 0.1155, "step": 10633 }, { "epoch": 2.2223615464994775, "grad_norm": 1.0504151881422719, "learning_rate": 1.4515563150474953e-05, "loss": 0.1534, "step": 10634 }, { "epoch": 2.2225705329153604, "grad_norm": 1.080965732203924, "learning_rate": 1.4514556563691444e-05, "loss": 0.128, "step": 10635 }, { "epoch": 2.2227795193312434, "grad_norm": 1.0273927814089023, "learning_rate": 1.4513549919453065e-05, "loss": 0.1114, "step": 10636 }, { "epoch": 2.2229885057471264, "grad_norm": 1.0896673858008072, "learning_rate": 1.4512543217772635e-05, "loss": 0.1263, "step": 10637 }, { "epoch": 2.2231974921630093, "grad_norm": 0.9913693499064437, "learning_rate": 1.4511536458662959e-05, "loss": 0.1232, "step": 10638 }, { "epoch": 2.2234064785788923, "grad_norm": 1.1326379401411857, "learning_rate": 1.451052964213685e-05, "loss": 0.1174, "step": 10639 }, { "epoch": 2.2236154649947752, "grad_norm": 1.0607766537977052, "learning_rate": 1.4509522768207125e-05, "loss": 0.1277, "step": 10640 }, { "epoch": 2.223824451410658, "grad_norm": 1.0448369057829092, "learning_rate": 1.4508515836886597e-05, "loss": 0.1223, "step": 10641 }, { "epoch": 2.224033437826541, "grad_norm": 1.0964998561613435, "learning_rate": 1.4507508848188076e-05, "loss": 0.1203, "step": 10642 }, { "epoch": 2.224242424242424, "grad_norm": 1.0752711470290266, "learning_rate": 1.4506501802124384e-05, "loss": 0.1119, "step": 10643 }, { "epoch": 2.224451410658307, "grad_norm": 0.8388554540562477, "learning_rate": 1.4505494698708336e-05, "loss": 0.1124, "step": 10644 }, { "epoch": 2.22466039707419, "grad_norm": 0.8784638829912383, "learning_rate": 1.4504487537952743e-05, "loss": 0.1139, "step": 10645 }, { "epoch": 2.224869383490073, "grad_norm": 0.9484081296277118, "learning_rate": 1.4503480319870432e-05, "loss": 0.1099, "step": 10646 }, { "epoch": 2.225078369905956, "grad_norm": 1.1386340841313318, "learning_rate": 1.4502473044474215e-05, "loss": 0.1477, "step": 10647 }, { "epoch": 2.225287356321839, "grad_norm": 1.082115885190209, "learning_rate": 1.4501465711776913e-05, "loss": 0.1342, "step": 10648 }, { "epoch": 2.225496342737722, "grad_norm": 1.0716100361267298, "learning_rate": 1.4500458321791345e-05, "loss": 0.1274, "step": 10649 }, { "epoch": 2.225705329153605, "grad_norm": 0.9241030472858477, "learning_rate": 1.4499450874530334e-05, "loss": 0.1147, "step": 10650 }, { "epoch": 2.225914315569488, "grad_norm": 1.0895016017720462, "learning_rate": 1.4498443370006696e-05, "loss": 0.1185, "step": 10651 }, { "epoch": 2.226123301985371, "grad_norm": 1.0127802490087465, "learning_rate": 1.4497435808233262e-05, "loss": 0.1368, "step": 10652 }, { "epoch": 2.2263322884012537, "grad_norm": 1.0524753944108736, "learning_rate": 1.4496428189222846e-05, "loss": 0.1602, "step": 10653 }, { "epoch": 2.2265412748171367, "grad_norm": 1.1306575844594327, "learning_rate": 1.4495420512988279e-05, "loss": 0.1334, "step": 10654 }, { "epoch": 2.2267502612330197, "grad_norm": 0.8995897314317672, "learning_rate": 1.4494412779542378e-05, "loss": 0.1271, "step": 10655 }, { "epoch": 2.2269592476489026, "grad_norm": 1.074902196961184, "learning_rate": 1.4493404988897974e-05, "loss": 0.1286, "step": 10656 }, { "epoch": 2.2271682340647856, "grad_norm": 0.9193696642198425, "learning_rate": 1.4492397141067888e-05, "loss": 0.1004, "step": 10657 }, { "epoch": 2.2273772204806686, "grad_norm": 0.9060631222359763, "learning_rate": 1.4491389236064952e-05, "loss": 0.1318, "step": 10658 }, { "epoch": 2.227586206896552, "grad_norm": 0.9571405427465395, "learning_rate": 1.4490381273901987e-05, "loss": 0.1208, "step": 10659 }, { "epoch": 2.227795193312435, "grad_norm": 0.9736897126123585, "learning_rate": 1.4489373254591826e-05, "loss": 0.0947, "step": 10660 }, { "epoch": 2.228004179728318, "grad_norm": 0.898221787116648, "learning_rate": 1.4488365178147295e-05, "loss": 0.1173, "step": 10661 }, { "epoch": 2.228213166144201, "grad_norm": 1.1102810713502058, "learning_rate": 1.4487357044581223e-05, "loss": 0.1264, "step": 10662 }, { "epoch": 2.228422152560084, "grad_norm": 0.8486531140284085, "learning_rate": 1.4486348853906439e-05, "loss": 0.1046, "step": 10663 }, { "epoch": 2.2286311389759668, "grad_norm": 1.2977937897804506, "learning_rate": 1.448534060613578e-05, "loss": 0.1389, "step": 10664 }, { "epoch": 2.2288401253918497, "grad_norm": 1.151307745710643, "learning_rate": 1.448433230128207e-05, "loss": 0.1285, "step": 10665 }, { "epoch": 2.2290491118077327, "grad_norm": 1.0623472026090723, "learning_rate": 1.4483323939358145e-05, "loss": 0.1046, "step": 10666 }, { "epoch": 2.2292580982236156, "grad_norm": 1.0439008279845459, "learning_rate": 1.4482315520376839e-05, "loss": 0.1255, "step": 10667 }, { "epoch": 2.2294670846394986, "grad_norm": 1.1431306984995122, "learning_rate": 1.4481307044350984e-05, "loss": 0.1276, "step": 10668 }, { "epoch": 2.2296760710553816, "grad_norm": 1.0018230984004612, "learning_rate": 1.4480298511293414e-05, "loss": 0.139, "step": 10669 }, { "epoch": 2.2298850574712645, "grad_norm": 1.1740676772601426, "learning_rate": 1.4479289921216966e-05, "loss": 0.1553, "step": 10670 }, { "epoch": 2.2300940438871475, "grad_norm": 0.8894252184063675, "learning_rate": 1.4478281274134473e-05, "loss": 0.1044, "step": 10671 }, { "epoch": 2.2303030303030305, "grad_norm": 0.7408881638427598, "learning_rate": 1.4477272570058775e-05, "loss": 0.0949, "step": 10672 }, { "epoch": 2.2305120167189134, "grad_norm": 1.1380810139887998, "learning_rate": 1.4476263809002706e-05, "loss": 0.1597, "step": 10673 }, { "epoch": 2.2307210031347964, "grad_norm": 1.070368691979418, "learning_rate": 1.4475254990979111e-05, "loss": 0.1437, "step": 10674 }, { "epoch": 2.2309299895506793, "grad_norm": 1.0693965284754114, "learning_rate": 1.4474246116000817e-05, "loss": 0.1109, "step": 10675 }, { "epoch": 2.2311389759665623, "grad_norm": 0.9087512904854739, "learning_rate": 1.4473237184080675e-05, "loss": 0.1041, "step": 10676 }, { "epoch": 2.2313479623824453, "grad_norm": 0.9608226668291644, "learning_rate": 1.447222819523152e-05, "loss": 0.1244, "step": 10677 }, { "epoch": 2.231556948798328, "grad_norm": 1.08579283363263, "learning_rate": 1.4471219149466192e-05, "loss": 0.1068, "step": 10678 }, { "epoch": 2.231765935214211, "grad_norm": 1.110285699533278, "learning_rate": 1.4470210046797533e-05, "loss": 0.1353, "step": 10679 }, { "epoch": 2.231974921630094, "grad_norm": 1.183981375345824, "learning_rate": 1.4469200887238388e-05, "loss": 0.1507, "step": 10680 }, { "epoch": 2.232183908045977, "grad_norm": 1.059415251974694, "learning_rate": 1.4468191670801597e-05, "loss": 0.1222, "step": 10681 }, { "epoch": 2.23239289446186, "grad_norm": 0.8695398513106936, "learning_rate": 1.4467182397500011e-05, "loss": 0.1179, "step": 10682 }, { "epoch": 2.232601880877743, "grad_norm": 1.066452196323239, "learning_rate": 1.4466173067346463e-05, "loss": 0.1318, "step": 10683 }, { "epoch": 2.232810867293626, "grad_norm": 0.8451729948879001, "learning_rate": 1.4465163680353806e-05, "loss": 0.1164, "step": 10684 }, { "epoch": 2.233019853709509, "grad_norm": 0.8597208189854997, "learning_rate": 1.4464154236534886e-05, "loss": 0.1027, "step": 10685 }, { "epoch": 2.233228840125392, "grad_norm": 1.094640867374464, "learning_rate": 1.446314473590255e-05, "loss": 0.1425, "step": 10686 }, { "epoch": 2.233437826541275, "grad_norm": 0.7942511406169154, "learning_rate": 1.4462135178469638e-05, "loss": 0.0969, "step": 10687 }, { "epoch": 2.233646812957158, "grad_norm": 0.9369076410744391, "learning_rate": 1.446112556424901e-05, "loss": 0.1051, "step": 10688 }, { "epoch": 2.233855799373041, "grad_norm": 1.0270255743733834, "learning_rate": 1.4460115893253507e-05, "loss": 0.1288, "step": 10689 }, { "epoch": 2.2340647857889238, "grad_norm": 0.9499554956174947, "learning_rate": 1.4459106165495979e-05, "loss": 0.1312, "step": 10690 }, { "epoch": 2.2342737722048067, "grad_norm": 0.9403436850238038, "learning_rate": 1.4458096380989279e-05, "loss": 0.1161, "step": 10691 }, { "epoch": 2.2344827586206897, "grad_norm": 0.8728394305057825, "learning_rate": 1.445708653974626e-05, "loss": 0.1191, "step": 10692 }, { "epoch": 2.2346917450365726, "grad_norm": 0.9486298035643261, "learning_rate": 1.4456076641779766e-05, "loss": 0.111, "step": 10693 }, { "epoch": 2.2349007314524556, "grad_norm": 0.9460374652039952, "learning_rate": 1.445506668710266e-05, "loss": 0.1466, "step": 10694 }, { "epoch": 2.2351097178683386, "grad_norm": 1.0190227093271285, "learning_rate": 1.4454056675727785e-05, "loss": 0.1285, "step": 10695 }, { "epoch": 2.2353187042842215, "grad_norm": 1.058197806008755, "learning_rate": 1.4453046607668002e-05, "loss": 0.1356, "step": 10696 }, { "epoch": 2.2355276907001045, "grad_norm": 1.2819354392962024, "learning_rate": 1.4452036482936162e-05, "loss": 0.1388, "step": 10697 }, { "epoch": 2.2357366771159874, "grad_norm": 0.9936478197549083, "learning_rate": 1.4451026301545124e-05, "loss": 0.1373, "step": 10698 }, { "epoch": 2.2359456635318704, "grad_norm": 0.9151956964889294, "learning_rate": 1.445001606350774e-05, "loss": 0.1278, "step": 10699 }, { "epoch": 2.2361546499477534, "grad_norm": 0.9572555159393628, "learning_rate": 1.444900576883687e-05, "loss": 0.1267, "step": 10700 }, { "epoch": 2.2363636363636363, "grad_norm": 1.1267271892566593, "learning_rate": 1.444799541754537e-05, "loss": 0.1197, "step": 10701 }, { "epoch": 2.2365726227795193, "grad_norm": 0.975266171167899, "learning_rate": 1.4446985009646099e-05, "loss": 0.1068, "step": 10702 }, { "epoch": 2.2367816091954023, "grad_norm": 0.9132283817563631, "learning_rate": 1.4445974545151917e-05, "loss": 0.1227, "step": 10703 }, { "epoch": 2.236990595611285, "grad_norm": 0.8893599369572287, "learning_rate": 1.4444964024075683e-05, "loss": 0.1225, "step": 10704 }, { "epoch": 2.237199582027168, "grad_norm": 1.0482116072002778, "learning_rate": 1.4443953446430252e-05, "loss": 0.1274, "step": 10705 }, { "epoch": 2.237408568443051, "grad_norm": 0.9091907231401701, "learning_rate": 1.4442942812228497e-05, "loss": 0.1278, "step": 10706 }, { "epoch": 2.237617554858934, "grad_norm": 1.098543672177364, "learning_rate": 1.4441932121483268e-05, "loss": 0.1249, "step": 10707 }, { "epoch": 2.237826541274817, "grad_norm": 0.940089213018525, "learning_rate": 1.4440921374207437e-05, "loss": 0.1137, "step": 10708 }, { "epoch": 2.2380355276907, "grad_norm": 1.0171872101980413, "learning_rate": 1.443991057041386e-05, "loss": 0.1377, "step": 10709 }, { "epoch": 2.238244514106583, "grad_norm": 1.2090729535776545, "learning_rate": 1.4438899710115408e-05, "loss": 0.1368, "step": 10710 }, { "epoch": 2.238453500522466, "grad_norm": 1.1352774890572466, "learning_rate": 1.4437888793324936e-05, "loss": 0.1022, "step": 10711 }, { "epoch": 2.238662486938349, "grad_norm": 0.9253823743789179, "learning_rate": 1.4436877820055323e-05, "loss": 0.1206, "step": 10712 }, { "epoch": 2.238871473354232, "grad_norm": 1.0797789171401406, "learning_rate": 1.4435866790319423e-05, "loss": 0.1304, "step": 10713 }, { "epoch": 2.239080459770115, "grad_norm": 1.037367062776386, "learning_rate": 1.443485570413011e-05, "loss": 0.1412, "step": 10714 }, { "epoch": 2.239289446185998, "grad_norm": 1.0424961104200403, "learning_rate": 1.4433844561500249e-05, "loss": 0.0972, "step": 10715 }, { "epoch": 2.2394984326018808, "grad_norm": 0.861209262232301, "learning_rate": 1.4432833362442708e-05, "loss": 0.1194, "step": 10716 }, { "epoch": 2.2397074190177637, "grad_norm": 0.9470331213993175, "learning_rate": 1.443182210697036e-05, "loss": 0.1161, "step": 10717 }, { "epoch": 2.2399164054336467, "grad_norm": 1.001598784224331, "learning_rate": 1.4430810795096068e-05, "loss": 0.1428, "step": 10718 }, { "epoch": 2.2401253918495296, "grad_norm": 1.2351637947146152, "learning_rate": 1.4429799426832708e-05, "loss": 0.1279, "step": 10719 }, { "epoch": 2.2403343782654126, "grad_norm": 0.8376780613507584, "learning_rate": 1.4428788002193148e-05, "loss": 0.1175, "step": 10720 }, { "epoch": 2.2405433646812956, "grad_norm": 0.9922136927900524, "learning_rate": 1.4427776521190264e-05, "loss": 0.1207, "step": 10721 }, { "epoch": 2.2407523510971785, "grad_norm": 0.9881949337304395, "learning_rate": 1.4426764983836926e-05, "loss": 0.1133, "step": 10722 }, { "epoch": 2.2409613375130615, "grad_norm": 0.9871830484001258, "learning_rate": 1.4425753390146004e-05, "loss": 0.1123, "step": 10723 }, { "epoch": 2.2411703239289444, "grad_norm": 0.9675507176147419, "learning_rate": 1.4424741740130379e-05, "loss": 0.1269, "step": 10724 }, { "epoch": 2.2413793103448274, "grad_norm": 1.1053338120549798, "learning_rate": 1.4423730033802923e-05, "loss": 0.1488, "step": 10725 }, { "epoch": 2.2415882967607104, "grad_norm": 1.0000169486349273, "learning_rate": 1.4422718271176508e-05, "loss": 0.1174, "step": 10726 }, { "epoch": 2.2417972831765933, "grad_norm": 1.032302719894267, "learning_rate": 1.4421706452264017e-05, "loss": 0.1413, "step": 10727 }, { "epoch": 2.2420062695924763, "grad_norm": 0.8218599822363677, "learning_rate": 1.4420694577078322e-05, "loss": 0.1038, "step": 10728 }, { "epoch": 2.2422152560083592, "grad_norm": 1.0458783793592186, "learning_rate": 1.4419682645632303e-05, "loss": 0.1318, "step": 10729 }, { "epoch": 2.242424242424242, "grad_norm": 1.0165930425263434, "learning_rate": 1.4418670657938836e-05, "loss": 0.1268, "step": 10730 }, { "epoch": 2.242633228840125, "grad_norm": 0.9478256830859743, "learning_rate": 1.4417658614010805e-05, "loss": 0.1119, "step": 10731 }, { "epoch": 2.242842215256008, "grad_norm": 1.007700365487186, "learning_rate": 1.4416646513861082e-05, "loss": 0.1305, "step": 10732 }, { "epoch": 2.243051201671891, "grad_norm": 0.8145527654800957, "learning_rate": 1.4415634357502555e-05, "loss": 0.099, "step": 10733 }, { "epoch": 2.2432601880877745, "grad_norm": 1.2420675061182511, "learning_rate": 1.4414622144948102e-05, "loss": 0.1182, "step": 10734 }, { "epoch": 2.2434691745036575, "grad_norm": 0.9737368702405047, "learning_rate": 1.4413609876210606e-05, "loss": 0.1278, "step": 10735 }, { "epoch": 2.2436781609195404, "grad_norm": 0.9855388011565, "learning_rate": 1.4412597551302949e-05, "loss": 0.1256, "step": 10736 }, { "epoch": 2.2438871473354234, "grad_norm": 0.8393528045474207, "learning_rate": 1.4411585170238013e-05, "loss": 0.1197, "step": 10737 }, { "epoch": 2.2440961337513063, "grad_norm": 0.905903280463638, "learning_rate": 1.4410572733028685e-05, "loss": 0.127, "step": 10738 }, { "epoch": 2.2443051201671893, "grad_norm": 1.0716352855807016, "learning_rate": 1.440956023968785e-05, "loss": 0.1219, "step": 10739 }, { "epoch": 2.2445141065830723, "grad_norm": 1.0091782047943811, "learning_rate": 1.4408547690228392e-05, "loss": 0.1337, "step": 10740 }, { "epoch": 2.2447230929989552, "grad_norm": 1.0361097015925753, "learning_rate": 1.4407535084663196e-05, "loss": 0.1204, "step": 10741 }, { "epoch": 2.244932079414838, "grad_norm": 1.2253922997521312, "learning_rate": 1.4406522423005149e-05, "loss": 0.1578, "step": 10742 }, { "epoch": 2.245141065830721, "grad_norm": 1.0069232108101605, "learning_rate": 1.4405509705267144e-05, "loss": 0.1314, "step": 10743 }, { "epoch": 2.245350052246604, "grad_norm": 0.9708947910859776, "learning_rate": 1.4404496931462062e-05, "loss": 0.1263, "step": 10744 }, { "epoch": 2.245559038662487, "grad_norm": 0.9972021031870583, "learning_rate": 1.4403484101602801e-05, "loss": 0.1113, "step": 10745 }, { "epoch": 2.24576802507837, "grad_norm": 0.8542854320410643, "learning_rate": 1.4402471215702241e-05, "loss": 0.1179, "step": 10746 }, { "epoch": 2.245977011494253, "grad_norm": 1.060673127188371, "learning_rate": 1.4401458273773279e-05, "loss": 0.1347, "step": 10747 }, { "epoch": 2.246185997910136, "grad_norm": 0.9741981960913341, "learning_rate": 1.4400445275828805e-05, "loss": 0.1294, "step": 10748 }, { "epoch": 2.246394984326019, "grad_norm": 1.1299467048700311, "learning_rate": 1.4399432221881711e-05, "loss": 0.1403, "step": 10749 }, { "epoch": 2.246603970741902, "grad_norm": 0.9033761101729186, "learning_rate": 1.4398419111944885e-05, "loss": 0.1115, "step": 10750 }, { "epoch": 2.246812957157785, "grad_norm": 0.8491355076825444, "learning_rate": 1.4397405946031228e-05, "loss": 0.1045, "step": 10751 }, { "epoch": 2.247021943573668, "grad_norm": 1.130032905050901, "learning_rate": 1.4396392724153632e-05, "loss": 0.1448, "step": 10752 }, { "epoch": 2.2472309299895508, "grad_norm": 0.8659194780428272, "learning_rate": 1.439537944632499e-05, "loss": 0.1033, "step": 10753 }, { "epoch": 2.2474399164054337, "grad_norm": 0.9737369263334748, "learning_rate": 1.4394366112558196e-05, "loss": 0.1269, "step": 10754 }, { "epoch": 2.2476489028213167, "grad_norm": 1.0137448024420683, "learning_rate": 1.4393352722866153e-05, "loss": 0.1431, "step": 10755 }, { "epoch": 2.2478578892371996, "grad_norm": 1.104774650776601, "learning_rate": 1.4392339277261747e-05, "loss": 0.1248, "step": 10756 }, { "epoch": 2.2480668756530826, "grad_norm": 1.0190665627077506, "learning_rate": 1.4391325775757887e-05, "loss": 0.1257, "step": 10757 }, { "epoch": 2.2482758620689656, "grad_norm": 1.1203369719996081, "learning_rate": 1.4390312218367463e-05, "loss": 0.1266, "step": 10758 }, { "epoch": 2.2484848484848485, "grad_norm": 0.880652448918321, "learning_rate": 1.438929860510338e-05, "loss": 0.0973, "step": 10759 }, { "epoch": 2.2486938349007315, "grad_norm": 0.9710792187923649, "learning_rate": 1.4388284935978537e-05, "loss": 0.1128, "step": 10760 }, { "epoch": 2.2489028213166145, "grad_norm": 1.132016640453678, "learning_rate": 1.4387271211005834e-05, "loss": 0.1273, "step": 10761 }, { "epoch": 2.2491118077324974, "grad_norm": 0.9124348374221594, "learning_rate": 1.4386257430198165e-05, "loss": 0.1103, "step": 10762 }, { "epoch": 2.2493207941483804, "grad_norm": 1.2059009702325378, "learning_rate": 1.4385243593568445e-05, "loss": 0.1364, "step": 10763 }, { "epoch": 2.2495297805642633, "grad_norm": 1.1641696118150666, "learning_rate": 1.4384229701129567e-05, "loss": 0.1419, "step": 10764 }, { "epoch": 2.2497387669801463, "grad_norm": 1.16631617019018, "learning_rate": 1.4383215752894436e-05, "loss": 0.1262, "step": 10765 }, { "epoch": 2.2499477533960293, "grad_norm": 1.0910914089494692, "learning_rate": 1.438220174887596e-05, "loss": 0.1433, "step": 10766 }, { "epoch": 2.250156739811912, "grad_norm": 1.0014659885559127, "learning_rate": 1.4381187689087042e-05, "loss": 0.1282, "step": 10767 }, { "epoch": 2.250365726227795, "grad_norm": 0.8765133280185927, "learning_rate": 1.4380173573540584e-05, "loss": 0.126, "step": 10768 }, { "epoch": 2.250574712643678, "grad_norm": 0.8753921172537599, "learning_rate": 1.4379159402249496e-05, "loss": 0.1133, "step": 10769 }, { "epoch": 2.250783699059561, "grad_norm": 0.8780569671583982, "learning_rate": 1.4378145175226683e-05, "loss": 0.0856, "step": 10770 }, { "epoch": 2.250992685475444, "grad_norm": 0.9437646803892868, "learning_rate": 1.4377130892485056e-05, "loss": 0.1222, "step": 10771 }, { "epoch": 2.251201671891327, "grad_norm": 0.9852143199343187, "learning_rate": 1.4376116554037518e-05, "loss": 0.1187, "step": 10772 }, { "epoch": 2.25141065830721, "grad_norm": 0.8545088123238014, "learning_rate": 1.4375102159896986e-05, "loss": 0.1068, "step": 10773 }, { "epoch": 2.251619644723093, "grad_norm": 0.8874387006796702, "learning_rate": 1.4374087710076357e-05, "loss": 0.116, "step": 10774 }, { "epoch": 2.251828631138976, "grad_norm": 0.9876875475253544, "learning_rate": 1.4373073204588556e-05, "loss": 0.127, "step": 10775 }, { "epoch": 2.252037617554859, "grad_norm": 1.021853344879814, "learning_rate": 1.4372058643446485e-05, "loss": 0.1034, "step": 10776 }, { "epoch": 2.252246603970742, "grad_norm": 0.9694991919766273, "learning_rate": 1.4371044026663056e-05, "loss": 0.1124, "step": 10777 }, { "epoch": 2.252455590386625, "grad_norm": 1.009092677098872, "learning_rate": 1.4370029354251187e-05, "loss": 0.1191, "step": 10778 }, { "epoch": 2.2526645768025078, "grad_norm": 0.8612647414085048, "learning_rate": 1.436901462622379e-05, "loss": 0.117, "step": 10779 }, { "epoch": 2.2528735632183907, "grad_norm": 0.9356417188738859, "learning_rate": 1.4367999842593772e-05, "loss": 0.1148, "step": 10780 }, { "epoch": 2.2530825496342737, "grad_norm": 1.020552565073745, "learning_rate": 1.4366985003374056e-05, "loss": 0.144, "step": 10781 }, { "epoch": 2.2532915360501566, "grad_norm": 0.9292273320632144, "learning_rate": 1.4365970108577554e-05, "loss": 0.1251, "step": 10782 }, { "epoch": 2.2535005224660396, "grad_norm": 0.9864076678135453, "learning_rate": 1.4364955158217182e-05, "loss": 0.1109, "step": 10783 }, { "epoch": 2.2537095088819226, "grad_norm": 0.9548402862130398, "learning_rate": 1.4363940152305857e-05, "loss": 0.1176, "step": 10784 }, { "epoch": 2.2539184952978055, "grad_norm": 0.7604229902583013, "learning_rate": 1.43629250908565e-05, "loss": 0.1039, "step": 10785 }, { "epoch": 2.2541274817136885, "grad_norm": 1.0488825040845176, "learning_rate": 1.4361909973882021e-05, "loss": 0.1207, "step": 10786 }, { "epoch": 2.2543364681295714, "grad_norm": 0.8878303643192829, "learning_rate": 1.436089480139535e-05, "loss": 0.0922, "step": 10787 }, { "epoch": 2.2545454545454544, "grad_norm": 1.1735379730543387, "learning_rate": 1.4359879573409398e-05, "loss": 0.1405, "step": 10788 }, { "epoch": 2.2547544409613374, "grad_norm": 1.032172832375124, "learning_rate": 1.4358864289937086e-05, "loss": 0.13, "step": 10789 }, { "epoch": 2.2549634273772203, "grad_norm": 0.9434421899863004, "learning_rate": 1.4357848950991342e-05, "loss": 0.1201, "step": 10790 }, { "epoch": 2.2551724137931033, "grad_norm": 1.2716474280906305, "learning_rate": 1.4356833556585082e-05, "loss": 0.1606, "step": 10791 }, { "epoch": 2.2553814002089863, "grad_norm": 0.9826654187573574, "learning_rate": 1.4355818106731225e-05, "loss": 0.0926, "step": 10792 }, { "epoch": 2.255590386624869, "grad_norm": 1.0091344706848517, "learning_rate": 1.4354802601442704e-05, "loss": 0.1254, "step": 10793 }, { "epoch": 2.255799373040752, "grad_norm": 1.0738341812610988, "learning_rate": 1.4353787040732436e-05, "loss": 0.1105, "step": 10794 }, { "epoch": 2.256008359456635, "grad_norm": 1.084645312811349, "learning_rate": 1.4352771424613346e-05, "loss": 0.1351, "step": 10795 }, { "epoch": 2.256217345872518, "grad_norm": 0.9623920095679926, "learning_rate": 1.4351755753098363e-05, "loss": 0.1222, "step": 10796 }, { "epoch": 2.256426332288401, "grad_norm": 1.0265906163694372, "learning_rate": 1.4350740026200407e-05, "loss": 0.1193, "step": 10797 }, { "epoch": 2.2566353187042845, "grad_norm": 1.1117126509395912, "learning_rate": 1.4349724243932412e-05, "loss": 0.1213, "step": 10798 }, { "epoch": 2.2568443051201674, "grad_norm": 1.1145819359943872, "learning_rate": 1.4348708406307299e-05, "loss": 0.1028, "step": 10799 }, { "epoch": 2.2570532915360504, "grad_norm": 1.0224837921757217, "learning_rate": 1.4347692513338004e-05, "loss": 0.1126, "step": 10800 }, { "epoch": 2.2572622779519333, "grad_norm": 1.0068782730961836, "learning_rate": 1.4346676565037443e-05, "loss": 0.1242, "step": 10801 }, { "epoch": 2.2574712643678163, "grad_norm": 1.0712271987204143, "learning_rate": 1.434566056141856e-05, "loss": 0.1357, "step": 10802 }, { "epoch": 2.2576802507836993, "grad_norm": 1.1464456543234836, "learning_rate": 1.4344644502494276e-05, "loss": 0.1369, "step": 10803 }, { "epoch": 2.2578892371995822, "grad_norm": 1.0796272604716861, "learning_rate": 1.4343628388277524e-05, "loss": 0.1061, "step": 10804 }, { "epoch": 2.258098223615465, "grad_norm": 1.0536209930345466, "learning_rate": 1.4342612218781238e-05, "loss": 0.1156, "step": 10805 }, { "epoch": 2.258307210031348, "grad_norm": 1.0026913389762573, "learning_rate": 1.4341595994018349e-05, "loss": 0.1095, "step": 10806 }, { "epoch": 2.258516196447231, "grad_norm": 1.0035845120408917, "learning_rate": 1.4340579714001785e-05, "loss": 0.1331, "step": 10807 }, { "epoch": 2.258725182863114, "grad_norm": 1.1023854931400392, "learning_rate": 1.433956337874449e-05, "loss": 0.1343, "step": 10808 }, { "epoch": 2.258934169278997, "grad_norm": 0.9135869392750565, "learning_rate": 1.433854698825939e-05, "loss": 0.0938, "step": 10809 }, { "epoch": 2.25914315569488, "grad_norm": 0.9923000731805448, "learning_rate": 1.4337530542559424e-05, "loss": 0.1172, "step": 10810 }, { "epoch": 2.259352142110763, "grad_norm": 1.0028901692154537, "learning_rate": 1.4336514041657528e-05, "loss": 0.1247, "step": 10811 }, { "epoch": 2.259561128526646, "grad_norm": 0.9215128371008937, "learning_rate": 1.4335497485566637e-05, "loss": 0.0941, "step": 10812 }, { "epoch": 2.259770114942529, "grad_norm": 1.0267530608743543, "learning_rate": 1.4334480874299685e-05, "loss": 0.1075, "step": 10813 }, { "epoch": 2.259979101358412, "grad_norm": 1.010751768829375, "learning_rate": 1.4333464207869618e-05, "loss": 0.1417, "step": 10814 }, { "epoch": 2.260188087774295, "grad_norm": 0.9818222929852727, "learning_rate": 1.433244748628937e-05, "loss": 0.1141, "step": 10815 }, { "epoch": 2.2603970741901778, "grad_norm": 0.9882841163934788, "learning_rate": 1.433143070957188e-05, "loss": 0.1207, "step": 10816 }, { "epoch": 2.2606060606060607, "grad_norm": 0.9718965123749906, "learning_rate": 1.433041387773009e-05, "loss": 0.1363, "step": 10817 }, { "epoch": 2.2608150470219437, "grad_norm": 0.9133807961411825, "learning_rate": 1.4329396990776938e-05, "loss": 0.1122, "step": 10818 }, { "epoch": 2.2610240334378267, "grad_norm": 0.9363038048996326, "learning_rate": 1.4328380048725367e-05, "loss": 0.119, "step": 10819 }, { "epoch": 2.2612330198537096, "grad_norm": 0.8618193909179179, "learning_rate": 1.4327363051588323e-05, "loss": 0.1016, "step": 10820 }, { "epoch": 2.2614420062695926, "grad_norm": 0.9559317101175384, "learning_rate": 1.4326345999378743e-05, "loss": 0.1286, "step": 10821 }, { "epoch": 2.2616509926854755, "grad_norm": 1.003700019907194, "learning_rate": 1.432532889210957e-05, "loss": 0.1155, "step": 10822 }, { "epoch": 2.2618599791013585, "grad_norm": 1.0338804411173521, "learning_rate": 1.4324311729793756e-05, "loss": 0.1427, "step": 10823 }, { "epoch": 2.2620689655172415, "grad_norm": 0.8764835491258007, "learning_rate": 1.4323294512444241e-05, "loss": 0.1174, "step": 10824 }, { "epoch": 2.2622779519331244, "grad_norm": 1.218790562311369, "learning_rate": 1.4322277240073968e-05, "loss": 0.137, "step": 10825 }, { "epoch": 2.2624869383490074, "grad_norm": 0.8518126963289856, "learning_rate": 1.4321259912695889e-05, "loss": 0.1101, "step": 10826 }, { "epoch": 2.2626959247648903, "grad_norm": 0.9922985022043135, "learning_rate": 1.4320242530322944e-05, "loss": 0.1125, "step": 10827 }, { "epoch": 2.2629049111807733, "grad_norm": 1.2685113093709806, "learning_rate": 1.431922509296809e-05, "loss": 0.1365, "step": 10828 }, { "epoch": 2.2631138975966563, "grad_norm": 1.0268355819311008, "learning_rate": 1.431820760064427e-05, "loss": 0.104, "step": 10829 }, { "epoch": 2.2633228840125392, "grad_norm": 0.9721838099994086, "learning_rate": 1.4317190053364434e-05, "loss": 0.1056, "step": 10830 }, { "epoch": 2.263531870428422, "grad_norm": 1.1723387128104779, "learning_rate": 1.4316172451141529e-05, "loss": 0.1499, "step": 10831 }, { "epoch": 2.263740856844305, "grad_norm": 1.0920205303794206, "learning_rate": 1.431515479398851e-05, "loss": 0.1279, "step": 10832 }, { "epoch": 2.263949843260188, "grad_norm": 1.0137193150540496, "learning_rate": 1.4314137081918329e-05, "loss": 0.1163, "step": 10833 }, { "epoch": 2.264158829676071, "grad_norm": 0.9698549332081084, "learning_rate": 1.4313119314943933e-05, "loss": 0.1151, "step": 10834 }, { "epoch": 2.264367816091954, "grad_norm": 0.9367125161567703, "learning_rate": 1.4312101493078276e-05, "loss": 0.1271, "step": 10835 }, { "epoch": 2.264576802507837, "grad_norm": 1.0725851074717248, "learning_rate": 1.4311083616334318e-05, "loss": 0.121, "step": 10836 }, { "epoch": 2.26478578892372, "grad_norm": 1.1078176012595984, "learning_rate": 1.4310065684725003e-05, "loss": 0.1361, "step": 10837 }, { "epoch": 2.264994775339603, "grad_norm": 1.5002199277202437, "learning_rate": 1.4309047698263294e-05, "loss": 0.1638, "step": 10838 }, { "epoch": 2.265203761755486, "grad_norm": 0.9131547282200284, "learning_rate": 1.430802965696214e-05, "loss": 0.116, "step": 10839 }, { "epoch": 2.265412748171369, "grad_norm": 0.9207779551082818, "learning_rate": 1.4307011560834503e-05, "loss": 0.1121, "step": 10840 }, { "epoch": 2.265621734587252, "grad_norm": 0.8137905245154946, "learning_rate": 1.4305993409893335e-05, "loss": 0.0989, "step": 10841 }, { "epoch": 2.2658307210031348, "grad_norm": 0.9458246760220383, "learning_rate": 1.4304975204151599e-05, "loss": 0.1151, "step": 10842 }, { "epoch": 2.2660397074190177, "grad_norm": 0.8500201640307562, "learning_rate": 1.4303956943622246e-05, "loss": 0.0986, "step": 10843 }, { "epoch": 2.2662486938349007, "grad_norm": 0.9914597189555922, "learning_rate": 1.4302938628318243e-05, "loss": 0.1189, "step": 10844 }, { "epoch": 2.2664576802507836, "grad_norm": 0.807807055608768, "learning_rate": 1.4301920258252544e-05, "loss": 0.088, "step": 10845 }, { "epoch": 2.2666666666666666, "grad_norm": 0.9861698037034964, "learning_rate": 1.430090183343811e-05, "loss": 0.1187, "step": 10846 }, { "epoch": 2.2668756530825496, "grad_norm": 0.9289258155025147, "learning_rate": 1.4299883353887904e-05, "loss": 0.1227, "step": 10847 }, { "epoch": 2.2670846394984325, "grad_norm": 0.9730396108215429, "learning_rate": 1.4298864819614888e-05, "loss": 0.1187, "step": 10848 }, { "epoch": 2.2672936259143155, "grad_norm": 1.062662322666158, "learning_rate": 1.4297846230632023e-05, "loss": 0.1237, "step": 10849 }, { "epoch": 2.2675026123301985, "grad_norm": 0.8440215005369716, "learning_rate": 1.4296827586952273e-05, "loss": 0.0982, "step": 10850 }, { "epoch": 2.2677115987460814, "grad_norm": 1.0803560796441198, "learning_rate": 1.42958088885886e-05, "loss": 0.1431, "step": 10851 }, { "epoch": 2.2679205851619644, "grad_norm": 1.1380119381425957, "learning_rate": 1.4294790135553969e-05, "loss": 0.1259, "step": 10852 }, { "epoch": 2.2681295715778473, "grad_norm": 0.9379522876350233, "learning_rate": 1.4293771327861347e-05, "loss": 0.1195, "step": 10853 }, { "epoch": 2.2683385579937303, "grad_norm": 1.1934183126582263, "learning_rate": 1.4292752465523702e-05, "loss": 0.1421, "step": 10854 }, { "epoch": 2.2685475444096133, "grad_norm": 0.953288353271106, "learning_rate": 1.4291733548553994e-05, "loss": 0.1213, "step": 10855 }, { "epoch": 2.268756530825496, "grad_norm": 1.0563952833008972, "learning_rate": 1.4290714576965196e-05, "loss": 0.1087, "step": 10856 }, { "epoch": 2.268965517241379, "grad_norm": 1.0587527510418067, "learning_rate": 1.4289695550770275e-05, "loss": 0.1452, "step": 10857 }, { "epoch": 2.269174503657262, "grad_norm": 0.8990663629560106, "learning_rate": 1.4288676469982194e-05, "loss": 0.1075, "step": 10858 }, { "epoch": 2.269383490073145, "grad_norm": 1.0821402751591074, "learning_rate": 1.4287657334613931e-05, "loss": 0.1124, "step": 10859 }, { "epoch": 2.269592476489028, "grad_norm": 0.9911348852338762, "learning_rate": 1.4286638144678456e-05, "loss": 0.1254, "step": 10860 }, { "epoch": 2.269801462904911, "grad_norm": 0.9618629818512718, "learning_rate": 1.4285618900188729e-05, "loss": 0.099, "step": 10861 }, { "epoch": 2.270010449320794, "grad_norm": 1.1392921215811642, "learning_rate": 1.4284599601157734e-05, "loss": 0.1219, "step": 10862 }, { "epoch": 2.270219435736677, "grad_norm": 1.0892141995646452, "learning_rate": 1.4283580247598437e-05, "loss": 0.1459, "step": 10863 }, { "epoch": 2.27042842215256, "grad_norm": 1.4779559850338522, "learning_rate": 1.4282560839523807e-05, "loss": 0.1607, "step": 10864 }, { "epoch": 2.270637408568443, "grad_norm": 1.1407413923177565, "learning_rate": 1.4281541376946828e-05, "loss": 0.1253, "step": 10865 }, { "epoch": 2.270846394984326, "grad_norm": 1.1050084769958375, "learning_rate": 1.4280521859880466e-05, "loss": 0.1253, "step": 10866 }, { "epoch": 2.271055381400209, "grad_norm": 1.0501297281343924, "learning_rate": 1.42795022883377e-05, "loss": 0.1169, "step": 10867 }, { "epoch": 2.2712643678160918, "grad_norm": 0.9469427003462455, "learning_rate": 1.42784826623315e-05, "loss": 0.1187, "step": 10868 }, { "epoch": 2.2714733542319747, "grad_norm": 0.985972560503953, "learning_rate": 1.4277462981874852e-05, "loss": 0.1196, "step": 10869 }, { "epoch": 2.2716823406478577, "grad_norm": 0.8883452156018551, "learning_rate": 1.4276443246980722e-05, "loss": 0.1127, "step": 10870 }, { "epoch": 2.2718913270637406, "grad_norm": 0.9229893646505336, "learning_rate": 1.4275423457662098e-05, "loss": 0.1087, "step": 10871 }, { "epoch": 2.2721003134796236, "grad_norm": 1.042641445743547, "learning_rate": 1.4274403613931953e-05, "loss": 0.1216, "step": 10872 }, { "epoch": 2.2723092998955066, "grad_norm": 0.8468169196598956, "learning_rate": 1.4273383715803266e-05, "loss": 0.0973, "step": 10873 }, { "epoch": 2.2725182863113895, "grad_norm": 0.8601489107650175, "learning_rate": 1.4272363763289017e-05, "loss": 0.0848, "step": 10874 }, { "epoch": 2.2727272727272725, "grad_norm": 1.160531852810354, "learning_rate": 1.427134375640219e-05, "loss": 0.1568, "step": 10875 }, { "epoch": 2.2729362591431554, "grad_norm": 1.0977791160727524, "learning_rate": 1.427032369515576e-05, "loss": 0.1189, "step": 10876 }, { "epoch": 2.273145245559039, "grad_norm": 1.1140255426719887, "learning_rate": 1.4269303579562715e-05, "loss": 0.1324, "step": 10877 }, { "epoch": 2.273354231974922, "grad_norm": 1.116104428057654, "learning_rate": 1.4268283409636032e-05, "loss": 0.1084, "step": 10878 }, { "epoch": 2.2735632183908048, "grad_norm": 0.9306269163143246, "learning_rate": 1.42672631853887e-05, "loss": 0.1204, "step": 10879 }, { "epoch": 2.2737722048066877, "grad_norm": 1.0958867936963888, "learning_rate": 1.4266242906833698e-05, "loss": 0.1545, "step": 10880 }, { "epoch": 2.2739811912225707, "grad_norm": 0.8512038217756144, "learning_rate": 1.4265222573984015e-05, "loss": 0.0954, "step": 10881 }, { "epoch": 2.2741901776384537, "grad_norm": 1.1246773785169426, "learning_rate": 1.426420218685263e-05, "loss": 0.1539, "step": 10882 }, { "epoch": 2.2743991640543366, "grad_norm": 1.0241238352877198, "learning_rate": 1.4263181745452539e-05, "loss": 0.1173, "step": 10883 }, { "epoch": 2.2746081504702196, "grad_norm": 0.9411094685889675, "learning_rate": 1.4262161249796718e-05, "loss": 0.1203, "step": 10884 }, { "epoch": 2.2748171368861025, "grad_norm": 1.105086623569688, "learning_rate": 1.4261140699898162e-05, "loss": 0.1518, "step": 10885 }, { "epoch": 2.2750261233019855, "grad_norm": 1.0060170219389264, "learning_rate": 1.4260120095769855e-05, "loss": 0.1055, "step": 10886 }, { "epoch": 2.2752351097178685, "grad_norm": 0.8568933073773902, "learning_rate": 1.4259099437424788e-05, "loss": 0.1199, "step": 10887 }, { "epoch": 2.2754440961337514, "grad_norm": 0.9624544819734947, "learning_rate": 1.4258078724875947e-05, "loss": 0.1167, "step": 10888 }, { "epoch": 2.2756530825496344, "grad_norm": 0.948191095815819, "learning_rate": 1.4257057958136331e-05, "loss": 0.1468, "step": 10889 }, { "epoch": 2.2758620689655173, "grad_norm": 1.1211426809818612, "learning_rate": 1.425603713721892e-05, "loss": 0.1351, "step": 10890 }, { "epoch": 2.2760710553814003, "grad_norm": 1.0832831279674866, "learning_rate": 1.4255016262136711e-05, "loss": 0.1031, "step": 10891 }, { "epoch": 2.2762800417972833, "grad_norm": 0.9475938647381434, "learning_rate": 1.4253995332902697e-05, "loss": 0.1207, "step": 10892 }, { "epoch": 2.2764890282131662, "grad_norm": 1.1683341981302542, "learning_rate": 1.4252974349529871e-05, "loss": 0.1202, "step": 10893 }, { "epoch": 2.276698014629049, "grad_norm": 0.8856635244137808, "learning_rate": 1.425195331203122e-05, "loss": 0.1036, "step": 10894 }, { "epoch": 2.276907001044932, "grad_norm": 1.0175671566121953, "learning_rate": 1.4250932220419747e-05, "loss": 0.1049, "step": 10895 }, { "epoch": 2.277115987460815, "grad_norm": 0.8757476350708652, "learning_rate": 1.4249911074708442e-05, "loss": 0.1002, "step": 10896 }, { "epoch": 2.277324973876698, "grad_norm": 1.0934580451833478, "learning_rate": 1.4248889874910302e-05, "loss": 0.1164, "step": 10897 }, { "epoch": 2.277533960292581, "grad_norm": 0.9143916022412282, "learning_rate": 1.4247868621038325e-05, "loss": 0.1104, "step": 10898 }, { "epoch": 2.277742946708464, "grad_norm": 1.3205942828810744, "learning_rate": 1.4246847313105508e-05, "loss": 0.15, "step": 10899 }, { "epoch": 2.277951933124347, "grad_norm": 0.9906386901828316, "learning_rate": 1.4245825951124843e-05, "loss": 0.1313, "step": 10900 }, { "epoch": 2.27816091954023, "grad_norm": 1.0733804317423303, "learning_rate": 1.4244804535109338e-05, "loss": 0.1234, "step": 10901 }, { "epoch": 2.278369905956113, "grad_norm": 1.1152703029165916, "learning_rate": 1.4243783065071984e-05, "loss": 0.1191, "step": 10902 }, { "epoch": 2.278578892371996, "grad_norm": 1.017918285176511, "learning_rate": 1.4242761541025783e-05, "loss": 0.1308, "step": 10903 }, { "epoch": 2.278787878787879, "grad_norm": 0.8760861958375268, "learning_rate": 1.4241739962983736e-05, "loss": 0.1255, "step": 10904 }, { "epoch": 2.2789968652037618, "grad_norm": 0.9927640444337039, "learning_rate": 1.4240718330958847e-05, "loss": 0.1003, "step": 10905 }, { "epoch": 2.2792058516196447, "grad_norm": 1.0956720138293479, "learning_rate": 1.4239696644964111e-05, "loss": 0.1137, "step": 10906 }, { "epoch": 2.2794148380355277, "grad_norm": 1.0536134963819412, "learning_rate": 1.4238674905012539e-05, "loss": 0.1187, "step": 10907 }, { "epoch": 2.2796238244514107, "grad_norm": 0.987414183027001, "learning_rate": 1.4237653111117128e-05, "loss": 0.1221, "step": 10908 }, { "epoch": 2.2798328108672936, "grad_norm": 0.9275020211463786, "learning_rate": 1.4236631263290888e-05, "loss": 0.111, "step": 10909 }, { "epoch": 2.2800417972831766, "grad_norm": 0.9661007225087724, "learning_rate": 1.4235609361546816e-05, "loss": 0.1074, "step": 10910 }, { "epoch": 2.2802507836990595, "grad_norm": 0.9413996716407278, "learning_rate": 1.4234587405897924e-05, "loss": 0.1122, "step": 10911 }, { "epoch": 2.2804597701149425, "grad_norm": 1.0955314178595272, "learning_rate": 1.4233565396357209e-05, "loss": 0.1088, "step": 10912 }, { "epoch": 2.2806687565308255, "grad_norm": 1.2503636222726728, "learning_rate": 1.423254333293769e-05, "loss": 0.1523, "step": 10913 }, { "epoch": 2.2808777429467084, "grad_norm": 0.9336549356680348, "learning_rate": 1.4231521215652366e-05, "loss": 0.1023, "step": 10914 }, { "epoch": 2.2810867293625914, "grad_norm": 0.8052196115033148, "learning_rate": 1.4230499044514247e-05, "loss": 0.093, "step": 10915 }, { "epoch": 2.2812957157784743, "grad_norm": 1.1530642493688241, "learning_rate": 1.4229476819536341e-05, "loss": 0.143, "step": 10916 }, { "epoch": 2.2815047021943573, "grad_norm": 0.9820810218558798, "learning_rate": 1.422845454073166e-05, "loss": 0.1231, "step": 10917 }, { "epoch": 2.2817136886102403, "grad_norm": 0.882575485525423, "learning_rate": 1.422743220811321e-05, "loss": 0.097, "step": 10918 }, { "epoch": 2.2819226750261232, "grad_norm": 0.9683602789877905, "learning_rate": 1.4226409821694009e-05, "loss": 0.1438, "step": 10919 }, { "epoch": 2.282131661442006, "grad_norm": 1.0375630595265135, "learning_rate": 1.4225387381487058e-05, "loss": 0.1298, "step": 10920 }, { "epoch": 2.282340647857889, "grad_norm": 1.0766490888850295, "learning_rate": 1.422436488750538e-05, "loss": 0.132, "step": 10921 }, { "epoch": 2.282549634273772, "grad_norm": 0.9442193161114549, "learning_rate": 1.422334233976198e-05, "loss": 0.1434, "step": 10922 }, { "epoch": 2.282758620689655, "grad_norm": 0.7865874777264356, "learning_rate": 1.4222319738269876e-05, "loss": 0.1241, "step": 10923 }, { "epoch": 2.282967607105538, "grad_norm": 1.0048440494659534, "learning_rate": 1.4221297083042076e-05, "loss": 0.1399, "step": 10924 }, { "epoch": 2.283176593521421, "grad_norm": 0.8186004216191883, "learning_rate": 1.4220274374091605e-05, "loss": 0.1009, "step": 10925 }, { "epoch": 2.283385579937304, "grad_norm": 1.083850791258651, "learning_rate": 1.421925161143147e-05, "loss": 0.1399, "step": 10926 }, { "epoch": 2.283594566353187, "grad_norm": 0.9485109607294541, "learning_rate": 1.4218228795074691e-05, "loss": 0.1197, "step": 10927 }, { "epoch": 2.28380355276907, "grad_norm": 0.9507919449482696, "learning_rate": 1.4217205925034284e-05, "loss": 0.1066, "step": 10928 }, { "epoch": 2.284012539184953, "grad_norm": 0.9917429586448923, "learning_rate": 1.4216183001323267e-05, "loss": 0.1377, "step": 10929 }, { "epoch": 2.284221525600836, "grad_norm": 1.0732710144896485, "learning_rate": 1.4215160023954657e-05, "loss": 0.1384, "step": 10930 }, { "epoch": 2.2844305120167188, "grad_norm": 0.8572600286679921, "learning_rate": 1.421413699294148e-05, "loss": 0.1165, "step": 10931 }, { "epoch": 2.2846394984326017, "grad_norm": 0.9810190950747048, "learning_rate": 1.4213113908296743e-05, "loss": 0.1493, "step": 10932 }, { "epoch": 2.2848484848484847, "grad_norm": 0.734933891507306, "learning_rate": 1.4212090770033475e-05, "loss": 0.1076, "step": 10933 }, { "epoch": 2.2850574712643676, "grad_norm": 0.7697773487947424, "learning_rate": 1.4211067578164695e-05, "loss": 0.1007, "step": 10934 }, { "epoch": 2.2852664576802506, "grad_norm": 0.9523053210277599, "learning_rate": 1.4210044332703428e-05, "loss": 0.1104, "step": 10935 }, { "epoch": 2.2854754440961336, "grad_norm": 0.8199662642330003, "learning_rate": 1.4209021033662692e-05, "loss": 0.0907, "step": 10936 }, { "epoch": 2.2856844305120165, "grad_norm": 1.0415860653264524, "learning_rate": 1.420799768105551e-05, "loss": 0.119, "step": 10937 }, { "epoch": 2.2858934169279, "grad_norm": 0.8901715285493192, "learning_rate": 1.4206974274894909e-05, "loss": 0.106, "step": 10938 }, { "epoch": 2.286102403343783, "grad_norm": 1.0357204145976362, "learning_rate": 1.420595081519391e-05, "loss": 0.1385, "step": 10939 }, { "epoch": 2.286311389759666, "grad_norm": 1.0394603885278566, "learning_rate": 1.420492730196554e-05, "loss": 0.0997, "step": 10940 }, { "epoch": 2.286520376175549, "grad_norm": 0.960960378708857, "learning_rate": 1.4203903735222826e-05, "loss": 0.0978, "step": 10941 }, { "epoch": 2.286729362591432, "grad_norm": 0.9423517426339949, "learning_rate": 1.4202880114978794e-05, "loss": 0.12, "step": 10942 }, { "epoch": 2.2869383490073147, "grad_norm": 0.9266214065108365, "learning_rate": 1.4201856441246469e-05, "loss": 0.1113, "step": 10943 }, { "epoch": 2.2871473354231977, "grad_norm": 1.0446423858831997, "learning_rate": 1.4200832714038882e-05, "loss": 0.1293, "step": 10944 }, { "epoch": 2.2873563218390807, "grad_norm": 1.2265839777250573, "learning_rate": 1.4199808933369057e-05, "loss": 0.1046, "step": 10945 }, { "epoch": 2.2875653082549636, "grad_norm": 1.6232556664659694, "learning_rate": 1.4198785099250028e-05, "loss": 0.1294, "step": 10946 }, { "epoch": 2.2877742946708466, "grad_norm": 1.0640917713232392, "learning_rate": 1.4197761211694823e-05, "loss": 0.1241, "step": 10947 }, { "epoch": 2.2879832810867295, "grad_norm": 1.1711970886603986, "learning_rate": 1.4196737270716472e-05, "loss": 0.1399, "step": 10948 }, { "epoch": 2.2881922675026125, "grad_norm": 1.0121505451679196, "learning_rate": 1.4195713276328006e-05, "loss": 0.1311, "step": 10949 }, { "epoch": 2.2884012539184955, "grad_norm": 1.023212627476572, "learning_rate": 1.4194689228542462e-05, "loss": 0.1172, "step": 10950 }, { "epoch": 2.2886102403343784, "grad_norm": 1.0926618472077088, "learning_rate": 1.4193665127372863e-05, "loss": 0.1427, "step": 10951 }, { "epoch": 2.2888192267502614, "grad_norm": 0.8598941387950226, "learning_rate": 1.419264097283225e-05, "loss": 0.0985, "step": 10952 }, { "epoch": 2.2890282131661444, "grad_norm": 1.088283880407889, "learning_rate": 1.4191616764933656e-05, "loss": 0.1263, "step": 10953 }, { "epoch": 2.2892371995820273, "grad_norm": 0.9770042104080926, "learning_rate": 1.4190592503690113e-05, "loss": 0.1165, "step": 10954 }, { "epoch": 2.2894461859979103, "grad_norm": 1.0385808179364577, "learning_rate": 1.4189568189114655e-05, "loss": 0.1245, "step": 10955 }, { "epoch": 2.2896551724137932, "grad_norm": 0.7557843503055234, "learning_rate": 1.4188543821220327e-05, "loss": 0.081, "step": 10956 }, { "epoch": 2.289864158829676, "grad_norm": 0.9965154933002296, "learning_rate": 1.4187519400020154e-05, "loss": 0.1096, "step": 10957 }, { "epoch": 2.290073145245559, "grad_norm": 0.95247481318516, "learning_rate": 1.4186494925527181e-05, "loss": 0.1118, "step": 10958 }, { "epoch": 2.290282131661442, "grad_norm": 0.9568793807534101, "learning_rate": 1.4185470397754444e-05, "loss": 0.1307, "step": 10959 }, { "epoch": 2.290491118077325, "grad_norm": 1.195259569472282, "learning_rate": 1.418444581671498e-05, "loss": 0.1452, "step": 10960 }, { "epoch": 2.290700104493208, "grad_norm": 0.8984647747392821, "learning_rate": 1.4183421182421833e-05, "loss": 0.0992, "step": 10961 }, { "epoch": 2.290909090909091, "grad_norm": 1.0298008489440484, "learning_rate": 1.418239649488804e-05, "loss": 0.119, "step": 10962 }, { "epoch": 2.291118077324974, "grad_norm": 0.9814471860644401, "learning_rate": 1.4181371754126637e-05, "loss": 0.1429, "step": 10963 }, { "epoch": 2.291327063740857, "grad_norm": 1.0672387452512953, "learning_rate": 1.4180346960150675e-05, "loss": 0.1224, "step": 10964 }, { "epoch": 2.29153605015674, "grad_norm": 1.1092901387361704, "learning_rate": 1.4179322112973189e-05, "loss": 0.1152, "step": 10965 }, { "epoch": 2.291745036572623, "grad_norm": 0.9239471564418855, "learning_rate": 1.4178297212607226e-05, "loss": 0.0854, "step": 10966 }, { "epoch": 2.291954022988506, "grad_norm": 0.975397565592426, "learning_rate": 1.4177272259065826e-05, "loss": 0.1154, "step": 10967 }, { "epoch": 2.2921630094043888, "grad_norm": 1.0561060169602798, "learning_rate": 1.4176247252362039e-05, "loss": 0.1176, "step": 10968 }, { "epoch": 2.2923719958202717, "grad_norm": 1.1115817046749623, "learning_rate": 1.41752221925089e-05, "loss": 0.1182, "step": 10969 }, { "epoch": 2.2925809822361547, "grad_norm": 1.0638965746182876, "learning_rate": 1.4174197079519462e-05, "loss": 0.1276, "step": 10970 }, { "epoch": 2.2927899686520377, "grad_norm": 1.1493899396665033, "learning_rate": 1.4173171913406772e-05, "loss": 0.1365, "step": 10971 }, { "epoch": 2.2929989550679206, "grad_norm": 0.993823432392822, "learning_rate": 1.4172146694183872e-05, "loss": 0.1202, "step": 10972 }, { "epoch": 2.2932079414838036, "grad_norm": 1.0593099070519063, "learning_rate": 1.4171121421863812e-05, "loss": 0.1529, "step": 10973 }, { "epoch": 2.2934169278996865, "grad_norm": 2.3180852399298884, "learning_rate": 1.4170096096459642e-05, "loss": 0.1182, "step": 10974 }, { "epoch": 2.2936259143155695, "grad_norm": 1.0139695524595127, "learning_rate": 1.4169070717984404e-05, "loss": 0.1217, "step": 10975 }, { "epoch": 2.2938349007314525, "grad_norm": 1.074638429001517, "learning_rate": 1.4168045286451158e-05, "loss": 0.1454, "step": 10976 }, { "epoch": 2.2940438871473354, "grad_norm": 0.8610886234651091, "learning_rate": 1.4167019801872946e-05, "loss": 0.1158, "step": 10977 }, { "epoch": 2.2942528735632184, "grad_norm": 0.8815755369047616, "learning_rate": 1.416599426426282e-05, "loss": 0.1185, "step": 10978 }, { "epoch": 2.2944618599791013, "grad_norm": 1.2394325358871814, "learning_rate": 1.4164968673633837e-05, "loss": 0.1321, "step": 10979 }, { "epoch": 2.2946708463949843, "grad_norm": 1.0565249794937288, "learning_rate": 1.4163943029999043e-05, "loss": 0.1448, "step": 10980 }, { "epoch": 2.2948798328108673, "grad_norm": 1.2073332055061106, "learning_rate": 1.4162917333371493e-05, "loss": 0.1415, "step": 10981 }, { "epoch": 2.2950888192267502, "grad_norm": 1.1336612887334947, "learning_rate": 1.4161891583764244e-05, "loss": 0.0949, "step": 10982 }, { "epoch": 2.295297805642633, "grad_norm": 0.8306640253937482, "learning_rate": 1.4160865781190345e-05, "loss": 0.1063, "step": 10983 }, { "epoch": 2.295506792058516, "grad_norm": 1.1585771282674027, "learning_rate": 1.4159839925662853e-05, "loss": 0.1236, "step": 10984 }, { "epoch": 2.295715778474399, "grad_norm": 1.5883115906619507, "learning_rate": 1.4158814017194824e-05, "loss": 0.1372, "step": 10985 }, { "epoch": 2.295924764890282, "grad_norm": 0.8740069792023242, "learning_rate": 1.4157788055799316e-05, "loss": 0.0992, "step": 10986 }, { "epoch": 2.296133751306165, "grad_norm": 1.1124251066381199, "learning_rate": 1.415676204148938e-05, "loss": 0.1411, "step": 10987 }, { "epoch": 2.296342737722048, "grad_norm": 0.863545674699453, "learning_rate": 1.4155735974278082e-05, "loss": 0.1157, "step": 10988 }, { "epoch": 2.296551724137931, "grad_norm": 0.9042398086193015, "learning_rate": 1.4154709854178475e-05, "loss": 0.1263, "step": 10989 }, { "epoch": 2.296760710553814, "grad_norm": 1.0261276333067397, "learning_rate": 1.4153683681203618e-05, "loss": 0.1173, "step": 10990 }, { "epoch": 2.296969696969697, "grad_norm": 0.9998167413390044, "learning_rate": 1.4152657455366573e-05, "loss": 0.1261, "step": 10991 }, { "epoch": 2.29717868338558, "grad_norm": 1.0157304088612982, "learning_rate": 1.41516311766804e-05, "loss": 0.1502, "step": 10992 }, { "epoch": 2.297387669801463, "grad_norm": 1.1179017995547287, "learning_rate": 1.4150604845158156e-05, "loss": 0.1248, "step": 10993 }, { "epoch": 2.2975966562173458, "grad_norm": 0.9812060365817614, "learning_rate": 1.4149578460812909e-05, "loss": 0.1295, "step": 10994 }, { "epoch": 2.2978056426332287, "grad_norm": 1.0092647224357265, "learning_rate": 1.4148552023657716e-05, "loss": 0.123, "step": 10995 }, { "epoch": 2.2980146290491117, "grad_norm": 1.0051680595306913, "learning_rate": 1.4147525533705644e-05, "loss": 0.1454, "step": 10996 }, { "epoch": 2.2982236154649947, "grad_norm": 1.0730470173979694, "learning_rate": 1.4146498990969754e-05, "loss": 0.11, "step": 10997 }, { "epoch": 2.2984326018808776, "grad_norm": 1.068096160988076, "learning_rate": 1.4145472395463114e-05, "loss": 0.116, "step": 10998 }, { "epoch": 2.2986415882967606, "grad_norm": 0.8520309247061464, "learning_rate": 1.414444574719878e-05, "loss": 0.1101, "step": 10999 }, { "epoch": 2.2988505747126435, "grad_norm": 1.1449437818522354, "learning_rate": 1.4143419046189831e-05, "loss": 0.1165, "step": 11000 }, { "epoch": 2.2990595611285265, "grad_norm": 1.1725796863643296, "learning_rate": 1.4142392292449323e-05, "loss": 0.1241, "step": 11001 }, { "epoch": 2.2992685475444095, "grad_norm": 1.0759037869973105, "learning_rate": 1.4141365485990326e-05, "loss": 0.1348, "step": 11002 }, { "epoch": 2.2994775339602924, "grad_norm": 1.251951599851503, "learning_rate": 1.4140338626825909e-05, "loss": 0.1596, "step": 11003 }, { "epoch": 2.2996865203761754, "grad_norm": 0.8478141688296483, "learning_rate": 1.4139311714969142e-05, "loss": 0.0949, "step": 11004 }, { "epoch": 2.2998955067920583, "grad_norm": 0.9026291472228971, "learning_rate": 1.413828475043309e-05, "loss": 0.1076, "step": 11005 }, { "epoch": 2.3001044932079413, "grad_norm": 1.0072890112249993, "learning_rate": 1.4137257733230825e-05, "loss": 0.1191, "step": 11006 }, { "epoch": 2.3003134796238243, "grad_norm": 1.0157709734855624, "learning_rate": 1.4136230663375417e-05, "loss": 0.1384, "step": 11007 }, { "epoch": 2.3005224660397072, "grad_norm": 1.1696520631660805, "learning_rate": 1.4135203540879934e-05, "loss": 0.1355, "step": 11008 }, { "epoch": 2.30073145245559, "grad_norm": 0.8072711530710208, "learning_rate": 1.4134176365757455e-05, "loss": 0.1024, "step": 11009 }, { "epoch": 2.300940438871473, "grad_norm": 1.2540800569652644, "learning_rate": 1.4133149138021047e-05, "loss": 0.1158, "step": 11010 }, { "epoch": 2.301149425287356, "grad_norm": 0.9522308128350647, "learning_rate": 1.4132121857683782e-05, "loss": 0.1262, "step": 11011 }, { "epoch": 2.301358411703239, "grad_norm": 0.9750197078069859, "learning_rate": 1.413109452475874e-05, "loss": 0.0977, "step": 11012 }, { "epoch": 2.301567398119122, "grad_norm": 1.1101265850935658, "learning_rate": 1.4130067139258991e-05, "loss": 0.1363, "step": 11013 }, { "epoch": 2.301776384535005, "grad_norm": 0.9640795987330696, "learning_rate": 1.4129039701197607e-05, "loss": 0.1211, "step": 11014 }, { "epoch": 2.301985370950888, "grad_norm": 0.9171152825695411, "learning_rate": 1.4128012210587672e-05, "loss": 0.1021, "step": 11015 }, { "epoch": 2.302194357366771, "grad_norm": 0.9091954679304401, "learning_rate": 1.4126984667442254e-05, "loss": 0.1236, "step": 11016 }, { "epoch": 2.3024033437826543, "grad_norm": 1.0711525891388156, "learning_rate": 1.4125957071774434e-05, "loss": 0.1369, "step": 11017 }, { "epoch": 2.3026123301985373, "grad_norm": 1.1333932727227956, "learning_rate": 1.4124929423597291e-05, "loss": 0.1292, "step": 11018 }, { "epoch": 2.3028213166144202, "grad_norm": 1.1969029609100892, "learning_rate": 1.4123901722923904e-05, "loss": 0.1366, "step": 11019 }, { "epoch": 2.303030303030303, "grad_norm": 1.2339225446209452, "learning_rate": 1.4122873969767348e-05, "loss": 0.1625, "step": 11020 }, { "epoch": 2.303239289446186, "grad_norm": 0.9968170001853561, "learning_rate": 1.4121846164140706e-05, "loss": 0.1181, "step": 11021 }, { "epoch": 2.303448275862069, "grad_norm": 1.1196479894169569, "learning_rate": 1.4120818306057058e-05, "loss": 0.1583, "step": 11022 }, { "epoch": 2.303657262277952, "grad_norm": 1.1301200580052448, "learning_rate": 1.4119790395529483e-05, "loss": 0.1219, "step": 11023 }, { "epoch": 2.303866248693835, "grad_norm": 0.9590348008376031, "learning_rate": 1.4118762432571065e-05, "loss": 0.1217, "step": 11024 }, { "epoch": 2.304075235109718, "grad_norm": 0.9947752004186422, "learning_rate": 1.4117734417194889e-05, "loss": 0.1204, "step": 11025 }, { "epoch": 2.304284221525601, "grad_norm": 0.9650865684850414, "learning_rate": 1.411670634941403e-05, "loss": 0.1188, "step": 11026 }, { "epoch": 2.304493207941484, "grad_norm": 0.9907316379803601, "learning_rate": 1.4115678229241579e-05, "loss": 0.1192, "step": 11027 }, { "epoch": 2.304702194357367, "grad_norm": 0.9060734305293038, "learning_rate": 1.4114650056690618e-05, "loss": 0.1226, "step": 11028 }, { "epoch": 2.30491118077325, "grad_norm": 0.8971577879265913, "learning_rate": 1.4113621831774232e-05, "loss": 0.1044, "step": 11029 }, { "epoch": 2.305120167189133, "grad_norm": 0.8757073886917561, "learning_rate": 1.4112593554505508e-05, "loss": 0.1045, "step": 11030 }, { "epoch": 2.305329153605016, "grad_norm": 0.9817910887005451, "learning_rate": 1.4111565224897532e-05, "loss": 0.1102, "step": 11031 }, { "epoch": 2.3055381400208987, "grad_norm": 0.9287233953279452, "learning_rate": 1.411053684296339e-05, "loss": 0.1163, "step": 11032 }, { "epoch": 2.3057471264367817, "grad_norm": 1.017695857716407, "learning_rate": 1.410950840871617e-05, "loss": 0.0932, "step": 11033 }, { "epoch": 2.3059561128526647, "grad_norm": 0.9220850374550694, "learning_rate": 1.4108479922168962e-05, "loss": 0.1106, "step": 11034 }, { "epoch": 2.3061650992685476, "grad_norm": 0.7914395888258027, "learning_rate": 1.4107451383334853e-05, "loss": 0.1079, "step": 11035 }, { "epoch": 2.3063740856844306, "grad_norm": 0.9061491513471435, "learning_rate": 1.4106422792226935e-05, "loss": 0.1134, "step": 11036 }, { "epoch": 2.3065830721003135, "grad_norm": 1.060791705952293, "learning_rate": 1.4105394148858298e-05, "loss": 0.0975, "step": 11037 }, { "epoch": 2.3067920585161965, "grad_norm": 1.2486237316284514, "learning_rate": 1.410436545324203e-05, "loss": 0.1459, "step": 11038 }, { "epoch": 2.3070010449320795, "grad_norm": 1.1091642629611087, "learning_rate": 1.4103336705391227e-05, "loss": 0.1015, "step": 11039 }, { "epoch": 2.3072100313479624, "grad_norm": 0.9534297827417693, "learning_rate": 1.410230790531898e-05, "loss": 0.1201, "step": 11040 }, { "epoch": 2.3074190177638454, "grad_norm": 1.1588195962497099, "learning_rate": 1.4101279053038378e-05, "loss": 0.1372, "step": 11041 }, { "epoch": 2.3076280041797284, "grad_norm": 0.9413559472929081, "learning_rate": 1.4100250148562523e-05, "loss": 0.1354, "step": 11042 }, { "epoch": 2.3078369905956113, "grad_norm": 0.9016657648565118, "learning_rate": 1.4099221191904504e-05, "loss": 0.1152, "step": 11043 }, { "epoch": 2.3080459770114943, "grad_norm": 0.8802592956645842, "learning_rate": 1.4098192183077415e-05, "loss": 0.1164, "step": 11044 }, { "epoch": 2.3082549634273772, "grad_norm": 0.9517655877845468, "learning_rate": 1.4097163122094358e-05, "loss": 0.1038, "step": 11045 }, { "epoch": 2.30846394984326, "grad_norm": 0.7256735147478198, "learning_rate": 1.409613400896842e-05, "loss": 0.0922, "step": 11046 }, { "epoch": 2.308672936259143, "grad_norm": 0.9354306411806717, "learning_rate": 1.4095104843712706e-05, "loss": 0.0916, "step": 11047 }, { "epoch": 2.308881922675026, "grad_norm": 1.0308102139954656, "learning_rate": 1.4094075626340311e-05, "loss": 0.1127, "step": 11048 }, { "epoch": 2.309090909090909, "grad_norm": 0.9672326486368136, "learning_rate": 1.4093046356864335e-05, "loss": 0.1318, "step": 11049 }, { "epoch": 2.309299895506792, "grad_norm": 1.0284492993596661, "learning_rate": 1.4092017035297872e-05, "loss": 0.1498, "step": 11050 }, { "epoch": 2.309508881922675, "grad_norm": 1.0429137458325204, "learning_rate": 1.409098766165403e-05, "loss": 0.139, "step": 11051 }, { "epoch": 2.309717868338558, "grad_norm": 1.0531054568257106, "learning_rate": 1.40899582359459e-05, "loss": 0.1134, "step": 11052 }, { "epoch": 2.309926854754441, "grad_norm": 1.1048407214899443, "learning_rate": 1.408892875818659e-05, "loss": 0.1289, "step": 11053 }, { "epoch": 2.310135841170324, "grad_norm": 1.0131105504314324, "learning_rate": 1.4087899228389199e-05, "loss": 0.1242, "step": 11054 }, { "epoch": 2.310344827586207, "grad_norm": 0.9676143321703742, "learning_rate": 1.4086869646566833e-05, "loss": 0.1164, "step": 11055 }, { "epoch": 2.31055381400209, "grad_norm": 1.1020243157102243, "learning_rate": 1.4085840012732588e-05, "loss": 0.103, "step": 11056 }, { "epoch": 2.3107628004179728, "grad_norm": 1.2136516961628732, "learning_rate": 1.4084810326899574e-05, "loss": 0.1153, "step": 11057 }, { "epoch": 2.3109717868338557, "grad_norm": 1.0586621492475543, "learning_rate": 1.408378058908089e-05, "loss": 0.0977, "step": 11058 }, { "epoch": 2.3111807732497387, "grad_norm": 0.9631966124780785, "learning_rate": 1.4082750799289645e-05, "loss": 0.1344, "step": 11059 }, { "epoch": 2.3113897596656217, "grad_norm": 0.9858018500246825, "learning_rate": 1.4081720957538947e-05, "loss": 0.1008, "step": 11060 }, { "epoch": 2.3115987460815046, "grad_norm": 1.106297025978483, "learning_rate": 1.4080691063841897e-05, "loss": 0.1279, "step": 11061 }, { "epoch": 2.3118077324973876, "grad_norm": 0.8551945117164654, "learning_rate": 1.4079661118211605e-05, "loss": 0.0932, "step": 11062 }, { "epoch": 2.3120167189132705, "grad_norm": 0.9276916564372465, "learning_rate": 1.4078631120661179e-05, "loss": 0.1278, "step": 11063 }, { "epoch": 2.3122257053291535, "grad_norm": 1.118072429958454, "learning_rate": 1.4077601071203724e-05, "loss": 0.1362, "step": 11064 }, { "epoch": 2.3124346917450365, "grad_norm": 1.0564925495956121, "learning_rate": 1.4076570969852352e-05, "loss": 0.1216, "step": 11065 }, { "epoch": 2.3126436781609194, "grad_norm": 0.9732763914260699, "learning_rate": 1.4075540816620172e-05, "loss": 0.1156, "step": 11066 }, { "epoch": 2.3128526645768024, "grad_norm": 1.050098091335231, "learning_rate": 1.4074510611520296e-05, "loss": 0.1075, "step": 11067 }, { "epoch": 2.3130616509926853, "grad_norm": 1.4208359501317642, "learning_rate": 1.4073480354565829e-05, "loss": 0.1168, "step": 11068 }, { "epoch": 2.3132706374085683, "grad_norm": 1.1382077756452254, "learning_rate": 1.407245004576989e-05, "loss": 0.1012, "step": 11069 }, { "epoch": 2.3134796238244513, "grad_norm": 0.9992704444409676, "learning_rate": 1.4071419685145587e-05, "loss": 0.1373, "step": 11070 }, { "epoch": 2.3136886102403342, "grad_norm": 1.0817934791063466, "learning_rate": 1.4070389272706035e-05, "loss": 0.1285, "step": 11071 }, { "epoch": 2.313897596656217, "grad_norm": 1.0414420055065967, "learning_rate": 1.4069358808464347e-05, "loss": 0.1285, "step": 11072 }, { "epoch": 2.3141065830721, "grad_norm": 0.9031471526066471, "learning_rate": 1.4068328292433638e-05, "loss": 0.1122, "step": 11073 }, { "epoch": 2.314315569487983, "grad_norm": 0.9086940274093874, "learning_rate": 1.4067297724627018e-05, "loss": 0.1155, "step": 11074 }, { "epoch": 2.314524555903866, "grad_norm": 0.9858416355270506, "learning_rate": 1.406626710505761e-05, "loss": 0.1196, "step": 11075 }, { "epoch": 2.314733542319749, "grad_norm": 0.9184506091843287, "learning_rate": 1.4065236433738527e-05, "loss": 0.1128, "step": 11076 }, { "epoch": 2.314942528735632, "grad_norm": 0.9815791688165805, "learning_rate": 1.4064205710682886e-05, "loss": 0.1089, "step": 11077 }, { "epoch": 2.315151515151515, "grad_norm": 1.1309058787274573, "learning_rate": 1.40631749359038e-05, "loss": 0.1198, "step": 11078 }, { "epoch": 2.3153605015673984, "grad_norm": 0.9661410715550116, "learning_rate": 1.4062144109414398e-05, "loss": 0.1286, "step": 11079 }, { "epoch": 2.3155694879832813, "grad_norm": 0.9197928232623799, "learning_rate": 1.4061113231227788e-05, "loss": 0.1116, "step": 11080 }, { "epoch": 2.3157784743991643, "grad_norm": 1.0100800233775273, "learning_rate": 1.4060082301357096e-05, "loss": 0.1265, "step": 11081 }, { "epoch": 2.3159874608150472, "grad_norm": 0.9897805535023138, "learning_rate": 1.405905131981544e-05, "loss": 0.1238, "step": 11082 }, { "epoch": 2.31619644723093, "grad_norm": 0.9137145678959683, "learning_rate": 1.405802028661594e-05, "loss": 0.1203, "step": 11083 }, { "epoch": 2.316405433646813, "grad_norm": 1.0670664012400959, "learning_rate": 1.4056989201771723e-05, "loss": 0.1493, "step": 11084 }, { "epoch": 2.316614420062696, "grad_norm": 1.2828786697376309, "learning_rate": 1.4055958065295902e-05, "loss": 0.1348, "step": 11085 }, { "epoch": 2.316823406478579, "grad_norm": 0.9365430444213086, "learning_rate": 1.4054926877201607e-05, "loss": 0.1234, "step": 11086 }, { "epoch": 2.317032392894462, "grad_norm": 0.9227531469849151, "learning_rate": 1.4053895637501957e-05, "loss": 0.1161, "step": 11087 }, { "epoch": 2.317241379310345, "grad_norm": 1.045787037507133, "learning_rate": 1.4052864346210083e-05, "loss": 0.1466, "step": 11088 }, { "epoch": 2.317450365726228, "grad_norm": 0.860962100434273, "learning_rate": 1.4051833003339099e-05, "loss": 0.1288, "step": 11089 }, { "epoch": 2.317659352142111, "grad_norm": 1.1074959263212454, "learning_rate": 1.405080160890214e-05, "loss": 0.1223, "step": 11090 }, { "epoch": 2.317868338557994, "grad_norm": 0.9327086308765415, "learning_rate": 1.404977016291233e-05, "loss": 0.1059, "step": 11091 }, { "epoch": 2.318077324973877, "grad_norm": 0.9961433811852369, "learning_rate": 1.4048738665382793e-05, "loss": 0.1216, "step": 11092 }, { "epoch": 2.31828631138976, "grad_norm": 1.2208103431743982, "learning_rate": 1.4047707116326657e-05, "loss": 0.1575, "step": 11093 }, { "epoch": 2.318495297805643, "grad_norm": 0.9401499591776472, "learning_rate": 1.4046675515757053e-05, "loss": 0.1245, "step": 11094 }, { "epoch": 2.3187042842215257, "grad_norm": 0.9175196574214407, "learning_rate": 1.4045643863687107e-05, "loss": 0.1155, "step": 11095 }, { "epoch": 2.3189132706374087, "grad_norm": 1.0174768499741333, "learning_rate": 1.404461216012995e-05, "loss": 0.1245, "step": 11096 }, { "epoch": 2.3191222570532917, "grad_norm": 1.1539553812269328, "learning_rate": 1.4043580405098709e-05, "loss": 0.1384, "step": 11097 }, { "epoch": 2.3193312434691746, "grad_norm": 0.8768706675894467, "learning_rate": 1.404254859860652e-05, "loss": 0.0994, "step": 11098 }, { "epoch": 2.3195402298850576, "grad_norm": 1.3963085354912013, "learning_rate": 1.4041516740666507e-05, "loss": 0.1345, "step": 11099 }, { "epoch": 2.3197492163009406, "grad_norm": 0.9134673868296352, "learning_rate": 1.4040484831291812e-05, "loss": 0.1047, "step": 11100 }, { "epoch": 2.3199582027168235, "grad_norm": 1.1732309608242968, "learning_rate": 1.4039452870495559e-05, "loss": 0.1106, "step": 11101 }, { "epoch": 2.3201671891327065, "grad_norm": 1.0357163720616376, "learning_rate": 1.4038420858290885e-05, "loss": 0.1342, "step": 11102 }, { "epoch": 2.3203761755485894, "grad_norm": 1.0282147363142082, "learning_rate": 1.4037388794690923e-05, "loss": 0.1326, "step": 11103 }, { "epoch": 2.3205851619644724, "grad_norm": 1.1796152699407205, "learning_rate": 1.4036356679708807e-05, "loss": 0.1487, "step": 11104 }, { "epoch": 2.3207941483803554, "grad_norm": 0.9918724701324808, "learning_rate": 1.4035324513357675e-05, "loss": 0.1248, "step": 11105 }, { "epoch": 2.3210031347962383, "grad_norm": 0.9612851780565768, "learning_rate": 1.4034292295650662e-05, "loss": 0.1279, "step": 11106 }, { "epoch": 2.3212121212121213, "grad_norm": 0.9291238540104596, "learning_rate": 1.4033260026600901e-05, "loss": 0.1239, "step": 11107 }, { "epoch": 2.3214211076280042, "grad_norm": 1.0831923201930287, "learning_rate": 1.4032227706221534e-05, "loss": 0.1349, "step": 11108 }, { "epoch": 2.321630094043887, "grad_norm": 0.8851565171690824, "learning_rate": 1.4031195334525698e-05, "loss": 0.1025, "step": 11109 }, { "epoch": 2.32183908045977, "grad_norm": 0.8464953730363924, "learning_rate": 1.4030162911526528e-05, "loss": 0.106, "step": 11110 }, { "epoch": 2.322048066875653, "grad_norm": 1.0745433137609919, "learning_rate": 1.4029130437237169e-05, "loss": 0.1259, "step": 11111 }, { "epoch": 2.322257053291536, "grad_norm": 1.034418075488099, "learning_rate": 1.4028097911670755e-05, "loss": 0.1149, "step": 11112 }, { "epoch": 2.322466039707419, "grad_norm": 1.064260348737139, "learning_rate": 1.402706533484043e-05, "loss": 0.1419, "step": 11113 }, { "epoch": 2.322675026123302, "grad_norm": 0.923445985258718, "learning_rate": 1.4026032706759334e-05, "loss": 0.1167, "step": 11114 }, { "epoch": 2.322884012539185, "grad_norm": 1.0013875089751134, "learning_rate": 1.402500002744061e-05, "loss": 0.1137, "step": 11115 }, { "epoch": 2.323092998955068, "grad_norm": 0.9705831361999463, "learning_rate": 1.4023967296897399e-05, "loss": 0.1043, "step": 11116 }, { "epoch": 2.323301985370951, "grad_norm": 1.181426545488094, "learning_rate": 1.4022934515142845e-05, "loss": 0.1379, "step": 11117 }, { "epoch": 2.323510971786834, "grad_norm": 0.972705903937939, "learning_rate": 1.4021901682190093e-05, "loss": 0.1321, "step": 11118 }, { "epoch": 2.323719958202717, "grad_norm": 1.1467498506845868, "learning_rate": 1.4020868798052284e-05, "loss": 0.1476, "step": 11119 }, { "epoch": 2.3239289446186, "grad_norm": 0.9998755911172581, "learning_rate": 1.4019835862742567e-05, "loss": 0.1318, "step": 11120 }, { "epoch": 2.3241379310344827, "grad_norm": 0.7863925998286099, "learning_rate": 1.4018802876274084e-05, "loss": 0.0914, "step": 11121 }, { "epoch": 2.3243469174503657, "grad_norm": 0.9456305849114912, "learning_rate": 1.4017769838659984e-05, "loss": 0.1168, "step": 11122 }, { "epoch": 2.3245559038662487, "grad_norm": 0.970822779340068, "learning_rate": 1.4016736749913413e-05, "loss": 0.1232, "step": 11123 }, { "epoch": 2.3247648902821316, "grad_norm": 0.8958095034043373, "learning_rate": 1.401570361004752e-05, "loss": 0.0961, "step": 11124 }, { "epoch": 2.3249738766980146, "grad_norm": 0.9292156163758402, "learning_rate": 1.4014670419075449e-05, "loss": 0.1144, "step": 11125 }, { "epoch": 2.3251828631138975, "grad_norm": 1.3202523642025135, "learning_rate": 1.4013637177010355e-05, "loss": 0.1616, "step": 11126 }, { "epoch": 2.3253918495297805, "grad_norm": 1.1138997636684587, "learning_rate": 1.4012603883865386e-05, "loss": 0.137, "step": 11127 }, { "epoch": 2.3256008359456635, "grad_norm": 1.2395786402753268, "learning_rate": 1.4011570539653689e-05, "loss": 0.1133, "step": 11128 }, { "epoch": 2.3258098223615464, "grad_norm": 1.275458596389811, "learning_rate": 1.4010537144388416e-05, "loss": 0.136, "step": 11129 }, { "epoch": 2.3260188087774294, "grad_norm": 1.1524674795363463, "learning_rate": 1.4009503698082725e-05, "loss": 0.1292, "step": 11130 }, { "epoch": 2.3262277951933124, "grad_norm": 0.8899755013490435, "learning_rate": 1.4008470200749757e-05, "loss": 0.1213, "step": 11131 }, { "epoch": 2.3264367816091953, "grad_norm": 1.103907046067124, "learning_rate": 1.4007436652402675e-05, "loss": 0.1429, "step": 11132 }, { "epoch": 2.3266457680250783, "grad_norm": 0.9859133624104653, "learning_rate": 1.4006403053054627e-05, "loss": 0.1322, "step": 11133 }, { "epoch": 2.3268547544409612, "grad_norm": 0.8945766991479623, "learning_rate": 1.4005369402718767e-05, "loss": 0.1099, "step": 11134 }, { "epoch": 2.327063740856844, "grad_norm": 1.0361527685558434, "learning_rate": 1.4004335701408254e-05, "loss": 0.1147, "step": 11135 }, { "epoch": 2.327272727272727, "grad_norm": 1.1644369255717766, "learning_rate": 1.4003301949136241e-05, "loss": 0.1307, "step": 11136 }, { "epoch": 2.32748171368861, "grad_norm": 1.0307693505383915, "learning_rate": 1.4002268145915881e-05, "loss": 0.1415, "step": 11137 }, { "epoch": 2.327690700104493, "grad_norm": 1.036288746265341, "learning_rate": 1.4001234291760335e-05, "loss": 0.1248, "step": 11138 }, { "epoch": 2.327899686520376, "grad_norm": 0.9976451547454752, "learning_rate": 1.4000200386682758e-05, "loss": 0.1481, "step": 11139 }, { "epoch": 2.328108672936259, "grad_norm": 1.002427320286118, "learning_rate": 1.3999166430696311e-05, "loss": 0.1399, "step": 11140 }, { "epoch": 2.328317659352142, "grad_norm": 1.0248660957347737, "learning_rate": 1.3998132423814153e-05, "loss": 0.1642, "step": 11141 }, { "epoch": 2.328526645768025, "grad_norm": 1.0090008385196136, "learning_rate": 1.399709836604944e-05, "loss": 0.1165, "step": 11142 }, { "epoch": 2.328735632183908, "grad_norm": 0.9779557251651967, "learning_rate": 1.399606425741533e-05, "loss": 0.1336, "step": 11143 }, { "epoch": 2.328944618599791, "grad_norm": 0.9479643522596508, "learning_rate": 1.399503009792499e-05, "loss": 0.1256, "step": 11144 }, { "epoch": 2.329153605015674, "grad_norm": 0.9052711123248127, "learning_rate": 1.3993995887591578e-05, "loss": 0.1099, "step": 11145 }, { "epoch": 2.3293625914315568, "grad_norm": 0.7522014356250367, "learning_rate": 1.3992961626428256e-05, "loss": 0.1143, "step": 11146 }, { "epoch": 2.3295715778474397, "grad_norm": 0.961757988949324, "learning_rate": 1.3991927314448188e-05, "loss": 0.119, "step": 11147 }, { "epoch": 2.3297805642633227, "grad_norm": 0.7872471529590294, "learning_rate": 1.3990892951664539e-05, "loss": 0.0977, "step": 11148 }, { "epoch": 2.3299895506792057, "grad_norm": 1.0390584663184015, "learning_rate": 1.3989858538090465e-05, "loss": 0.1256, "step": 11149 }, { "epoch": 2.3301985370950886, "grad_norm": 0.7638405635887319, "learning_rate": 1.3988824073739137e-05, "loss": 0.0853, "step": 11150 }, { "epoch": 2.3304075235109716, "grad_norm": 0.8890439709262198, "learning_rate": 1.3987789558623721e-05, "loss": 0.1162, "step": 11151 }, { "epoch": 2.3306165099268545, "grad_norm": 1.1137362200921201, "learning_rate": 1.3986754992757377e-05, "loss": 0.1303, "step": 11152 }, { "epoch": 2.3308254963427375, "grad_norm": 1.0305545620873213, "learning_rate": 1.3985720376153278e-05, "loss": 0.125, "step": 11153 }, { "epoch": 2.3310344827586205, "grad_norm": 1.0946105705245253, "learning_rate": 1.3984685708824588e-05, "loss": 0.1358, "step": 11154 }, { "epoch": 2.3312434691745034, "grad_norm": 0.8749968548772633, "learning_rate": 1.3983650990784474e-05, "loss": 0.0869, "step": 11155 }, { "epoch": 2.3314524555903864, "grad_norm": 1.1682884195930912, "learning_rate": 1.3982616222046105e-05, "loss": 0.145, "step": 11156 }, { "epoch": 2.3316614420062693, "grad_norm": 1.0708802895337808, "learning_rate": 1.3981581402622652e-05, "loss": 0.1248, "step": 11157 }, { "epoch": 2.3318704284221528, "grad_norm": 1.1367135525148178, "learning_rate": 1.3980546532527283e-05, "loss": 0.135, "step": 11158 }, { "epoch": 2.3320794148380357, "grad_norm": 1.142198583128713, "learning_rate": 1.397951161177317e-05, "loss": 0.1247, "step": 11159 }, { "epoch": 2.3322884012539187, "grad_norm": 1.0653329027784142, "learning_rate": 1.3978476640373482e-05, "loss": 0.1029, "step": 11160 }, { "epoch": 2.3324973876698016, "grad_norm": 1.0971103419924257, "learning_rate": 1.397744161834139e-05, "loss": 0.1089, "step": 11161 }, { "epoch": 2.3327063740856846, "grad_norm": 0.9865529790546769, "learning_rate": 1.3976406545690068e-05, "loss": 0.1214, "step": 11162 }, { "epoch": 2.3329153605015676, "grad_norm": 0.9907580885080208, "learning_rate": 1.397537142243269e-05, "loss": 0.1148, "step": 11163 }, { "epoch": 2.3331243469174505, "grad_norm": 1.1053404091877506, "learning_rate": 1.3974336248582424e-05, "loss": 0.123, "step": 11164 }, { "epoch": 2.3333333333333335, "grad_norm": 1.039914150152378, "learning_rate": 1.3973301024152455e-05, "loss": 0.1277, "step": 11165 }, { "epoch": 2.3335423197492164, "grad_norm": 1.0110380280272964, "learning_rate": 1.3972265749155947e-05, "loss": 0.1245, "step": 11166 }, { "epoch": 2.3337513061650994, "grad_norm": 1.062059066549565, "learning_rate": 1.397123042360608e-05, "loss": 0.1178, "step": 11167 }, { "epoch": 2.3339602925809824, "grad_norm": 1.2208049516942454, "learning_rate": 1.397019504751603e-05, "loss": 0.1544, "step": 11168 }, { "epoch": 2.3341692789968653, "grad_norm": 1.0102714585567618, "learning_rate": 1.3969159620898974e-05, "loss": 0.1339, "step": 11169 }, { "epoch": 2.3343782654127483, "grad_norm": 1.077020157789806, "learning_rate": 1.3968124143768085e-05, "loss": 0.1429, "step": 11170 }, { "epoch": 2.3345872518286312, "grad_norm": 1.0247550360819189, "learning_rate": 1.3967088616136551e-05, "loss": 0.1095, "step": 11171 }, { "epoch": 2.334796238244514, "grad_norm": 1.033553914099234, "learning_rate": 1.3966053038017542e-05, "loss": 0.1226, "step": 11172 }, { "epoch": 2.335005224660397, "grad_norm": 1.075766703642608, "learning_rate": 1.3965017409424242e-05, "loss": 0.1247, "step": 11173 }, { "epoch": 2.33521421107628, "grad_norm": 1.0475564623703522, "learning_rate": 1.3963981730369827e-05, "loss": 0.1405, "step": 11174 }, { "epoch": 2.335423197492163, "grad_norm": 1.0839740041796835, "learning_rate": 1.3962946000867483e-05, "loss": 0.1342, "step": 11175 }, { "epoch": 2.335632183908046, "grad_norm": 0.818136508531222, "learning_rate": 1.3961910220930384e-05, "loss": 0.0996, "step": 11176 }, { "epoch": 2.335841170323929, "grad_norm": 0.9703408265902056, "learning_rate": 1.3960874390571721e-05, "loss": 0.1163, "step": 11177 }, { "epoch": 2.336050156739812, "grad_norm": 1.0152373990176993, "learning_rate": 1.3959838509804669e-05, "loss": 0.1348, "step": 11178 }, { "epoch": 2.336259143155695, "grad_norm": 0.934636415034658, "learning_rate": 1.3958802578642413e-05, "loss": 0.136, "step": 11179 }, { "epoch": 2.336468129571578, "grad_norm": 0.9030844714562627, "learning_rate": 1.3957766597098138e-05, "loss": 0.1146, "step": 11180 }, { "epoch": 2.336677115987461, "grad_norm": 0.8218878743423313, "learning_rate": 1.3956730565185033e-05, "loss": 0.1105, "step": 11181 }, { "epoch": 2.336886102403344, "grad_norm": 1.1779196137958894, "learning_rate": 1.3955694482916271e-05, "loss": 0.1272, "step": 11182 }, { "epoch": 2.337095088819227, "grad_norm": 0.9651250487625475, "learning_rate": 1.395465835030505e-05, "loss": 0.1141, "step": 11183 }, { "epoch": 2.3373040752351097, "grad_norm": 1.1996806630769195, "learning_rate": 1.395362216736455e-05, "loss": 0.1376, "step": 11184 }, { "epoch": 2.3375130616509927, "grad_norm": 0.9005709612885695, "learning_rate": 1.3952585934107963e-05, "loss": 0.1152, "step": 11185 }, { "epoch": 2.3377220480668757, "grad_norm": 1.1514948045759175, "learning_rate": 1.395154965054847e-05, "loss": 0.123, "step": 11186 }, { "epoch": 2.3379310344827586, "grad_norm": 1.0530033811654322, "learning_rate": 1.3950513316699266e-05, "loss": 0.1508, "step": 11187 }, { "epoch": 2.3381400208986416, "grad_norm": 1.1157578246283342, "learning_rate": 1.3949476932573531e-05, "loss": 0.1157, "step": 11188 }, { "epoch": 2.3383490073145246, "grad_norm": 1.0561353552881374, "learning_rate": 1.3948440498184467e-05, "loss": 0.115, "step": 11189 }, { "epoch": 2.3385579937304075, "grad_norm": 1.1609573365442825, "learning_rate": 1.3947404013545253e-05, "loss": 0.1304, "step": 11190 }, { "epoch": 2.3387669801462905, "grad_norm": 1.321167309037972, "learning_rate": 1.3946367478669087e-05, "loss": 0.1436, "step": 11191 }, { "epoch": 2.3389759665621734, "grad_norm": 1.0152885417235462, "learning_rate": 1.3945330893569158e-05, "loss": 0.1185, "step": 11192 }, { "epoch": 2.3391849529780564, "grad_norm": 1.09885151776141, "learning_rate": 1.394429425825866e-05, "loss": 0.1252, "step": 11193 }, { "epoch": 2.3393939393939394, "grad_norm": 0.8103287328733872, "learning_rate": 1.3943257572750778e-05, "loss": 0.0924, "step": 11194 }, { "epoch": 2.3396029258098223, "grad_norm": 1.0193101090008043, "learning_rate": 1.3942220837058717e-05, "loss": 0.1412, "step": 11195 }, { "epoch": 2.3398119122257053, "grad_norm": 0.9880580757683166, "learning_rate": 1.3941184051195665e-05, "loss": 0.1256, "step": 11196 }, { "epoch": 2.3400208986415882, "grad_norm": 1.169109229688092, "learning_rate": 1.3940147215174814e-05, "loss": 0.1571, "step": 11197 }, { "epoch": 2.340229885057471, "grad_norm": 1.04867791752593, "learning_rate": 1.3939110329009366e-05, "loss": 0.1038, "step": 11198 }, { "epoch": 2.340438871473354, "grad_norm": 1.1493527692895504, "learning_rate": 1.3938073392712515e-05, "loss": 0.1363, "step": 11199 }, { "epoch": 2.340647857889237, "grad_norm": 0.9971901391347879, "learning_rate": 1.3937036406297453e-05, "loss": 0.1116, "step": 11200 }, { "epoch": 2.34085684430512, "grad_norm": 1.1736599308082334, "learning_rate": 1.3935999369777384e-05, "loss": 0.1085, "step": 11201 }, { "epoch": 2.341065830721003, "grad_norm": 0.9689362697386092, "learning_rate": 1.39349622831655e-05, "loss": 0.1118, "step": 11202 }, { "epoch": 2.341274817136886, "grad_norm": 1.0219659814142814, "learning_rate": 1.3933925146475004e-05, "loss": 0.1402, "step": 11203 }, { "epoch": 2.341483803552769, "grad_norm": 0.8106836241040624, "learning_rate": 1.3932887959719093e-05, "loss": 0.1058, "step": 11204 }, { "epoch": 2.341692789968652, "grad_norm": 0.9223842689766442, "learning_rate": 1.393185072291097e-05, "loss": 0.1316, "step": 11205 }, { "epoch": 2.341901776384535, "grad_norm": 0.8121245660087993, "learning_rate": 1.3930813436063826e-05, "loss": 0.1185, "step": 11206 }, { "epoch": 2.342110762800418, "grad_norm": 1.1445276547411982, "learning_rate": 1.3929776099190874e-05, "loss": 0.1429, "step": 11207 }, { "epoch": 2.342319749216301, "grad_norm": 0.8825749413009087, "learning_rate": 1.392873871230531e-05, "loss": 0.1136, "step": 11208 }, { "epoch": 2.342528735632184, "grad_norm": 0.8784986280693665, "learning_rate": 1.3927701275420335e-05, "loss": 0.1227, "step": 11209 }, { "epoch": 2.3427377220480667, "grad_norm": 1.0923779066372994, "learning_rate": 1.3926663788549158e-05, "loss": 0.1349, "step": 11210 }, { "epoch": 2.3429467084639497, "grad_norm": 1.0875966932482959, "learning_rate": 1.3925626251704975e-05, "loss": 0.1425, "step": 11211 }, { "epoch": 2.3431556948798327, "grad_norm": 0.8578415451780499, "learning_rate": 1.3924588664900993e-05, "loss": 0.1046, "step": 11212 }, { "epoch": 2.3433646812957156, "grad_norm": 0.938470661704453, "learning_rate": 1.3923551028150424e-05, "loss": 0.1097, "step": 11213 }, { "epoch": 2.3435736677115986, "grad_norm": 1.1188515597928153, "learning_rate": 1.3922513341466464e-05, "loss": 0.1276, "step": 11214 }, { "epoch": 2.3437826541274815, "grad_norm": 1.0444636205182471, "learning_rate": 1.3921475604862322e-05, "loss": 0.1302, "step": 11215 }, { "epoch": 2.3439916405433645, "grad_norm": 0.9393870070502527, "learning_rate": 1.3920437818351205e-05, "loss": 0.1224, "step": 11216 }, { "epoch": 2.3442006269592475, "grad_norm": 0.9391029982002695, "learning_rate": 1.3919399981946324e-05, "loss": 0.1054, "step": 11217 }, { "epoch": 2.3444096133751304, "grad_norm": 1.0251438317841535, "learning_rate": 1.391836209566088e-05, "loss": 0.1158, "step": 11218 }, { "epoch": 2.3446185997910134, "grad_norm": 1.1804524070415219, "learning_rate": 1.391732415950809e-05, "loss": 0.147, "step": 11219 }, { "epoch": 2.344827586206897, "grad_norm": 1.0672492175255917, "learning_rate": 1.3916286173501156e-05, "loss": 0.141, "step": 11220 }, { "epoch": 2.3450365726227798, "grad_norm": 0.9951809918643898, "learning_rate": 1.3915248137653293e-05, "loss": 0.0995, "step": 11221 }, { "epoch": 2.3452455590386627, "grad_norm": 0.8372222039445626, "learning_rate": 1.391421005197771e-05, "loss": 0.0961, "step": 11222 }, { "epoch": 2.3454545454545457, "grad_norm": 1.0595150684145531, "learning_rate": 1.3913171916487618e-05, "loss": 0.1219, "step": 11223 }, { "epoch": 2.3456635318704286, "grad_norm": 0.851584467586377, "learning_rate": 1.3912133731196227e-05, "loss": 0.1104, "step": 11224 }, { "epoch": 2.3458725182863116, "grad_norm": 0.8025713952768317, "learning_rate": 1.3911095496116756e-05, "loss": 0.0966, "step": 11225 }, { "epoch": 2.3460815047021946, "grad_norm": 1.0055517424428217, "learning_rate": 1.391005721126241e-05, "loss": 0.1039, "step": 11226 }, { "epoch": 2.3462904911180775, "grad_norm": 1.019093967886108, "learning_rate": 1.390901887664641e-05, "loss": 0.1223, "step": 11227 }, { "epoch": 2.3464994775339605, "grad_norm": 0.8048797931169982, "learning_rate": 1.3907980492281962e-05, "loss": 0.0998, "step": 11228 }, { "epoch": 2.3467084639498434, "grad_norm": 0.986755262513085, "learning_rate": 1.390694205818229e-05, "loss": 0.1199, "step": 11229 }, { "epoch": 2.3469174503657264, "grad_norm": 1.1084207028666244, "learning_rate": 1.3905903574360604e-05, "loss": 0.1362, "step": 11230 }, { "epoch": 2.3471264367816094, "grad_norm": 1.010689099235858, "learning_rate": 1.3904865040830126e-05, "loss": 0.1268, "step": 11231 }, { "epoch": 2.3473354231974923, "grad_norm": 1.0200588094578333, "learning_rate": 1.3903826457604068e-05, "loss": 0.1326, "step": 11232 }, { "epoch": 2.3475444096133753, "grad_norm": 1.0803982462952906, "learning_rate": 1.3902787824695648e-05, "loss": 0.141, "step": 11233 }, { "epoch": 2.3477533960292583, "grad_norm": 1.0008946723136105, "learning_rate": 1.3901749142118083e-05, "loss": 0.11, "step": 11234 }, { "epoch": 2.347962382445141, "grad_norm": 0.970364304771364, "learning_rate": 1.3900710409884596e-05, "loss": 0.122, "step": 11235 }, { "epoch": 2.348171368861024, "grad_norm": 1.1687738895693398, "learning_rate": 1.3899671628008403e-05, "loss": 0.1231, "step": 11236 }, { "epoch": 2.348380355276907, "grad_norm": 1.029734166538608, "learning_rate": 1.389863279650273e-05, "loss": 0.1148, "step": 11237 }, { "epoch": 2.34858934169279, "grad_norm": 1.0302620279456338, "learning_rate": 1.3897593915380791e-05, "loss": 0.1243, "step": 11238 }, { "epoch": 2.348798328108673, "grad_norm": 1.054845893787822, "learning_rate": 1.3896554984655805e-05, "loss": 0.124, "step": 11239 }, { "epoch": 2.349007314524556, "grad_norm": 1.1209856029379521, "learning_rate": 1.3895516004341005e-05, "loss": 0.1222, "step": 11240 }, { "epoch": 2.349216300940439, "grad_norm": 1.2467538160559455, "learning_rate": 1.3894476974449606e-05, "loss": 0.1522, "step": 11241 }, { "epoch": 2.349425287356322, "grad_norm": 1.1057449462220954, "learning_rate": 1.389343789499483e-05, "loss": 0.1273, "step": 11242 }, { "epoch": 2.349634273772205, "grad_norm": 1.1691848050579148, "learning_rate": 1.3892398765989908e-05, "loss": 0.161, "step": 11243 }, { "epoch": 2.349843260188088, "grad_norm": 0.8596490089404834, "learning_rate": 1.3891359587448059e-05, "loss": 0.0955, "step": 11244 }, { "epoch": 2.350052246603971, "grad_norm": 0.974582667323226, "learning_rate": 1.3890320359382506e-05, "loss": 0.1323, "step": 11245 }, { "epoch": 2.350261233019854, "grad_norm": 0.8779068628086252, "learning_rate": 1.3889281081806483e-05, "loss": 0.0966, "step": 11246 }, { "epoch": 2.3504702194357368, "grad_norm": 1.1279500273564083, "learning_rate": 1.388824175473321e-05, "loss": 0.1212, "step": 11247 }, { "epoch": 2.3506792058516197, "grad_norm": 0.8742483787565842, "learning_rate": 1.3887202378175913e-05, "loss": 0.1219, "step": 11248 }, { "epoch": 2.3508881922675027, "grad_norm": 0.9688256383490541, "learning_rate": 1.3886162952147825e-05, "loss": 0.1361, "step": 11249 }, { "epoch": 2.3510971786833856, "grad_norm": 0.8306848929344418, "learning_rate": 1.3885123476662175e-05, "loss": 0.1015, "step": 11250 }, { "epoch": 2.3513061650992686, "grad_norm": 1.160703872380075, "learning_rate": 1.3884083951732184e-05, "loss": 0.1256, "step": 11251 }, { "epoch": 2.3515151515151516, "grad_norm": 0.7834961599396014, "learning_rate": 1.3883044377371089e-05, "loss": 0.0905, "step": 11252 }, { "epoch": 2.3517241379310345, "grad_norm": 0.8929797238375587, "learning_rate": 1.3882004753592116e-05, "loss": 0.1293, "step": 11253 }, { "epoch": 2.3519331243469175, "grad_norm": 0.8471840418804051, "learning_rate": 1.3880965080408498e-05, "loss": 0.1222, "step": 11254 }, { "epoch": 2.3521421107628004, "grad_norm": 0.9455280257319755, "learning_rate": 1.3879925357833465e-05, "loss": 0.1422, "step": 11255 }, { "epoch": 2.3523510971786834, "grad_norm": 0.9361941601645495, "learning_rate": 1.3878885585880253e-05, "loss": 0.1113, "step": 11256 }, { "epoch": 2.3525600835945664, "grad_norm": 1.0598003728657832, "learning_rate": 1.3877845764562088e-05, "loss": 0.1292, "step": 11257 }, { "epoch": 2.3527690700104493, "grad_norm": 1.2393447559148085, "learning_rate": 1.3876805893892212e-05, "loss": 0.1425, "step": 11258 }, { "epoch": 2.3529780564263323, "grad_norm": 0.8155930025773447, "learning_rate": 1.3875765973883852e-05, "loss": 0.1156, "step": 11259 }, { "epoch": 2.3531870428422152, "grad_norm": 1.082022186605822, "learning_rate": 1.3874726004550243e-05, "loss": 0.1481, "step": 11260 }, { "epoch": 2.353396029258098, "grad_norm": 1.0931165336827158, "learning_rate": 1.3873685985904624e-05, "loss": 0.1069, "step": 11261 }, { "epoch": 2.353605015673981, "grad_norm": 0.9431623710703535, "learning_rate": 1.387264591796023e-05, "loss": 0.1073, "step": 11262 }, { "epoch": 2.353814002089864, "grad_norm": 1.0374374130935242, "learning_rate": 1.3871605800730292e-05, "loss": 0.1388, "step": 11263 }, { "epoch": 2.354022988505747, "grad_norm": 0.8553463078214442, "learning_rate": 1.3870565634228057e-05, "loss": 0.1201, "step": 11264 }, { "epoch": 2.35423197492163, "grad_norm": 1.0136955876962346, "learning_rate": 1.3869525418466756e-05, "loss": 0.1342, "step": 11265 }, { "epoch": 2.354440961337513, "grad_norm": 0.8959729709635075, "learning_rate": 1.3868485153459627e-05, "loss": 0.0953, "step": 11266 }, { "epoch": 2.354649947753396, "grad_norm": 1.023233129158912, "learning_rate": 1.3867444839219913e-05, "loss": 0.1417, "step": 11267 }, { "epoch": 2.354858934169279, "grad_norm": 0.9143337553391999, "learning_rate": 1.386640447576085e-05, "loss": 0.1069, "step": 11268 }, { "epoch": 2.355067920585162, "grad_norm": 0.9335385043546712, "learning_rate": 1.3865364063095678e-05, "loss": 0.1085, "step": 11269 }, { "epoch": 2.355276907001045, "grad_norm": 1.0796217069186254, "learning_rate": 1.3864323601237643e-05, "loss": 0.1086, "step": 11270 }, { "epoch": 2.355485893416928, "grad_norm": 1.0110035605943735, "learning_rate": 1.3863283090199981e-05, "loss": 0.1062, "step": 11271 }, { "epoch": 2.355694879832811, "grad_norm": 1.0211456835908868, "learning_rate": 1.3862242529995938e-05, "loss": 0.1403, "step": 11272 }, { "epoch": 2.3559038662486937, "grad_norm": 0.9552273959054011, "learning_rate": 1.3861201920638751e-05, "loss": 0.1289, "step": 11273 }, { "epoch": 2.3561128526645767, "grad_norm": 1.0269721469627517, "learning_rate": 1.3860161262141674e-05, "loss": 0.145, "step": 11274 }, { "epoch": 2.3563218390804597, "grad_norm": 1.1198016156329504, "learning_rate": 1.3859120554517938e-05, "loss": 0.1109, "step": 11275 }, { "epoch": 2.3565308254963426, "grad_norm": 0.9688651462072131, "learning_rate": 1.38580797977808e-05, "loss": 0.1148, "step": 11276 }, { "epoch": 2.3567398119122256, "grad_norm": 1.2497211332947185, "learning_rate": 1.3857038991943495e-05, "loss": 0.1437, "step": 11277 }, { "epoch": 2.3569487983281086, "grad_norm": 0.8401680129671077, "learning_rate": 1.3855998137019274e-05, "loss": 0.0996, "step": 11278 }, { "epoch": 2.3571577847439915, "grad_norm": 1.1080187593646937, "learning_rate": 1.3854957233021384e-05, "loss": 0.139, "step": 11279 }, { "epoch": 2.3573667711598745, "grad_norm": 0.9678735770552546, "learning_rate": 1.3853916279963071e-05, "loss": 0.116, "step": 11280 }, { "epoch": 2.3575757575757574, "grad_norm": 1.0688365060228664, "learning_rate": 1.385287527785758e-05, "loss": 0.1218, "step": 11281 }, { "epoch": 2.3577847439916404, "grad_norm": 1.098367250478747, "learning_rate": 1.3851834226718167e-05, "loss": 0.1284, "step": 11282 }, { "epoch": 2.3579937304075234, "grad_norm": 1.0876818734673637, "learning_rate": 1.3850793126558073e-05, "loss": 0.1449, "step": 11283 }, { "epoch": 2.3582027168234063, "grad_norm": 0.8615213882494068, "learning_rate": 1.384975197739055e-05, "loss": 0.093, "step": 11284 }, { "epoch": 2.3584117032392893, "grad_norm": 0.8355281395823795, "learning_rate": 1.3848710779228852e-05, "loss": 0.0901, "step": 11285 }, { "epoch": 2.3586206896551722, "grad_norm": 0.9593265036040667, "learning_rate": 1.3847669532086226e-05, "loss": 0.127, "step": 11286 }, { "epoch": 2.358829676071055, "grad_norm": 0.9447357682337981, "learning_rate": 1.384662823597592e-05, "loss": 0.1191, "step": 11287 }, { "epoch": 2.359038662486938, "grad_norm": 0.9927591293635687, "learning_rate": 1.3845586890911195e-05, "loss": 0.1405, "step": 11288 }, { "epoch": 2.359247648902821, "grad_norm": 1.0690671574174202, "learning_rate": 1.38445454969053e-05, "loss": 0.1138, "step": 11289 }, { "epoch": 2.359456635318704, "grad_norm": 1.0296077575666325, "learning_rate": 1.3843504053971486e-05, "loss": 0.1154, "step": 11290 }, { "epoch": 2.359665621734587, "grad_norm": 1.080089859156301, "learning_rate": 1.384246256212301e-05, "loss": 0.1328, "step": 11291 }, { "epoch": 2.35987460815047, "grad_norm": 1.0650091166687572, "learning_rate": 1.3841421021373125e-05, "loss": 0.1138, "step": 11292 }, { "epoch": 2.360083594566353, "grad_norm": 1.2046499876506231, "learning_rate": 1.3840379431735085e-05, "loss": 0.1536, "step": 11293 }, { "epoch": 2.360292580982236, "grad_norm": 1.047074683396706, "learning_rate": 1.383933779322215e-05, "loss": 0.1111, "step": 11294 }, { "epoch": 2.360501567398119, "grad_norm": 1.0482827706299518, "learning_rate": 1.3838296105847573e-05, "loss": 0.1172, "step": 11295 }, { "epoch": 2.360710553814002, "grad_norm": 1.0541432867763445, "learning_rate": 1.383725436962461e-05, "loss": 0.1412, "step": 11296 }, { "epoch": 2.360919540229885, "grad_norm": 1.15225179368603, "learning_rate": 1.3836212584566523e-05, "loss": 0.137, "step": 11297 }, { "epoch": 2.3611285266457678, "grad_norm": 1.0887497772145835, "learning_rate": 1.383517075068657e-05, "loss": 0.1259, "step": 11298 }, { "epoch": 2.361337513061651, "grad_norm": 0.9654008283676717, "learning_rate": 1.3834128867998004e-05, "loss": 0.1142, "step": 11299 }, { "epoch": 2.361546499477534, "grad_norm": 1.0519683939467641, "learning_rate": 1.3833086936514094e-05, "loss": 0.1291, "step": 11300 }, { "epoch": 2.361755485893417, "grad_norm": 0.9071532779120696, "learning_rate": 1.3832044956248089e-05, "loss": 0.1202, "step": 11301 }, { "epoch": 2.3619644723093, "grad_norm": 1.3039135711981238, "learning_rate": 1.3831002927213261e-05, "loss": 0.129, "step": 11302 }, { "epoch": 2.362173458725183, "grad_norm": 1.0075472650884187, "learning_rate": 1.3829960849422863e-05, "loss": 0.1187, "step": 11303 }, { "epoch": 2.362382445141066, "grad_norm": 1.1329328224132387, "learning_rate": 1.3828918722890165e-05, "loss": 0.118, "step": 11304 }, { "epoch": 2.362591431556949, "grad_norm": 1.0024653786530615, "learning_rate": 1.3827876547628418e-05, "loss": 0.1391, "step": 11305 }, { "epoch": 2.362800417972832, "grad_norm": 0.8732731841344621, "learning_rate": 1.3826834323650899e-05, "loss": 0.0988, "step": 11306 }, { "epoch": 2.363009404388715, "grad_norm": 0.9668622158533665, "learning_rate": 1.3825792050970865e-05, "loss": 0.1221, "step": 11307 }, { "epoch": 2.363218390804598, "grad_norm": 0.8989911911003312, "learning_rate": 1.3824749729601576e-05, "loss": 0.1105, "step": 11308 }, { "epoch": 2.363427377220481, "grad_norm": 1.065522180750755, "learning_rate": 1.3823707359556306e-05, "loss": 0.1343, "step": 11309 }, { "epoch": 2.3636363636363638, "grad_norm": 0.8765808149213556, "learning_rate": 1.3822664940848318e-05, "loss": 0.0932, "step": 11310 }, { "epoch": 2.3638453500522467, "grad_norm": 1.071116233392576, "learning_rate": 1.3821622473490874e-05, "loss": 0.1355, "step": 11311 }, { "epoch": 2.3640543364681297, "grad_norm": 1.03245183960634, "learning_rate": 1.3820579957497247e-05, "loss": 0.1201, "step": 11312 }, { "epoch": 2.3642633228840126, "grad_norm": 1.0766511550237732, "learning_rate": 1.3819537392880701e-05, "loss": 0.1246, "step": 11313 }, { "epoch": 2.3644723092998956, "grad_norm": 1.0735077434033085, "learning_rate": 1.3818494779654504e-05, "loss": 0.1244, "step": 11314 }, { "epoch": 2.3646812957157786, "grad_norm": 1.0212744454256333, "learning_rate": 1.3817452117831929e-05, "loss": 0.1344, "step": 11315 }, { "epoch": 2.3648902821316615, "grad_norm": 1.0717800397763895, "learning_rate": 1.3816409407426241e-05, "loss": 0.1314, "step": 11316 }, { "epoch": 2.3650992685475445, "grad_norm": 1.2053649021368364, "learning_rate": 1.3815366648450711e-05, "loss": 0.1135, "step": 11317 }, { "epoch": 2.3653082549634274, "grad_norm": 1.0994945394825648, "learning_rate": 1.3814323840918613e-05, "loss": 0.1274, "step": 11318 }, { "epoch": 2.3655172413793104, "grad_norm": 1.1621289787396072, "learning_rate": 1.3813280984843218e-05, "loss": 0.1159, "step": 11319 }, { "epoch": 2.3657262277951934, "grad_norm": 1.0142305169011585, "learning_rate": 1.3812238080237792e-05, "loss": 0.1203, "step": 11320 }, { "epoch": 2.3659352142110763, "grad_norm": 0.9175905342601534, "learning_rate": 1.3811195127115614e-05, "loss": 0.0888, "step": 11321 }, { "epoch": 2.3661442006269593, "grad_norm": 1.1494714160506234, "learning_rate": 1.3810152125489955e-05, "loss": 0.1422, "step": 11322 }, { "epoch": 2.3663531870428423, "grad_norm": 0.9567949080224859, "learning_rate": 1.3809109075374085e-05, "loss": 0.1149, "step": 11323 }, { "epoch": 2.366562173458725, "grad_norm": 1.1059911566934049, "learning_rate": 1.3808065976781286e-05, "loss": 0.1265, "step": 11324 }, { "epoch": 2.366771159874608, "grad_norm": 0.87718777931573, "learning_rate": 1.380702282972483e-05, "loss": 0.1045, "step": 11325 }, { "epoch": 2.366980146290491, "grad_norm": 1.0574353589609786, "learning_rate": 1.3805979634217988e-05, "loss": 0.126, "step": 11326 }, { "epoch": 2.367189132706374, "grad_norm": 1.0704745219465288, "learning_rate": 1.3804936390274045e-05, "loss": 0.12, "step": 11327 }, { "epoch": 2.367398119122257, "grad_norm": 0.9908872594728535, "learning_rate": 1.3803893097906271e-05, "loss": 0.1392, "step": 11328 }, { "epoch": 2.36760710553814, "grad_norm": 0.8634113824688541, "learning_rate": 1.3802849757127945e-05, "loss": 0.1026, "step": 11329 }, { "epoch": 2.367816091954023, "grad_norm": 0.9976631615238751, "learning_rate": 1.3801806367952346e-05, "loss": 0.1302, "step": 11330 }, { "epoch": 2.368025078369906, "grad_norm": 0.7755745537429367, "learning_rate": 1.3800762930392754e-05, "loss": 0.0802, "step": 11331 }, { "epoch": 2.368234064785789, "grad_norm": 1.1126151685266252, "learning_rate": 1.3799719444462444e-05, "loss": 0.1358, "step": 11332 }, { "epoch": 2.368443051201672, "grad_norm": 1.1459157439559855, "learning_rate": 1.3798675910174705e-05, "loss": 0.1158, "step": 11333 }, { "epoch": 2.368652037617555, "grad_norm": 0.9776580053238193, "learning_rate": 1.3797632327542807e-05, "loss": 0.1355, "step": 11334 }, { "epoch": 2.368861024033438, "grad_norm": 0.959745954035464, "learning_rate": 1.3796588696580038e-05, "loss": 0.1189, "step": 11335 }, { "epoch": 2.3690700104493208, "grad_norm": 1.0507768719205794, "learning_rate": 1.3795545017299676e-05, "loss": 0.1135, "step": 11336 }, { "epoch": 2.3692789968652037, "grad_norm": 1.1572136849060994, "learning_rate": 1.379450128971501e-05, "loss": 0.1525, "step": 11337 }, { "epoch": 2.3694879832810867, "grad_norm": 1.1437848098461658, "learning_rate": 1.379345751383931e-05, "loss": 0.1041, "step": 11338 }, { "epoch": 2.3696969696969696, "grad_norm": 1.0463528765003813, "learning_rate": 1.3792413689685877e-05, "loss": 0.1256, "step": 11339 }, { "epoch": 2.3699059561128526, "grad_norm": 1.1107123771025418, "learning_rate": 1.379136981726798e-05, "loss": 0.1218, "step": 11340 }, { "epoch": 2.3701149425287356, "grad_norm": 0.8718489639973774, "learning_rate": 1.3790325896598912e-05, "loss": 0.0878, "step": 11341 }, { "epoch": 2.3703239289446185, "grad_norm": 1.1700192087442802, "learning_rate": 1.378928192769196e-05, "loss": 0.1419, "step": 11342 }, { "epoch": 2.3705329153605015, "grad_norm": 1.055796215755546, "learning_rate": 1.3788237910560406e-05, "loss": 0.1094, "step": 11343 }, { "epoch": 2.3707419017763844, "grad_norm": 0.8943851255376934, "learning_rate": 1.3787193845217532e-05, "loss": 0.1181, "step": 11344 }, { "epoch": 2.3709508881922674, "grad_norm": 0.9812828312677077, "learning_rate": 1.3786149731676637e-05, "loss": 0.1385, "step": 11345 }, { "epoch": 2.3711598746081504, "grad_norm": 0.8891958294764463, "learning_rate": 1.3785105569951003e-05, "loss": 0.1178, "step": 11346 }, { "epoch": 2.3713688610240333, "grad_norm": 1.1137677658629404, "learning_rate": 1.3784061360053915e-05, "loss": 0.1042, "step": 11347 }, { "epoch": 2.3715778474399163, "grad_norm": 0.9961194671095563, "learning_rate": 1.378301710199867e-05, "loss": 0.117, "step": 11348 }, { "epoch": 2.3717868338557992, "grad_norm": 1.062569117690025, "learning_rate": 1.3781972795798553e-05, "loss": 0.1257, "step": 11349 }, { "epoch": 2.371995820271682, "grad_norm": 1.081241109693738, "learning_rate": 1.3780928441466852e-05, "loss": 0.1357, "step": 11350 }, { "epoch": 2.372204806687565, "grad_norm": 0.8969462290691664, "learning_rate": 1.3779884039016866e-05, "loss": 0.1332, "step": 11351 }, { "epoch": 2.372413793103448, "grad_norm": 0.9457892721913735, "learning_rate": 1.3778839588461879e-05, "loss": 0.1073, "step": 11352 }, { "epoch": 2.372622779519331, "grad_norm": 1.05085631444766, "learning_rate": 1.3777795089815186e-05, "loss": 0.1399, "step": 11353 }, { "epoch": 2.372831765935214, "grad_norm": 0.8899258666029279, "learning_rate": 1.377675054309008e-05, "loss": 0.1166, "step": 11354 }, { "epoch": 2.373040752351097, "grad_norm": 1.04513160154118, "learning_rate": 1.377570594829986e-05, "loss": 0.1359, "step": 11355 }, { "epoch": 2.37324973876698, "grad_norm": 0.9697111885825312, "learning_rate": 1.3774661305457808e-05, "loss": 0.1168, "step": 11356 }, { "epoch": 2.373458725182863, "grad_norm": 0.9611129305550932, "learning_rate": 1.377361661457723e-05, "loss": 0.1302, "step": 11357 }, { "epoch": 2.373667711598746, "grad_norm": 1.0525784385665382, "learning_rate": 1.3772571875671414e-05, "loss": 0.0988, "step": 11358 }, { "epoch": 2.373876698014629, "grad_norm": 0.9346117602816132, "learning_rate": 1.377152708875366e-05, "loss": 0.1104, "step": 11359 }, { "epoch": 2.3740856844305123, "grad_norm": 1.218548012186029, "learning_rate": 1.3770482253837264e-05, "loss": 0.1477, "step": 11360 }, { "epoch": 2.3742946708463952, "grad_norm": 1.174226220523034, "learning_rate": 1.3769437370935523e-05, "loss": 0.1591, "step": 11361 }, { "epoch": 2.374503657262278, "grad_norm": 0.951631309396642, "learning_rate": 1.376839244006173e-05, "loss": 0.112, "step": 11362 }, { "epoch": 2.374712643678161, "grad_norm": 0.8530578426696895, "learning_rate": 1.3767347461229194e-05, "loss": 0.1184, "step": 11363 }, { "epoch": 2.374921630094044, "grad_norm": 0.9226285501302602, "learning_rate": 1.3766302434451204e-05, "loss": 0.1225, "step": 11364 }, { "epoch": 2.375130616509927, "grad_norm": 1.058623735733357, "learning_rate": 1.3765257359741065e-05, "loss": 0.131, "step": 11365 }, { "epoch": 2.37533960292581, "grad_norm": 1.0007259632897343, "learning_rate": 1.3764212237112073e-05, "loss": 0.1127, "step": 11366 }, { "epoch": 2.375548589341693, "grad_norm": 1.132734138338741, "learning_rate": 1.3763167066577537e-05, "loss": 0.1184, "step": 11367 }, { "epoch": 2.375757575757576, "grad_norm": 0.8495754911070151, "learning_rate": 1.3762121848150747e-05, "loss": 0.0896, "step": 11368 }, { "epoch": 2.375966562173459, "grad_norm": 1.1717243960644412, "learning_rate": 1.3761076581845014e-05, "loss": 0.1375, "step": 11369 }, { "epoch": 2.376175548589342, "grad_norm": 1.156767624090082, "learning_rate": 1.3760031267673636e-05, "loss": 0.1384, "step": 11370 }, { "epoch": 2.376384535005225, "grad_norm": 0.986803715123587, "learning_rate": 1.3758985905649918e-05, "loss": 0.1212, "step": 11371 }, { "epoch": 2.376593521421108, "grad_norm": 0.9747447157606633, "learning_rate": 1.3757940495787163e-05, "loss": 0.1285, "step": 11372 }, { "epoch": 2.3768025078369908, "grad_norm": 0.9415510456976667, "learning_rate": 1.3756895038098681e-05, "loss": 0.113, "step": 11373 }, { "epoch": 2.3770114942528737, "grad_norm": 1.1012031122124808, "learning_rate": 1.3755849532597767e-05, "loss": 0.1303, "step": 11374 }, { "epoch": 2.3772204806687567, "grad_norm": 0.87083066479713, "learning_rate": 1.3754803979297736e-05, "loss": 0.1146, "step": 11375 }, { "epoch": 2.3774294670846396, "grad_norm": 0.9990548365348937, "learning_rate": 1.3753758378211887e-05, "loss": 0.1191, "step": 11376 }, { "epoch": 2.3776384535005226, "grad_norm": 0.9462640935330707, "learning_rate": 1.3752712729353532e-05, "loss": 0.1311, "step": 11377 }, { "epoch": 2.3778474399164056, "grad_norm": 0.9830044246501014, "learning_rate": 1.3751667032735976e-05, "loss": 0.1298, "step": 11378 }, { "epoch": 2.3780564263322885, "grad_norm": 1.120733019758682, "learning_rate": 1.375062128837253e-05, "loss": 0.1255, "step": 11379 }, { "epoch": 2.3782654127481715, "grad_norm": 1.0863678760891984, "learning_rate": 1.3749575496276497e-05, "loss": 0.1205, "step": 11380 }, { "epoch": 2.3784743991640545, "grad_norm": 0.8806360839839581, "learning_rate": 1.3748529656461193e-05, "loss": 0.1206, "step": 11381 }, { "epoch": 2.3786833855799374, "grad_norm": 1.1265131980174186, "learning_rate": 1.3747483768939928e-05, "loss": 0.1404, "step": 11382 }, { "epoch": 2.3788923719958204, "grad_norm": 1.2001182018536323, "learning_rate": 1.3746437833726004e-05, "loss": 0.1297, "step": 11383 }, { "epoch": 2.3791013584117033, "grad_norm": 0.9821112805304485, "learning_rate": 1.3745391850832741e-05, "loss": 0.1188, "step": 11384 }, { "epoch": 2.3793103448275863, "grad_norm": 0.9731460175347545, "learning_rate": 1.3744345820273448e-05, "loss": 0.1255, "step": 11385 }, { "epoch": 2.3795193312434693, "grad_norm": 1.1658110110040931, "learning_rate": 1.3743299742061433e-05, "loss": 0.1373, "step": 11386 }, { "epoch": 2.379728317659352, "grad_norm": 0.8942900398269186, "learning_rate": 1.3742253616210017e-05, "loss": 0.123, "step": 11387 }, { "epoch": 2.379937304075235, "grad_norm": 0.9229524085119117, "learning_rate": 1.374120744273251e-05, "loss": 0.1198, "step": 11388 }, { "epoch": 2.380146290491118, "grad_norm": 1.1029079987266228, "learning_rate": 1.3740161221642221e-05, "loss": 0.1304, "step": 11389 }, { "epoch": 2.380355276907001, "grad_norm": 0.9664153569151527, "learning_rate": 1.3739114952952474e-05, "loss": 0.1159, "step": 11390 }, { "epoch": 2.380564263322884, "grad_norm": 0.855837589078883, "learning_rate": 1.3738068636676581e-05, "loss": 0.095, "step": 11391 }, { "epoch": 2.380773249738767, "grad_norm": 0.9070093634021869, "learning_rate": 1.3737022272827853e-05, "loss": 0.1132, "step": 11392 }, { "epoch": 2.38098223615465, "grad_norm": 1.0465956632673765, "learning_rate": 1.3735975861419612e-05, "loss": 0.1243, "step": 11393 }, { "epoch": 2.381191222570533, "grad_norm": 1.0350325254468564, "learning_rate": 1.3734929402465176e-05, "loss": 0.1127, "step": 11394 }, { "epoch": 2.381400208986416, "grad_norm": 1.1116152075243064, "learning_rate": 1.3733882895977857e-05, "loss": 0.1152, "step": 11395 }, { "epoch": 2.381609195402299, "grad_norm": 1.0526808951111661, "learning_rate": 1.373283634197098e-05, "loss": 0.1381, "step": 11396 }, { "epoch": 2.381818181818182, "grad_norm": 1.0067703916772177, "learning_rate": 1.3731789740457862e-05, "loss": 0.1403, "step": 11397 }, { "epoch": 2.382027168234065, "grad_norm": 1.1288390797063748, "learning_rate": 1.373074309145182e-05, "loss": 0.1529, "step": 11398 }, { "epoch": 2.3822361546499478, "grad_norm": 1.0848003067321121, "learning_rate": 1.3729696394966177e-05, "loss": 0.1126, "step": 11399 }, { "epoch": 2.3824451410658307, "grad_norm": 0.946806080335641, "learning_rate": 1.3728649651014255e-05, "loss": 0.119, "step": 11400 }, { "epoch": 2.3826541274817137, "grad_norm": 1.4706885395271851, "learning_rate": 1.3727602859609368e-05, "loss": 0.1191, "step": 11401 }, { "epoch": 2.3828631138975966, "grad_norm": 1.1342198383379456, "learning_rate": 1.372655602076485e-05, "loss": 0.1364, "step": 11402 }, { "epoch": 2.3830721003134796, "grad_norm": 1.0270510465675344, "learning_rate": 1.3725509134494015e-05, "loss": 0.1325, "step": 11403 }, { "epoch": 2.3832810867293626, "grad_norm": 0.8452350243906122, "learning_rate": 1.3724462200810187e-05, "loss": 0.1064, "step": 11404 }, { "epoch": 2.3834900731452455, "grad_norm": 0.8182499211809067, "learning_rate": 1.3723415219726692e-05, "loss": 0.1045, "step": 11405 }, { "epoch": 2.3836990595611285, "grad_norm": 1.0270889127817215, "learning_rate": 1.3722368191256856e-05, "loss": 0.1194, "step": 11406 }, { "epoch": 2.3839080459770114, "grad_norm": 1.0360493446924641, "learning_rate": 1.3721321115414e-05, "loss": 0.1143, "step": 11407 }, { "epoch": 2.3841170323928944, "grad_norm": 0.9019467356179524, "learning_rate": 1.3720273992211453e-05, "loss": 0.1073, "step": 11408 }, { "epoch": 2.3843260188087774, "grad_norm": 0.9945722461512647, "learning_rate": 1.3719226821662539e-05, "loss": 0.1331, "step": 11409 }, { "epoch": 2.3845350052246603, "grad_norm": 0.9509503194123646, "learning_rate": 1.3718179603780587e-05, "loss": 0.1328, "step": 11410 }, { "epoch": 2.3847439916405433, "grad_norm": 0.9438512444141754, "learning_rate": 1.3717132338578922e-05, "loss": 0.1143, "step": 11411 }, { "epoch": 2.3849529780564263, "grad_norm": 1.012293151901657, "learning_rate": 1.3716085026070876e-05, "loss": 0.1417, "step": 11412 }, { "epoch": 2.385161964472309, "grad_norm": 0.8759582348048645, "learning_rate": 1.3715037666269773e-05, "loss": 0.131, "step": 11413 }, { "epoch": 2.385370950888192, "grad_norm": 0.940914612322379, "learning_rate": 1.3713990259188945e-05, "loss": 0.1105, "step": 11414 }, { "epoch": 2.385579937304075, "grad_norm": 1.005784613896286, "learning_rate": 1.3712942804841723e-05, "loss": 0.1361, "step": 11415 }, { "epoch": 2.385788923719958, "grad_norm": 0.9860483642895049, "learning_rate": 1.3711895303241434e-05, "loss": 0.1314, "step": 11416 }, { "epoch": 2.385997910135841, "grad_norm": 0.9968262456154007, "learning_rate": 1.371084775440141e-05, "loss": 0.1196, "step": 11417 }, { "epoch": 2.386206896551724, "grad_norm": 0.9741995218401656, "learning_rate": 1.370980015833499e-05, "loss": 0.1151, "step": 11418 }, { "epoch": 2.386415882967607, "grad_norm": 0.9431462857125357, "learning_rate": 1.3708752515055492e-05, "loss": 0.1089, "step": 11419 }, { "epoch": 2.38662486938349, "grad_norm": 1.1110961219352906, "learning_rate": 1.3707704824576263e-05, "loss": 0.1316, "step": 11420 }, { "epoch": 2.386833855799373, "grad_norm": 0.89791895620865, "learning_rate": 1.3706657086910626e-05, "loss": 0.1258, "step": 11421 }, { "epoch": 2.387042842215256, "grad_norm": 0.9545926467643971, "learning_rate": 1.3705609302071923e-05, "loss": 0.1018, "step": 11422 }, { "epoch": 2.387251828631139, "grad_norm": 1.0305689495173869, "learning_rate": 1.3704561470073485e-05, "loss": 0.1358, "step": 11423 }, { "epoch": 2.387460815047022, "grad_norm": 0.8163798313857733, "learning_rate": 1.3703513590928647e-05, "loss": 0.1059, "step": 11424 }, { "epoch": 2.3876698014629048, "grad_norm": 1.0515350533685277, "learning_rate": 1.3702465664650744e-05, "loss": 0.1164, "step": 11425 }, { "epoch": 2.3878787878787877, "grad_norm": 1.118217041576994, "learning_rate": 1.3701417691253118e-05, "loss": 0.113, "step": 11426 }, { "epoch": 2.3880877742946707, "grad_norm": 1.068191232377868, "learning_rate": 1.3700369670749101e-05, "loss": 0.1143, "step": 11427 }, { "epoch": 2.3882967607105536, "grad_norm": 1.0604784152722702, "learning_rate": 1.3699321603152031e-05, "loss": 0.1356, "step": 11428 }, { "epoch": 2.3885057471264366, "grad_norm": 1.0193650521944566, "learning_rate": 1.3698273488475246e-05, "loss": 0.1138, "step": 11429 }, { "epoch": 2.3887147335423196, "grad_norm": 1.1836708674095116, "learning_rate": 1.369722532673209e-05, "loss": 0.128, "step": 11430 }, { "epoch": 2.3889237199582025, "grad_norm": 0.906836279282657, "learning_rate": 1.3696177117935894e-05, "loss": 0.1053, "step": 11431 }, { "epoch": 2.3891327063740855, "grad_norm": 1.128440296142837, "learning_rate": 1.3695128862100009e-05, "loss": 0.1154, "step": 11432 }, { "epoch": 2.3893416927899684, "grad_norm": 1.0575776748227999, "learning_rate": 1.3694080559237766e-05, "loss": 0.1218, "step": 11433 }, { "epoch": 2.3895506792058514, "grad_norm": 0.8686093756287849, "learning_rate": 1.369303220936251e-05, "loss": 0.1131, "step": 11434 }, { "epoch": 2.3897596656217344, "grad_norm": 0.9237249597794716, "learning_rate": 1.3691983812487581e-05, "loss": 0.127, "step": 11435 }, { "epoch": 2.3899686520376173, "grad_norm": 0.922259994954953, "learning_rate": 1.3690935368626325e-05, "loss": 0.123, "step": 11436 }, { "epoch": 2.3901776384535003, "grad_norm": 1.1600343375509907, "learning_rate": 1.3689886877792082e-05, "loss": 0.1701, "step": 11437 }, { "epoch": 2.3903866248693832, "grad_norm": 1.028125440587962, "learning_rate": 1.36888383399982e-05, "loss": 0.1391, "step": 11438 }, { "epoch": 2.390595611285266, "grad_norm": 1.0618323708968918, "learning_rate": 1.3687789755258018e-05, "loss": 0.1333, "step": 11439 }, { "epoch": 2.3908045977011496, "grad_norm": 1.0147627849339445, "learning_rate": 1.3686741123584886e-05, "loss": 0.1255, "step": 11440 }, { "epoch": 2.3910135841170326, "grad_norm": 1.0836557167167966, "learning_rate": 1.3685692444992145e-05, "loss": 0.1395, "step": 11441 }, { "epoch": 2.3912225705329155, "grad_norm": 1.1220574554945784, "learning_rate": 1.3684643719493145e-05, "loss": 0.1295, "step": 11442 }, { "epoch": 2.3914315569487985, "grad_norm": 0.8993699402009082, "learning_rate": 1.3683594947101226e-05, "loss": 0.1136, "step": 11443 }, { "epoch": 2.3916405433646815, "grad_norm": 0.8686495062796803, "learning_rate": 1.3682546127829744e-05, "loss": 0.0988, "step": 11444 }, { "epoch": 2.3918495297805644, "grad_norm": 0.9089782726513497, "learning_rate": 1.368149726169204e-05, "loss": 0.1113, "step": 11445 }, { "epoch": 2.3920585161964474, "grad_norm": 1.0848535804008312, "learning_rate": 1.3680448348701467e-05, "loss": 0.1302, "step": 11446 }, { "epoch": 2.3922675026123303, "grad_norm": 0.9862667535571109, "learning_rate": 1.3679399388871371e-05, "loss": 0.1256, "step": 11447 }, { "epoch": 2.3924764890282133, "grad_norm": 0.9850443405201976, "learning_rate": 1.3678350382215104e-05, "loss": 0.1308, "step": 11448 }, { "epoch": 2.3926854754440963, "grad_norm": 1.398956134456242, "learning_rate": 1.3677301328746012e-05, "loss": 0.1296, "step": 11449 }, { "epoch": 2.3928944618599792, "grad_norm": 0.9156684998321385, "learning_rate": 1.3676252228477456e-05, "loss": 0.0972, "step": 11450 }, { "epoch": 2.393103448275862, "grad_norm": 0.9659040187539838, "learning_rate": 1.3675203081422774e-05, "loss": 0.1327, "step": 11451 }, { "epoch": 2.393312434691745, "grad_norm": 1.0149221118685727, "learning_rate": 1.3674153887595327e-05, "loss": 0.1236, "step": 11452 }, { "epoch": 2.393521421107628, "grad_norm": 1.0173141012338474, "learning_rate": 1.3673104647008465e-05, "loss": 0.1436, "step": 11453 }, { "epoch": 2.393730407523511, "grad_norm": 1.238369221443898, "learning_rate": 1.3672055359675544e-05, "loss": 0.1521, "step": 11454 }, { "epoch": 2.393939393939394, "grad_norm": 1.0125632577208723, "learning_rate": 1.3671006025609914e-05, "loss": 0.1161, "step": 11455 }, { "epoch": 2.394148380355277, "grad_norm": 1.1146887121408995, "learning_rate": 1.3669956644824928e-05, "loss": 0.1259, "step": 11456 }, { "epoch": 2.39435736677116, "grad_norm": 0.8862451284302947, "learning_rate": 1.366890721733395e-05, "loss": 0.1193, "step": 11457 }, { "epoch": 2.394566353187043, "grad_norm": 0.678292686333667, "learning_rate": 1.3667857743150322e-05, "loss": 0.0934, "step": 11458 }, { "epoch": 2.394775339602926, "grad_norm": 1.1440078838386887, "learning_rate": 1.3666808222287413e-05, "loss": 0.1567, "step": 11459 }, { "epoch": 2.394984326018809, "grad_norm": 1.0100416384739057, "learning_rate": 1.3665758654758571e-05, "loss": 0.1248, "step": 11460 }, { "epoch": 2.395193312434692, "grad_norm": 0.7436545885079093, "learning_rate": 1.366470904057716e-05, "loss": 0.1028, "step": 11461 }, { "epoch": 2.3954022988505748, "grad_norm": 1.144783298709667, "learning_rate": 1.3663659379756533e-05, "loss": 0.1356, "step": 11462 }, { "epoch": 2.3956112852664577, "grad_norm": 1.1480609077805841, "learning_rate": 1.3662609672310052e-05, "loss": 0.1252, "step": 11463 }, { "epoch": 2.3958202716823407, "grad_norm": 1.0712380335646363, "learning_rate": 1.366155991825107e-05, "loss": 0.1331, "step": 11464 }, { "epoch": 2.3960292580982236, "grad_norm": 0.9587167569272205, "learning_rate": 1.3660510117592955e-05, "loss": 0.1055, "step": 11465 }, { "epoch": 2.3962382445141066, "grad_norm": 0.918537290671871, "learning_rate": 1.3659460270349065e-05, "loss": 0.142, "step": 11466 }, { "epoch": 2.3964472309299896, "grad_norm": 1.1041194127811698, "learning_rate": 1.3658410376532757e-05, "loss": 0.1294, "step": 11467 }, { "epoch": 2.3966562173458725, "grad_norm": 0.9070583390730785, "learning_rate": 1.3657360436157396e-05, "loss": 0.11, "step": 11468 }, { "epoch": 2.3968652037617555, "grad_norm": 1.3722119449130006, "learning_rate": 1.3656310449236347e-05, "loss": 0.1411, "step": 11469 }, { "epoch": 2.3970741901776385, "grad_norm": 0.9474383336665693, "learning_rate": 1.3655260415782965e-05, "loss": 0.1146, "step": 11470 }, { "epoch": 2.3972831765935214, "grad_norm": 0.8769433694674509, "learning_rate": 1.3654210335810621e-05, "loss": 0.1126, "step": 11471 }, { "epoch": 2.3974921630094044, "grad_norm": 0.9035385522718159, "learning_rate": 1.3653160209332672e-05, "loss": 0.1058, "step": 11472 }, { "epoch": 2.3977011494252873, "grad_norm": 1.1110091587718518, "learning_rate": 1.3652110036362487e-05, "loss": 0.1248, "step": 11473 }, { "epoch": 2.3979101358411703, "grad_norm": 1.0795146370437452, "learning_rate": 1.365105981691343e-05, "loss": 0.1176, "step": 11474 }, { "epoch": 2.3981191222570533, "grad_norm": 1.1203235346962874, "learning_rate": 1.3650009550998869e-05, "loss": 0.1372, "step": 11475 }, { "epoch": 2.398328108672936, "grad_norm": 1.1621646075751146, "learning_rate": 1.3648959238632165e-05, "loss": 0.1103, "step": 11476 }, { "epoch": 2.398537095088819, "grad_norm": 1.1898107375273679, "learning_rate": 1.364790887982669e-05, "loss": 0.1516, "step": 11477 }, { "epoch": 2.398746081504702, "grad_norm": 1.1134733536479298, "learning_rate": 1.364685847459581e-05, "loss": 0.1261, "step": 11478 }, { "epoch": 2.398955067920585, "grad_norm": 1.0297015393103182, "learning_rate": 1.3645808022952891e-05, "loss": 0.1308, "step": 11479 }, { "epoch": 2.399164054336468, "grad_norm": 1.0334060464403518, "learning_rate": 1.3644757524911302e-05, "loss": 0.1217, "step": 11480 }, { "epoch": 2.399373040752351, "grad_norm": 1.0900887157367947, "learning_rate": 1.3643706980484417e-05, "loss": 0.1201, "step": 11481 }, { "epoch": 2.399582027168234, "grad_norm": 1.019000531092923, "learning_rate": 1.3642656389685599e-05, "loss": 0.1201, "step": 11482 }, { "epoch": 2.399791013584117, "grad_norm": 1.125821603203324, "learning_rate": 1.3641605752528225e-05, "loss": 0.1295, "step": 11483 }, { "epoch": 2.4, "grad_norm": 1.1268900172612395, "learning_rate": 1.3640555069025658e-05, "loss": 0.1431, "step": 11484 }, { "epoch": 2.400208986415883, "grad_norm": 1.0210165467855234, "learning_rate": 1.3639504339191279e-05, "loss": 0.1258, "step": 11485 }, { "epoch": 2.400417972831766, "grad_norm": 0.9496115574887719, "learning_rate": 1.3638453563038451e-05, "loss": 0.1244, "step": 11486 }, { "epoch": 2.400626959247649, "grad_norm": 0.9825863832170937, "learning_rate": 1.3637402740580557e-05, "loss": 0.1421, "step": 11487 }, { "epoch": 2.4008359456635318, "grad_norm": 0.9345885335133776, "learning_rate": 1.363635187183096e-05, "loss": 0.1243, "step": 11488 }, { "epoch": 2.4010449320794147, "grad_norm": 1.0402936790540487, "learning_rate": 1.3635300956803042e-05, "loss": 0.1294, "step": 11489 }, { "epoch": 2.4012539184952977, "grad_norm": 1.0555634096215507, "learning_rate": 1.3634249995510172e-05, "loss": 0.1211, "step": 11490 }, { "epoch": 2.4014629049111806, "grad_norm": 0.9645050000515911, "learning_rate": 1.3633198987965728e-05, "loss": 0.1222, "step": 11491 }, { "epoch": 2.4016718913270636, "grad_norm": 0.8909081096540742, "learning_rate": 1.3632147934183084e-05, "loss": 0.1236, "step": 11492 }, { "epoch": 2.4018808777429466, "grad_norm": 0.9225147032560441, "learning_rate": 1.3631096834175619e-05, "loss": 0.1111, "step": 11493 }, { "epoch": 2.4020898641588295, "grad_norm": 1.0584923108488304, "learning_rate": 1.3630045687956705e-05, "loss": 0.156, "step": 11494 }, { "epoch": 2.4022988505747125, "grad_norm": 0.992243270604992, "learning_rate": 1.3628994495539729e-05, "loss": 0.1407, "step": 11495 }, { "epoch": 2.4025078369905954, "grad_norm": 0.982424407598906, "learning_rate": 1.3627943256938059e-05, "loss": 0.1004, "step": 11496 }, { "epoch": 2.4027168234064784, "grad_norm": 1.0690909217471707, "learning_rate": 1.3626891972165077e-05, "loss": 0.1129, "step": 11497 }, { "epoch": 2.4029258098223614, "grad_norm": 0.9464286141160214, "learning_rate": 1.3625840641234165e-05, "loss": 0.1111, "step": 11498 }, { "epoch": 2.4031347962382443, "grad_norm": 0.9946432834347118, "learning_rate": 1.36247892641587e-05, "loss": 0.1545, "step": 11499 }, { "epoch": 2.4033437826541273, "grad_norm": 1.1616972189671821, "learning_rate": 1.362373784095206e-05, "loss": 0.1433, "step": 11500 }, { "epoch": 2.4035527690700107, "grad_norm": 0.8648887409867314, "learning_rate": 1.3622686371627634e-05, "loss": 0.1449, "step": 11501 }, { "epoch": 2.4037617554858937, "grad_norm": 1.1121923019774431, "learning_rate": 1.3621634856198796e-05, "loss": 0.1249, "step": 11502 }, { "epoch": 2.4039707419017766, "grad_norm": 1.2465739129129458, "learning_rate": 1.362058329467893e-05, "loss": 0.1341, "step": 11503 }, { "epoch": 2.4041797283176596, "grad_norm": 0.9967053470581648, "learning_rate": 1.3619531687081423e-05, "loss": 0.1295, "step": 11504 }, { "epoch": 2.4043887147335425, "grad_norm": 0.9754657156529338, "learning_rate": 1.3618480033419655e-05, "loss": 0.1269, "step": 11505 }, { "epoch": 2.4045977011494255, "grad_norm": 1.220234978185739, "learning_rate": 1.3617428333707006e-05, "loss": 0.1452, "step": 11506 }, { "epoch": 2.4048066875653085, "grad_norm": 0.8838945951688415, "learning_rate": 1.3616376587956868e-05, "loss": 0.0889, "step": 11507 }, { "epoch": 2.4050156739811914, "grad_norm": 0.8787004923950702, "learning_rate": 1.361532479618262e-05, "loss": 0.1196, "step": 11508 }, { "epoch": 2.4052246603970744, "grad_norm": 0.9838630181012848, "learning_rate": 1.3614272958397652e-05, "loss": 0.1042, "step": 11509 }, { "epoch": 2.4054336468129573, "grad_norm": 1.090379826361849, "learning_rate": 1.3613221074615349e-05, "loss": 0.1119, "step": 11510 }, { "epoch": 2.4056426332288403, "grad_norm": 0.7790367345109306, "learning_rate": 1.3612169144849099e-05, "loss": 0.0945, "step": 11511 }, { "epoch": 2.4058516196447233, "grad_norm": 1.0567168811200514, "learning_rate": 1.3611117169112285e-05, "loss": 0.1154, "step": 11512 }, { "epoch": 2.4060606060606062, "grad_norm": 1.063577013261242, "learning_rate": 1.3610065147418302e-05, "loss": 0.1465, "step": 11513 }, { "epoch": 2.406269592476489, "grad_norm": 1.1516366996733234, "learning_rate": 1.3609013079780531e-05, "loss": 0.1336, "step": 11514 }, { "epoch": 2.406478578892372, "grad_norm": 0.8514676103298477, "learning_rate": 1.3607960966212366e-05, "loss": 0.0867, "step": 11515 }, { "epoch": 2.406687565308255, "grad_norm": 1.2461395797479211, "learning_rate": 1.3606908806727197e-05, "loss": 0.1274, "step": 11516 }, { "epoch": 2.406896551724138, "grad_norm": 0.861076404150075, "learning_rate": 1.3605856601338414e-05, "loss": 0.1024, "step": 11517 }, { "epoch": 2.407105538140021, "grad_norm": 1.0052052341428388, "learning_rate": 1.3604804350059401e-05, "loss": 0.1218, "step": 11518 }, { "epoch": 2.407314524555904, "grad_norm": 1.1207578396416835, "learning_rate": 1.3603752052903562e-05, "loss": 0.153, "step": 11519 }, { "epoch": 2.407523510971787, "grad_norm": 1.0779366421991496, "learning_rate": 1.3602699709884281e-05, "loss": 0.1336, "step": 11520 }, { "epoch": 2.40773249738767, "grad_norm": 1.2012226815542206, "learning_rate": 1.3601647321014952e-05, "loss": 0.1222, "step": 11521 }, { "epoch": 2.407941483803553, "grad_norm": 0.8655038499625075, "learning_rate": 1.360059488630897e-05, "loss": 0.1185, "step": 11522 }, { "epoch": 2.408150470219436, "grad_norm": 1.0754928548288618, "learning_rate": 1.3599542405779728e-05, "loss": 0.118, "step": 11523 }, { "epoch": 2.408359456635319, "grad_norm": 1.0093083096984063, "learning_rate": 1.359848987944062e-05, "loss": 0.1112, "step": 11524 }, { "epoch": 2.4085684430512018, "grad_norm": 1.013362512469799, "learning_rate": 1.3597437307305041e-05, "loss": 0.1377, "step": 11525 }, { "epoch": 2.4087774294670847, "grad_norm": 0.9774492103737144, "learning_rate": 1.3596384689386387e-05, "loss": 0.0893, "step": 11526 }, { "epoch": 2.4089864158829677, "grad_norm": 0.8828929098470641, "learning_rate": 1.3595332025698054e-05, "loss": 0.124, "step": 11527 }, { "epoch": 2.4091954022988507, "grad_norm": 1.1094844621664228, "learning_rate": 1.3594279316253441e-05, "loss": 0.1272, "step": 11528 }, { "epoch": 2.4094043887147336, "grad_norm": 1.1049366382528893, "learning_rate": 1.3593226561065942e-05, "loss": 0.1117, "step": 11529 }, { "epoch": 2.4096133751306166, "grad_norm": 1.082326699127741, "learning_rate": 1.3592173760148957e-05, "loss": 0.1298, "step": 11530 }, { "epoch": 2.4098223615464995, "grad_norm": 0.9292994675254137, "learning_rate": 1.3591120913515883e-05, "loss": 0.1077, "step": 11531 }, { "epoch": 2.4100313479623825, "grad_norm": 0.9525813114053576, "learning_rate": 1.3590068021180121e-05, "loss": 0.1299, "step": 11532 }, { "epoch": 2.4102403343782655, "grad_norm": 1.009087673562831, "learning_rate": 1.3589015083155067e-05, "loss": 0.131, "step": 11533 }, { "epoch": 2.4104493207941484, "grad_norm": 0.9704524481284911, "learning_rate": 1.3587962099454127e-05, "loss": 0.1212, "step": 11534 }, { "epoch": 2.4106583072100314, "grad_norm": 0.9664154531965132, "learning_rate": 1.3586909070090699e-05, "loss": 0.124, "step": 11535 }, { "epoch": 2.4108672936259143, "grad_norm": 1.1238443490205368, "learning_rate": 1.3585855995078182e-05, "loss": 0.1544, "step": 11536 }, { "epoch": 2.4110762800417973, "grad_norm": 0.7281947520601693, "learning_rate": 1.358480287442998e-05, "loss": 0.0953, "step": 11537 }, { "epoch": 2.4112852664576803, "grad_norm": 1.1313048204732719, "learning_rate": 1.3583749708159501e-05, "loss": 0.1314, "step": 11538 }, { "epoch": 2.4114942528735632, "grad_norm": 0.9418131772484862, "learning_rate": 1.358269649628014e-05, "loss": 0.121, "step": 11539 }, { "epoch": 2.411703239289446, "grad_norm": 0.8608394103858868, "learning_rate": 1.3581643238805305e-05, "loss": 0.1065, "step": 11540 }, { "epoch": 2.411912225705329, "grad_norm": 1.0298969292826659, "learning_rate": 1.3580589935748398e-05, "loss": 0.1054, "step": 11541 }, { "epoch": 2.412121212121212, "grad_norm": 0.9123521959141574, "learning_rate": 1.3579536587122828e-05, "loss": 0.135, "step": 11542 }, { "epoch": 2.412330198537095, "grad_norm": 0.9508228744927172, "learning_rate": 1.3578483192941995e-05, "loss": 0.107, "step": 11543 }, { "epoch": 2.412539184952978, "grad_norm": 1.0778926418575625, "learning_rate": 1.3577429753219311e-05, "loss": 0.1161, "step": 11544 }, { "epoch": 2.412748171368861, "grad_norm": 1.0961022334864512, "learning_rate": 1.3576376267968177e-05, "loss": 0.129, "step": 11545 }, { "epoch": 2.412957157784744, "grad_norm": 0.9445675489551097, "learning_rate": 1.3575322737202006e-05, "loss": 0.1184, "step": 11546 }, { "epoch": 2.413166144200627, "grad_norm": 0.944718666162867, "learning_rate": 1.3574269160934202e-05, "loss": 0.1266, "step": 11547 }, { "epoch": 2.41337513061651, "grad_norm": 1.107185371283051, "learning_rate": 1.357321553917817e-05, "loss": 0.1087, "step": 11548 }, { "epoch": 2.413584117032393, "grad_norm": 0.9196816959568518, "learning_rate": 1.3572161871947327e-05, "loss": 0.1198, "step": 11549 }, { "epoch": 2.413793103448276, "grad_norm": 1.4542484010819765, "learning_rate": 1.3571108159255082e-05, "loss": 0.1114, "step": 11550 }, { "epoch": 2.4140020898641588, "grad_norm": 1.0572730189301238, "learning_rate": 1.3570054401114834e-05, "loss": 0.138, "step": 11551 }, { "epoch": 2.4142110762800417, "grad_norm": 0.8781897770723286, "learning_rate": 1.356900059754001e-05, "loss": 0.1188, "step": 11552 }, { "epoch": 2.4144200626959247, "grad_norm": 0.8532265244902489, "learning_rate": 1.3567946748544007e-05, "loss": 0.1079, "step": 11553 }, { "epoch": 2.4146290491118076, "grad_norm": 0.9046901248293482, "learning_rate": 1.3566892854140245e-05, "loss": 0.1333, "step": 11554 }, { "epoch": 2.4148380355276906, "grad_norm": 0.982192859058262, "learning_rate": 1.3565838914342136e-05, "loss": 0.1271, "step": 11555 }, { "epoch": 2.4150470219435736, "grad_norm": 0.9131986908574788, "learning_rate": 1.356478492916309e-05, "loss": 0.1353, "step": 11556 }, { "epoch": 2.4152560083594565, "grad_norm": 1.1200132733538746, "learning_rate": 1.3563730898616521e-05, "loss": 0.1349, "step": 11557 }, { "epoch": 2.4154649947753395, "grad_norm": 0.8905928960911988, "learning_rate": 1.3562676822715848e-05, "loss": 0.1208, "step": 11558 }, { "epoch": 2.4156739811912225, "grad_norm": 1.0845614434856095, "learning_rate": 1.356162270147448e-05, "loss": 0.1199, "step": 11559 }, { "epoch": 2.4158829676071054, "grad_norm": 1.11054964860629, "learning_rate": 1.3560568534905834e-05, "loss": 0.1422, "step": 11560 }, { "epoch": 2.4160919540229884, "grad_norm": 1.0240712933316547, "learning_rate": 1.3559514323023324e-05, "loss": 0.1242, "step": 11561 }, { "epoch": 2.4163009404388713, "grad_norm": 1.1038198420042804, "learning_rate": 1.3558460065840374e-05, "loss": 0.1339, "step": 11562 }, { "epoch": 2.4165099268547543, "grad_norm": 1.1197106281345808, "learning_rate": 1.355740576337039e-05, "loss": 0.1174, "step": 11563 }, { "epoch": 2.4167189132706373, "grad_norm": 1.025215452152, "learning_rate": 1.3556351415626803e-05, "loss": 0.1215, "step": 11564 }, { "epoch": 2.41692789968652, "grad_norm": 1.0021630465927576, "learning_rate": 1.3555297022623017e-05, "loss": 0.1565, "step": 11565 }, { "epoch": 2.417136886102403, "grad_norm": 0.8927054329934483, "learning_rate": 1.3554242584372462e-05, "loss": 0.1121, "step": 11566 }, { "epoch": 2.417345872518286, "grad_norm": 0.856503779370618, "learning_rate": 1.355318810088855e-05, "loss": 0.0946, "step": 11567 }, { "epoch": 2.417554858934169, "grad_norm": 0.8411172475308313, "learning_rate": 1.3552133572184709e-05, "loss": 0.1213, "step": 11568 }, { "epoch": 2.417763845350052, "grad_norm": 0.8512772914744015, "learning_rate": 1.3551078998274347e-05, "loss": 0.1302, "step": 11569 }, { "epoch": 2.417972831765935, "grad_norm": 0.9870367299056474, "learning_rate": 1.35500243791709e-05, "loss": 0.1144, "step": 11570 }, { "epoch": 2.418181818181818, "grad_norm": 0.9466446430654885, "learning_rate": 1.3548969714887781e-05, "loss": 0.1183, "step": 11571 }, { "epoch": 2.418390804597701, "grad_norm": 0.8447102175823216, "learning_rate": 1.354791500543841e-05, "loss": 0.1071, "step": 11572 }, { "epoch": 2.418599791013584, "grad_norm": 1.0109242359228956, "learning_rate": 1.3546860250836215e-05, "loss": 0.1162, "step": 11573 }, { "epoch": 2.418808777429467, "grad_norm": 0.7927831135149827, "learning_rate": 1.3545805451094622e-05, "loss": 0.1037, "step": 11574 }, { "epoch": 2.41901776384535, "grad_norm": 0.6329388292324956, "learning_rate": 1.3544750606227046e-05, "loss": 0.0878, "step": 11575 }, { "epoch": 2.419226750261233, "grad_norm": 0.9775587762920762, "learning_rate": 1.3543695716246918e-05, "loss": 0.1385, "step": 11576 }, { "epoch": 2.4194357366771158, "grad_norm": 0.9431470571662429, "learning_rate": 1.3542640781167663e-05, "loss": 0.0986, "step": 11577 }, { "epoch": 2.4196447230929987, "grad_norm": 1.00112522742493, "learning_rate": 1.3541585801002703e-05, "loss": 0.1177, "step": 11578 }, { "epoch": 2.4198537095088817, "grad_norm": 0.9918338762061082, "learning_rate": 1.3540530775765471e-05, "loss": 0.1146, "step": 11579 }, { "epoch": 2.420062695924765, "grad_norm": 0.8504042244390803, "learning_rate": 1.3539475705469388e-05, "loss": 0.0986, "step": 11580 }, { "epoch": 2.420271682340648, "grad_norm": 0.8794023547859923, "learning_rate": 1.3538420590127878e-05, "loss": 0.1008, "step": 11581 }, { "epoch": 2.420480668756531, "grad_norm": 0.8809174272210786, "learning_rate": 1.353736542975438e-05, "loss": 0.1027, "step": 11582 }, { "epoch": 2.420689655172414, "grad_norm": 0.8688699025511407, "learning_rate": 1.3536310224362313e-05, "loss": 0.1094, "step": 11583 }, { "epoch": 2.420898641588297, "grad_norm": 1.336533135032439, "learning_rate": 1.353525497396511e-05, "loss": 0.1613, "step": 11584 }, { "epoch": 2.42110762800418, "grad_norm": 0.9357092192513486, "learning_rate": 1.35341996785762e-05, "loss": 0.1241, "step": 11585 }, { "epoch": 2.421316614420063, "grad_norm": 1.1913648289430514, "learning_rate": 1.3533144338209019e-05, "loss": 0.1304, "step": 11586 }, { "epoch": 2.421525600835946, "grad_norm": 0.9844759409302372, "learning_rate": 1.3532088952876984e-05, "loss": 0.1369, "step": 11587 }, { "epoch": 2.4217345872518288, "grad_norm": 0.7595178559280096, "learning_rate": 1.3531033522593541e-05, "loss": 0.0782, "step": 11588 }, { "epoch": 2.4219435736677117, "grad_norm": 0.9376911077530032, "learning_rate": 1.3529978047372116e-05, "loss": 0.0921, "step": 11589 }, { "epoch": 2.4221525600835947, "grad_norm": 1.0328148954034184, "learning_rate": 1.3528922527226141e-05, "loss": 0.1438, "step": 11590 }, { "epoch": 2.4223615464994777, "grad_norm": 0.939220868460627, "learning_rate": 1.352786696216905e-05, "loss": 0.1273, "step": 11591 }, { "epoch": 2.4225705329153606, "grad_norm": 0.9245171464366015, "learning_rate": 1.3526811352214278e-05, "loss": 0.1073, "step": 11592 }, { "epoch": 2.4227795193312436, "grad_norm": 0.9741955137539204, "learning_rate": 1.3525755697375252e-05, "loss": 0.1167, "step": 11593 }, { "epoch": 2.4229885057471265, "grad_norm": 1.0286630464799151, "learning_rate": 1.3524699997665417e-05, "loss": 0.1261, "step": 11594 }, { "epoch": 2.4231974921630095, "grad_norm": 1.0839734696581451, "learning_rate": 1.3523644253098204e-05, "loss": 0.1193, "step": 11595 }, { "epoch": 2.4234064785788925, "grad_norm": 0.9595512704855043, "learning_rate": 1.3522588463687049e-05, "loss": 0.1174, "step": 11596 }, { "epoch": 2.4236154649947754, "grad_norm": 1.0422450565318853, "learning_rate": 1.352153262944539e-05, "loss": 0.1226, "step": 11597 }, { "epoch": 2.4238244514106584, "grad_norm": 0.9808860958573443, "learning_rate": 1.3520476750386664e-05, "loss": 0.1214, "step": 11598 }, { "epoch": 2.4240334378265413, "grad_norm": 0.9508595299702537, "learning_rate": 1.3519420826524304e-05, "loss": 0.121, "step": 11599 }, { "epoch": 2.4242424242424243, "grad_norm": 0.9166248665253132, "learning_rate": 1.3518364857871756e-05, "loss": 0.1287, "step": 11600 }, { "epoch": 2.4244514106583073, "grad_norm": 0.9313181361463215, "learning_rate": 1.351730884444245e-05, "loss": 0.1114, "step": 11601 }, { "epoch": 2.4246603970741902, "grad_norm": 1.0455178488328543, "learning_rate": 1.3516252786249832e-05, "loss": 0.1199, "step": 11602 }, { "epoch": 2.424869383490073, "grad_norm": 0.9032558663543259, "learning_rate": 1.3515196683307343e-05, "loss": 0.1185, "step": 11603 }, { "epoch": 2.425078369905956, "grad_norm": 0.9402317968817989, "learning_rate": 1.3514140535628419e-05, "loss": 0.1094, "step": 11604 }, { "epoch": 2.425287356321839, "grad_norm": 0.9671904518009374, "learning_rate": 1.3513084343226504e-05, "loss": 0.1294, "step": 11605 }, { "epoch": 2.425496342737722, "grad_norm": 0.9410961848078147, "learning_rate": 1.3512028106115036e-05, "loss": 0.1296, "step": 11606 }, { "epoch": 2.425705329153605, "grad_norm": 0.8218394393230982, "learning_rate": 1.3510971824307464e-05, "loss": 0.1076, "step": 11607 }, { "epoch": 2.425914315569488, "grad_norm": 1.0217903299987723, "learning_rate": 1.3509915497817223e-05, "loss": 0.1262, "step": 11608 }, { "epoch": 2.426123301985371, "grad_norm": 0.8530673374932239, "learning_rate": 1.3508859126657763e-05, "loss": 0.1013, "step": 11609 }, { "epoch": 2.426332288401254, "grad_norm": 0.9454670951871379, "learning_rate": 1.3507802710842523e-05, "loss": 0.1183, "step": 11610 }, { "epoch": 2.426541274817137, "grad_norm": 0.9803760987169845, "learning_rate": 1.3506746250384952e-05, "loss": 0.1237, "step": 11611 }, { "epoch": 2.42675026123302, "grad_norm": 0.8768328831898149, "learning_rate": 1.3505689745298493e-05, "loss": 0.1182, "step": 11612 }, { "epoch": 2.426959247648903, "grad_norm": 0.9227192463517095, "learning_rate": 1.3504633195596592e-05, "loss": 0.113, "step": 11613 }, { "epoch": 2.4271682340647858, "grad_norm": 0.8033663207594226, "learning_rate": 1.3503576601292692e-05, "loss": 0.1034, "step": 11614 }, { "epoch": 2.4273772204806687, "grad_norm": 0.9420054769264801, "learning_rate": 1.3502519962400246e-05, "loss": 0.1358, "step": 11615 }, { "epoch": 2.4275862068965517, "grad_norm": 1.0756993666484285, "learning_rate": 1.3501463278932698e-05, "loss": 0.1209, "step": 11616 }, { "epoch": 2.4277951933124347, "grad_norm": 0.9721697087610011, "learning_rate": 1.3500406550903495e-05, "loss": 0.1253, "step": 11617 }, { "epoch": 2.4280041797283176, "grad_norm": 0.9930787576111231, "learning_rate": 1.3499349778326085e-05, "loss": 0.1332, "step": 11618 }, { "epoch": 2.4282131661442006, "grad_norm": 0.9364634053244836, "learning_rate": 1.3498292961213924e-05, "loss": 0.1387, "step": 11619 }, { "epoch": 2.4284221525600835, "grad_norm": 0.9586294866318513, "learning_rate": 1.3497236099580451e-05, "loss": 0.0985, "step": 11620 }, { "epoch": 2.4286311389759665, "grad_norm": 0.9927592510371461, "learning_rate": 1.3496179193439125e-05, "loss": 0.1301, "step": 11621 }, { "epoch": 2.4288401253918495, "grad_norm": 1.0278173782522813, "learning_rate": 1.3495122242803391e-05, "loss": 0.1285, "step": 11622 }, { "epoch": 2.4290491118077324, "grad_norm": 0.912330694307987, "learning_rate": 1.3494065247686706e-05, "loss": 0.1241, "step": 11623 }, { "epoch": 2.4292580982236154, "grad_norm": 1.0458612192528243, "learning_rate": 1.3493008208102517e-05, "loss": 0.1146, "step": 11624 }, { "epoch": 2.4294670846394983, "grad_norm": 1.0248562161130892, "learning_rate": 1.349195112406428e-05, "loss": 0.124, "step": 11625 }, { "epoch": 2.4296760710553813, "grad_norm": 0.9127131771038827, "learning_rate": 1.3490893995585444e-05, "loss": 0.0946, "step": 11626 }, { "epoch": 2.4298850574712643, "grad_norm": 0.8548073494456404, "learning_rate": 1.3489836822679467e-05, "loss": 0.1221, "step": 11627 }, { "epoch": 2.4300940438871472, "grad_norm": 1.0112585740426148, "learning_rate": 1.3488779605359798e-05, "loss": 0.1232, "step": 11628 }, { "epoch": 2.43030303030303, "grad_norm": 1.232077438454572, "learning_rate": 1.3487722343639898e-05, "loss": 0.1648, "step": 11629 }, { "epoch": 2.430512016718913, "grad_norm": 1.013652089565491, "learning_rate": 1.3486665037533218e-05, "loss": 0.1187, "step": 11630 }, { "epoch": 2.430721003134796, "grad_norm": 1.0055941916245896, "learning_rate": 1.3485607687053217e-05, "loss": 0.1192, "step": 11631 }, { "epoch": 2.430929989550679, "grad_norm": 0.9878462931936601, "learning_rate": 1.3484550292213346e-05, "loss": 0.1278, "step": 11632 }, { "epoch": 2.431138975966562, "grad_norm": 0.894546932872488, "learning_rate": 1.348349285302707e-05, "loss": 0.1052, "step": 11633 }, { "epoch": 2.431347962382445, "grad_norm": 1.1538664394459466, "learning_rate": 1.3482435369507841e-05, "loss": 0.1297, "step": 11634 }, { "epoch": 2.431556948798328, "grad_norm": 1.0457484511872301, "learning_rate": 1.3481377841669114e-05, "loss": 0.1268, "step": 11635 }, { "epoch": 2.431765935214211, "grad_norm": 1.1331178570333267, "learning_rate": 1.3480320269524356e-05, "loss": 0.1311, "step": 11636 }, { "epoch": 2.431974921630094, "grad_norm": 0.9992226168479509, "learning_rate": 1.3479262653087025e-05, "loss": 0.1274, "step": 11637 }, { "epoch": 2.432183908045977, "grad_norm": 1.1819539224900446, "learning_rate": 1.3478204992370572e-05, "loss": 0.1346, "step": 11638 }, { "epoch": 2.43239289446186, "grad_norm": 1.0245679174449227, "learning_rate": 1.347714728738847e-05, "loss": 0.1348, "step": 11639 }, { "epoch": 2.4326018808777428, "grad_norm": 1.0611981663632286, "learning_rate": 1.347608953815417e-05, "loss": 0.1325, "step": 11640 }, { "epoch": 2.4328108672936257, "grad_norm": 0.875202827390351, "learning_rate": 1.3475031744681137e-05, "loss": 0.089, "step": 11641 }, { "epoch": 2.433019853709509, "grad_norm": 0.9385324524457697, "learning_rate": 1.3473973906982834e-05, "loss": 0.1206, "step": 11642 }, { "epoch": 2.433228840125392, "grad_norm": 0.9333826845614642, "learning_rate": 1.3472916025072724e-05, "loss": 0.1117, "step": 11643 }, { "epoch": 2.433437826541275, "grad_norm": 0.884307466922124, "learning_rate": 1.3471858098964266e-05, "loss": 0.1105, "step": 11644 }, { "epoch": 2.433646812957158, "grad_norm": 0.936006187218356, "learning_rate": 1.3470800128670928e-05, "loss": 0.1233, "step": 11645 }, { "epoch": 2.433855799373041, "grad_norm": 0.9762637855610873, "learning_rate": 1.3469742114206175e-05, "loss": 0.1225, "step": 11646 }, { "epoch": 2.434064785788924, "grad_norm": 1.0568274016549792, "learning_rate": 1.3468684055583469e-05, "loss": 0.1335, "step": 11647 }, { "epoch": 2.434273772204807, "grad_norm": 1.0894107373290776, "learning_rate": 1.3467625952816276e-05, "loss": 0.1392, "step": 11648 }, { "epoch": 2.43448275862069, "grad_norm": 1.3040539544917165, "learning_rate": 1.3466567805918065e-05, "loss": 0.1359, "step": 11649 }, { "epoch": 2.434691745036573, "grad_norm": 0.9441540176118655, "learning_rate": 1.3465509614902297e-05, "loss": 0.0952, "step": 11650 }, { "epoch": 2.434900731452456, "grad_norm": 0.9849930647350504, "learning_rate": 1.3464451379782445e-05, "loss": 0.1298, "step": 11651 }, { "epoch": 2.4351097178683387, "grad_norm": 1.0487698446915106, "learning_rate": 1.3463393100571974e-05, "loss": 0.111, "step": 11652 }, { "epoch": 2.4353187042842217, "grad_norm": 0.971080853344741, "learning_rate": 1.3462334777284349e-05, "loss": 0.1251, "step": 11653 }, { "epoch": 2.4355276907001047, "grad_norm": 0.9107739696954859, "learning_rate": 1.3461276409933044e-05, "loss": 0.1063, "step": 11654 }, { "epoch": 2.4357366771159876, "grad_norm": 1.2406413648930423, "learning_rate": 1.346021799853153e-05, "loss": 0.1485, "step": 11655 }, { "epoch": 2.4359456635318706, "grad_norm": 1.183946226288747, "learning_rate": 1.3459159543093268e-05, "loss": 0.1196, "step": 11656 }, { "epoch": 2.4361546499477535, "grad_norm": 0.8508207610037337, "learning_rate": 1.345810104363174e-05, "loss": 0.0977, "step": 11657 }, { "epoch": 2.4363636363636365, "grad_norm": 0.9515171440406949, "learning_rate": 1.3457042500160407e-05, "loss": 0.1067, "step": 11658 }, { "epoch": 2.4365726227795195, "grad_norm": 0.8375441268248973, "learning_rate": 1.3455983912692746e-05, "loss": 0.1031, "step": 11659 }, { "epoch": 2.4367816091954024, "grad_norm": 1.0377003718385178, "learning_rate": 1.3454925281242225e-05, "loss": 0.1265, "step": 11660 }, { "epoch": 2.4369905956112854, "grad_norm": 1.1685648033183678, "learning_rate": 1.3453866605822325e-05, "loss": 0.1045, "step": 11661 }, { "epoch": 2.4371995820271684, "grad_norm": 1.0919381876765224, "learning_rate": 1.3452807886446509e-05, "loss": 0.1417, "step": 11662 }, { "epoch": 2.4374085684430513, "grad_norm": 1.1374789818698468, "learning_rate": 1.345174912312826e-05, "loss": 0.1551, "step": 11663 }, { "epoch": 2.4376175548589343, "grad_norm": 0.9293630915748375, "learning_rate": 1.3450690315881047e-05, "loss": 0.0958, "step": 11664 }, { "epoch": 2.4378265412748172, "grad_norm": 1.0569529749658475, "learning_rate": 1.3449631464718347e-05, "loss": 0.1189, "step": 11665 }, { "epoch": 2.4380355276907, "grad_norm": 1.0308965853640877, "learning_rate": 1.3448572569653634e-05, "loss": 0.1348, "step": 11666 }, { "epoch": 2.438244514106583, "grad_norm": 1.0246433245211952, "learning_rate": 1.3447513630700385e-05, "loss": 0.1358, "step": 11667 }, { "epoch": 2.438453500522466, "grad_norm": 1.0388300233399987, "learning_rate": 1.3446454647872074e-05, "loss": 0.1228, "step": 11668 }, { "epoch": 2.438662486938349, "grad_norm": 1.0369756464203639, "learning_rate": 1.3445395621182186e-05, "loss": 0.1261, "step": 11669 }, { "epoch": 2.438871473354232, "grad_norm": 0.9843091838823549, "learning_rate": 1.3444336550644192e-05, "loss": 0.1213, "step": 11670 }, { "epoch": 2.439080459770115, "grad_norm": 0.9512035576620982, "learning_rate": 1.3443277436271571e-05, "loss": 0.1237, "step": 11671 }, { "epoch": 2.439289446185998, "grad_norm": 1.1442111409138644, "learning_rate": 1.3442218278077804e-05, "loss": 0.1368, "step": 11672 }, { "epoch": 2.439498432601881, "grad_norm": 0.988520731068378, "learning_rate": 1.3441159076076371e-05, "loss": 0.1354, "step": 11673 }, { "epoch": 2.439707419017764, "grad_norm": 1.124340972740101, "learning_rate": 1.3440099830280744e-05, "loss": 0.1526, "step": 11674 }, { "epoch": 2.439916405433647, "grad_norm": 1.0122663459673793, "learning_rate": 1.3439040540704418e-05, "loss": 0.1387, "step": 11675 }, { "epoch": 2.44012539184953, "grad_norm": 1.0042399337754775, "learning_rate": 1.3437981207360862e-05, "loss": 0.1342, "step": 11676 }, { "epoch": 2.4403343782654128, "grad_norm": 0.8921655758970366, "learning_rate": 1.3436921830263563e-05, "loss": 0.1187, "step": 11677 }, { "epoch": 2.4405433646812957, "grad_norm": 0.9673140746560819, "learning_rate": 1.3435862409426003e-05, "loss": 0.1284, "step": 11678 }, { "epoch": 2.4407523510971787, "grad_norm": 0.9068645527702959, "learning_rate": 1.3434802944861662e-05, "loss": 0.1077, "step": 11679 }, { "epoch": 2.4409613375130617, "grad_norm": 1.079296241115122, "learning_rate": 1.3433743436584026e-05, "loss": 0.119, "step": 11680 }, { "epoch": 2.4411703239289446, "grad_norm": 1.0265975936243525, "learning_rate": 1.3432683884606576e-05, "loss": 0.1145, "step": 11681 }, { "epoch": 2.4413793103448276, "grad_norm": 1.0081959013694144, "learning_rate": 1.3431624288942804e-05, "loss": 0.1178, "step": 11682 }, { "epoch": 2.4415882967607105, "grad_norm": 0.9368461473145706, "learning_rate": 1.3430564649606184e-05, "loss": 0.1199, "step": 11683 }, { "epoch": 2.4417972831765935, "grad_norm": 2.205306263881927, "learning_rate": 1.3429504966610211e-05, "loss": 0.1215, "step": 11684 }, { "epoch": 2.4420062695924765, "grad_norm": 1.052880684000366, "learning_rate": 1.3428445239968366e-05, "loss": 0.1417, "step": 11685 }, { "epoch": 2.4422152560083594, "grad_norm": 0.9575291692664494, "learning_rate": 1.3427385469694137e-05, "loss": 0.1147, "step": 11686 }, { "epoch": 2.4424242424242424, "grad_norm": 0.8120518731418549, "learning_rate": 1.3426325655801011e-05, "loss": 0.088, "step": 11687 }, { "epoch": 2.4426332288401253, "grad_norm": 0.9819516272086607, "learning_rate": 1.3425265798302479e-05, "loss": 0.1372, "step": 11688 }, { "epoch": 2.4428422152560083, "grad_norm": 0.9125692423837299, "learning_rate": 1.342420589721202e-05, "loss": 0.1089, "step": 11689 }, { "epoch": 2.4430512016718913, "grad_norm": 1.0977474651518144, "learning_rate": 1.3423145952543137e-05, "loss": 0.1224, "step": 11690 }, { "epoch": 2.4432601880877742, "grad_norm": 0.8971301364868683, "learning_rate": 1.3422085964309308e-05, "loss": 0.1316, "step": 11691 }, { "epoch": 2.443469174503657, "grad_norm": 1.0686579359982602, "learning_rate": 1.3421025932524027e-05, "loss": 0.1324, "step": 11692 }, { "epoch": 2.44367816091954, "grad_norm": 1.0277781078494517, "learning_rate": 1.3419965857200784e-05, "loss": 0.1336, "step": 11693 }, { "epoch": 2.443887147335423, "grad_norm": 0.8847727436292341, "learning_rate": 1.3418905738353077e-05, "loss": 0.1111, "step": 11694 }, { "epoch": 2.444096133751306, "grad_norm": 1.0413471258025702, "learning_rate": 1.3417845575994381e-05, "loss": 0.1479, "step": 11695 }, { "epoch": 2.444305120167189, "grad_norm": 0.9628407255829993, "learning_rate": 1.3416785370138206e-05, "loss": 0.1217, "step": 11696 }, { "epoch": 2.444514106583072, "grad_norm": 1.0550848116862739, "learning_rate": 1.3415725120798036e-05, "loss": 0.1207, "step": 11697 }, { "epoch": 2.444723092998955, "grad_norm": 0.9963429219149165, "learning_rate": 1.3414664827987364e-05, "loss": 0.1235, "step": 11698 }, { "epoch": 2.444932079414838, "grad_norm": 0.9115819025613271, "learning_rate": 1.3413604491719688e-05, "loss": 0.1129, "step": 11699 }, { "epoch": 2.445141065830721, "grad_norm": 1.0075474729230625, "learning_rate": 1.3412544112008499e-05, "loss": 0.1382, "step": 11700 }, { "epoch": 2.445350052246604, "grad_norm": 0.9658960030269594, "learning_rate": 1.341148368886729e-05, "loss": 0.1117, "step": 11701 }, { "epoch": 2.445559038662487, "grad_norm": 1.032881012198799, "learning_rate": 1.3410423222309566e-05, "loss": 0.1549, "step": 11702 }, { "epoch": 2.4457680250783698, "grad_norm": 0.896215187275305, "learning_rate": 1.3409362712348812e-05, "loss": 0.1195, "step": 11703 }, { "epoch": 2.4459770114942527, "grad_norm": 0.8365107589549451, "learning_rate": 1.340830215899853e-05, "loss": 0.0995, "step": 11704 }, { "epoch": 2.4461859979101357, "grad_norm": 0.9746772466061405, "learning_rate": 1.3407241562272217e-05, "loss": 0.1147, "step": 11705 }, { "epoch": 2.4463949843260187, "grad_norm": 1.0773743326441363, "learning_rate": 1.3406180922183371e-05, "loss": 0.1165, "step": 11706 }, { "epoch": 2.4466039707419016, "grad_norm": 0.8599193705614251, "learning_rate": 1.3405120238745489e-05, "loss": 0.1198, "step": 11707 }, { "epoch": 2.4468129571577846, "grad_norm": 0.9269058179994081, "learning_rate": 1.3404059511972072e-05, "loss": 0.1323, "step": 11708 }, { "epoch": 2.4470219435736675, "grad_norm": 0.8833026089445277, "learning_rate": 1.3402998741876619e-05, "loss": 0.1081, "step": 11709 }, { "epoch": 2.4472309299895505, "grad_norm": 1.017996821165573, "learning_rate": 1.3401937928472626e-05, "loss": 0.1016, "step": 11710 }, { "epoch": 2.4474399164054335, "grad_norm": 0.8113058562668259, "learning_rate": 1.3400877071773597e-05, "loss": 0.1016, "step": 11711 }, { "epoch": 2.4476489028213164, "grad_norm": 0.8506359081214617, "learning_rate": 1.3399816171793034e-05, "loss": 0.1169, "step": 11712 }, { "epoch": 2.4478578892371994, "grad_norm": 0.8682305129407035, "learning_rate": 1.3398755228544436e-05, "loss": 0.1098, "step": 11713 }, { "epoch": 2.4480668756530823, "grad_norm": 0.9906857991450864, "learning_rate": 1.3397694242041309e-05, "loss": 0.1061, "step": 11714 }, { "epoch": 2.4482758620689653, "grad_norm": 1.1300376723223868, "learning_rate": 1.339663321229715e-05, "loss": 0.1266, "step": 11715 }, { "epoch": 2.4484848484848483, "grad_norm": 1.061342212642982, "learning_rate": 1.3395572139325466e-05, "loss": 0.1034, "step": 11716 }, { "epoch": 2.4486938349007312, "grad_norm": 0.9498485351892204, "learning_rate": 1.339451102313976e-05, "loss": 0.1149, "step": 11717 }, { "epoch": 2.448902821316614, "grad_norm": 1.0863141607594937, "learning_rate": 1.339344986375354e-05, "loss": 0.1104, "step": 11718 }, { "epoch": 2.449111807732497, "grad_norm": 1.1056529891004931, "learning_rate": 1.3392388661180303e-05, "loss": 0.1379, "step": 11719 }, { "epoch": 2.44932079414838, "grad_norm": 0.9425650237814158, "learning_rate": 1.3391327415433564e-05, "loss": 0.1183, "step": 11720 }, { "epoch": 2.4495297805642635, "grad_norm": 1.142975339159467, "learning_rate": 1.339026612652682e-05, "loss": 0.1351, "step": 11721 }, { "epoch": 2.4497387669801465, "grad_norm": 1.1153859982227994, "learning_rate": 1.3389204794473582e-05, "loss": 0.1409, "step": 11722 }, { "epoch": 2.4499477533960294, "grad_norm": 0.9470958383116507, "learning_rate": 1.3388143419287357e-05, "loss": 0.1086, "step": 11723 }, { "epoch": 2.4501567398119124, "grad_norm": 0.9662737785833224, "learning_rate": 1.3387082000981657e-05, "loss": 0.1259, "step": 11724 }, { "epoch": 2.4503657262277954, "grad_norm": 1.1643822960058756, "learning_rate": 1.3386020539569979e-05, "loss": 0.1461, "step": 11725 }, { "epoch": 2.4505747126436783, "grad_norm": 1.0389288293602126, "learning_rate": 1.3384959035065844e-05, "loss": 0.1312, "step": 11726 }, { "epoch": 2.4507836990595613, "grad_norm": 0.9187523947929246, "learning_rate": 1.3383897487482753e-05, "loss": 0.1267, "step": 11727 }, { "epoch": 2.4509926854754442, "grad_norm": 1.0634132227126956, "learning_rate": 1.3382835896834218e-05, "loss": 0.1147, "step": 11728 }, { "epoch": 2.451201671891327, "grad_norm": 1.0869105052276296, "learning_rate": 1.3381774263133751e-05, "loss": 0.132, "step": 11729 }, { "epoch": 2.45141065830721, "grad_norm": 0.906074715382183, "learning_rate": 1.3380712586394863e-05, "loss": 0.1269, "step": 11730 }, { "epoch": 2.451619644723093, "grad_norm": 0.9018606646320472, "learning_rate": 1.3379650866631062e-05, "loss": 0.0945, "step": 11731 }, { "epoch": 2.451828631138976, "grad_norm": 1.1353457196491592, "learning_rate": 1.3378589103855866e-05, "loss": 0.1544, "step": 11732 }, { "epoch": 2.452037617554859, "grad_norm": 0.9548862716106541, "learning_rate": 1.3377527298082782e-05, "loss": 0.1124, "step": 11733 }, { "epoch": 2.452246603970742, "grad_norm": 1.2604690136341572, "learning_rate": 1.3376465449325326e-05, "loss": 0.144, "step": 11734 }, { "epoch": 2.452455590386625, "grad_norm": 1.0687948216088043, "learning_rate": 1.337540355759701e-05, "loss": 0.1348, "step": 11735 }, { "epoch": 2.452664576802508, "grad_norm": 0.7367773800017857, "learning_rate": 1.3374341622911354e-05, "loss": 0.094, "step": 11736 }, { "epoch": 2.452873563218391, "grad_norm": 0.8571656639796117, "learning_rate": 1.337327964528186e-05, "loss": 0.1355, "step": 11737 }, { "epoch": 2.453082549634274, "grad_norm": 1.014021706409738, "learning_rate": 1.337221762472206e-05, "loss": 0.132, "step": 11738 }, { "epoch": 2.453291536050157, "grad_norm": 0.8479416315860901, "learning_rate": 1.3371155561245454e-05, "loss": 0.1146, "step": 11739 }, { "epoch": 2.45350052246604, "grad_norm": 0.8592927840686008, "learning_rate": 1.3370093454865571e-05, "loss": 0.1142, "step": 11740 }, { "epoch": 2.4537095088819227, "grad_norm": 1.077721902668532, "learning_rate": 1.3369031305595921e-05, "loss": 0.1107, "step": 11741 }, { "epoch": 2.4539184952978057, "grad_norm": 1.031318524325911, "learning_rate": 1.3367969113450026e-05, "loss": 0.127, "step": 11742 }, { "epoch": 2.4541274817136887, "grad_norm": 0.8506554421989247, "learning_rate": 1.3366906878441395e-05, "loss": 0.1203, "step": 11743 }, { "epoch": 2.4543364681295716, "grad_norm": 0.8755189581799746, "learning_rate": 1.336584460058356e-05, "loss": 0.0973, "step": 11744 }, { "epoch": 2.4545454545454546, "grad_norm": 0.876024518267558, "learning_rate": 1.336478227989003e-05, "loss": 0.1025, "step": 11745 }, { "epoch": 2.4547544409613375, "grad_norm": 1.1922799235050616, "learning_rate": 1.3363719916374324e-05, "loss": 0.1363, "step": 11746 }, { "epoch": 2.4549634273772205, "grad_norm": 0.7353673398762552, "learning_rate": 1.3362657510049972e-05, "loss": 0.1088, "step": 11747 }, { "epoch": 2.4551724137931035, "grad_norm": 0.9086405986880391, "learning_rate": 1.3361595060930487e-05, "loss": 0.1122, "step": 11748 }, { "epoch": 2.4553814002089864, "grad_norm": 1.1054423371527897, "learning_rate": 1.3360532569029391e-05, "loss": 0.1271, "step": 11749 }, { "epoch": 2.4555903866248694, "grad_norm": 0.9325534709947984, "learning_rate": 1.3359470034360211e-05, "loss": 0.1159, "step": 11750 }, { "epoch": 2.4557993730407524, "grad_norm": 0.8417688216585271, "learning_rate": 1.3358407456936463e-05, "loss": 0.1058, "step": 11751 }, { "epoch": 2.4560083594566353, "grad_norm": 1.0168412355573986, "learning_rate": 1.3357344836771674e-05, "loss": 0.1262, "step": 11752 }, { "epoch": 2.4562173458725183, "grad_norm": 0.84235796114937, "learning_rate": 1.3356282173879364e-05, "loss": 0.0919, "step": 11753 }, { "epoch": 2.4564263322884012, "grad_norm": 1.1053236879458261, "learning_rate": 1.335521946827306e-05, "loss": 0.1163, "step": 11754 }, { "epoch": 2.456635318704284, "grad_norm": 0.9791689830333536, "learning_rate": 1.3354156719966285e-05, "loss": 0.1222, "step": 11755 }, { "epoch": 2.456844305120167, "grad_norm": 1.1691760657582564, "learning_rate": 1.3353093928972567e-05, "loss": 0.1264, "step": 11756 }, { "epoch": 2.45705329153605, "grad_norm": 0.9274660716855965, "learning_rate": 1.3352031095305428e-05, "loss": 0.1021, "step": 11757 }, { "epoch": 2.457262277951933, "grad_norm": 1.051346322552745, "learning_rate": 1.3350968218978395e-05, "loss": 0.1412, "step": 11758 }, { "epoch": 2.457471264367816, "grad_norm": 0.9161770654174525, "learning_rate": 1.3349905300005e-05, "loss": 0.1339, "step": 11759 }, { "epoch": 2.457680250783699, "grad_norm": 1.0247028184635094, "learning_rate": 1.3348842338398761e-05, "loss": 0.1145, "step": 11760 }, { "epoch": 2.457889237199582, "grad_norm": 1.024820093504524, "learning_rate": 1.3347779334173212e-05, "loss": 0.1195, "step": 11761 }, { "epoch": 2.458098223615465, "grad_norm": 1.1746729151810356, "learning_rate": 1.3346716287341881e-05, "loss": 0.134, "step": 11762 }, { "epoch": 2.458307210031348, "grad_norm": 0.9623584161519372, "learning_rate": 1.3345653197918298e-05, "loss": 0.1161, "step": 11763 }, { "epoch": 2.458516196447231, "grad_norm": 1.0919632947644013, "learning_rate": 1.3344590065915987e-05, "loss": 0.089, "step": 11764 }, { "epoch": 2.458725182863114, "grad_norm": 0.9995070007643913, "learning_rate": 1.3343526891348484e-05, "loss": 0.1011, "step": 11765 }, { "epoch": 2.4589341692789968, "grad_norm": 1.0367262190632132, "learning_rate": 1.3342463674229317e-05, "loss": 0.1404, "step": 11766 }, { "epoch": 2.4591431556948797, "grad_norm": 1.0040713561821824, "learning_rate": 1.3341400414572018e-05, "loss": 0.1161, "step": 11767 }, { "epoch": 2.4593521421107627, "grad_norm": 1.2262991817135376, "learning_rate": 1.3340337112390116e-05, "loss": 0.1424, "step": 11768 }, { "epoch": 2.4595611285266457, "grad_norm": 1.0607122895427412, "learning_rate": 1.333927376769715e-05, "loss": 0.1528, "step": 11769 }, { "epoch": 2.4597701149425286, "grad_norm": 1.189906350937594, "learning_rate": 1.3338210380506642e-05, "loss": 0.144, "step": 11770 }, { "epoch": 2.4599791013584116, "grad_norm": 0.9951794748993461, "learning_rate": 1.3337146950832136e-05, "loss": 0.1143, "step": 11771 }, { "epoch": 2.4601880877742945, "grad_norm": 0.9308927762910725, "learning_rate": 1.3336083478687158e-05, "loss": 0.1002, "step": 11772 }, { "epoch": 2.4603970741901775, "grad_norm": 1.1488152728770464, "learning_rate": 1.3335019964085247e-05, "loss": 0.1312, "step": 11773 }, { "epoch": 2.4606060606060605, "grad_norm": 0.9709361500892576, "learning_rate": 1.3333956407039934e-05, "loss": 0.1208, "step": 11774 }, { "epoch": 2.4608150470219434, "grad_norm": 0.901991879568454, "learning_rate": 1.3332892807564761e-05, "loss": 0.1132, "step": 11775 }, { "epoch": 2.4610240334378264, "grad_norm": 1.087660565246464, "learning_rate": 1.3331829165673255e-05, "loss": 0.1448, "step": 11776 }, { "epoch": 2.4612330198537093, "grad_norm": 1.0272562725697623, "learning_rate": 1.3330765481378962e-05, "loss": 0.1202, "step": 11777 }, { "epoch": 2.4614420062695923, "grad_norm": 0.8732994239748013, "learning_rate": 1.3329701754695412e-05, "loss": 0.1083, "step": 11778 }, { "epoch": 2.4616509926854753, "grad_norm": 0.8832320727366468, "learning_rate": 1.3328637985636146e-05, "loss": 0.1095, "step": 11779 }, { "epoch": 2.4618599791013582, "grad_norm": 0.912424388677352, "learning_rate": 1.33275741742147e-05, "loss": 0.1284, "step": 11780 }, { "epoch": 2.462068965517241, "grad_norm": 0.8188514476203841, "learning_rate": 1.3326510320444616e-05, "loss": 0.1129, "step": 11781 }, { "epoch": 2.462277951933124, "grad_norm": 0.821006314872558, "learning_rate": 1.332544642433943e-05, "loss": 0.0977, "step": 11782 }, { "epoch": 2.4624869383490076, "grad_norm": 1.119697088487702, "learning_rate": 1.3324382485912683e-05, "loss": 0.1292, "step": 11783 }, { "epoch": 2.4626959247648905, "grad_norm": 0.8552984034979932, "learning_rate": 1.3323318505177913e-05, "loss": 0.1109, "step": 11784 }, { "epoch": 2.4629049111807735, "grad_norm": 1.217202583757344, "learning_rate": 1.3322254482148664e-05, "loss": 0.1403, "step": 11785 }, { "epoch": 2.4631138975966564, "grad_norm": 1.0125348387955402, "learning_rate": 1.3321190416838476e-05, "loss": 0.1277, "step": 11786 }, { "epoch": 2.4633228840125394, "grad_norm": 1.0625204450432322, "learning_rate": 1.3320126309260895e-05, "loss": 0.1212, "step": 11787 }, { "epoch": 2.4635318704284224, "grad_norm": 0.911469103183236, "learning_rate": 1.3319062159429455e-05, "loss": 0.1128, "step": 11788 }, { "epoch": 2.4637408568443053, "grad_norm": 1.0022129113087055, "learning_rate": 1.3317997967357708e-05, "loss": 0.1234, "step": 11789 }, { "epoch": 2.4639498432601883, "grad_norm": 2.1317940381382394, "learning_rate": 1.331693373305919e-05, "loss": 0.1255, "step": 11790 }, { "epoch": 2.4641588296760712, "grad_norm": 0.8402516999351796, "learning_rate": 1.3315869456547448e-05, "loss": 0.1106, "step": 11791 }, { "epoch": 2.464367816091954, "grad_norm": 0.7981360450835432, "learning_rate": 1.3314805137836029e-05, "loss": 0.1083, "step": 11792 }, { "epoch": 2.464576802507837, "grad_norm": 0.895462200036838, "learning_rate": 1.3313740776938476e-05, "loss": 0.0913, "step": 11793 }, { "epoch": 2.46478578892372, "grad_norm": 1.044884926612539, "learning_rate": 1.3312676373868332e-05, "loss": 0.1317, "step": 11794 }, { "epoch": 2.464994775339603, "grad_norm": 0.979155597714178, "learning_rate": 1.3311611928639147e-05, "loss": 0.1285, "step": 11795 }, { "epoch": 2.465203761755486, "grad_norm": 0.8875786535630987, "learning_rate": 1.3310547441264468e-05, "loss": 0.1015, "step": 11796 }, { "epoch": 2.465412748171369, "grad_norm": 1.085879976780801, "learning_rate": 1.3309482911757838e-05, "loss": 0.1215, "step": 11797 }, { "epoch": 2.465621734587252, "grad_norm": 1.0358194642660883, "learning_rate": 1.3308418340132811e-05, "loss": 0.1335, "step": 11798 }, { "epoch": 2.465830721003135, "grad_norm": 0.8642048590664608, "learning_rate": 1.3307353726402932e-05, "loss": 0.1166, "step": 11799 }, { "epoch": 2.466039707419018, "grad_norm": 0.9930194551270135, "learning_rate": 1.3306289070581746e-05, "loss": 0.1237, "step": 11800 }, { "epoch": 2.466248693834901, "grad_norm": 1.1147423651265511, "learning_rate": 1.3305224372682813e-05, "loss": 0.1139, "step": 11801 }, { "epoch": 2.466457680250784, "grad_norm": 1.16308924424549, "learning_rate": 1.3304159632719671e-05, "loss": 0.1257, "step": 11802 }, { "epoch": 2.466666666666667, "grad_norm": 0.9667153043580968, "learning_rate": 1.3303094850705877e-05, "loss": 0.1192, "step": 11803 }, { "epoch": 2.4668756530825497, "grad_norm": 0.9572332955731021, "learning_rate": 1.3302030026654981e-05, "loss": 0.129, "step": 11804 }, { "epoch": 2.4670846394984327, "grad_norm": 1.1522639393359888, "learning_rate": 1.3300965160580536e-05, "loss": 0.111, "step": 11805 }, { "epoch": 2.4672936259143157, "grad_norm": 0.9549814669527644, "learning_rate": 1.329990025249609e-05, "loss": 0.1115, "step": 11806 }, { "epoch": 2.4675026123301986, "grad_norm": 1.0975036226006747, "learning_rate": 1.3298835302415202e-05, "loss": 0.1213, "step": 11807 }, { "epoch": 2.4677115987460816, "grad_norm": 1.060501243387582, "learning_rate": 1.329777031035142e-05, "loss": 0.1306, "step": 11808 }, { "epoch": 2.4679205851619646, "grad_norm": 0.9918519744409965, "learning_rate": 1.3296705276318297e-05, "loss": 0.1187, "step": 11809 }, { "epoch": 2.4681295715778475, "grad_norm": 1.0635156762801836, "learning_rate": 1.3295640200329389e-05, "loss": 0.1334, "step": 11810 }, { "epoch": 2.4683385579937305, "grad_norm": 1.1593581617346047, "learning_rate": 1.3294575082398255e-05, "loss": 0.1347, "step": 11811 }, { "epoch": 2.4685475444096134, "grad_norm": 1.100978896972991, "learning_rate": 1.3293509922538444e-05, "loss": 0.1179, "step": 11812 }, { "epoch": 2.4687565308254964, "grad_norm": 1.2378200626927784, "learning_rate": 1.3292444720763515e-05, "loss": 0.1622, "step": 11813 }, { "epoch": 2.4689655172413794, "grad_norm": 1.0675225762379765, "learning_rate": 1.3291379477087024e-05, "loss": 0.1428, "step": 11814 }, { "epoch": 2.4691745036572623, "grad_norm": 0.8630154478458413, "learning_rate": 1.3290314191522527e-05, "loss": 0.1034, "step": 11815 }, { "epoch": 2.4693834900731453, "grad_norm": 0.9918094637610162, "learning_rate": 1.328924886408358e-05, "loss": 0.1375, "step": 11816 }, { "epoch": 2.4695924764890282, "grad_norm": 1.0231188841032923, "learning_rate": 1.3288183494783747e-05, "loss": 0.1356, "step": 11817 }, { "epoch": 2.469801462904911, "grad_norm": 0.8496529514554437, "learning_rate": 1.3287118083636576e-05, "loss": 0.1039, "step": 11818 }, { "epoch": 2.470010449320794, "grad_norm": 0.9626111255342704, "learning_rate": 1.328605263065564e-05, "loss": 0.1217, "step": 11819 }, { "epoch": 2.470219435736677, "grad_norm": 1.1440939304470528, "learning_rate": 1.3284987135854488e-05, "loss": 0.1223, "step": 11820 }, { "epoch": 2.47042842215256, "grad_norm": 1.03844094286348, "learning_rate": 1.3283921599246683e-05, "loss": 0.1471, "step": 11821 }, { "epoch": 2.470637408568443, "grad_norm": 0.9805396154021978, "learning_rate": 1.3282856020845785e-05, "loss": 0.1317, "step": 11822 }, { "epoch": 2.470846394984326, "grad_norm": 0.9529965186161975, "learning_rate": 1.3281790400665357e-05, "loss": 0.0983, "step": 11823 }, { "epoch": 2.471055381400209, "grad_norm": 1.1389890073081346, "learning_rate": 1.3280724738718957e-05, "loss": 0.144, "step": 11824 }, { "epoch": 2.471264367816092, "grad_norm": 1.1197471422462075, "learning_rate": 1.3279659035020156e-05, "loss": 0.1054, "step": 11825 }, { "epoch": 2.471473354231975, "grad_norm": 1.1759335470642556, "learning_rate": 1.3278593289582509e-05, "loss": 0.1378, "step": 11826 }, { "epoch": 2.471682340647858, "grad_norm": 0.8106133756880058, "learning_rate": 1.3277527502419578e-05, "loss": 0.1104, "step": 11827 }, { "epoch": 2.471891327063741, "grad_norm": 1.1347427562659516, "learning_rate": 1.3276461673544932e-05, "loss": 0.1349, "step": 11828 }, { "epoch": 2.472100313479624, "grad_norm": 0.9227548332229006, "learning_rate": 1.3275395802972133e-05, "loss": 0.1335, "step": 11829 }, { "epoch": 2.4723092998955067, "grad_norm": 0.9178893857557944, "learning_rate": 1.3274329890714743e-05, "loss": 0.1137, "step": 11830 }, { "epoch": 2.4725182863113897, "grad_norm": 1.2066309015245626, "learning_rate": 1.3273263936786333e-05, "loss": 0.1314, "step": 11831 }, { "epoch": 2.4727272727272727, "grad_norm": 1.0177407123066928, "learning_rate": 1.3272197941200468e-05, "loss": 0.1264, "step": 11832 }, { "epoch": 2.4729362591431556, "grad_norm": 1.0014147026885658, "learning_rate": 1.327113190397071e-05, "loss": 0.1292, "step": 11833 }, { "epoch": 2.4731452455590386, "grad_norm": 0.9345943661760295, "learning_rate": 1.3270065825110631e-05, "loss": 0.0944, "step": 11834 }, { "epoch": 2.4733542319749215, "grad_norm": 0.9973534397245499, "learning_rate": 1.3268999704633797e-05, "loss": 0.1338, "step": 11835 }, { "epoch": 2.4735632183908045, "grad_norm": 1.0501161563842434, "learning_rate": 1.3267933542553772e-05, "loss": 0.1278, "step": 11836 }, { "epoch": 2.4737722048066875, "grad_norm": 0.9255923528905476, "learning_rate": 1.3266867338884131e-05, "loss": 0.1378, "step": 11837 }, { "epoch": 2.4739811912225704, "grad_norm": 0.9215502171154473, "learning_rate": 1.3265801093638442e-05, "loss": 0.1172, "step": 11838 }, { "epoch": 2.4741901776384534, "grad_norm": 0.9892308054181419, "learning_rate": 1.3264734806830268e-05, "loss": 0.0919, "step": 11839 }, { "epoch": 2.4743991640543364, "grad_norm": 0.9485622116283224, "learning_rate": 1.3263668478473188e-05, "loss": 0.1109, "step": 11840 }, { "epoch": 2.4746081504702193, "grad_norm": 1.0108563466390257, "learning_rate": 1.3262602108580767e-05, "loss": 0.1188, "step": 11841 }, { "epoch": 2.4748171368861023, "grad_norm": 0.8492208740140187, "learning_rate": 1.3261535697166579e-05, "loss": 0.1017, "step": 11842 }, { "epoch": 2.4750261233019852, "grad_norm": 0.8277924815068043, "learning_rate": 1.3260469244244195e-05, "loss": 0.112, "step": 11843 }, { "epoch": 2.475235109717868, "grad_norm": 0.8681219234372484, "learning_rate": 1.3259402749827186e-05, "loss": 0.1203, "step": 11844 }, { "epoch": 2.475444096133751, "grad_norm": 0.780896385930053, "learning_rate": 1.3258336213929126e-05, "loss": 0.101, "step": 11845 }, { "epoch": 2.475653082549634, "grad_norm": 1.0685419830789444, "learning_rate": 1.325726963656359e-05, "loss": 0.1208, "step": 11846 }, { "epoch": 2.475862068965517, "grad_norm": 1.047534284566885, "learning_rate": 1.3256203017744148e-05, "loss": 0.1191, "step": 11847 }, { "epoch": 2.4760710553814, "grad_norm": 1.053295139011966, "learning_rate": 1.3255136357484377e-05, "loss": 0.1174, "step": 11848 }, { "epoch": 2.476280041797283, "grad_norm": 1.0129177806908818, "learning_rate": 1.3254069655797852e-05, "loss": 0.1233, "step": 11849 }, { "epoch": 2.476489028213166, "grad_norm": 0.8816116157586239, "learning_rate": 1.325300291269815e-05, "loss": 0.1429, "step": 11850 }, { "epoch": 2.476698014629049, "grad_norm": 1.020599607662107, "learning_rate": 1.3251936128198841e-05, "loss": 0.1356, "step": 11851 }, { "epoch": 2.476907001044932, "grad_norm": 0.957960406672375, "learning_rate": 1.325086930231351e-05, "loss": 0.1066, "step": 11852 }, { "epoch": 2.477115987460815, "grad_norm": 1.2364446247144891, "learning_rate": 1.3249802435055727e-05, "loss": 0.153, "step": 11853 }, { "epoch": 2.477324973876698, "grad_norm": 0.9387131068550871, "learning_rate": 1.324873552643907e-05, "loss": 0.1104, "step": 11854 }, { "epoch": 2.4775339602925808, "grad_norm": 1.0632028488817857, "learning_rate": 1.3247668576477122e-05, "loss": 0.1207, "step": 11855 }, { "epoch": 2.4777429467084637, "grad_norm": 0.9564806496609163, "learning_rate": 1.3246601585183461e-05, "loss": 0.1194, "step": 11856 }, { "epoch": 2.4779519331243467, "grad_norm": 0.9008956289728508, "learning_rate": 1.3245534552571657e-05, "loss": 0.111, "step": 11857 }, { "epoch": 2.4781609195402297, "grad_norm": 0.8390709639226512, "learning_rate": 1.3244467478655304e-05, "loss": 0.1103, "step": 11858 }, { "epoch": 2.4783699059561126, "grad_norm": 1.0226249683241042, "learning_rate": 1.3243400363447971e-05, "loss": 0.097, "step": 11859 }, { "epoch": 2.4785788923719956, "grad_norm": 0.9110451664228066, "learning_rate": 1.3242333206963245e-05, "loss": 0.1166, "step": 11860 }, { "epoch": 2.4787878787878785, "grad_norm": 1.1162785681544782, "learning_rate": 1.3241266009214702e-05, "loss": 0.1203, "step": 11861 }, { "epoch": 2.478996865203762, "grad_norm": 1.4583195604280552, "learning_rate": 1.3240198770215931e-05, "loss": 0.1064, "step": 11862 }, { "epoch": 2.479205851619645, "grad_norm": 0.9000727501755483, "learning_rate": 1.3239131489980503e-05, "loss": 0.1259, "step": 11863 }, { "epoch": 2.479414838035528, "grad_norm": 0.9259885415861545, "learning_rate": 1.3238064168522015e-05, "loss": 0.1004, "step": 11864 }, { "epoch": 2.479623824451411, "grad_norm": 0.9772310905516454, "learning_rate": 1.3236996805854039e-05, "loss": 0.1295, "step": 11865 }, { "epoch": 2.479832810867294, "grad_norm": 0.8709956476563416, "learning_rate": 1.3235929401990164e-05, "loss": 0.1088, "step": 11866 }, { "epoch": 2.4800417972831768, "grad_norm": 1.0692839104680913, "learning_rate": 1.3234861956943972e-05, "loss": 0.1378, "step": 11867 }, { "epoch": 2.4802507836990597, "grad_norm": 0.9018896081295745, "learning_rate": 1.3233794470729054e-05, "loss": 0.1063, "step": 11868 }, { "epoch": 2.4804597701149427, "grad_norm": 1.0553364969500993, "learning_rate": 1.3232726943358985e-05, "loss": 0.1162, "step": 11869 }, { "epoch": 2.4806687565308256, "grad_norm": 0.9944073075643728, "learning_rate": 1.3231659374847362e-05, "loss": 0.1136, "step": 11870 }, { "epoch": 2.4808777429467086, "grad_norm": 0.9421529712140654, "learning_rate": 1.323059176520776e-05, "loss": 0.1159, "step": 11871 }, { "epoch": 2.4810867293625916, "grad_norm": 1.0498229365895422, "learning_rate": 1.3229524114453779e-05, "loss": 0.1467, "step": 11872 }, { "epoch": 2.4812957157784745, "grad_norm": 1.021603688955117, "learning_rate": 1.3228456422598995e-05, "loss": 0.1272, "step": 11873 }, { "epoch": 2.4815047021943575, "grad_norm": 0.9580003980986896, "learning_rate": 1.3227388689657004e-05, "loss": 0.1148, "step": 11874 }, { "epoch": 2.4817136886102404, "grad_norm": 0.929374422876598, "learning_rate": 1.3226320915641387e-05, "loss": 0.1156, "step": 11875 }, { "epoch": 2.4819226750261234, "grad_norm": 0.975375659868373, "learning_rate": 1.3225253100565742e-05, "loss": 0.1268, "step": 11876 }, { "epoch": 2.4821316614420064, "grad_norm": 0.9903761450845582, "learning_rate": 1.3224185244443653e-05, "loss": 0.1282, "step": 11877 }, { "epoch": 2.4823406478578893, "grad_norm": 0.8718339803792002, "learning_rate": 1.322311734728871e-05, "loss": 0.1117, "step": 11878 }, { "epoch": 2.4825496342737723, "grad_norm": 1.144042948845397, "learning_rate": 1.3222049409114505e-05, "loss": 0.1415, "step": 11879 }, { "epoch": 2.4827586206896552, "grad_norm": 0.9358067820907066, "learning_rate": 1.3220981429934634e-05, "loss": 0.1112, "step": 11880 }, { "epoch": 2.482967607105538, "grad_norm": 0.9434417908144046, "learning_rate": 1.3219913409762677e-05, "loss": 0.1252, "step": 11881 }, { "epoch": 2.483176593521421, "grad_norm": 1.300908889371279, "learning_rate": 1.3218845348612236e-05, "loss": 0.1194, "step": 11882 }, { "epoch": 2.483385579937304, "grad_norm": 1.0839658109048638, "learning_rate": 1.3217777246496904e-05, "loss": 0.1347, "step": 11883 }, { "epoch": 2.483594566353187, "grad_norm": 0.9140209439500807, "learning_rate": 1.3216709103430268e-05, "loss": 0.0958, "step": 11884 }, { "epoch": 2.48380355276907, "grad_norm": 1.1374535864592714, "learning_rate": 1.3215640919425925e-05, "loss": 0.1383, "step": 11885 }, { "epoch": 2.484012539184953, "grad_norm": 1.04267484629998, "learning_rate": 1.3214572694497473e-05, "loss": 0.128, "step": 11886 }, { "epoch": 2.484221525600836, "grad_norm": 1.1682992073560914, "learning_rate": 1.3213504428658496e-05, "loss": 0.1588, "step": 11887 }, { "epoch": 2.484430512016719, "grad_norm": 0.9531571242452749, "learning_rate": 1.3212436121922604e-05, "loss": 0.1156, "step": 11888 }, { "epoch": 2.484639498432602, "grad_norm": 1.0238458638769845, "learning_rate": 1.3211367774303383e-05, "loss": 0.1109, "step": 11889 }, { "epoch": 2.484848484848485, "grad_norm": 1.1058832018479885, "learning_rate": 1.321029938581443e-05, "loss": 0.136, "step": 11890 }, { "epoch": 2.485057471264368, "grad_norm": 0.9274677537526836, "learning_rate": 1.3209230956469346e-05, "loss": 0.1169, "step": 11891 }, { "epoch": 2.485266457680251, "grad_norm": 1.0468680524631158, "learning_rate": 1.3208162486281728e-05, "loss": 0.1266, "step": 11892 }, { "epoch": 2.4854754440961337, "grad_norm": 1.0215439891317761, "learning_rate": 1.3207093975265165e-05, "loss": 0.1277, "step": 11893 }, { "epoch": 2.4856844305120167, "grad_norm": 0.999088639090424, "learning_rate": 1.3206025423433273e-05, "loss": 0.1302, "step": 11894 }, { "epoch": 2.4858934169278997, "grad_norm": 1.0166372132493895, "learning_rate": 1.3204956830799636e-05, "loss": 0.1359, "step": 11895 }, { "epoch": 2.4861024033437826, "grad_norm": 0.9438418340624011, "learning_rate": 1.3203888197377857e-05, "loss": 0.1158, "step": 11896 }, { "epoch": 2.4863113897596656, "grad_norm": 1.183748462152711, "learning_rate": 1.3202819523181538e-05, "loss": 0.1368, "step": 11897 }, { "epoch": 2.4865203761755486, "grad_norm": 2.1674302637773084, "learning_rate": 1.320175080822428e-05, "loss": 0.1143, "step": 11898 }, { "epoch": 2.4867293625914315, "grad_norm": 0.9470669411096599, "learning_rate": 1.3200682052519683e-05, "loss": 0.1361, "step": 11899 }, { "epoch": 2.4869383490073145, "grad_norm": 1.300843635612558, "learning_rate": 1.3199613256081347e-05, "loss": 0.1114, "step": 11900 }, { "epoch": 2.4871473354231974, "grad_norm": 0.8917077366577328, "learning_rate": 1.3198544418922878e-05, "loss": 0.1181, "step": 11901 }, { "epoch": 2.4873563218390804, "grad_norm": 0.8590066495647403, "learning_rate": 1.3197475541057872e-05, "loss": 0.1176, "step": 11902 }, { "epoch": 2.4875653082549634, "grad_norm": 1.055855236266672, "learning_rate": 1.3196406622499942e-05, "loss": 0.1262, "step": 11903 }, { "epoch": 2.4877742946708463, "grad_norm": 0.8803083873912698, "learning_rate": 1.3195337663262683e-05, "loss": 0.1041, "step": 11904 }, { "epoch": 2.4879832810867293, "grad_norm": 0.8751596133676506, "learning_rate": 1.3194268663359703e-05, "loss": 0.1136, "step": 11905 }, { "epoch": 2.4881922675026122, "grad_norm": 0.9307112984094688, "learning_rate": 1.3193199622804607e-05, "loss": 0.1122, "step": 11906 }, { "epoch": 2.488401253918495, "grad_norm": 1.1450793915239592, "learning_rate": 1.3192130541611e-05, "loss": 0.1283, "step": 11907 }, { "epoch": 2.488610240334378, "grad_norm": 1.029125475690881, "learning_rate": 1.3191061419792483e-05, "loss": 0.1226, "step": 11908 }, { "epoch": 2.488819226750261, "grad_norm": 0.8974896836032953, "learning_rate": 1.318999225736267e-05, "loss": 0.1056, "step": 11909 }, { "epoch": 2.489028213166144, "grad_norm": 0.8890848535522274, "learning_rate": 1.3188923054335166e-05, "loss": 0.1137, "step": 11910 }, { "epoch": 2.489237199582027, "grad_norm": 1.4358858982053364, "learning_rate": 1.3187853810723571e-05, "loss": 0.1249, "step": 11911 }, { "epoch": 2.48944618599791, "grad_norm": 0.9188238087077524, "learning_rate": 1.3186784526541503e-05, "loss": 0.1378, "step": 11912 }, { "epoch": 2.489655172413793, "grad_norm": 1.0425991704852728, "learning_rate": 1.3185715201802565e-05, "loss": 0.119, "step": 11913 }, { "epoch": 2.489864158829676, "grad_norm": 1.0547083662151566, "learning_rate": 1.3184645836520363e-05, "loss": 0.1422, "step": 11914 }, { "epoch": 2.490073145245559, "grad_norm": 0.9565246119698232, "learning_rate": 1.3183576430708513e-05, "loss": 0.1179, "step": 11915 }, { "epoch": 2.490282131661442, "grad_norm": 1.0278837427660927, "learning_rate": 1.3182506984380621e-05, "loss": 0.1123, "step": 11916 }, { "epoch": 2.490491118077325, "grad_norm": 0.7901045565052155, "learning_rate": 1.3181437497550297e-05, "loss": 0.1137, "step": 11917 }, { "epoch": 2.490700104493208, "grad_norm": 0.8143145979739149, "learning_rate": 1.3180367970231153e-05, "loss": 0.101, "step": 11918 }, { "epoch": 2.4909090909090907, "grad_norm": 0.8779791933800102, "learning_rate": 1.3179298402436804e-05, "loss": 0.1036, "step": 11919 }, { "epoch": 2.4911180773249737, "grad_norm": 1.1383218739097418, "learning_rate": 1.3178228794180855e-05, "loss": 0.1216, "step": 11920 }, { "epoch": 2.4913270637408567, "grad_norm": 0.9109900707194581, "learning_rate": 1.3177159145476924e-05, "loss": 0.1086, "step": 11921 }, { "epoch": 2.4915360501567396, "grad_norm": 0.9309280927626683, "learning_rate": 1.3176089456338618e-05, "loss": 0.1252, "step": 11922 }, { "epoch": 2.491745036572623, "grad_norm": 0.6165760494324247, "learning_rate": 1.3175019726779556e-05, "loss": 0.0786, "step": 11923 }, { "epoch": 2.491954022988506, "grad_norm": 1.0434191281505332, "learning_rate": 1.3173949956813352e-05, "loss": 0.1165, "step": 11924 }, { "epoch": 2.492163009404389, "grad_norm": 0.8847337971019062, "learning_rate": 1.3172880146453619e-05, "loss": 0.1031, "step": 11925 }, { "epoch": 2.492371995820272, "grad_norm": 0.8870312759215407, "learning_rate": 1.3171810295713968e-05, "loss": 0.1212, "step": 11926 }, { "epoch": 2.492580982236155, "grad_norm": 1.0742698765760985, "learning_rate": 1.3170740404608022e-05, "loss": 0.1436, "step": 11927 }, { "epoch": 2.492789968652038, "grad_norm": 1.209529312577816, "learning_rate": 1.3169670473149392e-05, "loss": 0.1398, "step": 11928 }, { "epoch": 2.492998955067921, "grad_norm": 0.8736363959601974, "learning_rate": 1.3168600501351696e-05, "loss": 0.1017, "step": 11929 }, { "epoch": 2.4932079414838038, "grad_norm": 1.0210739350203113, "learning_rate": 1.316753048922855e-05, "loss": 0.1098, "step": 11930 }, { "epoch": 2.4934169278996867, "grad_norm": 1.1296111690398285, "learning_rate": 1.3166460436793579e-05, "loss": 0.133, "step": 11931 }, { "epoch": 2.4936259143155697, "grad_norm": 0.8629563518544597, "learning_rate": 1.3165390344060387e-05, "loss": 0.1116, "step": 11932 }, { "epoch": 2.4938349007314526, "grad_norm": 0.9265619419044054, "learning_rate": 1.3164320211042604e-05, "loss": 0.1135, "step": 11933 }, { "epoch": 2.4940438871473356, "grad_norm": 1.011612665292189, "learning_rate": 1.3163250037753846e-05, "loss": 0.1408, "step": 11934 }, { "epoch": 2.4942528735632186, "grad_norm": 0.8533895256206667, "learning_rate": 1.3162179824207729e-05, "loss": 0.0949, "step": 11935 }, { "epoch": 2.4944618599791015, "grad_norm": 0.8875603952739631, "learning_rate": 1.3161109570417878e-05, "loss": 0.1093, "step": 11936 }, { "epoch": 2.4946708463949845, "grad_norm": 0.8265074699128562, "learning_rate": 1.3160039276397914e-05, "loss": 0.0977, "step": 11937 }, { "epoch": 2.4948798328108674, "grad_norm": 0.9067557473127404, "learning_rate": 1.3158968942161452e-05, "loss": 0.1082, "step": 11938 }, { "epoch": 2.4950888192267504, "grad_norm": 1.0853522622155134, "learning_rate": 1.3157898567722121e-05, "loss": 0.1086, "step": 11939 }, { "epoch": 2.4952978056426334, "grad_norm": 0.9657538025073821, "learning_rate": 1.3156828153093539e-05, "loss": 0.1192, "step": 11940 }, { "epoch": 2.4955067920585163, "grad_norm": 1.241970965380541, "learning_rate": 1.3155757698289331e-05, "loss": 0.1456, "step": 11941 }, { "epoch": 2.4957157784743993, "grad_norm": 0.9736609448783399, "learning_rate": 1.3154687203323118e-05, "loss": 0.1019, "step": 11942 }, { "epoch": 2.4959247648902823, "grad_norm": 1.1219767128246245, "learning_rate": 1.3153616668208526e-05, "loss": 0.1184, "step": 11943 }, { "epoch": 2.496133751306165, "grad_norm": 1.038985999335083, "learning_rate": 1.3152546092959175e-05, "loss": 0.1442, "step": 11944 }, { "epoch": 2.496342737722048, "grad_norm": 1.083805833840804, "learning_rate": 1.3151475477588695e-05, "loss": 0.1249, "step": 11945 }, { "epoch": 2.496551724137931, "grad_norm": 0.8479230464072046, "learning_rate": 1.315040482211071e-05, "loss": 0.111, "step": 11946 }, { "epoch": 2.496760710553814, "grad_norm": 1.1029292481438937, "learning_rate": 1.3149334126538844e-05, "loss": 0.1405, "step": 11947 }, { "epoch": 2.496969696969697, "grad_norm": 0.9305547113246319, "learning_rate": 1.3148263390886721e-05, "loss": 0.1309, "step": 11948 }, { "epoch": 2.49717868338558, "grad_norm": 0.9773841909182447, "learning_rate": 1.3147192615167976e-05, "loss": 0.117, "step": 11949 }, { "epoch": 2.497387669801463, "grad_norm": 0.7862953431257774, "learning_rate": 1.3146121799396228e-05, "loss": 0.0928, "step": 11950 }, { "epoch": 2.497596656217346, "grad_norm": 0.874262901205967, "learning_rate": 1.3145050943585109e-05, "loss": 0.0879, "step": 11951 }, { "epoch": 2.497805642633229, "grad_norm": 1.2399084672768965, "learning_rate": 1.3143980047748249e-05, "loss": 0.1082, "step": 11952 }, { "epoch": 2.498014629049112, "grad_norm": 0.9938431194072048, "learning_rate": 1.3142909111899268e-05, "loss": 0.1067, "step": 11953 }, { "epoch": 2.498223615464995, "grad_norm": 1.1209820075138996, "learning_rate": 1.3141838136051808e-05, "loss": 0.1359, "step": 11954 }, { "epoch": 2.498432601880878, "grad_norm": 1.1195714722153904, "learning_rate": 1.314076712021949e-05, "loss": 0.1232, "step": 11955 }, { "epoch": 2.4986415882967608, "grad_norm": 0.9241212384397359, "learning_rate": 1.3139696064415946e-05, "loss": 0.1186, "step": 11956 }, { "epoch": 2.4988505747126437, "grad_norm": 0.9451331233385882, "learning_rate": 1.3138624968654809e-05, "loss": 0.1116, "step": 11957 }, { "epoch": 2.4990595611285267, "grad_norm": 1.091936149241622, "learning_rate": 1.3137553832949707e-05, "loss": 0.1184, "step": 11958 }, { "epoch": 2.4992685475444096, "grad_norm": 1.0778883990373471, "learning_rate": 1.3136482657314275e-05, "loss": 0.1368, "step": 11959 }, { "epoch": 2.4994775339602926, "grad_norm": 0.9479793561884277, "learning_rate": 1.3135411441762144e-05, "loss": 0.1188, "step": 11960 }, { "epoch": 2.4996865203761756, "grad_norm": 0.9822932349645733, "learning_rate": 1.3134340186306952e-05, "loss": 0.1146, "step": 11961 }, { "epoch": 2.4998955067920585, "grad_norm": 0.9620758187693798, "learning_rate": 1.3133268890962321e-05, "loss": 0.1021, "step": 11962 }, { "epoch": 2.5001044932079415, "grad_norm": 1.0138110355783139, "learning_rate": 1.3132197555741897e-05, "loss": 0.134, "step": 11963 }, { "epoch": 2.5003134796238244, "grad_norm": 1.1983845379408102, "learning_rate": 1.3131126180659307e-05, "loss": 0.1195, "step": 11964 }, { "epoch": 2.5005224660397074, "grad_norm": 0.982593337825928, "learning_rate": 1.3130054765728187e-05, "loss": 0.1192, "step": 11965 }, { "epoch": 2.5007314524555904, "grad_norm": 1.211517304061581, "learning_rate": 1.3128983310962174e-05, "loss": 0.135, "step": 11966 }, { "epoch": 2.5009404388714733, "grad_norm": 0.8337742544114348, "learning_rate": 1.3127911816374903e-05, "loss": 0.1178, "step": 11967 }, { "epoch": 2.5011494252873563, "grad_norm": 1.0321342012250627, "learning_rate": 1.312684028198001e-05, "loss": 0.1492, "step": 11968 }, { "epoch": 2.5013584117032392, "grad_norm": 1.0710483044499999, "learning_rate": 1.3125768707791135e-05, "loss": 0.0945, "step": 11969 }, { "epoch": 2.501567398119122, "grad_norm": 1.060383737404323, "learning_rate": 1.3124697093821913e-05, "loss": 0.1366, "step": 11970 }, { "epoch": 2.501776384535005, "grad_norm": 1.0391359906360895, "learning_rate": 1.312362544008598e-05, "loss": 0.1429, "step": 11971 }, { "epoch": 2.501985370950888, "grad_norm": 0.8604617694740339, "learning_rate": 1.312255374659698e-05, "loss": 0.111, "step": 11972 }, { "epoch": 2.502194357366771, "grad_norm": 1.0145489325466894, "learning_rate": 1.3121482013368549e-05, "loss": 0.1529, "step": 11973 }, { "epoch": 2.502403343782654, "grad_norm": 0.8489662438862569, "learning_rate": 1.3120410240414323e-05, "loss": 0.1026, "step": 11974 }, { "epoch": 2.502612330198537, "grad_norm": 0.863849469900923, "learning_rate": 1.3119338427747948e-05, "loss": 0.1313, "step": 11975 }, { "epoch": 2.50282131661442, "grad_norm": 0.988505358020295, "learning_rate": 1.3118266575383064e-05, "loss": 0.1186, "step": 11976 }, { "epoch": 2.503030303030303, "grad_norm": 0.917728225243601, "learning_rate": 1.3117194683333302e-05, "loss": 0.121, "step": 11977 }, { "epoch": 2.503239289446186, "grad_norm": 0.8742401374295674, "learning_rate": 1.3116122751612319e-05, "loss": 0.1102, "step": 11978 }, { "epoch": 2.503448275862069, "grad_norm": 1.1034161489269434, "learning_rate": 1.3115050780233744e-05, "loss": 0.119, "step": 11979 }, { "epoch": 2.503657262277952, "grad_norm": 0.9867884582571309, "learning_rate": 1.311397876921123e-05, "loss": 0.1093, "step": 11980 }, { "epoch": 2.503866248693835, "grad_norm": 0.9619115927597579, "learning_rate": 1.3112906718558411e-05, "loss": 0.12, "step": 11981 }, { "epoch": 2.5040752351097177, "grad_norm": 0.8187554898415901, "learning_rate": 1.3111834628288937e-05, "loss": 0.1047, "step": 11982 }, { "epoch": 2.5042842215256007, "grad_norm": 0.8279689847748632, "learning_rate": 1.3110762498416445e-05, "loss": 0.1041, "step": 11983 }, { "epoch": 2.5044932079414837, "grad_norm": 0.9755131610731015, "learning_rate": 1.310969032895459e-05, "loss": 0.1027, "step": 11984 }, { "epoch": 2.5047021943573666, "grad_norm": 0.9575274746231134, "learning_rate": 1.3108618119917007e-05, "loss": 0.0977, "step": 11985 }, { "epoch": 2.5049111807732496, "grad_norm": 1.19717359573545, "learning_rate": 1.3107545871317347e-05, "loss": 0.1286, "step": 11986 }, { "epoch": 2.5051201671891326, "grad_norm": 0.9698596532093586, "learning_rate": 1.3106473583169255e-05, "loss": 0.1413, "step": 11987 }, { "epoch": 2.5053291536050155, "grad_norm": 1.130355900243939, "learning_rate": 1.3105401255486378e-05, "loss": 0.1361, "step": 11988 }, { "epoch": 2.5055381400208985, "grad_norm": 1.1047531301220943, "learning_rate": 1.310432888828236e-05, "loss": 0.1322, "step": 11989 }, { "epoch": 2.5057471264367814, "grad_norm": 0.8281380115516053, "learning_rate": 1.3103256481570855e-05, "loss": 0.1079, "step": 11990 }, { "epoch": 2.5059561128526644, "grad_norm": 1.1031479730243072, "learning_rate": 1.3102184035365504e-05, "loss": 0.1453, "step": 11991 }, { "epoch": 2.5061650992685474, "grad_norm": 0.9906622521954295, "learning_rate": 1.3101111549679957e-05, "loss": 0.1036, "step": 11992 }, { "epoch": 2.5063740856844303, "grad_norm": 1.0903715307194326, "learning_rate": 1.3100039024527867e-05, "loss": 0.1484, "step": 11993 }, { "epoch": 2.5065830721003133, "grad_norm": 3.3580686145823226, "learning_rate": 1.3098966459922881e-05, "loss": 0.1503, "step": 11994 }, { "epoch": 2.5067920585161962, "grad_norm": 0.9334367196640592, "learning_rate": 1.3097893855878647e-05, "loss": 0.1182, "step": 11995 }, { "epoch": 2.507001044932079, "grad_norm": 0.9785795615076027, "learning_rate": 1.3096821212408823e-05, "loss": 0.1082, "step": 11996 }, { "epoch": 2.507210031347962, "grad_norm": 1.1161972586673017, "learning_rate": 1.309574852952705e-05, "loss": 0.1261, "step": 11997 }, { "epoch": 2.507419017763845, "grad_norm": 1.1364990565144735, "learning_rate": 1.3094675807246988e-05, "loss": 0.1196, "step": 11998 }, { "epoch": 2.507628004179728, "grad_norm": 1.230190535529179, "learning_rate": 1.3093603045582285e-05, "loss": 0.1543, "step": 11999 }, { "epoch": 2.507836990595611, "grad_norm": 0.9429402192802204, "learning_rate": 1.3092530244546593e-05, "loss": 0.1306, "step": 12000 }, { "epoch": 2.508045977011494, "grad_norm": 0.926374347556555, "learning_rate": 1.3091457404153562e-05, "loss": 0.1018, "step": 12001 }, { "epoch": 2.508254963427377, "grad_norm": 0.9968600749220229, "learning_rate": 1.3090384524416857e-05, "loss": 0.1379, "step": 12002 }, { "epoch": 2.50846394984326, "grad_norm": 0.9064243902183485, "learning_rate": 1.3089311605350124e-05, "loss": 0.1136, "step": 12003 }, { "epoch": 2.508672936259143, "grad_norm": 0.9930951280815004, "learning_rate": 1.3088238646967016e-05, "loss": 0.1109, "step": 12004 }, { "epoch": 2.508881922675026, "grad_norm": 0.8395524438968422, "learning_rate": 1.3087165649281191e-05, "loss": 0.1033, "step": 12005 }, { "epoch": 2.509090909090909, "grad_norm": 0.7903432386535043, "learning_rate": 1.3086092612306307e-05, "loss": 0.0981, "step": 12006 }, { "epoch": 2.509299895506792, "grad_norm": 1.0116574468326962, "learning_rate": 1.3085019536056013e-05, "loss": 0.1452, "step": 12007 }, { "epoch": 2.509508881922675, "grad_norm": 0.9750634123598395, "learning_rate": 1.3083946420543973e-05, "loss": 0.1254, "step": 12008 }, { "epoch": 2.509717868338558, "grad_norm": 0.9553230063030396, "learning_rate": 1.308287326578384e-05, "loss": 0.1011, "step": 12009 }, { "epoch": 2.509926854754441, "grad_norm": 1.1734389527119786, "learning_rate": 1.3081800071789272e-05, "loss": 0.1535, "step": 12010 }, { "epoch": 2.510135841170324, "grad_norm": 0.8363505621340047, "learning_rate": 1.308072683857393e-05, "loss": 0.1188, "step": 12011 }, { "epoch": 2.510344827586207, "grad_norm": 1.1459073976866265, "learning_rate": 1.3079653566151471e-05, "loss": 0.1273, "step": 12012 }, { "epoch": 2.51055381400209, "grad_norm": 0.9512316783215918, "learning_rate": 1.307858025453555e-05, "loss": 0.1172, "step": 12013 }, { "epoch": 2.510762800417973, "grad_norm": 1.0201733053587034, "learning_rate": 1.3077506903739829e-05, "loss": 0.1358, "step": 12014 }, { "epoch": 2.510971786833856, "grad_norm": 1.1320740385175325, "learning_rate": 1.3076433513777972e-05, "loss": 0.1227, "step": 12015 }, { "epoch": 2.511180773249739, "grad_norm": 0.9743279227324024, "learning_rate": 1.3075360084663636e-05, "loss": 0.1139, "step": 12016 }, { "epoch": 2.511389759665622, "grad_norm": 0.9845935665358995, "learning_rate": 1.3074286616410483e-05, "loss": 0.1545, "step": 12017 }, { "epoch": 2.511598746081505, "grad_norm": 1.0998851044819054, "learning_rate": 1.3073213109032174e-05, "loss": 0.168, "step": 12018 }, { "epoch": 2.5118077324973878, "grad_norm": 1.0293811272507105, "learning_rate": 1.307213956254237e-05, "loss": 0.1236, "step": 12019 }, { "epoch": 2.5120167189132707, "grad_norm": 0.9945347111097826, "learning_rate": 1.3071065976954737e-05, "loss": 0.0922, "step": 12020 }, { "epoch": 2.5122257053291537, "grad_norm": 1.0864514317068037, "learning_rate": 1.3069992352282936e-05, "loss": 0.127, "step": 12021 }, { "epoch": 2.5124346917450366, "grad_norm": 0.8544195876196342, "learning_rate": 1.306891868854063e-05, "loss": 0.125, "step": 12022 }, { "epoch": 2.5126436781609196, "grad_norm": 1.1012119967142553, "learning_rate": 1.3067844985741483e-05, "loss": 0.1212, "step": 12023 }, { "epoch": 2.5128526645768026, "grad_norm": 0.874430897150453, "learning_rate": 1.3066771243899164e-05, "loss": 0.1177, "step": 12024 }, { "epoch": 2.5130616509926855, "grad_norm": 0.9436388166597767, "learning_rate": 1.3065697463027328e-05, "loss": 0.1354, "step": 12025 }, { "epoch": 2.5132706374085685, "grad_norm": 0.8421809936922807, "learning_rate": 1.3064623643139652e-05, "loss": 0.1009, "step": 12026 }, { "epoch": 2.5134796238244514, "grad_norm": 0.9172201124665952, "learning_rate": 1.3063549784249793e-05, "loss": 0.1081, "step": 12027 }, { "epoch": 2.5136886102403344, "grad_norm": 0.9037354461884677, "learning_rate": 1.3062475886371425e-05, "loss": 0.1211, "step": 12028 }, { "epoch": 2.5138975966562174, "grad_norm": 0.8074412242858303, "learning_rate": 1.3061401949518209e-05, "loss": 0.1089, "step": 12029 }, { "epoch": 2.5141065830721003, "grad_norm": 1.123319181447335, "learning_rate": 1.3060327973703817e-05, "loss": 0.13, "step": 12030 }, { "epoch": 2.5143155694879833, "grad_norm": 1.1652926560424555, "learning_rate": 1.305925395894191e-05, "loss": 0.1315, "step": 12031 }, { "epoch": 2.5145245559038663, "grad_norm": 1.0583165138709811, "learning_rate": 1.3058179905246168e-05, "loss": 0.1445, "step": 12032 }, { "epoch": 2.514733542319749, "grad_norm": 0.954595346522494, "learning_rate": 1.3057105812630253e-05, "loss": 0.1242, "step": 12033 }, { "epoch": 2.514942528735632, "grad_norm": 0.925606295085198, "learning_rate": 1.3056031681107832e-05, "loss": 0.126, "step": 12034 }, { "epoch": 2.515151515151515, "grad_norm": 1.1745149869924456, "learning_rate": 1.3054957510692581e-05, "loss": 0.1428, "step": 12035 }, { "epoch": 2.515360501567398, "grad_norm": 0.7991792483106974, "learning_rate": 1.3053883301398167e-05, "loss": 0.1025, "step": 12036 }, { "epoch": 2.515569487983281, "grad_norm": 1.0962107972051258, "learning_rate": 1.305280905323826e-05, "loss": 0.1373, "step": 12037 }, { "epoch": 2.515778474399164, "grad_norm": 1.0668106582738628, "learning_rate": 1.3051734766226533e-05, "loss": 0.1428, "step": 12038 }, { "epoch": 2.515987460815047, "grad_norm": 0.9667731021947572, "learning_rate": 1.3050660440376662e-05, "loss": 0.1299, "step": 12039 }, { "epoch": 2.51619644723093, "grad_norm": 1.2125099789951532, "learning_rate": 1.3049586075702312e-05, "loss": 0.1396, "step": 12040 }, { "epoch": 2.516405433646813, "grad_norm": 0.9395442781492408, "learning_rate": 1.3048511672217162e-05, "loss": 0.0918, "step": 12041 }, { "epoch": 2.516614420062696, "grad_norm": 0.951977686608486, "learning_rate": 1.3047437229934882e-05, "loss": 0.1358, "step": 12042 }, { "epoch": 2.516823406478579, "grad_norm": 0.9060779754257855, "learning_rate": 1.3046362748869147e-05, "loss": 0.1113, "step": 12043 }, { "epoch": 2.517032392894462, "grad_norm": 0.9786224718031057, "learning_rate": 1.3045288229033632e-05, "loss": 0.1204, "step": 12044 }, { "epoch": 2.5172413793103448, "grad_norm": 0.9725427988187825, "learning_rate": 1.3044213670442013e-05, "loss": 0.1238, "step": 12045 }, { "epoch": 2.5174503657262277, "grad_norm": 0.9440626870815565, "learning_rate": 1.3043139073107961e-05, "loss": 0.1098, "step": 12046 }, { "epoch": 2.5176593521421107, "grad_norm": 0.9779739494923378, "learning_rate": 1.3042064437045158e-05, "loss": 0.1287, "step": 12047 }, { "epoch": 2.5178683385579936, "grad_norm": 1.0003047030477004, "learning_rate": 1.3040989762267278e-05, "loss": 0.1094, "step": 12048 }, { "epoch": 2.5180773249738766, "grad_norm": 1.0324142665985658, "learning_rate": 1.3039915048787995e-05, "loss": 0.1312, "step": 12049 }, { "epoch": 2.5182863113897596, "grad_norm": 0.9345476398625543, "learning_rate": 1.303884029662099e-05, "loss": 0.1052, "step": 12050 }, { "epoch": 2.5184952978056425, "grad_norm": 0.8786122949773271, "learning_rate": 1.303776550577994e-05, "loss": 0.1028, "step": 12051 }, { "epoch": 2.5187042842215255, "grad_norm": 1.026251471120554, "learning_rate": 1.3036690676278521e-05, "loss": 0.1231, "step": 12052 }, { "epoch": 2.5189132706374084, "grad_norm": 0.860290304226249, "learning_rate": 1.3035615808130415e-05, "loss": 0.1092, "step": 12053 }, { "epoch": 2.5191222570532914, "grad_norm": 1.140223063747228, "learning_rate": 1.3034540901349302e-05, "loss": 0.1364, "step": 12054 }, { "epoch": 2.5193312434691744, "grad_norm": 1.0001245036109692, "learning_rate": 1.3033465955948859e-05, "loss": 0.1139, "step": 12055 }, { "epoch": 2.5195402298850573, "grad_norm": 0.9208048763589126, "learning_rate": 1.3032390971942768e-05, "loss": 0.1215, "step": 12056 }, { "epoch": 2.5197492163009403, "grad_norm": 1.3622824108388651, "learning_rate": 1.3031315949344711e-05, "loss": 0.1252, "step": 12057 }, { "epoch": 2.5199582027168232, "grad_norm": 0.8402561754895923, "learning_rate": 1.3030240888168364e-05, "loss": 0.1007, "step": 12058 }, { "epoch": 2.520167189132706, "grad_norm": 0.8174515777925234, "learning_rate": 1.3029165788427415e-05, "loss": 0.1077, "step": 12059 }, { "epoch": 2.5203761755485896, "grad_norm": 1.1591311990816322, "learning_rate": 1.3028090650135545e-05, "loss": 0.1293, "step": 12060 }, { "epoch": 2.5205851619644726, "grad_norm": 1.0281696180436373, "learning_rate": 1.3027015473306438e-05, "loss": 0.1217, "step": 12061 }, { "epoch": 2.5207941483803555, "grad_norm": 1.0918082850208553, "learning_rate": 1.302594025795377e-05, "loss": 0.1302, "step": 12062 }, { "epoch": 2.5210031347962385, "grad_norm": 0.9635678257717443, "learning_rate": 1.3024865004091236e-05, "loss": 0.1296, "step": 12063 }, { "epoch": 2.5212121212121215, "grad_norm": 0.9392246017461954, "learning_rate": 1.302378971173251e-05, "loss": 0.1125, "step": 12064 }, { "epoch": 2.5214211076280044, "grad_norm": 0.9740234400003723, "learning_rate": 1.3022714380891284e-05, "loss": 0.0994, "step": 12065 }, { "epoch": 2.5216300940438874, "grad_norm": 0.8939045510233616, "learning_rate": 1.3021639011581237e-05, "loss": 0.0982, "step": 12066 }, { "epoch": 2.5218390804597703, "grad_norm": 0.8591255724022487, "learning_rate": 1.3020563603816058e-05, "loss": 0.1258, "step": 12067 }, { "epoch": 2.5220480668756533, "grad_norm": 0.9874751636195805, "learning_rate": 1.3019488157609437e-05, "loss": 0.1327, "step": 12068 }, { "epoch": 2.5222570532915363, "grad_norm": 0.7146815396065845, "learning_rate": 1.3018412672975058e-05, "loss": 0.1064, "step": 12069 }, { "epoch": 2.5224660397074192, "grad_norm": 0.9989191619601493, "learning_rate": 1.3017337149926603e-05, "loss": 0.1324, "step": 12070 }, { "epoch": 2.522675026123302, "grad_norm": 1.0332966754667747, "learning_rate": 1.3016261588477769e-05, "loss": 0.1157, "step": 12071 }, { "epoch": 2.522884012539185, "grad_norm": 0.9063307362267906, "learning_rate": 1.3015185988642239e-05, "loss": 0.1028, "step": 12072 }, { "epoch": 2.523092998955068, "grad_norm": 0.9370500786867675, "learning_rate": 1.30141103504337e-05, "loss": 0.1157, "step": 12073 }, { "epoch": 2.523301985370951, "grad_norm": 1.1286910451557557, "learning_rate": 1.3013034673865843e-05, "loss": 0.1257, "step": 12074 }, { "epoch": 2.523510971786834, "grad_norm": 1.0098411147946718, "learning_rate": 1.301195895895236e-05, "loss": 0.1339, "step": 12075 }, { "epoch": 2.523719958202717, "grad_norm": 0.9926704210114072, "learning_rate": 1.3010883205706936e-05, "loss": 0.1175, "step": 12076 }, { "epoch": 2.5239289446186, "grad_norm": 0.8904429394916945, "learning_rate": 1.300980741414327e-05, "loss": 0.1266, "step": 12077 }, { "epoch": 2.524137931034483, "grad_norm": 1.3169103254270556, "learning_rate": 1.3008731584275044e-05, "loss": 0.1373, "step": 12078 }, { "epoch": 2.524346917450366, "grad_norm": 1.0275610715293557, "learning_rate": 1.3007655716115954e-05, "loss": 0.1196, "step": 12079 }, { "epoch": 2.524555903866249, "grad_norm": 1.0344796948075734, "learning_rate": 1.3006579809679693e-05, "loss": 0.1039, "step": 12080 }, { "epoch": 2.524764890282132, "grad_norm": 0.8486027054857184, "learning_rate": 1.3005503864979953e-05, "loss": 0.1188, "step": 12081 }, { "epoch": 2.5249738766980148, "grad_norm": 0.9512214137606237, "learning_rate": 1.3004427882030423e-05, "loss": 0.1146, "step": 12082 }, { "epoch": 2.5251828631138977, "grad_norm": 1.0016779139280076, "learning_rate": 1.3003351860844806e-05, "loss": 0.1169, "step": 12083 }, { "epoch": 2.5253918495297807, "grad_norm": 1.1455772380090896, "learning_rate": 1.3002275801436788e-05, "loss": 0.1461, "step": 12084 }, { "epoch": 2.5256008359456636, "grad_norm": 1.0432883691247103, "learning_rate": 1.3001199703820065e-05, "loss": 0.1271, "step": 12085 }, { "epoch": 2.5258098223615466, "grad_norm": 0.68971592414422, "learning_rate": 1.3000123568008332e-05, "loss": 0.0876, "step": 12086 }, { "epoch": 2.5260188087774296, "grad_norm": 1.1804717868527388, "learning_rate": 1.2999047394015287e-05, "loss": 0.1338, "step": 12087 }, { "epoch": 2.5262277951933125, "grad_norm": 1.242194111464088, "learning_rate": 1.299797118185462e-05, "loss": 0.1273, "step": 12088 }, { "epoch": 2.5264367816091955, "grad_norm": 0.9860374502610241, "learning_rate": 1.2996894931540036e-05, "loss": 0.1086, "step": 12089 }, { "epoch": 2.5266457680250785, "grad_norm": 0.7901759423455282, "learning_rate": 1.2995818643085228e-05, "loss": 0.102, "step": 12090 }, { "epoch": 2.5268547544409614, "grad_norm": 1.1296896870858582, "learning_rate": 1.2994742316503892e-05, "loss": 0.1106, "step": 12091 }, { "epoch": 2.5270637408568444, "grad_norm": 1.0931977510979662, "learning_rate": 1.2993665951809727e-05, "loss": 0.1246, "step": 12092 }, { "epoch": 2.5272727272727273, "grad_norm": 1.0741208167321572, "learning_rate": 1.2992589549016432e-05, "loss": 0.1406, "step": 12093 }, { "epoch": 2.5274817136886103, "grad_norm": 1.028252517592801, "learning_rate": 1.2991513108137703e-05, "loss": 0.1322, "step": 12094 }, { "epoch": 2.5276907001044933, "grad_norm": 1.1808208550918622, "learning_rate": 1.2990436629187246e-05, "loss": 0.1498, "step": 12095 }, { "epoch": 2.527899686520376, "grad_norm": 0.9742715867132679, "learning_rate": 1.2989360112178755e-05, "loss": 0.1105, "step": 12096 }, { "epoch": 2.528108672936259, "grad_norm": 1.0843955633201823, "learning_rate": 1.2988283557125934e-05, "loss": 0.14, "step": 12097 }, { "epoch": 2.528317659352142, "grad_norm": 0.8801649922038275, "learning_rate": 1.298720696404248e-05, "loss": 0.12, "step": 12098 }, { "epoch": 2.528526645768025, "grad_norm": 1.128291399789167, "learning_rate": 1.29861303329421e-05, "loss": 0.1492, "step": 12099 }, { "epoch": 2.528735632183908, "grad_norm": 0.9022796519026564, "learning_rate": 1.2985053663838488e-05, "loss": 0.0959, "step": 12100 }, { "epoch": 2.528944618599791, "grad_norm": 0.9166701569922606, "learning_rate": 1.2983976956745353e-05, "loss": 0.1085, "step": 12101 }, { "epoch": 2.529153605015674, "grad_norm": 0.8516202277608604, "learning_rate": 1.2982900211676395e-05, "loss": 0.115, "step": 12102 }, { "epoch": 2.529362591431557, "grad_norm": 1.0119543445933703, "learning_rate": 1.2981823428645317e-05, "loss": 0.1198, "step": 12103 }, { "epoch": 2.52957157784744, "grad_norm": 1.1525567798447323, "learning_rate": 1.2980746607665822e-05, "loss": 0.1433, "step": 12104 }, { "epoch": 2.529780564263323, "grad_norm": 1.0386637871604794, "learning_rate": 1.297966974875162e-05, "loss": 0.1342, "step": 12105 }, { "epoch": 2.529989550679206, "grad_norm": 0.9705085491008366, "learning_rate": 1.2978592851916407e-05, "loss": 0.115, "step": 12106 }, { "epoch": 2.530198537095089, "grad_norm": 1.0210760293565384, "learning_rate": 1.2977515917173897e-05, "loss": 0.1558, "step": 12107 }, { "epoch": 2.5304075235109718, "grad_norm": 1.08779490065429, "learning_rate": 1.297643894453779e-05, "loss": 0.1206, "step": 12108 }, { "epoch": 2.5306165099268547, "grad_norm": 0.8686959729975491, "learning_rate": 1.2975361934021792e-05, "loss": 0.1103, "step": 12109 }, { "epoch": 2.5308254963427377, "grad_norm": 0.8723698931894838, "learning_rate": 1.2974284885639611e-05, "loss": 0.1134, "step": 12110 }, { "epoch": 2.5310344827586206, "grad_norm": 0.8506197687796601, "learning_rate": 1.297320779940496e-05, "loss": 0.1137, "step": 12111 }, { "epoch": 2.5312434691745036, "grad_norm": 0.8151665289197364, "learning_rate": 1.2972130675331533e-05, "loss": 0.1035, "step": 12112 }, { "epoch": 2.5314524555903866, "grad_norm": 1.0292796466469747, "learning_rate": 1.297105351343305e-05, "loss": 0.1207, "step": 12113 }, { "epoch": 2.5316614420062695, "grad_norm": 0.8843099110285942, "learning_rate": 1.2969976313723213e-05, "loss": 0.1171, "step": 12114 }, { "epoch": 2.5318704284221525, "grad_norm": 1.0252657724778629, "learning_rate": 1.2968899076215738e-05, "loss": 0.1302, "step": 12115 }, { "epoch": 2.5320794148380354, "grad_norm": 0.9693375469906941, "learning_rate": 1.2967821800924328e-05, "loss": 0.1191, "step": 12116 }, { "epoch": 2.5322884012539184, "grad_norm": 0.9603469692786853, "learning_rate": 1.2966744487862696e-05, "loss": 0.1275, "step": 12117 }, { "epoch": 2.5324973876698014, "grad_norm": 0.9464093615791086, "learning_rate": 1.2965667137044554e-05, "loss": 0.1112, "step": 12118 }, { "epoch": 2.5327063740856843, "grad_norm": 0.9561129548380363, "learning_rate": 1.2964589748483608e-05, "loss": 0.1243, "step": 12119 }, { "epoch": 2.5329153605015673, "grad_norm": 1.1050407806354616, "learning_rate": 1.2963512322193569e-05, "loss": 0.1357, "step": 12120 }, { "epoch": 2.5331243469174503, "grad_norm": 3.184547529476181, "learning_rate": 1.2962434858188157e-05, "loss": 0.1051, "step": 12121 }, { "epoch": 2.533333333333333, "grad_norm": 1.0271997962982817, "learning_rate": 1.2961357356481082e-05, "loss": 0.1431, "step": 12122 }, { "epoch": 2.533542319749216, "grad_norm": 0.9928090060298791, "learning_rate": 1.296027981708605e-05, "loss": 0.1078, "step": 12123 }, { "epoch": 2.533751306165099, "grad_norm": 0.9735771839383788, "learning_rate": 1.2959202240016779e-05, "loss": 0.1164, "step": 12124 }, { "epoch": 2.533960292580982, "grad_norm": 0.9706393827822093, "learning_rate": 1.2958124625286984e-05, "loss": 0.1235, "step": 12125 }, { "epoch": 2.534169278996865, "grad_norm": 1.1560734583720498, "learning_rate": 1.295704697291038e-05, "loss": 0.1198, "step": 12126 }, { "epoch": 2.534378265412748, "grad_norm": 1.647825527400424, "learning_rate": 1.2955969282900676e-05, "loss": 0.15, "step": 12127 }, { "epoch": 2.534587251828631, "grad_norm": 0.9982776145948568, "learning_rate": 1.2954891555271595e-05, "loss": 0.1148, "step": 12128 }, { "epoch": 2.534796238244514, "grad_norm": 0.9521692894986415, "learning_rate": 1.2953813790036849e-05, "loss": 0.12, "step": 12129 }, { "epoch": 2.535005224660397, "grad_norm": 0.9099302309696676, "learning_rate": 1.2952735987210152e-05, "loss": 0.1194, "step": 12130 }, { "epoch": 2.53521421107628, "grad_norm": 0.8326440364330749, "learning_rate": 1.2951658146805223e-05, "loss": 0.1132, "step": 12131 }, { "epoch": 2.535423197492163, "grad_norm": 1.0657313240647481, "learning_rate": 1.2950580268835784e-05, "loss": 0.1294, "step": 12132 }, { "epoch": 2.535632183908046, "grad_norm": 0.9257264218846228, "learning_rate": 1.2949502353315542e-05, "loss": 0.0987, "step": 12133 }, { "epoch": 2.5358411703239288, "grad_norm": 1.0078073018037585, "learning_rate": 1.2948424400258226e-05, "loss": 0.1031, "step": 12134 }, { "epoch": 2.5360501567398117, "grad_norm": 1.1776958739048586, "learning_rate": 1.2947346409677547e-05, "loss": 0.148, "step": 12135 }, { "epoch": 2.5362591431556947, "grad_norm": 1.0602538640605634, "learning_rate": 1.2946268381587229e-05, "loss": 0.1082, "step": 12136 }, { "epoch": 2.5364681295715776, "grad_norm": 0.8247819276880308, "learning_rate": 1.2945190316000988e-05, "loss": 0.1191, "step": 12137 }, { "epoch": 2.5366771159874606, "grad_norm": 0.8560869677125436, "learning_rate": 1.2944112212932546e-05, "loss": 0.1063, "step": 12138 }, { "epoch": 2.5368861024033436, "grad_norm": 0.9188384090952298, "learning_rate": 1.2943034072395624e-05, "loss": 0.1018, "step": 12139 }, { "epoch": 2.5370950888192265, "grad_norm": 1.1340880016018764, "learning_rate": 1.2941955894403943e-05, "loss": 0.1622, "step": 12140 }, { "epoch": 2.5373040752351095, "grad_norm": 1.3198842346260151, "learning_rate": 1.2940877678971221e-05, "loss": 0.1364, "step": 12141 }, { "epoch": 2.5375130616509924, "grad_norm": 0.9636213989654207, "learning_rate": 1.2939799426111188e-05, "loss": 0.1097, "step": 12142 }, { "epoch": 2.5377220480668754, "grad_norm": 1.1815987966506822, "learning_rate": 1.2938721135837557e-05, "loss": 0.1527, "step": 12143 }, { "epoch": 2.5379310344827584, "grad_norm": 0.9259875059315368, "learning_rate": 1.2937642808164057e-05, "loss": 0.1102, "step": 12144 }, { "epoch": 2.5381400208986413, "grad_norm": 1.0832170912952304, "learning_rate": 1.293656444310441e-05, "loss": 0.1375, "step": 12145 }, { "epoch": 2.5383490073145243, "grad_norm": 0.9955945007092715, "learning_rate": 1.2935486040672339e-05, "loss": 0.1392, "step": 12146 }, { "epoch": 2.5385579937304072, "grad_norm": 0.9889106375813872, "learning_rate": 1.293440760088157e-05, "loss": 0.1152, "step": 12147 }, { "epoch": 2.5387669801462907, "grad_norm": 1.0401463412764733, "learning_rate": 1.2933329123745823e-05, "loss": 0.117, "step": 12148 }, { "epoch": 2.5389759665621736, "grad_norm": 0.8323794372182816, "learning_rate": 1.2932250609278829e-05, "loss": 0.1131, "step": 12149 }, { "epoch": 2.5391849529780566, "grad_norm": 0.8633026956546556, "learning_rate": 1.2931172057494314e-05, "loss": 0.0884, "step": 12150 }, { "epoch": 2.5393939393939395, "grad_norm": 0.9898925993655288, "learning_rate": 1.2930093468406e-05, "loss": 0.1331, "step": 12151 }, { "epoch": 2.5396029258098225, "grad_norm": 0.8927237287177856, "learning_rate": 1.2929014842027619e-05, "loss": 0.111, "step": 12152 }, { "epoch": 2.5398119122257055, "grad_norm": 1.0299119506015115, "learning_rate": 1.2927936178372893e-05, "loss": 0.1411, "step": 12153 }, { "epoch": 2.5400208986415884, "grad_norm": 0.9104573703380254, "learning_rate": 1.2926857477455553e-05, "loss": 0.113, "step": 12154 }, { "epoch": 2.5402298850574714, "grad_norm": 1.0106205457321364, "learning_rate": 1.2925778739289325e-05, "loss": 0.1029, "step": 12155 }, { "epoch": 2.5404388714733543, "grad_norm": 0.9062800687252001, "learning_rate": 1.292469996388794e-05, "loss": 0.1232, "step": 12156 }, { "epoch": 2.5406478578892373, "grad_norm": 0.9116529570271538, "learning_rate": 1.2923621151265123e-05, "loss": 0.1006, "step": 12157 }, { "epoch": 2.5408568443051203, "grad_norm": 0.8404088087477012, "learning_rate": 1.2922542301434608e-05, "loss": 0.1031, "step": 12158 }, { "epoch": 2.5410658307210032, "grad_norm": 0.9315243027690664, "learning_rate": 1.2921463414410124e-05, "loss": 0.1161, "step": 12159 }, { "epoch": 2.541274817136886, "grad_norm": 1.091174058111058, "learning_rate": 1.2920384490205402e-05, "loss": 0.1699, "step": 12160 }, { "epoch": 2.541483803552769, "grad_norm": 1.1505098165828553, "learning_rate": 1.2919305528834169e-05, "loss": 0.1427, "step": 12161 }, { "epoch": 2.541692789968652, "grad_norm": 0.9993112140245994, "learning_rate": 1.2918226530310165e-05, "loss": 0.1237, "step": 12162 }, { "epoch": 2.541901776384535, "grad_norm": 0.9404155356670713, "learning_rate": 1.2917147494647111e-05, "loss": 0.1257, "step": 12163 }, { "epoch": 2.542110762800418, "grad_norm": 0.8984208037352297, "learning_rate": 1.2916068421858748e-05, "loss": 0.1119, "step": 12164 }, { "epoch": 2.542319749216301, "grad_norm": 1.197158451193446, "learning_rate": 1.2914989311958803e-05, "loss": 0.1287, "step": 12165 }, { "epoch": 2.542528735632184, "grad_norm": 1.165548267404781, "learning_rate": 1.2913910164961014e-05, "loss": 0.1319, "step": 12166 }, { "epoch": 2.542737722048067, "grad_norm": 0.7941824511164199, "learning_rate": 1.2912830980879115e-05, "loss": 0.1109, "step": 12167 }, { "epoch": 2.54294670846395, "grad_norm": 0.8617937337321194, "learning_rate": 1.2911751759726839e-05, "loss": 0.109, "step": 12168 }, { "epoch": 2.543155694879833, "grad_norm": 0.9448966708933646, "learning_rate": 1.2910672501517913e-05, "loss": 0.1376, "step": 12169 }, { "epoch": 2.543364681295716, "grad_norm": 1.0716627129352092, "learning_rate": 1.2909593206266089e-05, "loss": 0.1234, "step": 12170 }, { "epoch": 2.5435736677115988, "grad_norm": 0.9688371908883848, "learning_rate": 1.2908513873985087e-05, "loss": 0.1253, "step": 12171 }, { "epoch": 2.5437826541274817, "grad_norm": 1.0618900437684875, "learning_rate": 1.290743450468865e-05, "loss": 0.1244, "step": 12172 }, { "epoch": 2.5439916405433647, "grad_norm": 0.9317545771686366, "learning_rate": 1.2906355098390515e-05, "loss": 0.1103, "step": 12173 }, { "epoch": 2.5442006269592476, "grad_norm": 0.8607718206394692, "learning_rate": 1.290527565510442e-05, "loss": 0.0936, "step": 12174 }, { "epoch": 2.5444096133751306, "grad_norm": 0.8446475378930807, "learning_rate": 1.2904196174844099e-05, "loss": 0.0908, "step": 12175 }, { "epoch": 2.5446185997910136, "grad_norm": 0.901985090965099, "learning_rate": 1.2903116657623294e-05, "loss": 0.1092, "step": 12176 }, { "epoch": 2.5448275862068965, "grad_norm": 0.8272330616276286, "learning_rate": 1.2902037103455738e-05, "loss": 0.0899, "step": 12177 }, { "epoch": 2.5450365726227795, "grad_norm": 0.9819506433998572, "learning_rate": 1.2900957512355174e-05, "loss": 0.1214, "step": 12178 }, { "epoch": 2.5452455590386625, "grad_norm": 1.0526955544019438, "learning_rate": 1.2899877884335343e-05, "loss": 0.1097, "step": 12179 }, { "epoch": 2.5454545454545454, "grad_norm": 1.179703220449879, "learning_rate": 1.2898798219409986e-05, "loss": 0.1328, "step": 12180 }, { "epoch": 2.5456635318704284, "grad_norm": 1.349780701956018, "learning_rate": 1.2897718517592836e-05, "loss": 0.1637, "step": 12181 }, { "epoch": 2.5458725182863113, "grad_norm": 1.1288750073677016, "learning_rate": 1.2896638778897639e-05, "loss": 0.1234, "step": 12182 }, { "epoch": 2.5460815047021943, "grad_norm": 1.1244720999584614, "learning_rate": 1.2895559003338137e-05, "loss": 0.1433, "step": 12183 }, { "epoch": 2.5462904911180773, "grad_norm": 1.2394085102269412, "learning_rate": 1.289447919092807e-05, "loss": 0.1393, "step": 12184 }, { "epoch": 2.54649947753396, "grad_norm": 1.0454193323682686, "learning_rate": 1.2893399341681178e-05, "loss": 0.1252, "step": 12185 }, { "epoch": 2.546708463949843, "grad_norm": 0.8597770433377915, "learning_rate": 1.2892319455611213e-05, "loss": 0.1104, "step": 12186 }, { "epoch": 2.546917450365726, "grad_norm": 0.9454206652574871, "learning_rate": 1.2891239532731907e-05, "loss": 0.1083, "step": 12187 }, { "epoch": 2.547126436781609, "grad_norm": 0.9552271480559255, "learning_rate": 1.289015957305701e-05, "loss": 0.1143, "step": 12188 }, { "epoch": 2.547335423197492, "grad_norm": 0.7887737049316583, "learning_rate": 1.2889079576600265e-05, "loss": 0.1197, "step": 12189 }, { "epoch": 2.547544409613375, "grad_norm": 0.8886227425989198, "learning_rate": 1.2887999543375418e-05, "loss": 0.097, "step": 12190 }, { "epoch": 2.547753396029258, "grad_norm": 0.9201747458029348, "learning_rate": 1.2886919473396212e-05, "loss": 0.1033, "step": 12191 }, { "epoch": 2.547962382445141, "grad_norm": 0.8972320385507182, "learning_rate": 1.2885839366676393e-05, "loss": 0.1307, "step": 12192 }, { "epoch": 2.548171368861024, "grad_norm": 0.9733457339432604, "learning_rate": 1.2884759223229706e-05, "loss": 0.1211, "step": 12193 }, { "epoch": 2.548380355276907, "grad_norm": 0.9798849543795235, "learning_rate": 1.2883679043069902e-05, "loss": 0.1231, "step": 12194 }, { "epoch": 2.54858934169279, "grad_norm": 0.9952626846558325, "learning_rate": 1.2882598826210723e-05, "loss": 0.1243, "step": 12195 }, { "epoch": 2.548798328108673, "grad_norm": 0.966160464648866, "learning_rate": 1.2881518572665917e-05, "loss": 0.1223, "step": 12196 }, { "epoch": 2.5490073145245558, "grad_norm": 0.8931758499046503, "learning_rate": 1.2880438282449237e-05, "loss": 0.123, "step": 12197 }, { "epoch": 2.5492163009404387, "grad_norm": 0.9593143156847046, "learning_rate": 1.2879357955574424e-05, "loss": 0.1107, "step": 12198 }, { "epoch": 2.5494252873563217, "grad_norm": 1.0618506839396482, "learning_rate": 1.287827759205523e-05, "loss": 0.1202, "step": 12199 }, { "epoch": 2.549634273772205, "grad_norm": 1.0408795125132704, "learning_rate": 1.2877197191905409e-05, "loss": 0.1138, "step": 12200 }, { "epoch": 2.549843260188088, "grad_norm": 0.968711742397502, "learning_rate": 1.2876116755138706e-05, "loss": 0.1177, "step": 12201 }, { "epoch": 2.550052246603971, "grad_norm": 1.1253591271971808, "learning_rate": 1.2875036281768867e-05, "loss": 0.1227, "step": 12202 }, { "epoch": 2.550261233019854, "grad_norm": 1.0761236712004918, "learning_rate": 1.2873955771809655e-05, "loss": 0.1358, "step": 12203 }, { "epoch": 2.550470219435737, "grad_norm": 1.0050915567622287, "learning_rate": 1.287287522527481e-05, "loss": 0.1245, "step": 12204 }, { "epoch": 2.55067920585162, "grad_norm": 1.1157280917144534, "learning_rate": 1.2871794642178086e-05, "loss": 0.1408, "step": 12205 }, { "epoch": 2.550888192267503, "grad_norm": 1.0069803579248588, "learning_rate": 1.2870714022533237e-05, "loss": 0.0998, "step": 12206 }, { "epoch": 2.551097178683386, "grad_norm": 1.0579847086817997, "learning_rate": 1.286963336635402e-05, "loss": 0.1338, "step": 12207 }, { "epoch": 2.5513061650992688, "grad_norm": 1.2522435329067023, "learning_rate": 1.2868552673654178e-05, "loss": 0.145, "step": 12208 }, { "epoch": 2.5515151515151517, "grad_norm": 1.1291095049303812, "learning_rate": 1.2867471944447473e-05, "loss": 0.1222, "step": 12209 }, { "epoch": 2.5517241379310347, "grad_norm": 1.0249020710764674, "learning_rate": 1.2866391178747655e-05, "loss": 0.1264, "step": 12210 }, { "epoch": 2.5519331243469177, "grad_norm": 0.9216922754595047, "learning_rate": 1.286531037656848e-05, "loss": 0.1393, "step": 12211 }, { "epoch": 2.5521421107628006, "grad_norm": 1.2607569115666986, "learning_rate": 1.28642295379237e-05, "loss": 0.1451, "step": 12212 }, { "epoch": 2.5523510971786836, "grad_norm": 0.9462802607374572, "learning_rate": 1.2863148662827075e-05, "loss": 0.1161, "step": 12213 }, { "epoch": 2.5525600835945665, "grad_norm": 1.0582739034880906, "learning_rate": 1.2862067751292356e-05, "loss": 0.1366, "step": 12214 }, { "epoch": 2.5527690700104495, "grad_norm": 0.9833949615593863, "learning_rate": 1.2860986803333304e-05, "loss": 0.1304, "step": 12215 }, { "epoch": 2.5529780564263325, "grad_norm": 1.01228493738178, "learning_rate": 1.2859905818963672e-05, "loss": 0.1091, "step": 12216 }, { "epoch": 2.5531870428422154, "grad_norm": 1.4636701848480935, "learning_rate": 1.2858824798197219e-05, "loss": 0.1309, "step": 12217 }, { "epoch": 2.5533960292580984, "grad_norm": 0.9775604308315466, "learning_rate": 1.2857743741047704e-05, "loss": 0.1229, "step": 12218 }, { "epoch": 2.5536050156739813, "grad_norm": 0.9939667630698212, "learning_rate": 1.2856662647528881e-05, "loss": 0.1306, "step": 12219 }, { "epoch": 2.5538140020898643, "grad_norm": 0.8561971030929698, "learning_rate": 1.2855581517654512e-05, "loss": 0.0896, "step": 12220 }, { "epoch": 2.5540229885057473, "grad_norm": 1.1020941333838372, "learning_rate": 1.2854500351438358e-05, "loss": 0.1418, "step": 12221 }, { "epoch": 2.5542319749216302, "grad_norm": 1.041837537542818, "learning_rate": 1.2853419148894172e-05, "loss": 0.1417, "step": 12222 }, { "epoch": 2.554440961337513, "grad_norm": 0.9313085247690228, "learning_rate": 1.2852337910035719e-05, "loss": 0.1217, "step": 12223 }, { "epoch": 2.554649947753396, "grad_norm": 0.9487534043764045, "learning_rate": 1.2851256634876759e-05, "loss": 0.1158, "step": 12224 }, { "epoch": 2.554858934169279, "grad_norm": 0.9820111351003061, "learning_rate": 1.2850175323431055e-05, "loss": 0.1282, "step": 12225 }, { "epoch": 2.555067920585162, "grad_norm": 0.8553912821715187, "learning_rate": 1.284909397571236e-05, "loss": 0.1112, "step": 12226 }, { "epoch": 2.555276907001045, "grad_norm": 0.8533229740077269, "learning_rate": 1.2848012591734446e-05, "loss": 0.1162, "step": 12227 }, { "epoch": 2.555485893416928, "grad_norm": 0.9640805011948489, "learning_rate": 1.284693117151107e-05, "loss": 0.1282, "step": 12228 }, { "epoch": 2.555694879832811, "grad_norm": 0.8694173788414856, "learning_rate": 1.2845849715055994e-05, "loss": 0.0845, "step": 12229 }, { "epoch": 2.555903866248694, "grad_norm": 1.051138598445105, "learning_rate": 1.2844768222382985e-05, "loss": 0.1553, "step": 12230 }, { "epoch": 2.556112852664577, "grad_norm": 1.0237910584190009, "learning_rate": 1.2843686693505805e-05, "loss": 0.1264, "step": 12231 }, { "epoch": 2.55632183908046, "grad_norm": 0.8910118114343257, "learning_rate": 1.2842605128438215e-05, "loss": 0.1109, "step": 12232 }, { "epoch": 2.556530825496343, "grad_norm": 0.9388633360449814, "learning_rate": 1.2841523527193984e-05, "loss": 0.1118, "step": 12233 }, { "epoch": 2.5567398119122258, "grad_norm": 1.0797178375109748, "learning_rate": 1.2840441889786875e-05, "loss": 0.1444, "step": 12234 }, { "epoch": 2.5569487983281087, "grad_norm": 1.0496814225153177, "learning_rate": 1.2839360216230654e-05, "loss": 0.1333, "step": 12235 }, { "epoch": 2.5571577847439917, "grad_norm": 1.2170924264044525, "learning_rate": 1.2838278506539087e-05, "loss": 0.1627, "step": 12236 }, { "epoch": 2.5573667711598747, "grad_norm": 0.8598794158609522, "learning_rate": 1.2837196760725943e-05, "loss": 0.0965, "step": 12237 }, { "epoch": 2.5575757575757576, "grad_norm": 1.0653472128077401, "learning_rate": 1.2836114978804981e-05, "loss": 0.1379, "step": 12238 }, { "epoch": 2.5577847439916406, "grad_norm": 0.7887172088383604, "learning_rate": 1.2835033160789978e-05, "loss": 0.0978, "step": 12239 }, { "epoch": 2.5579937304075235, "grad_norm": 1.0191887979080811, "learning_rate": 1.2833951306694696e-05, "loss": 0.1258, "step": 12240 }, { "epoch": 2.5582027168234065, "grad_norm": 1.1032029574539413, "learning_rate": 1.2832869416532904e-05, "loss": 0.1347, "step": 12241 }, { "epoch": 2.5584117032392895, "grad_norm": 1.058399657182938, "learning_rate": 1.2831787490318375e-05, "loss": 0.1458, "step": 12242 }, { "epoch": 2.5586206896551724, "grad_norm": 0.9425285160733016, "learning_rate": 1.2830705528064875e-05, "loss": 0.1226, "step": 12243 }, { "epoch": 2.5588296760710554, "grad_norm": 1.0043735034029448, "learning_rate": 1.2829623529786169e-05, "loss": 0.15, "step": 12244 }, { "epoch": 2.5590386624869383, "grad_norm": 1.1566516079505507, "learning_rate": 1.2828541495496036e-05, "loss": 0.1187, "step": 12245 }, { "epoch": 2.5592476489028213, "grad_norm": 0.9696562117283497, "learning_rate": 1.2827459425208239e-05, "loss": 0.121, "step": 12246 }, { "epoch": 2.5594566353187043, "grad_norm": 0.9769165133663018, "learning_rate": 1.2826377318936557e-05, "loss": 0.1304, "step": 12247 }, { "epoch": 2.5596656217345872, "grad_norm": 0.8417600427433426, "learning_rate": 1.2825295176694753e-05, "loss": 0.1031, "step": 12248 }, { "epoch": 2.55987460815047, "grad_norm": 1.027765921274065, "learning_rate": 1.2824212998496605e-05, "loss": 0.1164, "step": 12249 }, { "epoch": 2.560083594566353, "grad_norm": 0.8437020087887508, "learning_rate": 1.2823130784355882e-05, "loss": 0.1107, "step": 12250 }, { "epoch": 2.560292580982236, "grad_norm": 1.0139460574484844, "learning_rate": 1.2822048534286358e-05, "loss": 0.1437, "step": 12251 }, { "epoch": 2.560501567398119, "grad_norm": 0.898381432797365, "learning_rate": 1.2820966248301806e-05, "loss": 0.1283, "step": 12252 }, { "epoch": 2.560710553814002, "grad_norm": 1.0792236400437716, "learning_rate": 1.2819883926416002e-05, "loss": 0.1355, "step": 12253 }, { "epoch": 2.560919540229885, "grad_norm": 0.8032982254915504, "learning_rate": 1.2818801568642716e-05, "loss": 0.1215, "step": 12254 }, { "epoch": 2.561128526645768, "grad_norm": 0.9902861825736406, "learning_rate": 1.281771917499573e-05, "loss": 0.1189, "step": 12255 }, { "epoch": 2.561337513061651, "grad_norm": 1.1398798906992076, "learning_rate": 1.281663674548881e-05, "loss": 0.1343, "step": 12256 }, { "epoch": 2.561546499477534, "grad_norm": 1.243630899777085, "learning_rate": 1.2815554280135738e-05, "loss": 0.1209, "step": 12257 }, { "epoch": 2.561755485893417, "grad_norm": 1.036476226037971, "learning_rate": 1.2814471778950289e-05, "loss": 0.1136, "step": 12258 }, { "epoch": 2.5619644723093, "grad_norm": 1.1733171152329305, "learning_rate": 1.2813389241946235e-05, "loss": 0.1355, "step": 12259 }, { "epoch": 2.5621734587251828, "grad_norm": 1.137397440906102, "learning_rate": 1.281230666913736e-05, "loss": 0.1437, "step": 12260 }, { "epoch": 2.5623824451410657, "grad_norm": 1.1741578364487886, "learning_rate": 1.2811224060537439e-05, "loss": 0.1601, "step": 12261 }, { "epoch": 2.5625914315569487, "grad_norm": 1.030068673458738, "learning_rate": 1.2810141416160247e-05, "loss": 0.1314, "step": 12262 }, { "epoch": 2.5628004179728316, "grad_norm": 1.0150777035031422, "learning_rate": 1.2809058736019563e-05, "loss": 0.1267, "step": 12263 }, { "epoch": 2.5630094043887146, "grad_norm": 0.9435954756052531, "learning_rate": 1.2807976020129172e-05, "loss": 0.113, "step": 12264 }, { "epoch": 2.5632183908045976, "grad_norm": 0.9526817564749468, "learning_rate": 1.2806893268502844e-05, "loss": 0.1367, "step": 12265 }, { "epoch": 2.5634273772204805, "grad_norm": 0.8987705641293148, "learning_rate": 1.2805810481154363e-05, "loss": 0.1148, "step": 12266 }, { "epoch": 2.5636363636363635, "grad_norm": 0.9409834742890264, "learning_rate": 1.2804727658097515e-05, "loss": 0.1172, "step": 12267 }, { "epoch": 2.5638453500522465, "grad_norm": 0.9147256030564738, "learning_rate": 1.280364479934607e-05, "loss": 0.1117, "step": 12268 }, { "epoch": 2.5640543364681294, "grad_norm": 1.0210971731304341, "learning_rate": 1.2802561904913815e-05, "loss": 0.1108, "step": 12269 }, { "epoch": 2.5642633228840124, "grad_norm": 0.7439251130072743, "learning_rate": 1.2801478974814534e-05, "loss": 0.101, "step": 12270 }, { "epoch": 2.5644723092998953, "grad_norm": 0.8909723934290285, "learning_rate": 1.2800396009062e-05, "loss": 0.1034, "step": 12271 }, { "epoch": 2.5646812957157783, "grad_norm": 1.4814770055680613, "learning_rate": 1.2799313007670007e-05, "loss": 0.1402, "step": 12272 }, { "epoch": 2.5648902821316613, "grad_norm": 1.0950938458788495, "learning_rate": 1.2798229970652329e-05, "loss": 0.1472, "step": 12273 }, { "epoch": 2.565099268547544, "grad_norm": 0.8738088382536064, "learning_rate": 1.2797146898022752e-05, "loss": 0.1026, "step": 12274 }, { "epoch": 2.565308254963427, "grad_norm": 1.0553608914622599, "learning_rate": 1.279606378979506e-05, "loss": 0.1186, "step": 12275 }, { "epoch": 2.56551724137931, "grad_norm": 1.2395837007395576, "learning_rate": 1.2794980645983039e-05, "loss": 0.1478, "step": 12276 }, { "epoch": 2.565726227795193, "grad_norm": 1.1018249597633702, "learning_rate": 1.2793897466600469e-05, "loss": 0.1079, "step": 12277 }, { "epoch": 2.565935214211076, "grad_norm": 0.9988621631017545, "learning_rate": 1.2792814251661143e-05, "loss": 0.1068, "step": 12278 }, { "epoch": 2.566144200626959, "grad_norm": 1.1889168736755933, "learning_rate": 1.2791731001178839e-05, "loss": 0.1542, "step": 12279 }, { "epoch": 2.566353187042842, "grad_norm": 0.9030681045831626, "learning_rate": 1.2790647715167345e-05, "loss": 0.1278, "step": 12280 }, { "epoch": 2.566562173458725, "grad_norm": 0.893520533029981, "learning_rate": 1.2789564393640453e-05, "loss": 0.0821, "step": 12281 }, { "epoch": 2.566771159874608, "grad_norm": 0.9491176200215932, "learning_rate": 1.2788481036611943e-05, "loss": 0.1341, "step": 12282 }, { "epoch": 2.566980146290491, "grad_norm": 1.1304401593246567, "learning_rate": 1.2787397644095602e-05, "loss": 0.1537, "step": 12283 }, { "epoch": 2.567189132706374, "grad_norm": 1.1154283909697265, "learning_rate": 1.2786314216105227e-05, "loss": 0.125, "step": 12284 }, { "epoch": 2.567398119122257, "grad_norm": 0.9131255262544385, "learning_rate": 1.2785230752654596e-05, "loss": 0.1049, "step": 12285 }, { "epoch": 2.5676071055381398, "grad_norm": 1.081214182417824, "learning_rate": 1.2784147253757502e-05, "loss": 0.1283, "step": 12286 }, { "epoch": 2.5678160919540227, "grad_norm": 0.9982738064490432, "learning_rate": 1.2783063719427735e-05, "loss": 0.129, "step": 12287 }, { "epoch": 2.568025078369906, "grad_norm": 0.9349945881461624, "learning_rate": 1.2781980149679085e-05, "loss": 0.0985, "step": 12288 }, { "epoch": 2.568234064785789, "grad_norm": 0.9608878892108188, "learning_rate": 1.278089654452534e-05, "loss": 0.1231, "step": 12289 }, { "epoch": 2.568443051201672, "grad_norm": 0.9518670529489934, "learning_rate": 1.2779812903980292e-05, "loss": 0.1186, "step": 12290 }, { "epoch": 2.568652037617555, "grad_norm": 1.11189943687451, "learning_rate": 1.2778729228057732e-05, "loss": 0.1324, "step": 12291 }, { "epoch": 2.568861024033438, "grad_norm": 1.0581772291493576, "learning_rate": 1.2777645516771448e-05, "loss": 0.1233, "step": 12292 }, { "epoch": 2.569070010449321, "grad_norm": 1.0245062081744973, "learning_rate": 1.2776561770135238e-05, "loss": 0.1152, "step": 12293 }, { "epoch": 2.569278996865204, "grad_norm": 0.9458869923540036, "learning_rate": 1.2775477988162894e-05, "loss": 0.1188, "step": 12294 }, { "epoch": 2.569487983281087, "grad_norm": 1.117418822207369, "learning_rate": 1.27743941708682e-05, "loss": 0.1225, "step": 12295 }, { "epoch": 2.56969696969697, "grad_norm": 0.9407271055938727, "learning_rate": 1.277331031826496e-05, "loss": 0.1262, "step": 12296 }, { "epoch": 2.5699059561128528, "grad_norm": 1.104543499776942, "learning_rate": 1.2772226430366963e-05, "loss": 0.1431, "step": 12297 }, { "epoch": 2.5701149425287357, "grad_norm": 1.0331198645192996, "learning_rate": 1.2771142507188001e-05, "loss": 0.1396, "step": 12298 }, { "epoch": 2.5703239289446187, "grad_norm": 0.9599704252789434, "learning_rate": 1.2770058548741875e-05, "loss": 0.1148, "step": 12299 }, { "epoch": 2.5705329153605017, "grad_norm": 0.8743866026935705, "learning_rate": 1.2768974555042376e-05, "loss": 0.1028, "step": 12300 }, { "epoch": 2.5707419017763846, "grad_norm": 0.9694526354360735, "learning_rate": 1.2767890526103295e-05, "loss": 0.1257, "step": 12301 }, { "epoch": 2.5709508881922676, "grad_norm": 0.8881582634944559, "learning_rate": 1.2766806461938436e-05, "loss": 0.1168, "step": 12302 }, { "epoch": 2.5711598746081505, "grad_norm": 0.9480230954568659, "learning_rate": 1.2765722362561595e-05, "loss": 0.1243, "step": 12303 }, { "epoch": 2.5713688610240335, "grad_norm": 1.1109871024428526, "learning_rate": 1.2764638227986562e-05, "loss": 0.1242, "step": 12304 }, { "epoch": 2.5715778474399165, "grad_norm": 0.8538097637664808, "learning_rate": 1.2763554058227139e-05, "loss": 0.1048, "step": 12305 }, { "epoch": 2.5717868338557994, "grad_norm": 1.1055344883067277, "learning_rate": 1.2762469853297126e-05, "loss": 0.1433, "step": 12306 }, { "epoch": 2.5719958202716824, "grad_norm": 1.2808696573177507, "learning_rate": 1.2761385613210314e-05, "loss": 0.1352, "step": 12307 }, { "epoch": 2.5722048066875653, "grad_norm": 0.9907201285429758, "learning_rate": 1.276030133798051e-05, "loss": 0.1133, "step": 12308 }, { "epoch": 2.5724137931034483, "grad_norm": 1.1557682146165955, "learning_rate": 1.2759217027621507e-05, "loss": 0.1461, "step": 12309 }, { "epoch": 2.5726227795193313, "grad_norm": 1.0379447855238026, "learning_rate": 1.2758132682147106e-05, "loss": 0.1287, "step": 12310 }, { "epoch": 2.5728317659352142, "grad_norm": 1.2706813777878296, "learning_rate": 1.2757048301571108e-05, "loss": 0.1326, "step": 12311 }, { "epoch": 2.573040752351097, "grad_norm": 0.9475849042529312, "learning_rate": 1.2755963885907316e-05, "loss": 0.1289, "step": 12312 }, { "epoch": 2.57324973876698, "grad_norm": 1.0708721318505068, "learning_rate": 1.2754879435169526e-05, "loss": 0.1295, "step": 12313 }, { "epoch": 2.573458725182863, "grad_norm": 0.8424212324359152, "learning_rate": 1.2753794949371542e-05, "loss": 0.118, "step": 12314 }, { "epoch": 2.573667711598746, "grad_norm": 1.0726818323886775, "learning_rate": 1.2752710428527166e-05, "loss": 0.1294, "step": 12315 }, { "epoch": 2.573876698014629, "grad_norm": 1.015321675004407, "learning_rate": 1.2751625872650197e-05, "loss": 0.1217, "step": 12316 }, { "epoch": 2.574085684430512, "grad_norm": 0.8495952938782574, "learning_rate": 1.275054128175444e-05, "loss": 0.1081, "step": 12317 }, { "epoch": 2.574294670846395, "grad_norm": 1.1168501912091342, "learning_rate": 1.2749456655853703e-05, "loss": 0.1421, "step": 12318 }, { "epoch": 2.574503657262278, "grad_norm": 1.1052507575606083, "learning_rate": 1.2748371994961782e-05, "loss": 0.1518, "step": 12319 }, { "epoch": 2.574712643678161, "grad_norm": 1.0324978885314, "learning_rate": 1.2747287299092484e-05, "loss": 0.1293, "step": 12320 }, { "epoch": 2.574921630094044, "grad_norm": 0.9266691592659417, "learning_rate": 1.274620256825961e-05, "loss": 0.133, "step": 12321 }, { "epoch": 2.575130616509927, "grad_norm": 1.0400436480438, "learning_rate": 1.2745117802476974e-05, "loss": 0.1253, "step": 12322 }, { "epoch": 2.5753396029258098, "grad_norm": 1.0997222389899557, "learning_rate": 1.2744033001758372e-05, "loss": 0.1065, "step": 12323 }, { "epoch": 2.5755485893416927, "grad_norm": 0.981776352166942, "learning_rate": 1.2742948166117617e-05, "loss": 0.1239, "step": 12324 }, { "epoch": 2.5757575757575757, "grad_norm": 0.8505534164746426, "learning_rate": 1.2741863295568508e-05, "loss": 0.0896, "step": 12325 }, { "epoch": 2.5759665621734587, "grad_norm": 1.0707324364017725, "learning_rate": 1.2740778390124858e-05, "loss": 0.1101, "step": 12326 }, { "epoch": 2.5761755485893416, "grad_norm": 0.8444031038387024, "learning_rate": 1.273969344980047e-05, "loss": 0.0935, "step": 12327 }, { "epoch": 2.5763845350052246, "grad_norm": 0.9084453529959455, "learning_rate": 1.2738608474609153e-05, "loss": 0.1074, "step": 12328 }, { "epoch": 2.5765935214211075, "grad_norm": 0.8978548427571359, "learning_rate": 1.2737523464564716e-05, "loss": 0.1238, "step": 12329 }, { "epoch": 2.5768025078369905, "grad_norm": 1.0708224567369535, "learning_rate": 1.2736438419680967e-05, "loss": 0.129, "step": 12330 }, { "epoch": 2.5770114942528735, "grad_norm": 0.934006008649895, "learning_rate": 1.2735353339971715e-05, "loss": 0.1157, "step": 12331 }, { "epoch": 2.5772204806687564, "grad_norm": 1.0629986526424755, "learning_rate": 1.2734268225450769e-05, "loss": 0.1029, "step": 12332 }, { "epoch": 2.5774294670846394, "grad_norm": 1.2358298545193358, "learning_rate": 1.2733183076131933e-05, "loss": 0.1269, "step": 12333 }, { "epoch": 2.5776384535005223, "grad_norm": 0.9575307526588867, "learning_rate": 1.2732097892029028e-05, "loss": 0.1156, "step": 12334 }, { "epoch": 2.5778474399164053, "grad_norm": 1.0571800623474703, "learning_rate": 1.2731012673155859e-05, "loss": 0.1267, "step": 12335 }, { "epoch": 2.5780564263322883, "grad_norm": 0.9515428366082447, "learning_rate": 1.272992741952624e-05, "loss": 0.1254, "step": 12336 }, { "epoch": 2.5782654127481712, "grad_norm": 0.9294502975759816, "learning_rate": 1.2728842131153978e-05, "loss": 0.1054, "step": 12337 }, { "epoch": 2.578474399164054, "grad_norm": 0.9518884131190204, "learning_rate": 1.2727756808052887e-05, "loss": 0.0957, "step": 12338 }, { "epoch": 2.578683385579937, "grad_norm": 1.1258489969340408, "learning_rate": 1.2726671450236779e-05, "loss": 0.1194, "step": 12339 }, { "epoch": 2.57889237199582, "grad_norm": 1.3154455137209287, "learning_rate": 1.2725586057719468e-05, "loss": 0.135, "step": 12340 }, { "epoch": 2.5791013584117035, "grad_norm": 0.9313687725008696, "learning_rate": 1.2724500630514766e-05, "loss": 0.1056, "step": 12341 }, { "epoch": 2.5793103448275865, "grad_norm": 0.8717297436722149, "learning_rate": 1.2723415168636493e-05, "loss": 0.1025, "step": 12342 }, { "epoch": 2.5795193312434694, "grad_norm": 0.91053617101721, "learning_rate": 1.2722329672098454e-05, "loss": 0.088, "step": 12343 }, { "epoch": 2.5797283176593524, "grad_norm": 1.0161121529480548, "learning_rate": 1.2721244140914468e-05, "loss": 0.1363, "step": 12344 }, { "epoch": 2.5799373040752354, "grad_norm": 1.0106355901471293, "learning_rate": 1.2720158575098352e-05, "loss": 0.1226, "step": 12345 }, { "epoch": 2.5801462904911183, "grad_norm": 1.3219619859538576, "learning_rate": 1.2719072974663912e-05, "loss": 0.1505, "step": 12346 }, { "epoch": 2.5803552769070013, "grad_norm": 0.9163345832978677, "learning_rate": 1.2717987339624979e-05, "loss": 0.103, "step": 12347 }, { "epoch": 2.5805642633228842, "grad_norm": 1.6948364986823188, "learning_rate": 1.2716901669995361e-05, "loss": 0.1509, "step": 12348 }, { "epoch": 2.580773249738767, "grad_norm": 1.0871715888116773, "learning_rate": 1.2715815965788871e-05, "loss": 0.1215, "step": 12349 }, { "epoch": 2.58098223615465, "grad_norm": 0.9540322623941152, "learning_rate": 1.2714730227019334e-05, "loss": 0.1091, "step": 12350 }, { "epoch": 2.581191222570533, "grad_norm": 1.064109899504349, "learning_rate": 1.2713644453700567e-05, "loss": 0.1094, "step": 12351 }, { "epoch": 2.581400208986416, "grad_norm": 0.9184432231871558, "learning_rate": 1.2712558645846377e-05, "loss": 0.1102, "step": 12352 }, { "epoch": 2.581609195402299, "grad_norm": 0.8956405270625557, "learning_rate": 1.2711472803470602e-05, "loss": 0.0963, "step": 12353 }, { "epoch": 2.581818181818182, "grad_norm": 1.240942731008347, "learning_rate": 1.2710386926587043e-05, "loss": 0.1075, "step": 12354 }, { "epoch": 2.582027168234065, "grad_norm": 1.0066148241183068, "learning_rate": 1.270930101520953e-05, "loss": 0.1137, "step": 12355 }, { "epoch": 2.582236154649948, "grad_norm": 0.9161787857340136, "learning_rate": 1.2708215069351879e-05, "loss": 0.1118, "step": 12356 }, { "epoch": 2.582445141065831, "grad_norm": 1.1004951048795413, "learning_rate": 1.2707129089027915e-05, "loss": 0.1129, "step": 12357 }, { "epoch": 2.582654127481714, "grad_norm": 0.925059676211591, "learning_rate": 1.2706043074251448e-05, "loss": 0.1111, "step": 12358 }, { "epoch": 2.582863113897597, "grad_norm": 1.2234215504362036, "learning_rate": 1.2704957025036314e-05, "loss": 0.1377, "step": 12359 }, { "epoch": 2.58307210031348, "grad_norm": 1.1420164149265943, "learning_rate": 1.270387094139632e-05, "loss": 0.1442, "step": 12360 }, { "epoch": 2.5832810867293627, "grad_norm": 0.977402890568705, "learning_rate": 1.27027848233453e-05, "loss": 0.1374, "step": 12361 }, { "epoch": 2.5834900731452457, "grad_norm": 1.197250462294599, "learning_rate": 1.270169867089707e-05, "loss": 0.1429, "step": 12362 }, { "epoch": 2.5836990595611287, "grad_norm": 0.8571772319967919, "learning_rate": 1.2700612484065458e-05, "loss": 0.0965, "step": 12363 }, { "epoch": 2.5839080459770116, "grad_norm": 0.9468739007931056, "learning_rate": 1.2699526262864277e-05, "loss": 0.1211, "step": 12364 }, { "epoch": 2.5841170323928946, "grad_norm": 0.9466182118736363, "learning_rate": 1.2698440007307365e-05, "loss": 0.1052, "step": 12365 }, { "epoch": 2.5843260188087775, "grad_norm": 1.0401546838471156, "learning_rate": 1.2697353717408535e-05, "loss": 0.1076, "step": 12366 }, { "epoch": 2.5845350052246605, "grad_norm": 1.4210434362488138, "learning_rate": 1.2696267393181615e-05, "loss": 0.1571, "step": 12367 }, { "epoch": 2.5847439916405435, "grad_norm": 0.9895899171973076, "learning_rate": 1.2695181034640435e-05, "loss": 0.1234, "step": 12368 }, { "epoch": 2.5849529780564264, "grad_norm": 0.9039431449374312, "learning_rate": 1.2694094641798817e-05, "loss": 0.1202, "step": 12369 }, { "epoch": 2.5851619644723094, "grad_norm": 0.9302666048322716, "learning_rate": 1.2693008214670586e-05, "loss": 0.1055, "step": 12370 }, { "epoch": 2.5853709508881924, "grad_norm": 0.8922036921821479, "learning_rate": 1.269192175326957e-05, "loss": 0.104, "step": 12371 }, { "epoch": 2.5855799373040753, "grad_norm": 0.9602563811265868, "learning_rate": 1.2690835257609595e-05, "loss": 0.1161, "step": 12372 }, { "epoch": 2.5857889237199583, "grad_norm": 0.9864525883573618, "learning_rate": 1.268974872770449e-05, "loss": 0.1278, "step": 12373 }, { "epoch": 2.5859979101358412, "grad_norm": 0.8987499118426426, "learning_rate": 1.2688662163568078e-05, "loss": 0.1185, "step": 12374 }, { "epoch": 2.586206896551724, "grad_norm": 1.0141673457535854, "learning_rate": 1.2687575565214196e-05, "loss": 0.1167, "step": 12375 }, { "epoch": 2.586415882967607, "grad_norm": 1.0047888941039456, "learning_rate": 1.2686488932656662e-05, "loss": 0.1311, "step": 12376 }, { "epoch": 2.58662486938349, "grad_norm": 0.9078102398021485, "learning_rate": 1.2685402265909315e-05, "loss": 0.1193, "step": 12377 }, { "epoch": 2.586833855799373, "grad_norm": 0.9993409387706907, "learning_rate": 1.2684315564985978e-05, "loss": 0.1307, "step": 12378 }, { "epoch": 2.587042842215256, "grad_norm": 0.9008795585122011, "learning_rate": 1.2683228829900485e-05, "loss": 0.093, "step": 12379 }, { "epoch": 2.587251828631139, "grad_norm": 0.8392349073844821, "learning_rate": 1.2682142060666663e-05, "loss": 0.0965, "step": 12380 }, { "epoch": 2.587460815047022, "grad_norm": 0.9453388035253469, "learning_rate": 1.2681055257298346e-05, "loss": 0.1163, "step": 12381 }, { "epoch": 2.587669801462905, "grad_norm": 1.0479785513952136, "learning_rate": 1.267996841980936e-05, "loss": 0.1422, "step": 12382 }, { "epoch": 2.587878787878788, "grad_norm": 0.957568456929633, "learning_rate": 1.2678881548213547e-05, "loss": 0.1219, "step": 12383 }, { "epoch": 2.588087774294671, "grad_norm": 1.1324058174491543, "learning_rate": 1.2677794642524727e-05, "loss": 0.1258, "step": 12384 }, { "epoch": 2.588296760710554, "grad_norm": 0.9731723878990302, "learning_rate": 1.267670770275674e-05, "loss": 0.1169, "step": 12385 }, { "epoch": 2.5885057471264368, "grad_norm": 1.1774998553184166, "learning_rate": 1.267562072892342e-05, "loss": 0.112, "step": 12386 }, { "epoch": 2.5887147335423197, "grad_norm": 0.9327771251379781, "learning_rate": 1.2674533721038596e-05, "loss": 0.1294, "step": 12387 }, { "epoch": 2.5889237199582027, "grad_norm": 1.1008378878947915, "learning_rate": 1.2673446679116103e-05, "loss": 0.1489, "step": 12388 }, { "epoch": 2.5891327063740857, "grad_norm": 1.176873451693455, "learning_rate": 1.2672359603169775e-05, "loss": 0.1409, "step": 12389 }, { "epoch": 2.5893416927899686, "grad_norm": 1.0403543466696261, "learning_rate": 1.2671272493213448e-05, "loss": 0.1288, "step": 12390 }, { "epoch": 2.5895506792058516, "grad_norm": 0.9404934394763075, "learning_rate": 1.267018534926096e-05, "loss": 0.1227, "step": 12391 }, { "epoch": 2.5897596656217345, "grad_norm": 1.135368773483615, "learning_rate": 1.2669098171326139e-05, "loss": 0.1219, "step": 12392 }, { "epoch": 2.5899686520376175, "grad_norm": 0.9308461809536525, "learning_rate": 1.2668010959422832e-05, "loss": 0.1132, "step": 12393 }, { "epoch": 2.5901776384535005, "grad_norm": 1.0943721853278199, "learning_rate": 1.2666923713564865e-05, "loss": 0.1447, "step": 12394 }, { "epoch": 2.5903866248693834, "grad_norm": 1.0532986449315191, "learning_rate": 1.2665836433766079e-05, "loss": 0.115, "step": 12395 }, { "epoch": 2.5905956112852664, "grad_norm": 1.022283825892622, "learning_rate": 1.266474912004031e-05, "loss": 0.1167, "step": 12396 }, { "epoch": 2.5908045977011493, "grad_norm": 1.0800448860509488, "learning_rate": 1.2663661772401398e-05, "loss": 0.1411, "step": 12397 }, { "epoch": 2.5910135841170323, "grad_norm": 1.049925692374267, "learning_rate": 1.2662574390863181e-05, "loss": 0.1326, "step": 12398 }, { "epoch": 2.5912225705329153, "grad_norm": 0.9597124090370478, "learning_rate": 1.2661486975439499e-05, "loss": 0.1251, "step": 12399 }, { "epoch": 2.5914315569487982, "grad_norm": 1.3293208799562422, "learning_rate": 1.2660399526144187e-05, "loss": 0.1149, "step": 12400 }, { "epoch": 2.591640543364681, "grad_norm": 0.823280056093617, "learning_rate": 1.2659312042991085e-05, "loss": 0.1163, "step": 12401 }, { "epoch": 2.591849529780564, "grad_norm": 0.9373060901221943, "learning_rate": 1.2658224525994039e-05, "loss": 0.109, "step": 12402 }, { "epoch": 2.592058516196447, "grad_norm": 0.913086663357279, "learning_rate": 1.2657136975166879e-05, "loss": 0.1098, "step": 12403 }, { "epoch": 2.59226750261233, "grad_norm": 1.0276543989614664, "learning_rate": 1.2656049390523457e-05, "loss": 0.1337, "step": 12404 }, { "epoch": 2.592476489028213, "grad_norm": 1.034286228377427, "learning_rate": 1.2654961772077607e-05, "loss": 0.1298, "step": 12405 }, { "epoch": 2.592685475444096, "grad_norm": 1.2771251522003142, "learning_rate": 1.2653874119843175e-05, "loss": 0.1276, "step": 12406 }, { "epoch": 2.592894461859979, "grad_norm": 0.9994157984142836, "learning_rate": 1.2652786433833998e-05, "loss": 0.11, "step": 12407 }, { "epoch": 2.593103448275862, "grad_norm": 0.9014598599167644, "learning_rate": 1.265169871406392e-05, "loss": 0.1244, "step": 12408 }, { "epoch": 2.593312434691745, "grad_norm": 0.8781566484257095, "learning_rate": 1.2650610960546788e-05, "loss": 0.1111, "step": 12409 }, { "epoch": 2.593521421107628, "grad_norm": 1.121679131096095, "learning_rate": 1.264952317329644e-05, "loss": 0.1434, "step": 12410 }, { "epoch": 2.593730407523511, "grad_norm": 0.8094554660648059, "learning_rate": 1.2648435352326724e-05, "loss": 0.1102, "step": 12411 }, { "epoch": 2.5939393939393938, "grad_norm": 1.2474141621635921, "learning_rate": 1.2647347497651483e-05, "loss": 0.1356, "step": 12412 }, { "epoch": 2.5941483803552767, "grad_norm": 1.052593141495553, "learning_rate": 1.264625960928456e-05, "loss": 0.1415, "step": 12413 }, { "epoch": 2.5943573667711597, "grad_norm": 0.9935423217505379, "learning_rate": 1.2645171687239803e-05, "loss": 0.1121, "step": 12414 }, { "epoch": 2.5945663531870427, "grad_norm": 1.2110622330397023, "learning_rate": 1.2644083731531053e-05, "loss": 0.1549, "step": 12415 }, { "epoch": 2.5947753396029256, "grad_norm": 1.0593822561992097, "learning_rate": 1.2642995742172164e-05, "loss": 0.1398, "step": 12416 }, { "epoch": 2.5949843260188086, "grad_norm": 0.8353570958503619, "learning_rate": 1.2641907719176975e-05, "loss": 0.0953, "step": 12417 }, { "epoch": 2.5951933124346915, "grad_norm": 1.0258482912412084, "learning_rate": 1.2640819662559334e-05, "loss": 0.1246, "step": 12418 }, { "epoch": 2.5954022988505745, "grad_norm": 0.9808478017592929, "learning_rate": 1.263973157233309e-05, "loss": 0.1136, "step": 12419 }, { "epoch": 2.5956112852664575, "grad_norm": 0.8669736318252677, "learning_rate": 1.2638643448512091e-05, "loss": 0.1211, "step": 12420 }, { "epoch": 2.5958202716823404, "grad_norm": 0.9321526701473933, "learning_rate": 1.263755529111018e-05, "loss": 0.1244, "step": 12421 }, { "epoch": 2.5960292580982234, "grad_norm": 0.7991463317831597, "learning_rate": 1.2636467100141214e-05, "loss": 0.1007, "step": 12422 }, { "epoch": 2.5962382445141063, "grad_norm": 0.930631089582894, "learning_rate": 1.2635378875619036e-05, "loss": 0.0941, "step": 12423 }, { "epoch": 2.5964472309299893, "grad_norm": 1.3609568445291187, "learning_rate": 1.2634290617557496e-05, "loss": 0.1154, "step": 12424 }, { "epoch": 2.5966562173458723, "grad_norm": 0.887692514062375, "learning_rate": 1.2633202325970447e-05, "loss": 0.1291, "step": 12425 }, { "epoch": 2.5968652037617552, "grad_norm": 1.1713414142030294, "learning_rate": 1.2632114000871738e-05, "loss": 0.1461, "step": 12426 }, { "epoch": 2.597074190177638, "grad_norm": 0.8898162080609658, "learning_rate": 1.2631025642275212e-05, "loss": 0.1414, "step": 12427 }, { "epoch": 2.597283176593521, "grad_norm": 1.0925939155167077, "learning_rate": 1.2629937250194732e-05, "loss": 0.1194, "step": 12428 }, { "epoch": 2.5974921630094046, "grad_norm": 1.0167549334302446, "learning_rate": 1.2628848824644145e-05, "loss": 0.1357, "step": 12429 }, { "epoch": 2.5977011494252875, "grad_norm": 1.0195849648634687, "learning_rate": 1.26277603656373e-05, "loss": 0.1164, "step": 12430 }, { "epoch": 2.5979101358411705, "grad_norm": 1.8204101890357356, "learning_rate": 1.2626671873188052e-05, "loss": 0.1329, "step": 12431 }, { "epoch": 2.5981191222570534, "grad_norm": 1.3423137704103274, "learning_rate": 1.2625583347310254e-05, "loss": 0.1589, "step": 12432 }, { "epoch": 2.5983281086729364, "grad_norm": 0.6499893522651946, "learning_rate": 1.2624494788017756e-05, "loss": 0.0754, "step": 12433 }, { "epoch": 2.5985370950888194, "grad_norm": 1.1104066015286622, "learning_rate": 1.2623406195324417e-05, "loss": 0.1144, "step": 12434 }, { "epoch": 2.5987460815047023, "grad_norm": 1.1395468721834716, "learning_rate": 1.2622317569244087e-05, "loss": 0.1378, "step": 12435 }, { "epoch": 2.5989550679205853, "grad_norm": 0.9885048628025557, "learning_rate": 1.2621228909790622e-05, "loss": 0.1318, "step": 12436 }, { "epoch": 2.5991640543364682, "grad_norm": 1.079701245135734, "learning_rate": 1.2620140216977877e-05, "loss": 0.1221, "step": 12437 }, { "epoch": 2.599373040752351, "grad_norm": 0.9300465798288553, "learning_rate": 1.2619051490819708e-05, "loss": 0.1157, "step": 12438 }, { "epoch": 2.599582027168234, "grad_norm": 0.9821989555798184, "learning_rate": 1.2617962731329964e-05, "loss": 0.1409, "step": 12439 }, { "epoch": 2.599791013584117, "grad_norm": 0.9071884459728944, "learning_rate": 1.2616873938522513e-05, "loss": 0.1118, "step": 12440 }, { "epoch": 2.6, "grad_norm": 0.816249367195374, "learning_rate": 1.2615785112411204e-05, "loss": 0.0935, "step": 12441 }, { "epoch": 2.600208986415883, "grad_norm": 1.4097450459456489, "learning_rate": 1.2614696253009895e-05, "loss": 0.148, "step": 12442 }, { "epoch": 2.600417972831766, "grad_norm": 0.9981125703456936, "learning_rate": 1.2613607360332445e-05, "loss": 0.1349, "step": 12443 }, { "epoch": 2.600626959247649, "grad_norm": 0.9176760220098441, "learning_rate": 1.2612518434392711e-05, "loss": 0.1069, "step": 12444 }, { "epoch": 2.600835945663532, "grad_norm": 0.8467910556200977, "learning_rate": 1.2611429475204547e-05, "loss": 0.1183, "step": 12445 }, { "epoch": 2.601044932079415, "grad_norm": 1.191098717318032, "learning_rate": 1.2610340482781822e-05, "loss": 0.1158, "step": 12446 }, { "epoch": 2.601253918495298, "grad_norm": 1.2774001263373636, "learning_rate": 1.2609251457138385e-05, "loss": 0.1186, "step": 12447 }, { "epoch": 2.601462904911181, "grad_norm": 0.9524698221191252, "learning_rate": 1.2608162398288099e-05, "loss": 0.1326, "step": 12448 }, { "epoch": 2.601671891327064, "grad_norm": 1.074308866337256, "learning_rate": 1.2607073306244828e-05, "loss": 0.1269, "step": 12449 }, { "epoch": 2.6018808777429467, "grad_norm": 1.2048448063502273, "learning_rate": 1.2605984181022429e-05, "loss": 0.1436, "step": 12450 }, { "epoch": 2.6020898641588297, "grad_norm": 1.0073687080946465, "learning_rate": 1.260489502263476e-05, "loss": 0.1054, "step": 12451 }, { "epoch": 2.6022988505747127, "grad_norm": 1.097890161181266, "learning_rate": 1.2603805831095688e-05, "loss": 0.1516, "step": 12452 }, { "epoch": 2.6025078369905956, "grad_norm": 0.9322334630207637, "learning_rate": 1.260271660641907e-05, "loss": 0.1406, "step": 12453 }, { "epoch": 2.6027168234064786, "grad_norm": 1.0053076321957224, "learning_rate": 1.260162734861877e-05, "loss": 0.1205, "step": 12454 }, { "epoch": 2.6029258098223615, "grad_norm": 0.9638213896951646, "learning_rate": 1.2600538057708649e-05, "loss": 0.1143, "step": 12455 }, { "epoch": 2.6031347962382445, "grad_norm": 0.9685027014786914, "learning_rate": 1.2599448733702575e-05, "loss": 0.1096, "step": 12456 }, { "epoch": 2.6033437826541275, "grad_norm": 0.8634820122595024, "learning_rate": 1.2598359376614407e-05, "loss": 0.1105, "step": 12457 }, { "epoch": 2.6035527690700104, "grad_norm": 1.0078263755619998, "learning_rate": 1.2597269986458007e-05, "loss": 0.13, "step": 12458 }, { "epoch": 2.6037617554858934, "grad_norm": 0.9693510854742536, "learning_rate": 1.2596180563247242e-05, "loss": 0.1305, "step": 12459 }, { "epoch": 2.6039707419017764, "grad_norm": 0.8683838916250217, "learning_rate": 1.2595091106995978e-05, "loss": 0.1214, "step": 12460 }, { "epoch": 2.6041797283176593, "grad_norm": 1.077770353733759, "learning_rate": 1.2594001617718078e-05, "loss": 0.1239, "step": 12461 }, { "epoch": 2.6043887147335423, "grad_norm": 0.9465463746989754, "learning_rate": 1.259291209542741e-05, "loss": 0.1163, "step": 12462 }, { "epoch": 2.6045977011494252, "grad_norm": 0.8527003784614636, "learning_rate": 1.2591822540137835e-05, "loss": 0.1115, "step": 12463 }, { "epoch": 2.604806687565308, "grad_norm": 0.9238963224533925, "learning_rate": 1.2590732951863222e-05, "loss": 0.111, "step": 12464 }, { "epoch": 2.605015673981191, "grad_norm": 0.8879821853891938, "learning_rate": 1.258964333061744e-05, "loss": 0.1091, "step": 12465 }, { "epoch": 2.605224660397074, "grad_norm": 0.9510667455615915, "learning_rate": 1.2588553676414352e-05, "loss": 0.1065, "step": 12466 }, { "epoch": 2.605433646812957, "grad_norm": 0.8832502113716172, "learning_rate": 1.258746398926783e-05, "loss": 0.1143, "step": 12467 }, { "epoch": 2.60564263322884, "grad_norm": 0.9960851487459786, "learning_rate": 1.2586374269191739e-05, "loss": 0.1213, "step": 12468 }, { "epoch": 2.605851619644723, "grad_norm": 0.8990490510279724, "learning_rate": 1.2585284516199948e-05, "loss": 0.114, "step": 12469 }, { "epoch": 2.606060606060606, "grad_norm": 1.1044663240678758, "learning_rate": 1.2584194730306323e-05, "loss": 0.143, "step": 12470 }, { "epoch": 2.606269592476489, "grad_norm": 1.0381945795786502, "learning_rate": 1.2583104911524737e-05, "loss": 0.1155, "step": 12471 }, { "epoch": 2.606478578892372, "grad_norm": 1.0354246843389208, "learning_rate": 1.258201505986906e-05, "loss": 0.135, "step": 12472 }, { "epoch": 2.606687565308255, "grad_norm": 0.9476098160504833, "learning_rate": 1.258092517535316e-05, "loss": 0.1099, "step": 12473 }, { "epoch": 2.606896551724138, "grad_norm": 0.9216137183180831, "learning_rate": 1.2579835257990911e-05, "loss": 0.119, "step": 12474 }, { "epoch": 2.6071055381400208, "grad_norm": 1.0048311593259902, "learning_rate": 1.257874530779618e-05, "loss": 0.0974, "step": 12475 }, { "epoch": 2.6073145245559037, "grad_norm": 1.318229313757062, "learning_rate": 1.2577655324782839e-05, "loss": 0.1392, "step": 12476 }, { "epoch": 2.6075235109717867, "grad_norm": 0.853706669558029, "learning_rate": 1.257656530896476e-05, "loss": 0.1108, "step": 12477 }, { "epoch": 2.6077324973876697, "grad_norm": 1.0875766651362113, "learning_rate": 1.2575475260355814e-05, "loss": 0.1418, "step": 12478 }, { "epoch": 2.6079414838035526, "grad_norm": 1.0446250092837173, "learning_rate": 1.2574385178969876e-05, "loss": 0.1225, "step": 12479 }, { "epoch": 2.6081504702194356, "grad_norm": 1.0810655187502745, "learning_rate": 1.2573295064820821e-05, "loss": 0.1459, "step": 12480 }, { "epoch": 2.6083594566353185, "grad_norm": 1.471437769776728, "learning_rate": 1.2572204917922518e-05, "loss": 0.1429, "step": 12481 }, { "epoch": 2.608568443051202, "grad_norm": 0.9728306362368035, "learning_rate": 1.257111473828884e-05, "loss": 0.1153, "step": 12482 }, { "epoch": 2.608777429467085, "grad_norm": 1.0171414332258666, "learning_rate": 1.2570024525933666e-05, "loss": 0.1308, "step": 12483 }, { "epoch": 2.608986415882968, "grad_norm": 1.1004966163331003, "learning_rate": 1.2568934280870868e-05, "loss": 0.1371, "step": 12484 }, { "epoch": 2.609195402298851, "grad_norm": 0.903875587618299, "learning_rate": 1.2567844003114322e-05, "loss": 0.1048, "step": 12485 }, { "epoch": 2.609404388714734, "grad_norm": 0.8854576601320209, "learning_rate": 1.2566753692677902e-05, "loss": 0.0934, "step": 12486 }, { "epoch": 2.6096133751306168, "grad_norm": 1.0352366848524177, "learning_rate": 1.2565663349575486e-05, "loss": 0.126, "step": 12487 }, { "epoch": 2.6098223615464997, "grad_norm": 1.0352386655508938, "learning_rate": 1.2564572973820947e-05, "loss": 0.1408, "step": 12488 }, { "epoch": 2.6100313479623827, "grad_norm": 1.1823149545338454, "learning_rate": 1.2563482565428167e-05, "loss": 0.1253, "step": 12489 }, { "epoch": 2.6102403343782656, "grad_norm": 0.9217087042289516, "learning_rate": 1.2562392124411015e-05, "loss": 0.1108, "step": 12490 }, { "epoch": 2.6104493207941486, "grad_norm": 0.9108687255906348, "learning_rate": 1.256130165078338e-05, "loss": 0.1045, "step": 12491 }, { "epoch": 2.6106583072100316, "grad_norm": 0.9773133365611918, "learning_rate": 1.2560211144559131e-05, "loss": 0.1098, "step": 12492 }, { "epoch": 2.6108672936259145, "grad_norm": 0.9431990096489137, "learning_rate": 1.2559120605752149e-05, "loss": 0.1236, "step": 12493 }, { "epoch": 2.6110762800417975, "grad_norm": 0.9592509510307655, "learning_rate": 1.2558030034376312e-05, "loss": 0.1108, "step": 12494 }, { "epoch": 2.6112852664576804, "grad_norm": 0.9836023469236168, "learning_rate": 1.2556939430445503e-05, "loss": 0.1098, "step": 12495 }, { "epoch": 2.6114942528735634, "grad_norm": 1.1136275452978674, "learning_rate": 1.2555848793973593e-05, "loss": 0.1414, "step": 12496 }, { "epoch": 2.6117032392894464, "grad_norm": 0.9034595922158515, "learning_rate": 1.2554758124974476e-05, "loss": 0.1089, "step": 12497 }, { "epoch": 2.6119122257053293, "grad_norm": 1.0090003522291986, "learning_rate": 1.2553667423462019e-05, "loss": 0.1101, "step": 12498 }, { "epoch": 2.6121212121212123, "grad_norm": 0.9096402836600391, "learning_rate": 1.255257668945011e-05, "loss": 0.1082, "step": 12499 }, { "epoch": 2.6123301985370952, "grad_norm": 1.064446011106307, "learning_rate": 1.2551485922952626e-05, "loss": 0.128, "step": 12500 }, { "epoch": 2.612539184952978, "grad_norm": 0.7793739518368362, "learning_rate": 1.2550395123983457e-05, "loss": 0.0938, "step": 12501 }, { "epoch": 2.612748171368861, "grad_norm": 0.962580319332771, "learning_rate": 1.2549304292556472e-05, "loss": 0.1196, "step": 12502 }, { "epoch": 2.612957157784744, "grad_norm": 1.1473299558253325, "learning_rate": 1.2548213428685567e-05, "loss": 0.1445, "step": 12503 }, { "epoch": 2.613166144200627, "grad_norm": 0.9487729724101348, "learning_rate": 1.2547122532384616e-05, "loss": 0.1118, "step": 12504 }, { "epoch": 2.61337513061651, "grad_norm": 1.0091571084653916, "learning_rate": 1.2546031603667506e-05, "loss": 0.1042, "step": 12505 }, { "epoch": 2.613584117032393, "grad_norm": 1.03772441208136, "learning_rate": 1.254494064254812e-05, "loss": 0.1176, "step": 12506 }, { "epoch": 2.613793103448276, "grad_norm": 0.9850413397875326, "learning_rate": 1.2543849649040345e-05, "loss": 0.1185, "step": 12507 }, { "epoch": 2.614002089864159, "grad_norm": 1.071940346175, "learning_rate": 1.2542758623158058e-05, "loss": 0.1049, "step": 12508 }, { "epoch": 2.614211076280042, "grad_norm": 0.9276885972050026, "learning_rate": 1.2541667564915155e-05, "loss": 0.1109, "step": 12509 }, { "epoch": 2.614420062695925, "grad_norm": 0.9375226895337058, "learning_rate": 1.2540576474325512e-05, "loss": 0.1143, "step": 12510 }, { "epoch": 2.614629049111808, "grad_norm": 1.0133658046975582, "learning_rate": 1.2539485351403017e-05, "loss": 0.1094, "step": 12511 }, { "epoch": 2.614838035527691, "grad_norm": 1.064797669097153, "learning_rate": 1.2538394196161559e-05, "loss": 0.1201, "step": 12512 }, { "epoch": 2.6150470219435737, "grad_norm": 1.4422992935380696, "learning_rate": 1.2537303008615026e-05, "loss": 0.1595, "step": 12513 }, { "epoch": 2.6152560083594567, "grad_norm": 1.039123338776145, "learning_rate": 1.2536211788777298e-05, "loss": 0.1381, "step": 12514 }, { "epoch": 2.6154649947753397, "grad_norm": 1.063265157991119, "learning_rate": 1.2535120536662268e-05, "loss": 0.1133, "step": 12515 }, { "epoch": 2.6156739811912226, "grad_norm": 1.0785457077355436, "learning_rate": 1.2534029252283823e-05, "loss": 0.1377, "step": 12516 }, { "epoch": 2.6158829676071056, "grad_norm": 1.0886489593499915, "learning_rate": 1.2532937935655854e-05, "loss": 0.1149, "step": 12517 }, { "epoch": 2.6160919540229886, "grad_norm": 0.9508135427577268, "learning_rate": 1.2531846586792242e-05, "loss": 0.1291, "step": 12518 }, { "epoch": 2.6163009404388715, "grad_norm": 0.9108242326551003, "learning_rate": 1.2530755205706887e-05, "loss": 0.126, "step": 12519 }, { "epoch": 2.6165099268547545, "grad_norm": 0.8866900990007534, "learning_rate": 1.2529663792413666e-05, "loss": 0.1132, "step": 12520 }, { "epoch": 2.6167189132706374, "grad_norm": 1.264159812893524, "learning_rate": 1.252857234692648e-05, "loss": 0.0936, "step": 12521 }, { "epoch": 2.6169278996865204, "grad_norm": 0.9778288816869416, "learning_rate": 1.2527480869259212e-05, "loss": 0.1267, "step": 12522 }, { "epoch": 2.6171368861024034, "grad_norm": 1.2822802405092393, "learning_rate": 1.2526389359425757e-05, "loss": 0.1466, "step": 12523 }, { "epoch": 2.6173458725182863, "grad_norm": 0.9237532074010042, "learning_rate": 1.2525297817440003e-05, "loss": 0.1192, "step": 12524 }, { "epoch": 2.6175548589341693, "grad_norm": 0.9905806330043132, "learning_rate": 1.2524206243315846e-05, "loss": 0.1208, "step": 12525 }, { "epoch": 2.6177638453500522, "grad_norm": 0.95198857604273, "learning_rate": 1.2523114637067175e-05, "loss": 0.1263, "step": 12526 }, { "epoch": 2.617972831765935, "grad_norm": 0.8854512322749245, "learning_rate": 1.2522022998707881e-05, "loss": 0.1004, "step": 12527 }, { "epoch": 2.618181818181818, "grad_norm": 1.103955133363586, "learning_rate": 1.252093132825186e-05, "loss": 0.1351, "step": 12528 }, { "epoch": 2.618390804597701, "grad_norm": 0.7753924444832885, "learning_rate": 1.2519839625713001e-05, "loss": 0.0909, "step": 12529 }, { "epoch": 2.618599791013584, "grad_norm": 0.997761344829363, "learning_rate": 1.2518747891105204e-05, "loss": 0.1114, "step": 12530 }, { "epoch": 2.618808777429467, "grad_norm": 0.866532515219939, "learning_rate": 1.251765612444236e-05, "loss": 0.1204, "step": 12531 }, { "epoch": 2.61901776384535, "grad_norm": 0.888683988486317, "learning_rate": 1.2516564325738361e-05, "loss": 0.1098, "step": 12532 }, { "epoch": 2.619226750261233, "grad_norm": 0.8175570211001558, "learning_rate": 1.2515472495007104e-05, "loss": 0.0996, "step": 12533 }, { "epoch": 2.619435736677116, "grad_norm": 1.1191331277627794, "learning_rate": 1.2514380632262484e-05, "loss": 0.1447, "step": 12534 }, { "epoch": 2.619644723092999, "grad_norm": 1.110885921301813, "learning_rate": 1.2513288737518397e-05, "loss": 0.125, "step": 12535 }, { "epoch": 2.619853709508882, "grad_norm": 0.9984821819534562, "learning_rate": 1.2512196810788739e-05, "loss": 0.1282, "step": 12536 }, { "epoch": 2.620062695924765, "grad_norm": 0.9763355271519064, "learning_rate": 1.2511104852087408e-05, "loss": 0.1228, "step": 12537 }, { "epoch": 2.620271682340648, "grad_norm": 1.2674386428489162, "learning_rate": 1.2510012861428298e-05, "loss": 0.0987, "step": 12538 }, { "epoch": 2.6204806687565307, "grad_norm": 1.0247044825805036, "learning_rate": 1.2508920838825308e-05, "loss": 0.099, "step": 12539 }, { "epoch": 2.6206896551724137, "grad_norm": 1.089983843094093, "learning_rate": 1.2507828784292333e-05, "loss": 0.1443, "step": 12540 }, { "epoch": 2.6208986415882967, "grad_norm": 1.0166043823643598, "learning_rate": 1.2506736697843276e-05, "loss": 0.1164, "step": 12541 }, { "epoch": 2.6211076280041796, "grad_norm": 1.0438149226369702, "learning_rate": 1.250564457949203e-05, "loss": 0.1386, "step": 12542 }, { "epoch": 2.6213166144200626, "grad_norm": 0.9840898989167952, "learning_rate": 1.25045524292525e-05, "loss": 0.1437, "step": 12543 }, { "epoch": 2.6215256008359455, "grad_norm": 1.0817707570164847, "learning_rate": 1.2503460247138581e-05, "loss": 0.1263, "step": 12544 }, { "epoch": 2.6217345872518285, "grad_norm": 1.170642518176593, "learning_rate": 1.2502368033164176e-05, "loss": 0.1499, "step": 12545 }, { "epoch": 2.6219435736677115, "grad_norm": 1.041769657053102, "learning_rate": 1.2501275787343181e-05, "loss": 0.1453, "step": 12546 }, { "epoch": 2.6221525600835944, "grad_norm": 1.059194149730787, "learning_rate": 1.2500183509689499e-05, "loss": 0.1215, "step": 12547 }, { "epoch": 2.6223615464994774, "grad_norm": 0.8636503668628116, "learning_rate": 1.2499091200217029e-05, "loss": 0.122, "step": 12548 }, { "epoch": 2.6225705329153604, "grad_norm": 1.1773258034335472, "learning_rate": 1.2497998858939677e-05, "loss": 0.143, "step": 12549 }, { "epoch": 2.6227795193312433, "grad_norm": 1.071357396366165, "learning_rate": 1.2496906485871341e-05, "loss": 0.1278, "step": 12550 }, { "epoch": 2.6229885057471263, "grad_norm": 1.1338557711248183, "learning_rate": 1.2495814081025923e-05, "loss": 0.1324, "step": 12551 }, { "epoch": 2.6231974921630092, "grad_norm": 0.7530757760353344, "learning_rate": 1.2494721644417327e-05, "loss": 0.0925, "step": 12552 }, { "epoch": 2.623406478578892, "grad_norm": 0.9491386836950573, "learning_rate": 1.2493629176059453e-05, "loss": 0.1358, "step": 12553 }, { "epoch": 2.623615464994775, "grad_norm": 0.831374857968056, "learning_rate": 1.249253667596621e-05, "loss": 0.1139, "step": 12554 }, { "epoch": 2.623824451410658, "grad_norm": 0.7784530960687323, "learning_rate": 1.2491444144151498e-05, "loss": 0.1025, "step": 12555 }, { "epoch": 2.624033437826541, "grad_norm": 1.089784478256533, "learning_rate": 1.2490351580629223e-05, "loss": 0.1269, "step": 12556 }, { "epoch": 2.624242424242424, "grad_norm": 0.9069812265542156, "learning_rate": 1.2489258985413287e-05, "loss": 0.1344, "step": 12557 }, { "epoch": 2.624451410658307, "grad_norm": 0.8426593488426576, "learning_rate": 1.2488166358517597e-05, "loss": 0.1111, "step": 12558 }, { "epoch": 2.62466039707419, "grad_norm": 0.9697868054660381, "learning_rate": 1.2487073699956056e-05, "loss": 0.1291, "step": 12559 }, { "epoch": 2.624869383490073, "grad_norm": 1.0002036492889432, "learning_rate": 1.2485981009742573e-05, "loss": 0.1244, "step": 12560 }, { "epoch": 2.625078369905956, "grad_norm": 0.8142105352115112, "learning_rate": 1.2484888287891054e-05, "loss": 0.1077, "step": 12561 }, { "epoch": 2.625287356321839, "grad_norm": 1.040862850574303, "learning_rate": 1.2483795534415404e-05, "loss": 0.1259, "step": 12562 }, { "epoch": 2.625496342737722, "grad_norm": 0.7686587252712769, "learning_rate": 1.2482702749329532e-05, "loss": 0.1139, "step": 12563 }, { "epoch": 2.6257053291536048, "grad_norm": 0.871259898753194, "learning_rate": 1.248160993264734e-05, "loss": 0.1328, "step": 12564 }, { "epoch": 2.6259143155694877, "grad_norm": 0.9927242764193547, "learning_rate": 1.2480517084382742e-05, "loss": 0.1514, "step": 12565 }, { "epoch": 2.6261233019853707, "grad_norm": 0.8462530651814653, "learning_rate": 1.2479424204549646e-05, "loss": 0.1084, "step": 12566 }, { "epoch": 2.6263322884012537, "grad_norm": 0.8348685194641084, "learning_rate": 1.2478331293161956e-05, "loss": 0.1271, "step": 12567 }, { "epoch": 2.6265412748171366, "grad_norm": 0.8954867683386368, "learning_rate": 1.2477238350233582e-05, "loss": 0.123, "step": 12568 }, { "epoch": 2.6267502612330196, "grad_norm": 1.0691754975828656, "learning_rate": 1.2476145375778437e-05, "loss": 0.1477, "step": 12569 }, { "epoch": 2.626959247648903, "grad_norm": 0.9789021679025088, "learning_rate": 1.2475052369810432e-05, "loss": 0.1215, "step": 12570 }, { "epoch": 2.627168234064786, "grad_norm": 0.9442176249203662, "learning_rate": 1.2473959332343467e-05, "loss": 0.1039, "step": 12571 }, { "epoch": 2.627377220480669, "grad_norm": 0.9327441494551382, "learning_rate": 1.2472866263391467e-05, "loss": 0.1246, "step": 12572 }, { "epoch": 2.627586206896552, "grad_norm": 0.846556812833576, "learning_rate": 1.2471773162968332e-05, "loss": 0.1082, "step": 12573 }, { "epoch": 2.627795193312435, "grad_norm": 0.8860518915351056, "learning_rate": 1.2470680031087979e-05, "loss": 0.108, "step": 12574 }, { "epoch": 2.628004179728318, "grad_norm": 0.9542520301358595, "learning_rate": 1.2469586867764314e-05, "loss": 0.1152, "step": 12575 }, { "epoch": 2.6282131661442008, "grad_norm": 1.1140223375933438, "learning_rate": 1.2468493673011257e-05, "loss": 0.1442, "step": 12576 }, { "epoch": 2.6284221525600837, "grad_norm": 1.1103509343759068, "learning_rate": 1.2467400446842714e-05, "loss": 0.1323, "step": 12577 }, { "epoch": 2.6286311389759667, "grad_norm": 0.9845642942798066, "learning_rate": 1.2466307189272606e-05, "loss": 0.13, "step": 12578 }, { "epoch": 2.6288401253918496, "grad_norm": 0.9645639395641764, "learning_rate": 1.2465213900314836e-05, "loss": 0.1082, "step": 12579 }, { "epoch": 2.6290491118077326, "grad_norm": 1.1332192301369886, "learning_rate": 1.2464120579983325e-05, "loss": 0.1329, "step": 12580 }, { "epoch": 2.6292580982236156, "grad_norm": 1.0304741176562808, "learning_rate": 1.2463027228291986e-05, "loss": 0.1183, "step": 12581 }, { "epoch": 2.6294670846394985, "grad_norm": 1.0046896538942773, "learning_rate": 1.2461933845254734e-05, "loss": 0.1364, "step": 12582 }, { "epoch": 2.6296760710553815, "grad_norm": 0.9591313683177055, "learning_rate": 1.2460840430885481e-05, "loss": 0.1372, "step": 12583 }, { "epoch": 2.6298850574712644, "grad_norm": 1.0868583750531888, "learning_rate": 1.2459746985198147e-05, "loss": 0.1168, "step": 12584 }, { "epoch": 2.6300940438871474, "grad_norm": 0.9602322357511418, "learning_rate": 1.2458653508206644e-05, "loss": 0.1137, "step": 12585 }, { "epoch": 2.6303030303030304, "grad_norm": 1.1094539879272713, "learning_rate": 1.2457559999924888e-05, "loss": 0.134, "step": 12586 }, { "epoch": 2.6305120167189133, "grad_norm": 0.9378507390180711, "learning_rate": 1.2456466460366798e-05, "loss": 0.1206, "step": 12587 }, { "epoch": 2.6307210031347963, "grad_norm": 0.8604477520312823, "learning_rate": 1.2455372889546291e-05, "loss": 0.1132, "step": 12588 }, { "epoch": 2.6309299895506792, "grad_norm": 1.0510360915021473, "learning_rate": 1.2454279287477283e-05, "loss": 0.1179, "step": 12589 }, { "epoch": 2.631138975966562, "grad_norm": 0.9150560911243651, "learning_rate": 1.2453185654173691e-05, "loss": 0.1299, "step": 12590 }, { "epoch": 2.631347962382445, "grad_norm": 0.9685831872229527, "learning_rate": 1.2452091989649437e-05, "loss": 0.1243, "step": 12591 }, { "epoch": 2.631556948798328, "grad_norm": 0.972655096378204, "learning_rate": 1.2450998293918436e-05, "loss": 0.1247, "step": 12592 }, { "epoch": 2.631765935214211, "grad_norm": 0.7864123489585971, "learning_rate": 1.2449904566994609e-05, "loss": 0.1127, "step": 12593 }, { "epoch": 2.631974921630094, "grad_norm": 0.9688061855761206, "learning_rate": 1.2448810808891876e-05, "loss": 0.101, "step": 12594 }, { "epoch": 2.632183908045977, "grad_norm": 0.985633621924191, "learning_rate": 1.2447717019624152e-05, "loss": 0.1321, "step": 12595 }, { "epoch": 2.63239289446186, "grad_norm": 0.851602702853814, "learning_rate": 1.2446623199205363e-05, "loss": 0.089, "step": 12596 }, { "epoch": 2.632601880877743, "grad_norm": 0.9858325635253362, "learning_rate": 1.2445529347649426e-05, "loss": 0.1076, "step": 12597 }, { "epoch": 2.632810867293626, "grad_norm": 0.9764218716550642, "learning_rate": 1.2444435464970264e-05, "loss": 0.1258, "step": 12598 }, { "epoch": 2.633019853709509, "grad_norm": 1.1703009857500037, "learning_rate": 1.2443341551181795e-05, "loss": 0.1322, "step": 12599 }, { "epoch": 2.633228840125392, "grad_norm": 1.0837181048893672, "learning_rate": 1.2442247606297948e-05, "loss": 0.1183, "step": 12600 }, { "epoch": 2.633437826541275, "grad_norm": 1.1224030190380603, "learning_rate": 1.244115363033264e-05, "loss": 0.1581, "step": 12601 }, { "epoch": 2.6336468129571577, "grad_norm": 0.7538965431369063, "learning_rate": 1.2440059623299792e-05, "loss": 0.086, "step": 12602 }, { "epoch": 2.6338557993730407, "grad_norm": 0.7651871120236188, "learning_rate": 1.2438965585213327e-05, "loss": 0.0962, "step": 12603 }, { "epoch": 2.6340647857889237, "grad_norm": 0.9738821983362795, "learning_rate": 1.2437871516087174e-05, "loss": 0.1372, "step": 12604 }, { "epoch": 2.6342737722048066, "grad_norm": 0.9766984539861262, "learning_rate": 1.243677741593525e-05, "loss": 0.1316, "step": 12605 }, { "epoch": 2.6344827586206896, "grad_norm": 1.0373744833090397, "learning_rate": 1.2435683284771486e-05, "loss": 0.1268, "step": 12606 }, { "epoch": 2.6346917450365726, "grad_norm": 0.8473910217403962, "learning_rate": 1.2434589122609804e-05, "loss": 0.1108, "step": 12607 }, { "epoch": 2.6349007314524555, "grad_norm": 0.8953986533441313, "learning_rate": 1.2433494929464124e-05, "loss": 0.114, "step": 12608 }, { "epoch": 2.6351097178683385, "grad_norm": 0.9759820260261749, "learning_rate": 1.2432400705348375e-05, "loss": 0.1009, "step": 12609 }, { "epoch": 2.6353187042842214, "grad_norm": 0.934094957611642, "learning_rate": 1.2431306450276486e-05, "loss": 0.1156, "step": 12610 }, { "epoch": 2.6355276907001044, "grad_norm": 1.0471657683728284, "learning_rate": 1.2430212164262378e-05, "loss": 0.1229, "step": 12611 }, { "epoch": 2.6357366771159874, "grad_norm": 1.0811525107963333, "learning_rate": 1.2429117847319982e-05, "loss": 0.1203, "step": 12612 }, { "epoch": 2.6359456635318703, "grad_norm": 1.0531359229979074, "learning_rate": 1.242802349946322e-05, "loss": 0.1241, "step": 12613 }, { "epoch": 2.6361546499477533, "grad_norm": 0.7870549137952252, "learning_rate": 1.2426929120706022e-05, "loss": 0.1047, "step": 12614 }, { "epoch": 2.6363636363636362, "grad_norm": 0.9244238958334913, "learning_rate": 1.2425834711062316e-05, "loss": 0.1207, "step": 12615 }, { "epoch": 2.636572622779519, "grad_norm": 0.9553975892290745, "learning_rate": 1.242474027054603e-05, "loss": 0.1368, "step": 12616 }, { "epoch": 2.636781609195402, "grad_norm": 0.9786748356314369, "learning_rate": 1.2423645799171089e-05, "loss": 0.109, "step": 12617 }, { "epoch": 2.636990595611285, "grad_norm": 0.9803091970293503, "learning_rate": 1.2422551296951431e-05, "loss": 0.1035, "step": 12618 }, { "epoch": 2.637199582027168, "grad_norm": 1.1369831550301026, "learning_rate": 1.2421456763900974e-05, "loss": 0.1031, "step": 12619 }, { "epoch": 2.637408568443051, "grad_norm": 1.2014897352206877, "learning_rate": 1.2420362200033656e-05, "loss": 0.1051, "step": 12620 }, { "epoch": 2.637617554858934, "grad_norm": 1.0781280836017437, "learning_rate": 1.2419267605363402e-05, "loss": 0.1256, "step": 12621 }, { "epoch": 2.637826541274817, "grad_norm": 1.119782143310098, "learning_rate": 1.2418172979904144e-05, "loss": 0.1257, "step": 12622 }, { "epoch": 2.6380355276907004, "grad_norm": 0.8023817994866513, "learning_rate": 1.2417078323669814e-05, "loss": 0.0993, "step": 12623 }, { "epoch": 2.6382445141065833, "grad_norm": 1.048865520536612, "learning_rate": 1.2415983636674346e-05, "loss": 0.1282, "step": 12624 }, { "epoch": 2.6384535005224663, "grad_norm": 0.8658952050218856, "learning_rate": 1.2414888918931663e-05, "loss": 0.1022, "step": 12625 }, { "epoch": 2.6386624869383493, "grad_norm": 0.7998725032973673, "learning_rate": 1.2413794170455704e-05, "loss": 0.0937, "step": 12626 }, { "epoch": 2.6388714733542322, "grad_norm": 0.785413660248956, "learning_rate": 1.2412699391260398e-05, "loss": 0.0944, "step": 12627 }, { "epoch": 2.639080459770115, "grad_norm": 0.9390123763362747, "learning_rate": 1.2411604581359682e-05, "loss": 0.1176, "step": 12628 }, { "epoch": 2.639289446185998, "grad_norm": 1.0852926915639984, "learning_rate": 1.2410509740767484e-05, "loss": 0.1234, "step": 12629 }, { "epoch": 2.639498432601881, "grad_norm": 0.988415813895408, "learning_rate": 1.2409414869497742e-05, "loss": 0.1054, "step": 12630 }, { "epoch": 2.639707419017764, "grad_norm": 0.8870890446224358, "learning_rate": 1.2408319967564389e-05, "loss": 0.1122, "step": 12631 }, { "epoch": 2.639916405433647, "grad_norm": 0.9258836145365108, "learning_rate": 1.2407225034981357e-05, "loss": 0.1069, "step": 12632 }, { "epoch": 2.64012539184953, "grad_norm": 1.2336448450627717, "learning_rate": 1.240613007176258e-05, "loss": 0.1239, "step": 12633 }, { "epoch": 2.640334378265413, "grad_norm": 1.147774088890727, "learning_rate": 1.2405035077921996e-05, "loss": 0.1307, "step": 12634 }, { "epoch": 2.640543364681296, "grad_norm": 0.8188248861326313, "learning_rate": 1.240394005347354e-05, "loss": 0.1075, "step": 12635 }, { "epoch": 2.640752351097179, "grad_norm": 0.8906476092752146, "learning_rate": 1.2402844998431152e-05, "loss": 0.1054, "step": 12636 }, { "epoch": 2.640961337513062, "grad_norm": 0.9656667592915641, "learning_rate": 1.2401749912808758e-05, "loss": 0.1065, "step": 12637 }, { "epoch": 2.641170323928945, "grad_norm": 0.970115825926113, "learning_rate": 1.2400654796620305e-05, "loss": 0.1048, "step": 12638 }, { "epoch": 2.6413793103448278, "grad_norm": 1.2213525299011503, "learning_rate": 1.2399559649879722e-05, "loss": 0.1283, "step": 12639 }, { "epoch": 2.6415882967607107, "grad_norm": 0.9091431131454405, "learning_rate": 1.2398464472600954e-05, "loss": 0.1068, "step": 12640 }, { "epoch": 2.6417972831765937, "grad_norm": 0.8843780841577665, "learning_rate": 1.2397369264797933e-05, "loss": 0.1026, "step": 12641 }, { "epoch": 2.6420062695924766, "grad_norm": 1.050049992620006, "learning_rate": 1.23962740264846e-05, "loss": 0.1438, "step": 12642 }, { "epoch": 2.6422152560083596, "grad_norm": 1.0341739860285117, "learning_rate": 1.2395178757674892e-05, "loss": 0.1222, "step": 12643 }, { "epoch": 2.6424242424242426, "grad_norm": 1.010354206824368, "learning_rate": 1.2394083458382751e-05, "loss": 0.1253, "step": 12644 }, { "epoch": 2.6426332288401255, "grad_norm": 0.9030186146807878, "learning_rate": 1.2392988128622114e-05, "loss": 0.1221, "step": 12645 }, { "epoch": 2.6428422152560085, "grad_norm": 1.0334968594039853, "learning_rate": 1.239189276840692e-05, "loss": 0.1287, "step": 12646 }, { "epoch": 2.6430512016718914, "grad_norm": 0.8606822252435921, "learning_rate": 1.239079737775111e-05, "loss": 0.1027, "step": 12647 }, { "epoch": 2.6432601880877744, "grad_norm": 1.110031750880209, "learning_rate": 1.2389701956668627e-05, "loss": 0.1349, "step": 12648 }, { "epoch": 2.6434691745036574, "grad_norm": 0.8024755268860145, "learning_rate": 1.2388606505173411e-05, "loss": 0.0977, "step": 12649 }, { "epoch": 2.6436781609195403, "grad_norm": 0.8563220819155118, "learning_rate": 1.23875110232794e-05, "loss": 0.1105, "step": 12650 }, { "epoch": 2.6438871473354233, "grad_norm": 1.0328409561516345, "learning_rate": 1.238641551100054e-05, "loss": 0.124, "step": 12651 }, { "epoch": 2.6440961337513063, "grad_norm": 1.051271782739285, "learning_rate": 1.2385319968350766e-05, "loss": 0.1366, "step": 12652 }, { "epoch": 2.644305120167189, "grad_norm": 0.9509145518089754, "learning_rate": 1.238422439534403e-05, "loss": 0.1198, "step": 12653 }, { "epoch": 2.644514106583072, "grad_norm": 0.8448064294014394, "learning_rate": 1.2383128791994273e-05, "loss": 0.1124, "step": 12654 }, { "epoch": 2.644723092998955, "grad_norm": 0.8921606914649711, "learning_rate": 1.2382033158315431e-05, "loss": 0.1168, "step": 12655 }, { "epoch": 2.644932079414838, "grad_norm": 1.1094170737855389, "learning_rate": 1.2380937494321455e-05, "loss": 0.1351, "step": 12656 }, { "epoch": 2.645141065830721, "grad_norm": 1.062381260045799, "learning_rate": 1.2379841800026288e-05, "loss": 0.1234, "step": 12657 }, { "epoch": 2.645350052246604, "grad_norm": 0.7162479117990261, "learning_rate": 1.2378746075443867e-05, "loss": 0.0949, "step": 12658 }, { "epoch": 2.645559038662487, "grad_norm": 0.9825078184888382, "learning_rate": 1.237765032058815e-05, "loss": 0.1218, "step": 12659 }, { "epoch": 2.64576802507837, "grad_norm": 1.107335828421943, "learning_rate": 1.2376554535473074e-05, "loss": 0.1403, "step": 12660 }, { "epoch": 2.645977011494253, "grad_norm": 0.8147910642683802, "learning_rate": 1.2375458720112584e-05, "loss": 0.1322, "step": 12661 }, { "epoch": 2.646185997910136, "grad_norm": 0.9693002362084208, "learning_rate": 1.2374362874520624e-05, "loss": 0.1208, "step": 12662 }, { "epoch": 2.646394984326019, "grad_norm": 1.0108581098217733, "learning_rate": 1.2373266998711152e-05, "loss": 0.1022, "step": 12663 }, { "epoch": 2.646603970741902, "grad_norm": 0.923946768759688, "learning_rate": 1.23721710926981e-05, "loss": 0.1269, "step": 12664 }, { "epoch": 2.6468129571577848, "grad_norm": 0.9238836267718654, "learning_rate": 1.2371075156495425e-05, "loss": 0.1308, "step": 12665 }, { "epoch": 2.6470219435736677, "grad_norm": 1.1731347993119328, "learning_rate": 1.2369979190117071e-05, "loss": 0.1314, "step": 12666 }, { "epoch": 2.6472309299895507, "grad_norm": 0.9926329237076079, "learning_rate": 1.2368883193576986e-05, "loss": 0.1189, "step": 12667 }, { "epoch": 2.6474399164054336, "grad_norm": 1.0910840494612222, "learning_rate": 1.2367787166889118e-05, "loss": 0.1359, "step": 12668 }, { "epoch": 2.6476489028213166, "grad_norm": 0.9185954906743726, "learning_rate": 1.2366691110067419e-05, "loss": 0.1099, "step": 12669 }, { "epoch": 2.6478578892371996, "grad_norm": 0.9963293450324957, "learning_rate": 1.2365595023125834e-05, "loss": 0.1382, "step": 12670 }, { "epoch": 2.6480668756530825, "grad_norm": 0.8835328329242464, "learning_rate": 1.2364498906078312e-05, "loss": 0.1217, "step": 12671 }, { "epoch": 2.6482758620689655, "grad_norm": 1.2666665957522543, "learning_rate": 1.2363402758938807e-05, "loss": 0.1392, "step": 12672 }, { "epoch": 2.6484848484848484, "grad_norm": 0.9699369450612865, "learning_rate": 1.2362306581721266e-05, "loss": 0.1144, "step": 12673 }, { "epoch": 2.6486938349007314, "grad_norm": 0.786555995735982, "learning_rate": 1.236121037443964e-05, "loss": 0.0865, "step": 12674 }, { "epoch": 2.6489028213166144, "grad_norm": 1.028995283204778, "learning_rate": 1.2360114137107884e-05, "loss": 0.1357, "step": 12675 }, { "epoch": 2.6491118077324973, "grad_norm": 1.000648014519195, "learning_rate": 1.235901786973994e-05, "loss": 0.1266, "step": 12676 }, { "epoch": 2.6493207941483803, "grad_norm": 0.9015610329585325, "learning_rate": 1.235792157234977e-05, "loss": 0.1074, "step": 12677 }, { "epoch": 2.6495297805642632, "grad_norm": 1.057889016091716, "learning_rate": 1.2356825244951318e-05, "loss": 0.1463, "step": 12678 }, { "epoch": 2.649738766980146, "grad_norm": 1.084937341440436, "learning_rate": 1.2355728887558542e-05, "loss": 0.1004, "step": 12679 }, { "epoch": 2.649947753396029, "grad_norm": 0.7315077158175245, "learning_rate": 1.2354632500185391e-05, "loss": 0.1007, "step": 12680 }, { "epoch": 2.650156739811912, "grad_norm": 1.1126857293003654, "learning_rate": 1.2353536082845826e-05, "loss": 0.1492, "step": 12681 }, { "epoch": 2.650365726227795, "grad_norm": 0.9779557404914974, "learning_rate": 1.235243963555379e-05, "loss": 0.1382, "step": 12682 }, { "epoch": 2.650574712643678, "grad_norm": 0.9468922294089225, "learning_rate": 1.2351343158323242e-05, "loss": 0.1175, "step": 12683 }, { "epoch": 2.650783699059561, "grad_norm": 0.9267217134455734, "learning_rate": 1.2350246651168137e-05, "loss": 0.1188, "step": 12684 }, { "epoch": 2.650992685475444, "grad_norm": 0.858025439859828, "learning_rate": 1.2349150114102428e-05, "loss": 0.122, "step": 12685 }, { "epoch": 2.651201671891327, "grad_norm": 1.0280233266524614, "learning_rate": 1.2348053547140073e-05, "loss": 0.1499, "step": 12686 }, { "epoch": 2.65141065830721, "grad_norm": 0.8692988278718764, "learning_rate": 1.2346956950295029e-05, "loss": 0.1227, "step": 12687 }, { "epoch": 2.651619644723093, "grad_norm": 0.9716440602692248, "learning_rate": 1.2345860323581242e-05, "loss": 0.1226, "step": 12688 }, { "epoch": 2.651828631138976, "grad_norm": 1.172513680832859, "learning_rate": 1.234476366701268e-05, "loss": 0.1401, "step": 12689 }, { "epoch": 2.652037617554859, "grad_norm": 0.8513127347611061, "learning_rate": 1.2343666980603293e-05, "loss": 0.1335, "step": 12690 }, { "epoch": 2.6522466039707417, "grad_norm": 0.8815172280280761, "learning_rate": 1.2342570264367037e-05, "loss": 0.1175, "step": 12691 }, { "epoch": 2.6524555903866247, "grad_norm": 1.0360937066527505, "learning_rate": 1.2341473518317875e-05, "loss": 0.1252, "step": 12692 }, { "epoch": 2.6526645768025077, "grad_norm": 0.819025623016249, "learning_rate": 1.2340376742469765e-05, "loss": 0.1033, "step": 12693 }, { "epoch": 2.6528735632183906, "grad_norm": 1.0425793520327802, "learning_rate": 1.2339279936836659e-05, "loss": 0.1311, "step": 12694 }, { "epoch": 2.6530825496342736, "grad_norm": 1.153542265306797, "learning_rate": 1.2338183101432519e-05, "loss": 0.1104, "step": 12695 }, { "epoch": 2.6532915360501566, "grad_norm": 1.0485097892143156, "learning_rate": 1.2337086236271303e-05, "loss": 0.1236, "step": 12696 }, { "epoch": 2.6535005224660395, "grad_norm": 0.8126666233645662, "learning_rate": 1.2335989341366971e-05, "loss": 0.0967, "step": 12697 }, { "epoch": 2.6537095088819225, "grad_norm": 0.8287562551814842, "learning_rate": 1.2334892416733485e-05, "loss": 0.1078, "step": 12698 }, { "epoch": 2.6539184952978054, "grad_norm": 0.8730129226585147, "learning_rate": 1.2333795462384805e-05, "loss": 0.1115, "step": 12699 }, { "epoch": 2.6541274817136884, "grad_norm": 0.8537748116419012, "learning_rate": 1.2332698478334885e-05, "loss": 0.1201, "step": 12700 }, { "epoch": 2.6543364681295714, "grad_norm": 0.9982679011118728, "learning_rate": 1.2331601464597694e-05, "loss": 0.1376, "step": 12701 }, { "epoch": 2.6545454545454543, "grad_norm": 1.1640640018297161, "learning_rate": 1.2330504421187186e-05, "loss": 0.1288, "step": 12702 }, { "epoch": 2.6547544409613373, "grad_norm": 0.939032801425362, "learning_rate": 1.2329407348117328e-05, "loss": 0.1157, "step": 12703 }, { "epoch": 2.6549634273772202, "grad_norm": 0.904054062517017, "learning_rate": 1.232831024540208e-05, "loss": 0.1344, "step": 12704 }, { "epoch": 2.655172413793103, "grad_norm": 1.10851507614272, "learning_rate": 1.2327213113055407e-05, "loss": 0.1162, "step": 12705 }, { "epoch": 2.655381400208986, "grad_norm": 1.027502365574941, "learning_rate": 1.2326115951091267e-05, "loss": 0.0988, "step": 12706 }, { "epoch": 2.655590386624869, "grad_norm": 0.8819432332129248, "learning_rate": 1.2325018759523624e-05, "loss": 0.1018, "step": 12707 }, { "epoch": 2.655799373040752, "grad_norm": 1.0451017933598556, "learning_rate": 1.2323921538366443e-05, "loss": 0.1102, "step": 12708 }, { "epoch": 2.656008359456635, "grad_norm": 0.8795393667621694, "learning_rate": 1.232282428763369e-05, "loss": 0.1088, "step": 12709 }, { "epoch": 2.656217345872518, "grad_norm": 0.8517367881344827, "learning_rate": 1.2321727007339325e-05, "loss": 0.1204, "step": 12710 }, { "epoch": 2.6564263322884014, "grad_norm": 0.9471902030282829, "learning_rate": 1.2320629697497317e-05, "loss": 0.1163, "step": 12711 }, { "epoch": 2.6566353187042844, "grad_norm": 1.0146186378665942, "learning_rate": 1.2319532358121626e-05, "loss": 0.1461, "step": 12712 }, { "epoch": 2.6568443051201673, "grad_norm": 0.8383713448165577, "learning_rate": 1.2318434989226222e-05, "loss": 0.1024, "step": 12713 }, { "epoch": 2.6570532915360503, "grad_norm": 0.9319142672878932, "learning_rate": 1.2317337590825069e-05, "loss": 0.1175, "step": 12714 }, { "epoch": 2.6572622779519333, "grad_norm": 0.8330541145057969, "learning_rate": 1.2316240162932132e-05, "loss": 0.0849, "step": 12715 }, { "epoch": 2.657471264367816, "grad_norm": 0.9586053100012494, "learning_rate": 1.2315142705561379e-05, "loss": 0.1184, "step": 12716 }, { "epoch": 2.657680250783699, "grad_norm": 0.9757459443911375, "learning_rate": 1.2314045218726776e-05, "loss": 0.1084, "step": 12717 }, { "epoch": 2.657889237199582, "grad_norm": 1.0266741718779344, "learning_rate": 1.2312947702442291e-05, "loss": 0.1091, "step": 12718 }, { "epoch": 2.658098223615465, "grad_norm": 0.8585649148420736, "learning_rate": 1.231185015672189e-05, "loss": 0.1094, "step": 12719 }, { "epoch": 2.658307210031348, "grad_norm": 1.0179015027749199, "learning_rate": 1.2310752581579543e-05, "loss": 0.1347, "step": 12720 }, { "epoch": 2.658516196447231, "grad_norm": 1.0493757661843346, "learning_rate": 1.2309654977029216e-05, "loss": 0.1358, "step": 12721 }, { "epoch": 2.658725182863114, "grad_norm": 0.844531773301831, "learning_rate": 1.2308557343084881e-05, "loss": 0.1022, "step": 12722 }, { "epoch": 2.658934169278997, "grad_norm": 1.0728947877002708, "learning_rate": 1.2307459679760505e-05, "loss": 0.1118, "step": 12723 }, { "epoch": 2.65914315569488, "grad_norm": 0.9865575965406402, "learning_rate": 1.2306361987070056e-05, "loss": 0.1256, "step": 12724 }, { "epoch": 2.659352142110763, "grad_norm": 1.0566503951426949, "learning_rate": 1.2305264265027507e-05, "loss": 0.1166, "step": 12725 }, { "epoch": 2.659561128526646, "grad_norm": 0.8575424459606924, "learning_rate": 1.230416651364683e-05, "loss": 0.1079, "step": 12726 }, { "epoch": 2.659770114942529, "grad_norm": 1.1085281737841166, "learning_rate": 1.230306873294199e-05, "loss": 0.1096, "step": 12727 }, { "epoch": 2.6599791013584118, "grad_norm": 1.1418794861813575, "learning_rate": 1.2301970922926962e-05, "loss": 0.1469, "step": 12728 }, { "epoch": 2.6601880877742947, "grad_norm": 0.9788782736168807, "learning_rate": 1.2300873083615717e-05, "loss": 0.1324, "step": 12729 }, { "epoch": 2.6603970741901777, "grad_norm": 1.1539280374634724, "learning_rate": 1.2299775215022223e-05, "loss": 0.1382, "step": 12730 }, { "epoch": 2.6606060606060606, "grad_norm": 1.040610871375703, "learning_rate": 1.2298677317160454e-05, "loss": 0.1282, "step": 12731 }, { "epoch": 2.6608150470219436, "grad_norm": 0.8937176158998974, "learning_rate": 1.2297579390044388e-05, "loss": 0.1198, "step": 12732 }, { "epoch": 2.6610240334378266, "grad_norm": 0.8872852971352777, "learning_rate": 1.2296481433687987e-05, "loss": 0.1169, "step": 12733 }, { "epoch": 2.6612330198537095, "grad_norm": 1.1695943787862397, "learning_rate": 1.2295383448105236e-05, "loss": 0.125, "step": 12734 }, { "epoch": 2.6614420062695925, "grad_norm": 0.8452411784010521, "learning_rate": 1.22942854333101e-05, "loss": 0.1012, "step": 12735 }, { "epoch": 2.6616509926854754, "grad_norm": 1.2664291997433355, "learning_rate": 1.2293187389316558e-05, "loss": 0.1388, "step": 12736 }, { "epoch": 2.6618599791013584, "grad_norm": 1.1290496812460769, "learning_rate": 1.229208931613858e-05, "loss": 0.1482, "step": 12737 }, { "epoch": 2.6620689655172414, "grad_norm": 1.0025960635015638, "learning_rate": 1.2290991213790147e-05, "loss": 0.1215, "step": 12738 }, { "epoch": 2.6622779519331243, "grad_norm": 0.9931764726260016, "learning_rate": 1.2289893082285229e-05, "loss": 0.1289, "step": 12739 }, { "epoch": 2.6624869383490073, "grad_norm": 0.9564364940508481, "learning_rate": 1.2288794921637802e-05, "loss": 0.1028, "step": 12740 }, { "epoch": 2.6626959247648903, "grad_norm": 1.1464378039366492, "learning_rate": 1.228769673186184e-05, "loss": 0.1483, "step": 12741 }, { "epoch": 2.662904911180773, "grad_norm": 0.9367133203455923, "learning_rate": 1.2286598512971323e-05, "loss": 0.1194, "step": 12742 }, { "epoch": 2.663113897596656, "grad_norm": 0.9576381939860263, "learning_rate": 1.2285500264980229e-05, "loss": 0.1298, "step": 12743 }, { "epoch": 2.663322884012539, "grad_norm": 1.2545282550882, "learning_rate": 1.228440198790253e-05, "loss": 0.1217, "step": 12744 }, { "epoch": 2.663531870428422, "grad_norm": 0.9170550699755713, "learning_rate": 1.2283303681752206e-05, "loss": 0.1117, "step": 12745 }, { "epoch": 2.663740856844305, "grad_norm": 0.8812552707527487, "learning_rate": 1.2282205346543231e-05, "loss": 0.1155, "step": 12746 }, { "epoch": 2.663949843260188, "grad_norm": 0.8461931470708197, "learning_rate": 1.228110698228959e-05, "loss": 0.0947, "step": 12747 }, { "epoch": 2.664158829676071, "grad_norm": 0.9485085450067599, "learning_rate": 1.2280008589005257e-05, "loss": 0.134, "step": 12748 }, { "epoch": 2.664367816091954, "grad_norm": 0.8324923681871397, "learning_rate": 1.227891016670421e-05, "loss": 0.0998, "step": 12749 }, { "epoch": 2.664576802507837, "grad_norm": 1.0448284727717487, "learning_rate": 1.2277811715400434e-05, "loss": 0.1172, "step": 12750 }, { "epoch": 2.66478578892372, "grad_norm": 0.8038824501172026, "learning_rate": 1.22767132351079e-05, "loss": 0.1051, "step": 12751 }, { "epoch": 2.664994775339603, "grad_norm": 0.7326503327191127, "learning_rate": 1.2275614725840593e-05, "loss": 0.0809, "step": 12752 }, { "epoch": 2.665203761755486, "grad_norm": 1.0769437565736049, "learning_rate": 1.2274516187612492e-05, "loss": 0.1465, "step": 12753 }, { "epoch": 2.6654127481713688, "grad_norm": 0.9837750791146029, "learning_rate": 1.2273417620437578e-05, "loss": 0.1325, "step": 12754 }, { "epoch": 2.6656217345872517, "grad_norm": 0.9454946299134875, "learning_rate": 1.227231902432983e-05, "loss": 0.1093, "step": 12755 }, { "epoch": 2.6658307210031347, "grad_norm": 1.098536758748992, "learning_rate": 1.2271220399303236e-05, "loss": 0.1352, "step": 12756 }, { "epoch": 2.6660397074190176, "grad_norm": 0.8529706628943613, "learning_rate": 1.227012174537177e-05, "loss": 0.1051, "step": 12757 }, { "epoch": 2.6662486938349006, "grad_norm": 1.2710454454071884, "learning_rate": 1.226902306254942e-05, "loss": 0.1426, "step": 12758 }, { "epoch": 2.6664576802507836, "grad_norm": 0.9731830456966972, "learning_rate": 1.226792435085016e-05, "loss": 0.1253, "step": 12759 }, { "epoch": 2.6666666666666665, "grad_norm": 0.9968083690111127, "learning_rate": 1.2266825610287983e-05, "loss": 0.1286, "step": 12760 }, { "epoch": 2.6668756530825495, "grad_norm": 0.9596140387936076, "learning_rate": 1.2265726840876866e-05, "loss": 0.1228, "step": 12761 }, { "epoch": 2.6670846394984324, "grad_norm": 1.18379644848786, "learning_rate": 1.2264628042630795e-05, "loss": 0.1401, "step": 12762 }, { "epoch": 2.667293625914316, "grad_norm": 0.9719250880437247, "learning_rate": 1.2263529215563754e-05, "loss": 0.1129, "step": 12763 }, { "epoch": 2.667502612330199, "grad_norm": 1.1173692176619352, "learning_rate": 1.2262430359689726e-05, "loss": 0.1333, "step": 12764 }, { "epoch": 2.6677115987460818, "grad_norm": 0.9752914979234493, "learning_rate": 1.2261331475022696e-05, "loss": 0.1312, "step": 12765 }, { "epoch": 2.6679205851619647, "grad_norm": 1.2060436559152998, "learning_rate": 1.2260232561576646e-05, "loss": 0.1492, "step": 12766 }, { "epoch": 2.6681295715778477, "grad_norm": 0.9961209091428875, "learning_rate": 1.2259133619365568e-05, "loss": 0.1191, "step": 12767 }, { "epoch": 2.6683385579937307, "grad_norm": 0.9511318868280303, "learning_rate": 1.2258034648403445e-05, "loss": 0.1189, "step": 12768 }, { "epoch": 2.6685475444096136, "grad_norm": 0.8802399163768961, "learning_rate": 1.2256935648704262e-05, "loss": 0.1239, "step": 12769 }, { "epoch": 2.6687565308254966, "grad_norm": 1.1152338765057996, "learning_rate": 1.2255836620282006e-05, "loss": 0.136, "step": 12770 }, { "epoch": 2.6689655172413795, "grad_norm": 1.1140675744521242, "learning_rate": 1.2254737563150662e-05, "loss": 0.1555, "step": 12771 }, { "epoch": 2.6691745036572625, "grad_norm": 0.9462072418229287, "learning_rate": 1.2253638477324221e-05, "loss": 0.1121, "step": 12772 }, { "epoch": 2.6693834900731455, "grad_norm": 0.9730712285926553, "learning_rate": 1.225253936281667e-05, "loss": 0.1301, "step": 12773 }, { "epoch": 2.6695924764890284, "grad_norm": 0.8061568014726838, "learning_rate": 1.2251440219641995e-05, "loss": 0.1154, "step": 12774 }, { "epoch": 2.6698014629049114, "grad_norm": 1.1819340036161083, "learning_rate": 1.2250341047814186e-05, "loss": 0.1459, "step": 12775 }, { "epoch": 2.6700104493207943, "grad_norm": 1.1043627099195454, "learning_rate": 1.2249241847347229e-05, "loss": 0.1571, "step": 12776 }, { "epoch": 2.6702194357366773, "grad_norm": 1.1339596339156595, "learning_rate": 1.2248142618255115e-05, "loss": 0.1393, "step": 12777 }, { "epoch": 2.6704284221525603, "grad_norm": 0.9408908938336543, "learning_rate": 1.2247043360551831e-05, "loss": 0.117, "step": 12778 }, { "epoch": 2.6706374085684432, "grad_norm": 0.873802702078326, "learning_rate": 1.2245944074251372e-05, "loss": 0.1169, "step": 12779 }, { "epoch": 2.670846394984326, "grad_norm": 0.9338423949755336, "learning_rate": 1.2244844759367726e-05, "loss": 0.1113, "step": 12780 }, { "epoch": 2.671055381400209, "grad_norm": 0.8477201113940693, "learning_rate": 1.2243745415914882e-05, "loss": 0.1194, "step": 12781 }, { "epoch": 2.671264367816092, "grad_norm": 0.8249089538679156, "learning_rate": 1.2242646043906832e-05, "loss": 0.0896, "step": 12782 }, { "epoch": 2.671473354231975, "grad_norm": 0.7894471818515546, "learning_rate": 1.2241546643357565e-05, "loss": 0.1059, "step": 12783 }, { "epoch": 2.671682340647858, "grad_norm": 1.0884603593089832, "learning_rate": 1.2240447214281074e-05, "loss": 0.1407, "step": 12784 }, { "epoch": 2.671891327063741, "grad_norm": 0.9041030585526777, "learning_rate": 1.2239347756691352e-05, "loss": 0.1192, "step": 12785 }, { "epoch": 2.672100313479624, "grad_norm": 1.031269466041572, "learning_rate": 1.2238248270602395e-05, "loss": 0.1241, "step": 12786 }, { "epoch": 2.672309299895507, "grad_norm": 0.8588655296830597, "learning_rate": 1.2237148756028188e-05, "loss": 0.1262, "step": 12787 }, { "epoch": 2.67251828631139, "grad_norm": 0.8882387349809611, "learning_rate": 1.2236049212982728e-05, "loss": 0.1081, "step": 12788 }, { "epoch": 2.672727272727273, "grad_norm": 0.8988162084425935, "learning_rate": 1.2234949641480009e-05, "loss": 0.1237, "step": 12789 }, { "epoch": 2.672936259143156, "grad_norm": 0.8229242988590911, "learning_rate": 1.2233850041534021e-05, "loss": 0.12, "step": 12790 }, { "epoch": 2.6731452455590388, "grad_norm": 1.0845123397773992, "learning_rate": 1.2232750413158764e-05, "loss": 0.1318, "step": 12791 }, { "epoch": 2.6733542319749217, "grad_norm": 0.8494194747413162, "learning_rate": 1.2231650756368228e-05, "loss": 0.1089, "step": 12792 }, { "epoch": 2.6735632183908047, "grad_norm": 1.0905766422881709, "learning_rate": 1.223055107117641e-05, "loss": 0.1235, "step": 12793 }, { "epoch": 2.6737722048066876, "grad_norm": 0.9101472132473089, "learning_rate": 1.2229451357597301e-05, "loss": 0.1172, "step": 12794 }, { "epoch": 2.6739811912225706, "grad_norm": 1.0056971822811127, "learning_rate": 1.2228351615644907e-05, "loss": 0.1211, "step": 12795 }, { "epoch": 2.6741901776384536, "grad_norm": 1.4761526752033904, "learning_rate": 1.222725184533321e-05, "loss": 0.1548, "step": 12796 }, { "epoch": 2.6743991640543365, "grad_norm": 0.9724530154034553, "learning_rate": 1.2226152046676218e-05, "loss": 0.1053, "step": 12797 }, { "epoch": 2.6746081504702195, "grad_norm": 1.0273463935741618, "learning_rate": 1.222505221968792e-05, "loss": 0.1112, "step": 12798 }, { "epoch": 2.6748171368861025, "grad_norm": 0.9071657290068418, "learning_rate": 1.2223952364382319e-05, "loss": 0.101, "step": 12799 }, { "epoch": 2.6750261233019854, "grad_norm": 1.0607337879196075, "learning_rate": 1.2222852480773409e-05, "loss": 0.1138, "step": 12800 }, { "epoch": 2.6752351097178684, "grad_norm": 1.0285469820392652, "learning_rate": 1.2221752568875187e-05, "loss": 0.1218, "step": 12801 }, { "epoch": 2.6754440961337513, "grad_norm": 1.0131499417146577, "learning_rate": 1.222065262870165e-05, "loss": 0.1273, "step": 12802 }, { "epoch": 2.6756530825496343, "grad_norm": 0.8376716999716858, "learning_rate": 1.2219552660266805e-05, "loss": 0.1105, "step": 12803 }, { "epoch": 2.6758620689655173, "grad_norm": 1.1781234530511135, "learning_rate": 1.221845266358464e-05, "loss": 0.1099, "step": 12804 }, { "epoch": 2.6760710553814, "grad_norm": 1.0514192140992773, "learning_rate": 1.2217352638669159e-05, "loss": 0.1196, "step": 12805 }, { "epoch": 2.676280041797283, "grad_norm": 0.934717198292123, "learning_rate": 1.2216252585534365e-05, "loss": 0.1061, "step": 12806 }, { "epoch": 2.676489028213166, "grad_norm": 0.9491260944832985, "learning_rate": 1.2215152504194253e-05, "loss": 0.1144, "step": 12807 }, { "epoch": 2.676698014629049, "grad_norm": 0.9710331821610353, "learning_rate": 1.2214052394662822e-05, "loss": 0.1194, "step": 12808 }, { "epoch": 2.676907001044932, "grad_norm": 0.7837550512994051, "learning_rate": 1.2212952256954079e-05, "loss": 0.0911, "step": 12809 }, { "epoch": 2.677115987460815, "grad_norm": 0.92141463617973, "learning_rate": 1.2211852091082016e-05, "loss": 0.1041, "step": 12810 }, { "epoch": 2.677324973876698, "grad_norm": 1.0315244104171966, "learning_rate": 1.2210751897060644e-05, "loss": 0.1315, "step": 12811 }, { "epoch": 2.677533960292581, "grad_norm": 1.072034324547116, "learning_rate": 1.220965167490396e-05, "loss": 0.1237, "step": 12812 }, { "epoch": 2.677742946708464, "grad_norm": 0.9653261451142888, "learning_rate": 1.2208551424625964e-05, "loss": 0.1296, "step": 12813 }, { "epoch": 2.677951933124347, "grad_norm": 0.9722711623709507, "learning_rate": 1.2207451146240662e-05, "loss": 0.118, "step": 12814 }, { "epoch": 2.67816091954023, "grad_norm": 0.7452218417315232, "learning_rate": 1.2206350839762058e-05, "loss": 0.0831, "step": 12815 }, { "epoch": 2.678369905956113, "grad_norm": 0.93059810271628, "learning_rate": 1.220525050520415e-05, "loss": 0.117, "step": 12816 }, { "epoch": 2.6785788923719958, "grad_norm": 1.0889783532357014, "learning_rate": 1.2204150142580943e-05, "loss": 0.1579, "step": 12817 }, { "epoch": 2.6787878787878787, "grad_norm": 0.9458721549991923, "learning_rate": 1.2203049751906443e-05, "loss": 0.1048, "step": 12818 }, { "epoch": 2.6789968652037617, "grad_norm": 1.1527426114004489, "learning_rate": 1.2201949333194658e-05, "loss": 0.1405, "step": 12819 }, { "epoch": 2.6792058516196446, "grad_norm": 0.9635205334322708, "learning_rate": 1.2200848886459581e-05, "loss": 0.1149, "step": 12820 }, { "epoch": 2.6794148380355276, "grad_norm": 0.9586825527595879, "learning_rate": 1.2199748411715228e-05, "loss": 0.1028, "step": 12821 }, { "epoch": 2.6796238244514106, "grad_norm": 1.2048040926714294, "learning_rate": 1.2198647908975598e-05, "loss": 0.1437, "step": 12822 }, { "epoch": 2.6798328108672935, "grad_norm": 0.8703328156761237, "learning_rate": 1.2197547378254699e-05, "loss": 0.1133, "step": 12823 }, { "epoch": 2.6800417972831765, "grad_norm": 0.9739785418220095, "learning_rate": 1.2196446819566538e-05, "loss": 0.109, "step": 12824 }, { "epoch": 2.6802507836990594, "grad_norm": 0.9983427517083804, "learning_rate": 1.219534623292512e-05, "loss": 0.1289, "step": 12825 }, { "epoch": 2.6804597701149424, "grad_norm": 1.2206317383478196, "learning_rate": 1.2194245618344452e-05, "loss": 0.1435, "step": 12826 }, { "epoch": 2.6806687565308254, "grad_norm": 1.0548748577596072, "learning_rate": 1.2193144975838538e-05, "loss": 0.1369, "step": 12827 }, { "epoch": 2.6808777429467083, "grad_norm": 0.8780384652366776, "learning_rate": 1.2192044305421392e-05, "loss": 0.0954, "step": 12828 }, { "epoch": 2.6810867293625913, "grad_norm": 1.5317618265883879, "learning_rate": 1.2190943607107012e-05, "loss": 0.1505, "step": 12829 }, { "epoch": 2.6812957157784743, "grad_norm": 0.977091936538386, "learning_rate": 1.2189842880909416e-05, "loss": 0.1182, "step": 12830 }, { "epoch": 2.681504702194357, "grad_norm": 0.9634614620093616, "learning_rate": 1.2188742126842612e-05, "loss": 0.1115, "step": 12831 }, { "epoch": 2.68171368861024, "grad_norm": 1.1266585571918393, "learning_rate": 1.2187641344920602e-05, "loss": 0.1489, "step": 12832 }, { "epoch": 2.681922675026123, "grad_norm": 1.0592132418268, "learning_rate": 1.2186540535157399e-05, "loss": 0.1124, "step": 12833 }, { "epoch": 2.682131661442006, "grad_norm": 0.8616487974966986, "learning_rate": 1.218543969756701e-05, "loss": 0.1171, "step": 12834 }, { "epoch": 2.682340647857889, "grad_norm": 0.8562697412332426, "learning_rate": 1.218433883216345e-05, "loss": 0.1274, "step": 12835 }, { "epoch": 2.682549634273772, "grad_norm": 1.080817323876312, "learning_rate": 1.2183237938960723e-05, "loss": 0.1541, "step": 12836 }, { "epoch": 2.682758620689655, "grad_norm": 1.0528622773892242, "learning_rate": 1.2182137017972848e-05, "loss": 0.1178, "step": 12837 }, { "epoch": 2.682967607105538, "grad_norm": 1.219919634502423, "learning_rate": 1.2181036069213828e-05, "loss": 0.1499, "step": 12838 }, { "epoch": 2.683176593521421, "grad_norm": 0.882252788765192, "learning_rate": 1.2179935092697677e-05, "loss": 0.1048, "step": 12839 }, { "epoch": 2.683385579937304, "grad_norm": 0.9369846694702272, "learning_rate": 1.2178834088438404e-05, "loss": 0.1041, "step": 12840 }, { "epoch": 2.683594566353187, "grad_norm": 0.8991342675556669, "learning_rate": 1.2177733056450027e-05, "loss": 0.122, "step": 12841 }, { "epoch": 2.68380355276907, "grad_norm": 0.9085541720594535, "learning_rate": 1.2176631996746555e-05, "loss": 0.1216, "step": 12842 }, { "epoch": 2.6840125391849528, "grad_norm": 0.9176131578917867, "learning_rate": 1.2175530909342003e-05, "loss": 0.1028, "step": 12843 }, { "epoch": 2.6842215256008357, "grad_norm": 1.1052209451104076, "learning_rate": 1.217442979425038e-05, "loss": 0.1373, "step": 12844 }, { "epoch": 2.6844305120167187, "grad_norm": 0.95579241502252, "learning_rate": 1.21733286514857e-05, "loss": 0.1358, "step": 12845 }, { "epoch": 2.6846394984326016, "grad_norm": 0.8613066964781668, "learning_rate": 1.217222748106198e-05, "loss": 0.1011, "step": 12846 }, { "epoch": 2.6848484848484846, "grad_norm": 0.8067809856995904, "learning_rate": 1.2171126282993234e-05, "loss": 0.1073, "step": 12847 }, { "epoch": 2.6850574712643676, "grad_norm": 0.8079585034336239, "learning_rate": 1.2170025057293472e-05, "loss": 0.1012, "step": 12848 }, { "epoch": 2.6852664576802505, "grad_norm": 0.8511871715231212, "learning_rate": 1.2168923803976713e-05, "loss": 0.1022, "step": 12849 }, { "epoch": 2.6854754440961335, "grad_norm": 0.9337961317859802, "learning_rate": 1.2167822523056971e-05, "loss": 0.1168, "step": 12850 }, { "epoch": 2.685684430512017, "grad_norm": 0.9626725886048635, "learning_rate": 1.216672121454826e-05, "loss": 0.115, "step": 12851 }, { "epoch": 2.6858934169279, "grad_norm": 0.9010958819155174, "learning_rate": 1.2165619878464598e-05, "loss": 0.1233, "step": 12852 }, { "epoch": 2.686102403343783, "grad_norm": 0.8239333218552474, "learning_rate": 1.2164518514820001e-05, "loss": 0.1152, "step": 12853 }, { "epoch": 2.6863113897596658, "grad_norm": 0.9348465054738746, "learning_rate": 1.2163417123628485e-05, "loss": 0.1017, "step": 12854 }, { "epoch": 2.6865203761755487, "grad_norm": 0.8318386607733241, "learning_rate": 1.2162315704904068e-05, "loss": 0.1062, "step": 12855 }, { "epoch": 2.6867293625914317, "grad_norm": 0.9278596315345632, "learning_rate": 1.2161214258660767e-05, "loss": 0.1219, "step": 12856 }, { "epoch": 2.6869383490073147, "grad_norm": 0.9601386773797845, "learning_rate": 1.2160112784912596e-05, "loss": 0.1275, "step": 12857 }, { "epoch": 2.6871473354231976, "grad_norm": 0.8605371768449792, "learning_rate": 1.2159011283673579e-05, "loss": 0.1118, "step": 12858 }, { "epoch": 2.6873563218390806, "grad_norm": 1.1534433656043444, "learning_rate": 1.2157909754957729e-05, "loss": 0.1287, "step": 12859 }, { "epoch": 2.6875653082549635, "grad_norm": 1.1210639634210253, "learning_rate": 1.2156808198779068e-05, "loss": 0.1191, "step": 12860 }, { "epoch": 2.6877742946708465, "grad_norm": 0.9840173180097181, "learning_rate": 1.2155706615151618e-05, "loss": 0.1113, "step": 12861 }, { "epoch": 2.6879832810867295, "grad_norm": 0.8248873336214022, "learning_rate": 1.2154605004089388e-05, "loss": 0.1123, "step": 12862 }, { "epoch": 2.6881922675026124, "grad_norm": 0.853368480184792, "learning_rate": 1.2153503365606408e-05, "loss": 0.0983, "step": 12863 }, { "epoch": 2.6884012539184954, "grad_norm": 1.248863520526348, "learning_rate": 1.2152401699716694e-05, "loss": 0.1458, "step": 12864 }, { "epoch": 2.6886102403343783, "grad_norm": 0.9518513541703755, "learning_rate": 1.2151300006434264e-05, "loss": 0.1035, "step": 12865 }, { "epoch": 2.6888192267502613, "grad_norm": 1.0360516013378023, "learning_rate": 1.2150198285773147e-05, "loss": 0.109, "step": 12866 }, { "epoch": 2.6890282131661443, "grad_norm": 0.9774366840051999, "learning_rate": 1.2149096537747353e-05, "loss": 0.1279, "step": 12867 }, { "epoch": 2.6892371995820272, "grad_norm": 1.0003202650793475, "learning_rate": 1.2147994762370911e-05, "loss": 0.1226, "step": 12868 }, { "epoch": 2.68944618599791, "grad_norm": 0.8818658606818082, "learning_rate": 1.2146892959657843e-05, "loss": 0.104, "step": 12869 }, { "epoch": 2.689655172413793, "grad_norm": 0.8348021196342172, "learning_rate": 1.214579112962217e-05, "loss": 0.1046, "step": 12870 }, { "epoch": 2.689864158829676, "grad_norm": 0.8943137125461617, "learning_rate": 1.214468927227791e-05, "loss": 0.1141, "step": 12871 }, { "epoch": 2.690073145245559, "grad_norm": 0.8336200539287842, "learning_rate": 1.2143587387639092e-05, "loss": 0.0994, "step": 12872 }, { "epoch": 2.690282131661442, "grad_norm": 1.0221580557464032, "learning_rate": 1.2142485475719736e-05, "loss": 0.143, "step": 12873 }, { "epoch": 2.690491118077325, "grad_norm": 1.1995442619034737, "learning_rate": 1.2141383536533866e-05, "loss": 0.1043, "step": 12874 }, { "epoch": 2.690700104493208, "grad_norm": 0.9242533427055789, "learning_rate": 1.2140281570095508e-05, "loss": 0.1104, "step": 12875 }, { "epoch": 2.690909090909091, "grad_norm": 0.8116774788401053, "learning_rate": 1.2139179576418685e-05, "loss": 0.1107, "step": 12876 }, { "epoch": 2.691118077324974, "grad_norm": 0.8774461088771903, "learning_rate": 1.2138077555517418e-05, "loss": 0.1146, "step": 12877 }, { "epoch": 2.691327063740857, "grad_norm": 1.1440548566206599, "learning_rate": 1.2136975507405737e-05, "loss": 0.1558, "step": 12878 }, { "epoch": 2.69153605015674, "grad_norm": 0.938327315317657, "learning_rate": 1.2135873432097664e-05, "loss": 0.11, "step": 12879 }, { "epoch": 2.6917450365726228, "grad_norm": 1.032632034360042, "learning_rate": 1.2134771329607227e-05, "loss": 0.1199, "step": 12880 }, { "epoch": 2.6919540229885057, "grad_norm": 0.9375250738835691, "learning_rate": 1.2133669199948452e-05, "loss": 0.1125, "step": 12881 }, { "epoch": 2.6921630094043887, "grad_norm": 0.8815021929725616, "learning_rate": 1.2132567043135366e-05, "loss": 0.1101, "step": 12882 }, { "epoch": 2.6923719958202716, "grad_norm": 0.9139755622478121, "learning_rate": 1.2131464859181992e-05, "loss": 0.1192, "step": 12883 }, { "epoch": 2.6925809822361546, "grad_norm": 1.0083044495710698, "learning_rate": 1.2130362648102358e-05, "loss": 0.1153, "step": 12884 }, { "epoch": 2.6927899686520376, "grad_norm": 0.9446967290438473, "learning_rate": 1.2129260409910493e-05, "loss": 0.1199, "step": 12885 }, { "epoch": 2.6929989550679205, "grad_norm": 0.9435903614083141, "learning_rate": 1.2128158144620425e-05, "loss": 0.1291, "step": 12886 }, { "epoch": 2.6932079414838035, "grad_norm": 0.9155603849799008, "learning_rate": 1.212705585224618e-05, "loss": 0.1246, "step": 12887 }, { "epoch": 2.6934169278996865, "grad_norm": 1.054170874807669, "learning_rate": 1.2125953532801793e-05, "loss": 0.1431, "step": 12888 }, { "epoch": 2.6936259143155694, "grad_norm": 0.8761468091107433, "learning_rate": 1.2124851186301282e-05, "loss": 0.1213, "step": 12889 }, { "epoch": 2.6938349007314524, "grad_norm": 1.0252637514036897, "learning_rate": 1.2123748812758684e-05, "loss": 0.1078, "step": 12890 }, { "epoch": 2.6940438871473353, "grad_norm": 0.9183514952995521, "learning_rate": 1.2122646412188024e-05, "loss": 0.1131, "step": 12891 }, { "epoch": 2.6942528735632183, "grad_norm": 1.153090108297703, "learning_rate": 1.2121543984603338e-05, "loss": 0.1013, "step": 12892 }, { "epoch": 2.6944618599791013, "grad_norm": 1.0419948545401463, "learning_rate": 1.2120441530018648e-05, "loss": 0.1285, "step": 12893 }, { "epoch": 2.694670846394984, "grad_norm": 1.0739051920904594, "learning_rate": 1.2119339048447988e-05, "loss": 0.1414, "step": 12894 }, { "epoch": 2.694879832810867, "grad_norm": 0.860356747582496, "learning_rate": 1.2118236539905393e-05, "loss": 0.1215, "step": 12895 }, { "epoch": 2.69508881922675, "grad_norm": 2.099656632383198, "learning_rate": 1.2117134004404887e-05, "loss": 0.1506, "step": 12896 }, { "epoch": 2.695297805642633, "grad_norm": 0.8419404753471786, "learning_rate": 1.2116031441960505e-05, "loss": 0.0998, "step": 12897 }, { "epoch": 2.695506792058516, "grad_norm": 0.8159697620997349, "learning_rate": 1.2114928852586281e-05, "loss": 0.1013, "step": 12898 }, { "epoch": 2.695715778474399, "grad_norm": 1.2768048234962808, "learning_rate": 1.2113826236296245e-05, "loss": 0.1356, "step": 12899 }, { "epoch": 2.695924764890282, "grad_norm": 1.1980489867072737, "learning_rate": 1.211272359310443e-05, "loss": 0.1394, "step": 12900 }, { "epoch": 2.696133751306165, "grad_norm": 1.1060353828146308, "learning_rate": 1.2111620923024865e-05, "loss": 0.1202, "step": 12901 }, { "epoch": 2.696342737722048, "grad_norm": 0.99031983063124, "learning_rate": 1.2110518226071589e-05, "loss": 0.1166, "step": 12902 }, { "epoch": 2.696551724137931, "grad_norm": 0.9825084096126857, "learning_rate": 1.2109415502258632e-05, "loss": 0.1158, "step": 12903 }, { "epoch": 2.6967607105538143, "grad_norm": 1.1319666303583167, "learning_rate": 1.210831275160003e-05, "loss": 0.1574, "step": 12904 }, { "epoch": 2.6969696969696972, "grad_norm": 1.0530562246324775, "learning_rate": 1.2107209974109817e-05, "loss": 0.1447, "step": 12905 }, { "epoch": 2.69717868338558, "grad_norm": 1.0157553857217787, "learning_rate": 1.2106107169802027e-05, "loss": 0.1406, "step": 12906 }, { "epoch": 2.697387669801463, "grad_norm": 0.9547547646422383, "learning_rate": 1.2105004338690694e-05, "loss": 0.1113, "step": 12907 }, { "epoch": 2.697596656217346, "grad_norm": 1.103984891835164, "learning_rate": 1.2103901480789852e-05, "loss": 0.1211, "step": 12908 }, { "epoch": 2.697805642633229, "grad_norm": 0.8490657962441953, "learning_rate": 1.2102798596113541e-05, "loss": 0.1087, "step": 12909 }, { "epoch": 2.698014629049112, "grad_norm": 0.893101061992428, "learning_rate": 1.2101695684675794e-05, "loss": 0.1017, "step": 12910 }, { "epoch": 2.698223615464995, "grad_norm": 1.1727654742093212, "learning_rate": 1.2100592746490648e-05, "loss": 0.1332, "step": 12911 }, { "epoch": 2.698432601880878, "grad_norm": 0.985537724333575, "learning_rate": 1.2099489781572142e-05, "loss": 0.1405, "step": 12912 }, { "epoch": 2.698641588296761, "grad_norm": 1.0684637869284672, "learning_rate": 1.2098386789934308e-05, "loss": 0.1208, "step": 12913 }, { "epoch": 2.698850574712644, "grad_norm": 1.0454915698243814, "learning_rate": 1.2097283771591186e-05, "loss": 0.1462, "step": 12914 }, { "epoch": 2.699059561128527, "grad_norm": 1.0858591455946773, "learning_rate": 1.2096180726556815e-05, "loss": 0.1271, "step": 12915 }, { "epoch": 2.69926854754441, "grad_norm": 0.8328728108861506, "learning_rate": 1.2095077654845228e-05, "loss": 0.1093, "step": 12916 }, { "epoch": 2.6994775339602928, "grad_norm": 1.1225124195100282, "learning_rate": 1.2093974556470472e-05, "loss": 0.1356, "step": 12917 }, { "epoch": 2.6996865203761757, "grad_norm": 1.0867553933594816, "learning_rate": 1.2092871431446577e-05, "loss": 0.1369, "step": 12918 }, { "epoch": 2.6998955067920587, "grad_norm": 0.8649995323507188, "learning_rate": 1.2091768279787587e-05, "loss": 0.1141, "step": 12919 }, { "epoch": 2.7001044932079417, "grad_norm": 1.0816962259639233, "learning_rate": 1.209066510150754e-05, "loss": 0.1256, "step": 12920 }, { "epoch": 2.7003134796238246, "grad_norm": 0.9889082767217748, "learning_rate": 1.2089561896620474e-05, "loss": 0.1043, "step": 12921 }, { "epoch": 2.7005224660397076, "grad_norm": 1.0217509270137108, "learning_rate": 1.208845866514043e-05, "loss": 0.1234, "step": 12922 }, { "epoch": 2.7007314524555905, "grad_norm": 0.9057183259347759, "learning_rate": 1.2087355407081452e-05, "loss": 0.1232, "step": 12923 }, { "epoch": 2.7009404388714735, "grad_norm": 1.1070763888668893, "learning_rate": 1.2086252122457576e-05, "loss": 0.1295, "step": 12924 }, { "epoch": 2.7011494252873565, "grad_norm": 1.0146133867253768, "learning_rate": 1.2085148811282844e-05, "loss": 0.1322, "step": 12925 }, { "epoch": 2.7013584117032394, "grad_norm": 0.8828968687697242, "learning_rate": 1.2084045473571299e-05, "loss": 0.093, "step": 12926 }, { "epoch": 2.7015673981191224, "grad_norm": 1.0451918911057607, "learning_rate": 1.2082942109336982e-05, "loss": 0.1346, "step": 12927 }, { "epoch": 2.7017763845350053, "grad_norm": 0.8474371882794689, "learning_rate": 1.2081838718593934e-05, "loss": 0.1044, "step": 12928 }, { "epoch": 2.7019853709508883, "grad_norm": 1.0823423361103819, "learning_rate": 1.20807353013562e-05, "loss": 0.1459, "step": 12929 }, { "epoch": 2.7021943573667713, "grad_norm": 0.9718126265893526, "learning_rate": 1.2079631857637819e-05, "loss": 0.1223, "step": 12930 }, { "epoch": 2.7024033437826542, "grad_norm": 1.1235458925794832, "learning_rate": 1.2078528387452838e-05, "loss": 0.1384, "step": 12931 }, { "epoch": 2.702612330198537, "grad_norm": 0.9399353524291394, "learning_rate": 1.2077424890815297e-05, "loss": 0.1168, "step": 12932 }, { "epoch": 2.70282131661442, "grad_norm": 1.0680147630196826, "learning_rate": 1.2076321367739241e-05, "loss": 0.1124, "step": 12933 }, { "epoch": 2.703030303030303, "grad_norm": 1.2086255623477684, "learning_rate": 1.2075217818238713e-05, "loss": 0.1304, "step": 12934 }, { "epoch": 2.703239289446186, "grad_norm": 0.8130802303909903, "learning_rate": 1.207411424232776e-05, "loss": 0.0938, "step": 12935 }, { "epoch": 2.703448275862069, "grad_norm": 0.8976094806955526, "learning_rate": 1.2073010640020428e-05, "loss": 0.1096, "step": 12936 }, { "epoch": 2.703657262277952, "grad_norm": 0.979533300981937, "learning_rate": 1.2071907011330758e-05, "loss": 0.1193, "step": 12937 }, { "epoch": 2.703866248693835, "grad_norm": 1.0914329484748322, "learning_rate": 1.2070803356272792e-05, "loss": 0.1643, "step": 12938 }, { "epoch": 2.704075235109718, "grad_norm": 0.8107459775253325, "learning_rate": 1.2069699674860588e-05, "loss": 0.1047, "step": 12939 }, { "epoch": 2.704284221525601, "grad_norm": 0.920564245401841, "learning_rate": 1.2068595967108178e-05, "loss": 0.127, "step": 12940 }, { "epoch": 2.704493207941484, "grad_norm": 0.8831638790657468, "learning_rate": 1.206749223302962e-05, "loss": 0.0916, "step": 12941 }, { "epoch": 2.704702194357367, "grad_norm": 1.0432821181488743, "learning_rate": 1.2066388472638954e-05, "loss": 0.1364, "step": 12942 }, { "epoch": 2.7049111807732498, "grad_norm": 0.9489848741941613, "learning_rate": 1.2065284685950231e-05, "loss": 0.1209, "step": 12943 }, { "epoch": 2.7051201671891327, "grad_norm": 1.040952137430433, "learning_rate": 1.2064180872977493e-05, "loss": 0.1284, "step": 12944 }, { "epoch": 2.7053291536050157, "grad_norm": 0.8437877288309815, "learning_rate": 1.2063077033734797e-05, "loss": 0.1012, "step": 12945 }, { "epoch": 2.7055381400208987, "grad_norm": 0.7488584928688663, "learning_rate": 1.2061973168236182e-05, "loss": 0.0987, "step": 12946 }, { "epoch": 2.7057471264367816, "grad_norm": 0.7840126555570485, "learning_rate": 1.2060869276495696e-05, "loss": 0.1, "step": 12947 }, { "epoch": 2.7059561128526646, "grad_norm": 1.0276465074895276, "learning_rate": 1.2059765358527395e-05, "loss": 0.1142, "step": 12948 }, { "epoch": 2.7061650992685475, "grad_norm": 0.8270256617979697, "learning_rate": 1.2058661414345325e-05, "loss": 0.1064, "step": 12949 }, { "epoch": 2.7063740856844305, "grad_norm": 0.9157566645795456, "learning_rate": 1.2057557443963534e-05, "loss": 0.1114, "step": 12950 }, { "epoch": 2.7065830721003135, "grad_norm": 0.9769835275282454, "learning_rate": 1.2056453447396078e-05, "loss": 0.1163, "step": 12951 }, { "epoch": 2.7067920585161964, "grad_norm": 1.1184250631634862, "learning_rate": 1.2055349424656995e-05, "loss": 0.1425, "step": 12952 }, { "epoch": 2.7070010449320794, "grad_norm": 0.8266358460669724, "learning_rate": 1.2054245375760345e-05, "loss": 0.1125, "step": 12953 }, { "epoch": 2.7072100313479623, "grad_norm": 1.0996322598707882, "learning_rate": 1.2053141300720175e-05, "loss": 0.1366, "step": 12954 }, { "epoch": 2.7074190177638453, "grad_norm": 1.0160092108634826, "learning_rate": 1.2052037199550537e-05, "loss": 0.1201, "step": 12955 }, { "epoch": 2.7076280041797283, "grad_norm": 0.9980337337383008, "learning_rate": 1.2050933072265482e-05, "loss": 0.1304, "step": 12956 }, { "epoch": 2.7078369905956112, "grad_norm": 0.8315815964949612, "learning_rate": 1.2049828918879067e-05, "loss": 0.102, "step": 12957 }, { "epoch": 2.708045977011494, "grad_norm": 0.8103911604429621, "learning_rate": 1.2048724739405337e-05, "loss": 0.0986, "step": 12958 }, { "epoch": 2.708254963427377, "grad_norm": 1.0097084287844091, "learning_rate": 1.2047620533858347e-05, "loss": 0.123, "step": 12959 }, { "epoch": 2.70846394984326, "grad_norm": 1.060863379591515, "learning_rate": 1.204651630225215e-05, "loss": 0.1222, "step": 12960 }, { "epoch": 2.708672936259143, "grad_norm": 1.1992917582779679, "learning_rate": 1.2045412044600795e-05, "loss": 0.1197, "step": 12961 }, { "epoch": 2.708881922675026, "grad_norm": 0.8273100011095715, "learning_rate": 1.2044307760918343e-05, "loss": 0.1071, "step": 12962 }, { "epoch": 2.709090909090909, "grad_norm": 0.9706825804063665, "learning_rate": 1.2043203451218845e-05, "loss": 0.1169, "step": 12963 }, { "epoch": 2.709299895506792, "grad_norm": 1.0625815797789844, "learning_rate": 1.2042099115516353e-05, "loss": 0.1453, "step": 12964 }, { "epoch": 2.709508881922675, "grad_norm": 0.8399837466299598, "learning_rate": 1.2040994753824919e-05, "loss": 0.1152, "step": 12965 }, { "epoch": 2.709717868338558, "grad_norm": 1.0778025542790466, "learning_rate": 1.2039890366158607e-05, "loss": 0.1179, "step": 12966 }, { "epoch": 2.709926854754441, "grad_norm": 0.9579802517693858, "learning_rate": 1.2038785952531463e-05, "loss": 0.123, "step": 12967 }, { "epoch": 2.710135841170324, "grad_norm": 1.0441822258407223, "learning_rate": 1.2037681512957543e-05, "loss": 0.1272, "step": 12968 }, { "epoch": 2.7103448275862068, "grad_norm": 1.0894359203587822, "learning_rate": 1.2036577047450912e-05, "loss": 0.1276, "step": 12969 }, { "epoch": 2.7105538140020897, "grad_norm": 1.1124572975118197, "learning_rate": 1.2035472556025615e-05, "loss": 0.1318, "step": 12970 }, { "epoch": 2.7107628004179727, "grad_norm": 0.9469057002848116, "learning_rate": 1.2034368038695715e-05, "loss": 0.1416, "step": 12971 }, { "epoch": 2.7109717868338556, "grad_norm": 0.7879681485223848, "learning_rate": 1.2033263495475264e-05, "loss": 0.1007, "step": 12972 }, { "epoch": 2.7111807732497386, "grad_norm": 0.9840550000297228, "learning_rate": 1.2032158926378323e-05, "loss": 0.1275, "step": 12973 }, { "epoch": 2.7113897596656216, "grad_norm": 0.9837611936850436, "learning_rate": 1.203105433141895e-05, "loss": 0.1259, "step": 12974 }, { "epoch": 2.7115987460815045, "grad_norm": 0.9959099970524842, "learning_rate": 1.20299497106112e-05, "loss": 0.1473, "step": 12975 }, { "epoch": 2.7118077324973875, "grad_norm": 0.8515359203009875, "learning_rate": 1.202884506396913e-05, "loss": 0.0975, "step": 12976 }, { "epoch": 2.7120167189132705, "grad_norm": 1.880188729875732, "learning_rate": 1.2027740391506803e-05, "loss": 0.1557, "step": 12977 }, { "epoch": 2.7122257053291534, "grad_norm": 0.8120532633819053, "learning_rate": 1.2026635693238273e-05, "loss": 0.0932, "step": 12978 }, { "epoch": 2.7124346917450364, "grad_norm": 0.9659260616630325, "learning_rate": 1.2025530969177601e-05, "loss": 0.1359, "step": 12979 }, { "epoch": 2.7126436781609193, "grad_norm": 0.8909911371822934, "learning_rate": 1.2024426219338845e-05, "loss": 0.1224, "step": 12980 }, { "epoch": 2.7128526645768023, "grad_norm": 0.9110013593687174, "learning_rate": 1.2023321443736071e-05, "loss": 0.0936, "step": 12981 }, { "epoch": 2.7130616509926853, "grad_norm": 1.1312957092452558, "learning_rate": 1.2022216642383331e-05, "loss": 0.1332, "step": 12982 }, { "epoch": 2.713270637408568, "grad_norm": 1.1220662683757472, "learning_rate": 1.2021111815294688e-05, "loss": 0.1211, "step": 12983 }, { "epoch": 2.713479623824451, "grad_norm": 0.9027946949058866, "learning_rate": 1.2020006962484204e-05, "loss": 0.1015, "step": 12984 }, { "epoch": 2.713688610240334, "grad_norm": 0.9554895150580706, "learning_rate": 1.2018902083965935e-05, "loss": 0.1202, "step": 12985 }, { "epoch": 2.713897596656217, "grad_norm": 0.7428552996026937, "learning_rate": 1.201779717975395e-05, "loss": 0.1168, "step": 12986 }, { "epoch": 2.7141065830721, "grad_norm": 0.7511273441409291, "learning_rate": 1.201669224986231e-05, "loss": 0.101, "step": 12987 }, { "epoch": 2.714315569487983, "grad_norm": 1.0644692258762896, "learning_rate": 1.2015587294305069e-05, "loss": 0.1185, "step": 12988 }, { "epoch": 2.714524555903866, "grad_norm": 0.8572529402435793, "learning_rate": 1.2014482313096295e-05, "loss": 0.1199, "step": 12989 }, { "epoch": 2.714733542319749, "grad_norm": 0.9511715364514619, "learning_rate": 1.201337730625005e-05, "loss": 0.1085, "step": 12990 }, { "epoch": 2.714942528735632, "grad_norm": 1.0638590142702502, "learning_rate": 1.2012272273780398e-05, "loss": 0.1129, "step": 12991 }, { "epoch": 2.7151515151515153, "grad_norm": 1.0417172979683147, "learning_rate": 1.2011167215701403e-05, "loss": 0.1099, "step": 12992 }, { "epoch": 2.7153605015673983, "grad_norm": 0.9946156373365088, "learning_rate": 1.2010062132027125e-05, "loss": 0.1264, "step": 12993 }, { "epoch": 2.7155694879832812, "grad_norm": 1.0107468708490464, "learning_rate": 1.200895702277163e-05, "loss": 0.1349, "step": 12994 }, { "epoch": 2.715778474399164, "grad_norm": 0.9335493929056704, "learning_rate": 1.200785188794898e-05, "loss": 0.0924, "step": 12995 }, { "epoch": 2.715987460815047, "grad_norm": 1.0956160577809533, "learning_rate": 1.2006746727573243e-05, "loss": 0.1355, "step": 12996 }, { "epoch": 2.71619644723093, "grad_norm": 0.9211020821238185, "learning_rate": 1.2005641541658484e-05, "loss": 0.1042, "step": 12997 }, { "epoch": 2.716405433646813, "grad_norm": 0.8770282822102866, "learning_rate": 1.2004536330218763e-05, "loss": 0.1081, "step": 12998 }, { "epoch": 2.716614420062696, "grad_norm": 1.2109982833740365, "learning_rate": 1.2003431093268154e-05, "loss": 0.1384, "step": 12999 }, { "epoch": 2.716823406478579, "grad_norm": 0.8427519338345745, "learning_rate": 1.2002325830820715e-05, "loss": 0.122, "step": 13000 }, { "epoch": 2.717032392894462, "grad_norm": 1.1412839605161944, "learning_rate": 1.2001220542890517e-05, "loss": 0.1337, "step": 13001 }, { "epoch": 2.717241379310345, "grad_norm": 0.9968492638160723, "learning_rate": 1.2000115229491625e-05, "loss": 0.1256, "step": 13002 }, { "epoch": 2.717450365726228, "grad_norm": 0.957295731187896, "learning_rate": 1.1999009890638105e-05, "loss": 0.1263, "step": 13003 }, { "epoch": 2.717659352142111, "grad_norm": 1.641275671328181, "learning_rate": 1.1997904526344024e-05, "loss": 0.1312, "step": 13004 }, { "epoch": 2.717868338557994, "grad_norm": 0.8879216053059782, "learning_rate": 1.1996799136623452e-05, "loss": 0.1182, "step": 13005 }, { "epoch": 2.7180773249738768, "grad_norm": 1.1358650899415421, "learning_rate": 1.1995693721490456e-05, "loss": 0.1284, "step": 13006 }, { "epoch": 2.7182863113897597, "grad_norm": 0.8261082402144, "learning_rate": 1.1994588280959101e-05, "loss": 0.0985, "step": 13007 }, { "epoch": 2.7184952978056427, "grad_norm": 1.1545090883523716, "learning_rate": 1.1993482815043458e-05, "loss": 0.1354, "step": 13008 }, { "epoch": 2.7187042842215257, "grad_norm": 0.9717692197160558, "learning_rate": 1.1992377323757598e-05, "loss": 0.1236, "step": 13009 }, { "epoch": 2.7189132706374086, "grad_norm": 1.1770278538894847, "learning_rate": 1.1991271807115588e-05, "loss": 0.1429, "step": 13010 }, { "epoch": 2.7191222570532916, "grad_norm": 1.0454046440985976, "learning_rate": 1.1990166265131495e-05, "loss": 0.1244, "step": 13011 }, { "epoch": 2.7193312434691745, "grad_norm": 1.016486662664432, "learning_rate": 1.1989060697819392e-05, "loss": 0.1545, "step": 13012 }, { "epoch": 2.7195402298850575, "grad_norm": 1.016376283321492, "learning_rate": 1.1987955105193347e-05, "loss": 0.1346, "step": 13013 }, { "epoch": 2.7197492163009405, "grad_norm": 1.0674895912494728, "learning_rate": 1.1986849487267435e-05, "loss": 0.1345, "step": 13014 }, { "epoch": 2.7199582027168234, "grad_norm": 0.8088054080034687, "learning_rate": 1.1985743844055722e-05, "loss": 0.0966, "step": 13015 }, { "epoch": 2.7201671891327064, "grad_norm": 0.78281659883606, "learning_rate": 1.1984638175572279e-05, "loss": 0.1012, "step": 13016 }, { "epoch": 2.7203761755485893, "grad_norm": 0.987601304362726, "learning_rate": 1.1983532481831179e-05, "loss": 0.1343, "step": 13017 }, { "epoch": 2.7205851619644723, "grad_norm": 0.9286175171230112, "learning_rate": 1.1982426762846494e-05, "loss": 0.1222, "step": 13018 }, { "epoch": 2.7207941483803553, "grad_norm": 1.0595241071420947, "learning_rate": 1.1981321018632295e-05, "loss": 0.1413, "step": 13019 }, { "epoch": 2.7210031347962382, "grad_norm": 0.9910833584105893, "learning_rate": 1.1980215249202655e-05, "loss": 0.1351, "step": 13020 }, { "epoch": 2.721212121212121, "grad_norm": 1.1002987479164175, "learning_rate": 1.1979109454571649e-05, "loss": 0.1227, "step": 13021 }, { "epoch": 2.721421107628004, "grad_norm": 1.050844226602575, "learning_rate": 1.1978003634753344e-05, "loss": 0.1344, "step": 13022 }, { "epoch": 2.721630094043887, "grad_norm": 1.1171722548382135, "learning_rate": 1.1976897789761816e-05, "loss": 0.1278, "step": 13023 }, { "epoch": 2.72183908045977, "grad_norm": 0.942792598813507, "learning_rate": 1.1975791919611142e-05, "loss": 0.102, "step": 13024 }, { "epoch": 2.722048066875653, "grad_norm": 0.862559949381374, "learning_rate": 1.1974686024315391e-05, "loss": 0.1033, "step": 13025 }, { "epoch": 2.722257053291536, "grad_norm": 0.8817934401970253, "learning_rate": 1.1973580103888644e-05, "loss": 0.1147, "step": 13026 }, { "epoch": 2.722466039707419, "grad_norm": 0.949668490174558, "learning_rate": 1.197247415834497e-05, "loss": 0.1042, "step": 13027 }, { "epoch": 2.722675026123302, "grad_norm": 0.9095035201444265, "learning_rate": 1.1971368187698442e-05, "loss": 0.1377, "step": 13028 }, { "epoch": 2.722884012539185, "grad_norm": 0.9880157523723417, "learning_rate": 1.197026219196314e-05, "loss": 0.1191, "step": 13029 }, { "epoch": 2.723092998955068, "grad_norm": 0.9322464771012133, "learning_rate": 1.1969156171153138e-05, "loss": 0.1295, "step": 13030 }, { "epoch": 2.723301985370951, "grad_norm": 0.9420689416815875, "learning_rate": 1.1968050125282508e-05, "loss": 0.1364, "step": 13031 }, { "epoch": 2.7235109717868338, "grad_norm": 0.8679506481033503, "learning_rate": 1.1966944054365337e-05, "loss": 0.1192, "step": 13032 }, { "epoch": 2.7237199582027167, "grad_norm": 0.9789365594071758, "learning_rate": 1.1965837958415692e-05, "loss": 0.1078, "step": 13033 }, { "epoch": 2.7239289446185997, "grad_norm": 1.053675245270035, "learning_rate": 1.196473183744765e-05, "loss": 0.1321, "step": 13034 }, { "epoch": 2.7241379310344827, "grad_norm": 0.9318125995234471, "learning_rate": 1.196362569147529e-05, "loss": 0.1354, "step": 13035 }, { "epoch": 2.7243469174503656, "grad_norm": 0.9832950516002988, "learning_rate": 1.1962519520512693e-05, "loss": 0.1346, "step": 13036 }, { "epoch": 2.7245559038662486, "grad_norm": 0.9573752134265038, "learning_rate": 1.196141332457393e-05, "loss": 0.1185, "step": 13037 }, { "epoch": 2.7247648902821315, "grad_norm": 0.912645895975465, "learning_rate": 1.1960307103673086e-05, "loss": 0.1229, "step": 13038 }, { "epoch": 2.7249738766980145, "grad_norm": 0.8139102788942416, "learning_rate": 1.1959200857824234e-05, "loss": 0.0959, "step": 13039 }, { "epoch": 2.7251828631138975, "grad_norm": 0.8635790490250759, "learning_rate": 1.1958094587041456e-05, "loss": 0.1068, "step": 13040 }, { "epoch": 2.7253918495297804, "grad_norm": 0.9757456370491645, "learning_rate": 1.1956988291338828e-05, "loss": 0.1207, "step": 13041 }, { "epoch": 2.7256008359456634, "grad_norm": 1.0084361557136639, "learning_rate": 1.1955881970730432e-05, "loss": 0.1245, "step": 13042 }, { "epoch": 2.7258098223615463, "grad_norm": 1.0855322223238777, "learning_rate": 1.1954775625230348e-05, "loss": 0.1261, "step": 13043 }, { "epoch": 2.7260188087774293, "grad_norm": 0.851926383126976, "learning_rate": 1.1953669254852655e-05, "loss": 0.1217, "step": 13044 }, { "epoch": 2.7262277951933127, "grad_norm": 1.138539298721911, "learning_rate": 1.1952562859611435e-05, "loss": 0.1375, "step": 13045 }, { "epoch": 2.7264367816091957, "grad_norm": 1.2333895869377107, "learning_rate": 1.1951456439520763e-05, "loss": 0.1075, "step": 13046 }, { "epoch": 2.7266457680250786, "grad_norm": 1.0027034827520163, "learning_rate": 1.1950349994594723e-05, "loss": 0.1416, "step": 13047 }, { "epoch": 2.7268547544409616, "grad_norm": 0.8652769906517509, "learning_rate": 1.1949243524847401e-05, "loss": 0.1232, "step": 13048 }, { "epoch": 2.7270637408568446, "grad_norm": 1.198045345897936, "learning_rate": 1.1948137030292873e-05, "loss": 0.1259, "step": 13049 }, { "epoch": 2.7272727272727275, "grad_norm": 1.0032087004791725, "learning_rate": 1.1947030510945224e-05, "loss": 0.1085, "step": 13050 }, { "epoch": 2.7274817136886105, "grad_norm": 0.9414132333268125, "learning_rate": 1.194592396681853e-05, "loss": 0.0896, "step": 13051 }, { "epoch": 2.7276907001044934, "grad_norm": 0.9179893234874384, "learning_rate": 1.1944817397926883e-05, "loss": 0.1135, "step": 13052 }, { "epoch": 2.7278996865203764, "grad_norm": 0.8044545298088385, "learning_rate": 1.1943710804284358e-05, "loss": 0.116, "step": 13053 }, { "epoch": 2.7281086729362594, "grad_norm": 0.8442701778569381, "learning_rate": 1.1942604185905044e-05, "loss": 0.117, "step": 13054 }, { "epoch": 2.7283176593521423, "grad_norm": 1.0091286441463707, "learning_rate": 1.194149754280302e-05, "loss": 0.1493, "step": 13055 }, { "epoch": 2.7285266457680253, "grad_norm": 0.8740705039799749, "learning_rate": 1.1940390874992374e-05, "loss": 0.112, "step": 13056 }, { "epoch": 2.7287356321839082, "grad_norm": 1.0915790870843638, "learning_rate": 1.1939284182487186e-05, "loss": 0.159, "step": 13057 }, { "epoch": 2.728944618599791, "grad_norm": 0.8562674629628051, "learning_rate": 1.193817746530154e-05, "loss": 0.1173, "step": 13058 }, { "epoch": 2.729153605015674, "grad_norm": 1.0870922529273512, "learning_rate": 1.1937070723449525e-05, "loss": 0.1385, "step": 13059 }, { "epoch": 2.729362591431557, "grad_norm": 1.0725367304846343, "learning_rate": 1.1935963956945221e-05, "loss": 0.1268, "step": 13060 }, { "epoch": 2.72957157784744, "grad_norm": 1.011107309661006, "learning_rate": 1.1934857165802717e-05, "loss": 0.1204, "step": 13061 }, { "epoch": 2.729780564263323, "grad_norm": 1.0518093278934837, "learning_rate": 1.1933750350036102e-05, "loss": 0.1307, "step": 13062 }, { "epoch": 2.729989550679206, "grad_norm": 0.9817425989171042, "learning_rate": 1.1932643509659454e-05, "loss": 0.1187, "step": 13063 }, { "epoch": 2.730198537095089, "grad_norm": 0.9191505349021433, "learning_rate": 1.1931536644686864e-05, "loss": 0.1115, "step": 13064 }, { "epoch": 2.730407523510972, "grad_norm": 0.9860585933354497, "learning_rate": 1.1930429755132415e-05, "loss": 0.1224, "step": 13065 }, { "epoch": 2.730616509926855, "grad_norm": 0.999678822563183, "learning_rate": 1.19293228410102e-05, "loss": 0.1179, "step": 13066 }, { "epoch": 2.730825496342738, "grad_norm": 1.0785915216112791, "learning_rate": 1.1928215902334302e-05, "loss": 0.1375, "step": 13067 }, { "epoch": 2.731034482758621, "grad_norm": 1.1942791524106908, "learning_rate": 1.192710893911881e-05, "loss": 0.1406, "step": 13068 }, { "epoch": 2.731243469174504, "grad_norm": 0.803914250363211, "learning_rate": 1.192600195137781e-05, "loss": 0.1159, "step": 13069 }, { "epoch": 2.7314524555903867, "grad_norm": 1.0634329740229846, "learning_rate": 1.1924894939125392e-05, "loss": 0.1225, "step": 13070 }, { "epoch": 2.7316614420062697, "grad_norm": 0.9291325172993853, "learning_rate": 1.1923787902375643e-05, "loss": 0.0996, "step": 13071 }, { "epoch": 2.7318704284221527, "grad_norm": 1.197508342647104, "learning_rate": 1.1922680841142654e-05, "loss": 0.1631, "step": 13072 }, { "epoch": 2.7320794148380356, "grad_norm": 0.9816304291058019, "learning_rate": 1.192157375544051e-05, "loss": 0.1155, "step": 13073 }, { "epoch": 2.7322884012539186, "grad_norm": 1.049189869911371, "learning_rate": 1.1920466645283307e-05, "loss": 0.1397, "step": 13074 }, { "epoch": 2.7324973876698015, "grad_norm": 0.8453441215345316, "learning_rate": 1.1919359510685128e-05, "loss": 0.1235, "step": 13075 }, { "epoch": 2.7327063740856845, "grad_norm": 1.0697449558082543, "learning_rate": 1.1918252351660066e-05, "loss": 0.1586, "step": 13076 }, { "epoch": 2.7329153605015675, "grad_norm": 1.0151033112994687, "learning_rate": 1.1917145168222213e-05, "loss": 0.1587, "step": 13077 }, { "epoch": 2.7331243469174504, "grad_norm": 0.8837044554496833, "learning_rate": 1.1916037960385656e-05, "loss": 0.121, "step": 13078 }, { "epoch": 2.7333333333333334, "grad_norm": 1.0171779074013159, "learning_rate": 1.191493072816449e-05, "loss": 0.1107, "step": 13079 }, { "epoch": 2.7335423197492164, "grad_norm": 1.0952133063981682, "learning_rate": 1.1913823471572805e-05, "loss": 0.1226, "step": 13080 }, { "epoch": 2.7337513061650993, "grad_norm": 0.9650243725662725, "learning_rate": 1.191271619062469e-05, "loss": 0.1106, "step": 13081 }, { "epoch": 2.7339602925809823, "grad_norm": 1.0093252439507856, "learning_rate": 1.1911608885334239e-05, "loss": 0.1315, "step": 13082 }, { "epoch": 2.7341692789968652, "grad_norm": 0.9096169989746699, "learning_rate": 1.1910501555715542e-05, "loss": 0.1166, "step": 13083 }, { "epoch": 2.734378265412748, "grad_norm": 0.8064785014968922, "learning_rate": 1.1909394201782698e-05, "loss": 0.1023, "step": 13084 }, { "epoch": 2.734587251828631, "grad_norm": 0.9595824893614449, "learning_rate": 1.1908286823549791e-05, "loss": 0.1292, "step": 13085 }, { "epoch": 2.734796238244514, "grad_norm": 1.045889981636157, "learning_rate": 1.1907179421030919e-05, "loss": 0.1212, "step": 13086 }, { "epoch": 2.735005224660397, "grad_norm": 0.8175163281261012, "learning_rate": 1.1906071994240175e-05, "loss": 0.1015, "step": 13087 }, { "epoch": 2.73521421107628, "grad_norm": 0.9104312993315572, "learning_rate": 1.1904964543191653e-05, "loss": 0.111, "step": 13088 }, { "epoch": 2.735423197492163, "grad_norm": 1.1674312314841016, "learning_rate": 1.1903857067899449e-05, "loss": 0.1354, "step": 13089 }, { "epoch": 2.735632183908046, "grad_norm": 1.0481748641100752, "learning_rate": 1.1902749568377652e-05, "loss": 0.1039, "step": 13090 }, { "epoch": 2.735841170323929, "grad_norm": 0.8576769948270578, "learning_rate": 1.190164204464036e-05, "loss": 0.1107, "step": 13091 }, { "epoch": 2.736050156739812, "grad_norm": 0.9412643436551651, "learning_rate": 1.1900534496701667e-05, "loss": 0.1004, "step": 13092 }, { "epoch": 2.736259143155695, "grad_norm": 1.0762427730939228, "learning_rate": 1.189942692457567e-05, "loss": 0.1362, "step": 13093 }, { "epoch": 2.736468129571578, "grad_norm": 1.0825185081742588, "learning_rate": 1.1898319328276465e-05, "loss": 0.1182, "step": 13094 }, { "epoch": 2.7366771159874608, "grad_norm": 0.7634797478283897, "learning_rate": 1.1897211707818146e-05, "loss": 0.0908, "step": 13095 }, { "epoch": 2.7368861024033437, "grad_norm": 0.976870200140691, "learning_rate": 1.1896104063214808e-05, "loss": 0.1205, "step": 13096 }, { "epoch": 2.7370950888192267, "grad_norm": 1.0446027827215492, "learning_rate": 1.1894996394480551e-05, "loss": 0.1372, "step": 13097 }, { "epoch": 2.7373040752351097, "grad_norm": 0.9899881476688759, "learning_rate": 1.1893888701629468e-05, "loss": 0.1217, "step": 13098 }, { "epoch": 2.7375130616509926, "grad_norm": 1.0868102782086178, "learning_rate": 1.189278098467566e-05, "loss": 0.1269, "step": 13099 }, { "epoch": 2.7377220480668756, "grad_norm": 0.9909631107457415, "learning_rate": 1.1891673243633221e-05, "loss": 0.1204, "step": 13100 }, { "epoch": 2.7379310344827585, "grad_norm": 0.9544531924685644, "learning_rate": 1.1890565478516254e-05, "loss": 0.1231, "step": 13101 }, { "epoch": 2.7381400208986415, "grad_norm": 0.9330203812285585, "learning_rate": 1.1889457689338851e-05, "loss": 0.0891, "step": 13102 }, { "epoch": 2.7383490073145245, "grad_norm": 0.9259639742636763, "learning_rate": 1.1888349876115112e-05, "loss": 0.1172, "step": 13103 }, { "epoch": 2.7385579937304074, "grad_norm": 1.0201188337948677, "learning_rate": 1.1887242038859138e-05, "loss": 0.1155, "step": 13104 }, { "epoch": 2.7387669801462904, "grad_norm": 0.9227232235629833, "learning_rate": 1.1886134177585026e-05, "loss": 0.1221, "step": 13105 }, { "epoch": 2.7389759665621733, "grad_norm": 1.1457139312585476, "learning_rate": 1.1885026292306874e-05, "loss": 0.1209, "step": 13106 }, { "epoch": 2.7391849529780563, "grad_norm": 0.8222554715807738, "learning_rate": 1.1883918383038787e-05, "loss": 0.1024, "step": 13107 }, { "epoch": 2.7393939393939393, "grad_norm": 0.8984306929306276, "learning_rate": 1.188281044979486e-05, "loss": 0.1128, "step": 13108 }, { "epoch": 2.7396029258098222, "grad_norm": 0.900652199605538, "learning_rate": 1.1881702492589194e-05, "loss": 0.1152, "step": 13109 }, { "epoch": 2.739811912225705, "grad_norm": 0.8980716696265143, "learning_rate": 1.1880594511435888e-05, "loss": 0.0964, "step": 13110 }, { "epoch": 2.740020898641588, "grad_norm": 1.065243022957759, "learning_rate": 1.1879486506349047e-05, "loss": 0.1309, "step": 13111 }, { "epoch": 2.740229885057471, "grad_norm": 1.1843355591744251, "learning_rate": 1.1878378477342769e-05, "loss": 0.1492, "step": 13112 }, { "epoch": 2.740438871473354, "grad_norm": 1.0247965177686094, "learning_rate": 1.187727042443116e-05, "loss": 0.1126, "step": 13113 }, { "epoch": 2.740647857889237, "grad_norm": 1.1399126814568412, "learning_rate": 1.1876162347628314e-05, "loss": 0.1481, "step": 13114 }, { "epoch": 2.74085684430512, "grad_norm": 1.0800325971573657, "learning_rate": 1.187505424694834e-05, "loss": 0.1306, "step": 13115 }, { "epoch": 2.741065830721003, "grad_norm": 1.0256906059257964, "learning_rate": 1.1873946122405333e-05, "loss": 0.1286, "step": 13116 }, { "epoch": 2.741274817136886, "grad_norm": 1.364869412727248, "learning_rate": 1.1872837974013402e-05, "loss": 0.1622, "step": 13117 }, { "epoch": 2.741483803552769, "grad_norm": 0.8793545537200724, "learning_rate": 1.1871729801786649e-05, "loss": 0.1105, "step": 13118 }, { "epoch": 2.741692789968652, "grad_norm": 0.9475458794869249, "learning_rate": 1.1870621605739178e-05, "loss": 0.1285, "step": 13119 }, { "epoch": 2.741901776384535, "grad_norm": 1.7287965177934264, "learning_rate": 1.186951338588509e-05, "loss": 0.1485, "step": 13120 }, { "epoch": 2.7421107628004178, "grad_norm": 1.0127834445503232, "learning_rate": 1.1868405142238489e-05, "loss": 0.1385, "step": 13121 }, { "epoch": 2.7423197492163007, "grad_norm": 1.0667963553669904, "learning_rate": 1.1867296874813476e-05, "loss": 0.1216, "step": 13122 }, { "epoch": 2.7425287356321837, "grad_norm": 0.8979525060906085, "learning_rate": 1.1866188583624163e-05, "loss": 0.1299, "step": 13123 }, { "epoch": 2.7427377220480667, "grad_norm": 1.0818540400361292, "learning_rate": 1.186508026868465e-05, "loss": 0.1415, "step": 13124 }, { "epoch": 2.7429467084639496, "grad_norm": 0.7544136782524047, "learning_rate": 1.1863971930009046e-05, "loss": 0.1113, "step": 13125 }, { "epoch": 2.7431556948798326, "grad_norm": 0.765942533150612, "learning_rate": 1.1862863567611452e-05, "loss": 0.0985, "step": 13126 }, { "epoch": 2.7433646812957155, "grad_norm": 1.016514136934217, "learning_rate": 1.1861755181505971e-05, "loss": 0.1272, "step": 13127 }, { "epoch": 2.7435736677115985, "grad_norm": 0.9801828539609073, "learning_rate": 1.186064677170672e-05, "loss": 0.1193, "step": 13128 }, { "epoch": 2.7437826541274815, "grad_norm": 0.9817043987389082, "learning_rate": 1.1859538338227792e-05, "loss": 0.1411, "step": 13129 }, { "epoch": 2.7439916405433644, "grad_norm": 0.972612459654189, "learning_rate": 1.1858429881083302e-05, "loss": 0.1273, "step": 13130 }, { "epoch": 2.7442006269592474, "grad_norm": 0.952131919254209, "learning_rate": 1.1857321400287356e-05, "loss": 0.1361, "step": 13131 }, { "epoch": 2.7444096133751303, "grad_norm": 1.0262200532683068, "learning_rate": 1.185621289585406e-05, "loss": 0.1359, "step": 13132 }, { "epoch": 2.7446185997910137, "grad_norm": 0.9189864331838331, "learning_rate": 1.185510436779752e-05, "loss": 0.113, "step": 13133 }, { "epoch": 2.7448275862068967, "grad_norm": 1.0489063193081714, "learning_rate": 1.1853995816131845e-05, "loss": 0.1369, "step": 13134 }, { "epoch": 2.7450365726227797, "grad_norm": 0.9923710328908352, "learning_rate": 1.1852887240871145e-05, "loss": 0.1121, "step": 13135 }, { "epoch": 2.7452455590386626, "grad_norm": 0.9311765381768722, "learning_rate": 1.1851778642029522e-05, "loss": 0.0993, "step": 13136 }, { "epoch": 2.7454545454545456, "grad_norm": 0.86399435760505, "learning_rate": 1.1850670019621098e-05, "loss": 0.1024, "step": 13137 }, { "epoch": 2.7456635318704286, "grad_norm": 1.0279611533349728, "learning_rate": 1.1849561373659966e-05, "loss": 0.122, "step": 13138 }, { "epoch": 2.7458725182863115, "grad_norm": 0.9543407179243026, "learning_rate": 1.1848452704160245e-05, "loss": 0.1133, "step": 13139 }, { "epoch": 2.7460815047021945, "grad_norm": 0.9793396294842464, "learning_rate": 1.1847344011136042e-05, "loss": 0.1159, "step": 13140 }, { "epoch": 2.7462904911180774, "grad_norm": 1.190823695013448, "learning_rate": 1.1846235294601467e-05, "loss": 0.1277, "step": 13141 }, { "epoch": 2.7464994775339604, "grad_norm": 1.021421560853952, "learning_rate": 1.1845126554570626e-05, "loss": 0.1274, "step": 13142 }, { "epoch": 2.7467084639498434, "grad_norm": 0.9865831051519431, "learning_rate": 1.1844017791057643e-05, "loss": 0.1469, "step": 13143 }, { "epoch": 2.7469174503657263, "grad_norm": 1.2279345634093644, "learning_rate": 1.1842909004076613e-05, "loss": 0.1078, "step": 13144 }, { "epoch": 2.7471264367816093, "grad_norm": 0.8942851406216964, "learning_rate": 1.1841800193641653e-05, "loss": 0.126, "step": 13145 }, { "epoch": 2.7473354231974922, "grad_norm": 0.7676853216267685, "learning_rate": 1.1840691359766875e-05, "loss": 0.0917, "step": 13146 }, { "epoch": 2.747544409613375, "grad_norm": 0.9160111276907922, "learning_rate": 1.1839582502466392e-05, "loss": 0.1151, "step": 13147 }, { "epoch": 2.747753396029258, "grad_norm": 0.8144006458567393, "learning_rate": 1.1838473621754314e-05, "loss": 0.1174, "step": 13148 }, { "epoch": 2.747962382445141, "grad_norm": 0.749475689607291, "learning_rate": 1.1837364717644754e-05, "loss": 0.1106, "step": 13149 }, { "epoch": 2.748171368861024, "grad_norm": 1.0007165726712557, "learning_rate": 1.1836255790151824e-05, "loss": 0.1252, "step": 13150 }, { "epoch": 2.748380355276907, "grad_norm": 1.0384442779454803, "learning_rate": 1.1835146839289637e-05, "loss": 0.1447, "step": 13151 }, { "epoch": 2.74858934169279, "grad_norm": 0.9956041589022496, "learning_rate": 1.1834037865072308e-05, "loss": 0.1233, "step": 13152 }, { "epoch": 2.748798328108673, "grad_norm": 0.9440854627056512, "learning_rate": 1.1832928867513948e-05, "loss": 0.102, "step": 13153 }, { "epoch": 2.749007314524556, "grad_norm": 0.9742522263051647, "learning_rate": 1.1831819846628666e-05, "loss": 0.1248, "step": 13154 }, { "epoch": 2.749216300940439, "grad_norm": 1.3233204896211013, "learning_rate": 1.183071080243059e-05, "loss": 0.1191, "step": 13155 }, { "epoch": 2.749425287356322, "grad_norm": 0.8343237908687328, "learning_rate": 1.1829601734933822e-05, "loss": 0.0949, "step": 13156 }, { "epoch": 2.749634273772205, "grad_norm": 1.1128668008423381, "learning_rate": 1.182849264415248e-05, "loss": 0.1332, "step": 13157 }, { "epoch": 2.749843260188088, "grad_norm": 1.0683212637074748, "learning_rate": 1.1827383530100682e-05, "loss": 0.1278, "step": 13158 }, { "epoch": 2.7500522466039707, "grad_norm": 0.8521825437644034, "learning_rate": 1.1826274392792538e-05, "loss": 0.1054, "step": 13159 }, { "epoch": 2.7502612330198537, "grad_norm": 0.9960957391199122, "learning_rate": 1.1825165232242166e-05, "loss": 0.1268, "step": 13160 }, { "epoch": 2.7504702194357367, "grad_norm": 1.0044528208370158, "learning_rate": 1.1824056048463682e-05, "loss": 0.1217, "step": 13161 }, { "epoch": 2.7506792058516196, "grad_norm": 1.0342497199417475, "learning_rate": 1.1822946841471201e-05, "loss": 0.1257, "step": 13162 }, { "epoch": 2.7508881922675026, "grad_norm": 1.175583609314903, "learning_rate": 1.1821837611278843e-05, "loss": 0.1239, "step": 13163 }, { "epoch": 2.7510971786833855, "grad_norm": 0.9548297689155214, "learning_rate": 1.1820728357900724e-05, "loss": 0.1075, "step": 13164 }, { "epoch": 2.7513061650992685, "grad_norm": 0.9781407142366231, "learning_rate": 1.1819619081350955e-05, "loss": 0.1139, "step": 13165 }, { "epoch": 2.7515151515151515, "grad_norm": 1.033097204363485, "learning_rate": 1.1818509781643659e-05, "loss": 0.144, "step": 13166 }, { "epoch": 2.7517241379310344, "grad_norm": 1.0095407162926038, "learning_rate": 1.1817400458792954e-05, "loss": 0.1234, "step": 13167 }, { "epoch": 2.7519331243469174, "grad_norm": 1.0734524223744233, "learning_rate": 1.1816291112812952e-05, "loss": 0.1467, "step": 13168 }, { "epoch": 2.7521421107628004, "grad_norm": 0.8735416811394954, "learning_rate": 1.1815181743717779e-05, "loss": 0.0998, "step": 13169 }, { "epoch": 2.7523510971786833, "grad_norm": 1.02537839974437, "learning_rate": 1.1814072351521549e-05, "loss": 0.1407, "step": 13170 }, { "epoch": 2.7525600835945663, "grad_norm": 0.8888889633295225, "learning_rate": 1.181296293623838e-05, "loss": 0.1117, "step": 13171 }, { "epoch": 2.7527690700104492, "grad_norm": 1.013701723217646, "learning_rate": 1.1811853497882393e-05, "loss": 0.1495, "step": 13172 }, { "epoch": 2.752978056426332, "grad_norm": 0.9850158144484433, "learning_rate": 1.1810744036467707e-05, "loss": 0.1384, "step": 13173 }, { "epoch": 2.753187042842215, "grad_norm": 0.9234876933220865, "learning_rate": 1.180963455200844e-05, "loss": 0.1143, "step": 13174 }, { "epoch": 2.753396029258098, "grad_norm": 1.03986446807365, "learning_rate": 1.1808525044518716e-05, "loss": 0.1101, "step": 13175 }, { "epoch": 2.753605015673981, "grad_norm": 0.9436091666876548, "learning_rate": 1.1807415514012653e-05, "loss": 0.0955, "step": 13176 }, { "epoch": 2.753814002089864, "grad_norm": 0.9305355222088835, "learning_rate": 1.1806305960504369e-05, "loss": 0.1147, "step": 13177 }, { "epoch": 2.754022988505747, "grad_norm": 0.7594060437238392, "learning_rate": 1.1805196384007988e-05, "loss": 0.0743, "step": 13178 }, { "epoch": 2.75423197492163, "grad_norm": 1.0655777776500446, "learning_rate": 1.180408678453763e-05, "loss": 0.14, "step": 13179 }, { "epoch": 2.754440961337513, "grad_norm": 0.813617177001062, "learning_rate": 1.1802977162107414e-05, "loss": 0.1015, "step": 13180 }, { "epoch": 2.754649947753396, "grad_norm": 0.8032307718367546, "learning_rate": 1.1801867516731466e-05, "loss": 0.1035, "step": 13181 }, { "epoch": 2.754858934169279, "grad_norm": 0.8556434407590136, "learning_rate": 1.1800757848423909e-05, "loss": 0.1057, "step": 13182 }, { "epoch": 2.755067920585162, "grad_norm": 0.768999701445251, "learning_rate": 1.1799648157198862e-05, "loss": 0.0973, "step": 13183 }, { "epoch": 2.7552769070010448, "grad_norm": 1.0780447750830684, "learning_rate": 1.1798538443070446e-05, "loss": 0.1245, "step": 13184 }, { "epoch": 2.7554858934169277, "grad_norm": 0.8587008410700046, "learning_rate": 1.1797428706052785e-05, "loss": 0.1202, "step": 13185 }, { "epoch": 2.755694879832811, "grad_norm": 0.917125356227341, "learning_rate": 1.1796318946160005e-05, "loss": 0.1123, "step": 13186 }, { "epoch": 2.755903866248694, "grad_norm": 0.8887021285394083, "learning_rate": 1.1795209163406226e-05, "loss": 0.0995, "step": 13187 }, { "epoch": 2.756112852664577, "grad_norm": 0.9080345412421974, "learning_rate": 1.1794099357805577e-05, "loss": 0.1167, "step": 13188 }, { "epoch": 2.75632183908046, "grad_norm": 0.8715486708767144, "learning_rate": 1.1792989529372177e-05, "loss": 0.1151, "step": 13189 }, { "epoch": 2.756530825496343, "grad_norm": 0.756765704426552, "learning_rate": 1.1791879678120148e-05, "loss": 0.1026, "step": 13190 }, { "epoch": 2.756739811912226, "grad_norm": 0.9579540894190846, "learning_rate": 1.1790769804063622e-05, "loss": 0.1166, "step": 13191 }, { "epoch": 2.756948798328109, "grad_norm": 0.9369424464475076, "learning_rate": 1.1789659907216718e-05, "loss": 0.1331, "step": 13192 }, { "epoch": 2.757157784743992, "grad_norm": 1.1154923557372054, "learning_rate": 1.1788549987593565e-05, "loss": 0.132, "step": 13193 }, { "epoch": 2.757366771159875, "grad_norm": 2.601204904342012, "learning_rate": 1.1787440045208287e-05, "loss": 0.1158, "step": 13194 }, { "epoch": 2.757575757575758, "grad_norm": 0.7880877074333793, "learning_rate": 1.1786330080075008e-05, "loss": 0.0911, "step": 13195 }, { "epoch": 2.7577847439916408, "grad_norm": 0.8707133763095877, "learning_rate": 1.1785220092207856e-05, "loss": 0.1211, "step": 13196 }, { "epoch": 2.7579937304075237, "grad_norm": 0.876459532713026, "learning_rate": 1.1784110081620956e-05, "loss": 0.1004, "step": 13197 }, { "epoch": 2.7582027168234067, "grad_norm": 0.8338600946366022, "learning_rate": 1.178300004832844e-05, "loss": 0.1063, "step": 13198 }, { "epoch": 2.7584117032392896, "grad_norm": 0.987920169896481, "learning_rate": 1.1781889992344426e-05, "loss": 0.1413, "step": 13199 }, { "epoch": 2.7586206896551726, "grad_norm": 1.1281899698351199, "learning_rate": 1.178077991368305e-05, "loss": 0.1314, "step": 13200 }, { "epoch": 2.7588296760710556, "grad_norm": 1.0983670033249022, "learning_rate": 1.1779669812358432e-05, "loss": 0.1221, "step": 13201 }, { "epoch": 2.7590386624869385, "grad_norm": 0.9988009026927994, "learning_rate": 1.1778559688384703e-05, "loss": 0.1239, "step": 13202 }, { "epoch": 2.7592476489028215, "grad_norm": 1.0399869434123516, "learning_rate": 1.177744954177599e-05, "loss": 0.1318, "step": 13203 }, { "epoch": 2.7594566353187044, "grad_norm": 0.9755449053209614, "learning_rate": 1.1776339372546426e-05, "loss": 0.1124, "step": 13204 }, { "epoch": 2.7596656217345874, "grad_norm": 0.9058720166493619, "learning_rate": 1.1775229180710135e-05, "loss": 0.108, "step": 13205 }, { "epoch": 2.7598746081504704, "grad_norm": 0.9826008871596337, "learning_rate": 1.177411896628125e-05, "loss": 0.1265, "step": 13206 }, { "epoch": 2.7600835945663533, "grad_norm": 1.0137544397986926, "learning_rate": 1.1773008729273894e-05, "loss": 0.1429, "step": 13207 }, { "epoch": 2.7602925809822363, "grad_norm": 1.1193826357262717, "learning_rate": 1.1771898469702198e-05, "loss": 0.1151, "step": 13208 }, { "epoch": 2.7605015673981192, "grad_norm": 1.1152293672414684, "learning_rate": 1.1770788187580298e-05, "loss": 0.1564, "step": 13209 }, { "epoch": 2.760710553814002, "grad_norm": 1.0038900670384654, "learning_rate": 1.1769677882922317e-05, "loss": 0.1291, "step": 13210 }, { "epoch": 2.760919540229885, "grad_norm": 0.6658563303366785, "learning_rate": 1.1768567555742387e-05, "loss": 0.076, "step": 13211 }, { "epoch": 2.761128526645768, "grad_norm": 1.0515815241098874, "learning_rate": 1.1767457206054645e-05, "loss": 0.1203, "step": 13212 }, { "epoch": 2.761337513061651, "grad_norm": 1.0689786493042426, "learning_rate": 1.1766346833873212e-05, "loss": 0.1389, "step": 13213 }, { "epoch": 2.761546499477534, "grad_norm": 1.0086541485662615, "learning_rate": 1.1765236439212227e-05, "loss": 0.1058, "step": 13214 }, { "epoch": 2.761755485893417, "grad_norm": 1.5041831191365094, "learning_rate": 1.1764126022085817e-05, "loss": 0.1501, "step": 13215 }, { "epoch": 2.7619644723093, "grad_norm": 0.8027248676469192, "learning_rate": 1.1763015582508115e-05, "loss": 0.1111, "step": 13216 }, { "epoch": 2.762173458725183, "grad_norm": 0.958060481120822, "learning_rate": 1.1761905120493253e-05, "loss": 0.1166, "step": 13217 }, { "epoch": 2.762382445141066, "grad_norm": 0.8637802112128297, "learning_rate": 1.1760794636055368e-05, "loss": 0.0891, "step": 13218 }, { "epoch": 2.762591431556949, "grad_norm": 1.057059787087578, "learning_rate": 1.1759684129208586e-05, "loss": 0.1389, "step": 13219 }, { "epoch": 2.762800417972832, "grad_norm": 0.8823055361258477, "learning_rate": 1.1758573599967042e-05, "loss": 0.0889, "step": 13220 }, { "epoch": 2.763009404388715, "grad_norm": 1.039067901611609, "learning_rate": 1.175746304834487e-05, "loss": 0.1409, "step": 13221 }, { "epoch": 2.7632183908045977, "grad_norm": 0.9744968441321684, "learning_rate": 1.1756352474356204e-05, "loss": 0.1085, "step": 13222 }, { "epoch": 2.7634273772204807, "grad_norm": 0.8166183928368524, "learning_rate": 1.1755241878015174e-05, "loss": 0.0956, "step": 13223 }, { "epoch": 2.7636363636363637, "grad_norm": 1.1596528252504752, "learning_rate": 1.1754131259335921e-05, "loss": 0.1283, "step": 13224 }, { "epoch": 2.7638453500522466, "grad_norm": 1.1107260966190853, "learning_rate": 1.1753020618332576e-05, "loss": 0.1195, "step": 13225 }, { "epoch": 2.7640543364681296, "grad_norm": 0.8207411434690323, "learning_rate": 1.175190995501927e-05, "loss": 0.103, "step": 13226 }, { "epoch": 2.7642633228840126, "grad_norm": 0.8089322728212711, "learning_rate": 1.1750799269410142e-05, "loss": 0.1034, "step": 13227 }, { "epoch": 2.7644723092998955, "grad_norm": 0.9637130922115105, "learning_rate": 1.1749688561519328e-05, "loss": 0.1266, "step": 13228 }, { "epoch": 2.7646812957157785, "grad_norm": 0.90981956473772, "learning_rate": 1.1748577831360959e-05, "loss": 0.1158, "step": 13229 }, { "epoch": 2.7648902821316614, "grad_norm": 0.8808821450530118, "learning_rate": 1.1747467078949177e-05, "loss": 0.1026, "step": 13230 }, { "epoch": 2.7650992685475444, "grad_norm": 1.0849835315122802, "learning_rate": 1.1746356304298114e-05, "loss": 0.1404, "step": 13231 }, { "epoch": 2.7653082549634274, "grad_norm": 1.073374205410221, "learning_rate": 1.1745245507421904e-05, "loss": 0.138, "step": 13232 }, { "epoch": 2.7655172413793103, "grad_norm": 0.9363064921698583, "learning_rate": 1.1744134688334692e-05, "loss": 0.1197, "step": 13233 }, { "epoch": 2.7657262277951933, "grad_norm": 0.9068343883913389, "learning_rate": 1.1743023847050607e-05, "loss": 0.1212, "step": 13234 }, { "epoch": 2.7659352142110762, "grad_norm": 0.895131131362618, "learning_rate": 1.1741912983583789e-05, "loss": 0.1099, "step": 13235 }, { "epoch": 2.766144200626959, "grad_norm": 0.9571182921651791, "learning_rate": 1.1740802097948374e-05, "loss": 0.1198, "step": 13236 }, { "epoch": 2.766353187042842, "grad_norm": 1.0072240161921882, "learning_rate": 1.1739691190158503e-05, "loss": 0.1266, "step": 13237 }, { "epoch": 2.766562173458725, "grad_norm": 1.005802328459462, "learning_rate": 1.1738580260228311e-05, "loss": 0.1282, "step": 13238 }, { "epoch": 2.766771159874608, "grad_norm": 1.0773713726718812, "learning_rate": 1.173746930817194e-05, "loss": 0.126, "step": 13239 }, { "epoch": 2.766980146290491, "grad_norm": 0.808234493414262, "learning_rate": 1.1736358334003524e-05, "loss": 0.1065, "step": 13240 }, { "epoch": 2.767189132706374, "grad_norm": 0.9071769809003781, "learning_rate": 1.1735247337737206e-05, "loss": 0.1132, "step": 13241 }, { "epoch": 2.767398119122257, "grad_norm": 0.8484907418474241, "learning_rate": 1.1734136319387122e-05, "loss": 0.1318, "step": 13242 }, { "epoch": 2.76760710553814, "grad_norm": 1.0595533965765713, "learning_rate": 1.1733025278967412e-05, "loss": 0.141, "step": 13243 }, { "epoch": 2.767816091954023, "grad_norm": 1.058994432413238, "learning_rate": 1.1731914216492214e-05, "loss": 0.1234, "step": 13244 }, { "epoch": 2.768025078369906, "grad_norm": 0.9286276537199988, "learning_rate": 1.1730803131975676e-05, "loss": 0.1259, "step": 13245 }, { "epoch": 2.768234064785789, "grad_norm": 0.8533533915798595, "learning_rate": 1.172969202543193e-05, "loss": 0.1266, "step": 13246 }, { "epoch": 2.768443051201672, "grad_norm": 0.8339270117424291, "learning_rate": 1.172858089687512e-05, "loss": 0.1085, "step": 13247 }, { "epoch": 2.7686520376175547, "grad_norm": 0.9281798890799504, "learning_rate": 1.1727469746319386e-05, "loss": 0.098, "step": 13248 }, { "epoch": 2.7688610240334377, "grad_norm": 1.0070148144798863, "learning_rate": 1.1726358573778872e-05, "loss": 0.1289, "step": 13249 }, { "epoch": 2.7690700104493207, "grad_norm": 0.8130439791696077, "learning_rate": 1.1725247379267712e-05, "loss": 0.112, "step": 13250 }, { "epoch": 2.7692789968652036, "grad_norm": 0.817002893764793, "learning_rate": 1.1724136162800058e-05, "loss": 0.0965, "step": 13251 }, { "epoch": 2.7694879832810866, "grad_norm": 0.880920431118735, "learning_rate": 1.1723024924390042e-05, "loss": 0.1178, "step": 13252 }, { "epoch": 2.7696969696969695, "grad_norm": 0.9669520884365042, "learning_rate": 1.1721913664051814e-05, "loss": 0.1535, "step": 13253 }, { "epoch": 2.7699059561128525, "grad_norm": 1.021480107673962, "learning_rate": 1.1720802381799512e-05, "loss": 0.1113, "step": 13254 }, { "epoch": 2.7701149425287355, "grad_norm": 1.0988977777760727, "learning_rate": 1.171969107764728e-05, "loss": 0.1489, "step": 13255 }, { "epoch": 2.7703239289446184, "grad_norm": 1.005180251498947, "learning_rate": 1.171857975160926e-05, "loss": 0.1249, "step": 13256 }, { "epoch": 2.7705329153605014, "grad_norm": 1.0204583309530848, "learning_rate": 1.17174684036996e-05, "loss": 0.1149, "step": 13257 }, { "epoch": 2.7707419017763844, "grad_norm": 1.0253363336810641, "learning_rate": 1.1716357033932439e-05, "loss": 0.1445, "step": 13258 }, { "epoch": 2.7709508881922673, "grad_norm": 0.8664010495417838, "learning_rate": 1.171524564232192e-05, "loss": 0.1127, "step": 13259 }, { "epoch": 2.7711598746081503, "grad_norm": 1.1826137893152697, "learning_rate": 1.1714134228882192e-05, "loss": 0.113, "step": 13260 }, { "epoch": 2.7713688610240332, "grad_norm": 0.945378171583267, "learning_rate": 1.1713022793627396e-05, "loss": 0.1448, "step": 13261 }, { "epoch": 2.771577847439916, "grad_norm": 0.9755262103390491, "learning_rate": 1.1711911336571678e-05, "loss": 0.1248, "step": 13262 }, { "epoch": 2.771786833855799, "grad_norm": 0.7012306519041416, "learning_rate": 1.1710799857729183e-05, "loss": 0.0847, "step": 13263 }, { "epoch": 2.771995820271682, "grad_norm": 0.9263616676681466, "learning_rate": 1.1709688357114057e-05, "loss": 0.1261, "step": 13264 }, { "epoch": 2.772204806687565, "grad_norm": 1.017264935116205, "learning_rate": 1.1708576834740442e-05, "loss": 0.1241, "step": 13265 }, { "epoch": 2.772413793103448, "grad_norm": 0.8631828334004463, "learning_rate": 1.1707465290622488e-05, "loss": 0.1199, "step": 13266 }, { "epoch": 2.772622779519331, "grad_norm": 0.7669157291656363, "learning_rate": 1.170635372477434e-05, "loss": 0.0985, "step": 13267 }, { "epoch": 2.772831765935214, "grad_norm": 0.8172477292046184, "learning_rate": 1.1705242137210145e-05, "loss": 0.1007, "step": 13268 }, { "epoch": 2.773040752351097, "grad_norm": 0.7786618477126935, "learning_rate": 1.170413052794405e-05, "loss": 0.0923, "step": 13269 }, { "epoch": 2.77324973876698, "grad_norm": 0.9701410824807606, "learning_rate": 1.1703018896990197e-05, "loss": 0.1132, "step": 13270 }, { "epoch": 2.773458725182863, "grad_norm": 0.8664018856373027, "learning_rate": 1.1701907244362739e-05, "loss": 0.1084, "step": 13271 }, { "epoch": 2.773667711598746, "grad_norm": 0.8798549857949263, "learning_rate": 1.1700795570075821e-05, "loss": 0.118, "step": 13272 }, { "epoch": 2.7738766980146288, "grad_norm": 1.0044791879802955, "learning_rate": 1.1699683874143593e-05, "loss": 0.1355, "step": 13273 }, { "epoch": 2.774085684430512, "grad_norm": 0.8760020189784036, "learning_rate": 1.1698572156580198e-05, "loss": 0.1119, "step": 13274 }, { "epoch": 2.774294670846395, "grad_norm": 1.087257269731317, "learning_rate": 1.1697460417399795e-05, "loss": 0.1254, "step": 13275 }, { "epoch": 2.774503657262278, "grad_norm": 0.9388816631742248, "learning_rate": 1.169634865661652e-05, "loss": 0.1213, "step": 13276 }, { "epoch": 2.774712643678161, "grad_norm": 0.968073023076306, "learning_rate": 1.1695236874244529e-05, "loss": 0.1278, "step": 13277 }, { "epoch": 2.774921630094044, "grad_norm": 0.9880792767193802, "learning_rate": 1.1694125070297968e-05, "loss": 0.1358, "step": 13278 }, { "epoch": 2.775130616509927, "grad_norm": 1.0806358796827953, "learning_rate": 1.1693013244790989e-05, "loss": 0.1238, "step": 13279 }, { "epoch": 2.77533960292581, "grad_norm": 1.2354799846084652, "learning_rate": 1.169190139773774e-05, "loss": 0.1234, "step": 13280 }, { "epoch": 2.775548589341693, "grad_norm": 1.0899980118144135, "learning_rate": 1.1690789529152378e-05, "loss": 0.112, "step": 13281 }, { "epoch": 2.775757575757576, "grad_norm": 0.7895403197101843, "learning_rate": 1.1689677639049041e-05, "loss": 0.0938, "step": 13282 }, { "epoch": 2.775966562173459, "grad_norm": 1.008793407135476, "learning_rate": 1.1688565727441888e-05, "loss": 0.1153, "step": 13283 }, { "epoch": 2.776175548589342, "grad_norm": 1.029069240059966, "learning_rate": 1.1687453794345065e-05, "loss": 0.1175, "step": 13284 }, { "epoch": 2.7763845350052248, "grad_norm": 0.8790168010332952, "learning_rate": 1.1686341839772729e-05, "loss": 0.1234, "step": 13285 }, { "epoch": 2.7765935214211077, "grad_norm": 1.0531508098609152, "learning_rate": 1.1685229863739024e-05, "loss": 0.1533, "step": 13286 }, { "epoch": 2.7768025078369907, "grad_norm": 0.9244360548368127, "learning_rate": 1.1684117866258112e-05, "loss": 0.1156, "step": 13287 }, { "epoch": 2.7770114942528736, "grad_norm": 1.0176752167336185, "learning_rate": 1.1683005847344134e-05, "loss": 0.1117, "step": 13288 }, { "epoch": 2.7772204806687566, "grad_norm": 1.0079352423150887, "learning_rate": 1.1681893807011247e-05, "loss": 0.1301, "step": 13289 }, { "epoch": 2.7774294670846396, "grad_norm": 0.8924348131830598, "learning_rate": 1.1680781745273604e-05, "loss": 0.1127, "step": 13290 }, { "epoch": 2.7776384535005225, "grad_norm": 0.909396718653419, "learning_rate": 1.1679669662145358e-05, "loss": 0.1095, "step": 13291 }, { "epoch": 2.7778474399164055, "grad_norm": 0.923460703659935, "learning_rate": 1.1678557557640659e-05, "loss": 0.1083, "step": 13292 }, { "epoch": 2.7780564263322884, "grad_norm": 1.046786896613601, "learning_rate": 1.1677445431773663e-05, "loss": 0.1436, "step": 13293 }, { "epoch": 2.7782654127481714, "grad_norm": 0.9752660710805974, "learning_rate": 1.1676333284558525e-05, "loss": 0.1447, "step": 13294 }, { "epoch": 2.7784743991640544, "grad_norm": 0.8266496045196307, "learning_rate": 1.1675221116009394e-05, "loss": 0.1084, "step": 13295 }, { "epoch": 2.7786833855799373, "grad_norm": 0.8260776922098342, "learning_rate": 1.1674108926140428e-05, "loss": 0.1168, "step": 13296 }, { "epoch": 2.7788923719958203, "grad_norm": 0.9871486038673167, "learning_rate": 1.1672996714965783e-05, "loss": 0.1261, "step": 13297 }, { "epoch": 2.7791013584117032, "grad_norm": 0.8451315729369702, "learning_rate": 1.1671884482499605e-05, "loss": 0.0839, "step": 13298 }, { "epoch": 2.779310344827586, "grad_norm": 0.8925610795144322, "learning_rate": 1.167077222875606e-05, "loss": 0.1087, "step": 13299 }, { "epoch": 2.779519331243469, "grad_norm": 0.9861918697672367, "learning_rate": 1.1669659953749296e-05, "loss": 0.1214, "step": 13300 }, { "epoch": 2.779728317659352, "grad_norm": 0.9678214224274726, "learning_rate": 1.1668547657493468e-05, "loss": 0.1279, "step": 13301 }, { "epoch": 2.779937304075235, "grad_norm": 1.1160742327731057, "learning_rate": 1.1667435340002737e-05, "loss": 0.1293, "step": 13302 }, { "epoch": 2.780146290491118, "grad_norm": 1.0221767096008865, "learning_rate": 1.1666323001291256e-05, "loss": 0.1234, "step": 13303 }, { "epoch": 2.780355276907001, "grad_norm": 0.9674080032383674, "learning_rate": 1.1665210641373179e-05, "loss": 0.1307, "step": 13304 }, { "epoch": 2.780564263322884, "grad_norm": 0.840909480397704, "learning_rate": 1.1664098260262669e-05, "loss": 0.1089, "step": 13305 }, { "epoch": 2.780773249738767, "grad_norm": 1.1731433727208287, "learning_rate": 1.1662985857973876e-05, "loss": 0.1581, "step": 13306 }, { "epoch": 2.78098223615465, "grad_norm": 1.02267497607158, "learning_rate": 1.166187343452096e-05, "loss": 0.143, "step": 13307 }, { "epoch": 2.781191222570533, "grad_norm": 0.8603149050183364, "learning_rate": 1.166076098991808e-05, "loss": 0.1174, "step": 13308 }, { "epoch": 2.781400208986416, "grad_norm": 1.2724209857977944, "learning_rate": 1.165964852417939e-05, "loss": 0.1316, "step": 13309 }, { "epoch": 2.781609195402299, "grad_norm": 0.8626863407405506, "learning_rate": 1.1658536037319049e-05, "loss": 0.1082, "step": 13310 }, { "epoch": 2.7818181818181817, "grad_norm": 0.9212275663424653, "learning_rate": 1.1657423529351217e-05, "loss": 0.1173, "step": 13311 }, { "epoch": 2.7820271682340647, "grad_norm": 0.9729502231247384, "learning_rate": 1.165631100029005e-05, "loss": 0.1129, "step": 13312 }, { "epoch": 2.7822361546499477, "grad_norm": 0.996794342863, "learning_rate": 1.165519845014971e-05, "loss": 0.1224, "step": 13313 }, { "epoch": 2.7824451410658306, "grad_norm": 1.0423102117844032, "learning_rate": 1.1654085878944354e-05, "loss": 0.1129, "step": 13314 }, { "epoch": 2.7826541274817136, "grad_norm": 1.3012101831276721, "learning_rate": 1.1652973286688138e-05, "loss": 0.1692, "step": 13315 }, { "epoch": 2.7828631138975966, "grad_norm": 0.8785879413800214, "learning_rate": 1.1651860673395228e-05, "loss": 0.114, "step": 13316 }, { "epoch": 2.7830721003134795, "grad_norm": 0.9071081434645494, "learning_rate": 1.1650748039079777e-05, "loss": 0.124, "step": 13317 }, { "epoch": 2.7832810867293625, "grad_norm": 0.9706939055483219, "learning_rate": 1.1649635383755951e-05, "loss": 0.1265, "step": 13318 }, { "epoch": 2.7834900731452454, "grad_norm": 1.2181194505486421, "learning_rate": 1.1648522707437908e-05, "loss": 0.1387, "step": 13319 }, { "epoch": 2.7836990595611284, "grad_norm": 1.2008261293976912, "learning_rate": 1.1647410010139808e-05, "loss": 0.135, "step": 13320 }, { "epoch": 2.7839080459770114, "grad_norm": 0.9793451336101935, "learning_rate": 1.1646297291875813e-05, "loss": 0.1283, "step": 13321 }, { "epoch": 2.7841170323928943, "grad_norm": 0.8820360510075228, "learning_rate": 1.164518455266008e-05, "loss": 0.1144, "step": 13322 }, { "epoch": 2.7843260188087773, "grad_norm": 0.8432159527122688, "learning_rate": 1.1644071792506775e-05, "loss": 0.089, "step": 13323 }, { "epoch": 2.7845350052246602, "grad_norm": 1.1259104596899694, "learning_rate": 1.1642959011430059e-05, "loss": 0.1385, "step": 13324 }, { "epoch": 2.784743991640543, "grad_norm": 1.0252763998035115, "learning_rate": 1.1641846209444092e-05, "loss": 0.1268, "step": 13325 }, { "epoch": 2.7849529780564266, "grad_norm": 1.1162798377906344, "learning_rate": 1.1640733386563039e-05, "loss": 0.1354, "step": 13326 }, { "epoch": 2.7851619644723096, "grad_norm": 0.8037867696446664, "learning_rate": 1.163962054280106e-05, "loss": 0.1085, "step": 13327 }, { "epoch": 2.7853709508881925, "grad_norm": 0.9895210944766428, "learning_rate": 1.1638507678172318e-05, "loss": 0.1222, "step": 13328 }, { "epoch": 2.7855799373040755, "grad_norm": 0.8750552244140832, "learning_rate": 1.1637394792690974e-05, "loss": 0.1061, "step": 13329 }, { "epoch": 2.7857889237199585, "grad_norm": 0.8527330327096868, "learning_rate": 1.1636281886371197e-05, "loss": 0.119, "step": 13330 }, { "epoch": 2.7859979101358414, "grad_norm": 0.7205423615103151, "learning_rate": 1.1635168959227146e-05, "loss": 0.0845, "step": 13331 }, { "epoch": 2.7862068965517244, "grad_norm": 0.9467021332935918, "learning_rate": 1.1634056011272988e-05, "loss": 0.1277, "step": 13332 }, { "epoch": 2.7864158829676073, "grad_norm": 1.0522456788425403, "learning_rate": 1.1632943042522881e-05, "loss": 0.1509, "step": 13333 }, { "epoch": 2.7866248693834903, "grad_norm": 0.842375737009422, "learning_rate": 1.1631830052990996e-05, "loss": 0.1206, "step": 13334 }, { "epoch": 2.7868338557993733, "grad_norm": 0.8600781364450847, "learning_rate": 1.1630717042691492e-05, "loss": 0.1285, "step": 13335 }, { "epoch": 2.787042842215256, "grad_norm": 0.7966543396819815, "learning_rate": 1.1629604011638538e-05, "loss": 0.103, "step": 13336 }, { "epoch": 2.787251828631139, "grad_norm": 0.9495943412188966, "learning_rate": 1.1628490959846297e-05, "loss": 0.1056, "step": 13337 }, { "epoch": 2.787460815047022, "grad_norm": 0.9584408971577439, "learning_rate": 1.1627377887328937e-05, "loss": 0.1015, "step": 13338 }, { "epoch": 2.787669801462905, "grad_norm": 0.8680067120521786, "learning_rate": 1.1626264794100619e-05, "loss": 0.1064, "step": 13339 }, { "epoch": 2.787878787878788, "grad_norm": 0.9251775750911391, "learning_rate": 1.1625151680175513e-05, "loss": 0.1197, "step": 13340 }, { "epoch": 2.788087774294671, "grad_norm": 0.9519799214070551, "learning_rate": 1.1624038545567782e-05, "loss": 0.1117, "step": 13341 }, { "epoch": 2.788296760710554, "grad_norm": 1.0614152846760339, "learning_rate": 1.1622925390291595e-05, "loss": 0.1403, "step": 13342 }, { "epoch": 2.788505747126437, "grad_norm": 1.1337744921001638, "learning_rate": 1.1621812214361117e-05, "loss": 0.1427, "step": 13343 }, { "epoch": 2.78871473354232, "grad_norm": 0.946735496018272, "learning_rate": 1.1620699017790517e-05, "loss": 0.1208, "step": 13344 }, { "epoch": 2.788923719958203, "grad_norm": 0.9342390271655323, "learning_rate": 1.161958580059396e-05, "loss": 0.1296, "step": 13345 }, { "epoch": 2.789132706374086, "grad_norm": 0.9988527973542403, "learning_rate": 1.1618472562785615e-05, "loss": 0.1266, "step": 13346 }, { "epoch": 2.789341692789969, "grad_norm": 1.0728759472019158, "learning_rate": 1.1617359304379647e-05, "loss": 0.1192, "step": 13347 }, { "epoch": 2.7895506792058518, "grad_norm": 1.0482712657896767, "learning_rate": 1.1616246025390226e-05, "loss": 0.1275, "step": 13348 }, { "epoch": 2.7897596656217347, "grad_norm": 1.0462193131862965, "learning_rate": 1.1615132725831519e-05, "loss": 0.0993, "step": 13349 }, { "epoch": 2.7899686520376177, "grad_norm": 0.882575600191084, "learning_rate": 1.16140194057177e-05, "loss": 0.0925, "step": 13350 }, { "epoch": 2.7901776384535006, "grad_norm": 0.9685733343742522, "learning_rate": 1.161290606506293e-05, "loss": 0.1169, "step": 13351 }, { "epoch": 2.7903866248693836, "grad_norm": 0.9388728715254092, "learning_rate": 1.1611792703881382e-05, "loss": 0.1148, "step": 13352 }, { "epoch": 2.7905956112852666, "grad_norm": 1.004900302256687, "learning_rate": 1.1610679322187225e-05, "loss": 0.1202, "step": 13353 }, { "epoch": 2.7908045977011495, "grad_norm": 0.8983267281894382, "learning_rate": 1.1609565919994625e-05, "loss": 0.1162, "step": 13354 }, { "epoch": 2.7910135841170325, "grad_norm": 0.9495145728482413, "learning_rate": 1.1608452497317757e-05, "loss": 0.1215, "step": 13355 }, { "epoch": 2.7912225705329154, "grad_norm": 0.7247258445663151, "learning_rate": 1.1607339054170793e-05, "loss": 0.0783, "step": 13356 }, { "epoch": 2.7914315569487984, "grad_norm": 1.01115223780228, "learning_rate": 1.1606225590567894e-05, "loss": 0.1386, "step": 13357 }, { "epoch": 2.7916405433646814, "grad_norm": 1.0168244974638443, "learning_rate": 1.1605112106523237e-05, "loss": 0.1306, "step": 13358 }, { "epoch": 2.7918495297805643, "grad_norm": 0.9553778566345809, "learning_rate": 1.1603998602050994e-05, "loss": 0.1296, "step": 13359 }, { "epoch": 2.7920585161964473, "grad_norm": 0.977993095616469, "learning_rate": 1.1602885077165333e-05, "loss": 0.115, "step": 13360 }, { "epoch": 2.7922675026123303, "grad_norm": 1.0254354067370164, "learning_rate": 1.1601771531880423e-05, "loss": 0.1001, "step": 13361 }, { "epoch": 2.792476489028213, "grad_norm": 0.8037497771128455, "learning_rate": 1.1600657966210445e-05, "loss": 0.0975, "step": 13362 }, { "epoch": 2.792685475444096, "grad_norm": 0.868932743391259, "learning_rate": 1.1599544380169561e-05, "loss": 0.1017, "step": 13363 }, { "epoch": 2.792894461859979, "grad_norm": 0.8653569617243619, "learning_rate": 1.1598430773771947e-05, "loss": 0.1133, "step": 13364 }, { "epoch": 2.793103448275862, "grad_norm": 0.7565205973911218, "learning_rate": 1.1597317147031775e-05, "loss": 0.1075, "step": 13365 }, { "epoch": 2.793312434691745, "grad_norm": 0.8190277987811959, "learning_rate": 1.159620349996322e-05, "loss": 0.0947, "step": 13366 }, { "epoch": 2.793521421107628, "grad_norm": 1.1322966954596387, "learning_rate": 1.1595089832580448e-05, "loss": 0.1249, "step": 13367 }, { "epoch": 2.793730407523511, "grad_norm": 0.9162706845512075, "learning_rate": 1.1593976144897642e-05, "loss": 0.1117, "step": 13368 }, { "epoch": 2.793939393939394, "grad_norm": 1.1178929442598835, "learning_rate": 1.1592862436928969e-05, "loss": 0.1338, "step": 13369 }, { "epoch": 2.794148380355277, "grad_norm": 0.9036026064370763, "learning_rate": 1.1591748708688603e-05, "loss": 0.108, "step": 13370 }, { "epoch": 2.79435736677116, "grad_norm": 1.170364885763156, "learning_rate": 1.1590634960190722e-05, "loss": 0.1129, "step": 13371 }, { "epoch": 2.794566353187043, "grad_norm": 1.048083170943725, "learning_rate": 1.1589521191449498e-05, "loss": 0.1358, "step": 13372 }, { "epoch": 2.794775339602926, "grad_norm": 1.0488571560155164, "learning_rate": 1.1588407402479099e-05, "loss": 0.1369, "step": 13373 }, { "epoch": 2.7949843260188088, "grad_norm": 0.8244011969460492, "learning_rate": 1.1587293593293712e-05, "loss": 0.0967, "step": 13374 }, { "epoch": 2.7951933124346917, "grad_norm": 1.0066821465503295, "learning_rate": 1.15861797639075e-05, "loss": 0.1179, "step": 13375 }, { "epoch": 2.7954022988505747, "grad_norm": 0.7851203355253135, "learning_rate": 1.1585065914334646e-05, "loss": 0.0855, "step": 13376 }, { "epoch": 2.7956112852664576, "grad_norm": 0.791880216470578, "learning_rate": 1.1583952044589324e-05, "loss": 0.0893, "step": 13377 }, { "epoch": 2.7958202716823406, "grad_norm": 0.9708485064079336, "learning_rate": 1.158283815468571e-05, "loss": 0.1046, "step": 13378 }, { "epoch": 2.7960292580982236, "grad_norm": 0.8058445821067746, "learning_rate": 1.1581724244637976e-05, "loss": 0.0999, "step": 13379 }, { "epoch": 2.7962382445141065, "grad_norm": 1.0434240520203253, "learning_rate": 1.1580610314460304e-05, "loss": 0.1285, "step": 13380 }, { "epoch": 2.7964472309299895, "grad_norm": 0.9816951802888391, "learning_rate": 1.1579496364166866e-05, "loss": 0.1277, "step": 13381 }, { "epoch": 2.7966562173458724, "grad_norm": 0.9288414016398051, "learning_rate": 1.1578382393771841e-05, "loss": 0.1276, "step": 13382 }, { "epoch": 2.7968652037617554, "grad_norm": 1.253491235272471, "learning_rate": 1.1577268403289407e-05, "loss": 0.1569, "step": 13383 }, { "epoch": 2.7970741901776384, "grad_norm": 0.9319280480097879, "learning_rate": 1.1576154392733738e-05, "loss": 0.1228, "step": 13384 }, { "epoch": 2.7972831765935213, "grad_norm": 0.9136033892102434, "learning_rate": 1.1575040362119014e-05, "loss": 0.0918, "step": 13385 }, { "epoch": 2.7974921630094043, "grad_norm": 0.8596916157900898, "learning_rate": 1.1573926311459411e-05, "loss": 0.1251, "step": 13386 }, { "epoch": 2.7977011494252872, "grad_norm": 0.9961744676326604, "learning_rate": 1.1572812240769112e-05, "loss": 0.1336, "step": 13387 }, { "epoch": 2.79791013584117, "grad_norm": 1.149950400402352, "learning_rate": 1.1571698150062287e-05, "loss": 0.1321, "step": 13388 }, { "epoch": 2.798119122257053, "grad_norm": 0.9272828030386172, "learning_rate": 1.1570584039353124e-05, "loss": 0.1197, "step": 13389 }, { "epoch": 2.798328108672936, "grad_norm": 0.8862160864039119, "learning_rate": 1.1569469908655795e-05, "loss": 0.1013, "step": 13390 }, { "epoch": 2.798537095088819, "grad_norm": 0.9706957200171084, "learning_rate": 1.156835575798448e-05, "loss": 0.12, "step": 13391 }, { "epoch": 2.798746081504702, "grad_norm": 0.9472587339792352, "learning_rate": 1.156724158735336e-05, "loss": 0.0951, "step": 13392 }, { "epoch": 2.798955067920585, "grad_norm": 0.8621957618363252, "learning_rate": 1.1566127396776612e-05, "loss": 0.1136, "step": 13393 }, { "epoch": 2.799164054336468, "grad_norm": 0.9141625093493627, "learning_rate": 1.156501318626842e-05, "loss": 0.1244, "step": 13394 }, { "epoch": 2.799373040752351, "grad_norm": 0.9537945767647302, "learning_rate": 1.1563898955842965e-05, "loss": 0.1234, "step": 13395 }, { "epoch": 2.799582027168234, "grad_norm": 0.8267949705813709, "learning_rate": 1.1562784705514418e-05, "loss": 0.1123, "step": 13396 }, { "epoch": 2.799791013584117, "grad_norm": 0.9346978740359808, "learning_rate": 1.156167043529697e-05, "loss": 0.1359, "step": 13397 }, { "epoch": 2.8, "grad_norm": 1.030343697306614, "learning_rate": 1.1560556145204795e-05, "loss": 0.1184, "step": 13398 }, { "epoch": 2.800208986415883, "grad_norm": 1.09351229122666, "learning_rate": 1.1559441835252078e-05, "loss": 0.1263, "step": 13399 }, { "epoch": 2.8004179728317657, "grad_norm": 0.9318903083438181, "learning_rate": 1.1558327505453e-05, "loss": 0.1311, "step": 13400 }, { "epoch": 2.8006269592476487, "grad_norm": 1.072474851537048, "learning_rate": 1.1557213155821742e-05, "loss": 0.1516, "step": 13401 }, { "epoch": 2.8008359456635317, "grad_norm": 0.9077468820758305, "learning_rate": 1.1556098786372485e-05, "loss": 0.1225, "step": 13402 }, { "epoch": 2.8010449320794146, "grad_norm": 0.7889117598569308, "learning_rate": 1.1554984397119412e-05, "loss": 0.0914, "step": 13403 }, { "epoch": 2.8012539184952976, "grad_norm": 0.8541327270008857, "learning_rate": 1.1553869988076704e-05, "loss": 0.123, "step": 13404 }, { "epoch": 2.8014629049111806, "grad_norm": 0.8379669468067215, "learning_rate": 1.1552755559258544e-05, "loss": 0.0982, "step": 13405 }, { "epoch": 2.8016718913270635, "grad_norm": 0.9313743794326378, "learning_rate": 1.1551641110679117e-05, "loss": 0.1263, "step": 13406 }, { "epoch": 2.8018808777429465, "grad_norm": 1.1429051465130018, "learning_rate": 1.1550526642352606e-05, "loss": 0.1485, "step": 13407 }, { "epoch": 2.8020898641588294, "grad_norm": 1.0013060676779169, "learning_rate": 1.1549412154293193e-05, "loss": 0.1123, "step": 13408 }, { "epoch": 2.8022988505747124, "grad_norm": 0.9675098336761174, "learning_rate": 1.1548297646515059e-05, "loss": 0.1203, "step": 13409 }, { "epoch": 2.8025078369905954, "grad_norm": 0.9736520984173169, "learning_rate": 1.1547183119032392e-05, "loss": 0.1229, "step": 13410 }, { "epoch": 2.8027168234064783, "grad_norm": 1.0235384761774209, "learning_rate": 1.1546068571859375e-05, "loss": 0.1317, "step": 13411 }, { "epoch": 2.8029258098223613, "grad_norm": 0.9467875467287228, "learning_rate": 1.1544954005010191e-05, "loss": 0.1167, "step": 13412 }, { "epoch": 2.8031347962382442, "grad_norm": 1.0351633134798275, "learning_rate": 1.1543839418499028e-05, "loss": 0.1315, "step": 13413 }, { "epoch": 2.8033437826541276, "grad_norm": 0.8344006718343892, "learning_rate": 1.1542724812340066e-05, "loss": 0.1266, "step": 13414 }, { "epoch": 2.8035527690700106, "grad_norm": 0.9183968971128251, "learning_rate": 1.1541610186547494e-05, "loss": 0.1136, "step": 13415 }, { "epoch": 2.8037617554858936, "grad_norm": 0.9219594753013651, "learning_rate": 1.1540495541135494e-05, "loss": 0.1331, "step": 13416 }, { "epoch": 2.8039707419017765, "grad_norm": 1.0079576126712708, "learning_rate": 1.1539380876118256e-05, "loss": 0.1333, "step": 13417 }, { "epoch": 2.8041797283176595, "grad_norm": 1.096088772842362, "learning_rate": 1.1538266191509963e-05, "loss": 0.1464, "step": 13418 }, { "epoch": 2.8043887147335425, "grad_norm": 0.9174426000052691, "learning_rate": 1.1537151487324805e-05, "loss": 0.1232, "step": 13419 }, { "epoch": 2.8045977011494254, "grad_norm": 0.852802232070574, "learning_rate": 1.153603676357696e-05, "loss": 0.1135, "step": 13420 }, { "epoch": 2.8048066875653084, "grad_norm": 0.7596684666001547, "learning_rate": 1.153492202028062e-05, "loss": 0.0987, "step": 13421 }, { "epoch": 2.8050156739811913, "grad_norm": 0.8698106307915534, "learning_rate": 1.1533807257449974e-05, "loss": 0.101, "step": 13422 }, { "epoch": 2.8052246603970743, "grad_norm": 0.9025667696270675, "learning_rate": 1.1532692475099205e-05, "loss": 0.1031, "step": 13423 }, { "epoch": 2.8054336468129573, "grad_norm": 0.7364991393835462, "learning_rate": 1.1531577673242502e-05, "loss": 0.0974, "step": 13424 }, { "epoch": 2.80564263322884, "grad_norm": 1.1522761909177417, "learning_rate": 1.1530462851894058e-05, "loss": 0.1361, "step": 13425 }, { "epoch": 2.805851619644723, "grad_norm": 1.0226796588903027, "learning_rate": 1.1529348011068049e-05, "loss": 0.1389, "step": 13426 }, { "epoch": 2.806060606060606, "grad_norm": 1.0349693367199113, "learning_rate": 1.152823315077867e-05, "loss": 0.131, "step": 13427 }, { "epoch": 2.806269592476489, "grad_norm": 1.0530931756555089, "learning_rate": 1.1527118271040112e-05, "loss": 0.1464, "step": 13428 }, { "epoch": 2.806478578892372, "grad_norm": 1.020795130325141, "learning_rate": 1.152600337186656e-05, "loss": 0.1257, "step": 13429 }, { "epoch": 2.806687565308255, "grad_norm": 0.8835453138081544, "learning_rate": 1.15248884532722e-05, "loss": 0.0905, "step": 13430 }, { "epoch": 2.806896551724138, "grad_norm": 0.858893614934098, "learning_rate": 1.1523773515271229e-05, "loss": 0.1127, "step": 13431 }, { "epoch": 2.807105538140021, "grad_norm": 1.0038769370574427, "learning_rate": 1.1522658557877831e-05, "loss": 0.1117, "step": 13432 }, { "epoch": 2.807314524555904, "grad_norm": 0.889172409730133, "learning_rate": 1.1521543581106194e-05, "loss": 0.1147, "step": 13433 }, { "epoch": 2.807523510971787, "grad_norm": 0.9197948257125835, "learning_rate": 1.1520428584970512e-05, "loss": 0.117, "step": 13434 }, { "epoch": 2.80773249738767, "grad_norm": 0.9680985060616641, "learning_rate": 1.1519313569484974e-05, "loss": 0.1117, "step": 13435 }, { "epoch": 2.807941483803553, "grad_norm": 1.125571960699247, "learning_rate": 1.1518198534663766e-05, "loss": 0.1554, "step": 13436 }, { "epoch": 2.8081504702194358, "grad_norm": 0.9234962842366577, "learning_rate": 1.1517083480521087e-05, "loss": 0.1179, "step": 13437 }, { "epoch": 2.8083594566353187, "grad_norm": 0.9170813435737049, "learning_rate": 1.1515968407071123e-05, "loss": 0.1041, "step": 13438 }, { "epoch": 2.8085684430512017, "grad_norm": 1.2287829676200854, "learning_rate": 1.1514853314328062e-05, "loss": 0.1368, "step": 13439 }, { "epoch": 2.8087774294670846, "grad_norm": 0.8040167244066356, "learning_rate": 1.1513738202306101e-05, "loss": 0.0956, "step": 13440 }, { "epoch": 2.8089864158829676, "grad_norm": 0.9641305979804573, "learning_rate": 1.151262307101943e-05, "loss": 0.1214, "step": 13441 }, { "epoch": 2.8091954022988506, "grad_norm": 1.0892812828459704, "learning_rate": 1.1511507920482236e-05, "loss": 0.135, "step": 13442 }, { "epoch": 2.8094043887147335, "grad_norm": 1.1532070283375122, "learning_rate": 1.1510392750708719e-05, "loss": 0.1342, "step": 13443 }, { "epoch": 2.8096133751306165, "grad_norm": 1.009790005742838, "learning_rate": 1.1509277561713066e-05, "loss": 0.1203, "step": 13444 }, { "epoch": 2.8098223615464994, "grad_norm": 1.0009280665278637, "learning_rate": 1.1508162353509469e-05, "loss": 0.1236, "step": 13445 }, { "epoch": 2.8100313479623824, "grad_norm": 0.956668489645291, "learning_rate": 1.1507047126112124e-05, "loss": 0.1022, "step": 13446 }, { "epoch": 2.8102403343782654, "grad_norm": 1.1858693176730883, "learning_rate": 1.1505931879535224e-05, "loss": 0.143, "step": 13447 }, { "epoch": 2.8104493207941483, "grad_norm": 0.9091498382269284, "learning_rate": 1.1504816613792958e-05, "loss": 0.0942, "step": 13448 }, { "epoch": 2.8106583072100313, "grad_norm": 0.9573709045269645, "learning_rate": 1.1503701328899526e-05, "loss": 0.1058, "step": 13449 }, { "epoch": 2.8108672936259143, "grad_norm": 0.8889401244471935, "learning_rate": 1.1502586024869115e-05, "loss": 0.1114, "step": 13450 }, { "epoch": 2.811076280041797, "grad_norm": 0.7914049492219007, "learning_rate": 1.1501470701715922e-05, "loss": 0.1136, "step": 13451 }, { "epoch": 2.81128526645768, "grad_norm": 1.0812772678757925, "learning_rate": 1.1500355359454143e-05, "loss": 0.1455, "step": 13452 }, { "epoch": 2.811494252873563, "grad_norm": 0.8750455053419677, "learning_rate": 1.1499239998097973e-05, "loss": 0.1232, "step": 13453 }, { "epoch": 2.811703239289446, "grad_norm": 1.0020366760199475, "learning_rate": 1.1498124617661602e-05, "loss": 0.1191, "step": 13454 }, { "epoch": 2.811912225705329, "grad_norm": 0.9441166033740361, "learning_rate": 1.1497009218159227e-05, "loss": 0.1205, "step": 13455 }, { "epoch": 2.812121212121212, "grad_norm": 0.9044810387967022, "learning_rate": 1.1495893799605045e-05, "loss": 0.1195, "step": 13456 }, { "epoch": 2.812330198537095, "grad_norm": 0.9633428088387235, "learning_rate": 1.149477836201325e-05, "loss": 0.124, "step": 13457 }, { "epoch": 2.812539184952978, "grad_norm": 1.0706538239574699, "learning_rate": 1.149366290539804e-05, "loss": 0.1376, "step": 13458 }, { "epoch": 2.812748171368861, "grad_norm": 0.9217605949121445, "learning_rate": 1.1492547429773607e-05, "loss": 0.1094, "step": 13459 }, { "epoch": 2.812957157784744, "grad_norm": 0.8036331506613903, "learning_rate": 1.149143193515415e-05, "loss": 0.1169, "step": 13460 }, { "epoch": 2.813166144200627, "grad_norm": 1.025993935293242, "learning_rate": 1.1490316421553862e-05, "loss": 0.139, "step": 13461 }, { "epoch": 2.81337513061651, "grad_norm": 0.9941564424237369, "learning_rate": 1.1489200888986945e-05, "loss": 0.1121, "step": 13462 }, { "epoch": 2.8135841170323928, "grad_norm": 0.8116989110950343, "learning_rate": 1.148808533746759e-05, "loss": 0.1052, "step": 13463 }, { "epoch": 2.8137931034482757, "grad_norm": 0.807050013786946, "learning_rate": 1.1486969767010003e-05, "loss": 0.1029, "step": 13464 }, { "epoch": 2.8140020898641587, "grad_norm": 1.0828509039275107, "learning_rate": 1.1485854177628373e-05, "loss": 0.1191, "step": 13465 }, { "epoch": 2.8142110762800416, "grad_norm": 0.9261701413751239, "learning_rate": 1.14847385693369e-05, "loss": 0.1238, "step": 13466 }, { "epoch": 2.814420062695925, "grad_norm": 0.8779148798004646, "learning_rate": 1.1483622942149782e-05, "loss": 0.1144, "step": 13467 }, { "epoch": 2.814629049111808, "grad_norm": 1.2163482841325017, "learning_rate": 1.1482507296081219e-05, "loss": 0.1143, "step": 13468 }, { "epoch": 2.814838035527691, "grad_norm": 0.982157932214383, "learning_rate": 1.1481391631145405e-05, "loss": 0.1201, "step": 13469 }, { "epoch": 2.815047021943574, "grad_norm": 0.8819958089204305, "learning_rate": 1.1480275947356545e-05, "loss": 0.1031, "step": 13470 }, { "epoch": 2.815256008359457, "grad_norm": 0.9130560702445705, "learning_rate": 1.147916024472883e-05, "loss": 0.128, "step": 13471 }, { "epoch": 2.81546499477534, "grad_norm": 1.1929934596126006, "learning_rate": 1.1478044523276466e-05, "loss": 0.1351, "step": 13472 }, { "epoch": 2.815673981191223, "grad_norm": 1.1330933273848294, "learning_rate": 1.1476928783013648e-05, "loss": 0.1434, "step": 13473 }, { "epoch": 2.8158829676071058, "grad_norm": 0.9814235427300717, "learning_rate": 1.1475813023954578e-05, "loss": 0.1247, "step": 13474 }, { "epoch": 2.8160919540229887, "grad_norm": 0.9527528638500767, "learning_rate": 1.1474697246113455e-05, "loss": 0.112, "step": 13475 }, { "epoch": 2.8163009404388717, "grad_norm": 0.820305333850966, "learning_rate": 1.147358144950448e-05, "loss": 0.1085, "step": 13476 }, { "epoch": 2.8165099268547547, "grad_norm": 0.847098789538906, "learning_rate": 1.1472465634141851e-05, "loss": 0.1046, "step": 13477 }, { "epoch": 2.8167189132706376, "grad_norm": 0.945628746434054, "learning_rate": 1.147134980003977e-05, "loss": 0.1283, "step": 13478 }, { "epoch": 2.8169278996865206, "grad_norm": 0.8564369512262541, "learning_rate": 1.1470233947212436e-05, "loss": 0.1219, "step": 13479 }, { "epoch": 2.8171368861024035, "grad_norm": 1.012761250367544, "learning_rate": 1.1469118075674055e-05, "loss": 0.1123, "step": 13480 }, { "epoch": 2.8173458725182865, "grad_norm": 1.0188463731510318, "learning_rate": 1.1468002185438822e-05, "loss": 0.1418, "step": 13481 }, { "epoch": 2.8175548589341695, "grad_norm": 1.333838338306195, "learning_rate": 1.1466886276520943e-05, "loss": 0.1398, "step": 13482 }, { "epoch": 2.8177638453500524, "grad_norm": 1.1144472621397985, "learning_rate": 1.1465770348934617e-05, "loss": 0.1169, "step": 13483 }, { "epoch": 2.8179728317659354, "grad_norm": 0.8586339874739191, "learning_rate": 1.1464654402694047e-05, "loss": 0.0971, "step": 13484 }, { "epoch": 2.8181818181818183, "grad_norm": 0.762403649904372, "learning_rate": 1.1463538437813435e-05, "loss": 0.1069, "step": 13485 }, { "epoch": 2.8183908045977013, "grad_norm": 0.9258910786942277, "learning_rate": 1.1462422454306983e-05, "loss": 0.1271, "step": 13486 }, { "epoch": 2.8185997910135843, "grad_norm": 1.107907672723716, "learning_rate": 1.1461306452188894e-05, "loss": 0.1231, "step": 13487 }, { "epoch": 2.8188087774294672, "grad_norm": 0.9942047683741737, "learning_rate": 1.1460190431473373e-05, "loss": 0.1145, "step": 13488 }, { "epoch": 2.81901776384535, "grad_norm": 1.0057658699542997, "learning_rate": 1.1459074392174619e-05, "loss": 0.1156, "step": 13489 }, { "epoch": 2.819226750261233, "grad_norm": 1.254807990360935, "learning_rate": 1.1457958334306838e-05, "loss": 0.1189, "step": 13490 }, { "epoch": 2.819435736677116, "grad_norm": 1.0720445971684067, "learning_rate": 1.145684225788423e-05, "loss": 0.1248, "step": 13491 }, { "epoch": 2.819644723092999, "grad_norm": 0.979097112215005, "learning_rate": 1.1455726162921006e-05, "loss": 0.0911, "step": 13492 }, { "epoch": 2.819853709508882, "grad_norm": 1.1534564528789069, "learning_rate": 1.1454610049431365e-05, "loss": 0.1375, "step": 13493 }, { "epoch": 2.820062695924765, "grad_norm": 1.104155105503425, "learning_rate": 1.1453493917429514e-05, "loss": 0.1336, "step": 13494 }, { "epoch": 2.820271682340648, "grad_norm": 1.0792094587162893, "learning_rate": 1.1452377766929654e-05, "loss": 0.1392, "step": 13495 }, { "epoch": 2.820480668756531, "grad_norm": 0.9504834907069022, "learning_rate": 1.145126159794599e-05, "loss": 0.0965, "step": 13496 }, { "epoch": 2.820689655172414, "grad_norm": 1.037951897526911, "learning_rate": 1.1450145410492733e-05, "loss": 0.1251, "step": 13497 }, { "epoch": 2.820898641588297, "grad_norm": 0.9878185014922684, "learning_rate": 1.1449029204584079e-05, "loss": 0.1234, "step": 13498 }, { "epoch": 2.82110762800418, "grad_norm": 1.2456788020035234, "learning_rate": 1.1447912980234239e-05, "loss": 0.1426, "step": 13499 }, { "epoch": 2.8213166144200628, "grad_norm": 1.0422031694259484, "learning_rate": 1.1446796737457424e-05, "loss": 0.1226, "step": 13500 }, { "epoch": 2.8215256008359457, "grad_norm": 1.0716734006759738, "learning_rate": 1.1445680476267829e-05, "loss": 0.1396, "step": 13501 }, { "epoch": 2.8217345872518287, "grad_norm": 0.9708578219547693, "learning_rate": 1.1444564196679667e-05, "loss": 0.1173, "step": 13502 }, { "epoch": 2.8219435736677116, "grad_norm": 0.9717993039580238, "learning_rate": 1.1443447898707139e-05, "loss": 0.111, "step": 13503 }, { "epoch": 2.8221525600835946, "grad_norm": 0.9580223209707568, "learning_rate": 1.144233158236446e-05, "loss": 0.1107, "step": 13504 }, { "epoch": 2.8223615464994776, "grad_norm": 0.6893912441298512, "learning_rate": 1.144121524766583e-05, "loss": 0.0924, "step": 13505 }, { "epoch": 2.8225705329153605, "grad_norm": 0.8900829500454435, "learning_rate": 1.1440098894625457e-05, "loss": 0.1021, "step": 13506 }, { "epoch": 2.8227795193312435, "grad_norm": 1.089182337577674, "learning_rate": 1.1438982523257552e-05, "loss": 0.1178, "step": 13507 }, { "epoch": 2.8229885057471265, "grad_norm": 1.0141394391656036, "learning_rate": 1.1437866133576318e-05, "loss": 0.1291, "step": 13508 }, { "epoch": 2.8231974921630094, "grad_norm": 0.9293147868384164, "learning_rate": 1.1436749725595967e-05, "loss": 0.1135, "step": 13509 }, { "epoch": 2.8234064785788924, "grad_norm": 1.0389674212447089, "learning_rate": 1.1435633299330706e-05, "loss": 0.1401, "step": 13510 }, { "epoch": 2.8236154649947753, "grad_norm": 0.9304030352196038, "learning_rate": 1.143451685479474e-05, "loss": 0.1037, "step": 13511 }, { "epoch": 2.8238244514106583, "grad_norm": 1.0976889662839546, "learning_rate": 1.1433400392002282e-05, "loss": 0.1336, "step": 13512 }, { "epoch": 2.8240334378265413, "grad_norm": 1.147179089855397, "learning_rate": 1.1432283910967539e-05, "loss": 0.1597, "step": 13513 }, { "epoch": 2.824242424242424, "grad_norm": 0.9439496933073852, "learning_rate": 1.1431167411704717e-05, "loss": 0.1054, "step": 13514 }, { "epoch": 2.824451410658307, "grad_norm": 1.0607201125154098, "learning_rate": 1.1430050894228028e-05, "loss": 0.114, "step": 13515 }, { "epoch": 2.82466039707419, "grad_norm": 1.317954578799499, "learning_rate": 1.1428934358551687e-05, "loss": 0.1458, "step": 13516 }, { "epoch": 2.824869383490073, "grad_norm": 1.219253509341985, "learning_rate": 1.142781780468989e-05, "loss": 0.1498, "step": 13517 }, { "epoch": 2.825078369905956, "grad_norm": 1.3795480139404597, "learning_rate": 1.142670123265686e-05, "loss": 0.1341, "step": 13518 }, { "epoch": 2.825287356321839, "grad_norm": 0.9285830388038251, "learning_rate": 1.14255846424668e-05, "loss": 0.114, "step": 13519 }, { "epoch": 2.825496342737722, "grad_norm": 0.8098619972759243, "learning_rate": 1.1424468034133925e-05, "loss": 0.1092, "step": 13520 }, { "epoch": 2.825705329153605, "grad_norm": 0.7485720433645034, "learning_rate": 1.142335140767244e-05, "loss": 0.1008, "step": 13521 }, { "epoch": 2.825914315569488, "grad_norm": 1.0962311020274431, "learning_rate": 1.1422234763096563e-05, "loss": 0.1154, "step": 13522 }, { "epoch": 2.826123301985371, "grad_norm": 0.8457503035114822, "learning_rate": 1.1421118100420497e-05, "loss": 0.113, "step": 13523 }, { "epoch": 2.826332288401254, "grad_norm": 1.048865500213969, "learning_rate": 1.1420001419658459e-05, "loss": 0.1497, "step": 13524 }, { "epoch": 2.826541274817137, "grad_norm": 0.982405439264025, "learning_rate": 1.1418884720824658e-05, "loss": 0.1091, "step": 13525 }, { "epoch": 2.8267502612330198, "grad_norm": 1.0181950482048818, "learning_rate": 1.1417768003933308e-05, "loss": 0.1265, "step": 13526 }, { "epoch": 2.8269592476489027, "grad_norm": 0.8326343607067764, "learning_rate": 1.1416651268998617e-05, "loss": 0.0992, "step": 13527 }, { "epoch": 2.8271682340647857, "grad_norm": 0.8729878116312766, "learning_rate": 1.1415534516034803e-05, "loss": 0.127, "step": 13528 }, { "epoch": 2.8273772204806686, "grad_norm": 0.753684368605987, "learning_rate": 1.1414417745056073e-05, "loss": 0.0927, "step": 13529 }, { "epoch": 2.8275862068965516, "grad_norm": 0.9246143419637806, "learning_rate": 1.1413300956076643e-05, "loss": 0.1312, "step": 13530 }, { "epoch": 2.8277951933124346, "grad_norm": 0.787977565416397, "learning_rate": 1.1412184149110723e-05, "loss": 0.0994, "step": 13531 }, { "epoch": 2.8280041797283175, "grad_norm": 0.97367740079064, "learning_rate": 1.1411067324172529e-05, "loss": 0.1096, "step": 13532 }, { "epoch": 2.8282131661442005, "grad_norm": 0.9494557981103954, "learning_rate": 1.1409950481276274e-05, "loss": 0.1308, "step": 13533 }, { "epoch": 2.8284221525600834, "grad_norm": 0.9144395234788817, "learning_rate": 1.140883362043617e-05, "loss": 0.1282, "step": 13534 }, { "epoch": 2.8286311389759664, "grad_norm": 0.9952171399975439, "learning_rate": 1.1407716741666433e-05, "loss": 0.1159, "step": 13535 }, { "epoch": 2.8288401253918494, "grad_norm": 0.8610807961282394, "learning_rate": 1.1406599844981275e-05, "loss": 0.1125, "step": 13536 }, { "epoch": 2.8290491118077323, "grad_norm": 0.8561446612574619, "learning_rate": 1.1405482930394911e-05, "loss": 0.1061, "step": 13537 }, { "epoch": 2.8292580982236153, "grad_norm": 0.9399543819173487, "learning_rate": 1.1404365997921553e-05, "loss": 0.1163, "step": 13538 }, { "epoch": 2.8294670846394983, "grad_norm": 1.0568410824894743, "learning_rate": 1.1403249047575423e-05, "loss": 0.1215, "step": 13539 }, { "epoch": 2.829676071055381, "grad_norm": 0.8659937212416997, "learning_rate": 1.1402132079370729e-05, "loss": 0.1239, "step": 13540 }, { "epoch": 2.829885057471264, "grad_norm": 0.805551987732883, "learning_rate": 1.1401015093321688e-05, "loss": 0.1015, "step": 13541 }, { "epoch": 2.830094043887147, "grad_norm": 1.034520228244375, "learning_rate": 1.1399898089442515e-05, "loss": 0.1124, "step": 13542 }, { "epoch": 2.83030303030303, "grad_norm": 0.8594314716729495, "learning_rate": 1.1398781067747428e-05, "loss": 0.1083, "step": 13543 }, { "epoch": 2.830512016718913, "grad_norm": 0.8448208847738011, "learning_rate": 1.1397664028250642e-05, "loss": 0.0938, "step": 13544 }, { "epoch": 2.830721003134796, "grad_norm": 0.9020734215058694, "learning_rate": 1.139654697096637e-05, "loss": 0.11, "step": 13545 }, { "epoch": 2.830929989550679, "grad_norm": 0.7988476783530171, "learning_rate": 1.1395429895908834e-05, "loss": 0.1109, "step": 13546 }, { "epoch": 2.831138975966562, "grad_norm": 1.0069098359812583, "learning_rate": 1.1394312803092244e-05, "loss": 0.113, "step": 13547 }, { "epoch": 2.831347962382445, "grad_norm": 0.929592880723625, "learning_rate": 1.1393195692530823e-05, "loss": 0.1413, "step": 13548 }, { "epoch": 2.831556948798328, "grad_norm": 0.962760313415654, "learning_rate": 1.1392078564238782e-05, "loss": 0.1191, "step": 13549 }, { "epoch": 2.831765935214211, "grad_norm": 0.9336872909335605, "learning_rate": 1.1390961418230342e-05, "loss": 0.12, "step": 13550 }, { "epoch": 2.831974921630094, "grad_norm": 1.231247969863736, "learning_rate": 1.1389844254519722e-05, "loss": 0.1044, "step": 13551 }, { "epoch": 2.8321839080459768, "grad_norm": 0.9277887126578817, "learning_rate": 1.1388727073121136e-05, "loss": 0.1079, "step": 13552 }, { "epoch": 2.8323928944618597, "grad_norm": 0.9774463357129983, "learning_rate": 1.1387609874048804e-05, "loss": 0.1101, "step": 13553 }, { "epoch": 2.8326018808777427, "grad_norm": 1.1265489615008728, "learning_rate": 1.138649265731694e-05, "loss": 0.1392, "step": 13554 }, { "epoch": 2.832810867293626, "grad_norm": 0.957113312719553, "learning_rate": 1.1385375422939769e-05, "loss": 0.1289, "step": 13555 }, { "epoch": 2.833019853709509, "grad_norm": 0.8361980460561607, "learning_rate": 1.1384258170931505e-05, "loss": 0.0928, "step": 13556 }, { "epoch": 2.833228840125392, "grad_norm": 0.7245047437699091, "learning_rate": 1.138314090130637e-05, "loss": 0.0803, "step": 13557 }, { "epoch": 2.833437826541275, "grad_norm": 1.0035948046838494, "learning_rate": 1.1382023614078582e-05, "loss": 0.1174, "step": 13558 }, { "epoch": 2.833646812957158, "grad_norm": 0.9738448859787269, "learning_rate": 1.1380906309262355e-05, "loss": 0.1068, "step": 13559 }, { "epoch": 2.833855799373041, "grad_norm": 0.8853108842053644, "learning_rate": 1.1379788986871917e-05, "loss": 0.1157, "step": 13560 }, { "epoch": 2.834064785788924, "grad_norm": 1.2142467597663187, "learning_rate": 1.137867164692148e-05, "loss": 0.1446, "step": 13561 }, { "epoch": 2.834273772204807, "grad_norm": 0.9452449369719692, "learning_rate": 1.137755428942527e-05, "loss": 0.1167, "step": 13562 }, { "epoch": 2.8344827586206898, "grad_norm": 0.9753927257566588, "learning_rate": 1.1376436914397507e-05, "loss": 0.1071, "step": 13563 }, { "epoch": 2.8346917450365727, "grad_norm": 1.0080350472176745, "learning_rate": 1.1375319521852404e-05, "loss": 0.1153, "step": 13564 }, { "epoch": 2.8349007314524557, "grad_norm": 0.8808173768013631, "learning_rate": 1.1374202111804187e-05, "loss": 0.1198, "step": 13565 }, { "epoch": 2.8351097178683387, "grad_norm": 0.7723941264492722, "learning_rate": 1.1373084684267077e-05, "loss": 0.0954, "step": 13566 }, { "epoch": 2.8353187042842216, "grad_norm": 0.9041694088396443, "learning_rate": 1.1371967239255297e-05, "loss": 0.0993, "step": 13567 }, { "epoch": 2.8355276907001046, "grad_norm": 0.987265519738405, "learning_rate": 1.1370849776783062e-05, "loss": 0.1204, "step": 13568 }, { "epoch": 2.8357366771159875, "grad_norm": 0.7632589288151028, "learning_rate": 1.1369732296864602e-05, "loss": 0.1138, "step": 13569 }, { "epoch": 2.8359456635318705, "grad_norm": 0.8698353709010391, "learning_rate": 1.1368614799514131e-05, "loss": 0.086, "step": 13570 }, { "epoch": 2.8361546499477535, "grad_norm": 1.0208897407334607, "learning_rate": 1.1367497284745873e-05, "loss": 0.1301, "step": 13571 }, { "epoch": 2.8363636363636364, "grad_norm": 0.8995653145108906, "learning_rate": 1.136637975257405e-05, "loss": 0.1118, "step": 13572 }, { "epoch": 2.8365726227795194, "grad_norm": 1.0082709939663543, "learning_rate": 1.136526220301289e-05, "loss": 0.1378, "step": 13573 }, { "epoch": 2.8367816091954023, "grad_norm": 0.8639288788346158, "learning_rate": 1.1364144636076604e-05, "loss": 0.1045, "step": 13574 }, { "epoch": 2.8369905956112853, "grad_norm": 1.0297913873251074, "learning_rate": 1.1363027051779427e-05, "loss": 0.1318, "step": 13575 }, { "epoch": 2.8371995820271683, "grad_norm": 0.9161497566756234, "learning_rate": 1.1361909450135575e-05, "loss": 0.1276, "step": 13576 }, { "epoch": 2.8374085684430512, "grad_norm": 0.9787411006568741, "learning_rate": 1.1360791831159275e-05, "loss": 0.1221, "step": 13577 }, { "epoch": 2.837617554858934, "grad_norm": 0.952361779678971, "learning_rate": 1.1359674194864745e-05, "loss": 0.1197, "step": 13578 }, { "epoch": 2.837826541274817, "grad_norm": 0.8718318371370677, "learning_rate": 1.1358556541266215e-05, "loss": 0.1131, "step": 13579 }, { "epoch": 2.8380355276907, "grad_norm": 1.0592411785338267, "learning_rate": 1.1357438870377902e-05, "loss": 0.1435, "step": 13580 }, { "epoch": 2.838244514106583, "grad_norm": 1.0318043187097075, "learning_rate": 1.1356321182214038e-05, "loss": 0.1247, "step": 13581 }, { "epoch": 2.838453500522466, "grad_norm": 0.9868365438041964, "learning_rate": 1.1355203476788841e-05, "loss": 0.1126, "step": 13582 }, { "epoch": 2.838662486938349, "grad_norm": 1.0339399163570298, "learning_rate": 1.1354085754116539e-05, "loss": 0.0929, "step": 13583 }, { "epoch": 2.838871473354232, "grad_norm": 0.8958627987622104, "learning_rate": 1.1352968014211357e-05, "loss": 0.1116, "step": 13584 }, { "epoch": 2.839080459770115, "grad_norm": 1.0840511367915255, "learning_rate": 1.135185025708752e-05, "loss": 0.1281, "step": 13585 }, { "epoch": 2.839289446185998, "grad_norm": 1.0098000458000274, "learning_rate": 1.1350732482759246e-05, "loss": 0.125, "step": 13586 }, { "epoch": 2.839498432601881, "grad_norm": 0.8505330341796304, "learning_rate": 1.1349614691240771e-05, "loss": 0.1146, "step": 13587 }, { "epoch": 2.839707419017764, "grad_norm": 0.8820214540941603, "learning_rate": 1.1348496882546315e-05, "loss": 0.1073, "step": 13588 }, { "epoch": 2.8399164054336468, "grad_norm": 1.1448951280574795, "learning_rate": 1.1347379056690106e-05, "loss": 0.1404, "step": 13589 }, { "epoch": 2.8401253918495297, "grad_norm": 0.9844345384667672, "learning_rate": 1.1346261213686368e-05, "loss": 0.1221, "step": 13590 }, { "epoch": 2.8403343782654127, "grad_norm": 0.9785576340890182, "learning_rate": 1.1345143353549333e-05, "loss": 0.1084, "step": 13591 }, { "epoch": 2.8405433646812956, "grad_norm": 1.136832912137164, "learning_rate": 1.1344025476293216e-05, "loss": 0.1335, "step": 13592 }, { "epoch": 2.8407523510971786, "grad_norm": 1.1322745592640295, "learning_rate": 1.1342907581932257e-05, "loss": 0.1484, "step": 13593 }, { "epoch": 2.8409613375130616, "grad_norm": 0.9176027758903834, "learning_rate": 1.1341789670480671e-05, "loss": 0.109, "step": 13594 }, { "epoch": 2.8411703239289445, "grad_norm": 1.0280123998020185, "learning_rate": 1.1340671741952693e-05, "loss": 0.1429, "step": 13595 }, { "epoch": 2.8413793103448275, "grad_norm": 1.1044055447861991, "learning_rate": 1.133955379636255e-05, "loss": 0.1121, "step": 13596 }, { "epoch": 2.8415882967607105, "grad_norm": 1.1531166652495335, "learning_rate": 1.1338435833724468e-05, "loss": 0.1345, "step": 13597 }, { "epoch": 2.8417972831765934, "grad_norm": 1.1587613618456598, "learning_rate": 1.1337317854052672e-05, "loss": 0.1425, "step": 13598 }, { "epoch": 2.8420062695924764, "grad_norm": 0.9952252409464891, "learning_rate": 1.1336199857361397e-05, "loss": 0.0949, "step": 13599 }, { "epoch": 2.8422152560083593, "grad_norm": 0.8881766084821271, "learning_rate": 1.1335081843664864e-05, "loss": 0.1224, "step": 13600 }, { "epoch": 2.8424242424242423, "grad_norm": 1.111719002256249, "learning_rate": 1.1333963812977304e-05, "loss": 0.1193, "step": 13601 }, { "epoch": 2.8426332288401253, "grad_norm": 1.1178863331070161, "learning_rate": 1.1332845765312949e-05, "loss": 0.1284, "step": 13602 }, { "epoch": 2.842842215256008, "grad_norm": 0.8883359415989355, "learning_rate": 1.1331727700686024e-05, "loss": 0.0998, "step": 13603 }, { "epoch": 2.843051201671891, "grad_norm": 0.9464909594726657, "learning_rate": 1.133060961911076e-05, "loss": 0.1107, "step": 13604 }, { "epoch": 2.843260188087774, "grad_norm": 0.8185323896238881, "learning_rate": 1.1329491520601384e-05, "loss": 0.073, "step": 13605 }, { "epoch": 2.843469174503657, "grad_norm": 1.1963848099221048, "learning_rate": 1.1328373405172128e-05, "loss": 0.1481, "step": 13606 }, { "epoch": 2.84367816091954, "grad_norm": 0.966750222929001, "learning_rate": 1.1327255272837221e-05, "loss": 0.131, "step": 13607 }, { "epoch": 2.8438871473354235, "grad_norm": 1.1553619652194678, "learning_rate": 1.1326137123610894e-05, "loss": 0.1208, "step": 13608 }, { "epoch": 2.8440961337513064, "grad_norm": 0.8960865887832069, "learning_rate": 1.1325018957507377e-05, "loss": 0.1226, "step": 13609 }, { "epoch": 2.8443051201671894, "grad_norm": 1.0810777869786072, "learning_rate": 1.1323900774540896e-05, "loss": 0.1335, "step": 13610 }, { "epoch": 2.8445141065830724, "grad_norm": 1.0363358207995883, "learning_rate": 1.132278257472569e-05, "loss": 0.1258, "step": 13611 }, { "epoch": 2.8447230929989553, "grad_norm": 1.0964576075081676, "learning_rate": 1.1321664358075984e-05, "loss": 0.1281, "step": 13612 }, { "epoch": 2.8449320794148383, "grad_norm": 0.8109288935902702, "learning_rate": 1.1320546124606007e-05, "loss": 0.113, "step": 13613 }, { "epoch": 2.8451410658307212, "grad_norm": 1.0366377970109337, "learning_rate": 1.1319427874329998e-05, "loss": 0.0998, "step": 13614 }, { "epoch": 2.845350052246604, "grad_norm": 0.9830361101528824, "learning_rate": 1.1318309607262182e-05, "loss": 0.1378, "step": 13615 }, { "epoch": 2.845559038662487, "grad_norm": 1.0094689826998378, "learning_rate": 1.1317191323416794e-05, "loss": 0.1234, "step": 13616 }, { "epoch": 2.84576802507837, "grad_norm": 0.8564248547827131, "learning_rate": 1.1316073022808062e-05, "loss": 0.0922, "step": 13617 }, { "epoch": 2.845977011494253, "grad_norm": 0.8435607571828633, "learning_rate": 1.1314954705450223e-05, "loss": 0.0852, "step": 13618 }, { "epoch": 2.846185997910136, "grad_norm": 0.9436743698881253, "learning_rate": 1.1313836371357508e-05, "loss": 0.1081, "step": 13619 }, { "epoch": 2.846394984326019, "grad_norm": 0.826922679610496, "learning_rate": 1.1312718020544149e-05, "loss": 0.1059, "step": 13620 }, { "epoch": 2.846603970741902, "grad_norm": 0.8452728661907322, "learning_rate": 1.1311599653024376e-05, "loss": 0.1083, "step": 13621 }, { "epoch": 2.846812957157785, "grad_norm": 1.0896461868201035, "learning_rate": 1.1310481268812427e-05, "loss": 0.1444, "step": 13622 }, { "epoch": 2.847021943573668, "grad_norm": 0.9551462190870904, "learning_rate": 1.1309362867922531e-05, "loss": 0.1192, "step": 13623 }, { "epoch": 2.847230929989551, "grad_norm": 1.3396876045149095, "learning_rate": 1.1308244450368924e-05, "loss": 0.1084, "step": 13624 }, { "epoch": 2.847439916405434, "grad_norm": 0.9829617427781799, "learning_rate": 1.1307126016165839e-05, "loss": 0.1351, "step": 13625 }, { "epoch": 2.8476489028213168, "grad_norm": 0.8504324520361523, "learning_rate": 1.130600756532751e-05, "loss": 0.0979, "step": 13626 }, { "epoch": 2.8478578892371997, "grad_norm": 1.1281483990304506, "learning_rate": 1.1304889097868173e-05, "loss": 0.1399, "step": 13627 }, { "epoch": 2.8480668756530827, "grad_norm": 0.9746506344934895, "learning_rate": 1.1303770613802056e-05, "loss": 0.117, "step": 13628 }, { "epoch": 2.8482758620689657, "grad_norm": 0.9735626596222554, "learning_rate": 1.1302652113143398e-05, "loss": 0.1128, "step": 13629 }, { "epoch": 2.8484848484848486, "grad_norm": 0.6838987829742631, "learning_rate": 1.1301533595906434e-05, "loss": 0.0806, "step": 13630 }, { "epoch": 2.8486938349007316, "grad_norm": 1.1942228034961229, "learning_rate": 1.1300415062105398e-05, "loss": 0.1081, "step": 13631 }, { "epoch": 2.8489028213166145, "grad_norm": 0.8612177873208341, "learning_rate": 1.1299296511754528e-05, "loss": 0.1112, "step": 13632 }, { "epoch": 2.8491118077324975, "grad_norm": 0.7990581966806956, "learning_rate": 1.1298177944868055e-05, "loss": 0.1233, "step": 13633 }, { "epoch": 2.8493207941483805, "grad_norm": 0.9433092925225086, "learning_rate": 1.1297059361460213e-05, "loss": 0.1099, "step": 13634 }, { "epoch": 2.8495297805642634, "grad_norm": 1.0081947840353414, "learning_rate": 1.1295940761545242e-05, "loss": 0.1626, "step": 13635 }, { "epoch": 2.8497387669801464, "grad_norm": 0.8748317759428946, "learning_rate": 1.1294822145137379e-05, "loss": 0.1121, "step": 13636 }, { "epoch": 2.8499477533960293, "grad_norm": 1.1560052956493658, "learning_rate": 1.1293703512250856e-05, "loss": 0.1222, "step": 13637 }, { "epoch": 2.8501567398119123, "grad_norm": 1.108206142934419, "learning_rate": 1.1292584862899911e-05, "loss": 0.1242, "step": 13638 }, { "epoch": 2.8503657262277953, "grad_norm": 0.9109116071757906, "learning_rate": 1.1291466197098782e-05, "loss": 0.123, "step": 13639 }, { "epoch": 2.8505747126436782, "grad_norm": 0.9230768329054903, "learning_rate": 1.1290347514861704e-05, "loss": 0.1227, "step": 13640 }, { "epoch": 2.850783699059561, "grad_norm": 1.0710016534390916, "learning_rate": 1.1289228816202915e-05, "loss": 0.1208, "step": 13641 }, { "epoch": 2.850992685475444, "grad_norm": 1.0318933932829677, "learning_rate": 1.1288110101136654e-05, "loss": 0.1291, "step": 13642 }, { "epoch": 2.851201671891327, "grad_norm": 0.9779686622669767, "learning_rate": 1.1286991369677151e-05, "loss": 0.1076, "step": 13643 }, { "epoch": 2.85141065830721, "grad_norm": 1.0598351562051997, "learning_rate": 1.1285872621838654e-05, "loss": 0.1326, "step": 13644 }, { "epoch": 2.851619644723093, "grad_norm": 1.0848779108234996, "learning_rate": 1.1284753857635393e-05, "loss": 0.1383, "step": 13645 }, { "epoch": 2.851828631138976, "grad_norm": 1.0211966417010887, "learning_rate": 1.128363507708161e-05, "loss": 0.1448, "step": 13646 }, { "epoch": 2.852037617554859, "grad_norm": 0.976070040161427, "learning_rate": 1.128251628019154e-05, "loss": 0.1281, "step": 13647 }, { "epoch": 2.852246603970742, "grad_norm": 1.1893733510455764, "learning_rate": 1.1281397466979427e-05, "loss": 0.1416, "step": 13648 }, { "epoch": 2.852455590386625, "grad_norm": 0.8747379674695512, "learning_rate": 1.1280278637459502e-05, "loss": 0.1103, "step": 13649 }, { "epoch": 2.852664576802508, "grad_norm": 0.8585468606448995, "learning_rate": 1.1279159791646013e-05, "loss": 0.1105, "step": 13650 }, { "epoch": 2.852873563218391, "grad_norm": 0.8992466498881609, "learning_rate": 1.1278040929553191e-05, "loss": 0.1047, "step": 13651 }, { "epoch": 2.8530825496342738, "grad_norm": 0.8968047739154448, "learning_rate": 1.1276922051195278e-05, "loss": 0.1117, "step": 13652 }, { "epoch": 2.8532915360501567, "grad_norm": 0.8167047429474517, "learning_rate": 1.1275803156586513e-05, "loss": 0.1059, "step": 13653 }, { "epoch": 2.8535005224660397, "grad_norm": 1.519967806169871, "learning_rate": 1.1274684245741141e-05, "loss": 0.1162, "step": 13654 }, { "epoch": 2.8537095088819227, "grad_norm": 0.9299260867828011, "learning_rate": 1.1273565318673393e-05, "loss": 0.1329, "step": 13655 }, { "epoch": 2.8539184952978056, "grad_norm": 1.0833848676627256, "learning_rate": 1.1272446375397516e-05, "loss": 0.1089, "step": 13656 }, { "epoch": 2.8541274817136886, "grad_norm": 1.0289634928635412, "learning_rate": 1.1271327415927748e-05, "loss": 0.1066, "step": 13657 }, { "epoch": 2.8543364681295715, "grad_norm": 1.0538344655238576, "learning_rate": 1.1270208440278327e-05, "loss": 0.1314, "step": 13658 }, { "epoch": 2.8545454545454545, "grad_norm": 0.8400989811851177, "learning_rate": 1.12690894484635e-05, "loss": 0.1066, "step": 13659 }, { "epoch": 2.8547544409613375, "grad_norm": 0.9450314021034781, "learning_rate": 1.1267970440497501e-05, "loss": 0.1236, "step": 13660 }, { "epoch": 2.8549634273772204, "grad_norm": 0.9930838399817805, "learning_rate": 1.1266851416394573e-05, "loss": 0.1298, "step": 13661 }, { "epoch": 2.8551724137931034, "grad_norm": 0.9739942844822906, "learning_rate": 1.1265732376168965e-05, "loss": 0.1271, "step": 13662 }, { "epoch": 2.8553814002089863, "grad_norm": 0.9028997292106163, "learning_rate": 1.1264613319834907e-05, "loss": 0.1179, "step": 13663 }, { "epoch": 2.8555903866248693, "grad_norm": 0.8103662609589027, "learning_rate": 1.1263494247406648e-05, "loss": 0.1026, "step": 13664 }, { "epoch": 2.8557993730407523, "grad_norm": 1.019100109141156, "learning_rate": 1.1262375158898427e-05, "loss": 0.1436, "step": 13665 }, { "epoch": 2.8560083594566352, "grad_norm": 0.8171187376606424, "learning_rate": 1.1261256054324488e-05, "loss": 0.0963, "step": 13666 }, { "epoch": 2.856217345872518, "grad_norm": 0.6909331378426654, "learning_rate": 1.1260136933699069e-05, "loss": 0.0862, "step": 13667 }, { "epoch": 2.856426332288401, "grad_norm": 1.1003192966205138, "learning_rate": 1.125901779703642e-05, "loss": 0.124, "step": 13668 }, { "epoch": 2.856635318704284, "grad_norm": 1.0094477630146719, "learning_rate": 1.1257898644350778e-05, "loss": 0.112, "step": 13669 }, { "epoch": 2.856844305120167, "grad_norm": 0.8936046685581427, "learning_rate": 1.1256779475656388e-05, "loss": 0.118, "step": 13670 }, { "epoch": 2.85705329153605, "grad_norm": 0.9337911181078705, "learning_rate": 1.1255660290967493e-05, "loss": 0.1193, "step": 13671 }, { "epoch": 2.857262277951933, "grad_norm": 1.0089647541617743, "learning_rate": 1.125454109029834e-05, "loss": 0.112, "step": 13672 }, { "epoch": 2.857471264367816, "grad_norm": 1.1524526106172415, "learning_rate": 1.1253421873663162e-05, "loss": 0.1148, "step": 13673 }, { "epoch": 2.857680250783699, "grad_norm": 0.9520031076980675, "learning_rate": 1.1252302641076216e-05, "loss": 0.119, "step": 13674 }, { "epoch": 2.857889237199582, "grad_norm": 1.0085033916972206, "learning_rate": 1.1251183392551736e-05, "loss": 0.1112, "step": 13675 }, { "epoch": 2.858098223615465, "grad_norm": 1.1152993257375905, "learning_rate": 1.125006412810397e-05, "loss": 0.1393, "step": 13676 }, { "epoch": 2.858307210031348, "grad_norm": 0.9175775418744073, "learning_rate": 1.1248944847747167e-05, "loss": 0.1228, "step": 13677 }, { "epoch": 2.8585161964472308, "grad_norm": 0.9127182568360099, "learning_rate": 1.1247825551495564e-05, "loss": 0.1343, "step": 13678 }, { "epoch": 2.8587251828631137, "grad_norm": 1.0420253827263903, "learning_rate": 1.1246706239363408e-05, "loss": 0.1108, "step": 13679 }, { "epoch": 2.8589341692789967, "grad_norm": 0.888007702354871, "learning_rate": 1.1245586911364943e-05, "loss": 0.1009, "step": 13680 }, { "epoch": 2.8591431556948796, "grad_norm": 1.0456570177714557, "learning_rate": 1.124446756751442e-05, "loss": 0.1347, "step": 13681 }, { "epoch": 2.8593521421107626, "grad_norm": 0.9978819831617418, "learning_rate": 1.1243348207826077e-05, "loss": 0.1332, "step": 13682 }, { "epoch": 2.8595611285266456, "grad_norm": 0.9867951944144757, "learning_rate": 1.1242228832314168e-05, "loss": 0.1139, "step": 13683 }, { "epoch": 2.8597701149425285, "grad_norm": 0.871386206327284, "learning_rate": 1.124110944099293e-05, "loss": 0.1132, "step": 13684 }, { "epoch": 2.8599791013584115, "grad_norm": 1.13068974878133, "learning_rate": 1.1239990033876614e-05, "loss": 0.1204, "step": 13685 }, { "epoch": 2.8601880877742945, "grad_norm": 0.9317447599001512, "learning_rate": 1.1238870610979464e-05, "loss": 0.1008, "step": 13686 }, { "epoch": 2.8603970741901774, "grad_norm": 0.8863169725374478, "learning_rate": 1.123775117231573e-05, "loss": 0.1061, "step": 13687 }, { "epoch": 2.8606060606060604, "grad_norm": 0.9361437449528928, "learning_rate": 1.1236631717899654e-05, "loss": 0.126, "step": 13688 }, { "epoch": 2.8608150470219433, "grad_norm": 0.9359184438090099, "learning_rate": 1.1235512247745488e-05, "loss": 0.1159, "step": 13689 }, { "epoch": 2.8610240334378263, "grad_norm": 0.86069472642246, "learning_rate": 1.1234392761867473e-05, "loss": 0.1062, "step": 13690 }, { "epoch": 2.8612330198537093, "grad_norm": 0.7724161445734121, "learning_rate": 1.123327326027986e-05, "loss": 0.0971, "step": 13691 }, { "epoch": 2.861442006269592, "grad_norm": 1.165283384207512, "learning_rate": 1.1232153742996897e-05, "loss": 0.1678, "step": 13692 }, { "epoch": 2.861650992685475, "grad_norm": 0.9248445480685585, "learning_rate": 1.1231034210032827e-05, "loss": 0.1201, "step": 13693 }, { "epoch": 2.861859979101358, "grad_norm": 1.0940310433699763, "learning_rate": 1.1229914661401903e-05, "loss": 0.1299, "step": 13694 }, { "epoch": 2.862068965517241, "grad_norm": 1.2082359756304173, "learning_rate": 1.1228795097118375e-05, "loss": 0.1421, "step": 13695 }, { "epoch": 2.8622779519331245, "grad_norm": 0.9932743504875415, "learning_rate": 1.1227675517196484e-05, "loss": 0.1286, "step": 13696 }, { "epoch": 2.8624869383490075, "grad_norm": 0.9132491810694879, "learning_rate": 1.1226555921650482e-05, "loss": 0.1002, "step": 13697 }, { "epoch": 2.8626959247648904, "grad_norm": 0.9417664965638981, "learning_rate": 1.1225436310494617e-05, "loss": 0.133, "step": 13698 }, { "epoch": 2.8629049111807734, "grad_norm": 1.0096021354041593, "learning_rate": 1.122431668374314e-05, "loss": 0.1323, "step": 13699 }, { "epoch": 2.8631138975966564, "grad_norm": 0.8912921039298719, "learning_rate": 1.1223197041410297e-05, "loss": 0.0868, "step": 13700 }, { "epoch": 2.8633228840125393, "grad_norm": 0.8792499636843452, "learning_rate": 1.122207738351034e-05, "loss": 0.127, "step": 13701 }, { "epoch": 2.8635318704284223, "grad_norm": 0.9300286689062103, "learning_rate": 1.1220957710057517e-05, "loss": 0.1385, "step": 13702 }, { "epoch": 2.8637408568443052, "grad_norm": 1.0282182843606127, "learning_rate": 1.1219838021066075e-05, "loss": 0.1228, "step": 13703 }, { "epoch": 2.863949843260188, "grad_norm": 0.9368192208011381, "learning_rate": 1.1218718316550267e-05, "loss": 0.1275, "step": 13704 }, { "epoch": 2.864158829676071, "grad_norm": 0.8490360305731922, "learning_rate": 1.1217598596524341e-05, "loss": 0.1156, "step": 13705 }, { "epoch": 2.864367816091954, "grad_norm": 1.0815030075394507, "learning_rate": 1.1216478861002552e-05, "loss": 0.1396, "step": 13706 }, { "epoch": 2.864576802507837, "grad_norm": 1.0601156985189044, "learning_rate": 1.1215359109999148e-05, "loss": 0.145, "step": 13707 }, { "epoch": 2.86478578892372, "grad_norm": 1.010080656664584, "learning_rate": 1.1214239343528374e-05, "loss": 0.1215, "step": 13708 }, { "epoch": 2.864994775339603, "grad_norm": 1.166853506556489, "learning_rate": 1.1213119561604489e-05, "loss": 0.1408, "step": 13709 }, { "epoch": 2.865203761755486, "grad_norm": 0.9604641002636058, "learning_rate": 1.1211999764241739e-05, "loss": 0.1261, "step": 13710 }, { "epoch": 2.865412748171369, "grad_norm": 0.8223069512898555, "learning_rate": 1.1210879951454378e-05, "loss": 0.1123, "step": 13711 }, { "epoch": 2.865621734587252, "grad_norm": 1.0664411359535226, "learning_rate": 1.1209760123256653e-05, "loss": 0.1287, "step": 13712 }, { "epoch": 2.865830721003135, "grad_norm": 1.1047982659923508, "learning_rate": 1.1208640279662824e-05, "loss": 0.1246, "step": 13713 }, { "epoch": 2.866039707419018, "grad_norm": 0.871349706887621, "learning_rate": 1.1207520420687133e-05, "loss": 0.1132, "step": 13714 }, { "epoch": 2.8662486938349008, "grad_norm": 0.8963947614318049, "learning_rate": 1.1206400546343837e-05, "loss": 0.113, "step": 13715 }, { "epoch": 2.8664576802507837, "grad_norm": 0.9036180898790235, "learning_rate": 1.1205280656647186e-05, "loss": 0.1429, "step": 13716 }, { "epoch": 2.8666666666666667, "grad_norm": 0.9631187751511299, "learning_rate": 1.1204160751611437e-05, "loss": 0.1147, "step": 13717 }, { "epoch": 2.8668756530825497, "grad_norm": 0.7964966978693048, "learning_rate": 1.1203040831250835e-05, "loss": 0.1055, "step": 13718 }, { "epoch": 2.8670846394984326, "grad_norm": 0.8320082697665048, "learning_rate": 1.1201920895579643e-05, "loss": 0.1002, "step": 13719 }, { "epoch": 2.8672936259143156, "grad_norm": 0.9937082987460774, "learning_rate": 1.1200800944612105e-05, "loss": 0.1336, "step": 13720 }, { "epoch": 2.8675026123301985, "grad_norm": 1.0535618593510137, "learning_rate": 1.1199680978362478e-05, "loss": 0.1329, "step": 13721 }, { "epoch": 2.8677115987460815, "grad_norm": 1.0079616384127497, "learning_rate": 1.1198560996845012e-05, "loss": 0.1202, "step": 13722 }, { "epoch": 2.8679205851619645, "grad_norm": 0.9185150954896942, "learning_rate": 1.1197441000073967e-05, "loss": 0.1182, "step": 13723 }, { "epoch": 2.8681295715778474, "grad_norm": 1.1782490831274677, "learning_rate": 1.1196320988063589e-05, "loss": 0.1455, "step": 13724 }, { "epoch": 2.8683385579937304, "grad_norm": 1.008999414879094, "learning_rate": 1.1195200960828138e-05, "loss": 0.1072, "step": 13725 }, { "epoch": 2.8685475444096133, "grad_norm": 0.9586467037043741, "learning_rate": 1.1194080918381867e-05, "loss": 0.1111, "step": 13726 }, { "epoch": 2.8687565308254963, "grad_norm": 1.0128029131584648, "learning_rate": 1.1192960860739026e-05, "loss": 0.1305, "step": 13727 }, { "epoch": 2.8689655172413793, "grad_norm": 0.8198195308794829, "learning_rate": 1.1191840787913874e-05, "loss": 0.0986, "step": 13728 }, { "epoch": 2.8691745036572622, "grad_norm": 0.8452972290460576, "learning_rate": 1.1190720699920669e-05, "loss": 0.0865, "step": 13729 }, { "epoch": 2.869383490073145, "grad_norm": 0.9538227659428369, "learning_rate": 1.1189600596773655e-05, "loss": 0.1135, "step": 13730 }, { "epoch": 2.869592476489028, "grad_norm": 1.1360557144165246, "learning_rate": 1.1188480478487098e-05, "loss": 0.1551, "step": 13731 }, { "epoch": 2.869801462904911, "grad_norm": 0.8751898635914628, "learning_rate": 1.1187360345075247e-05, "loss": 0.1177, "step": 13732 }, { "epoch": 2.870010449320794, "grad_norm": 1.1005243890679919, "learning_rate": 1.1186240196552356e-05, "loss": 0.1155, "step": 13733 }, { "epoch": 2.870219435736677, "grad_norm": 1.1449912284495507, "learning_rate": 1.1185120032932689e-05, "loss": 0.1577, "step": 13734 }, { "epoch": 2.87042842215256, "grad_norm": 1.06865542250599, "learning_rate": 1.1183999854230493e-05, "loss": 0.1228, "step": 13735 }, { "epoch": 2.870637408568443, "grad_norm": 0.9348006354762902, "learning_rate": 1.1182879660460026e-05, "loss": 0.1238, "step": 13736 }, { "epoch": 2.870846394984326, "grad_norm": 1.5737535085866747, "learning_rate": 1.1181759451635551e-05, "loss": 0.1259, "step": 13737 }, { "epoch": 2.871055381400209, "grad_norm": 1.1727787087895702, "learning_rate": 1.1180639227771318e-05, "loss": 0.1478, "step": 13738 }, { "epoch": 2.871264367816092, "grad_norm": 1.1656072336724426, "learning_rate": 1.1179518988881584e-05, "loss": 0.1371, "step": 13739 }, { "epoch": 2.871473354231975, "grad_norm": 0.9522324276977985, "learning_rate": 1.1178398734980606e-05, "loss": 0.1179, "step": 13740 }, { "epoch": 2.8716823406478578, "grad_norm": 1.04934777273312, "learning_rate": 1.1177278466082644e-05, "loss": 0.1112, "step": 13741 }, { "epoch": 2.8718913270637407, "grad_norm": 1.0414928597099737, "learning_rate": 1.117615818220195e-05, "loss": 0.1218, "step": 13742 }, { "epoch": 2.8721003134796237, "grad_norm": 1.029781252520412, "learning_rate": 1.1175037883352788e-05, "loss": 0.1145, "step": 13743 }, { "epoch": 2.8723092998955067, "grad_norm": 0.920407983020673, "learning_rate": 1.117391756954941e-05, "loss": 0.1222, "step": 13744 }, { "epoch": 2.8725182863113896, "grad_norm": 0.7422093717392604, "learning_rate": 1.1172797240806074e-05, "loss": 0.0941, "step": 13745 }, { "epoch": 2.8727272727272726, "grad_norm": 0.9715830478572133, "learning_rate": 1.1171676897137043e-05, "loss": 0.1274, "step": 13746 }, { "epoch": 2.8729362591431555, "grad_norm": 0.7656351302455469, "learning_rate": 1.1170556538556573e-05, "loss": 0.1016, "step": 13747 }, { "epoch": 2.8731452455590385, "grad_norm": 1.001877370163599, "learning_rate": 1.1169436165078914e-05, "loss": 0.1306, "step": 13748 }, { "epoch": 2.873354231974922, "grad_norm": 1.0489387495549194, "learning_rate": 1.1168315776718337e-05, "loss": 0.1368, "step": 13749 }, { "epoch": 2.873563218390805, "grad_norm": 1.0342230614072268, "learning_rate": 1.1167195373489095e-05, "loss": 0.1658, "step": 13750 }, { "epoch": 2.873772204806688, "grad_norm": 1.0889378525852262, "learning_rate": 1.1166074955405446e-05, "loss": 0.1322, "step": 13751 }, { "epoch": 2.873981191222571, "grad_norm": 0.8506885292676862, "learning_rate": 1.1164954522481653e-05, "loss": 0.1006, "step": 13752 }, { "epoch": 2.8741901776384537, "grad_norm": 0.9739040923158111, "learning_rate": 1.116383407473197e-05, "loss": 0.1249, "step": 13753 }, { "epoch": 2.8743991640543367, "grad_norm": 1.1966797892622778, "learning_rate": 1.116271361217066e-05, "loss": 0.1097, "step": 13754 }, { "epoch": 2.8746081504702197, "grad_norm": 0.7363414477328998, "learning_rate": 1.1161593134811981e-05, "loss": 0.0933, "step": 13755 }, { "epoch": 2.8748171368861026, "grad_norm": 0.8865278606155031, "learning_rate": 1.1160472642670195e-05, "loss": 0.104, "step": 13756 }, { "epoch": 2.8750261233019856, "grad_norm": 1.0054557461313864, "learning_rate": 1.1159352135759558e-05, "loss": 0.117, "step": 13757 }, { "epoch": 2.8752351097178686, "grad_norm": 1.033521222019647, "learning_rate": 1.1158231614094337e-05, "loss": 0.108, "step": 13758 }, { "epoch": 2.8754440961337515, "grad_norm": 0.8986364715566205, "learning_rate": 1.1157111077688785e-05, "loss": 0.1244, "step": 13759 }, { "epoch": 2.8756530825496345, "grad_norm": 0.9207061804367195, "learning_rate": 1.1155990526557168e-05, "loss": 0.1155, "step": 13760 }, { "epoch": 2.8758620689655174, "grad_norm": 1.0314521345960193, "learning_rate": 1.1154869960713741e-05, "loss": 0.1377, "step": 13761 }, { "epoch": 2.8760710553814004, "grad_norm": 0.9273606809902278, "learning_rate": 1.1153749380172771e-05, "loss": 0.1122, "step": 13762 }, { "epoch": 2.8762800417972834, "grad_norm": 0.9236672895265603, "learning_rate": 1.1152628784948518e-05, "loss": 0.1117, "step": 13763 }, { "epoch": 2.8764890282131663, "grad_norm": 0.969706460867398, "learning_rate": 1.115150817505524e-05, "loss": 0.1254, "step": 13764 }, { "epoch": 2.8766980146290493, "grad_norm": 0.9683719265674464, "learning_rate": 1.1150387550507204e-05, "loss": 0.1324, "step": 13765 }, { "epoch": 2.8769070010449322, "grad_norm": 0.9934336444581681, "learning_rate": 1.1149266911318666e-05, "loss": 0.1246, "step": 13766 }, { "epoch": 2.877115987460815, "grad_norm": 0.9616691969411874, "learning_rate": 1.114814625750389e-05, "loss": 0.0723, "step": 13767 }, { "epoch": 2.877324973876698, "grad_norm": 1.0181305643462866, "learning_rate": 1.1147025589077139e-05, "loss": 0.0986, "step": 13768 }, { "epoch": 2.877533960292581, "grad_norm": 0.8895169037312911, "learning_rate": 1.1145904906052674e-05, "loss": 0.1173, "step": 13769 }, { "epoch": 2.877742946708464, "grad_norm": 1.0507158053308396, "learning_rate": 1.1144784208444759e-05, "loss": 0.1288, "step": 13770 }, { "epoch": 2.877951933124347, "grad_norm": 0.9566971324166911, "learning_rate": 1.1143663496267656e-05, "loss": 0.1214, "step": 13771 }, { "epoch": 2.87816091954023, "grad_norm": 1.0079989039823711, "learning_rate": 1.1142542769535626e-05, "loss": 0.1119, "step": 13772 }, { "epoch": 2.878369905956113, "grad_norm": 0.9247886778894471, "learning_rate": 1.1141422028262933e-05, "loss": 0.1122, "step": 13773 }, { "epoch": 2.878578892371996, "grad_norm": 1.0770754439955852, "learning_rate": 1.1140301272463843e-05, "loss": 0.1262, "step": 13774 }, { "epoch": 2.878787878787879, "grad_norm": 0.9413433318414876, "learning_rate": 1.1139180502152616e-05, "loss": 0.1214, "step": 13775 }, { "epoch": 2.878996865203762, "grad_norm": 1.0951910931162887, "learning_rate": 1.113805971734352e-05, "loss": 0.1383, "step": 13776 }, { "epoch": 2.879205851619645, "grad_norm": 1.004929000962255, "learning_rate": 1.1136938918050813e-05, "loss": 0.1305, "step": 13777 }, { "epoch": 2.879414838035528, "grad_norm": 1.0557004020411527, "learning_rate": 1.113581810428876e-05, "loss": 0.1361, "step": 13778 }, { "epoch": 2.8796238244514107, "grad_norm": 0.9296605827669986, "learning_rate": 1.113469727607163e-05, "loss": 0.1119, "step": 13779 }, { "epoch": 2.8798328108672937, "grad_norm": 1.0325359578526543, "learning_rate": 1.1133576433413682e-05, "loss": 0.1025, "step": 13780 }, { "epoch": 2.8800417972831767, "grad_norm": 1.0741099215494025, "learning_rate": 1.113245557632918e-05, "loss": 0.1224, "step": 13781 }, { "epoch": 2.8802507836990596, "grad_norm": 1.0019727308254074, "learning_rate": 1.1131334704832395e-05, "loss": 0.1069, "step": 13782 }, { "epoch": 2.8804597701149426, "grad_norm": 0.8106718317908874, "learning_rate": 1.1130213818937586e-05, "loss": 0.0788, "step": 13783 }, { "epoch": 2.8806687565308255, "grad_norm": 1.0535624700527342, "learning_rate": 1.1129092918659019e-05, "loss": 0.1277, "step": 13784 }, { "epoch": 2.8808777429467085, "grad_norm": 0.9514959215550403, "learning_rate": 1.1127972004010963e-05, "loss": 0.1122, "step": 13785 }, { "epoch": 2.8810867293625915, "grad_norm": 1.049366654092098, "learning_rate": 1.112685107500768e-05, "loss": 0.1199, "step": 13786 }, { "epoch": 2.8812957157784744, "grad_norm": 0.8162011370609, "learning_rate": 1.1125730131663433e-05, "loss": 0.0982, "step": 13787 }, { "epoch": 2.8815047021943574, "grad_norm": 0.8663181338250354, "learning_rate": 1.1124609173992495e-05, "loss": 0.1104, "step": 13788 }, { "epoch": 2.8817136886102404, "grad_norm": 0.9179834840859575, "learning_rate": 1.1123488202009126e-05, "loss": 0.1136, "step": 13789 }, { "epoch": 2.8819226750261233, "grad_norm": 0.753952783454638, "learning_rate": 1.1122367215727596e-05, "loss": 0.0967, "step": 13790 }, { "epoch": 2.8821316614420063, "grad_norm": 1.0167834585299145, "learning_rate": 1.1121246215162166e-05, "loss": 0.1356, "step": 13791 }, { "epoch": 2.8823406478578892, "grad_norm": 0.9455059281982616, "learning_rate": 1.112012520032711e-05, "loss": 0.1066, "step": 13792 }, { "epoch": 2.882549634273772, "grad_norm": 1.0752969095290488, "learning_rate": 1.1119004171236686e-05, "loss": 0.1355, "step": 13793 }, { "epoch": 2.882758620689655, "grad_norm": 0.9300903459071564, "learning_rate": 1.1117883127905171e-05, "loss": 0.1241, "step": 13794 }, { "epoch": 2.882967607105538, "grad_norm": 1.1072638718670886, "learning_rate": 1.1116762070346823e-05, "loss": 0.1084, "step": 13795 }, { "epoch": 2.883176593521421, "grad_norm": 1.0295964209700563, "learning_rate": 1.1115640998575912e-05, "loss": 0.1211, "step": 13796 }, { "epoch": 2.883385579937304, "grad_norm": 0.9285551749174623, "learning_rate": 1.1114519912606708e-05, "loss": 0.1172, "step": 13797 }, { "epoch": 2.883594566353187, "grad_norm": 0.890843269772137, "learning_rate": 1.1113398812453478e-05, "loss": 0.1305, "step": 13798 }, { "epoch": 2.88380355276907, "grad_norm": 0.8414762911027353, "learning_rate": 1.1112277698130485e-05, "loss": 0.0922, "step": 13799 }, { "epoch": 2.884012539184953, "grad_norm": 0.994279701578608, "learning_rate": 1.1111156569652004e-05, "loss": 0.1244, "step": 13800 }, { "epoch": 2.884221525600836, "grad_norm": 1.1842139783502739, "learning_rate": 1.1110035427032297e-05, "loss": 0.1456, "step": 13801 }, { "epoch": 2.884430512016719, "grad_norm": 0.84796884147198, "learning_rate": 1.1108914270285636e-05, "loss": 0.1077, "step": 13802 }, { "epoch": 2.884639498432602, "grad_norm": 1.029750858142759, "learning_rate": 1.1107793099426287e-05, "loss": 0.1236, "step": 13803 }, { "epoch": 2.8848484848484848, "grad_norm": 0.8288531605557322, "learning_rate": 1.1106671914468524e-05, "loss": 0.1056, "step": 13804 }, { "epoch": 2.8850574712643677, "grad_norm": 1.1002220732498695, "learning_rate": 1.1105550715426607e-05, "loss": 0.1111, "step": 13805 }, { "epoch": 2.8852664576802507, "grad_norm": 0.9682049052556453, "learning_rate": 1.1104429502314813e-05, "loss": 0.122, "step": 13806 }, { "epoch": 2.8854754440961337, "grad_norm": 0.9771739402426985, "learning_rate": 1.1103308275147409e-05, "loss": 0.1062, "step": 13807 }, { "epoch": 2.8856844305120166, "grad_norm": 0.9339911901418011, "learning_rate": 1.110218703393866e-05, "loss": 0.0977, "step": 13808 }, { "epoch": 2.8858934169278996, "grad_norm": 1.2467552333236671, "learning_rate": 1.1101065778702842e-05, "loss": 0.1246, "step": 13809 }, { "epoch": 2.8861024033437825, "grad_norm": 0.9614129691680172, "learning_rate": 1.1099944509454221e-05, "loss": 0.1311, "step": 13810 }, { "epoch": 2.8863113897596655, "grad_norm": 1.0525583680246877, "learning_rate": 1.1098823226207068e-05, "loss": 0.1334, "step": 13811 }, { "epoch": 2.8865203761755485, "grad_norm": 0.9989076501143873, "learning_rate": 1.1097701928975653e-05, "loss": 0.1155, "step": 13812 }, { "epoch": 2.8867293625914314, "grad_norm": 1.0074678637801773, "learning_rate": 1.1096580617774246e-05, "loss": 0.1348, "step": 13813 }, { "epoch": 2.8869383490073144, "grad_norm": 0.9063170699856449, "learning_rate": 1.1095459292617117e-05, "loss": 0.1238, "step": 13814 }, { "epoch": 2.8871473354231973, "grad_norm": 0.9149968768167924, "learning_rate": 1.1094337953518537e-05, "loss": 0.0987, "step": 13815 }, { "epoch": 2.8873563218390803, "grad_norm": 0.9056678800184018, "learning_rate": 1.109321660049278e-05, "loss": 0.122, "step": 13816 }, { "epoch": 2.8875653082549633, "grad_norm": 0.9190278511210174, "learning_rate": 1.109209523355411e-05, "loss": 0.1182, "step": 13817 }, { "epoch": 2.8877742946708462, "grad_norm": 0.8883634107701581, "learning_rate": 1.1090973852716805e-05, "loss": 0.1123, "step": 13818 }, { "epoch": 2.887983281086729, "grad_norm": 0.7408640977383446, "learning_rate": 1.1089852457995133e-05, "loss": 0.0792, "step": 13819 }, { "epoch": 2.888192267502612, "grad_norm": 0.6922137737720419, "learning_rate": 1.1088731049403365e-05, "loss": 0.1001, "step": 13820 }, { "epoch": 2.888401253918495, "grad_norm": 0.9070793763429195, "learning_rate": 1.1087609626955775e-05, "loss": 0.1031, "step": 13821 }, { "epoch": 2.888610240334378, "grad_norm": 0.8267906247622211, "learning_rate": 1.1086488190666635e-05, "loss": 0.1047, "step": 13822 }, { "epoch": 2.888819226750261, "grad_norm": 0.8846932047154685, "learning_rate": 1.1085366740550212e-05, "loss": 0.1069, "step": 13823 }, { "epoch": 2.889028213166144, "grad_norm": 0.9267979259011654, "learning_rate": 1.1084245276620784e-05, "loss": 0.0945, "step": 13824 }, { "epoch": 2.889237199582027, "grad_norm": 1.033385507632334, "learning_rate": 1.1083123798892623e-05, "loss": 0.1101, "step": 13825 }, { "epoch": 2.88944618599791, "grad_norm": 0.9226002970793424, "learning_rate": 1.1082002307379998e-05, "loss": 0.1201, "step": 13826 }, { "epoch": 2.889655172413793, "grad_norm": 0.9944045797219809, "learning_rate": 1.1080880802097185e-05, "loss": 0.1288, "step": 13827 }, { "epoch": 2.889864158829676, "grad_norm": 1.0595075642361707, "learning_rate": 1.1079759283058456e-05, "loss": 0.1227, "step": 13828 }, { "epoch": 2.890073145245559, "grad_norm": 0.8627768085147044, "learning_rate": 1.107863775027808e-05, "loss": 0.1031, "step": 13829 }, { "epoch": 2.8902821316614418, "grad_norm": 0.8755147247879961, "learning_rate": 1.1077516203770337e-05, "loss": 0.1033, "step": 13830 }, { "epoch": 2.8904911180773247, "grad_norm": 1.153459999041574, "learning_rate": 1.1076394643549495e-05, "loss": 0.1249, "step": 13831 }, { "epoch": 2.8907001044932077, "grad_norm": 0.8858731799356206, "learning_rate": 1.1075273069629833e-05, "loss": 0.1241, "step": 13832 }, { "epoch": 2.8909090909090907, "grad_norm": 1.0610493686273998, "learning_rate": 1.107415148202562e-05, "loss": 0.1203, "step": 13833 }, { "epoch": 2.8911180773249736, "grad_norm": 1.014009093518424, "learning_rate": 1.1073029880751133e-05, "loss": 0.1152, "step": 13834 }, { "epoch": 2.8913270637408566, "grad_norm": 0.8782151261991135, "learning_rate": 1.1071908265820645e-05, "loss": 0.1081, "step": 13835 }, { "epoch": 2.8915360501567395, "grad_norm": 1.0080844043352426, "learning_rate": 1.107078663724843e-05, "loss": 0.118, "step": 13836 }, { "epoch": 2.891745036572623, "grad_norm": 0.869402087857064, "learning_rate": 1.1069664995048761e-05, "loss": 0.1194, "step": 13837 }, { "epoch": 2.891954022988506, "grad_norm": 1.0636057831799521, "learning_rate": 1.1068543339235918e-05, "loss": 0.1312, "step": 13838 }, { "epoch": 2.892163009404389, "grad_norm": 0.9950690821575635, "learning_rate": 1.106742166982417e-05, "loss": 0.1216, "step": 13839 }, { "epoch": 2.892371995820272, "grad_norm": 0.8073340017364572, "learning_rate": 1.1066299986827795e-05, "loss": 0.1182, "step": 13840 }, { "epoch": 2.892580982236155, "grad_norm": 0.9297741727864982, "learning_rate": 1.1065178290261068e-05, "loss": 0.117, "step": 13841 }, { "epoch": 2.8927899686520377, "grad_norm": 0.9585849170303228, "learning_rate": 1.1064056580138262e-05, "loss": 0.1149, "step": 13842 }, { "epoch": 2.8929989550679207, "grad_norm": 1.1575992295400075, "learning_rate": 1.1062934856473655e-05, "loss": 0.1211, "step": 13843 }, { "epoch": 2.8932079414838037, "grad_norm": 1.1197984333534796, "learning_rate": 1.1061813119281521e-05, "loss": 0.103, "step": 13844 }, { "epoch": 2.8934169278996866, "grad_norm": 1.1987859276645125, "learning_rate": 1.1060691368576142e-05, "loss": 0.1482, "step": 13845 }, { "epoch": 2.8936259143155696, "grad_norm": 0.8440733422813511, "learning_rate": 1.1059569604371784e-05, "loss": 0.1071, "step": 13846 }, { "epoch": 2.8938349007314526, "grad_norm": 0.7996669253421921, "learning_rate": 1.105844782668273e-05, "loss": 0.0937, "step": 13847 }, { "epoch": 2.8940438871473355, "grad_norm": 0.8913356126034511, "learning_rate": 1.1057326035523255e-05, "loss": 0.1284, "step": 13848 }, { "epoch": 2.8942528735632185, "grad_norm": 1.088429867443506, "learning_rate": 1.1056204230907636e-05, "loss": 0.1385, "step": 13849 }, { "epoch": 2.8944618599791014, "grad_norm": 1.0096489966614515, "learning_rate": 1.1055082412850146e-05, "loss": 0.1166, "step": 13850 }, { "epoch": 2.8946708463949844, "grad_norm": 0.9899618950190009, "learning_rate": 1.1053960581365068e-05, "loss": 0.1242, "step": 13851 }, { "epoch": 2.8948798328108674, "grad_norm": 1.029753099884889, "learning_rate": 1.1052838736466676e-05, "loss": 0.1518, "step": 13852 }, { "epoch": 2.8950888192267503, "grad_norm": 1.0244405250737232, "learning_rate": 1.1051716878169245e-05, "loss": 0.1192, "step": 13853 }, { "epoch": 2.8952978056426333, "grad_norm": 0.9287878020640626, "learning_rate": 1.1050595006487055e-05, "loss": 0.1369, "step": 13854 }, { "epoch": 2.8955067920585162, "grad_norm": 0.9430604090274373, "learning_rate": 1.1049473121434387e-05, "loss": 0.1105, "step": 13855 }, { "epoch": 2.895715778474399, "grad_norm": 0.8024527681645359, "learning_rate": 1.1048351223025508e-05, "loss": 0.1043, "step": 13856 }, { "epoch": 2.895924764890282, "grad_norm": 0.9289483719192261, "learning_rate": 1.1047229311274707e-05, "loss": 0.1224, "step": 13857 }, { "epoch": 2.896133751306165, "grad_norm": 0.8361265052886766, "learning_rate": 1.1046107386196258e-05, "loss": 0.0816, "step": 13858 }, { "epoch": 2.896342737722048, "grad_norm": 0.9629493636711254, "learning_rate": 1.104498544780444e-05, "loss": 0.1223, "step": 13859 }, { "epoch": 2.896551724137931, "grad_norm": 0.990832547142249, "learning_rate": 1.104386349611353e-05, "loss": 0.1295, "step": 13860 }, { "epoch": 2.896760710553814, "grad_norm": 1.0470012137434805, "learning_rate": 1.1042741531137809e-05, "loss": 0.1368, "step": 13861 }, { "epoch": 2.896969696969697, "grad_norm": 0.980362267819122, "learning_rate": 1.1041619552891547e-05, "loss": 0.1363, "step": 13862 }, { "epoch": 2.89717868338558, "grad_norm": 0.7696548696848888, "learning_rate": 1.1040497561389038e-05, "loss": 0.0998, "step": 13863 }, { "epoch": 2.897387669801463, "grad_norm": 0.927240871743269, "learning_rate": 1.103937555664455e-05, "loss": 0.1118, "step": 13864 }, { "epoch": 2.897596656217346, "grad_norm": 0.9770085435323637, "learning_rate": 1.1038253538672365e-05, "loss": 0.128, "step": 13865 }, { "epoch": 2.897805642633229, "grad_norm": 0.8352670074025607, "learning_rate": 1.1037131507486764e-05, "loss": 0.1116, "step": 13866 }, { "epoch": 2.898014629049112, "grad_norm": 0.7629673346004318, "learning_rate": 1.1036009463102027e-05, "loss": 0.0972, "step": 13867 }, { "epoch": 2.8982236154649947, "grad_norm": 0.8956711753463081, "learning_rate": 1.1034887405532429e-05, "loss": 0.1041, "step": 13868 }, { "epoch": 2.8984326018808777, "grad_norm": 0.869557903759444, "learning_rate": 1.1033765334792254e-05, "loss": 0.1057, "step": 13869 }, { "epoch": 2.8986415882967607, "grad_norm": 0.8758981635456764, "learning_rate": 1.1032643250895783e-05, "loss": 0.1271, "step": 13870 }, { "epoch": 2.8988505747126436, "grad_norm": 0.9714859416534194, "learning_rate": 1.1031521153857294e-05, "loss": 0.1344, "step": 13871 }, { "epoch": 2.8990595611285266, "grad_norm": 1.1041948860502946, "learning_rate": 1.1030399043691067e-05, "loss": 0.1621, "step": 13872 }, { "epoch": 2.8992685475444095, "grad_norm": 0.976786506370853, "learning_rate": 1.1029276920411387e-05, "loss": 0.1193, "step": 13873 }, { "epoch": 2.8994775339602925, "grad_norm": 1.0521491639485736, "learning_rate": 1.1028154784032525e-05, "loss": 0.1262, "step": 13874 }, { "epoch": 2.8996865203761755, "grad_norm": 0.9183256932401401, "learning_rate": 1.1027032634568775e-05, "loss": 0.1267, "step": 13875 }, { "epoch": 2.8998955067920584, "grad_norm": 1.1418351108470648, "learning_rate": 1.1025910472034409e-05, "loss": 0.1441, "step": 13876 }, { "epoch": 2.9001044932079414, "grad_norm": 0.9117369370971304, "learning_rate": 1.1024788296443712e-05, "loss": 0.1141, "step": 13877 }, { "epoch": 2.9003134796238244, "grad_norm": 0.9640267882548285, "learning_rate": 1.1023666107810965e-05, "loss": 0.1101, "step": 13878 }, { "epoch": 2.9005224660397073, "grad_norm": 1.2446664912871683, "learning_rate": 1.1022543906150447e-05, "loss": 0.1634, "step": 13879 }, { "epoch": 2.9007314524555903, "grad_norm": 1.0100189033909948, "learning_rate": 1.1021421691476441e-05, "loss": 0.1244, "step": 13880 }, { "epoch": 2.9009404388714732, "grad_norm": 0.9937892904430604, "learning_rate": 1.1020299463803232e-05, "loss": 0.1112, "step": 13881 }, { "epoch": 2.901149425287356, "grad_norm": 1.0295396315769842, "learning_rate": 1.1019177223145099e-05, "loss": 0.1099, "step": 13882 }, { "epoch": 2.901358411703239, "grad_norm": 0.8761762741347812, "learning_rate": 1.1018054969516325e-05, "loss": 0.1072, "step": 13883 }, { "epoch": 2.901567398119122, "grad_norm": 0.7827361238215692, "learning_rate": 1.1016932702931193e-05, "loss": 0.0813, "step": 13884 }, { "epoch": 2.901776384535005, "grad_norm": 0.8848467347200485, "learning_rate": 1.1015810423403987e-05, "loss": 0.1024, "step": 13885 }, { "epoch": 2.901985370950888, "grad_norm": 1.0162961098565988, "learning_rate": 1.1014688130948983e-05, "loss": 0.1383, "step": 13886 }, { "epoch": 2.902194357366771, "grad_norm": 1.057603652681328, "learning_rate": 1.1013565825580474e-05, "loss": 0.1404, "step": 13887 }, { "epoch": 2.902403343782654, "grad_norm": 0.903184082186043, "learning_rate": 1.1012443507312736e-05, "loss": 0.1188, "step": 13888 }, { "epoch": 2.9026123301985374, "grad_norm": 2.656044165843754, "learning_rate": 1.1011321176160054e-05, "loss": 0.1161, "step": 13889 }, { "epoch": 2.9028213166144203, "grad_norm": 1.0569242841685127, "learning_rate": 1.1010198832136712e-05, "loss": 0.1388, "step": 13890 }, { "epoch": 2.9030303030303033, "grad_norm": 1.1594548597549348, "learning_rate": 1.1009076475256997e-05, "loss": 0.1333, "step": 13891 }, { "epoch": 2.9032392894461863, "grad_norm": 1.0177854454322441, "learning_rate": 1.1007954105535182e-05, "loss": 0.1218, "step": 13892 }, { "epoch": 2.903448275862069, "grad_norm": 0.9063592332633246, "learning_rate": 1.1006831722985564e-05, "loss": 0.1258, "step": 13893 }, { "epoch": 2.903657262277952, "grad_norm": 0.9104985561449136, "learning_rate": 1.1005709327622418e-05, "loss": 0.1235, "step": 13894 }, { "epoch": 2.903866248693835, "grad_norm": 0.7904063657291582, "learning_rate": 1.1004586919460032e-05, "loss": 0.1056, "step": 13895 }, { "epoch": 2.904075235109718, "grad_norm": 0.7700871672409393, "learning_rate": 1.100346449851269e-05, "loss": 0.088, "step": 13896 }, { "epoch": 2.904284221525601, "grad_norm": 1.0102208748681756, "learning_rate": 1.100234206479468e-05, "loss": 0.1162, "step": 13897 }, { "epoch": 2.904493207941484, "grad_norm": 1.2386559427488653, "learning_rate": 1.1001219618320276e-05, "loss": 0.1277, "step": 13898 }, { "epoch": 2.904702194357367, "grad_norm": 0.8837777331581727, "learning_rate": 1.1000097159103775e-05, "loss": 0.1224, "step": 13899 }, { "epoch": 2.90491118077325, "grad_norm": 0.9086926049655925, "learning_rate": 1.0998974687159456e-05, "loss": 0.1237, "step": 13900 }, { "epoch": 2.905120167189133, "grad_norm": 0.9224544927779961, "learning_rate": 1.0997852202501607e-05, "loss": 0.1246, "step": 13901 }, { "epoch": 2.905329153605016, "grad_norm": 0.9494196156705782, "learning_rate": 1.099672970514451e-05, "loss": 0.1041, "step": 13902 }, { "epoch": 2.905538140020899, "grad_norm": 0.7371992013464019, "learning_rate": 1.0995607195102453e-05, "loss": 0.091, "step": 13903 }, { "epoch": 2.905747126436782, "grad_norm": 0.9628447850315998, "learning_rate": 1.099448467238972e-05, "loss": 0.1095, "step": 13904 }, { "epoch": 2.9059561128526648, "grad_norm": 0.8626972350764996, "learning_rate": 1.0993362137020598e-05, "loss": 0.1164, "step": 13905 }, { "epoch": 2.9061650992685477, "grad_norm": 0.9539491102733281, "learning_rate": 1.0992239589009372e-05, "loss": 0.1125, "step": 13906 }, { "epoch": 2.9063740856844307, "grad_norm": 0.9215805642330053, "learning_rate": 1.099111702837033e-05, "loss": 0.1091, "step": 13907 }, { "epoch": 2.9065830721003136, "grad_norm": 0.8145906799409621, "learning_rate": 1.098999445511776e-05, "loss": 0.1024, "step": 13908 }, { "epoch": 2.9067920585161966, "grad_norm": 0.7709872356289518, "learning_rate": 1.0988871869265944e-05, "loss": 0.1169, "step": 13909 }, { "epoch": 2.9070010449320796, "grad_norm": 1.0070891462552183, "learning_rate": 1.0987749270829172e-05, "loss": 0.1269, "step": 13910 }, { "epoch": 2.9072100313479625, "grad_norm": 0.8211054593890866, "learning_rate": 1.0986626659821727e-05, "loss": 0.0981, "step": 13911 }, { "epoch": 2.9074190177638455, "grad_norm": 0.9229821312486776, "learning_rate": 1.0985504036257899e-05, "loss": 0.1186, "step": 13912 }, { "epoch": 2.9076280041797284, "grad_norm": 0.9079658451983781, "learning_rate": 1.0984381400151975e-05, "loss": 0.1156, "step": 13913 }, { "epoch": 2.9078369905956114, "grad_norm": 0.9628708194208198, "learning_rate": 1.0983258751518245e-05, "loss": 0.1151, "step": 13914 }, { "epoch": 2.9080459770114944, "grad_norm": 0.9598880055156184, "learning_rate": 1.098213609037099e-05, "loss": 0.1172, "step": 13915 }, { "epoch": 2.9082549634273773, "grad_norm": 1.383269871041611, "learning_rate": 1.0981013416724502e-05, "loss": 0.1633, "step": 13916 }, { "epoch": 2.9084639498432603, "grad_norm": 1.0945606472264224, "learning_rate": 1.0979890730593065e-05, "loss": 0.1287, "step": 13917 }, { "epoch": 2.9086729362591432, "grad_norm": 0.8779107791653054, "learning_rate": 1.0978768031990976e-05, "loss": 0.0998, "step": 13918 }, { "epoch": 2.908881922675026, "grad_norm": 1.1019342206869596, "learning_rate": 1.0977645320932511e-05, "loss": 0.1085, "step": 13919 }, { "epoch": 2.909090909090909, "grad_norm": 1.1165433518726189, "learning_rate": 1.097652259743197e-05, "loss": 0.1276, "step": 13920 }, { "epoch": 2.909299895506792, "grad_norm": 1.0377945487886664, "learning_rate": 1.097539986150363e-05, "loss": 0.1449, "step": 13921 }, { "epoch": 2.909508881922675, "grad_norm": 0.9874495461673456, "learning_rate": 1.0974277113161789e-05, "loss": 0.1257, "step": 13922 }, { "epoch": 2.909717868338558, "grad_norm": 0.8583338819715565, "learning_rate": 1.097315435242073e-05, "loss": 0.1082, "step": 13923 }, { "epoch": 2.909926854754441, "grad_norm": 1.2421168190899512, "learning_rate": 1.0972031579294746e-05, "loss": 0.1097, "step": 13924 }, { "epoch": 2.910135841170324, "grad_norm": 0.8504834568320468, "learning_rate": 1.0970908793798121e-05, "loss": 0.0977, "step": 13925 }, { "epoch": 2.910344827586207, "grad_norm": 0.9158905751703517, "learning_rate": 1.096978599594515e-05, "loss": 0.0929, "step": 13926 }, { "epoch": 2.91055381400209, "grad_norm": 0.6871508011674282, "learning_rate": 1.096866318575012e-05, "loss": 0.082, "step": 13927 }, { "epoch": 2.910762800417973, "grad_norm": 1.6252750019098943, "learning_rate": 1.0967540363227319e-05, "loss": 0.1293, "step": 13928 }, { "epoch": 2.910971786833856, "grad_norm": 0.9523726587034433, "learning_rate": 1.0966417528391035e-05, "loss": 0.1163, "step": 13929 }, { "epoch": 2.911180773249739, "grad_norm": 1.010268456599523, "learning_rate": 1.0965294681255565e-05, "loss": 0.1381, "step": 13930 }, { "epoch": 2.9113897596656217, "grad_norm": 1.119200541032923, "learning_rate": 1.0964171821835192e-05, "loss": 0.1309, "step": 13931 }, { "epoch": 2.9115987460815047, "grad_norm": 0.9885304789988366, "learning_rate": 1.096304895014421e-05, "loss": 0.1306, "step": 13932 }, { "epoch": 2.9118077324973877, "grad_norm": 0.9919369952164437, "learning_rate": 1.096192606619691e-05, "loss": 0.1448, "step": 13933 }, { "epoch": 2.9120167189132706, "grad_norm": 1.0650775266655337, "learning_rate": 1.0960803170007576e-05, "loss": 0.1411, "step": 13934 }, { "epoch": 2.9122257053291536, "grad_norm": 0.9361666519738194, "learning_rate": 1.0959680261590506e-05, "loss": 0.1342, "step": 13935 }, { "epoch": 2.9124346917450366, "grad_norm": 0.865975412846615, "learning_rate": 1.095855734095999e-05, "loss": 0.1112, "step": 13936 }, { "epoch": 2.9126436781609195, "grad_norm": 1.1129502624832739, "learning_rate": 1.0957434408130312e-05, "loss": 0.1463, "step": 13937 }, { "epoch": 2.9128526645768025, "grad_norm": 0.9040846839120673, "learning_rate": 1.0956311463115773e-05, "loss": 0.1197, "step": 13938 }, { "epoch": 2.9130616509926854, "grad_norm": 1.1488469440162181, "learning_rate": 1.0955188505930657e-05, "loss": 0.1499, "step": 13939 }, { "epoch": 2.9132706374085684, "grad_norm": 0.8152091454862026, "learning_rate": 1.0954065536589256e-05, "loss": 0.1155, "step": 13940 }, { "epoch": 2.9134796238244514, "grad_norm": 0.8968966553545102, "learning_rate": 1.0952942555105866e-05, "loss": 0.1291, "step": 13941 }, { "epoch": 2.9136886102403343, "grad_norm": 1.0492405293786826, "learning_rate": 1.0951819561494774e-05, "loss": 0.1429, "step": 13942 }, { "epoch": 2.9138975966562173, "grad_norm": 0.991370873546482, "learning_rate": 1.0950696555770273e-05, "loss": 0.1194, "step": 13943 }, { "epoch": 2.9141065830721002, "grad_norm": 0.8135032057261651, "learning_rate": 1.0949573537946661e-05, "loss": 0.1042, "step": 13944 }, { "epoch": 2.914315569487983, "grad_norm": 0.9480672084028992, "learning_rate": 1.094845050803822e-05, "loss": 0.1281, "step": 13945 }, { "epoch": 2.914524555903866, "grad_norm": 1.1365337407618687, "learning_rate": 1.094732746605925e-05, "loss": 0.1653, "step": 13946 }, { "epoch": 2.914733542319749, "grad_norm": 0.9890801688575499, "learning_rate": 1.094620441202404e-05, "loss": 0.1211, "step": 13947 }, { "epoch": 2.914942528735632, "grad_norm": 1.1827742288339749, "learning_rate": 1.0945081345946885e-05, "loss": 0.1298, "step": 13948 }, { "epoch": 2.915151515151515, "grad_norm": 0.7166474788884117, "learning_rate": 1.0943958267842071e-05, "loss": 0.108, "step": 13949 }, { "epoch": 2.915360501567398, "grad_norm": 0.834512672073112, "learning_rate": 1.0942835177723903e-05, "loss": 0.1106, "step": 13950 }, { "epoch": 2.915569487983281, "grad_norm": 0.9360805772355044, "learning_rate": 1.0941712075606662e-05, "loss": 0.136, "step": 13951 }, { "epoch": 2.915778474399164, "grad_norm": 0.8037370292191274, "learning_rate": 1.094058896150465e-05, "loss": 0.1027, "step": 13952 }, { "epoch": 2.915987460815047, "grad_norm": 0.8499876341879558, "learning_rate": 1.0939465835432155e-05, "loss": 0.0961, "step": 13953 }, { "epoch": 2.91619644723093, "grad_norm": 0.9680821322518876, "learning_rate": 1.0938342697403478e-05, "loss": 0.1187, "step": 13954 }, { "epoch": 2.916405433646813, "grad_norm": 1.0684920039321915, "learning_rate": 1.0937219547432897e-05, "loss": 0.1248, "step": 13955 }, { "epoch": 2.916614420062696, "grad_norm": 0.7506457094922451, "learning_rate": 1.0936096385534724e-05, "loss": 0.1131, "step": 13956 }, { "epoch": 2.9168234064785787, "grad_norm": 0.9745518791083297, "learning_rate": 1.0934973211723243e-05, "loss": 0.1038, "step": 13957 }, { "epoch": 2.9170323928944617, "grad_norm": 1.1680955863250102, "learning_rate": 1.0933850026012751e-05, "loss": 0.1398, "step": 13958 }, { "epoch": 2.9172413793103447, "grad_norm": 0.8746964027965615, "learning_rate": 1.0932726828417542e-05, "loss": 0.1234, "step": 13959 }, { "epoch": 2.9174503657262276, "grad_norm": 0.9793704413275306, "learning_rate": 1.093160361895191e-05, "loss": 0.1279, "step": 13960 }, { "epoch": 2.9176593521421106, "grad_norm": 0.8409745395659984, "learning_rate": 1.0930480397630146e-05, "loss": 0.119, "step": 13961 }, { "epoch": 2.9178683385579935, "grad_norm": 0.9351560119719865, "learning_rate": 1.0929357164466552e-05, "loss": 0.111, "step": 13962 }, { "epoch": 2.9180773249738765, "grad_norm": 0.9983517486278268, "learning_rate": 1.0928233919475416e-05, "loss": 0.1266, "step": 13963 }, { "epoch": 2.9182863113897595, "grad_norm": 0.9481937798057763, "learning_rate": 1.092711066267104e-05, "loss": 0.1248, "step": 13964 }, { "epoch": 2.9184952978056424, "grad_norm": 0.8720694405967653, "learning_rate": 1.0925987394067714e-05, "loss": 0.0952, "step": 13965 }, { "epoch": 2.9187042842215254, "grad_norm": 0.847671534496832, "learning_rate": 1.0924864113679739e-05, "loss": 0.1127, "step": 13966 }, { "epoch": 2.9189132706374084, "grad_norm": 0.7953761723535079, "learning_rate": 1.09237408215214e-05, "loss": 0.0999, "step": 13967 }, { "epoch": 2.9191222570532913, "grad_norm": 0.9040517002283743, "learning_rate": 1.0922617517607003e-05, "loss": 0.1305, "step": 13968 }, { "epoch": 2.9193312434691743, "grad_norm": 0.9505626915953537, "learning_rate": 1.092149420195084e-05, "loss": 0.1092, "step": 13969 }, { "epoch": 2.9195402298850572, "grad_norm": 0.8998177796827285, "learning_rate": 1.0920370874567206e-05, "loss": 0.1136, "step": 13970 }, { "epoch": 2.91974921630094, "grad_norm": 1.07813239650212, "learning_rate": 1.0919247535470396e-05, "loss": 0.1328, "step": 13971 }, { "epoch": 2.919958202716823, "grad_norm": 0.9428125570400437, "learning_rate": 1.0918124184674714e-05, "loss": 0.1304, "step": 13972 }, { "epoch": 2.920167189132706, "grad_norm": 0.75148808373854, "learning_rate": 1.0917000822194445e-05, "loss": 0.0878, "step": 13973 }, { "epoch": 2.920376175548589, "grad_norm": 0.7123783191693976, "learning_rate": 1.0915877448043895e-05, "loss": 0.0784, "step": 13974 }, { "epoch": 2.920585161964472, "grad_norm": 0.8335957142336744, "learning_rate": 1.0914754062237355e-05, "loss": 0.1006, "step": 13975 }, { "epoch": 2.920794148380355, "grad_norm": 1.0392326870253543, "learning_rate": 1.0913630664789125e-05, "loss": 0.1093, "step": 13976 }, { "epoch": 2.9210031347962384, "grad_norm": 1.0765931405398268, "learning_rate": 1.0912507255713502e-05, "loss": 0.1548, "step": 13977 }, { "epoch": 2.9212121212121214, "grad_norm": 0.8697443200195529, "learning_rate": 1.091138383502478e-05, "loss": 0.1207, "step": 13978 }, { "epoch": 2.9214211076280043, "grad_norm": 1.039381300013078, "learning_rate": 1.091026040273726e-05, "loss": 0.1365, "step": 13979 }, { "epoch": 2.9216300940438873, "grad_norm": 0.9210353091507552, "learning_rate": 1.0909136958865236e-05, "loss": 0.1066, "step": 13980 }, { "epoch": 2.9218390804597703, "grad_norm": 0.9576325992910844, "learning_rate": 1.0908013503423007e-05, "loss": 0.1315, "step": 13981 }, { "epoch": 2.922048066875653, "grad_norm": 0.9466048596601664, "learning_rate": 1.0906890036424873e-05, "loss": 0.1136, "step": 13982 }, { "epoch": 2.922257053291536, "grad_norm": 0.9510554701670786, "learning_rate": 1.0905766557885132e-05, "loss": 0.1398, "step": 13983 }, { "epoch": 2.922466039707419, "grad_norm": 0.7624324682616562, "learning_rate": 1.0904643067818077e-05, "loss": 0.0999, "step": 13984 }, { "epoch": 2.922675026123302, "grad_norm": 1.0647196316198886, "learning_rate": 1.090351956623801e-05, "loss": 0.1359, "step": 13985 }, { "epoch": 2.922884012539185, "grad_norm": 0.9554916324699904, "learning_rate": 1.0902396053159227e-05, "loss": 0.1298, "step": 13986 }, { "epoch": 2.923092998955068, "grad_norm": 0.9932466055409289, "learning_rate": 1.0901272528596032e-05, "loss": 0.1048, "step": 13987 }, { "epoch": 2.923301985370951, "grad_norm": 0.8101987349988481, "learning_rate": 1.0900148992562715e-05, "loss": 0.1006, "step": 13988 }, { "epoch": 2.923510971786834, "grad_norm": 1.05011241646804, "learning_rate": 1.0899025445073586e-05, "loss": 0.1394, "step": 13989 }, { "epoch": 2.923719958202717, "grad_norm": 0.9459315204246218, "learning_rate": 1.0897901886142932e-05, "loss": 0.0969, "step": 13990 }, { "epoch": 2.9239289446186, "grad_norm": 1.0419545491285762, "learning_rate": 1.0896778315785057e-05, "loss": 0.1342, "step": 13991 }, { "epoch": 2.924137931034483, "grad_norm": 0.9633243938442249, "learning_rate": 1.0895654734014263e-05, "loss": 0.1264, "step": 13992 }, { "epoch": 2.924346917450366, "grad_norm": 0.8242601996554644, "learning_rate": 1.0894531140844849e-05, "loss": 0.1074, "step": 13993 }, { "epoch": 2.9245559038662488, "grad_norm": 1.1467781949214215, "learning_rate": 1.0893407536291107e-05, "loss": 0.1576, "step": 13994 }, { "epoch": 2.9247648902821317, "grad_norm": 0.8488922448813553, "learning_rate": 1.0892283920367348e-05, "loss": 0.0984, "step": 13995 }, { "epoch": 2.9249738766980147, "grad_norm": 1.0916441401313473, "learning_rate": 1.0891160293087864e-05, "loss": 0.1106, "step": 13996 }, { "epoch": 2.9251828631138976, "grad_norm": 1.166351524415322, "learning_rate": 1.0890036654466955e-05, "loss": 0.1302, "step": 13997 }, { "epoch": 2.9253918495297806, "grad_norm": 0.9509580004100056, "learning_rate": 1.0888913004518925e-05, "loss": 0.1118, "step": 13998 }, { "epoch": 2.9256008359456636, "grad_norm": 1.043308357388821, "learning_rate": 1.0887789343258073e-05, "loss": 0.1591, "step": 13999 }, { "epoch": 2.9258098223615465, "grad_norm": 0.938065113478569, "learning_rate": 1.0886665670698693e-05, "loss": 0.1019, "step": 14000 }, { "epoch": 2.9260188087774295, "grad_norm": 1.037101071174952, "learning_rate": 1.0885541986855097e-05, "loss": 0.1339, "step": 14001 }, { "epoch": 2.9262277951933124, "grad_norm": 0.8294881607963007, "learning_rate": 1.0884418291741579e-05, "loss": 0.1002, "step": 14002 }, { "epoch": 2.9264367816091954, "grad_norm": 0.8490762320450379, "learning_rate": 1.0883294585372437e-05, "loss": 0.0811, "step": 14003 }, { "epoch": 2.9266457680250784, "grad_norm": 1.1870546392048038, "learning_rate": 1.0882170867761978e-05, "loss": 0.117, "step": 14004 }, { "epoch": 2.9268547544409613, "grad_norm": 1.0767718636529744, "learning_rate": 1.0881047138924503e-05, "loss": 0.1394, "step": 14005 }, { "epoch": 2.9270637408568443, "grad_norm": 0.9715629249340866, "learning_rate": 1.0879923398874305e-05, "loss": 0.1124, "step": 14006 }, { "epoch": 2.9272727272727272, "grad_norm": 0.9830385965250438, "learning_rate": 1.0878799647625695e-05, "loss": 0.1153, "step": 14007 }, { "epoch": 2.92748171368861, "grad_norm": 1.2804836276619729, "learning_rate": 1.0877675885192972e-05, "loss": 0.1578, "step": 14008 }, { "epoch": 2.927690700104493, "grad_norm": 0.9976874705335811, "learning_rate": 1.0876552111590433e-05, "loss": 0.1319, "step": 14009 }, { "epoch": 2.927899686520376, "grad_norm": 1.1893857986785907, "learning_rate": 1.0875428326832385e-05, "loss": 0.1293, "step": 14010 }, { "epoch": 2.928108672936259, "grad_norm": 0.8534797080276624, "learning_rate": 1.0874304530933129e-05, "loss": 0.1059, "step": 14011 }, { "epoch": 2.928317659352142, "grad_norm": 1.0997691003864682, "learning_rate": 1.0873180723906962e-05, "loss": 0.1256, "step": 14012 }, { "epoch": 2.928526645768025, "grad_norm": 0.8852271720906005, "learning_rate": 1.0872056905768194e-05, "loss": 0.1063, "step": 14013 }, { "epoch": 2.928735632183908, "grad_norm": 1.0164424839803283, "learning_rate": 1.0870933076531122e-05, "loss": 0.1329, "step": 14014 }, { "epoch": 2.928944618599791, "grad_norm": 0.8900319561993932, "learning_rate": 1.0869809236210052e-05, "loss": 0.1006, "step": 14015 }, { "epoch": 2.929153605015674, "grad_norm": 1.0253750102554273, "learning_rate": 1.0868685384819283e-05, "loss": 0.1285, "step": 14016 }, { "epoch": 2.929362591431557, "grad_norm": 0.923559744252799, "learning_rate": 1.086756152237312e-05, "loss": 0.107, "step": 14017 }, { "epoch": 2.92957157784744, "grad_norm": 0.9083580229143632, "learning_rate": 1.0866437648885864e-05, "loss": 0.1116, "step": 14018 }, { "epoch": 2.929780564263323, "grad_norm": 0.9192326662115823, "learning_rate": 1.0865313764371823e-05, "loss": 0.1273, "step": 14019 }, { "epoch": 2.9299895506792057, "grad_norm": 0.8172367237146032, "learning_rate": 1.0864189868845296e-05, "loss": 0.0949, "step": 14020 }, { "epoch": 2.9301985370950887, "grad_norm": 0.8628921907724523, "learning_rate": 1.0863065962320584e-05, "loss": 0.1049, "step": 14021 }, { "epoch": 2.9304075235109717, "grad_norm": 1.1840160668490982, "learning_rate": 1.0861942044811997e-05, "loss": 0.1237, "step": 14022 }, { "epoch": 2.9306165099268546, "grad_norm": 0.9439419809531826, "learning_rate": 1.0860818116333838e-05, "loss": 0.1334, "step": 14023 }, { "epoch": 2.9308254963427376, "grad_norm": 1.0120730281473422, "learning_rate": 1.0859694176900402e-05, "loss": 0.1063, "step": 14024 }, { "epoch": 2.9310344827586206, "grad_norm": 0.9712528521233184, "learning_rate": 1.0858570226526001e-05, "loss": 0.1307, "step": 14025 }, { "epoch": 2.9312434691745035, "grad_norm": 1.0662780116083417, "learning_rate": 1.0857446265224938e-05, "loss": 0.1162, "step": 14026 }, { "epoch": 2.9314524555903865, "grad_norm": 0.8422718791250685, "learning_rate": 1.0856322293011517e-05, "loss": 0.1007, "step": 14027 }, { "epoch": 2.9316614420062694, "grad_norm": 1.0834163661987724, "learning_rate": 1.085519830990004e-05, "loss": 0.1163, "step": 14028 }, { "epoch": 2.9318704284221524, "grad_norm": 1.0772641787725674, "learning_rate": 1.0854074315904814e-05, "loss": 0.1351, "step": 14029 }, { "epoch": 2.932079414838036, "grad_norm": 0.8533513525779312, "learning_rate": 1.0852950311040138e-05, "loss": 0.1072, "step": 14030 }, { "epoch": 2.9322884012539188, "grad_norm": 0.9802859022135955, "learning_rate": 1.0851826295320329e-05, "loss": 0.1164, "step": 14031 }, { "epoch": 2.9324973876698017, "grad_norm": 1.0152743060255773, "learning_rate": 1.0850702268759678e-05, "loss": 0.1264, "step": 14032 }, { "epoch": 2.9327063740856847, "grad_norm": 0.8582267733510728, "learning_rate": 1.08495782313725e-05, "loss": 0.0921, "step": 14033 }, { "epoch": 2.9329153605015676, "grad_norm": 0.9577394818679678, "learning_rate": 1.0848454183173094e-05, "loss": 0.1051, "step": 14034 }, { "epoch": 2.9331243469174506, "grad_norm": 0.8660358576486641, "learning_rate": 1.084733012417577e-05, "loss": 0.1227, "step": 14035 }, { "epoch": 2.9333333333333336, "grad_norm": 1.088874155758411, "learning_rate": 1.0846206054394827e-05, "loss": 0.1317, "step": 14036 }, { "epoch": 2.9335423197492165, "grad_norm": 1.022838019843161, "learning_rate": 1.0845081973844578e-05, "loss": 0.135, "step": 14037 }, { "epoch": 2.9337513061650995, "grad_norm": 0.9611927372163407, "learning_rate": 1.0843957882539324e-05, "loss": 0.1048, "step": 14038 }, { "epoch": 2.9339602925809825, "grad_norm": 1.1134892362939255, "learning_rate": 1.0842833780493374e-05, "loss": 0.1332, "step": 14039 }, { "epoch": 2.9341692789968654, "grad_norm": 1.3848527136771318, "learning_rate": 1.084170966772103e-05, "loss": 0.149, "step": 14040 }, { "epoch": 2.9343782654127484, "grad_norm": 1.0818335843222486, "learning_rate": 1.0840585544236604e-05, "loss": 0.1604, "step": 14041 }, { "epoch": 2.9345872518286313, "grad_norm": 1.083467584723042, "learning_rate": 1.0839461410054393e-05, "loss": 0.1446, "step": 14042 }, { "epoch": 2.9347962382445143, "grad_norm": 1.1074866755644537, "learning_rate": 1.0838337265188711e-05, "loss": 0.1309, "step": 14043 }, { "epoch": 2.9350052246603973, "grad_norm": 1.0805898865121664, "learning_rate": 1.0837213109653864e-05, "loss": 0.1406, "step": 14044 }, { "epoch": 2.93521421107628, "grad_norm": 1.0775605485405042, "learning_rate": 1.0836088943464155e-05, "loss": 0.1699, "step": 14045 }, { "epoch": 2.935423197492163, "grad_norm": 0.9425610633289243, "learning_rate": 1.0834964766633895e-05, "loss": 0.1286, "step": 14046 }, { "epoch": 2.935632183908046, "grad_norm": 1.0624735910893897, "learning_rate": 1.0833840579177391e-05, "loss": 0.1001, "step": 14047 }, { "epoch": 2.935841170323929, "grad_norm": 0.842918079003667, "learning_rate": 1.0832716381108943e-05, "loss": 0.0882, "step": 14048 }, { "epoch": 2.936050156739812, "grad_norm": 1.004165665651416, "learning_rate": 1.0831592172442863e-05, "loss": 0.129, "step": 14049 }, { "epoch": 2.936259143155695, "grad_norm": 1.1700512307161828, "learning_rate": 1.0830467953193459e-05, "loss": 0.1535, "step": 14050 }, { "epoch": 2.936468129571578, "grad_norm": 0.8754476288592252, "learning_rate": 1.0829343723375042e-05, "loss": 0.1078, "step": 14051 }, { "epoch": 2.936677115987461, "grad_norm": 1.0147621110062297, "learning_rate": 1.0828219483001911e-05, "loss": 0.1187, "step": 14052 }, { "epoch": 2.936886102403344, "grad_norm": 1.0571317724048608, "learning_rate": 1.082709523208838e-05, "loss": 0.1111, "step": 14053 }, { "epoch": 2.937095088819227, "grad_norm": 0.8719132884828449, "learning_rate": 1.0825970970648757e-05, "loss": 0.112, "step": 14054 }, { "epoch": 2.93730407523511, "grad_norm": 0.9188787667413507, "learning_rate": 1.0824846698697342e-05, "loss": 0.1228, "step": 14055 }, { "epoch": 2.937513061650993, "grad_norm": 1.0004536480816992, "learning_rate": 1.0823722416248456e-05, "loss": 0.1261, "step": 14056 }, { "epoch": 2.9377220480668758, "grad_norm": 0.7518357832867822, "learning_rate": 1.0822598123316394e-05, "loss": 0.1006, "step": 14057 }, { "epoch": 2.9379310344827587, "grad_norm": 0.8159194661958402, "learning_rate": 1.0821473819915476e-05, "loss": 0.0963, "step": 14058 }, { "epoch": 2.9381400208986417, "grad_norm": 0.8633906754070252, "learning_rate": 1.0820349506060003e-05, "loss": 0.1148, "step": 14059 }, { "epoch": 2.9383490073145246, "grad_norm": 0.8729667228787831, "learning_rate": 1.0819225181764288e-05, "loss": 0.1119, "step": 14060 }, { "epoch": 2.9385579937304076, "grad_norm": 1.1685311390612394, "learning_rate": 1.0818100847042636e-05, "loss": 0.1508, "step": 14061 }, { "epoch": 2.9387669801462906, "grad_norm": 0.903766844354163, "learning_rate": 1.081697650190936e-05, "loss": 0.1115, "step": 14062 }, { "epoch": 2.9389759665621735, "grad_norm": 0.8615995043506315, "learning_rate": 1.0815852146378762e-05, "loss": 0.1081, "step": 14063 }, { "epoch": 2.9391849529780565, "grad_norm": 0.9349714345909864, "learning_rate": 1.0814727780465162e-05, "loss": 0.135, "step": 14064 }, { "epoch": 2.9393939393939394, "grad_norm": 0.658947856725578, "learning_rate": 1.081360340418286e-05, "loss": 0.0763, "step": 14065 }, { "epoch": 2.9396029258098224, "grad_norm": 0.9904894390799064, "learning_rate": 1.0812479017546168e-05, "loss": 0.093, "step": 14066 }, { "epoch": 2.9398119122257054, "grad_norm": 0.9411123358636276, "learning_rate": 1.0811354620569399e-05, "loss": 0.1299, "step": 14067 }, { "epoch": 2.9400208986415883, "grad_norm": 0.9923763220986366, "learning_rate": 1.081023021326686e-05, "loss": 0.1145, "step": 14068 }, { "epoch": 2.9402298850574713, "grad_norm": 1.023057173560944, "learning_rate": 1.0809105795652859e-05, "loss": 0.1084, "step": 14069 }, { "epoch": 2.9404388714733543, "grad_norm": 0.9316431029918928, "learning_rate": 1.080798136774171e-05, "loss": 0.1178, "step": 14070 }, { "epoch": 2.940647857889237, "grad_norm": 0.9501673057312294, "learning_rate": 1.0806856929547722e-05, "loss": 0.1255, "step": 14071 }, { "epoch": 2.94085684430512, "grad_norm": 0.8431868852310935, "learning_rate": 1.0805732481085203e-05, "loss": 0.1148, "step": 14072 }, { "epoch": 2.941065830721003, "grad_norm": 1.0426091481715773, "learning_rate": 1.0804608022368463e-05, "loss": 0.1292, "step": 14073 }, { "epoch": 2.941274817136886, "grad_norm": 0.774114241168452, "learning_rate": 1.080348355341182e-05, "loss": 0.1067, "step": 14074 }, { "epoch": 2.941483803552769, "grad_norm": 1.0195605873819664, "learning_rate": 1.0802359074229574e-05, "loss": 0.1065, "step": 14075 }, { "epoch": 2.941692789968652, "grad_norm": 1.0364325340937137, "learning_rate": 1.0801234584836042e-05, "loss": 0.1313, "step": 14076 }, { "epoch": 2.941901776384535, "grad_norm": 0.9913054537557017, "learning_rate": 1.0800110085245534e-05, "loss": 0.1202, "step": 14077 }, { "epoch": 2.942110762800418, "grad_norm": 1.1460758411323855, "learning_rate": 1.079898557547236e-05, "loss": 0.126, "step": 14078 }, { "epoch": 2.942319749216301, "grad_norm": 0.9857938692335926, "learning_rate": 1.0797861055530832e-05, "loss": 0.1253, "step": 14079 }, { "epoch": 2.942528735632184, "grad_norm": 0.9143451855973322, "learning_rate": 1.0796736525435264e-05, "loss": 0.1056, "step": 14080 }, { "epoch": 2.942737722048067, "grad_norm": 0.8687143899078703, "learning_rate": 1.079561198519996e-05, "loss": 0.1158, "step": 14081 }, { "epoch": 2.94294670846395, "grad_norm": 0.8921268620900664, "learning_rate": 1.0794487434839238e-05, "loss": 0.1267, "step": 14082 }, { "epoch": 2.9431556948798328, "grad_norm": 0.9770595823385121, "learning_rate": 1.0793362874367409e-05, "loss": 0.1154, "step": 14083 }, { "epoch": 2.9433646812957157, "grad_norm": 1.0717311981200943, "learning_rate": 1.079223830379878e-05, "loss": 0.122, "step": 14084 }, { "epoch": 2.9435736677115987, "grad_norm": 0.9195532056997027, "learning_rate": 1.079111372314767e-05, "loss": 0.1102, "step": 14085 }, { "epoch": 2.9437826541274816, "grad_norm": 1.0952026232407566, "learning_rate": 1.0789989132428389e-05, "loss": 0.1367, "step": 14086 }, { "epoch": 2.9439916405433646, "grad_norm": 0.7838052673396347, "learning_rate": 1.0788864531655243e-05, "loss": 0.0936, "step": 14087 }, { "epoch": 2.9442006269592476, "grad_norm": 0.8557745570908735, "learning_rate": 1.0787739920842553e-05, "loss": 0.1067, "step": 14088 }, { "epoch": 2.9444096133751305, "grad_norm": 0.8571808475922252, "learning_rate": 1.0786615300004627e-05, "loss": 0.1047, "step": 14089 }, { "epoch": 2.9446185997910135, "grad_norm": 0.963381720162549, "learning_rate": 1.0785490669155776e-05, "loss": 0.1229, "step": 14090 }, { "epoch": 2.9448275862068964, "grad_norm": 1.0453984172884927, "learning_rate": 1.0784366028310316e-05, "loss": 0.1296, "step": 14091 }, { "epoch": 2.9450365726227794, "grad_norm": 1.0266425407628255, "learning_rate": 1.0783241377482561e-05, "loss": 0.1297, "step": 14092 }, { "epoch": 2.9452455590386624, "grad_norm": 0.9103949341286383, "learning_rate": 1.0782116716686816e-05, "loss": 0.1096, "step": 14093 }, { "epoch": 2.9454545454545453, "grad_norm": 0.9698576801443325, "learning_rate": 1.0780992045937406e-05, "loss": 0.1309, "step": 14094 }, { "epoch": 2.9456635318704283, "grad_norm": 0.8837662596741882, "learning_rate": 1.0779867365248635e-05, "loss": 0.105, "step": 14095 }, { "epoch": 2.9458725182863112, "grad_norm": 0.9876510715304383, "learning_rate": 1.0778742674634821e-05, "loss": 0.1217, "step": 14096 }, { "epoch": 2.946081504702194, "grad_norm": 1.98633637570482, "learning_rate": 1.0777617974110273e-05, "loss": 0.1283, "step": 14097 }, { "epoch": 2.946290491118077, "grad_norm": 0.9115522562118719, "learning_rate": 1.0776493263689312e-05, "loss": 0.1056, "step": 14098 }, { "epoch": 2.94649947753396, "grad_norm": 0.9633055649882347, "learning_rate": 1.077536854338624e-05, "loss": 0.1195, "step": 14099 }, { "epoch": 2.946708463949843, "grad_norm": 0.7845144702243105, "learning_rate": 1.0774243813215384e-05, "loss": 0.0808, "step": 14100 }, { "epoch": 2.946917450365726, "grad_norm": 0.9107168658954428, "learning_rate": 1.077311907319105e-05, "loss": 0.1178, "step": 14101 }, { "epoch": 2.947126436781609, "grad_norm": 1.0081942995500228, "learning_rate": 1.0771994323327553e-05, "loss": 0.1522, "step": 14102 }, { "epoch": 2.947335423197492, "grad_norm": 0.8528971775928196, "learning_rate": 1.0770869563639209e-05, "loss": 0.108, "step": 14103 }, { "epoch": 2.947544409613375, "grad_norm": 1.155286065155871, "learning_rate": 1.0769744794140334e-05, "loss": 0.1402, "step": 14104 }, { "epoch": 2.947753396029258, "grad_norm": 0.9519474698423535, "learning_rate": 1.0768620014845235e-05, "loss": 0.123, "step": 14105 }, { "epoch": 2.947962382445141, "grad_norm": 1.0667326951731346, "learning_rate": 1.0767495225768234e-05, "loss": 0.1182, "step": 14106 }, { "epoch": 2.948171368861024, "grad_norm": 1.014320648089915, "learning_rate": 1.0766370426923645e-05, "loss": 0.1392, "step": 14107 }, { "epoch": 2.948380355276907, "grad_norm": 0.879492421098354, "learning_rate": 1.0765245618325779e-05, "loss": 0.1131, "step": 14108 }, { "epoch": 2.9485893416927897, "grad_norm": 0.766098495446188, "learning_rate": 1.0764120799988953e-05, "loss": 0.0991, "step": 14109 }, { "epoch": 2.9487983281086727, "grad_norm": 0.8691066038613472, "learning_rate": 1.0762995971927484e-05, "loss": 0.1224, "step": 14110 }, { "epoch": 2.9490073145245557, "grad_norm": 1.1067760119439904, "learning_rate": 1.0761871134155682e-05, "loss": 0.1314, "step": 14111 }, { "epoch": 2.9492163009404386, "grad_norm": 0.9686853597949027, "learning_rate": 1.076074628668787e-05, "loss": 0.1274, "step": 14112 }, { "epoch": 2.9494252873563216, "grad_norm": 0.8832961629750046, "learning_rate": 1.0759621429538356e-05, "loss": 0.117, "step": 14113 }, { "epoch": 2.9496342737722046, "grad_norm": 0.8637772718876762, "learning_rate": 1.075849656272146e-05, "loss": 0.1227, "step": 14114 }, { "epoch": 2.9498432601880875, "grad_norm": 0.8663782794411005, "learning_rate": 1.0757371686251496e-05, "loss": 0.1163, "step": 14115 }, { "epoch": 2.9500522466039705, "grad_norm": 0.9579989783180256, "learning_rate": 1.0756246800142785e-05, "loss": 0.1295, "step": 14116 }, { "epoch": 2.9502612330198534, "grad_norm": 0.790216123664618, "learning_rate": 1.0755121904409631e-05, "loss": 0.1029, "step": 14117 }, { "epoch": 2.950470219435737, "grad_norm": 0.9550966535379041, "learning_rate": 1.0753996999066362e-05, "loss": 0.1155, "step": 14118 }, { "epoch": 2.95067920585162, "grad_norm": 1.021220120022599, "learning_rate": 1.0752872084127289e-05, "loss": 0.1348, "step": 14119 }, { "epoch": 2.9508881922675028, "grad_norm": 0.9993448314108266, "learning_rate": 1.075174715960673e-05, "loss": 0.1241, "step": 14120 }, { "epoch": 2.9510971786833857, "grad_norm": 1.058087364066997, "learning_rate": 1.0750622225518999e-05, "loss": 0.1485, "step": 14121 }, { "epoch": 2.9513061650992687, "grad_norm": 0.785773952835844, "learning_rate": 1.0749497281878414e-05, "loss": 0.102, "step": 14122 }, { "epoch": 2.9515151515151516, "grad_norm": 0.8350088594922657, "learning_rate": 1.074837232869929e-05, "loss": 0.1058, "step": 14123 }, { "epoch": 2.9517241379310346, "grad_norm": 1.022400594593084, "learning_rate": 1.0747247365995945e-05, "loss": 0.1343, "step": 14124 }, { "epoch": 2.9519331243469176, "grad_norm": 1.056817678593667, "learning_rate": 1.07461223937827e-05, "loss": 0.1533, "step": 14125 }, { "epoch": 2.9521421107628005, "grad_norm": 1.0376138509346933, "learning_rate": 1.0744997412073865e-05, "loss": 0.1285, "step": 14126 }, { "epoch": 2.9523510971786835, "grad_norm": 1.2350846570502065, "learning_rate": 1.0743872420883764e-05, "loss": 0.1426, "step": 14127 }, { "epoch": 2.9525600835945665, "grad_norm": 0.9882796276242712, "learning_rate": 1.074274742022671e-05, "loss": 0.1409, "step": 14128 }, { "epoch": 2.9527690700104494, "grad_norm": 0.8971015587965151, "learning_rate": 1.0741622410117018e-05, "loss": 0.1073, "step": 14129 }, { "epoch": 2.9529780564263324, "grad_norm": 0.9720974728228837, "learning_rate": 1.0740497390569011e-05, "loss": 0.0888, "step": 14130 }, { "epoch": 2.9531870428422153, "grad_norm": 1.1167840022869888, "learning_rate": 1.0739372361597008e-05, "loss": 0.1103, "step": 14131 }, { "epoch": 2.9533960292580983, "grad_norm": 1.003186455765926, "learning_rate": 1.0738247323215315e-05, "loss": 0.1361, "step": 14132 }, { "epoch": 2.9536050156739813, "grad_norm": 1.0295014330905585, "learning_rate": 1.0737122275438265e-05, "loss": 0.1196, "step": 14133 }, { "epoch": 2.953814002089864, "grad_norm": 0.8741468389649412, "learning_rate": 1.0735997218280166e-05, "loss": 0.1232, "step": 14134 }, { "epoch": 2.954022988505747, "grad_norm": 1.0797176654999638, "learning_rate": 1.073487215175534e-05, "loss": 0.1269, "step": 14135 }, { "epoch": 2.95423197492163, "grad_norm": 0.8618298214992385, "learning_rate": 1.0733747075878105e-05, "loss": 0.1196, "step": 14136 }, { "epoch": 2.954440961337513, "grad_norm": 0.8806028217185611, "learning_rate": 1.073262199066278e-05, "loss": 0.0936, "step": 14137 }, { "epoch": 2.954649947753396, "grad_norm": 1.0665117430698248, "learning_rate": 1.0731496896123676e-05, "loss": 0.1275, "step": 14138 }, { "epoch": 2.954858934169279, "grad_norm": 0.9537067003866143, "learning_rate": 1.0730371792275126e-05, "loss": 0.1031, "step": 14139 }, { "epoch": 2.955067920585162, "grad_norm": 0.9454913223129963, "learning_rate": 1.0729246679131437e-05, "loss": 0.1111, "step": 14140 }, { "epoch": 2.955276907001045, "grad_norm": 0.9710445253335769, "learning_rate": 1.0728121556706932e-05, "loss": 0.1486, "step": 14141 }, { "epoch": 2.955485893416928, "grad_norm": 1.2468842902525847, "learning_rate": 1.072699642501593e-05, "loss": 0.1432, "step": 14142 }, { "epoch": 2.955694879832811, "grad_norm": 0.9540479394688688, "learning_rate": 1.0725871284072749e-05, "loss": 0.1157, "step": 14143 }, { "epoch": 2.955903866248694, "grad_norm": 0.9486528469796385, "learning_rate": 1.0724746133891706e-05, "loss": 0.1162, "step": 14144 }, { "epoch": 2.956112852664577, "grad_norm": 0.9645037416954625, "learning_rate": 1.0723620974487128e-05, "loss": 0.1201, "step": 14145 }, { "epoch": 2.9563218390804598, "grad_norm": 0.7831468362589963, "learning_rate": 1.0722495805873327e-05, "loss": 0.1039, "step": 14146 }, { "epoch": 2.9565308254963427, "grad_norm": 0.780742124653605, "learning_rate": 1.0721370628064625e-05, "loss": 0.0903, "step": 14147 }, { "epoch": 2.9567398119122257, "grad_norm": 0.9843079483046298, "learning_rate": 1.0720245441075344e-05, "loss": 0.1113, "step": 14148 }, { "epoch": 2.9569487983281086, "grad_norm": 1.0143451269544337, "learning_rate": 1.07191202449198e-05, "loss": 0.1229, "step": 14149 }, { "epoch": 2.9571577847439916, "grad_norm": 1.029995980057999, "learning_rate": 1.0717995039612312e-05, "loss": 0.1193, "step": 14150 }, { "epoch": 2.9573667711598746, "grad_norm": 1.0145465388183532, "learning_rate": 1.0716869825167206e-05, "loss": 0.1309, "step": 14151 }, { "epoch": 2.9575757575757575, "grad_norm": 1.2132382601951228, "learning_rate": 1.0715744601598796e-05, "loss": 0.1345, "step": 14152 }, { "epoch": 2.9577847439916405, "grad_norm": 0.8819847904925213, "learning_rate": 1.0714619368921407e-05, "loss": 0.1087, "step": 14153 }, { "epoch": 2.9579937304075234, "grad_norm": 0.9568101402471483, "learning_rate": 1.0713494127149356e-05, "loss": 0.1209, "step": 14154 }, { "epoch": 2.9582027168234064, "grad_norm": 0.8522341177468612, "learning_rate": 1.0712368876296965e-05, "loss": 0.1079, "step": 14155 }, { "epoch": 2.9584117032392894, "grad_norm": 0.893363554165517, "learning_rate": 1.071124361637855e-05, "loss": 0.1295, "step": 14156 }, { "epoch": 2.9586206896551723, "grad_norm": 1.0478336322672308, "learning_rate": 1.0710118347408443e-05, "loss": 0.1167, "step": 14157 }, { "epoch": 2.9588296760710553, "grad_norm": 0.979845862098776, "learning_rate": 1.0708993069400955e-05, "loss": 0.1282, "step": 14158 }, { "epoch": 2.9590386624869383, "grad_norm": 1.0016315406060325, "learning_rate": 1.0707867782370409e-05, "loss": 0.1147, "step": 14159 }, { "epoch": 2.959247648902821, "grad_norm": 0.8289674215340301, "learning_rate": 1.0706742486331128e-05, "loss": 0.103, "step": 14160 }, { "epoch": 2.959456635318704, "grad_norm": 1.1114350360077232, "learning_rate": 1.0705617181297432e-05, "loss": 0.1447, "step": 14161 }, { "epoch": 2.959665621734587, "grad_norm": 1.070266196459044, "learning_rate": 1.0704491867283637e-05, "loss": 0.1437, "step": 14162 }, { "epoch": 2.95987460815047, "grad_norm": 0.8818557695591976, "learning_rate": 1.0703366544304077e-05, "loss": 0.1175, "step": 14163 }, { "epoch": 2.960083594566353, "grad_norm": 1.0585783022798392, "learning_rate": 1.0702241212373064e-05, "loss": 0.138, "step": 14164 }, { "epoch": 2.960292580982236, "grad_norm": 0.7441495006873731, "learning_rate": 1.070111587150492e-05, "loss": 0.0937, "step": 14165 }, { "epoch": 2.960501567398119, "grad_norm": 1.0161106423469124, "learning_rate": 1.0699990521713967e-05, "loss": 0.1324, "step": 14166 }, { "epoch": 2.960710553814002, "grad_norm": 0.8703211611134298, "learning_rate": 1.0698865163014533e-05, "loss": 0.1164, "step": 14167 }, { "epoch": 2.960919540229885, "grad_norm": 0.8675928727933101, "learning_rate": 1.0697739795420932e-05, "loss": 0.1079, "step": 14168 }, { "epoch": 2.961128526645768, "grad_norm": 0.9551374389532568, "learning_rate": 1.0696614418947493e-05, "loss": 0.1173, "step": 14169 }, { "epoch": 2.961337513061651, "grad_norm": 0.9948385689513815, "learning_rate": 1.069548903360853e-05, "loss": 0.1147, "step": 14170 }, { "epoch": 2.9615464994775342, "grad_norm": 0.9110252036462605, "learning_rate": 1.0694363639418373e-05, "loss": 0.1083, "step": 14171 }, { "epoch": 2.961755485893417, "grad_norm": 0.9572769708996469, "learning_rate": 1.0693238236391339e-05, "loss": 0.12, "step": 14172 }, { "epoch": 2.9619644723093, "grad_norm": 0.8964681815777412, "learning_rate": 1.0692112824541756e-05, "loss": 0.1025, "step": 14173 }, { "epoch": 2.962173458725183, "grad_norm": 0.9294540112260267, "learning_rate": 1.069098740388394e-05, "loss": 0.1113, "step": 14174 }, { "epoch": 2.962382445141066, "grad_norm": 0.9826237771649159, "learning_rate": 1.068986197443222e-05, "loss": 0.1214, "step": 14175 }, { "epoch": 2.962591431556949, "grad_norm": 0.9345877661300291, "learning_rate": 1.0688736536200917e-05, "loss": 0.1051, "step": 14176 }, { "epoch": 2.962800417972832, "grad_norm": 0.9367408589331429, "learning_rate": 1.068761108920435e-05, "loss": 0.1219, "step": 14177 }, { "epoch": 2.963009404388715, "grad_norm": 0.8707925904977716, "learning_rate": 1.0686485633456846e-05, "loss": 0.1342, "step": 14178 }, { "epoch": 2.963218390804598, "grad_norm": 0.8717944843783499, "learning_rate": 1.0685360168972729e-05, "loss": 0.1124, "step": 14179 }, { "epoch": 2.963427377220481, "grad_norm": 0.7733443500034491, "learning_rate": 1.0684234695766319e-05, "loss": 0.1046, "step": 14180 }, { "epoch": 2.963636363636364, "grad_norm": 1.0358507049680357, "learning_rate": 1.0683109213851944e-05, "loss": 0.1269, "step": 14181 }, { "epoch": 2.963845350052247, "grad_norm": 0.9643206615265345, "learning_rate": 1.068198372324392e-05, "loss": 0.1295, "step": 14182 }, { "epoch": 2.9640543364681298, "grad_norm": 1.0044762790869584, "learning_rate": 1.0680858223956578e-05, "loss": 0.1011, "step": 14183 }, { "epoch": 2.9642633228840127, "grad_norm": 0.8976506327367058, "learning_rate": 1.0679732716004238e-05, "loss": 0.091, "step": 14184 }, { "epoch": 2.9644723092998957, "grad_norm": 0.9297139874389712, "learning_rate": 1.0678607199401228e-05, "loss": 0.0992, "step": 14185 }, { "epoch": 2.9646812957157787, "grad_norm": 0.9559213984578283, "learning_rate": 1.0677481674161865e-05, "loss": 0.1174, "step": 14186 }, { "epoch": 2.9648902821316616, "grad_norm": 1.0907911355704698, "learning_rate": 1.067635614030048e-05, "loss": 0.1365, "step": 14187 }, { "epoch": 2.9650992685475446, "grad_norm": 1.0584612386335124, "learning_rate": 1.0675230597831395e-05, "loss": 0.1142, "step": 14188 }, { "epoch": 2.9653082549634275, "grad_norm": 1.1252204610974967, "learning_rate": 1.067410504676893e-05, "loss": 0.1004, "step": 14189 }, { "epoch": 2.9655172413793105, "grad_norm": 0.9940054358839788, "learning_rate": 1.0672979487127413e-05, "loss": 0.116, "step": 14190 }, { "epoch": 2.9657262277951935, "grad_norm": 0.8749196135012494, "learning_rate": 1.0671853918921174e-05, "loss": 0.1045, "step": 14191 }, { "epoch": 2.9659352142110764, "grad_norm": 0.967728636558518, "learning_rate": 1.0670728342164525e-05, "loss": 0.1164, "step": 14192 }, { "epoch": 2.9661442006269594, "grad_norm": 0.9853757926879667, "learning_rate": 1.0669602756871802e-05, "loss": 0.1265, "step": 14193 }, { "epoch": 2.9663531870428423, "grad_norm": 0.9642063921249653, "learning_rate": 1.0668477163057325e-05, "loss": 0.1244, "step": 14194 }, { "epoch": 2.9665621734587253, "grad_norm": 1.0456460628609385, "learning_rate": 1.066735156073542e-05, "loss": 0.1438, "step": 14195 }, { "epoch": 2.9667711598746083, "grad_norm": 0.9260142947531087, "learning_rate": 1.0666225949920413e-05, "loss": 0.1545, "step": 14196 }, { "epoch": 2.9669801462904912, "grad_norm": 1.0207903025176996, "learning_rate": 1.0665100330626625e-05, "loss": 0.1244, "step": 14197 }, { "epoch": 2.967189132706374, "grad_norm": 0.8943026129779927, "learning_rate": 1.0663974702868386e-05, "loss": 0.1063, "step": 14198 }, { "epoch": 2.967398119122257, "grad_norm": 0.8372327105974148, "learning_rate": 1.0662849066660017e-05, "loss": 0.1215, "step": 14199 }, { "epoch": 2.96760710553814, "grad_norm": 1.1419499085607023, "learning_rate": 1.0661723422015848e-05, "loss": 0.1481, "step": 14200 }, { "epoch": 2.967816091954023, "grad_norm": 0.872620770137727, "learning_rate": 1.06605977689502e-05, "loss": 0.1213, "step": 14201 }, { "epoch": 2.968025078369906, "grad_norm": 0.8570714330302329, "learning_rate": 1.0659472107477407e-05, "loss": 0.1242, "step": 14202 }, { "epoch": 2.968234064785789, "grad_norm": 0.7526062165806895, "learning_rate": 1.0658346437611786e-05, "loss": 0.1031, "step": 14203 }, { "epoch": 2.968443051201672, "grad_norm": 0.9450928707006213, "learning_rate": 1.0657220759367666e-05, "loss": 0.1148, "step": 14204 }, { "epoch": 2.968652037617555, "grad_norm": 0.9944466426245765, "learning_rate": 1.0656095072759374e-05, "loss": 0.1349, "step": 14205 }, { "epoch": 2.968861024033438, "grad_norm": 0.77566821584075, "learning_rate": 1.0654969377801237e-05, "loss": 0.1241, "step": 14206 }, { "epoch": 2.969070010449321, "grad_norm": 0.8426067050094473, "learning_rate": 1.0653843674507573e-05, "loss": 0.1073, "step": 14207 }, { "epoch": 2.969278996865204, "grad_norm": 0.8738550093928271, "learning_rate": 1.0652717962892723e-05, "loss": 0.1084, "step": 14208 }, { "epoch": 2.9694879832810868, "grad_norm": 0.8081792631483578, "learning_rate": 1.0651592242971e-05, "loss": 0.1106, "step": 14209 }, { "epoch": 2.9696969696969697, "grad_norm": 1.0957664190321061, "learning_rate": 1.065046651475674e-05, "loss": 0.1173, "step": 14210 }, { "epoch": 2.9699059561128527, "grad_norm": 0.9429601551378727, "learning_rate": 1.0649340778264261e-05, "loss": 0.124, "step": 14211 }, { "epoch": 2.9701149425287356, "grad_norm": 0.9957138225387469, "learning_rate": 1.0648215033507899e-05, "loss": 0.1163, "step": 14212 }, { "epoch": 2.9703239289446186, "grad_norm": 0.7912288708469581, "learning_rate": 1.064708928050197e-05, "loss": 0.1028, "step": 14213 }, { "epoch": 2.9705329153605016, "grad_norm": 0.8426270160040216, "learning_rate": 1.0645963519260814e-05, "loss": 0.1053, "step": 14214 }, { "epoch": 2.9707419017763845, "grad_norm": 0.9142996842141554, "learning_rate": 1.0644837749798748e-05, "loss": 0.1229, "step": 14215 }, { "epoch": 2.9709508881922675, "grad_norm": 0.9474981952600148, "learning_rate": 1.0643711972130104e-05, "loss": 0.1448, "step": 14216 }, { "epoch": 2.9711598746081505, "grad_norm": 0.8986888909522562, "learning_rate": 1.0642586186269207e-05, "loss": 0.125, "step": 14217 }, { "epoch": 2.9713688610240334, "grad_norm": 0.901279893782542, "learning_rate": 1.0641460392230385e-05, "loss": 0.1179, "step": 14218 }, { "epoch": 2.9715778474399164, "grad_norm": 0.8421042040548112, "learning_rate": 1.0640334590027966e-05, "loss": 0.1047, "step": 14219 }, { "epoch": 2.9717868338557993, "grad_norm": 0.8591208257735051, "learning_rate": 1.0639208779676277e-05, "loss": 0.1049, "step": 14220 }, { "epoch": 2.9719958202716823, "grad_norm": 1.0098772229006452, "learning_rate": 1.0638082961189646e-05, "loss": 0.124, "step": 14221 }, { "epoch": 2.9722048066875653, "grad_norm": 0.7930529294524065, "learning_rate": 1.0636957134582403e-05, "loss": 0.0923, "step": 14222 }, { "epoch": 2.972413793103448, "grad_norm": 0.7778022041746483, "learning_rate": 1.0635831299868871e-05, "loss": 0.091, "step": 14223 }, { "epoch": 2.972622779519331, "grad_norm": 0.8652601910438629, "learning_rate": 1.0634705457063383e-05, "loss": 0.1192, "step": 14224 }, { "epoch": 2.972831765935214, "grad_norm": 0.7706308485248332, "learning_rate": 1.0633579606180262e-05, "loss": 0.0845, "step": 14225 }, { "epoch": 2.973040752351097, "grad_norm": 0.8449566531612961, "learning_rate": 1.0632453747233843e-05, "loss": 0.0997, "step": 14226 }, { "epoch": 2.97324973876698, "grad_norm": 0.980207146508853, "learning_rate": 1.063132788023845e-05, "loss": 0.1284, "step": 14227 }, { "epoch": 2.973458725182863, "grad_norm": 1.0668533286127855, "learning_rate": 1.063020200520841e-05, "loss": 0.1272, "step": 14228 }, { "epoch": 2.973667711598746, "grad_norm": 0.8419844344475867, "learning_rate": 1.0629076122158052e-05, "loss": 0.1066, "step": 14229 }, { "epoch": 2.973876698014629, "grad_norm": 1.1249993753704028, "learning_rate": 1.0627950231101712e-05, "loss": 0.1364, "step": 14230 }, { "epoch": 2.974085684430512, "grad_norm": 0.8247343183691731, "learning_rate": 1.0626824332053707e-05, "loss": 0.0949, "step": 14231 }, { "epoch": 2.974294670846395, "grad_norm": 0.8323241513361078, "learning_rate": 1.0625698425028377e-05, "loss": 0.0938, "step": 14232 }, { "epoch": 2.974503657262278, "grad_norm": 0.931041567635466, "learning_rate": 1.062457251004004e-05, "loss": 0.1204, "step": 14233 }, { "epoch": 2.974712643678161, "grad_norm": 0.9602426309886745, "learning_rate": 1.0623446587103035e-05, "loss": 0.1233, "step": 14234 }, { "epoch": 2.9749216300940438, "grad_norm": 1.104514822613494, "learning_rate": 1.0622320656231686e-05, "loss": 0.1268, "step": 14235 }, { "epoch": 2.9751306165099267, "grad_norm": 1.022964124272635, "learning_rate": 1.0621194717440324e-05, "loss": 0.1251, "step": 14236 }, { "epoch": 2.9753396029258097, "grad_norm": 1.0393113821554107, "learning_rate": 1.0620068770743274e-05, "loss": 0.1188, "step": 14237 }, { "epoch": 2.9755485893416926, "grad_norm": 1.0657209871061173, "learning_rate": 1.0618942816154874e-05, "loss": 0.1107, "step": 14238 }, { "epoch": 2.9757575757575756, "grad_norm": 0.9737651647625211, "learning_rate": 1.0617816853689444e-05, "loss": 0.106, "step": 14239 }, { "epoch": 2.9759665621734586, "grad_norm": 0.9384719538718657, "learning_rate": 1.061669088336132e-05, "loss": 0.1281, "step": 14240 }, { "epoch": 2.9761755485893415, "grad_norm": 1.14853857249678, "learning_rate": 1.0615564905184829e-05, "loss": 0.1396, "step": 14241 }, { "epoch": 2.9763845350052245, "grad_norm": 1.0018698830116457, "learning_rate": 1.0614438919174306e-05, "loss": 0.1365, "step": 14242 }, { "epoch": 2.9765935214211074, "grad_norm": 0.9139011107825981, "learning_rate": 1.061331292534407e-05, "loss": 0.1077, "step": 14243 }, { "epoch": 2.9768025078369904, "grad_norm": 1.0349896346443, "learning_rate": 1.0612186923708463e-05, "loss": 0.1254, "step": 14244 }, { "epoch": 2.9770114942528734, "grad_norm": 0.9667422326464014, "learning_rate": 1.0611060914281808e-05, "loss": 0.1302, "step": 14245 }, { "epoch": 2.9772204806687563, "grad_norm": 0.9736594920991207, "learning_rate": 1.0609934897078435e-05, "loss": 0.1121, "step": 14246 }, { "epoch": 2.9774294670846393, "grad_norm": 0.9681239271058703, "learning_rate": 1.0608808872112679e-05, "loss": 0.1047, "step": 14247 }, { "epoch": 2.9776384535005223, "grad_norm": 0.8749986217913717, "learning_rate": 1.0607682839398869e-05, "loss": 0.1064, "step": 14248 }, { "epoch": 2.977847439916405, "grad_norm": 0.8418576644596463, "learning_rate": 1.0606556798951333e-05, "loss": 0.0909, "step": 14249 }, { "epoch": 2.978056426332288, "grad_norm": 1.018812804854642, "learning_rate": 1.0605430750784404e-05, "loss": 0.137, "step": 14250 }, { "epoch": 2.978265412748171, "grad_norm": 1.2777472901754385, "learning_rate": 1.060430469491241e-05, "loss": 0.1598, "step": 14251 }, { "epoch": 2.978474399164054, "grad_norm": 0.9521362845155852, "learning_rate": 1.0603178631349687e-05, "loss": 0.1337, "step": 14252 }, { "epoch": 2.978683385579937, "grad_norm": 0.9334273880198243, "learning_rate": 1.0602052560110559e-05, "loss": 0.1132, "step": 14253 }, { "epoch": 2.97889237199582, "grad_norm": 1.0637052488624055, "learning_rate": 1.0600926481209363e-05, "loss": 0.1315, "step": 14254 }, { "epoch": 2.979101358411703, "grad_norm": 0.8542738028638904, "learning_rate": 1.0599800394660427e-05, "loss": 0.114, "step": 14255 }, { "epoch": 2.979310344827586, "grad_norm": 0.8480193154283284, "learning_rate": 1.0598674300478085e-05, "loss": 0.1154, "step": 14256 }, { "epoch": 2.979519331243469, "grad_norm": 0.8913708606398678, "learning_rate": 1.0597548198676664e-05, "loss": 0.1018, "step": 14257 }, { "epoch": 2.979728317659352, "grad_norm": 1.0191212092058068, "learning_rate": 1.0596422089270497e-05, "loss": 0.1201, "step": 14258 }, { "epoch": 2.9799373040752353, "grad_norm": 0.804894791777023, "learning_rate": 1.0595295972273917e-05, "loss": 0.1096, "step": 14259 }, { "epoch": 2.9801462904911182, "grad_norm": 0.998646641762501, "learning_rate": 1.0594169847701259e-05, "loss": 0.1529, "step": 14260 }, { "epoch": 2.980355276907001, "grad_norm": 0.9048985172696519, "learning_rate": 1.0593043715566845e-05, "loss": 0.1282, "step": 14261 }, { "epoch": 2.980564263322884, "grad_norm": 0.8762684758523258, "learning_rate": 1.0591917575885015e-05, "loss": 0.1229, "step": 14262 }, { "epoch": 2.980773249738767, "grad_norm": 0.811929239877453, "learning_rate": 1.0590791428670099e-05, "loss": 0.1017, "step": 14263 }, { "epoch": 2.98098223615465, "grad_norm": 0.9066729106800834, "learning_rate": 1.0589665273936427e-05, "loss": 0.1083, "step": 14264 }, { "epoch": 2.981191222570533, "grad_norm": 0.9752218715270384, "learning_rate": 1.0588539111698333e-05, "loss": 0.1281, "step": 14265 }, { "epoch": 2.981400208986416, "grad_norm": 0.941015313964094, "learning_rate": 1.0587412941970152e-05, "loss": 0.144, "step": 14266 }, { "epoch": 2.981609195402299, "grad_norm": 1.0256013400465194, "learning_rate": 1.0586286764766208e-05, "loss": 0.1183, "step": 14267 }, { "epoch": 2.981818181818182, "grad_norm": 0.8921720385071481, "learning_rate": 1.0585160580100841e-05, "loss": 0.1054, "step": 14268 }, { "epoch": 2.982027168234065, "grad_norm": 0.8079110399258291, "learning_rate": 1.0584034387988382e-05, "loss": 0.0958, "step": 14269 }, { "epoch": 2.982236154649948, "grad_norm": 0.7107663033074726, "learning_rate": 1.0582908188443157e-05, "loss": 0.0976, "step": 14270 }, { "epoch": 2.982445141065831, "grad_norm": 1.2962919856441149, "learning_rate": 1.0581781981479511e-05, "loss": 0.149, "step": 14271 }, { "epoch": 2.9826541274817138, "grad_norm": 0.9154634975234653, "learning_rate": 1.0580655767111766e-05, "loss": 0.1136, "step": 14272 }, { "epoch": 2.9828631138975967, "grad_norm": 0.9613277470107678, "learning_rate": 1.0579529545354259e-05, "loss": 0.1218, "step": 14273 }, { "epoch": 2.9830721003134797, "grad_norm": 0.8251210728092685, "learning_rate": 1.0578403316221321e-05, "loss": 0.1071, "step": 14274 }, { "epoch": 2.9832810867293627, "grad_norm": 1.0380784784686083, "learning_rate": 1.0577277079727291e-05, "loss": 0.1485, "step": 14275 }, { "epoch": 2.9834900731452456, "grad_norm": 0.8898505665329938, "learning_rate": 1.0576150835886494e-05, "loss": 0.1218, "step": 14276 }, { "epoch": 2.9836990595611286, "grad_norm": 1.0512678602930607, "learning_rate": 1.057502458471327e-05, "loss": 0.1322, "step": 14277 }, { "epoch": 2.9839080459770115, "grad_norm": 0.8171241750683851, "learning_rate": 1.0573898326221948e-05, "loss": 0.1196, "step": 14278 }, { "epoch": 2.9841170323928945, "grad_norm": 0.8185658437071647, "learning_rate": 1.0572772060426861e-05, "loss": 0.1338, "step": 14279 }, { "epoch": 2.9843260188087775, "grad_norm": 0.8886503324624138, "learning_rate": 1.0571645787342346e-05, "loss": 0.1171, "step": 14280 }, { "epoch": 2.9845350052246604, "grad_norm": 0.9379171528423336, "learning_rate": 1.0570519506982738e-05, "loss": 0.1149, "step": 14281 }, { "epoch": 2.9847439916405434, "grad_norm": 0.8843921347179461, "learning_rate": 1.0569393219362363e-05, "loss": 0.1381, "step": 14282 }, { "epoch": 2.9849529780564263, "grad_norm": 1.1344457496232003, "learning_rate": 1.0568266924495563e-05, "loss": 0.1441, "step": 14283 }, { "epoch": 2.9851619644723093, "grad_norm": 0.7968900653385439, "learning_rate": 1.0567140622396666e-05, "loss": 0.1047, "step": 14284 }, { "epoch": 2.9853709508881923, "grad_norm": 0.8230792983077105, "learning_rate": 1.0566014313080006e-05, "loss": 0.1194, "step": 14285 }, { "epoch": 2.9855799373040752, "grad_norm": 0.870981888836939, "learning_rate": 1.0564887996559923e-05, "loss": 0.1209, "step": 14286 }, { "epoch": 2.985788923719958, "grad_norm": 0.957107995008654, "learning_rate": 1.0563761672850747e-05, "loss": 0.1402, "step": 14287 }, { "epoch": 2.985997910135841, "grad_norm": 0.9740982228872482, "learning_rate": 1.0562635341966812e-05, "loss": 0.1225, "step": 14288 }, { "epoch": 2.986206896551724, "grad_norm": 0.8019123330088493, "learning_rate": 1.0561509003922454e-05, "loss": 0.1037, "step": 14289 }, { "epoch": 2.986415882967607, "grad_norm": 0.9225441899782636, "learning_rate": 1.0560382658732008e-05, "loss": 0.1217, "step": 14290 }, { "epoch": 2.98662486938349, "grad_norm": 0.9988168869110123, "learning_rate": 1.0559256306409803e-05, "loss": 0.1285, "step": 14291 }, { "epoch": 2.986833855799373, "grad_norm": 0.817272813474259, "learning_rate": 1.055812994697018e-05, "loss": 0.1127, "step": 14292 }, { "epoch": 2.987042842215256, "grad_norm": 0.9365487632284818, "learning_rate": 1.0557003580427471e-05, "loss": 0.1245, "step": 14293 }, { "epoch": 2.987251828631139, "grad_norm": 0.7497094836229369, "learning_rate": 1.055587720679601e-05, "loss": 0.0995, "step": 14294 }, { "epoch": 2.987460815047022, "grad_norm": 0.9761050651653731, "learning_rate": 1.0554750826090136e-05, "loss": 0.1328, "step": 14295 }, { "epoch": 2.987669801462905, "grad_norm": 0.7280844227927108, "learning_rate": 1.0553624438324179e-05, "loss": 0.0907, "step": 14296 }, { "epoch": 2.987878787878788, "grad_norm": 0.7874776365144488, "learning_rate": 1.0552498043512476e-05, "loss": 0.1122, "step": 14297 }, { "epoch": 2.9880877742946708, "grad_norm": 1.1249359852726954, "learning_rate": 1.0551371641669363e-05, "loss": 0.0961, "step": 14298 }, { "epoch": 2.9882967607105537, "grad_norm": 0.8735255537109219, "learning_rate": 1.0550245232809175e-05, "loss": 0.1228, "step": 14299 }, { "epoch": 2.9885057471264367, "grad_norm": 1.080299709976865, "learning_rate": 1.0549118816946245e-05, "loss": 0.1337, "step": 14300 }, { "epoch": 2.9887147335423196, "grad_norm": 0.9670879494742158, "learning_rate": 1.0547992394094912e-05, "loss": 0.1216, "step": 14301 }, { "epoch": 2.9889237199582026, "grad_norm": 1.256848493373903, "learning_rate": 1.054686596426951e-05, "loss": 0.1336, "step": 14302 }, { "epoch": 2.9891327063740856, "grad_norm": 0.9376850335623081, "learning_rate": 1.0545739527484374e-05, "loss": 0.1186, "step": 14303 }, { "epoch": 2.9893416927899685, "grad_norm": 1.0951704370573165, "learning_rate": 1.0544613083753838e-05, "loss": 0.1058, "step": 14304 }, { "epoch": 2.9895506792058515, "grad_norm": 0.8519821830542602, "learning_rate": 1.0543486633092244e-05, "loss": 0.1107, "step": 14305 }, { "epoch": 2.9897596656217345, "grad_norm": 0.9151870536465375, "learning_rate": 1.0542360175513917e-05, "loss": 0.1415, "step": 14306 }, { "epoch": 2.9899686520376174, "grad_norm": 0.8108796287883225, "learning_rate": 1.0541233711033205e-05, "loss": 0.1078, "step": 14307 }, { "epoch": 2.9901776384535004, "grad_norm": 0.8653652532663852, "learning_rate": 1.0540107239664437e-05, "loss": 0.117, "step": 14308 }, { "epoch": 2.9903866248693833, "grad_norm": 0.8289604099668275, "learning_rate": 1.053898076142195e-05, "loss": 0.1065, "step": 14309 }, { "epoch": 2.9905956112852663, "grad_norm": 0.9030292813838966, "learning_rate": 1.0537854276320083e-05, "loss": 0.1097, "step": 14310 }, { "epoch": 2.9908045977011493, "grad_norm": 1.0616008230998972, "learning_rate": 1.053672778437317e-05, "loss": 0.1346, "step": 14311 }, { "epoch": 2.9910135841170327, "grad_norm": 0.9792396335322553, "learning_rate": 1.0535601285595545e-05, "loss": 0.1273, "step": 14312 }, { "epoch": 2.9912225705329156, "grad_norm": 0.9431226064833987, "learning_rate": 1.0534474780001551e-05, "loss": 0.1076, "step": 14313 }, { "epoch": 2.9914315569487986, "grad_norm": 0.9670110650973591, "learning_rate": 1.0533348267605519e-05, "loss": 0.1046, "step": 14314 }, { "epoch": 2.9916405433646815, "grad_norm": 0.7395785272611303, "learning_rate": 1.0532221748421786e-05, "loss": 0.1076, "step": 14315 }, { "epoch": 2.9918495297805645, "grad_norm": 1.0601040153876182, "learning_rate": 1.0531095222464694e-05, "loss": 0.1223, "step": 14316 }, { "epoch": 2.9920585161964475, "grad_norm": 1.0144060058109878, "learning_rate": 1.0529968689748575e-05, "loss": 0.1375, "step": 14317 }, { "epoch": 2.9922675026123304, "grad_norm": 0.8922218745331555, "learning_rate": 1.0528842150287763e-05, "loss": 0.1202, "step": 14318 }, { "epoch": 2.9924764890282134, "grad_norm": 0.8991687878921598, "learning_rate": 1.0527715604096602e-05, "loss": 0.1148, "step": 14319 }, { "epoch": 2.9926854754440964, "grad_norm": 0.8829803628897903, "learning_rate": 1.0526589051189427e-05, "loss": 0.107, "step": 14320 }, { "epoch": 2.9928944618599793, "grad_norm": 0.9410498528366715, "learning_rate": 1.052546249158057e-05, "loss": 0.1191, "step": 14321 }, { "epoch": 2.9931034482758623, "grad_norm": 0.8521329191569426, "learning_rate": 1.0524335925284374e-05, "loss": 0.1149, "step": 14322 }, { "epoch": 2.9933124346917452, "grad_norm": 1.0079190957553528, "learning_rate": 1.0523209352315178e-05, "loss": 0.1437, "step": 14323 }, { "epoch": 2.993521421107628, "grad_norm": 0.8096509422884268, "learning_rate": 1.0522082772687312e-05, "loss": 0.106, "step": 14324 }, { "epoch": 2.993730407523511, "grad_norm": 0.9830979400114077, "learning_rate": 1.052095618641512e-05, "loss": 0.1122, "step": 14325 }, { "epoch": 2.993939393939394, "grad_norm": 0.8444692342886864, "learning_rate": 1.0519829593512938e-05, "loss": 0.1122, "step": 14326 }, { "epoch": 2.994148380355277, "grad_norm": 0.9714443219339398, "learning_rate": 1.05187029939951e-05, "loss": 0.1242, "step": 14327 }, { "epoch": 2.99435736677116, "grad_norm": 0.9679828938722007, "learning_rate": 1.0517576387875948e-05, "loss": 0.1235, "step": 14328 }, { "epoch": 2.994566353187043, "grad_norm": 1.119320305198932, "learning_rate": 1.0516449775169819e-05, "loss": 0.1446, "step": 14329 }, { "epoch": 2.994775339602926, "grad_norm": 0.8930357152460998, "learning_rate": 1.051532315589105e-05, "loss": 0.1229, "step": 14330 }, { "epoch": 2.994984326018809, "grad_norm": 0.8900930338201579, "learning_rate": 1.051419653005398e-05, "loss": 0.1152, "step": 14331 }, { "epoch": 2.995193312434692, "grad_norm": 1.0920557797394719, "learning_rate": 1.0513069897672945e-05, "loss": 0.1129, "step": 14332 }, { "epoch": 2.995402298850575, "grad_norm": 1.0375450657239103, "learning_rate": 1.0511943258762285e-05, "loss": 0.1252, "step": 14333 }, { "epoch": 2.995611285266458, "grad_norm": 0.9209453773542376, "learning_rate": 1.0510816613336338e-05, "loss": 0.1182, "step": 14334 }, { "epoch": 2.9958202716823408, "grad_norm": 1.0524151163603186, "learning_rate": 1.0509689961409446e-05, "loss": 0.1449, "step": 14335 }, { "epoch": 2.9960292580982237, "grad_norm": 0.8359847802020872, "learning_rate": 1.0508563302995939e-05, "loss": 0.1168, "step": 14336 }, { "epoch": 2.9962382445141067, "grad_norm": 0.9241271823459721, "learning_rate": 1.0507436638110162e-05, "loss": 0.1314, "step": 14337 }, { "epoch": 2.9964472309299897, "grad_norm": 0.9000049305814828, "learning_rate": 1.0506309966766453e-05, "loss": 0.121, "step": 14338 }, { "epoch": 2.9966562173458726, "grad_norm": 1.0382610659355493, "learning_rate": 1.0505183288979148e-05, "loss": 0.1308, "step": 14339 }, { "epoch": 2.9968652037617556, "grad_norm": 0.9250827489742453, "learning_rate": 1.0504056604762588e-05, "loss": 0.1166, "step": 14340 }, { "epoch": 2.9970741901776385, "grad_norm": 0.9894984507966921, "learning_rate": 1.0502929914131111e-05, "loss": 0.1119, "step": 14341 }, { "epoch": 2.9972831765935215, "grad_norm": 0.9885752521434532, "learning_rate": 1.0501803217099054e-05, "loss": 0.1177, "step": 14342 }, { "epoch": 2.9974921630094045, "grad_norm": 0.7958773621561516, "learning_rate": 1.0500676513680761e-05, "loss": 0.0984, "step": 14343 }, { "epoch": 2.9977011494252874, "grad_norm": 0.9127927259332201, "learning_rate": 1.049954980389057e-05, "loss": 0.1226, "step": 14344 }, { "epoch": 2.9979101358411704, "grad_norm": 0.756874944550809, "learning_rate": 1.049842308774281e-05, "loss": 0.091, "step": 14345 }, { "epoch": 2.9981191222570533, "grad_norm": 0.7412089222555813, "learning_rate": 1.0497296365251836e-05, "loss": 0.097, "step": 14346 }, { "epoch": 2.9983281086729363, "grad_norm": 0.9447053258466681, "learning_rate": 1.0496169636431979e-05, "loss": 0.1157, "step": 14347 }, { "epoch": 2.9985370950888193, "grad_norm": 1.4083116652337746, "learning_rate": 1.0495042901297576e-05, "loss": 0.1118, "step": 14348 }, { "epoch": 2.9987460815047022, "grad_norm": 0.9770876742391301, "learning_rate": 1.0493916159862971e-05, "loss": 0.1045, "step": 14349 }, { "epoch": 2.998955067920585, "grad_norm": 0.8969671628797252, "learning_rate": 1.0492789412142506e-05, "loss": 0.1153, "step": 14350 }, { "epoch": 2.999164054336468, "grad_norm": 1.1403610045502492, "learning_rate": 1.049166265815051e-05, "loss": 0.1369, "step": 14351 }, { "epoch": 2.999373040752351, "grad_norm": 1.0324103151887987, "learning_rate": 1.0490535897901334e-05, "loss": 0.1371, "step": 14352 }, { "epoch": 2.999582027168234, "grad_norm": 0.9603486445233026, "learning_rate": 1.0489409131409314e-05, "loss": 0.1183, "step": 14353 }, { "epoch": 2.999791013584117, "grad_norm": 1.007628370857741, "learning_rate": 1.0488282358688788e-05, "loss": 0.1125, "step": 14354 }, { "epoch": 3.0, "grad_norm": 0.7518053688675155, "learning_rate": 1.0487155579754098e-05, "loss": 0.0726, "step": 14355 } ], "logging_steps": 1.0, "max_steps": 28710, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 5000.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.381824342294528e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }