{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.5432517941199166, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.8813431424399217, "epoch": 0.003703989505363068, "grad_norm": 0.0770227387547493, "learning_rate": 4.4e-06, "loss": 0.9689, "mean_token_accuracy": 0.7800739134351412, "num_tokens": 2121076.0, "step": 12 }, { "entropy": 0.8696938330928484, "epoch": 0.007407979010726136, "grad_norm": 0.07951901108026505, "learning_rate": 9.2e-06, "loss": 0.9476, "mean_token_accuracy": 0.7851626686751842, "num_tokens": 4244776.0, "step": 24 }, { "entropy": 0.9179414622485638, "epoch": 0.011111968516089204, "grad_norm": 0.07233385741710663, "learning_rate": 1.4000000000000001e-05, "loss": 0.9521, "mean_token_accuracy": 0.783088818192482, "num_tokens": 6391196.0, "step": 36 }, { "entropy": 0.9946566758056482, "epoch": 0.014815958021452273, "grad_norm": 0.061557795852422714, "learning_rate": 1.88e-05, "loss": 0.9499, "mean_token_accuracy": 0.7822788568834463, "num_tokens": 8503157.0, "step": 48 }, { "entropy": 0.9443877513209978, "epoch": 0.01851994752681534, "grad_norm": 0.056875523179769516, "learning_rate": 2.36e-05, "loss": 0.9141, "mean_token_accuracy": 0.7903395928442478, "num_tokens": 10635230.0, "step": 60 }, { "entropy": 0.9328317133088907, "epoch": 0.02222393703217841, "grad_norm": 0.06272337585687637, "learning_rate": 2.84e-05, "loss": 0.9189, "mean_token_accuracy": 0.7886765040457249, "num_tokens": 12765188.0, "step": 72 }, { "entropy": 0.9268471834560236, "epoch": 0.025927926537541477, "grad_norm": 0.06604354083538055, "learning_rate": 3.32e-05, "loss": 0.9058, "mean_token_accuracy": 0.7908310927450657, "num_tokens": 14906939.0, "step": 84 }, { "entropy": 0.9278339172403017, "epoch": 0.029631916042904546, "grad_norm": 0.07385515421628952, "learning_rate": 3.8e-05, "loss": 0.9058, "mean_token_accuracy": 0.7903257831931114, "num_tokens": 17063752.0, "step": 96 }, { "entropy": 0.920132706562678, "epoch": 0.03333590554826761, "grad_norm": 0.08544895797967911, "learning_rate": 4.2800000000000004e-05, "loss": 0.8985, "mean_token_accuracy": 0.7918331262965997, "num_tokens": 19212379.0, "step": 108 }, { "entropy": 0.9165268304447333, "epoch": 0.03703989505363068, "grad_norm": 0.07440265268087387, "learning_rate": 4.76e-05, "loss": 0.8934, "mean_token_accuracy": 0.7929873218139013, "num_tokens": 21338398.0, "step": 120 }, { "entropy": 0.9263193979859352, "epoch": 0.04074388455899375, "grad_norm": 0.07524750381708145, "learning_rate": 5.2400000000000007e-05, "loss": 0.9026, "mean_token_accuracy": 0.7904796029130617, "num_tokens": 23447525.0, "step": 132 }, { "entropy": 0.9026983591417471, "epoch": 0.04444787406435682, "grad_norm": 0.0853290930390358, "learning_rate": 5.72e-05, "loss": 0.8807, "mean_token_accuracy": 0.7949702950815359, "num_tokens": 25576292.0, "step": 144 }, { "entropy": 0.91398652891318, "epoch": 0.048151863569719885, "grad_norm": 0.08753567934036255, "learning_rate": 6.2e-05, "loss": 0.893, "mean_token_accuracy": 0.7922145264844099, "num_tokens": 27677943.0, "step": 156 }, { "entropy": 0.9047610275447369, "epoch": 0.051855853075082954, "grad_norm": 0.09057004004716873, "learning_rate": 6.680000000000001e-05, "loss": 0.8825, "mean_token_accuracy": 0.7944387656946977, "num_tokens": 29799957.0, "step": 168 }, { "entropy": 0.8962277062237263, "epoch": 0.05555984258044602, "grad_norm": 0.09516258537769318, "learning_rate": 7.16e-05, "loss": 0.8759, "mean_token_accuracy": 0.7956550841530164, "num_tokens": 31935987.0, "step": 180 }, { "entropy": 0.8993425803879896, "epoch": 0.05926383208580909, "grad_norm": 0.09167130291461945, "learning_rate": 7.64e-05, "loss": 0.8781, "mean_token_accuracy": 0.7949150291581949, "num_tokens": 34073688.0, "step": 192 }, { "entropy": 0.8955355410774549, "epoch": 0.06296782159117216, "grad_norm": 0.10528211295604706, "learning_rate": 8.120000000000001e-05, "loss": 0.8769, "mean_token_accuracy": 0.7950745013852915, "num_tokens": 36211776.0, "step": 204 }, { "entropy": 0.890474279721578, "epoch": 0.06667181109653522, "grad_norm": 0.09239654988050461, "learning_rate": 8.6e-05, "loss": 0.8681, "mean_token_accuracy": 0.7962082512676716, "num_tokens": 38314176.0, "step": 216 }, { "entropy": 0.8846843503415585, "epoch": 0.0703758006018983, "grad_norm": 0.08734577894210815, "learning_rate": 9.080000000000001e-05, "loss": 0.864, "mean_token_accuracy": 0.7977614911894003, "num_tokens": 40427803.0, "step": 228 }, { "entropy": 0.8876174452404181, "epoch": 0.07407979010726136, "grad_norm": 0.09199592471122742, "learning_rate": 9.56e-05, "loss": 0.8663, "mean_token_accuracy": 0.7968363290031751, "num_tokens": 42601795.0, "step": 240 }, { "entropy": 0.8842576717336973, "epoch": 0.07778377961262443, "grad_norm": 0.09066279232501984, "learning_rate": 9.999999821283761e-05, "loss": 0.8596, "mean_token_accuracy": 0.7992542050778866, "num_tokens": 44720639.0, "step": 252 }, { "entropy": 0.8819859276215235, "epoch": 0.0814877691179875, "grad_norm": 0.09380369633436203, "learning_rate": 9.999969796985704e-05, "loss": 0.8606, "mean_token_accuracy": 0.7987109025319418, "num_tokens": 46822627.0, "step": 264 }, { "entropy": 0.8764691551526388, "epoch": 0.08519175862335057, "grad_norm": 0.08952156454324722, "learning_rate": 9.999888302765345e-05, "loss": 0.8564, "mean_token_accuracy": 0.7992806943754355, "num_tokens": 48939905.0, "step": 276 }, { "entropy": 0.8867798671126366, "epoch": 0.08889574812871363, "grad_norm": 0.0936591774225235, "learning_rate": 9.999755339461591e-05, "loss": 0.8668, "mean_token_accuracy": 0.7967317899068197, "num_tokens": 51041899.0, "step": 288 }, { "entropy": 0.8787145999570688, "epoch": 0.09259973763407671, "grad_norm": 0.0926186814904213, "learning_rate": 9.999570908443172e-05, "loss": 0.8574, "mean_token_accuracy": 0.7985943108797073, "num_tokens": 53173319.0, "step": 300 }, { "entropy": 0.8810293078422546, "epoch": 0.09630372713943977, "grad_norm": 0.09375683963298798, "learning_rate": 9.99933501160863e-05, "loss": 0.8602, "mean_token_accuracy": 0.7984230530758699, "num_tokens": 55298285.0, "step": 312 }, { "entropy": 0.8722913352151712, "epoch": 0.10000771664480285, "grad_norm": 0.09732625633478165, "learning_rate": 9.999047651386295e-05, "loss": 0.8545, "mean_token_accuracy": 0.7996316676338514, "num_tokens": 57422076.0, "step": 324 }, { "entropy": 0.8687134782473246, "epoch": 0.10371170615016591, "grad_norm": 0.08659056574106216, "learning_rate": 9.99870883073427e-05, "loss": 0.8453, "mean_token_accuracy": 0.8012902028858662, "num_tokens": 59585248.0, "step": 336 }, { "entropy": 0.8830971730252107, "epoch": 0.10741569565552897, "grad_norm": 0.08845651149749756, "learning_rate": 9.998318553140387e-05, "loss": 0.8627, "mean_token_accuracy": 0.7974087223410606, "num_tokens": 61673684.0, "step": 348 }, { "entropy": 0.8765653309722742, "epoch": 0.11111968516089205, "grad_norm": 0.08888445049524307, "learning_rate": 9.997876822622186e-05, "loss": 0.8556, "mean_token_accuracy": 0.7995972596108913, "num_tokens": 63792291.0, "step": 360 }, { "entropy": 0.871281652400891, "epoch": 0.11482367466625511, "grad_norm": 0.09176748991012573, "learning_rate": 9.99738364372686e-05, "loss": 0.8534, "mean_token_accuracy": 0.7989814169704914, "num_tokens": 65901685.0, "step": 372 }, { "entropy": 0.8688261434435844, "epoch": 0.11852766417161818, "grad_norm": 0.09121166169643402, "learning_rate": 9.996839021531213e-05, "loss": 0.8467, "mean_token_accuracy": 0.8007690558830897, "num_tokens": 68051060.0, "step": 384 }, { "entropy": 0.8706431252261003, "epoch": 0.12223165367698124, "grad_norm": 0.09300525486469269, "learning_rate": 9.996242961641615e-05, "loss": 0.8536, "mean_token_accuracy": 0.7998151158293089, "num_tokens": 70196769.0, "step": 396 }, { "entropy": 0.8776354491710663, "epoch": 0.12593564318234432, "grad_norm": 0.08426107466220856, "learning_rate": 9.995595470193933e-05, "loss": 0.8545, "mean_token_accuracy": 0.7993121594190598, "num_tokens": 72307999.0, "step": 408 }, { "entropy": 0.8773620029290518, "epoch": 0.1296396326877074, "grad_norm": 0.09252817928791046, "learning_rate": 9.994896553853472e-05, "loss": 0.8581, "mean_token_accuracy": 0.7982841742535433, "num_tokens": 74422316.0, "step": 420 }, { "entropy": 0.8797574390967687, "epoch": 0.13334362219307044, "grad_norm": 0.08833932876586914, "learning_rate": 9.994146219814912e-05, "loss": 0.8583, "mean_token_accuracy": 0.7982658172647158, "num_tokens": 76535665.0, "step": 432 }, { "entropy": 0.8648973529537519, "epoch": 0.13704761169843352, "grad_norm": 0.08894747495651245, "learning_rate": 9.993344475802226e-05, "loss": 0.8449, "mean_token_accuracy": 0.8004408640166124, "num_tokens": 78656429.0, "step": 444 }, { "entropy": 0.8582859995464484, "epoch": 0.1407516012037966, "grad_norm": 0.09439876675605774, "learning_rate": 9.992491330068606e-05, "loss": 0.8369, "mean_token_accuracy": 0.8031645379960537, "num_tokens": 80814117.0, "step": 456 }, { "entropy": 0.8811340468625227, "epoch": 0.14445559070915967, "grad_norm": 0.0849665030837059, "learning_rate": 9.99158679139637e-05, "loss": 0.858, "mean_token_accuracy": 0.7984087901810805, "num_tokens": 82925792.0, "step": 468 }, { "entropy": 0.8554788815478483, "epoch": 0.14815958021452272, "grad_norm": 0.0906689241528511, "learning_rate": 9.990630869096883e-05, "loss": 0.8383, "mean_token_accuracy": 0.8024374966820081, "num_tokens": 85082604.0, "step": 480 }, { "entropy": 0.8597272150218487, "epoch": 0.1518635697198858, "grad_norm": 0.08773977309465408, "learning_rate": 9.989623573010455e-05, "loss": 0.8391, "mean_token_accuracy": 0.8020844186345736, "num_tokens": 87207502.0, "step": 492 }, { "entropy": 0.8616875174144903, "epoch": 0.15556755922524887, "grad_norm": 0.08853704482316971, "learning_rate": 9.988564913506238e-05, "loss": 0.8414, "mean_token_accuracy": 0.801638551056385, "num_tokens": 89348327.0, "step": 504 }, { "entropy": 0.8478845059871674, "epoch": 0.15927154873061192, "grad_norm": 0.09445828199386597, "learning_rate": 9.987454901482122e-05, "loss": 0.8285, "mean_token_accuracy": 0.8042828006048998, "num_tokens": 91504049.0, "step": 516 }, { "entropy": 0.8598806957403818, "epoch": 0.162975538235975, "grad_norm": 0.09136994183063507, "learning_rate": 9.986293548364622e-05, "loss": 0.8411, "mean_token_accuracy": 0.8020358892778555, "num_tokens": 93626935.0, "step": 528 }, { "entropy": 0.8653969628115495, "epoch": 0.16667952774133807, "grad_norm": 0.09781411290168762, "learning_rate": 9.985080866108762e-05, "loss": 0.8444, "mean_token_accuracy": 0.8006485402584076, "num_tokens": 95703749.0, "step": 540 }, { "entropy": 0.8538096360862255, "epoch": 0.17038351724670114, "grad_norm": 0.08897630125284195, "learning_rate": 9.983816867197953e-05, "loss": 0.8359, "mean_token_accuracy": 0.8030762349565824, "num_tokens": 97836700.0, "step": 552 }, { "entropy": 0.8514401825765768, "epoch": 0.1740875067520642, "grad_norm": 0.09062942117452621, "learning_rate": 9.982501564643852e-05, "loss": 0.8295, "mean_token_accuracy": 0.8046696037054062, "num_tokens": 99924036.0, "step": 564 }, { "entropy": 0.8512975362439951, "epoch": 0.17779149625742727, "grad_norm": 0.0891498252749443, "learning_rate": 9.98113497198625e-05, "loss": 0.8304, "mean_token_accuracy": 0.8033984725673994, "num_tokens": 102057627.0, "step": 576 }, { "entropy": 0.8456763414045175, "epoch": 0.18149548576279034, "grad_norm": 0.0918072909116745, "learning_rate": 9.979717103292912e-05, "loss": 0.8262, "mean_token_accuracy": 0.8045558805267016, "num_tokens": 104162821.0, "step": 588 }, { "entropy": 0.8628777662913004, "epoch": 0.18519947526815342, "grad_norm": 0.09555666148662567, "learning_rate": 9.978247973159448e-05, "loss": 0.8427, "mean_token_accuracy": 0.8011121414601803, "num_tokens": 106260558.0, "step": 600 }, { "entropy": 0.8535663560032845, "epoch": 0.18890346477351647, "grad_norm": 0.09658853709697723, "learning_rate": 9.97672759670915e-05, "loss": 0.8311, "mean_token_accuracy": 0.8037117148439089, "num_tokens": 108347746.0, "step": 612 }, { "entropy": 0.850828155875206, "epoch": 0.19260745427887954, "grad_norm": 0.09168772399425507, "learning_rate": 9.975155989592844e-05, "loss": 0.8351, "mean_token_accuracy": 0.8032047462960085, "num_tokens": 110475744.0, "step": 624 }, { "entropy": 0.8446941214303175, "epoch": 0.19631144378424262, "grad_norm": 0.09660109132528305, "learning_rate": 9.973533167988728e-05, "loss": 0.8261, "mean_token_accuracy": 0.805509191006422, "num_tokens": 112562500.0, "step": 636 }, { "entropy": 0.8449327821532885, "epoch": 0.2000154332896057, "grad_norm": 0.0988537073135376, "learning_rate": 9.971859148602202e-05, "loss": 0.8235, "mean_token_accuracy": 0.8050300491352876, "num_tokens": 114663477.0, "step": 648 }, { "entropy": 0.8418795031805834, "epoch": 0.20371942279496874, "grad_norm": 0.10221028327941895, "learning_rate": 9.970133948665702e-05, "loss": 0.8259, "mean_token_accuracy": 0.8041424031058947, "num_tokens": 116824983.0, "step": 660 }, { "entropy": 0.8358608248333136, "epoch": 0.20742341230033182, "grad_norm": 0.09639787673950195, "learning_rate": 9.968357585938515e-05, "loss": 0.815, "mean_token_accuracy": 0.8072937255104383, "num_tokens": 118942813.0, "step": 672 }, { "entropy": 0.8451284418503443, "epoch": 0.2111274018056949, "grad_norm": 0.09338120371103287, "learning_rate": 9.966530078706599e-05, "loss": 0.8267, "mean_token_accuracy": 0.8043443597853184, "num_tokens": 121069569.0, "step": 684 }, { "entropy": 0.8439769372344017, "epoch": 0.21483139131105794, "grad_norm": 0.10154031217098236, "learning_rate": 9.964651445782405e-05, "loss": 0.8258, "mean_token_accuracy": 0.8052287250757217, "num_tokens": 123193984.0, "step": 696 }, { "entropy": 0.8514358488221964, "epoch": 0.21853538081642102, "grad_norm": 0.09599766135215759, "learning_rate": 9.962721706504663e-05, "loss": 0.8331, "mean_token_accuracy": 0.8030676891406378, "num_tokens": 125349706.0, "step": 708 }, { "entropy": 0.8406069638828436, "epoch": 0.2222393703217841, "grad_norm": 0.09738138318061829, "learning_rate": 9.9607408807382e-05, "loss": 0.8218, "mean_token_accuracy": 0.8066203904648622, "num_tokens": 127475845.0, "step": 720 }, { "entropy": 0.8436915278434753, "epoch": 0.22594335982714717, "grad_norm": 0.0990961417555809, "learning_rate": 9.958708988873729e-05, "loss": 0.8254, "mean_token_accuracy": 0.8044916093349457, "num_tokens": 129575388.0, "step": 732 }, { "entropy": 0.8330078596870104, "epoch": 0.22964734933251021, "grad_norm": 0.09744027256965637, "learning_rate": 9.956626051827643e-05, "loss": 0.8138, "mean_token_accuracy": 0.8071556240320206, "num_tokens": 131698547.0, "step": 744 }, { "entropy": 0.8432958399256071, "epoch": 0.2333513388378733, "grad_norm": 0.1007818877696991, "learning_rate": 9.954492091041788e-05, "loss": 0.8227, "mean_token_accuracy": 0.804592452943325, "num_tokens": 133841198.0, "step": 756 }, { "entropy": 0.836149275302887, "epoch": 0.23705532834323637, "grad_norm": 0.09519964456558228, "learning_rate": 9.952307128483256e-05, "loss": 0.8178, "mean_token_accuracy": 0.8064361910025278, "num_tokens": 135976771.0, "step": 768 }, { "entropy": 0.8368441437681516, "epoch": 0.24075931784859944, "grad_norm": 0.10342419147491455, "learning_rate": 9.950071186644159e-05, "loss": 0.8176, "mean_token_accuracy": 0.8063248756031195, "num_tokens": 138094553.0, "step": 780 }, { "entropy": 0.8424511601527532, "epoch": 0.2444633073539625, "grad_norm": 0.0969947949051857, "learning_rate": 9.94778428854138e-05, "loss": 0.8208, "mean_token_accuracy": 0.8055132577816645, "num_tokens": 140199892.0, "step": 792 }, { "entropy": 0.8211217597126961, "epoch": 0.24816729685932556, "grad_norm": 0.09834083914756775, "learning_rate": 9.945446457716359e-05, "loss": 0.8014, "mean_token_accuracy": 0.8097951101760069, "num_tokens": 142340717.0, "step": 804 }, { "entropy": 0.8515114995340506, "epoch": 0.25187128636468864, "grad_norm": 0.10160677880048752, "learning_rate": 9.943057718234836e-05, "loss": 0.8317, "mean_token_accuracy": 0.802549467732509, "num_tokens": 144490208.0, "step": 816 }, { "entropy": 0.8411018749078115, "epoch": 0.2555752758700517, "grad_norm": 0.10110847651958466, "learning_rate": 9.940618094686603e-05, "loss": 0.8243, "mean_token_accuracy": 0.8038219797114531, "num_tokens": 146598967.0, "step": 828 }, { "entropy": 0.8269097929199537, "epoch": 0.2592792653754148, "grad_norm": 0.10856305807828903, "learning_rate": 9.938127612185261e-05, "loss": 0.8078, "mean_token_accuracy": 0.808723546564579, "num_tokens": 148706602.0, "step": 840 }, { "entropy": 0.8350271495680014, "epoch": 0.26298325488077784, "grad_norm": 0.10275018215179443, "learning_rate": 9.935586296367953e-05, "loss": 0.8144, "mean_token_accuracy": 0.8073840402066708, "num_tokens": 150831294.0, "step": 852 }, { "entropy": 0.8402173941334089, "epoch": 0.2666872443861409, "grad_norm": 0.10895514488220215, "learning_rate": 9.932994173395103e-05, "loss": 0.823, "mean_token_accuracy": 0.8050992513696352, "num_tokens": 152914180.0, "step": 864 }, { "entropy": 0.835850744197766, "epoch": 0.270391233891504, "grad_norm": 0.1103854700922966, "learning_rate": 9.930351269950143e-05, "loss": 0.8149, "mean_token_accuracy": 0.8063218059639136, "num_tokens": 155043841.0, "step": 876 }, { "entropy": 0.8304483393828074, "epoch": 0.27409522339686704, "grad_norm": 0.09872893989086151, "learning_rate": 9.927657613239247e-05, "loss": 0.8124, "mean_token_accuracy": 0.8073564060032368, "num_tokens": 157194273.0, "step": 888 }, { "entropy": 0.8408761695027351, "epoch": 0.2777992129022301, "grad_norm": 0.10328029841184616, "learning_rate": 9.924913230991044e-05, "loss": 0.8201, "mean_token_accuracy": 0.8051391566793124, "num_tokens": 159325166.0, "step": 900 }, { "entropy": 0.8261769798894724, "epoch": 0.2815032024075932, "grad_norm": 0.1065843477845192, "learning_rate": 9.922118151456327e-05, "loss": 0.8074, "mean_token_accuracy": 0.8085715671380361, "num_tokens": 161429666.0, "step": 912 }, { "entropy": 0.8249383146564165, "epoch": 0.28520719191295624, "grad_norm": 0.10588902980089188, "learning_rate": 9.919272403407782e-05, "loss": 0.8061, "mean_token_accuracy": 0.8078173498312632, "num_tokens": 163588436.0, "step": 924 }, { "entropy": 0.8257463040451208, "epoch": 0.28891118141831934, "grad_norm": 0.10355869680643082, "learning_rate": 9.91637601613967e-05, "loss": 0.8065, "mean_token_accuracy": 0.8088524142901102, "num_tokens": 165703594.0, "step": 936 }, { "entropy": 0.8323748372495174, "epoch": 0.2926151709236824, "grad_norm": 0.11059477180242538, "learning_rate": 9.913429019467534e-05, "loss": 0.8135, "mean_token_accuracy": 0.8072844197352728, "num_tokens": 167834747.0, "step": 948 }, { "entropy": 0.8198406957089901, "epoch": 0.29631916042904544, "grad_norm": 0.11167987436056137, "learning_rate": 9.910431443727897e-05, "loss": 0.8005, "mean_token_accuracy": 0.8094635804494222, "num_tokens": 169954318.0, "step": 960 }, { "entropy": 0.8273561559617519, "epoch": 0.30002314993440854, "grad_norm": 0.11263331025838852, "learning_rate": 9.907383319777945e-05, "loss": 0.8066, "mean_token_accuracy": 0.8084728009998798, "num_tokens": 172056219.0, "step": 972 }, { "entropy": 0.8382978811860085, "epoch": 0.3037271394397716, "grad_norm": 0.1031041070818901, "learning_rate": 9.904284678995206e-05, "loss": 0.8204, "mean_token_accuracy": 0.8051142630477747, "num_tokens": 174209607.0, "step": 984 }, { "entropy": 0.8209519870579243, "epoch": 0.30743112894513464, "grad_norm": 0.10300373286008835, "learning_rate": 9.901135553277232e-05, "loss": 0.798, "mean_token_accuracy": 0.8102124035358429, "num_tokens": 176340332.0, "step": 996 }, { "entropy": 0.8192687357465426, "epoch": 0.31113511845049774, "grad_norm": 0.11473007500171661, "learning_rate": 9.897935975041273e-05, "loss": 0.8024, "mean_token_accuracy": 0.8089515815178553, "num_tokens": 178465989.0, "step": 1008 }, { "entropy": 0.8185019778708617, "epoch": 0.3148391079558608, "grad_norm": 0.10434567183256149, "learning_rate": 9.894685977223934e-05, "loss": 0.7995, "mean_token_accuracy": 0.8095376479128996, "num_tokens": 180608399.0, "step": 1020 }, { "entropy": 0.8181099407374859, "epoch": 0.31854309746122383, "grad_norm": 0.10841691493988037, "learning_rate": 9.891385593280847e-05, "loss": 0.7983, "mean_token_accuracy": 0.8103865322967371, "num_tokens": 182744332.0, "step": 1032 }, { "entropy": 0.8311556254824003, "epoch": 0.32224708696658694, "grad_norm": 0.11030473560094833, "learning_rate": 9.888034857186315e-05, "loss": 0.8107, "mean_token_accuracy": 0.8080282248556614, "num_tokens": 184878580.0, "step": 1044 }, { "entropy": 0.8324671102066835, "epoch": 0.32595107647195, "grad_norm": 0.11505813896656036, "learning_rate": 9.884633803432972e-05, "loss": 0.816, "mean_token_accuracy": 0.8058654479682446, "num_tokens": 186987179.0, "step": 1056 }, { "entropy": 0.8222536593675613, "epoch": 0.3296550659773131, "grad_norm": 0.11003649234771729, "learning_rate": 9.881182467031427e-05, "loss": 0.8024, "mean_token_accuracy": 0.8091614345709482, "num_tokens": 189146940.0, "step": 1068 }, { "entropy": 0.8215553897122542, "epoch": 0.33335905548267614, "grad_norm": 0.11053642630577087, "learning_rate": 9.877680883509895e-05, "loss": 0.8045, "mean_token_accuracy": 0.8081031031906605, "num_tokens": 191273140.0, "step": 1080 }, { "entropy": 0.8301792852580547, "epoch": 0.3370630449880392, "grad_norm": 0.11618710309267044, "learning_rate": 9.874129088913842e-05, "loss": 0.8111, "mean_token_accuracy": 0.8073916758100191, "num_tokens": 193419995.0, "step": 1092 }, { "entropy": 0.8200336868564287, "epoch": 0.3407670344934023, "grad_norm": 0.11068252474069595, "learning_rate": 9.870527119805603e-05, "loss": 0.7987, "mean_token_accuracy": 0.8100821475187937, "num_tokens": 195570082.0, "step": 1104 }, { "entropy": 0.8287765470643839, "epoch": 0.34447102399876534, "grad_norm": 0.10776914656162262, "learning_rate": 9.866875013264023e-05, "loss": 0.8118, "mean_token_accuracy": 0.8070496221383413, "num_tokens": 197675630.0, "step": 1116 }, { "entropy": 0.8167796346048514, "epoch": 0.3481750135041284, "grad_norm": 0.10457509756088257, "learning_rate": 9.863172806884051e-05, "loss": 0.798, "mean_token_accuracy": 0.8097474686801434, "num_tokens": 199804447.0, "step": 1128 }, { "entropy": 0.8168069496750832, "epoch": 0.3518790030094915, "grad_norm": 0.10983676463365555, "learning_rate": 9.859420538776376e-05, "loss": 0.7981, "mean_token_accuracy": 0.8102098839978377, "num_tokens": 201919269.0, "step": 1140 }, { "entropy": 0.808088593184948, "epoch": 0.35558299251485453, "grad_norm": 0.11442640423774719, "learning_rate": 9.855618247567018e-05, "loss": 0.7926, "mean_token_accuracy": 0.8110264576971531, "num_tokens": 204057926.0, "step": 1152 }, { "entropy": 0.8169751837849617, "epoch": 0.3592869820202176, "grad_norm": 0.10822667181491852, "learning_rate": 9.851765972396943e-05, "loss": 0.7967, "mean_token_accuracy": 0.8103730641305447, "num_tokens": 206175396.0, "step": 1164 }, { "entropy": 0.814443551003933, "epoch": 0.3629909715255807, "grad_norm": 0.11191259324550629, "learning_rate": 9.847863752921649e-05, "loss": 0.7933, "mean_token_accuracy": 0.8108923596640428, "num_tokens": 208298979.0, "step": 1176 }, { "entropy": 0.807010448227326, "epoch": 0.36669496103094373, "grad_norm": 0.11479055136442184, "learning_rate": 9.843911629310764e-05, "loss": 0.7892, "mean_token_accuracy": 0.8113560838003954, "num_tokens": 210441373.0, "step": 1188 }, { "entropy": 0.8038304224610329, "epoch": 0.37039895053630684, "grad_norm": 0.12469020485877991, "learning_rate": 9.839909642247637e-05, "loss": 0.7845, "mean_token_accuracy": 0.8129827156662941, "num_tokens": 212573549.0, "step": 1200 }, { "entropy": 0.8132020806272825, "epoch": 0.3741029400416699, "grad_norm": 0.11616237461566925, "learning_rate": 9.835857832928908e-05, "loss": 0.7946, "mean_token_accuracy": 0.8103696592152119, "num_tokens": 214646922.0, "step": 1212 }, { "entropy": 0.8121326218048731, "epoch": 0.37780692954703293, "grad_norm": 0.11775597929954529, "learning_rate": 9.831756243064088e-05, "loss": 0.7945, "mean_token_accuracy": 0.8108564031620821, "num_tokens": 216767641.0, "step": 1224 }, { "entropy": 0.8172868477801482, "epoch": 0.38151091905239604, "grad_norm": 0.11528316140174866, "learning_rate": 9.827604914875139e-05, "loss": 0.7979, "mean_token_accuracy": 0.8101303689181805, "num_tokens": 218905400.0, "step": 1236 }, { "entropy": 0.8001810808976492, "epoch": 0.3852149085577591, "grad_norm": 0.1265256702899933, "learning_rate": 9.823403891096024e-05, "loss": 0.7804, "mean_token_accuracy": 0.813588964442412, "num_tokens": 220993960.0, "step": 1248 }, { "entropy": 0.8007968428234259, "epoch": 0.38891889806312213, "grad_norm": 0.11904493719339371, "learning_rate": 9.819153214972279e-05, "loss": 0.7826, "mean_token_accuracy": 0.8128631959358851, "num_tokens": 223114091.0, "step": 1260 }, { "entropy": 0.8208761115868887, "epoch": 0.39262288756848523, "grad_norm": 0.12480930238962173, "learning_rate": 9.814852930260561e-05, "loss": 0.7983, "mean_token_accuracy": 0.8095660296579202, "num_tokens": 225247556.0, "step": 1272 }, { "entropy": 0.815931453059117, "epoch": 0.3963268770738483, "grad_norm": 0.11885613203048706, "learning_rate": 9.810503081228202e-05, "loss": 0.7996, "mean_token_accuracy": 0.8100833334028721, "num_tokens": 227351759.0, "step": 1284 }, { "entropy": 0.8205481581389904, "epoch": 0.4000308665792114, "grad_norm": 0.11844924092292786, "learning_rate": 9.80610371265275e-05, "loss": 0.8019, "mean_token_accuracy": 0.808446753770113, "num_tokens": 229441295.0, "step": 1296 }, { "entropy": 0.8117605733374754, "epoch": 0.40373485608457443, "grad_norm": 0.11241784691810608, "learning_rate": 9.801654869821512e-05, "loss": 0.7932, "mean_token_accuracy": 0.81082005550464, "num_tokens": 231566470.0, "step": 1308 }, { "entropy": 0.8152283951640129, "epoch": 0.4074388455899375, "grad_norm": 0.13755977153778076, "learning_rate": 9.797156598531085e-05, "loss": 0.7936, "mean_token_accuracy": 0.8106731151541074, "num_tokens": 233658940.0, "step": 1320 }, { "entropy": 0.8065759042898814, "epoch": 0.4111428350953006, "grad_norm": 0.11853731423616409, "learning_rate": 9.79260894508688e-05, "loss": 0.7894, "mean_token_accuracy": 0.8116564750671387, "num_tokens": 235792409.0, "step": 1332 }, { "entropy": 0.8189149846633276, "epoch": 0.41484682460066363, "grad_norm": 0.11743967235088348, "learning_rate": 9.788011956302656e-05, "loss": 0.798, "mean_token_accuracy": 0.8094673914213976, "num_tokens": 237882728.0, "step": 1344 }, { "entropy": 0.8161345782379309, "epoch": 0.4185508141060267, "grad_norm": 0.12426267564296722, "learning_rate": 9.783365679500027e-05, "loss": 0.7954, "mean_token_accuracy": 0.8102164330581824, "num_tokens": 240024867.0, "step": 1356 }, { "entropy": 0.7942858090003332, "epoch": 0.4222548036113898, "grad_norm": 0.11311879754066467, "learning_rate": 9.778670162507986e-05, "loss": 0.7759, "mean_token_accuracy": 0.8148905138174692, "num_tokens": 242149516.0, "step": 1368 }, { "entropy": 0.803426214804252, "epoch": 0.42595879311675283, "grad_norm": 0.11434963345527649, "learning_rate": 9.773925453662403e-05, "loss": 0.785, "mean_token_accuracy": 0.8126038151482741, "num_tokens": 244256116.0, "step": 1380 }, { "entropy": 0.7926236540079117, "epoch": 0.4296627826221159, "grad_norm": 0.10715149343013763, "learning_rate": 9.769131601805534e-05, "loss": 0.7733, "mean_token_accuracy": 0.8152074652413527, "num_tokens": 246406475.0, "step": 1392 }, { "entropy": 0.8100047335028648, "epoch": 0.433366772127479, "grad_norm": 0.11607677489519119, "learning_rate": 9.76428865628551e-05, "loss": 0.7907, "mean_token_accuracy": 0.8116505754490694, "num_tokens": 248494590.0, "step": 1404 }, { "entropy": 0.8006227687001228, "epoch": 0.43707076163284203, "grad_norm": 0.11636164784431458, "learning_rate": 9.75939666695584e-05, "loss": 0.7815, "mean_token_accuracy": 0.8131669908761978, "num_tokens": 250624649.0, "step": 1416 }, { "entropy": 0.8028407072027525, "epoch": 0.44077475113820513, "grad_norm": 0.12113169580698013, "learning_rate": 9.75445568417489e-05, "loss": 0.7819, "mean_token_accuracy": 0.812740029146274, "num_tokens": 252753433.0, "step": 1428 }, { "entropy": 0.811029980580012, "epoch": 0.4444787406435682, "grad_norm": 0.1194261834025383, "learning_rate": 9.74946575880537e-05, "loss": 0.7925, "mean_token_accuracy": 0.8101853169500828, "num_tokens": 254919606.0, "step": 1440 }, { "entropy": 0.8054500371217728, "epoch": 0.44818273014893123, "grad_norm": 0.12146873027086258, "learning_rate": 9.744426942213799e-05, "loss": 0.788, "mean_token_accuracy": 0.8125789103408655, "num_tokens": 257037052.0, "step": 1452 }, { "entropy": 0.7984961271286011, "epoch": 0.45188671965429433, "grad_norm": 0.12276361137628555, "learning_rate": 9.739339286269995e-05, "loss": 0.7787, "mean_token_accuracy": 0.8136009263495604, "num_tokens": 259200523.0, "step": 1464 }, { "entropy": 0.8060410469770432, "epoch": 0.4555907091596574, "grad_norm": 0.12075335532426834, "learning_rate": 9.734202843346522e-05, "loss": 0.7882, "mean_token_accuracy": 0.8113779984414577, "num_tokens": 261303525.0, "step": 1476 }, { "entropy": 0.8057870579262575, "epoch": 0.45929469866502043, "grad_norm": 0.12227274477481842, "learning_rate": 9.729017666318165e-05, "loss": 0.7868, "mean_token_accuracy": 0.8118944627543291, "num_tokens": 263437688.0, "step": 1488 }, { "entropy": 0.8005211018025875, "epoch": 0.46299868817038353, "grad_norm": 0.11424333602190018, "learning_rate": 9.723783808561378e-05, "loss": 0.7791, "mean_token_accuracy": 0.8138281057278315, "num_tokens": 265568368.0, "step": 1500 }, { "entropy": 0.8023173411687216, "epoch": 0.4667026776757466, "grad_norm": 0.13906393945217133, "learning_rate": 9.718501323953737e-05, "loss": 0.7853, "mean_token_accuracy": 0.811311274766922, "num_tokens": 267706317.0, "step": 1512 }, { "entropy": 0.8110432215034962, "epoch": 0.47040666718110963, "grad_norm": 0.11983723938465118, "learning_rate": 9.713170266873384e-05, "loss": 0.7893, "mean_token_accuracy": 0.8107657519479593, "num_tokens": 269835542.0, "step": 1524 }, { "entropy": 0.7883443137009939, "epoch": 0.47411065668647273, "grad_norm": 0.13033325970172882, "learning_rate": 9.70779069219847e-05, "loss": 0.7734, "mean_token_accuracy": 0.8156957700848579, "num_tokens": 271983602.0, "step": 1536 }, { "entropy": 0.7950549945235252, "epoch": 0.4778146461918358, "grad_norm": 0.10897476971149445, "learning_rate": 9.702362655306587e-05, "loss": 0.775, "mean_token_accuracy": 0.8149407207965851, "num_tokens": 274107343.0, "step": 1548 }, { "entropy": 0.7990003265440464, "epoch": 0.4815186356971989, "grad_norm": 0.11902206391096115, "learning_rate": 9.696886212074202e-05, "loss": 0.7808, "mean_token_accuracy": 0.8132348544895649, "num_tokens": 276222435.0, "step": 1560 }, { "entropy": 0.801575344055891, "epoch": 0.48522262520256193, "grad_norm": 0.11533886194229126, "learning_rate": 9.691361418876075e-05, "loss": 0.7795, "mean_token_accuracy": 0.8133863943318526, "num_tokens": 278355706.0, "step": 1572 }, { "entropy": 0.7944580602149168, "epoch": 0.488926614707925, "grad_norm": 0.12622949481010437, "learning_rate": 9.685788332584685e-05, "loss": 0.7752, "mean_token_accuracy": 0.814650778969129, "num_tokens": 280460375.0, "step": 1584 }, { "entropy": 0.8005058703323206, "epoch": 0.4926306042132881, "grad_norm": 0.12395934015512466, "learning_rate": 9.68016701056964e-05, "loss": 0.7828, "mean_token_accuracy": 0.8131459752718607, "num_tokens": 282590228.0, "step": 1596 }, { "entropy": 0.8098582600553831, "epoch": 0.49633459371865113, "grad_norm": 0.13313600420951843, "learning_rate": 9.674497510697097e-05, "loss": 0.7903, "mean_token_accuracy": 0.8115353050331274, "num_tokens": 284669419.0, "step": 1608 }, { "entropy": 0.805266530563434, "epoch": 0.5000385832240142, "grad_norm": 0.11492042243480682, "learning_rate": 9.668779891329147e-05, "loss": 0.7851, "mean_token_accuracy": 0.8125738625725111, "num_tokens": 286824939.0, "step": 1620 }, { "entropy": 0.7950548666218916, "epoch": 0.5037425727293773, "grad_norm": 0.12213092297315598, "learning_rate": 9.663014211323233e-05, "loss": 0.7738, "mean_token_accuracy": 0.8150150341292223, "num_tokens": 288958154.0, "step": 1632 }, { "entropy": 0.7952163182199001, "epoch": 0.5074465622347404, "grad_norm": 0.11614470928907394, "learning_rate": 9.657200530031533e-05, "loss": 0.7776, "mean_token_accuracy": 0.8137414579590162, "num_tokens": 291092060.0, "step": 1644 }, { "entropy": 0.8064338949819406, "epoch": 0.5111505517401034, "grad_norm": 0.1288743019104004, "learning_rate": 9.651338907300354e-05, "loss": 0.7836, "mean_token_accuracy": 0.8120651505887508, "num_tokens": 293214294.0, "step": 1656 }, { "entropy": 0.8109964442749819, "epoch": 0.5148545412454665, "grad_norm": 0.12403728067874908, "learning_rate": 9.645429403469512e-05, "loss": 0.794, "mean_token_accuracy": 0.8106129181881746, "num_tokens": 295328150.0, "step": 1668 }, { "entropy": 0.8038183848063151, "epoch": 0.5185585307508296, "grad_norm": 0.12052307277917862, "learning_rate": 9.639472079371717e-05, "loss": 0.7841, "mean_token_accuracy": 0.8124721460044384, "num_tokens": 297455756.0, "step": 1680 }, { "entropy": 0.7984983747204145, "epoch": 0.5222625202561926, "grad_norm": 0.12845173478126526, "learning_rate": 9.63346699633194e-05, "loss": 0.7794, "mean_token_accuracy": 0.8130777689317862, "num_tokens": 299602538.0, "step": 1692 }, { "entropy": 0.7976608164608479, "epoch": 0.5259665097615557, "grad_norm": 0.1210513561964035, "learning_rate": 9.627414216166787e-05, "loss": 0.7789, "mean_token_accuracy": 0.8136931844055653, "num_tokens": 301711090.0, "step": 1704 }, { "entropy": 0.787033441166083, "epoch": 0.5296704992669188, "grad_norm": 0.11822398006916046, "learning_rate": 9.621313801183858e-05, "loss": 0.7655, "mean_token_accuracy": 0.8165569653113683, "num_tokens": 303823411.0, "step": 1716 }, { "entropy": 0.7807075269520283, "epoch": 0.5333744887722818, "grad_norm": 0.1327759176492691, "learning_rate": 9.61516581418111e-05, "loss": 0.7621, "mean_token_accuracy": 0.8171720070143541, "num_tokens": 305981408.0, "step": 1728 }, { "entropy": 0.7996973978976408, "epoch": 0.5370784782776449, "grad_norm": 0.12808099389076233, "learning_rate": 9.608970318446208e-05, "loss": 0.7815, "mean_token_accuracy": 0.8129920872549216, "num_tokens": 308103143.0, "step": 1740 }, { "entropy": 0.7994385659694672, "epoch": 0.540782467783008, "grad_norm": 0.12494061887264252, "learning_rate": 9.602727377755875e-05, "loss": 0.7776, "mean_token_accuracy": 0.8130986665685972, "num_tokens": 310257572.0, "step": 1752 }, { "entropy": 0.7863332827885946, "epoch": 0.544486457288371, "grad_norm": 0.11875750869512558, "learning_rate": 9.596437056375231e-05, "loss": 0.7668, "mean_token_accuracy": 0.8158110665778319, "num_tokens": 312373153.0, "step": 1764 }, { "entropy": 0.7789370119571686, "epoch": 0.5481904467937341, "grad_norm": 0.12376753985881805, "learning_rate": 9.590099419057141e-05, "loss": 0.7592, "mean_token_accuracy": 0.8181358501315117, "num_tokens": 314480636.0, "step": 1776 }, { "entropy": 0.7844561214248339, "epoch": 0.5518944362990972, "grad_norm": 0.13164328038692474, "learning_rate": 9.583714531041538e-05, "loss": 0.7677, "mean_token_accuracy": 0.8156558784345785, "num_tokens": 316566703.0, "step": 1788 }, { "entropy": 0.7855853562553724, "epoch": 0.5555984258044602, "grad_norm": 0.13665804266929626, "learning_rate": 9.577282458054755e-05, "loss": 0.7639, "mean_token_accuracy": 0.8170899252096812, "num_tokens": 318671313.0, "step": 1800 }, { "entropy": 0.7836192175745964, "epoch": 0.5593024153098233, "grad_norm": 0.12808938324451447, "learning_rate": 9.570803266308854e-05, "loss": 0.7644, "mean_token_accuracy": 0.8164806043108305, "num_tokens": 320785587.0, "step": 1812 }, { "entropy": 0.7717621214687824, "epoch": 0.5630064048151864, "grad_norm": 0.11582314968109131, "learning_rate": 9.564277022500936e-05, "loss": 0.753, "mean_token_accuracy": 0.8190883584320545, "num_tokens": 322901535.0, "step": 1824 }, { "entropy": 0.7969209514558315, "epoch": 0.5667103943205494, "grad_norm": 0.1297702044248581, "learning_rate": 9.557703793812458e-05, "loss": 0.7776, "mean_token_accuracy": 0.8134247288107872, "num_tokens": 325009488.0, "step": 1836 }, { "entropy": 0.7894351184368134, "epoch": 0.5704143838259125, "grad_norm": 0.13040970265865326, "learning_rate": 9.551083647908546e-05, "loss": 0.7707, "mean_token_accuracy": 0.8147525365153948, "num_tokens": 327151753.0, "step": 1848 }, { "entropy": 0.7986459036668142, "epoch": 0.5741183733312756, "grad_norm": 0.12837360799312592, "learning_rate": 9.544416652937287e-05, "loss": 0.7798, "mean_token_accuracy": 0.8129371106624603, "num_tokens": 329288710.0, "step": 1860 }, { "entropy": 0.782096286614736, "epoch": 0.5778223628366387, "grad_norm": 0.1294112354516983, "learning_rate": 9.53770287752904e-05, "loss": 0.7621, "mean_token_accuracy": 0.8168822092314562, "num_tokens": 331435486.0, "step": 1872 }, { "entropy": 0.802633331467708, "epoch": 0.5815263523420017, "grad_norm": 0.1259497106075287, "learning_rate": 9.53094239079572e-05, "loss": 0.7828, "mean_token_accuracy": 0.8125941964487234, "num_tokens": 333550517.0, "step": 1884 }, { "entropy": 0.7907603432734808, "epoch": 0.5852303418473648, "grad_norm": 0.1232752725481987, "learning_rate": 9.524135262330098e-05, "loss": 0.7692, "mean_token_accuracy": 0.8152645404140154, "num_tokens": 335680200.0, "step": 1896 }, { "entropy": 0.785350481669108, "epoch": 0.5889343313527279, "grad_norm": 0.12669149041175842, "learning_rate": 9.517281562205067e-05, "loss": 0.769, "mean_token_accuracy": 0.8154158430794874, "num_tokens": 337807097.0, "step": 1908 }, { "entropy": 0.7851421398421129, "epoch": 0.5926383208580909, "grad_norm": 0.1276937574148178, "learning_rate": 9.510381360972938e-05, "loss": 0.7641, "mean_token_accuracy": 0.8159417261679968, "num_tokens": 339954087.0, "step": 1920 }, { "entropy": 0.7850158450504144, "epoch": 0.596342310363454, "grad_norm": 0.132017120718956, "learning_rate": 9.503434729664705e-05, "loss": 0.765, "mean_token_accuracy": 0.8161118167142073, "num_tokens": 342092293.0, "step": 1932 }, { "entropy": 0.7878543138504028, "epoch": 0.6000462998688171, "grad_norm": 0.12451501935720444, "learning_rate": 9.49644173978931e-05, "loss": 0.7704, "mean_token_accuracy": 0.8147625786562761, "num_tokens": 344240873.0, "step": 1944 }, { "entropy": 0.7971215297778448, "epoch": 0.6037502893741801, "grad_norm": 0.12397521734237671, "learning_rate": 9.489402463332923e-05, "loss": 0.7757, "mean_token_accuracy": 0.814307109763225, "num_tokens": 346363936.0, "step": 1956 }, { "entropy": 0.7801007392505804, "epoch": 0.6074542788795432, "grad_norm": 0.12504300475120544, "learning_rate": 9.482316972758181e-05, "loss": 0.7628, "mean_token_accuracy": 0.8172982657949129, "num_tokens": 348481250.0, "step": 1968 }, { "entropy": 0.7930209897458553, "epoch": 0.6111582683849063, "grad_norm": 0.1284765601158142, "learning_rate": 9.475185341003455e-05, "loss": 0.7747, "mean_token_accuracy": 0.8143445054690043, "num_tokens": 350608362.0, "step": 1980 }, { "entropy": 0.7802144425610701, "epoch": 0.6148622578902693, "grad_norm": 0.12513010203838348, "learning_rate": 9.468007641482094e-05, "loss": 0.7647, "mean_token_accuracy": 0.8167619270582994, "num_tokens": 352709516.0, "step": 1992 }, { "entropy": 0.7944958060979843, "epoch": 0.6185662473956324, "grad_norm": 0.13269025087356567, "learning_rate": 9.460783948081675e-05, "loss": 0.7739, "mean_token_accuracy": 0.8146844121317068, "num_tokens": 354822759.0, "step": 2004 }, { "entropy": 0.7854583573838075, "epoch": 0.6222702369009955, "grad_norm": 0.1287117898464203, "learning_rate": 9.453514335163231e-05, "loss": 0.7637, "mean_token_accuracy": 0.8166800700128078, "num_tokens": 356911949.0, "step": 2016 }, { "entropy": 0.7777937439580759, "epoch": 0.6259742264063585, "grad_norm": 0.12762552499771118, "learning_rate": 9.446198877560497e-05, "loss": 0.7571, "mean_token_accuracy": 0.8182835541665554, "num_tokens": 359049122.0, "step": 2028 }, { "entropy": 0.7750151492655277, "epoch": 0.6296782159117216, "grad_norm": 0.1277306228876114, "learning_rate": 9.438837650579137e-05, "loss": 0.7553, "mean_token_accuracy": 0.8175727687776089, "num_tokens": 361210090.0, "step": 2040 }, { "entropy": 0.7802457685271899, "epoch": 0.6333822054170847, "grad_norm": 0.12481208890676498, "learning_rate": 9.431430729995963e-05, "loss": 0.7598, "mean_token_accuracy": 0.8178850611050924, "num_tokens": 363337396.0, "step": 2052 }, { "entropy": 0.7862026058137417, "epoch": 0.6370861949224477, "grad_norm": 0.13092230260372162, "learning_rate": 9.42397819205816e-05, "loss": 0.7688, "mean_token_accuracy": 0.8156153832872709, "num_tokens": 365477630.0, "step": 2064 }, { "entropy": 0.7690726555883884, "epoch": 0.6407901844278108, "grad_norm": 0.11987863481044769, "learning_rate": 9.416480113482504e-05, "loss": 0.7521, "mean_token_accuracy": 0.8195419311523438, "num_tokens": 367643528.0, "step": 2076 }, { "entropy": 0.7762465241054693, "epoch": 0.6444941739331739, "grad_norm": 0.13100093603134155, "learning_rate": 9.408936571454566e-05, "loss": 0.7563, "mean_token_accuracy": 0.8186973209182421, "num_tokens": 369782358.0, "step": 2088 }, { "entropy": 0.7813232329984506, "epoch": 0.648198163438537, "grad_norm": 0.13405582308769226, "learning_rate": 9.401347643627915e-05, "loss": 0.7632, "mean_token_accuracy": 0.8167420464257399, "num_tokens": 371928210.0, "step": 2100 }, { "entropy": 0.7875241699318091, "epoch": 0.6519021529439, "grad_norm": 0.12580129504203796, "learning_rate": 9.393713408123332e-05, "loss": 0.7699, "mean_token_accuracy": 0.8153914275268713, "num_tokens": 374058726.0, "step": 2112 }, { "entropy": 0.7834992570181688, "epoch": 0.6556061424492631, "grad_norm": 0.1281077116727829, "learning_rate": 9.38603394352799e-05, "loss": 0.7638, "mean_token_accuracy": 0.8171246275305748, "num_tokens": 376203701.0, "step": 2124 }, { "entropy": 0.7805231350163618, "epoch": 0.6593101319546262, "grad_norm": 0.14219819009304047, "learning_rate": 9.378309328894662e-05, "loss": 0.7616, "mean_token_accuracy": 0.8166681937873363, "num_tokens": 378335272.0, "step": 2136 }, { "entropy": 0.7874154051144918, "epoch": 0.6630141214599892, "grad_norm": 0.1304338425397873, "learning_rate": 9.370539643740883e-05, "loss": 0.7637, "mean_token_accuracy": 0.8164051709075769, "num_tokens": 380467266.0, "step": 2148 }, { "entropy": 0.7733189848562082, "epoch": 0.6667181109653523, "grad_norm": 0.13076983392238617, "learning_rate": 9.36272496804816e-05, "loss": 0.7571, "mean_token_accuracy": 0.8175699549416701, "num_tokens": 382603869.0, "step": 2160 }, { "entropy": 0.776911374181509, "epoch": 0.6704221004707154, "grad_norm": 0.13455283641815186, "learning_rate": 9.354865382261128e-05, "loss": 0.7563, "mean_token_accuracy": 0.8178863686819872, "num_tokens": 384767350.0, "step": 2172 }, { "entropy": 0.7755288705229759, "epoch": 0.6741260899760784, "grad_norm": 0.1210244670510292, "learning_rate": 9.346960967286728e-05, "loss": 0.7556, "mean_token_accuracy": 0.8183015001316866, "num_tokens": 386934062.0, "step": 2184 }, { "entropy": 0.7806575360397497, "epoch": 0.6778300794814415, "grad_norm": 0.1282687783241272, "learning_rate": 9.339011804493378e-05, "loss": 0.7604, "mean_token_accuracy": 0.8167587071657181, "num_tokens": 389047648.0, "step": 2196 }, { "entropy": 0.7758863245447477, "epoch": 0.6815340689868046, "grad_norm": 0.13144470751285553, "learning_rate": 9.331017975710132e-05, "loss": 0.7582, "mean_token_accuracy": 0.8180744908750057, "num_tokens": 391155544.0, "step": 2208 }, { "entropy": 0.7667204054693381, "epoch": 0.6852380584921676, "grad_norm": 0.12816905975341797, "learning_rate": 9.322979563225833e-05, "loss": 0.7464, "mean_token_accuracy": 0.8206619794170061, "num_tokens": 393293375.0, "step": 2220 }, { "entropy": 0.7751789751152197, "epoch": 0.6889420479975307, "grad_norm": 0.14497670531272888, "learning_rate": 9.314896649788277e-05, "loss": 0.7561, "mean_token_accuracy": 0.8184558848539988, "num_tokens": 395411494.0, "step": 2232 }, { "entropy": 0.7844450324773788, "epoch": 0.6926460375028938, "grad_norm": 0.13543701171875, "learning_rate": 9.306769318603348e-05, "loss": 0.765, "mean_token_accuracy": 0.8167361902693907, "num_tokens": 397518224.0, "step": 2244 }, { "entropy": 0.7686401257912318, "epoch": 0.6963500270082568, "grad_norm": 0.13837109506130219, "learning_rate": 9.298597653334178e-05, "loss": 0.7487, "mean_token_accuracy": 0.8197805831829706, "num_tokens": 399648867.0, "step": 2256 }, { "entropy": 0.7626761943101883, "epoch": 0.7000540165136199, "grad_norm": 0.1293450891971588, "learning_rate": 9.290381738100265e-05, "loss": 0.7441, "mean_token_accuracy": 0.8214877719680468, "num_tokens": 401743610.0, "step": 2268 }, { "entropy": 0.7687733148535093, "epoch": 0.703758006018983, "grad_norm": 0.1483583152294159, "learning_rate": 9.282121657476627e-05, "loss": 0.7506, "mean_token_accuracy": 0.8195670247077942, "num_tokens": 403850723.0, "step": 2280 }, { "entropy": 0.7698837158580621, "epoch": 0.707461995524346, "grad_norm": 0.13035227358341217, "learning_rate": 9.273817496492917e-05, "loss": 0.7503, "mean_token_accuracy": 0.8193908805648485, "num_tokens": 405973443.0, "step": 2292 }, { "entropy": 0.770644503335158, "epoch": 0.7111659850297091, "grad_norm": 0.13103176653385162, "learning_rate": 9.265469340632557e-05, "loss": 0.7538, "mean_token_accuracy": 0.8188917512694994, "num_tokens": 408105892.0, "step": 2304 }, { "entropy": 0.7735109639664491, "epoch": 0.7148699745350722, "grad_norm": 0.1398514360189438, "learning_rate": 9.257077275831853e-05, "loss": 0.7518, "mean_token_accuracy": 0.819186095148325, "num_tokens": 410296037.0, "step": 2316 }, { "entropy": 0.771765373647213, "epoch": 0.7185739640404352, "grad_norm": 0.14083071053028107, "learning_rate": 9.248641388479111e-05, "loss": 0.7526, "mean_token_accuracy": 0.8195289969444275, "num_tokens": 412380775.0, "step": 2328 }, { "entropy": 0.7654011559983095, "epoch": 0.7222779535457983, "grad_norm": 0.1325056552886963, "learning_rate": 9.240161765413748e-05, "loss": 0.7457, "mean_token_accuracy": 0.8208736081918081, "num_tokens": 414535472.0, "step": 2340 }, { "entropy": 0.7590547104676565, "epoch": 0.7259819430511614, "grad_norm": 0.1285925805568695, "learning_rate": 9.231638493925402e-05, "loss": 0.7407, "mean_token_accuracy": 0.8210619514187177, "num_tokens": 416686381.0, "step": 2352 }, { "entropy": 0.7812760807573795, "epoch": 0.7296859325565245, "grad_norm": 0.12959104776382446, "learning_rate": 9.223071661753024e-05, "loss": 0.7615, "mean_token_accuracy": 0.8172919216255347, "num_tokens": 418808712.0, "step": 2364 }, { "entropy": 0.7873292689522108, "epoch": 0.7333899220618875, "grad_norm": 0.1371845006942749, "learning_rate": 9.214461357083985e-05, "loss": 0.7677, "mean_token_accuracy": 0.8155596914390723, "num_tokens": 420927036.0, "step": 2376 }, { "entropy": 0.7580399426321188, "epoch": 0.7370939115672506, "grad_norm": 0.14509984850883484, "learning_rate": 9.205807668553164e-05, "loss": 0.7373, "mean_token_accuracy": 0.8222843247155348, "num_tokens": 423036030.0, "step": 2388 }, { "entropy": 0.7687507805724939, "epoch": 0.7407979010726137, "grad_norm": 0.15484097599983215, "learning_rate": 9.197110685242034e-05, "loss": 0.747, "mean_token_accuracy": 0.8196417490641276, "num_tokens": 425187345.0, "step": 2400 }, { "entropy": 0.764864676942428, "epoch": 0.7445018905779767, "grad_norm": 0.12323542684316635, "learning_rate": 9.188370496677745e-05, "loss": 0.7452, "mean_token_accuracy": 0.8201479675869147, "num_tokens": 427331828.0, "step": 2412 }, { "entropy": 0.7658702706297239, "epoch": 0.7482058800833398, "grad_norm": 0.12649372220039368, "learning_rate": 9.179587192832209e-05, "loss": 0.7474, "mean_token_accuracy": 0.8195582069456577, "num_tokens": 429448072.0, "step": 2424 }, { "entropy": 0.7575242506961027, "epoch": 0.7519098695887029, "grad_norm": 0.13158433139324188, "learning_rate": 9.170760864121162e-05, "loss": 0.7366, "mean_token_accuracy": 0.8224081955850124, "num_tokens": 431533949.0, "step": 2436 }, { "entropy": 0.7688459927837054, "epoch": 0.7556138590940659, "grad_norm": 0.12840636074543, "learning_rate": 9.161891601403245e-05, "loss": 0.7487, "mean_token_accuracy": 0.8195722003777822, "num_tokens": 433658430.0, "step": 2448 }, { "entropy": 0.7660383395850658, "epoch": 0.759317848599429, "grad_norm": 0.12874674797058105, "learning_rate": 9.152979495979063e-05, "loss": 0.7493, "mean_token_accuracy": 0.8193290382623672, "num_tokens": 435741465.0, "step": 2460 }, { "entropy": 0.7668347917497158, "epoch": 0.7630218381047921, "grad_norm": 0.12673811614513397, "learning_rate": 9.144024639590245e-05, "loss": 0.7473, "mean_token_accuracy": 0.8204079742232958, "num_tokens": 437908304.0, "step": 2472 }, { "entropy": 0.7687832986315092, "epoch": 0.7667258276101551, "grad_norm": 0.12845873832702637, "learning_rate": 9.135027124418499e-05, "loss": 0.7536, "mean_token_accuracy": 0.8192412157853445, "num_tokens": 440028485.0, "step": 2484 }, { "entropy": 0.7862997514506181, "epoch": 0.7704298171155182, "grad_norm": 0.13068512082099915, "learning_rate": 9.125987043084665e-05, "loss": 0.7642, "mean_token_accuracy": 0.8162732509275278, "num_tokens": 442179533.0, "step": 2496 }, { "entropy": 0.7685239054262638, "epoch": 0.7741338066208813, "grad_norm": 0.13230957090854645, "learning_rate": 9.116904488647764e-05, "loss": 0.75, "mean_token_accuracy": 0.8193069696426392, "num_tokens": 444264359.0, "step": 2508 }, { "entropy": 0.76015779748559, "epoch": 0.7778377961262443, "grad_norm": 0.13136623799800873, "learning_rate": 9.107779554604035e-05, "loss": 0.7418, "mean_token_accuracy": 0.8217526078224182, "num_tokens": 446400739.0, "step": 2520 }, { "entropy": 0.7768492065370083, "epoch": 0.7815417856316074, "grad_norm": 0.13927006721496582, "learning_rate": 9.098612334885972e-05, "loss": 0.7581, "mean_token_accuracy": 0.8174782631297907, "num_tokens": 448488906.0, "step": 2532 }, { "entropy": 0.7547270730137825, "epoch": 0.7852457751369705, "grad_norm": 0.14124959707260132, "learning_rate": 9.089402923861366e-05, "loss": 0.7356, "mean_token_accuracy": 0.8228224813938141, "num_tokens": 450609731.0, "step": 2544 }, { "entropy": 0.7677095470329126, "epoch": 0.7889497646423335, "grad_norm": 0.14042387902736664, "learning_rate": 9.080151416332319e-05, "loss": 0.747, "mean_token_accuracy": 0.8197631984949112, "num_tokens": 452710661.0, "step": 2556 }, { "entropy": 0.768154501914978, "epoch": 0.7926537541476966, "grad_norm": 0.12782582640647888, "learning_rate": 9.070857907534287e-05, "loss": 0.7473, "mean_token_accuracy": 0.8194541210929552, "num_tokens": 454884003.0, "step": 2568 }, { "entropy": 0.7609333768486977, "epoch": 0.7963577436530597, "grad_norm": 0.141867533326149, "learning_rate": 9.061522493135079e-05, "loss": 0.7407, "mean_token_accuracy": 0.82136203845342, "num_tokens": 457030737.0, "step": 2580 }, { "entropy": 0.7599871419370174, "epoch": 0.8000617331584228, "grad_norm": 0.13808685541152954, "learning_rate": 9.052145269233887e-05, "loss": 0.7402, "mean_token_accuracy": 0.8215218285719553, "num_tokens": 459177274.0, "step": 2592 }, { "entropy": 0.7648821386198202, "epoch": 0.8037657226637858, "grad_norm": 0.15076588094234467, "learning_rate": 9.042726332360292e-05, "loss": 0.7454, "mean_token_accuracy": 0.8200170012811819, "num_tokens": 461306463.0, "step": 2604 }, { "entropy": 0.7687475743393103, "epoch": 0.8074697121691489, "grad_norm": 0.12965603172779083, "learning_rate": 9.033265779473268e-05, "loss": 0.7492, "mean_token_accuracy": 0.819452028721571, "num_tokens": 463455408.0, "step": 2616 }, { "entropy": 0.7646373882889748, "epoch": 0.811173701674512, "grad_norm": 0.1345282644033432, "learning_rate": 9.023763707960188e-05, "loss": 0.7434, "mean_token_accuracy": 0.8209528836111227, "num_tokens": 465550715.0, "step": 2628 }, { "entropy": 0.76267567401131, "epoch": 0.814877691179875, "grad_norm": 0.1315702497959137, "learning_rate": 9.01422021563582e-05, "loss": 0.7427, "mean_token_accuracy": 0.820941454420487, "num_tokens": 467676160.0, "step": 2640 }, { "entropy": 0.7417604538301626, "epoch": 0.8185816806852381, "grad_norm": 0.12949636578559875, "learning_rate": 9.00463540074132e-05, "loss": 0.7243, "mean_token_accuracy": 0.8254370614886284, "num_tokens": 469811393.0, "step": 2652 }, { "entropy": 0.7587257462243239, "epoch": 0.8222856701906012, "grad_norm": 0.13278073072433472, "learning_rate": 8.995009361943218e-05, "loss": 0.7384, "mean_token_accuracy": 0.8231843349834284, "num_tokens": 471930105.0, "step": 2664 }, { "entropy": 0.7431893559793631, "epoch": 0.8259896596959642, "grad_norm": 0.14032749831676483, "learning_rate": 8.985342198332407e-05, "loss": 0.7247, "mean_token_accuracy": 0.8248631035288175, "num_tokens": 474075700.0, "step": 2676 }, { "entropy": 0.7719796399275461, "epoch": 0.8296936492013273, "grad_norm": 0.14027121663093567, "learning_rate": 8.975634009423122e-05, "loss": 0.752, "mean_token_accuracy": 0.8185311630368233, "num_tokens": 476204230.0, "step": 2688 }, { "entropy": 0.7616085906823477, "epoch": 0.8333976387066904, "grad_norm": 0.1393400877714157, "learning_rate": 8.965884895151908e-05, "loss": 0.7443, "mean_token_accuracy": 0.8211442058285078, "num_tokens": 478348559.0, "step": 2700 }, { "entropy": 0.7600347759823004, "epoch": 0.8371016282120534, "grad_norm": 0.1410454660654068, "learning_rate": 8.956094955876607e-05, "loss": 0.7373, "mean_token_accuracy": 0.8219936129947504, "num_tokens": 480446587.0, "step": 2712 }, { "entropy": 0.7613047709067663, "epoch": 0.8408056177174165, "grad_norm": 0.140048548579216, "learning_rate": 8.946264292375306e-05, "loss": 0.7426, "mean_token_accuracy": 0.8213416139284769, "num_tokens": 482581346.0, "step": 2724 }, { "entropy": 0.7594624037543932, "epoch": 0.8445096072227796, "grad_norm": 0.1277952641248703, "learning_rate": 8.936393005845316e-05, "loss": 0.739, "mean_token_accuracy": 0.8218294816712538, "num_tokens": 484695228.0, "step": 2736 }, { "entropy": 0.7544240554173788, "epoch": 0.8482135967281426, "grad_norm": 0.154397651553154, "learning_rate": 8.926481197902122e-05, "loss": 0.7341, "mean_token_accuracy": 0.8225894048810005, "num_tokens": 486862761.0, "step": 2748 }, { "entropy": 0.7661119649807612, "epoch": 0.8519175862335057, "grad_norm": 0.14195363223552704, "learning_rate": 8.916528970578333e-05, "loss": 0.746, "mean_token_accuracy": 0.8203906156122684, "num_tokens": 489014996.0, "step": 2760 }, { "entropy": 0.7686087116599083, "epoch": 0.8556215757388688, "grad_norm": 0.13611237704753876, "learning_rate": 8.906536426322646e-05, "loss": 0.7473, "mean_token_accuracy": 0.8198287424941858, "num_tokens": 491157432.0, "step": 2772 }, { "entropy": 0.7590742185711861, "epoch": 0.8593255652442318, "grad_norm": 0.12830516695976257, "learning_rate": 8.896503667998777e-05, "loss": 0.7414, "mean_token_accuracy": 0.8214252628386021, "num_tokens": 493327333.0, "step": 2784 }, { "entropy": 0.758670142541329, "epoch": 0.8630295547495949, "grad_norm": 0.13577738404273987, "learning_rate": 8.886430798884406e-05, "loss": 0.74, "mean_token_accuracy": 0.8217101270953814, "num_tokens": 495441259.0, "step": 2796 }, { "entropy": 0.7604162829617659, "epoch": 0.866733544254958, "grad_norm": 0.1389269232749939, "learning_rate": 8.876317922670119e-05, "loss": 0.7385, "mean_token_accuracy": 0.8216200309495131, "num_tokens": 497558061.0, "step": 2808 }, { "entropy": 0.7654056176543236, "epoch": 0.870437533760321, "grad_norm": 0.1383410394191742, "learning_rate": 8.866165143458334e-05, "loss": 0.7493, "mean_token_accuracy": 0.819673765450716, "num_tokens": 499689660.0, "step": 2820 }, { "entropy": 0.7578516465922197, "epoch": 0.8741415232656841, "grad_norm": 0.144080251455307, "learning_rate": 8.855972565762236e-05, "loss": 0.7373, "mean_token_accuracy": 0.822392825037241, "num_tokens": 501811384.0, "step": 2832 }, { "entropy": 0.7520838553706805, "epoch": 0.8778455127710472, "grad_norm": 0.1445675939321518, "learning_rate": 8.845740294504691e-05, "loss": 0.7321, "mean_token_accuracy": 0.8238844747344652, "num_tokens": 503931258.0, "step": 2844 }, { "entropy": 0.7716477451225122, "epoch": 0.8815495022764103, "grad_norm": 0.13848181068897247, "learning_rate": 8.835468435017183e-05, "loss": 0.75, "mean_token_accuracy": 0.8195607153077921, "num_tokens": 506070704.0, "step": 2856 }, { "entropy": 0.7465110706786314, "epoch": 0.8852534917817733, "grad_norm": 0.1347406953573227, "learning_rate": 8.825157093038708e-05, "loss": 0.7295, "mean_token_accuracy": 0.8236632930735747, "num_tokens": 508203294.0, "step": 2868 }, { "entropy": 0.7651778521637121, "epoch": 0.8889574812871364, "grad_norm": 0.13069529831409454, "learning_rate": 8.814806374714702e-05, "loss": 0.7449, "mean_token_accuracy": 0.8204664116104444, "num_tokens": 510395477.0, "step": 2880 }, { "entropy": 0.7543973810970783, "epoch": 0.8926614707924995, "grad_norm": 0.12907341122627258, "learning_rate": 8.804416386595943e-05, "loss": 0.733, "mean_token_accuracy": 0.8230690496663252, "num_tokens": 512509055.0, "step": 2892 }, { "entropy": 0.7565698722998301, "epoch": 0.8963654602978625, "grad_norm": 0.14323952794075012, "learning_rate": 8.793987235637453e-05, "loss": 0.7376, "mean_token_accuracy": 0.8218328369160494, "num_tokens": 514596287.0, "step": 2904 }, { "entropy": 0.7765912314256033, "epoch": 0.9000694498032256, "grad_norm": 0.13393579423427582, "learning_rate": 8.783519029197398e-05, "loss": 0.7567, "mean_token_accuracy": 0.8180692394574484, "num_tokens": 516697723.0, "step": 2916 }, { "entropy": 0.748100645840168, "epoch": 0.9037734393085887, "grad_norm": 0.1474265605211258, "learning_rate": 8.773011875035983e-05, "loss": 0.7294, "mean_token_accuracy": 0.8240405097603798, "num_tokens": 518833866.0, "step": 2928 }, { "entropy": 0.7694602260986964, "epoch": 0.9074774288139517, "grad_norm": 0.14337094128131866, "learning_rate": 8.762465881314346e-05, "loss": 0.7481, "mean_token_accuracy": 0.8199893161654472, "num_tokens": 520946745.0, "step": 2940 }, { "entropy": 0.7642699517309666, "epoch": 0.9111814183193148, "grad_norm": 0.1440202295780182, "learning_rate": 8.751881156593434e-05, "loss": 0.7471, "mean_token_accuracy": 0.8201769590377808, "num_tokens": 523082474.0, "step": 2952 }, { "entropy": 0.7573205133279165, "epoch": 0.9148854078246779, "grad_norm": 0.14229246973991394, "learning_rate": 8.7412578098329e-05, "loss": 0.7388, "mean_token_accuracy": 0.8218303211033344, "num_tokens": 525230014.0, "step": 2964 }, { "entropy": 0.774663812170426, "epoch": 0.9185893973300409, "grad_norm": 0.1373312920331955, "learning_rate": 8.730595950389968e-05, "loss": 0.7537, "mean_token_accuracy": 0.8188973863919576, "num_tokens": 527358964.0, "step": 2976 }, { "entropy": 0.7440223582088947, "epoch": 0.922293386835404, "grad_norm": 0.13869309425354004, "learning_rate": 8.71989568801832e-05, "loss": 0.7243, "mean_token_accuracy": 0.8259313181042671, "num_tokens": 529446921.0, "step": 2988 }, { "entropy": 0.7558054054776827, "epoch": 0.9259973763407671, "grad_norm": 0.14386014640331268, "learning_rate": 8.709157132866954e-05, "loss": 0.7371, "mean_token_accuracy": 0.8218563583989938, "num_tokens": 531572822.0, "step": 3000 }, { "entropy": 0.7559870270391306, "epoch": 0.9297013658461301, "grad_norm": 0.14299409091472626, "learning_rate": 8.698380395479058e-05, "loss": 0.7346, "mean_token_accuracy": 0.8229275730748972, "num_tokens": 533671253.0, "step": 3012 }, { "entropy": 0.7501622214913368, "epoch": 0.9334053553514932, "grad_norm": 0.1446056067943573, "learning_rate": 8.68756558679087e-05, "loss": 0.7307, "mean_token_accuracy": 0.8238628643254439, "num_tokens": 535799339.0, "step": 3024 }, { "entropy": 0.7404041352371374, "epoch": 0.9371093448568563, "grad_norm": 0.14985917508602142, "learning_rate": 8.676712818130534e-05, "loss": 0.7227, "mean_token_accuracy": 0.8250208596388499, "num_tokens": 537936260.0, "step": 3036 }, { "entropy": 0.7513143469889959, "epoch": 0.9408133343622193, "grad_norm": 0.13818083703517914, "learning_rate": 8.665822201216958e-05, "loss": 0.7306, "mean_token_accuracy": 0.8236184107760588, "num_tokens": 540092247.0, "step": 3048 }, { "entropy": 0.7599398357172807, "epoch": 0.9445173238675824, "grad_norm": 0.14524659514427185, "learning_rate": 8.654893848158658e-05, "loss": 0.7398, "mean_token_accuracy": 0.8216308690607548, "num_tokens": 542253810.0, "step": 3060 }, { "entropy": 0.7556075366834799, "epoch": 0.9482213133729455, "grad_norm": 0.14592809975147247, "learning_rate": 8.643927871452611e-05, "loss": 0.7387, "mean_token_accuracy": 0.8219700244565805, "num_tokens": 544371064.0, "step": 3072 }, { "entropy": 0.7573518306016922, "epoch": 0.9519253028783086, "grad_norm": 0.136802539229393, "learning_rate": 8.632924383983096e-05, "loss": 0.7383, "mean_token_accuracy": 0.8221282474696636, "num_tokens": 546525688.0, "step": 3084 }, { "entropy": 0.760393563657999, "epoch": 0.9556292923836716, "grad_norm": 0.14470767974853516, "learning_rate": 8.621883499020523e-05, "loss": 0.7405, "mean_token_accuracy": 0.8215649748841921, "num_tokens": 548650119.0, "step": 3096 }, { "entropy": 0.7766042277216911, "epoch": 0.9593332818890347, "grad_norm": 0.15178759396076202, "learning_rate": 8.610805330220275e-05, "loss": 0.755, "mean_token_accuracy": 0.818018895884355, "num_tokens": 550763680.0, "step": 3108 }, { "entropy": 0.7451968391736349, "epoch": 0.9630372713943978, "grad_norm": 0.16293193399906158, "learning_rate": 8.599689991621543e-05, "loss": 0.7257, "mean_token_accuracy": 0.8251891148587068, "num_tokens": 552911674.0, "step": 3120 }, { "entropy": 0.7404080505172411, "epoch": 0.9667412608997608, "grad_norm": 0.14965808391571045, "learning_rate": 8.588537597646139e-05, "loss": 0.7215, "mean_token_accuracy": 0.8259551251928011, "num_tokens": 555010287.0, "step": 3132 }, { "entropy": 0.7473993599414825, "epoch": 0.9704452504051239, "grad_norm": 0.13951192796230316, "learning_rate": 8.577348263097324e-05, "loss": 0.7281, "mean_token_accuracy": 0.8238498754799366, "num_tokens": 557140449.0, "step": 3144 }, { "entropy": 0.749595433473587, "epoch": 0.974149239910487, "grad_norm": 0.14272676408290863, "learning_rate": 8.566122103158636e-05, "loss": 0.7301, "mean_token_accuracy": 0.8237165659666061, "num_tokens": 559258810.0, "step": 3156 }, { "entropy": 0.7361926498512427, "epoch": 0.97785322941585, "grad_norm": 0.14749550819396973, "learning_rate": 8.554859233392682e-05, "loss": 0.7167, "mean_token_accuracy": 0.8265771567821503, "num_tokens": 561389026.0, "step": 3168 }, { "entropy": 0.7617857754230499, "epoch": 0.9815572189212131, "grad_norm": 0.14244970679283142, "learning_rate": 8.543559769739974e-05, "loss": 0.7436, "mean_token_accuracy": 0.8205315048495928, "num_tokens": 563506691.0, "step": 3180 }, { "entropy": 0.7460702173411846, "epoch": 0.9852612084265762, "grad_norm": 0.15570423007011414, "learning_rate": 8.532223828517716e-05, "loss": 0.7265, "mean_token_accuracy": 0.8244366881748041, "num_tokens": 565591234.0, "step": 3192 }, { "entropy": 0.7432848749061426, "epoch": 0.9889651979319392, "grad_norm": 0.1433694213628769, "learning_rate": 8.520851526418614e-05, "loss": 0.7252, "mean_token_accuracy": 0.8244002535939217, "num_tokens": 567734464.0, "step": 3204 }, { "entropy": 0.7543482234080633, "epoch": 0.9926691874373023, "grad_norm": 0.1491028368473053, "learning_rate": 8.509442980509678e-05, "loss": 0.7326, "mean_token_accuracy": 0.8236272583405176, "num_tokens": 569859027.0, "step": 3216 }, { "entropy": 0.7251827741662661, "epoch": 0.9963731769426654, "grad_norm": 0.14663253724575043, "learning_rate": 8.497998308231012e-05, "loss": 0.706, "mean_token_accuracy": 0.8289556242525578, "num_tokens": 571982689.0, "step": 3228 }, { "entropy": 0.7527169965683146, "epoch": 1.0, "grad_norm": 0.1720859855413437, "learning_rate": 8.486517627394606e-05, "loss": 0.7331, "mean_token_accuracy": 0.823117880111045, "num_tokens": 574035358.0, "step": 3240 }, { "entropy": 0.7543971252938112, "epoch": 1.003703989505363, "grad_norm": 0.14148341119289398, "learning_rate": 8.475001056183124e-05, "loss": 0.731, "mean_token_accuracy": 0.8228112831711769, "num_tokens": 576181278.0, "step": 3252 }, { "entropy": 0.7378036280473074, "epoch": 1.0074079790107262, "grad_norm": 0.13494503498077393, "learning_rate": 8.463448713148687e-05, "loss": 0.7178, "mean_token_accuracy": 0.825627734263738, "num_tokens": 578356052.0, "step": 3264 }, { "entropy": 0.7363200634717941, "epoch": 1.0111119685160892, "grad_norm": 0.14111706614494324, "learning_rate": 8.451860717211653e-05, "loss": 0.7176, "mean_token_accuracy": 0.8264128789305687, "num_tokens": 580483875.0, "step": 3276 }, { "entropy": 0.7324805557727814, "epoch": 1.0148159580214522, "grad_norm": 0.14493153989315033, "learning_rate": 8.440237187659391e-05, "loss": 0.7122, "mean_token_accuracy": 0.8274283918241659, "num_tokens": 582624687.0, "step": 3288 }, { "entropy": 0.7307334840297699, "epoch": 1.0185199475268154, "grad_norm": 0.14563046395778656, "learning_rate": 8.42857824414506e-05, "loss": 0.7103, "mean_token_accuracy": 0.8277510106563568, "num_tokens": 584748734.0, "step": 3300 }, { "entropy": 0.7457285039126873, "epoch": 1.0222239370321784, "grad_norm": 0.15820999443531036, "learning_rate": 8.416884006686366e-05, "loss": 0.7266, "mean_token_accuracy": 0.8236699538926283, "num_tokens": 586852542.0, "step": 3312 }, { "entropy": 0.7318388596177101, "epoch": 1.0259279265375414, "grad_norm": 0.15957045555114746, "learning_rate": 8.405154595664332e-05, "loss": 0.7107, "mean_token_accuracy": 0.8276907838881016, "num_tokens": 588984878.0, "step": 3324 }, { "entropy": 0.7418795588115851, "epoch": 1.0296319160429046, "grad_norm": 0.14202064275741577, "learning_rate": 8.39339013182207e-05, "loss": 0.7207, "mean_token_accuracy": 0.8251368477940559, "num_tokens": 591095187.0, "step": 3336 }, { "entropy": 0.7447669468820095, "epoch": 1.0333359055482676, "grad_norm": 0.15226991474628448, "learning_rate": 8.381590736263512e-05, "loss": 0.724, "mean_token_accuracy": 0.8251153019567331, "num_tokens": 593222885.0, "step": 3348 }, { "entropy": 0.7374236173927784, "epoch": 1.0370398950536306, "grad_norm": 0.14633263647556305, "learning_rate": 8.369756530452191e-05, "loss": 0.7177, "mean_token_accuracy": 0.8263744947810968, "num_tokens": 595340453.0, "step": 3360 }, { "entropy": 0.7381995966037115, "epoch": 1.0407438845589938, "grad_norm": 0.1505565196275711, "learning_rate": 8.35788763620997e-05, "loss": 0.7174, "mean_token_accuracy": 0.8263110890984535, "num_tokens": 597449422.0, "step": 3372 }, { "entropy": 0.7414120820661386, "epoch": 1.0444478740643568, "grad_norm": 0.143823504447937, "learning_rate": 8.345984175715802e-05, "loss": 0.7195, "mean_token_accuracy": 0.8251637890934944, "num_tokens": 599594017.0, "step": 3384 }, { "entropy": 0.7337635308504105, "epoch": 1.0481518635697198, "grad_norm": 0.14950762689113617, "learning_rate": 8.334046271504465e-05, "loss": 0.7151, "mean_token_accuracy": 0.8268262160321077, "num_tokens": 601723800.0, "step": 3396 }, { "entropy": 0.737778523315986, "epoch": 1.051855853075083, "grad_norm": 0.14158302545547485, "learning_rate": 8.3220740464653e-05, "loss": 0.7155, "mean_token_accuracy": 0.8266446044047674, "num_tokens": 603851467.0, "step": 3408 }, { "entropy": 0.7279616557061672, "epoch": 1.055559842580446, "grad_norm": 0.14092348515987396, "learning_rate": 8.310067623840951e-05, "loss": 0.7091, "mean_token_accuracy": 0.8280529901385307, "num_tokens": 605974553.0, "step": 3420 }, { "entropy": 0.7397043655316035, "epoch": 1.059263832085809, "grad_norm": 0.1458451896905899, "learning_rate": 8.298027127226093e-05, "loss": 0.7186, "mean_token_accuracy": 0.8254076987504959, "num_tokens": 608111416.0, "step": 3432 }, { "entropy": 0.7327801200250784, "epoch": 1.0629678215911722, "grad_norm": 0.14519493281841278, "learning_rate": 8.28595268056616e-05, "loss": 0.7121, "mean_token_accuracy": 0.8273726465801398, "num_tokens": 610224373.0, "step": 3444 }, { "entropy": 0.7508813291788101, "epoch": 1.0666718110965352, "grad_norm": 0.1475428193807602, "learning_rate": 8.273844408156066e-05, "loss": 0.7283, "mean_token_accuracy": 0.8238155034681162, "num_tokens": 612367458.0, "step": 3456 }, { "entropy": 0.7356340115269026, "epoch": 1.0703758006018984, "grad_norm": 0.15480412542819977, "learning_rate": 8.261702434638936e-05, "loss": 0.7165, "mean_token_accuracy": 0.8265450286368529, "num_tokens": 614500562.0, "step": 3468 }, { "entropy": 0.7494183418651422, "epoch": 1.0740797901072614, "grad_norm": 0.14762075245380402, "learning_rate": 8.249526885004809e-05, "loss": 0.7274, "mean_token_accuracy": 0.8240708733598391, "num_tokens": 616628140.0, "step": 3480 }, { "entropy": 0.7446132637560368, "epoch": 1.0777837796126244, "grad_norm": 0.15463441610336304, "learning_rate": 8.237317884589361e-05, "loss": 0.7247, "mean_token_accuracy": 0.8243404676516851, "num_tokens": 618719252.0, "step": 3492 }, { "entropy": 0.7410028763115406, "epoch": 1.0814877691179876, "grad_norm": 0.15182152390480042, "learning_rate": 8.225075559072614e-05, "loss": 0.7195, "mean_token_accuracy": 0.8264857692023119, "num_tokens": 620851273.0, "step": 3504 }, { "entropy": 0.7477393746376038, "epoch": 1.0851917586233506, "grad_norm": 0.14437253773212433, "learning_rate": 8.212800034477637e-05, "loss": 0.7256, "mean_token_accuracy": 0.8241480415066084, "num_tokens": 622956290.0, "step": 3516 }, { "entropy": 0.734037263939778, "epoch": 1.0888957481287136, "grad_norm": 0.15787097811698914, "learning_rate": 8.200491437169251e-05, "loss": 0.713, "mean_token_accuracy": 0.8271950905521711, "num_tokens": 625080917.0, "step": 3528 }, { "entropy": 0.7377768655618032, "epoch": 1.0925997376340768, "grad_norm": 0.14514772593975067, "learning_rate": 8.188149893852732e-05, "loss": 0.7162, "mean_token_accuracy": 0.8267666287720203, "num_tokens": 627213629.0, "step": 3540 }, { "entropy": 0.7321221468349298, "epoch": 1.0963037271394398, "grad_norm": 0.15606361627578735, "learning_rate": 8.175775531572501e-05, "loss": 0.7107, "mean_token_accuracy": 0.8280278158684572, "num_tokens": 629338939.0, "step": 3552 }, { "entropy": 0.7345453649759293, "epoch": 1.1000077166448028, "grad_norm": 0.14273642003536224, "learning_rate": 8.163368477710825e-05, "loss": 0.7143, "mean_token_accuracy": 0.8269894048571587, "num_tokens": 631433430.0, "step": 3564 }, { "entropy": 0.7408470747371515, "epoch": 1.103711706150166, "grad_norm": 0.16014736890792847, "learning_rate": 8.150928859986488e-05, "loss": 0.7194, "mean_token_accuracy": 0.8258067518472672, "num_tokens": 633560482.0, "step": 3576 }, { "entropy": 0.7225839781264464, "epoch": 1.107415695655529, "grad_norm": 0.15194731950759888, "learning_rate": 8.138456806453503e-05, "loss": 0.7012, "mean_token_accuracy": 0.8298800686995188, "num_tokens": 635693516.0, "step": 3588 }, { "entropy": 0.7254953247805437, "epoch": 1.111119685160892, "grad_norm": 0.15114106237888336, "learning_rate": 8.125952445499765e-05, "loss": 0.7036, "mean_token_accuracy": 0.8291740665833155, "num_tokens": 637840171.0, "step": 3600 }, { "entropy": 0.7438126876950264, "epoch": 1.1148236746662552, "grad_norm": 0.15659384429454803, "learning_rate": 8.113415905845751e-05, "loss": 0.7238, "mean_token_accuracy": 0.8248664475977421, "num_tokens": 639953063.0, "step": 3612 }, { "entropy": 0.7412439535061518, "epoch": 1.1185276641716182, "grad_norm": 0.14707545936107635, "learning_rate": 8.100847316543185e-05, "loss": 0.7207, "mean_token_accuracy": 0.825270589441061, "num_tokens": 642061202.0, "step": 3624 }, { "entropy": 0.7216433795789877, "epoch": 1.1222316536769812, "grad_norm": 0.14075824618339539, "learning_rate": 8.088246806973712e-05, "loss": 0.7012, "mean_token_accuracy": 0.8303951819737753, "num_tokens": 644170583.0, "step": 3636 }, { "entropy": 0.7379123046994209, "epoch": 1.1259356431823444, "grad_norm": 0.14188188314437866, "learning_rate": 8.075614506847563e-05, "loss": 0.7167, "mean_token_accuracy": 0.8262317391733328, "num_tokens": 646270524.0, "step": 3648 }, { "entropy": 0.7330505475401878, "epoch": 1.1296396326877074, "grad_norm": 0.14915235340595245, "learning_rate": 8.062950546202228e-05, "loss": 0.7143, "mean_token_accuracy": 0.8266897139449915, "num_tokens": 648408593.0, "step": 3660 }, { "entropy": 0.7359581738710403, "epoch": 1.1333436221930704, "grad_norm": 0.1471003293991089, "learning_rate": 8.050255055401105e-05, "loss": 0.7131, "mean_token_accuracy": 0.8272727802395821, "num_tokens": 650550976.0, "step": 3672 }, { "entropy": 0.7366929526130358, "epoch": 1.1370476116984336, "grad_norm": 0.14605937898159027, "learning_rate": 8.03752816513217e-05, "loss": 0.7169, "mean_token_accuracy": 0.825990212460359, "num_tokens": 652692582.0, "step": 3684 }, { "entropy": 0.7435482144355774, "epoch": 1.1407516012037966, "grad_norm": 0.15462058782577515, "learning_rate": 8.024770006406628e-05, "loss": 0.7251, "mean_token_accuracy": 0.823987594495217, "num_tokens": 654848825.0, "step": 3696 }, { "entropy": 0.7348724926511446, "epoch": 1.1444555907091596, "grad_norm": 0.1494790017604828, "learning_rate": 8.011980710557554e-05, "loss": 0.7139, "mean_token_accuracy": 0.8266541995108128, "num_tokens": 656948636.0, "step": 3708 }, { "entropy": 0.7282378102342287, "epoch": 1.1481595802145228, "grad_norm": 0.16433827579021454, "learning_rate": 7.999160409238563e-05, "loss": 0.7075, "mean_token_accuracy": 0.8285117484629154, "num_tokens": 659061066.0, "step": 3720 }, { "entropy": 0.738138652096192, "epoch": 1.1518635697198858, "grad_norm": 0.15777455270290375, "learning_rate": 7.986309234422427e-05, "loss": 0.7177, "mean_token_accuracy": 0.8264009902874628, "num_tokens": 661175184.0, "step": 3732 }, { "entropy": 0.730212123443683, "epoch": 1.1555675592252488, "grad_norm": 0.1504729837179184, "learning_rate": 7.973427318399746e-05, "loss": 0.7146, "mean_token_accuracy": 0.827114554742972, "num_tokens": 663279250.0, "step": 3744 }, { "entropy": 0.7331996709108353, "epoch": 1.159271548730612, "grad_norm": 0.15760678052902222, "learning_rate": 7.960514793777559e-05, "loss": 0.7107, "mean_token_accuracy": 0.8281191426018873, "num_tokens": 665384207.0, "step": 3756 }, { "entropy": 0.7236106917262077, "epoch": 1.162975538235975, "grad_norm": 0.15555773675441742, "learning_rate": 7.947571793478e-05, "loss": 0.7033, "mean_token_accuracy": 0.8294017761945724, "num_tokens": 667490801.0, "step": 3768 }, { "entropy": 0.7413870220383009, "epoch": 1.166679527741338, "grad_norm": 0.14847822487354279, "learning_rate": 7.934598450736919e-05, "loss": 0.7201, "mean_token_accuracy": 0.8256005557874838, "num_tokens": 669641877.0, "step": 3780 }, { "entropy": 0.7133117939035097, "epoch": 1.1703835172467012, "grad_norm": 0.14297045767307281, "learning_rate": 7.921594899102505e-05, "loss": 0.6898, "mean_token_accuracy": 0.8321465166906515, "num_tokens": 671760517.0, "step": 3792 }, { "entropy": 0.7260085244973501, "epoch": 1.1740875067520642, "grad_norm": 0.154536172747612, "learning_rate": 7.908561272433932e-05, "loss": 0.7061, "mean_token_accuracy": 0.828648411979278, "num_tokens": 673879141.0, "step": 3804 }, { "entropy": 0.7265941066046556, "epoch": 1.1777914962574272, "grad_norm": 0.15274272859096527, "learning_rate": 7.895497704899957e-05, "loss": 0.7083, "mean_token_accuracy": 0.8286111789445082, "num_tokens": 676023807.0, "step": 3816 }, { "entropy": 0.7224672473967075, "epoch": 1.1814954857627904, "grad_norm": 0.16113263368606567, "learning_rate": 7.882404330977556e-05, "loss": 0.7002, "mean_token_accuracy": 0.8300817757844925, "num_tokens": 678114097.0, "step": 3828 }, { "entropy": 0.731824230402708, "epoch": 1.1851994752681534, "grad_norm": 0.16188302636146545, "learning_rate": 7.869281285450527e-05, "loss": 0.7136, "mean_token_accuracy": 0.827064195026954, "num_tokens": 680225565.0, "step": 3840 }, { "entropy": 0.745010394603014, "epoch": 1.1889034647735164, "grad_norm": 0.1467347890138626, "learning_rate": 7.856128703408118e-05, "loss": 0.7239, "mean_token_accuracy": 0.8247304347654184, "num_tokens": 682344124.0, "step": 3852 }, { "entropy": 0.7087207237879435, "epoch": 1.1926074542788796, "grad_norm": 0.14708933234214783, "learning_rate": 7.842946720243617e-05, "loss": 0.6868, "mean_token_accuracy": 0.8324318118393421, "num_tokens": 684423002.0, "step": 3864 }, { "entropy": 0.7357664232452711, "epoch": 1.1963114437842426, "grad_norm": 0.13852429389953613, "learning_rate": 7.829735471652978e-05, "loss": 0.7142, "mean_token_accuracy": 0.8267920973400275, "num_tokens": 686526635.0, "step": 3876 }, { "entropy": 0.717096570879221, "epoch": 1.2000154332896056, "grad_norm": 0.15155161917209625, "learning_rate": 7.816495093633405e-05, "loss": 0.6966, "mean_token_accuracy": 0.8317411964138349, "num_tokens": 688658046.0, "step": 3888 }, { "entropy": 0.7423039426406225, "epoch": 1.2037194227949688, "grad_norm": 0.16276352107524872, "learning_rate": 7.80322572248197e-05, "loss": 0.7213, "mean_token_accuracy": 0.8252335165937742, "num_tokens": 690786172.0, "step": 3900 }, { "entropy": 0.711784016340971, "epoch": 1.2074234123003318, "grad_norm": 0.1595115065574646, "learning_rate": 7.7899274947942e-05, "loss": 0.6916, "mean_token_accuracy": 0.8321591466665268, "num_tokens": 692906988.0, "step": 3912 }, { "entropy": 0.7300878415505091, "epoch": 1.2111274018056948, "grad_norm": 0.14089138805866241, "learning_rate": 7.77660054746267e-05, "loss": 0.7072, "mean_token_accuracy": 0.8286360974113146, "num_tokens": 695033002.0, "step": 3924 }, { "entropy": 0.7247926443815231, "epoch": 1.214831391311058, "grad_norm": 0.1512719839811325, "learning_rate": 7.763245017675596e-05, "loss": 0.7032, "mean_token_accuracy": 0.8293510377407074, "num_tokens": 697153179.0, "step": 3936 }, { "entropy": 0.7274364518622557, "epoch": 1.218535380816421, "grad_norm": 0.14784738421440125, "learning_rate": 7.749861042915424e-05, "loss": 0.7094, "mean_token_accuracy": 0.8280140981078148, "num_tokens": 699267739.0, "step": 3948 }, { "entropy": 0.733709204941988, "epoch": 1.222239370321784, "grad_norm": 0.1493990421295166, "learning_rate": 7.736448760957418e-05, "loss": 0.7153, "mean_token_accuracy": 0.826848104596138, "num_tokens": 701376610.0, "step": 3960 }, { "entropy": 0.7143348765869936, "epoch": 1.2259433598271472, "grad_norm": 0.16152378916740417, "learning_rate": 7.72300830986823e-05, "loss": 0.692, "mean_token_accuracy": 0.8318944051861763, "num_tokens": 703516019.0, "step": 3972 }, { "entropy": 0.7241542227566242, "epoch": 1.2296473493325102, "grad_norm": 0.1497247815132141, "learning_rate": 7.709539828004492e-05, "loss": 0.7011, "mean_token_accuracy": 0.8292028916378816, "num_tokens": 705617753.0, "step": 3984 }, { "entropy": 0.7197034644583861, "epoch": 1.2333513388378732, "grad_norm": 0.1511349231004715, "learning_rate": 7.696043454011387e-05, "loss": 0.7001, "mean_token_accuracy": 0.8302300423383713, "num_tokens": 707732735.0, "step": 3996 }, { "entropy": 0.7363270024458567, "epoch": 1.2370553283432364, "grad_norm": 0.15632647275924683, "learning_rate": 7.682519326821215e-05, "loss": 0.716, "mean_token_accuracy": 0.8261935313542684, "num_tokens": 709835226.0, "step": 4008 }, { "entropy": 0.7174325250089169, "epoch": 1.2407593178485994, "grad_norm": 0.13901802897453308, "learning_rate": 7.668967585651974e-05, "loss": 0.6954, "mean_token_accuracy": 0.8306252273420492, "num_tokens": 711988818.0, "step": 4020 }, { "entropy": 0.7259048347671827, "epoch": 1.2444633073539624, "grad_norm": 0.1487317979335785, "learning_rate": 7.65538837000592e-05, "loss": 0.7057, "mean_token_accuracy": 0.8284634724259377, "num_tokens": 714096477.0, "step": 4032 }, { "entropy": 0.7280755999187628, "epoch": 1.2481672968593256, "grad_norm": 0.14473356306552887, "learning_rate": 7.64178181966813e-05, "loss": 0.7081, "mean_token_accuracy": 0.8283463989694914, "num_tokens": 716245082.0, "step": 4044 }, { "entropy": 0.7249644311765829, "epoch": 1.2518712863646886, "grad_norm": 0.1691472977399826, "learning_rate": 7.62814807470507e-05, "loss": 0.7051, "mean_token_accuracy": 0.8292301384111246, "num_tokens": 718361640.0, "step": 4056 }, { "entropy": 0.720829289406538, "epoch": 1.2555752758700516, "grad_norm": 0.14902262389659882, "learning_rate": 7.614487275463143e-05, "loss": 0.6981, "mean_token_accuracy": 0.8298750383158525, "num_tokens": 720489178.0, "step": 4068 }, { "entropy": 0.7293098631004492, "epoch": 1.2592792653754148, "grad_norm": 0.1404629945755005, "learning_rate": 7.600799562567258e-05, "loss": 0.7089, "mean_token_accuracy": 0.8281568810343742, "num_tokens": 722608223.0, "step": 4080 }, { "entropy": 0.722664033373197, "epoch": 1.2629832548807778, "grad_norm": 0.14373779296875, "learning_rate": 7.587085076919369e-05, "loss": 0.7024, "mean_token_accuracy": 0.8302177861332893, "num_tokens": 724757494.0, "step": 4092 }, { "entropy": 0.7234973087906837, "epoch": 1.2666872443861408, "grad_norm": 0.14617067575454712, "learning_rate": 7.573343959697029e-05, "loss": 0.7046, "mean_token_accuracy": 0.8289206735789776, "num_tokens": 726922696.0, "step": 4104 }, { "entropy": 0.7261980349818865, "epoch": 1.270391233891504, "grad_norm": 0.1495673805475235, "learning_rate": 7.55957635235194e-05, "loss": 0.706, "mean_token_accuracy": 0.828493465979894, "num_tokens": 729052352.0, "step": 4116 }, { "entropy": 0.7174485189219316, "epoch": 1.274095223396867, "grad_norm": 0.14732122421264648, "learning_rate": 7.545782396608496e-05, "loss": 0.6992, "mean_token_accuracy": 0.8296906674901644, "num_tokens": 731197978.0, "step": 4128 }, { "entropy": 0.7247343001266321, "epoch": 1.27779921290223, "grad_norm": 0.14333774149417877, "learning_rate": 7.53196223446232e-05, "loss": 0.7061, "mean_token_accuracy": 0.8288652760287126, "num_tokens": 733333853.0, "step": 4140 }, { "entropy": 0.7190474011003971, "epoch": 1.2815032024075932, "grad_norm": 0.15297874808311462, "learning_rate": 7.518116008178805e-05, "loss": 0.7012, "mean_token_accuracy": 0.8297313017149767, "num_tokens": 735431034.0, "step": 4152 }, { "entropy": 0.7267656040688356, "epoch": 1.2852071919129562, "grad_norm": 0.14727260172367096, "learning_rate": 7.50424386029165e-05, "loss": 0.7044, "mean_token_accuracy": 0.8287480349342028, "num_tokens": 737589354.0, "step": 4164 }, { "entropy": 0.7389783896505833, "epoch": 1.2889111814183194, "grad_norm": 0.15076899528503418, "learning_rate": 7.490345933601395e-05, "loss": 0.7211, "mean_token_accuracy": 0.8257643903295199, "num_tokens": 739720304.0, "step": 4176 }, { "entropy": 0.7206626770397028, "epoch": 1.2926151709236824, "grad_norm": 0.16268907487392426, "learning_rate": 7.476422371173942e-05, "loss": 0.6993, "mean_token_accuracy": 0.8301717328528563, "num_tokens": 741856208.0, "step": 4188 }, { "entropy": 0.7273670248687267, "epoch": 1.2963191604290454, "grad_norm": 0.158157080411911, "learning_rate": 7.462473316339093e-05, "loss": 0.7062, "mean_token_accuracy": 0.8289822128911813, "num_tokens": 743969037.0, "step": 4200 }, { "entropy": 0.7194506376981735, "epoch": 1.3000231499344086, "grad_norm": 0.16528646647930145, "learning_rate": 7.44849891268907e-05, "loss": 0.699, "mean_token_accuracy": 0.8304046653211117, "num_tokens": 746092753.0, "step": 4212 }, { "entropy": 0.7224392394224802, "epoch": 1.3037271394397716, "grad_norm": 0.15017327666282654, "learning_rate": 7.434499304077036e-05, "loss": 0.7023, "mean_token_accuracy": 0.8293930192788442, "num_tokens": 748204673.0, "step": 4224 }, { "entropy": 0.7125812520583471, "epoch": 1.3074311289451346, "grad_norm": 0.15608710050582886, "learning_rate": 7.420474634615617e-05, "loss": 0.6921, "mean_token_accuracy": 0.8319937810301781, "num_tokens": 750325027.0, "step": 4236 }, { "entropy": 0.7182938729723295, "epoch": 1.3111351184504978, "grad_norm": 0.15669798851013184, "learning_rate": 7.406425048675409e-05, "loss": 0.6978, "mean_token_accuracy": 0.8300301494697729, "num_tokens": 752470674.0, "step": 4248 }, { "entropy": 0.7105309578279654, "epoch": 1.3148391079558608, "grad_norm": 0.15128910541534424, "learning_rate": 7.392350690883509e-05, "loss": 0.6895, "mean_token_accuracy": 0.8324328971405824, "num_tokens": 754594955.0, "step": 4260 }, { "entropy": 0.7113361358642578, "epoch": 1.3185430974612238, "grad_norm": 0.15378808975219727, "learning_rate": 7.378251706122013e-05, "loss": 0.6903, "mean_token_accuracy": 0.8316868332525095, "num_tokens": 756717367.0, "step": 4272 }, { "entropy": 0.7267830123504003, "epoch": 1.322247086966587, "grad_norm": 0.151032954454422, "learning_rate": 7.364128239526525e-05, "loss": 0.7045, "mean_token_accuracy": 0.8282621925075849, "num_tokens": 758856430.0, "step": 4284 }, { "entropy": 0.721363440155983, "epoch": 1.32595107647195, "grad_norm": 0.16554951667785645, "learning_rate": 7.349980436484672e-05, "loss": 0.702, "mean_token_accuracy": 0.8293261242409548, "num_tokens": 761009855.0, "step": 4296 }, { "entropy": 0.717485940704743, "epoch": 1.3296550659773132, "grad_norm": 0.14937981963157654, "learning_rate": 7.335808442634596e-05, "loss": 0.6975, "mean_token_accuracy": 0.8307504417995611, "num_tokens": 763137834.0, "step": 4308 }, { "entropy": 0.7333954150478045, "epoch": 1.3333590554826762, "grad_norm": 0.14564256370067596, "learning_rate": 7.321612403863465e-05, "loss": 0.7109, "mean_token_accuracy": 0.8272267108162245, "num_tokens": 765243912.0, "step": 4320 }, { "entropy": 0.7177773999671141, "epoch": 1.3370630449880392, "grad_norm": 0.14997075498104095, "learning_rate": 7.30739246630596e-05, "loss": 0.6971, "mean_token_accuracy": 0.8304961547255516, "num_tokens": 767342231.0, "step": 4332 }, { "entropy": 0.7231075142820677, "epoch": 1.3407670344934024, "grad_norm": 0.15372461080551147, "learning_rate": 7.293148776342787e-05, "loss": 0.7022, "mean_token_accuracy": 0.8295708782970905, "num_tokens": 769459559.0, "step": 4344 }, { "entropy": 0.727438664684693, "epoch": 1.3444710239987654, "grad_norm": 0.1568097472190857, "learning_rate": 7.278881480599151e-05, "loss": 0.7063, "mean_token_accuracy": 0.8285723961889744, "num_tokens": 771586827.0, "step": 4356 }, { "entropy": 0.7256615745524565, "epoch": 1.3481750135041284, "grad_norm": 0.15446576476097107, "learning_rate": 7.264590725943263e-05, "loss": 0.7042, "mean_token_accuracy": 0.8298846408724785, "num_tokens": 773753359.0, "step": 4368 }, { "entropy": 0.716560627023379, "epoch": 1.3518790030094916, "grad_norm": 0.15649054944515228, "learning_rate": 7.250276659484814e-05, "loss": 0.6973, "mean_token_accuracy": 0.8308561046918234, "num_tokens": 775838605.0, "step": 4380 }, { "entropy": 0.7301006155709425, "epoch": 1.3555829925148546, "grad_norm": 0.15302646160125732, "learning_rate": 7.235939428573473e-05, "loss": 0.7084, "mean_token_accuracy": 0.8278401518861452, "num_tokens": 777947001.0, "step": 4392 }, { "entropy": 0.7269327379763126, "epoch": 1.3592869820202176, "grad_norm": 0.15263816714286804, "learning_rate": 7.221579180797365e-05, "loss": 0.7051, "mean_token_accuracy": 0.8292628613611063, "num_tokens": 780073853.0, "step": 4404 }, { "entropy": 0.7197130558391412, "epoch": 1.3629909715255808, "grad_norm": 0.1532004326581955, "learning_rate": 7.207196063981552e-05, "loss": 0.6983, "mean_token_accuracy": 0.8306596241891384, "num_tokens": 782197116.0, "step": 4416 }, { "entropy": 0.7153556197881699, "epoch": 1.3666949610309438, "grad_norm": 0.16322393715381622, "learning_rate": 7.192790226186505e-05, "loss": 0.694, "mean_token_accuracy": 0.8311380048592886, "num_tokens": 784336823.0, "step": 4428 }, { "entropy": 0.7153538229564825, "epoch": 1.3703989505363068, "grad_norm": 0.17276719212532043, "learning_rate": 7.178361815706594e-05, "loss": 0.6943, "mean_token_accuracy": 0.830880576123794, "num_tokens": 786476106.0, "step": 4440 }, { "entropy": 0.7140049921969572, "epoch": 1.37410294004167, "grad_norm": 0.1465296596288681, "learning_rate": 7.163910981068547e-05, "loss": 0.6914, "mean_token_accuracy": 0.8320667843023936, "num_tokens": 788592462.0, "step": 4452 }, { "entropy": 0.7188820218046507, "epoch": 1.377806929547033, "grad_norm": 0.1437097191810608, "learning_rate": 7.14943787102993e-05, "loss": 0.6992, "mean_token_accuracy": 0.8301914831002554, "num_tokens": 790710211.0, "step": 4464 }, { "entropy": 0.7231453433632851, "epoch": 1.381510919052396, "grad_norm": 0.15103840827941895, "learning_rate": 7.134942634577614e-05, "loss": 0.7028, "mean_token_accuracy": 0.8289699579278628, "num_tokens": 792853918.0, "step": 4476 }, { "entropy": 0.7152188581724962, "epoch": 1.3852149085577592, "grad_norm": 0.15957361459732056, "learning_rate": 7.12042542092624e-05, "loss": 0.6933, "mean_token_accuracy": 0.8314299285411835, "num_tokens": 794995486.0, "step": 4488 }, { "entropy": 0.7186589650809765, "epoch": 1.3889188980631222, "grad_norm": 0.15625914931297302, "learning_rate": 7.105886379516679e-05, "loss": 0.7, "mean_token_accuracy": 0.8297496847808361, "num_tokens": 797143590.0, "step": 4500 }, { "entropy": 0.7220030228296915, "epoch": 1.3926228875684852, "grad_norm": 0.15391391515731812, "learning_rate": 7.091325660014505e-05, "loss": 0.7001, "mean_token_accuracy": 0.8300996559361616, "num_tokens": 799290589.0, "step": 4512 }, { "entropy": 0.7087641693651676, "epoch": 1.3963268770738484, "grad_norm": 0.1555260866880417, "learning_rate": 7.076743412308441e-05, "loss": 0.6868, "mean_token_accuracy": 0.8325341766079267, "num_tokens": 801404554.0, "step": 4524 }, { "entropy": 0.734593483308951, "epoch": 1.4000308665792114, "grad_norm": 0.16026608645915985, "learning_rate": 7.062139786508827e-05, "loss": 0.7146, "mean_token_accuracy": 0.8272804208099842, "num_tokens": 803507633.0, "step": 4536 }, { "entropy": 0.7193813882768154, "epoch": 1.4037348560845744, "grad_norm": 0.15414464473724365, "learning_rate": 7.047514932946068e-05, "loss": 0.6982, "mean_token_accuracy": 0.83012605458498, "num_tokens": 805648024.0, "step": 4548 }, { "entropy": 0.7172121778130531, "epoch": 1.4074388455899376, "grad_norm": 0.15337833762168884, "learning_rate": 7.032869002169088e-05, "loss": 0.6981, "mean_token_accuracy": 0.8307264360288779, "num_tokens": 807789002.0, "step": 4560 }, { "entropy": 0.7184814922511578, "epoch": 1.4111428350953006, "grad_norm": 0.15992848575115204, "learning_rate": 7.01820214494378e-05, "loss": 0.6974, "mean_token_accuracy": 0.8301586744685968, "num_tokens": 809914470.0, "step": 4572 }, { "entropy": 0.7108126245439053, "epoch": 1.4148468246006636, "grad_norm": 0.15666691958904266, "learning_rate": 7.00351451225146e-05, "loss": 0.6883, "mean_token_accuracy": 0.832117979725202, "num_tokens": 812044112.0, "step": 4584 }, { "entropy": 0.7207119353115559, "epoch": 1.4185508141060268, "grad_norm": 0.162274569272995, "learning_rate": 6.9888062552873e-05, "loss": 0.6988, "mean_token_accuracy": 0.830579354117314, "num_tokens": 814207854.0, "step": 4596 }, { "entropy": 0.7099998084207376, "epoch": 1.4222548036113898, "grad_norm": 0.15512961149215698, "learning_rate": 6.974077525458785e-05, "loss": 0.6901, "mean_token_accuracy": 0.8324882164597511, "num_tokens": 816326311.0, "step": 4608 }, { "entropy": 0.7227374029656252, "epoch": 1.4259587931167528, "grad_norm": 0.15393690764904022, "learning_rate": 6.95932847438415e-05, "loss": 0.7023, "mean_token_accuracy": 0.8297974628706773, "num_tokens": 818475940.0, "step": 4620 }, { "entropy": 0.7139190497497717, "epoch": 1.429662782622116, "grad_norm": 0.14897596836090088, "learning_rate": 6.944559253890809e-05, "loss": 0.6923, "mean_token_accuracy": 0.8322372958064079, "num_tokens": 820570551.0, "step": 4632 }, { "entropy": 0.7113192056616148, "epoch": 1.433366772127479, "grad_norm": 0.16115012764930725, "learning_rate": 6.92977001601381e-05, "loss": 0.6919, "mean_token_accuracy": 0.8324420315523943, "num_tokens": 822687816.0, "step": 4644 }, { "entropy": 0.7261552785833677, "epoch": 1.437070761632842, "grad_norm": 0.17303551733493805, "learning_rate": 6.914960912994257e-05, "loss": 0.7047, "mean_token_accuracy": 0.8293804277976354, "num_tokens": 824846735.0, "step": 4656 }, { "entropy": 0.7077195967237154, "epoch": 1.4407747511382052, "grad_norm": 0.15039795637130737, "learning_rate": 6.900132097277748e-05, "loss": 0.687, "mean_token_accuracy": 0.8330833079914252, "num_tokens": 827013414.0, "step": 4668 }, { "entropy": 0.7014664622644583, "epoch": 1.4444787406435682, "grad_norm": 0.1548740714788437, "learning_rate": 6.885283721512803e-05, "loss": 0.6819, "mean_token_accuracy": 0.834053193529447, "num_tokens": 829191903.0, "step": 4680 }, { "entropy": 0.7116682541867098, "epoch": 1.4481827301489312, "grad_norm": 0.15452542901039124, "learning_rate": 6.870415938549292e-05, "loss": 0.6917, "mean_token_accuracy": 0.8317574722071489, "num_tokens": 831331900.0, "step": 4692 }, { "entropy": 0.7132287646333376, "epoch": 1.4518867196542944, "grad_norm": 0.15533322095870972, "learning_rate": 6.855528901436871e-05, "loss": 0.6894, "mean_token_accuracy": 0.8324517086148262, "num_tokens": 833489694.0, "step": 4704 }, { "entropy": 0.7113682615260283, "epoch": 1.4555907091596574, "grad_norm": 0.1611347496509552, "learning_rate": 6.840622763423391e-05, "loss": 0.6906, "mean_token_accuracy": 0.8323202182849249, "num_tokens": 835603454.0, "step": 4716 }, { "entropy": 0.6990887063244978, "epoch": 1.4592946986650204, "grad_norm": 0.1729213446378708, "learning_rate": 6.825697677953332e-05, "loss": 0.6819, "mean_token_accuracy": 0.8341849880913893, "num_tokens": 837735389.0, "step": 4728 }, { "entropy": 0.7129435998698076, "epoch": 1.4629986881703836, "grad_norm": 0.1657145619392395, "learning_rate": 6.810753798666223e-05, "loss": 0.6948, "mean_token_accuracy": 0.8312515988945961, "num_tokens": 839880792.0, "step": 4740 }, { "entropy": 0.7213788678248724, "epoch": 1.4667026776757466, "grad_norm": 0.1646701693534851, "learning_rate": 6.795791279395052e-05, "loss": 0.6996, "mean_token_accuracy": 0.8302897252142429, "num_tokens": 842005360.0, "step": 4752 }, { "entropy": 0.7203125320374966, "epoch": 1.4704066671811096, "grad_norm": 0.1686820685863495, "learning_rate": 6.780810274164691e-05, "loss": 0.6968, "mean_token_accuracy": 0.8303221389651299, "num_tokens": 844137345.0, "step": 4764 }, { "entropy": 0.7195424201587836, "epoch": 1.4741106566864728, "grad_norm": 0.1486455649137497, "learning_rate": 6.765810937190306e-05, "loss": 0.6999, "mean_token_accuracy": 0.8301795323689779, "num_tokens": 846276636.0, "step": 4776 }, { "entropy": 0.7059193042417368, "epoch": 1.4778146461918358, "grad_norm": 0.16239939630031586, "learning_rate": 6.750793422875771e-05, "loss": 0.6847, "mean_token_accuracy": 0.833564005792141, "num_tokens": 848408984.0, "step": 4788 }, { "entropy": 0.724933902422587, "epoch": 1.4815186356971988, "grad_norm": 0.16981275379657745, "learning_rate": 6.73575788581208e-05, "loss": 0.7054, "mean_token_accuracy": 0.8286726363003254, "num_tokens": 850546298.0, "step": 4800 }, { "entropy": 0.7115325927734375, "epoch": 1.485222625202562, "grad_norm": 0.17294248938560486, "learning_rate": 6.720704480775753e-05, "loss": 0.6899, "mean_token_accuracy": 0.8321196387211481, "num_tokens": 852661935.0, "step": 4812 }, { "entropy": 0.7177861680587133, "epoch": 1.488926614707925, "grad_norm": 0.160500168800354, "learning_rate": 6.705633362727243e-05, "loss": 0.6948, "mean_token_accuracy": 0.8320040429631869, "num_tokens": 854775802.0, "step": 4824 }, { "entropy": 0.704534916828076, "epoch": 1.492630604213288, "grad_norm": 0.16029898822307587, "learning_rate": 6.690544686809342e-05, "loss": 0.6824, "mean_token_accuracy": 0.8342838796476523, "num_tokens": 856867232.0, "step": 4836 }, { "entropy": 0.7108530278007189, "epoch": 1.4963345937186512, "grad_norm": 0.16785788536071777, "learning_rate": 6.675438608345583e-05, "loss": 0.69, "mean_token_accuracy": 0.832078884045283, "num_tokens": 859022001.0, "step": 4848 }, { "entropy": 0.7115449421107769, "epoch": 1.5000385832240142, "grad_norm": 0.15710817277431488, "learning_rate": 6.660315282838643e-05, "loss": 0.6926, "mean_token_accuracy": 0.8314018162588278, "num_tokens": 861145193.0, "step": 4860 }, { "entropy": 0.7248165036241213, "epoch": 1.5037425727293772, "grad_norm": 0.16734077036380768, "learning_rate": 6.645174865968742e-05, "loss": 0.7031, "mean_token_accuracy": 0.8290742263197899, "num_tokens": 863278162.0, "step": 4872 }, { "entropy": 0.7192536381383737, "epoch": 1.5074465622347404, "grad_norm": 0.15687040984630585, "learning_rate": 6.630017513592035e-05, "loss": 0.6977, "mean_token_accuracy": 0.8306297038992246, "num_tokens": 865417354.0, "step": 4884 }, { "entropy": 0.7058528165022532, "epoch": 1.5111505517401034, "grad_norm": 0.17756927013397217, "learning_rate": 6.614843381739014e-05, "loss": 0.6867, "mean_token_accuracy": 0.8327944378058115, "num_tokens": 867563022.0, "step": 4896 }, { "entropy": 0.7131319008767605, "epoch": 1.5148545412454664, "grad_norm": 0.15551723539829254, "learning_rate": 6.5996526266129e-05, "loss": 0.6927, "mean_token_accuracy": 0.8321846400698026, "num_tokens": 869656511.0, "step": 4908 }, { "entropy": 0.7125682967404524, "epoch": 1.5185585307508296, "grad_norm": 0.15141348540782928, "learning_rate": 6.584445404588038e-05, "loss": 0.6929, "mean_token_accuracy": 0.831573948264122, "num_tokens": 871824051.0, "step": 4920 }, { "entropy": 0.7077928557991982, "epoch": 1.5222625202561926, "grad_norm": 0.16390904784202576, "learning_rate": 6.569221872208277e-05, "loss": 0.6869, "mean_token_accuracy": 0.8330778690675894, "num_tokens": 873933807.0, "step": 4932 }, { "entropy": 0.7208421056469282, "epoch": 1.5259665097615556, "grad_norm": 0.1592690497636795, "learning_rate": 6.553982186185374e-05, "loss": 0.7006, "mean_token_accuracy": 0.8300425770382086, "num_tokens": 876062457.0, "step": 4944 }, { "entropy": 0.7219262520472208, "epoch": 1.5296704992669188, "grad_norm": 0.16316470503807068, "learning_rate": 6.538726503397362e-05, "loss": 0.6995, "mean_token_accuracy": 0.8302759416401386, "num_tokens": 878177430.0, "step": 4956 }, { "entropy": 0.7148293716212114, "epoch": 1.5333744887722818, "grad_norm": 0.16070803999900818, "learning_rate": 6.523454980886957e-05, "loss": 0.6935, "mean_token_accuracy": 0.8319490551948547, "num_tokens": 880334120.0, "step": 4968 }, { "entropy": 0.7131251158813635, "epoch": 1.5370784782776448, "grad_norm": 0.15670515596866608, "learning_rate": 6.508167775859918e-05, "loss": 0.692, "mean_token_accuracy": 0.8319077553848425, "num_tokens": 882447090.0, "step": 4980 }, { "entropy": 0.7086505778133869, "epoch": 1.540782467783008, "grad_norm": 0.15983329713344574, "learning_rate": 6.49286504568345e-05, "loss": 0.6881, "mean_token_accuracy": 0.8326895671586195, "num_tokens": 884569531.0, "step": 4992 } ], "logging_steps": 12, "max_steps": 12000, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.9347755764237206e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }