| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.13661868748475237, | |
| "eval_steps": 10, | |
| "global_step": 560, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00024396194193705782, | |
| "grad_norm": 4.489601135253906, | |
| "learning_rate": 2.4999420463141455e-07, | |
| "loss": 3.306, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.00024396194193705782, | |
| "eval_loss": 2.9874014854431152, | |
| "eval_runtime": 85.0857, | |
| "eval_samples_per_second": 3.009, | |
| "eval_steps_per_second": 0.752, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.00048792388387411563, | |
| "grad_norm": 4.510775566101074, | |
| "learning_rate": 2.4998840671678217e-07, | |
| "loss": 2.7398, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0007318858258111735, | |
| "grad_norm": 5.426989555358887, | |
| "learning_rate": 2.499826062544247e-07, | |
| "loss": 3.3804, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0009758477677482313, | |
| "grad_norm": 3.3621065616607666, | |
| "learning_rate": 2.4997680324266246e-07, | |
| "loss": 2.8696, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0012198097096852891, | |
| "grad_norm": 4.033163070678711, | |
| "learning_rate": 2.499709976798144e-07, | |
| "loss": 3.4674, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.001463771651622347, | |
| "grad_norm": 3.398115634918213, | |
| "learning_rate": 2.4996518956419777e-07, | |
| "loss": 3.3783, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0017077335935594047, | |
| "grad_norm": 2.1340525150299072, | |
| "learning_rate": 2.499593788941286e-07, | |
| "loss": 2.6618, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.0019516955354964625, | |
| "grad_norm": 2.8017404079437256, | |
| "learning_rate": 2.499535656679212e-07, | |
| "loss": 3.1798, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0021956574774335204, | |
| "grad_norm": 2.3321993350982666, | |
| "learning_rate": 2.499477498838886e-07, | |
| "loss": 3.1456, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.0024396194193705783, | |
| "grad_norm": 1.805264949798584, | |
| "learning_rate": 2.4994193154034227e-07, | |
| "loss": 2.7338, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0024396194193705783, | |
| "eval_loss": 2.8181073665618896, | |
| "eval_runtime": 82.6944, | |
| "eval_samples_per_second": 3.096, | |
| "eval_steps_per_second": 0.774, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.002683581361307636, | |
| "grad_norm": 2.295260190963745, | |
| "learning_rate": 2.499361106355922e-07, | |
| "loss": 2.876, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.002927543303244694, | |
| "grad_norm": 1.9248020648956299, | |
| "learning_rate": 2.499302871679468e-07, | |
| "loss": 3.0328, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.0031715052451817514, | |
| "grad_norm": 1.5429915189743042, | |
| "learning_rate": 2.4992446113571303e-07, | |
| "loss": 2.7445, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0034154671871188093, | |
| "grad_norm": 1.659693717956543, | |
| "learning_rate": 2.4991863253719657e-07, | |
| "loss": 2.9829, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.003659429129055867, | |
| "grad_norm": 1.6439993381500244, | |
| "learning_rate": 2.4991280137070126e-07, | |
| "loss": 2.7864, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.003903391070992925, | |
| "grad_norm": 1.731806993484497, | |
| "learning_rate": 2.499069676345297e-07, | |
| "loss": 2.9397, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.004147353012929983, | |
| "grad_norm": 1.397567629814148, | |
| "learning_rate": 2.499011313269829e-07, | |
| "loss": 2.7251, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.004391314954867041, | |
| "grad_norm": 1.5513560771942139, | |
| "learning_rate": 2.498952924463603e-07, | |
| "loss": 2.9325, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.004635276896804099, | |
| "grad_norm": 1.7622836828231812, | |
| "learning_rate": 2.498894509909601e-07, | |
| "loss": 2.7478, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.0048792388387411565, | |
| "grad_norm": 1.4812703132629395, | |
| "learning_rate": 2.4988360695907864e-07, | |
| "loss": 2.7757, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0048792388387411565, | |
| "eval_loss": 2.6998705863952637, | |
| "eval_runtime": 82.4721, | |
| "eval_samples_per_second": 3.104, | |
| "eval_steps_per_second": 0.776, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.005123200780678214, | |
| "grad_norm": 1.8131576776504517, | |
| "learning_rate": 2.49877760349011e-07, | |
| "loss": 2.5977, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.005367162722615272, | |
| "grad_norm": 1.084616780281067, | |
| "learning_rate": 2.498719111590508e-07, | |
| "loss": 2.3549, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.00561112466455233, | |
| "grad_norm": 1.909567952156067, | |
| "learning_rate": 2.498660593874899e-07, | |
| "loss": 3.0033, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.005855086606489388, | |
| "grad_norm": 1.3913578987121582, | |
| "learning_rate": 2.4986020503261886e-07, | |
| "loss": 2.6655, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.006099048548426446, | |
| "grad_norm": 1.499211311340332, | |
| "learning_rate": 2.498543480927266e-07, | |
| "loss": 2.7763, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.006343010490363503, | |
| "grad_norm": 1.3160173892974854, | |
| "learning_rate": 2.4984848856610065e-07, | |
| "loss": 2.7146, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.006586972432300561, | |
| "grad_norm": 1.4656383991241455, | |
| "learning_rate": 2.4984262645102706e-07, | |
| "loss": 2.8071, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.006830934374237619, | |
| "grad_norm": 1.5149258375167847, | |
| "learning_rate": 2.4983676174579014e-07, | |
| "loss": 2.8613, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.0070748963161746765, | |
| "grad_norm": 1.3552800416946411, | |
| "learning_rate": 2.498308944486729e-07, | |
| "loss": 2.6073, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.007318858258111734, | |
| "grad_norm": 1.8789068460464478, | |
| "learning_rate": 2.4982502455795676e-07, | |
| "loss": 2.8036, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.007318858258111734, | |
| "eval_loss": 2.5998635292053223, | |
| "eval_runtime": 82.4021, | |
| "eval_samples_per_second": 3.107, | |
| "eval_steps_per_second": 0.777, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.007562820200048792, | |
| "grad_norm": 1.5838264226913452, | |
| "learning_rate": 2.498191520719216e-07, | |
| "loss": 2.7908, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.00780678214198585, | |
| "grad_norm": 1.575810194015503, | |
| "learning_rate": 2.4981327698884575e-07, | |
| "loss": 2.5728, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.008050744083922909, | |
| "grad_norm": 1.1017578840255737, | |
| "learning_rate": 2.498073993070061e-07, | |
| "loss": 2.519, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.008294706025859966, | |
| "grad_norm": 1.5795230865478516, | |
| "learning_rate": 2.49801519024678e-07, | |
| "loss": 2.8713, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.008538667967797023, | |
| "grad_norm": 1.3720916509628296, | |
| "learning_rate": 2.497956361401352e-07, | |
| "loss": 2.6911, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.008782629909734082, | |
| "grad_norm": 1.3356428146362305, | |
| "learning_rate": 2.4978975065165004e-07, | |
| "loss": 2.5879, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.009026591851671139, | |
| "grad_norm": 2.031726121902466, | |
| "learning_rate": 2.497838625574932e-07, | |
| "loss": 2.9549, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.009270553793608197, | |
| "grad_norm": 1.4513427019119263, | |
| "learning_rate": 2.497779718559339e-07, | |
| "loss": 2.8033, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.009514515735545254, | |
| "grad_norm": 1.4715417623519897, | |
| "learning_rate": 2.497720785452398e-07, | |
| "loss": 2.5233, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.009758477677482313, | |
| "grad_norm": 1.3367327451705933, | |
| "learning_rate": 2.497661826236771e-07, | |
| "loss": 2.6558, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.009758477677482313, | |
| "eval_loss": 2.5077946186065674, | |
| "eval_runtime": 82.473, | |
| "eval_samples_per_second": 3.104, | |
| "eval_steps_per_second": 0.776, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.01000243961941937, | |
| "grad_norm": 1.8079203367233276, | |
| "learning_rate": 2.497602840895103e-07, | |
| "loss": 2.7062, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.010246401561356429, | |
| "grad_norm": 1.297031283378601, | |
| "learning_rate": 2.4975438294100266e-07, | |
| "loss": 2.4938, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.010490363503293486, | |
| "grad_norm": 2.0549025535583496, | |
| "learning_rate": 2.497484791764155e-07, | |
| "loss": 2.8457, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.010734325445230545, | |
| "grad_norm": 2.118145227432251, | |
| "learning_rate": 2.4974257279400897e-07, | |
| "loss": 2.6677, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.010978287387167602, | |
| "grad_norm": 1.6570909023284912, | |
| "learning_rate": 2.497366637920414e-07, | |
| "loss": 2.7371, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.01122224932910466, | |
| "grad_norm": 2.100497007369995, | |
| "learning_rate": 2.497307521687697e-07, | |
| "loss": 2.7241, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.011466211271041717, | |
| "grad_norm": 1.4280970096588135, | |
| "learning_rate": 2.497248379224492e-07, | |
| "loss": 2.53, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.011710173212978776, | |
| "grad_norm": 1.6932001113891602, | |
| "learning_rate": 2.497189210513339e-07, | |
| "loss": 2.815, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.011954135154915833, | |
| "grad_norm": 1.8314259052276611, | |
| "learning_rate": 2.497130015536758e-07, | |
| "loss": 2.7558, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.012198097096852892, | |
| "grad_norm": 1.3970531225204468, | |
| "learning_rate": 2.497070794277257e-07, | |
| "loss": 2.5018, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.012198097096852892, | |
| "eval_loss": 2.428684949874878, | |
| "eval_runtime": 82.4898, | |
| "eval_samples_per_second": 3.103, | |
| "eval_steps_per_second": 0.776, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.012442059038789949, | |
| "grad_norm": 2.6512253284454346, | |
| "learning_rate": 2.497011546717327e-07, | |
| "loss": 2.6264, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.012686020980727006, | |
| "grad_norm": 1.2588273286819458, | |
| "learning_rate": 2.496952272839445e-07, | |
| "loss": 2.3415, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.012929982922664065, | |
| "grad_norm": 1.6063730716705322, | |
| "learning_rate": 2.4968929726260705e-07, | |
| "loss": 2.4857, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.013173944864601122, | |
| "grad_norm": 1.344925045967102, | |
| "learning_rate": 2.4968336460596485e-07, | |
| "loss": 2.4742, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.01341790680653818, | |
| "grad_norm": 1.141435146331787, | |
| "learning_rate": 2.4967742931226075e-07, | |
| "loss": 2.2668, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.013661868748475237, | |
| "grad_norm": 1.4117755889892578, | |
| "learning_rate": 2.4967149137973625e-07, | |
| "loss": 2.4959, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.013905830690412296, | |
| "grad_norm": 1.292641520500183, | |
| "learning_rate": 2.496655508066309e-07, | |
| "loss": 2.3535, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.014149792632349353, | |
| "grad_norm": 1.3126784563064575, | |
| "learning_rate": 2.4965960759118313e-07, | |
| "loss": 2.3842, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.014393754574286412, | |
| "grad_norm": 1.4474728107452393, | |
| "learning_rate": 2.4965366173162953e-07, | |
| "loss": 2.5879, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.014637716516223469, | |
| "grad_norm": 1.4832170009613037, | |
| "learning_rate": 2.4964771322620516e-07, | |
| "loss": 2.5618, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.014637716516223469, | |
| "eval_loss": 2.370858907699585, | |
| "eval_runtime": 82.5008, | |
| "eval_samples_per_second": 3.103, | |
| "eval_steps_per_second": 0.776, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.014881678458160527, | |
| "grad_norm": 3.201457977294922, | |
| "learning_rate": 2.4964176207314356e-07, | |
| "loss": 2.4885, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.015125640400097584, | |
| "grad_norm": 1.71616530418396, | |
| "learning_rate": 2.496358082706767e-07, | |
| "loss": 2.5725, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.015369602342034643, | |
| "grad_norm": 1.8521833419799805, | |
| "learning_rate": 2.4962985181703483e-07, | |
| "loss": 2.4873, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.0156135642839717, | |
| "grad_norm": 1.3216487169265747, | |
| "learning_rate": 2.496238927104469e-07, | |
| "loss": 2.4606, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.015857526225908757, | |
| "grad_norm": 1.3042290210723877, | |
| "learning_rate": 2.4961793094913995e-07, | |
| "loss": 2.5374, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.016101488167845818, | |
| "grad_norm": 1.4314343929290771, | |
| "learning_rate": 2.4961196653133975e-07, | |
| "loss": 2.5357, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.016345450109782875, | |
| "grad_norm": 1.6834615468978882, | |
| "learning_rate": 2.4960599945527027e-07, | |
| "loss": 2.2217, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.01658941205171993, | |
| "grad_norm": 1.1532320976257324, | |
| "learning_rate": 2.49600029719154e-07, | |
| "loss": 2.4338, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.01683337399365699, | |
| "grad_norm": 1.009998083114624, | |
| "learning_rate": 2.495940573212118e-07, | |
| "loss": 2.18, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.017077335935594046, | |
| "grad_norm": 1.4564158916473389, | |
| "learning_rate": 2.4958808225966306e-07, | |
| "loss": 2.3481, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.017077335935594046, | |
| "eval_loss": 2.3252947330474854, | |
| "eval_runtime": 82.5205, | |
| "eval_samples_per_second": 3.102, | |
| "eval_steps_per_second": 0.776, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.017321297877531106, | |
| "grad_norm": 1.5508018732070923, | |
| "learning_rate": 2.4958210453272533e-07, | |
| "loss": 2.5017, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.017565259819468163, | |
| "grad_norm": 1.2118752002716064, | |
| "learning_rate": 2.4957612413861483e-07, | |
| "loss": 2.5198, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.01780922176140522, | |
| "grad_norm": 1.1424647569656372, | |
| "learning_rate": 2.4957014107554603e-07, | |
| "loss": 2.4275, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.018053183703342277, | |
| "grad_norm": 1.1591920852661133, | |
| "learning_rate": 2.4956415534173195e-07, | |
| "loss": 2.2197, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.018297145645279338, | |
| "grad_norm": 1.1998584270477295, | |
| "learning_rate": 2.495581669353838e-07, | |
| "loss": 2.2739, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.018541107587216395, | |
| "grad_norm": 1.0552688837051392, | |
| "learning_rate": 2.4955217585471147e-07, | |
| "loss": 2.4085, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.01878506952915345, | |
| "grad_norm": 1.0630302429199219, | |
| "learning_rate": 2.495461820979229e-07, | |
| "loss": 2.3622, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.01902903147109051, | |
| "grad_norm": 1.6193199157714844, | |
| "learning_rate": 2.4954018566322477e-07, | |
| "loss": 2.4233, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.01927299341302757, | |
| "grad_norm": 1.0845212936401367, | |
| "learning_rate": 2.4953418654882195e-07, | |
| "loss": 2.2942, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.019516955354964626, | |
| "grad_norm": 0.9362667202949524, | |
| "learning_rate": 2.495281847529178e-07, | |
| "loss": 2.3475, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.019516955354964626, | |
| "eval_loss": 2.2899718284606934, | |
| "eval_runtime": 82.7023, | |
| "eval_samples_per_second": 3.095, | |
| "eval_steps_per_second": 0.774, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.019760917296901683, | |
| "grad_norm": 1.1640156507492065, | |
| "learning_rate": 2.4952218027371403e-07, | |
| "loss": 2.4911, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.02000487923883874, | |
| "grad_norm": 1.158489465713501, | |
| "learning_rate": 2.495161731094107e-07, | |
| "loss": 2.3697, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.0202488411807758, | |
| "grad_norm": 1.056389570236206, | |
| "learning_rate": 2.4951016325820637e-07, | |
| "loss": 2.3726, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.020492803122712858, | |
| "grad_norm": 1.1232126951217651, | |
| "learning_rate": 2.4950415071829794e-07, | |
| "loss": 2.3631, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.020736765064649915, | |
| "grad_norm": 1.4430733919143677, | |
| "learning_rate": 2.4949813548788067e-07, | |
| "loss": 2.4389, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.02098072700658697, | |
| "grad_norm": 1.4792566299438477, | |
| "learning_rate": 2.4949211756514816e-07, | |
| "loss": 2.5851, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.02122468894852403, | |
| "grad_norm": 0.8782404661178589, | |
| "learning_rate": 2.494860969482926e-07, | |
| "loss": 2.3258, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.02146865089046109, | |
| "grad_norm": 0.9481968879699707, | |
| "learning_rate": 2.4948007363550424e-07, | |
| "loss": 2.3977, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.021712612832398146, | |
| "grad_norm": 1.0738717317581177, | |
| "learning_rate": 2.4947404762497197e-07, | |
| "loss": 2.0767, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.021956574774335203, | |
| "grad_norm": 1.3180803060531616, | |
| "learning_rate": 2.49468018914883e-07, | |
| "loss": 2.4085, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.021956574774335203, | |
| "eval_loss": 2.2610299587249756, | |
| "eval_runtime": 82.4227, | |
| "eval_samples_per_second": 3.106, | |
| "eval_steps_per_second": 0.776, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.02220053671627226, | |
| "grad_norm": 0.9324449896812439, | |
| "learning_rate": 2.4946198750342283e-07, | |
| "loss": 2.3142, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.02244449865820932, | |
| "grad_norm": 1.5807453393936157, | |
| "learning_rate": 2.4945595338877547e-07, | |
| "loss": 2.3756, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.022688460600146378, | |
| "grad_norm": 1.279068112373352, | |
| "learning_rate": 2.494499165691231e-07, | |
| "loss": 2.2482, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.022932422542083435, | |
| "grad_norm": 1.6906729936599731, | |
| "learning_rate": 2.4944387704264644e-07, | |
| "loss": 2.3038, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.02317638448402049, | |
| "grad_norm": 1.2444514036178589, | |
| "learning_rate": 2.494378348075246e-07, | |
| "loss": 2.1946, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.023420346425957552, | |
| "grad_norm": 0.9085439443588257, | |
| "learning_rate": 2.494317898619349e-07, | |
| "loss": 2.0829, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.02366430836789461, | |
| "grad_norm": 1.0624847412109375, | |
| "learning_rate": 2.4942574220405314e-07, | |
| "loss": 2.2917, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.023908270309831666, | |
| "grad_norm": 0.9223533868789673, | |
| "learning_rate": 2.4941969183205344e-07, | |
| "loss": 2.4056, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.024152232251768723, | |
| "grad_norm": 4.390754699707031, | |
| "learning_rate": 2.494136387441083e-07, | |
| "loss": 2.4544, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.024396194193705784, | |
| "grad_norm": 0.999297559261322, | |
| "learning_rate": 2.494075829383886e-07, | |
| "loss": 2.2129, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.024396194193705784, | |
| "eval_loss": 2.2379603385925293, | |
| "eval_runtime": 82.4809, | |
| "eval_samples_per_second": 3.104, | |
| "eval_steps_per_second": 0.776, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.02464015613564284, | |
| "grad_norm": 0.9902031421661377, | |
| "learning_rate": 2.494015244130635e-07, | |
| "loss": 2.0829, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.024884118077579898, | |
| "grad_norm": 1.0847697257995605, | |
| "learning_rate": 2.493954631663007e-07, | |
| "loss": 2.2557, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.025128080019516955, | |
| "grad_norm": 0.8790014982223511, | |
| "learning_rate": 2.493893991962659e-07, | |
| "loss": 2.2532, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.02537204196145401, | |
| "grad_norm": 0.8715433478355408, | |
| "learning_rate": 2.493833325011235e-07, | |
| "loss": 2.348, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.025616003903391072, | |
| "grad_norm": 0.9393193125724792, | |
| "learning_rate": 2.4937726307903606e-07, | |
| "loss": 2.3185, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.02585996584532813, | |
| "grad_norm": 1.0732641220092773, | |
| "learning_rate": 2.493711909281646e-07, | |
| "loss": 2.3618, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.026103927787265186, | |
| "grad_norm": 1.067499041557312, | |
| "learning_rate": 2.493651160466685e-07, | |
| "loss": 2.446, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.026347889729202243, | |
| "grad_norm": 1.0830148458480835, | |
| "learning_rate": 2.493590384327053e-07, | |
| "loss": 2.4026, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.026591851671139304, | |
| "grad_norm": 1.4239816665649414, | |
| "learning_rate": 2.49352958084431e-07, | |
| "loss": 2.2756, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.02683581361307636, | |
| "grad_norm": 0.7910580635070801, | |
| "learning_rate": 2.49346875e-07, | |
| "loss": 2.1831, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.02683581361307636, | |
| "eval_loss": 2.2186379432678223, | |
| "eval_runtime": 82.443, | |
| "eval_samples_per_second": 3.105, | |
| "eval_steps_per_second": 0.776, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.027079775555013418, | |
| "grad_norm": 1.1051030158996582, | |
| "learning_rate": 2.49340789177565e-07, | |
| "loss": 2.2737, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.027323737496950475, | |
| "grad_norm": 1.0497264862060547, | |
| "learning_rate": 2.4933470061527687e-07, | |
| "loss": 2.2902, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.027567699438887535, | |
| "grad_norm": 0.87137770652771, | |
| "learning_rate": 2.493286093112851e-07, | |
| "loss": 2.1482, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.027811661380824592, | |
| "grad_norm": 1.1766656637191772, | |
| "learning_rate": 2.493225152637374e-07, | |
| "loss": 2.353, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.02805562332276165, | |
| "grad_norm": 0.9225324988365173, | |
| "learning_rate": 2.4931641847077963e-07, | |
| "loss": 2.1774, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.028299585264698706, | |
| "grad_norm": 0.8458165526390076, | |
| "learning_rate": 2.493103189305562e-07, | |
| "loss": 2.112, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.028543547206635766, | |
| "grad_norm": 0.975180983543396, | |
| "learning_rate": 2.493042166412099e-07, | |
| "loss": 2.331, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.028787509148572824, | |
| "grad_norm": 0.879942774772644, | |
| "learning_rate": 2.492981116008816e-07, | |
| "loss": 2.2939, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.02903147109050988, | |
| "grad_norm": 1.5170252323150635, | |
| "learning_rate": 2.492920038077106e-07, | |
| "loss": 2.435, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.029275433032446938, | |
| "grad_norm": 1.0312517881393433, | |
| "learning_rate": 2.492858932598346e-07, | |
| "loss": 2.2375, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.029275433032446938, | |
| "eval_loss": 2.201087474822998, | |
| "eval_runtime": 82.5159, | |
| "eval_samples_per_second": 3.102, | |
| "eval_steps_per_second": 0.776, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.029519394974383995, | |
| "grad_norm": 0.999895453453064, | |
| "learning_rate": 2.4927977995538954e-07, | |
| "loss": 2.2366, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.029763356916321055, | |
| "grad_norm": 0.75639408826828, | |
| "learning_rate": 2.4927366389250973e-07, | |
| "loss": 2.1457, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.030007318858258112, | |
| "grad_norm": 1.004939079284668, | |
| "learning_rate": 2.4926754506932774e-07, | |
| "loss": 2.2746, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.03025128080019517, | |
| "grad_norm": 0.9373717308044434, | |
| "learning_rate": 2.4926142348397453e-07, | |
| "loss": 2.2899, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.030495242742132226, | |
| "grad_norm": 22.97978401184082, | |
| "learning_rate": 2.492552991345792e-07, | |
| "loss": 2.3833, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.030739204684069286, | |
| "grad_norm": 1.2713708877563477, | |
| "learning_rate": 2.4924917201926936e-07, | |
| "loss": 2.3817, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.030983166626006343, | |
| "grad_norm": 1.0373756885528564, | |
| "learning_rate": 2.492430421361708e-07, | |
| "loss": 2.0911, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.0312271285679434, | |
| "grad_norm": 1.1061681509017944, | |
| "learning_rate": 2.4923690948340783e-07, | |
| "loss": 2.2405, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.03147109050988046, | |
| "grad_norm": 0.8704845309257507, | |
| "learning_rate": 2.4923077405910264e-07, | |
| "loss": 2.3703, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.031715052451817514, | |
| "grad_norm": 0.8377108573913574, | |
| "learning_rate": 2.4922463586137616e-07, | |
| "loss": 2.1756, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.031715052451817514, | |
| "eval_loss": 2.1864898204803467, | |
| "eval_runtime": 82.5766, | |
| "eval_samples_per_second": 3.1, | |
| "eval_steps_per_second": 0.775, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.03195901439375457, | |
| "grad_norm": 0.9043044447898865, | |
| "learning_rate": 2.4921849488834745e-07, | |
| "loss": 2.3177, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.032202976335691635, | |
| "grad_norm": 0.9630913138389587, | |
| "learning_rate": 2.4921235113813376e-07, | |
| "loss": 2.3257, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.03244693827762869, | |
| "grad_norm": 0.9598456025123596, | |
| "learning_rate": 2.492062046088508e-07, | |
| "loss": 2.1792, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.03269090021956575, | |
| "grad_norm": 0.9462944865226746, | |
| "learning_rate": 2.4920005529861254e-07, | |
| "loss": 2.1268, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.032934862161502806, | |
| "grad_norm": 0.9108706116676331, | |
| "learning_rate": 2.491939032055311e-07, | |
| "loss": 2.2696, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.03317882410343986, | |
| "grad_norm": 1.834842324256897, | |
| "learning_rate": 2.491877483277171e-07, | |
| "loss": 2.3871, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.03342278604537692, | |
| "grad_norm": 0.9213249087333679, | |
| "learning_rate": 2.4918159066327943e-07, | |
| "loss": 2.1749, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.03366674798731398, | |
| "grad_norm": 0.8780300617218018, | |
| "learning_rate": 2.49175430210325e-07, | |
| "loss": 2.2413, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.033910709929251034, | |
| "grad_norm": 118.5926284790039, | |
| "learning_rate": 2.491692669669594e-07, | |
| "loss": 2.2684, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.03415467187118809, | |
| "grad_norm": 0.8691902160644531, | |
| "learning_rate": 2.4916310093128616e-07, | |
| "loss": 2.1863, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.03415467187118809, | |
| "eval_loss": 2.174090623855591, | |
| "eval_runtime": 82.5703, | |
| "eval_samples_per_second": 3.1, | |
| "eval_steps_per_second": 0.775, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.034398633813125155, | |
| "grad_norm": 0.8839126825332642, | |
| "learning_rate": 2.491569321014073e-07, | |
| "loss": 2.2502, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.03464259575506221, | |
| "grad_norm": 1.0043957233428955, | |
| "learning_rate": 2.49150760475423e-07, | |
| "loss": 2.2624, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.03488655769699927, | |
| "grad_norm": 0.7548794746398926, | |
| "learning_rate": 2.4914458605143187e-07, | |
| "loss": 2.2126, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.035130519638936326, | |
| "grad_norm": 1.0142180919647217, | |
| "learning_rate": 2.491384088275306e-07, | |
| "loss": 2.1853, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.03537448158087338, | |
| "grad_norm": 0.9584774971008301, | |
| "learning_rate": 2.491322288018143e-07, | |
| "loss": 2.0712, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.03561844352281044, | |
| "grad_norm": 1.2019151449203491, | |
| "learning_rate": 2.4912604597237626e-07, | |
| "loss": 2.1924, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.0358624054647475, | |
| "grad_norm": 0.8215965628623962, | |
| "learning_rate": 2.4911986033730807e-07, | |
| "loss": 2.1359, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.036106367406684554, | |
| "grad_norm": 0.8709486722946167, | |
| "learning_rate": 2.491136718946997e-07, | |
| "loss": 2.4015, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.03635032934862162, | |
| "grad_norm": 0.7629618644714355, | |
| "learning_rate": 2.4910748064263914e-07, | |
| "loss": 2.2006, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.036594291290558675, | |
| "grad_norm": 1.1265887022018433, | |
| "learning_rate": 2.491012865792129e-07, | |
| "loss": 2.2315, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.036594291290558675, | |
| "eval_loss": 2.162862539291382, | |
| "eval_runtime": 82.4917, | |
| "eval_samples_per_second": 3.103, | |
| "eval_steps_per_second": 0.776, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.03683825323249573, | |
| "grad_norm": 0.8692772388458252, | |
| "learning_rate": 2.490950897025056e-07, | |
| "loss": 2.1676, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.03708221517443279, | |
| "grad_norm": 0.865197479724884, | |
| "learning_rate": 2.4908889001060015e-07, | |
| "loss": 2.2783, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.037326177116369846, | |
| "grad_norm": 0.922754168510437, | |
| "learning_rate": 2.490826875015777e-07, | |
| "loss": 2.3296, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.0375701390583069, | |
| "grad_norm": 0.9516766667366028, | |
| "learning_rate": 2.490764821735178e-07, | |
| "loss": 2.3431, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.03781410100024396, | |
| "grad_norm": 0.9996930360794067, | |
| "learning_rate": 2.4907027402449803e-07, | |
| "loss": 2.2365, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.03805806294218102, | |
| "grad_norm": 0.7445939779281616, | |
| "learning_rate": 2.4906406305259434e-07, | |
| "loss": 2.1925, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.038302024884118074, | |
| "grad_norm": 0.8426290154457092, | |
| "learning_rate": 2.4905784925588094e-07, | |
| "loss": 2.2349, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.03854598682605514, | |
| "grad_norm": 1.0926883220672607, | |
| "learning_rate": 2.4905163263243023e-07, | |
| "loss": 2.2426, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.038789948767992195, | |
| "grad_norm": 1.0484980344772339, | |
| "learning_rate": 2.4904541318031294e-07, | |
| "loss": 2.1593, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.03903391070992925, | |
| "grad_norm": 0.8670967817306519, | |
| "learning_rate": 2.49039190897598e-07, | |
| "loss": 2.2036, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.03903391070992925, | |
| "eval_loss": 2.152311086654663, | |
| "eval_runtime": 82.7226, | |
| "eval_samples_per_second": 3.095, | |
| "eval_steps_per_second": 0.774, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.03927787265186631, | |
| "grad_norm": 0.8905733227729797, | |
| "learning_rate": 2.490329657823525e-07, | |
| "loss": 2.0864, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.039521834593803366, | |
| "grad_norm": 0.8731813430786133, | |
| "learning_rate": 2.490267378326419e-07, | |
| "loss": 2.218, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.03976579653574042, | |
| "grad_norm": 0.7908375263214111, | |
| "learning_rate": 2.490205070465299e-07, | |
| "loss": 2.1147, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.04000975847767748, | |
| "grad_norm": 0.9735328555107117, | |
| "learning_rate": 2.4901427342207823e-07, | |
| "loss": 2.1494, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.04025372041961454, | |
| "grad_norm": 0.7927246689796448, | |
| "learning_rate": 2.490080369573472e-07, | |
| "loss": 2.1533, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.0404976823615516, | |
| "grad_norm": 0.8604568839073181, | |
| "learning_rate": 2.4900179765039496e-07, | |
| "loss": 2.1147, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.04074164430348866, | |
| "grad_norm": 1.905282735824585, | |
| "learning_rate": 2.489955554992782e-07, | |
| "loss": 2.1228, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.040985606245425715, | |
| "grad_norm": 0.8245707154273987, | |
| "learning_rate": 2.489893105020518e-07, | |
| "loss": 2.0648, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.04122956818736277, | |
| "grad_norm": 0.9537479281425476, | |
| "learning_rate": 2.489830626567686e-07, | |
| "loss": 2.2827, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.04147353012929983, | |
| "grad_norm": 0.8153314590454102, | |
| "learning_rate": 2.4897681196148e-07, | |
| "loss": 2.1892, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.04147353012929983, | |
| "eval_loss": 2.14363694190979, | |
| "eval_runtime": 84.0031, | |
| "eval_samples_per_second": 3.048, | |
| "eval_steps_per_second": 0.762, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.041717492071236886, | |
| "grad_norm": 0.9862874150276184, | |
| "learning_rate": 2.4897055841423537e-07, | |
| "loss": 2.1974, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.04196145401317394, | |
| "grad_norm": 0.7293988466262817, | |
| "learning_rate": 2.489643020130825e-07, | |
| "loss": 2.1471, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.042205415955111, | |
| "grad_norm": 0.981706976890564, | |
| "learning_rate": 2.4895804275606724e-07, | |
| "loss": 2.1939, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.04244937789704806, | |
| "grad_norm": 0.7976970076560974, | |
| "learning_rate": 2.489517806412337e-07, | |
| "loss": 2.3129, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.04269333983898512, | |
| "grad_norm": 1.0813937187194824, | |
| "learning_rate": 2.4894551566662435e-07, | |
| "loss": 2.4131, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.04293730178092218, | |
| "grad_norm": 0.7610160708427429, | |
| "learning_rate": 2.4893924783027967e-07, | |
| "loss": 2.2202, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.043181263722859235, | |
| "grad_norm": 0.8845246434211731, | |
| "learning_rate": 2.4893297713023835e-07, | |
| "loss": 2.0994, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.04342522566479629, | |
| "grad_norm": 0.8146999478340149, | |
| "learning_rate": 2.4892670356453745e-07, | |
| "loss": 2.2112, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.04366918760673335, | |
| "grad_norm": 53.56632995605469, | |
| "learning_rate": 2.4892042713121207e-07, | |
| "loss": 2.2625, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.043913149548670406, | |
| "grad_norm": 0.8157463073730469, | |
| "learning_rate": 2.4891414782829566e-07, | |
| "loss": 2.206, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.043913149548670406, | |
| "eval_loss": 2.1358835697174072, | |
| "eval_runtime": 84.0237, | |
| "eval_samples_per_second": 3.047, | |
| "eval_steps_per_second": 0.762, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.04415711149060746, | |
| "grad_norm": 0.776892364025116, | |
| "learning_rate": 2.4890786565381976e-07, | |
| "loss": 2.1461, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.04440107343254452, | |
| "grad_norm": 1.0113416910171509, | |
| "learning_rate": 2.489015806058142e-07, | |
| "loss": 2.2795, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.044645035374481584, | |
| "grad_norm": 1.4400883913040161, | |
| "learning_rate": 2.4889529268230683e-07, | |
| "loss": 2.252, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.04488899731641864, | |
| "grad_norm": 1.5715802907943726, | |
| "learning_rate": 2.4888900188132405e-07, | |
| "loss": 2.254, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.0451329592583557, | |
| "grad_norm": 0.8854827284812927, | |
| "learning_rate": 2.4888270820089003e-07, | |
| "loss": 2.2534, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.045376921200292755, | |
| "grad_norm": 0.8536761403083801, | |
| "learning_rate": 2.488764116390274e-07, | |
| "loss": 2.243, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.04562088314222981, | |
| "grad_norm": 1.156900405883789, | |
| "learning_rate": 2.488701121937568e-07, | |
| "loss": 2.2924, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.04586484508416687, | |
| "grad_norm": 0.9670676589012146, | |
| "learning_rate": 2.488638098630973e-07, | |
| "loss": 2.1767, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.046108807026103926, | |
| "grad_norm": 0.6777328848838806, | |
| "learning_rate": 2.4885750464506606e-07, | |
| "loss": 2.2506, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.04635276896804098, | |
| "grad_norm": 1.1340001821517944, | |
| "learning_rate": 2.488511965376782e-07, | |
| "loss": 2.2462, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.04635276896804098, | |
| "eval_loss": 2.1293463706970215, | |
| "eval_runtime": 82.7369, | |
| "eval_samples_per_second": 3.094, | |
| "eval_steps_per_second": 0.774, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.04659673090997804, | |
| "grad_norm": 0.9202519655227661, | |
| "learning_rate": 2.488448855389473e-07, | |
| "loss": 2.264, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.046840692851915104, | |
| "grad_norm": 0.7520004510879517, | |
| "learning_rate": 2.48838571646885e-07, | |
| "loss": 2.184, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.04708465479385216, | |
| "grad_norm": 0.9756651520729065, | |
| "learning_rate": 2.488322548595012e-07, | |
| "loss": 2.2218, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.04732861673578922, | |
| "grad_norm": 1.1608117818832397, | |
| "learning_rate": 2.488259351748038e-07, | |
| "loss": 2.3409, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.047572578677726275, | |
| "grad_norm": 0.9417536854743958, | |
| "learning_rate": 2.48819612590799e-07, | |
| "loss": 2.2483, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.04781654061966333, | |
| "grad_norm": 0.8184861540794373, | |
| "learning_rate": 2.4881328710549126e-07, | |
| "loss": 2.1357, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.04806050256160039, | |
| "grad_norm": 0.8574642539024353, | |
| "learning_rate": 2.48806958716883e-07, | |
| "loss": 2.2997, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.048304464503537446, | |
| "grad_norm": 0.762095034122467, | |
| "learning_rate": 2.488006274229749e-07, | |
| "loss": 2.1704, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.0485484264454745, | |
| "grad_norm": 0.876278281211853, | |
| "learning_rate": 2.4879429322176583e-07, | |
| "loss": 2.1739, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.04879238838741157, | |
| "grad_norm": 0.7141769528388977, | |
| "learning_rate": 2.4878795611125284e-07, | |
| "loss": 2.1301, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.04879238838741157, | |
| "eval_loss": 2.1227312088012695, | |
| "eval_runtime": 82.5491, | |
| "eval_samples_per_second": 3.101, | |
| "eval_steps_per_second": 0.775, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.049036350329348624, | |
| "grad_norm": 0.793533444404602, | |
| "learning_rate": 2.487816160894311e-07, | |
| "loss": 2.105, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.04928031227128568, | |
| "grad_norm": 1.4064488410949707, | |
| "learning_rate": 2.4877527315429387e-07, | |
| "loss": 2.2669, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.04952427421322274, | |
| "grad_norm": 0.7336916923522949, | |
| "learning_rate": 2.4876892730383267e-07, | |
| "loss": 2.2367, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.049768236155159795, | |
| "grad_norm": 5.0201287269592285, | |
| "learning_rate": 2.4876257853603717e-07, | |
| "loss": 2.3826, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.05001219809709685, | |
| "grad_norm": 1.0578281879425049, | |
| "learning_rate": 2.4875622684889513e-07, | |
| "loss": 2.178, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.05025616003903391, | |
| "grad_norm": 0.7075783610343933, | |
| "learning_rate": 2.4874987224039246e-07, | |
| "loss": 2.2244, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.050500121980970966, | |
| "grad_norm": 1.0379695892333984, | |
| "learning_rate": 2.4874351470851334e-07, | |
| "loss": 2.2114, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.05074408392290802, | |
| "grad_norm": 0.8745626211166382, | |
| "learning_rate": 2.4873715425123986e-07, | |
| "loss": 2.2984, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.05098804586484509, | |
| "grad_norm": 0.7211839556694031, | |
| "learning_rate": 2.4873079086655244e-07, | |
| "loss": 2.2514, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.051232007806782144, | |
| "grad_norm": 1.0270675420761108, | |
| "learning_rate": 2.487244245524296e-07, | |
| "loss": 2.0273, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.051232007806782144, | |
| "eval_loss": 2.1163525581359863, | |
| "eval_runtime": 82.5285, | |
| "eval_samples_per_second": 3.102, | |
| "eval_steps_per_second": 0.775, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.0514759697487192, | |
| "grad_norm": 0.7041739225387573, | |
| "learning_rate": 2.487180553068481e-07, | |
| "loss": 2.2021, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.05171993169065626, | |
| "grad_norm": 0.7967678904533386, | |
| "learning_rate": 2.487116831277826e-07, | |
| "loss": 2.0156, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.051963893632593315, | |
| "grad_norm": 0.9619562029838562, | |
| "learning_rate": 2.4870530801320607e-07, | |
| "loss": 2.0662, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.05220785557453037, | |
| "grad_norm": 0.6969118714332581, | |
| "learning_rate": 2.486989299610895e-07, | |
| "loss": 2.1831, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.05245181751646743, | |
| "grad_norm": 0.9280874729156494, | |
| "learning_rate": 2.4869254896940207e-07, | |
| "loss": 2.1238, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.052695779458404486, | |
| "grad_norm": 0.8420175909996033, | |
| "learning_rate": 2.4868616503611124e-07, | |
| "loss": 2.0776, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.05293974140034155, | |
| "grad_norm": 0.737402081489563, | |
| "learning_rate": 2.486797781591823e-07, | |
| "loss": 1.9902, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.05318370334227861, | |
| "grad_norm": 1.4963791370391846, | |
| "learning_rate": 2.4867338833657884e-07, | |
| "loss": 2.1924, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.053427665284215664, | |
| "grad_norm": 0.8707871437072754, | |
| "learning_rate": 2.4866699556626256e-07, | |
| "loss": 2.3329, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.05367162722615272, | |
| "grad_norm": 2.2541556358337402, | |
| "learning_rate": 2.486605998461933e-07, | |
| "loss": 2.2628, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.05367162722615272, | |
| "eval_loss": 2.110635995864868, | |
| "eval_runtime": 82.4261, | |
| "eval_samples_per_second": 3.106, | |
| "eval_steps_per_second": 0.776, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.05391558916808978, | |
| "grad_norm": 0.7243703603744507, | |
| "learning_rate": 2.4865420117432884e-07, | |
| "loss": 2.2738, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.054159551110026835, | |
| "grad_norm": 2.9058852195739746, | |
| "learning_rate": 2.4864779954862536e-07, | |
| "loss": 2.2954, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.05440351305196389, | |
| "grad_norm": 0.6986073851585388, | |
| "learning_rate": 2.486413949670369e-07, | |
| "loss": 2.062, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.05464747499390095, | |
| "grad_norm": 0.8654193878173828, | |
| "learning_rate": 2.486349874275158e-07, | |
| "loss": 1.8318, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.054891436935838006, | |
| "grad_norm": 1.1562286615371704, | |
| "learning_rate": 2.486285769280123e-07, | |
| "loss": 2.3011, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.05513539887777507, | |
| "grad_norm": 0.9906584024429321, | |
| "learning_rate": 2.48622163466475e-07, | |
| "loss": 2.0338, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.05537936081971213, | |
| "grad_norm": 0.7650761008262634, | |
| "learning_rate": 2.486157470408504e-07, | |
| "loss": 2.2399, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.055623322761649184, | |
| "grad_norm": 0.8273106217384338, | |
| "learning_rate": 2.4860932764908314e-07, | |
| "loss": 2.1346, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.05586728470358624, | |
| "grad_norm": 0.8235612511634827, | |
| "learning_rate": 2.486029052891161e-07, | |
| "loss": 2.1117, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.0561112466455233, | |
| "grad_norm": 0.7551500201225281, | |
| "learning_rate": 2.4859647995889003e-07, | |
| "loss": 2.0045, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0561112466455233, | |
| "eval_loss": 2.105764150619507, | |
| "eval_runtime": 82.5574, | |
| "eval_samples_per_second": 3.101, | |
| "eval_steps_per_second": 0.775, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.056355208587460355, | |
| "grad_norm": 0.6984049081802368, | |
| "learning_rate": 2.4859005165634397e-07, | |
| "loss": 1.9406, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.05659917052939741, | |
| "grad_norm": 0.7007377743721008, | |
| "learning_rate": 2.4858362037941493e-07, | |
| "loss": 2.1012, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.05684313247133447, | |
| "grad_norm": 0.7448357343673706, | |
| "learning_rate": 2.485771861260381e-07, | |
| "loss": 2.0201, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.05708709441327153, | |
| "grad_norm": 0.8038643002510071, | |
| "learning_rate": 2.485707488941467e-07, | |
| "loss": 2.113, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.05733105635520859, | |
| "grad_norm": 0.683526873588562, | |
| "learning_rate": 2.48564308681672e-07, | |
| "loss": 2.2006, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.05757501829714565, | |
| "grad_norm": 1.10641610622406, | |
| "learning_rate": 2.485578654865435e-07, | |
| "loss": 2.1394, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.057818980239082704, | |
| "grad_norm": 0.9854748249053955, | |
| "learning_rate": 2.485514193066886e-07, | |
| "loss": 2.0461, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.05806294218101976, | |
| "grad_norm": 0.9628323316574097, | |
| "learning_rate": 2.485449701400329e-07, | |
| "loss": 2.0795, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.05830690412295682, | |
| "grad_norm": 0.7303637862205505, | |
| "learning_rate": 2.485385179845001e-07, | |
| "loss": 2.1231, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.058550866064893875, | |
| "grad_norm": 1.024084448814392, | |
| "learning_rate": 2.4853206283801187e-07, | |
| "loss": 2.2952, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.058550866064893875, | |
| "eval_loss": 2.1005725860595703, | |
| "eval_runtime": 82.6605, | |
| "eval_samples_per_second": 3.097, | |
| "eval_steps_per_second": 0.774, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.05879482800683093, | |
| "grad_norm": 0.7839118242263794, | |
| "learning_rate": 2.4852560469848794e-07, | |
| "loss": 2.1039, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.05903878994876799, | |
| "grad_norm": 0.8608528971672058, | |
| "learning_rate": 2.4851914356384624e-07, | |
| "loss": 2.0228, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.05928275189070505, | |
| "grad_norm": 0.7604301571846008, | |
| "learning_rate": 2.485126794320027e-07, | |
| "loss": 2.085, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.05952671383264211, | |
| "grad_norm": 0.6948022842407227, | |
| "learning_rate": 2.4850621230087125e-07, | |
| "loss": 2.1667, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.05977067577457917, | |
| "grad_norm": 0.9847801327705383, | |
| "learning_rate": 2.4849974216836405e-07, | |
| "loss": 2.132, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.060014637716516224, | |
| "grad_norm": 0.647404670715332, | |
| "learning_rate": 2.4849326903239115e-07, | |
| "loss": 2.0881, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.06025859965845328, | |
| "grad_norm": 0.8329553604125977, | |
| "learning_rate": 2.4848679289086074e-07, | |
| "loss": 2.1774, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.06050256160039034, | |
| "grad_norm": 0.6948926448822021, | |
| "learning_rate": 2.4848031374167913e-07, | |
| "loss": 2.1431, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.060746523542327395, | |
| "grad_norm": 1.132636547088623, | |
| "learning_rate": 2.484738315827505e-07, | |
| "loss": 2.1713, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.06099048548426445, | |
| "grad_norm": 0.7370600700378418, | |
| "learning_rate": 2.484673464119773e-07, | |
| "loss": 2.1122, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.06099048548426445, | |
| "eval_loss": 2.0957703590393066, | |
| "eval_runtime": 82.5102, | |
| "eval_samples_per_second": 3.103, | |
| "eval_steps_per_second": 0.776, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.061234447426201516, | |
| "grad_norm": 0.9531499147415161, | |
| "learning_rate": 2.484608582272598e-07, | |
| "loss": 2.1948, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.06147840936813857, | |
| "grad_norm": 49.09546661376953, | |
| "learning_rate": 2.4845436702649656e-07, | |
| "loss": 2.3097, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.06172237131007563, | |
| "grad_norm": 0.9064823389053345, | |
| "learning_rate": 2.48447872807584e-07, | |
| "loss": 2.1416, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.06196633325201269, | |
| "grad_norm": 0.8501002788543701, | |
| "learning_rate": 2.484413755684167e-07, | |
| "loss": 2.1266, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.062210295193949744, | |
| "grad_norm": 0.9969101548194885, | |
| "learning_rate": 2.484348753068872e-07, | |
| "loss": 2.1585, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.0624542571358868, | |
| "grad_norm": 0.7964323163032532, | |
| "learning_rate": 2.484283720208861e-07, | |
| "loss": 2.0506, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.06269821907782386, | |
| "grad_norm": 0.8279253840446472, | |
| "learning_rate": 2.4842186570830207e-07, | |
| "loss": 2.0881, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.06294218101976091, | |
| "grad_norm": 0.751754879951477, | |
| "learning_rate": 2.484153563670218e-07, | |
| "loss": 1.93, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.06318614296169797, | |
| "grad_norm": 0.7673382759094238, | |
| "learning_rate": 2.4840884399493006e-07, | |
| "loss": 2.0995, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.06343010490363503, | |
| "grad_norm": 0.6529645919799805, | |
| "learning_rate": 2.4840232858990943e-07, | |
| "loss": 2.0436, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.06343010490363503, | |
| "eval_loss": 2.0920004844665527, | |
| "eval_runtime": 82.4308, | |
| "eval_samples_per_second": 3.106, | |
| "eval_steps_per_second": 0.776, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.06367406684557209, | |
| "grad_norm": 1.109803557395935, | |
| "learning_rate": 2.4839581014984084e-07, | |
| "loss": 2.1963, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.06391802878750914, | |
| "grad_norm": 0.8001452088356018, | |
| "learning_rate": 2.48389288672603e-07, | |
| "loss": 2.2763, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.0641619907294462, | |
| "grad_norm": 0.8246157169342041, | |
| "learning_rate": 2.483827641560728e-07, | |
| "loss": 2.0909, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.06440595267138327, | |
| "grad_norm": 1.1484432220458984, | |
| "learning_rate": 2.48376236598125e-07, | |
| "loss": 2.0256, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.06464991461332033, | |
| "grad_norm": 0.7669978737831116, | |
| "learning_rate": 2.4836970599663255e-07, | |
| "loss": 2.119, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.06489387655525738, | |
| "grad_norm": 0.8040069341659546, | |
| "learning_rate": 2.4836317234946626e-07, | |
| "loss": 2.2469, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.06513783849719444, | |
| "grad_norm": 1.2191132307052612, | |
| "learning_rate": 2.48356635654495e-07, | |
| "loss": 2.1695, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.0653818004391315, | |
| "grad_norm": 0.7855357527732849, | |
| "learning_rate": 2.4835009590958575e-07, | |
| "loss": 2.165, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.06562576238106856, | |
| "grad_norm": 0.692392110824585, | |
| "learning_rate": 2.483435531126034e-07, | |
| "loss": 2.1611, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.06586972432300561, | |
| "grad_norm": 0.7394700646400452, | |
| "learning_rate": 2.483370072614108e-07, | |
| "loss": 2.0499, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.06586972432300561, | |
| "eval_loss": 2.0882444381713867, | |
| "eval_runtime": 82.4623, | |
| "eval_samples_per_second": 3.104, | |
| "eval_steps_per_second": 0.776, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.06611368626494267, | |
| "grad_norm": 0.8597064018249512, | |
| "learning_rate": 2.483304583538689e-07, | |
| "loss": 2.1721, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.06635764820687973, | |
| "grad_norm": 0.6948525905609131, | |
| "learning_rate": 2.4832390638783666e-07, | |
| "loss": 2.1164, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.06660161014881678, | |
| "grad_norm": 0.9271346926689148, | |
| "learning_rate": 2.4831735136117095e-07, | |
| "loss": 2.222, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.06684557209075384, | |
| "grad_norm": 0.7124070525169373, | |
| "learning_rate": 2.4831079327172674e-07, | |
| "loss": 2.1953, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.0670895340326909, | |
| "grad_norm": 0.7730213403701782, | |
| "learning_rate": 2.4830423211735686e-07, | |
| "loss": 2.1612, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.06733349597462795, | |
| "grad_norm": 0.7319573760032654, | |
| "learning_rate": 2.482976678959123e-07, | |
| "loss": 2.1091, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.06757745791656501, | |
| "grad_norm": 0.7052650451660156, | |
| "learning_rate": 2.4829110060524197e-07, | |
| "loss": 2.0802, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.06782141985850207, | |
| "grad_norm": 0.6924635171890259, | |
| "learning_rate": 2.482845302431927e-07, | |
| "loss": 2.0242, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.06806538180043913, | |
| "grad_norm": 0.8777127861976624, | |
| "learning_rate": 2.4827795680760933e-07, | |
| "loss": 2.1926, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.06830934374237618, | |
| "grad_norm": 0.6327698230743408, | |
| "learning_rate": 2.482713802963348e-07, | |
| "loss": 2.0477, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.06830934374237618, | |
| "eval_loss": 2.0836799144744873, | |
| "eval_runtime": 82.5559, | |
| "eval_samples_per_second": 3.101, | |
| "eval_steps_per_second": 0.775, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.06855330568431325, | |
| "grad_norm": 0.8194009065628052, | |
| "learning_rate": 2.4826480070720985e-07, | |
| "loss": 2.1662, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.06879726762625031, | |
| "grad_norm": 2.9831783771514893, | |
| "learning_rate": 2.482582180380734e-07, | |
| "loss": 2.0296, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.06904122956818737, | |
| "grad_norm": 0.7184981107711792, | |
| "learning_rate": 2.482516322867622e-07, | |
| "loss": 2.0806, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.06928519151012442, | |
| "grad_norm": 0.9403382539749146, | |
| "learning_rate": 2.48245043451111e-07, | |
| "loss": 2.267, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.06952915345206148, | |
| "grad_norm": 0.7837559580802917, | |
| "learning_rate": 2.482384515289525e-07, | |
| "loss": 2.1853, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.06977311539399854, | |
| "grad_norm": 0.6912328004837036, | |
| "learning_rate": 2.482318565181174e-07, | |
| "loss": 2.1157, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.0700170773359356, | |
| "grad_norm": 0.7220721244812012, | |
| "learning_rate": 2.4822525841643453e-07, | |
| "loss": 2.1604, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.07026103927787265, | |
| "grad_norm": 0.6910116672515869, | |
| "learning_rate": 2.482186572217303e-07, | |
| "loss": 2.3089, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.07050500121980971, | |
| "grad_norm": 0.6678460836410522, | |
| "learning_rate": 2.482120529318294e-07, | |
| "loss": 2.144, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.07074896316174677, | |
| "grad_norm": 0.8362970948219299, | |
| "learning_rate": 2.482054455445545e-07, | |
| "loss": 2.1724, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.07074896316174677, | |
| "eval_loss": 2.079866409301758, | |
| "eval_runtime": 82.5785, | |
| "eval_samples_per_second": 3.1, | |
| "eval_steps_per_second": 0.775, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.07099292510368382, | |
| "grad_norm": 0.6900479793548584, | |
| "learning_rate": 2.481988350577259e-07, | |
| "loss": 2.0982, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.07123688704562088, | |
| "grad_norm": 0.737310528755188, | |
| "learning_rate": 2.481922214691622e-07, | |
| "loss": 2.0816, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.07148084898755794, | |
| "grad_norm": 0.6768659949302673, | |
| "learning_rate": 2.481856047766798e-07, | |
| "loss": 2.1823, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.071724810929495, | |
| "grad_norm": 0.6438968181610107, | |
| "learning_rate": 2.4817898497809304e-07, | |
| "loss": 2.0001, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.07196877287143205, | |
| "grad_norm": 0.9217244982719421, | |
| "learning_rate": 2.4817236207121427e-07, | |
| "loss": 2.1251, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.07221273481336911, | |
| "grad_norm": 0.984214186668396, | |
| "learning_rate": 2.4816573605385374e-07, | |
| "loss": 2.2588, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.07245669675530617, | |
| "grad_norm": 0.6501964330673218, | |
| "learning_rate": 2.481591069238197e-07, | |
| "loss": 1.9671, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.07270065869724324, | |
| "grad_norm": 0.7080546617507935, | |
| "learning_rate": 2.481524746789182e-07, | |
| "loss": 2.1113, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.0729446206391803, | |
| "grad_norm": 0.759996235370636, | |
| "learning_rate": 2.4814583931695343e-07, | |
| "loss": 2.1671, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.07318858258111735, | |
| "grad_norm": 0.7538090348243713, | |
| "learning_rate": 2.4813920083572734e-07, | |
| "loss": 2.0294, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.07318858258111735, | |
| "eval_loss": 2.0766212940216064, | |
| "eval_runtime": 82.5747, | |
| "eval_samples_per_second": 3.1, | |
| "eval_steps_per_second": 0.775, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.07343254452305441, | |
| "grad_norm": 1.0901976823806763, | |
| "learning_rate": 2.481325592330399e-07, | |
| "loss": 2.1481, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.07367650646499146, | |
| "grad_norm": 0.5969391465187073, | |
| "learning_rate": 2.4812591450668896e-07, | |
| "loss": 2.0585, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.07392046840692852, | |
| "grad_norm": 1.0443135499954224, | |
| "learning_rate": 2.4811926665447034e-07, | |
| "loss": 2.1669, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.07416443034886558, | |
| "grad_norm": 0.7203410863876343, | |
| "learning_rate": 2.481126156741779e-07, | |
| "loss": 2.1098, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.07440839229080264, | |
| "grad_norm": 0.8156256079673767, | |
| "learning_rate": 2.481059615636031e-07, | |
| "loss": 2.0854, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.07465235423273969, | |
| "grad_norm": 0.9032734036445618, | |
| "learning_rate": 2.480993043205356e-07, | |
| "loss": 2.0709, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.07489631617467675, | |
| "grad_norm": 0.7365292310714722, | |
| "learning_rate": 2.4809264394276297e-07, | |
| "loss": 2.0466, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.0751402781166138, | |
| "grad_norm": 1.4419785737991333, | |
| "learning_rate": 2.4808598042807057e-07, | |
| "loss": 2.1844, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.07538424005855086, | |
| "grad_norm": 0.8487168550491333, | |
| "learning_rate": 2.4807931377424167e-07, | |
| "loss": 2.0844, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.07562820200048792, | |
| "grad_norm": 0.6521626114845276, | |
| "learning_rate": 2.4807264397905757e-07, | |
| "loss": 2.1657, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.07562820200048792, | |
| "eval_loss": 2.0734453201293945, | |
| "eval_runtime": 82.6342, | |
| "eval_samples_per_second": 3.098, | |
| "eval_steps_per_second": 0.774, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.07587216394242498, | |
| "grad_norm": 1.0338906049728394, | |
| "learning_rate": 2.480659710402974e-07, | |
| "loss": 2.0817, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.07611612588436203, | |
| "grad_norm": 0.7622800469398499, | |
| "learning_rate": 2.480592949557383e-07, | |
| "loss": 2.2028, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.07636008782629909, | |
| "grad_norm": 0.698598325252533, | |
| "learning_rate": 2.4805261572315513e-07, | |
| "loss": 2.1197, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.07660404976823615, | |
| "grad_norm": 0.9301127195358276, | |
| "learning_rate": 2.480459333403207e-07, | |
| "loss": 2.0719, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.07684801171017322, | |
| "grad_norm": 0.7141032814979553, | |
| "learning_rate": 2.480392478050059e-07, | |
| "loss": 1.9979, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.07709197365211028, | |
| "grad_norm": 0.8011495471000671, | |
| "learning_rate": 2.4803255911497927e-07, | |
| "loss": 2.2752, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.07733593559404733, | |
| "grad_norm": 0.6986822485923767, | |
| "learning_rate": 2.4802586726800744e-07, | |
| "loss": 2.1448, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.07757989753598439, | |
| "grad_norm": 0.7055565118789673, | |
| "learning_rate": 2.4801917226185476e-07, | |
| "loss": 2.1929, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.07782385947792145, | |
| "grad_norm": 0.6355963945388794, | |
| "learning_rate": 2.480124740942837e-07, | |
| "loss": 2.0868, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.0780678214198585, | |
| "grad_norm": 0.6996073126792908, | |
| "learning_rate": 2.480057727630543e-07, | |
| "loss": 2.1996, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.0780678214198585, | |
| "eval_loss": 2.070326805114746, | |
| "eval_runtime": 82.8377, | |
| "eval_samples_per_second": 3.09, | |
| "eval_steps_per_second": 0.773, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.07831178336179556, | |
| "grad_norm": 0.9807422757148743, | |
| "learning_rate": 2.479990682659248e-07, | |
| "loss": 2.1283, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.07855574530373262, | |
| "grad_norm": 0.660064697265625, | |
| "learning_rate": 2.4799236060065104e-07, | |
| "loss": 2.1273, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.07879970724566968, | |
| "grad_norm": 0.6279289722442627, | |
| "learning_rate": 2.47985649764987e-07, | |
| "loss": 2.0789, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.07904366918760673, | |
| "grad_norm": 0.8380366563796997, | |
| "learning_rate": 2.4797893575668437e-07, | |
| "loss": 2.0606, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.07928763112954379, | |
| "grad_norm": 0.6436744928359985, | |
| "learning_rate": 2.4797221857349267e-07, | |
| "loss": 2.1684, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.07953159307148085, | |
| "grad_norm": 0.9657268524169922, | |
| "learning_rate": 2.4796549821315954e-07, | |
| "loss": 2.1487, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.0797755550134179, | |
| "grad_norm": 0.7531183362007141, | |
| "learning_rate": 2.479587746734302e-07, | |
| "loss": 2.2133, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.08001951695535496, | |
| "grad_norm": 0.6298527121543884, | |
| "learning_rate": 2.4795204795204794e-07, | |
| "loss": 2.1123, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.08026347889729202, | |
| "grad_norm": 1.2394922971725464, | |
| "learning_rate": 2.479453180467538e-07, | |
| "loss": 2.1458, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.08050744083922907, | |
| "grad_norm": 0.7295296788215637, | |
| "learning_rate": 2.479385849552867e-07, | |
| "loss": 2.127, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.08050744083922907, | |
| "eval_loss": 2.0674209594726562, | |
| "eval_runtime": 82.4919, | |
| "eval_samples_per_second": 3.103, | |
| "eval_steps_per_second": 0.776, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.08075140278116613, | |
| "grad_norm": 0.7010200619697571, | |
| "learning_rate": 2.479318486753834e-07, | |
| "loss": 1.9559, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.0809953647231032, | |
| "grad_norm": 0.9205324053764343, | |
| "learning_rate": 2.479251092047787e-07, | |
| "loss": 2.3111, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.08123932666504026, | |
| "grad_norm": 0.7850201725959778, | |
| "learning_rate": 2.4791836654120494e-07, | |
| "loss": 2.0908, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.08148328860697732, | |
| "grad_norm": 0.8352589011192322, | |
| "learning_rate": 2.4791162068239256e-07, | |
| "loss": 2.0713, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.08172725054891437, | |
| "grad_norm": 1.1444090604782104, | |
| "learning_rate": 2.4790487162606977e-07, | |
| "loss": 2.2373, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.08197121249085143, | |
| "grad_norm": 0.6652919054031372, | |
| "learning_rate": 2.478981193699626e-07, | |
| "loss": 2.0393, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.08221517443278849, | |
| "grad_norm": 1.2522311210632324, | |
| "learning_rate": 2.478913639117949e-07, | |
| "loss": 2.1158, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.08245913637472554, | |
| "grad_norm": 0.6700438261032104, | |
| "learning_rate": 2.478846052492885e-07, | |
| "loss": 2.1475, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.0827030983166626, | |
| "grad_norm": 0.594170868396759, | |
| "learning_rate": 2.478778433801629e-07, | |
| "loss": 2.2158, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.08294706025859966, | |
| "grad_norm": 0.7548016905784607, | |
| "learning_rate": 2.478710783021355e-07, | |
| "loss": 2.1094, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.08294706025859966, | |
| "eval_loss": 2.0647270679473877, | |
| "eval_runtime": 82.4405, | |
| "eval_samples_per_second": 3.105, | |
| "eval_steps_per_second": 0.776, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.08319102220053672, | |
| "grad_norm": 0.6635969877243042, | |
| "learning_rate": 2.4786431001292156e-07, | |
| "loss": 2.0065, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.08343498414247377, | |
| "grad_norm": 0.5917369723320007, | |
| "learning_rate": 2.478575385102342e-07, | |
| "loss": 2.1334, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.08367894608441083, | |
| "grad_norm": 0.7114012241363525, | |
| "learning_rate": 2.4785076379178427e-07, | |
| "loss": 2.2898, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.08392290802634789, | |
| "grad_norm": 0.6210088729858398, | |
| "learning_rate": 2.478439858552805e-07, | |
| "loss": 2.1155, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.08416686996828494, | |
| "grad_norm": 3.0671420097351074, | |
| "learning_rate": 2.4783720469842943e-07, | |
| "loss": 2.2391, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.084410831910222, | |
| "grad_norm": 0.6841104030609131, | |
| "learning_rate": 2.4783042031893544e-07, | |
| "loss": 1.9859, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.08465479385215906, | |
| "grad_norm": 0.8301260471343994, | |
| "learning_rate": 2.478236327145007e-07, | |
| "loss": 2.1063, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.08489875579409611, | |
| "grad_norm": 0.688525378704071, | |
| "learning_rate": 2.4781684188282526e-07, | |
| "loss": 2.0468, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.08514271773603319, | |
| "grad_norm": 0.7203090190887451, | |
| "learning_rate": 2.4781004782160693e-07, | |
| "loss": 2.1424, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.08538667967797024, | |
| "grad_norm": 0.6987703442573547, | |
| "learning_rate": 2.478032505285412e-07, | |
| "loss": 2.0616, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.08538667967797024, | |
| "eval_loss": 2.0618653297424316, | |
| "eval_runtime": 82.4978, | |
| "eval_samples_per_second": 3.103, | |
| "eval_steps_per_second": 0.776, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.0856306416199073, | |
| "grad_norm": 0.8080345392227173, | |
| "learning_rate": 2.4779645000132166e-07, | |
| "loss": 2.1916, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.08587460356184436, | |
| "grad_norm": 0.6830054521560669, | |
| "learning_rate": 2.477896462376395e-07, | |
| "loss": 2.0308, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.08611856550378141, | |
| "grad_norm": 0.9455615282058716, | |
| "learning_rate": 2.4778283923518366e-07, | |
| "loss": 2.2529, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.08636252744571847, | |
| "grad_norm": 1.0438265800476074, | |
| "learning_rate": 2.477760289916411e-07, | |
| "loss": 2.1359, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.08660648938765553, | |
| "grad_norm": 0.7835482358932495, | |
| "learning_rate": 2.477692155046964e-07, | |
| "loss": 2.2987, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.08685045132959258, | |
| "grad_norm": 1.003544569015503, | |
| "learning_rate": 2.47762398772032e-07, | |
| "loss": 2.2476, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.08709441327152964, | |
| "grad_norm": 0.7639005780220032, | |
| "learning_rate": 2.4775557879132803e-07, | |
| "loss": 2.0089, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.0873383752134667, | |
| "grad_norm": 0.6183759570121765, | |
| "learning_rate": 2.4774875556026265e-07, | |
| "loss": 2.0263, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.08758233715540376, | |
| "grad_norm": 0.9802903532981873, | |
| "learning_rate": 2.477419290765115e-07, | |
| "loss": 2.0125, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.08782629909734081, | |
| "grad_norm": 0.7445787787437439, | |
| "learning_rate": 2.4773509933774833e-07, | |
| "loss": 1.9287, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.08782629909734081, | |
| "eval_loss": 2.060185194015503, | |
| "eval_runtime": 82.5332, | |
| "eval_samples_per_second": 3.102, | |
| "eval_steps_per_second": 0.775, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.08807026103927787, | |
| "grad_norm": 0.6245632767677307, | |
| "learning_rate": 2.4772826634164435e-07, | |
| "loss": 1.9036, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.08831422298121493, | |
| "grad_norm": 0.822571873664856, | |
| "learning_rate": 2.4772143008586876e-07, | |
| "loss": 2.1521, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.08855818492315198, | |
| "grad_norm": 0.7154176235198975, | |
| "learning_rate": 2.4771459056808844e-07, | |
| "loss": 2.1507, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.08880214686508904, | |
| "grad_norm": 0.7716903686523438, | |
| "learning_rate": 2.477077477859681e-07, | |
| "loss": 2.0469, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.0890461088070261, | |
| "grad_norm": 0.8829085230827332, | |
| "learning_rate": 2.4770090173717014e-07, | |
| "loss": 2.0799, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.08929007074896317, | |
| "grad_norm": 0.6315395832061768, | |
| "learning_rate": 2.4769405241935484e-07, | |
| "loss": 2.0283, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.08953403269090023, | |
| "grad_norm": 0.6415708661079407, | |
| "learning_rate": 2.476871998301802e-07, | |
| "loss": 2.0164, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.08977799463283728, | |
| "grad_norm": 0.6306458115577698, | |
| "learning_rate": 2.476803439673019e-07, | |
| "loss": 2.1085, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.09002195657477434, | |
| "grad_norm": 0.8052152991294861, | |
| "learning_rate": 2.476734848283735e-07, | |
| "loss": 2.1536, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.0902659185167114, | |
| "grad_norm": 0.7443030476570129, | |
| "learning_rate": 2.476666224110462e-07, | |
| "loss": 2.2221, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.0902659185167114, | |
| "eval_loss": 2.057771682739258, | |
| "eval_runtime": 82.4526, | |
| "eval_samples_per_second": 3.105, | |
| "eval_steps_per_second": 0.776, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.09050988045864845, | |
| "grad_norm": 0.717291533946991, | |
| "learning_rate": 2.476597567129691e-07, | |
| "loss": 1.9786, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.09075384240058551, | |
| "grad_norm": 0.7804278135299683, | |
| "learning_rate": 2.4765288773178894e-07, | |
| "loss": 2.1336, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.09099780434252257, | |
| "grad_norm": 0.5851336121559143, | |
| "learning_rate": 2.476460154651503e-07, | |
| "loss": 2.0796, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.09124176628445962, | |
| "grad_norm": 0.7063292860984802, | |
| "learning_rate": 2.4763913991069527e-07, | |
| "loss": 2.1067, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.09148572822639668, | |
| "grad_norm": 0.6482515335083008, | |
| "learning_rate": 2.4763226106606407e-07, | |
| "loss": 2.0642, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.09172969016833374, | |
| "grad_norm": 1.2077556848526, | |
| "learning_rate": 2.476253789288943e-07, | |
| "loss": 1.9154, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.0919736521102708, | |
| "grad_norm": 0.8584334254264832, | |
| "learning_rate": 2.4761849349682154e-07, | |
| "loss": 1.9837, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.09221761405220785, | |
| "grad_norm": 0.5971949696540833, | |
| "learning_rate": 2.4761160476747895e-07, | |
| "loss": 1.9219, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.09246157599414491, | |
| "grad_norm": 1.1772984266281128, | |
| "learning_rate": 2.4760471273849755e-07, | |
| "loss": 2.1976, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.09270553793608197, | |
| "grad_norm": 1.0061829090118408, | |
| "learning_rate": 2.47597817407506e-07, | |
| "loss": 2.1878, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.09270553793608197, | |
| "eval_loss": 2.0550973415374756, | |
| "eval_runtime": 82.389, | |
| "eval_samples_per_second": 3.107, | |
| "eval_steps_per_second": 0.777, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.09294949987801902, | |
| "grad_norm": 89.18840789794922, | |
| "learning_rate": 2.475909187721307e-07, | |
| "loss": 2.1648, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.09319346181995608, | |
| "grad_norm": 1.159109115600586, | |
| "learning_rate": 2.4758401682999573e-07, | |
| "loss": 2.1707, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.09343742376189315, | |
| "grad_norm": 0.7915341258049011, | |
| "learning_rate": 2.475771115787231e-07, | |
| "loss": 2.11, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.09368138570383021, | |
| "grad_norm": 0.691881537437439, | |
| "learning_rate": 2.475702030159322e-07, | |
| "loss": 1.9882, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.09392534764576727, | |
| "grad_norm": 1.018052577972412, | |
| "learning_rate": 2.475632911392405e-07, | |
| "loss": 2.0549, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.09416930958770432, | |
| "grad_norm": 0.9836434721946716, | |
| "learning_rate": 2.475563759462629e-07, | |
| "loss": 2.177, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.09441327152964138, | |
| "grad_norm": 0.7274807691574097, | |
| "learning_rate": 2.475494574346122e-07, | |
| "loss": 2.1897, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.09465723347157844, | |
| "grad_norm": 0.853111982345581, | |
| "learning_rate": 2.475425356018988e-07, | |
| "loss": 2.0191, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.0949011954135155, | |
| "grad_norm": 0.7096747159957886, | |
| "learning_rate": 2.475356104457307e-07, | |
| "loss": 2.0043, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.09514515735545255, | |
| "grad_norm": 0.59073406457901, | |
| "learning_rate": 2.4752868196371393e-07, | |
| "loss": 2.1815, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.09514515735545255, | |
| "eval_loss": 2.0534627437591553, | |
| "eval_runtime": 82.449, | |
| "eval_samples_per_second": 3.105, | |
| "eval_steps_per_second": 0.776, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.09538911929738961, | |
| "grad_norm": 1.0871241092681885, | |
| "learning_rate": 2.47521750153452e-07, | |
| "loss": 2.3593, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.09563308123932666, | |
| "grad_norm": 0.7196955680847168, | |
| "learning_rate": 2.4751481501254606e-07, | |
| "loss": 2.1606, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.09587704318126372, | |
| "grad_norm": 0.6455821394920349, | |
| "learning_rate": 2.4750787653859505e-07, | |
| "loss": 2.1609, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.09612100512320078, | |
| "grad_norm": 0.8399761915206909, | |
| "learning_rate": 2.475009347291956e-07, | |
| "loss": 2.2308, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.09636496706513784, | |
| "grad_norm": 0.7365739941596985, | |
| "learning_rate": 2.47493989581942e-07, | |
| "loss": 2.1575, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.09660892900707489, | |
| "grad_norm": 0.7569345235824585, | |
| "learning_rate": 2.4748704109442635e-07, | |
| "loss": 2.1495, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.09685289094901195, | |
| "grad_norm": 0.6441726088523865, | |
| "learning_rate": 2.4748008926423817e-07, | |
| "loss": 2.0264, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.097096852890949, | |
| "grad_norm": 0.6600158214569092, | |
| "learning_rate": 2.474731340889649e-07, | |
| "loss": 2.1404, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.09734081483288606, | |
| "grad_norm": 0.6377738118171692, | |
| "learning_rate": 2.4746617556619163e-07, | |
| "loss": 2.0164, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.09758477677482313, | |
| "grad_norm": 0.7105040550231934, | |
| "learning_rate": 2.4745921369350094e-07, | |
| "loss": 2.1193, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.09758477677482313, | |
| "eval_loss": 2.051236391067505, | |
| "eval_runtime": 82.3788, | |
| "eval_samples_per_second": 3.108, | |
| "eval_steps_per_second": 0.777, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.09782873871676019, | |
| "grad_norm": 0.9596676230430603, | |
| "learning_rate": 2.474522484684733e-07, | |
| "loss": 2.1332, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.09807270065869725, | |
| "grad_norm": 0.7559407353401184, | |
| "learning_rate": 2.4744527988868673e-07, | |
| "loss": 2.0184, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.0983166626006343, | |
| "grad_norm": 0.9651991128921509, | |
| "learning_rate": 2.4743830795171695e-07, | |
| "loss": 2.0775, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.09856062454257136, | |
| "grad_norm": 0.6360098719596863, | |
| "learning_rate": 2.474313326551373e-07, | |
| "loss": 1.9806, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.09880458648450842, | |
| "grad_norm": 0.8414661288261414, | |
| "learning_rate": 2.474243539965189e-07, | |
| "loss": 2.0598, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.09904854842644548, | |
| "grad_norm": 0.593665361404419, | |
| "learning_rate": 2.4741737197343045e-07, | |
| "loss": 2.1291, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.09929251036838253, | |
| "grad_norm": 0.6151170134544373, | |
| "learning_rate": 2.4741038658343824e-07, | |
| "loss": 2.0288, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.09953647231031959, | |
| "grad_norm": 0.8486893773078918, | |
| "learning_rate": 2.474033978241063e-07, | |
| "loss": 2.1914, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.09978043425225665, | |
| "grad_norm": 0.5917589664459229, | |
| "learning_rate": 2.4739640569299634e-07, | |
| "loss": 1.997, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.1000243961941937, | |
| "grad_norm": 0.6412562727928162, | |
| "learning_rate": 2.4738941018766753e-07, | |
| "loss": 2.1638, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.1000243961941937, | |
| "eval_loss": 2.0491929054260254, | |
| "eval_runtime": 82.4035, | |
| "eval_samples_per_second": 3.107, | |
| "eval_steps_per_second": 0.777, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.10026835813613076, | |
| "grad_norm": 0.8086695075035095, | |
| "learning_rate": 2.47382411305677e-07, | |
| "loss": 2.0884, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.10051232007806782, | |
| "grad_norm": 0.7479122281074524, | |
| "learning_rate": 2.4737540904457914e-07, | |
| "loss": 2.1612, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.10075628202000488, | |
| "grad_norm": 0.6791818141937256, | |
| "learning_rate": 2.4736840340192635e-07, | |
| "loss": 2.0849, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.10100024396194193, | |
| "grad_norm": 0.7554202675819397, | |
| "learning_rate": 2.4736139437526835e-07, | |
| "loss": 2.0364, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.10124420590387899, | |
| "grad_norm": 1.182301640510559, | |
| "learning_rate": 2.4735438196215273e-07, | |
| "loss": 2.0642, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.10148816784581605, | |
| "grad_norm": 0.7850328087806702, | |
| "learning_rate": 2.4734736616012457e-07, | |
| "loss": 2.0743, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.10173212978775312, | |
| "grad_norm": 0.8438737988471985, | |
| "learning_rate": 2.4734034696672667e-07, | |
| "loss": 2.1803, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.10197609172969017, | |
| "grad_norm": 5.938900947570801, | |
| "learning_rate": 2.473333243794993e-07, | |
| "loss": 2.1007, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.10222005367162723, | |
| "grad_norm": 0.7063493728637695, | |
| "learning_rate": 2.4732629839598054e-07, | |
| "loss": 2.0102, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.10246401561356429, | |
| "grad_norm": 0.7309712171554565, | |
| "learning_rate": 2.4731926901370596e-07, | |
| "loss": 2.0201, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.10246401561356429, | |
| "eval_loss": 2.047513484954834, | |
| "eval_runtime": 82.2828, | |
| "eval_samples_per_second": 3.111, | |
| "eval_steps_per_second": 0.778, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.10270797755550135, | |
| "grad_norm": 0.6622780561447144, | |
| "learning_rate": 2.473122362302088e-07, | |
| "loss": 2.1224, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.1029519394974384, | |
| "grad_norm": 0.5755351185798645, | |
| "learning_rate": 2.4730520004301997e-07, | |
| "loss": 2.0315, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.10319590143937546, | |
| "grad_norm": 0.6571397185325623, | |
| "learning_rate": 2.472981604496678e-07, | |
| "loss": 2.1613, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.10343986338131252, | |
| "grad_norm": 0.6562328338623047, | |
| "learning_rate": 2.472911174476784e-07, | |
| "loss": 2.1833, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.10368382532324957, | |
| "grad_norm": 0.6494048237800598, | |
| "learning_rate": 2.4728407103457554e-07, | |
| "loss": 2.0679, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.10392778726518663, | |
| "grad_norm": 1.0681666135787964, | |
| "learning_rate": 2.472770212078803e-07, | |
| "loss": 2.1321, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.10417174920712369, | |
| "grad_norm": 0.7042210698127747, | |
| "learning_rate": 2.4726996796511157e-07, | |
| "loss": 2.1927, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.10441571114906074, | |
| "grad_norm": 0.6480265259742737, | |
| "learning_rate": 2.4726291130378586e-07, | |
| "loss": 2.3393, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.1046596730909978, | |
| "grad_norm": 0.6662938594818115, | |
| "learning_rate": 2.472558512214172e-07, | |
| "loss": 2.0135, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.10490363503293486, | |
| "grad_norm": 0.6291259527206421, | |
| "learning_rate": 2.4724878771551725e-07, | |
| "loss": 2.1968, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.10490363503293486, | |
| "eval_loss": 2.0453972816467285, | |
| "eval_runtime": 82.4758, | |
| "eval_samples_per_second": 3.104, | |
| "eval_steps_per_second": 0.776, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.10514759697487192, | |
| "grad_norm": 3.447300910949707, | |
| "learning_rate": 2.4724172078359513e-07, | |
| "loss": 2.0465, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.10539155891680897, | |
| "grad_norm": 1.3743339776992798, | |
| "learning_rate": 2.4723465042315776e-07, | |
| "loss": 2.0544, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.10563552085874603, | |
| "grad_norm": 0.8059394359588623, | |
| "learning_rate": 2.4722757663170946e-07, | |
| "loss": 2.1522, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.1058794828006831, | |
| "grad_norm": 0.6414276361465454, | |
| "learning_rate": 2.4722049940675223e-07, | |
| "loss": 2.1575, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.10612344474262016, | |
| "grad_norm": 1.0368949174880981, | |
| "learning_rate": 2.472134187457856e-07, | |
| "loss": 1.9911, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.10636740668455721, | |
| "grad_norm": 0.6836427450180054, | |
| "learning_rate": 2.4720633464630656e-07, | |
| "loss": 2.074, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.10661136862649427, | |
| "grad_norm": 0.6823791861534119, | |
| "learning_rate": 2.4719924710581e-07, | |
| "loss": 2.0613, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.10685533056843133, | |
| "grad_norm": 0.6797323822975159, | |
| "learning_rate": 2.4719215612178795e-07, | |
| "loss": 2.0521, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.10709929251036839, | |
| "grad_norm": 0.6761137247085571, | |
| "learning_rate": 2.471850616917303e-07, | |
| "loss": 2.0434, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.10734325445230544, | |
| "grad_norm": 0.6031590104103088, | |
| "learning_rate": 2.4717796381312446e-07, | |
| "loss": 2.1448, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.10734325445230544, | |
| "eval_loss": 2.0439488887786865, | |
| "eval_runtime": 82.3918, | |
| "eval_samples_per_second": 3.107, | |
| "eval_steps_per_second": 0.777, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.1075872163942425, | |
| "grad_norm": 0.7096999287605286, | |
| "learning_rate": 2.471708624834553e-07, | |
| "loss": 2.236, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.10783117833617956, | |
| "grad_norm": 0.7802643775939941, | |
| "learning_rate": 2.471637577002053e-07, | |
| "loss": 2.0246, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.10807514027811661, | |
| "grad_norm": 0.8068674802780151, | |
| "learning_rate": 2.471566494608545e-07, | |
| "loss": 2.1481, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.10831910222005367, | |
| "grad_norm": 0.792898416519165, | |
| "learning_rate": 2.4714953776288044e-07, | |
| "loss": 1.9699, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.10856306416199073, | |
| "grad_norm": 0.6124379634857178, | |
| "learning_rate": 2.471424226037583e-07, | |
| "loss": 2.1199, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.10880702610392778, | |
| "grad_norm": 1.238782525062561, | |
| "learning_rate": 2.471353039809606e-07, | |
| "loss": 2.1193, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.10905098804586484, | |
| "grad_norm": 0.5847099423408508, | |
| "learning_rate": 2.471281818919577e-07, | |
| "loss": 2.2197, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.1092949499878019, | |
| "grad_norm": 0.7409020066261292, | |
| "learning_rate": 2.4712105633421726e-07, | |
| "loss": 2.1878, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.10953891192973896, | |
| "grad_norm": 0.8623873591423035, | |
| "learning_rate": 2.471139273052045e-07, | |
| "loss": 2.102, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.10978287387167601, | |
| "grad_norm": 0.9536486864089966, | |
| "learning_rate": 2.471067948023822e-07, | |
| "loss": 2.0028, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.10978287387167601, | |
| "eval_loss": 2.042297601699829, | |
| "eval_runtime": 82.3539, | |
| "eval_samples_per_second": 3.109, | |
| "eval_steps_per_second": 0.777, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.11002683581361308, | |
| "grad_norm": 2.532158136367798, | |
| "learning_rate": 2.4709965882321085e-07, | |
| "loss": 2.1556, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.11027079775555014, | |
| "grad_norm": 0.7216641306877136, | |
| "learning_rate": 2.470925193651481e-07, | |
| "loss": 2.2463, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.1105147596974872, | |
| "grad_norm": 0.7760343551635742, | |
| "learning_rate": 2.470853764256495e-07, | |
| "loss": 2.102, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.11075872163942425, | |
| "grad_norm": 0.8911886811256409, | |
| "learning_rate": 2.4707823000216777e-07, | |
| "loss": 2.3057, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.11100268358136131, | |
| "grad_norm": 0.7775437235832214, | |
| "learning_rate": 2.470710800921534e-07, | |
| "loss": 2.0745, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.11124664552329837, | |
| "grad_norm": 0.6954789757728577, | |
| "learning_rate": 2.470639266930543e-07, | |
| "loss": 1.9749, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.11149060746523543, | |
| "grad_norm": 0.5103456974029541, | |
| "learning_rate": 2.4705676980231577e-07, | |
| "loss": 2.053, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.11173456940717248, | |
| "grad_norm": 0.9708994626998901, | |
| "learning_rate": 2.4704960941738093e-07, | |
| "loss": 2.0905, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.11197853134910954, | |
| "grad_norm": 0.7622889876365662, | |
| "learning_rate": 2.4704244553569005e-07, | |
| "loss": 2.2957, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.1122224932910466, | |
| "grad_norm": 0.7484825849533081, | |
| "learning_rate": 2.470352781546811e-07, | |
| "loss": 2.0953, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.1122224932910466, | |
| "eval_loss": 2.0406336784362793, | |
| "eval_runtime": 82.4811, | |
| "eval_samples_per_second": 3.104, | |
| "eval_steps_per_second": 0.776, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.11246645523298365, | |
| "grad_norm": 0.9012355208396912, | |
| "learning_rate": 2.4702810727178955e-07, | |
| "loss": 2.092, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.11271041717492071, | |
| "grad_norm": 0.9759025573730469, | |
| "learning_rate": 2.470209328844483e-07, | |
| "loss": 2.2126, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.11295437911685777, | |
| "grad_norm": 0.6701992154121399, | |
| "learning_rate": 2.470137549900877e-07, | |
| "loss": 2.0805, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.11319834105879482, | |
| "grad_norm": 0.9370368123054504, | |
| "learning_rate": 2.4700657358613573e-07, | |
| "loss": 2.1702, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.11344230300073188, | |
| "grad_norm": 0.7537663578987122, | |
| "learning_rate": 2.4699938867001765e-07, | |
| "loss": 2.1348, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.11368626494266894, | |
| "grad_norm": 28.197704315185547, | |
| "learning_rate": 2.469922002391564e-07, | |
| "loss": 2.1985, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.113930226884606, | |
| "grad_norm": 0.7190595269203186, | |
| "learning_rate": 2.4698500829097235e-07, | |
| "loss": 2.065, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 0.11417418882654307, | |
| "grad_norm": 0.6859692931175232, | |
| "learning_rate": 2.469778128228832e-07, | |
| "loss": 2.2628, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.11441815076848012, | |
| "grad_norm": 1.0769257545471191, | |
| "learning_rate": 2.4697061383230436e-07, | |
| "loss": 2.099, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.11466211271041718, | |
| "grad_norm": 0.7370059490203857, | |
| "learning_rate": 2.469634113166485e-07, | |
| "loss": 2.0264, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.11466211271041718, | |
| "eval_loss": 2.03948712348938, | |
| "eval_runtime": 82.5195, | |
| "eval_samples_per_second": 3.102, | |
| "eval_steps_per_second": 0.776, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.11490607465235424, | |
| "grad_norm": 1.2912418842315674, | |
| "learning_rate": 2.4695620527332587e-07, | |
| "loss": 1.9958, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.1151500365942913, | |
| "grad_norm": 0.7174116373062134, | |
| "learning_rate": 2.4694899569974417e-07, | |
| "loss": 2.1067, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.11539399853622835, | |
| "grad_norm": 0.8682342171669006, | |
| "learning_rate": 2.4694178259330843e-07, | |
| "loss": 2.0431, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 0.11563796047816541, | |
| "grad_norm": 0.7805954217910767, | |
| "learning_rate": 2.4693456595142144e-07, | |
| "loss": 1.936, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.11588192242010247, | |
| "grad_norm": 0.6474030017852783, | |
| "learning_rate": 2.46927345771483e-07, | |
| "loss": 1.9181, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.11612588436203952, | |
| "grad_norm": 1.033121943473816, | |
| "learning_rate": 2.4692012205089086e-07, | |
| "loss": 2.1393, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.11636984630397658, | |
| "grad_norm": 0.6728079915046692, | |
| "learning_rate": 2.469128947870398e-07, | |
| "loss": 2.0636, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.11661380824591364, | |
| "grad_norm": 0.7054829001426697, | |
| "learning_rate": 2.4690566397732225e-07, | |
| "loss": 2.0255, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.11685777018785069, | |
| "grad_norm": 0.6209118962287903, | |
| "learning_rate": 2.4689842961912813e-07, | |
| "loss": 2.2017, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 0.11710173212978775, | |
| "grad_norm": 0.8064582943916321, | |
| "learning_rate": 2.4689119170984457e-07, | |
| "loss": 2.0825, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.11710173212978775, | |
| "eval_loss": 2.0377635955810547, | |
| "eval_runtime": 82.47, | |
| "eval_samples_per_second": 3.104, | |
| "eval_steps_per_second": 0.776, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.11734569407172481, | |
| "grad_norm": 0.6298145651817322, | |
| "learning_rate": 2.4688395024685635e-07, | |
| "loss": 1.9996, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 0.11758965601366186, | |
| "grad_norm": 0.6311830282211304, | |
| "learning_rate": 2.4687670522754556e-07, | |
| "loss": 2.025, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.11783361795559892, | |
| "grad_norm": 0.7297334671020508, | |
| "learning_rate": 2.468694566492918e-07, | |
| "loss": 2.2315, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.11807757989753598, | |
| "grad_norm": 0.8511336445808411, | |
| "learning_rate": 2.468622045094721e-07, | |
| "loss": 2.0218, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.11832154183947305, | |
| "grad_norm": 1.0287209749221802, | |
| "learning_rate": 2.4685494880546075e-07, | |
| "loss": 2.0803, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.1185655037814101, | |
| "grad_norm": 1.0278816223144531, | |
| "learning_rate": 2.468476895346296e-07, | |
| "loss": 2.0773, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.11880946572334716, | |
| "grad_norm": 0.6326528787612915, | |
| "learning_rate": 2.468404266943481e-07, | |
| "loss": 1.9748, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 0.11905342766528422, | |
| "grad_norm": 0.7467640042304993, | |
| "learning_rate": 2.4683316028198264e-07, | |
| "loss": 2.1406, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.11929738960722128, | |
| "grad_norm": 2.9902219772338867, | |
| "learning_rate": 2.4682589029489734e-07, | |
| "loss": 2.2492, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.11954135154915833, | |
| "grad_norm": 0.7884283661842346, | |
| "learning_rate": 2.468186167304538e-07, | |
| "loss": 2.125, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.11954135154915833, | |
| "eval_loss": 2.0366787910461426, | |
| "eval_runtime": 82.4749, | |
| "eval_samples_per_second": 3.104, | |
| "eval_steps_per_second": 0.776, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.11978531349109539, | |
| "grad_norm": 0.8623523116111755, | |
| "learning_rate": 2.4681133958601076e-07, | |
| "loss": 2.0296, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 0.12002927543303245, | |
| "grad_norm": 1.98282790184021, | |
| "learning_rate": 2.4680405885892456e-07, | |
| "loss": 1.9251, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.1202732373749695, | |
| "grad_norm": 0.6307066082954407, | |
| "learning_rate": 2.4679677454654887e-07, | |
| "loss": 2.0975, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 0.12051719931690656, | |
| "grad_norm": 0.6835567951202393, | |
| "learning_rate": 2.4678948664623467e-07, | |
| "loss": 2.0954, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.12076116125884362, | |
| "grad_norm": 0.6431268453598022, | |
| "learning_rate": 2.467821951553305e-07, | |
| "loss": 1.9843, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.12100512320078068, | |
| "grad_norm": 0.6810166835784912, | |
| "learning_rate": 2.467749000711822e-07, | |
| "loss": 1.9317, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.12124908514271773, | |
| "grad_norm": 0.6672168970108032, | |
| "learning_rate": 2.467676013911329e-07, | |
| "loss": 1.8837, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 0.12149304708465479, | |
| "grad_norm": 0.6653776168823242, | |
| "learning_rate": 2.467602991125233e-07, | |
| "loss": 2.1792, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.12173700902659185, | |
| "grad_norm": 0.7671879529953003, | |
| "learning_rate": 2.467529932326913e-07, | |
| "loss": 2.0863, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 0.1219809709685289, | |
| "grad_norm": 0.7360325455665588, | |
| "learning_rate": 2.467456837489723e-07, | |
| "loss": 2.2625, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1219809709685289, | |
| "eval_loss": 2.0352420806884766, | |
| "eval_runtime": 82.4773, | |
| "eval_samples_per_second": 3.104, | |
| "eval_steps_per_second": 0.776, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.12222493291046596, | |
| "grad_norm": 1.1433802843093872, | |
| "learning_rate": 2.46738370658699e-07, | |
| "loss": 2.0469, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 0.12246889485240303, | |
| "grad_norm": 0.774438738822937, | |
| "learning_rate": 2.467310539592016e-07, | |
| "loss": 2.159, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 0.12271285679434009, | |
| "grad_norm": 0.5934178233146667, | |
| "learning_rate": 2.467237336478074e-07, | |
| "loss": 2.1611, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 0.12295681873627715, | |
| "grad_norm": 0.76198810338974, | |
| "learning_rate": 2.4671640972184124e-07, | |
| "loss": 2.065, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.1232007806782142, | |
| "grad_norm": 2.332010507583618, | |
| "learning_rate": 2.4670908217862535e-07, | |
| "loss": 2.2185, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.12344474262015126, | |
| "grad_norm": 0.590925395488739, | |
| "learning_rate": 2.4670175101547916e-07, | |
| "loss": 1.9858, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 0.12368870456208832, | |
| "grad_norm": 0.6301215887069702, | |
| "learning_rate": 2.466944162297197e-07, | |
| "loss": 2.0693, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 0.12393266650402537, | |
| "grad_norm": 0.6086316704750061, | |
| "learning_rate": 2.466870778186611e-07, | |
| "loss": 2.1286, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 0.12417662844596243, | |
| "grad_norm": 0.5990573167800903, | |
| "learning_rate": 2.466797357796149e-07, | |
| "loss": 1.9842, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 0.12442059038789949, | |
| "grad_norm": 0.7765055894851685, | |
| "learning_rate": 2.466723901098901e-07, | |
| "loss": 2.0005, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.12442059038789949, | |
| "eval_loss": 2.033975124359131, | |
| "eval_runtime": 82.4194, | |
| "eval_samples_per_second": 3.106, | |
| "eval_steps_per_second": 0.777, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.12466455232983654, | |
| "grad_norm": 0.7282994389533997, | |
| "learning_rate": 2.466650408067929e-07, | |
| "loss": 2.0578, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 0.1249085142717736, | |
| "grad_norm": 0.6947245597839355, | |
| "learning_rate": 2.4665768786762685e-07, | |
| "loss": 2.1235, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.12515247621371067, | |
| "grad_norm": 1.0526394844055176, | |
| "learning_rate": 2.4665033128969293e-07, | |
| "loss": 2.0394, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 0.12539643815564772, | |
| "grad_norm": 0.7054687142372131, | |
| "learning_rate": 2.466429710702893e-07, | |
| "loss": 2.1627, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 0.1256404000975848, | |
| "grad_norm": 1.6336970329284668, | |
| "learning_rate": 2.466356072067116e-07, | |
| "loss": 1.9868, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.12588436203952183, | |
| "grad_norm": 0.5906395316123962, | |
| "learning_rate": 2.4662823969625266e-07, | |
| "loss": 2.0956, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 0.1261283239814589, | |
| "grad_norm": 0.6716820001602173, | |
| "learning_rate": 2.466208685362027e-07, | |
| "loss": 1.7824, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 0.12637228592339594, | |
| "grad_norm": 0.6512242555618286, | |
| "learning_rate": 2.4661349372384934e-07, | |
| "loss": 1.9613, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 0.12661624786533301, | |
| "grad_norm": 0.6093871593475342, | |
| "learning_rate": 2.466061152564773e-07, | |
| "loss": 2.133, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 0.12686020980727006, | |
| "grad_norm": 0.6348795890808105, | |
| "learning_rate": 2.4659873313136873e-07, | |
| "loss": 1.9917, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.12686020980727006, | |
| "eval_loss": 2.032553195953369, | |
| "eval_runtime": 82.4844, | |
| "eval_samples_per_second": 3.104, | |
| "eval_steps_per_second": 0.776, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.12710417174920713, | |
| "grad_norm": 0.7311941385269165, | |
| "learning_rate": 2.465913473458031e-07, | |
| "loss": 1.989, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 0.12734813369114417, | |
| "grad_norm": 0.6399905681610107, | |
| "learning_rate": 2.465839578970572e-07, | |
| "loss": 2.0483, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 0.12759209563308124, | |
| "grad_norm": 0.6352974772453308, | |
| "learning_rate": 2.46576564782405e-07, | |
| "loss": 2.1475, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 0.12783605757501829, | |
| "grad_norm": 1.216728687286377, | |
| "learning_rate": 2.4656916799911783e-07, | |
| "loss": 2.1861, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 0.12808001951695536, | |
| "grad_norm": 16.101139068603516, | |
| "learning_rate": 2.4656176754446437e-07, | |
| "loss": 2.1574, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.1283239814588924, | |
| "grad_norm": 0.6247526407241821, | |
| "learning_rate": 2.4655436341571053e-07, | |
| "loss": 2.1521, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 0.12856794340082947, | |
| "grad_norm": 0.6813322305679321, | |
| "learning_rate": 2.4654695561011943e-07, | |
| "loss": 2.1452, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 0.12881190534276654, | |
| "grad_norm": 1.005342721939087, | |
| "learning_rate": 2.4653954412495173e-07, | |
| "loss": 2.0553, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 0.12905586728470358, | |
| "grad_norm": 0.6594173312187195, | |
| "learning_rate": 2.46532128957465e-07, | |
| "loss": 2.0046, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 0.12929982922664066, | |
| "grad_norm": 0.6536688208580017, | |
| "learning_rate": 2.465247101049144e-07, | |
| "loss": 2.0084, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.12929982922664066, | |
| "eval_loss": 2.0311105251312256, | |
| "eval_runtime": 82.4257, | |
| "eval_samples_per_second": 3.106, | |
| "eval_steps_per_second": 0.776, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.1295437911685777, | |
| "grad_norm": 0.6012054085731506, | |
| "learning_rate": 2.465172875645522e-07, | |
| "loss": 2.2109, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 0.12978775311051477, | |
| "grad_norm": 0.7324656248092651, | |
| "learning_rate": 2.4650986133362793e-07, | |
| "loss": 2.0499, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 0.1300317150524518, | |
| "grad_norm": 0.6712090969085693, | |
| "learning_rate": 2.465024314093885e-07, | |
| "loss": 1.9613, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 0.13027567699438888, | |
| "grad_norm": 0.8287779688835144, | |
| "learning_rate": 2.4649499778907805e-07, | |
| "loss": 2.0791, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 0.13051963893632593, | |
| "grad_norm": 0.7180325388908386, | |
| "learning_rate": 2.4648756046993777e-07, | |
| "loss": 2.1055, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.130763600878263, | |
| "grad_norm": 0.8076268434524536, | |
| "learning_rate": 2.4648011944920643e-07, | |
| "loss": 2.0083, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 0.13100756282020004, | |
| "grad_norm": 0.6361818909645081, | |
| "learning_rate": 2.464726747241198e-07, | |
| "loss": 2.069, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 0.1312515247621371, | |
| "grad_norm": 0.6086621284484863, | |
| "learning_rate": 2.46465226291911e-07, | |
| "loss": 1.9604, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 0.13149548670407415, | |
| "grad_norm": 0.5492677092552185, | |
| "learning_rate": 2.4645777414981045e-07, | |
| "loss": 2.0372, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 0.13173944864601123, | |
| "grad_norm": 0.7199727892875671, | |
| "learning_rate": 2.4645031829504564e-07, | |
| "loss": 2.0154, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.13173944864601123, | |
| "eval_loss": 2.030088186264038, | |
| "eval_runtime": 82.3631, | |
| "eval_samples_per_second": 3.108, | |
| "eval_steps_per_second": 0.777, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.13198341058794827, | |
| "grad_norm": 1.0681871175765991, | |
| "learning_rate": 2.464428587248415e-07, | |
| "loss": 2.1332, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 0.13222737252988534, | |
| "grad_norm": 2.4582200050354004, | |
| "learning_rate": 2.4643539543642e-07, | |
| "loss": 2.0242, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 0.13247133447182238, | |
| "grad_norm": 0.556507408618927, | |
| "learning_rate": 2.4642792842700055e-07, | |
| "loss": 2.0096, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 0.13271529641375945, | |
| "grad_norm": 0.6344817280769348, | |
| "learning_rate": 2.464204576937995e-07, | |
| "loss": 1.9323, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.13295925835569652, | |
| "grad_norm": 0.6195700764656067, | |
| "learning_rate": 2.4641298323403077e-07, | |
| "loss": 2.1069, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.13320322029763357, | |
| "grad_norm": 0.7518613338470459, | |
| "learning_rate": 2.464055050449052e-07, | |
| "loss": 2.0032, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 0.13344718223957064, | |
| "grad_norm": 0.710758626461029, | |
| "learning_rate": 2.46398023123631e-07, | |
| "loss": 2.1871, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 0.13369114418150768, | |
| "grad_norm": 0.6719003915786743, | |
| "learning_rate": 2.463905374674136e-07, | |
| "loss": 1.9962, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 0.13393510612344475, | |
| "grad_norm": 0.9001761078834534, | |
| "learning_rate": 2.4638304807345555e-07, | |
| "loss": 2.0175, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 0.1341790680653818, | |
| "grad_norm": 0.8284860849380493, | |
| "learning_rate": 2.463755549389567e-07, | |
| "loss": 2.0108, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.1341790680653818, | |
| "eval_loss": 2.0290093421936035, | |
| "eval_runtime": 82.3581, | |
| "eval_samples_per_second": 3.108, | |
| "eval_steps_per_second": 0.777, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.13442303000731887, | |
| "grad_norm": 0.5253940224647522, | |
| "learning_rate": 2.46368058061114e-07, | |
| "loss": 2.055, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 0.1346669919492559, | |
| "grad_norm": 0.7425602078437805, | |
| "learning_rate": 2.4636055743712173e-07, | |
| "loss": 2.1348, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 0.13491095389119298, | |
| "grad_norm": 0.5845130681991577, | |
| "learning_rate": 2.4635305306417126e-07, | |
| "loss": 2.0106, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 0.13515491583313002, | |
| "grad_norm": 0.6662041544914246, | |
| "learning_rate": 2.463455449394512e-07, | |
| "loss": 2.1305, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 0.1353988777750671, | |
| "grad_norm": 0.6484659910202026, | |
| "learning_rate": 2.4633803306014726e-07, | |
| "loss": 2.1217, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.13564283971700414, | |
| "grad_norm": 0.7985727787017822, | |
| "learning_rate": 2.4633051742344244e-07, | |
| "loss": 2.1338, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 0.1358868016589412, | |
| "grad_norm": 0.6523579359054565, | |
| "learning_rate": 2.463229980265169e-07, | |
| "loss": 2.0912, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 0.13613076360087825, | |
| "grad_norm": 0.7691523432731628, | |
| "learning_rate": 2.4631547486654805e-07, | |
| "loss": 1.9642, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 0.13637472554281532, | |
| "grad_norm": 0.7172589898109436, | |
| "learning_rate": 2.4630794794071024e-07, | |
| "loss": 1.974, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 0.13661868748475237, | |
| "grad_norm": 0.9193669557571411, | |
| "learning_rate": 2.4630041724617526e-07, | |
| "loss": 2.0944, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.13661868748475237, | |
| "eval_loss": 2.027602195739746, | |
| "eval_runtime": 82.4834, | |
| "eval_samples_per_second": 3.104, | |
| "eval_steps_per_second": 0.776, | |
| "step": 560 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 4099, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.767317518548992e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |