{ "best_global_step": 1392, "best_metric": 0.05742287, "best_model_checkpoint": "/workspace/output/v0-20250602-172327/checkpoint-1392", "epoch": 1.0, "eval_steps": 50, "global_step": 1392, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007183908045977011, "grad_norm": 2.2841062545776367, "learning_rate": 1.4285714285714287e-07, "loss": 1.1600542068481445, "memory(GiB)": 10.89, "step": 1, "token_acc": 0.7039106145251397, "train_speed(iter/s)": 0.058022 }, { "epoch": 0.0014367816091954023, "grad_norm": 1.9468196630477905, "learning_rate": 2.8571428571428575e-07, "loss": 1.1409335136413574, "memory(GiB)": 11.66, "step": 2, "token_acc": 0.691699604743083, "train_speed(iter/s)": 0.109588 }, { "epoch": 0.0021551724137931034, "grad_norm": 9.093587875366211, "learning_rate": 4.285714285714286e-07, "loss": 1.1661465167999268, "memory(GiB)": 11.66, "step": 3, "token_acc": 0.717479674796748, "train_speed(iter/s)": 0.156027 }, { "epoch": 0.0028735632183908046, "grad_norm": 2.2898786067962646, "learning_rate": 5.714285714285715e-07, "loss": 1.1043263673782349, "memory(GiB)": 14.0, "step": 4, "token_acc": 0.7023809523809523, "train_speed(iter/s)": 0.198077 }, { "epoch": 0.0035919540229885057, "grad_norm": 2.0568299293518066, "learning_rate": 7.142857142857143e-07, "loss": 1.13688325881958, "memory(GiB)": 16.6, "step": 5, "token_acc": 0.6909090909090909, "train_speed(iter/s)": 0.235945 }, { "epoch": 0.004310344827586207, "grad_norm": 2.1850244998931885, "learning_rate": 8.571428571428572e-07, "loss": 1.1642564535140991, "memory(GiB)": 16.6, "step": 6, "token_acc": 0.6863468634686347, "train_speed(iter/s)": 0.270722 }, { "epoch": 0.005028735632183908, "grad_norm": 2.2342655658721924, "learning_rate": 1.0000000000000002e-06, "loss": 1.1676537990570068, "memory(GiB)": 16.6, "step": 7, "token_acc": 0.7145454545454546, "train_speed(iter/s)": 0.302324 }, { "epoch": 0.005747126436781609, "grad_norm": 2.8290085792541504, "learning_rate": 1.142857142857143e-06, "loss": 1.1567661762237549, "memory(GiB)": 16.6, "step": 8, "token_acc": 0.7142857142857143, "train_speed(iter/s)": 0.332067 }, { "epoch": 0.00646551724137931, "grad_norm": 4.12807559967041, "learning_rate": 1.2857142857142856e-06, "loss": 1.1599122285842896, "memory(GiB)": 16.6, "step": 9, "token_acc": 0.6553784860557769, "train_speed(iter/s)": 0.359527 }, { "epoch": 0.007183908045977011, "grad_norm": 8.037861824035645, "learning_rate": 1.4285714285714286e-06, "loss": 1.0911784172058105, "memory(GiB)": 16.6, "step": 10, "token_acc": 0.7160068846815835, "train_speed(iter/s)": 0.385134 }, { "epoch": 0.007902298850574713, "grad_norm": 1.854526162147522, "learning_rate": 1.5714285714285714e-06, "loss": 1.1166468858718872, "memory(GiB)": 16.6, "step": 11, "token_acc": 0.7025210084033613, "train_speed(iter/s)": 0.409045 }, { "epoch": 0.008620689655172414, "grad_norm": 1.9229704141616821, "learning_rate": 1.7142857142857145e-06, "loss": 1.1075730323791504, "memory(GiB)": 16.6, "step": 12, "token_acc": 0.7184873949579832, "train_speed(iter/s)": 0.431354 }, { "epoch": 0.009339080459770116, "grad_norm": 2.3131110668182373, "learning_rate": 1.8571428571428573e-06, "loss": 1.095297932624817, "memory(GiB)": 16.6, "step": 13, "token_acc": 0.6679462571976967, "train_speed(iter/s)": 0.452187 }, { "epoch": 0.010057471264367816, "grad_norm": 1.6835637092590332, "learning_rate": 2.0000000000000003e-06, "loss": 1.0489076375961304, "memory(GiB)": 16.6, "step": 14, "token_acc": 0.6977687626774848, "train_speed(iter/s)": 0.471729 }, { "epoch": 0.010775862068965518, "grad_norm": 2.728545904159546, "learning_rate": 2.1428571428571427e-06, "loss": 1.102731466293335, "memory(GiB)": 16.6, "step": 15, "token_acc": 0.704119850187266, "train_speed(iter/s)": 0.49011 }, { "epoch": 0.011494252873563218, "grad_norm": 2.705237627029419, "learning_rate": 2.285714285714286e-06, "loss": 1.007725715637207, "memory(GiB)": 16.6, "step": 16, "token_acc": 0.6903765690376569, "train_speed(iter/s)": 0.507422 }, { "epoch": 0.01221264367816092, "grad_norm": 2.3735504150390625, "learning_rate": 2.428571428571429e-06, "loss": 0.9896494746208191, "memory(GiB)": 16.6, "step": 17, "token_acc": 0.6953781512605042, "train_speed(iter/s)": 0.52366 }, { "epoch": 0.01293103448275862, "grad_norm": 1.8568823337554932, "learning_rate": 2.571428571428571e-06, "loss": 0.9329249858856201, "memory(GiB)": 16.6, "step": 18, "token_acc": 0.6954128440366972, "train_speed(iter/s)": 0.539094 }, { "epoch": 0.013649425287356323, "grad_norm": 1.9218499660491943, "learning_rate": 2.7142857142857144e-06, "loss": 0.9208242893218994, "memory(GiB)": 19.32, "step": 19, "token_acc": 0.7178217821782178, "train_speed(iter/s)": 0.553574 }, { "epoch": 0.014367816091954023, "grad_norm": 1.478088140487671, "learning_rate": 2.8571428571428573e-06, "loss": 0.8970478177070618, "memory(GiB)": 19.32, "step": 20, "token_acc": 0.7373540856031129, "train_speed(iter/s)": 0.567336 }, { "epoch": 0.015086206896551725, "grad_norm": 1.8505417108535767, "learning_rate": 3e-06, "loss": 0.8795347213745117, "memory(GiB)": 19.32, "step": 21, "token_acc": 0.7376760563380281, "train_speed(iter/s)": 0.580274 }, { "epoch": 0.015804597701149427, "grad_norm": 1.8054825067520142, "learning_rate": 3.142857142857143e-06, "loss": 0.8449476957321167, "memory(GiB)": 19.32, "step": 22, "token_acc": 0.7405940594059406, "train_speed(iter/s)": 0.592472 }, { "epoch": 0.016522988505747127, "grad_norm": 2.823409080505371, "learning_rate": 3.285714285714286e-06, "loss": 0.8313107490539551, "memory(GiB)": 19.32, "step": 23, "token_acc": 0.7145833333333333, "train_speed(iter/s)": 0.604281 }, { "epoch": 0.017241379310344827, "grad_norm": 2.051133632659912, "learning_rate": 3.428571428571429e-06, "loss": 0.7995213866233826, "memory(GiB)": 19.32, "step": 24, "token_acc": 0.7479674796747967, "train_speed(iter/s)": 0.61564 }, { "epoch": 0.017959770114942528, "grad_norm": 1.728045105934143, "learning_rate": 3.5714285714285718e-06, "loss": 0.7689487338066101, "memory(GiB)": 19.32, "step": 25, "token_acc": 0.7403508771929824, "train_speed(iter/s)": 0.626282 }, { "epoch": 0.01867816091954023, "grad_norm": 2.002279758453369, "learning_rate": 3.7142857142857146e-06, "loss": 0.7477456331253052, "memory(GiB)": 19.32, "step": 26, "token_acc": 0.7206896551724138, "train_speed(iter/s)": 0.636509 }, { "epoch": 0.01939655172413793, "grad_norm": 1.4157384634017944, "learning_rate": 3.857142857142858e-06, "loss": 0.7123377323150635, "memory(GiB)": 19.32, "step": 27, "token_acc": 0.7624113475177305, "train_speed(iter/s)": 0.646348 }, { "epoch": 0.020114942528735632, "grad_norm": 1.5171438455581665, "learning_rate": 4.000000000000001e-06, "loss": 0.7045837640762329, "memory(GiB)": 19.32, "step": 28, "token_acc": 0.7557692307692307, "train_speed(iter/s)": 0.655715 }, { "epoch": 0.020833333333333332, "grad_norm": 1.4624277353286743, "learning_rate": 4.1428571428571435e-06, "loss": 0.6961330771446228, "memory(GiB)": 19.32, "step": 29, "token_acc": 0.7419898819561551, "train_speed(iter/s)": 0.664732 }, { "epoch": 0.021551724137931036, "grad_norm": 1.382030963897705, "learning_rate": 4.2857142857142855e-06, "loss": 0.6795358061790466, "memory(GiB)": 19.32, "step": 30, "token_acc": 0.7483660130718954, "train_speed(iter/s)": 0.673162 }, { "epoch": 0.022270114942528736, "grad_norm": 0.7986149787902832, "learning_rate": 4.428571428571429e-06, "loss": 0.6588076949119568, "memory(GiB)": 19.32, "step": 31, "token_acc": 0.7810077519379846, "train_speed(iter/s)": 0.67957 }, { "epoch": 0.022988505747126436, "grad_norm": 3.235369920730591, "learning_rate": 4.571428571428572e-06, "loss": 0.6534308195114136, "memory(GiB)": 22.04, "step": 32, "token_acc": 0.7418397626112759, "train_speed(iter/s)": 0.68618 }, { "epoch": 0.023706896551724137, "grad_norm": 0.9043703079223633, "learning_rate": 4.714285714285715e-06, "loss": 0.6173244714736938, "memory(GiB)": 22.04, "step": 33, "token_acc": 0.7704280155642024, "train_speed(iter/s)": 0.69268 }, { "epoch": 0.02442528735632184, "grad_norm": 0.9317988753318787, "learning_rate": 4.857142857142858e-06, "loss": 0.6101970672607422, "memory(GiB)": 22.04, "step": 34, "token_acc": 0.7844827586206896, "train_speed(iter/s)": 0.698581 }, { "epoch": 0.02514367816091954, "grad_norm": 1.2440056800842285, "learning_rate": 5e-06, "loss": 0.5939048528671265, "memory(GiB)": 22.04, "step": 35, "token_acc": 0.7711267605633803, "train_speed(iter/s)": 0.704449 }, { "epoch": 0.02586206896551724, "grad_norm": 0.7959598302841187, "learning_rate": 5.142857142857142e-06, "loss": 0.5946186780929565, "memory(GiB)": 22.04, "step": 36, "token_acc": 0.8024691358024691, "train_speed(iter/s)": 0.710975 }, { "epoch": 0.02658045977011494, "grad_norm": 1.0198500156402588, "learning_rate": 5.285714285714286e-06, "loss": 0.5879278182983398, "memory(GiB)": 22.04, "step": 37, "token_acc": 0.7632933104631218, "train_speed(iter/s)": 0.717008 }, { "epoch": 0.027298850574712645, "grad_norm": 1.319176197052002, "learning_rate": 5.428571428571429e-06, "loss": 0.5616503357887268, "memory(GiB)": 22.04, "step": 38, "token_acc": 0.7900763358778626, "train_speed(iter/s)": 0.722112 }, { "epoch": 0.028017241379310345, "grad_norm": 2.255368232727051, "learning_rate": 5.571428571428572e-06, "loss": 0.5551696419715881, "memory(GiB)": 22.04, "step": 39, "token_acc": 0.7962616822429907, "train_speed(iter/s)": 0.72696 }, { "epoch": 0.028735632183908046, "grad_norm": 1.2048120498657227, "learning_rate": 5.7142857142857145e-06, "loss": 0.5641672015190125, "memory(GiB)": 22.04, "step": 40, "token_acc": 0.7755905511811023, "train_speed(iter/s)": 0.731667 }, { "epoch": 0.029454022988505746, "grad_norm": 1.2680882215499878, "learning_rate": 5.857142857142858e-06, "loss": 0.53426593542099, "memory(GiB)": 22.04, "step": 41, "token_acc": 0.7771317829457365, "train_speed(iter/s)": 0.73598 }, { "epoch": 0.03017241379310345, "grad_norm": 1.5466786623001099, "learning_rate": 6e-06, "loss": 0.5407324433326721, "memory(GiB)": 22.04, "step": 42, "token_acc": 0.7844112769485904, "train_speed(iter/s)": 0.741551 }, { "epoch": 0.03089080459770115, "grad_norm": 1.1284416913986206, "learning_rate": 6.142857142857144e-06, "loss": 0.5273822546005249, "memory(GiB)": 22.04, "step": 43, "token_acc": 0.7950530035335689, "train_speed(iter/s)": 0.74689 }, { "epoch": 0.031609195402298854, "grad_norm": 1.2674559354782104, "learning_rate": 6.285714285714286e-06, "loss": 0.49984538555145264, "memory(GiB)": 22.04, "step": 44, "token_acc": 0.8225469728601252, "train_speed(iter/s)": 0.750789 }, { "epoch": 0.032327586206896554, "grad_norm": 5.208553314208984, "learning_rate": 6.4285714285714295e-06, "loss": 0.48178571462631226, "memory(GiB)": 22.04, "step": 45, "token_acc": 0.8046875, "train_speed(iter/s)": 0.754392 }, { "epoch": 0.033045977011494254, "grad_norm": 1.5044139623641968, "learning_rate": 6.571428571428572e-06, "loss": 0.4803074598312378, "memory(GiB)": 22.04, "step": 46, "token_acc": 0.8210332103321033, "train_speed(iter/s)": 0.759248 }, { "epoch": 0.033764367816091954, "grad_norm": 1.5763964653015137, "learning_rate": 6.714285714285714e-06, "loss": 0.4811801016330719, "memory(GiB)": 22.04, "step": 47, "token_acc": 0.826530612244898, "train_speed(iter/s)": 0.763808 }, { "epoch": 0.034482758620689655, "grad_norm": 1.5099002122879028, "learning_rate": 6.857142857142858e-06, "loss": 0.45788246393203735, "memory(GiB)": 22.04, "step": 48, "token_acc": 0.8535262206148282, "train_speed(iter/s)": 0.768382 }, { "epoch": 0.035201149425287355, "grad_norm": 1.7129669189453125, "learning_rate": 7e-06, "loss": 0.43941259384155273, "memory(GiB)": 22.04, "step": 49, "token_acc": 0.8255578093306288, "train_speed(iter/s)": 0.772838 }, { "epoch": 0.035919540229885055, "grad_norm": 2.7886550426483154, "learning_rate": 7.1428571428571436e-06, "loss": 0.4306374788284302, "memory(GiB)": 22.04, "step": 50, "token_acc": 0.8442028985507246, "train_speed(iter/s)": 0.777264 }, { "epoch": 0.035919540229885055, "eval_loss": 0.417650043964386, "eval_runtime": 6.0008, "eval_samples_per_second": 74.99, "eval_steps_per_second": 2.5, "eval_token_acc": 0.8405500749250749, "step": 50 }, { "epoch": 0.036637931034482756, "grad_norm": 2.2179479598999023, "learning_rate": 7.285714285714286e-06, "loss": 0.41859811544418335, "memory(GiB)": 22.04, "step": 51, "token_acc": 0.8393764025038384, "train_speed(iter/s)": 0.602938 }, { "epoch": 0.03735632183908046, "grad_norm": 3.0027248859405518, "learning_rate": 7.428571428571429e-06, "loss": 0.4272707402706146, "memory(GiB)": 22.04, "step": 52, "token_acc": 0.8485370051635112, "train_speed(iter/s)": 0.607356 }, { "epoch": 0.03807471264367816, "grad_norm": 1.9224092960357666, "learning_rate": 7.571428571428572e-06, "loss": 0.3921546936035156, "memory(GiB)": 22.04, "step": 53, "token_acc": 0.8393194706994329, "train_speed(iter/s)": 0.611585 }, { "epoch": 0.03879310344827586, "grad_norm": 1.4286688566207886, "learning_rate": 7.714285714285716e-06, "loss": 0.40434399247169495, "memory(GiB)": 22.04, "step": 54, "token_acc": 0.8651252408477842, "train_speed(iter/s)": 0.61635 }, { "epoch": 0.039511494252873564, "grad_norm": 1.2184127569198608, "learning_rate": 7.857142857142858e-06, "loss": 0.38974618911743164, "memory(GiB)": 22.04, "step": 55, "token_acc": 0.8517034068136272, "train_speed(iter/s)": 0.620202 }, { "epoch": 0.040229885057471264, "grad_norm": 1.7397710084915161, "learning_rate": 8.000000000000001e-06, "loss": 0.3863411545753479, "memory(GiB)": 22.04, "step": 56, "token_acc": 0.8636363636363636, "train_speed(iter/s)": 0.624095 }, { "epoch": 0.040948275862068964, "grad_norm": 1.4375176429748535, "learning_rate": 8.142857142857143e-06, "loss": 0.3879658281803131, "memory(GiB)": 22.04, "step": 57, "token_acc": 0.8411053540587219, "train_speed(iter/s)": 0.627926 }, { "epoch": 0.041666666666666664, "grad_norm": 1.3894975185394287, "learning_rate": 8.285714285714287e-06, "loss": 0.3751828670501709, "memory(GiB)": 22.04, "step": 58, "token_acc": 0.8551236749116607, "train_speed(iter/s)": 0.631639 }, { "epoch": 0.042385057471264365, "grad_norm": 1.4215877056121826, "learning_rate": 8.428571428571429e-06, "loss": 0.361402690410614, "memory(GiB)": 22.04, "step": 59, "token_acc": 0.8843283582089553, "train_speed(iter/s)": 0.634921 }, { "epoch": 0.04310344827586207, "grad_norm": 1.1520580053329468, "learning_rate": 8.571428571428571e-06, "loss": 0.3440661132335663, "memory(GiB)": 22.04, "step": 60, "token_acc": 0.8522727272727273, "train_speed(iter/s)": 0.638257 }, { "epoch": 0.04382183908045977, "grad_norm": 1.2310643196105957, "learning_rate": 8.714285714285715e-06, "loss": 0.33500269055366516, "memory(GiB)": 22.04, "step": 61, "token_acc": 0.876984126984127, "train_speed(iter/s)": 0.642375 }, { "epoch": 0.04454022988505747, "grad_norm": 2.312145709991455, "learning_rate": 8.857142857142858e-06, "loss": 0.35877761244773865, "memory(GiB)": 22.04, "step": 62, "token_acc": 0.8598484848484849, "train_speed(iter/s)": 0.646563 }, { "epoch": 0.04525862068965517, "grad_norm": 2.1113927364349365, "learning_rate": 9e-06, "loss": 0.3470730781555176, "memory(GiB)": 22.04, "step": 63, "token_acc": 0.8632326820603907, "train_speed(iter/s)": 0.650663 }, { "epoch": 0.04597701149425287, "grad_norm": 1.3712527751922607, "learning_rate": 9.142857142857144e-06, "loss": 0.3441311717033386, "memory(GiB)": 22.04, "step": 64, "token_acc": 0.8793774319066148, "train_speed(iter/s)": 0.654559 }, { "epoch": 0.04669540229885057, "grad_norm": 1.4694899320602417, "learning_rate": 9.285714285714288e-06, "loss": 0.34587639570236206, "memory(GiB)": 22.04, "step": 65, "token_acc": 0.8624229979466119, "train_speed(iter/s)": 0.658521 }, { "epoch": 0.04741379310344827, "grad_norm": 1.3747105598449707, "learning_rate": 9.42857142857143e-06, "loss": 0.33066898584365845, "memory(GiB)": 22.04, "step": 66, "token_acc": 0.8765690376569037, "train_speed(iter/s)": 0.662401 }, { "epoch": 0.048132183908045974, "grad_norm": 1.7012337446212769, "learning_rate": 9.571428571428573e-06, "loss": 0.32754647731781006, "memory(GiB)": 22.04, "step": 67, "token_acc": 0.8752293577981651, "train_speed(iter/s)": 0.666163 }, { "epoch": 0.04885057471264368, "grad_norm": 1.235076904296875, "learning_rate": 9.714285714285715e-06, "loss": 0.3348221778869629, "memory(GiB)": 22.04, "step": 68, "token_acc": 0.8900414937759336, "train_speed(iter/s)": 0.669933 }, { "epoch": 0.04956896551724138, "grad_norm": 1.377700924873352, "learning_rate": 9.857142857142859e-06, "loss": 0.3319036364555359, "memory(GiB)": 22.04, "step": 69, "token_acc": 0.8724584103512015, "train_speed(iter/s)": 0.673584 }, { "epoch": 0.05028735632183908, "grad_norm": 3.6680097579956055, "learning_rate": 1e-05, "loss": 0.3383405804634094, "memory(GiB)": 22.04, "step": 70, "token_acc": 0.8456057007125891, "train_speed(iter/s)": 0.677199 }, { "epoch": 0.05100574712643678, "grad_norm": 1.5148518085479736, "learning_rate": 9.999985881887463e-06, "loss": 0.33105501532554626, "memory(GiB)": 22.04, "step": 71, "token_acc": 0.8606701940035273, "train_speed(iter/s)": 0.680572 }, { "epoch": 0.05172413793103448, "grad_norm": 1.4275816679000854, "learning_rate": 9.99994352762958e-06, "loss": 0.31913650035858154, "memory(GiB)": 22.04, "step": 72, "token_acc": 0.8698884758364313, "train_speed(iter/s)": 0.68403 }, { "epoch": 0.05244252873563218, "grad_norm": 1.5467592477798462, "learning_rate": 9.999872937465538e-06, "loss": 0.30536019802093506, "memory(GiB)": 22.04, "step": 73, "token_acc": 0.874031007751938, "train_speed(iter/s)": 0.687441 }, { "epoch": 0.05316091954022988, "grad_norm": 1.98291015625, "learning_rate": 9.999774111793974e-06, "loss": 0.3247784376144409, "memory(GiB)": 22.04, "step": 74, "token_acc": 0.8786324786324786, "train_speed(iter/s)": 0.690705 }, { "epoch": 0.05387931034482758, "grad_norm": 1.427363395690918, "learning_rate": 9.999647051172982e-06, "loss": 0.2954842448234558, "memory(GiB)": 22.04, "step": 75, "token_acc": 0.8933333333333333, "train_speed(iter/s)": 0.693996 }, { "epoch": 0.05459770114942529, "grad_norm": 1.354496955871582, "learning_rate": 9.999491756320105e-06, "loss": 0.31551671028137207, "memory(GiB)": 22.04, "step": 76, "token_acc": 0.8818011257035647, "train_speed(iter/s)": 0.697211 }, { "epoch": 0.05531609195402299, "grad_norm": 1.326243281364441, "learning_rate": 9.999308228112332e-06, "loss": 0.30212467908859253, "memory(GiB)": 22.04, "step": 77, "token_acc": 0.891566265060241, "train_speed(iter/s)": 0.700273 }, { "epoch": 0.05603448275862069, "grad_norm": 1.0829932689666748, "learning_rate": 9.99909646758609e-06, "loss": 0.3023262619972229, "memory(GiB)": 22.04, "step": 78, "token_acc": 0.891588785046729, "train_speed(iter/s)": 0.703409 }, { "epoch": 0.05675287356321839, "grad_norm": 1.196694016456604, "learning_rate": 9.998856475937242e-06, "loss": 0.30286282300949097, "memory(GiB)": 22.04, "step": 79, "token_acc": 0.8821656050955414, "train_speed(iter/s)": 0.706501 }, { "epoch": 0.05747126436781609, "grad_norm": 0.9569135904312134, "learning_rate": 9.99858825452108e-06, "loss": 0.31772279739379883, "memory(GiB)": 22.04, "step": 80, "token_acc": 0.8616600790513834, "train_speed(iter/s)": 0.709422 }, { "epoch": 0.05818965517241379, "grad_norm": 1.0102065801620483, "learning_rate": 9.998291804852318e-06, "loss": 0.3182288110256195, "memory(GiB)": 22.04, "step": 81, "token_acc": 0.8916666666666667, "train_speed(iter/s)": 0.712357 }, { "epoch": 0.05890804597701149, "grad_norm": 1.0702964067459106, "learning_rate": 9.997967128605078e-06, "loss": 0.28996023535728455, "memory(GiB)": 22.04, "step": 82, "token_acc": 0.9082217973231358, "train_speed(iter/s)": 0.71467 }, { "epoch": 0.05962643678160919, "grad_norm": 1.4451472759246826, "learning_rate": 9.997614227612887e-06, "loss": 0.29703807830810547, "memory(GiB)": 22.04, "step": 83, "token_acc": 0.8817427385892116, "train_speed(iter/s)": 0.717506 }, { "epoch": 0.0603448275862069, "grad_norm": 0.9890632629394531, "learning_rate": 9.997233103868664e-06, "loss": 0.2898198366165161, "memory(GiB)": 22.04, "step": 84, "token_acc": 0.8695652173913043, "train_speed(iter/s)": 0.720337 }, { "epoch": 0.0610632183908046, "grad_norm": 1.093158483505249, "learning_rate": 9.996823759524705e-06, "loss": 0.28111356496810913, "memory(GiB)": 22.04, "step": 85, "token_acc": 0.8967495219885278, "train_speed(iter/s)": 0.723099 }, { "epoch": 0.0617816091954023, "grad_norm": 0.9716314673423767, "learning_rate": 9.996386196892683e-06, "loss": 0.2728654146194458, "memory(GiB)": 22.04, "step": 86, "token_acc": 0.9053763440860215, "train_speed(iter/s)": 0.725865 }, { "epoch": 0.0625, "grad_norm": 1.7202712297439575, "learning_rate": 9.995920418443619e-06, "loss": 0.29384976625442505, "memory(GiB)": 22.04, "step": 87, "token_acc": 0.8682008368200836, "train_speed(iter/s)": 0.728457 }, { "epoch": 0.06321839080459771, "grad_norm": 1.438616156578064, "learning_rate": 9.995426426807875e-06, "loss": 0.2719009220600128, "memory(GiB)": 22.04, "step": 88, "token_acc": 0.8966074313408724, "train_speed(iter/s)": 0.7306 }, { "epoch": 0.0639367816091954, "grad_norm": 1.0650871992111206, "learning_rate": 9.994904224775149e-06, "loss": 0.28917616605758667, "memory(GiB)": 22.04, "step": 89, "token_acc": 0.8870967741935484, "train_speed(iter/s)": 0.732648 }, { "epoch": 0.06465517241379311, "grad_norm": 0.9939769506454468, "learning_rate": 9.994353815294438e-06, "loss": 0.2857409715652466, "memory(GiB)": 22.04, "step": 90, "token_acc": 0.8875968992248062, "train_speed(iter/s)": 0.73472 }, { "epoch": 0.0653735632183908, "grad_norm": 0.9182151556015015, "learning_rate": 9.993775201474041e-06, "loss": 0.27646404504776, "memory(GiB)": 22.04, "step": 91, "token_acc": 0.8943781942078365, "train_speed(iter/s)": 0.736647 }, { "epoch": 0.06609195402298851, "grad_norm": 0.9665948748588562, "learning_rate": 9.993168386581533e-06, "loss": 0.26815876364707947, "memory(GiB)": 22.04, "step": 92, "token_acc": 0.8838951310861424, "train_speed(iter/s)": 0.739035 }, { "epoch": 0.0668103448275862, "grad_norm": 1.7117060422897339, "learning_rate": 9.992533374043747e-06, "loss": 0.2667180597782135, "memory(GiB)": 22.04, "step": 93, "token_acc": 0.9050505050505051, "train_speed(iter/s)": 0.74155 }, { "epoch": 0.06752873563218391, "grad_norm": 1.398851990699768, "learning_rate": 9.991870167446751e-06, "loss": 0.2699788212776184, "memory(GiB)": 22.04, "step": 94, "token_acc": 0.8870967741935484, "train_speed(iter/s)": 0.74364 }, { "epoch": 0.06824712643678162, "grad_norm": 2.527916431427002, "learning_rate": 9.99117877053584e-06, "loss": 0.27772611379623413, "memory(GiB)": 22.04, "step": 95, "token_acc": 0.8987108655616943, "train_speed(iter/s)": 0.745375 }, { "epoch": 0.06896551724137931, "grad_norm": 1.4383560419082642, "learning_rate": 9.990459187215498e-06, "loss": 0.2658151388168335, "memory(GiB)": 22.04, "step": 96, "token_acc": 0.898876404494382, "train_speed(iter/s)": 0.746584 }, { "epoch": 0.06968390804597702, "grad_norm": 1.0126614570617676, "learning_rate": 9.989711421549389e-06, "loss": 0.26273298263549805, "memory(GiB)": 22.04, "step": 97, "token_acc": 0.8758278145695364, "train_speed(iter/s)": 0.748079 }, { "epoch": 0.07040229885057471, "grad_norm": 2.330467939376831, "learning_rate": 9.98893547776033e-06, "loss": 0.274189293384552, "memory(GiB)": 22.04, "step": 98, "token_acc": 0.8962075848303394, "train_speed(iter/s)": 0.750194 }, { "epoch": 0.07112068965517242, "grad_norm": 1.8854765892028809, "learning_rate": 9.988131360230266e-06, "loss": 0.27949923276901245, "memory(GiB)": 22.04, "step": 99, "token_acc": 0.8977072310405644, "train_speed(iter/s)": 0.752505 }, { "epoch": 0.07183908045977011, "grad_norm": 1.2201083898544312, "learning_rate": 9.987299073500245e-06, "loss": 0.2654486894607544, "memory(GiB)": 22.04, "step": 100, "token_acc": 0.9048507462686567, "train_speed(iter/s)": 0.754696 }, { "epoch": 0.07183908045977011, "eval_loss": 0.2732755243778229, "eval_runtime": 5.8759, "eval_samples_per_second": 76.584, "eval_steps_per_second": 2.553, "eval_token_acc": 0.8946366133866134, "step": 100 }, { "epoch": 0.07255747126436782, "grad_norm": 1.4662714004516602, "learning_rate": 9.986438622270392e-06, "loss": 0.2716202139854431, "memory(GiB)": 22.04, "step": 101, "token_acc": 0.8986732417517905, "train_speed(iter/s)": 0.66087 }, { "epoch": 0.07327586206896551, "grad_norm": 1.2047909498214722, "learning_rate": 9.985550011399889e-06, "loss": 0.2638857364654541, "memory(GiB)": 22.04, "step": 102, "token_acc": 0.8876221498371335, "train_speed(iter/s)": 0.66326 }, { "epoch": 0.07399425287356322, "grad_norm": 1.2716830968856812, "learning_rate": 9.984633245906939e-06, "loss": 0.24802210927009583, "memory(GiB)": 22.04, "step": 103, "token_acc": 0.9063097514340345, "train_speed(iter/s)": 0.66577 }, { "epoch": 0.07471264367816093, "grad_norm": 0.8747836947441101, "learning_rate": 9.98368833096874e-06, "loss": 0.2696152925491333, "memory(GiB)": 22.04, "step": 104, "token_acc": 0.8978494623655914, "train_speed(iter/s)": 0.668209 }, { "epoch": 0.07543103448275862, "grad_norm": 1.3527655601501465, "learning_rate": 9.982715271921457e-06, "loss": 0.26998597383499146, "memory(GiB)": 22.04, "step": 105, "token_acc": 0.8945518453427065, "train_speed(iter/s)": 0.670484 }, { "epoch": 0.07614942528735633, "grad_norm": 1.0049402713775635, "learning_rate": 9.981714074260196e-06, "loss": 0.2559014558792114, "memory(GiB)": 22.04, "step": 106, "token_acc": 0.8954983922829582, "train_speed(iter/s)": 0.672536 }, { "epoch": 0.07686781609195402, "grad_norm": 0.9462708234786987, "learning_rate": 9.980684743638965e-06, "loss": 0.2650679349899292, "memory(GiB)": 22.04, "step": 107, "token_acc": 0.8864696734059098, "train_speed(iter/s)": 0.674564 }, { "epoch": 0.07758620689655173, "grad_norm": 1.2705525159835815, "learning_rate": 9.979627285870644e-06, "loss": 0.28446877002716064, "memory(GiB)": 22.04, "step": 108, "token_acc": 0.8618421052631579, "train_speed(iter/s)": 0.676606 }, { "epoch": 0.07830459770114942, "grad_norm": 1.0003091096878052, "learning_rate": 9.978541706926959e-06, "loss": 0.2499096691608429, "memory(GiB)": 22.04, "step": 109, "token_acc": 0.9106463878326996, "train_speed(iter/s)": 0.678525 }, { "epoch": 0.07902298850574713, "grad_norm": 1.380895972251892, "learning_rate": 9.977428012938437e-06, "loss": 0.2661605775356293, "memory(GiB)": 22.04, "step": 110, "token_acc": 0.8923611111111112, "train_speed(iter/s)": 0.680567 }, { "epoch": 0.07974137931034483, "grad_norm": 1.1094906330108643, "learning_rate": 9.976286210194382e-06, "loss": 0.26611244678497314, "memory(GiB)": 22.04, "step": 111, "token_acc": 0.8957446808510638, "train_speed(iter/s)": 0.68194 }, { "epoch": 0.08045977011494253, "grad_norm": 1.2140064239501953, "learning_rate": 9.975116305142836e-06, "loss": 0.26161032915115356, "memory(GiB)": 22.04, "step": 112, "token_acc": 0.9033457249070632, "train_speed(iter/s)": 0.683722 }, { "epoch": 0.08117816091954023, "grad_norm": 0.9756461977958679, "learning_rate": 9.973918304390536e-06, "loss": 0.24689538776874542, "memory(GiB)": 22.04, "step": 113, "token_acc": 0.9110671936758893, "train_speed(iter/s)": 0.685708 }, { "epoch": 0.08189655172413793, "grad_norm": 1.3619202375411987, "learning_rate": 9.97269221470289e-06, "loss": 0.2549363076686859, "memory(GiB)": 22.04, "step": 114, "token_acc": 0.9290322580645162, "train_speed(iter/s)": 0.687578 }, { "epoch": 0.08261494252873564, "grad_norm": 1.1069204807281494, "learning_rate": 9.971438043003923e-06, "loss": 0.2543719708919525, "memory(GiB)": 22.04, "step": 115, "token_acc": 0.9036608863198459, "train_speed(iter/s)": 0.689509 }, { "epoch": 0.08333333333333333, "grad_norm": 1.19921875, "learning_rate": 9.97015579637625e-06, "loss": 0.23842032253742218, "memory(GiB)": 22.04, "step": 116, "token_acc": 0.9, "train_speed(iter/s)": 0.691207 }, { "epoch": 0.08405172413793104, "grad_norm": 1.343320369720459, "learning_rate": 9.968845482061036e-06, "loss": 0.25210797786712646, "memory(GiB)": 22.04, "step": 117, "token_acc": 0.903353057199211, "train_speed(iter/s)": 0.693259 }, { "epoch": 0.08477011494252873, "grad_norm": 1.121742606163025, "learning_rate": 9.967507107457942e-06, "loss": 0.23466835916042328, "memory(GiB)": 22.04, "step": 118, "token_acc": 0.9310344827586207, "train_speed(iter/s)": 0.695395 }, { "epoch": 0.08548850574712644, "grad_norm": 1.1817830801010132, "learning_rate": 9.9661406801251e-06, "loss": 0.24389104545116425, "memory(GiB)": 22.04, "step": 119, "token_acc": 0.9288617886178862, "train_speed(iter/s)": 0.697485 }, { "epoch": 0.08620689655172414, "grad_norm": 1.2986172437667847, "learning_rate": 9.96474620777906e-06, "loss": 0.25543078780174255, "memory(GiB)": 22.04, "step": 120, "token_acc": 0.9047619047619048, "train_speed(iter/s)": 0.69954 }, { "epoch": 0.08692528735632184, "grad_norm": 4.396556854248047, "learning_rate": 9.963323698294749e-06, "loss": 0.24473431706428528, "memory(GiB)": 22.04, "step": 121, "token_acc": 0.8974358974358975, "train_speed(iter/s)": 0.701591 }, { "epoch": 0.08764367816091954, "grad_norm": 1.4296936988830566, "learning_rate": 9.961873159705426e-06, "loss": 0.24880093336105347, "memory(GiB)": 22.04, "step": 122, "token_acc": 0.9111531190926276, "train_speed(iter/s)": 0.703637 }, { "epoch": 0.08836206896551724, "grad_norm": 2.046653985977173, "learning_rate": 9.960394600202636e-06, "loss": 0.2361898273229599, "memory(GiB)": 22.04, "step": 123, "token_acc": 0.9112050739957717, "train_speed(iter/s)": 0.705639 }, { "epoch": 0.08908045977011494, "grad_norm": 1.4354192018508911, "learning_rate": 9.95888802813617e-06, "loss": 0.24984179437160492, "memory(GiB)": 22.04, "step": 124, "token_acc": 0.896551724137931, "train_speed(iter/s)": 0.707569 }, { "epoch": 0.08979885057471264, "grad_norm": 1.751117467880249, "learning_rate": 9.95735345201401e-06, "loss": 0.23743167519569397, "memory(GiB)": 22.04, "step": 125, "token_acc": 0.8884955752212389, "train_speed(iter/s)": 0.709451 }, { "epoch": 0.09051724137931035, "grad_norm": 1.8865572214126587, "learning_rate": 9.955790880502278e-06, "loss": 0.2536635398864746, "memory(GiB)": 22.04, "step": 126, "token_acc": 0.8923395445134575, "train_speed(iter/s)": 0.711318 }, { "epoch": 0.09123563218390805, "grad_norm": 1.197823405265808, "learning_rate": 9.954200322425204e-06, "loss": 0.24290497601032257, "memory(GiB)": 22.04, "step": 127, "token_acc": 0.9105691056910569, "train_speed(iter/s)": 0.713213 }, { "epoch": 0.09195402298850575, "grad_norm": 1.3979065418243408, "learning_rate": 9.952581786765057e-06, "loss": 0.24988895654678345, "memory(GiB)": 22.04, "step": 128, "token_acc": 0.8971631205673759, "train_speed(iter/s)": 0.715112 }, { "epoch": 0.09267241379310345, "grad_norm": 2.2524890899658203, "learning_rate": 9.950935282662103e-06, "loss": 0.25248128175735474, "memory(GiB)": 22.04, "step": 129, "token_acc": 0.8884297520661157, "train_speed(iter/s)": 0.716993 }, { "epoch": 0.09339080459770115, "grad_norm": 1.5302817821502686, "learning_rate": 9.949260819414557e-06, "loss": 0.2514447271823883, "memory(GiB)": 22.04, "step": 130, "token_acc": 0.903353057199211, "train_speed(iter/s)": 0.718846 }, { "epoch": 0.09410919540229885, "grad_norm": 1.8826956748962402, "learning_rate": 9.94755840647852e-06, "loss": 0.24213311076164246, "memory(GiB)": 22.04, "step": 131, "token_acc": 0.9007936507936508, "train_speed(iter/s)": 0.720693 }, { "epoch": 0.09482758620689655, "grad_norm": 2.9384260177612305, "learning_rate": 9.945828053467939e-06, "loss": 0.24626880884170532, "memory(GiB)": 22.04, "step": 132, "token_acc": 0.9084507042253521, "train_speed(iter/s)": 0.722518 }, { "epoch": 0.09554597701149425, "grad_norm": 1.701797604560852, "learning_rate": 9.944069770154537e-06, "loss": 0.24730336666107178, "memory(GiB)": 22.04, "step": 133, "token_acc": 0.9292543021032504, "train_speed(iter/s)": 0.724328 }, { "epoch": 0.09626436781609195, "grad_norm": 1.4743934869766235, "learning_rate": 9.942283566467773e-06, "loss": 0.24133077263832092, "memory(GiB)": 22.04, "step": 134, "token_acc": 0.9066901408450704, "train_speed(iter/s)": 0.726089 }, { "epoch": 0.09698275862068965, "grad_norm": 1.677595853805542, "learning_rate": 9.940469452494778e-06, "loss": 0.24493783712387085, "memory(GiB)": 22.04, "step": 135, "token_acc": 0.896551724137931, "train_speed(iter/s)": 0.727734 }, { "epoch": 0.09770114942528736, "grad_norm": 1.1534054279327393, "learning_rate": 9.938627438480295e-06, "loss": 0.2394789457321167, "memory(GiB)": 22.04, "step": 136, "token_acc": 0.9106029106029107, "train_speed(iter/s)": 0.729447 }, { "epoch": 0.09841954022988506, "grad_norm": 1.5104615688323975, "learning_rate": 9.93675753482663e-06, "loss": 0.23206910490989685, "memory(GiB)": 22.04, "step": 137, "token_acc": 0.9213286713286714, "train_speed(iter/s)": 0.731158 }, { "epoch": 0.09913793103448276, "grad_norm": 1.6190274953842163, "learning_rate": 9.93485975209359e-06, "loss": 0.24004344642162323, "memory(GiB)": 22.04, "step": 138, "token_acc": 0.90625, "train_speed(iter/s)": 0.732875 }, { "epoch": 0.09985632183908046, "grad_norm": 1.8998982906341553, "learning_rate": 9.932934100998415e-06, "loss": 0.24400366842746735, "memory(GiB)": 22.04, "step": 139, "token_acc": 0.9151943462897526, "train_speed(iter/s)": 0.734576 }, { "epoch": 0.10057471264367816, "grad_norm": 1.7702089548110962, "learning_rate": 9.930980592415728e-06, "loss": 0.2416125386953354, "memory(GiB)": 22.04, "step": 140, "token_acc": 0.9063136456211812, "train_speed(iter/s)": 0.73626 }, { "epoch": 0.10129310344827586, "grad_norm": 1.6240352392196655, "learning_rate": 9.928999237377475e-06, "loss": 0.2210443615913391, "memory(GiB)": 22.04, "step": 141, "token_acc": 0.9295774647887324, "train_speed(iter/s)": 0.737927 }, { "epoch": 0.10201149425287356, "grad_norm": 1.750473141670227, "learning_rate": 9.926990047072849e-06, "loss": 0.23541398346424103, "memory(GiB)": 22.04, "step": 142, "token_acc": 0.9232323232323232, "train_speed(iter/s)": 0.73958 }, { "epoch": 0.10272988505747127, "grad_norm": 2.170330762863159, "learning_rate": 9.924953032848243e-06, "loss": 0.23631441593170166, "memory(GiB)": 22.04, "step": 143, "token_acc": 0.9084249084249084, "train_speed(iter/s)": 0.741182 }, { "epoch": 0.10344827586206896, "grad_norm": 3.16756010055542, "learning_rate": 9.922888206207174e-06, "loss": 0.2369396686553955, "memory(GiB)": 22.04, "step": 144, "token_acc": 0.9170212765957447, "train_speed(iter/s)": 0.742521 }, { "epoch": 0.10416666666666667, "grad_norm": 1.6588138341903687, "learning_rate": 9.920795578810223e-06, "loss": 0.24720795452594757, "memory(GiB)": 22.04, "step": 145, "token_acc": 0.9009174311926605, "train_speed(iter/s)": 0.743792 }, { "epoch": 0.10488505747126436, "grad_norm": 1.4406559467315674, "learning_rate": 9.918675162474974e-06, "loss": 0.2379404455423355, "memory(GiB)": 22.04, "step": 146, "token_acc": 0.9310344827586207, "train_speed(iter/s)": 0.745059 }, { "epoch": 0.10560344827586207, "grad_norm": 0.9256932139396667, "learning_rate": 9.916526969175932e-06, "loss": 0.2219916582107544, "memory(GiB)": 22.04, "step": 147, "token_acc": 0.9032738095238095, "train_speed(iter/s)": 0.745947 }, { "epoch": 0.10632183908045977, "grad_norm": 1.5220218896865845, "learning_rate": 9.914351011044472e-06, "loss": 0.2362460196018219, "memory(GiB)": 22.04, "step": 148, "token_acc": 0.9182389937106918, "train_speed(iter/s)": 0.746791 }, { "epoch": 0.10704022988505747, "grad_norm": 1.2706156969070435, "learning_rate": 9.912147300368766e-06, "loss": 0.22389021515846252, "memory(GiB)": 22.04, "step": 149, "token_acc": 0.9005736137667304, "train_speed(iter/s)": 0.748291 }, { "epoch": 0.10775862068965517, "grad_norm": 1.3866983652114868, "learning_rate": 9.909915849593705e-06, "loss": 0.22584214806556702, "memory(GiB)": 22.04, "step": 150, "token_acc": 0.9125, "train_speed(iter/s)": 0.749763 }, { "epoch": 0.10775862068965517, "eval_loss": 0.2299415022134781, "eval_runtime": 6.5344, "eval_samples_per_second": 68.867, "eval_steps_per_second": 2.296, "eval_token_acc": 0.9113854895104895, "step": 150 }, { "epoch": 0.10847701149425287, "grad_norm": 12.256382942199707, "learning_rate": 9.90765667132084e-06, "loss": 0.23844251036643982, "memory(GiB)": 22.04, "step": 151, "token_acc": 0.9123992413466098, "train_speed(iter/s)": 0.684127 }, { "epoch": 0.10919540229885058, "grad_norm": 2.986100673675537, "learning_rate": 9.905369778308304e-06, "loss": 0.24036195874214172, "memory(GiB)": 22.04, "step": 152, "token_acc": 0.9271653543307087, "train_speed(iter/s)": 0.685741 }, { "epoch": 0.10991379310344827, "grad_norm": 2.924368381500244, "learning_rate": 9.903055183470742e-06, "loss": 0.2319793403148651, "memory(GiB)": 22.04, "step": 153, "token_acc": 0.90234375, "train_speed(iter/s)": 0.687336 }, { "epoch": 0.11063218390804598, "grad_norm": 2.1302788257598877, "learning_rate": 9.900712899879237e-06, "loss": 0.23484164476394653, "memory(GiB)": 22.04, "step": 154, "token_acc": 0.9186507936507936, "train_speed(iter/s)": 0.688927 }, { "epoch": 0.11135057471264367, "grad_norm": 1.3652247190475464, "learning_rate": 9.89834294076124e-06, "loss": 0.23756006360054016, "memory(GiB)": 22.04, "step": 155, "token_acc": 0.908289241622575, "train_speed(iter/s)": 0.690504 }, { "epoch": 0.11206896551724138, "grad_norm": 2.664670944213867, "learning_rate": 9.895945319500488e-06, "loss": 0.22808465361595154, "memory(GiB)": 22.05, "step": 156, "token_acc": 0.9087837837837838, "train_speed(iter/s)": 0.692077 }, { "epoch": 0.11278735632183907, "grad_norm": 1.7507610321044922, "learning_rate": 9.89352004963694e-06, "loss": 0.23546862602233887, "memory(GiB)": 22.05, "step": 157, "token_acc": 0.9147424511545293, "train_speed(iter/s)": 0.693609 }, { "epoch": 0.11350574712643678, "grad_norm": 2.3135080337524414, "learning_rate": 9.891067144866687e-06, "loss": 0.2246210277080536, "memory(GiB)": 22.05, "step": 158, "token_acc": 0.8988970588235294, "train_speed(iter/s)": 0.695157 }, { "epoch": 0.11422413793103449, "grad_norm": 1.9778283834457397, "learning_rate": 9.888586619041882e-06, "loss": 0.22059768438339233, "memory(GiB)": 22.05, "step": 159, "token_acc": 0.9342359767891683, "train_speed(iter/s)": 0.696678 }, { "epoch": 0.11494252873563218, "grad_norm": 1.4737626314163208, "learning_rate": 9.886078486170665e-06, "loss": 0.24834270775318146, "memory(GiB)": 22.05, "step": 160, "token_acc": 0.8983739837398373, "train_speed(iter/s)": 0.698137 }, { "epoch": 0.11566091954022989, "grad_norm": 1.9049218893051147, "learning_rate": 9.883542760417074e-06, "loss": 0.21647751331329346, "memory(GiB)": 22.05, "step": 161, "token_acc": 0.9174664107485605, "train_speed(iter/s)": 0.699602 }, { "epoch": 0.11637931034482758, "grad_norm": 2.1588926315307617, "learning_rate": 9.880979456100974e-06, "loss": 0.22561410069465637, "memory(GiB)": 22.05, "step": 162, "token_acc": 0.9189686924493554, "train_speed(iter/s)": 0.700649 }, { "epoch": 0.11709770114942529, "grad_norm": 1.5971635580062866, "learning_rate": 9.878388587697975e-06, "loss": 0.21962696313858032, "memory(GiB)": 22.05, "step": 163, "token_acc": 0.9114688128772636, "train_speed(iter/s)": 0.701723 }, { "epoch": 0.11781609195402298, "grad_norm": 2.276752233505249, "learning_rate": 9.875770169839343e-06, "loss": 0.22907713055610657, "memory(GiB)": 22.05, "step": 164, "token_acc": 0.9047619047619048, "train_speed(iter/s)": 0.702452 }, { "epoch": 0.11853448275862069, "grad_norm": 2.0477635860443115, "learning_rate": 9.873124217311926e-06, "loss": 0.21524345874786377, "memory(GiB)": 22.05, "step": 165, "token_acc": 0.9027303754266212, "train_speed(iter/s)": 0.703729 }, { "epoch": 0.11925287356321838, "grad_norm": 1.342482566833496, "learning_rate": 9.870450745058066e-06, "loss": 0.22757768630981445, "memory(GiB)": 22.05, "step": 166, "token_acc": 0.9208494208494209, "train_speed(iter/s)": 0.705185 }, { "epoch": 0.11997126436781609, "grad_norm": 1.1608365774154663, "learning_rate": 9.867749768175516e-06, "loss": 0.21952572464942932, "memory(GiB)": 22.05, "step": 167, "token_acc": 0.9310344827586207, "train_speed(iter/s)": 0.706523 }, { "epoch": 0.1206896551724138, "grad_norm": 1.0210541486740112, "learning_rate": 9.865021301917358e-06, "loss": 0.21645024418830872, "memory(GiB)": 22.05, "step": 168, "token_acc": 0.9223985890652557, "train_speed(iter/s)": 0.707958 }, { "epoch": 0.12140804597701149, "grad_norm": 1.1041332483291626, "learning_rate": 9.862265361691901e-06, "loss": 0.2084260880947113, "memory(GiB)": 22.05, "step": 169, "token_acc": 0.914018691588785, "train_speed(iter/s)": 0.708944 }, { "epoch": 0.1221264367816092, "grad_norm": 1.215216875076294, "learning_rate": 9.859481963062623e-06, "loss": 0.22357940673828125, "memory(GiB)": 22.05, "step": 170, "token_acc": 0.9032258064516129, "train_speed(iter/s)": 0.710025 }, { "epoch": 0.12284482758620689, "grad_norm": 1.5652254819869995, "learning_rate": 9.856671121748053e-06, "loss": 0.2163272500038147, "memory(GiB)": 22.05, "step": 171, "token_acc": 0.9006849315068494, "train_speed(iter/s)": 0.711037 }, { "epoch": 0.1235632183908046, "grad_norm": 1.2962990999221802, "learning_rate": 9.853832853621703e-06, "loss": 0.2195151448249817, "memory(GiB)": 22.05, "step": 172, "token_acc": 0.9030206677265501, "train_speed(iter/s)": 0.711892 }, { "epoch": 0.12428160919540229, "grad_norm": 1.3539279699325562, "learning_rate": 9.850967174711968e-06, "loss": 0.21610432863235474, "memory(GiB)": 22.05, "step": 173, "token_acc": 0.9276672694394213, "train_speed(iter/s)": 0.713253 }, { "epoch": 0.125, "grad_norm": 1.3578412532806396, "learning_rate": 9.848074101202037e-06, "loss": 0.21898093819618225, "memory(GiB)": 22.05, "step": 174, "token_acc": 0.9072847682119205, "train_speed(iter/s)": 0.714622 }, { "epoch": 0.1257183908045977, "grad_norm": 1.556525707244873, "learning_rate": 9.845153649429808e-06, "loss": 0.2233959138393402, "memory(GiB)": 22.05, "step": 175, "token_acc": 0.9019607843137255, "train_speed(iter/s)": 0.715989 }, { "epoch": 0.12643678160919541, "grad_norm": 1.494619369506836, "learning_rate": 9.842205835887785e-06, "loss": 0.21436643600463867, "memory(GiB)": 22.05, "step": 176, "token_acc": 0.9348659003831418, "train_speed(iter/s)": 0.717342 }, { "epoch": 0.1271551724137931, "grad_norm": 1.1375237703323364, "learning_rate": 9.839230677222997e-06, "loss": 0.20291903614997864, "memory(GiB)": 22.05, "step": 177, "token_acc": 0.940677966101695, "train_speed(iter/s)": 0.718703 }, { "epoch": 0.1278735632183908, "grad_norm": 0.9925621151924133, "learning_rate": 9.836228190236892e-06, "loss": 0.21897557377815247, "memory(GiB)": 22.05, "step": 178, "token_acc": 0.9257246376811594, "train_speed(iter/s)": 0.720043 }, { "epoch": 0.1285919540229885, "grad_norm": 1.253004550933838, "learning_rate": 9.833198391885248e-06, "loss": 0.21895527839660645, "memory(GiB)": 22.05, "step": 179, "token_acc": 0.9105839416058394, "train_speed(iter/s)": 0.72138 }, { "epoch": 0.12931034482758622, "grad_norm": 1.329412579536438, "learning_rate": 9.83014129927808e-06, "loss": 0.20048490166664124, "memory(GiB)": 22.05, "step": 180, "token_acc": 0.9262948207171314, "train_speed(iter/s)": 0.722713 }, { "epoch": 0.1300287356321839, "grad_norm": 1.1390221118927002, "learning_rate": 9.82705692967954e-06, "loss": 0.20828093588352203, "memory(GiB)": 22.05, "step": 181, "token_acc": 0.9254457050243112, "train_speed(iter/s)": 0.724034 }, { "epoch": 0.1307471264367816, "grad_norm": 1.2132861614227295, "learning_rate": 9.823945300507815e-06, "loss": 0.20966939628124237, "memory(GiB)": 24.84, "step": 182, "token_acc": 0.9181818181818182, "train_speed(iter/s)": 0.725322 }, { "epoch": 0.1314655172413793, "grad_norm": 1.0235892534255981, "learning_rate": 9.820806429335042e-06, "loss": 0.22209616005420685, "memory(GiB)": 24.84, "step": 183, "token_acc": 0.9330453563714903, "train_speed(iter/s)": 0.726627 }, { "epoch": 0.13218390804597702, "grad_norm": 0.9361897706985474, "learning_rate": 9.817640333887194e-06, "loss": 0.21441872417926788, "memory(GiB)": 24.84, "step": 184, "token_acc": 0.9308600337268128, "train_speed(iter/s)": 0.727901 }, { "epoch": 0.13290229885057472, "grad_norm": 1.107532262802124, "learning_rate": 9.814447032043987e-06, "loss": 0.19575703144073486, "memory(GiB)": 24.84, "step": 185, "token_acc": 0.926, "train_speed(iter/s)": 0.729167 }, { "epoch": 0.1336206896551724, "grad_norm": 1.1905796527862549, "learning_rate": 9.81122654183878e-06, "loss": 0.20645347237586975, "memory(GiB)": 24.84, "step": 186, "token_acc": 0.9278169014084507, "train_speed(iter/s)": 0.730429 }, { "epoch": 0.1343390804597701, "grad_norm": 1.1426783800125122, "learning_rate": 9.807978881458468e-06, "loss": 0.20688220858573914, "memory(GiB)": 24.84, "step": 187, "token_acc": 0.9305084745762712, "train_speed(iter/s)": 0.731682 }, { "epoch": 0.13505747126436782, "grad_norm": 1.831527829170227, "learning_rate": 9.804704069243389e-06, "loss": 0.21625418961048126, "memory(GiB)": 24.84, "step": 188, "token_acc": 0.9337016574585635, "train_speed(iter/s)": 0.732917 }, { "epoch": 0.13577586206896552, "grad_norm": 1.3148528337478638, "learning_rate": 9.801402123687205e-06, "loss": 0.19783540070056915, "memory(GiB)": 24.84, "step": 189, "token_acc": 0.9301848049281314, "train_speed(iter/s)": 0.734146 }, { "epoch": 0.13649425287356323, "grad_norm": 1.074392318725586, "learning_rate": 9.798073063436815e-06, "loss": 0.19579976797103882, "memory(GiB)": 24.84, "step": 190, "token_acc": 0.9358288770053476, "train_speed(iter/s)": 0.735385 }, { "epoch": 0.1372126436781609, "grad_norm": 1.2053247690200806, "learning_rate": 9.794716907292237e-06, "loss": 0.20485402643680573, "memory(GiB)": 24.84, "step": 191, "token_acc": 0.9156626506024096, "train_speed(iter/s)": 0.736625 }, { "epoch": 0.13793103448275862, "grad_norm": 1.155835509300232, "learning_rate": 9.791333674206507e-06, "loss": 0.20083120465278625, "memory(GiB)": 24.84, "step": 192, "token_acc": 0.9337349397590361, "train_speed(iter/s)": 0.737845 }, { "epoch": 0.13864942528735633, "grad_norm": 0.9754124879837036, "learning_rate": 9.787923383285571e-06, "loss": 0.20589718222618103, "memory(GiB)": 24.84, "step": 193, "token_acc": 0.9323770491803278, "train_speed(iter/s)": 0.739068 }, { "epoch": 0.13936781609195403, "grad_norm": 2.156139373779297, "learning_rate": 9.784486053788179e-06, "loss": 0.20241686701774597, "memory(GiB)": 24.84, "step": 194, "token_acc": 0.9229249011857708, "train_speed(iter/s)": 0.740268 }, { "epoch": 0.1400862068965517, "grad_norm": 1.343120813369751, "learning_rate": 9.78102170512577e-06, "loss": 0.20425817370414734, "memory(GiB)": 24.84, "step": 195, "token_acc": 0.9396378269617707, "train_speed(iter/s)": 0.741462 }, { "epoch": 0.14080459770114942, "grad_norm": 2.005476713180542, "learning_rate": 9.77753035686237e-06, "loss": 0.20664328336715698, "memory(GiB)": 24.84, "step": 196, "token_acc": 0.9352640545144804, "train_speed(iter/s)": 0.742637 }, { "epoch": 0.14152298850574713, "grad_norm": 2.070678949356079, "learning_rate": 9.77401202871448e-06, "loss": 0.19421425461769104, "memory(GiB)": 24.84, "step": 197, "token_acc": 0.9307432432432432, "train_speed(iter/s)": 0.743779 }, { "epoch": 0.14224137931034483, "grad_norm": 1.4514005184173584, "learning_rate": 9.770466740550963e-06, "loss": 0.20578676462173462, "memory(GiB)": 24.84, "step": 198, "token_acc": 0.9137645107794361, "train_speed(iter/s)": 0.744554 }, { "epoch": 0.14295977011494254, "grad_norm": 2.5176234245300293, "learning_rate": 9.766894512392923e-06, "loss": 0.2052486091852188, "memory(GiB)": 24.84, "step": 199, "token_acc": 0.9154929577464789, "train_speed(iter/s)": 0.745549 }, { "epoch": 0.14367816091954022, "grad_norm": 1.0861202478408813, "learning_rate": 9.763295364413616e-06, "loss": 0.19605907797813416, "memory(GiB)": 24.84, "step": 200, "token_acc": 0.9277777777777778, "train_speed(iter/s)": 0.746454 }, { "epoch": 0.14367816091954022, "eval_loss": 0.19427108764648438, "eval_runtime": 5.9574, "eval_samples_per_second": 75.536, "eval_steps_per_second": 2.518, "eval_token_acc": 0.9256056443556444, "step": 200 }, { "epoch": 0.14439655172413793, "grad_norm": 1.9137955904006958, "learning_rate": 9.759669316938307e-06, "loss": 0.20330177247524261, "memory(GiB)": 24.84, "step": 201, "token_acc": 0.9277164439279488, "train_speed(iter/s)": 0.696902 }, { "epoch": 0.14511494252873564, "grad_norm": 1.0583440065383911, "learning_rate": 9.756016390444174e-06, "loss": 0.18788671493530273, "memory(GiB)": 24.84, "step": 202, "token_acc": 0.9367720465890182, "train_speed(iter/s)": 0.698104 }, { "epoch": 0.14583333333333334, "grad_norm": 2.1851727962493896, "learning_rate": 9.752336605560191e-06, "loss": 0.19478628039360046, "memory(GiB)": 24.84, "step": 203, "token_acc": 0.9202334630350194, "train_speed(iter/s)": 0.699315 }, { "epoch": 0.14655172413793102, "grad_norm": 1.3496832847595215, "learning_rate": 9.748629983067004e-06, "loss": 0.19495636224746704, "memory(GiB)": 24.84, "step": 204, "token_acc": 0.9329268292682927, "train_speed(iter/s)": 0.700512 }, { "epoch": 0.14727011494252873, "grad_norm": 1.5665887594223022, "learning_rate": 9.744896543896818e-06, "loss": 0.19407391548156738, "memory(GiB)": 24.84, "step": 205, "token_acc": 0.9244060475161987, "train_speed(iter/s)": 0.701701 }, { "epoch": 0.14798850574712644, "grad_norm": 1.0636086463928223, "learning_rate": 9.741136309133279e-06, "loss": 0.18881715834140778, "memory(GiB)": 24.84, "step": 206, "token_acc": 0.9448979591836735, "train_speed(iter/s)": 0.702887 }, { "epoch": 0.14870689655172414, "grad_norm": 0.8942740559577942, "learning_rate": 9.737349300011353e-06, "loss": 0.18940697610378265, "memory(GiB)": 24.84, "step": 207, "token_acc": 0.9279112754158965, "train_speed(iter/s)": 0.704063 }, { "epoch": 0.14942528735632185, "grad_norm": 1.4068875312805176, "learning_rate": 9.733535537917211e-06, "loss": 0.20070528984069824, "memory(GiB)": 24.84, "step": 208, "token_acc": 0.9351851851851852, "train_speed(iter/s)": 0.705177 }, { "epoch": 0.15014367816091953, "grad_norm": 1.3585190773010254, "learning_rate": 9.729695044388098e-06, "loss": 0.19485361874103546, "memory(GiB)": 24.84, "step": 209, "token_acc": 0.9329608938547486, "train_speed(iter/s)": 0.706313 }, { "epoch": 0.15086206896551724, "grad_norm": 1.2172582149505615, "learning_rate": 9.725827841112226e-06, "loss": 0.19421790540218353, "memory(GiB)": 24.84, "step": 210, "token_acc": 0.9247104247104247, "train_speed(iter/s)": 0.707456 }, { "epoch": 0.15158045977011494, "grad_norm": 1.1872955560684204, "learning_rate": 9.721933949928637e-06, "loss": 0.19092409312725067, "memory(GiB)": 24.84, "step": 211, "token_acc": 0.9229480737018425, "train_speed(iter/s)": 0.708534 }, { "epoch": 0.15229885057471265, "grad_norm": 1.304781436920166, "learning_rate": 9.718013392827087e-06, "loss": 0.20113182067871094, "memory(GiB)": 24.84, "step": 212, "token_acc": 0.935969868173258, "train_speed(iter/s)": 0.709589 }, { "epoch": 0.15301724137931033, "grad_norm": 1.1453309059143066, "learning_rate": 9.714066191947928e-06, "loss": 0.1901583969593048, "memory(GiB)": 24.84, "step": 213, "token_acc": 0.9121338912133892, "train_speed(iter/s)": 0.710487 }, { "epoch": 0.15373563218390804, "grad_norm": 1.0522037744522095, "learning_rate": 9.710092369581966e-06, "loss": 0.1946502923965454, "memory(GiB)": 24.84, "step": 214, "token_acc": 0.9356521739130435, "train_speed(iter/s)": 0.711314 }, { "epoch": 0.15445402298850575, "grad_norm": 1.404204249382019, "learning_rate": 9.70609194817035e-06, "loss": 0.20639878511428833, "memory(GiB)": 24.84, "step": 215, "token_acc": 0.923469387755102, "train_speed(iter/s)": 0.712431 }, { "epoch": 0.15517241379310345, "grad_norm": 1.0829970836639404, "learning_rate": 9.702064950304442e-06, "loss": 0.20797958970069885, "memory(GiB)": 24.84, "step": 216, "token_acc": 0.9083044982698962, "train_speed(iter/s)": 0.713538 }, { "epoch": 0.15589080459770116, "grad_norm": 1.0975240468978882, "learning_rate": 9.698011398725682e-06, "loss": 0.1960044503211975, "memory(GiB)": 24.84, "step": 217, "token_acc": 0.9078694817658349, "train_speed(iter/s)": 0.714638 }, { "epoch": 0.15660919540229884, "grad_norm": 1.3524855375289917, "learning_rate": 9.693931316325473e-06, "loss": 0.20202916860580444, "memory(GiB)": 24.84, "step": 218, "token_acc": 0.9351145038167938, "train_speed(iter/s)": 0.715577 }, { "epoch": 0.15732758620689655, "grad_norm": 1.0087871551513672, "learning_rate": 9.689824726145038e-06, "loss": 0.19543080031871796, "memory(GiB)": 24.84, "step": 219, "token_acc": 0.9270462633451957, "train_speed(iter/s)": 0.716444 }, { "epoch": 0.15804597701149425, "grad_norm": 1.1391695737838745, "learning_rate": 9.685691651375297e-06, "loss": 0.17688420414924622, "memory(GiB)": 24.84, "step": 220, "token_acc": 0.9424860853432282, "train_speed(iter/s)": 0.717273 }, { "epoch": 0.15876436781609196, "grad_norm": 0.9525985717773438, "learning_rate": 9.681532115356737e-06, "loss": 0.17621678113937378, "memory(GiB)": 24.84, "step": 221, "token_acc": 0.9316081330868762, "train_speed(iter/s)": 0.718102 }, { "epoch": 0.15948275862068967, "grad_norm": 2.5463855266571045, "learning_rate": 9.677346141579277e-06, "loss": 0.18204565346240997, "memory(GiB)": 24.84, "step": 222, "token_acc": 0.9230769230769231, "train_speed(iter/s)": 0.719063 }, { "epoch": 0.16020114942528735, "grad_norm": 2.046905517578125, "learning_rate": 9.673133753682138e-06, "loss": 0.18908774852752686, "memory(GiB)": 24.84, "step": 223, "token_acc": 0.9324090121317158, "train_speed(iter/s)": 0.720073 }, { "epoch": 0.16091954022988506, "grad_norm": 1.7669357061386108, "learning_rate": 9.668894975453705e-06, "loss": 0.21247610449790955, "memory(GiB)": 24.84, "step": 224, "token_acc": 0.9352226720647774, "train_speed(iter/s)": 0.721125 }, { "epoch": 0.16163793103448276, "grad_norm": 1.6063449382781982, "learning_rate": 9.664629830831396e-06, "loss": 0.192172572016716, "memory(GiB)": 24.84, "step": 225, "token_acc": 0.9331941544885177, "train_speed(iter/s)": 0.72216 }, { "epoch": 0.16235632183908047, "grad_norm": 1.2727521657943726, "learning_rate": 9.66033834390153e-06, "loss": 0.19713172316551208, "memory(GiB)": 24.84, "step": 226, "token_acc": 0.9326315789473684, "train_speed(iter/s)": 0.723158 }, { "epoch": 0.16307471264367815, "grad_norm": 1.2347190380096436, "learning_rate": 9.656020538899183e-06, "loss": 0.18433819711208344, "memory(GiB)": 24.84, "step": 227, "token_acc": 0.9275618374558304, "train_speed(iter/s)": 0.72401 }, { "epoch": 0.16379310344827586, "grad_norm": 1.074959397315979, "learning_rate": 9.65167644020806e-06, "loss": 0.19094997644424438, "memory(GiB)": 24.84, "step": 228, "token_acc": 0.9243542435424354, "train_speed(iter/s)": 0.724833 }, { "epoch": 0.16451149425287356, "grad_norm": 1.0248912572860718, "learning_rate": 9.647306072360349e-06, "loss": 0.1931467354297638, "memory(GiB)": 24.84, "step": 229, "token_acc": 0.9365942028985508, "train_speed(iter/s)": 0.725639 }, { "epoch": 0.16522988505747127, "grad_norm": 1.5024125576019287, "learning_rate": 9.64290946003659e-06, "loss": 0.18492433428764343, "memory(GiB)": 24.84, "step": 230, "token_acc": 0.9217687074829932, "train_speed(iter/s)": 0.726512 }, { "epoch": 0.16594827586206898, "grad_norm": 1.0916630029678345, "learning_rate": 9.638486628065528e-06, "loss": 0.17669491469860077, "memory(GiB)": 24.84, "step": 231, "token_acc": 0.9160714285714285, "train_speed(iter/s)": 0.727514 }, { "epoch": 0.16666666666666666, "grad_norm": 0.8625556826591492, "learning_rate": 9.63403760142398e-06, "loss": 0.18543057143688202, "memory(GiB)": 24.84, "step": 232, "token_acc": 0.937888198757764, "train_speed(iter/s)": 0.728504 }, { "epoch": 0.16738505747126436, "grad_norm": 1.178329586982727, "learning_rate": 9.629562405236687e-06, "loss": 0.17150650918483734, "memory(GiB)": 24.84, "step": 233, "token_acc": 0.9346642468239564, "train_speed(iter/s)": 0.729483 }, { "epoch": 0.16810344827586207, "grad_norm": 1.4486019611358643, "learning_rate": 9.625061064776183e-06, "loss": 0.18622320890426636, "memory(GiB)": 24.84, "step": 234, "token_acc": 0.9259259259259259, "train_speed(iter/s)": 0.730487 }, { "epoch": 0.16882183908045978, "grad_norm": 1.210326910018921, "learning_rate": 9.620533605462636e-06, "loss": 0.171561598777771, "memory(GiB)": 24.84, "step": 235, "token_acc": 0.9441340782122905, "train_speed(iter/s)": 0.731482 }, { "epoch": 0.16954022988505746, "grad_norm": 2.732008457183838, "learning_rate": 9.61598005286372e-06, "loss": 0.18958157300949097, "memory(GiB)": 24.84, "step": 236, "token_acc": 0.9454191033138402, "train_speed(iter/s)": 0.732467 }, { "epoch": 0.17025862068965517, "grad_norm": 1.7039433717727661, "learning_rate": 9.611400432694463e-06, "loss": 0.18519675731658936, "memory(GiB)": 24.84, "step": 237, "token_acc": 0.9108695652173913, "train_speed(iter/s)": 0.733453 }, { "epoch": 0.17097701149425287, "grad_norm": 1.149326205253601, "learning_rate": 9.606794770817102e-06, "loss": 0.18558377027511597, "memory(GiB)": 24.84, "step": 238, "token_acc": 0.9157706093189965, "train_speed(iter/s)": 0.734407 }, { "epoch": 0.17169540229885058, "grad_norm": 1.4714226722717285, "learning_rate": 9.602163093240936e-06, "loss": 0.1851988136768341, "memory(GiB)": 24.84, "step": 239, "token_acc": 0.9257246376811594, "train_speed(iter/s)": 0.735329 }, { "epoch": 0.1724137931034483, "grad_norm": 1.1995127201080322, "learning_rate": 9.597505426122184e-06, "loss": 0.1687508523464203, "memory(GiB)": 24.84, "step": 240, "token_acc": 0.9267822736030829, "train_speed(iter/s)": 0.736264 }, { "epoch": 0.17313218390804597, "grad_norm": 1.4919582605361938, "learning_rate": 9.592821795763835e-06, "loss": 0.19475847482681274, "memory(GiB)": 24.84, "step": 241, "token_acc": 0.9214876033057852, "train_speed(iter/s)": 0.737166 }, { "epoch": 0.17385057471264367, "grad_norm": 1.3282195329666138, "learning_rate": 9.588112228615495e-06, "loss": 0.1831132024526596, "memory(GiB)": 24.84, "step": 242, "token_acc": 0.9398230088495575, "train_speed(iter/s)": 0.738104 }, { "epoch": 0.17456896551724138, "grad_norm": 1.2160499095916748, "learning_rate": 9.583376751273248e-06, "loss": 0.17874905467033386, "memory(GiB)": 24.84, "step": 243, "token_acc": 0.9233511586452763, "train_speed(iter/s)": 0.739006 }, { "epoch": 0.1752873563218391, "grad_norm": 1.2552424669265747, "learning_rate": 9.57861539047949e-06, "loss": 0.18485480546951294, "memory(GiB)": 24.84, "step": 244, "token_acc": 0.9389763779527559, "train_speed(iter/s)": 0.739933 }, { "epoch": 0.17600574712643677, "grad_norm": 1.4071519374847412, "learning_rate": 9.57382817312279e-06, "loss": 0.18947458267211914, "memory(GiB)": 24.84, "step": 245, "token_acc": 0.9542124542124543, "train_speed(iter/s)": 0.740849 }, { "epoch": 0.17672413793103448, "grad_norm": 1.2728570699691772, "learning_rate": 9.569015126237744e-06, "loss": 0.18542245030403137, "memory(GiB)": 24.84, "step": 246, "token_acc": 0.9360568383658969, "train_speed(iter/s)": 0.741782 }, { "epoch": 0.17744252873563218, "grad_norm": 1.1807173490524292, "learning_rate": 9.564176277004804e-06, "loss": 0.1766074001789093, "memory(GiB)": 24.84, "step": 247, "token_acc": 0.9345991561181435, "train_speed(iter/s)": 0.742717 }, { "epoch": 0.1781609195402299, "grad_norm": 1.1560941934585571, "learning_rate": 9.559311652750135e-06, "loss": 0.17736376821994781, "memory(GiB)": 24.84, "step": 248, "token_acc": 0.9405320813771518, "train_speed(iter/s)": 0.743577 }, { "epoch": 0.1788793103448276, "grad_norm": 2.1547412872314453, "learning_rate": 9.554421280945467e-06, "loss": 0.18056851625442505, "memory(GiB)": 24.84, "step": 249, "token_acc": 0.9414556962025317, "train_speed(iter/s)": 0.744207 }, { "epoch": 0.17959770114942528, "grad_norm": 1.1441009044647217, "learning_rate": 9.549505189207924e-06, "loss": 0.16143842041492462, "memory(GiB)": 24.84, "step": 250, "token_acc": 0.9271948608137045, "train_speed(iter/s)": 0.744526 }, { "epoch": 0.17959770114942528, "eval_loss": 0.17048770189285278, "eval_runtime": 6.2165, "eval_samples_per_second": 72.388, "eval_steps_per_second": 2.413, "eval_token_acc": 0.935064935064935, "step": 250 }, { "epoch": 0.18031609195402298, "grad_norm": 1.4486165046691895, "learning_rate": 9.54456340529988e-06, "loss": 0.17096185684204102, "memory(GiB)": 24.84, "step": 251, "token_acc": 0.9339611536197764, "train_speed(iter/s)": 0.704131 }, { "epoch": 0.1810344827586207, "grad_norm": 2.873619318008423, "learning_rate": 9.539595957128803e-06, "loss": 0.18871326744556427, "memory(GiB)": 24.84, "step": 252, "token_acc": 0.9222614840989399, "train_speed(iter/s)": 0.705082 }, { "epoch": 0.1817528735632184, "grad_norm": 0.9712240099906921, "learning_rate": 9.534602872747086e-06, "loss": 0.16412265598773956, "memory(GiB)": 24.84, "step": 253, "token_acc": 0.9384359400998337, "train_speed(iter/s)": 0.70602 }, { "epoch": 0.1824712643678161, "grad_norm": 1.2170720100402832, "learning_rate": 9.529584180351902e-06, "loss": 0.17214810848236084, "memory(GiB)": 24.84, "step": 254, "token_acc": 0.9409020217729394, "train_speed(iter/s)": 0.706984 }, { "epoch": 0.18318965517241378, "grad_norm": 1.0731936693191528, "learning_rate": 9.524539908285039e-06, "loss": 0.1753038465976715, "memory(GiB)": 24.84, "step": 255, "token_acc": 0.9347826086956522, "train_speed(iter/s)": 0.707936 }, { "epoch": 0.1839080459770115, "grad_norm": 1.1383955478668213, "learning_rate": 9.519470085032733e-06, "loss": 0.17063133418560028, "memory(GiB)": 24.84, "step": 256, "token_acc": 0.9366336633663367, "train_speed(iter/s)": 0.708876 }, { "epoch": 0.1846264367816092, "grad_norm": 1.2984424829483032, "learning_rate": 9.51437473922552e-06, "loss": 0.17558182775974274, "memory(GiB)": 24.84, "step": 257, "token_acc": 0.9285714285714286, "train_speed(iter/s)": 0.709806 }, { "epoch": 0.1853448275862069, "grad_norm": 1.277177333831787, "learning_rate": 9.509253899638066e-06, "loss": 0.17482249438762665, "memory(GiB)": 24.84, "step": 258, "token_acc": 0.913322632423756, "train_speed(iter/s)": 0.710734 }, { "epoch": 0.18606321839080459, "grad_norm": 1.174424409866333, "learning_rate": 9.504107595189007e-06, "loss": 0.17518410086631775, "memory(GiB)": 24.84, "step": 259, "token_acc": 0.9433551198257081, "train_speed(iter/s)": 0.711677 }, { "epoch": 0.1867816091954023, "grad_norm": 1.3053314685821533, "learning_rate": 9.498935854940785e-06, "loss": 0.1856999397277832, "memory(GiB)": 24.84, "step": 260, "token_acc": 0.9481327800829875, "train_speed(iter/s)": 0.71263 }, { "epoch": 0.1875, "grad_norm": 1.4047025442123413, "learning_rate": 9.493738708099485e-06, "loss": 0.1603386402130127, "memory(GiB)": 24.84, "step": 261, "token_acc": 0.9285714285714286, "train_speed(iter/s)": 0.713515 }, { "epoch": 0.1882183908045977, "grad_norm": 1.3200582265853882, "learning_rate": 9.488516184014667e-06, "loss": 0.1756541132926941, "memory(GiB)": 24.84, "step": 262, "token_acc": 0.9107806691449815, "train_speed(iter/s)": 0.714448 }, { "epoch": 0.18893678160919541, "grad_norm": 1.688560962677002, "learning_rate": 9.483268312179206e-06, "loss": 0.17776352167129517, "memory(GiB)": 24.84, "step": 263, "token_acc": 0.9245647969052224, "train_speed(iter/s)": 0.715354 }, { "epoch": 0.1896551724137931, "grad_norm": 0.9999760985374451, "learning_rate": 9.477995122229117e-06, "loss": 0.16564805805683136, "memory(GiB)": 24.84, "step": 264, "token_acc": 0.9478827361563518, "train_speed(iter/s)": 0.716006 }, { "epoch": 0.1903735632183908, "grad_norm": 2.7515392303466797, "learning_rate": 9.4726966439434e-06, "loss": 0.1725974678993225, "memory(GiB)": 24.84, "step": 265, "token_acc": 0.9252173913043479, "train_speed(iter/s)": 0.716798 }, { "epoch": 0.1910919540229885, "grad_norm": 1.3686915636062622, "learning_rate": 9.467372907243858e-06, "loss": 0.17224182188510895, "memory(GiB)": 24.84, "step": 266, "token_acc": 0.9381044487427466, "train_speed(iter/s)": 0.717716 }, { "epoch": 0.19181034482758622, "grad_norm": 4.453400611877441, "learning_rate": 9.462023942194934e-06, "loss": 0.16680872440338135, "memory(GiB)": 24.84, "step": 267, "token_acc": 0.9460966542750929, "train_speed(iter/s)": 0.718615 }, { "epoch": 0.1925287356321839, "grad_norm": 2.131878614425659, "learning_rate": 9.456649779003548e-06, "loss": 0.18635231256484985, "memory(GiB)": 24.84, "step": 268, "token_acc": 0.9403578528827038, "train_speed(iter/s)": 0.719514 }, { "epoch": 0.1932471264367816, "grad_norm": 1.34542977809906, "learning_rate": 9.451250448018916e-06, "loss": 0.1784970760345459, "memory(GiB)": 24.84, "step": 269, "token_acc": 0.945518453427065, "train_speed(iter/s)": 0.720419 }, { "epoch": 0.1939655172413793, "grad_norm": 1.3527719974517822, "learning_rate": 9.44582597973238e-06, "loss": 0.17781509459018707, "memory(GiB)": 24.84, "step": 270, "token_acc": 0.9228007181328546, "train_speed(iter/s)": 0.721276 }, { "epoch": 0.19468390804597702, "grad_norm": 1.4100406169891357, "learning_rate": 9.440376404777245e-06, "loss": 0.1739485263824463, "memory(GiB)": 24.84, "step": 271, "token_acc": 0.9339285714285714, "train_speed(iter/s)": 0.72209 }, { "epoch": 0.19540229885057472, "grad_norm": 1.158107042312622, "learning_rate": 9.434901753928593e-06, "loss": 0.1613406240940094, "memory(GiB)": 24.84, "step": 272, "token_acc": 0.9208400646203554, "train_speed(iter/s)": 0.722952 }, { "epoch": 0.1961206896551724, "grad_norm": 4.168455123901367, "learning_rate": 9.429402058103122e-06, "loss": 0.1677890121936798, "memory(GiB)": 24.84, "step": 273, "token_acc": 0.9330628803245437, "train_speed(iter/s)": 0.723815 }, { "epoch": 0.1968390804597701, "grad_norm": 2.258901357650757, "learning_rate": 9.423877348358956e-06, "loss": 0.17392060160636902, "memory(GiB)": 24.84, "step": 274, "token_acc": 0.9246861924686193, "train_speed(iter/s)": 0.724686 }, { "epoch": 0.19755747126436782, "grad_norm": 1.8273433446884155, "learning_rate": 9.418327655895492e-06, "loss": 0.16346615552902222, "memory(GiB)": 24.84, "step": 275, "token_acc": 0.9379432624113475, "train_speed(iter/s)": 0.725383 }, { "epoch": 0.19827586206896552, "grad_norm": 1.3173205852508545, "learning_rate": 9.4127530120532e-06, "loss": 0.17696520686149597, "memory(GiB)": 24.84, "step": 276, "token_acc": 0.9193548387096774, "train_speed(iter/s)": 0.725941 }, { "epoch": 0.19899425287356323, "grad_norm": 1.2824018001556396, "learning_rate": 9.407153448313458e-06, "loss": 0.16345083713531494, "memory(GiB)": 24.84, "step": 277, "token_acc": 0.9309090909090909, "train_speed(iter/s)": 0.726561 }, { "epoch": 0.1997126436781609, "grad_norm": 1.6922494173049927, "learning_rate": 9.401528996298375e-06, "loss": 0.15776585042476654, "memory(GiB)": 24.84, "step": 278, "token_acc": 0.9403453689167975, "train_speed(iter/s)": 0.727343 }, { "epoch": 0.20043103448275862, "grad_norm": 0.9524480104446411, "learning_rate": 9.395879687770611e-06, "loss": 0.16145414113998413, "memory(GiB)": 24.84, "step": 279, "token_acc": 0.9430147058823529, "train_speed(iter/s)": 0.728155 }, { "epoch": 0.20114942528735633, "grad_norm": 1.1509290933609009, "learning_rate": 9.390205554633193e-06, "loss": 0.1712127923965454, "memory(GiB)": 24.84, "step": 280, "token_acc": 0.9474708171206225, "train_speed(iter/s)": 0.72897 }, { "epoch": 0.20186781609195403, "grad_norm": 1.0565576553344727, "learning_rate": 9.384506628929344e-06, "loss": 0.1632165014743805, "memory(GiB)": 24.84, "step": 281, "token_acc": 0.9375, "train_speed(iter/s)": 0.729789 }, { "epoch": 0.2025862068965517, "grad_norm": 1.1710087060928345, "learning_rate": 9.378782942842292e-06, "loss": 0.16626501083374023, "memory(GiB)": 24.84, "step": 282, "token_acc": 0.9517819706498952, "train_speed(iter/s)": 0.730619 }, { "epoch": 0.20330459770114942, "grad_norm": 0.9994138479232788, "learning_rate": 9.373034528695097e-06, "loss": 0.1653590202331543, "memory(GiB)": 24.84, "step": 283, "token_acc": 0.9293361884368309, "train_speed(iter/s)": 0.731354 }, { "epoch": 0.20402298850574713, "grad_norm": 1.1102218627929688, "learning_rate": 9.367261418950459e-06, "loss": 0.16029050946235657, "memory(GiB)": 24.84, "step": 284, "token_acc": 0.9244935543278084, "train_speed(iter/s)": 0.73203 }, { "epoch": 0.20474137931034483, "grad_norm": 0.8043876886367798, "learning_rate": 9.361463646210545e-06, "loss": 0.1460798680782318, "memory(GiB)": 24.84, "step": 285, "token_acc": 0.9461400359066428, "train_speed(iter/s)": 0.732693 }, { "epoch": 0.20545977011494254, "grad_norm": 1.192240595817566, "learning_rate": 9.355641243216798e-06, "loss": 0.16191384196281433, "memory(GiB)": 24.84, "step": 286, "token_acc": 0.9152542372881356, "train_speed(iter/s)": 0.733387 }, { "epoch": 0.20617816091954022, "grad_norm": 1.0879827737808228, "learning_rate": 9.349794242849752e-06, "loss": 0.16262328624725342, "memory(GiB)": 24.84, "step": 287, "token_acc": 0.9291338582677166, "train_speed(iter/s)": 0.734173 }, { "epoch": 0.20689655172413793, "grad_norm": 1.5752301216125488, "learning_rate": 9.343922678128854e-06, "loss": 0.16404327750205994, "memory(GiB)": 24.84, "step": 288, "token_acc": 0.9306049822064056, "train_speed(iter/s)": 0.734977 }, { "epoch": 0.20761494252873564, "grad_norm": 1.449565052986145, "learning_rate": 9.338026582212268e-06, "loss": 0.16222915053367615, "memory(GiB)": 24.84, "step": 289, "token_acc": 0.9401869158878504, "train_speed(iter/s)": 0.735765 }, { "epoch": 0.20833333333333334, "grad_norm": 1.4924116134643555, "learning_rate": 9.332105988396692e-06, "loss": 0.16684061288833618, "memory(GiB)": 24.84, "step": 290, "token_acc": 0.9400826446280992, "train_speed(iter/s)": 0.736548 }, { "epoch": 0.20905172413793102, "grad_norm": 1.2434477806091309, "learning_rate": 9.326160930117168e-06, "loss": 0.15447336435317993, "memory(GiB)": 24.84, "step": 291, "token_acc": 0.9461697722567288, "train_speed(iter/s)": 0.737336 }, { "epoch": 0.20977011494252873, "grad_norm": 1.0727328062057495, "learning_rate": 9.3201914409469e-06, "loss": 0.15826071798801422, "memory(GiB)": 24.84, "step": 292, "token_acc": 0.9403747870528109, "train_speed(iter/s)": 0.738124 }, { "epoch": 0.21048850574712644, "grad_norm": 1.5840122699737549, "learning_rate": 9.314197554597053e-06, "loss": 0.17694279551506042, "memory(GiB)": 24.84, "step": 293, "token_acc": 0.9330783938814532, "train_speed(iter/s)": 0.738912 }, { "epoch": 0.21120689655172414, "grad_norm": 1.4127275943756104, "learning_rate": 9.308179304916573e-06, "loss": 0.15642927587032318, "memory(GiB)": 24.84, "step": 294, "token_acc": 0.95, "train_speed(iter/s)": 0.739699 }, { "epoch": 0.21192528735632185, "grad_norm": 1.0951980352401733, "learning_rate": 9.30213672589199e-06, "loss": 0.14578227698802948, "memory(GiB)": 24.84, "step": 295, "token_acc": 0.9647058823529412, "train_speed(iter/s)": 0.740478 }, { "epoch": 0.21264367816091953, "grad_norm": 1.294354796409607, "learning_rate": 9.29606985164723e-06, "loss": 0.15017825365066528, "memory(GiB)": 24.84, "step": 296, "token_acc": 0.9403578528827038, "train_speed(iter/s)": 0.741245 }, { "epoch": 0.21336206896551724, "grad_norm": 0.9250004887580872, "learning_rate": 9.289978716443417e-06, "loss": 0.15283486247062683, "memory(GiB)": 24.84, "step": 297, "token_acc": 0.9375, "train_speed(iter/s)": 0.742018 }, { "epoch": 0.21408045977011494, "grad_norm": 1.07221519947052, "learning_rate": 9.283863354678683e-06, "loss": 0.14911124110221863, "memory(GiB)": 24.84, "step": 298, "token_acc": 0.9491833030852994, "train_speed(iter/s)": 0.74279 }, { "epoch": 0.21479885057471265, "grad_norm": 1.3063242435455322, "learning_rate": 9.277723800887977e-06, "loss": 0.15536969900131226, "memory(GiB)": 24.84, "step": 299, "token_acc": 0.9352014010507881, "train_speed(iter/s)": 0.74351 }, { "epoch": 0.21551724137931033, "grad_norm": 1.4165470600128174, "learning_rate": 9.27156008974286e-06, "loss": 0.1630294770002365, "memory(GiB)": 24.84, "step": 300, "token_acc": 0.9467455621301775, "train_speed(iter/s)": 0.744028 }, { "epoch": 0.21551724137931033, "eval_loss": 0.1468629688024521, "eval_runtime": 7.5826, "eval_samples_per_second": 59.347, "eval_steps_per_second": 1.978, "eval_token_acc": 0.9429164585414586, "step": 300 }, { "epoch": 0.21623563218390804, "grad_norm": 1.3495118618011475, "learning_rate": 9.265372256051322e-06, "loss": 0.1614147126674652, "memory(GiB)": 24.84, "step": 301, "token_acc": 0.9440047253396338, "train_speed(iter/s)": 0.70714 }, { "epoch": 0.21695402298850575, "grad_norm": 1.419327974319458, "learning_rate": 9.259160334757575e-06, "loss": 0.15543609857559204, "memory(GiB)": 24.84, "step": 302, "token_acc": 0.9326923076923077, "train_speed(iter/s)": 0.707779 }, { "epoch": 0.21767241379310345, "grad_norm": 1.401384711265564, "learning_rate": 9.25292436094186e-06, "loss": 0.15874376893043518, "memory(GiB)": 24.84, "step": 303, "token_acc": 0.9469914040114613, "train_speed(iter/s)": 0.70857 }, { "epoch": 0.21839080459770116, "grad_norm": 0.9685477614402771, "learning_rate": 9.246664369820249e-06, "loss": 0.14868557453155518, "memory(GiB)": 24.84, "step": 304, "token_acc": 0.9526881720430107, "train_speed(iter/s)": 0.709367 }, { "epoch": 0.21910919540229884, "grad_norm": 1.2144882678985596, "learning_rate": 9.240380396744446e-06, "loss": 0.14889603853225708, "memory(GiB)": 24.84, "step": 305, "token_acc": 0.9509981851179673, "train_speed(iter/s)": 0.710147 }, { "epoch": 0.21982758620689655, "grad_norm": 1.1610817909240723, "learning_rate": 9.234072477201588e-06, "loss": 0.15774932503700256, "memory(GiB)": 24.84, "step": 306, "token_acc": 0.9507640067911715, "train_speed(iter/s)": 0.710947 }, { "epoch": 0.22054597701149425, "grad_norm": 1.3675167560577393, "learning_rate": 9.227740646814038e-06, "loss": 0.14327308535575867, "memory(GiB)": 24.84, "step": 307, "token_acc": 0.9493670886075949, "train_speed(iter/s)": 0.711742 }, { "epoch": 0.22126436781609196, "grad_norm": 1.2221105098724365, "learning_rate": 9.2213849413392e-06, "loss": 0.1559586077928543, "memory(GiB)": 24.84, "step": 308, "token_acc": 0.9414225941422594, "train_speed(iter/s)": 0.712526 }, { "epoch": 0.22198275862068967, "grad_norm": 1.3934344053268433, "learning_rate": 9.215005396669294e-06, "loss": 0.14346933364868164, "memory(GiB)": 24.84, "step": 309, "token_acc": 0.961335676625659, "train_speed(iter/s)": 0.713299 }, { "epoch": 0.22270114942528735, "grad_norm": 1.1291680335998535, "learning_rate": 9.208602048831176e-06, "loss": 0.15378274023532867, "memory(GiB)": 24.84, "step": 310, "token_acc": 0.9485420240137221, "train_speed(iter/s)": 0.714085 }, { "epoch": 0.22341954022988506, "grad_norm": 1.0991337299346924, "learning_rate": 9.202174933986118e-06, "loss": 0.13361436128616333, "memory(GiB)": 24.84, "step": 311, "token_acc": 0.9522968197879859, "train_speed(iter/s)": 0.714872 }, { "epoch": 0.22413793103448276, "grad_norm": 1.4599182605743408, "learning_rate": 9.195724088429611e-06, "loss": 0.15119382739067078, "memory(GiB)": 24.84, "step": 312, "token_acc": 0.9489603024574669, "train_speed(iter/s)": 0.715645 }, { "epoch": 0.22485632183908047, "grad_norm": 1.0968987941741943, "learning_rate": 9.189249548591165e-06, "loss": 0.1554754376411438, "memory(GiB)": 24.84, "step": 313, "token_acc": 0.9321486268174475, "train_speed(iter/s)": 0.71625 }, { "epoch": 0.22557471264367815, "grad_norm": 1.2048288583755493, "learning_rate": 9.18275135103409e-06, "loss": 0.14621251821517944, "memory(GiB)": 24.84, "step": 314, "token_acc": 0.9372384937238494, "train_speed(iter/s)": 0.716811 }, { "epoch": 0.22629310344827586, "grad_norm": 0.8819748759269714, "learning_rate": 9.176229532455298e-06, "loss": 0.146192729473114, "memory(GiB)": 24.84, "step": 315, "token_acc": 0.9393346379647749, "train_speed(iter/s)": 0.717555 }, { "epoch": 0.22701149425287356, "grad_norm": 1.466875433921814, "learning_rate": 9.169684129685099e-06, "loss": 0.14800411462783813, "memory(GiB)": 24.84, "step": 316, "token_acc": 0.9448669201520913, "train_speed(iter/s)": 0.718317 }, { "epoch": 0.22772988505747127, "grad_norm": 1.4240273237228394, "learning_rate": 9.163115179686986e-06, "loss": 0.14824992418289185, "memory(GiB)": 24.84, "step": 317, "token_acc": 0.9435626102292769, "train_speed(iter/s)": 0.719079 }, { "epoch": 0.22844827586206898, "grad_norm": 1.168806791305542, "learning_rate": 9.156522719557428e-06, "loss": 0.1498476266860962, "memory(GiB)": 24.84, "step": 318, "token_acc": 0.9430255402750491, "train_speed(iter/s)": 0.719838 }, { "epoch": 0.22916666666666666, "grad_norm": 0.9806177020072937, "learning_rate": 9.149906786525662e-06, "loss": 0.13878831267356873, "memory(GiB)": 24.84, "step": 319, "token_acc": 0.9403292181069959, "train_speed(iter/s)": 0.720588 }, { "epoch": 0.22988505747126436, "grad_norm": 1.280739188194275, "learning_rate": 9.143267417953486e-06, "loss": 0.15622420608997345, "memory(GiB)": 24.84, "step": 320, "token_acc": 0.9269162210338681, "train_speed(iter/s)": 0.721332 }, { "epoch": 0.23060344827586207, "grad_norm": 1.0431073904037476, "learning_rate": 9.136604651335039e-06, "loss": 0.16762125492095947, "memory(GiB)": 24.84, "step": 321, "token_acc": 0.9236499068901304, "train_speed(iter/s)": 0.722084 }, { "epoch": 0.23132183908045978, "grad_norm": 1.000288963317871, "learning_rate": 9.129918524296596e-06, "loss": 0.14830325543880463, "memory(GiB)": 24.84, "step": 322, "token_acc": 0.9489194499017681, "train_speed(iter/s)": 0.722832 }, { "epoch": 0.23204022988505746, "grad_norm": 0.8490732312202454, "learning_rate": 9.123209074596353e-06, "loss": 0.13322216272354126, "memory(GiB)": 24.84, "step": 323, "token_acc": 0.9477124183006536, "train_speed(iter/s)": 0.723554 }, { "epoch": 0.23275862068965517, "grad_norm": 1.0365772247314453, "learning_rate": 9.11647634012422e-06, "loss": 0.1322324424982071, "memory(GiB)": 24.84, "step": 324, "token_acc": 0.9540918163672655, "train_speed(iter/s)": 0.724271 }, { "epoch": 0.23347701149425287, "grad_norm": 1.242830753326416, "learning_rate": 9.109720358901599e-06, "loss": 0.15137039124965668, "memory(GiB)": 24.84, "step": 325, "token_acc": 0.9473684210526315, "train_speed(iter/s)": 0.724922 }, { "epoch": 0.23419540229885058, "grad_norm": 1.2658153772354126, "learning_rate": 9.102941169081167e-06, "loss": 0.16147923469543457, "memory(GiB)": 24.84, "step": 326, "token_acc": 0.9320754716981132, "train_speed(iter/s)": 0.725491 }, { "epoch": 0.2349137931034483, "grad_norm": 0.9772785902023315, "learning_rate": 9.096138808946673e-06, "loss": 0.150155708193779, "memory(GiB)": 24.84, "step": 327, "token_acc": 0.9445628997867804, "train_speed(iter/s)": 0.725986 }, { "epoch": 0.23563218390804597, "grad_norm": 1.3678042888641357, "learning_rate": 9.089313316912708e-06, "loss": 0.1660783290863037, "memory(GiB)": 24.84, "step": 328, "token_acc": 0.923728813559322, "train_speed(iter/s)": 0.726611 }, { "epoch": 0.23635057471264367, "grad_norm": 1.0742008686065674, "learning_rate": 9.082464731524502e-06, "loss": 0.15340426564216614, "memory(GiB)": 24.84, "step": 329, "token_acc": 0.9625212947189097, "train_speed(iter/s)": 0.72734 }, { "epoch": 0.23706896551724138, "grad_norm": 1.1345187425613403, "learning_rate": 9.075593091457692e-06, "loss": 0.13481304049491882, "memory(GiB)": 24.84, "step": 330, "token_acc": 0.9552529182879378, "train_speed(iter/s)": 0.728059 }, { "epoch": 0.2377873563218391, "grad_norm": 1.908892273902893, "learning_rate": 9.068698435518117e-06, "loss": 0.15422780811786652, "memory(GiB)": 24.84, "step": 331, "token_acc": 0.9430255402750491, "train_speed(iter/s)": 0.728757 }, { "epoch": 0.23850574712643677, "grad_norm": 1.0237723588943481, "learning_rate": 9.061780802641582e-06, "loss": 0.1384231597185135, "memory(GiB)": 24.84, "step": 332, "token_acc": 0.9425925925925925, "train_speed(iter/s)": 0.729142 }, { "epoch": 0.23922413793103448, "grad_norm": 1.636013150215149, "learning_rate": 9.05484023189366e-06, "loss": 0.14707940816879272, "memory(GiB)": 24.84, "step": 333, "token_acc": 0.9398797595190381, "train_speed(iter/s)": 0.729846 }, { "epoch": 0.23994252873563218, "grad_norm": 1.134458303451538, "learning_rate": 9.047876762469451e-06, "loss": 0.1455414593219757, "memory(GiB)": 24.84, "step": 334, "token_acc": 0.9405204460966543, "train_speed(iter/s)": 0.730555 }, { "epoch": 0.2406609195402299, "grad_norm": 1.7651512622833252, "learning_rate": 9.040890433693377e-06, "loss": 0.15867596864700317, "memory(GiB)": 24.84, "step": 335, "token_acc": 0.9552980132450332, "train_speed(iter/s)": 0.73126 }, { "epoch": 0.2413793103448276, "grad_norm": 1.384774923324585, "learning_rate": 9.033881285018945e-06, "loss": 0.1463089883327484, "memory(GiB)": 24.84, "step": 336, "token_acc": 0.9376218323586745, "train_speed(iter/s)": 0.73195 }, { "epoch": 0.24209770114942528, "grad_norm": 1.2504887580871582, "learning_rate": 9.026849356028534e-06, "loss": 0.13326561450958252, "memory(GiB)": 24.84, "step": 337, "token_acc": 0.9637404580152672, "train_speed(iter/s)": 0.73242 }, { "epoch": 0.24281609195402298, "grad_norm": 1.190233588218689, "learning_rate": 9.019794686433174e-06, "loss": 0.14306066930294037, "memory(GiB)": 24.84, "step": 338, "token_acc": 0.956442831215971, "train_speed(iter/s)": 0.732884 }, { "epoch": 0.2435344827586207, "grad_norm": 1.2957282066345215, "learning_rate": 9.01271731607231e-06, "loss": 0.14537212252616882, "memory(GiB)": 24.84, "step": 339, "token_acc": 0.9456740442655935, "train_speed(iter/s)": 0.733356 }, { "epoch": 0.2442528735632184, "grad_norm": 0.9001214504241943, "learning_rate": 9.005617284913586e-06, "loss": 0.13776838779449463, "memory(GiB)": 24.84, "step": 340, "token_acc": 0.9586206896551724, "train_speed(iter/s)": 0.733852 }, { "epoch": 0.2449712643678161, "grad_norm": 1.2198249101638794, "learning_rate": 8.998494633052622e-06, "loss": 0.13208453357219696, "memory(GiB)": 24.84, "step": 341, "token_acc": 0.9432485322896281, "train_speed(iter/s)": 0.734532 }, { "epoch": 0.24568965517241378, "grad_norm": 0.9589635729789734, "learning_rate": 8.991349400712772e-06, "loss": 0.12399931252002716, "memory(GiB)": 24.84, "step": 342, "token_acc": 0.9644268774703557, "train_speed(iter/s)": 0.735207 }, { "epoch": 0.2464080459770115, "grad_norm": 1.498348593711853, "learning_rate": 8.984181628244917e-06, "loss": 0.1473434418439865, "memory(GiB)": 24.84, "step": 343, "token_acc": 0.9547325102880658, "train_speed(iter/s)": 0.735889 }, { "epoch": 0.2471264367816092, "grad_norm": 1.0425798892974854, "learning_rate": 8.976991356127225e-06, "loss": 0.14547964930534363, "memory(GiB)": 24.84, "step": 344, "token_acc": 0.9533582089552238, "train_speed(iter/s)": 0.736565 }, { "epoch": 0.2478448275862069, "grad_norm": 1.4299161434173584, "learning_rate": 8.969778624964922e-06, "loss": 0.13614286482334137, "memory(GiB)": 24.84, "step": 345, "token_acc": 0.9354838709677419, "train_speed(iter/s)": 0.737251 }, { "epoch": 0.24856321839080459, "grad_norm": 1.174292802810669, "learning_rate": 8.962543475490068e-06, "loss": 0.14519433677196503, "memory(GiB)": 24.84, "step": 346, "token_acc": 0.9247863247863248, "train_speed(iter/s)": 0.737922 }, { "epoch": 0.2492816091954023, "grad_norm": 1.3678618669509888, "learning_rate": 8.955285948561328e-06, "loss": 0.13812494277954102, "memory(GiB)": 24.84, "step": 347, "token_acc": 0.9428571428571428, "train_speed(iter/s)": 0.738597 }, { "epoch": 0.25, "grad_norm": 0.94019615650177, "learning_rate": 8.948006085163735e-06, "loss": 0.14584122598171234, "memory(GiB)": 24.84, "step": 348, "token_acc": 0.9423791821561338, "train_speed(iter/s)": 0.739272 }, { "epoch": 0.2507183908045977, "grad_norm": 1.9481747150421143, "learning_rate": 8.940703926408456e-06, "loss": 0.1363934576511383, "memory(GiB)": 24.84, "step": 349, "token_acc": 0.9465648854961832, "train_speed(iter/s)": 0.739804 }, { "epoch": 0.2514367816091954, "grad_norm": 1.3570866584777832, "learning_rate": 8.933379513532575e-06, "loss": 0.14979156851768494, "memory(GiB)": 24.84, "step": 350, "token_acc": 0.9377593360995851, "train_speed(iter/s)": 0.740288 }, { "epoch": 0.2514367816091954, "eval_loss": 0.13180726766586304, "eval_runtime": 5.8552, "eval_samples_per_second": 76.855, "eval_steps_per_second": 2.562, "eval_token_acc": 0.9493787462537463, "step": 350 }, { "epoch": 0.2521551724137931, "grad_norm": 1.162318468093872, "learning_rate": 8.926032887898846e-06, "loss": 0.14043065905570984, "memory(GiB)": 24.84, "step": 351, "token_acc": 0.9523305084745762, "train_speed(iter/s)": 0.711805 }, { "epoch": 0.25287356321839083, "grad_norm": 1.2196840047836304, "learning_rate": 8.91866409099546e-06, "loss": 0.1394709199666977, "memory(GiB)": 24.84, "step": 352, "token_acc": 0.9552238805970149, "train_speed(iter/s)": 0.712493 }, { "epoch": 0.2535919540229885, "grad_norm": 1.1888127326965332, "learning_rate": 8.911273164435824e-06, "loss": 0.127862811088562, "memory(GiB)": 24.84, "step": 353, "token_acc": 0.9547325102880658, "train_speed(iter/s)": 0.713182 }, { "epoch": 0.2543103448275862, "grad_norm": 0.9853198528289795, "learning_rate": 8.903860149958308e-06, "loss": 0.14018258452415466, "memory(GiB)": 24.84, "step": 354, "token_acc": 0.9576427255985267, "train_speed(iter/s)": 0.713804 }, { "epoch": 0.2550287356321839, "grad_norm": 1.4263728857040405, "learning_rate": 8.896425089426022e-06, "loss": 0.1291871815919876, "memory(GiB)": 24.84, "step": 355, "token_acc": 0.9562737642585551, "train_speed(iter/s)": 0.714324 }, { "epoch": 0.2557471264367816, "grad_norm": 1.0120909214019775, "learning_rate": 8.888968024826575e-06, "loss": 0.12034127116203308, "memory(GiB)": 24.84, "step": 356, "token_acc": 0.9578783151326054, "train_speed(iter/s)": 0.714839 }, { "epoch": 0.25646551724137934, "grad_norm": 1.3119267225265503, "learning_rate": 8.881488998271834e-06, "loss": 0.12918488681316376, "memory(GiB)": 24.84, "step": 357, "token_acc": 0.9621342512908778, "train_speed(iter/s)": 0.715261 }, { "epoch": 0.257183908045977, "grad_norm": 1.77956223487854, "learning_rate": 8.873988051997702e-06, "loss": 0.12913912534713745, "memory(GiB)": 24.84, "step": 358, "token_acc": 0.9693053311793215, "train_speed(iter/s)": 0.715895 }, { "epoch": 0.2579022988505747, "grad_norm": 1.343458652496338, "learning_rate": 8.866465228363853e-06, "loss": 0.13815584778785706, "memory(GiB)": 24.84, "step": 359, "token_acc": 0.9286871961102107, "train_speed(iter/s)": 0.716564 }, { "epoch": 0.25862068965517243, "grad_norm": 1.3716161251068115, "learning_rate": 8.85892056985352e-06, "loss": 0.1388726681470871, "memory(GiB)": 24.84, "step": 360, "token_acc": 0.930379746835443, "train_speed(iter/s)": 0.717194 }, { "epoch": 0.2593390804597701, "grad_norm": 2.883368730545044, "learning_rate": 8.851354119073234e-06, "loss": 0.1283245086669922, "memory(GiB)": 24.84, "step": 361, "token_acc": 0.9595375722543352, "train_speed(iter/s)": 0.717844 }, { "epoch": 0.2600574712643678, "grad_norm": 1.2361292839050293, "learning_rate": 8.8437659187526e-06, "loss": 0.1457204520702362, "memory(GiB)": 24.84, "step": 362, "token_acc": 0.9493670886075949, "train_speed(iter/s)": 0.718491 }, { "epoch": 0.2607758620689655, "grad_norm": 4.618409156799316, "learning_rate": 8.836156011744046e-06, "loss": 0.12857215106487274, "memory(GiB)": 24.84, "step": 363, "token_acc": 0.9573643410852714, "train_speed(iter/s)": 0.719153 }, { "epoch": 0.2614942528735632, "grad_norm": 1.2343487739562988, "learning_rate": 8.828524441022575e-06, "loss": 0.13959184288978577, "memory(GiB)": 24.84, "step": 364, "token_acc": 0.95, "train_speed(iter/s)": 0.719745 }, { "epoch": 0.26221264367816094, "grad_norm": 1.1168897151947021, "learning_rate": 8.820871249685543e-06, "loss": 0.14101894199848175, "memory(GiB)": 24.84, "step": 365, "token_acc": 0.9481216457960644, "train_speed(iter/s)": 0.720205 }, { "epoch": 0.2629310344827586, "grad_norm": 1.8235266208648682, "learning_rate": 8.813196480952393e-06, "loss": 0.13574296236038208, "memory(GiB)": 24.84, "step": 366, "token_acc": 0.9592592592592593, "train_speed(iter/s)": 0.720841 }, { "epoch": 0.2636494252873563, "grad_norm": 1.5005167722702026, "learning_rate": 8.805500178164426e-06, "loss": 0.1379730999469757, "memory(GiB)": 24.84, "step": 367, "token_acc": 0.9552238805970149, "train_speed(iter/s)": 0.721491 }, { "epoch": 0.26436781609195403, "grad_norm": 7.426710605621338, "learning_rate": 8.797782384784549e-06, "loss": 0.1413784623146057, "memory(GiB)": 24.84, "step": 368, "token_acc": 0.9470899470899471, "train_speed(iter/s)": 0.722133 }, { "epoch": 0.2650862068965517, "grad_norm": 1.0051600933074951, "learning_rate": 8.790043144397032e-06, "loss": 0.13160303235054016, "memory(GiB)": 24.84, "step": 369, "token_acc": 0.9548133595284872, "train_speed(iter/s)": 0.722743 }, { "epoch": 0.26580459770114945, "grad_norm": 1.5079478025436401, "learning_rate": 8.782282500707262e-06, "loss": 0.1274179220199585, "memory(GiB)": 24.84, "step": 370, "token_acc": 0.9544658493870403, "train_speed(iter/s)": 0.723382 }, { "epoch": 0.2665229885057471, "grad_norm": 1.214949369430542, "learning_rate": 8.774500497541495e-06, "loss": 0.1233191192150116, "memory(GiB)": 24.84, "step": 371, "token_acc": 0.9623931623931624, "train_speed(iter/s)": 0.724021 }, { "epoch": 0.2672413793103448, "grad_norm": 3.4449706077575684, "learning_rate": 8.766697178846611e-06, "loss": 0.13986286520957947, "memory(GiB)": 24.84, "step": 372, "token_acc": 0.9507042253521126, "train_speed(iter/s)": 0.724647 }, { "epoch": 0.26795977011494254, "grad_norm": 3.4131948947906494, "learning_rate": 8.75887258868986e-06, "loss": 0.14523467421531677, "memory(GiB)": 24.84, "step": 373, "token_acc": 0.9506802721088435, "train_speed(iter/s)": 0.725284 }, { "epoch": 0.2686781609195402, "grad_norm": 1.250696063041687, "learning_rate": 8.751026771258622e-06, "loss": 0.13420426845550537, "memory(GiB)": 24.84, "step": 374, "token_acc": 0.956989247311828, "train_speed(iter/s)": 0.725884 }, { "epoch": 0.26939655172413796, "grad_norm": 1.5008834600448608, "learning_rate": 8.743159770860151e-06, "loss": 0.12397860735654831, "memory(GiB)": 24.84, "step": 375, "token_acc": 0.9521276595744681, "train_speed(iter/s)": 0.726464 }, { "epoch": 0.27011494252873564, "grad_norm": 1.7179456949234009, "learning_rate": 8.735271631921322e-06, "loss": 0.12368256598711014, "memory(GiB)": 24.84, "step": 376, "token_acc": 0.9641319942611191, "train_speed(iter/s)": 0.727074 }, { "epoch": 0.2708333333333333, "grad_norm": 1.0828380584716797, "learning_rate": 8.727362398988393e-06, "loss": 0.12909603118896484, "memory(GiB)": 24.84, "step": 377, "token_acc": 0.9473684210526315, "train_speed(iter/s)": 0.727606 }, { "epoch": 0.27155172413793105, "grad_norm": 1.1656800508499146, "learning_rate": 8.719432116726738e-06, "loss": 0.13731162250041962, "memory(GiB)": 24.84, "step": 378, "token_acc": 0.947841726618705, "train_speed(iter/s)": 0.72821 }, { "epoch": 0.27227011494252873, "grad_norm": 1.1647826433181763, "learning_rate": 8.711480829920603e-06, "loss": 0.13747452199459076, "memory(GiB)": 24.84, "step": 379, "token_acc": 0.9542682926829268, "train_speed(iter/s)": 0.728692 }, { "epoch": 0.27298850574712646, "grad_norm": 1.438542366027832, "learning_rate": 8.703508583472855e-06, "loss": 0.13408952951431274, "memory(GiB)": 24.84, "step": 380, "token_acc": 0.9495967741935484, "train_speed(iter/s)": 0.729137 }, { "epoch": 0.27370689655172414, "grad_norm": 1.0917822122573853, "learning_rate": 8.69551542240472e-06, "loss": 0.14070478081703186, "memory(GiB)": 24.84, "step": 381, "token_acc": 0.9432387312186978, "train_speed(iter/s)": 0.729609 }, { "epoch": 0.2744252873563218, "grad_norm": 0.8537577986717224, "learning_rate": 8.68750139185554e-06, "loss": 0.12624122202396393, "memory(GiB)": 24.84, "step": 382, "token_acc": 0.9562841530054644, "train_speed(iter/s)": 0.730232 }, { "epoch": 0.27514367816091956, "grad_norm": 1.1926109790802002, "learning_rate": 8.679466537082507e-06, "loss": 0.13841408491134644, "memory(GiB)": 24.84, "step": 383, "token_acc": 0.9527972027972028, "train_speed(iter/s)": 0.730852 }, { "epoch": 0.27586206896551724, "grad_norm": 1.0448687076568604, "learning_rate": 8.671410903460416e-06, "loss": 0.1290646493434906, "memory(GiB)": 24.84, "step": 384, "token_acc": 0.9356136820925554, "train_speed(iter/s)": 0.731467 }, { "epoch": 0.2765804597701149, "grad_norm": 1.2338109016418457, "learning_rate": 8.663334536481402e-06, "loss": 0.14118263125419617, "memory(GiB)": 24.84, "step": 385, "token_acc": 0.9426987060998152, "train_speed(iter/s)": 0.732077 }, { "epoch": 0.27729885057471265, "grad_norm": 1.052526831626892, "learning_rate": 8.65523748175469e-06, "loss": 0.12158431112766266, "memory(GiB)": 24.84, "step": 386, "token_acc": 0.9559082892416225, "train_speed(iter/s)": 0.732672 }, { "epoch": 0.27801724137931033, "grad_norm": 1.0157711505889893, "learning_rate": 8.647119785006333e-06, "loss": 0.12292643636465073, "memory(GiB)": 24.84, "step": 387, "token_acc": 0.9449715370018975, "train_speed(iter/s)": 0.733261 }, { "epoch": 0.27873563218390807, "grad_norm": 0.8534294366836548, "learning_rate": 8.63898149207895e-06, "loss": 0.1265827715396881, "memory(GiB)": 24.84, "step": 388, "token_acc": 0.9568106312292359, "train_speed(iter/s)": 0.733857 }, { "epoch": 0.27945402298850575, "grad_norm": 1.395734190940857, "learning_rate": 8.63082264893148e-06, "loss": 0.13306191563606262, "memory(GiB)": 24.84, "step": 389, "token_acc": 0.9467455621301775, "train_speed(iter/s)": 0.734431 }, { "epoch": 0.2801724137931034, "grad_norm": 1.4615795612335205, "learning_rate": 8.622643301638902e-06, "loss": 0.13135965168476105, "memory(GiB)": 24.84, "step": 390, "token_acc": 0.9552469135802469, "train_speed(iter/s)": 0.735021 }, { "epoch": 0.28089080459770116, "grad_norm": 0.8823055624961853, "learning_rate": 8.614443496392e-06, "loss": 0.1303437501192093, "memory(GiB)": 24.84, "step": 391, "token_acc": 0.9581818181818181, "train_speed(iter/s)": 0.735584 }, { "epoch": 0.28160919540229884, "grad_norm": 1.7396676540374756, "learning_rate": 8.606223279497081e-06, "loss": 0.12720996141433716, "memory(GiB)": 24.84, "step": 392, "token_acc": 0.9344262295081968, "train_speed(iter/s)": 0.736007 }, { "epoch": 0.2823275862068966, "grad_norm": 0.8407917618751526, "learning_rate": 8.597982697375726e-06, "loss": 0.1184331476688385, "memory(GiB)": 24.84, "step": 393, "token_acc": 0.9581749049429658, "train_speed(iter/s)": 0.736468 }, { "epoch": 0.28304597701149425, "grad_norm": 0.9657503366470337, "learning_rate": 8.589721796564521e-06, "loss": 0.13019074499607086, "memory(GiB)": 24.84, "step": 394, "token_acc": 0.9630314232902033, "train_speed(iter/s)": 0.736881 }, { "epoch": 0.28376436781609193, "grad_norm": 0.8997913599014282, "learning_rate": 8.581440623714794e-06, "loss": 0.1275487244129181, "memory(GiB)": 24.84, "step": 395, "token_acc": 0.9623352165725048, "train_speed(iter/s)": 0.73741 }, { "epoch": 0.28448275862068967, "grad_norm": 1.6580129861831665, "learning_rate": 8.57313922559236e-06, "loss": 0.13062232732772827, "memory(GiB)": 24.84, "step": 396, "token_acc": 0.9549356223175965, "train_speed(iter/s)": 0.737998 }, { "epoch": 0.28520114942528735, "grad_norm": 1.4013501405715942, "learning_rate": 8.564817649077246e-06, "loss": 0.1358243227005005, "memory(GiB)": 24.84, "step": 397, "token_acc": 0.9475806451612904, "train_speed(iter/s)": 0.738559 }, { "epoch": 0.2859195402298851, "grad_norm": 1.1134705543518066, "learning_rate": 8.556475941163436e-06, "loss": 0.13466456532478333, "memory(GiB)": 24.84, "step": 398, "token_acc": 0.9408284023668639, "train_speed(iter/s)": 0.739146 }, { "epoch": 0.28663793103448276, "grad_norm": 1.2859421968460083, "learning_rate": 8.548114148958596e-06, "loss": 0.12429073452949524, "memory(GiB)": 24.84, "step": 399, "token_acc": 0.9463667820069204, "train_speed(iter/s)": 0.73973 }, { "epoch": 0.28735632183908044, "grad_norm": 0.9572249054908752, "learning_rate": 8.539732319683817e-06, "loss": 0.13029971718788147, "memory(GiB)": 24.84, "step": 400, "token_acc": 0.9544715447154472, "train_speed(iter/s)": 0.740189 }, { "epoch": 0.28735632183908044, "eval_loss": 0.12088574469089508, "eval_runtime": 6.4054, "eval_samples_per_second": 70.253, "eval_steps_per_second": 2.342, "eval_token_acc": 0.9526254995004995, "step": 400 }, { "epoch": 0.2880747126436782, "grad_norm": 1.2613950967788696, "learning_rate": 8.531330500673341e-06, "loss": 0.13224560022354126, "memory(GiB)": 24.84, "step": 401, "token_acc": 0.9545401288810779, "train_speed(iter/s)": 0.714656 }, { "epoch": 0.28879310344827586, "grad_norm": 7.855434894561768, "learning_rate": 8.5229087393743e-06, "loss": 0.13132476806640625, "memory(GiB)": 24.84, "step": 402, "token_acc": 0.9297912713472486, "train_speed(iter/s)": 0.715259 }, { "epoch": 0.28951149425287354, "grad_norm": 1.420209288597107, "learning_rate": 8.514467083346445e-06, "loss": 0.12822335958480835, "memory(GiB)": 24.84, "step": 403, "token_acc": 0.9626865671641791, "train_speed(iter/s)": 0.715858 }, { "epoch": 0.29022988505747127, "grad_norm": 1.0696207284927368, "learning_rate": 8.506005580261872e-06, "loss": 0.1286643147468567, "memory(GiB)": 24.84, "step": 404, "token_acc": 0.9611451942740287, "train_speed(iter/s)": 0.716459 }, { "epoch": 0.29094827586206895, "grad_norm": 2.1789097785949707, "learning_rate": 8.497524277904764e-06, "loss": 0.13200733065605164, "memory(GiB)": 24.84, "step": 405, "token_acc": 0.9579349904397706, "train_speed(iter/s)": 0.717057 }, { "epoch": 0.2916666666666667, "grad_norm": 1.969804048538208, "learning_rate": 8.489023224171114e-06, "loss": 0.1297609508037567, "memory(GiB)": 24.84, "step": 406, "token_acc": 0.9559082892416225, "train_speed(iter/s)": 0.717651 }, { "epoch": 0.29238505747126436, "grad_norm": 1.2139972448349, "learning_rate": 8.480502467068455e-06, "loss": 0.13814041018486023, "memory(GiB)": 24.84, "step": 407, "token_acc": 0.9497816593886463, "train_speed(iter/s)": 0.718235 }, { "epoch": 0.29310344827586204, "grad_norm": 1.3743432760238647, "learning_rate": 8.47196205471559e-06, "loss": 0.13162177801132202, "memory(GiB)": 24.84, "step": 408, "token_acc": 0.95, "train_speed(iter/s)": 0.718759 }, { "epoch": 0.2938218390804598, "grad_norm": 1.9120649099349976, "learning_rate": 8.463402035342319e-06, "loss": 0.12039199471473694, "memory(GiB)": 24.84, "step": 409, "token_acc": 0.94106463878327, "train_speed(iter/s)": 0.719211 }, { "epoch": 0.29454022988505746, "grad_norm": 1.3268839120864868, "learning_rate": 8.45482245728917e-06, "loss": 0.12348617613315582, "memory(GiB)": 24.84, "step": 410, "token_acc": 0.946515397082658, "train_speed(iter/s)": 0.719669 }, { "epoch": 0.2952586206896552, "grad_norm": 1.4535937309265137, "learning_rate": 8.446223369007122e-06, "loss": 0.12629663944244385, "memory(GiB)": 24.84, "step": 411, "token_acc": 0.9453441295546559, "train_speed(iter/s)": 0.720056 }, { "epoch": 0.2959770114942529, "grad_norm": 1.4548380374908447, "learning_rate": 8.437604819057336e-06, "loss": 0.12674254179000854, "memory(GiB)": 24.84, "step": 412, "token_acc": 0.9545454545454546, "train_speed(iter/s)": 0.720632 }, { "epoch": 0.29669540229885055, "grad_norm": 0.879306972026825, "learning_rate": 8.42896685611087e-06, "loss": 0.12116341292858124, "memory(GiB)": 24.84, "step": 413, "token_acc": 0.9556650246305419, "train_speed(iter/s)": 0.721213 }, { "epoch": 0.2974137931034483, "grad_norm": 1.3626656532287598, "learning_rate": 8.420309528948422e-06, "loss": 0.1393766850233078, "memory(GiB)": 24.84, "step": 414, "token_acc": 0.9502982107355865, "train_speed(iter/s)": 0.721513 }, { "epoch": 0.29813218390804597, "grad_norm": 1.4237494468688965, "learning_rate": 8.411632886460036e-06, "loss": 0.11848053336143494, "memory(GiB)": 24.84, "step": 415, "token_acc": 0.9538461538461539, "train_speed(iter/s)": 0.721718 }, { "epoch": 0.2988505747126437, "grad_norm": 0.781672477722168, "learning_rate": 8.40293697764484e-06, "loss": 0.11938994377851486, "memory(GiB)": 24.84, "step": 416, "token_acc": 0.9474671669793621, "train_speed(iter/s)": 0.722191 }, { "epoch": 0.2995689655172414, "grad_norm": 1.3910363912582397, "learning_rate": 8.394221851610761e-06, "loss": 0.12929438054561615, "memory(GiB)": 24.84, "step": 417, "token_acc": 0.9577205882352942, "train_speed(iter/s)": 0.722749 }, { "epoch": 0.30028735632183906, "grad_norm": 1.5534672737121582, "learning_rate": 8.385487557574253e-06, "loss": 0.13545769453048706, "memory(GiB)": 24.84, "step": 418, "token_acc": 0.9433962264150944, "train_speed(iter/s)": 0.72332 }, { "epoch": 0.3010057471264368, "grad_norm": 0.9145299792289734, "learning_rate": 8.37673414486001e-06, "loss": 0.12323573231697083, "memory(GiB)": 24.84, "step": 419, "token_acc": 0.9472759226713533, "train_speed(iter/s)": 0.723882 }, { "epoch": 0.3017241379310345, "grad_norm": 1.9065254926681519, "learning_rate": 8.367961662900704e-06, "loss": 0.11988499015569687, "memory(GiB)": 24.84, "step": 420, "token_acc": 0.963302752293578, "train_speed(iter/s)": 0.724451 }, { "epoch": 0.3024425287356322, "grad_norm": 1.4019211530685425, "learning_rate": 8.359170161236686e-06, "loss": 0.13407179713249207, "memory(GiB)": 24.84, "step": 421, "token_acc": 0.9555984555984556, "train_speed(iter/s)": 0.725012 }, { "epoch": 0.3031609195402299, "grad_norm": 1.6576014757156372, "learning_rate": 8.35035968951572e-06, "loss": 0.12769007682800293, "memory(GiB)": 24.84, "step": 422, "token_acc": 0.959758551307847, "train_speed(iter/s)": 0.725577 }, { "epoch": 0.30387931034482757, "grad_norm": 1.0823659896850586, "learning_rate": 8.341530297492703e-06, "loss": 0.1196344792842865, "memory(GiB)": 24.84, "step": 423, "token_acc": 0.933456561922366, "train_speed(iter/s)": 0.726142 }, { "epoch": 0.3045977011494253, "grad_norm": 1.3176158666610718, "learning_rate": 8.33268203502937e-06, "loss": 0.124687060713768, "memory(GiB)": 24.84, "step": 424, "token_acc": 0.9619565217391305, "train_speed(iter/s)": 0.726705 }, { "epoch": 0.305316091954023, "grad_norm": 1.2056509256362915, "learning_rate": 8.323814952094028e-06, "loss": 0.11561831086874008, "memory(GiB)": 24.84, "step": 425, "token_acc": 0.9559748427672956, "train_speed(iter/s)": 0.727268 }, { "epoch": 0.30603448275862066, "grad_norm": 1.11044442653656, "learning_rate": 8.314929098761268e-06, "loss": 0.1227533370256424, "memory(GiB)": 24.84, "step": 426, "token_acc": 0.9491833030852994, "train_speed(iter/s)": 0.727826 }, { "epoch": 0.3067528735632184, "grad_norm": 1.3345277309417725, "learning_rate": 8.306024525211682e-06, "loss": 0.11534899473190308, "memory(GiB)": 24.84, "step": 427, "token_acc": 0.9538152610441767, "train_speed(iter/s)": 0.728383 }, { "epoch": 0.3074712643678161, "grad_norm": 1.4172611236572266, "learning_rate": 8.297101281731576e-06, "loss": 0.11006147414445877, "memory(GiB)": 24.84, "step": 428, "token_acc": 0.9604989604989606, "train_speed(iter/s)": 0.72894 }, { "epoch": 0.3081896551724138, "grad_norm": 1.4628186225891113, "learning_rate": 8.288159418712693e-06, "loss": 0.12456518411636353, "memory(GiB)": 24.84, "step": 429, "token_acc": 0.9560878243512974, "train_speed(iter/s)": 0.729496 }, { "epoch": 0.3089080459770115, "grad_norm": 1.8314918279647827, "learning_rate": 8.279198986651925e-06, "loss": 0.13092833757400513, "memory(GiB)": 24.84, "step": 430, "token_acc": 0.9439579684763573, "train_speed(iter/s)": 0.730048 }, { "epoch": 0.30962643678160917, "grad_norm": 1.2581682205200195, "learning_rate": 8.270220036151028e-06, "loss": 0.11373047530651093, "memory(GiB)": 24.84, "step": 431, "token_acc": 0.9459459459459459, "train_speed(iter/s)": 0.73054 }, { "epoch": 0.3103448275862069, "grad_norm": 1.175366759300232, "learning_rate": 8.261222617916335e-06, "loss": 0.12694357335567474, "memory(GiB)": 24.84, "step": 432, "token_acc": 0.9505703422053232, "train_speed(iter/s)": 0.730949 }, { "epoch": 0.3110632183908046, "grad_norm": 1.7992173433303833, "learning_rate": 8.25220678275847e-06, "loss": 0.13344240188598633, "memory(GiB)": 24.84, "step": 433, "token_acc": 0.9420505200594353, "train_speed(iter/s)": 0.731349 }, { "epoch": 0.3117816091954023, "grad_norm": 1.4579970836639404, "learning_rate": 8.243172581592066e-06, "loss": 0.1397123634815216, "memory(GiB)": 24.84, "step": 434, "token_acc": 0.9555555555555556, "train_speed(iter/s)": 0.731812 }, { "epoch": 0.3125, "grad_norm": 0.9540843963623047, "learning_rate": 8.234120065435466e-06, "loss": 0.123537577688694, "memory(GiB)": 24.84, "step": 435, "token_acc": 0.9417637271214643, "train_speed(iter/s)": 0.732346 }, { "epoch": 0.3132183908045977, "grad_norm": 1.1826317310333252, "learning_rate": 8.22504928541045e-06, "loss": 0.1237822026014328, "memory(GiB)": 24.84, "step": 436, "token_acc": 0.9609236234458259, "train_speed(iter/s)": 0.73289 }, { "epoch": 0.3139367816091954, "grad_norm": 1.1137676239013672, "learning_rate": 8.215960292741933e-06, "loss": 0.12027280777692795, "memory(GiB)": 24.84, "step": 437, "token_acc": 0.95, "train_speed(iter/s)": 0.733431 }, { "epoch": 0.3146551724137931, "grad_norm": 0.808922290802002, "learning_rate": 8.206853138757687e-06, "loss": 0.1091887354850769, "memory(GiB)": 24.84, "step": 438, "token_acc": 0.967479674796748, "train_speed(iter/s)": 0.733972 }, { "epoch": 0.31537356321839083, "grad_norm": 2.142683506011963, "learning_rate": 8.19772787488804e-06, "loss": 0.12552979588508606, "memory(GiB)": 24.84, "step": 439, "token_acc": 0.9605734767025089, "train_speed(iter/s)": 0.734507 }, { "epoch": 0.3160919540229885, "grad_norm": 0.9239856600761414, "learning_rate": 8.188584552665592e-06, "loss": 0.11206837743520737, "memory(GiB)": 24.84, "step": 440, "token_acc": 0.9518796992481203, "train_speed(iter/s)": 0.735038 }, { "epoch": 0.3168103448275862, "grad_norm": 0.9072840809822083, "learning_rate": 8.179423223724926e-06, "loss": 0.12953786551952362, "memory(GiB)": 24.84, "step": 441, "token_acc": 0.9407114624505929, "train_speed(iter/s)": 0.735569 }, { "epoch": 0.3175287356321839, "grad_norm": 1.2857810258865356, "learning_rate": 8.17024393980231e-06, "loss": 0.12499482929706573, "memory(GiB)": 24.84, "step": 442, "token_acc": 0.9560229445506692, "train_speed(iter/s)": 0.736104 }, { "epoch": 0.3182471264367816, "grad_norm": 1.0931156873703003, "learning_rate": 8.161046752735408e-06, "loss": 0.12824667990207672, "memory(GiB)": 24.84, "step": 443, "token_acc": 0.9454191033138402, "train_speed(iter/s)": 0.736633 }, { "epoch": 0.31896551724137934, "grad_norm": 1.3693287372589111, "learning_rate": 8.15183171446299e-06, "loss": 0.11433771252632141, "memory(GiB)": 24.84, "step": 444, "token_acc": 0.9428571428571428, "train_speed(iter/s)": 0.737164 }, { "epoch": 0.319683908045977, "grad_norm": 0.84522545337677, "learning_rate": 8.142598877024637e-06, "loss": 0.12468549609184265, "memory(GiB)": 24.84, "step": 445, "token_acc": 0.9584086799276673, "train_speed(iter/s)": 0.737657 }, { "epoch": 0.3204022988505747, "grad_norm": 1.6560345888137817, "learning_rate": 8.133348292560442e-06, "loss": 0.12423508614301682, "memory(GiB)": 24.84, "step": 446, "token_acc": 0.9536679536679536, "train_speed(iter/s)": 0.738029 }, { "epoch": 0.32112068965517243, "grad_norm": 1.1307737827301025, "learning_rate": 8.12408001331072e-06, "loss": 0.11594332009553909, "memory(GiB)": 24.84, "step": 447, "token_acc": 0.9567901234567902, "train_speed(iter/s)": 0.738422 }, { "epoch": 0.3218390804597701, "grad_norm": 0.8652155995368958, "learning_rate": 8.114794091615718e-06, "loss": 0.11533249914646149, "memory(GiB)": 24.84, "step": 448, "token_acc": 0.9618320610687023, "train_speed(iter/s)": 0.738776 }, { "epoch": 0.3225574712643678, "grad_norm": 0.8272719979286194, "learning_rate": 8.10549057991531e-06, "loss": 0.10936818271875381, "memory(GiB)": 24.84, "step": 449, "token_acc": 0.9625984251968503, "train_speed(iter/s)": 0.739292 }, { "epoch": 0.3232758620689655, "grad_norm": 0.9610937833786011, "learning_rate": 8.096169530748708e-06, "loss": 0.11316747963428497, "memory(GiB)": 24.84, "step": 450, "token_acc": 0.9566724436741768, "train_speed(iter/s)": 0.73979 }, { "epoch": 0.3232758620689655, "eval_loss": 0.11268799751996994, "eval_runtime": 5.9925, "eval_samples_per_second": 75.094, "eval_steps_per_second": 2.503, "eval_token_acc": 0.9559815184815185, "step": 450 }, { "epoch": 0.3239942528735632, "grad_norm": 1.001347541809082, "learning_rate": 8.086830996754156e-06, "loss": 0.10513995587825775, "memory(GiB)": 24.84, "step": 451, "token_acc": 0.9599437807449052, "train_speed(iter/s)": 0.71743 }, { "epoch": 0.32471264367816094, "grad_norm": 1.046526551246643, "learning_rate": 8.077475030668647e-06, "loss": 0.1085362508893013, "memory(GiB)": 24.84, "step": 452, "token_acc": 0.9617590822179732, "train_speed(iter/s)": 0.717959 }, { "epoch": 0.3254310344827586, "grad_norm": 1.0407040119171143, "learning_rate": 8.068101685327615e-06, "loss": 0.10881409049034119, "memory(GiB)": 24.84, "step": 453, "token_acc": 0.9716312056737588, "train_speed(iter/s)": 0.718483 }, { "epoch": 0.3261494252873563, "grad_norm": 1.1814591884613037, "learning_rate": 8.058711013664633e-06, "loss": 0.11983378231525421, "memory(GiB)": 24.84, "step": 454, "token_acc": 0.9508928571428571, "train_speed(iter/s)": 0.719011 }, { "epoch": 0.32686781609195403, "grad_norm": 1.0421934127807617, "learning_rate": 8.049303068711127e-06, "loss": 0.10917912423610687, "memory(GiB)": 24.84, "step": 455, "token_acc": 0.9677966101694915, "train_speed(iter/s)": 0.719539 }, { "epoch": 0.3275862068965517, "grad_norm": 1.2311826944351196, "learning_rate": 8.039877903596069e-06, "loss": 0.11370255798101425, "memory(GiB)": 24.84, "step": 456, "token_acc": 0.9684418145956607, "train_speed(iter/s)": 0.72006 }, { "epoch": 0.32830459770114945, "grad_norm": 1.287265419960022, "learning_rate": 8.030435571545675e-06, "loss": 0.10762807726860046, "memory(GiB)": 24.84, "step": 457, "token_acc": 0.9619047619047619, "train_speed(iter/s)": 0.720585 }, { "epoch": 0.3290229885057471, "grad_norm": 0.8744949698448181, "learning_rate": 8.020976125883105e-06, "loss": 0.11789572238922119, "memory(GiB)": 24.84, "step": 458, "token_acc": 0.9698681732580038, "train_speed(iter/s)": 0.721111 }, { "epoch": 0.3297413793103448, "grad_norm": 1.1591928005218506, "learning_rate": 8.01149962002817e-06, "loss": 0.11805061995983124, "memory(GiB)": 24.84, "step": 459, "token_acc": 0.9520833333333333, "train_speed(iter/s)": 0.721632 }, { "epoch": 0.33045977011494254, "grad_norm": 0.978761613368988, "learning_rate": 8.002006107497018e-06, "loss": 0.10133609175682068, "memory(GiB)": 24.84, "step": 460, "token_acc": 0.9664310954063604, "train_speed(iter/s)": 0.722156 }, { "epoch": 0.3311781609195402, "grad_norm": 1.2160905599594116, "learning_rate": 7.992495641901842e-06, "loss": 0.12343230098485947, "memory(GiB)": 24.84, "step": 461, "token_acc": 0.9532062391681109, "train_speed(iter/s)": 0.722656 }, { "epoch": 0.33189655172413796, "grad_norm": 1.046201229095459, "learning_rate": 7.982968276950568e-06, "loss": 0.11839239299297333, "memory(GiB)": 24.84, "step": 462, "token_acc": 0.958029197080292, "train_speed(iter/s)": 0.723149 }, { "epoch": 0.33261494252873564, "grad_norm": 1.1655502319335938, "learning_rate": 7.973424066446566e-06, "loss": 0.12283087521791458, "memory(GiB)": 24.84, "step": 463, "token_acc": 0.9292929292929293, "train_speed(iter/s)": 0.72353 }, { "epoch": 0.3333333333333333, "grad_norm": 0.9541414976119995, "learning_rate": 7.963863064288326e-06, "loss": 0.11290963739156723, "memory(GiB)": 24.84, "step": 464, "token_acc": 0.9639065817409767, "train_speed(iter/s)": 0.723928 }, { "epoch": 0.33405172413793105, "grad_norm": 0.98385089635849, "learning_rate": 7.954285324469172e-06, "loss": 0.11938171088695526, "memory(GiB)": 24.84, "step": 465, "token_acc": 0.966044142614601, "train_speed(iter/s)": 0.724319 }, { "epoch": 0.33477011494252873, "grad_norm": 1.1710288524627686, "learning_rate": 7.944690901076949e-06, "loss": 0.12381379306316376, "memory(GiB)": 24.84, "step": 466, "token_acc": 0.9422222222222222, "train_speed(iter/s)": 0.724756 }, { "epoch": 0.33548850574712646, "grad_norm": 1.1105443239212036, "learning_rate": 7.935079848293712e-06, "loss": 0.10833810269832611, "memory(GiB)": 24.84, "step": 467, "token_acc": 0.9729241877256317, "train_speed(iter/s)": 0.72513 }, { "epoch": 0.33620689655172414, "grad_norm": 1.1232632398605347, "learning_rate": 7.925452220395436e-06, "loss": 0.11596910655498505, "memory(GiB)": 24.84, "step": 468, "token_acc": 0.9440715883668904, "train_speed(iter/s)": 0.725513 }, { "epoch": 0.3369252873563218, "grad_norm": 1.4992798566818237, "learning_rate": 7.915808071751692e-06, "loss": 0.12260337173938751, "memory(GiB)": 24.84, "step": 469, "token_acc": 0.9487179487179487, "train_speed(iter/s)": 0.726019 }, { "epoch": 0.33764367816091956, "grad_norm": 0.8139585256576538, "learning_rate": 7.906147456825349e-06, "loss": 0.11935368925333023, "memory(GiB)": 24.84, "step": 470, "token_acc": 0.9611451942740287, "train_speed(iter/s)": 0.726522 }, { "epoch": 0.33836206896551724, "grad_norm": 1.0991419553756714, "learning_rate": 7.89647043017227e-06, "loss": 0.12071461975574493, "memory(GiB)": 24.84, "step": 471, "token_acc": 0.951171875, "train_speed(iter/s)": 0.727022 }, { "epoch": 0.3390804597701149, "grad_norm": 0.7816020846366882, "learning_rate": 7.886777046440993e-06, "loss": 0.11949585378170013, "memory(GiB)": 24.84, "step": 472, "token_acc": 0.9515503875968992, "train_speed(iter/s)": 0.727527 }, { "epoch": 0.33979885057471265, "grad_norm": 1.219448208808899, "learning_rate": 7.877067360372432e-06, "loss": 0.10941040515899658, "memory(GiB)": 24.84, "step": 473, "token_acc": 0.9529837251356239, "train_speed(iter/s)": 0.72803 }, { "epoch": 0.34051724137931033, "grad_norm": 1.106952428817749, "learning_rate": 7.867341426799562e-06, "loss": 0.10911481082439423, "memory(GiB)": 24.84, "step": 474, "token_acc": 0.9436008676789588, "train_speed(iter/s)": 0.728535 }, { "epoch": 0.34123563218390807, "grad_norm": 0.9474596381187439, "learning_rate": 7.857599300647114e-06, "loss": 0.10300856828689575, "memory(GiB)": 24.84, "step": 475, "token_acc": 0.9598603839441536, "train_speed(iter/s)": 0.729034 }, { "epoch": 0.34195402298850575, "grad_norm": 0.9893588423728943, "learning_rate": 7.847841036931263e-06, "loss": 0.12188372015953064, "memory(GiB)": 24.84, "step": 476, "token_acc": 0.9423791821561338, "train_speed(iter/s)": 0.729536 }, { "epoch": 0.3426724137931034, "grad_norm": 1.3593332767486572, "learning_rate": 7.838066690759311e-06, "loss": 0.11190907657146454, "memory(GiB)": 24.84, "step": 477, "token_acc": 0.945179584120983, "train_speed(iter/s)": 0.730035 }, { "epoch": 0.34339080459770116, "grad_norm": 1.2279316186904907, "learning_rate": 7.828276317329388e-06, "loss": 0.1265583038330078, "memory(GiB)": 24.84, "step": 478, "token_acc": 0.9607476635514018, "train_speed(iter/s)": 0.730529 }, { "epoch": 0.34410919540229884, "grad_norm": 1.0916123390197754, "learning_rate": 7.818469971930134e-06, "loss": 0.1107066199183464, "memory(GiB)": 24.84, "step": 479, "token_acc": 0.9549393414211439, "train_speed(iter/s)": 0.731018 }, { "epoch": 0.3448275862068966, "grad_norm": 0.8232234120368958, "learning_rate": 7.80864770994038e-06, "loss": 0.10538312792778015, "memory(GiB)": 24.84, "step": 480, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.7315 }, { "epoch": 0.34554597701149425, "grad_norm": 1.018763780593872, "learning_rate": 7.798809586828848e-06, "loss": 0.11191593110561371, "memory(GiB)": 24.84, "step": 481, "token_acc": 0.9629629629629629, "train_speed(iter/s)": 0.731989 }, { "epoch": 0.34626436781609193, "grad_norm": 1.0069042444229126, "learning_rate": 7.788955658153829e-06, "loss": 0.11850462853908539, "memory(GiB)": 24.84, "step": 482, "token_acc": 0.9489194499017681, "train_speed(iter/s)": 0.732478 }, { "epoch": 0.34698275862068967, "grad_norm": 1.1425495147705078, "learning_rate": 7.779085979562874e-06, "loss": 0.12226521968841553, "memory(GiB)": 24.84, "step": 483, "token_acc": 0.9512195121951219, "train_speed(iter/s)": 0.732957 }, { "epoch": 0.34770114942528735, "grad_norm": 0.8342289924621582, "learning_rate": 7.769200606792476e-06, "loss": 0.09935799241065979, "memory(GiB)": 24.84, "step": 484, "token_acc": 0.9515789473684211, "train_speed(iter/s)": 0.733431 }, { "epoch": 0.3484195402298851, "grad_norm": 0.9965822100639343, "learning_rate": 7.759299595667752e-06, "loss": 0.11027398705482483, "memory(GiB)": 24.84, "step": 485, "token_acc": 0.9683098591549296, "train_speed(iter/s)": 0.733862 }, { "epoch": 0.34913793103448276, "grad_norm": 1.1083300113677979, "learning_rate": 7.749383002102147e-06, "loss": 0.10343383997678757, "memory(GiB)": 24.84, "step": 486, "token_acc": 0.9665492957746479, "train_speed(iter/s)": 0.734209 }, { "epoch": 0.34985632183908044, "grad_norm": 1.4335342645645142, "learning_rate": 7.739450882097088e-06, "loss": 0.11565598100423813, "memory(GiB)": 24.84, "step": 487, "token_acc": 0.947265625, "train_speed(iter/s)": 0.734525 }, { "epoch": 0.3505747126436782, "grad_norm": 0.8502451777458191, "learning_rate": 7.72950329174169e-06, "loss": 0.1068037748336792, "memory(GiB)": 24.84, "step": 488, "token_acc": 0.962671905697446, "train_speed(iter/s)": 0.734913 }, { "epoch": 0.35129310344827586, "grad_norm": 2.1923625469207764, "learning_rate": 7.719540287212439e-06, "loss": 0.11190034449100494, "memory(GiB)": 24.84, "step": 489, "token_acc": 0.96900826446281, "train_speed(iter/s)": 0.735396 }, { "epoch": 0.35201149425287354, "grad_norm": 1.6235222816467285, "learning_rate": 7.709561924772855e-06, "loss": 0.11709795147180557, "memory(GiB)": 24.84, "step": 490, "token_acc": 0.96484375, "train_speed(iter/s)": 0.735875 }, { "epoch": 0.35272988505747127, "grad_norm": 1.1556147336959839, "learning_rate": 7.6995682607732e-06, "loss": 0.11861775815486908, "memory(GiB)": 24.84, "step": 491, "token_acc": 0.9528301886792453, "train_speed(iter/s)": 0.736341 }, { "epoch": 0.35344827586206895, "grad_norm": 1.3356729745864868, "learning_rate": 7.689559351650142e-06, "loss": 0.11623478680849075, "memory(GiB)": 24.84, "step": 492, "token_acc": 0.9508506616257089, "train_speed(iter/s)": 0.736817 }, { "epoch": 0.3541666666666667, "grad_norm": 1.3273471593856812, "learning_rate": 7.679535253926445e-06, "loss": 0.11295489221811295, "memory(GiB)": 24.84, "step": 493, "token_acc": 0.9526515151515151, "train_speed(iter/s)": 0.737292 }, { "epoch": 0.35488505747126436, "grad_norm": 1.0412935018539429, "learning_rate": 7.66949602421064e-06, "loss": 0.12274104356765747, "memory(GiB)": 24.84, "step": 494, "token_acc": 0.9581056466302368, "train_speed(iter/s)": 0.737759 }, { "epoch": 0.35560344827586204, "grad_norm": 1.3617368936538696, "learning_rate": 7.659441719196724e-06, "loss": 0.11050333082675934, "memory(GiB)": 24.84, "step": 495, "token_acc": 0.9505494505494505, "train_speed(iter/s)": 0.73823 }, { "epoch": 0.3563218390804598, "grad_norm": 1.4376667737960815, "learning_rate": 7.649372395663816e-06, "loss": 0.10986854135990143, "memory(GiB)": 24.84, "step": 496, "token_acc": 0.967391304347826, "train_speed(iter/s)": 0.738698 }, { "epoch": 0.35704022988505746, "grad_norm": 0.988973081111908, "learning_rate": 7.639288110475855e-06, "loss": 0.10603369772434235, "memory(GiB)": 24.84, "step": 497, "token_acc": 0.952, "train_speed(iter/s)": 0.739169 }, { "epoch": 0.3577586206896552, "grad_norm": 1.1493233442306519, "learning_rate": 7.629188920581267e-06, "loss": 0.10952651500701904, "memory(GiB)": 24.84, "step": 498, "token_acc": 0.9571428571428572, "train_speed(iter/s)": 0.739636 }, { "epoch": 0.3584770114942529, "grad_norm": 1.5167906284332275, "learning_rate": 7.619074883012656e-06, "loss": 0.11344001442193985, "memory(GiB)": 24.84, "step": 499, "token_acc": 0.9624060150375939, "train_speed(iter/s)": 0.739895 }, { "epoch": 0.35919540229885055, "grad_norm": 1.1103161573410034, "learning_rate": 7.608946054886468e-06, "loss": 0.11735260486602783, "memory(GiB)": 24.84, "step": 500, "token_acc": 0.9580838323353293, "train_speed(iter/s)": 0.740249 }, { "epoch": 0.35919540229885055, "eval_loss": 0.10537128895521164, "eval_runtime": 6.0582, "eval_samples_per_second": 74.28, "eval_steps_per_second": 2.476, "eval_token_acc": 0.959119005994006, "step": 500 }, { "epoch": 0.3599137931034483, "grad_norm": 1.4167964458465576, "learning_rate": 7.598802493402678e-06, "loss": 0.11653858423233032, "memory(GiB)": 24.84, "step": 501, "token_acc": 0.9606503298774741, "train_speed(iter/s)": 0.720031 }, { "epoch": 0.36063218390804597, "grad_norm": 1.4989160299301147, "learning_rate": 7.588644255844464e-06, "loss": 0.11281104385852814, "memory(GiB)": 24.84, "step": 502, "token_acc": 0.9664310954063604, "train_speed(iter/s)": 0.720446 }, { "epoch": 0.3613505747126437, "grad_norm": 1.948373556137085, "learning_rate": 7.578471399577879e-06, "loss": 0.10534854233264923, "memory(GiB)": 24.84, "step": 503, "token_acc": 0.9626373626373627, "train_speed(iter/s)": 0.720781 }, { "epoch": 0.3620689655172414, "grad_norm": 2.1170334815979004, "learning_rate": 7.568283982051538e-06, "loss": 0.10974562913179398, "memory(GiB)": 24.84, "step": 504, "token_acc": 0.964349376114082, "train_speed(iter/s)": 0.721119 }, { "epoch": 0.36278735632183906, "grad_norm": 1.8701902627944946, "learning_rate": 7.558082060796283e-06, "loss": 0.1069445013999939, "memory(GiB)": 24.84, "step": 505, "token_acc": 0.9538745387453874, "train_speed(iter/s)": 0.72154 }, { "epoch": 0.3635057471264368, "grad_norm": 0.9538026452064514, "learning_rate": 7.5478656934248626e-06, "loss": 0.10211368650197983, "memory(GiB)": 24.84, "step": 506, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.722014 }, { "epoch": 0.3642241379310345, "grad_norm": 1.016895055770874, "learning_rate": 7.537634937631606e-06, "loss": 0.10614500939846039, "memory(GiB)": 24.84, "step": 507, "token_acc": 0.9488117001828154, "train_speed(iter/s)": 0.722482 }, { "epoch": 0.3649425287356322, "grad_norm": 1.0671054124832153, "learning_rate": 7.527389851192099e-06, "loss": 0.11848550289869308, "memory(GiB)": 24.84, "step": 508, "token_acc": 0.9475890985324947, "train_speed(iter/s)": 0.722951 }, { "epoch": 0.3656609195402299, "grad_norm": 1.0452548265457153, "learning_rate": 7.517130491962854e-06, "loss": 0.11080736666917801, "memory(GiB)": 24.84, "step": 509, "token_acc": 0.9593345656192237, "train_speed(iter/s)": 0.723413 }, { "epoch": 0.36637931034482757, "grad_norm": 0.7399190664291382, "learning_rate": 7.506856917880989e-06, "loss": 0.11159782111644745, "memory(GiB)": 24.84, "step": 510, "token_acc": 0.956442831215971, "train_speed(iter/s)": 0.723874 }, { "epoch": 0.3670977011494253, "grad_norm": 0.9241609573364258, "learning_rate": 7.496569186963889e-06, "loss": 0.10804389417171478, "memory(GiB)": 24.84, "step": 511, "token_acc": 0.9644194756554307, "train_speed(iter/s)": 0.724341 }, { "epoch": 0.367816091954023, "grad_norm": 0.9984833002090454, "learning_rate": 7.486267357308896e-06, "loss": 0.10174515843391418, "memory(GiB)": 24.84, "step": 512, "token_acc": 0.9686520376175548, "train_speed(iter/s)": 0.724798 }, { "epoch": 0.36853448275862066, "grad_norm": 1.0489730834960938, "learning_rate": 7.475951487092962e-06, "loss": 0.10034982860088348, "memory(GiB)": 24.84, "step": 513, "token_acc": 0.9730215827338129, "train_speed(iter/s)": 0.725246 }, { "epoch": 0.3692528735632184, "grad_norm": 1.4646326303482056, "learning_rate": 7.465621634572336e-06, "loss": 0.11048506200313568, "memory(GiB)": 24.84, "step": 514, "token_acc": 0.951310861423221, "train_speed(iter/s)": 0.725703 }, { "epoch": 0.3699712643678161, "grad_norm": 1.743573546409607, "learning_rate": 7.455277858082227e-06, "loss": 0.11295029520988464, "memory(GiB)": 24.84, "step": 515, "token_acc": 0.9628099173553719, "train_speed(iter/s)": 0.726165 }, { "epoch": 0.3706896551724138, "grad_norm": 1.2521836757659912, "learning_rate": 7.444920216036473e-06, "loss": 0.11555603891611099, "memory(GiB)": 24.84, "step": 516, "token_acc": 0.9545454545454546, "train_speed(iter/s)": 0.726619 }, { "epoch": 0.3714080459770115, "grad_norm": 1.2082592248916626, "learning_rate": 7.434548766927219e-06, "loss": 0.10131315886974335, "memory(GiB)": 24.84, "step": 517, "token_acc": 0.9724612736660929, "train_speed(iter/s)": 0.727073 }, { "epoch": 0.37212643678160917, "grad_norm": 1.573310136795044, "learning_rate": 7.4241635693245766e-06, "loss": 0.11855731904506683, "memory(GiB)": 24.84, "step": 518, "token_acc": 0.9476861167002012, "train_speed(iter/s)": 0.727476 }, { "epoch": 0.3728448275862069, "grad_norm": 1.3655012845993042, "learning_rate": 7.413764681876302e-06, "loss": 0.1125379204750061, "memory(GiB)": 24.84, "step": 519, "token_acc": 0.9359267734553776, "train_speed(iter/s)": 0.727807 }, { "epoch": 0.3735632183908046, "grad_norm": 1.0757932662963867, "learning_rate": 7.40335216330746e-06, "loss": 0.103436179459095, "memory(GiB)": 24.84, "step": 520, "token_acc": 0.9453551912568307, "train_speed(iter/s)": 0.728095 }, { "epoch": 0.3742816091954023, "grad_norm": 1.2344474792480469, "learning_rate": 7.392926072420097e-06, "loss": 0.12005594372749329, "memory(GiB)": 24.84, "step": 521, "token_acc": 0.9609665427509294, "train_speed(iter/s)": 0.72842 }, { "epoch": 0.375, "grad_norm": 1.1030434370040894, "learning_rate": 7.382486468092899e-06, "loss": 0.10706955194473267, "memory(GiB)": 24.84, "step": 522, "token_acc": 0.9446564885496184, "train_speed(iter/s)": 0.728875 }, { "epoch": 0.3757183908045977, "grad_norm": 2.1924386024475098, "learning_rate": 7.372033409280872e-06, "loss": 0.10846179723739624, "memory(GiB)": 24.84, "step": 523, "token_acc": 0.9695817490494296, "train_speed(iter/s)": 0.729329 }, { "epoch": 0.3764367816091954, "grad_norm": 1.2033382654190063, "learning_rate": 7.361566955014999e-06, "loss": 0.11224424093961716, "memory(GiB)": 24.84, "step": 524, "token_acc": 0.966, "train_speed(iter/s)": 0.729781 }, { "epoch": 0.3771551724137931, "grad_norm": 1.7320388555526733, "learning_rate": 7.351087164401914e-06, "loss": 0.10968895256519318, "memory(GiB)": 24.84, "step": 525, "token_acc": 0.948019801980198, "train_speed(iter/s)": 0.730233 }, { "epoch": 0.37787356321839083, "grad_norm": 0.8668650388717651, "learning_rate": 7.340594096623559e-06, "loss": 0.10040594637393951, "memory(GiB)": 24.84, "step": 526, "token_acc": 0.9628252788104089, "train_speed(iter/s)": 0.730681 }, { "epoch": 0.3785919540229885, "grad_norm": 1.5609307289123535, "learning_rate": 7.330087810936861e-06, "loss": 0.10753326863050461, "memory(GiB)": 24.84, "step": 527, "token_acc": 0.9540229885057471, "train_speed(iter/s)": 0.73113 }, { "epoch": 0.3793103448275862, "grad_norm": 1.3372551202774048, "learning_rate": 7.319568366673389e-06, "loss": 0.10418304800987244, "memory(GiB)": 24.84, "step": 528, "token_acc": 0.9630390143737166, "train_speed(iter/s)": 0.73157 }, { "epoch": 0.3800287356321839, "grad_norm": 1.0397417545318604, "learning_rate": 7.30903582323902e-06, "loss": 0.1025521457195282, "memory(GiB)": 24.84, "step": 529, "token_acc": 0.9628252788104089, "train_speed(iter/s)": 0.732019 }, { "epoch": 0.3807471264367816, "grad_norm": 1.0986452102661133, "learning_rate": 7.2984902401136115e-06, "loss": 0.10805949568748474, "memory(GiB)": 24.84, "step": 530, "token_acc": 0.9409190371991247, "train_speed(iter/s)": 0.732461 }, { "epoch": 0.38146551724137934, "grad_norm": 1.1119999885559082, "learning_rate": 7.287931676850652e-06, "loss": 0.10854840278625488, "memory(GiB)": 24.84, "step": 531, "token_acc": 0.9511201629327902, "train_speed(iter/s)": 0.732907 }, { "epoch": 0.382183908045977, "grad_norm": 0.9873979687690735, "learning_rate": 7.277360193076936e-06, "loss": 0.1023559421300888, "memory(GiB)": 24.84, "step": 532, "token_acc": 0.9664570230607966, "train_speed(iter/s)": 0.733346 }, { "epoch": 0.3829022988505747, "grad_norm": 0.9861406683921814, "learning_rate": 7.266775848492223e-06, "loss": 0.10380479693412781, "memory(GiB)": 24.84, "step": 533, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.733791 }, { "epoch": 0.38362068965517243, "grad_norm": 0.9794347286224365, "learning_rate": 7.256178702868899e-06, "loss": 0.10223473608493805, "memory(GiB)": 24.84, "step": 534, "token_acc": 0.9597197898423818, "train_speed(iter/s)": 0.734213 }, { "epoch": 0.3843390804597701, "grad_norm": 1.1636738777160645, "learning_rate": 7.2455688160516445e-06, "loss": 0.09840905666351318, "memory(GiB)": 24.84, "step": 535, "token_acc": 0.9617486338797814, "train_speed(iter/s)": 0.734656 }, { "epoch": 0.3850574712643678, "grad_norm": 1.0179938077926636, "learning_rate": 7.234946247957087e-06, "loss": 0.10833137482404709, "memory(GiB)": 24.84, "step": 536, "token_acc": 0.9590643274853801, "train_speed(iter/s)": 0.7351 }, { "epoch": 0.3857758620689655, "grad_norm": 1.055955410003662, "learning_rate": 7.224311058573472e-06, "loss": 0.10779173672199249, "memory(GiB)": 24.84, "step": 537, "token_acc": 0.9646464646464646, "train_speed(iter/s)": 0.735537 }, { "epoch": 0.3864942528735632, "grad_norm": 0.9162089824676514, "learning_rate": 7.213663307960321e-06, "loss": 0.09521356225013733, "memory(GiB)": 24.84, "step": 538, "token_acc": 0.957089552238806, "train_speed(iter/s)": 0.735975 }, { "epoch": 0.38721264367816094, "grad_norm": 1.0770106315612793, "learning_rate": 7.20300305624809e-06, "loss": 0.10021792352199554, "memory(GiB)": 24.84, "step": 539, "token_acc": 0.9523809523809523, "train_speed(iter/s)": 0.73636 }, { "epoch": 0.3879310344827586, "grad_norm": 1.5047006607055664, "learning_rate": 7.192330363637832e-06, "loss": 0.09770134836435318, "memory(GiB)": 24.84, "step": 540, "token_acc": 0.9545454545454546, "train_speed(iter/s)": 0.736668 }, { "epoch": 0.3886494252873563, "grad_norm": 1.2360405921936035, "learning_rate": 7.181645290400856e-06, "loss": 0.10725338757038116, "memory(GiB)": 24.84, "step": 541, "token_acc": 0.9548387096774194, "train_speed(iter/s)": 0.736976 }, { "epoch": 0.38936781609195403, "grad_norm": 1.1240206956863403, "learning_rate": 7.170947896878392e-06, "loss": 0.09974702447652817, "memory(GiB)": 24.84, "step": 542, "token_acc": 0.969811320754717, "train_speed(iter/s)": 0.737354 }, { "epoch": 0.3900862068965517, "grad_norm": 1.3412690162658691, "learning_rate": 7.160238243481238e-06, "loss": 0.1082761213183403, "memory(GiB)": 24.84, "step": 543, "token_acc": 0.9572490706319703, "train_speed(iter/s)": 0.737791 }, { "epoch": 0.39080459770114945, "grad_norm": 0.9703657031059265, "learning_rate": 7.149516390689433e-06, "loss": 0.10282774269580841, "memory(GiB)": 24.84, "step": 544, "token_acc": 0.969551282051282, "train_speed(iter/s)": 0.738222 }, { "epoch": 0.3915229885057471, "grad_norm": 1.1522157192230225, "learning_rate": 7.138782399051908e-06, "loss": 0.10382555425167084, "memory(GiB)": 24.84, "step": 545, "token_acc": 0.9520153550863724, "train_speed(iter/s)": 0.738658 }, { "epoch": 0.3922413793103448, "grad_norm": 1.0163986682891846, "learning_rate": 7.12803632918614e-06, "loss": 0.09793849289417267, "memory(GiB)": 24.84, "step": 546, "token_acc": 0.9649122807017544, "train_speed(iter/s)": 0.739094 }, { "epoch": 0.39295977011494254, "grad_norm": 0.9924953579902649, "learning_rate": 7.117278241777823e-06, "loss": 0.09201813489198685, "memory(GiB)": 24.84, "step": 547, "token_acc": 0.9554794520547946, "train_speed(iter/s)": 0.73953 }, { "epoch": 0.3936781609195402, "grad_norm": 0.7964367270469666, "learning_rate": 7.1065081975805086e-06, "loss": 0.10161037743091583, "memory(GiB)": 24.84, "step": 548, "token_acc": 0.9661016949152542, "train_speed(iter/s)": 0.739961 }, { "epoch": 0.39439655172413796, "grad_norm": 1.0971744060516357, "learning_rate": 7.095726257415276e-06, "loss": 0.11514529585838318, "memory(GiB)": 24.84, "step": 549, "token_acc": 0.953125, "train_speed(iter/s)": 0.740393 }, { "epoch": 0.39511494252873564, "grad_norm": 0.87525874376297, "learning_rate": 7.084932482170385e-06, "loss": 0.09710817784070969, "memory(GiB)": 24.84, "step": 550, "token_acc": 0.9768683274021353, "train_speed(iter/s)": 0.740809 }, { "epoch": 0.39511494252873564, "eval_loss": 0.09873759001493454, "eval_runtime": 6.6255, "eval_samples_per_second": 67.919, "eval_steps_per_second": 2.264, "eval_token_acc": 0.962037962037962, "step": 550 }, { "epoch": 0.3958333333333333, "grad_norm": 0.9809340238571167, "learning_rate": 7.0741269328009286e-06, "loss": 0.0923876017332077, "memory(GiB)": 24.84, "step": 551, "token_acc": 0.9642484039466047, "train_speed(iter/s)": 0.721969 }, { "epoch": 0.39655172413793105, "grad_norm": 1.020236611366272, "learning_rate": 7.063309670328491e-06, "loss": 0.10057920962572098, "memory(GiB)": 24.84, "step": 552, "token_acc": 0.9571150097465887, "train_speed(iter/s)": 0.722404 }, { "epoch": 0.39727011494252873, "grad_norm": 1.0687503814697266, "learning_rate": 7.052480755840803e-06, "loss": 0.10533179342746735, "memory(GiB)": 24.84, "step": 553, "token_acc": 0.9615384615384616, "train_speed(iter/s)": 0.722832 }, { "epoch": 0.39798850574712646, "grad_norm": 0.9639756083488464, "learning_rate": 7.041640250491398e-06, "loss": 0.09748411178588867, "memory(GiB)": 24.84, "step": 554, "token_acc": 0.9575645756457565, "train_speed(iter/s)": 0.723263 }, { "epoch": 0.39870689655172414, "grad_norm": 1.140478253364563, "learning_rate": 7.030788215499268e-06, "loss": 0.09069720655679703, "memory(GiB)": 24.84, "step": 555, "token_acc": 0.9584664536741214, "train_speed(iter/s)": 0.723673 }, { "epoch": 0.3994252873563218, "grad_norm": 1.185470700263977, "learning_rate": 7.019924712148511e-06, "loss": 0.10478450357913971, "memory(GiB)": 24.84, "step": 556, "token_acc": 0.9574468085106383, "train_speed(iter/s)": 0.724012 }, { "epoch": 0.40014367816091956, "grad_norm": 0.8778942227363586, "learning_rate": 7.009049801787994e-06, "loss": 0.10495404899120331, "memory(GiB)": 24.84, "step": 557, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.724351 }, { "epoch": 0.40086206896551724, "grad_norm": 1.2196749448776245, "learning_rate": 6.998163545830998e-06, "loss": 0.11238733679056168, "memory(GiB)": 24.84, "step": 558, "token_acc": 0.9507299270072993, "train_speed(iter/s)": 0.724685 }, { "epoch": 0.4015804597701149, "grad_norm": 0.9775468111038208, "learning_rate": 6.987266005754879e-06, "loss": 0.09895237535238266, "memory(GiB)": 24.84, "step": 559, "token_acc": 0.9649122807017544, "train_speed(iter/s)": 0.725118 }, { "epoch": 0.40229885057471265, "grad_norm": 1.2810943126678467, "learning_rate": 6.976357243100718e-06, "loss": 0.10445723682641983, "memory(GiB)": 24.84, "step": 560, "token_acc": 0.9582417582417583, "train_speed(iter/s)": 0.725543 }, { "epoch": 0.40301724137931033, "grad_norm": 1.1017495393753052, "learning_rate": 6.965437319472965e-06, "loss": 0.11268270015716553, "memory(GiB)": 24.84, "step": 561, "token_acc": 0.9693877551020408, "train_speed(iter/s)": 0.725961 }, { "epoch": 0.40373563218390807, "grad_norm": 0.9425846934318542, "learning_rate": 6.954506296539112e-06, "loss": 0.11186347901821136, "memory(GiB)": 24.84, "step": 562, "token_acc": 0.9499036608863198, "train_speed(iter/s)": 0.726389 }, { "epoch": 0.40445402298850575, "grad_norm": 1.0702611207962036, "learning_rate": 6.943564236029317e-06, "loss": 0.11253111809492111, "memory(GiB)": 24.84, "step": 563, "token_acc": 0.9422680412371134, "train_speed(iter/s)": 0.726812 }, { "epoch": 0.4051724137931034, "grad_norm": 0.9865430593490601, "learning_rate": 6.9326111997360775e-06, "loss": 0.10423590242862701, "memory(GiB)": 24.84, "step": 564, "token_acc": 0.9633911368015414, "train_speed(iter/s)": 0.727237 }, { "epoch": 0.40589080459770116, "grad_norm": 1.1423591375350952, "learning_rate": 6.9216472495138785e-06, "loss": 0.09942566603422165, "memory(GiB)": 24.84, "step": 565, "token_acc": 0.9600886917960089, "train_speed(iter/s)": 0.727666 }, { "epoch": 0.40660919540229884, "grad_norm": 2.046635866165161, "learning_rate": 6.910672447278827e-06, "loss": 0.11045810580253601, "memory(GiB)": 24.84, "step": 566, "token_acc": 0.9670710571923743, "train_speed(iter/s)": 0.728088 }, { "epoch": 0.4073275862068966, "grad_norm": 2.7038323879241943, "learning_rate": 6.8996868550083226e-06, "loss": 0.09416159242391586, "memory(GiB)": 24.84, "step": 567, "token_acc": 0.95625, "train_speed(iter/s)": 0.728505 }, { "epoch": 0.40804597701149425, "grad_norm": 1.3885408639907837, "learning_rate": 6.8886905347406985e-06, "loss": 0.10328912734985352, "memory(GiB)": 24.84, "step": 568, "token_acc": 0.9665354330708661, "train_speed(iter/s)": 0.728926 }, { "epoch": 0.40876436781609193, "grad_norm": 1.1590747833251953, "learning_rate": 6.877683548574866e-06, "loss": 0.10196081548929214, "memory(GiB)": 24.84, "step": 569, "token_acc": 0.9635761589403974, "train_speed(iter/s)": 0.729344 }, { "epoch": 0.40948275862068967, "grad_norm": 0.9205074906349182, "learning_rate": 6.866665958669976e-06, "loss": 0.10381410270929337, "memory(GiB)": 24.84, "step": 570, "token_acc": 0.9636363636363636, "train_speed(iter/s)": 0.729751 }, { "epoch": 0.41020114942528735, "grad_norm": 1.4095512628555298, "learning_rate": 6.855637827245055e-06, "loss": 0.10683813691139221, "memory(GiB)": 24.84, "step": 571, "token_acc": 0.9523809523809523, "train_speed(iter/s)": 0.730148 }, { "epoch": 0.4109195402298851, "grad_norm": 0.887482225894928, "learning_rate": 6.844599216578667e-06, "loss": 0.10551828145980835, "memory(GiB)": 24.84, "step": 572, "token_acc": 0.9620689655172414, "train_speed(iter/s)": 0.730444 }, { "epoch": 0.41163793103448276, "grad_norm": 0.9398730993270874, "learning_rate": 6.833550189008546e-06, "loss": 0.10359399020671844, "memory(GiB)": 24.84, "step": 573, "token_acc": 0.96045197740113, "train_speed(iter/s)": 0.730747 }, { "epoch": 0.41235632183908044, "grad_norm": 1.8816733360290527, "learning_rate": 6.822490806931262e-06, "loss": 0.10712971538305283, "memory(GiB)": 24.84, "step": 574, "token_acc": 0.9537815126050421, "train_speed(iter/s)": 0.731057 }, { "epoch": 0.4130747126436782, "grad_norm": 1.9917737245559692, "learning_rate": 6.811421132801855e-06, "loss": 0.10952628403902054, "memory(GiB)": 24.84, "step": 575, "token_acc": 0.9591078066914498, "train_speed(iter/s)": 0.731363 }, { "epoch": 0.41379310344827586, "grad_norm": 0.8376147747039795, "learning_rate": 6.800341229133486e-06, "loss": 0.10060207545757294, "memory(GiB)": 24.84, "step": 576, "token_acc": 0.9606003752345216, "train_speed(iter/s)": 0.731598 }, { "epoch": 0.41451149425287354, "grad_norm": 0.8417714834213257, "learning_rate": 6.789251158497084e-06, "loss": 0.0937974601984024, "memory(GiB)": 24.84, "step": 577, "token_acc": 0.9586206896551724, "train_speed(iter/s)": 0.732006 }, { "epoch": 0.41522988505747127, "grad_norm": 1.2017229795455933, "learning_rate": 6.778150983520999e-06, "loss": 0.0943036824464798, "memory(GiB)": 24.84, "step": 578, "token_acc": 0.9616788321167883, "train_speed(iter/s)": 0.732418 }, { "epoch": 0.41594827586206895, "grad_norm": 0.9254114031791687, "learning_rate": 6.767040766890636e-06, "loss": 0.09632664918899536, "memory(GiB)": 24.84, "step": 579, "token_acc": 0.9533898305084746, "train_speed(iter/s)": 0.732825 }, { "epoch": 0.4166666666666667, "grad_norm": 0.895348072052002, "learning_rate": 6.755920571348111e-06, "loss": 0.10044130682945251, "memory(GiB)": 24.84, "step": 580, "token_acc": 0.9610894941634242, "train_speed(iter/s)": 0.733237 }, { "epoch": 0.41738505747126436, "grad_norm": 1.0386415719985962, "learning_rate": 6.744790459691894e-06, "loss": 0.09722594916820526, "memory(GiB)": 24.84, "step": 581, "token_acc": 0.9474835886214442, "train_speed(iter/s)": 0.733424 }, { "epoch": 0.41810344827586204, "grad_norm": 1.049390435218811, "learning_rate": 6.73365049477645e-06, "loss": 0.09305921941995621, "memory(GiB)": 24.84, "step": 582, "token_acc": 0.9541984732824428, "train_speed(iter/s)": 0.733819 }, { "epoch": 0.4188218390804598, "grad_norm": 0.8428685069084167, "learning_rate": 6.722500739511895e-06, "loss": 0.08820360898971558, "memory(GiB)": 24.84, "step": 583, "token_acc": 0.9664179104477612, "train_speed(iter/s)": 0.734198 }, { "epoch": 0.41954022988505746, "grad_norm": 1.6562708616256714, "learning_rate": 6.711341256863623e-06, "loss": 0.10299413651227951, "memory(GiB)": 24.84, "step": 584, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.734606 }, { "epoch": 0.4202586206896552, "grad_norm": 1.0888352394104004, "learning_rate": 6.700172109851971e-06, "loss": 0.09153769165277481, "memory(GiB)": 24.84, "step": 585, "token_acc": 0.9521829521829522, "train_speed(iter/s)": 0.735013 }, { "epoch": 0.4209770114942529, "grad_norm": 0.9358983635902405, "learning_rate": 6.688993361551847e-06, "loss": 0.10442180931568146, "memory(GiB)": 24.84, "step": 586, "token_acc": 0.9709618874773139, "train_speed(iter/s)": 0.73542 }, { "epoch": 0.42169540229885055, "grad_norm": 1.1094099283218384, "learning_rate": 6.677805075092381e-06, "loss": 0.10711309313774109, "memory(GiB)": 24.84, "step": 587, "token_acc": 0.9617486338797814, "train_speed(iter/s)": 0.735826 }, { "epoch": 0.4224137931034483, "grad_norm": 0.8663936853408813, "learning_rate": 6.66660731365657e-06, "loss": 0.10309429466724396, "memory(GiB)": 24.84, "step": 588, "token_acc": 0.9469026548672567, "train_speed(iter/s)": 0.73623 }, { "epoch": 0.42313218390804597, "grad_norm": 0.9223802089691162, "learning_rate": 6.655400140480916e-06, "loss": 0.09803730249404907, "memory(GiB)": 24.84, "step": 589, "token_acc": 0.9599303135888502, "train_speed(iter/s)": 0.736631 }, { "epoch": 0.4238505747126437, "grad_norm": 0.7408332228660583, "learning_rate": 6.64418361885507e-06, "loss": 0.08790215849876404, "memory(GiB)": 24.84, "step": 590, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.737014 }, { "epoch": 0.4245689655172414, "grad_norm": 1.3856441974639893, "learning_rate": 6.632957812121479e-06, "loss": 0.10292988270521164, "memory(GiB)": 24.84, "step": 591, "token_acc": 0.9522900763358778, "train_speed(iter/s)": 0.737418 }, { "epoch": 0.42528735632183906, "grad_norm": 0.7161735892295837, "learning_rate": 6.621722783675024e-06, "loss": 0.09429532289505005, "memory(GiB)": 24.84, "step": 592, "token_acc": 0.962457337883959, "train_speed(iter/s)": 0.737802 }, { "epoch": 0.4260057471264368, "grad_norm": 1.0052870512008667, "learning_rate": 6.610478596962665e-06, "loss": 0.09552145004272461, "memory(GiB)": 24.84, "step": 593, "token_acc": 0.9617706237424547, "train_speed(iter/s)": 0.738123 }, { "epoch": 0.4267241379310345, "grad_norm": 1.5970730781555176, "learning_rate": 6.599225315483076e-06, "loss": 0.1061292514204979, "memory(GiB)": 24.84, "step": 594, "token_acc": 0.9704251386321626, "train_speed(iter/s)": 0.738404 }, { "epoch": 0.4274425287356322, "grad_norm": 1.2523112297058105, "learning_rate": 6.587963002786299e-06, "loss": 0.09640787541866302, "memory(GiB)": 24.84, "step": 595, "token_acc": 0.9662027833001988, "train_speed(iter/s)": 0.738795 }, { "epoch": 0.4281609195402299, "grad_norm": 1.0037275552749634, "learning_rate": 6.576691722473368e-06, "loss": 0.1000569686293602, "memory(GiB)": 24.84, "step": 596, "token_acc": 0.9652650822669104, "train_speed(iter/s)": 0.739191 }, { "epoch": 0.42887931034482757, "grad_norm": 1.2037330865859985, "learning_rate": 6.5654115381959695e-06, "loss": 0.09520967304706573, "memory(GiB)": 24.84, "step": 597, "token_acc": 0.9569672131147541, "train_speed(iter/s)": 0.739566 }, { "epoch": 0.4295977011494253, "grad_norm": 1.9629340171813965, "learning_rate": 6.554122513656065e-06, "loss": 0.09098273515701294, "memory(GiB)": 24.84, "step": 598, "token_acc": 0.9554030874785592, "train_speed(iter/s)": 0.739963 }, { "epoch": 0.430316091954023, "grad_norm": 0.9623202681541443, "learning_rate": 6.542824712605542e-06, "loss": 0.08370727300643921, "memory(GiB)": 24.84, "step": 599, "token_acc": 0.9743150684931506, "train_speed(iter/s)": 0.74036 }, { "epoch": 0.43103448275862066, "grad_norm": 0.7939316630363464, "learning_rate": 6.531518198845854e-06, "loss": 0.0960487648844719, "memory(GiB)": 24.84, "step": 600, "token_acc": 0.9700934579439252, "train_speed(iter/s)": 0.740751 }, { "epoch": 0.43103448275862066, "eval_loss": 0.09290853142738342, "eval_runtime": 5.8572, "eval_samples_per_second": 76.829, "eval_steps_per_second": 2.561, "eval_token_acc": 0.9634271978021978, "step": 600 }, { "epoch": 0.4317528735632184, "grad_norm": 2.171105146408081, "learning_rate": 6.520203036227651e-06, "loss": 0.08810974657535553, "memory(GiB)": 24.84, "step": 601, "token_acc": 0.9660399529964747, "train_speed(iter/s)": 0.723805 }, { "epoch": 0.4324712643678161, "grad_norm": 0.7150639891624451, "learning_rate": 6.508879288650431e-06, "loss": 0.09494931995868683, "memory(GiB)": 24.84, "step": 602, "token_acc": 0.9670138888888888, "train_speed(iter/s)": 0.724206 }, { "epoch": 0.4331896551724138, "grad_norm": 1.0675841569900513, "learning_rate": 6.4975470200621705e-06, "loss": 0.10132480412721634, "memory(GiB)": 24.84, "step": 603, "token_acc": 0.9478957915831663, "train_speed(iter/s)": 0.724605 }, { "epoch": 0.4339080459770115, "grad_norm": 0.9373019933700562, "learning_rate": 6.486206294458966e-06, "loss": 0.10160164535045624, "memory(GiB)": 24.84, "step": 604, "token_acc": 0.9537037037037037, "train_speed(iter/s)": 0.724997 }, { "epoch": 0.43462643678160917, "grad_norm": 1.0563969612121582, "learning_rate": 6.474857175884673e-06, "loss": 0.08329151570796967, "memory(GiB)": 24.84, "step": 605, "token_acc": 0.9787234042553191, "train_speed(iter/s)": 0.725396 }, { "epoch": 0.4353448275862069, "grad_norm": 1.2491768598556519, "learning_rate": 6.463499728430549e-06, "loss": 0.097022645175457, "memory(GiB)": 24.84, "step": 606, "token_acc": 0.954954954954955, "train_speed(iter/s)": 0.725797 }, { "epoch": 0.4360632183908046, "grad_norm": 0.7747812867164612, "learning_rate": 6.4521340162348765e-06, "loss": 0.09988689422607422, "memory(GiB)": 24.84, "step": 607, "token_acc": 0.9552238805970149, "train_speed(iter/s)": 0.726197 }, { "epoch": 0.4367816091954023, "grad_norm": 0.8826586604118347, "learning_rate": 6.4407601034826225e-06, "loss": 0.0974455177783966, "memory(GiB)": 24.84, "step": 608, "token_acc": 0.9584055459272097, "train_speed(iter/s)": 0.726594 }, { "epoch": 0.4375, "grad_norm": 0.9875901937484741, "learning_rate": 6.429378054405055e-06, "loss": 0.09550222754478455, "memory(GiB)": 24.84, "step": 609, "token_acc": 0.9705304518664047, "train_speed(iter/s)": 0.726927 }, { "epoch": 0.4382183908045977, "grad_norm": 0.8828912377357483, "learning_rate": 6.417987933279397e-06, "loss": 0.10582631081342697, "memory(GiB)": 24.84, "step": 610, "token_acc": 0.9539347408829175, "train_speed(iter/s)": 0.72721 }, { "epoch": 0.4389367816091954, "grad_norm": 1.0183696746826172, "learning_rate": 6.406589804428449e-06, "loss": 0.09873131662607193, "memory(GiB)": 24.84, "step": 611, "token_acc": 0.9486652977412731, "train_speed(iter/s)": 0.727519 }, { "epoch": 0.4396551724137931, "grad_norm": 1.5483784675598145, "learning_rate": 6.395183732220242e-06, "loss": 0.10141580551862717, "memory(GiB)": 24.84, "step": 612, "token_acc": 0.9634146341463414, "train_speed(iter/s)": 0.727907 }, { "epoch": 0.44037356321839083, "grad_norm": 0.9236054420471191, "learning_rate": 6.3837697810676595e-06, "loss": 0.08817378431558609, "memory(GiB)": 24.84, "step": 613, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.7283 }, { "epoch": 0.4410919540229885, "grad_norm": 1.0316143035888672, "learning_rate": 6.372348015428077e-06, "loss": 0.09336516261100769, "memory(GiB)": 24.84, "step": 614, "token_acc": 0.9618181818181818, "train_speed(iter/s)": 0.728692 }, { "epoch": 0.4418103448275862, "grad_norm": 0.9575990438461304, "learning_rate": 6.360918499803008e-06, "loss": 0.09915097057819366, "memory(GiB)": 24.84, "step": 615, "token_acc": 0.9652650822669104, "train_speed(iter/s)": 0.729085 }, { "epoch": 0.4425287356321839, "grad_norm": 1.1113587617874146, "learning_rate": 6.349481298737723e-06, "loss": 0.10027844458818436, "memory(GiB)": 24.84, "step": 616, "token_acc": 0.9576427255985267, "train_speed(iter/s)": 0.729443 }, { "epoch": 0.4432471264367816, "grad_norm": 1.0846248865127563, "learning_rate": 6.3380364768209036e-06, "loss": 0.09093832969665527, "memory(GiB)": 24.84, "step": 617, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.72983 }, { "epoch": 0.44396551724137934, "grad_norm": 1.0515748262405396, "learning_rate": 6.32658409868426e-06, "loss": 0.10288532823324203, "memory(GiB)": 24.84, "step": 618, "token_acc": 0.95578231292517, "train_speed(iter/s)": 0.73022 }, { "epoch": 0.444683908045977, "grad_norm": 1.4630143642425537, "learning_rate": 6.315124229002177e-06, "loss": 0.10114451497793198, "memory(GiB)": 24.84, "step": 619, "token_acc": 0.9625246548323472, "train_speed(iter/s)": 0.730578 }, { "epoch": 0.4454022988505747, "grad_norm": 1.1701664924621582, "learning_rate": 6.303656932491349e-06, "loss": 0.09213127195835114, "memory(GiB)": 24.84, "step": 620, "token_acc": 0.9700996677740864, "train_speed(iter/s)": 0.730962 }, { "epoch": 0.44612068965517243, "grad_norm": 1.2215163707733154, "learning_rate": 6.292182273910409e-06, "loss": 0.10057064890861511, "memory(GiB)": 24.84, "step": 621, "token_acc": 0.9487704918032787, "train_speed(iter/s)": 0.731343 }, { "epoch": 0.4468390804597701, "grad_norm": 1.5336759090423584, "learning_rate": 6.280700318059563e-06, "loss": 0.09580475836992264, "memory(GiB)": 24.84, "step": 622, "token_acc": 0.9669565217391304, "train_speed(iter/s)": 0.731722 }, { "epoch": 0.4475574712643678, "grad_norm": 1.1412867307662964, "learning_rate": 6.269211129780232e-06, "loss": 0.09715986251831055, "memory(GiB)": 24.84, "step": 623, "token_acc": 0.9652777777777778, "train_speed(iter/s)": 0.732089 }, { "epoch": 0.4482758620689655, "grad_norm": 1.362949252128601, "learning_rate": 6.257714773954674e-06, "loss": 0.1033032089471817, "memory(GiB)": 24.84, "step": 624, "token_acc": 0.9481327800829875, "train_speed(iter/s)": 0.732354 }, { "epoch": 0.4489942528735632, "grad_norm": 1.1167316436767578, "learning_rate": 6.2462113155056305e-06, "loss": 0.09575332701206207, "memory(GiB)": 24.84, "step": 625, "token_acc": 0.9612244897959183, "train_speed(iter/s)": 0.73267 }, { "epoch": 0.44971264367816094, "grad_norm": 1.5427743196487427, "learning_rate": 6.234700819395946e-06, "loss": 0.09764684736728668, "memory(GiB)": 24.84, "step": 626, "token_acc": 0.9626168224299065, "train_speed(iter/s)": 0.733038 }, { "epoch": 0.4504310344827586, "grad_norm": 0.9193015098571777, "learning_rate": 6.223183350628215e-06, "loss": 0.09733188152313232, "memory(GiB)": 24.84, "step": 627, "token_acc": 0.9584086799276673, "train_speed(iter/s)": 0.733413 }, { "epoch": 0.4511494252873563, "grad_norm": 1.0162497758865356, "learning_rate": 6.211658974244407e-06, "loss": 0.094648577272892, "memory(GiB)": 24.84, "step": 628, "token_acc": 0.9559386973180076, "train_speed(iter/s)": 0.733789 }, { "epoch": 0.45186781609195403, "grad_norm": 0.9464552402496338, "learning_rate": 6.2001277553254946e-06, "loss": 0.09657412767410278, "memory(GiB)": 24.84, "step": 629, "token_acc": 0.956, "train_speed(iter/s)": 0.734131 }, { "epoch": 0.4525862068965517, "grad_norm": 1.2065213918685913, "learning_rate": 6.1885897589911e-06, "loss": 0.09302497655153275, "memory(GiB)": 24.84, "step": 630, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.734402 }, { "epoch": 0.45330459770114945, "grad_norm": 0.9411286115646362, "learning_rate": 6.177045050399114e-06, "loss": 0.09372767060995102, "memory(GiB)": 24.84, "step": 631, "token_acc": 0.9584158415841584, "train_speed(iter/s)": 0.734684 }, { "epoch": 0.4540229885057471, "grad_norm": 0.8479830622673035, "learning_rate": 6.1654936947453355e-06, "loss": 0.08492769300937653, "memory(GiB)": 24.84, "step": 632, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.734917 }, { "epoch": 0.4547413793103448, "grad_norm": 0.9935433864593506, "learning_rate": 6.153935757263098e-06, "loss": 0.101004958152771, "memory(GiB)": 24.84, "step": 633, "token_acc": 0.9673539518900344, "train_speed(iter/s)": 0.73529 }, { "epoch": 0.45545977011494254, "grad_norm": 0.9869242906570435, "learning_rate": 6.142371303222909e-06, "loss": 0.09348718076944351, "memory(GiB)": 24.84, "step": 634, "token_acc": 0.9553752535496958, "train_speed(iter/s)": 0.735666 }, { "epoch": 0.4561781609195402, "grad_norm": 0.9835852384567261, "learning_rate": 6.130800397932073e-06, "loss": 0.0949326828122139, "memory(GiB)": 24.84, "step": 635, "token_acc": 0.9664310954063604, "train_speed(iter/s)": 0.736034 }, { "epoch": 0.45689655172413796, "grad_norm": 1.014649748802185, "learning_rate": 6.119223106734328e-06, "loss": 0.09672580659389496, "memory(GiB)": 24.84, "step": 636, "token_acc": 0.9649415692821369, "train_speed(iter/s)": 0.736397 }, { "epoch": 0.45761494252873564, "grad_norm": 1.0924326181411743, "learning_rate": 6.107639495009472e-06, "loss": 0.09272860735654831, "memory(GiB)": 24.84, "step": 637, "token_acc": 0.9717314487632509, "train_speed(iter/s)": 0.736771 }, { "epoch": 0.4583333333333333, "grad_norm": 1.1298500299453735, "learning_rate": 6.0960496281729995e-06, "loss": 0.08880304545164108, "memory(GiB)": 24.84, "step": 638, "token_acc": 0.9558232931726908, "train_speed(iter/s)": 0.737135 }, { "epoch": 0.45905172413793105, "grad_norm": 1.64706289768219, "learning_rate": 6.084453571675728e-06, "loss": 0.09650009125471115, "memory(GiB)": 24.84, "step": 639, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.737492 }, { "epoch": 0.45977011494252873, "grad_norm": 0.961529552936554, "learning_rate": 6.072851391003432e-06, "loss": 0.09986139088869095, "memory(GiB)": 24.84, "step": 640, "token_acc": 0.951417004048583, "train_speed(iter/s)": 0.737865 }, { "epoch": 0.46048850574712646, "grad_norm": 0.878676176071167, "learning_rate": 6.061243151676465e-06, "loss": 0.09690164029598236, "memory(GiB)": 24.84, "step": 641, "token_acc": 0.9506903353057199, "train_speed(iter/s)": 0.738233 }, { "epoch": 0.46120689655172414, "grad_norm": 0.8117820024490356, "learning_rate": 6.0496289192494e-06, "loss": 0.09799845516681671, "memory(GiB)": 24.84, "step": 642, "token_acc": 0.96, "train_speed(iter/s)": 0.738599 }, { "epoch": 0.4619252873563218, "grad_norm": 1.2709945440292358, "learning_rate": 6.038008759310654e-06, "loss": 0.0991658866405487, "memory(GiB)": 24.84, "step": 643, "token_acc": 0.9588336192109777, "train_speed(iter/s)": 0.738968 }, { "epoch": 0.46264367816091956, "grad_norm": 1.0069496631622314, "learning_rate": 6.026382737482116e-06, "loss": 0.09862448275089264, "memory(GiB)": 24.84, "step": 644, "token_acc": 0.9718543046357616, "train_speed(iter/s)": 0.739336 }, { "epoch": 0.46336206896551724, "grad_norm": 1.1386953592300415, "learning_rate": 6.0147509194187815e-06, "loss": 0.09559030830860138, "memory(GiB)": 24.84, "step": 645, "token_acc": 0.9639830508474576, "train_speed(iter/s)": 0.73969 }, { "epoch": 0.4640804597701149, "grad_norm": 1.0747205018997192, "learning_rate": 6.003113370808375e-06, "loss": 0.09457142651081085, "memory(GiB)": 24.84, "step": 646, "token_acc": 0.9607250755287009, "train_speed(iter/s)": 0.739974 }, { "epoch": 0.46479885057471265, "grad_norm": 1.3741015195846558, "learning_rate": 5.991470157370988e-06, "loss": 0.09248257428407669, "memory(GiB)": 24.84, "step": 647, "token_acc": 0.9644970414201184, "train_speed(iter/s)": 0.74023 }, { "epoch": 0.46551724137931033, "grad_norm": 0.7507222890853882, "learning_rate": 5.979821344858695e-06, "loss": 0.09867667406797409, "memory(GiB)": 24.84, "step": 648, "token_acc": 0.970703125, "train_speed(iter/s)": 0.740571 }, { "epoch": 0.46623563218390807, "grad_norm": 0.8737552762031555, "learning_rate": 5.968166999055197e-06, "loss": 0.08857963979244232, "memory(GiB)": 24.84, "step": 649, "token_acc": 0.9702602230483272, "train_speed(iter/s)": 0.740931 }, { "epoch": 0.46695402298850575, "grad_norm": 1.0540691614151, "learning_rate": 5.956507185775441e-06, "loss": 0.09258486330509186, "memory(GiB)": 24.84, "step": 650, "token_acc": 0.9662522202486679, "train_speed(iter/s)": 0.741285 }, { "epoch": 0.46695402298850575, "eval_loss": 0.08939895778894424, "eval_runtime": 6.4814, "eval_samples_per_second": 69.429, "eval_steps_per_second": 2.314, "eval_token_acc": 0.9650193556443556, "step": 650 }, { "epoch": 0.4676724137931034, "grad_norm": 1.2081449031829834, "learning_rate": 5.944841970865249e-06, "loss": 0.0846780464053154, "memory(GiB)": 26.38, "step": 651, "token_acc": 0.9697356215213359, "train_speed(iter/s)": 0.724349 }, { "epoch": 0.46839080459770116, "grad_norm": 1.2141385078430176, "learning_rate": 5.933171420200946e-06, "loss": 0.09899264574050903, "memory(GiB)": 26.38, "step": 652, "token_acc": 0.9535398230088495, "train_speed(iter/s)": 0.724705 }, { "epoch": 0.46910919540229884, "grad_norm": 1.5502980947494507, "learning_rate": 5.921495599688994e-06, "loss": 0.093675896525383, "memory(GiB)": 26.38, "step": 653, "token_acc": 0.9662522202486679, "train_speed(iter/s)": 0.725061 }, { "epoch": 0.4698275862068966, "grad_norm": 3.112104892730713, "learning_rate": 5.909814575265609e-06, "loss": 0.08393201977014542, "memory(GiB)": 26.38, "step": 654, "token_acc": 0.9567099567099567, "train_speed(iter/s)": 0.72542 }, { "epoch": 0.47054597701149425, "grad_norm": 1.364339828491211, "learning_rate": 5.898128412896401e-06, "loss": 0.08788499981164932, "memory(GiB)": 26.38, "step": 655, "token_acc": 0.9685767097966729, "train_speed(iter/s)": 0.725789 }, { "epoch": 0.47126436781609193, "grad_norm": 0.9216952919960022, "learning_rate": 5.88643717857599e-06, "loss": 0.10707274079322815, "memory(GiB)": 26.38, "step": 656, "token_acc": 0.9634146341463414, "train_speed(iter/s)": 0.726156 }, { "epoch": 0.47198275862068967, "grad_norm": 0.9463977217674255, "learning_rate": 5.87474093832764e-06, "loss": 0.08701065927743912, "memory(GiB)": 26.38, "step": 657, "token_acc": 0.9589322381930184, "train_speed(iter/s)": 0.726512 }, { "epoch": 0.47270114942528735, "grad_norm": 1.0429399013519287, "learning_rate": 5.863039758202889e-06, "loss": 0.08990433812141418, "memory(GiB)": 26.38, "step": 658, "token_acc": 0.974910394265233, "train_speed(iter/s)": 0.726879 }, { "epoch": 0.4734195402298851, "grad_norm": 1.5515447854995728, "learning_rate": 5.851333704281164e-06, "loss": 0.1023520678281784, "memory(GiB)": 26.38, "step": 659, "token_acc": 0.9665354330708661, "train_speed(iter/s)": 0.727244 }, { "epoch": 0.47413793103448276, "grad_norm": 1.421204686164856, "learning_rate": 5.839622842669423e-06, "loss": 0.09511786699295044, "memory(GiB)": 26.38, "step": 660, "token_acc": 0.9481641468682506, "train_speed(iter/s)": 0.727542 }, { "epoch": 0.47485632183908044, "grad_norm": 0.8577045798301697, "learning_rate": 5.827907239501769e-06, "loss": 0.09023083746433258, "memory(GiB)": 26.38, "step": 661, "token_acc": 0.9637023593466425, "train_speed(iter/s)": 0.727798 }, { "epoch": 0.4755747126436782, "grad_norm": 1.2591403722763062, "learning_rate": 5.816186960939084e-06, "loss": 0.09647996723651886, "memory(GiB)": 26.38, "step": 662, "token_acc": 0.959409594095941, "train_speed(iter/s)": 0.728056 }, { "epoch": 0.47629310344827586, "grad_norm": 1.2893247604370117, "learning_rate": 5.804462073168652e-06, "loss": 0.09710188210010529, "memory(GiB)": 26.38, "step": 663, "token_acc": 0.9628975265017667, "train_speed(iter/s)": 0.728203 }, { "epoch": 0.47701149425287354, "grad_norm": 0.8880581259727478, "learning_rate": 5.7927326424037875e-06, "loss": 0.0993897020816803, "memory(GiB)": 26.38, "step": 664, "token_acc": 0.9641638225255973, "train_speed(iter/s)": 0.728554 }, { "epoch": 0.47772988505747127, "grad_norm": 0.9053446054458618, "learning_rate": 5.7809987348834605e-06, "loss": 0.08985472470521927, "memory(GiB)": 26.38, "step": 665, "token_acc": 0.9487750556792873, "train_speed(iter/s)": 0.7289 }, { "epoch": 0.47844827586206895, "grad_norm": 1.212175726890564, "learning_rate": 5.7692604168719225e-06, "loss": 0.09166769683361053, "memory(GiB)": 26.38, "step": 666, "token_acc": 0.9631675874769797, "train_speed(iter/s)": 0.729251 }, { "epoch": 0.4791666666666667, "grad_norm": 0.8622274994850159, "learning_rate": 5.7575177546583294e-06, "loss": 0.09530209004878998, "memory(GiB)": 26.38, "step": 667, "token_acc": 0.9635193133047211, "train_speed(iter/s)": 0.729601 }, { "epoch": 0.47988505747126436, "grad_norm": 1.227569580078125, "learning_rate": 5.745770814556373e-06, "loss": 0.09925785660743713, "memory(GiB)": 26.38, "step": 668, "token_acc": 0.9650924024640657, "train_speed(iter/s)": 0.729954 }, { "epoch": 0.48060344827586204, "grad_norm": 0.7272898554801941, "learning_rate": 5.734019662903901e-06, "loss": 0.10001900047063828, "memory(GiB)": 26.38, "step": 669, "token_acc": 0.9617486338797814, "train_speed(iter/s)": 0.730303 }, { "epoch": 0.4813218390804598, "grad_norm": 0.7615684270858765, "learning_rate": 5.722264366062549e-06, "loss": 0.09268024563789368, "memory(GiB)": 26.38, "step": 670, "token_acc": 0.9633204633204633, "train_speed(iter/s)": 0.730647 }, { "epoch": 0.48204022988505746, "grad_norm": 0.87273770570755, "learning_rate": 5.710504990417355e-06, "loss": 0.08621759712696075, "memory(GiB)": 26.38, "step": 671, "token_acc": 0.965034965034965, "train_speed(iter/s)": 0.730992 }, { "epoch": 0.4827586206896552, "grad_norm": 1.1019201278686523, "learning_rate": 5.698741602376395e-06, "loss": 0.09552071988582611, "memory(GiB)": 26.38, "step": 672, "token_acc": 0.9570815450643777, "train_speed(iter/s)": 0.731339 }, { "epoch": 0.4834770114942529, "grad_norm": 1.027260184288025, "learning_rate": 5.686974268370405e-06, "loss": 0.08426695317029953, "memory(GiB)": 26.38, "step": 673, "token_acc": 0.9646840148698885, "train_speed(iter/s)": 0.73168 }, { "epoch": 0.48419540229885055, "grad_norm": 1.377021074295044, "learning_rate": 5.675203054852403e-06, "loss": 0.08640296757221222, "memory(GiB)": 26.38, "step": 674, "token_acc": 0.974609375, "train_speed(iter/s)": 0.732027 }, { "epoch": 0.4849137931034483, "grad_norm": 0.8988898992538452, "learning_rate": 5.6634280282973165e-06, "loss": 0.0929965078830719, "memory(GiB)": 26.38, "step": 675, "token_acc": 0.9597806215722121, "train_speed(iter/s)": 0.732311 }, { "epoch": 0.48563218390804597, "grad_norm": 1.1238445043563843, "learning_rate": 5.651649255201603e-06, "loss": 0.08780014514923096, "memory(GiB)": 26.38, "step": 676, "token_acc": 0.9661354581673307, "train_speed(iter/s)": 0.732557 }, { "epoch": 0.4863505747126437, "grad_norm": 1.3038151264190674, "learning_rate": 5.639866802082883e-06, "loss": 0.08624088019132614, "memory(GiB)": 26.38, "step": 677, "token_acc": 0.9744094488188977, "train_speed(iter/s)": 0.732898 }, { "epoch": 0.4870689655172414, "grad_norm": 0.8883510231971741, "learning_rate": 5.628080735479553e-06, "loss": 0.08130700886249542, "memory(GiB)": 26.38, "step": 678, "token_acc": 0.9780775716694773, "train_speed(iter/s)": 0.733246 }, { "epoch": 0.48778735632183906, "grad_norm": 1.0704052448272705, "learning_rate": 5.616291121950421e-06, "loss": 0.09129241108894348, "memory(GiB)": 26.38, "step": 679, "token_acc": 0.9695238095238096, "train_speed(iter/s)": 0.733583 }, { "epoch": 0.4885057471264368, "grad_norm": 0.855258584022522, "learning_rate": 5.604498028074323e-06, "loss": 0.08300517499446869, "memory(GiB)": 26.38, "step": 680, "token_acc": 0.9712121212121212, "train_speed(iter/s)": 0.73392 }, { "epoch": 0.4892241379310345, "grad_norm": 1.3169938325881958, "learning_rate": 5.592701520449751e-06, "loss": 0.08334079384803772, "memory(GiB)": 26.38, "step": 681, "token_acc": 0.9491525423728814, "train_speed(iter/s)": 0.734268 }, { "epoch": 0.4899425287356322, "grad_norm": 1.0568863153457642, "learning_rate": 5.580901665694471e-06, "loss": 0.08151838928461075, "memory(GiB)": 26.38, "step": 682, "token_acc": 0.957169459962756, "train_speed(iter/s)": 0.734591 }, { "epoch": 0.4906609195402299, "grad_norm": 1.4351849555969238, "learning_rate": 5.5690985304451575e-06, "loss": 0.10071972012519836, "memory(GiB)": 26.38, "step": 683, "token_acc": 0.96, "train_speed(iter/s)": 0.734829 }, { "epoch": 0.49137931034482757, "grad_norm": 1.045121669769287, "learning_rate": 5.557292181357003e-06, "loss": 0.07871466130018234, "memory(GiB)": 26.38, "step": 684, "token_acc": 0.9737302977232924, "train_speed(iter/s)": 0.73509 }, { "epoch": 0.4920977011494253, "grad_norm": 1.1799269914627075, "learning_rate": 5.545482685103356e-06, "loss": 0.08088584244251251, "memory(GiB)": 26.38, "step": 685, "token_acc": 0.9672447013487476, "train_speed(iter/s)": 0.735311 }, { "epoch": 0.492816091954023, "grad_norm": 0.9605402946472168, "learning_rate": 5.533670108375334e-06, "loss": 0.08731529116630554, "memory(GiB)": 26.38, "step": 686, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.735653 }, { "epoch": 0.49353448275862066, "grad_norm": 0.8562530279159546, "learning_rate": 5.521854517881454e-06, "loss": 0.08568824827671051, "memory(GiB)": 26.38, "step": 687, "token_acc": 0.960377358490566, "train_speed(iter/s)": 0.735971 }, { "epoch": 0.4942528735632184, "grad_norm": 1.0218334197998047, "learning_rate": 5.510035980347249e-06, "loss": 0.09573349356651306, "memory(GiB)": 26.38, "step": 688, "token_acc": 0.9644268774703557, "train_speed(iter/s)": 0.736307 }, { "epoch": 0.4949712643678161, "grad_norm": 1.1240662336349487, "learning_rate": 5.498214562514896e-06, "loss": 0.0932910218834877, "memory(GiB)": 26.38, "step": 689, "token_acc": 0.9567010309278351, "train_speed(iter/s)": 0.736628 }, { "epoch": 0.4956896551724138, "grad_norm": 1.1241310834884644, "learning_rate": 5.486390331142841e-06, "loss": 0.09082062542438507, "memory(GiB)": 26.38, "step": 690, "token_acc": 0.9654545454545455, "train_speed(iter/s)": 0.736955 }, { "epoch": 0.4964080459770115, "grad_norm": 0.9445090293884277, "learning_rate": 5.474563353005411e-06, "loss": 0.08835384994745255, "memory(GiB)": 26.38, "step": 691, "token_acc": 0.9603960396039604, "train_speed(iter/s)": 0.737286 }, { "epoch": 0.49712643678160917, "grad_norm": 1.9841448068618774, "learning_rate": 5.462733694892452e-06, "loss": 0.08324095606803894, "memory(GiB)": 26.38, "step": 692, "token_acc": 0.9689578713968958, "train_speed(iter/s)": 0.7376 }, { "epoch": 0.4978448275862069, "grad_norm": 1.1443192958831787, "learning_rate": 5.450901423608942e-06, "loss": 0.09350195527076721, "memory(GiB)": 26.38, "step": 693, "token_acc": 0.9613821138211383, "train_speed(iter/s)": 0.73793 }, { "epoch": 0.4985632183908046, "grad_norm": 1.1371794939041138, "learning_rate": 5.439066605974615e-06, "loss": 0.09789994359016418, "memory(GiB)": 26.38, "step": 694, "token_acc": 0.9570552147239264, "train_speed(iter/s)": 0.738257 }, { "epoch": 0.4992816091954023, "grad_norm": 1.4195266962051392, "learning_rate": 5.427229308823585e-06, "loss": 0.09036225080490112, "memory(GiB)": 26.38, "step": 695, "token_acc": 0.9661733615221987, "train_speed(iter/s)": 0.738588 }, { "epoch": 0.5, "grad_norm": 0.9720412492752075, "learning_rate": 5.415389599003972e-06, "loss": 0.09468471258878708, "memory(GiB)": 26.38, "step": 696, "token_acc": 0.9667359667359667, "train_speed(iter/s)": 0.738913 }, { "epoch": 0.5007183908045977, "grad_norm": 0.7829233407974243, "learning_rate": 5.403547543377516e-06, "loss": 0.0856652706861496, "memory(GiB)": 26.38, "step": 697, "token_acc": 0.9668989547038328, "train_speed(iter/s)": 0.739176 }, { "epoch": 0.5014367816091954, "grad_norm": 1.0836762189865112, "learning_rate": 5.391703208819209e-06, "loss": 0.10220605134963989, "memory(GiB)": 26.38, "step": 698, "token_acc": 0.9530516431924883, "train_speed(iter/s)": 0.739416 }, { "epoch": 0.5021551724137931, "grad_norm": 0.8385603427886963, "learning_rate": 5.379856662216907e-06, "loss": 0.07701939344406128, "memory(GiB)": 26.38, "step": 699, "token_acc": 0.9819967266775778, "train_speed(iter/s)": 0.739701 }, { "epoch": 0.5028735632183908, "grad_norm": 0.9613054990768433, "learning_rate": 5.368007970470964e-06, "loss": 0.08547097444534302, "memory(GiB)": 26.38, "step": 700, "token_acc": 0.984873949579832, "train_speed(iter/s)": 0.740021 }, { "epoch": 0.5028735632183908, "eval_loss": 0.08450771868228912, "eval_runtime": 5.9027, "eval_samples_per_second": 76.236, "eval_steps_per_second": 2.541, "eval_token_acc": 0.9663305444555444, "step": 700 }, { "epoch": 0.5035919540229885, "grad_norm": 0.9990906119346619, "learning_rate": 5.356157200493843e-06, "loss": 0.08602029085159302, "memory(GiB)": 26.38, "step": 701, "token_acc": 0.9686145918725846, "train_speed(iter/s)": 0.724958 }, { "epoch": 0.5043103448275862, "grad_norm": 0.9589906930923462, "learning_rate": 5.344304419209748e-06, "loss": 0.0914459377527237, "memory(GiB)": 26.38, "step": 702, "token_acc": 0.9791304347826087, "train_speed(iter/s)": 0.725223 }, { "epoch": 0.5050287356321839, "grad_norm": 1.2270933389663696, "learning_rate": 5.332449693554239e-06, "loss": 0.08678223937749863, "memory(GiB)": 26.38, "step": 703, "token_acc": 0.974903474903475, "train_speed(iter/s)": 0.725567 }, { "epoch": 0.5057471264367817, "grad_norm": 1.0488221645355225, "learning_rate": 5.3205930904738544e-06, "loss": 0.080626480281353, "memory(GiB)": 26.38, "step": 704, "token_acc": 0.9775280898876404, "train_speed(iter/s)": 0.725895 }, { "epoch": 0.5064655172413793, "grad_norm": 1.1411386728286743, "learning_rate": 5.308734676925739e-06, "loss": 0.08206845074892044, "memory(GiB)": 26.38, "step": 705, "token_acc": 0.9685767097966729, "train_speed(iter/s)": 0.726241 }, { "epoch": 0.507183908045977, "grad_norm": 1.178661584854126, "learning_rate": 5.296874519877256e-06, "loss": 0.08915100991725922, "memory(GiB)": 26.38, "step": 706, "token_acc": 0.9639934533551555, "train_speed(iter/s)": 0.726577 }, { "epoch": 0.5079022988505747, "grad_norm": 1.2857904434204102, "learning_rate": 5.285012686305623e-06, "loss": 0.08666995167732239, "memory(GiB)": 26.38, "step": 707, "token_acc": 0.9640287769784173, "train_speed(iter/s)": 0.726914 }, { "epoch": 0.5086206896551724, "grad_norm": 0.8724091053009033, "learning_rate": 5.273149243197517e-06, "loss": 0.08778765797615051, "memory(GiB)": 26.38, "step": 708, "token_acc": 0.9678571428571429, "train_speed(iter/s)": 0.727253 }, { "epoch": 0.5093390804597702, "grad_norm": 1.6754868030548096, "learning_rate": 5.261284257548708e-06, "loss": 0.0866551399230957, "memory(GiB)": 26.38, "step": 709, "token_acc": 0.9688644688644689, "train_speed(iter/s)": 0.72759 }, { "epoch": 0.5100574712643678, "grad_norm": 0.9349995851516724, "learning_rate": 5.2494177963636785e-06, "loss": 0.08291681110858917, "memory(GiB)": 26.38, "step": 710, "token_acc": 0.964110929853181, "train_speed(iter/s)": 0.727927 }, { "epoch": 0.5107758620689655, "grad_norm": 0.7074158787727356, "learning_rate": 5.237549926655243e-06, "loss": 0.08082785457372665, "memory(GiB)": 26.38, "step": 711, "token_acc": 0.9691119691119691, "train_speed(iter/s)": 0.728264 }, { "epoch": 0.5114942528735632, "grad_norm": 1.5507152080535889, "learning_rate": 5.225680715444168e-06, "loss": 0.09177494794130325, "memory(GiB)": 26.38, "step": 712, "token_acc": 0.9574898785425101, "train_speed(iter/s)": 0.728576 }, { "epoch": 0.5122126436781609, "grad_norm": 1.712868332862854, "learning_rate": 5.213810229758799e-06, "loss": 0.08526857197284698, "memory(GiB)": 26.38, "step": 713, "token_acc": 0.9763113367174281, "train_speed(iter/s)": 0.728831 }, { "epoch": 0.5129310344827587, "grad_norm": 1.7338165044784546, "learning_rate": 5.201938536634674e-06, "loss": 0.08553456515073776, "memory(GiB)": 26.38, "step": 714, "token_acc": 0.959349593495935, "train_speed(iter/s)": 0.729073 }, { "epoch": 0.5136494252873564, "grad_norm": 0.8703961372375488, "learning_rate": 5.190065703114157e-06, "loss": 0.08357927203178406, "memory(GiB)": 26.38, "step": 715, "token_acc": 0.9792746113989638, "train_speed(iter/s)": 0.729397 }, { "epoch": 0.514367816091954, "grad_norm": 1.0394001007080078, "learning_rate": 5.178191796246043e-06, "loss": 0.08742557466030121, "memory(GiB)": 26.38, "step": 716, "token_acc": 0.9594882729211087, "train_speed(iter/s)": 0.729721 }, { "epoch": 0.5150862068965517, "grad_norm": 1.1961150169372559, "learning_rate": 5.166316883085196e-06, "loss": 0.10443438589572906, "memory(GiB)": 26.38, "step": 717, "token_acc": 0.9747081712062257, "train_speed(iter/s)": 0.730055 }, { "epoch": 0.5158045977011494, "grad_norm": 1.1000189781188965, "learning_rate": 5.154441030692162e-06, "loss": 0.08611233532428741, "memory(GiB)": 26.38, "step": 718, "token_acc": 0.9612403100775194, "train_speed(iter/s)": 0.730381 }, { "epoch": 0.5165229885057471, "grad_norm": 0.959223747253418, "learning_rate": 5.1425643061327856e-06, "loss": 0.08313649892807007, "memory(GiB)": 26.38, "step": 719, "token_acc": 0.9725776965265083, "train_speed(iter/s)": 0.730703 }, { "epoch": 0.5172413793103449, "grad_norm": 7.051178455352783, "learning_rate": 5.1306867764778445e-06, "loss": 0.08746903389692307, "memory(GiB)": 26.38, "step": 720, "token_acc": 0.9553903345724907, "train_speed(iter/s)": 0.731033 }, { "epoch": 0.5179597701149425, "grad_norm": 0.7597294449806213, "learning_rate": 5.118808508802654e-06, "loss": 0.08713261783123016, "memory(GiB)": 26.38, "step": 721, "token_acc": 0.9596491228070175, "train_speed(iter/s)": 0.731358 }, { "epoch": 0.5186781609195402, "grad_norm": 0.8657106757164001, "learning_rate": 5.106929570186706e-06, "loss": 0.07960385084152222, "memory(GiB)": 26.38, "step": 722, "token_acc": 0.9676898222940227, "train_speed(iter/s)": 0.731683 }, { "epoch": 0.5193965517241379, "grad_norm": 0.9900314807891846, "learning_rate": 5.095050027713275e-06, "loss": 0.09159831702709198, "memory(GiB)": 26.38, "step": 723, "token_acc": 0.9695238095238096, "train_speed(iter/s)": 0.732008 }, { "epoch": 0.5201149425287356, "grad_norm": 1.2320846319198608, "learning_rate": 5.083169948469049e-06, "loss": 0.09346140921115875, "memory(GiB)": 26.38, "step": 724, "token_acc": 0.9700934579439252, "train_speed(iter/s)": 0.732334 }, { "epoch": 0.5208333333333334, "grad_norm": 1.8474129438400269, "learning_rate": 5.071289399543745e-06, "loss": 0.07544177770614624, "memory(GiB)": 26.38, "step": 725, "token_acc": 0.964527027027027, "train_speed(iter/s)": 0.732655 }, { "epoch": 0.521551724137931, "grad_norm": 2.4312589168548584, "learning_rate": 5.059408448029737e-06, "loss": 0.07533755898475647, "memory(GiB)": 26.38, "step": 726, "token_acc": 0.9714285714285714, "train_speed(iter/s)": 0.732977 }, { "epoch": 0.5222701149425287, "grad_norm": 0.9773052930831909, "learning_rate": 5.0475271610216655e-06, "loss": 0.086385577917099, "memory(GiB)": 26.38, "step": 727, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.733222 }, { "epoch": 0.5229885057471264, "grad_norm": 1.276046872138977, "learning_rate": 5.0356456056160715e-06, "loss": 0.08255019038915634, "memory(GiB)": 26.38, "step": 728, "token_acc": 0.9794871794871794, "train_speed(iter/s)": 0.733432 }, { "epoch": 0.5237068965517241, "grad_norm": 0.9188868403434753, "learning_rate": 5.023763848911009e-06, "loss": 0.08063976466655731, "memory(GiB)": 26.38, "step": 729, "token_acc": 0.9696356275303644, "train_speed(iter/s)": 0.733754 }, { "epoch": 0.5244252873563219, "grad_norm": 1.1689414978027344, "learning_rate": 5.0118819580056686e-06, "loss": 0.08695630729198456, "memory(GiB)": 26.38, "step": 730, "token_acc": 0.9692622950819673, "train_speed(iter/s)": 0.734074 }, { "epoch": 0.5251436781609196, "grad_norm": 1.1059610843658447, "learning_rate": 5e-06, "loss": 0.08829709142446518, "memory(GiB)": 26.38, "step": 731, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.734399 }, { "epoch": 0.5258620689655172, "grad_norm": 0.9323937296867371, "learning_rate": 4.988118041994332e-06, "loss": 0.08130931854248047, "memory(GiB)": 26.38, "step": 732, "token_acc": 0.9684418145956607, "train_speed(iter/s)": 0.734723 }, { "epoch": 0.5265804597701149, "grad_norm": 0.7500733733177185, "learning_rate": 4.976236151088993e-06, "loss": 0.08099929988384247, "memory(GiB)": 26.38, "step": 733, "token_acc": 0.9628252788104089, "train_speed(iter/s)": 0.735051 }, { "epoch": 0.5272988505747126, "grad_norm": 1.2932260036468506, "learning_rate": 4.964354394383929e-06, "loss": 0.0899437814950943, "memory(GiB)": 26.38, "step": 734, "token_acc": 0.9656419529837251, "train_speed(iter/s)": 0.735374 }, { "epoch": 0.5280172413793104, "grad_norm": 1.9139082431793213, "learning_rate": 4.952472838978335e-06, "loss": 0.08275993168354034, "memory(GiB)": 26.38, "step": 735, "token_acc": 0.976491862567812, "train_speed(iter/s)": 0.735695 }, { "epoch": 0.5287356321839081, "grad_norm": 0.8697855472564697, "learning_rate": 4.940591551970264e-06, "loss": 0.09371650218963623, "memory(GiB)": 26.38, "step": 736, "token_acc": 0.9575221238938053, "train_speed(iter/s)": 0.736001 }, { "epoch": 0.5294540229885057, "grad_norm": 0.7562043070793152, "learning_rate": 4.928710600456255e-06, "loss": 0.07644882053136826, "memory(GiB)": 26.38, "step": 737, "token_acc": 0.9625246548323472, "train_speed(iter/s)": 0.73623 }, { "epoch": 0.5301724137931034, "grad_norm": 0.918316125869751, "learning_rate": 4.9168300515309515e-06, "loss": 0.08956922590732574, "memory(GiB)": 26.38, "step": 738, "token_acc": 0.9551656920077972, "train_speed(iter/s)": 0.736475 }, { "epoch": 0.5308908045977011, "grad_norm": 0.9238330721855164, "learning_rate": 4.904949972286728e-06, "loss": 0.08229527622461319, "memory(GiB)": 26.38, "step": 739, "token_acc": 0.968503937007874, "train_speed(iter/s)": 0.736705 }, { "epoch": 0.5316091954022989, "grad_norm": 1.691611409187317, "learning_rate": 4.8930704298132965e-06, "loss": 0.07667367160320282, "memory(GiB)": 26.38, "step": 740, "token_acc": 0.9739130434782609, "train_speed(iter/s)": 0.737003 }, { "epoch": 0.5323275862068966, "grad_norm": 0.9107336401939392, "learning_rate": 4.881191491197348e-06, "loss": 0.09261763095855713, "memory(GiB)": 26.38, "step": 741, "token_acc": 0.959915611814346, "train_speed(iter/s)": 0.737324 }, { "epoch": 0.5330459770114943, "grad_norm": 1.6203936338424683, "learning_rate": 4.869313223522159e-06, "loss": 0.09311778843402863, "memory(GiB)": 26.38, "step": 742, "token_acc": 0.9588785046728971, "train_speed(iter/s)": 0.737634 }, { "epoch": 0.5337643678160919, "grad_norm": 1.2814760208129883, "learning_rate": 4.857435693867215e-06, "loss": 0.07843939960002899, "memory(GiB)": 26.38, "step": 743, "token_acc": 0.9573712255772646, "train_speed(iter/s)": 0.737952 }, { "epoch": 0.5344827586206896, "grad_norm": 1.0772955417633057, "learning_rate": 4.845558969307839e-06, "loss": 0.08776114881038666, "memory(GiB)": 26.38, "step": 744, "token_acc": 0.9670781893004116, "train_speed(iter/s)": 0.738273 }, { "epoch": 0.5352011494252874, "grad_norm": 0.9527371525764465, "learning_rate": 4.833683116914805e-06, "loss": 0.07761067897081375, "memory(GiB)": 26.38, "step": 745, "token_acc": 0.9772296015180265, "train_speed(iter/s)": 0.738594 }, { "epoch": 0.5359195402298851, "grad_norm": 0.787468671798706, "learning_rate": 4.821808203753959e-06, "loss": 0.08057382702827454, "memory(GiB)": 26.38, "step": 746, "token_acc": 0.9726027397260274, "train_speed(iter/s)": 0.738766 }, { "epoch": 0.5366379310344828, "grad_norm": 0.869594931602478, "learning_rate": 4.809934296885845e-06, "loss": 0.08612678945064545, "memory(GiB)": 26.38, "step": 747, "token_acc": 0.974739970282318, "train_speed(iter/s)": 0.739066 }, { "epoch": 0.5373563218390804, "grad_norm": 1.6743704080581665, "learning_rate": 4.798061463365327e-06, "loss": 0.08545944839715958, "memory(GiB)": 26.38, "step": 748, "token_acc": 0.9653061224489796, "train_speed(iter/s)": 0.739375 }, { "epoch": 0.5380747126436781, "grad_norm": 0.7055721879005432, "learning_rate": 4.786189770241203e-06, "loss": 0.07462334632873535, "memory(GiB)": 26.38, "step": 749, "token_acc": 0.9659318637274549, "train_speed(iter/s)": 0.73962 }, { "epoch": 0.5387931034482759, "grad_norm": 0.8918562531471252, "learning_rate": 4.774319284555833e-06, "loss": 0.0909460112452507, "memory(GiB)": 26.38, "step": 750, "token_acc": 0.9566160520607375, "train_speed(iter/s)": 0.739843 }, { "epoch": 0.5387931034482759, "eval_loss": 0.07929463684558868, "eval_runtime": 5.9254, "eval_samples_per_second": 75.944, "eval_steps_per_second": 2.531, "eval_token_acc": 0.9692807192807192, "step": 750 }, { "epoch": 0.5395114942528736, "grad_norm": 1.576403260231018, "learning_rate": 4.762450073344756e-06, "loss": 0.08758231997489929, "memory(GiB)": 26.38, "step": 751, "token_acc": 0.9700704225352113, "train_speed(iter/s)": 0.72646 }, { "epoch": 0.5402298850574713, "grad_norm": 0.7552700042724609, "learning_rate": 4.7505822036363214e-06, "loss": 0.08544185757637024, "memory(GiB)": 26.38, "step": 752, "token_acc": 0.9796954314720813, "train_speed(iter/s)": 0.726782 }, { "epoch": 0.540948275862069, "grad_norm": 0.7325831055641174, "learning_rate": 4.738715742451292e-06, "loss": 0.0729229524731636, "memory(GiB)": 26.38, "step": 753, "token_acc": 0.9722735674676525, "train_speed(iter/s)": 0.72708 }, { "epoch": 0.5416666666666666, "grad_norm": 0.9097095131874084, "learning_rate": 4.726850756802486e-06, "loss": 0.08259224891662598, "memory(GiB)": 26.38, "step": 754, "token_acc": 0.9644351464435147, "train_speed(iter/s)": 0.72731 }, { "epoch": 0.5423850574712644, "grad_norm": 0.7437594532966614, "learning_rate": 4.71498731369438e-06, "loss": 0.07856935262680054, "memory(GiB)": 26.38, "step": 755, "token_acc": 0.9643435980551054, "train_speed(iter/s)": 0.727547 }, { "epoch": 0.5431034482758621, "grad_norm": 0.8596389889717102, "learning_rate": 4.703125480122747e-06, "loss": 0.08305235952138901, "memory(GiB)": 26.38, "step": 756, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.727778 }, { "epoch": 0.5438218390804598, "grad_norm": 0.787600040435791, "learning_rate": 4.6912653230742644e-06, "loss": 0.07904721796512604, "memory(GiB)": 26.38, "step": 757, "token_acc": 0.9663716814159292, "train_speed(iter/s)": 0.728054 }, { "epoch": 0.5445402298850575, "grad_norm": 1.0864022970199585, "learning_rate": 4.679406909526147e-06, "loss": 0.08282336592674255, "memory(GiB)": 26.38, "step": 758, "token_acc": 0.9676258992805755, "train_speed(iter/s)": 0.728352 }, { "epoch": 0.5452586206896551, "grad_norm": 1.4616643190383911, "learning_rate": 4.667550306445762e-06, "loss": 0.08429908752441406, "memory(GiB)": 26.38, "step": 759, "token_acc": 0.9614678899082569, "train_speed(iter/s)": 0.728668 }, { "epoch": 0.5459770114942529, "grad_norm": 1.2560566663742065, "learning_rate": 4.655695580790254e-06, "loss": 0.08331495523452759, "memory(GiB)": 26.38, "step": 760, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.728986 }, { "epoch": 0.5466954022988506, "grad_norm": 1.037241816520691, "learning_rate": 4.6438427995061575e-06, "loss": 0.07832711189985275, "memory(GiB)": 26.38, "step": 761, "token_acc": 0.9822380106571936, "train_speed(iter/s)": 0.729297 }, { "epoch": 0.5474137931034483, "grad_norm": 1.051702618598938, "learning_rate": 4.631992029529037e-06, "loss": 0.0843583270907402, "memory(GiB)": 26.38, "step": 762, "token_acc": 0.9644859813084112, "train_speed(iter/s)": 0.72961 }, { "epoch": 0.548132183908046, "grad_norm": 1.119262933731079, "learning_rate": 4.620143337783095e-06, "loss": 0.07720677554607391, "memory(GiB)": 26.38, "step": 763, "token_acc": 0.9753086419753086, "train_speed(iter/s)": 0.729923 }, { "epoch": 0.5488505747126436, "grad_norm": 5.273931980133057, "learning_rate": 4.608296791180793e-06, "loss": 0.07975953072309494, "memory(GiB)": 26.38, "step": 764, "token_acc": 0.9663120567375887, "train_speed(iter/s)": 0.730237 }, { "epoch": 0.5495689655172413, "grad_norm": 0.9366029500961304, "learning_rate": 4.596452456622484e-06, "loss": 0.0839240700006485, "memory(GiB)": 26.38, "step": 765, "token_acc": 0.9617391304347827, "train_speed(iter/s)": 0.730502 }, { "epoch": 0.5502873563218391, "grad_norm": 1.2278259992599487, "learning_rate": 4.584610400996028e-06, "loss": 0.09087572246789932, "memory(GiB)": 26.38, "step": 766, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.730729 }, { "epoch": 0.5510057471264368, "grad_norm": 1.0272120237350464, "learning_rate": 4.572770691176415e-06, "loss": 0.07903590053319931, "memory(GiB)": 26.38, "step": 767, "token_acc": 0.9792027729636048, "train_speed(iter/s)": 0.730959 }, { "epoch": 0.5517241379310345, "grad_norm": 0.839047908782959, "learning_rate": 4.560933394025386e-06, "loss": 0.07923568785190582, "memory(GiB)": 26.38, "step": 768, "token_acc": 0.9622302158273381, "train_speed(iter/s)": 0.731252 }, { "epoch": 0.5524425287356322, "grad_norm": 0.8700577020645142, "learning_rate": 4.549098576391061e-06, "loss": 0.07561814785003662, "memory(GiB)": 26.38, "step": 769, "token_acc": 0.9597806215722121, "train_speed(iter/s)": 0.73156 }, { "epoch": 0.5531609195402298, "grad_norm": 0.8948972225189209, "learning_rate": 4.537266305107549e-06, "loss": 0.07530146837234497, "memory(GiB)": 26.38, "step": 770, "token_acc": 0.9681050656660413, "train_speed(iter/s)": 0.731873 }, { "epoch": 0.5538793103448276, "grad_norm": 0.9745829701423645, "learning_rate": 4.525436646994591e-06, "loss": 0.09233404695987701, "memory(GiB)": 26.38, "step": 771, "token_acc": 0.9602649006622517, "train_speed(iter/s)": 0.732179 }, { "epoch": 0.5545977011494253, "grad_norm": 1.1843253374099731, "learning_rate": 4.513609668857162e-06, "loss": 0.07765445858240128, "memory(GiB)": 26.38, "step": 772, "token_acc": 0.9643527204502814, "train_speed(iter/s)": 0.732491 }, { "epoch": 0.555316091954023, "grad_norm": 0.7652274966239929, "learning_rate": 4.5017854374851045e-06, "loss": 0.08124075829982758, "memory(GiB)": 26.38, "step": 773, "token_acc": 0.9847328244274809, "train_speed(iter/s)": 0.732797 }, { "epoch": 0.5560344827586207, "grad_norm": 1.5534379482269287, "learning_rate": 4.489964019652752e-06, "loss": 0.08491243422031403, "memory(GiB)": 26.38, "step": 774, "token_acc": 0.9759704251386322, "train_speed(iter/s)": 0.733105 }, { "epoch": 0.5567528735632183, "grad_norm": 1.5987989902496338, "learning_rate": 4.478145482118547e-06, "loss": 0.08500392735004425, "memory(GiB)": 26.38, "step": 775, "token_acc": 0.9608695652173913, "train_speed(iter/s)": 0.733414 }, { "epoch": 0.5574712643678161, "grad_norm": 0.9491287469863892, "learning_rate": 4.4663298916246665e-06, "loss": 0.08585185557603836, "memory(GiB)": 26.38, "step": 776, "token_acc": 0.9680851063829787, "train_speed(iter/s)": 0.733722 }, { "epoch": 0.5581896551724138, "grad_norm": 1.2485532760620117, "learning_rate": 4.4545173148966456e-06, "loss": 0.07644348591566086, "memory(GiB)": 26.38, "step": 777, "token_acc": 0.9672727272727273, "train_speed(iter/s)": 0.734031 }, { "epoch": 0.5589080459770115, "grad_norm": 0.9336556196212769, "learning_rate": 4.442707818642999e-06, "loss": 0.07450053095817566, "memory(GiB)": 26.38, "step": 778, "token_acc": 0.9750479846449136, "train_speed(iter/s)": 0.734335 }, { "epoch": 0.5596264367816092, "grad_norm": 0.8087511658668518, "learning_rate": 4.430901469554844e-06, "loss": 0.09334324300289154, "memory(GiB)": 26.38, "step": 779, "token_acc": 0.9426386233269598, "train_speed(iter/s)": 0.734629 }, { "epoch": 0.5603448275862069, "grad_norm": 1.7350176572799683, "learning_rate": 4.419098334305529e-06, "loss": 0.07801170647144318, "memory(GiB)": 26.38, "step": 780, "token_acc": 0.9796296296296296, "train_speed(iter/s)": 0.734844 }, { "epoch": 0.5610632183908046, "grad_norm": 1.11349356174469, "learning_rate": 4.407298479550249e-06, "loss": 0.07234872877597809, "memory(GiB)": 26.38, "step": 781, "token_acc": 0.9753787878787878, "train_speed(iter/s)": 0.735098 }, { "epoch": 0.5617816091954023, "grad_norm": 0.9656155705451965, "learning_rate": 4.395501971925677e-06, "loss": 0.07652012258768082, "memory(GiB)": 26.38, "step": 782, "token_acc": 0.9758842443729904, "train_speed(iter/s)": 0.735399 }, { "epoch": 0.5625, "grad_norm": 1.0723203420639038, "learning_rate": 4.383708878049579e-06, "loss": 0.07185139507055283, "memory(GiB)": 26.38, "step": 783, "token_acc": 0.9633911368015414, "train_speed(iter/s)": 0.735702 }, { "epoch": 0.5632183908045977, "grad_norm": 1.076810359954834, "learning_rate": 4.371919264520449e-06, "loss": 0.08683423697948456, "memory(GiB)": 26.38, "step": 784, "token_acc": 0.9766606822262118, "train_speed(iter/s)": 0.736005 }, { "epoch": 0.5639367816091954, "grad_norm": 0.9423619508743286, "learning_rate": 4.36013319791712e-06, "loss": 0.07961373031139374, "memory(GiB)": 26.38, "step": 785, "token_acc": 0.9727891156462585, "train_speed(iter/s)": 0.736307 }, { "epoch": 0.5646551724137931, "grad_norm": 0.8225990533828735, "learning_rate": 4.348350744798399e-06, "loss": 0.07736529409885406, "memory(GiB)": 26.38, "step": 786, "token_acc": 0.9626998223801065, "train_speed(iter/s)": 0.736611 }, { "epoch": 0.5653735632183908, "grad_norm": 1.3943365812301636, "learning_rate": 4.336571971702686e-06, "loss": 0.06967116892337799, "memory(GiB)": 26.38, "step": 787, "token_acc": 0.969811320754717, "train_speed(iter/s)": 0.736915 }, { "epoch": 0.5660919540229885, "grad_norm": 1.106191873550415, "learning_rate": 4.324796945147598e-06, "loss": 0.07285456359386444, "memory(GiB)": 26.38, "step": 788, "token_acc": 0.9621928166351607, "train_speed(iter/s)": 0.737217 }, { "epoch": 0.5668103448275862, "grad_norm": 1.0031057596206665, "learning_rate": 4.313025731629596e-06, "loss": 0.08482204377651215, "memory(GiB)": 26.38, "step": 789, "token_acc": 0.9674502712477396, "train_speed(iter/s)": 0.737513 }, { "epoch": 0.5675287356321839, "grad_norm": 1.1120381355285645, "learning_rate": 4.301258397623606e-06, "loss": 0.07426780462265015, "memory(GiB)": 26.38, "step": 790, "token_acc": 0.9617590822179732, "train_speed(iter/s)": 0.737813 }, { "epoch": 0.5682471264367817, "grad_norm": 1.4168410301208496, "learning_rate": 4.289495009582647e-06, "loss": 0.08724476397037506, "memory(GiB)": 26.38, "step": 791, "token_acc": 0.9672131147540983, "train_speed(iter/s)": 0.738099 }, { "epoch": 0.5689655172413793, "grad_norm": 2.6042821407318115, "learning_rate": 4.2777356339374526e-06, "loss": 0.08321377635002136, "memory(GiB)": 26.38, "step": 792, "token_acc": 0.9733124018838305, "train_speed(iter/s)": 0.738317 }, { "epoch": 0.569683908045977, "grad_norm": 0.9230087399482727, "learning_rate": 4.2659803370961e-06, "loss": 0.08587909489870071, "memory(GiB)": 26.38, "step": 793, "token_acc": 0.9631901840490797, "train_speed(iter/s)": 0.738547 }, { "epoch": 0.5704022988505747, "grad_norm": 1.4581927061080933, "learning_rate": 4.254229185443628e-06, "loss": 0.09046661108732224, "memory(GiB)": 26.38, "step": 794, "token_acc": 0.9554140127388535, "train_speed(iter/s)": 0.738763 }, { "epoch": 0.5711206896551724, "grad_norm": 0.9677795171737671, "learning_rate": 4.242482245341671e-06, "loss": 0.08434668928384781, "memory(GiB)": 26.38, "step": 795, "token_acc": 0.9581749049429658, "train_speed(iter/s)": 0.739039 }, { "epoch": 0.5718390804597702, "grad_norm": 1.080803394317627, "learning_rate": 4.230739583128078e-06, "loss": 0.07593526691198349, "memory(GiB)": 26.38, "step": 796, "token_acc": 0.9579579579579579, "train_speed(iter/s)": 0.739337 }, { "epoch": 0.5725574712643678, "grad_norm": 0.8187927603721619, "learning_rate": 4.2190012651165395e-06, "loss": 0.07999290525913239, "memory(GiB)": 26.38, "step": 797, "token_acc": 0.9717741935483871, "train_speed(iter/s)": 0.739638 }, { "epoch": 0.5732758620689655, "grad_norm": 0.901893138885498, "learning_rate": 4.2072673575962125e-06, "loss": 0.07469315826892853, "memory(GiB)": 26.38, "step": 798, "token_acc": 0.9675572519083969, "train_speed(iter/s)": 0.739931 }, { "epoch": 0.5739942528735632, "grad_norm": 0.6968883872032166, "learning_rate": 4.195537926831351e-06, "loss": 0.06346748769283295, "memory(GiB)": 26.38, "step": 799, "token_acc": 0.979047619047619, "train_speed(iter/s)": 0.740226 }, { "epoch": 0.5747126436781609, "grad_norm": 0.8265519738197327, "learning_rate": 4.183813039060919e-06, "loss": 0.07633837312459946, "memory(GiB)": 26.38, "step": 800, "token_acc": 0.9730337078651685, "train_speed(iter/s)": 0.740521 }, { "epoch": 0.5747126436781609, "eval_loss": 0.0765056163072586, "eval_runtime": 6.7504, "eval_samples_per_second": 66.663, "eval_steps_per_second": 2.222, "eval_token_acc": 0.9698114385614386, "step": 800 }, { "epoch": 0.5754310344827587, "grad_norm": 1.5826358795166016, "learning_rate": 4.1720927604982335e-06, "loss": 0.08891771733760834, "memory(GiB)": 26.38, "step": 801, "token_acc": 0.9698326655668159, "train_speed(iter/s)": 0.727531 }, { "epoch": 0.5761494252873564, "grad_norm": 0.8925449252128601, "learning_rate": 4.160377157330579e-06, "loss": 0.08540396392345428, "memory(GiB)": 26.38, "step": 802, "token_acc": 0.9586206896551724, "train_speed(iter/s)": 0.727832 }, { "epoch": 0.576867816091954, "grad_norm": 0.996103823184967, "learning_rate": 4.148666295718837e-06, "loss": 0.0744364857673645, "memory(GiB)": 26.38, "step": 803, "token_acc": 0.9737827715355806, "train_speed(iter/s)": 0.728128 }, { "epoch": 0.5775862068965517, "grad_norm": 1.1274832487106323, "learning_rate": 4.136960241797113e-06, "loss": 0.08627703785896301, "memory(GiB)": 26.38, "step": 804, "token_acc": 0.9694041867954911, "train_speed(iter/s)": 0.728425 }, { "epoch": 0.5783045977011494, "grad_norm": 1.07513427734375, "learning_rate": 4.125259061672362e-06, "loss": 0.08148907870054245, "memory(GiB)": 26.38, "step": 805, "token_acc": 0.9680284191829485, "train_speed(iter/s)": 0.728717 }, { "epoch": 0.5790229885057471, "grad_norm": 4.584576606750488, "learning_rate": 4.113562821424012e-06, "loss": 0.07782547175884247, "memory(GiB)": 26.38, "step": 806, "token_acc": 0.9663157894736842, "train_speed(iter/s)": 0.729013 }, { "epoch": 0.5797413793103449, "grad_norm": 1.0343949794769287, "learning_rate": 4.101871587103601e-06, "loss": 0.0845600962638855, "memory(GiB)": 26.38, "step": 807, "token_acc": 0.9609856262833676, "train_speed(iter/s)": 0.729297 }, { "epoch": 0.5804597701149425, "grad_norm": 1.1807146072387695, "learning_rate": 4.090185424734392e-06, "loss": 0.08332620561122894, "memory(GiB)": 26.38, "step": 808, "token_acc": 0.9714285714285714, "train_speed(iter/s)": 0.729575 }, { "epoch": 0.5811781609195402, "grad_norm": 0.8950625061988831, "learning_rate": 4.0785044003110075e-06, "loss": 0.0728270411491394, "memory(GiB)": 26.38, "step": 809, "token_acc": 0.9700374531835206, "train_speed(iter/s)": 0.729791 }, { "epoch": 0.5818965517241379, "grad_norm": 1.0409996509552002, "learning_rate": 4.066828579799054e-06, "loss": 0.0695987194776535, "memory(GiB)": 26.38, "step": 810, "token_acc": 0.9843505477308294, "train_speed(iter/s)": 0.730021 }, { "epoch": 0.5826149425287356, "grad_norm": 0.7431297898292542, "learning_rate": 4.055158029134752e-06, "loss": 0.07150119543075562, "memory(GiB)": 26.38, "step": 811, "token_acc": 0.9781021897810219, "train_speed(iter/s)": 0.730227 }, { "epoch": 0.5833333333333334, "grad_norm": 0.7088935971260071, "learning_rate": 4.043492814224559e-06, "loss": 0.06911292672157288, "memory(GiB)": 26.38, "step": 812, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.730498 }, { "epoch": 0.584051724137931, "grad_norm": 0.9799219965934753, "learning_rate": 4.031833000944803e-06, "loss": 0.06988340616226196, "memory(GiB)": 26.38, "step": 813, "token_acc": 0.9754601226993865, "train_speed(iter/s)": 0.730789 }, { "epoch": 0.5847701149425287, "grad_norm": 1.0006918907165527, "learning_rate": 4.020178655141307e-06, "loss": 0.08526722341775894, "memory(GiB)": 26.38, "step": 814, "token_acc": 0.963855421686747, "train_speed(iter/s)": 0.731082 }, { "epoch": 0.5854885057471264, "grad_norm": 0.8677977919578552, "learning_rate": 4.008529842629015e-06, "loss": 0.07396367192268372, "memory(GiB)": 26.38, "step": 815, "token_acc": 0.9727272727272728, "train_speed(iter/s)": 0.731376 }, { "epoch": 0.5862068965517241, "grad_norm": 1.0174881219863892, "learning_rate": 3.9968866291916254e-06, "loss": 0.07588553428649902, "memory(GiB)": 26.38, "step": 816, "token_acc": 0.9632495164410058, "train_speed(iter/s)": 0.73167 }, { "epoch": 0.5869252873563219, "grad_norm": 1.2207756042480469, "learning_rate": 3.98524908058122e-06, "loss": 0.078942209482193, "memory(GiB)": 26.38, "step": 817, "token_acc": 0.9640831758034026, "train_speed(iter/s)": 0.731964 }, { "epoch": 0.5876436781609196, "grad_norm": 0.8932569026947021, "learning_rate": 3.973617262517886e-06, "loss": 0.07280304282903671, "memory(GiB)": 26.38, "step": 818, "token_acc": 0.9730769230769231, "train_speed(iter/s)": 0.732256 }, { "epoch": 0.5883620689655172, "grad_norm": 0.8267220258712769, "learning_rate": 3.9619912406893475e-06, "loss": 0.07472781836986542, "memory(GiB)": 26.38, "step": 819, "token_acc": 0.9733333333333334, "train_speed(iter/s)": 0.732548 }, { "epoch": 0.5890804597701149, "grad_norm": 0.8482570648193359, "learning_rate": 3.950371080750602e-06, "loss": 0.07623612135648727, "memory(GiB)": 26.38, "step": 820, "token_acc": 0.9671179883945842, "train_speed(iter/s)": 0.732807 }, { "epoch": 0.5897988505747126, "grad_norm": 0.8856959342956543, "learning_rate": 3.9387568483235366e-06, "loss": 0.07946889102458954, "memory(GiB)": 26.38, "step": 821, "token_acc": 0.9758203799654577, "train_speed(iter/s)": 0.733056 }, { "epoch": 0.5905172413793104, "grad_norm": 0.8410401344299316, "learning_rate": 3.927148608996569e-06, "loss": 0.08084297180175781, "memory(GiB)": 26.38, "step": 822, "token_acc": 0.967391304347826, "train_speed(iter/s)": 0.733341 }, { "epoch": 0.5912356321839081, "grad_norm": 0.9992573857307434, "learning_rate": 3.9155464283242725e-06, "loss": 0.07132023572921753, "memory(GiB)": 26.38, "step": 823, "token_acc": 0.9771241830065359, "train_speed(iter/s)": 0.733624 }, { "epoch": 0.5919540229885057, "grad_norm": 0.8616142272949219, "learning_rate": 3.903950371827001e-06, "loss": 0.07894314080476761, "memory(GiB)": 26.38, "step": 824, "token_acc": 0.9644268774703557, "train_speed(iter/s)": 0.733912 }, { "epoch": 0.5926724137931034, "grad_norm": 0.8870595693588257, "learning_rate": 3.892360504990529e-06, "loss": 0.06379452347755432, "memory(GiB)": 26.38, "step": 825, "token_acc": 0.9717868338557993, "train_speed(iter/s)": 0.734191 }, { "epoch": 0.5933908045977011, "grad_norm": 1.62287175655365, "learning_rate": 3.880776893265673e-06, "loss": 0.08804817497730255, "memory(GiB)": 26.38, "step": 826, "token_acc": 0.9533898305084746, "train_speed(iter/s)": 0.734345 }, { "epoch": 0.5941091954022989, "grad_norm": 1.1681334972381592, "learning_rate": 3.869199602067927e-06, "loss": 0.08473111689090729, "memory(GiB)": 26.38, "step": 827, "token_acc": 0.9714795008912656, "train_speed(iter/s)": 0.734631 }, { "epoch": 0.5948275862068966, "grad_norm": 0.9178427457809448, "learning_rate": 3.85762869677709e-06, "loss": 0.07324652373790741, "memory(GiB)": 26.38, "step": 828, "token_acc": 0.9674952198852772, "train_speed(iter/s)": 0.734913 }, { "epoch": 0.5955459770114943, "grad_norm": 0.9400637149810791, "learning_rate": 3.846064242736903e-06, "loss": 0.07684402167797089, "memory(GiB)": 26.38, "step": 829, "token_acc": 0.9706390328151986, "train_speed(iter/s)": 0.735199 }, { "epoch": 0.5962643678160919, "grad_norm": 1.0145355463027954, "learning_rate": 3.834506305254667e-06, "loss": 0.07441005110740662, "memory(GiB)": 26.38, "step": 830, "token_acc": 0.9616161616161616, "train_speed(iter/s)": 0.735484 }, { "epoch": 0.5969827586206896, "grad_norm": 1.2723599672317505, "learning_rate": 3.822954949600887e-06, "loss": 0.08558730781078339, "memory(GiB)": 26.38, "step": 831, "token_acc": 0.9587242026266416, "train_speed(iter/s)": 0.735761 }, { "epoch": 0.5977011494252874, "grad_norm": 0.7831535935401917, "learning_rate": 3.811410241008902e-06, "loss": 0.08163666725158691, "memory(GiB)": 26.38, "step": 832, "token_acc": 0.9801980198019802, "train_speed(iter/s)": 0.735969 }, { "epoch": 0.5984195402298851, "grad_norm": 1.0889716148376465, "learning_rate": 3.7998722446745067e-06, "loss": 0.09394948184490204, "memory(GiB)": 26.38, "step": 833, "token_acc": 0.9587242026266416, "train_speed(iter/s)": 0.736196 }, { "epoch": 0.5991379310344828, "grad_norm": 2.05246639251709, "learning_rate": 3.788341025755595e-06, "loss": 0.08053284883499146, "memory(GiB)": 26.38, "step": 834, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.736481 }, { "epoch": 0.5998563218390804, "grad_norm": 0.8820657730102539, "learning_rate": 3.776816649371786e-06, "loss": 0.08754882216453552, "memory(GiB)": 26.38, "step": 835, "token_acc": 0.9676767676767677, "train_speed(iter/s)": 0.736764 }, { "epoch": 0.6005747126436781, "grad_norm": 0.9096137285232544, "learning_rate": 3.765299180604055e-06, "loss": 0.08012093603610992, "memory(GiB)": 26.38, "step": 836, "token_acc": 0.9706896551724138, "train_speed(iter/s)": 0.737045 }, { "epoch": 0.6012931034482759, "grad_norm": 0.6771939992904663, "learning_rate": 3.7537886844943716e-06, "loss": 0.07971130311489105, "memory(GiB)": 26.38, "step": 837, "token_acc": 0.9648760330578512, "train_speed(iter/s)": 0.73732 }, { "epoch": 0.6020114942528736, "grad_norm": 1.0950560569763184, "learning_rate": 3.7422852260453274e-06, "loss": 0.07390794903039932, "memory(GiB)": 26.38, "step": 838, "token_acc": 0.9759615384615384, "train_speed(iter/s)": 0.737594 }, { "epoch": 0.6027298850574713, "grad_norm": 0.81920325756073, "learning_rate": 3.73078887021977e-06, "loss": 0.07399514317512512, "memory(GiB)": 26.38, "step": 839, "token_acc": 0.9698492462311558, "train_speed(iter/s)": 0.737876 }, { "epoch": 0.603448275862069, "grad_norm": 0.6766725778579712, "learning_rate": 3.719299681940437e-06, "loss": 0.07421375811100006, "memory(GiB)": 26.38, "step": 840, "token_acc": 0.9738219895287958, "train_speed(iter/s)": 0.738155 }, { "epoch": 0.6041666666666666, "grad_norm": 0.831007719039917, "learning_rate": 3.7078177260895915e-06, "loss": 0.07379022985696793, "memory(GiB)": 26.38, "step": 841, "token_acc": 0.9699248120300752, "train_speed(iter/s)": 0.738436 }, { "epoch": 0.6048850574712644, "grad_norm": 0.7913525104522705, "learning_rate": 3.696343067508651e-06, "loss": 0.07241155207157135, "memory(GiB)": 26.38, "step": 842, "token_acc": 0.9774127310061602, "train_speed(iter/s)": 0.738718 }, { "epoch": 0.6056034482758621, "grad_norm": 0.7500242590904236, "learning_rate": 3.6848757709978235e-06, "loss": 0.07627473771572113, "memory(GiB)": 26.38, "step": 843, "token_acc": 0.955637707948244, "train_speed(iter/s)": 0.738996 }, { "epoch": 0.6063218390804598, "grad_norm": 0.7661165595054626, "learning_rate": 3.673415901315743e-06, "loss": 0.0779729038476944, "memory(GiB)": 26.38, "step": 844, "token_acc": 0.9653284671532847, "train_speed(iter/s)": 0.739274 }, { "epoch": 0.6070402298850575, "grad_norm": 0.8785233497619629, "learning_rate": 3.6619635231790985e-06, "loss": 0.07867603003978729, "memory(GiB)": 26.38, "step": 845, "token_acc": 0.96415770609319, "train_speed(iter/s)": 0.739554 }, { "epoch": 0.6077586206896551, "grad_norm": 0.7675284743309021, "learning_rate": 3.650518701262278e-06, "loss": 0.0788523331284523, "memory(GiB)": 26.38, "step": 846, "token_acc": 0.9670781893004116, "train_speed(iter/s)": 0.739808 }, { "epoch": 0.6084770114942529, "grad_norm": 0.7727171182632446, "learning_rate": 3.6390815001969947e-06, "loss": 0.07159574329853058, "memory(GiB)": 26.38, "step": 847, "token_acc": 0.9755301794453507, "train_speed(iter/s)": 0.740008 }, { "epoch": 0.6091954022988506, "grad_norm": 0.7569255232810974, "learning_rate": 3.6276519845719237e-06, "loss": 0.07040233910083771, "memory(GiB)": 27.35, "step": 848, "token_acc": 0.9708029197080292, "train_speed(iter/s)": 0.740219 }, { "epoch": 0.6099137931034483, "grad_norm": 0.8898181915283203, "learning_rate": 3.6162302189323422e-06, "loss": 0.07441496104001999, "memory(GiB)": 27.35, "step": 849, "token_acc": 0.9728867623604466, "train_speed(iter/s)": 0.740405 }, { "epoch": 0.610632183908046, "grad_norm": 0.9040642976760864, "learning_rate": 3.6048162677797595e-06, "loss": 0.079707071185112, "memory(GiB)": 27.35, "step": 850, "token_acc": 0.9709618874773139, "train_speed(iter/s)": 0.740682 }, { "epoch": 0.610632183908046, "eval_loss": 0.07404874265193939, "eval_runtime": 5.9394, "eval_samples_per_second": 75.765, "eval_steps_per_second": 2.526, "eval_token_acc": 0.9701236263736264, "step": 850 }, { "epoch": 0.6113505747126436, "grad_norm": 0.8705517053604126, "learning_rate": 3.5934101955715516e-06, "loss": 0.0729433074593544, "memory(GiB)": 27.35, "step": 851, "token_acc": 0.9727035366722051, "train_speed(iter/s)": 0.729282 }, { "epoch": 0.6120689655172413, "grad_norm": 1.0056146383285522, "learning_rate": 3.582012066720605e-06, "loss": 0.08413244783878326, "memory(GiB)": 27.35, "step": 852, "token_acc": 0.9717514124293786, "train_speed(iter/s)": 0.729551 }, { "epoch": 0.6127873563218391, "grad_norm": 0.7215927839279175, "learning_rate": 3.5706219455949462e-06, "loss": 0.06868686527013779, "memory(GiB)": 27.35, "step": 853, "token_acc": 0.9718045112781954, "train_speed(iter/s)": 0.729816 }, { "epoch": 0.6135057471264368, "grad_norm": 1.8421415090560913, "learning_rate": 3.559239896517379e-06, "loss": 0.07760631293058395, "memory(GiB)": 27.35, "step": 854, "token_acc": 0.9728813559322034, "train_speed(iter/s)": 0.73008 }, { "epoch": 0.6142241379310345, "grad_norm": 1.2173426151275635, "learning_rate": 3.547865983765123e-06, "loss": 0.07828463613986969, "memory(GiB)": 27.35, "step": 855, "token_acc": 0.9780701754385965, "train_speed(iter/s)": 0.730358 }, { "epoch": 0.6149425287356322, "grad_norm": 0.7620123624801636, "learning_rate": 3.536500271569452e-06, "loss": 0.06961657106876373, "memory(GiB)": 27.35, "step": 856, "token_acc": 0.9690346083788707, "train_speed(iter/s)": 0.730633 }, { "epoch": 0.6156609195402298, "grad_norm": 4.035293102264404, "learning_rate": 3.5251428241153263e-06, "loss": 0.07695697993040085, "memory(GiB)": 27.35, "step": 857, "token_acc": 0.9597197898423818, "train_speed(iter/s)": 0.730903 }, { "epoch": 0.6163793103448276, "grad_norm": 1.5998101234436035, "learning_rate": 3.5137937055410343e-06, "loss": 0.07928101718425751, "memory(GiB)": 27.35, "step": 858, "token_acc": 0.9644268774703557, "train_speed(iter/s)": 0.731179 }, { "epoch": 0.6170977011494253, "grad_norm": 0.9437355399131775, "learning_rate": 3.5024529799378316e-06, "loss": 0.07389257848262787, "memory(GiB)": 27.35, "step": 859, "token_acc": 0.9704433497536946, "train_speed(iter/s)": 0.731456 }, { "epoch": 0.617816091954023, "grad_norm": 1.1408931016921997, "learning_rate": 3.4911207113495703e-06, "loss": 0.07798676192760468, "memory(GiB)": 27.35, "step": 860, "token_acc": 0.9720149253731343, "train_speed(iter/s)": 0.73173 }, { "epoch": 0.6185344827586207, "grad_norm": 0.9332766532897949, "learning_rate": 3.4797969637723494e-06, "loss": 0.07393112033605576, "memory(GiB)": 27.35, "step": 861, "token_acc": 0.9771784232365145, "train_speed(iter/s)": 0.731991 }, { "epoch": 0.6192528735632183, "grad_norm": 0.9934490919113159, "learning_rate": 3.4684818011541484e-06, "loss": 0.07751727104187012, "memory(GiB)": 27.35, "step": 862, "token_acc": 0.9674952198852772, "train_speed(iter/s)": 0.732269 }, { "epoch": 0.6199712643678161, "grad_norm": 0.9399784207344055, "learning_rate": 3.4571752873944585e-06, "loss": 0.07377184927463531, "memory(GiB)": 27.35, "step": 863, "token_acc": 0.9690909090909091, "train_speed(iter/s)": 0.732548 }, { "epoch": 0.6206896551724138, "grad_norm": 0.9637095332145691, "learning_rate": 3.4458774863439366e-06, "loss": 0.07155682146549225, "memory(GiB)": 27.35, "step": 864, "token_acc": 0.9732313575525813, "train_speed(iter/s)": 0.73281 }, { "epoch": 0.6214080459770115, "grad_norm": 0.9337314367294312, "learning_rate": 3.4345884618040326e-06, "loss": 0.07261928170919418, "memory(GiB)": 27.35, "step": 865, "token_acc": 0.9651162790697675, "train_speed(iter/s)": 0.733038 }, { "epoch": 0.6221264367816092, "grad_norm": 0.825974702835083, "learning_rate": 3.423308277526633e-06, "loss": 0.07454828172922134, "memory(GiB)": 27.35, "step": 866, "token_acc": 0.9706896551724138, "train_speed(iter/s)": 0.733282 }, { "epoch": 0.6228448275862069, "grad_norm": 0.7263330221176147, "learning_rate": 3.412036997213703e-06, "loss": 0.07075491547584534, "memory(GiB)": 27.35, "step": 867, "token_acc": 0.9817184643510055, "train_speed(iter/s)": 0.733515 }, { "epoch": 0.6235632183908046, "grad_norm": 0.821790337562561, "learning_rate": 3.4007746845169253e-06, "loss": 0.07192401587963104, "memory(GiB)": 27.35, "step": 868, "token_acc": 0.9689213893967094, "train_speed(iter/s)": 0.733743 }, { "epoch": 0.6242816091954023, "grad_norm": 0.9073222875595093, "learning_rate": 3.389521403037337e-06, "loss": 0.06897731125354767, "memory(GiB)": 27.35, "step": 869, "token_acc": 0.9809688581314879, "train_speed(iter/s)": 0.733991 }, { "epoch": 0.625, "grad_norm": 1.672826886177063, "learning_rate": 3.3782772163249767e-06, "loss": 0.07837481796741486, "memory(GiB)": 27.35, "step": 870, "token_acc": 0.9661654135338346, "train_speed(iter/s)": 0.734254 }, { "epoch": 0.6257183908045977, "grad_norm": 1.1848864555358887, "learning_rate": 3.3670421878785213e-06, "loss": 0.06657271832227707, "memory(GiB)": 27.35, "step": 871, "token_acc": 0.9757462686567164, "train_speed(iter/s)": 0.734517 }, { "epoch": 0.6264367816091954, "grad_norm": 0.7704547643661499, "learning_rate": 3.3558163811449317e-06, "loss": 0.06876201927661896, "memory(GiB)": 27.35, "step": 872, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.734786 }, { "epoch": 0.6271551724137931, "grad_norm": 0.9547246694564819, "learning_rate": 3.3445998595190852e-06, "loss": 0.07085442543029785, "memory(GiB)": 27.35, "step": 873, "token_acc": 0.974757281553398, "train_speed(iter/s)": 0.735032 }, { "epoch": 0.6278735632183908, "grad_norm": 1.0655597448349, "learning_rate": 3.3333926863434317e-06, "loss": 0.07397568225860596, "memory(GiB)": 27.35, "step": 874, "token_acc": 0.9751434034416826, "train_speed(iter/s)": 0.735234 }, { "epoch": 0.6285919540229885, "grad_norm": 0.7517561912536621, "learning_rate": 3.3221949249076203e-06, "loss": 0.07189616560935974, "memory(GiB)": 27.35, "step": 875, "token_acc": 0.9709618874773139, "train_speed(iter/s)": 0.735436 }, { "epoch": 0.6293103448275862, "grad_norm": 0.8901945352554321, "learning_rate": 3.311006638448155e-06, "loss": 0.07607901096343994, "memory(GiB)": 27.35, "step": 876, "token_acc": 0.9582463465553236, "train_speed(iter/s)": 0.735677 }, { "epoch": 0.6300287356321839, "grad_norm": 0.7642549276351929, "learning_rate": 3.299827890148031e-06, "loss": 0.06628035008907318, "memory(GiB)": 27.35, "step": 877, "token_acc": 0.9797160243407708, "train_speed(iter/s)": 0.735945 }, { "epoch": 0.6307471264367817, "grad_norm": 1.1910935640335083, "learning_rate": 3.288658743136378e-06, "loss": 0.07509252429008484, "memory(GiB)": 27.35, "step": 878, "token_acc": 0.9772329246935202, "train_speed(iter/s)": 0.73621 }, { "epoch": 0.6314655172413793, "grad_norm": 0.8790740370750427, "learning_rate": 3.277499260488107e-06, "loss": 0.07344061881303787, "memory(GiB)": 27.35, "step": 879, "token_acc": 0.9764065335753176, "train_speed(iter/s)": 0.736476 }, { "epoch": 0.632183908045977, "grad_norm": 0.9910126328468323, "learning_rate": 3.2663495052235505e-06, "loss": 0.07324628531932831, "memory(GiB)": 27.35, "step": 880, "token_acc": 0.9658634538152611, "train_speed(iter/s)": 0.736742 }, { "epoch": 0.6329022988505747, "grad_norm": 0.9645206928253174, "learning_rate": 3.2552095403081075e-06, "loss": 0.06720315665006638, "memory(GiB)": 27.35, "step": 881, "token_acc": 0.9882583170254403, "train_speed(iter/s)": 0.736994 }, { "epoch": 0.6336206896551724, "grad_norm": 1.0749855041503906, "learning_rate": 3.2440794286518896e-06, "loss": 0.07151204347610474, "memory(GiB)": 27.35, "step": 882, "token_acc": 0.9769357495881383, "train_speed(iter/s)": 0.737241 }, { "epoch": 0.6343390804597702, "grad_norm": 0.8955501914024353, "learning_rate": 3.232959233109365e-06, "loss": 0.07083980739116669, "memory(GiB)": 27.35, "step": 883, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.737501 }, { "epoch": 0.6350574712643678, "grad_norm": 0.9717711806297302, "learning_rate": 3.2218490164790015e-06, "loss": 0.07736843079328537, "memory(GiB)": 27.35, "step": 884, "token_acc": 0.9722814498933902, "train_speed(iter/s)": 0.737749 }, { "epoch": 0.6357758620689655, "grad_norm": 1.2647638320922852, "learning_rate": 3.2107488415029157e-06, "loss": 0.07825569808483124, "memory(GiB)": 27.35, "step": 885, "token_acc": 0.9723661485319517, "train_speed(iter/s)": 0.73793 }, { "epoch": 0.6364942528735632, "grad_norm": 1.4382153749465942, "learning_rate": 3.199658770866515e-06, "loss": 0.07771571725606918, "memory(GiB)": 27.35, "step": 886, "token_acc": 0.9681528662420382, "train_speed(iter/s)": 0.738156 }, { "epoch": 0.6372126436781609, "grad_norm": 1.3645250797271729, "learning_rate": 3.1885788671981455e-06, "loss": 0.08029500395059586, "memory(GiB)": 27.35, "step": 887, "token_acc": 0.9610619469026549, "train_speed(iter/s)": 0.738424 }, { "epoch": 0.6379310344827587, "grad_norm": 1.110384464263916, "learning_rate": 3.1775091930687374e-06, "loss": 0.07799994945526123, "memory(GiB)": 27.35, "step": 888, "token_acc": 0.96045197740113, "train_speed(iter/s)": 0.738686 }, { "epoch": 0.6386494252873564, "grad_norm": 0.99549800157547, "learning_rate": 3.1664498109914554e-06, "loss": 0.06818181276321411, "memory(GiB)": 27.35, "step": 889, "token_acc": 0.9698046181172292, "train_speed(iter/s)": 0.738948 }, { "epoch": 0.639367816091954, "grad_norm": 0.9263333678245544, "learning_rate": 3.1554007834213357e-06, "loss": 0.07762879133224487, "memory(GiB)": 27.35, "step": 890, "token_acc": 0.9618473895582329, "train_speed(iter/s)": 0.739209 }, { "epoch": 0.6400862068965517, "grad_norm": 0.7713167667388916, "learning_rate": 3.1443621727549456e-06, "loss": 0.0694110319018364, "memory(GiB)": 27.35, "step": 891, "token_acc": 0.9720670391061452, "train_speed(iter/s)": 0.739466 }, { "epoch": 0.6408045977011494, "grad_norm": 1.0443989038467407, "learning_rate": 3.1333340413300263e-06, "loss": 0.07597457617521286, "memory(GiB)": 27.35, "step": 892, "token_acc": 0.9704861111111112, "train_speed(iter/s)": 0.73972 }, { "epoch": 0.6415229885057471, "grad_norm": 1.1122690439224243, "learning_rate": 3.1223164514251345e-06, "loss": 0.07015012204647064, "memory(GiB)": 27.35, "step": 893, "token_acc": 0.974910394265233, "train_speed(iter/s)": 0.739979 }, { "epoch": 0.6422413793103449, "grad_norm": 0.7885139584541321, "learning_rate": 3.1113094652593023e-06, "loss": 0.07805472612380981, "memory(GiB)": 27.35, "step": 894, "token_acc": 0.967128027681661, "train_speed(iter/s)": 0.740229 }, { "epoch": 0.6429597701149425, "grad_norm": 1.0381572246551514, "learning_rate": 3.100313144991678e-06, "loss": 0.06929145753383636, "memory(GiB)": 27.35, "step": 895, "token_acc": 0.9612141652613828, "train_speed(iter/s)": 0.740483 }, { "epoch": 0.6436781609195402, "grad_norm": 0.939305305480957, "learning_rate": 3.0893275527211742e-06, "loss": 0.07788573950529099, "memory(GiB)": 27.35, "step": 896, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.740742 }, { "epoch": 0.6443965517241379, "grad_norm": 1.6638479232788086, "learning_rate": 3.078352750486123e-06, "loss": 0.067719966173172, "memory(GiB)": 27.35, "step": 897, "token_acc": 0.9822380106571936, "train_speed(iter/s)": 0.740992 }, { "epoch": 0.6451149425287356, "grad_norm": 1.492928147315979, "learning_rate": 3.067388800263923e-06, "loss": 0.08594544231891632, "memory(GiB)": 27.35, "step": 898, "token_acc": 0.9758064516129032, "train_speed(iter/s)": 0.741254 }, { "epoch": 0.6458333333333334, "grad_norm": 1.201831579208374, "learning_rate": 3.0564357639706853e-06, "loss": 0.08354887366294861, "memory(GiB)": 27.35, "step": 899, "token_acc": 0.9661399548532731, "train_speed(iter/s)": 0.741514 }, { "epoch": 0.646551724137931, "grad_norm": 0.7791788578033447, "learning_rate": 3.04549370346089e-06, "loss": 0.06699979305267334, "memory(GiB)": 27.35, "step": 900, "token_acc": 0.97, "train_speed(iter/s)": 0.741773 }, { "epoch": 0.646551724137931, "eval_loss": 0.07144851982593536, "eval_runtime": 6.9864, "eval_samples_per_second": 64.41, "eval_steps_per_second": 2.147, "eval_token_acc": 0.9714035964035964, "step": 900 }, { "epoch": 0.6472701149425287, "grad_norm": 1.0211906433105469, "learning_rate": 3.0345626805270343e-06, "loss": 0.07246020436286926, "memory(GiB)": 27.35, "step": 901, "token_acc": 0.9743770568876352, "train_speed(iter/s)": 0.729812 }, { "epoch": 0.6479885057471264, "grad_norm": 1.9874002933502197, "learning_rate": 3.0236427568992845e-06, "loss": 0.08506324887275696, "memory(GiB)": 27.35, "step": 902, "token_acc": 0.9633911368015414, "train_speed(iter/s)": 0.730077 }, { "epoch": 0.6487068965517241, "grad_norm": 1.0115997791290283, "learning_rate": 3.012733994245122e-06, "loss": 0.07167153805494308, "memory(GiB)": 27.35, "step": 903, "token_acc": 0.9681528662420382, "train_speed(iter/s)": 0.730337 }, { "epoch": 0.6494252873563219, "grad_norm": 0.9378190040588379, "learning_rate": 3.0018364541690048e-06, "loss": 0.0688634067773819, "memory(GiB)": 27.35, "step": 904, "token_acc": 0.9800362976406534, "train_speed(iter/s)": 0.730596 }, { "epoch": 0.6501436781609196, "grad_norm": 1.5176596641540527, "learning_rate": 2.9909501982120088e-06, "loss": 0.07639572024345398, "memory(GiB)": 27.35, "step": 905, "token_acc": 0.9632034632034632, "train_speed(iter/s)": 0.730855 }, { "epoch": 0.6508620689655172, "grad_norm": 1.3800952434539795, "learning_rate": 2.9800752878514903e-06, "loss": 0.0702010840177536, "memory(GiB)": 27.35, "step": 906, "token_acc": 0.9775510204081632, "train_speed(iter/s)": 0.73097 }, { "epoch": 0.6515804597701149, "grad_norm": 1.3987656831741333, "learning_rate": 2.969211784500734e-06, "loss": 0.0734957903623581, "memory(GiB)": 27.35, "step": 907, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.731234 }, { "epoch": 0.6522988505747126, "grad_norm": 0.9565284848213196, "learning_rate": 2.958359749508603e-06, "loss": 0.06743159145116806, "memory(GiB)": 27.35, "step": 908, "token_acc": 0.975328947368421, "train_speed(iter/s)": 0.731499 }, { "epoch": 0.6530172413793104, "grad_norm": 0.8853782415390015, "learning_rate": 2.9475192441591982e-06, "loss": 0.06398328393697739, "memory(GiB)": 27.35, "step": 909, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.73176 }, { "epoch": 0.6537356321839081, "grad_norm": 1.1950054168701172, "learning_rate": 2.936690329671511e-06, "loss": 0.07268309593200684, "memory(GiB)": 27.35, "step": 910, "token_acc": 0.9637096774193549, "train_speed(iter/s)": 0.732022 }, { "epoch": 0.6544540229885057, "grad_norm": 1.138431429862976, "learning_rate": 2.925873067199072e-06, "loss": 0.07469640672206879, "memory(GiB)": 27.35, "step": 911, "token_acc": 0.9651162790697675, "train_speed(iter/s)": 0.732284 }, { "epoch": 0.6551724137931034, "grad_norm": 0.8434069156646729, "learning_rate": 2.915067517829615e-06, "loss": 0.06984487175941467, "memory(GiB)": 27.35, "step": 912, "token_acc": 0.9639175257731959, "train_speed(iter/s)": 0.732545 }, { "epoch": 0.6558908045977011, "grad_norm": 1.0258926153182983, "learning_rate": 2.9042737425847235e-06, "loss": 0.06321363151073456, "memory(GiB)": 27.35, "step": 913, "token_acc": 0.9825479930191972, "train_speed(iter/s)": 0.732804 }, { "epoch": 0.6566091954022989, "grad_norm": 0.9659363627433777, "learning_rate": 2.893491802419492e-06, "loss": 0.0705588236451149, "memory(GiB)": 27.35, "step": 914, "token_acc": 0.9835766423357665, "train_speed(iter/s)": 0.733061 }, { "epoch": 0.6573275862068966, "grad_norm": 0.7851347327232361, "learning_rate": 2.88272175822218e-06, "loss": 0.07048597186803818, "memory(GiB)": 27.35, "step": 915, "token_acc": 0.9706546275395034, "train_speed(iter/s)": 0.733319 }, { "epoch": 0.6580459770114943, "grad_norm": 1.5002787113189697, "learning_rate": 2.871963670813861e-06, "loss": 0.06722084432840347, "memory(GiB)": 27.35, "step": 916, "token_acc": 0.9686274509803922, "train_speed(iter/s)": 0.733578 }, { "epoch": 0.6587643678160919, "grad_norm": 1.4123423099517822, "learning_rate": 2.8612176009480943e-06, "loss": 0.07297956943511963, "memory(GiB)": 27.35, "step": 917, "token_acc": 0.9656357388316151, "train_speed(iter/s)": 0.733837 }, { "epoch": 0.6594827586206896, "grad_norm": 0.713733434677124, "learning_rate": 2.850483609310567e-06, "loss": 0.06324170529842377, "memory(GiB)": 27.35, "step": 918, "token_acc": 0.9814814814814815, "train_speed(iter/s)": 0.734088 }, { "epoch": 0.6602011494252874, "grad_norm": 1.68779718875885, "learning_rate": 2.839761756518764e-06, "loss": 0.0684061273932457, "memory(GiB)": 27.35, "step": 919, "token_acc": 0.9672977624784854, "train_speed(iter/s)": 0.734345 }, { "epoch": 0.6609195402298851, "grad_norm": 3.1292431354522705, "learning_rate": 2.829052103121611e-06, "loss": 0.07612736523151398, "memory(GiB)": 27.35, "step": 920, "token_acc": 0.9800399201596807, "train_speed(iter/s)": 0.734603 }, { "epoch": 0.6616379310344828, "grad_norm": 1.7842977046966553, "learning_rate": 2.818354709599145e-06, "loss": 0.06936296075582504, "memory(GiB)": 27.35, "step": 921, "token_acc": 0.9776119402985075, "train_speed(iter/s)": 0.734816 }, { "epoch": 0.6623563218390804, "grad_norm": 0.9584528803825378, "learning_rate": 2.807669636362169e-06, "loss": 0.0691850483417511, "memory(GiB)": 27.35, "step": 922, "token_acc": 0.9694323144104804, "train_speed(iter/s)": 0.735005 }, { "epoch": 0.6630747126436781, "grad_norm": 1.0446701049804688, "learning_rate": 2.796996943751913e-06, "loss": 0.07746624201536179, "memory(GiB)": 27.35, "step": 923, "token_acc": 0.9584717607973422, "train_speed(iter/s)": 0.735195 }, { "epoch": 0.6637931034482759, "grad_norm": 1.7424674034118652, "learning_rate": 2.7863366920396805e-06, "loss": 0.07230167090892792, "memory(GiB)": 27.35, "step": 924, "token_acc": 0.978688524590164, "train_speed(iter/s)": 0.735389 }, { "epoch": 0.6645114942528736, "grad_norm": 0.9550201296806335, "learning_rate": 2.7756889414265298e-06, "loss": 0.06837642937898636, "memory(GiB)": 27.35, "step": 925, "token_acc": 0.974910394265233, "train_speed(iter/s)": 0.735591 }, { "epoch": 0.6652298850574713, "grad_norm": 1.0424323081970215, "learning_rate": 2.765053752042915e-06, "loss": 0.07300032675266266, "memory(GiB)": 27.35, "step": 926, "token_acc": 0.9745454545454545, "train_speed(iter/s)": 0.735774 }, { "epoch": 0.665948275862069, "grad_norm": 1.429948091506958, "learning_rate": 2.754431183948357e-06, "loss": 0.07673048228025436, "memory(GiB)": 27.35, "step": 927, "token_acc": 0.9794238683127572, "train_speed(iter/s)": 0.735989 }, { "epoch": 0.6666666666666666, "grad_norm": 0.856723427772522, "learning_rate": 2.7438212971311016e-06, "loss": 0.060781329870224, "memory(GiB)": 27.35, "step": 928, "token_acc": 0.9808306709265175, "train_speed(iter/s)": 0.736238 }, { "epoch": 0.6673850574712644, "grad_norm": 0.8478904366493225, "learning_rate": 2.7332241515077784e-06, "loss": 0.06739235669374466, "memory(GiB)": 27.35, "step": 929, "token_acc": 0.9791271347248577, "train_speed(iter/s)": 0.736488 }, { "epoch": 0.6681034482758621, "grad_norm": 1.0149424076080322, "learning_rate": 2.722639806923066e-06, "loss": 0.06998412311077118, "memory(GiB)": 27.35, "step": 930, "token_acc": 0.9740484429065744, "train_speed(iter/s)": 0.736736 }, { "epoch": 0.6688218390804598, "grad_norm": 0.8605524897575378, "learning_rate": 2.7120683231493503e-06, "loss": 0.06101036071777344, "memory(GiB)": 27.35, "step": 931, "token_acc": 0.9810126582278481, "train_speed(iter/s)": 0.736991 }, { "epoch": 0.6695402298850575, "grad_norm": 0.963371753692627, "learning_rate": 2.7015097598863906e-06, "loss": 0.07041098177433014, "memory(GiB)": 27.35, "step": 932, "token_acc": 0.9783549783549783, "train_speed(iter/s)": 0.737243 }, { "epoch": 0.6702586206896551, "grad_norm": 0.9109320044517517, "learning_rate": 2.6909641767609806e-06, "loss": 0.0688549131155014, "memory(GiB)": 27.35, "step": 933, "token_acc": 0.967391304347826, "train_speed(iter/s)": 0.737496 }, { "epoch": 0.6709770114942529, "grad_norm": 0.8072447180747986, "learning_rate": 2.680431633326614e-06, "loss": 0.0739441066980362, "memory(GiB)": 27.35, "step": 934, "token_acc": 0.9742489270386266, "train_speed(iter/s)": 0.737749 }, { "epoch": 0.6716954022988506, "grad_norm": 1.3862199783325195, "learning_rate": 2.669912189063141e-06, "loss": 0.08049742132425308, "memory(GiB)": 27.35, "step": 935, "token_acc": 0.9529190207156308, "train_speed(iter/s)": 0.737982 }, { "epoch": 0.6724137931034483, "grad_norm": 1.1754916906356812, "learning_rate": 2.659405903376442e-06, "loss": 0.0697251707315445, "memory(GiB)": 27.35, "step": 936, "token_acc": 0.969258589511754, "train_speed(iter/s)": 0.738159 }, { "epoch": 0.673132183908046, "grad_norm": 0.9116896986961365, "learning_rate": 2.6489128355980877e-06, "loss": 0.06314043700695038, "memory(GiB)": 27.53, "step": 937, "token_acc": 0.971107544141252, "train_speed(iter/s)": 0.738392 }, { "epoch": 0.6738505747126436, "grad_norm": 1.4913427829742432, "learning_rate": 2.6384330449850028e-06, "loss": 0.06786030530929565, "memory(GiB)": 27.53, "step": 938, "token_acc": 0.9661016949152542, "train_speed(iter/s)": 0.738639 }, { "epoch": 0.6745689655172413, "grad_norm": 1.0333104133605957, "learning_rate": 2.62796659071913e-06, "loss": 0.07139422744512558, "memory(GiB)": 27.53, "step": 939, "token_acc": 0.9676767676767677, "train_speed(iter/s)": 0.738889 }, { "epoch": 0.6752873563218391, "grad_norm": 1.1019221544265747, "learning_rate": 2.617513531907103e-06, "loss": 0.06909745186567307, "memory(GiB)": 27.53, "step": 940, "token_acc": 0.9740484429065744, "train_speed(iter/s)": 0.739135 }, { "epoch": 0.6760057471264368, "grad_norm": 1.024092197418213, "learning_rate": 2.607073927579905e-06, "loss": 0.0762176364660263, "memory(GiB)": 27.53, "step": 941, "token_acc": 0.962, "train_speed(iter/s)": 0.739386 }, { "epoch": 0.6767241379310345, "grad_norm": 5.8833909034729, "learning_rate": 2.5966478366925406e-06, "loss": 0.07184892147779465, "memory(GiB)": 27.53, "step": 942, "token_acc": 0.9597701149425287, "train_speed(iter/s)": 0.739637 }, { "epoch": 0.6774425287356322, "grad_norm": 0.7874391078948975, "learning_rate": 2.5862353181236998e-06, "loss": 0.08487916737794876, "memory(GiB)": 27.53, "step": 943, "token_acc": 0.9637681159420289, "train_speed(iter/s)": 0.739881 }, { "epoch": 0.6781609195402298, "grad_norm": 0.780238926410675, "learning_rate": 2.5758364306754247e-06, "loss": 0.06558934599161148, "memory(GiB)": 27.53, "step": 944, "token_acc": 0.9802867383512545, "train_speed(iter/s)": 0.740128 }, { "epoch": 0.6788793103448276, "grad_norm": 0.76177978515625, "learning_rate": 2.5654512330727837e-06, "loss": 0.06106223911046982, "memory(GiB)": 27.53, "step": 945, "token_acc": 0.9702048417132216, "train_speed(iter/s)": 0.740372 }, { "epoch": 0.6795977011494253, "grad_norm": 1.1186301708221436, "learning_rate": 2.5550797839635283e-06, "loss": 0.0648101419210434, "memory(GiB)": 27.53, "step": 946, "token_acc": 0.9646365422396856, "train_speed(iter/s)": 0.740617 }, { "epoch": 0.680316091954023, "grad_norm": 0.7485531568527222, "learning_rate": 2.5447221419177743e-06, "loss": 0.06456213444471359, "memory(GiB)": 27.53, "step": 947, "token_acc": 0.9716446124763705, "train_speed(iter/s)": 0.740866 }, { "epoch": 0.6810344827586207, "grad_norm": 1.0539956092834473, "learning_rate": 2.5343783654276644e-06, "loss": 0.06967061758041382, "memory(GiB)": 27.53, "step": 948, "token_acc": 0.9727767695099818, "train_speed(iter/s)": 0.741114 }, { "epoch": 0.6817528735632183, "grad_norm": 1.308974027633667, "learning_rate": 2.5240485129070403e-06, "loss": 0.07887134701013565, "memory(GiB)": 27.53, "step": 949, "token_acc": 0.9727126805778491, "train_speed(iter/s)": 0.741361 }, { "epoch": 0.6824712643678161, "grad_norm": 0.9638488292694092, "learning_rate": 2.5137326426911067e-06, "loss": 0.06689711660146713, "memory(GiB)": 27.53, "step": 950, "token_acc": 0.9741824440619621, "train_speed(iter/s)": 0.74161 }, { "epoch": 0.6824712643678161, "eval_loss": 0.06753702461719513, "eval_runtime": 5.9759, "eval_samples_per_second": 75.302, "eval_steps_per_second": 2.51, "eval_token_acc": 0.9731518481518482, "step": 950 }, { "epoch": 0.6831896551724138, "grad_norm": 0.8586934208869934, "learning_rate": 2.503430813036112e-06, "loss": 0.07164427638053894, "memory(GiB)": 27.53, "step": 951, "token_acc": 0.9742003268736866, "train_speed(iter/s)": 0.730832 }, { "epoch": 0.6839080459770115, "grad_norm": 0.8321962952613831, "learning_rate": 2.493143082119013e-06, "loss": 0.07098487764596939, "memory(GiB)": 27.53, "step": 952, "token_acc": 0.9809688581314879, "train_speed(iter/s)": 0.731007 }, { "epoch": 0.6846264367816092, "grad_norm": 1.8721705675125122, "learning_rate": 2.4828695080371474e-06, "loss": 0.061981748789548874, "memory(GiB)": 27.53, "step": 953, "token_acc": 0.9762376237623762, "train_speed(iter/s)": 0.731233 }, { "epoch": 0.6853448275862069, "grad_norm": 0.8859158158302307, "learning_rate": 2.472610148807903e-06, "loss": 0.07842051982879639, "memory(GiB)": 27.53, "step": 954, "token_acc": 0.9502262443438914, "train_speed(iter/s)": 0.731482 }, { "epoch": 0.6860632183908046, "grad_norm": 0.7436713576316833, "learning_rate": 2.4623650623683954e-06, "loss": 0.07546351850032806, "memory(GiB)": 27.53, "step": 955, "token_acc": 0.9810671256454389, "train_speed(iter/s)": 0.731728 }, { "epoch": 0.6867816091954023, "grad_norm": 0.7586796283721924, "learning_rate": 2.452134306575139e-06, "loss": 0.06865014135837555, "memory(GiB)": 27.53, "step": 956, "token_acc": 0.9797979797979798, "train_speed(iter/s)": 0.731967 }, { "epoch": 0.6875, "grad_norm": 1.7641451358795166, "learning_rate": 2.441917939203718e-06, "loss": 0.06773155927658081, "memory(GiB)": 27.53, "step": 957, "token_acc": 0.9771863117870723, "train_speed(iter/s)": 0.732207 }, { "epoch": 0.6882183908045977, "grad_norm": 0.805242121219635, "learning_rate": 2.431716017948462e-06, "loss": 0.07488800585269928, "memory(GiB)": 27.53, "step": 958, "token_acc": 0.971042471042471, "train_speed(iter/s)": 0.732451 }, { "epoch": 0.6889367816091954, "grad_norm": 0.7790842652320862, "learning_rate": 2.421528600422121e-06, "loss": 0.06653250753879547, "memory(GiB)": 27.53, "step": 959, "token_acc": 0.9754716981132076, "train_speed(iter/s)": 0.732695 }, { "epoch": 0.6896551724137931, "grad_norm": 1.1953816413879395, "learning_rate": 2.4113557441555384e-06, "loss": 0.07856148481369019, "memory(GiB)": 27.53, "step": 960, "token_acc": 0.9653465346534653, "train_speed(iter/s)": 0.732941 }, { "epoch": 0.6903735632183908, "grad_norm": 0.8254369497299194, "learning_rate": 2.401197506597323e-06, "loss": 0.06287199258804321, "memory(GiB)": 27.53, "step": 961, "token_acc": 0.9766187050359713, "train_speed(iter/s)": 0.733178 }, { "epoch": 0.6910919540229885, "grad_norm": 0.8673030138015747, "learning_rate": 2.391053945113533e-06, "loss": 0.05875011160969734, "memory(GiB)": 27.53, "step": 962, "token_acc": 0.9689213893967094, "train_speed(iter/s)": 0.733419 }, { "epoch": 0.6918103448275862, "grad_norm": 1.8953784704208374, "learning_rate": 2.380925116987345e-06, "loss": 0.06381504237651825, "memory(GiB)": 27.53, "step": 963, "token_acc": 0.9774590163934426, "train_speed(iter/s)": 0.733662 }, { "epoch": 0.6925287356321839, "grad_norm": 1.2154247760772705, "learning_rate": 2.370811079418735e-06, "loss": 0.07270558178424835, "memory(GiB)": 27.53, "step": 964, "token_acc": 0.963963963963964, "train_speed(iter/s)": 0.733909 }, { "epoch": 0.6932471264367817, "grad_norm": 1.2279831171035767, "learning_rate": 2.360711889524148e-06, "loss": 0.07697765529155731, "memory(GiB)": 27.53, "step": 965, "token_acc": 0.9704724409448819, "train_speed(iter/s)": 0.734144 }, { "epoch": 0.6939655172413793, "grad_norm": 0.9836029410362244, "learning_rate": 2.350627604336186e-06, "loss": 0.07986192405223846, "memory(GiB)": 27.53, "step": 966, "token_acc": 0.9712389380530974, "train_speed(iter/s)": 0.734388 }, { "epoch": 0.694683908045977, "grad_norm": 1.3690550327301025, "learning_rate": 2.340558280803277e-06, "loss": 0.06743729114532471, "memory(GiB)": 27.53, "step": 967, "token_acc": 0.9739663093415007, "train_speed(iter/s)": 0.734605 }, { "epoch": 0.6954022988505747, "grad_norm": 1.2348272800445557, "learning_rate": 2.330503975789361e-06, "loss": 0.07859884202480316, "memory(GiB)": 27.53, "step": 968, "token_acc": 0.9722675367047309, "train_speed(iter/s)": 0.73485 }, { "epoch": 0.6961206896551724, "grad_norm": 0.9127457737922668, "learning_rate": 2.320464746073558e-06, "loss": 0.07455815374851227, "memory(GiB)": 27.53, "step": 969, "token_acc": 0.9747747747747748, "train_speed(iter/s)": 0.735097 }, { "epoch": 0.6968390804597702, "grad_norm": 1.050852656364441, "learning_rate": 2.3104406483498593e-06, "loss": 0.07323621958494186, "memory(GiB)": 27.53, "step": 970, "token_acc": 0.9841269841269841, "train_speed(iter/s)": 0.735341 }, { "epoch": 0.6975574712643678, "grad_norm": 0.9614119529724121, "learning_rate": 2.300431739226801e-06, "loss": 0.0704912394285202, "memory(GiB)": 27.53, "step": 971, "token_acc": 0.9780621572212066, "train_speed(iter/s)": 0.735577 }, { "epoch": 0.6982758620689655, "grad_norm": 0.9490033984184265, "learning_rate": 2.290438075227146e-06, "loss": 0.06801324337720871, "memory(GiB)": 27.53, "step": 972, "token_acc": 0.9752883031301482, "train_speed(iter/s)": 0.735821 }, { "epoch": 0.6989942528735632, "grad_norm": 0.9407207369804382, "learning_rate": 2.2804597127875628e-06, "loss": 0.05997643619775772, "memory(GiB)": 27.53, "step": 973, "token_acc": 0.9770642201834863, "train_speed(iter/s)": 0.736067 }, { "epoch": 0.6997126436781609, "grad_norm": 1.1746290922164917, "learning_rate": 2.270496708258309e-06, "loss": 0.07695136964321136, "memory(GiB)": 27.53, "step": 974, "token_acc": 0.9749552772808586, "train_speed(iter/s)": 0.736308 }, { "epoch": 0.7004310344827587, "grad_norm": 1.5772321224212646, "learning_rate": 2.2605491179029142e-06, "loss": 0.07204243540763855, "memory(GiB)": 27.53, "step": 975, "token_acc": 0.9683168316831683, "train_speed(iter/s)": 0.736552 }, { "epoch": 0.7011494252873564, "grad_norm": 1.4518944025039673, "learning_rate": 2.2506169978978543e-06, "loss": 0.07276761531829834, "memory(GiB)": 27.53, "step": 976, "token_acc": 0.9798994974874372, "train_speed(iter/s)": 0.736797 }, { "epoch": 0.701867816091954, "grad_norm": 0.9329469799995422, "learning_rate": 2.240700404332247e-06, "loss": 0.07031139731407166, "memory(GiB)": 27.53, "step": 977, "token_acc": 0.9714285714285714, "train_speed(iter/s)": 0.737041 }, { "epoch": 0.7025862068965517, "grad_norm": 0.7543947696685791, "learning_rate": 2.230799393207526e-06, "loss": 0.07064704596996307, "memory(GiB)": 27.53, "step": 978, "token_acc": 0.9628180039138943, "train_speed(iter/s)": 0.737251 }, { "epoch": 0.7033045977011494, "grad_norm": 0.8129311203956604, "learning_rate": 2.220914020437128e-06, "loss": 0.06692536920309067, "memory(GiB)": 27.53, "step": 979, "token_acc": 0.9753086419753086, "train_speed(iter/s)": 0.73743 }, { "epoch": 0.7040229885057471, "grad_norm": 0.6810854077339172, "learning_rate": 2.2110443418461723e-06, "loss": 0.06268423795700073, "memory(GiB)": 27.53, "step": 980, "token_acc": 0.9753521126760564, "train_speed(iter/s)": 0.737584 }, { "epoch": 0.7047413793103449, "grad_norm": 0.877329409122467, "learning_rate": 2.2011904131711536e-06, "loss": 0.06396004557609558, "memory(GiB)": 27.53, "step": 981, "token_acc": 0.9634146341463414, "train_speed(iter/s)": 0.737768 }, { "epoch": 0.7054597701149425, "grad_norm": 0.8136486411094666, "learning_rate": 2.191352290059621e-06, "loss": 0.06448061019182205, "memory(GiB)": 27.53, "step": 982, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.737974 }, { "epoch": 0.7061781609195402, "grad_norm": 0.7139348387718201, "learning_rate": 2.1815300280698686e-06, "loss": 0.06881536543369293, "memory(GiB)": 27.53, "step": 983, "token_acc": 0.9686888454011742, "train_speed(iter/s)": 0.738211 }, { "epoch": 0.7068965517241379, "grad_norm": 0.7779189348220825, "learning_rate": 2.171723682670613e-06, "loss": 0.06893943250179291, "memory(GiB)": 27.53, "step": 984, "token_acc": 0.9744597249508841, "train_speed(iter/s)": 0.738442 }, { "epoch": 0.7076149425287356, "grad_norm": 0.8389714360237122, "learning_rate": 2.1619333092406908e-06, "loss": 0.06623788177967072, "memory(GiB)": 27.53, "step": 985, "token_acc": 0.9743119266055046, "train_speed(iter/s)": 0.738678 }, { "epoch": 0.7083333333333334, "grad_norm": 0.813179075717926, "learning_rate": 2.152158963068739e-06, "loss": 0.060092538595199585, "memory(GiB)": 27.53, "step": 986, "token_acc": 0.9783236994219653, "train_speed(iter/s)": 0.738911 }, { "epoch": 0.709051724137931, "grad_norm": 0.932761013507843, "learning_rate": 2.1424006993528868e-06, "loss": 0.06810237467288971, "memory(GiB)": 27.53, "step": 987, "token_acc": 0.9709618874773139, "train_speed(iter/s)": 0.739146 }, { "epoch": 0.7097701149425287, "grad_norm": 0.7076575756072998, "learning_rate": 2.1326585732004384e-06, "loss": 0.0634394958615303, "memory(GiB)": 27.53, "step": 988, "token_acc": 0.9724770642201835, "train_speed(iter/s)": 0.73935 }, { "epoch": 0.7104885057471264, "grad_norm": 0.8703423142433167, "learning_rate": 2.122932639627569e-06, "loss": 0.07366350293159485, "memory(GiB)": 27.53, "step": 989, "token_acc": 0.9653061224489796, "train_speed(iter/s)": 0.739435 }, { "epoch": 0.7112068965517241, "grad_norm": 1.3596391677856445, "learning_rate": 2.1132229535590092e-06, "loss": 0.07654288411140442, "memory(GiB)": 27.53, "step": 990, "token_acc": 0.9753521126760564, "train_speed(iter/s)": 0.73967 }, { "epoch": 0.7119252873563219, "grad_norm": 0.8865165114402771, "learning_rate": 2.103529569827732e-06, "loss": 0.06531214714050293, "memory(GiB)": 27.53, "step": 991, "token_acc": 0.9658634538152611, "train_speed(iter/s)": 0.739907 }, { "epoch": 0.7126436781609196, "grad_norm": 0.8934274911880493, "learning_rate": 2.093852543174652e-06, "loss": 0.07630772143602371, "memory(GiB)": 27.53, "step": 992, "token_acc": 0.9760589318600368, "train_speed(iter/s)": 0.740139 }, { "epoch": 0.7133620689655172, "grad_norm": 0.7911908626556396, "learning_rate": 2.0841919282483097e-06, "loss": 0.07262256741523743, "memory(GiB)": 27.53, "step": 993, "token_acc": 0.9636015325670498, "train_speed(iter/s)": 0.740375 }, { "epoch": 0.7140804597701149, "grad_norm": 0.7536477446556091, "learning_rate": 2.0745477796045664e-06, "loss": 0.06478007882833481, "memory(GiB)": 27.53, "step": 994, "token_acc": 0.9792027729636048, "train_speed(iter/s)": 0.740606 }, { "epoch": 0.7147988505747126, "grad_norm": 0.8501970171928406, "learning_rate": 2.064920151706289e-06, "loss": 0.06475847959518433, "memory(GiB)": 27.53, "step": 995, "token_acc": 0.9790575916230366, "train_speed(iter/s)": 0.740841 }, { "epoch": 0.7155172413793104, "grad_norm": 1.2013850212097168, "learning_rate": 2.0553090989230527e-06, "loss": 0.07680854201316833, "memory(GiB)": 27.53, "step": 996, "token_acc": 0.9673704414587332, "train_speed(iter/s)": 0.741078 }, { "epoch": 0.7162356321839081, "grad_norm": 1.0963459014892578, "learning_rate": 2.0457146755308276e-06, "loss": 0.07540680468082428, "memory(GiB)": 27.53, "step": 997, "token_acc": 0.9769585253456221, "train_speed(iter/s)": 0.741314 }, { "epoch": 0.7169540229885057, "grad_norm": 0.9125537872314453, "learning_rate": 2.036136935711674e-06, "loss": 0.07029087096452713, "memory(GiB)": 27.53, "step": 998, "token_acc": 0.9797047970479705, "train_speed(iter/s)": 0.741552 }, { "epoch": 0.7176724137931034, "grad_norm": 0.7235585451126099, "learning_rate": 2.0265759335534357e-06, "loss": 0.0661415308713913, "memory(GiB)": 27.53, "step": 999, "token_acc": 0.9812286689419796, "train_speed(iter/s)": 0.741789 }, { "epoch": 0.7183908045977011, "grad_norm": 1.0036190748214722, "learning_rate": 2.017031723049432e-06, "loss": 0.06854017078876495, "memory(GiB)": 27.53, "step": 1000, "token_acc": 0.9855072463768116, "train_speed(iter/s)": 0.742026 }, { "epoch": 0.7183908045977011, "eval_loss": 0.06475164741277695, "eval_runtime": 6.0606, "eval_samples_per_second": 74.25, "eval_steps_per_second": 2.475, "eval_token_acc": 0.974541083916084, "step": 1000 }, { "epoch": 0.7191091954022989, "grad_norm": 0.7189743518829346, "learning_rate": 2.0075043580981594e-06, "loss": 0.06608723104000092, "memory(GiB)": 27.53, "step": 1001, "token_acc": 0.9751538097491718, "train_speed(iter/s)": 0.73141 }, { "epoch": 0.7198275862068966, "grad_norm": 0.8305124640464783, "learning_rate": 1.997993892502983e-06, "loss": 0.05840001255273819, "memory(GiB)": 27.53, "step": 1002, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.731642 }, { "epoch": 0.7205459770114943, "grad_norm": 0.8408995270729065, "learning_rate": 1.9885003799718304e-06, "loss": 0.06957124173641205, "memory(GiB)": 27.53, "step": 1003, "token_acc": 0.9745098039215686, "train_speed(iter/s)": 0.731845 }, { "epoch": 0.7212643678160919, "grad_norm": 0.841541588306427, "learning_rate": 1.979023874116895e-06, "loss": 0.056044772267341614, "memory(GiB)": 27.53, "step": 1004, "token_acc": 0.9755639097744361, "train_speed(iter/s)": 0.732012 }, { "epoch": 0.7219827586206896, "grad_norm": 0.8542435765266418, "learning_rate": 1.9695644284543276e-06, "loss": 0.07118597626686096, "memory(GiB)": 27.53, "step": 1005, "token_acc": 0.9567567567567568, "train_speed(iter/s)": 0.73223 }, { "epoch": 0.7227011494252874, "grad_norm": 1.0306808948516846, "learning_rate": 1.9601220964039324e-06, "loss": 0.07137622684240341, "memory(GiB)": 27.53, "step": 1006, "token_acc": 0.9756592292089249, "train_speed(iter/s)": 0.732461 }, { "epoch": 0.7234195402298851, "grad_norm": 0.801401674747467, "learning_rate": 1.950696931288874e-06, "loss": 0.06239210441708565, "memory(GiB)": 27.53, "step": 1007, "token_acc": 0.9818481848184818, "train_speed(iter/s)": 0.73269 }, { "epoch": 0.7241379310344828, "grad_norm": 1.4448184967041016, "learning_rate": 1.9412889863353683e-06, "loss": 0.07149198651313782, "memory(GiB)": 27.53, "step": 1008, "token_acc": 0.9681818181818181, "train_speed(iter/s)": 0.73291 }, { "epoch": 0.7248563218390804, "grad_norm": 0.9691747426986694, "learning_rate": 1.9318983146723893e-06, "loss": 0.06526925414800644, "memory(GiB)": 27.53, "step": 1009, "token_acc": 0.9624217118997912, "train_speed(iter/s)": 0.733135 }, { "epoch": 0.7255747126436781, "grad_norm": 0.8037576675415039, "learning_rate": 1.9225249693313547e-06, "loss": 0.06869252026081085, "memory(GiB)": 27.53, "step": 1010, "token_acc": 0.9754098360655737, "train_speed(iter/s)": 0.733362 }, { "epoch": 0.7262931034482759, "grad_norm": 0.8885784149169922, "learning_rate": 1.9131690032458454e-06, "loss": 0.059362154453992844, "memory(GiB)": 27.53, "step": 1011, "token_acc": 0.9736408566721582, "train_speed(iter/s)": 0.733578 }, { "epoch": 0.7270114942528736, "grad_norm": 0.9508578181266785, "learning_rate": 1.9038304692512943e-06, "loss": 0.0682445615530014, "memory(GiB)": 27.53, "step": 1012, "token_acc": 0.9769736842105263, "train_speed(iter/s)": 0.733806 }, { "epoch": 0.7277298850574713, "grad_norm": 1.0318766832351685, "learning_rate": 1.8945094200846903e-06, "loss": 0.07013137638568878, "memory(GiB)": 27.53, "step": 1013, "token_acc": 0.9816666666666667, "train_speed(iter/s)": 0.734033 }, { "epoch": 0.728448275862069, "grad_norm": 1.0861876010894775, "learning_rate": 1.8852059083842838e-06, "loss": 0.06541885435581207, "memory(GiB)": 27.53, "step": 1014, "token_acc": 0.9880341880341881, "train_speed(iter/s)": 0.734255 }, { "epoch": 0.7291666666666666, "grad_norm": 0.7606948018074036, "learning_rate": 1.875919986689282e-06, "loss": 0.07041481137275696, "memory(GiB)": 27.53, "step": 1015, "token_acc": 0.9704251386321626, "train_speed(iter/s)": 0.734487 }, { "epoch": 0.7298850574712644, "grad_norm": 0.8685198426246643, "learning_rate": 1.8666517074395607e-06, "loss": 0.06969735026359558, "memory(GiB)": 27.53, "step": 1016, "token_acc": 0.9603174603174603, "train_speed(iter/s)": 0.7347 }, { "epoch": 0.7306034482758621, "grad_norm": 0.8790197968482971, "learning_rate": 1.8574011229753646e-06, "loss": 0.06057700142264366, "memory(GiB)": 27.53, "step": 1017, "token_acc": 0.9851380042462845, "train_speed(iter/s)": 0.734921 }, { "epoch": 0.7313218390804598, "grad_norm": 0.9350599646568298, "learning_rate": 1.8481682855370098e-06, "loss": 0.07691513001918793, "memory(GiB)": 27.53, "step": 1018, "token_acc": 0.9763113367174281, "train_speed(iter/s)": 0.735141 }, { "epoch": 0.7320402298850575, "grad_norm": 1.0176793336868286, "learning_rate": 1.8389532472645921e-06, "loss": 0.06679132580757141, "memory(GiB)": 27.53, "step": 1019, "token_acc": 0.9624724061810155, "train_speed(iter/s)": 0.735366 }, { "epoch": 0.7327586206896551, "grad_norm": 1.176027536392212, "learning_rate": 1.829756060197692e-06, "loss": 0.06635869294404984, "memory(GiB)": 27.53, "step": 1020, "token_acc": 0.9766949152542372, "train_speed(iter/s)": 0.735597 }, { "epoch": 0.7334770114942529, "grad_norm": 0.9165264368057251, "learning_rate": 1.8205767762750752e-06, "loss": 0.06345489621162415, "memory(GiB)": 27.53, "step": 1021, "token_acc": 0.9710391822827938, "train_speed(iter/s)": 0.735826 }, { "epoch": 0.7341954022988506, "grad_norm": 0.8176271915435791, "learning_rate": 1.8114154473344081e-06, "loss": 0.06469245254993439, "memory(GiB)": 27.53, "step": 1022, "token_acc": 0.9766990291262136, "train_speed(iter/s)": 0.736055 }, { "epoch": 0.7349137931034483, "grad_norm": 1.868074893951416, "learning_rate": 1.80227212511196e-06, "loss": 0.06870341300964355, "memory(GiB)": 27.53, "step": 1023, "token_acc": 0.9718543046357616, "train_speed(iter/s)": 0.736278 }, { "epoch": 0.735632183908046, "grad_norm": 0.9473834037780762, "learning_rate": 1.7931468612423142e-06, "loss": 0.06299549341201782, "memory(GiB)": 27.53, "step": 1024, "token_acc": 0.9727463312368972, "train_speed(iter/s)": 0.736504 }, { "epoch": 0.7363505747126436, "grad_norm": 0.7801326513290405, "learning_rate": 1.7840397072580678e-06, "loss": 0.06288623064756393, "memory(GiB)": 27.53, "step": 1025, "token_acc": 0.9689922480620154, "train_speed(iter/s)": 0.736729 }, { "epoch": 0.7370689655172413, "grad_norm": 1.2157001495361328, "learning_rate": 1.7749507145895518e-06, "loss": 0.06309308856725693, "memory(GiB)": 27.53, "step": 1026, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.736953 }, { "epoch": 0.7377873563218391, "grad_norm": 0.6853142976760864, "learning_rate": 1.7658799345645356e-06, "loss": 0.060926783829927444, "memory(GiB)": 27.53, "step": 1027, "token_acc": 0.97131931166348, "train_speed(iter/s)": 0.737173 }, { "epoch": 0.7385057471264368, "grad_norm": 1.2152513265609741, "learning_rate": 1.756827418407936e-06, "loss": 0.07400523126125336, "memory(GiB)": 27.53, "step": 1028, "token_acc": 0.9711864406779661, "train_speed(iter/s)": 0.737393 }, { "epoch": 0.7392241379310345, "grad_norm": 0.8265789747238159, "learning_rate": 1.7477932172415317e-06, "loss": 0.06996798515319824, "memory(GiB)": 27.53, "step": 1029, "token_acc": 0.9663299663299664, "train_speed(iter/s)": 0.737614 }, { "epoch": 0.7399425287356322, "grad_norm": 0.9905454516410828, "learning_rate": 1.7387773820836668e-06, "loss": 0.06717927008867264, "memory(GiB)": 27.53, "step": 1030, "token_acc": 0.9633911368015414, "train_speed(iter/s)": 0.737838 }, { "epoch": 0.7406609195402298, "grad_norm": 0.9193287491798401, "learning_rate": 1.7297799638489726e-06, "loss": 0.05790258198976517, "memory(GiB)": 27.53, "step": 1031, "token_acc": 0.9801801801801802, "train_speed(iter/s)": 0.738041 }, { "epoch": 0.7413793103448276, "grad_norm": 0.9760376811027527, "learning_rate": 1.7208010133480751e-06, "loss": 0.06887291371822357, "memory(GiB)": 27.53, "step": 1032, "token_acc": 0.9610655737704918, "train_speed(iter/s)": 0.738214 }, { "epoch": 0.7420977011494253, "grad_norm": 1.4744527339935303, "learning_rate": 1.7118405812873074e-06, "loss": 0.06396161019802094, "memory(GiB)": 27.53, "step": 1033, "token_acc": 0.9731543624161074, "train_speed(iter/s)": 0.738361 }, { "epoch": 0.742816091954023, "grad_norm": 1.1088379621505737, "learning_rate": 1.7028987182684248e-06, "loss": 0.06751073896884918, "memory(GiB)": 27.53, "step": 1034, "token_acc": 0.9718804920913884, "train_speed(iter/s)": 0.738567 }, { "epoch": 0.7435344827586207, "grad_norm": 1.1442047357559204, "learning_rate": 1.6939754747883202e-06, "loss": 0.060918912291526794, "memory(GiB)": 27.53, "step": 1035, "token_acc": 0.9789983844911146, "train_speed(iter/s)": 0.738776 }, { "epoch": 0.7442528735632183, "grad_norm": 0.9042192101478577, "learning_rate": 1.6850709012387328e-06, "loss": 0.07109712064266205, "memory(GiB)": 27.53, "step": 1036, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.738944 }, { "epoch": 0.7449712643678161, "grad_norm": 0.8448064923286438, "learning_rate": 1.6761850479059727e-06, "loss": 0.05915457010269165, "memory(GiB)": 27.53, "step": 1037, "token_acc": 0.9686956521739131, "train_speed(iter/s)": 0.739119 }, { "epoch": 0.7456896551724138, "grad_norm": 1.4796496629714966, "learning_rate": 1.6673179649706312e-06, "loss": 0.05865549296140671, "memory(GiB)": 27.53, "step": 1038, "token_acc": 0.9733606557377049, "train_speed(iter/s)": 0.739311 }, { "epoch": 0.7464080459770115, "grad_norm": 1.0158050060272217, "learning_rate": 1.6584697025072993e-06, "loss": 0.0645514577627182, "memory(GiB)": 27.53, "step": 1039, "token_acc": 0.969551282051282, "train_speed(iter/s)": 0.739478 }, { "epoch": 0.7471264367816092, "grad_norm": 0.8095449805259705, "learning_rate": 1.64964031048428e-06, "loss": 0.06641900539398193, "memory(GiB)": 27.53, "step": 1040, "token_acc": 0.9867986798679867, "train_speed(iter/s)": 0.739614 }, { "epoch": 0.7478448275862069, "grad_norm": 0.973346471786499, "learning_rate": 1.6408298387633148e-06, "loss": 0.0671592578291893, "memory(GiB)": 27.53, "step": 1041, "token_acc": 0.9829351535836177, "train_speed(iter/s)": 0.739838 }, { "epoch": 0.7485632183908046, "grad_norm": 0.8007359504699707, "learning_rate": 1.632038337099297e-06, "loss": 0.0655389130115509, "memory(GiB)": 27.53, "step": 1042, "token_acc": 0.969335604770017, "train_speed(iter/s)": 0.740064 }, { "epoch": 0.7492816091954023, "grad_norm": 1.000349521636963, "learning_rate": 1.6232658551399895e-06, "loss": 0.07195691019296646, "memory(GiB)": 27.53, "step": 1043, "token_acc": 0.9654510556621881, "train_speed(iter/s)": 0.74029 }, { "epoch": 0.75, "grad_norm": 0.8394086360931396, "learning_rate": 1.6145124424257497e-06, "loss": 0.06920784711837769, "memory(GiB)": 27.53, "step": 1044, "token_acc": 0.968013468013468, "train_speed(iter/s)": 0.740514 }, { "epoch": 0.7507183908045977, "grad_norm": 0.9954524636268616, "learning_rate": 1.6057781483892403e-06, "loss": 0.06193302199244499, "memory(GiB)": 27.53, "step": 1045, "token_acc": 0.972972972972973, "train_speed(iter/s)": 0.740734 }, { "epoch": 0.7514367816091954, "grad_norm": 0.9962290525436401, "learning_rate": 1.5970630223551614e-06, "loss": 0.07023295015096664, "memory(GiB)": 27.53, "step": 1046, "token_acc": 0.9703389830508474, "train_speed(iter/s)": 0.740947 }, { "epoch": 0.7521551724137931, "grad_norm": 1.1021766662597656, "learning_rate": 1.5883671135399652e-06, "loss": 0.07473604381084442, "memory(GiB)": 27.53, "step": 1047, "token_acc": 0.9663366336633663, "train_speed(iter/s)": 0.741157 }, { "epoch": 0.7528735632183908, "grad_norm": 1.160988211631775, "learning_rate": 1.5796904710515792e-06, "loss": 0.06676337122917175, "memory(GiB)": 27.53, "step": 1048, "token_acc": 0.9826923076923076, "train_speed(iter/s)": 0.741378 }, { "epoch": 0.7535919540229885, "grad_norm": 0.7549822330474854, "learning_rate": 1.5710331438891302e-06, "loss": 0.06035991758108139, "memory(GiB)": 27.53, "step": 1049, "token_acc": 0.9843400447427293, "train_speed(iter/s)": 0.741594 }, { "epoch": 0.7543103448275862, "grad_norm": 0.802387535572052, "learning_rate": 1.5623951809426663e-06, "loss": 0.058977894484996796, "memory(GiB)": 27.53, "step": 1050, "token_acc": 0.9822485207100592, "train_speed(iter/s)": 0.741811 }, { "epoch": 0.7543103448275862, "eval_loss": 0.06309526413679123, "eval_runtime": 6.9684, "eval_samples_per_second": 64.577, "eval_steps_per_second": 2.153, "eval_token_acc": 0.9746347402597403, "step": 1050 }, { "epoch": 0.7550287356321839, "grad_norm": 1.8970974683761597, "learning_rate": 1.553776630992878e-06, "loss": 0.06898986548185349, "memory(GiB)": 27.53, "step": 1051, "token_acc": 0.9748678802113917, "train_speed(iter/s)": 0.731475 }, { "epoch": 0.7557471264367817, "grad_norm": 0.8297277092933655, "learning_rate": 1.5451775427108302e-06, "loss": 0.06425133347511292, "memory(GiB)": 27.53, "step": 1052, "token_acc": 0.9816053511705686, "train_speed(iter/s)": 0.731644 }, { "epoch": 0.7564655172413793, "grad_norm": 0.8298529386520386, "learning_rate": 1.5365979646576811e-06, "loss": 0.06671276688575745, "memory(GiB)": 27.53, "step": 1053, "token_acc": 0.9625246548323472, "train_speed(iter/s)": 0.731818 }, { "epoch": 0.757183908045977, "grad_norm": 0.7884525656700134, "learning_rate": 1.5280379452844124e-06, "loss": 0.06285654008388519, "memory(GiB)": 27.53, "step": 1054, "token_acc": 0.9836363636363636, "train_speed(iter/s)": 0.731964 }, { "epoch": 0.7579022988505747, "grad_norm": 0.9429306983947754, "learning_rate": 1.5194975329315465e-06, "loss": 0.06005372107028961, "memory(GiB)": 27.53, "step": 1055, "token_acc": 0.9724950884086444, "train_speed(iter/s)": 0.732147 }, { "epoch": 0.7586206896551724, "grad_norm": 0.8783925771713257, "learning_rate": 1.510976775828887e-06, "loss": 0.06929872930049896, "memory(GiB)": 27.53, "step": 1056, "token_acc": 0.97265625, "train_speed(iter/s)": 0.732372 }, { "epoch": 0.7593390804597702, "grad_norm": 0.8660690784454346, "learning_rate": 1.5024757220952368e-06, "loss": 0.07088316977024078, "memory(GiB)": 27.53, "step": 1057, "token_acc": 0.9678714859437751, "train_speed(iter/s)": 0.732598 }, { "epoch": 0.7600574712643678, "grad_norm": 1.5405594110488892, "learning_rate": 1.493994419738129e-06, "loss": 0.06411121040582657, "memory(GiB)": 27.53, "step": 1058, "token_acc": 0.9697624190064795, "train_speed(iter/s)": 0.732824 }, { "epoch": 0.7607758620689655, "grad_norm": 0.9672781229019165, "learning_rate": 1.4855329166535577e-06, "loss": 0.0711001455783844, "memory(GiB)": 27.53, "step": 1059, "token_acc": 0.969758064516129, "train_speed(iter/s)": 0.733047 }, { "epoch": 0.7614942528735632, "grad_norm": 0.8302735090255737, "learning_rate": 1.4770912606257003e-06, "loss": 0.061911702156066895, "memory(GiB)": 27.53, "step": 1060, "token_acc": 0.9813874788494078, "train_speed(iter/s)": 0.733271 }, { "epoch": 0.7622126436781609, "grad_norm": 1.2248096466064453, "learning_rate": 1.4686694993266598e-06, "loss": 0.060176782310009, "memory(GiB)": 27.53, "step": 1061, "token_acc": 0.9818913480885312, "train_speed(iter/s)": 0.733494 }, { "epoch": 0.7629310344827587, "grad_norm": 0.8550724983215332, "learning_rate": 1.4602676803161842e-06, "loss": 0.0658993199467659, "memory(GiB)": 27.53, "step": 1062, "token_acc": 0.9783464566929134, "train_speed(iter/s)": 0.733717 }, { "epoch": 0.7636494252873564, "grad_norm": 0.8928701877593994, "learning_rate": 1.4518858510414047e-06, "loss": 0.06941978633403778, "memory(GiB)": 27.53, "step": 1063, "token_acc": 0.9709302325581395, "train_speed(iter/s)": 0.73394 }, { "epoch": 0.764367816091954, "grad_norm": 0.799699068069458, "learning_rate": 1.4435240588365645e-06, "loss": 0.06068361550569534, "memory(GiB)": 27.53, "step": 1064, "token_acc": 0.9846743295019157, "train_speed(iter/s)": 0.734162 }, { "epoch": 0.7650862068965517, "grad_norm": 0.860525906085968, "learning_rate": 1.4351823509227553e-06, "loss": 0.08098623901605606, "memory(GiB)": 27.53, "step": 1065, "token_acc": 0.9658886894075404, "train_speed(iter/s)": 0.734384 }, { "epoch": 0.7658045977011494, "grad_norm": 0.9887138605117798, "learning_rate": 1.4268607744076419e-06, "loss": 0.06658250093460083, "memory(GiB)": 27.53, "step": 1066, "token_acc": 0.9781818181818182, "train_speed(iter/s)": 0.734607 }, { "epoch": 0.7665229885057471, "grad_norm": 0.7189052104949951, "learning_rate": 1.4185593762852069e-06, "loss": 0.06160145252943039, "memory(GiB)": 27.53, "step": 1067, "token_acc": 0.9714285714285714, "train_speed(iter/s)": 0.734828 }, { "epoch": 0.7672413793103449, "grad_norm": 0.8860306739807129, "learning_rate": 1.41027820343548e-06, "loss": 0.06241428479552269, "memory(GiB)": 27.53, "step": 1068, "token_acc": 0.9751332149200711, "train_speed(iter/s)": 0.735051 }, { "epoch": 0.7679597701149425, "grad_norm": 1.2726569175720215, "learning_rate": 1.402017302624274e-06, "loss": 0.07558304071426392, "memory(GiB)": 27.53, "step": 1069, "token_acc": 0.9637404580152672, "train_speed(iter/s)": 0.735154 }, { "epoch": 0.7686781609195402, "grad_norm": 0.7346631288528442, "learning_rate": 1.3937767205029196e-06, "loss": 0.06206774711608887, "memory(GiB)": 27.53, "step": 1070, "token_acc": 0.9813084112149533, "train_speed(iter/s)": 0.735373 }, { "epoch": 0.7693965517241379, "grad_norm": 0.9243918657302856, "learning_rate": 1.3855565036080015e-06, "loss": 0.06939692795276642, "memory(GiB)": 27.53, "step": 1071, "token_acc": 0.9659735349716446, "train_speed(iter/s)": 0.735594 }, { "epoch": 0.7701149425287356, "grad_norm": 0.7735506892204285, "learning_rate": 1.3773566983610992e-06, "loss": 0.056131474673748016, "memory(GiB)": 27.53, "step": 1072, "token_acc": 0.9798165137614679, "train_speed(iter/s)": 0.735816 }, { "epoch": 0.7708333333333334, "grad_norm": 0.740021824836731, "learning_rate": 1.369177351068523e-06, "loss": 0.06647099554538727, "memory(GiB)": 27.53, "step": 1073, "token_acc": 0.9830508474576272, "train_speed(iter/s)": 0.736033 }, { "epoch": 0.771551724137931, "grad_norm": 1.0878987312316895, "learning_rate": 1.3610185079210514e-06, "loss": 0.060843415558338165, "memory(GiB)": 27.53, "step": 1074, "token_acc": 0.9702048417132216, "train_speed(iter/s)": 0.736251 }, { "epoch": 0.7722701149425287, "grad_norm": 0.9356815814971924, "learning_rate": 1.3528802149936688e-06, "loss": 0.05744868889451027, "memory(GiB)": 27.53, "step": 1075, "token_acc": 0.9683544303797469, "train_speed(iter/s)": 0.736461 }, { "epoch": 0.7729885057471264, "grad_norm": 1.2200024127960205, "learning_rate": 1.34476251824531e-06, "loss": 0.07507891952991486, "memory(GiB)": 27.53, "step": 1076, "token_acc": 0.9776422764227642, "train_speed(iter/s)": 0.736681 }, { "epoch": 0.7737068965517241, "grad_norm": 0.8535330891609192, "learning_rate": 1.3366654635185983e-06, "loss": 0.07292214035987854, "memory(GiB)": 27.53, "step": 1077, "token_acc": 0.9731800766283525, "train_speed(iter/s)": 0.736898 }, { "epoch": 0.7744252873563219, "grad_norm": 0.8664982318878174, "learning_rate": 1.3285890965395853e-06, "loss": 0.06771373748779297, "memory(GiB)": 27.53, "step": 1078, "token_acc": 0.9647495361781077, "train_speed(iter/s)": 0.737117 }, { "epoch": 0.7751436781609196, "grad_norm": 0.7551286816596985, "learning_rate": 1.3205334629174937e-06, "loss": 0.06171279028058052, "memory(GiB)": 27.53, "step": 1079, "token_acc": 0.9738430583501007, "train_speed(iter/s)": 0.737335 }, { "epoch": 0.7758620689655172, "grad_norm": 0.9341418743133545, "learning_rate": 1.3124986081444625e-06, "loss": 0.06472261995077133, "memory(GiB)": 27.53, "step": 1080, "token_acc": 0.9814126394052045, "train_speed(iter/s)": 0.737552 }, { "epoch": 0.7765804597701149, "grad_norm": 0.8532360196113586, "learning_rate": 1.3044845775952814e-06, "loss": 0.05375043302774429, "memory(GiB)": 27.53, "step": 1081, "token_acc": 0.9751908396946565, "train_speed(iter/s)": 0.737771 }, { "epoch": 0.7772988505747126, "grad_norm": 0.8796221613883972, "learning_rate": 1.296491416527147e-06, "loss": 0.057583458721637726, "memory(GiB)": 27.53, "step": 1082, "token_acc": 0.9754601226993865, "train_speed(iter/s)": 0.737974 }, { "epoch": 0.7780172413793104, "grad_norm": 0.9244908094406128, "learning_rate": 1.288519170079398e-06, "loss": 0.0699407234787941, "memory(GiB)": 27.53, "step": 1083, "token_acc": 0.9738805970149254, "train_speed(iter/s)": 0.738176 }, { "epoch": 0.7787356321839081, "grad_norm": 1.0184824466705322, "learning_rate": 1.2805678832732627e-06, "loss": 0.06631313264369965, "memory(GiB)": 27.53, "step": 1084, "token_acc": 0.9794050343249427, "train_speed(iter/s)": 0.738338 }, { "epoch": 0.7794540229885057, "grad_norm": 0.8213652968406677, "learning_rate": 1.2726376010116082e-06, "loss": 0.060121290385723114, "memory(GiB)": 27.53, "step": 1085, "token_acc": 0.9825242718446602, "train_speed(iter/s)": 0.738495 }, { "epoch": 0.7801724137931034, "grad_norm": 0.9579296708106995, "learning_rate": 1.264728368078678e-06, "loss": 0.06418740749359131, "memory(GiB)": 27.53, "step": 1086, "token_acc": 0.9659574468085106, "train_speed(iter/s)": 0.738694 }, { "epoch": 0.7808908045977011, "grad_norm": 1.043710708618164, "learning_rate": 1.2568402291398501e-06, "loss": 0.06634148955345154, "memory(GiB)": 27.53, "step": 1087, "token_acc": 0.961456102783726, "train_speed(iter/s)": 0.738911 }, { "epoch": 0.7816091954022989, "grad_norm": 1.3617234230041504, "learning_rate": 1.248973228741378e-06, "loss": 0.06447525322437286, "memory(GiB)": 27.53, "step": 1088, "token_acc": 0.9771784232365145, "train_speed(iter/s)": 0.739125 }, { "epoch": 0.7823275862068966, "grad_norm": 1.207967758178711, "learning_rate": 1.2411274113101418e-06, "loss": 0.07293020933866501, "memory(GiB)": 27.53, "step": 1089, "token_acc": 0.9701789264413518, "train_speed(iter/s)": 0.739292 }, { "epoch": 0.7830459770114943, "grad_norm": 0.9223197102546692, "learning_rate": 1.2333028211533916e-06, "loss": 0.05626610666513443, "memory(GiB)": 27.53, "step": 1090, "token_acc": 0.9782608695652174, "train_speed(iter/s)": 0.739429 }, { "epoch": 0.7837643678160919, "grad_norm": 1.0617729425430298, "learning_rate": 1.2254995024585064e-06, "loss": 0.057823069393634796, "memory(GiB)": 27.53, "step": 1091, "token_acc": 0.9758203799654577, "train_speed(iter/s)": 0.739585 }, { "epoch": 0.7844827586206896, "grad_norm": 0.8646804094314575, "learning_rate": 1.21771749929274e-06, "loss": 0.06638962775468826, "memory(GiB)": 27.53, "step": 1092, "token_acc": 0.9667832167832168, "train_speed(iter/s)": 0.739781 }, { "epoch": 0.7852011494252874, "grad_norm": 1.138935923576355, "learning_rate": 1.2099568556029694e-06, "loss": 0.0645303875207901, "memory(GiB)": 27.53, "step": 1093, "token_acc": 0.983177570093458, "train_speed(iter/s)": 0.739992 }, { "epoch": 0.7859195402298851, "grad_norm": 0.978295087814331, "learning_rate": 1.2022176152154525e-06, "loss": 0.06063437461853027, "memory(GiB)": 27.53, "step": 1094, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.740205 }, { "epoch": 0.7866379310344828, "grad_norm": 0.948611855506897, "learning_rate": 1.1944998218355762e-06, "loss": 0.0647997111082077, "memory(GiB)": 27.53, "step": 1095, "token_acc": 0.9753086419753086, "train_speed(iter/s)": 0.74042 }, { "epoch": 0.7873563218390804, "grad_norm": 1.0813990831375122, "learning_rate": 1.1868035190476085e-06, "loss": 0.058523524552583694, "memory(GiB)": 27.53, "step": 1096, "token_acc": 0.9809885931558935, "train_speed(iter/s)": 0.740634 }, { "epoch": 0.7880747126436781, "grad_norm": 1.109010934829712, "learning_rate": 1.1791287503144582e-06, "loss": 0.07644788920879364, "memory(GiB)": 27.53, "step": 1097, "token_acc": 0.9758064516129032, "train_speed(iter/s)": 0.740844 }, { "epoch": 0.7887931034482759, "grad_norm": 1.329890489578247, "learning_rate": 1.1714755589774252e-06, "loss": 0.06804901361465454, "memory(GiB)": 27.53, "step": 1098, "token_acc": 0.9757462686567164, "train_speed(iter/s)": 0.741055 }, { "epoch": 0.7895114942528736, "grad_norm": 0.8452627062797546, "learning_rate": 1.1638439882559554e-06, "loss": 0.06748746335506439, "memory(GiB)": 27.53, "step": 1099, "token_acc": 0.9696356275303644, "train_speed(iter/s)": 0.741267 }, { "epoch": 0.7902298850574713, "grad_norm": 1.2553056478500366, "learning_rate": 1.1562340812474004e-06, "loss": 0.06922227144241333, "memory(GiB)": 27.53, "step": 1100, "token_acc": 0.9724919093851133, "train_speed(iter/s)": 0.741481 }, { "epoch": 0.7902298850574713, "eval_loss": 0.0602780357003212, "eval_runtime": 6.0557, "eval_samples_per_second": 74.311, "eval_steps_per_second": 2.477, "eval_token_acc": 0.9760083666333667, "step": 1100 }, { "epoch": 0.790948275862069, "grad_norm": 0.9310344457626343, "learning_rate": 1.1486458809267664e-06, "loss": 0.0685189738869667, "memory(GiB)": 27.53, "step": 1101, "token_acc": 0.9752574878655144, "train_speed(iter/s)": 0.732078 }, { "epoch": 0.7916666666666666, "grad_norm": 0.966113269329071, "learning_rate": 1.1410794301464817e-06, "loss": 0.06058147922158241, "memory(GiB)": 27.53, "step": 1102, "token_acc": 0.9805996472663139, "train_speed(iter/s)": 0.732233 }, { "epoch": 0.7923850574712644, "grad_norm": 1.177695631980896, "learning_rate": 1.1335347716361479e-06, "loss": 0.06445039808750153, "memory(GiB)": 27.53, "step": 1103, "token_acc": 0.983271375464684, "train_speed(iter/s)": 0.73245 }, { "epoch": 0.7931034482758621, "grad_norm": 0.8376126289367676, "learning_rate": 1.1260119480023008e-06, "loss": 0.06240818277001381, "memory(GiB)": 27.53, "step": 1104, "token_acc": 0.9705882352941176, "train_speed(iter/s)": 0.732667 }, { "epoch": 0.7938218390804598, "grad_norm": 0.9004363417625427, "learning_rate": 1.1185110017281664e-06, "loss": 0.05705379694700241, "memory(GiB)": 27.53, "step": 1105, "token_acc": 0.9718804920913884, "train_speed(iter/s)": 0.732846 }, { "epoch": 0.7945402298850575, "grad_norm": 1.18400239944458, "learning_rate": 1.1110319751734271e-06, "loss": 0.05766066908836365, "memory(GiB)": 27.53, "step": 1106, "token_acc": 0.968944099378882, "train_speed(iter/s)": 0.732989 }, { "epoch": 0.7952586206896551, "grad_norm": 1.017158031463623, "learning_rate": 1.1035749105739791e-06, "loss": 0.05735167860984802, "memory(GiB)": 27.53, "step": 1107, "token_acc": 0.9792099792099792, "train_speed(iter/s)": 0.733171 }, { "epoch": 0.7959770114942529, "grad_norm": 0.7798944711685181, "learning_rate": 1.0961398500416926e-06, "loss": 0.06587515771389008, "memory(GiB)": 27.53, "step": 1108, "token_acc": 0.9713656387665198, "train_speed(iter/s)": 0.733351 }, { "epoch": 0.7966954022988506, "grad_norm": 1.1261645555496216, "learning_rate": 1.0887268355641768e-06, "loss": 0.07397356629371643, "memory(GiB)": 27.53, "step": 1109, "token_acc": 0.975095785440613, "train_speed(iter/s)": 0.733527 }, { "epoch": 0.7974137931034483, "grad_norm": 0.9878758788108826, "learning_rate": 1.0813359090045412e-06, "loss": 0.06216038390994072, "memory(GiB)": 27.53, "step": 1110, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.733719 }, { "epoch": 0.798132183908046, "grad_norm": 0.8857271075248718, "learning_rate": 1.0739671121011568e-06, "loss": 0.06646867096424103, "memory(GiB)": 27.53, "step": 1111, "token_acc": 0.9725738396624473, "train_speed(iter/s)": 0.733934 }, { "epoch": 0.7988505747126436, "grad_norm": 2.7538540363311768, "learning_rate": 1.0666204864674263e-06, "loss": 0.061214983463287354, "memory(GiB)": 27.53, "step": 1112, "token_acc": 0.9711538461538461, "train_speed(iter/s)": 0.734149 }, { "epoch": 0.7995689655172413, "grad_norm": 1.0783498287200928, "learning_rate": 1.0592960735915447e-06, "loss": 0.058804914355278015, "memory(GiB)": 27.53, "step": 1113, "token_acc": 0.9803571428571428, "train_speed(iter/s)": 0.734353 }, { "epoch": 0.8002873563218391, "grad_norm": 1.07695734500885, "learning_rate": 1.0519939148362667e-06, "loss": 0.06513554602861404, "memory(GiB)": 27.53, "step": 1114, "token_acc": 0.9622266401590457, "train_speed(iter/s)": 0.734562 }, { "epoch": 0.8010057471264368, "grad_norm": 0.9451240301132202, "learning_rate": 1.0447140514386728e-06, "loss": 0.0637657642364502, "memory(GiB)": 27.53, "step": 1115, "token_acc": 0.959915611814346, "train_speed(iter/s)": 0.734777 }, { "epoch": 0.8017241379310345, "grad_norm": 0.8732826709747314, "learning_rate": 1.0374565245099328e-06, "loss": 0.06785064935684204, "memory(GiB)": 27.53, "step": 1116, "token_acc": 0.9744094488188977, "train_speed(iter/s)": 0.73499 }, { "epoch": 0.8024425287356322, "grad_norm": 1.222367525100708, "learning_rate": 1.0302213750350797e-06, "loss": 0.06828437745571136, "memory(GiB)": 27.53, "step": 1117, "token_acc": 0.9681274900398407, "train_speed(iter/s)": 0.735204 }, { "epoch": 0.8031609195402298, "grad_norm": 1.1875789165496826, "learning_rate": 1.0230086438727771e-06, "loss": 0.061946235597133636, "memory(GiB)": 27.53, "step": 1118, "token_acc": 0.9712643678160919, "train_speed(iter/s)": 0.735416 }, { "epoch": 0.8038793103448276, "grad_norm": 0.7366891503334045, "learning_rate": 1.015818371755085e-06, "loss": 0.05584091693162918, "memory(GiB)": 27.53, "step": 1119, "token_acc": 0.9792899408284024, "train_speed(iter/s)": 0.735628 }, { "epoch": 0.8045977011494253, "grad_norm": 0.8571769595146179, "learning_rate": 1.0086505992872304e-06, "loss": 0.06919243931770325, "memory(GiB)": 27.53, "step": 1120, "token_acc": 0.9772727272727273, "train_speed(iter/s)": 0.735841 }, { "epoch": 0.805316091954023, "grad_norm": 2.2154505252838135, "learning_rate": 1.0015053669473806e-06, "loss": 0.06767885386943817, "memory(GiB)": 27.53, "step": 1121, "token_acc": 0.9803220035778175, "train_speed(iter/s)": 0.736046 }, { "epoch": 0.8060344827586207, "grad_norm": 1.0961288213729858, "learning_rate": 9.943827150864143e-07, "loss": 0.05914837494492531, "memory(GiB)": 27.53, "step": 1122, "token_acc": 0.9819168173598554, "train_speed(iter/s)": 0.736261 }, { "epoch": 0.8067528735632183, "grad_norm": 1.135534405708313, "learning_rate": 9.87282683927691e-07, "loss": 0.06583724915981293, "memory(GiB)": 27.53, "step": 1123, "token_acc": 0.9650924024640657, "train_speed(iter/s)": 0.736469 }, { "epoch": 0.8074712643678161, "grad_norm": 0.73138827085495, "learning_rate": 9.80205313566827e-07, "loss": 0.0608791708946228, "memory(GiB)": 27.53, "step": 1124, "token_acc": 0.9806949806949807, "train_speed(iter/s)": 0.736681 }, { "epoch": 0.8081896551724138, "grad_norm": 0.8352106213569641, "learning_rate": 9.731506439714672e-07, "loss": 0.05746348202228546, "memory(GiB)": 27.53, "step": 1125, "token_acc": 0.9753694581280788, "train_speed(iter/s)": 0.736876 }, { "epoch": 0.8089080459770115, "grad_norm": 0.829933226108551, "learning_rate": 9.66118714981058e-07, "loss": 0.06432738900184631, "memory(GiB)": 27.53, "step": 1126, "token_acc": 0.9789674952198852, "train_speed(iter/s)": 0.737065 }, { "epoch": 0.8096264367816092, "grad_norm": 2.4665262699127197, "learning_rate": 9.591095663066253e-07, "loss": 0.05785127729177475, "memory(GiB)": 27.53, "step": 1127, "token_acc": 0.9718076285240465, "train_speed(iter/s)": 0.737274 }, { "epoch": 0.8103448275862069, "grad_norm": 0.8788185715675354, "learning_rate": 9.521232375305494e-07, "loss": 0.06040353700518608, "memory(GiB)": 27.53, "step": 1128, "token_acc": 0.9878787878787879, "train_speed(iter/s)": 0.737469 }, { "epoch": 0.8110632183908046, "grad_norm": 0.8666103482246399, "learning_rate": 9.451597681063412e-07, "loss": 0.06088443845510483, "memory(GiB)": 27.53, "step": 1129, "token_acc": 0.982, "train_speed(iter/s)": 0.737679 }, { "epoch": 0.8117816091954023, "grad_norm": 0.9043155908584595, "learning_rate": 9.382191973584193e-07, "loss": 0.0699068158864975, "memory(GiB)": 27.53, "step": 1130, "token_acc": 0.9654036243822076, "train_speed(iter/s)": 0.737889 }, { "epoch": 0.8125, "grad_norm": 0.9043624997138977, "learning_rate": 9.313015644818851e-07, "loss": 0.06146276742219925, "memory(GiB)": 27.53, "step": 1131, "token_acc": 0.9782178217821782, "train_speed(iter/s)": 0.7381 }, { "epoch": 0.8132183908045977, "grad_norm": 0.9881229996681213, "learning_rate": 9.244069085423074e-07, "loss": 0.06280240416526794, "memory(GiB)": 27.53, "step": 1132, "token_acc": 0.966804979253112, "train_speed(iter/s)": 0.738309 }, { "epoch": 0.8139367816091954, "grad_norm": 0.6198493838310242, "learning_rate": 9.175352684754979e-07, "loss": 0.05480733513832092, "memory(GiB)": 27.53, "step": 1133, "token_acc": 0.9763313609467456, "train_speed(iter/s)": 0.738518 }, { "epoch": 0.8146551724137931, "grad_norm": 0.8474046587944031, "learning_rate": 9.106866830872929e-07, "loss": 0.06325137615203857, "memory(GiB)": 27.53, "step": 1134, "token_acc": 0.9720558882235529, "train_speed(iter/s)": 0.738727 }, { "epoch": 0.8153735632183908, "grad_norm": 1.0299392938613892, "learning_rate": 9.038611910533296e-07, "loss": 0.064010851085186, "memory(GiB)": 27.53, "step": 1135, "token_acc": 0.9769230769230769, "train_speed(iter/s)": 0.738937 }, { "epoch": 0.8160919540229885, "grad_norm": 1.0258629322052002, "learning_rate": 8.970588309188343e-07, "loss": 0.06662111729383469, "memory(GiB)": 27.53, "step": 1136, "token_acc": 0.9738430583501007, "train_speed(iter/s)": 0.739146 }, { "epoch": 0.8168103448275862, "grad_norm": 0.7691630125045776, "learning_rate": 8.902796410984027e-07, "loss": 0.05417942255735397, "memory(GiB)": 27.53, "step": 1137, "token_acc": 0.9717314487632509, "train_speed(iter/s)": 0.739334 }, { "epoch": 0.8175287356321839, "grad_norm": 0.7807369232177734, "learning_rate": 8.835236598757796e-07, "loss": 0.05261258780956268, "memory(GiB)": 27.53, "step": 1138, "token_acc": 0.9808027923211169, "train_speed(iter/s)": 0.739487 }, { "epoch": 0.8182471264367817, "grad_norm": 1.2055937051773071, "learning_rate": 8.767909254036472e-07, "loss": 0.06012888625264168, "memory(GiB)": 27.53, "step": 1139, "token_acc": 0.9711538461538461, "train_speed(iter/s)": 0.739634 }, { "epoch": 0.8189655172413793, "grad_norm": 0.9839240908622742, "learning_rate": 8.70081475703406e-07, "loss": 0.061926163733005524, "memory(GiB)": 27.53, "step": 1140, "token_acc": 0.9739292364990689, "train_speed(iter/s)": 0.739815 }, { "epoch": 0.819683908045977, "grad_norm": 0.8166204690933228, "learning_rate": 8.633953486649632e-07, "loss": 0.06147310882806778, "memory(GiB)": 27.53, "step": 1141, "token_acc": 0.968421052631579, "train_speed(iter/s)": 0.740006 }, { "epoch": 0.8204022988505747, "grad_norm": 0.8148682713508606, "learning_rate": 8.567325820465156e-07, "loss": 0.05559694766998291, "memory(GiB)": 27.53, "step": 1142, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.740146 }, { "epoch": 0.8211206896551724, "grad_norm": 1.326007604598999, "learning_rate": 8.500932134743384e-07, "loss": 0.0626634806394577, "memory(GiB)": 27.53, "step": 1143, "token_acc": 0.9744597249508841, "train_speed(iter/s)": 0.740334 }, { "epoch": 0.8218390804597702, "grad_norm": 0.7195452451705933, "learning_rate": 8.434772804425734e-07, "loss": 0.05876833200454712, "memory(GiB)": 27.53, "step": 1144, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.740542 }, { "epoch": 0.8225574712643678, "grad_norm": 1.6010862588882446, "learning_rate": 8.368848203130159e-07, "loss": 0.0655226856470108, "memory(GiB)": 27.53, "step": 1145, "token_acc": 0.9735234215885947, "train_speed(iter/s)": 0.7407 }, { "epoch": 0.8232758620689655, "grad_norm": 0.9265581369400024, "learning_rate": 8.303158703149023e-07, "loss": 0.06240857392549515, "memory(GiB)": 27.53, "step": 1146, "token_acc": 0.9690909090909091, "train_speed(iter/s)": 0.740851 }, { "epoch": 0.8239942528735632, "grad_norm": 0.9456865787506104, "learning_rate": 8.237704675447028e-07, "loss": 0.04688794165849686, "memory(GiB)": 27.53, "step": 1147, "token_acc": 0.9850427350427351, "train_speed(iter/s)": 0.741004 }, { "epoch": 0.8247126436781609, "grad_norm": 0.9523940682411194, "learning_rate": 8.172486489659115e-07, "loss": 0.058565545827150345, "memory(GiB)": 27.53, "step": 1148, "token_acc": 0.9796334012219959, "train_speed(iter/s)": 0.741207 }, { "epoch": 0.8254310344827587, "grad_norm": 0.9443383812904358, "learning_rate": 8.107504514088365e-07, "loss": 0.06394673138856888, "memory(GiB)": 27.53, "step": 1149, "token_acc": 0.988, "train_speed(iter/s)": 0.74141 }, { "epoch": 0.8261494252873564, "grad_norm": 0.9836805462837219, "learning_rate": 8.042759115703891e-07, "loss": 0.0608956404030323, "memory(GiB)": 27.53, "step": 1150, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.741616 }, { "epoch": 0.8261494252873564, "eval_loss": 0.06049403175711632, "eval_runtime": 7.1737, "eval_samples_per_second": 62.729, "eval_steps_per_second": 2.091, "eval_token_acc": 0.9757586163836164, "step": 1150 }, { "epoch": 0.826867816091954, "grad_norm": 0.8555424809455872, "learning_rate": 7.978250660138837e-07, "loss": 0.06006840616464615, "memory(GiB)": 27.53, "step": 1151, "token_acc": 0.9762878168438267, "train_speed(iter/s)": 0.732133 }, { "epoch": 0.8275862068965517, "grad_norm": 0.8766492605209351, "learning_rate": 7.913979511688252e-07, "loss": 0.064631886780262, "memory(GiB)": 27.53, "step": 1152, "token_acc": 0.9808429118773946, "train_speed(iter/s)": 0.732322 }, { "epoch": 0.8283045977011494, "grad_norm": 1.008776068687439, "learning_rate": 7.849946033307066e-07, "loss": 0.056242458522319794, "memory(GiB)": 27.53, "step": 1153, "token_acc": 0.9720430107526882, "train_speed(iter/s)": 0.732479 }, { "epoch": 0.8290229885057471, "grad_norm": 0.9027056694030762, "learning_rate": 7.78615058660801e-07, "loss": 0.06753557175397873, "memory(GiB)": 27.53, "step": 1154, "token_acc": 0.9653179190751445, "train_speed(iter/s)": 0.732626 }, { "epoch": 0.8297413793103449, "grad_norm": 1.129051685333252, "learning_rate": 7.722593531859613e-07, "loss": 0.07091376185417175, "memory(GiB)": 27.53, "step": 1155, "token_acc": 0.9696312364425163, "train_speed(iter/s)": 0.732786 }, { "epoch": 0.8304597701149425, "grad_norm": 1.1257957220077515, "learning_rate": 7.659275227984142e-07, "loss": 0.06541141867637634, "memory(GiB)": 27.53, "step": 1156, "token_acc": 0.9644351464435147, "train_speed(iter/s)": 0.732925 }, { "epoch": 0.8311781609195402, "grad_norm": 0.8217150568962097, "learning_rate": 7.596196032555553e-07, "loss": 0.05764450877904892, "memory(GiB)": 27.53, "step": 1157, "token_acc": 0.9777365491651205, "train_speed(iter/s)": 0.733102 }, { "epoch": 0.8318965517241379, "grad_norm": 0.8348180055618286, "learning_rate": 7.533356301797523e-07, "loss": 0.0645107850432396, "memory(GiB)": 27.53, "step": 1158, "token_acc": 0.9659863945578231, "train_speed(iter/s)": 0.733306 }, { "epoch": 0.8326149425287356, "grad_norm": 1.062096118927002, "learning_rate": 7.470756390581412e-07, "loss": 0.061027683317661285, "memory(GiB)": 27.53, "step": 1159, "token_acc": 0.980836236933798, "train_speed(iter/s)": 0.733512 }, { "epoch": 0.8333333333333334, "grad_norm": 0.8153855800628662, "learning_rate": 7.408396652424271e-07, "loss": 0.059401340782642365, "memory(GiB)": 27.53, "step": 1160, "token_acc": 0.978494623655914, "train_speed(iter/s)": 0.733695 }, { "epoch": 0.834051724137931, "grad_norm": 1.9926213026046753, "learning_rate": 7.346277439486793e-07, "loss": 0.06306315213441849, "memory(GiB)": 27.53, "step": 1161, "token_acc": 0.9688995215311005, "train_speed(iter/s)": 0.733857 }, { "epoch": 0.8347701149425287, "grad_norm": 0.9178330302238464, "learning_rate": 7.28439910257141e-07, "loss": 0.06488300859928131, "memory(GiB)": 27.53, "step": 1162, "token_acc": 0.9753593429158111, "train_speed(iter/s)": 0.734023 }, { "epoch": 0.8354885057471264, "grad_norm": 0.9518449306488037, "learning_rate": 7.222761991120241e-07, "loss": 0.06367714703083038, "memory(GiB)": 27.53, "step": 1163, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.73418 }, { "epoch": 0.8362068965517241, "grad_norm": 1.0256438255310059, "learning_rate": 7.161366453213181e-07, "loss": 0.06149526685476303, "memory(GiB)": 27.53, "step": 1164, "token_acc": 0.9746376811594203, "train_speed(iter/s)": 0.734365 }, { "epoch": 0.8369252873563219, "grad_norm": 0.8647089004516602, "learning_rate": 7.10021283556585e-07, "loss": 0.061459969729185104, "memory(GiB)": 27.53, "step": 1165, "token_acc": 0.9817444219066938, "train_speed(iter/s)": 0.734569 }, { "epoch": 0.8376436781609196, "grad_norm": 1.167632818222046, "learning_rate": 7.03930148352771e-07, "loss": 0.05457035079598427, "memory(GiB)": 27.53, "step": 1166, "token_acc": 0.9779411764705882, "train_speed(iter/s)": 0.734768 }, { "epoch": 0.8383620689655172, "grad_norm": 0.7766239643096924, "learning_rate": 6.978632741080105e-07, "loss": 0.06211092323064804, "memory(GiB)": 27.53, "step": 1167, "token_acc": 0.9726495726495726, "train_speed(iter/s)": 0.734969 }, { "epoch": 0.8390804597701149, "grad_norm": 0.774737536907196, "learning_rate": 6.918206950834283e-07, "loss": 0.06451723724603653, "memory(GiB)": 27.53, "step": 1168, "token_acc": 0.9678068410462777, "train_speed(iter/s)": 0.73517 }, { "epoch": 0.8397988505747126, "grad_norm": 1.1929470300674438, "learning_rate": 6.858024454029482e-07, "loss": 0.06695246696472168, "memory(GiB)": 27.53, "step": 1169, "token_acc": 0.9692622950819673, "train_speed(iter/s)": 0.73537 }, { "epoch": 0.8405172413793104, "grad_norm": 0.9960474371910095, "learning_rate": 6.798085590531012e-07, "loss": 0.06451299041509628, "memory(GiB)": 27.53, "step": 1170, "token_acc": 0.9756637168141593, "train_speed(iter/s)": 0.735571 }, { "epoch": 0.8412356321839081, "grad_norm": 0.969942569732666, "learning_rate": 6.738390698828329e-07, "loss": 0.07123501598834991, "memory(GiB)": 27.53, "step": 1171, "token_acc": 0.9779559118236473, "train_speed(iter/s)": 0.735771 }, { "epoch": 0.8419540229885057, "grad_norm": 0.9232319593429565, "learning_rate": 6.678940116033095e-07, "loss": 0.0613936185836792, "memory(GiB)": 27.53, "step": 1172, "token_acc": 0.9632034632034632, "train_speed(iter/s)": 0.735969 }, { "epoch": 0.8426724137931034, "grad_norm": 0.8907992243766785, "learning_rate": 6.619734177877324e-07, "loss": 0.060066014528274536, "memory(GiB)": 27.53, "step": 1173, "token_acc": 0.9738955823293173, "train_speed(iter/s)": 0.736157 }, { "epoch": 0.8433908045977011, "grad_norm": 0.8520158529281616, "learning_rate": 6.560773218711458e-07, "loss": 0.061589814722537994, "memory(GiB)": 27.53, "step": 1174, "token_acc": 0.9831649831649831, "train_speed(iter/s)": 0.736355 }, { "epoch": 0.8441091954022989, "grad_norm": 2.6921842098236084, "learning_rate": 6.50205757150249e-07, "loss": 0.06938831508159637, "memory(GiB)": 27.53, "step": 1175, "token_acc": 0.9706457925636007, "train_speed(iter/s)": 0.736555 }, { "epoch": 0.8448275862068966, "grad_norm": 0.834119439125061, "learning_rate": 6.443587567832044e-07, "loss": 0.06207148730754852, "memory(GiB)": 27.53, "step": 1176, "token_acc": 0.9658886894075404, "train_speed(iter/s)": 0.736755 }, { "epoch": 0.8455459770114943, "grad_norm": 1.0323419570922852, "learning_rate": 6.385363537894568e-07, "loss": 0.06357185542583466, "memory(GiB)": 27.53, "step": 1177, "token_acc": 0.9865642994241842, "train_speed(iter/s)": 0.736951 }, { "epoch": 0.8462643678160919, "grad_norm": 1.7030569314956665, "learning_rate": 6.327385810495423e-07, "loss": 0.06190510466694832, "memory(GiB)": 27.53, "step": 1178, "token_acc": 0.9794392523364486, "train_speed(iter/s)": 0.737147 }, { "epoch": 0.8469827586206896, "grad_norm": 1.5382062196731567, "learning_rate": 6.269654713049051e-07, "loss": 0.05926360934972763, "memory(GiB)": 27.53, "step": 1179, "token_acc": 0.9780439121756487, "train_speed(iter/s)": 0.737345 }, { "epoch": 0.8477011494252874, "grad_norm": 1.0161781311035156, "learning_rate": 6.212170571577087e-07, "loss": 0.05069392919540405, "memory(GiB)": 27.53, "step": 1180, "token_acc": 0.9850746268656716, "train_speed(iter/s)": 0.737541 }, { "epoch": 0.8484195402298851, "grad_norm": 1.0050320625305176, "learning_rate": 6.154933710706573e-07, "loss": 0.06414249539375305, "memory(GiB)": 27.53, "step": 1181, "token_acc": 0.9789473684210527, "train_speed(iter/s)": 0.737726 }, { "epoch": 0.8491379310344828, "grad_norm": 1.1402781009674072, "learning_rate": 6.097944453668081e-07, "loss": 0.06061258167028427, "memory(GiB)": 27.53, "step": 1182, "token_acc": 0.9738317757009346, "train_speed(iter/s)": 0.737922 }, { "epoch": 0.8498563218390804, "grad_norm": 0.8242668509483337, "learning_rate": 6.04120312229391e-07, "loss": 0.06691639125347137, "memory(GiB)": 27.53, "step": 1183, "token_acc": 0.984375, "train_speed(iter/s)": 0.738119 }, { "epoch": 0.8505747126436781, "grad_norm": 0.9708530902862549, "learning_rate": 5.984710037016267e-07, "loss": 0.07632363587617874, "memory(GiB)": 27.53, "step": 1184, "token_acc": 0.9679245283018868, "train_speed(iter/s)": 0.738316 }, { "epoch": 0.8512931034482759, "grad_norm": 0.9939351081848145, "learning_rate": 5.92846551686544e-07, "loss": 0.0564611442387104, "memory(GiB)": 27.53, "step": 1185, "token_acc": 0.9806678383128296, "train_speed(iter/s)": 0.738509 }, { "epoch": 0.8520114942528736, "grad_norm": 1.9591580629348755, "learning_rate": 5.872469879468024e-07, "loss": 0.06975357234477997, "memory(GiB)": 27.53, "step": 1186, "token_acc": 0.967479674796748, "train_speed(iter/s)": 0.738705 }, { "epoch": 0.8527298850574713, "grad_norm": 1.0492730140686035, "learning_rate": 5.816723441045085e-07, "loss": 0.06531441956758499, "memory(GiB)": 27.53, "step": 1187, "token_acc": 0.9644760213143873, "train_speed(iter/s)": 0.738902 }, { "epoch": 0.853448275862069, "grad_norm": 0.9370294213294983, "learning_rate": 5.761226516410434e-07, "loss": 0.0555318221449852, "memory(GiB)": 27.53, "step": 1188, "token_acc": 0.9841897233201581, "train_speed(iter/s)": 0.739098 }, { "epoch": 0.8541666666666666, "grad_norm": 0.8762693405151367, "learning_rate": 5.705979418968799e-07, "loss": 0.06147278845310211, "memory(GiB)": 27.53, "step": 1189, "token_acc": 0.979757085020243, "train_speed(iter/s)": 0.739283 }, { "epoch": 0.8548850574712644, "grad_norm": 0.8018141388893127, "learning_rate": 5.650982460714083e-07, "loss": 0.0700703114271164, "memory(GiB)": 27.53, "step": 1190, "token_acc": 0.98046875, "train_speed(iter/s)": 0.739434 }, { "epoch": 0.8556034482758621, "grad_norm": 0.8933513760566711, "learning_rate": 5.596235952227569e-07, "loss": 0.0644611269235611, "memory(GiB)": 27.53, "step": 1191, "token_acc": 0.969551282051282, "train_speed(iter/s)": 0.739547 }, { "epoch": 0.8563218390804598, "grad_norm": 0.8305102586746216, "learning_rate": 5.54174020267621e-07, "loss": 0.06323353946208954, "memory(GiB)": 27.53, "step": 1192, "token_acc": 0.9804618117229129, "train_speed(iter/s)": 0.739666 }, { "epoch": 0.8570402298850575, "grad_norm": 1.0223325490951538, "learning_rate": 5.487495519810853e-07, "loss": 0.06272368133068085, "memory(GiB)": 27.53, "step": 1193, "token_acc": 0.9775474956822107, "train_speed(iter/s)": 0.739847 }, { "epoch": 0.8577586206896551, "grad_norm": 0.7861616611480713, "learning_rate": 5.433502209964531e-07, "loss": 0.06440359354019165, "memory(GiB)": 27.53, "step": 1194, "token_acc": 0.9719298245614035, "train_speed(iter/s)": 0.740041 }, { "epoch": 0.8584770114942529, "grad_norm": 0.95345538854599, "learning_rate": 5.379760578050669e-07, "loss": 0.05991306155920029, "memory(GiB)": 27.53, "step": 1195, "token_acc": 0.9789103690685413, "train_speed(iter/s)": 0.740235 }, { "epoch": 0.8591954022988506, "grad_norm": 0.7367057204246521, "learning_rate": 5.326270927561444e-07, "loss": 0.06630540639162064, "memory(GiB)": 27.53, "step": 1196, "token_acc": 0.9679245283018868, "train_speed(iter/s)": 0.74043 }, { "epoch": 0.8599137931034483, "grad_norm": 0.8215988874435425, "learning_rate": 5.273033560566015e-07, "loss": 0.06189655140042305, "memory(GiB)": 27.53, "step": 1197, "token_acc": 0.9795597484276729, "train_speed(iter/s)": 0.740626 }, { "epoch": 0.860632183908046, "grad_norm": 0.7549877762794495, "learning_rate": 5.22004877770883e-07, "loss": 0.05559377372264862, "memory(GiB)": 27.53, "step": 1198, "token_acc": 0.9825479930191972, "train_speed(iter/s)": 0.740812 }, { "epoch": 0.8613505747126436, "grad_norm": 0.9911515712738037, "learning_rate": 5.167316878207956e-07, "loss": 0.0596562996506691, "memory(GiB)": 27.53, "step": 1199, "token_acc": 0.9749518304431599, "train_speed(iter/s)": 0.740955 }, { "epoch": 0.8620689655172413, "grad_norm": 1.0757217407226562, "learning_rate": 5.114838159853336e-07, "loss": 0.06485432386398315, "memory(GiB)": 27.53, "step": 1200, "token_acc": 0.9658444022770398, "train_speed(iter/s)": 0.741099 }, { "epoch": 0.8620689655172413, "eval_loss": 0.05923636630177498, "eval_runtime": 6.9312, "eval_samples_per_second": 64.924, "eval_steps_per_second": 2.164, "eval_token_acc": 0.9764766483516484, "step": 1200 }, { "epoch": 0.8627873563218391, "grad_norm": 0.77975994348526, "learning_rate": 5.062612919005166e-07, "loss": 0.056971967220306396, "memory(GiB)": 27.53, "step": 1201, "token_acc": 0.9775227729800071, "train_speed(iter/s)": 0.732047 }, { "epoch": 0.8635057471264368, "grad_norm": 0.9517463445663452, "learning_rate": 5.010641450592158e-07, "loss": 0.064048632979393, "memory(GiB)": 27.53, "step": 1202, "token_acc": 0.9745222929936306, "train_speed(iter/s)": 0.732232 }, { "epoch": 0.8642241379310345, "grad_norm": 1.1465250253677368, "learning_rate": 4.958924048109937e-07, "loss": 0.058454856276512146, "memory(GiB)": 27.53, "step": 1203, "token_acc": 0.9844961240310077, "train_speed(iter/s)": 0.732426 }, { "epoch": 0.8649425287356322, "grad_norm": 1.638034462928772, "learning_rate": 4.907461003619346e-07, "loss": 0.056312303990125656, "memory(GiB)": 27.53, "step": 1204, "token_acc": 0.9776632302405498, "train_speed(iter/s)": 0.732602 }, { "epoch": 0.8656609195402298, "grad_norm": 0.8262189030647278, "learning_rate": 4.856252607744816e-07, "loss": 0.06050460413098335, "memory(GiB)": 27.53, "step": 1205, "token_acc": 0.9806201550387597, "train_speed(iter/s)": 0.732745 }, { "epoch": 0.8663793103448276, "grad_norm": 0.9084823131561279, "learning_rate": 4.805299149672682e-07, "loss": 0.05974102020263672, "memory(GiB)": 27.53, "step": 1206, "token_acc": 0.972457627118644, "train_speed(iter/s)": 0.732836 }, { "epoch": 0.8670977011494253, "grad_norm": 0.7485952377319336, "learning_rate": 4.754600917149621e-07, "loss": 0.060270585119724274, "memory(GiB)": 27.53, "step": 1207, "token_acc": 0.9800362976406534, "train_speed(iter/s)": 0.732979 }, { "epoch": 0.867816091954023, "grad_norm": 0.718009889125824, "learning_rate": 4.7041581964809733e-07, "loss": 0.056654684245586395, "memory(GiB)": 27.53, "step": 1208, "token_acc": 0.9686192468619247, "train_speed(iter/s)": 0.733174 }, { "epoch": 0.8685344827586207, "grad_norm": 0.7669220566749573, "learning_rate": 4.6539712725291507e-07, "loss": 0.053363144397735596, "memory(GiB)": 27.53, "step": 1209, "token_acc": 0.9745916515426497, "train_speed(iter/s)": 0.733369 }, { "epoch": 0.8692528735632183, "grad_norm": 1.1938389539718628, "learning_rate": 4.6040404287119924e-07, "loss": 0.06623540818691254, "memory(GiB)": 27.53, "step": 1210, "token_acc": 0.9603960396039604, "train_speed(iter/s)": 0.733565 }, { "epoch": 0.8699712643678161, "grad_norm": 0.736906886100769, "learning_rate": 4.5543659470012105e-07, "loss": 0.05583645775914192, "memory(GiB)": 27.53, "step": 1211, "token_acc": 0.9817850637522769, "train_speed(iter/s)": 0.733756 }, { "epoch": 0.8706896551724138, "grad_norm": 0.9487413763999939, "learning_rate": 4.504948107920781e-07, "loss": 0.0670657753944397, "memory(GiB)": 27.53, "step": 1212, "token_acc": 0.9867424242424242, "train_speed(iter/s)": 0.73395 }, { "epoch": 0.8714080459770115, "grad_norm": 1.3528481721878052, "learning_rate": 4.455787190545341e-07, "loss": 0.05982538312673569, "memory(GiB)": 27.53, "step": 1213, "token_acc": 0.9709401709401709, "train_speed(iter/s)": 0.734145 }, { "epoch": 0.8721264367816092, "grad_norm": 0.9580056071281433, "learning_rate": 4.4068834724986466e-07, "loss": 0.060927875339984894, "memory(GiB)": 27.53, "step": 1214, "token_acc": 0.9770992366412213, "train_speed(iter/s)": 0.734311 }, { "epoch": 0.8728448275862069, "grad_norm": 0.8393829464912415, "learning_rate": 4.358237229951967e-07, "loss": 0.054675690829753876, "memory(GiB)": 27.53, "step": 1215, "token_acc": 0.9729166666666667, "train_speed(iter/s)": 0.734458 }, { "epoch": 0.8735632183908046, "grad_norm": 1.006168246269226, "learning_rate": 4.309848737622568e-07, "loss": 0.060198381543159485, "memory(GiB)": 27.53, "step": 1216, "token_acc": 0.9763406940063092, "train_speed(iter/s)": 0.7346 }, { "epoch": 0.8742816091954023, "grad_norm": 0.883871853351593, "learning_rate": 4.2617182687721026e-07, "loss": 0.06357082724571228, "memory(GiB)": 27.53, "step": 1217, "token_acc": 0.9644268774703557, "train_speed(iter/s)": 0.734764 }, { "epoch": 0.875, "grad_norm": 0.7668992877006531, "learning_rate": 4.213846095205126e-07, "loss": 0.061284035444259644, "memory(GiB)": 27.53, "step": 1218, "token_acc": 0.9653284671532847, "train_speed(iter/s)": 0.734953 }, { "epoch": 0.8757183908045977, "grad_norm": 0.8512526154518127, "learning_rate": 4.1662324872675354e-07, "loss": 0.06336931139230728, "memory(GiB)": 27.53, "step": 1219, "token_acc": 0.9728033472803347, "train_speed(iter/s)": 0.735144 }, { "epoch": 0.8764367816091954, "grad_norm": 0.8747227787971497, "learning_rate": 4.1188777138450487e-07, "loss": 0.05642282962799072, "memory(GiB)": 27.53, "step": 1220, "token_acc": 0.9702970297029703, "train_speed(iter/s)": 0.735336 }, { "epoch": 0.8771551724137931, "grad_norm": 0.7609629034996033, "learning_rate": 4.071782042361655e-07, "loss": 0.054869066923856735, "memory(GiB)": 27.53, "step": 1221, "token_acc": 0.9764065335753176, "train_speed(iter/s)": 0.735528 }, { "epoch": 0.8778735632183908, "grad_norm": 0.7779321670532227, "learning_rate": 4.024945738778163e-07, "loss": 0.05740607902407646, "memory(GiB)": 27.53, "step": 1222, "token_acc": 0.9781312127236581, "train_speed(iter/s)": 0.735721 }, { "epoch": 0.8785919540229885, "grad_norm": 0.9035853147506714, "learning_rate": 3.9783690675906484e-07, "loss": 0.07089881598949432, "memory(GiB)": 27.53, "step": 1223, "token_acc": 0.9843444227005871, "train_speed(iter/s)": 0.735912 }, { "epoch": 0.8793103448275862, "grad_norm": 0.6317605376243591, "learning_rate": 3.9320522918289973e-07, "loss": 0.05423438549041748, "memory(GiB)": 27.53, "step": 1224, "token_acc": 0.9728033472803347, "train_speed(iter/s)": 0.736103 }, { "epoch": 0.8800287356321839, "grad_norm": 0.8873939514160156, "learning_rate": 3.885995673055376e-07, "loss": 0.06413359194993973, "memory(GiB)": 27.53, "step": 1225, "token_acc": 0.9790209790209791, "train_speed(iter/s)": 0.736277 }, { "epoch": 0.8807471264367817, "grad_norm": 0.7498215436935425, "learning_rate": 3.8401994713628044e-07, "loss": 0.05634697526693344, "memory(GiB)": 27.53, "step": 1226, "token_acc": 0.9812382739212008, "train_speed(iter/s)": 0.736469 }, { "epoch": 0.8814655172413793, "grad_norm": 1.839269995689392, "learning_rate": 3.79466394537365e-07, "loss": 0.05372678488492966, "memory(GiB)": 27.53, "step": 1227, "token_acc": 0.9793388429752066, "train_speed(iter/s)": 0.736662 }, { "epoch": 0.882183908045977, "grad_norm": 0.7894226908683777, "learning_rate": 3.7493893522381866e-07, "loss": 0.06510858237743378, "memory(GiB)": 27.53, "step": 1228, "token_acc": 0.9854014598540146, "train_speed(iter/s)": 0.736854 }, { "epoch": 0.8829022988505747, "grad_norm": 0.7895011901855469, "learning_rate": 3.704375947633132e-07, "loss": 0.05636715888977051, "memory(GiB)": 27.53, "step": 1229, "token_acc": 0.9658119658119658, "train_speed(iter/s)": 0.737036 }, { "epoch": 0.8836206896551724, "grad_norm": 1.6432998180389404, "learning_rate": 3.6596239857602136e-07, "loss": 0.0587158128619194, "memory(GiB)": 27.53, "step": 1230, "token_acc": 0.9796672828096118, "train_speed(iter/s)": 0.737224 }, { "epoch": 0.8843390804597702, "grad_norm": 1.0105592012405396, "learning_rate": 3.6151337193447323e-07, "loss": 0.05733828991651535, "memory(GiB)": 27.53, "step": 1231, "token_acc": 0.9707865168539326, "train_speed(iter/s)": 0.737414 }, { "epoch": 0.8850574712643678, "grad_norm": 0.7537611722946167, "learning_rate": 3.570905399634111e-07, "loss": 0.0548020601272583, "memory(GiB)": 27.53, "step": 1232, "token_acc": 0.97632058287796, "train_speed(iter/s)": 0.737485 }, { "epoch": 0.8857758620689655, "grad_norm": 0.9540972709655762, "learning_rate": 3.526939276396507e-07, "loss": 0.05054333806037903, "memory(GiB)": 27.53, "step": 1233, "token_acc": 0.9839285714285714, "train_speed(iter/s)": 0.737668 }, { "epoch": 0.8864942528735632, "grad_norm": 1.8226970434188843, "learning_rate": 3.483235597919404e-07, "loss": 0.06525330245494843, "memory(GiB)": 27.53, "step": 1234, "token_acc": 0.9683168316831683, "train_speed(iter/s)": 0.737861 }, { "epoch": 0.8872126436781609, "grad_norm": 0.8861536383628845, "learning_rate": 3.4397946110081793e-07, "loss": 0.0580095499753952, "memory(GiB)": 27.53, "step": 1235, "token_acc": 0.9753593429158111, "train_speed(iter/s)": 0.738053 }, { "epoch": 0.8879310344827587, "grad_norm": 1.169097661972046, "learning_rate": 3.396616560984711e-07, "loss": 0.06127054989337921, "memory(GiB)": 27.53, "step": 1236, "token_acc": 0.9774011299435028, "train_speed(iter/s)": 0.738246 }, { "epoch": 0.8886494252873564, "grad_norm": 0.8153477311134338, "learning_rate": 3.3537016916860455e-07, "loss": 0.06256256997585297, "memory(GiB)": 27.53, "step": 1237, "token_acc": 0.959409594095941, "train_speed(iter/s)": 0.738437 }, { "epoch": 0.889367816091954, "grad_norm": 1.680334448814392, "learning_rate": 3.31105024546296e-07, "loss": 0.06008529290556908, "memory(GiB)": 27.53, "step": 1238, "token_acc": 0.9732313575525813, "train_speed(iter/s)": 0.738622 }, { "epoch": 0.8900862068965517, "grad_norm": 1.6373730897903442, "learning_rate": 3.268662463178629e-07, "loss": 0.06018136069178581, "memory(GiB)": 27.53, "step": 1239, "token_acc": 0.9627659574468085, "train_speed(iter/s)": 0.73881 }, { "epoch": 0.8908045977011494, "grad_norm": 1.5795143842697144, "learning_rate": 3.226538584207228e-07, "loss": 0.06247794255614281, "memory(GiB)": 27.53, "step": 1240, "token_acc": 0.9690909090909091, "train_speed(iter/s)": 0.738999 }, { "epoch": 0.8915229885057471, "grad_norm": 1.0028352737426758, "learning_rate": 3.1846788464326315e-07, "loss": 0.06984129548072815, "memory(GiB)": 27.53, "step": 1241, "token_acc": 0.9742710120068611, "train_speed(iter/s)": 0.739174 }, { "epoch": 0.8922413793103449, "grad_norm": 0.8782671093940735, "learning_rate": 3.1430834862470395e-07, "loss": 0.048934560269117355, "memory(GiB)": 27.53, "step": 1242, "token_acc": 0.9929203539823008, "train_speed(iter/s)": 0.739299 }, { "epoch": 0.8929597701149425, "grad_norm": 0.8642504811286926, "learning_rate": 3.101752738549635e-07, "loss": 0.0636415034532547, "memory(GiB)": 27.53, "step": 1243, "token_acc": 0.9671772428884027, "train_speed(iter/s)": 0.739426 }, { "epoch": 0.8936781609195402, "grad_norm": 0.7848920822143555, "learning_rate": 3.0606868367452746e-07, "loss": 0.061694104224443436, "memory(GiB)": 27.53, "step": 1244, "token_acc": 0.9689213893967094, "train_speed(iter/s)": 0.739586 }, { "epoch": 0.8943965517241379, "grad_norm": 0.9380229115486145, "learning_rate": 3.0198860127431784e-07, "loss": 0.06426016986370087, "memory(GiB)": 27.53, "step": 1245, "token_acc": 0.9696969696969697, "train_speed(iter/s)": 0.739775 }, { "epoch": 0.8951149425287356, "grad_norm": 0.7808113098144531, "learning_rate": 2.9793504969555965e-07, "loss": 0.0645836740732193, "memory(GiB)": 27.53, "step": 1246, "token_acc": 0.9769094138543517, "train_speed(iter/s)": 0.739965 }, { "epoch": 0.8958333333333334, "grad_norm": 0.7816265225410461, "learning_rate": 2.9390805182965055e-07, "loss": 0.06212135776877403, "memory(GiB)": 27.53, "step": 1247, "token_acc": 0.9784313725490196, "train_speed(iter/s)": 0.740155 }, { "epoch": 0.896551724137931, "grad_norm": 0.9339035749435425, "learning_rate": 2.899076304180348e-07, "loss": 0.060722026973962784, "memory(GiB)": 27.53, "step": 1248, "token_acc": 0.9800995024875622, "train_speed(iter/s)": 0.740346 }, { "epoch": 0.8972701149425287, "grad_norm": 0.8073630928993225, "learning_rate": 2.8593380805207237e-07, "loss": 0.0638742595911026, "memory(GiB)": 27.53, "step": 1249, "token_acc": 0.983402489626556, "train_speed(iter/s)": 0.740534 }, { "epoch": 0.8979885057471264, "grad_norm": 1.051198959350586, "learning_rate": 2.819866071729127e-07, "loss": 0.06869731843471527, "memory(GiB)": 27.53, "step": 1250, "token_acc": 0.9735099337748344, "train_speed(iter/s)": 0.740724 }, { "epoch": 0.8979885057471264, "eval_loss": 0.058398567140102386, "eval_runtime": 6.102, "eval_samples_per_second": 73.746, "eval_steps_per_second": 2.458, "eval_token_acc": 0.977022977022977, "step": 1250 }, { "epoch": 0.8987068965517241, "grad_norm": 0.7977092266082764, "learning_rate": 2.7806605007136445e-07, "loss": 0.05325540155172348, "memory(GiB)": 27.53, "step": 1251, "token_acc": 0.978822434256458, "train_speed(iter/s)": 0.732623 }, { "epoch": 0.8994252873563219, "grad_norm": 1.0758461952209473, "learning_rate": 2.7417215888777493e-07, "loss": 0.06523766368627548, "memory(GiB)": 27.53, "step": 1252, "token_acc": 0.978688524590164, "train_speed(iter/s)": 0.732815 }, { "epoch": 0.9001436781609196, "grad_norm": 0.8811194896697998, "learning_rate": 2.7030495561190195e-07, "loss": 0.06441573053598404, "memory(GiB)": 27.53, "step": 1253, "token_acc": 0.9660194174757282, "train_speed(iter/s)": 0.733002 }, { "epoch": 0.9008620689655172, "grad_norm": 0.7800517082214355, "learning_rate": 2.6646446208279054e-07, "loss": 0.05558375269174576, "memory(GiB)": 27.53, "step": 1254, "token_acc": 0.9722703639514731, "train_speed(iter/s)": 0.733193 }, { "epoch": 0.9015804597701149, "grad_norm": 1.2402375936508179, "learning_rate": 2.6265069998864747e-07, "loss": 0.05886194109916687, "memory(GiB)": 27.53, "step": 1255, "token_acc": 0.9760589318600368, "train_speed(iter/s)": 0.733376 }, { "epoch": 0.9022988505747126, "grad_norm": 0.7730839848518372, "learning_rate": 2.5886369086672193e-07, "loss": 0.0604819729924202, "memory(GiB)": 27.53, "step": 1256, "token_acc": 0.9818913480885312, "train_speed(iter/s)": 0.733561 }, { "epoch": 0.9030172413793104, "grad_norm": 0.8705950379371643, "learning_rate": 2.551034561031823e-07, "loss": 0.05901094153523445, "memory(GiB)": 27.53, "step": 1257, "token_acc": 0.985480943738657, "train_speed(iter/s)": 0.733739 }, { "epoch": 0.9037356321839081, "grad_norm": 0.8001240491867065, "learning_rate": 2.513700169329963e-07, "loss": 0.05922878161072731, "memory(GiB)": 27.53, "step": 1258, "token_acc": 0.9707903780068728, "train_speed(iter/s)": 0.733875 }, { "epoch": 0.9044540229885057, "grad_norm": 1.331525206565857, "learning_rate": 2.476633944398088e-07, "loss": 0.06329378485679626, "memory(GiB)": 27.53, "step": 1259, "token_acc": 0.9840142095914742, "train_speed(iter/s)": 0.734017 }, { "epoch": 0.9051724137931034, "grad_norm": 0.8982879519462585, "learning_rate": 2.439836095558262e-07, "loss": 0.06487162411212921, "memory(GiB)": 27.53, "step": 1260, "token_acc": 0.9543568464730291, "train_speed(iter/s)": 0.734154 }, { "epoch": 0.9058908045977011, "grad_norm": 0.8225810527801514, "learning_rate": 2.4033068306169526e-07, "loss": 0.05531305819749832, "memory(GiB)": 27.53, "step": 1261, "token_acc": 0.9818548387096774, "train_speed(iter/s)": 0.7343 }, { "epoch": 0.9066091954022989, "grad_norm": 0.8299042582511902, "learning_rate": 2.3670463558638556e-07, "loss": 0.050242260098457336, "memory(GiB)": 27.53, "step": 1262, "token_acc": 0.9872495446265938, "train_speed(iter/s)": 0.734487 }, { "epoch": 0.9073275862068966, "grad_norm": 0.7936691045761108, "learning_rate": 2.3310548760707652e-07, "loss": 0.055468641221523285, "memory(GiB)": 27.53, "step": 1263, "token_acc": 0.975736568457539, "train_speed(iter/s)": 0.734667 }, { "epoch": 0.9080459770114943, "grad_norm": 0.7519497871398926, "learning_rate": 2.2953325944903848e-07, "loss": 0.056500159204006195, "memory(GiB)": 27.53, "step": 1264, "token_acc": 0.9893992932862191, "train_speed(iter/s)": 0.734851 }, { "epoch": 0.9087643678160919, "grad_norm": 1.3798526525497437, "learning_rate": 2.2598797128551953e-07, "loss": 0.058134254068136215, "memory(GiB)": 27.53, "step": 1265, "token_acc": 0.9689119170984456, "train_speed(iter/s)": 0.735021 }, { "epoch": 0.9094827586206896, "grad_norm": 0.9769287705421448, "learning_rate": 2.2246964313763053e-07, "loss": 0.06381552666425705, "memory(GiB)": 27.53, "step": 1266, "token_acc": 0.9768595041322314, "train_speed(iter/s)": 0.735201 }, { "epoch": 0.9102011494252874, "grad_norm": 0.8272403478622437, "learning_rate": 2.1897829487423139e-07, "loss": 0.06090264767408371, "memory(GiB)": 27.53, "step": 1267, "token_acc": 0.9650092081031307, "train_speed(iter/s)": 0.735379 }, { "epoch": 0.9109195402298851, "grad_norm": 1.0967930555343628, "learning_rate": 2.1551394621182277e-07, "loss": 0.056975144892930984, "memory(GiB)": 27.53, "step": 1268, "token_acc": 0.9887820512820513, "train_speed(iter/s)": 0.735557 }, { "epoch": 0.9116379310344828, "grad_norm": 1.2879087924957275, "learning_rate": 2.1207661671443003e-07, "loss": 0.05660143494606018, "memory(GiB)": 27.53, "step": 1269, "token_acc": 0.9757085020242915, "train_speed(iter/s)": 0.735735 }, { "epoch": 0.9123563218390804, "grad_norm": 0.9542869925498962, "learning_rate": 2.08666325793494e-07, "loss": 0.05737178027629852, "memory(GiB)": 27.53, "step": 1270, "token_acc": 0.9859154929577465, "train_speed(iter/s)": 0.735902 }, { "epoch": 0.9130747126436781, "grad_norm": 1.1606589555740356, "learning_rate": 2.052830927077637e-07, "loss": 0.06072818487882614, "memory(GiB)": 27.53, "step": 1271, "token_acc": 0.9729241877256317, "train_speed(iter/s)": 0.736056 }, { "epoch": 0.9137931034482759, "grad_norm": 1.0437860488891602, "learning_rate": 2.0192693656318597e-07, "loss": 0.06738600879907608, "memory(GiB)": 27.53, "step": 1272, "token_acc": 0.9801801801801802, "train_speed(iter/s)": 0.736211 }, { "epoch": 0.9145114942528736, "grad_norm": 1.048069715499878, "learning_rate": 1.9859787631279547e-07, "loss": 0.06182829663157463, "memory(GiB)": 27.53, "step": 1273, "token_acc": 0.9801587301587301, "train_speed(iter/s)": 0.736358 }, { "epoch": 0.9152298850574713, "grad_norm": 1.5536963939666748, "learning_rate": 1.9529593075661267e-07, "loss": 0.06614789366722107, "memory(GiB)": 27.53, "step": 1274, "token_acc": 0.9659574468085106, "train_speed(iter/s)": 0.736525 }, { "epoch": 0.915948275862069, "grad_norm": 0.8652864098548889, "learning_rate": 1.9202111854153217e-07, "loss": 0.06004370376467705, "memory(GiB)": 27.53, "step": 1275, "token_acc": 0.9639278557114228, "train_speed(iter/s)": 0.736698 }, { "epoch": 0.9166666666666666, "grad_norm": 1.7868647575378418, "learning_rate": 1.8877345816122162e-07, "loss": 0.06562146544456482, "memory(GiB)": 27.53, "step": 1276, "token_acc": 0.9725776965265083, "train_speed(iter/s)": 0.736877 }, { "epoch": 0.9173850574712644, "grad_norm": 1.0015677213668823, "learning_rate": 1.8555296795601364e-07, "loss": 0.061878688633441925, "memory(GiB)": 27.53, "step": 1277, "token_acc": 0.9753593429158111, "train_speed(iter/s)": 0.737061 }, { "epoch": 0.9181034482758621, "grad_norm": 0.9807928204536438, "learning_rate": 1.8235966611280687e-07, "loss": 0.06513012200593948, "memory(GiB)": 27.53, "step": 1278, "token_acc": 0.9645868465430016, "train_speed(iter/s)": 0.737243 }, { "epoch": 0.9188218390804598, "grad_norm": 0.8640570044517517, "learning_rate": 1.7919357066495836e-07, "loss": 0.05770382657647133, "memory(GiB)": 27.53, "step": 1279, "token_acc": 0.9797794117647058, "train_speed(iter/s)": 0.737426 }, { "epoch": 0.9195402298850575, "grad_norm": 0.9149330854415894, "learning_rate": 1.760546994921858e-07, "loss": 0.058014001697301865, "memory(GiB)": 27.53, "step": 1280, "token_acc": 0.9754098360655737, "train_speed(iter/s)": 0.73761 }, { "epoch": 0.9202586206896551, "grad_norm": 1.2631354331970215, "learning_rate": 1.7294307032046264e-07, "loss": 0.054404065012931824, "memory(GiB)": 27.53, "step": 1281, "token_acc": 0.9671361502347418, "train_speed(iter/s)": 0.737795 }, { "epoch": 0.9209770114942529, "grad_norm": 1.0816254615783691, "learning_rate": 1.6985870072192156e-07, "loss": 0.0670335441827774, "memory(GiB)": 27.53, "step": 1282, "token_acc": 0.9683168316831683, "train_speed(iter/s)": 0.73798 }, { "epoch": 0.9216954022988506, "grad_norm": 0.767786979675293, "learning_rate": 1.6680160811475332e-07, "loss": 0.059241876006126404, "memory(GiB)": 27.53, "step": 1283, "token_acc": 0.9823151125401929, "train_speed(iter/s)": 0.738163 }, { "epoch": 0.9224137931034483, "grad_norm": 0.6482214331626892, "learning_rate": 1.6377180976310968e-07, "loss": 0.05744243785738945, "memory(GiB)": 27.53, "step": 1284, "token_acc": 0.9825918762088974, "train_speed(iter/s)": 0.738346 }, { "epoch": 0.923132183908046, "grad_norm": 1.257673740386963, "learning_rate": 1.6076932277700352e-07, "loss": 0.05228473246097565, "memory(GiB)": 27.53, "step": 1285, "token_acc": 0.9798994974874372, "train_speed(iter/s)": 0.738525 }, { "epoch": 0.9238505747126436, "grad_norm": 0.7550585865974426, "learning_rate": 1.5779416411221437e-07, "loss": 0.06041378527879715, "memory(GiB)": 27.53, "step": 1286, "token_acc": 0.9657039711191335, "train_speed(iter/s)": 0.738709 }, { "epoch": 0.9245689655172413, "grad_norm": 0.8612072467803955, "learning_rate": 1.548463505701925e-07, "loss": 0.05853103846311569, "memory(GiB)": 27.53, "step": 1287, "token_acc": 0.9720430107526882, "train_speed(iter/s)": 0.738889 }, { "epoch": 0.9252873563218391, "grad_norm": 0.8939786553382874, "learning_rate": 1.5192589879796383e-07, "loss": 0.052785225212574005, "memory(GiB)": 27.53, "step": 1288, "token_acc": 0.9758364312267658, "train_speed(iter/s)": 0.73907 }, { "epoch": 0.9260057471264368, "grad_norm": 0.9680473208427429, "learning_rate": 1.4903282528803354e-07, "loss": 0.0600341260433197, "memory(GiB)": 27.53, "step": 1289, "token_acc": 0.9754716981132076, "train_speed(iter/s)": 0.73925 }, { "epoch": 0.9267241379310345, "grad_norm": 0.859541654586792, "learning_rate": 1.4616714637829822e-07, "loss": 0.056286148726940155, "memory(GiB)": 27.53, "step": 1290, "token_acc": 0.9635701275045537, "train_speed(iter/s)": 0.739435 }, { "epoch": 0.9274425287356322, "grad_norm": 0.8557973504066467, "learning_rate": 1.4332887825194818e-07, "loss": 0.06338023394346237, "memory(GiB)": 27.53, "step": 1291, "token_acc": 0.967984934086629, "train_speed(iter/s)": 0.73962 }, { "epoch": 0.9281609195402298, "grad_norm": 0.7078872323036194, "learning_rate": 1.4051803693737876e-07, "loss": 0.057185396552085876, "memory(GiB)": 27.53, "step": 1292, "token_acc": 0.9725490196078431, "train_speed(iter/s)": 0.739805 }, { "epoch": 0.9288793103448276, "grad_norm": 0.9672695398330688, "learning_rate": 1.3773463830809962e-07, "loss": 0.06533121317625046, "memory(GiB)": 27.53, "step": 1293, "token_acc": 0.9754098360655737, "train_speed(iter/s)": 0.739973 }, { "epoch": 0.9295977011494253, "grad_norm": 0.8326073288917542, "learning_rate": 1.3497869808264453e-07, "loss": 0.06311263889074326, "memory(GiB)": 27.53, "step": 1294, "token_acc": 0.974903474903475, "train_speed(iter/s)": 0.740104 }, { "epoch": 0.930316091954023, "grad_norm": 0.783066987991333, "learning_rate": 1.3225023182448392e-07, "loss": 0.0621393546462059, "memory(GiB)": 27.53, "step": 1295, "token_acc": 0.9781879194630873, "train_speed(iter/s)": 0.740244 }, { "epoch": 0.9310344827586207, "grad_norm": 1.3005826473236084, "learning_rate": 1.2954925494193472e-07, "loss": 0.05702809989452362, "memory(GiB)": 27.53, "step": 1296, "token_acc": 0.9805309734513274, "train_speed(iter/s)": 0.740388 }, { "epoch": 0.9317528735632183, "grad_norm": 1.3443634510040283, "learning_rate": 1.268757826880751e-07, "loss": 0.06281372904777527, "memory(GiB)": 27.53, "step": 1297, "token_acc": 0.9681908548707754, "train_speed(iter/s)": 0.740526 }, { "epoch": 0.9324712643678161, "grad_norm": 0.7878507971763611, "learning_rate": 1.2422983016065816e-07, "loss": 0.06288642436265945, "memory(GiB)": 27.53, "step": 1298, "token_acc": 0.9695238095238096, "train_speed(iter/s)": 0.740675 }, { "epoch": 0.9331896551724138, "grad_norm": 0.9865723848342896, "learning_rate": 1.2161141230202678e-07, "loss": 0.06864608079195023, "memory(GiB)": 27.53, "step": 1299, "token_acc": 0.9694501018329938, "train_speed(iter/s)": 0.740857 }, { "epoch": 0.9339080459770115, "grad_norm": 0.8490484356880188, "learning_rate": 1.1902054389902662e-07, "loss": 0.05831685662269592, "memory(GiB)": 27.53, "step": 1300, "token_acc": 0.9707903780068728, "train_speed(iter/s)": 0.741038 }, { "epoch": 0.9339080459770115, "eval_loss": 0.05767093598842621, "eval_runtime": 6.8648, "eval_samples_per_second": 65.552, "eval_steps_per_second": 2.185, "eval_token_acc": 0.9774288211788211, "step": 1300 }, { "epoch": 0.9346264367816092, "grad_norm": 0.7069272398948669, "learning_rate": 1.164572395829272e-07, "loss": 0.057239726185798645, "memory(GiB)": 27.53, "step": 1301, "token_acc": 0.9794883885417894, "train_speed(iter/s)": 0.732688 }, { "epoch": 0.9353448275862069, "grad_norm": 1.0305601358413696, "learning_rate": 1.1392151382933647e-07, "loss": 0.07142940163612366, "memory(GiB)": 27.53, "step": 1302, "token_acc": 0.976068376068376, "train_speed(iter/s)": 0.732848 }, { "epoch": 0.9360632183908046, "grad_norm": 1.0441395044326782, "learning_rate": 1.1141338095811804e-07, "loss": 0.0627642348408699, "memory(GiB)": 27.53, "step": 1303, "token_acc": 0.9672489082969432, "train_speed(iter/s)": 0.733034 }, { "epoch": 0.9367816091954023, "grad_norm": 0.9573545455932617, "learning_rate": 1.0893285513331353e-07, "loss": 0.05876416340470314, "memory(GiB)": 27.53, "step": 1304, "token_acc": 0.9769503546099291, "train_speed(iter/s)": 0.733216 }, { "epoch": 0.9375, "grad_norm": 0.8136374950408936, "learning_rate": 1.064799503630598e-07, "loss": 0.05297144129872322, "memory(GiB)": 27.53, "step": 1305, "token_acc": 0.9852507374631269, "train_speed(iter/s)": 0.733396 }, { "epoch": 0.9382183908045977, "grad_norm": 1.1049655675888062, "learning_rate": 1.0405468049951184e-07, "loss": 0.06218315288424492, "memory(GiB)": 27.53, "step": 1306, "token_acc": 0.9859402460456942, "train_speed(iter/s)": 0.733575 }, { "epoch": 0.9389367816091954, "grad_norm": 1.506242275238037, "learning_rate": 1.0165705923876113e-07, "loss": 0.054496388882398605, "memory(GiB)": 27.53, "step": 1307, "token_acc": 0.9851576994434137, "train_speed(iter/s)": 0.733759 }, { "epoch": 0.9396551724137931, "grad_norm": 0.7370707988739014, "learning_rate": 9.928710012076404e-08, "loss": 0.06542385369539261, "memory(GiB)": 27.53, "step": 1308, "token_acc": 0.9759519038076152, "train_speed(iter/s)": 0.73391 }, { "epoch": 0.9403735632183908, "grad_norm": 0.8851759433746338, "learning_rate": 9.694481652925913e-08, "loss": 0.05810582637786865, "memory(GiB)": 27.53, "step": 1309, "token_acc": 0.975, "train_speed(iter/s)": 0.734033 }, { "epoch": 0.9410919540229885, "grad_norm": 0.8321619629859924, "learning_rate": 9.463022169169666e-08, "loss": 0.054976917803287506, "memory(GiB)": 27.53, "step": 1310, "token_acc": 0.9836660617059891, "train_speed(iter/s)": 0.734208 }, { "epoch": 0.9418103448275862, "grad_norm": 0.8394550681114197, "learning_rate": 9.234332867916029e-08, "loss": 0.06433315575122833, "memory(GiB)": 27.53, "step": 1311, "token_acc": 0.9742063492063492, "train_speed(iter/s)": 0.734381 }, { "epoch": 0.9425287356321839, "grad_norm": 0.9813358187675476, "learning_rate": 9.008415040629548e-08, "loss": 0.0565367192029953, "memory(GiB)": 27.53, "step": 1312, "token_acc": 0.9745222929936306, "train_speed(iter/s)": 0.734436 }, { "epoch": 0.9432471264367817, "grad_norm": 0.7716550827026367, "learning_rate": 8.785269963123455e-08, "loss": 0.05793631821870804, "memory(GiB)": 27.53, "step": 1313, "token_acc": 0.978978978978979, "train_speed(iter/s)": 0.734614 }, { "epoch": 0.9439655172413793, "grad_norm": 1.0411561727523804, "learning_rate": 8.564898895552843e-08, "loss": 0.06137952581048012, "memory(GiB)": 27.53, "step": 1314, "token_acc": 0.9708333333333333, "train_speed(iter/s)": 0.734795 }, { "epoch": 0.944683908045977, "grad_norm": 0.8512791991233826, "learning_rate": 8.347303082406999e-08, "loss": 0.05875416845083237, "memory(GiB)": 27.53, "step": 1315, "token_acc": 0.9759450171821306, "train_speed(iter/s)": 0.734977 }, { "epoch": 0.9454022988505747, "grad_norm": 0.9030290246009827, "learning_rate": 8.132483752502806e-08, "loss": 0.06552864611148834, "memory(GiB)": 27.53, "step": 1316, "token_acc": 0.9720558882235529, "train_speed(iter/s)": 0.73516 }, { "epoch": 0.9461206896551724, "grad_norm": 0.7311325669288635, "learning_rate": 7.920442118977689e-08, "loss": 0.053215667605400085, "memory(GiB)": 27.53, "step": 1317, "token_acc": 0.9730250481695568, "train_speed(iter/s)": 0.735341 }, { "epoch": 0.9468390804597702, "grad_norm": 0.9692714214324951, "learning_rate": 7.711179379282674e-08, "loss": 0.05555614456534386, "memory(GiB)": 27.53, "step": 1318, "token_acc": 0.9763779527559056, "train_speed(iter/s)": 0.735523 }, { "epoch": 0.9475574712643678, "grad_norm": 0.810583233833313, "learning_rate": 7.50469671517573e-08, "loss": 0.06023557111620903, "memory(GiB)": 27.53, "step": 1319, "token_acc": 0.9837728194726166, "train_speed(iter/s)": 0.735705 }, { "epoch": 0.9482758620689655, "grad_norm": 0.8417066335678101, "learning_rate": 7.300995292715107e-08, "loss": 0.06228820979595184, "memory(GiB)": 27.53, "step": 1320, "token_acc": 0.9698275862068966, "train_speed(iter/s)": 0.735888 }, { "epoch": 0.9489942528735632, "grad_norm": 1.3338690996170044, "learning_rate": 7.10007626225262e-08, "loss": 0.06068476662039757, "memory(GiB)": 27.53, "step": 1321, "token_acc": 0.98, "train_speed(iter/s)": 0.736071 }, { "epoch": 0.9497126436781609, "grad_norm": 0.736926794052124, "learning_rate": 6.901940758427206e-08, "loss": 0.0573776476085186, "memory(GiB)": 27.53, "step": 1322, "token_acc": 0.9753787878787878, "train_speed(iter/s)": 0.736253 }, { "epoch": 0.9504310344827587, "grad_norm": 0.7601249814033508, "learning_rate": 6.706589900158655e-08, "loss": 0.05783964321017265, "memory(GiB)": 27.53, "step": 1323, "token_acc": 0.9786096256684492, "train_speed(iter/s)": 0.736435 }, { "epoch": 0.9511494252873564, "grad_norm": 1.0031280517578125, "learning_rate": 6.514024790641116e-08, "loss": 0.055593341588974, "memory(GiB)": 27.53, "step": 1324, "token_acc": 0.9929701230228472, "train_speed(iter/s)": 0.736595 }, { "epoch": 0.951867816091954, "grad_norm": 0.8535168170928955, "learning_rate": 6.324246517336985e-08, "loss": 0.059594884514808655, "memory(GiB)": 27.53, "step": 1325, "token_acc": 0.9655172413793104, "train_speed(iter/s)": 0.736726 }, { "epoch": 0.9525862068965517, "grad_norm": 0.8462489247322083, "learning_rate": 6.137256151970583e-08, "loss": 0.060249876230955124, "memory(GiB)": 27.53, "step": 1326, "token_acc": 0.9744680851063829, "train_speed(iter/s)": 0.736861 }, { "epoch": 0.9533045977011494, "grad_norm": 1.2636990547180176, "learning_rate": 5.95305475052238e-08, "loss": 0.05781608819961548, "memory(GiB)": 27.53, "step": 1327, "token_acc": 0.9824868651488616, "train_speed(iter/s)": 0.737025 }, { "epoch": 0.9540229885057471, "grad_norm": 0.889110267162323, "learning_rate": 5.771643353222778e-08, "loss": 0.05524569749832153, "memory(GiB)": 27.53, "step": 1328, "token_acc": 0.9718004338394793, "train_speed(iter/s)": 0.737205 }, { "epoch": 0.9547413793103449, "grad_norm": 0.988017737865448, "learning_rate": 5.5930229845464476e-08, "loss": 0.060014329850673676, "memory(GiB)": 27.53, "step": 1329, "token_acc": 0.9619771863117871, "train_speed(iter/s)": 0.737384 }, { "epoch": 0.9554597701149425, "grad_norm": 1.0546913146972656, "learning_rate": 5.417194653206337e-08, "loss": 0.05399401858448982, "memory(GiB)": 27.53, "step": 1330, "token_acc": 0.978984238178634, "train_speed(iter/s)": 0.737565 }, { "epoch": 0.9561781609195402, "grad_norm": 1.1142860651016235, "learning_rate": 5.244159352148059e-08, "loss": 0.07161542773246765, "memory(GiB)": 27.53, "step": 1331, "token_acc": 0.9714285714285714, "train_speed(iter/s)": 0.737745 }, { "epoch": 0.9568965517241379, "grad_norm": 1.162352204322815, "learning_rate": 5.073918058544458e-08, "loss": 0.06970338523387909, "memory(GiB)": 27.53, "step": 1332, "token_acc": 0.9770867430441899, "train_speed(iter/s)": 0.737926 }, { "epoch": 0.9576149425287356, "grad_norm": 0.8481786847114563, "learning_rate": 4.906471733789775e-08, "loss": 0.06448915600776672, "memory(GiB)": 27.53, "step": 1333, "token_acc": 0.9773913043478261, "train_speed(iter/s)": 0.738104 }, { "epoch": 0.9583333333333334, "grad_norm": 1.1345182657241821, "learning_rate": 4.741821323494489e-08, "loss": 0.05117519199848175, "memory(GiB)": 27.53, "step": 1334, "token_acc": 0.9820143884892086, "train_speed(iter/s)": 0.738282 }, { "epoch": 0.959051724137931, "grad_norm": 0.9734980463981628, "learning_rate": 4.579967757479709e-08, "loss": 0.06262022256851196, "memory(GiB)": 27.53, "step": 1335, "token_acc": 0.9667359667359667, "train_speed(iter/s)": 0.73846 }, { "epoch": 0.9597701149425287, "grad_norm": 0.6853424310684204, "learning_rate": 4.4209119497722883e-08, "loss": 0.0505802296102047, "memory(GiB)": 27.53, "step": 1336, "token_acc": 0.9805996472663139, "train_speed(iter/s)": 0.738633 }, { "epoch": 0.9604885057471264, "grad_norm": 0.8033921718597412, "learning_rate": 4.264654798599277e-08, "loss": 0.05293594300746918, "memory(GiB)": 27.53, "step": 1337, "token_acc": 0.9711538461538461, "train_speed(iter/s)": 0.738808 }, { "epoch": 0.9612068965517241, "grad_norm": 1.5980924367904663, "learning_rate": 4.1111971863830866e-08, "loss": 0.05291295051574707, "memory(GiB)": 27.53, "step": 1338, "token_acc": 0.9739776951672863, "train_speed(iter/s)": 0.738989 }, { "epoch": 0.9619252873563219, "grad_norm": 0.9982359409332275, "learning_rate": 3.960539979736444e-08, "loss": 0.05443057417869568, "memory(GiB)": 27.53, "step": 1339, "token_acc": 0.9801192842942346, "train_speed(iter/s)": 0.739166 }, { "epoch": 0.9626436781609196, "grad_norm": 0.8254309892654419, "learning_rate": 3.812684029457614e-08, "loss": 0.05976217985153198, "memory(GiB)": 27.53, "step": 1340, "token_acc": 0.9842381786339754, "train_speed(iter/s)": 0.739337 }, { "epoch": 0.9633620689655172, "grad_norm": 0.8409211039543152, "learning_rate": 3.667630170525238e-08, "loss": 0.055016301572322845, "memory(GiB)": 27.53, "step": 1341, "token_acc": 0.9716417910447761, "train_speed(iter/s)": 0.739513 }, { "epoch": 0.9640804597701149, "grad_norm": 0.8342769145965576, "learning_rate": 3.525379222094061e-08, "loss": 0.06828568875789642, "memory(GiB)": 27.53, "step": 1342, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.739687 }, { "epoch": 0.9647988505747126, "grad_norm": 0.8231976628303528, "learning_rate": 3.385931987490043e-08, "loss": 0.05468888580799103, "memory(GiB)": 27.53, "step": 1343, "token_acc": 0.9757914338919925, "train_speed(iter/s)": 0.739865 }, { "epoch": 0.9655172413793104, "grad_norm": 0.8693992495536804, "learning_rate": 3.249289254205867e-08, "loss": 0.0670827329158783, "memory(GiB)": 27.53, "step": 1344, "token_acc": 0.9700854700854701, "train_speed(iter/s)": 0.740035 }, { "epoch": 0.9662356321839081, "grad_norm": 0.9941868782043457, "learning_rate": 3.1154517938965514e-08, "loss": 0.07684467732906342, "memory(GiB)": 27.53, "step": 1345, "token_acc": 0.974559686888454, "train_speed(iter/s)": 0.740155 }, { "epoch": 0.9669540229885057, "grad_norm": 0.8865212798118591, "learning_rate": 2.984420362375007e-08, "loss": 0.06662784516811371, "memory(GiB)": 27.53, "step": 1346, "token_acc": 0.9734345351043643, "train_speed(iter/s)": 0.740309 }, { "epoch": 0.9676724137931034, "grad_norm": 0.8056420087814331, "learning_rate": 2.8561956996078778e-08, "loss": 0.05641065537929535, "memory(GiB)": 27.53, "step": 1347, "token_acc": 0.9711191335740073, "train_speed(iter/s)": 0.740461 }, { "epoch": 0.9683908045977011, "grad_norm": 0.8888630867004395, "learning_rate": 2.7307785297111533e-08, "loss": 0.06275849044322968, "memory(GiB)": 27.53, "step": 1348, "token_acc": 0.9731958762886598, "train_speed(iter/s)": 0.740615 }, { "epoch": 0.9691091954022989, "grad_norm": 0.9468866586685181, "learning_rate": 2.60816956094645e-08, "loss": 0.06882379949092865, "memory(GiB)": 27.53, "step": 1349, "token_acc": 0.9799599198396793, "train_speed(iter/s)": 0.740794 }, { "epoch": 0.9698275862068966, "grad_norm": 0.9178730249404907, "learning_rate": 2.488369485716513e-08, "loss": 0.057931914925575256, "memory(GiB)": 27.53, "step": 1350, "token_acc": 0.9744897959183674, "train_speed(iter/s)": 0.740963 }, { "epoch": 0.9698275862068966, "eval_loss": 0.05748336389660835, "eval_runtime": 6.0391, "eval_samples_per_second": 74.515, "eval_steps_per_second": 2.484, "eval_token_acc": 0.9775224775224776, "step": 1350 }, { "epoch": 0.9705459770114943, "grad_norm": 0.8806797862052917, "learning_rate": 2.371378980561889e-08, "loss": 0.05770386755466461, "memory(GiB)": 27.53, "step": 1351, "token_acc": 0.9784452296819788, "train_speed(iter/s)": 0.73334 }, { "epoch": 0.9712643678160919, "grad_norm": 1.1500838994979858, "learning_rate": 2.2571987061564827e-08, "loss": 0.06308402866125107, "memory(GiB)": 27.53, "step": 1352, "token_acc": 0.9736379613356766, "train_speed(iter/s)": 0.733513 }, { "epoch": 0.9719827586206896, "grad_norm": 0.8968740105628967, "learning_rate": 2.1458293073042814e-08, "loss": 0.06336413323879242, "memory(GiB)": 27.53, "step": 1353, "token_acc": 0.9822064056939501, "train_speed(iter/s)": 0.73369 }, { "epoch": 0.9727011494252874, "grad_norm": 0.8824398517608643, "learning_rate": 2.0372714129356375e-08, "loss": 0.06427834182977676, "memory(GiB)": 27.53, "step": 1354, "token_acc": 0.9674134419551935, "train_speed(iter/s)": 0.733862 }, { "epoch": 0.9734195402298851, "grad_norm": 0.8443480730056763, "learning_rate": 1.9315256361036038e-08, "loss": 0.05200262367725372, "memory(GiB)": 27.53, "step": 1355, "token_acc": 0.9759229534510433, "train_speed(iter/s)": 0.734027 }, { "epoch": 0.9741379310344828, "grad_norm": 0.7583639025688171, "learning_rate": 1.8285925739803812e-08, "loss": 0.05369126796722412, "memory(GiB)": 27.53, "step": 1356, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.7342 }, { "epoch": 0.9748563218390804, "grad_norm": 0.9542580842971802, "learning_rate": 1.72847280785432e-08, "loss": 0.070169597864151, "memory(GiB)": 27.53, "step": 1357, "token_acc": 0.9836660617059891, "train_speed(iter/s)": 0.734372 }, { "epoch": 0.9755747126436781, "grad_norm": 0.799308180809021, "learning_rate": 1.631166903126147e-08, "loss": 0.052556391805410385, "memory(GiB)": 27.53, "step": 1358, "token_acc": 0.987719298245614, "train_speed(iter/s)": 0.734539 }, { "epoch": 0.9762931034482759, "grad_norm": 0.6654383540153503, "learning_rate": 1.536675409306243e-08, "loss": 0.060013167560100555, "memory(GiB)": 27.53, "step": 1359, "token_acc": 0.9813780260707635, "train_speed(iter/s)": 0.734714 }, { "epoch": 0.9770114942528736, "grad_norm": 1.054984450340271, "learning_rate": 1.4449988600111486e-08, "loss": 0.058879557996988297, "memory(GiB)": 27.53, "step": 1360, "token_acc": 0.975, "train_speed(iter/s)": 0.734881 }, { "epoch": 0.9777298850574713, "grad_norm": 0.6225143671035767, "learning_rate": 1.3561377729608417e-08, "loss": 0.05292601138353348, "memory(GiB)": 27.53, "step": 1361, "token_acc": 0.9796747967479674, "train_speed(iter/s)": 0.735009 }, { "epoch": 0.978448275862069, "grad_norm": 0.96588534116745, "learning_rate": 1.2700926499756295e-08, "loss": 0.06171078979969025, "memory(GiB)": 27.53, "step": 1362, "token_acc": 0.9680638722554891, "train_speed(iter/s)": 0.73515 }, { "epoch": 0.9791666666666666, "grad_norm": 0.8536303639411926, "learning_rate": 1.1868639769734847e-08, "loss": 0.06248531863093376, "memory(GiB)": 27.53, "step": 1363, "token_acc": 0.9821073558648111, "train_speed(iter/s)": 0.735283 }, { "epoch": 0.9798850574712644, "grad_norm": 0.9707383513450623, "learning_rate": 1.1064522239669916e-08, "loss": 0.06458102911710739, "memory(GiB)": 27.53, "step": 1364, "token_acc": 0.9760869565217392, "train_speed(iter/s)": 0.73542 }, { "epoch": 0.9806034482758621, "grad_norm": 0.8585712313652039, "learning_rate": 1.0288578450611264e-08, "loss": 0.05405408516526222, "memory(GiB)": 27.53, "step": 1365, "token_acc": 0.9853420195439739, "train_speed(iter/s)": 0.735569 }, { "epoch": 0.9813218390804598, "grad_norm": 0.7936934232711792, "learning_rate": 9.54081278450314e-09, "loss": 0.05549629032611847, "memory(GiB)": 27.53, "step": 1366, "token_acc": 0.9739776951672863, "train_speed(iter/s)": 0.735745 }, { "epoch": 0.9820402298850575, "grad_norm": 1.0785877704620361, "learning_rate": 8.821229464160974e-09, "loss": 0.06314409524202347, "memory(GiB)": 27.53, "step": 1367, "token_acc": 0.9743589743589743, "train_speed(iter/s)": 0.735916 }, { "epoch": 0.9827586206896551, "grad_norm": 0.7016350030899048, "learning_rate": 8.129832553249173e-09, "loss": 0.06725485622882843, "memory(GiB)": 27.53, "step": 1368, "token_acc": 0.9633911368015414, "train_speed(iter/s)": 0.736088 }, { "epoch": 0.9834770114942529, "grad_norm": 0.9044017791748047, "learning_rate": 7.46662595625447e-09, "loss": 0.0704030841588974, "memory(GiB)": 27.53, "step": 1369, "token_acc": 0.9734151329243353, "train_speed(iter/s)": 0.736261 }, { "epoch": 0.9841954022988506, "grad_norm": 1.334763526916504, "learning_rate": 6.831613418468163e-09, "loss": 0.06056500971317291, "memory(GiB)": 27.53, "step": 1370, "token_acc": 0.9757914338919925, "train_speed(iter/s)": 0.736433 }, { "epoch": 0.9849137931034483, "grad_norm": 1.045500636100769, "learning_rate": 6.224798525960029e-09, "loss": 0.057177480310201645, "memory(GiB)": 27.53, "step": 1371, "token_acc": 0.9676375404530745, "train_speed(iter/s)": 0.736606 }, { "epoch": 0.985632183908046, "grad_norm": 0.9330273866653442, "learning_rate": 5.646184705563884e-09, "loss": 0.06270989775657654, "memory(GiB)": 27.53, "step": 1372, "token_acc": 0.9805970149253731, "train_speed(iter/s)": 0.736779 }, { "epoch": 0.9863505747126436, "grad_norm": 0.8979822397232056, "learning_rate": 5.095775224853161e-09, "loss": 0.063373863697052, "memory(GiB)": 27.53, "step": 1373, "token_acc": 0.9738805970149254, "train_speed(iter/s)": 0.736953 }, { "epoch": 0.9870689655172413, "grad_norm": 6.230584621429443, "learning_rate": 4.573573192125369e-09, "loss": 0.060350656509399414, "memory(GiB)": 27.53, "step": 1374, "token_acc": 0.982824427480916, "train_speed(iter/s)": 0.737128 }, { "epoch": 0.9877873563218391, "grad_norm": 0.8359853625297546, "learning_rate": 4.079581556382661e-09, "loss": 0.05626758933067322, "memory(GiB)": 27.53, "step": 1375, "token_acc": 0.9748549323017408, "train_speed(iter/s)": 0.737303 }, { "epoch": 0.9885057471264368, "grad_norm": 1.8882805109024048, "learning_rate": 3.613803107317959e-09, "loss": 0.06091444194316864, "memory(GiB)": 27.53, "step": 1376, "token_acc": 0.9797979797979798, "train_speed(iter/s)": 0.737477 }, { "epoch": 0.9892241379310345, "grad_norm": 1.5816795825958252, "learning_rate": 3.176240475294967e-09, "loss": 0.06356977671384811, "memory(GiB)": 27.53, "step": 1377, "token_acc": 0.9803921568627451, "train_speed(iter/s)": 0.737652 }, { "epoch": 0.9899425287356322, "grad_norm": 0.8664565682411194, "learning_rate": 2.7668961313376263e-09, "loss": 0.05936390906572342, "memory(GiB)": 27.53, "step": 1378, "token_acc": 0.9891304347826086, "train_speed(iter/s)": 0.737827 }, { "epoch": 0.9906609195402298, "grad_norm": 1.353153109550476, "learning_rate": 2.385772387114016e-09, "loss": 0.05402779579162598, "memory(GiB)": 27.53, "step": 1379, "token_acc": 0.9771528998242531, "train_speed(iter/s)": 0.738001 }, { "epoch": 0.9913793103448276, "grad_norm": 0.7358653545379639, "learning_rate": 2.0328713949230304e-09, "loss": 0.05669607222080231, "memory(GiB)": 27.53, "step": 1380, "token_acc": 0.9731663685152058, "train_speed(iter/s)": 0.738175 }, { "epoch": 0.9920977011494253, "grad_norm": 0.7442265152931213, "learning_rate": 1.708195147683278e-09, "loss": 0.05867166072130203, "memory(GiB)": 27.53, "step": 1381, "token_acc": 0.9834558823529411, "train_speed(iter/s)": 0.738349 }, { "epoch": 0.992816091954023, "grad_norm": 0.8357276916503906, "learning_rate": 1.4117454789208673e-09, "loss": 0.06443289667367935, "memory(GiB)": 27.53, "step": 1382, "token_acc": 0.974903474903475, "train_speed(iter/s)": 0.738518 }, { "epoch": 0.9935344827586207, "grad_norm": 0.8258159756660461, "learning_rate": 1.1435240627594157e-09, "loss": 0.06076141446828842, "memory(GiB)": 27.53, "step": 1383, "token_acc": 0.9822294022617124, "train_speed(iter/s)": 0.738651 }, { "epoch": 0.9942528735632183, "grad_norm": 0.8782249093055725, "learning_rate": 9.03532413911723e-10, "loss": 0.05992450192570686, "memory(GiB)": 27.53, "step": 1384, "token_acc": 0.9731404958677686, "train_speed(iter/s)": 0.738785 }, { "epoch": 0.9949712643678161, "grad_norm": 0.9968899488449097, "learning_rate": 6.917718876686685e-10, "loss": 0.060633670538663864, "memory(GiB)": 27.53, "step": 1385, "token_acc": 0.9814471243042672, "train_speed(iter/s)": 0.738921 }, { "epoch": 0.9956896551724138, "grad_norm": 1.1431612968444824, "learning_rate": 5.08243679894771e-10, "loss": 0.05440877377986908, "memory(GiB)": 27.53, "step": 1386, "token_acc": 0.9817813765182186, "train_speed(iter/s)": 0.739071 }, { "epoch": 0.9964080459770115, "grad_norm": 0.7624646425247192, "learning_rate": 3.5294882701764064e-10, "loss": 0.05500268191099167, "memory(GiB)": 27.53, "step": 1387, "token_acc": 0.9817184643510055, "train_speed(iter/s)": 0.739235 }, { "epoch": 0.9971264367816092, "grad_norm": 0.7336913347244263, "learning_rate": 2.2588820602631457e-10, "loss": 0.06436701118946075, "memory(GiB)": 27.53, "step": 1388, "token_acc": 0.9783889980353635, "train_speed(iter/s)": 0.739406 }, { "epoch": 0.9978448275862069, "grad_norm": 1.0420558452606201, "learning_rate": 1.2706253446237437e-10, "loss": 0.05881688371300697, "memory(GiB)": 27.53, "step": 1389, "token_acc": 0.9782971619365609, "train_speed(iter/s)": 0.739576 }, { "epoch": 0.9985632183908046, "grad_norm": 0.8520898222923279, "learning_rate": 5.6472370419391464e-11, "loss": 0.061375364661216736, "memory(GiB)": 27.53, "step": 1390, "token_acc": 0.9678800856531049, "train_speed(iter/s)": 0.739747 }, { "epoch": 0.9992816091954023, "grad_norm": 0.761191725730896, "learning_rate": 1.4118112536820605e-11, "loss": 0.058890827000141144, "memory(GiB)": 27.53, "step": 1391, "token_acc": 0.9688644688644689, "train_speed(iter/s)": 0.739914 }, { "epoch": 1.0, "grad_norm": 0.7942994832992554, "learning_rate": 0.0, "loss": 0.054856009781360626, "memory(GiB)": 27.53, "step": 1392, "token_acc": 0.9777777777777777, "train_speed(iter/s)": 0.740079 }, { "epoch": 1.0, "eval_loss": 0.05742287263274193, "eval_runtime": 7.0633, "eval_samples_per_second": 63.709, "eval_steps_per_second": 2.124, "eval_token_acc": 0.977538086913087, "step": 1392 } ], "logging_steps": 1, "max_steps": 1392, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.2326918753746944e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }