| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 315, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 7.553986895560367, | |
| "learning_rate": 5e-06, | |
| "loss": 1.5771, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.906740207205835, | |
| "learning_rate": 1.125e-05, | |
| "loss": 1.3506, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.8767819770258468, | |
| "learning_rate": 1.7500000000000002e-05, | |
| "loss": 1.1889, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.5446745909703954, | |
| "learning_rate": 2.375e-05, | |
| "loss": 1.0966, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.5140030398932292, | |
| "learning_rate": 3.0000000000000004e-05, | |
| "loss": 1.0404, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.4675005019349853, | |
| "learning_rate": 3.625e-05, | |
| "loss": 1.0091, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.44935151084519126, | |
| "learning_rate": 3.9995070884147604e-05, | |
| "loss": 0.9838, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.45169510024799914, | |
| "learning_rate": 3.9939646229673775e-05, | |
| "loss": 0.9736, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.42767814157802786, | |
| "learning_rate": 3.98228068051382e-05, | |
| "loss": 0.9652, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.4359248691134457, | |
| "learning_rate": 3.964491247983392e-05, | |
| "loss": 0.9755, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.4096569254948115, | |
| "learning_rate": 3.940651117416824e-05, | |
| "loss": 0.9617, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.39321526767386744, | |
| "learning_rate": 3.9108337172049794e-05, | |
| "loss": 0.9436, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.032, | |
| "grad_norm": 0.5454178809247112, | |
| "learning_rate": 3.875130885926973e-05, | |
| "loss": 0.8933, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.112, | |
| "grad_norm": 0.4889310546790741, | |
| "learning_rate": 3.83365258948432e-05, | |
| "loss": 0.8369, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.192, | |
| "grad_norm": 0.43597726561431077, | |
| "learning_rate": 3.786526582402313e-05, | |
| "loss": 0.8081, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.272, | |
| "grad_norm": 0.43639248467106284, | |
| "learning_rate": 3.733898014341858e-05, | |
| "loss": 0.8166, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.3519999999999999, | |
| "grad_norm": 0.3826009932791081, | |
| "learning_rate": 3.6759289830337246e-05, | |
| "loss": 0.8047, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.432, | |
| "grad_norm": 0.36002961067047146, | |
| "learning_rate": 3.612798035012161e-05, | |
| "loss": 0.7919, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.512, | |
| "grad_norm": 0.46361237648534737, | |
| "learning_rate": 3.544699615685671e-05, | |
| "loss": 0.7863, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.592, | |
| "grad_norm": 0.3866372632245251, | |
| "learning_rate": 3.4718434704387174e-05, | |
| "loss": 0.8034, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.6720000000000002, | |
| "grad_norm": 0.3878544531373144, | |
| "learning_rate": 3.394453998609001e-05, | |
| "loss": 0.777, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.752, | |
| "grad_norm": 0.43357660375765866, | |
| "learning_rate": 3.312769562330075e-05, | |
| "loss": 0.796, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.8319999999999999, | |
| "grad_norm": 0.38664388005160477, | |
| "learning_rate": 3.227041752368091e-05, | |
| "loss": 0.7816, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.912, | |
| "grad_norm": 0.3823673140467302, | |
| "learning_rate": 3.1375346132139135e-05, | |
| "loss": 0.7766, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.992, | |
| "grad_norm": 0.37828406727880126, | |
| "learning_rate": 3.0445238298173492e-05, | |
| "loss": 0.7966, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 2.064, | |
| "grad_norm": 0.6875770696894508, | |
| "learning_rate": 2.9482958784683883e-05, | |
| "loss": 0.6479, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.144, | |
| "grad_norm": 0.5536819048526282, | |
| "learning_rate": 2.849147144440747e-05, | |
| "loss": 0.6117, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 2.224, | |
| "grad_norm": 0.5248020590041124, | |
| "learning_rate": 2.7473830091154243e-05, | |
| "loss": 0.6098, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.304, | |
| "grad_norm": 0.4504880059072583, | |
| "learning_rate": 2.6433169093959405e-05, | |
| "loss": 0.6007, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 2.384, | |
| "grad_norm": 0.42667978139991347, | |
| "learning_rate": 2.5372693723123075e-05, | |
| "loss": 0.6083, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.464, | |
| "grad_norm": 0.432582533447793, | |
| "learning_rate": 2.4295670277871736e-05, | |
| "loss": 0.6029, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 2.544, | |
| "grad_norm": 0.4098794172365012, | |
| "learning_rate": 2.320541602604851e-05, | |
| "loss": 0.5893, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.624, | |
| "grad_norm": 0.4308341436869616, | |
| "learning_rate": 2.210528898681851e-05, | |
| "loss": 0.5806, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 0.43820711621775427, | |
| "learning_rate": 2.099867758785866e-05, | |
| "loss": 0.6151, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.784, | |
| "grad_norm": 0.41576971172263233, | |
| "learning_rate": 1.988899022888841e-05, | |
| "loss": 0.601, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 2.864, | |
| "grad_norm": 0.406055148224806, | |
| "learning_rate": 1.877964478368577e-05, | |
| "loss": 0.6103, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.944, | |
| "grad_norm": 0.40196819912431503, | |
| "learning_rate": 1.7674058072923075e-05, | |
| "loss": 0.6014, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 3.016, | |
| "grad_norm": 0.7427654443430163, | |
| "learning_rate": 1.6575635340246203e-05, | |
| "loss": 0.5633, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 3.096, | |
| "grad_norm": 0.6302486308462233, | |
| "learning_rate": 1.548775976401152e-05, | |
| "loss": 0.4498, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 3.176, | |
| "grad_norm": 0.6453164328082227, | |
| "learning_rate": 1.4413782036984616e-05, | |
| "loss": 0.4413, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3.2560000000000002, | |
| "grad_norm": 0.4777050134578628, | |
| "learning_rate": 1.3357010046095741e-05, | |
| "loss": 0.4268, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 3.336, | |
| "grad_norm": 0.45826237793958197, | |
| "learning_rate": 1.2320698684038599e-05, | |
| "loss": 0.4376, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 3.416, | |
| "grad_norm": 0.5034497132926059, | |
| "learning_rate": 1.1308039824093197e-05, | |
| "loss": 0.4402, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 3.496, | |
| "grad_norm": 0.41252483370606546, | |
| "learning_rate": 1.0322152489050508e-05, | |
| "loss": 0.4294, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 3.576, | |
| "grad_norm": 0.4695635107153035, | |
| "learning_rate": 9.366073244519124e-06, | |
| "loss": 0.4348, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 3.656, | |
| "grad_norm": 0.4709707441227059, | |
| "learning_rate": 8.442746846202711e-06, | |
| "loss": 0.437, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 3.7359999999999998, | |
| "grad_norm": 0.6313760297797569, | |
| "learning_rate": 7.5550171699549945e-06, | |
| "loss": 0.4356, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 3.816, | |
| "grad_norm": 0.4346437262602198, | |
| "learning_rate": 6.705618452548057e-06, | |
| "loss": 0.4368, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.896, | |
| "grad_norm": 0.42927897911166574, | |
| "learning_rate": 5.897166870132658e-06, | |
| "loss": 0.4295, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 3.976, | |
| "grad_norm": 0.39740342880248314, | |
| "learning_rate": 5.132152480329072e-06, | |
| "loss": 0.4439, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 4.048, | |
| "grad_norm": 0.6330823851675211, | |
| "learning_rate": 4.412931552767295e-06, | |
| "loss": 0.3693, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 4.128, | |
| "grad_norm": 0.7298997105017592, | |
| "learning_rate": 3.741719311698608e-06, | |
| "loss": 0.3449, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 4.208, | |
| "grad_norm": 0.4908230958897118, | |
| "learning_rate": 3.120583113031579e-06, | |
| "loss": 0.3457, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 4.288, | |
| "grad_norm": 0.4620570668309974, | |
| "learning_rate": 2.551436076807501e-06, | |
| "loss": 0.3326, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 4.368, | |
| "grad_norm": 0.44029177637407785, | |
| "learning_rate": 2.036031194727346e-06, | |
| "loss": 0.3267, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 4.448, | |
| "grad_norm": 0.4285706823245102, | |
| "learning_rate": 1.5759559308793448e-06, | |
| "loss": 0.3433, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 4.5280000000000005, | |
| "grad_norm": 0.44416591740232586, | |
| "learning_rate": 1.172627332297076e-06, | |
| "loss": 0.3388, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 4.608, | |
| "grad_norm": 0.4088428558019627, | |
| "learning_rate": 8.272876644077188e-07, | |
| "loss": 0.33, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 4.688, | |
| "grad_norm": 0.4102958065555951, | |
| "learning_rate": 5.410005848134315e-07, | |
| "loss": 0.3273, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 4.768, | |
| "grad_norm": 0.4530496321154963, | |
| "learning_rate": 3.1464786719075825e-07, | |
| "loss": 0.3376, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.848, | |
| "grad_norm": 0.4054203925196989, | |
| "learning_rate": 1.4892668539853606e-07, | |
| "loss": 0.3341, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 4.928, | |
| "grad_norm": 0.39471491526358593, | |
| "learning_rate": 4.434746615932018e-08, | |
| "loss": 0.3297, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.3906649028386399, | |
| "learning_rate": 1.2323169282257852e-09, | |
| "loss": 0.3311, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "step": 315, | |
| "total_flos": 616445810049024.0, | |
| "train_loss": 0.650969924624004, | |
| "train_runtime": 29756.3874, | |
| "train_samples_per_second": 1.344, | |
| "train_steps_per_second": 0.011 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 315, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 616445810049024.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |