| { | |
| "best_global_step": 2985, | |
| "best_metric": 2.4361066818237305, | |
| "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_7x1024_mem32_bs64_hf_armt_dmem64/run_20/checkpoint-2000", | |
| "epoch": 0.06, | |
| "eval_steps": 5, | |
| "global_step": 3000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0001, | |
| "eval_loss": 3.320133686065674, | |
| "eval_runtime": 33.1817, | |
| "eval_samples_per_second": 3.526, | |
| "eval_steps_per_second": 1.778, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0002, | |
| "eval_loss": 3.319335460662842, | |
| "eval_runtime": 33.1229, | |
| "eval_samples_per_second": 3.532, | |
| "eval_steps_per_second": 1.781, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0003, | |
| "eval_loss": 3.318042516708374, | |
| "eval_runtime": 33.3382, | |
| "eval_samples_per_second": 3.509, | |
| "eval_steps_per_second": 1.77, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0004, | |
| "eval_loss": 3.31443190574646, | |
| "eval_runtime": 33.2423, | |
| "eval_samples_per_second": 3.52, | |
| "eval_steps_per_second": 1.775, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0005, | |
| "grad_norm": 0.8831791054097137, | |
| "learning_rate": 4.8e-08, | |
| "loss": 3.4942, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0005, | |
| "eval_loss": 3.3073768615722656, | |
| "eval_runtime": 33.3914, | |
| "eval_samples_per_second": 3.504, | |
| "eval_steps_per_second": 1.767, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0006, | |
| "eval_loss": 3.299119472503662, | |
| "eval_runtime": 33.4042, | |
| "eval_samples_per_second": 3.503, | |
| "eval_steps_per_second": 1.766, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0007, | |
| "eval_loss": 3.2837445735931396, | |
| "eval_runtime": 33.3171, | |
| "eval_samples_per_second": 3.512, | |
| "eval_steps_per_second": 1.771, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.0008, | |
| "eval_loss": 3.26920747756958, | |
| "eval_runtime": 33.2887, | |
| "eval_samples_per_second": 3.515, | |
| "eval_steps_per_second": 1.772, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0009, | |
| "eval_loss": 3.2481868267059326, | |
| "eval_runtime": 33.3291, | |
| "eval_samples_per_second": 3.51, | |
| "eval_steps_per_second": 1.77, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.001, | |
| "grad_norm": 0.5545255682809549, | |
| "learning_rate": 9.8e-08, | |
| "loss": 3.4174, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.001, | |
| "eval_loss": 3.2263057231903076, | |
| "eval_runtime": 33.3242, | |
| "eval_samples_per_second": 3.511, | |
| "eval_steps_per_second": 1.77, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0011, | |
| "eval_loss": 3.2074711322784424, | |
| "eval_runtime": 33.3412, | |
| "eval_samples_per_second": 3.509, | |
| "eval_steps_per_second": 1.77, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.0012, | |
| "eval_loss": 3.1877729892730713, | |
| "eval_runtime": 33.5109, | |
| "eval_samples_per_second": 3.491, | |
| "eval_steps_per_second": 1.761, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0013, | |
| "eval_loss": 3.153503894805908, | |
| "eval_runtime": 33.4747, | |
| "eval_samples_per_second": 3.495, | |
| "eval_steps_per_second": 1.763, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.0014, | |
| "eval_loss": 3.1214191913604736, | |
| "eval_runtime": 33.5956, | |
| "eval_samples_per_second": 3.483, | |
| "eval_steps_per_second": 1.756, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0015, | |
| "grad_norm": 0.5083106511895727, | |
| "learning_rate": 1.4800000000000003e-07, | |
| "loss": 3.2951, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.0015, | |
| "eval_loss": 3.101821184158325, | |
| "eval_runtime": 33.6, | |
| "eval_samples_per_second": 3.482, | |
| "eval_steps_per_second": 1.756, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.0016, | |
| "eval_loss": 3.0797102451324463, | |
| "eval_runtime": 33.5302, | |
| "eval_samples_per_second": 3.489, | |
| "eval_steps_per_second": 1.76, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0017, | |
| "eval_loss": 3.0523691177368164, | |
| "eval_runtime": 33.5031, | |
| "eval_samples_per_second": 3.492, | |
| "eval_steps_per_second": 1.761, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.0018, | |
| "eval_loss": 3.022620677947998, | |
| "eval_runtime": 33.6265, | |
| "eval_samples_per_second": 3.479, | |
| "eval_steps_per_second": 1.755, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.0019, | |
| "eval_loss": 2.991481065750122, | |
| "eval_runtime": 33.5519, | |
| "eval_samples_per_second": 3.487, | |
| "eval_steps_per_second": 1.758, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.002, | |
| "grad_norm": 0.28367624064943, | |
| "learning_rate": 1.9800000000000003e-07, | |
| "loss": 3.1531, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.002, | |
| "eval_loss": 2.9630048274993896, | |
| "eval_runtime": 33.734, | |
| "eval_samples_per_second": 3.468, | |
| "eval_steps_per_second": 1.749, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0021, | |
| "eval_loss": 2.93916916847229, | |
| "eval_runtime": 33.4897, | |
| "eval_samples_per_second": 3.494, | |
| "eval_steps_per_second": 1.762, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.0022, | |
| "eval_loss": 2.9186832904815674, | |
| "eval_runtime": 33.5154, | |
| "eval_samples_per_second": 3.491, | |
| "eval_steps_per_second": 1.76, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0023, | |
| "eval_loss": 2.8985302448272705, | |
| "eval_runtime": 33.5846, | |
| "eval_samples_per_second": 3.484, | |
| "eval_steps_per_second": 1.757, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.0024, | |
| "eval_loss": 2.8786001205444336, | |
| "eval_runtime": 33.5482, | |
| "eval_samples_per_second": 3.488, | |
| "eval_steps_per_second": 1.759, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.0025, | |
| "grad_norm": 0.19615444236413476, | |
| "learning_rate": 2.48e-07, | |
| "loss": 3.0101, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.0025, | |
| "eval_loss": 2.860034704208374, | |
| "eval_runtime": 33.5143, | |
| "eval_samples_per_second": 3.491, | |
| "eval_steps_per_second": 1.76, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.0026, | |
| "eval_loss": 2.843663454055786, | |
| "eval_runtime": 33.5082, | |
| "eval_samples_per_second": 3.492, | |
| "eval_steps_per_second": 1.761, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0027, | |
| "eval_loss": 2.82882022857666, | |
| "eval_runtime": 33.4921, | |
| "eval_samples_per_second": 3.493, | |
| "eval_steps_per_second": 1.762, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.0028, | |
| "eval_loss": 2.8154728412628174, | |
| "eval_runtime": 33.6656, | |
| "eval_samples_per_second": 3.475, | |
| "eval_steps_per_second": 1.753, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.0029, | |
| "eval_loss": 2.801098346710205, | |
| "eval_runtime": 33.5229, | |
| "eval_samples_per_second": 3.49, | |
| "eval_steps_per_second": 1.76, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.003, | |
| "grad_norm": 0.5710572013823593, | |
| "learning_rate": 2.9800000000000005e-07, | |
| "loss": 2.876, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.003, | |
| "eval_loss": 2.789198160171509, | |
| "eval_runtime": 33.4535, | |
| "eval_samples_per_second": 3.497, | |
| "eval_steps_per_second": 1.764, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0031, | |
| "eval_loss": 2.7789695262908936, | |
| "eval_runtime": 33.6409, | |
| "eval_samples_per_second": 3.478, | |
| "eval_steps_per_second": 1.754, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.0032, | |
| "eval_loss": 2.7694201469421387, | |
| "eval_runtime": 33.4266, | |
| "eval_samples_per_second": 3.5, | |
| "eval_steps_per_second": 1.765, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0033, | |
| "eval_loss": 2.7600762844085693, | |
| "eval_runtime": 33.4725, | |
| "eval_samples_per_second": 3.495, | |
| "eval_steps_per_second": 1.763, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.0034, | |
| "eval_loss": 2.7517828941345215, | |
| "eval_runtime": 33.6223, | |
| "eval_samples_per_second": 3.48, | |
| "eval_steps_per_second": 1.755, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.0035, | |
| "grad_norm": 0.151307501186972, | |
| "learning_rate": 3.48e-07, | |
| "loss": 2.811, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.0035, | |
| "eval_loss": 2.743870258331299, | |
| "eval_runtime": 33.5221, | |
| "eval_samples_per_second": 3.49, | |
| "eval_steps_per_second": 1.76, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.0036, | |
| "eval_loss": 2.7366557121276855, | |
| "eval_runtime": 33.5448, | |
| "eval_samples_per_second": 3.488, | |
| "eval_steps_per_second": 1.759, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.0037, | |
| "eval_loss": 2.7298200130462646, | |
| "eval_runtime": 33.5428, | |
| "eval_samples_per_second": 3.488, | |
| "eval_steps_per_second": 1.759, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.0038, | |
| "eval_loss": 2.722888708114624, | |
| "eval_runtime": 33.6302, | |
| "eval_samples_per_second": 3.479, | |
| "eval_steps_per_second": 1.754, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.0039, | |
| "eval_loss": 2.714289426803589, | |
| "eval_runtime": 33.5594, | |
| "eval_samples_per_second": 3.486, | |
| "eval_steps_per_second": 1.758, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "grad_norm": 0.10362348542700331, | |
| "learning_rate": 3.9800000000000004e-07, | |
| "loss": 2.7606, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "eval_loss": 2.7078425884246826, | |
| "eval_runtime": 33.6447, | |
| "eval_samples_per_second": 3.478, | |
| "eval_steps_per_second": 1.754, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0041, | |
| "eval_loss": 2.7014663219451904, | |
| "eval_runtime": 33.565, | |
| "eval_samples_per_second": 3.486, | |
| "eval_steps_per_second": 1.758, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.0042, | |
| "eval_loss": 2.6956119537353516, | |
| "eval_runtime": 33.5938, | |
| "eval_samples_per_second": 3.483, | |
| "eval_steps_per_second": 1.756, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.0043, | |
| "eval_loss": 2.6901819705963135, | |
| "eval_runtime": 33.5009, | |
| "eval_samples_per_second": 3.492, | |
| "eval_steps_per_second": 1.761, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.0044, | |
| "eval_loss": 2.684842824935913, | |
| "eval_runtime": 33.5857, | |
| "eval_samples_per_second": 3.484, | |
| "eval_steps_per_second": 1.757, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.0045, | |
| "grad_norm": 0.08395542059093342, | |
| "learning_rate": 4.4800000000000004e-07, | |
| "loss": 2.727, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.0045, | |
| "eval_loss": 2.679893732070923, | |
| "eval_runtime": 33.5333, | |
| "eval_samples_per_second": 3.489, | |
| "eval_steps_per_second": 1.759, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.0046, | |
| "eval_loss": 2.6749234199523926, | |
| "eval_runtime": 33.6847, | |
| "eval_samples_per_second": 3.473, | |
| "eval_steps_per_second": 1.752, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0047, | |
| "eval_loss": 2.670543670654297, | |
| "eval_runtime": 33.5814, | |
| "eval_samples_per_second": 3.484, | |
| "eval_steps_per_second": 1.757, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.0048, | |
| "eval_loss": 2.6663973331451416, | |
| "eval_runtime": 33.5943, | |
| "eval_samples_per_second": 3.483, | |
| "eval_steps_per_second": 1.756, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.0049, | |
| "eval_loss": 2.662304162979126, | |
| "eval_runtime": 33.5309, | |
| "eval_samples_per_second": 3.489, | |
| "eval_steps_per_second": 1.76, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.005, | |
| "grad_norm": 0.06968304771462097, | |
| "learning_rate": 4.98e-07, | |
| "loss": 2.6931, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.005, | |
| "eval_loss": 2.65859317779541, | |
| "eval_runtime": 33.4663, | |
| "eval_samples_per_second": 3.496, | |
| "eval_steps_per_second": 1.763, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0051, | |
| "eval_loss": 2.654831886291504, | |
| "eval_runtime": 33.5962, | |
| "eval_samples_per_second": 3.483, | |
| "eval_steps_per_second": 1.756, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.0052, | |
| "eval_loss": 2.6509766578674316, | |
| "eval_runtime": 33.5064, | |
| "eval_samples_per_second": 3.492, | |
| "eval_steps_per_second": 1.761, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.0053, | |
| "eval_loss": 2.6467387676239014, | |
| "eval_runtime": 33.5346, | |
| "eval_samples_per_second": 3.489, | |
| "eval_steps_per_second": 1.759, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.0054, | |
| "eval_loss": 2.6428205966949463, | |
| "eval_runtime": 33.5418, | |
| "eval_samples_per_second": 3.488, | |
| "eval_steps_per_second": 1.759, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.0055, | |
| "grad_norm": 0.05704195405230526, | |
| "learning_rate": 5.480000000000001e-07, | |
| "loss": 2.674, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.0055, | |
| "eval_loss": 2.6392645835876465, | |
| "eval_runtime": 33.6509, | |
| "eval_samples_per_second": 3.477, | |
| "eval_steps_per_second": 1.753, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.0056, | |
| "eval_loss": 2.6361024379730225, | |
| "eval_runtime": 33.6973, | |
| "eval_samples_per_second": 3.472, | |
| "eval_steps_per_second": 1.751, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.0057, | |
| "eval_loss": 2.6328718662261963, | |
| "eval_runtime": 33.5639, | |
| "eval_samples_per_second": 3.486, | |
| "eval_steps_per_second": 1.758, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.0058, | |
| "eval_loss": 2.629871129989624, | |
| "eval_runtime": 33.5243, | |
| "eval_samples_per_second": 3.49, | |
| "eval_steps_per_second": 1.76, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.0059, | |
| "eval_loss": 2.6271257400512695, | |
| "eval_runtime": 33.6427, | |
| "eval_samples_per_second": 3.478, | |
| "eval_steps_per_second": 1.754, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "grad_norm": 0.05013991368613539, | |
| "learning_rate": 5.98e-07, | |
| "loss": 2.6504, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "eval_loss": 2.6243784427642822, | |
| "eval_runtime": 33.5815, | |
| "eval_samples_per_second": 3.484, | |
| "eval_steps_per_second": 1.757, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0061, | |
| "eval_loss": 2.621882915496826, | |
| "eval_runtime": 33.7331, | |
| "eval_samples_per_second": 3.468, | |
| "eval_steps_per_second": 1.749, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.0062, | |
| "eval_loss": 2.6194233894348145, | |
| "eval_runtime": 33.594, | |
| "eval_samples_per_second": 3.483, | |
| "eval_steps_per_second": 1.756, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.0063, | |
| "eval_loss": 2.6167914867401123, | |
| "eval_runtime": 33.5521, | |
| "eval_samples_per_second": 3.487, | |
| "eval_steps_per_second": 1.758, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.0064, | |
| "eval_loss": 2.6143040657043457, | |
| "eval_runtime": 33.58, | |
| "eval_samples_per_second": 3.484, | |
| "eval_steps_per_second": 1.757, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.0065, | |
| "grad_norm": 0.04696879401248375, | |
| "learning_rate": 6.48e-07, | |
| "loss": 2.6372, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.0065, | |
| "eval_loss": 2.611804246902466, | |
| "eval_runtime": 33.5371, | |
| "eval_samples_per_second": 3.489, | |
| "eval_steps_per_second": 1.759, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.0066, | |
| "eval_loss": 2.6093685626983643, | |
| "eval_runtime": 33.8057, | |
| "eval_samples_per_second": 3.461, | |
| "eval_steps_per_second": 1.745, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.0067, | |
| "eval_loss": 2.607069492340088, | |
| "eval_runtime": 33.5819, | |
| "eval_samples_per_second": 3.484, | |
| "eval_steps_per_second": 1.757, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.0068, | |
| "eval_loss": 2.604562520980835, | |
| "eval_runtime": 33.5971, | |
| "eval_samples_per_second": 3.482, | |
| "eval_steps_per_second": 1.756, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.0069, | |
| "eval_loss": 2.6024069786071777, | |
| "eval_runtime": 33.5107, | |
| "eval_samples_per_second": 3.491, | |
| "eval_steps_per_second": 1.761, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.007, | |
| "grad_norm": 0.04335213523196003, | |
| "learning_rate": 6.98e-07, | |
| "loss": 2.6173, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.007, | |
| "eval_loss": 2.6002795696258545, | |
| "eval_runtime": 33.6194, | |
| "eval_samples_per_second": 3.48, | |
| "eval_steps_per_second": 1.755, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.0071, | |
| "eval_loss": 2.598109245300293, | |
| "eval_runtime": 33.807, | |
| "eval_samples_per_second": 3.461, | |
| "eval_steps_per_second": 1.745, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.0072, | |
| "eval_loss": 2.596126079559326, | |
| "eval_runtime": 33.5287, | |
| "eval_samples_per_second": 3.49, | |
| "eval_steps_per_second": 1.76, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.0073, | |
| "eval_loss": 2.5941832065582275, | |
| "eval_runtime": 33.5456, | |
| "eval_samples_per_second": 3.488, | |
| "eval_steps_per_second": 1.759, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.0074, | |
| "eval_loss": 2.592336893081665, | |
| "eval_runtime": 33.6972, | |
| "eval_samples_per_second": 3.472, | |
| "eval_steps_per_second": 1.751, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.0075, | |
| "grad_norm": 0.04553004087145917, | |
| "learning_rate": 7.480000000000001e-07, | |
| "loss": 2.608, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.0075, | |
| "eval_loss": 2.590573310852051, | |
| "eval_runtime": 33.6132, | |
| "eval_samples_per_second": 3.481, | |
| "eval_steps_per_second": 1.755, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.0076, | |
| "eval_loss": 2.5888302326202393, | |
| "eval_runtime": 33.6363, | |
| "eval_samples_per_second": 3.478, | |
| "eval_steps_per_second": 1.754, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.0077, | |
| "eval_loss": 2.5870487689971924, | |
| "eval_runtime": 33.6309, | |
| "eval_samples_per_second": 3.479, | |
| "eval_steps_per_second": 1.754, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.0078, | |
| "eval_loss": 2.5851986408233643, | |
| "eval_runtime": 33.5237, | |
| "eval_samples_per_second": 3.49, | |
| "eval_steps_per_second": 1.76, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.0079, | |
| "eval_loss": 2.583341598510742, | |
| "eval_runtime": 33.4914, | |
| "eval_samples_per_second": 3.493, | |
| "eval_steps_per_second": 1.762, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 0.04067489002091025, | |
| "learning_rate": 7.98e-07, | |
| "loss": 2.6034, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "eval_loss": 2.5816242694854736, | |
| "eval_runtime": 33.6305, | |
| "eval_samples_per_second": 3.479, | |
| "eval_steps_per_second": 1.754, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.0081, | |
| "eval_loss": 2.5800209045410156, | |
| "eval_runtime": 33.9049, | |
| "eval_samples_per_second": 3.451, | |
| "eval_steps_per_second": 1.74, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.0082, | |
| "eval_loss": 2.5783472061157227, | |
| "eval_runtime": 33.6847, | |
| "eval_samples_per_second": 3.473, | |
| "eval_steps_per_second": 1.752, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.0083, | |
| "eval_loss": 2.5765581130981445, | |
| "eval_runtime": 33.5467, | |
| "eval_samples_per_second": 3.488, | |
| "eval_steps_per_second": 1.759, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.0084, | |
| "eval_loss": 2.574805974960327, | |
| "eval_runtime": 33.6837, | |
| "eval_samples_per_second": 3.473, | |
| "eval_steps_per_second": 1.752, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.0085, | |
| "grad_norm": 0.03957021300313725, | |
| "learning_rate": 8.480000000000001e-07, | |
| "loss": 2.5881, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.0085, | |
| "eval_loss": 2.5732243061065674, | |
| "eval_runtime": 33.6883, | |
| "eval_samples_per_second": 3.473, | |
| "eval_steps_per_second": 1.751, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.0086, | |
| "eval_loss": 2.5712339878082275, | |
| "eval_runtime": 34.0087, | |
| "eval_samples_per_second": 3.44, | |
| "eval_steps_per_second": 1.735, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.0087, | |
| "eval_loss": 2.5696043968200684, | |
| "eval_runtime": 33.5522, | |
| "eval_samples_per_second": 3.487, | |
| "eval_steps_per_second": 1.758, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.0088, | |
| "eval_loss": 2.568011522293091, | |
| "eval_runtime": 33.7026, | |
| "eval_samples_per_second": 3.472, | |
| "eval_steps_per_second": 1.751, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.0089, | |
| "eval_loss": 2.5661723613739014, | |
| "eval_runtime": 33.7143, | |
| "eval_samples_per_second": 3.47, | |
| "eval_steps_per_second": 1.75, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.009, | |
| "grad_norm": 0.04518058243135632, | |
| "learning_rate": 8.980000000000001e-07, | |
| "loss": 2.577, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.009, | |
| "eval_loss": 2.5647170543670654, | |
| "eval_runtime": 33.6066, | |
| "eval_samples_per_second": 3.481, | |
| "eval_steps_per_second": 1.756, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.0091, | |
| "eval_loss": 2.5629138946533203, | |
| "eval_runtime": 33.695, | |
| "eval_samples_per_second": 3.472, | |
| "eval_steps_per_second": 1.751, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.0092, | |
| "eval_loss": 2.561223268508911, | |
| "eval_runtime": 33.7639, | |
| "eval_samples_per_second": 3.465, | |
| "eval_steps_per_second": 1.747, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.0093, | |
| "eval_loss": 2.559941053390503, | |
| "eval_runtime": 33.5726, | |
| "eval_samples_per_second": 3.485, | |
| "eval_steps_per_second": 1.757, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.0094, | |
| "eval_loss": 2.5585126876831055, | |
| "eval_runtime": 33.5393, | |
| "eval_samples_per_second": 3.488, | |
| "eval_steps_per_second": 1.759, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.0095, | |
| "grad_norm": 0.04841685552742973, | |
| "learning_rate": 9.480000000000001e-07, | |
| "loss": 2.5614, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.0095, | |
| "eval_loss": 2.557070732116699, | |
| "eval_runtime": 33.5396, | |
| "eval_samples_per_second": 3.488, | |
| "eval_steps_per_second": 1.759, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.0096, | |
| "eval_loss": 2.5551016330718994, | |
| "eval_runtime": 33.8951, | |
| "eval_samples_per_second": 3.452, | |
| "eval_steps_per_second": 1.741, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.0097, | |
| "eval_loss": 2.553600311279297, | |
| "eval_runtime": 33.6678, | |
| "eval_samples_per_second": 3.475, | |
| "eval_steps_per_second": 1.752, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.0098, | |
| "eval_loss": 2.5523183345794678, | |
| "eval_runtime": 33.6551, | |
| "eval_samples_per_second": 3.476, | |
| "eval_steps_per_second": 1.753, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.0099, | |
| "eval_loss": 2.5510056018829346, | |
| "eval_runtime": 33.6214, | |
| "eval_samples_per_second": 3.48, | |
| "eval_steps_per_second": 1.755, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.043993000876628545, | |
| "learning_rate": 9.98e-07, | |
| "loss": 2.5613, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "eval_loss": 2.5498273372650146, | |
| "eval_runtime": 33.6069, | |
| "eval_samples_per_second": 3.481, | |
| "eval_steps_per_second": 1.756, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0101, | |
| "eval_loss": 2.548828601837158, | |
| "eval_runtime": 33.7909, | |
| "eval_samples_per_second": 3.462, | |
| "eval_steps_per_second": 1.746, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.0102, | |
| "eval_loss": 2.5474376678466797, | |
| "eval_runtime": 33.543, | |
| "eval_samples_per_second": 3.488, | |
| "eval_steps_per_second": 1.759, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.0103, | |
| "eval_loss": 2.5464441776275635, | |
| "eval_runtime": 33.6579, | |
| "eval_samples_per_second": 3.476, | |
| "eval_steps_per_second": 1.753, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.0104, | |
| "eval_loss": 2.5453498363494873, | |
| "eval_runtime": 33.4841, | |
| "eval_samples_per_second": 3.494, | |
| "eval_steps_per_second": 1.762, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.0105, | |
| "grad_norm": 0.04663602312001795, | |
| "learning_rate": 1.0480000000000002e-06, | |
| "loss": 2.5521, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.0105, | |
| "eval_loss": 2.5442492961883545, | |
| "eval_runtime": 33.5915, | |
| "eval_samples_per_second": 3.483, | |
| "eval_steps_per_second": 1.756, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.0106, | |
| "eval_loss": 2.5432002544403076, | |
| "eval_runtime": 33.6717, | |
| "eval_samples_per_second": 3.475, | |
| "eval_steps_per_second": 1.752, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.0107, | |
| "eval_loss": 2.542072057723999, | |
| "eval_runtime": 33.6153, | |
| "eval_samples_per_second": 3.481, | |
| "eval_steps_per_second": 1.755, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.0108, | |
| "eval_loss": 2.541541814804077, | |
| "eval_runtime": 34.4505, | |
| "eval_samples_per_second": 3.396, | |
| "eval_steps_per_second": 1.713, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.0109, | |
| "eval_loss": 2.540494203567505, | |
| "eval_runtime": 33.6369, | |
| "eval_samples_per_second": 3.478, | |
| "eval_steps_per_second": 1.754, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.011, | |
| "grad_norm": 0.044473565671350655, | |
| "learning_rate": 1.0980000000000001e-06, | |
| "loss": 2.5433, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.011, | |
| "eval_loss": 2.539369821548462, | |
| "eval_runtime": 33.5742, | |
| "eval_samples_per_second": 3.485, | |
| "eval_steps_per_second": 1.757, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.0111, | |
| "eval_loss": 2.5384223461151123, | |
| "eval_runtime": 33.9094, | |
| "eval_samples_per_second": 3.45, | |
| "eval_steps_per_second": 1.74, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.0112, | |
| "eval_loss": 2.5375945568084717, | |
| "eval_runtime": 33.6016, | |
| "eval_samples_per_second": 3.482, | |
| "eval_steps_per_second": 1.756, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.0113, | |
| "eval_loss": 2.536487340927124, | |
| "eval_runtime": 34.3561, | |
| "eval_samples_per_second": 3.406, | |
| "eval_steps_per_second": 1.717, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.0114, | |
| "eval_loss": 2.5356836318969727, | |
| "eval_runtime": 34.5074, | |
| "eval_samples_per_second": 3.391, | |
| "eval_steps_per_second": 1.71, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.0115, | |
| "grad_norm": 0.04668528198599521, | |
| "learning_rate": 1.148e-06, | |
| "loss": 2.5496, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.0115, | |
| "eval_loss": 2.5347819328308105, | |
| "eval_runtime": 33.5932, | |
| "eval_samples_per_second": 3.483, | |
| "eval_steps_per_second": 1.756, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.0116, | |
| "eval_loss": 2.534010410308838, | |
| "eval_runtime": 33.8124, | |
| "eval_samples_per_second": 3.46, | |
| "eval_steps_per_second": 1.745, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.0117, | |
| "eval_loss": 2.5331332683563232, | |
| "eval_runtime": 33.5617, | |
| "eval_samples_per_second": 3.486, | |
| "eval_steps_per_second": 1.758, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.0118, | |
| "eval_loss": 2.5322561264038086, | |
| "eval_runtime": 33.8081, | |
| "eval_samples_per_second": 3.461, | |
| "eval_steps_per_second": 1.745, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.0119, | |
| "eval_loss": 2.5314669609069824, | |
| "eval_runtime": 33.7053, | |
| "eval_samples_per_second": 3.471, | |
| "eval_steps_per_second": 1.75, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "grad_norm": 0.043769011241975755, | |
| "learning_rate": 1.1980000000000002e-06, | |
| "loss": 2.5455, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "eval_loss": 2.5307207107543945, | |
| "eval_runtime": 33.6848, | |
| "eval_samples_per_second": 3.473, | |
| "eval_steps_per_second": 1.752, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.0121, | |
| "eval_loss": 2.530006170272827, | |
| "eval_runtime": 33.686, | |
| "eval_samples_per_second": 3.473, | |
| "eval_steps_per_second": 1.751, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.0122, | |
| "eval_loss": 2.529109239578247, | |
| "eval_runtime": 33.7013, | |
| "eval_samples_per_second": 3.472, | |
| "eval_steps_per_second": 1.751, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.0123, | |
| "eval_loss": 2.5284457206726074, | |
| "eval_runtime": 33.6733, | |
| "eval_samples_per_second": 3.475, | |
| "eval_steps_per_second": 1.752, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.0124, | |
| "eval_loss": 2.5276710987091064, | |
| "eval_runtime": 33.624, | |
| "eval_samples_per_second": 3.48, | |
| "eval_steps_per_second": 1.755, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.0125, | |
| "grad_norm": 0.04196307636615052, | |
| "learning_rate": 1.248e-06, | |
| "loss": 2.5273, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.0125, | |
| "eval_loss": 2.526918411254883, | |
| "eval_runtime": 33.5952, | |
| "eval_samples_per_second": 3.483, | |
| "eval_steps_per_second": 1.756, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.0126, | |
| "eval_loss": 2.5262696743011475, | |
| "eval_runtime": 33.7522, | |
| "eval_samples_per_second": 3.466, | |
| "eval_steps_per_second": 1.748, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.0127, | |
| "eval_loss": 2.5255067348480225, | |
| "eval_runtime": 33.7929, | |
| "eval_samples_per_second": 3.462, | |
| "eval_steps_per_second": 1.746, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.0128, | |
| "eval_loss": 2.524789810180664, | |
| "eval_runtime": 33.7139, | |
| "eval_samples_per_second": 3.47, | |
| "eval_steps_per_second": 1.75, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.0129, | |
| "eval_loss": 2.524181604385376, | |
| "eval_runtime": 33.7772, | |
| "eval_samples_per_second": 3.464, | |
| "eval_steps_per_second": 1.747, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.013, | |
| "grad_norm": 0.04719575571491393, | |
| "learning_rate": 1.2980000000000001e-06, | |
| "loss": 2.5226, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.013, | |
| "eval_loss": 2.5235090255737305, | |
| "eval_runtime": 33.6972, | |
| "eval_samples_per_second": 3.472, | |
| "eval_steps_per_second": 1.751, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.0131, | |
| "eval_loss": 2.5227887630462646, | |
| "eval_runtime": 33.8073, | |
| "eval_samples_per_second": 3.461, | |
| "eval_steps_per_second": 1.745, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.0132, | |
| "eval_loss": 2.522101402282715, | |
| "eval_runtime": 33.7192, | |
| "eval_samples_per_second": 3.47, | |
| "eval_steps_per_second": 1.75, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.0133, | |
| "eval_loss": 2.5215632915496826, | |
| "eval_runtime": 33.7966, | |
| "eval_samples_per_second": 3.462, | |
| "eval_steps_per_second": 1.746, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.0134, | |
| "eval_loss": 2.5208749771118164, | |
| "eval_runtime": 33.7485, | |
| "eval_samples_per_second": 3.467, | |
| "eval_steps_per_second": 1.748, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.0135, | |
| "grad_norm": 0.044734235617461586, | |
| "learning_rate": 1.348e-06, | |
| "loss": 2.5273, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.0135, | |
| "eval_loss": 2.5201478004455566, | |
| "eval_runtime": 33.8972, | |
| "eval_samples_per_second": 3.452, | |
| "eval_steps_per_second": 1.741, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.0136, | |
| "eval_loss": 2.5197227001190186, | |
| "eval_runtime": 33.6652, | |
| "eval_samples_per_second": 3.475, | |
| "eval_steps_per_second": 1.753, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.0137, | |
| "eval_loss": 2.519151449203491, | |
| "eval_runtime": 33.6031, | |
| "eval_samples_per_second": 3.482, | |
| "eval_steps_per_second": 1.756, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.0138, | |
| "eval_loss": 2.5185396671295166, | |
| "eval_runtime": 33.6292, | |
| "eval_samples_per_second": 3.479, | |
| "eval_steps_per_second": 1.754, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.0139, | |
| "eval_loss": 2.517947196960449, | |
| "eval_runtime": 33.5987, | |
| "eval_samples_per_second": 3.482, | |
| "eval_steps_per_second": 1.756, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.014, | |
| "grad_norm": 0.04124740305893712, | |
| "learning_rate": 1.3980000000000002e-06, | |
| "loss": 2.5214, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.014, | |
| "eval_loss": 2.5173356533050537, | |
| "eval_runtime": 33.6657, | |
| "eval_samples_per_second": 3.475, | |
| "eval_steps_per_second": 1.753, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.0141, | |
| "eval_loss": 2.5167977809906006, | |
| "eval_runtime": 33.5728, | |
| "eval_samples_per_second": 3.485, | |
| "eval_steps_per_second": 1.757, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.0142, | |
| "eval_loss": 2.5162267684936523, | |
| "eval_runtime": 33.2779, | |
| "eval_samples_per_second": 3.516, | |
| "eval_steps_per_second": 1.773, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.0143, | |
| "eval_loss": 2.5155909061431885, | |
| "eval_runtime": 33.4627, | |
| "eval_samples_per_second": 3.496, | |
| "eval_steps_per_second": 1.763, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.0144, | |
| "eval_loss": 2.515427589416504, | |
| "eval_runtime": 33.439, | |
| "eval_samples_per_second": 3.499, | |
| "eval_steps_per_second": 1.764, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.0145, | |
| "grad_norm": 0.04140679897915697, | |
| "learning_rate": 1.4480000000000002e-06, | |
| "loss": 2.5192, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.0145, | |
| "eval_loss": 2.514657735824585, | |
| "eval_runtime": 33.3527, | |
| "eval_samples_per_second": 3.508, | |
| "eval_steps_per_second": 1.769, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.0146, | |
| "eval_loss": 2.5141184329986572, | |
| "eval_runtime": 33.3623, | |
| "eval_samples_per_second": 3.507, | |
| "eval_steps_per_second": 1.768, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.0147, | |
| "eval_loss": 2.5135021209716797, | |
| "eval_runtime": 36.2875, | |
| "eval_samples_per_second": 3.224, | |
| "eval_steps_per_second": 1.626, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.0148, | |
| "eval_loss": 2.5130276679992676, | |
| "eval_runtime": 33.3738, | |
| "eval_samples_per_second": 3.506, | |
| "eval_steps_per_second": 1.768, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.0149, | |
| "eval_loss": 2.5123140811920166, | |
| "eval_runtime": 33.7458, | |
| "eval_samples_per_second": 3.467, | |
| "eval_steps_per_second": 1.748, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.015, | |
| "grad_norm": 0.03921746155872101, | |
| "learning_rate": 1.498e-06, | |
| "loss": 2.5077, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.015, | |
| "eval_loss": 2.5117204189300537, | |
| "eval_runtime": 33.3164, | |
| "eval_samples_per_second": 3.512, | |
| "eval_steps_per_second": 1.771, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.0151, | |
| "eval_loss": 2.5113115310668945, | |
| "eval_runtime": 33.464, | |
| "eval_samples_per_second": 3.496, | |
| "eval_steps_per_second": 1.763, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.0152, | |
| "eval_loss": 2.510754108428955, | |
| "eval_runtime": 33.426, | |
| "eval_samples_per_second": 3.5, | |
| "eval_steps_per_second": 1.765, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.0153, | |
| "eval_loss": 2.510148525238037, | |
| "eval_runtime": 33.5135, | |
| "eval_samples_per_second": 3.491, | |
| "eval_steps_per_second": 1.76, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.0154, | |
| "eval_loss": 2.5096797943115234, | |
| "eval_runtime": 33.5467, | |
| "eval_samples_per_second": 3.488, | |
| "eval_steps_per_second": 1.759, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.0155, | |
| "grad_norm": 0.038161493704092234, | |
| "learning_rate": 1.548e-06, | |
| "loss": 2.5127, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.0155, | |
| "eval_loss": 2.5091397762298584, | |
| "eval_runtime": 33.6296, | |
| "eval_samples_per_second": 3.479, | |
| "eval_steps_per_second": 1.754, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.0156, | |
| "eval_loss": 2.5085766315460205, | |
| "eval_runtime": 33.6417, | |
| "eval_samples_per_second": 3.478, | |
| "eval_steps_per_second": 1.754, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.0157, | |
| "eval_loss": 2.5081799030303955, | |
| "eval_runtime": 33.5831, | |
| "eval_samples_per_second": 3.484, | |
| "eval_steps_per_second": 1.757, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.0158, | |
| "eval_loss": 2.5075252056121826, | |
| "eval_runtime": 33.5806, | |
| "eval_samples_per_second": 3.484, | |
| "eval_steps_per_second": 1.757, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.0159, | |
| "eval_loss": 2.5069563388824463, | |
| "eval_runtime": 33.6257, | |
| "eval_samples_per_second": 3.479, | |
| "eval_steps_per_second": 1.755, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 0.04372605860022339, | |
| "learning_rate": 1.5980000000000002e-06, | |
| "loss": 2.5019, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "eval_loss": 2.5065925121307373, | |
| "eval_runtime": 33.6041, | |
| "eval_samples_per_second": 3.482, | |
| "eval_steps_per_second": 1.756, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.0161, | |
| "eval_loss": 2.5059759616851807, | |
| "eval_runtime": 33.6116, | |
| "eval_samples_per_second": 3.481, | |
| "eval_steps_per_second": 1.755, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.0162, | |
| "eval_loss": 2.505453109741211, | |
| "eval_runtime": 33.5794, | |
| "eval_samples_per_second": 3.484, | |
| "eval_steps_per_second": 1.757, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.0163, | |
| "eval_loss": 2.505023241043091, | |
| "eval_runtime": 33.461, | |
| "eval_samples_per_second": 3.497, | |
| "eval_steps_per_second": 1.763, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.0164, | |
| "eval_loss": 2.5042824745178223, | |
| "eval_runtime": 33.5988, | |
| "eval_samples_per_second": 3.482, | |
| "eval_steps_per_second": 1.756, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.0165, | |
| "grad_norm": 0.041497520045134684, | |
| "learning_rate": 1.6480000000000001e-06, | |
| "loss": 2.4977, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.0165, | |
| "eval_loss": 2.5039255619049072, | |
| "eval_runtime": 33.6107, | |
| "eval_samples_per_second": 3.481, | |
| "eval_steps_per_second": 1.755, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.0166, | |
| "eval_loss": 2.503436803817749, | |
| "eval_runtime": 33.6213, | |
| "eval_samples_per_second": 3.48, | |
| "eval_steps_per_second": 1.755, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.0167, | |
| "eval_loss": 2.5028321743011475, | |
| "eval_runtime": 33.5009, | |
| "eval_samples_per_second": 3.492, | |
| "eval_steps_per_second": 1.761, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.0168, | |
| "eval_loss": 2.5022666454315186, | |
| "eval_runtime": 33.6392, | |
| "eval_samples_per_second": 3.478, | |
| "eval_steps_per_second": 1.754, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.0169, | |
| "eval_loss": 2.5018374919891357, | |
| "eval_runtime": 33.5928, | |
| "eval_samples_per_second": 3.483, | |
| "eval_steps_per_second": 1.756, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.017, | |
| "grad_norm": 0.040226840781059835, | |
| "learning_rate": 1.6980000000000003e-06, | |
| "loss": 2.4968, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.017, | |
| "eval_loss": 2.5012588500976562, | |
| "eval_runtime": 33.5216, | |
| "eval_samples_per_second": 3.49, | |
| "eval_steps_per_second": 1.76, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.0171, | |
| "eval_loss": 2.5006515979766846, | |
| "eval_runtime": 33.4799, | |
| "eval_samples_per_second": 3.495, | |
| "eval_steps_per_second": 1.762, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.0172, | |
| "eval_loss": 2.5001821517944336, | |
| "eval_runtime": 33.6067, | |
| "eval_samples_per_second": 3.481, | |
| "eval_steps_per_second": 1.756, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.0173, | |
| "eval_loss": 2.499708652496338, | |
| "eval_runtime": 33.5478, | |
| "eval_samples_per_second": 3.488, | |
| "eval_steps_per_second": 1.759, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.0174, | |
| "eval_loss": 2.4992101192474365, | |
| "eval_runtime": 33.3608, | |
| "eval_samples_per_second": 3.507, | |
| "eval_steps_per_second": 1.769, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.0175, | |
| "grad_norm": 0.043360400185163274, | |
| "learning_rate": 1.7480000000000002e-06, | |
| "loss": 2.4947, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.0175, | |
| "eval_loss": 2.49912428855896, | |
| "eval_runtime": 33.3782, | |
| "eval_samples_per_second": 3.505, | |
| "eval_steps_per_second": 1.768, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.0176, | |
| "eval_loss": 2.498539686203003, | |
| "eval_runtime": 33.4271, | |
| "eval_samples_per_second": 3.5, | |
| "eval_steps_per_second": 1.765, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.0177, | |
| "eval_loss": 2.4980475902557373, | |
| "eval_runtime": 33.508, | |
| "eval_samples_per_second": 3.492, | |
| "eval_steps_per_second": 1.761, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.0178, | |
| "eval_loss": 2.4972891807556152, | |
| "eval_runtime": 33.5801, | |
| "eval_samples_per_second": 3.484, | |
| "eval_steps_per_second": 1.757, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.0179, | |
| "eval_loss": 2.496943473815918, | |
| "eval_runtime": 33.4984, | |
| "eval_samples_per_second": 3.493, | |
| "eval_steps_per_second": 1.761, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.018, | |
| "grad_norm": 0.040565773819723885, | |
| "learning_rate": 1.798e-06, | |
| "loss": 2.4878, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.018, | |
| "eval_loss": 2.496464252471924, | |
| "eval_runtime": 33.6538, | |
| "eval_samples_per_second": 3.477, | |
| "eval_steps_per_second": 1.753, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.0181, | |
| "eval_loss": 2.496126890182495, | |
| "eval_runtime": 33.6415, | |
| "eval_samples_per_second": 3.478, | |
| "eval_steps_per_second": 1.754, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.0182, | |
| "eval_loss": 2.4957361221313477, | |
| "eval_runtime": 33.7646, | |
| "eval_samples_per_second": 3.465, | |
| "eval_steps_per_second": 1.747, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.0183, | |
| "eval_loss": 2.4954254627227783, | |
| "eval_runtime": 33.5639, | |
| "eval_samples_per_second": 3.486, | |
| "eval_steps_per_second": 1.758, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.0184, | |
| "eval_loss": 2.4948976039886475, | |
| "eval_runtime": 33.6038, | |
| "eval_samples_per_second": 3.482, | |
| "eval_steps_per_second": 1.756, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.0185, | |
| "grad_norm": 0.039370814834696136, | |
| "learning_rate": 1.8480000000000001e-06, | |
| "loss": 2.4986, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.0185, | |
| "eval_loss": 2.494521379470825, | |
| "eval_runtime": 33.7082, | |
| "eval_samples_per_second": 3.471, | |
| "eval_steps_per_second": 1.75, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.0186, | |
| "eval_loss": 2.4939730167388916, | |
| "eval_runtime": 33.6147, | |
| "eval_samples_per_second": 3.481, | |
| "eval_steps_per_second": 1.755, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.0187, | |
| "eval_loss": 2.49343204498291, | |
| "eval_runtime": 35.1502, | |
| "eval_samples_per_second": 3.329, | |
| "eval_steps_per_second": 1.679, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.0188, | |
| "eval_loss": 2.493082046508789, | |
| "eval_runtime": 33.6381, | |
| "eval_samples_per_second": 3.478, | |
| "eval_steps_per_second": 1.754, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.0189, | |
| "eval_loss": 2.492797374725342, | |
| "eval_runtime": 33.7089, | |
| "eval_samples_per_second": 3.471, | |
| "eval_steps_per_second": 1.75, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.019, | |
| "grad_norm": 0.04019472793080496, | |
| "learning_rate": 1.898e-06, | |
| "loss": 2.481, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.019, | |
| "eval_loss": 2.4925599098205566, | |
| "eval_runtime": 33.5096, | |
| "eval_samples_per_second": 3.492, | |
| "eval_steps_per_second": 1.761, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.0191, | |
| "eval_loss": 2.4918878078460693, | |
| "eval_runtime": 33.4921, | |
| "eval_samples_per_second": 3.493, | |
| "eval_steps_per_second": 1.762, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.0192, | |
| "eval_loss": 2.4916608333587646, | |
| "eval_runtime": 33.5126, | |
| "eval_samples_per_second": 3.491, | |
| "eval_steps_per_second": 1.761, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.0193, | |
| "eval_loss": 2.491708517074585, | |
| "eval_runtime": 33.6466, | |
| "eval_samples_per_second": 3.477, | |
| "eval_steps_per_second": 1.754, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.0194, | |
| "eval_loss": 2.4911839962005615, | |
| "eval_runtime": 33.6119, | |
| "eval_samples_per_second": 3.481, | |
| "eval_steps_per_second": 1.755, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.0195, | |
| "grad_norm": 0.04683912756161822, | |
| "learning_rate": 1.9480000000000002e-06, | |
| "loss": 2.4879, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.0195, | |
| "eval_loss": 2.490492343902588, | |
| "eval_runtime": 33.4389, | |
| "eval_samples_per_second": 3.499, | |
| "eval_steps_per_second": 1.764, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.0196, | |
| "eval_loss": 2.490133285522461, | |
| "eval_runtime": 33.361, | |
| "eval_samples_per_second": 3.507, | |
| "eval_steps_per_second": 1.769, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.0197, | |
| "eval_loss": 2.4896316528320312, | |
| "eval_runtime": 33.5863, | |
| "eval_samples_per_second": 3.484, | |
| "eval_steps_per_second": 1.757, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.0198, | |
| "eval_loss": 2.489122152328491, | |
| "eval_runtime": 33.6173, | |
| "eval_samples_per_second": 3.48, | |
| "eval_steps_per_second": 1.755, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.0199, | |
| "eval_loss": 2.488906145095825, | |
| "eval_runtime": 33.6531, | |
| "eval_samples_per_second": 3.477, | |
| "eval_steps_per_second": 1.753, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.047671496052023164, | |
| "learning_rate": 1.998e-06, | |
| "loss": 2.4879, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "eval_loss": 2.488457202911377, | |
| "eval_runtime": 33.7763, | |
| "eval_samples_per_second": 3.464, | |
| "eval_steps_per_second": 1.747, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.0201, | |
| "eval_loss": 2.4881434440612793, | |
| "eval_runtime": 33.6922, | |
| "eval_samples_per_second": 3.473, | |
| "eval_steps_per_second": 1.751, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.0202, | |
| "eval_loss": 2.4879722595214844, | |
| "eval_runtime": 33.6857, | |
| "eval_samples_per_second": 3.473, | |
| "eval_steps_per_second": 1.751, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.0203, | |
| "eval_loss": 2.4876134395599365, | |
| "eval_runtime": 33.7945, | |
| "eval_samples_per_second": 3.462, | |
| "eval_steps_per_second": 1.746, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.0204, | |
| "eval_loss": 2.4872164726257324, | |
| "eval_runtime": 33.7811, | |
| "eval_samples_per_second": 3.463, | |
| "eval_steps_per_second": 1.747, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.0205, | |
| "grad_norm": 0.04204734602618554, | |
| "learning_rate": 2.048e-06, | |
| "loss": 2.4708, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.0205, | |
| "eval_loss": 2.48695707321167, | |
| "eval_runtime": 33.821, | |
| "eval_samples_per_second": 3.459, | |
| "eval_steps_per_second": 1.744, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.0206, | |
| "eval_loss": 2.486564874649048, | |
| "eval_runtime": 33.82, | |
| "eval_samples_per_second": 3.459, | |
| "eval_steps_per_second": 1.745, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.0207, | |
| "eval_loss": 2.486281633377075, | |
| "eval_runtime": 33.927, | |
| "eval_samples_per_second": 3.449, | |
| "eval_steps_per_second": 1.739, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.0208, | |
| "eval_loss": 2.4860103130340576, | |
| "eval_runtime": 33.9697, | |
| "eval_samples_per_second": 3.444, | |
| "eval_steps_per_second": 1.737, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.0209, | |
| "eval_loss": 2.4855759143829346, | |
| "eval_runtime": 33.9097, | |
| "eval_samples_per_second": 3.45, | |
| "eval_steps_per_second": 1.74, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.021, | |
| "grad_norm": 0.03813289834436041, | |
| "learning_rate": 2.098e-06, | |
| "loss": 2.4799, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.021, | |
| "eval_loss": 2.485349416732788, | |
| "eval_runtime": 34.0131, | |
| "eval_samples_per_second": 3.44, | |
| "eval_steps_per_second": 1.735, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.0211, | |
| "eval_loss": 2.48506498336792, | |
| "eval_runtime": 34.036, | |
| "eval_samples_per_second": 3.438, | |
| "eval_steps_per_second": 1.733, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.0212, | |
| "eval_loss": 2.484771966934204, | |
| "eval_runtime": 34.0842, | |
| "eval_samples_per_second": 3.433, | |
| "eval_steps_per_second": 1.731, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.0213, | |
| "eval_loss": 2.4846508502960205, | |
| "eval_runtime": 34.0289, | |
| "eval_samples_per_second": 3.438, | |
| "eval_steps_per_second": 1.734, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.0214, | |
| "eval_loss": 2.484158992767334, | |
| "eval_runtime": 34.0038, | |
| "eval_samples_per_second": 3.441, | |
| "eval_steps_per_second": 1.735, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.0215, | |
| "grad_norm": 0.04289680570208033, | |
| "learning_rate": 2.148e-06, | |
| "loss": 2.4822, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.0215, | |
| "eval_loss": 2.483947992324829, | |
| "eval_runtime": 33.9604, | |
| "eval_samples_per_second": 3.445, | |
| "eval_steps_per_second": 1.737, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.0216, | |
| "eval_loss": 2.4836008548736572, | |
| "eval_runtime": 33.9465, | |
| "eval_samples_per_second": 3.447, | |
| "eval_steps_per_second": 1.738, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.0217, | |
| "eval_loss": 2.483187675476074, | |
| "eval_runtime": 34.1344, | |
| "eval_samples_per_second": 3.428, | |
| "eval_steps_per_second": 1.728, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.0218, | |
| "eval_loss": 2.4829964637756348, | |
| "eval_runtime": 34.0915, | |
| "eval_samples_per_second": 3.432, | |
| "eval_steps_per_second": 1.731, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.0219, | |
| "eval_loss": 2.482805013656616, | |
| "eval_runtime": 33.9291, | |
| "eval_samples_per_second": 3.448, | |
| "eval_steps_per_second": 1.739, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.022, | |
| "grad_norm": 0.03972633299982532, | |
| "learning_rate": 2.198e-06, | |
| "loss": 2.4871, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.022, | |
| "eval_loss": 2.482428550720215, | |
| "eval_runtime": 33.7324, | |
| "eval_samples_per_second": 3.468, | |
| "eval_steps_per_second": 1.749, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.0221, | |
| "eval_loss": 2.4822213649749756, | |
| "eval_runtime": 33.7954, | |
| "eval_samples_per_second": 3.462, | |
| "eval_steps_per_second": 1.746, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.0222, | |
| "eval_loss": 2.481689214706421, | |
| "eval_runtime": 33.7787, | |
| "eval_samples_per_second": 3.464, | |
| "eval_steps_per_second": 1.747, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.0223, | |
| "eval_loss": 2.481731414794922, | |
| "eval_runtime": 33.6129, | |
| "eval_samples_per_second": 3.481, | |
| "eval_steps_per_second": 1.755, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.0224, | |
| "eval_loss": 2.4812448024749756, | |
| "eval_runtime": 33.511, | |
| "eval_samples_per_second": 3.491, | |
| "eval_steps_per_second": 1.761, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.0225, | |
| "grad_norm": 0.041792864961431496, | |
| "learning_rate": 2.2480000000000003e-06, | |
| "loss": 2.4766, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.0225, | |
| "eval_loss": 2.4809837341308594, | |
| "eval_runtime": 33.7009, | |
| "eval_samples_per_second": 3.472, | |
| "eval_steps_per_second": 1.751, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.0226, | |
| "eval_loss": 2.480768918991089, | |
| "eval_runtime": 33.6615, | |
| "eval_samples_per_second": 3.476, | |
| "eval_steps_per_second": 1.753, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.0227, | |
| "eval_loss": 2.480337381362915, | |
| "eval_runtime": 33.6203, | |
| "eval_samples_per_second": 3.48, | |
| "eval_steps_per_second": 1.755, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.0228, | |
| "eval_loss": 2.4803271293640137, | |
| "eval_runtime": 33.6559, | |
| "eval_samples_per_second": 3.476, | |
| "eval_steps_per_second": 1.753, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.0229, | |
| "eval_loss": 2.4799482822418213, | |
| "eval_runtime": 33.5023, | |
| "eval_samples_per_second": 3.492, | |
| "eval_steps_per_second": 1.761, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.023, | |
| "grad_norm": 0.035383899567194975, | |
| "learning_rate": 2.2980000000000003e-06, | |
| "loss": 2.4749, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.023, | |
| "eval_loss": 2.479668140411377, | |
| "eval_runtime": 33.4615, | |
| "eval_samples_per_second": 3.497, | |
| "eval_steps_per_second": 1.763, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.0231, | |
| "eval_loss": 2.4794092178344727, | |
| "eval_runtime": 33.4264, | |
| "eval_samples_per_second": 3.5, | |
| "eval_steps_per_second": 1.765, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.0232, | |
| "eval_loss": 2.4790964126586914, | |
| "eval_runtime": 33.4165, | |
| "eval_samples_per_second": 3.501, | |
| "eval_steps_per_second": 1.766, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.0233, | |
| "eval_loss": 2.4789323806762695, | |
| "eval_runtime": 33.2576, | |
| "eval_samples_per_second": 3.518, | |
| "eval_steps_per_second": 1.774, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.0234, | |
| "eval_loss": 2.4786429405212402, | |
| "eval_runtime": 33.3028, | |
| "eval_samples_per_second": 3.513, | |
| "eval_steps_per_second": 1.772, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.0235, | |
| "grad_norm": 0.034819138532107045, | |
| "learning_rate": 2.3480000000000002e-06, | |
| "loss": 2.4874, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.0235, | |
| "eval_loss": 2.4784486293792725, | |
| "eval_runtime": 33.3374, | |
| "eval_samples_per_second": 3.51, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.0236, | |
| "eval_loss": 2.478088855743408, | |
| "eval_runtime": 33.2864, | |
| "eval_samples_per_second": 3.515, | |
| "eval_steps_per_second": 1.772, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.0237, | |
| "eval_loss": 2.477979898452759, | |
| "eval_runtime": 33.4245, | |
| "eval_samples_per_second": 3.5, | |
| "eval_steps_per_second": 1.765, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.0238, | |
| "eval_loss": 2.4778709411621094, | |
| "eval_runtime": 33.2611, | |
| "eval_samples_per_second": 3.518, | |
| "eval_steps_per_second": 1.774, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.0239, | |
| "eval_loss": 2.477571487426758, | |
| "eval_runtime": 33.3418, | |
| "eval_samples_per_second": 3.509, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 0.037748109041694296, | |
| "learning_rate": 2.398e-06, | |
| "loss": 2.4666, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "eval_loss": 2.4772226810455322, | |
| "eval_runtime": 33.3603, | |
| "eval_samples_per_second": 3.507, | |
| "eval_steps_per_second": 1.769, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.0241, | |
| "eval_loss": 2.4769959449768066, | |
| "eval_runtime": 33.21, | |
| "eval_samples_per_second": 3.523, | |
| "eval_steps_per_second": 1.777, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.0242, | |
| "eval_loss": 2.4768526554107666, | |
| "eval_runtime": 33.4359, | |
| "eval_samples_per_second": 3.499, | |
| "eval_steps_per_second": 1.765, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.0243, | |
| "eval_loss": 2.476616382598877, | |
| "eval_runtime": 33.3341, | |
| "eval_samples_per_second": 3.51, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.0244, | |
| "eval_loss": 2.476250171661377, | |
| "eval_runtime": 33.3422, | |
| "eval_samples_per_second": 3.509, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.0245, | |
| "grad_norm": 0.042904100843004035, | |
| "learning_rate": 2.448e-06, | |
| "loss": 2.4698, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.0245, | |
| "eval_loss": 2.475933790206909, | |
| "eval_runtime": 33.3238, | |
| "eval_samples_per_second": 3.511, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.0246, | |
| "eval_loss": 2.475733995437622, | |
| "eval_runtime": 33.337, | |
| "eval_samples_per_second": 3.51, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.0247, | |
| "eval_loss": 2.4756155014038086, | |
| "eval_runtime": 33.3642, | |
| "eval_samples_per_second": 3.507, | |
| "eval_steps_per_second": 1.768, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.0248, | |
| "eval_loss": 2.475208044052124, | |
| "eval_runtime": 33.3567, | |
| "eval_samples_per_second": 3.508, | |
| "eval_steps_per_second": 1.769, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.0249, | |
| "eval_loss": 2.4751882553100586, | |
| "eval_runtime": 33.2409, | |
| "eval_samples_per_second": 3.52, | |
| "eval_steps_per_second": 1.775, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 0.04198064762114288, | |
| "learning_rate": 2.498e-06, | |
| "loss": 2.4544, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "eval_loss": 2.4749433994293213, | |
| "eval_runtime": 33.219, | |
| "eval_samples_per_second": 3.522, | |
| "eval_steps_per_second": 1.776, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.0251, | |
| "eval_loss": 2.475109577178955, | |
| "eval_runtime": 33.293, | |
| "eval_samples_per_second": 3.514, | |
| "eval_steps_per_second": 1.772, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.0252, | |
| "eval_loss": 2.474750280380249, | |
| "eval_runtime": 33.5388, | |
| "eval_samples_per_second": 3.488, | |
| "eval_steps_per_second": 1.759, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.0253, | |
| "eval_loss": 2.4743547439575195, | |
| "eval_runtime": 33.3597, | |
| "eval_samples_per_second": 3.507, | |
| "eval_steps_per_second": 1.769, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.0254, | |
| "eval_loss": 2.4740777015686035, | |
| "eval_runtime": 33.3283, | |
| "eval_samples_per_second": 3.511, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.0255, | |
| "grad_norm": 0.03252077443949688, | |
| "learning_rate": 2.5480000000000004e-06, | |
| "loss": 2.4647, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.0255, | |
| "eval_loss": 2.473674774169922, | |
| "eval_runtime": 33.2492, | |
| "eval_samples_per_second": 3.519, | |
| "eval_steps_per_second": 1.774, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.0256, | |
| "eval_loss": 2.4734930992126465, | |
| "eval_runtime": 33.2934, | |
| "eval_samples_per_second": 3.514, | |
| "eval_steps_per_second": 1.772, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.0257, | |
| "eval_loss": 2.4735071659088135, | |
| "eval_runtime": 33.466, | |
| "eval_samples_per_second": 3.496, | |
| "eval_steps_per_second": 1.763, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.0258, | |
| "eval_loss": 2.4733572006225586, | |
| "eval_runtime": 33.248, | |
| "eval_samples_per_second": 3.519, | |
| "eval_steps_per_second": 1.775, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.0259, | |
| "eval_loss": 2.4730312824249268, | |
| "eval_runtime": 33.3551, | |
| "eval_samples_per_second": 3.508, | |
| "eval_steps_per_second": 1.769, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.026, | |
| "grad_norm": 0.034740776600877266, | |
| "learning_rate": 2.598e-06, | |
| "loss": 2.4625, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.026, | |
| "eval_loss": 2.4726204872131348, | |
| "eval_runtime": 33.3147, | |
| "eval_samples_per_second": 3.512, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.0261, | |
| "eval_loss": 2.4729621410369873, | |
| "eval_runtime": 33.3118, | |
| "eval_samples_per_second": 3.512, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.0262, | |
| "eval_loss": 2.4726085662841797, | |
| "eval_runtime": 33.4111, | |
| "eval_samples_per_second": 3.502, | |
| "eval_steps_per_second": 1.766, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.0263, | |
| "eval_loss": 2.4724133014678955, | |
| "eval_runtime": 33.3144, | |
| "eval_samples_per_second": 3.512, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.0264, | |
| "eval_loss": 2.471963405609131, | |
| "eval_runtime": 33.3272, | |
| "eval_samples_per_second": 3.511, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.0265, | |
| "grad_norm": 0.039738232523319775, | |
| "learning_rate": 2.648e-06, | |
| "loss": 2.4734, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.0265, | |
| "eval_loss": 2.4717814922332764, | |
| "eval_runtime": 33.2395, | |
| "eval_samples_per_second": 3.52, | |
| "eval_steps_per_second": 1.775, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.0266, | |
| "eval_loss": 2.471389055252075, | |
| "eval_runtime": 33.2159, | |
| "eval_samples_per_second": 3.522, | |
| "eval_steps_per_second": 1.776, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.0267, | |
| "eval_loss": 2.4711251258850098, | |
| "eval_runtime": 33.4193, | |
| "eval_samples_per_second": 3.501, | |
| "eval_steps_per_second": 1.765, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.0268, | |
| "eval_loss": 2.470979928970337, | |
| "eval_runtime": 33.2748, | |
| "eval_samples_per_second": 3.516, | |
| "eval_steps_per_second": 1.773, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.0269, | |
| "eval_loss": 2.4706759452819824, | |
| "eval_runtime": 33.3367, | |
| "eval_samples_per_second": 3.51, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.027, | |
| "grad_norm": 0.036968596903604725, | |
| "learning_rate": 2.6980000000000003e-06, | |
| "loss": 2.4642, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.027, | |
| "eval_loss": 2.470658302307129, | |
| "eval_runtime": 33.3288, | |
| "eval_samples_per_second": 3.51, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.0271, | |
| "eval_loss": 2.4704952239990234, | |
| "eval_runtime": 33.3162, | |
| "eval_samples_per_second": 3.512, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.0272, | |
| "eval_loss": 2.470270872116089, | |
| "eval_runtime": 33.35, | |
| "eval_samples_per_second": 3.508, | |
| "eval_steps_per_second": 1.769, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.0273, | |
| "eval_loss": 2.4699764251708984, | |
| "eval_runtime": 33.3696, | |
| "eval_samples_per_second": 3.506, | |
| "eval_steps_per_second": 1.768, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.0274, | |
| "eval_loss": 2.469688653945923, | |
| "eval_runtime": 33.4143, | |
| "eval_samples_per_second": 3.501, | |
| "eval_steps_per_second": 1.766, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.0275, | |
| "grad_norm": 0.03899590922475157, | |
| "learning_rate": 2.748e-06, | |
| "loss": 2.4579, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.0275, | |
| "eval_loss": 2.469435691833496, | |
| "eval_runtime": 33.34, | |
| "eval_samples_per_second": 3.509, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.0276, | |
| "eval_loss": 2.469395160675049, | |
| "eval_runtime": 33.2655, | |
| "eval_samples_per_second": 3.517, | |
| "eval_steps_per_second": 1.774, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.0277, | |
| "eval_loss": 2.46889328956604, | |
| "eval_runtime": 33.3344, | |
| "eval_samples_per_second": 3.51, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.0278, | |
| "eval_loss": 2.468695640563965, | |
| "eval_runtime": 33.4003, | |
| "eval_samples_per_second": 3.503, | |
| "eval_steps_per_second": 1.766, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.0279, | |
| "eval_loss": 2.4685797691345215, | |
| "eval_runtime": 33.252, | |
| "eval_samples_per_second": 3.519, | |
| "eval_steps_per_second": 1.774, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "grad_norm": 0.03498385470366268, | |
| "learning_rate": 2.798e-06, | |
| "loss": 2.472, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "eval_loss": 2.468594789505005, | |
| "eval_runtime": 33.5555, | |
| "eval_samples_per_second": 3.487, | |
| "eval_steps_per_second": 1.758, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.0281, | |
| "eval_loss": 2.4685287475585938, | |
| "eval_runtime": 33.3147, | |
| "eval_samples_per_second": 3.512, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.0282, | |
| "eval_loss": 2.467956304550171, | |
| "eval_runtime": 33.3679, | |
| "eval_samples_per_second": 3.506, | |
| "eval_steps_per_second": 1.768, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.0283, | |
| "eval_loss": 2.467761993408203, | |
| "eval_runtime": 33.3242, | |
| "eval_samples_per_second": 3.511, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.0284, | |
| "eval_loss": 2.467660903930664, | |
| "eval_runtime": 33.3677, | |
| "eval_samples_per_second": 3.506, | |
| "eval_steps_per_second": 1.768, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.0285, | |
| "grad_norm": 0.03333480906358989, | |
| "learning_rate": 2.848e-06, | |
| "loss": 2.4676, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.0285, | |
| "eval_loss": 2.4673027992248535, | |
| "eval_runtime": 33.3388, | |
| "eval_samples_per_second": 3.509, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.0286, | |
| "eval_loss": 2.467072010040283, | |
| "eval_runtime": 33.3596, | |
| "eval_samples_per_second": 3.507, | |
| "eval_steps_per_second": 1.769, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.0287, | |
| "eval_loss": 2.4668517112731934, | |
| "eval_runtime": 33.5136, | |
| "eval_samples_per_second": 3.491, | |
| "eval_steps_per_second": 1.76, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.0288, | |
| "eval_loss": 2.4666786193847656, | |
| "eval_runtime": 33.3405, | |
| "eval_samples_per_second": 3.509, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.0289, | |
| "eval_loss": 2.4667794704437256, | |
| "eval_runtime": 33.3333, | |
| "eval_samples_per_second": 3.51, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.029, | |
| "grad_norm": 0.03480548121480933, | |
| "learning_rate": 2.8980000000000005e-06, | |
| "loss": 2.4524, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.029, | |
| "eval_loss": 2.466280460357666, | |
| "eval_runtime": 33.4727, | |
| "eval_samples_per_second": 3.495, | |
| "eval_steps_per_second": 1.763, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.0291, | |
| "eval_loss": 2.4659922122955322, | |
| "eval_runtime": 33.3309, | |
| "eval_samples_per_second": 3.51, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.0292, | |
| "eval_loss": 2.4657278060913086, | |
| "eval_runtime": 33.326, | |
| "eval_samples_per_second": 3.511, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.0293, | |
| "eval_loss": 2.4654440879821777, | |
| "eval_runtime": 33.3457, | |
| "eval_samples_per_second": 3.509, | |
| "eval_steps_per_second": 1.769, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.0294, | |
| "eval_loss": 2.465367317199707, | |
| "eval_runtime": 33.2824, | |
| "eval_samples_per_second": 3.515, | |
| "eval_steps_per_second": 1.773, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.0295, | |
| "grad_norm": 0.03652712436191979, | |
| "learning_rate": 2.9480000000000004e-06, | |
| "loss": 2.466, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.0295, | |
| "eval_loss": 2.465318202972412, | |
| "eval_runtime": 33.3264, | |
| "eval_samples_per_second": 3.511, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.0296, | |
| "eval_loss": 2.465156316757202, | |
| "eval_runtime": 33.2661, | |
| "eval_samples_per_second": 3.517, | |
| "eval_steps_per_second": 1.774, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.0297, | |
| "eval_loss": 2.4648799896240234, | |
| "eval_runtime": 33.4782, | |
| "eval_samples_per_second": 3.495, | |
| "eval_steps_per_second": 1.762, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.0298, | |
| "eval_loss": 2.4646074771881104, | |
| "eval_runtime": 33.3194, | |
| "eval_samples_per_second": 3.511, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.0299, | |
| "eval_loss": 2.464465856552124, | |
| "eval_runtime": 33.3466, | |
| "eval_samples_per_second": 3.509, | |
| "eval_steps_per_second": 1.769, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.03778721361564108, | |
| "learning_rate": 2.9980000000000003e-06, | |
| "loss": 2.4684, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "eval_loss": 2.464305877685547, | |
| "eval_runtime": 33.25, | |
| "eval_samples_per_second": 3.519, | |
| "eval_steps_per_second": 1.774, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.0301, | |
| "eval_loss": 2.464261531829834, | |
| "eval_runtime": 33.3761, | |
| "eval_samples_per_second": 3.505, | |
| "eval_steps_per_second": 1.768, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.0302, | |
| "eval_loss": 2.464185953140259, | |
| "eval_runtime": 33.4957, | |
| "eval_samples_per_second": 3.493, | |
| "eval_steps_per_second": 1.761, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.0303, | |
| "eval_loss": 2.4639229774475098, | |
| "eval_runtime": 33.2475, | |
| "eval_samples_per_second": 3.519, | |
| "eval_steps_per_second": 1.775, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.0304, | |
| "eval_loss": 2.4636595249176025, | |
| "eval_runtime": 33.3124, | |
| "eval_samples_per_second": 3.512, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.0305, | |
| "grad_norm": 0.035809836530372154, | |
| "learning_rate": 3.0480000000000003e-06, | |
| "loss": 2.4631, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.0305, | |
| "eval_loss": 2.46356201171875, | |
| "eval_runtime": 33.3423, | |
| "eval_samples_per_second": 3.509, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.0306, | |
| "eval_loss": 2.463318347930908, | |
| "eval_runtime": 33.3917, | |
| "eval_samples_per_second": 3.504, | |
| "eval_steps_per_second": 1.767, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.0307, | |
| "eval_loss": 2.4631264209747314, | |
| "eval_runtime": 33.4053, | |
| "eval_samples_per_second": 3.502, | |
| "eval_steps_per_second": 1.766, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.0308, | |
| "eval_loss": 2.462981700897217, | |
| "eval_runtime": 33.2608, | |
| "eval_samples_per_second": 3.518, | |
| "eval_steps_per_second": 1.774, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.0309, | |
| "eval_loss": 2.462719202041626, | |
| "eval_runtime": 33.3259, | |
| "eval_samples_per_second": 3.511, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.031, | |
| "grad_norm": 0.05979367258550731, | |
| "learning_rate": 3.0980000000000007e-06, | |
| "loss": 2.46, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.031, | |
| "eval_loss": 2.462733268737793, | |
| "eval_runtime": 33.3195, | |
| "eval_samples_per_second": 3.511, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.0311, | |
| "eval_loss": 2.4625959396362305, | |
| "eval_runtime": 33.3704, | |
| "eval_samples_per_second": 3.506, | |
| "eval_steps_per_second": 1.768, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.0312, | |
| "eval_loss": 2.462366819381714, | |
| "eval_runtime": 33.4047, | |
| "eval_samples_per_second": 3.503, | |
| "eval_steps_per_second": 1.766, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.0313, | |
| "eval_loss": 2.4618427753448486, | |
| "eval_runtime": 33.3896, | |
| "eval_samples_per_second": 3.504, | |
| "eval_steps_per_second": 1.767, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.0314, | |
| "eval_loss": 2.4616317749023438, | |
| "eval_runtime": 33.3414, | |
| "eval_samples_per_second": 3.509, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.0315, | |
| "grad_norm": 0.031804244667956116, | |
| "learning_rate": 3.1480000000000006e-06, | |
| "loss": 2.4477, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.0315, | |
| "eval_loss": 2.4615368843078613, | |
| "eval_runtime": 33.3548, | |
| "eval_samples_per_second": 3.508, | |
| "eval_steps_per_second": 1.769, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.0316, | |
| "eval_loss": 2.461198091506958, | |
| "eval_runtime": 33.2416, | |
| "eval_samples_per_second": 3.52, | |
| "eval_steps_per_second": 1.775, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.0317, | |
| "eval_loss": 2.4611523151397705, | |
| "eval_runtime": 33.3445, | |
| "eval_samples_per_second": 3.509, | |
| "eval_steps_per_second": 1.769, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.0318, | |
| "eval_loss": 2.4609127044677734, | |
| "eval_runtime": 33.3175, | |
| "eval_samples_per_second": 3.512, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.0319, | |
| "eval_loss": 2.4608800411224365, | |
| "eval_runtime": 33.3052, | |
| "eval_samples_per_second": 3.513, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 0.03365841309984822, | |
| "learning_rate": 3.198e-06, | |
| "loss": 2.4523, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "eval_loss": 2.460757255554199, | |
| "eval_runtime": 33.2636, | |
| "eval_samples_per_second": 3.517, | |
| "eval_steps_per_second": 1.774, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.0321, | |
| "eval_loss": 2.4605917930603027, | |
| "eval_runtime": 33.4595, | |
| "eval_samples_per_second": 3.497, | |
| "eval_steps_per_second": 1.763, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.0322, | |
| "eval_loss": 2.4604575634002686, | |
| "eval_runtime": 33.2706, | |
| "eval_samples_per_second": 3.517, | |
| "eval_steps_per_second": 1.773, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.0323, | |
| "eval_loss": 2.4603111743927, | |
| "eval_runtime": 33.405, | |
| "eval_samples_per_second": 3.502, | |
| "eval_steps_per_second": 1.766, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.0324, | |
| "eval_loss": 2.460045337677002, | |
| "eval_runtime": 33.2598, | |
| "eval_samples_per_second": 3.518, | |
| "eval_steps_per_second": 1.774, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.0325, | |
| "grad_norm": 0.03534600587541967, | |
| "learning_rate": 3.248e-06, | |
| "loss": 2.45, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.0325, | |
| "eval_loss": 2.460045099258423, | |
| "eval_runtime": 33.2663, | |
| "eval_samples_per_second": 3.517, | |
| "eval_steps_per_second": 1.774, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.0326, | |
| "eval_loss": 2.4599287509918213, | |
| "eval_runtime": 33.2545, | |
| "eval_samples_per_second": 3.518, | |
| "eval_steps_per_second": 1.774, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.0327, | |
| "eval_loss": 2.459611654281616, | |
| "eval_runtime": 33.4189, | |
| "eval_samples_per_second": 3.501, | |
| "eval_steps_per_second": 1.765, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.0328, | |
| "eval_loss": 2.4594151973724365, | |
| "eval_runtime": 33.284, | |
| "eval_samples_per_second": 3.515, | |
| "eval_steps_per_second": 1.773, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.0329, | |
| "eval_loss": 2.4589221477508545, | |
| "eval_runtime": 33.4033, | |
| "eval_samples_per_second": 3.503, | |
| "eval_steps_per_second": 1.766, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.033, | |
| "grad_norm": 0.032596527761614855, | |
| "learning_rate": 3.298e-06, | |
| "loss": 2.4422, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.033, | |
| "eval_loss": 2.4589502811431885, | |
| "eval_runtime": 33.2986, | |
| "eval_samples_per_second": 3.514, | |
| "eval_steps_per_second": 1.772, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.0331, | |
| "eval_loss": 2.4588239192962646, | |
| "eval_runtime": 33.4046, | |
| "eval_samples_per_second": 3.503, | |
| "eval_steps_per_second": 1.766, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.0332, | |
| "eval_loss": 2.458603620529175, | |
| "eval_runtime": 33.3448, | |
| "eval_samples_per_second": 3.509, | |
| "eval_steps_per_second": 1.769, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.0333, | |
| "eval_loss": 2.458559513092041, | |
| "eval_runtime": 33.368, | |
| "eval_samples_per_second": 3.506, | |
| "eval_steps_per_second": 1.768, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.0334, | |
| "eval_loss": 2.458500862121582, | |
| "eval_runtime": 33.2335, | |
| "eval_samples_per_second": 3.521, | |
| "eval_steps_per_second": 1.775, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.0335, | |
| "grad_norm": 0.03339611698643194, | |
| "learning_rate": 3.348e-06, | |
| "loss": 2.447, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.0335, | |
| "eval_loss": 2.458252191543579, | |
| "eval_runtime": 33.3623, | |
| "eval_samples_per_second": 3.507, | |
| "eval_steps_per_second": 1.768, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.0336, | |
| "eval_loss": 2.4580931663513184, | |
| "eval_runtime": 33.2532, | |
| "eval_samples_per_second": 3.518, | |
| "eval_steps_per_second": 1.774, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.0337, | |
| "eval_loss": 2.4578795433044434, | |
| "eval_runtime": 33.3214, | |
| "eval_samples_per_second": 3.511, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.0338, | |
| "eval_loss": 2.4576218128204346, | |
| "eval_runtime": 33.248, | |
| "eval_samples_per_second": 3.519, | |
| "eval_steps_per_second": 1.775, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.0339, | |
| "eval_loss": 2.4576828479766846, | |
| "eval_runtime": 33.3499, | |
| "eval_samples_per_second": 3.508, | |
| "eval_steps_per_second": 1.769, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.034, | |
| "grad_norm": 0.03028181865357742, | |
| "learning_rate": 3.3980000000000003e-06, | |
| "loss": 2.4582, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.034, | |
| "eval_loss": 2.457383155822754, | |
| "eval_runtime": 33.2574, | |
| "eval_samples_per_second": 3.518, | |
| "eval_steps_per_second": 1.774, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.0341, | |
| "eval_loss": 2.4572579860687256, | |
| "eval_runtime": 33.2947, | |
| "eval_samples_per_second": 3.514, | |
| "eval_steps_per_second": 1.772, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.0342, | |
| "eval_loss": 2.4584450721740723, | |
| "eval_runtime": 33.3296, | |
| "eval_samples_per_second": 3.51, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.0343, | |
| "eval_loss": 2.458603858947754, | |
| "eval_runtime": 33.3017, | |
| "eval_samples_per_second": 3.513, | |
| "eval_steps_per_second": 1.772, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.0344, | |
| "eval_loss": 2.4579555988311768, | |
| "eval_runtime": 33.292, | |
| "eval_samples_per_second": 3.514, | |
| "eval_steps_per_second": 1.772, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.0345, | |
| "grad_norm": 0.03734241446236971, | |
| "learning_rate": 3.4480000000000003e-06, | |
| "loss": 2.4501, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.0345, | |
| "eval_loss": 2.4574153423309326, | |
| "eval_runtime": 33.4313, | |
| "eval_samples_per_second": 3.5, | |
| "eval_steps_per_second": 1.765, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.0346, | |
| "eval_loss": 2.456867218017578, | |
| "eval_runtime": 33.2833, | |
| "eval_samples_per_second": 3.515, | |
| "eval_steps_per_second": 1.773, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.0347, | |
| "eval_loss": 2.4567270278930664, | |
| "eval_runtime": 33.3694, | |
| "eval_samples_per_second": 3.506, | |
| "eval_steps_per_second": 1.768, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.0348, | |
| "eval_loss": 2.456348180770874, | |
| "eval_runtime": 33.3416, | |
| "eval_samples_per_second": 3.509, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.0349, | |
| "eval_loss": 2.4563136100769043, | |
| "eval_runtime": 33.3531, | |
| "eval_samples_per_second": 3.508, | |
| "eval_steps_per_second": 1.769, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.035, | |
| "grad_norm": 0.030782538004837847, | |
| "learning_rate": 3.4980000000000002e-06, | |
| "loss": 2.4509, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.035, | |
| "eval_loss": 2.455827236175537, | |
| "eval_runtime": 33.3143, | |
| "eval_samples_per_second": 3.512, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.0351, | |
| "eval_loss": 2.4558639526367188, | |
| "eval_runtime": 33.3716, | |
| "eval_samples_per_second": 3.506, | |
| "eval_steps_per_second": 1.768, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.0352, | |
| "eval_loss": 2.4555938243865967, | |
| "eval_runtime": 33.2966, | |
| "eval_samples_per_second": 3.514, | |
| "eval_steps_per_second": 1.772, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.0353, | |
| "eval_loss": 2.4551546573638916, | |
| "eval_runtime": 33.3145, | |
| "eval_samples_per_second": 3.512, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.0354, | |
| "eval_loss": 2.454957962036133, | |
| "eval_runtime": 33.3201, | |
| "eval_samples_per_second": 3.511, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.0355, | |
| "grad_norm": 0.03281862515471333, | |
| "learning_rate": 3.548e-06, | |
| "loss": 2.4439, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.0355, | |
| "eval_loss": 2.455031394958496, | |
| "eval_runtime": 33.264, | |
| "eval_samples_per_second": 3.517, | |
| "eval_steps_per_second": 1.774, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.0356, | |
| "eval_loss": 2.4550724029541016, | |
| "eval_runtime": 33.3734, | |
| "eval_samples_per_second": 3.506, | |
| "eval_steps_per_second": 1.768, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.0357, | |
| "eval_loss": 2.454719305038452, | |
| "eval_runtime": 33.3267, | |
| "eval_samples_per_second": 3.511, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 0.0358, | |
| "eval_loss": 2.4547033309936523, | |
| "eval_runtime": 33.2651, | |
| "eval_samples_per_second": 3.517, | |
| "eval_steps_per_second": 1.774, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.0359, | |
| "eval_loss": 2.454416275024414, | |
| "eval_runtime": 33.3612, | |
| "eval_samples_per_second": 3.507, | |
| "eval_steps_per_second": 1.769, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "grad_norm": 0.031756006482001914, | |
| "learning_rate": 3.5980000000000005e-06, | |
| "loss": 2.4493, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "eval_loss": 2.454286813735962, | |
| "eval_runtime": 33.326, | |
| "eval_samples_per_second": 3.511, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.0361, | |
| "eval_loss": 2.4541101455688477, | |
| "eval_runtime": 33.2597, | |
| "eval_samples_per_second": 3.518, | |
| "eval_steps_per_second": 1.774, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 0.0362, | |
| "eval_loss": 2.4541351795196533, | |
| "eval_runtime": 33.2421, | |
| "eval_samples_per_second": 3.52, | |
| "eval_steps_per_second": 1.775, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.0363, | |
| "eval_loss": 2.4537973403930664, | |
| "eval_runtime": 33.3201, | |
| "eval_samples_per_second": 3.511, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 0.0364, | |
| "eval_loss": 2.4534847736358643, | |
| "eval_runtime": 33.2973, | |
| "eval_samples_per_second": 3.514, | |
| "eval_steps_per_second": 1.772, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.0365, | |
| "grad_norm": 0.03128096989289917, | |
| "learning_rate": 3.6480000000000005e-06, | |
| "loss": 2.4526, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.0365, | |
| "eval_loss": 2.453655481338501, | |
| "eval_runtime": 33.3755, | |
| "eval_samples_per_second": 3.506, | |
| "eval_steps_per_second": 1.768, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.0366, | |
| "eval_loss": 2.4534049034118652, | |
| "eval_runtime": 33.332, | |
| "eval_samples_per_second": 3.51, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.0367, | |
| "eval_loss": 2.4529781341552734, | |
| "eval_runtime": 33.3325, | |
| "eval_samples_per_second": 3.51, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 0.0368, | |
| "eval_loss": 2.454005241394043, | |
| "eval_runtime": 33.3975, | |
| "eval_samples_per_second": 3.503, | |
| "eval_steps_per_second": 1.767, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.0369, | |
| "eval_loss": 2.4538745880126953, | |
| "eval_runtime": 33.3, | |
| "eval_samples_per_second": 3.514, | |
| "eval_steps_per_second": 1.772, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 0.037, | |
| "grad_norm": 0.02999582338402207, | |
| "learning_rate": 3.6980000000000004e-06, | |
| "loss": 2.4309, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.037, | |
| "eval_loss": 2.4534404277801514, | |
| "eval_runtime": 33.2825, | |
| "eval_samples_per_second": 3.515, | |
| "eval_steps_per_second": 1.773, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.0371, | |
| "eval_loss": 2.4529800415039062, | |
| "eval_runtime": 33.513, | |
| "eval_samples_per_second": 3.491, | |
| "eval_steps_per_second": 1.761, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 0.0372, | |
| "eval_loss": 2.453007221221924, | |
| "eval_runtime": 33.3414, | |
| "eval_samples_per_second": 3.509, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.0373, | |
| "eval_loss": 2.452350616455078, | |
| "eval_runtime": 33.3625, | |
| "eval_samples_per_second": 3.507, | |
| "eval_steps_per_second": 1.768, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 0.0374, | |
| "eval_loss": 2.4522666931152344, | |
| "eval_runtime": 33.3116, | |
| "eval_samples_per_second": 3.512, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.0375, | |
| "grad_norm": 0.0409025592520596, | |
| "learning_rate": 3.7480000000000004e-06, | |
| "loss": 2.442, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.0375, | |
| "eval_loss": 2.4521546363830566, | |
| "eval_runtime": 33.3782, | |
| "eval_samples_per_second": 3.505, | |
| "eval_steps_per_second": 1.768, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.0376, | |
| "eval_loss": 2.4520437717437744, | |
| "eval_runtime": 33.2887, | |
| "eval_samples_per_second": 3.515, | |
| "eval_steps_per_second": 1.772, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.0377, | |
| "eval_loss": 2.4519331455230713, | |
| "eval_runtime": 33.3746, | |
| "eval_samples_per_second": 3.506, | |
| "eval_steps_per_second": 1.768, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 0.0378, | |
| "eval_loss": 2.451744556427002, | |
| "eval_runtime": 33.3214, | |
| "eval_samples_per_second": 3.511, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.0379, | |
| "eval_loss": 2.451737642288208, | |
| "eval_runtime": 33.3457, | |
| "eval_samples_per_second": 3.509, | |
| "eval_steps_per_second": 1.769, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 0.038, | |
| "grad_norm": 0.03431980647954774, | |
| "learning_rate": 3.7980000000000007e-06, | |
| "loss": 2.4477, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.038, | |
| "eval_loss": 2.4515624046325684, | |
| "eval_runtime": 33.312, | |
| "eval_samples_per_second": 3.512, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.0381, | |
| "eval_loss": 2.4512295722961426, | |
| "eval_runtime": 33.3607, | |
| "eval_samples_per_second": 3.507, | |
| "eval_steps_per_second": 1.769, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 0.0382, | |
| "eval_loss": 2.4510445594787598, | |
| "eval_runtime": 33.339, | |
| "eval_samples_per_second": 3.509, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.0383, | |
| "eval_loss": 2.4508397579193115, | |
| "eval_runtime": 33.3996, | |
| "eval_samples_per_second": 3.503, | |
| "eval_steps_per_second": 1.766, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 0.0384, | |
| "eval_loss": 2.4510440826416016, | |
| "eval_runtime": 33.2905, | |
| "eval_samples_per_second": 3.515, | |
| "eval_steps_per_second": 1.772, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.0385, | |
| "grad_norm": 0.03587224652231601, | |
| "learning_rate": 3.848e-06, | |
| "loss": 2.4433, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.0385, | |
| "eval_loss": 2.450984239578247, | |
| "eval_runtime": 33.3263, | |
| "eval_samples_per_second": 3.511, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.0386, | |
| "eval_loss": 2.45090651512146, | |
| "eval_runtime": 33.3244, | |
| "eval_samples_per_second": 3.511, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.0387, | |
| "eval_loss": 2.450443983078003, | |
| "eval_runtime": 33.3023, | |
| "eval_samples_per_second": 3.513, | |
| "eval_steps_per_second": 1.772, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 0.0388, | |
| "eval_loss": 2.450309991836548, | |
| "eval_runtime": 33.4354, | |
| "eval_samples_per_second": 3.499, | |
| "eval_steps_per_second": 1.765, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.0389, | |
| "eval_loss": 2.4500510692596436, | |
| "eval_runtime": 33.3238, | |
| "eval_samples_per_second": 3.511, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 0.039, | |
| "grad_norm": 0.027239293031380653, | |
| "learning_rate": 3.898e-06, | |
| "loss": 2.4347, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.039, | |
| "eval_loss": 2.4498231410980225, | |
| "eval_runtime": 33.3306, | |
| "eval_samples_per_second": 3.51, | |
| "eval_steps_per_second": 1.77, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.0391, | |
| "eval_loss": 2.449704170227051, | |
| "eval_runtime": 33.3865, | |
| "eval_samples_per_second": 3.504, | |
| "eval_steps_per_second": 1.767, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 0.0392, | |
| "eval_loss": 2.44974684715271, | |
| "eval_runtime": 33.419, | |
| "eval_samples_per_second": 3.501, | |
| "eval_steps_per_second": 1.765, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.0393, | |
| "eval_loss": 2.450090169906616, | |
| "eval_runtime": 33.5315, | |
| "eval_samples_per_second": 3.489, | |
| "eval_steps_per_second": 1.76, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 0.0394, | |
| "eval_loss": 2.4494845867156982, | |
| "eval_runtime": 33.4607, | |
| "eval_samples_per_second": 3.497, | |
| "eval_steps_per_second": 1.763, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.0395, | |
| "grad_norm": 0.031553482039351585, | |
| "learning_rate": 3.948e-06, | |
| "loss": 2.4466, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.0395, | |
| "eval_loss": 2.449598550796509, | |
| "eval_runtime": 33.4853, | |
| "eval_samples_per_second": 3.494, | |
| "eval_steps_per_second": 1.762, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.0396, | |
| "eval_loss": 2.449420213699341, | |
| "eval_runtime": 33.4626, | |
| "eval_samples_per_second": 3.496, | |
| "eval_steps_per_second": 1.763, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.0397, | |
| "eval_loss": 2.449462890625, | |
| "eval_runtime": 33.4049, | |
| "eval_samples_per_second": 3.502, | |
| "eval_steps_per_second": 1.766, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 0.0398, | |
| "eval_loss": 2.449423313140869, | |
| "eval_runtime": 33.5823, | |
| "eval_samples_per_second": 3.484, | |
| "eval_steps_per_second": 1.757, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.0399, | |
| "eval_loss": 2.4491324424743652, | |
| "eval_runtime": 33.662, | |
| "eval_samples_per_second": 3.476, | |
| "eval_steps_per_second": 1.753, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.03314009226524554, | |
| "learning_rate": 3.9980000000000005e-06, | |
| "loss": 2.4391, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "eval_loss": 2.449084520339966, | |
| "eval_runtime": 33.5872, | |
| "eval_samples_per_second": 3.483, | |
| "eval_steps_per_second": 1.757, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.0401, | |
| "eval_loss": 2.449021577835083, | |
| "eval_runtime": 33.5048, | |
| "eval_samples_per_second": 3.492, | |
| "eval_steps_per_second": 1.761, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 0.0402, | |
| "eval_loss": 2.449159622192383, | |
| "eval_runtime": 33.4845, | |
| "eval_samples_per_second": 3.494, | |
| "eval_steps_per_second": 1.762, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.0403, | |
| "eval_loss": 2.448726177215576, | |
| "eval_runtime": 33.9926, | |
| "eval_samples_per_second": 3.442, | |
| "eval_steps_per_second": 1.736, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 0.0404, | |
| "eval_loss": 2.4484922885894775, | |
| "eval_runtime": 33.6594, | |
| "eval_samples_per_second": 3.476, | |
| "eval_steps_per_second": 1.753, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.0405, | |
| "grad_norm": 0.029877786947315705, | |
| "learning_rate": 4.048e-06, | |
| "loss": 2.438, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.0405, | |
| "eval_loss": 2.4485254287719727, | |
| "eval_runtime": 33.6812, | |
| "eval_samples_per_second": 3.474, | |
| "eval_steps_per_second": 1.752, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.0406, | |
| "eval_loss": 2.448495388031006, | |
| "eval_runtime": 33.9733, | |
| "eval_samples_per_second": 3.444, | |
| "eval_steps_per_second": 1.737, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.0407, | |
| "eval_loss": 2.4482643604278564, | |
| "eval_runtime": 33.9957, | |
| "eval_samples_per_second": 3.442, | |
| "eval_steps_per_second": 1.736, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 0.0408, | |
| "eval_loss": 2.4481942653656006, | |
| "eval_runtime": 34.3014, | |
| "eval_samples_per_second": 3.411, | |
| "eval_steps_per_second": 1.72, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.0409, | |
| "eval_loss": 2.448082208633423, | |
| "eval_runtime": 34.0411, | |
| "eval_samples_per_second": 3.437, | |
| "eval_steps_per_second": 1.733, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 0.041, | |
| "grad_norm": 0.031175983773220776, | |
| "learning_rate": 4.098e-06, | |
| "loss": 2.4332, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.041, | |
| "eval_loss": 2.4478490352630615, | |
| "eval_runtime": 33.9245, | |
| "eval_samples_per_second": 3.449, | |
| "eval_steps_per_second": 1.739, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.0411, | |
| "eval_loss": 2.4480035305023193, | |
| "eval_runtime": 34.0079, | |
| "eval_samples_per_second": 3.44, | |
| "eval_steps_per_second": 1.735, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 0.0412, | |
| "eval_loss": 2.447685718536377, | |
| "eval_runtime": 33.999, | |
| "eval_samples_per_second": 3.441, | |
| "eval_steps_per_second": 1.735, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.0413, | |
| "eval_loss": 2.447507619857788, | |
| "eval_runtime": 34.1446, | |
| "eval_samples_per_second": 3.427, | |
| "eval_steps_per_second": 1.728, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 0.0414, | |
| "eval_loss": 2.447322130203247, | |
| "eval_runtime": 33.7479, | |
| "eval_samples_per_second": 3.467, | |
| "eval_steps_per_second": 1.748, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.0415, | |
| "grad_norm": 0.02904850084773878, | |
| "learning_rate": 4.148000000000001e-06, | |
| "loss": 2.4481, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.0415, | |
| "eval_loss": 2.4471347332000732, | |
| "eval_runtime": 33.917, | |
| "eval_samples_per_second": 3.45, | |
| "eval_steps_per_second": 1.74, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.0416, | |
| "eval_loss": 2.447152853012085, | |
| "eval_runtime": 33.8287, | |
| "eval_samples_per_second": 3.459, | |
| "eval_steps_per_second": 1.744, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.0417, | |
| "eval_loss": 2.4469242095947266, | |
| "eval_runtime": 33.7591, | |
| "eval_samples_per_second": 3.466, | |
| "eval_steps_per_second": 1.748, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 0.0418, | |
| "eval_loss": 2.4471774101257324, | |
| "eval_runtime": 33.7879, | |
| "eval_samples_per_second": 3.463, | |
| "eval_steps_per_second": 1.746, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.0419, | |
| "eval_loss": 2.447988986968994, | |
| "eval_runtime": 33.6878, | |
| "eval_samples_per_second": 3.473, | |
| "eval_steps_per_second": 1.751, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 0.042, | |
| "grad_norm": 0.033838990669225626, | |
| "learning_rate": 4.198e-06, | |
| "loss": 2.4386, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.042, | |
| "eval_loss": 2.4477100372314453, | |
| "eval_runtime": 33.6345, | |
| "eval_samples_per_second": 3.479, | |
| "eval_steps_per_second": 1.754, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.0421, | |
| "eval_loss": 2.447394847869873, | |
| "eval_runtime": 33.6221, | |
| "eval_samples_per_second": 3.48, | |
| "eval_steps_per_second": 1.755, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 0.0422, | |
| "eval_loss": 2.4470951557159424, | |
| "eval_runtime": 33.6689, | |
| "eval_samples_per_second": 3.475, | |
| "eval_steps_per_second": 1.752, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.0423, | |
| "eval_loss": 2.4467623233795166, | |
| "eval_runtime": 33.6979, | |
| "eval_samples_per_second": 3.472, | |
| "eval_steps_per_second": 1.751, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 0.0424, | |
| "eval_loss": 2.4469833374023438, | |
| "eval_runtime": 33.8632, | |
| "eval_samples_per_second": 3.455, | |
| "eval_steps_per_second": 1.742, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.0425, | |
| "grad_norm": 0.0382703849144026, | |
| "learning_rate": 4.248000000000001e-06, | |
| "loss": 2.4313, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.0425, | |
| "eval_loss": 2.447753667831421, | |
| "eval_runtime": 33.7269, | |
| "eval_samples_per_second": 3.469, | |
| "eval_steps_per_second": 1.749, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.0426, | |
| "eval_loss": 2.447281837463379, | |
| "eval_runtime": 33.7037, | |
| "eval_samples_per_second": 3.471, | |
| "eval_steps_per_second": 1.751, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.0427, | |
| "eval_loss": 2.4472267627716064, | |
| "eval_runtime": 33.6873, | |
| "eval_samples_per_second": 3.473, | |
| "eval_steps_per_second": 1.751, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 0.0428, | |
| "eval_loss": 2.446859836578369, | |
| "eval_runtime": 33.6738, | |
| "eval_samples_per_second": 3.475, | |
| "eval_steps_per_second": 1.752, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.0429, | |
| "eval_loss": 2.446655035018921, | |
| "eval_runtime": 33.6536, | |
| "eval_samples_per_second": 3.477, | |
| "eval_steps_per_second": 1.753, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 0.043, | |
| "grad_norm": 0.027126678960545086, | |
| "learning_rate": 4.298e-06, | |
| "loss": 2.4298, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.043, | |
| "eval_loss": 2.4463651180267334, | |
| "eval_runtime": 33.6454, | |
| "eval_samples_per_second": 3.477, | |
| "eval_steps_per_second": 1.754, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.0431, | |
| "eval_loss": 2.4461581707000732, | |
| "eval_runtime": 33.6166, | |
| "eval_samples_per_second": 3.48, | |
| "eval_steps_per_second": 1.755, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 0.0432, | |
| "eval_loss": 2.4461660385131836, | |
| "eval_runtime": 33.5484, | |
| "eval_samples_per_second": 3.488, | |
| "eval_steps_per_second": 1.759, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.0433, | |
| "eval_loss": 2.4458513259887695, | |
| "eval_runtime": 33.6579, | |
| "eval_samples_per_second": 3.476, | |
| "eval_steps_per_second": 1.753, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 0.0434, | |
| "eval_loss": 2.4454855918884277, | |
| "eval_runtime": 33.5647, | |
| "eval_samples_per_second": 3.486, | |
| "eval_steps_per_second": 1.758, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.0435, | |
| "grad_norm": 0.030565328679921875, | |
| "learning_rate": 4.3480000000000006e-06, | |
| "loss": 2.4387, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 0.0435, | |
| "eval_loss": 2.445688009262085, | |
| "eval_runtime": 33.5164, | |
| "eval_samples_per_second": 3.491, | |
| "eval_steps_per_second": 1.76, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 0.0436, | |
| "eval_loss": 2.4456729888916016, | |
| "eval_runtime": 33.4724, | |
| "eval_samples_per_second": 3.495, | |
| "eval_steps_per_second": 1.763, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.0437, | |
| "eval_loss": 2.4460015296936035, | |
| "eval_runtime": 33.3984, | |
| "eval_samples_per_second": 3.503, | |
| "eval_steps_per_second": 1.767, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 0.0438, | |
| "eval_loss": 2.4460256099700928, | |
| "eval_runtime": 33.4582, | |
| "eval_samples_per_second": 3.497, | |
| "eval_steps_per_second": 1.763, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.0439, | |
| "eval_loss": 2.4456872940063477, | |
| "eval_runtime": 33.444, | |
| "eval_samples_per_second": 3.498, | |
| "eval_steps_per_second": 1.764, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "grad_norm": 0.03864046787827566, | |
| "learning_rate": 4.398000000000001e-06, | |
| "loss": 2.445, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "eval_loss": 2.4454870223999023, | |
| "eval_runtime": 33.4474, | |
| "eval_samples_per_second": 3.498, | |
| "eval_steps_per_second": 1.764, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.0441, | |
| "eval_loss": 2.4453113079071045, | |
| "eval_runtime": 33.4062, | |
| "eval_samples_per_second": 3.502, | |
| "eval_steps_per_second": 1.766, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 0.0442, | |
| "eval_loss": 2.4448771476745605, | |
| "eval_runtime": 33.3542, | |
| "eval_samples_per_second": 3.508, | |
| "eval_steps_per_second": 1.769, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.0443, | |
| "eval_loss": 2.444946765899658, | |
| "eval_runtime": 33.3997, | |
| "eval_samples_per_second": 3.503, | |
| "eval_steps_per_second": 1.766, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 0.0444, | |
| "eval_loss": 2.445194959640503, | |
| "eval_runtime": 33.3669, | |
| "eval_samples_per_second": 3.506, | |
| "eval_steps_per_second": 1.768, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.0445, | |
| "grad_norm": 0.026792091668494698, | |
| "learning_rate": 4.4480000000000004e-06, | |
| "loss": 2.4339, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 0.0445, | |
| "eval_loss": 2.445009469985962, | |
| "eval_runtime": 33.4467, | |
| "eval_samples_per_second": 3.498, | |
| "eval_steps_per_second": 1.764, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 0.0446, | |
| "eval_loss": 2.4450981616973877, | |
| "eval_runtime": 33.4513, | |
| "eval_samples_per_second": 3.498, | |
| "eval_steps_per_second": 1.764, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.0447, | |
| "eval_loss": 2.444899082183838, | |
| "eval_runtime": 33.3869, | |
| "eval_samples_per_second": 3.504, | |
| "eval_steps_per_second": 1.767, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 0.0448, | |
| "eval_loss": 2.4448494911193848, | |
| "eval_runtime": 33.486, | |
| "eval_samples_per_second": 3.494, | |
| "eval_steps_per_second": 1.762, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.0449, | |
| "eval_loss": 2.444640636444092, | |
| "eval_runtime": 33.4202, | |
| "eval_samples_per_second": 3.501, | |
| "eval_steps_per_second": 1.765, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 0.045, | |
| "grad_norm": 0.027104711228224686, | |
| "learning_rate": 4.498e-06, | |
| "loss": 2.4326, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.045, | |
| "eval_loss": 2.444633722305298, | |
| "eval_runtime": 33.4154, | |
| "eval_samples_per_second": 3.501, | |
| "eval_steps_per_second": 1.766, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.0451, | |
| "eval_loss": 2.44467830657959, | |
| "eval_runtime": 33.4237, | |
| "eval_samples_per_second": 3.501, | |
| "eval_steps_per_second": 1.765, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 0.0452, | |
| "eval_loss": 2.444413900375366, | |
| "eval_runtime": 33.3694, | |
| "eval_samples_per_second": 3.506, | |
| "eval_steps_per_second": 1.768, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.0453, | |
| "eval_loss": 2.444222927093506, | |
| "eval_runtime": 33.3585, | |
| "eval_samples_per_second": 3.507, | |
| "eval_steps_per_second": 1.769, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 0.0454, | |
| "eval_loss": 2.444108724594116, | |
| "eval_runtime": 33.3346, | |
| "eval_samples_per_second": 3.51, | |
| "eval_steps_per_second": 1.77, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.0455, | |
| "grad_norm": 0.033569645173308425, | |
| "learning_rate": 4.548e-06, | |
| "loss": 2.4342, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 0.0455, | |
| "eval_loss": 2.443859577178955, | |
| "eval_runtime": 33.3636, | |
| "eval_samples_per_second": 3.507, | |
| "eval_steps_per_second": 1.768, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 0.0456, | |
| "eval_loss": 2.4441120624542236, | |
| "eval_runtime": 33.2442, | |
| "eval_samples_per_second": 3.519, | |
| "eval_steps_per_second": 1.775, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.0457, | |
| "eval_loss": 2.4439260959625244, | |
| "eval_runtime": 33.2924, | |
| "eval_samples_per_second": 3.514, | |
| "eval_steps_per_second": 1.772, | |
| "step": 2285 | |
| }, | |
| { | |
| "epoch": 0.0458, | |
| "eval_loss": 2.4439032077789307, | |
| "eval_runtime": 33.4004, | |
| "eval_samples_per_second": 3.503, | |
| "eval_steps_per_second": 1.766, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.0459, | |
| "eval_loss": 2.443621873855591, | |
| "eval_runtime": 33.3314, | |
| "eval_samples_per_second": 3.51, | |
| "eval_steps_per_second": 1.77, | |
| "step": 2295 | |
| }, | |
| { | |
| "epoch": 0.046, | |
| "grad_norm": 0.02648413187023774, | |
| "learning_rate": 4.598e-06, | |
| "loss": 2.4368, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.046, | |
| "eval_loss": 2.4436306953430176, | |
| "eval_runtime": 33.372, | |
| "eval_samples_per_second": 3.506, | |
| "eval_steps_per_second": 1.768, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.0461, | |
| "eval_loss": 2.4436404705047607, | |
| "eval_runtime": 33.3039, | |
| "eval_samples_per_second": 3.513, | |
| "eval_steps_per_second": 1.772, | |
| "step": 2305 | |
| }, | |
| { | |
| "epoch": 0.0462, | |
| "eval_loss": 2.44333815574646, | |
| "eval_runtime": 33.3059, | |
| "eval_samples_per_second": 3.513, | |
| "eval_steps_per_second": 1.771, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.0463, | |
| "eval_loss": 2.443415880203247, | |
| "eval_runtime": 33.4065, | |
| "eval_samples_per_second": 3.502, | |
| "eval_steps_per_second": 1.766, | |
| "step": 2315 | |
| }, | |
| { | |
| "epoch": 0.0464, | |
| "eval_loss": 2.443068742752075, | |
| "eval_runtime": 33.2818, | |
| "eval_samples_per_second": 3.515, | |
| "eval_steps_per_second": 1.773, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.0465, | |
| "grad_norm": 0.0351440602227012, | |
| "learning_rate": 4.648e-06, | |
| "loss": 2.4381, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 0.0465, | |
| "eval_loss": 2.443199634552002, | |
| "eval_runtime": 33.3538, | |
| "eval_samples_per_second": 3.508, | |
| "eval_steps_per_second": 1.769, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 0.0466, | |
| "eval_loss": 2.4433047771453857, | |
| "eval_runtime": 33.4816, | |
| "eval_samples_per_second": 3.494, | |
| "eval_steps_per_second": 1.762, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.0467, | |
| "eval_loss": 2.443272113800049, | |
| "eval_runtime": 33.5015, | |
| "eval_samples_per_second": 3.492, | |
| "eval_steps_per_second": 1.761, | |
| "step": 2335 | |
| }, | |
| { | |
| "epoch": 0.0468, | |
| "eval_loss": 2.443246603012085, | |
| "eval_runtime": 33.5753, | |
| "eval_samples_per_second": 3.485, | |
| "eval_steps_per_second": 1.757, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.0469, | |
| "eval_loss": 2.4432363510131836, | |
| "eval_runtime": 33.2869, | |
| "eval_samples_per_second": 3.515, | |
| "eval_steps_per_second": 1.772, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 0.047, | |
| "grad_norm": 0.02695670446644145, | |
| "learning_rate": 4.698000000000001e-06, | |
| "loss": 2.4303, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.047, | |
| "eval_loss": 2.4429421424865723, | |
| "eval_runtime": 33.3556, | |
| "eval_samples_per_second": 3.508, | |
| "eval_steps_per_second": 1.769, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.0471, | |
| "eval_loss": 2.4427566528320312, | |
| "eval_runtime": 33.3612, | |
| "eval_samples_per_second": 3.507, | |
| "eval_steps_per_second": 1.769, | |
| "step": 2355 | |
| }, | |
| { | |
| "epoch": 0.0472, | |
| "eval_loss": 2.4425995349884033, | |
| "eval_runtime": 33.353, | |
| "eval_samples_per_second": 3.508, | |
| "eval_steps_per_second": 1.769, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.0473, | |
| "eval_loss": 2.4426395893096924, | |
| "eval_runtime": 33.4669, | |
| "eval_samples_per_second": 3.496, | |
| "eval_steps_per_second": 1.763, | |
| "step": 2365 | |
| }, | |
| { | |
| "epoch": 0.0474, | |
| "eval_loss": 2.4425301551818848, | |
| "eval_runtime": 33.3803, | |
| "eval_samples_per_second": 3.505, | |
| "eval_steps_per_second": 1.768, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.0475, | |
| "grad_norm": 0.031232764672567994, | |
| "learning_rate": 4.748e-06, | |
| "loss": 2.4284, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 0.0475, | |
| "eval_loss": 2.4426214694976807, | |
| "eval_runtime": 33.3013, | |
| "eval_samples_per_second": 3.513, | |
| "eval_steps_per_second": 1.772, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 0.0476, | |
| "eval_loss": 2.442599296569824, | |
| "eval_runtime": 33.3419, | |
| "eval_samples_per_second": 3.509, | |
| "eval_steps_per_second": 1.77, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.0477, | |
| "eval_loss": 2.442364454269409, | |
| "eval_runtime": 33.3677, | |
| "eval_samples_per_second": 3.506, | |
| "eval_steps_per_second": 1.768, | |
| "step": 2385 | |
| }, | |
| { | |
| "epoch": 0.0478, | |
| "eval_loss": 2.4425458908081055, | |
| "eval_runtime": 33.3892, | |
| "eval_samples_per_second": 3.504, | |
| "eval_steps_per_second": 1.767, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.0479, | |
| "eval_loss": 2.4425549507141113, | |
| "eval_runtime": 33.4202, | |
| "eval_samples_per_second": 3.501, | |
| "eval_steps_per_second": 1.765, | |
| "step": 2395 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 0.027127721086561404, | |
| "learning_rate": 4.7980000000000005e-06, | |
| "loss": 2.4291, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "eval_loss": 2.4425251483917236, | |
| "eval_runtime": 33.3802, | |
| "eval_samples_per_second": 3.505, | |
| "eval_steps_per_second": 1.768, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.0481, | |
| "eval_loss": 2.4424123764038086, | |
| "eval_runtime": 33.3283, | |
| "eval_samples_per_second": 3.511, | |
| "eval_steps_per_second": 1.77, | |
| "step": 2405 | |
| }, | |
| { | |
| "epoch": 0.0482, | |
| "eval_loss": 2.4421849250793457, | |
| "eval_runtime": 33.4172, | |
| "eval_samples_per_second": 3.501, | |
| "eval_steps_per_second": 1.766, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.0483, | |
| "eval_loss": 2.4419970512390137, | |
| "eval_runtime": 33.4642, | |
| "eval_samples_per_second": 3.496, | |
| "eval_steps_per_second": 1.763, | |
| "step": 2415 | |
| }, | |
| { | |
| "epoch": 0.0484, | |
| "eval_loss": 2.4419567584991455, | |
| "eval_runtime": 33.3663, | |
| "eval_samples_per_second": 3.507, | |
| "eval_steps_per_second": 1.768, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.0485, | |
| "grad_norm": 0.026032952013136927, | |
| "learning_rate": 4.848000000000001e-06, | |
| "loss": 2.4256, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 0.0485, | |
| "eval_loss": 2.441688299179077, | |
| "eval_runtime": 33.3169, | |
| "eval_samples_per_second": 3.512, | |
| "eval_steps_per_second": 1.771, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 0.0486, | |
| "eval_loss": 2.4417548179626465, | |
| "eval_runtime": 33.3476, | |
| "eval_samples_per_second": 3.508, | |
| "eval_steps_per_second": 1.769, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.0487, | |
| "eval_loss": 2.441769599914551, | |
| "eval_runtime": 33.4488, | |
| "eval_samples_per_second": 3.498, | |
| "eval_steps_per_second": 1.764, | |
| "step": 2435 | |
| }, | |
| { | |
| "epoch": 0.0488, | |
| "eval_loss": 2.4415283203125, | |
| "eval_runtime": 33.4555, | |
| "eval_samples_per_second": 3.497, | |
| "eval_steps_per_second": 1.764, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.0489, | |
| "eval_loss": 2.4416847229003906, | |
| "eval_runtime": 33.2459, | |
| "eval_samples_per_second": 3.519, | |
| "eval_steps_per_second": 1.775, | |
| "step": 2445 | |
| }, | |
| { | |
| "epoch": 0.049, | |
| "grad_norm": 0.02804626155591942, | |
| "learning_rate": 4.898e-06, | |
| "loss": 2.4334, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.049, | |
| "eval_loss": 2.4414188861846924, | |
| "eval_runtime": 33.2989, | |
| "eval_samples_per_second": 3.514, | |
| "eval_steps_per_second": 1.772, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.0491, | |
| "eval_loss": 2.4416472911834717, | |
| "eval_runtime": 33.3676, | |
| "eval_samples_per_second": 3.506, | |
| "eval_steps_per_second": 1.768, | |
| "step": 2455 | |
| }, | |
| { | |
| "epoch": 0.0492, | |
| "eval_loss": 2.4414844512939453, | |
| "eval_runtime": 33.4116, | |
| "eval_samples_per_second": 3.502, | |
| "eval_steps_per_second": 1.766, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.0493, | |
| "eval_loss": 2.441408395767212, | |
| "eval_runtime": 33.6104, | |
| "eval_samples_per_second": 3.481, | |
| "eval_steps_per_second": 1.755, | |
| "step": 2465 | |
| }, | |
| { | |
| "epoch": 0.0494, | |
| "eval_loss": 2.4413650035858154, | |
| "eval_runtime": 33.3838, | |
| "eval_samples_per_second": 3.505, | |
| "eval_steps_per_second": 1.767, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.0495, | |
| "grad_norm": 0.025351866385684634, | |
| "learning_rate": 4.948000000000001e-06, | |
| "loss": 2.4356, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 0.0495, | |
| "eval_loss": 2.4411768913269043, | |
| "eval_runtime": 33.3857, | |
| "eval_samples_per_second": 3.504, | |
| "eval_steps_per_second": 1.767, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 0.0496, | |
| "eval_loss": 2.441201686859131, | |
| "eval_runtime": 33.4117, | |
| "eval_samples_per_second": 3.502, | |
| "eval_steps_per_second": 1.766, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.0497, | |
| "eval_loss": 2.4408698081970215, | |
| "eval_runtime": 33.3015, | |
| "eval_samples_per_second": 3.513, | |
| "eval_steps_per_second": 1.772, | |
| "step": 2485 | |
| }, | |
| { | |
| "epoch": 0.0498, | |
| "eval_loss": 2.440950393676758, | |
| "eval_runtime": 33.379, | |
| "eval_samples_per_second": 3.505, | |
| "eval_steps_per_second": 1.768, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.0499, | |
| "eval_loss": 2.4407267570495605, | |
| "eval_runtime": 33.2561, | |
| "eval_samples_per_second": 3.518, | |
| "eval_steps_per_second": 1.774, | |
| "step": 2495 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.029743600833546286, | |
| "learning_rate": 4.998e-06, | |
| "loss": 2.4369, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "eval_loss": 2.4408068656921387, | |
| "eval_runtime": 33.3807, | |
| "eval_samples_per_second": 3.505, | |
| "eval_steps_per_second": 1.767, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.0501, | |
| "eval_loss": 2.4407401084899902, | |
| "eval_runtime": 33.2295, | |
| "eval_samples_per_second": 3.521, | |
| "eval_steps_per_second": 1.776, | |
| "step": 2505 | |
| }, | |
| { | |
| "epoch": 0.0502, | |
| "eval_loss": 2.4409286975860596, | |
| "eval_runtime": 33.3925, | |
| "eval_samples_per_second": 3.504, | |
| "eval_steps_per_second": 1.767, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.0503, | |
| "eval_loss": 2.4407782554626465, | |
| "eval_runtime": 33.4498, | |
| "eval_samples_per_second": 3.498, | |
| "eval_steps_per_second": 1.764, | |
| "step": 2515 | |
| }, | |
| { | |
| "epoch": 0.0504, | |
| "eval_loss": 2.4407856464385986, | |
| "eval_runtime": 33.4899, | |
| "eval_samples_per_second": 3.494, | |
| "eval_steps_per_second": 1.762, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.0505, | |
| "grad_norm": 0.027292319342276494, | |
| "learning_rate": 5.048000000000001e-06, | |
| "loss": 2.4263, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 0.0505, | |
| "eval_loss": 2.440830945968628, | |
| "eval_runtime": 33.3428, | |
| "eval_samples_per_second": 3.509, | |
| "eval_steps_per_second": 1.769, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 0.0506, | |
| "eval_loss": 2.44069504737854, | |
| "eval_runtime": 33.2895, | |
| "eval_samples_per_second": 3.515, | |
| "eval_steps_per_second": 1.772, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.0507, | |
| "eval_loss": 2.4408159255981445, | |
| "eval_runtime": 33.3488, | |
| "eval_samples_per_second": 3.508, | |
| "eval_steps_per_second": 1.769, | |
| "step": 2535 | |
| }, | |
| { | |
| "epoch": 0.0508, | |
| "eval_loss": 2.440523386001587, | |
| "eval_runtime": 33.3582, | |
| "eval_samples_per_second": 3.507, | |
| "eval_steps_per_second": 1.769, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.0509, | |
| "eval_loss": 2.4403724670410156, | |
| "eval_runtime": 33.5287, | |
| "eval_samples_per_second": 3.49, | |
| "eval_steps_per_second": 1.76, | |
| "step": 2545 | |
| }, | |
| { | |
| "epoch": 0.051, | |
| "grad_norm": 0.02495087994166461, | |
| "learning_rate": 5.098000000000001e-06, | |
| "loss": 2.428, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.051, | |
| "eval_loss": 2.440495252609253, | |
| "eval_runtime": 34.4575, | |
| "eval_samples_per_second": 3.395, | |
| "eval_steps_per_second": 1.712, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.0511, | |
| "eval_loss": 2.440384864807129, | |
| "eval_runtime": 34.0144, | |
| "eval_samples_per_second": 3.44, | |
| "eval_steps_per_second": 1.735, | |
| "step": 2555 | |
| }, | |
| { | |
| "epoch": 0.0512, | |
| "eval_loss": 2.4405176639556885, | |
| "eval_runtime": 34.5852, | |
| "eval_samples_per_second": 3.383, | |
| "eval_steps_per_second": 1.706, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.0513, | |
| "eval_loss": 2.4402472972869873, | |
| "eval_runtime": 34.2689, | |
| "eval_samples_per_second": 3.414, | |
| "eval_steps_per_second": 1.722, | |
| "step": 2565 | |
| }, | |
| { | |
| "epoch": 0.0514, | |
| "eval_loss": 2.440459966659546, | |
| "eval_runtime": 33.3821, | |
| "eval_samples_per_second": 3.505, | |
| "eval_steps_per_second": 1.767, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.0515, | |
| "grad_norm": 0.029728034222700407, | |
| "learning_rate": 5.1480000000000005e-06, | |
| "loss": 2.439, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 0.0515, | |
| "eval_loss": 2.440525531768799, | |
| "eval_runtime": 34.3072, | |
| "eval_samples_per_second": 3.41, | |
| "eval_steps_per_second": 1.72, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 0.0516, | |
| "eval_loss": 2.440373420715332, | |
| "eval_runtime": 33.5748, | |
| "eval_samples_per_second": 3.485, | |
| "eval_steps_per_second": 1.757, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.0517, | |
| "eval_loss": 2.4405770301818848, | |
| "eval_runtime": 35.2655, | |
| "eval_samples_per_second": 3.318, | |
| "eval_steps_per_second": 1.673, | |
| "step": 2585 | |
| }, | |
| { | |
| "epoch": 0.0518, | |
| "eval_loss": 2.4402198791503906, | |
| "eval_runtime": 34.9918, | |
| "eval_samples_per_second": 3.344, | |
| "eval_steps_per_second": 1.686, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.0519, | |
| "eval_loss": 2.440136194229126, | |
| "eval_runtime": 33.4873, | |
| "eval_samples_per_second": 3.494, | |
| "eval_steps_per_second": 1.762, | |
| "step": 2595 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "grad_norm": 0.02473354917836018, | |
| "learning_rate": 5.198000000000001e-06, | |
| "loss": 2.427, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "eval_loss": 2.440282106399536, | |
| "eval_runtime": 33.4628, | |
| "eval_samples_per_second": 3.496, | |
| "eval_steps_per_second": 1.763, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.0521, | |
| "eval_loss": 2.440448045730591, | |
| "eval_runtime": 33.4191, | |
| "eval_samples_per_second": 3.501, | |
| "eval_steps_per_second": 1.765, | |
| "step": 2605 | |
| }, | |
| { | |
| "epoch": 0.0522, | |
| "eval_loss": 2.440248966217041, | |
| "eval_runtime": 33.4911, | |
| "eval_samples_per_second": 3.493, | |
| "eval_steps_per_second": 1.762, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.0523, | |
| "eval_loss": 2.440030336380005, | |
| "eval_runtime": 33.4921, | |
| "eval_samples_per_second": 3.493, | |
| "eval_steps_per_second": 1.762, | |
| "step": 2615 | |
| }, | |
| { | |
| "epoch": 0.0524, | |
| "eval_loss": 2.4397685527801514, | |
| "eval_runtime": 33.4491, | |
| "eval_samples_per_second": 3.498, | |
| "eval_steps_per_second": 1.764, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.0525, | |
| "grad_norm": 0.026533778128592735, | |
| "learning_rate": 5.248000000000001e-06, | |
| "loss": 2.4214, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 0.0525, | |
| "eval_loss": 2.43971848487854, | |
| "eval_runtime": 33.3975, | |
| "eval_samples_per_second": 3.503, | |
| "eval_steps_per_second": 1.767, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 0.0526, | |
| "eval_loss": 2.4398951530456543, | |
| "eval_runtime": 33.4912, | |
| "eval_samples_per_second": 3.493, | |
| "eval_steps_per_second": 1.762, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.0527, | |
| "eval_loss": 2.43975830078125, | |
| "eval_runtime": 33.4071, | |
| "eval_samples_per_second": 3.502, | |
| "eval_steps_per_second": 1.766, | |
| "step": 2635 | |
| }, | |
| { | |
| "epoch": 0.0528, | |
| "eval_loss": 2.439666271209717, | |
| "eval_runtime": 33.4208, | |
| "eval_samples_per_second": 3.501, | |
| "eval_steps_per_second": 1.765, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.0529, | |
| "eval_loss": 2.439816951751709, | |
| "eval_runtime": 33.5111, | |
| "eval_samples_per_second": 3.491, | |
| "eval_steps_per_second": 1.761, | |
| "step": 2645 | |
| }, | |
| { | |
| "epoch": 0.053, | |
| "grad_norm": 0.024723120971366967, | |
| "learning_rate": 5.298000000000001e-06, | |
| "loss": 2.4241, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.053, | |
| "eval_loss": 2.4398183822631836, | |
| "eval_runtime": 33.506, | |
| "eval_samples_per_second": 3.492, | |
| "eval_steps_per_second": 1.761, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.0531, | |
| "eval_loss": 2.4402668476104736, | |
| "eval_runtime": 34.1298, | |
| "eval_samples_per_second": 3.428, | |
| "eval_steps_per_second": 1.729, | |
| "step": 2655 | |
| }, | |
| { | |
| "epoch": 0.0532, | |
| "eval_loss": 2.4400885105133057, | |
| "eval_runtime": 33.436, | |
| "eval_samples_per_second": 3.499, | |
| "eval_steps_per_second": 1.765, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.0533, | |
| "eval_loss": 2.439871311187744, | |
| "eval_runtime": 33.3874, | |
| "eval_samples_per_second": 3.504, | |
| "eval_steps_per_second": 1.767, | |
| "step": 2665 | |
| }, | |
| { | |
| "epoch": 0.0534, | |
| "eval_loss": 2.4393365383148193, | |
| "eval_runtime": 33.5258, | |
| "eval_samples_per_second": 3.49, | |
| "eval_steps_per_second": 1.76, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.0535, | |
| "grad_norm": 0.02173239513971497, | |
| "learning_rate": 5.348000000000001e-06, | |
| "loss": 2.4295, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 0.0535, | |
| "eval_loss": 2.439133405685425, | |
| "eval_runtime": 33.4962, | |
| "eval_samples_per_second": 3.493, | |
| "eval_steps_per_second": 1.761, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 0.0536, | |
| "eval_loss": 2.439093589782715, | |
| "eval_runtime": 33.4708, | |
| "eval_samples_per_second": 3.496, | |
| "eval_steps_per_second": 1.763, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.0537, | |
| "eval_loss": 2.439096212387085, | |
| "eval_runtime": 33.4284, | |
| "eval_samples_per_second": 3.5, | |
| "eval_steps_per_second": 1.765, | |
| "step": 2685 | |
| }, | |
| { | |
| "epoch": 0.0538, | |
| "eval_loss": 2.4389584064483643, | |
| "eval_runtime": 33.4749, | |
| "eval_samples_per_second": 3.495, | |
| "eval_steps_per_second": 1.763, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.0539, | |
| "eval_loss": 2.438805103302002, | |
| "eval_runtime": 33.478, | |
| "eval_samples_per_second": 3.495, | |
| "eval_steps_per_second": 1.762, | |
| "step": 2695 | |
| }, | |
| { | |
| "epoch": 0.054, | |
| "grad_norm": 0.023851331909406925, | |
| "learning_rate": 5.398e-06, | |
| "loss": 2.4302, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.054, | |
| "eval_loss": 2.4386403560638428, | |
| "eval_runtime": 33.4276, | |
| "eval_samples_per_second": 3.5, | |
| "eval_steps_per_second": 1.765, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.0541, | |
| "eval_loss": 2.438568115234375, | |
| "eval_runtime": 33.528, | |
| "eval_samples_per_second": 3.49, | |
| "eval_steps_per_second": 1.76, | |
| "step": 2705 | |
| }, | |
| { | |
| "epoch": 0.0542, | |
| "eval_loss": 2.438894510269165, | |
| "eval_runtime": 33.5228, | |
| "eval_samples_per_second": 3.49, | |
| "eval_steps_per_second": 1.76, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.0543, | |
| "eval_loss": 2.4387168884277344, | |
| "eval_runtime": 33.4663, | |
| "eval_samples_per_second": 3.496, | |
| "eval_steps_per_second": 1.763, | |
| "step": 2715 | |
| }, | |
| { | |
| "epoch": 0.0544, | |
| "eval_loss": 2.4385879039764404, | |
| "eval_runtime": 33.513, | |
| "eval_samples_per_second": 3.491, | |
| "eval_steps_per_second": 1.761, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.0545, | |
| "grad_norm": 0.02728082451264937, | |
| "learning_rate": 5.448e-06, | |
| "loss": 2.4308, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 0.0545, | |
| "eval_loss": 2.4388349056243896, | |
| "eval_runtime": 33.4525, | |
| "eval_samples_per_second": 3.497, | |
| "eval_steps_per_second": 1.764, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 0.0546, | |
| "eval_loss": 2.438887357711792, | |
| "eval_runtime": 33.428, | |
| "eval_samples_per_second": 3.5, | |
| "eval_steps_per_second": 1.765, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.0547, | |
| "eval_loss": 2.438713312149048, | |
| "eval_runtime": 33.5229, | |
| "eval_samples_per_second": 3.49, | |
| "eval_steps_per_second": 1.76, | |
| "step": 2735 | |
| }, | |
| { | |
| "epoch": 0.0548, | |
| "eval_loss": 2.438657283782959, | |
| "eval_runtime": 33.4169, | |
| "eval_samples_per_second": 3.501, | |
| "eval_steps_per_second": 1.766, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.0549, | |
| "eval_loss": 2.438544988632202, | |
| "eval_runtime": 33.4944, | |
| "eval_samples_per_second": 3.493, | |
| "eval_steps_per_second": 1.761, | |
| "step": 2745 | |
| }, | |
| { | |
| "epoch": 0.055, | |
| "grad_norm": 0.025461121075693184, | |
| "learning_rate": 5.498e-06, | |
| "loss": 2.4379, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.055, | |
| "eval_loss": 2.4386098384857178, | |
| "eval_runtime": 33.6782, | |
| "eval_samples_per_second": 3.474, | |
| "eval_steps_per_second": 1.752, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.0551, | |
| "eval_loss": 2.438521146774292, | |
| "eval_runtime": 33.5161, | |
| "eval_samples_per_second": 3.491, | |
| "eval_steps_per_second": 1.76, | |
| "step": 2755 | |
| }, | |
| { | |
| "epoch": 0.0552, | |
| "eval_loss": 2.438474178314209, | |
| "eval_runtime": 33.4773, | |
| "eval_samples_per_second": 3.495, | |
| "eval_steps_per_second": 1.762, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.0553, | |
| "eval_loss": 2.4382379055023193, | |
| "eval_runtime": 33.4869, | |
| "eval_samples_per_second": 3.494, | |
| "eval_steps_per_second": 1.762, | |
| "step": 2765 | |
| }, | |
| { | |
| "epoch": 0.0554, | |
| "eval_loss": 2.438157796859741, | |
| "eval_runtime": 33.543, | |
| "eval_samples_per_second": 3.488, | |
| "eval_steps_per_second": 1.759, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.0555, | |
| "grad_norm": 0.0234055445054481, | |
| "learning_rate": 5.548e-06, | |
| "loss": 2.4326, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 0.0555, | |
| "eval_loss": 2.438048839569092, | |
| "eval_runtime": 33.5073, | |
| "eval_samples_per_second": 3.492, | |
| "eval_steps_per_second": 1.761, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 0.0556, | |
| "eval_loss": 2.4379706382751465, | |
| "eval_runtime": 33.4567, | |
| "eval_samples_per_second": 3.497, | |
| "eval_steps_per_second": 1.763, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.0557, | |
| "eval_loss": 2.4379332065582275, | |
| "eval_runtime": 33.5172, | |
| "eval_samples_per_second": 3.491, | |
| "eval_steps_per_second": 1.76, | |
| "step": 2785 | |
| }, | |
| { | |
| "epoch": 0.0558, | |
| "eval_loss": 2.4380111694335938, | |
| "eval_runtime": 33.5913, | |
| "eval_samples_per_second": 3.483, | |
| "eval_steps_per_second": 1.756, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.0559, | |
| "eval_loss": 2.4379403591156006, | |
| "eval_runtime": 33.5223, | |
| "eval_samples_per_second": 3.49, | |
| "eval_steps_per_second": 1.76, | |
| "step": 2795 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 0.024691045411267393, | |
| "learning_rate": 5.5980000000000004e-06, | |
| "loss": 2.4297, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "eval_loss": 2.43778657913208, | |
| "eval_runtime": 33.524, | |
| "eval_samples_per_second": 3.49, | |
| "eval_steps_per_second": 1.76, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.0561, | |
| "eval_loss": 2.4376559257507324, | |
| "eval_runtime": 33.58, | |
| "eval_samples_per_second": 3.484, | |
| "eval_steps_per_second": 1.757, | |
| "step": 2805 | |
| }, | |
| { | |
| "epoch": 0.0562, | |
| "eval_loss": 2.437596559524536, | |
| "eval_runtime": 33.5756, | |
| "eval_samples_per_second": 3.485, | |
| "eval_steps_per_second": 1.757, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.0563, | |
| "eval_loss": 2.437690496444702, | |
| "eval_runtime": 33.5056, | |
| "eval_samples_per_second": 3.492, | |
| "eval_steps_per_second": 1.761, | |
| "step": 2815 | |
| }, | |
| { | |
| "epoch": 0.0564, | |
| "eval_loss": 2.437558174133301, | |
| "eval_runtime": 33.4948, | |
| "eval_samples_per_second": 3.493, | |
| "eval_steps_per_second": 1.761, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.0565, | |
| "grad_norm": 0.02500330428035899, | |
| "learning_rate": 5.648e-06, | |
| "loss": 2.4281, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 0.0565, | |
| "eval_loss": 2.437875747680664, | |
| "eval_runtime": 33.4492, | |
| "eval_samples_per_second": 3.498, | |
| "eval_steps_per_second": 1.764, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 0.0566, | |
| "eval_loss": 2.438183546066284, | |
| "eval_runtime": 33.5208, | |
| "eval_samples_per_second": 3.49, | |
| "eval_steps_per_second": 1.76, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.0567, | |
| "eval_loss": 2.4375228881835938, | |
| "eval_runtime": 33.5319, | |
| "eval_samples_per_second": 3.489, | |
| "eval_steps_per_second": 1.76, | |
| "step": 2835 | |
| }, | |
| { | |
| "epoch": 0.0568, | |
| "eval_loss": 2.437365770339966, | |
| "eval_runtime": 33.4734, | |
| "eval_samples_per_second": 3.495, | |
| "eval_steps_per_second": 1.763, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.0569, | |
| "eval_loss": 2.4376399517059326, | |
| "eval_runtime": 33.4578, | |
| "eval_samples_per_second": 3.497, | |
| "eval_steps_per_second": 1.763, | |
| "step": 2845 | |
| }, | |
| { | |
| "epoch": 0.057, | |
| "grad_norm": 0.023953363978697285, | |
| "learning_rate": 5.698e-06, | |
| "loss": 2.4341, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.057, | |
| "eval_loss": 2.437318801879883, | |
| "eval_runtime": 33.4551, | |
| "eval_samples_per_second": 3.497, | |
| "eval_steps_per_second": 1.764, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.0571, | |
| "eval_loss": 2.437349319458008, | |
| "eval_runtime": 33.4482, | |
| "eval_samples_per_second": 3.498, | |
| "eval_steps_per_second": 1.764, | |
| "step": 2855 | |
| }, | |
| { | |
| "epoch": 0.0572, | |
| "eval_loss": 2.437500476837158, | |
| "eval_runtime": 33.5179, | |
| "eval_samples_per_second": 3.491, | |
| "eval_steps_per_second": 1.76, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.0573, | |
| "eval_loss": 2.4371414184570312, | |
| "eval_runtime": 33.4246, | |
| "eval_samples_per_second": 3.5, | |
| "eval_steps_per_second": 1.765, | |
| "step": 2865 | |
| }, | |
| { | |
| "epoch": 0.0574, | |
| "eval_loss": 2.4371588230133057, | |
| "eval_runtime": 33.5686, | |
| "eval_samples_per_second": 3.485, | |
| "eval_steps_per_second": 1.758, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.0575, | |
| "grad_norm": 0.023037224733864405, | |
| "learning_rate": 5.748e-06, | |
| "loss": 2.4201, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 0.0575, | |
| "eval_loss": 2.4373178482055664, | |
| "eval_runtime": 33.4813, | |
| "eval_samples_per_second": 3.494, | |
| "eval_steps_per_second": 1.762, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 0.0576, | |
| "eval_loss": 2.4371204376220703, | |
| "eval_runtime": 33.5096, | |
| "eval_samples_per_second": 3.492, | |
| "eval_steps_per_second": 1.761, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.0577, | |
| "eval_loss": 2.43719482421875, | |
| "eval_runtime": 33.4709, | |
| "eval_samples_per_second": 3.496, | |
| "eval_steps_per_second": 1.763, | |
| "step": 2885 | |
| }, | |
| { | |
| "epoch": 0.0578, | |
| "eval_loss": 2.4369635581970215, | |
| "eval_runtime": 33.5125, | |
| "eval_samples_per_second": 3.491, | |
| "eval_steps_per_second": 1.761, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.0579, | |
| "eval_loss": 2.4367122650146484, | |
| "eval_runtime": 33.5349, | |
| "eval_samples_per_second": 3.489, | |
| "eval_steps_per_second": 1.759, | |
| "step": 2895 | |
| }, | |
| { | |
| "epoch": 0.058, | |
| "grad_norm": 0.023843041578218274, | |
| "learning_rate": 5.798e-06, | |
| "loss": 2.4322, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.058, | |
| "eval_loss": 2.436885118484497, | |
| "eval_runtime": 33.5038, | |
| "eval_samples_per_second": 3.492, | |
| "eval_steps_per_second": 1.761, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.0581, | |
| "eval_loss": 2.4368388652801514, | |
| "eval_runtime": 33.4337, | |
| "eval_samples_per_second": 3.499, | |
| "eval_steps_per_second": 1.765, | |
| "step": 2905 | |
| }, | |
| { | |
| "epoch": 0.0582, | |
| "eval_loss": 2.436776638031006, | |
| "eval_runtime": 33.5783, | |
| "eval_samples_per_second": 3.484, | |
| "eval_steps_per_second": 1.757, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.0583, | |
| "eval_loss": 2.4369046688079834, | |
| "eval_runtime": 33.5764, | |
| "eval_samples_per_second": 3.485, | |
| "eval_steps_per_second": 1.757, | |
| "step": 2915 | |
| }, | |
| { | |
| "epoch": 0.0584, | |
| "eval_loss": 2.4369351863861084, | |
| "eval_runtime": 33.5715, | |
| "eval_samples_per_second": 3.485, | |
| "eval_steps_per_second": 1.757, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.0585, | |
| "grad_norm": 0.030212978437899864, | |
| "learning_rate": 5.848000000000001e-06, | |
| "loss": 2.4318, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 0.0585, | |
| "eval_loss": 2.4367170333862305, | |
| "eval_runtime": 33.455, | |
| "eval_samples_per_second": 3.497, | |
| "eval_steps_per_second": 1.764, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 0.0586, | |
| "eval_loss": 2.4367101192474365, | |
| "eval_runtime": 33.3973, | |
| "eval_samples_per_second": 3.503, | |
| "eval_steps_per_second": 1.767, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.0587, | |
| "eval_loss": 2.436723470687866, | |
| "eval_runtime": 33.4183, | |
| "eval_samples_per_second": 3.501, | |
| "eval_steps_per_second": 1.766, | |
| "step": 2935 | |
| }, | |
| { | |
| "epoch": 0.0588, | |
| "eval_loss": 2.4368371963500977, | |
| "eval_runtime": 33.5269, | |
| "eval_samples_per_second": 3.49, | |
| "eval_steps_per_second": 1.76, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.0589, | |
| "eval_loss": 2.436763286590576, | |
| "eval_runtime": 33.4623, | |
| "eval_samples_per_second": 3.496, | |
| "eval_steps_per_second": 1.763, | |
| "step": 2945 | |
| }, | |
| { | |
| "epoch": 0.059, | |
| "grad_norm": 0.024293450378328845, | |
| "learning_rate": 5.898e-06, | |
| "loss": 2.4221, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.059, | |
| "eval_loss": 2.436692714691162, | |
| "eval_runtime": 33.523, | |
| "eval_samples_per_second": 3.49, | |
| "eval_steps_per_second": 1.76, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.0591, | |
| "eval_loss": 2.436657667160034, | |
| "eval_runtime": 34.902, | |
| "eval_samples_per_second": 3.352, | |
| "eval_steps_per_second": 1.69, | |
| "step": 2955 | |
| }, | |
| { | |
| "epoch": 0.0592, | |
| "eval_loss": 2.436432123184204, | |
| "eval_runtime": 33.4808, | |
| "eval_samples_per_second": 3.495, | |
| "eval_steps_per_second": 1.762, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.0593, | |
| "eval_loss": 2.436782121658325, | |
| "eval_runtime": 34.5166, | |
| "eval_samples_per_second": 3.39, | |
| "eval_steps_per_second": 1.709, | |
| "step": 2965 | |
| }, | |
| { | |
| "epoch": 0.0594, | |
| "eval_loss": 2.4366602897644043, | |
| "eval_runtime": 33.7416, | |
| "eval_samples_per_second": 3.468, | |
| "eval_steps_per_second": 1.749, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.0595, | |
| "grad_norm": 0.028294127858427973, | |
| "learning_rate": 5.9480000000000005e-06, | |
| "loss": 2.4196, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 0.0595, | |
| "eval_loss": 2.436668872833252, | |
| "eval_runtime": 35.1904, | |
| "eval_samples_per_second": 3.325, | |
| "eval_steps_per_second": 1.677, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 0.0596, | |
| "eval_loss": 2.436310052871704, | |
| "eval_runtime": 33.583, | |
| "eval_samples_per_second": 3.484, | |
| "eval_steps_per_second": 1.757, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.0597, | |
| "eval_loss": 2.4361066818237305, | |
| "eval_runtime": 34.1148, | |
| "eval_samples_per_second": 3.43, | |
| "eval_steps_per_second": 1.729, | |
| "step": 2985 | |
| }, | |
| { | |
| "epoch": 0.0598, | |
| "eval_loss": 2.436128854751587, | |
| "eval_runtime": 33.7895, | |
| "eval_samples_per_second": 3.463, | |
| "eval_steps_per_second": 1.746, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.0599, | |
| "eval_loss": 2.436457872390747, | |
| "eval_runtime": 34.0525, | |
| "eval_samples_per_second": 3.436, | |
| "eval_steps_per_second": 1.733, | |
| "step": 2995 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.02242795270420928, | |
| "learning_rate": 5.998000000000001e-06, | |
| "loss": 2.4245, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "eval_loss": 2.436203718185425, | |
| "eval_runtime": 33.6471, | |
| "eval_samples_per_second": 3.477, | |
| "eval_steps_per_second": 1.753, | |
| "step": 3000 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 50000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9223372036854775807, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.355905264309764e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |