{ "best_global_step": 2985, "best_metric": 2.4361066818237305, "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_7x1024_mem32_bs64_hf_armt_dmem64/run_20/checkpoint-2000", "epoch": 0.06, "eval_steps": 5, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001, "eval_loss": 3.320133686065674, "eval_runtime": 33.1817, "eval_samples_per_second": 3.526, "eval_steps_per_second": 1.778, "step": 5 }, { "epoch": 0.0002, "eval_loss": 3.319335460662842, "eval_runtime": 33.1229, "eval_samples_per_second": 3.532, "eval_steps_per_second": 1.781, "step": 10 }, { "epoch": 0.0003, "eval_loss": 3.318042516708374, "eval_runtime": 33.3382, "eval_samples_per_second": 3.509, "eval_steps_per_second": 1.77, "step": 15 }, { "epoch": 0.0004, "eval_loss": 3.31443190574646, "eval_runtime": 33.2423, "eval_samples_per_second": 3.52, "eval_steps_per_second": 1.775, "step": 20 }, { "epoch": 0.0005, "grad_norm": 0.8831791054097137, "learning_rate": 4.8e-08, "loss": 3.4942, "step": 25 }, { "epoch": 0.0005, "eval_loss": 3.3073768615722656, "eval_runtime": 33.3914, "eval_samples_per_second": 3.504, "eval_steps_per_second": 1.767, "step": 25 }, { "epoch": 0.0006, "eval_loss": 3.299119472503662, "eval_runtime": 33.4042, "eval_samples_per_second": 3.503, "eval_steps_per_second": 1.766, "step": 30 }, { "epoch": 0.0007, "eval_loss": 3.2837445735931396, "eval_runtime": 33.3171, "eval_samples_per_second": 3.512, "eval_steps_per_second": 1.771, "step": 35 }, { "epoch": 0.0008, "eval_loss": 3.26920747756958, "eval_runtime": 33.2887, "eval_samples_per_second": 3.515, "eval_steps_per_second": 1.772, "step": 40 }, { "epoch": 0.0009, "eval_loss": 3.2481868267059326, "eval_runtime": 33.3291, "eval_samples_per_second": 3.51, "eval_steps_per_second": 1.77, "step": 45 }, { "epoch": 0.001, "grad_norm": 0.5545255682809549, "learning_rate": 9.8e-08, "loss": 3.4174, "step": 50 }, { "epoch": 0.001, "eval_loss": 3.2263057231903076, "eval_runtime": 33.3242, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.77, "step": 50 }, { "epoch": 0.0011, "eval_loss": 3.2074711322784424, "eval_runtime": 33.3412, "eval_samples_per_second": 3.509, "eval_steps_per_second": 1.77, "step": 55 }, { "epoch": 0.0012, "eval_loss": 3.1877729892730713, "eval_runtime": 33.5109, "eval_samples_per_second": 3.491, "eval_steps_per_second": 1.761, "step": 60 }, { "epoch": 0.0013, "eval_loss": 3.153503894805908, "eval_runtime": 33.4747, "eval_samples_per_second": 3.495, "eval_steps_per_second": 1.763, "step": 65 }, { "epoch": 0.0014, "eval_loss": 3.1214191913604736, "eval_runtime": 33.5956, "eval_samples_per_second": 3.483, "eval_steps_per_second": 1.756, "step": 70 }, { "epoch": 0.0015, "grad_norm": 0.5083106511895727, "learning_rate": 1.4800000000000003e-07, "loss": 3.2951, "step": 75 }, { "epoch": 0.0015, "eval_loss": 3.101821184158325, "eval_runtime": 33.6, "eval_samples_per_second": 3.482, "eval_steps_per_second": 1.756, "step": 75 }, { "epoch": 0.0016, "eval_loss": 3.0797102451324463, "eval_runtime": 33.5302, "eval_samples_per_second": 3.489, "eval_steps_per_second": 1.76, "step": 80 }, { "epoch": 0.0017, "eval_loss": 3.0523691177368164, "eval_runtime": 33.5031, "eval_samples_per_second": 3.492, "eval_steps_per_second": 1.761, "step": 85 }, { "epoch": 0.0018, "eval_loss": 3.022620677947998, "eval_runtime": 33.6265, "eval_samples_per_second": 3.479, "eval_steps_per_second": 1.755, "step": 90 }, { "epoch": 0.0019, "eval_loss": 2.991481065750122, "eval_runtime": 33.5519, "eval_samples_per_second": 3.487, "eval_steps_per_second": 1.758, "step": 95 }, { "epoch": 0.002, "grad_norm": 0.28367624064943, "learning_rate": 1.9800000000000003e-07, "loss": 3.1531, "step": 100 }, { "epoch": 0.002, "eval_loss": 2.9630048274993896, "eval_runtime": 33.734, "eval_samples_per_second": 3.468, "eval_steps_per_second": 1.749, "step": 100 }, { "epoch": 0.0021, "eval_loss": 2.93916916847229, "eval_runtime": 33.4897, "eval_samples_per_second": 3.494, "eval_steps_per_second": 1.762, "step": 105 }, { "epoch": 0.0022, "eval_loss": 2.9186832904815674, "eval_runtime": 33.5154, "eval_samples_per_second": 3.491, "eval_steps_per_second": 1.76, "step": 110 }, { "epoch": 0.0023, "eval_loss": 2.8985302448272705, "eval_runtime": 33.5846, "eval_samples_per_second": 3.484, "eval_steps_per_second": 1.757, "step": 115 }, { "epoch": 0.0024, "eval_loss": 2.8786001205444336, "eval_runtime": 33.5482, "eval_samples_per_second": 3.488, "eval_steps_per_second": 1.759, "step": 120 }, { "epoch": 0.0025, "grad_norm": 0.19615444236413476, "learning_rate": 2.48e-07, "loss": 3.0101, "step": 125 }, { "epoch": 0.0025, "eval_loss": 2.860034704208374, "eval_runtime": 33.5143, "eval_samples_per_second": 3.491, "eval_steps_per_second": 1.76, "step": 125 }, { "epoch": 0.0026, "eval_loss": 2.843663454055786, "eval_runtime": 33.5082, "eval_samples_per_second": 3.492, "eval_steps_per_second": 1.761, "step": 130 }, { "epoch": 0.0027, "eval_loss": 2.82882022857666, "eval_runtime": 33.4921, "eval_samples_per_second": 3.493, "eval_steps_per_second": 1.762, "step": 135 }, { "epoch": 0.0028, "eval_loss": 2.8154728412628174, "eval_runtime": 33.6656, "eval_samples_per_second": 3.475, "eval_steps_per_second": 1.753, "step": 140 }, { "epoch": 0.0029, "eval_loss": 2.801098346710205, "eval_runtime": 33.5229, "eval_samples_per_second": 3.49, "eval_steps_per_second": 1.76, "step": 145 }, { "epoch": 0.003, "grad_norm": 0.5710572013823593, "learning_rate": 2.9800000000000005e-07, "loss": 2.876, "step": 150 }, { "epoch": 0.003, "eval_loss": 2.789198160171509, "eval_runtime": 33.4535, "eval_samples_per_second": 3.497, "eval_steps_per_second": 1.764, "step": 150 }, { "epoch": 0.0031, "eval_loss": 2.7789695262908936, "eval_runtime": 33.6409, "eval_samples_per_second": 3.478, "eval_steps_per_second": 1.754, "step": 155 }, { "epoch": 0.0032, "eval_loss": 2.7694201469421387, "eval_runtime": 33.4266, "eval_samples_per_second": 3.5, "eval_steps_per_second": 1.765, "step": 160 }, { "epoch": 0.0033, "eval_loss": 2.7600762844085693, "eval_runtime": 33.4725, "eval_samples_per_second": 3.495, "eval_steps_per_second": 1.763, "step": 165 }, { "epoch": 0.0034, "eval_loss": 2.7517828941345215, "eval_runtime": 33.6223, "eval_samples_per_second": 3.48, "eval_steps_per_second": 1.755, "step": 170 }, { "epoch": 0.0035, "grad_norm": 0.151307501186972, "learning_rate": 3.48e-07, "loss": 2.811, "step": 175 }, { "epoch": 0.0035, "eval_loss": 2.743870258331299, "eval_runtime": 33.5221, "eval_samples_per_second": 3.49, "eval_steps_per_second": 1.76, "step": 175 }, { "epoch": 0.0036, "eval_loss": 2.7366557121276855, "eval_runtime": 33.5448, "eval_samples_per_second": 3.488, "eval_steps_per_second": 1.759, "step": 180 }, { "epoch": 0.0037, "eval_loss": 2.7298200130462646, "eval_runtime": 33.5428, "eval_samples_per_second": 3.488, "eval_steps_per_second": 1.759, "step": 185 }, { "epoch": 0.0038, "eval_loss": 2.722888708114624, "eval_runtime": 33.6302, "eval_samples_per_second": 3.479, "eval_steps_per_second": 1.754, "step": 190 }, { "epoch": 0.0039, "eval_loss": 2.714289426803589, "eval_runtime": 33.5594, "eval_samples_per_second": 3.486, "eval_steps_per_second": 1.758, "step": 195 }, { "epoch": 0.004, "grad_norm": 0.10362348542700331, "learning_rate": 3.9800000000000004e-07, "loss": 2.7606, "step": 200 }, { "epoch": 0.004, "eval_loss": 2.7078425884246826, "eval_runtime": 33.6447, "eval_samples_per_second": 3.478, "eval_steps_per_second": 1.754, "step": 200 }, { "epoch": 0.0041, "eval_loss": 2.7014663219451904, "eval_runtime": 33.565, "eval_samples_per_second": 3.486, "eval_steps_per_second": 1.758, "step": 205 }, { "epoch": 0.0042, "eval_loss": 2.6956119537353516, "eval_runtime": 33.5938, "eval_samples_per_second": 3.483, "eval_steps_per_second": 1.756, "step": 210 }, { "epoch": 0.0043, "eval_loss": 2.6901819705963135, "eval_runtime": 33.5009, "eval_samples_per_second": 3.492, "eval_steps_per_second": 1.761, "step": 215 }, { "epoch": 0.0044, "eval_loss": 2.684842824935913, "eval_runtime": 33.5857, "eval_samples_per_second": 3.484, "eval_steps_per_second": 1.757, "step": 220 }, { "epoch": 0.0045, "grad_norm": 0.08395542059093342, "learning_rate": 4.4800000000000004e-07, "loss": 2.727, "step": 225 }, { "epoch": 0.0045, "eval_loss": 2.679893732070923, "eval_runtime": 33.5333, "eval_samples_per_second": 3.489, "eval_steps_per_second": 1.759, "step": 225 }, { "epoch": 0.0046, "eval_loss": 2.6749234199523926, "eval_runtime": 33.6847, "eval_samples_per_second": 3.473, "eval_steps_per_second": 1.752, "step": 230 }, { "epoch": 0.0047, "eval_loss": 2.670543670654297, "eval_runtime": 33.5814, "eval_samples_per_second": 3.484, "eval_steps_per_second": 1.757, "step": 235 }, { "epoch": 0.0048, "eval_loss": 2.6663973331451416, "eval_runtime": 33.5943, "eval_samples_per_second": 3.483, "eval_steps_per_second": 1.756, "step": 240 }, { "epoch": 0.0049, "eval_loss": 2.662304162979126, "eval_runtime": 33.5309, "eval_samples_per_second": 3.489, "eval_steps_per_second": 1.76, "step": 245 }, { "epoch": 0.005, "grad_norm": 0.06968304771462097, "learning_rate": 4.98e-07, "loss": 2.6931, "step": 250 }, { "epoch": 0.005, "eval_loss": 2.65859317779541, "eval_runtime": 33.4663, "eval_samples_per_second": 3.496, "eval_steps_per_second": 1.763, "step": 250 }, { "epoch": 0.0051, "eval_loss": 2.654831886291504, "eval_runtime": 33.5962, "eval_samples_per_second": 3.483, "eval_steps_per_second": 1.756, "step": 255 }, { "epoch": 0.0052, "eval_loss": 2.6509766578674316, "eval_runtime": 33.5064, "eval_samples_per_second": 3.492, "eval_steps_per_second": 1.761, "step": 260 }, { "epoch": 0.0053, "eval_loss": 2.6467387676239014, "eval_runtime": 33.5346, "eval_samples_per_second": 3.489, "eval_steps_per_second": 1.759, "step": 265 }, { "epoch": 0.0054, "eval_loss": 2.6428205966949463, "eval_runtime": 33.5418, "eval_samples_per_second": 3.488, "eval_steps_per_second": 1.759, "step": 270 }, { "epoch": 0.0055, "grad_norm": 0.05704195405230526, "learning_rate": 5.480000000000001e-07, "loss": 2.674, "step": 275 }, { "epoch": 0.0055, "eval_loss": 2.6392645835876465, "eval_runtime": 33.6509, "eval_samples_per_second": 3.477, "eval_steps_per_second": 1.753, "step": 275 }, { "epoch": 0.0056, "eval_loss": 2.6361024379730225, "eval_runtime": 33.6973, "eval_samples_per_second": 3.472, "eval_steps_per_second": 1.751, "step": 280 }, { "epoch": 0.0057, "eval_loss": 2.6328718662261963, "eval_runtime": 33.5639, "eval_samples_per_second": 3.486, "eval_steps_per_second": 1.758, "step": 285 }, { "epoch": 0.0058, "eval_loss": 2.629871129989624, "eval_runtime": 33.5243, "eval_samples_per_second": 3.49, "eval_steps_per_second": 1.76, "step": 290 }, { "epoch": 0.0059, "eval_loss": 2.6271257400512695, "eval_runtime": 33.6427, "eval_samples_per_second": 3.478, "eval_steps_per_second": 1.754, "step": 295 }, { "epoch": 0.006, "grad_norm": 0.05013991368613539, "learning_rate": 5.98e-07, "loss": 2.6504, "step": 300 }, { "epoch": 0.006, "eval_loss": 2.6243784427642822, "eval_runtime": 33.5815, "eval_samples_per_second": 3.484, "eval_steps_per_second": 1.757, "step": 300 }, { "epoch": 0.0061, "eval_loss": 2.621882915496826, "eval_runtime": 33.7331, "eval_samples_per_second": 3.468, "eval_steps_per_second": 1.749, "step": 305 }, { "epoch": 0.0062, "eval_loss": 2.6194233894348145, "eval_runtime": 33.594, "eval_samples_per_second": 3.483, "eval_steps_per_second": 1.756, "step": 310 }, { "epoch": 0.0063, "eval_loss": 2.6167914867401123, "eval_runtime": 33.5521, "eval_samples_per_second": 3.487, "eval_steps_per_second": 1.758, "step": 315 }, { "epoch": 0.0064, "eval_loss": 2.6143040657043457, "eval_runtime": 33.58, "eval_samples_per_second": 3.484, "eval_steps_per_second": 1.757, "step": 320 }, { "epoch": 0.0065, "grad_norm": 0.04696879401248375, "learning_rate": 6.48e-07, "loss": 2.6372, "step": 325 }, { "epoch": 0.0065, "eval_loss": 2.611804246902466, "eval_runtime": 33.5371, "eval_samples_per_second": 3.489, "eval_steps_per_second": 1.759, "step": 325 }, { "epoch": 0.0066, "eval_loss": 2.6093685626983643, "eval_runtime": 33.8057, "eval_samples_per_second": 3.461, "eval_steps_per_second": 1.745, "step": 330 }, { "epoch": 0.0067, "eval_loss": 2.607069492340088, "eval_runtime": 33.5819, "eval_samples_per_second": 3.484, "eval_steps_per_second": 1.757, "step": 335 }, { "epoch": 0.0068, "eval_loss": 2.604562520980835, "eval_runtime": 33.5971, "eval_samples_per_second": 3.482, "eval_steps_per_second": 1.756, "step": 340 }, { "epoch": 0.0069, "eval_loss": 2.6024069786071777, "eval_runtime": 33.5107, "eval_samples_per_second": 3.491, "eval_steps_per_second": 1.761, "step": 345 }, { "epoch": 0.007, "grad_norm": 0.04335213523196003, "learning_rate": 6.98e-07, "loss": 2.6173, "step": 350 }, { "epoch": 0.007, "eval_loss": 2.6002795696258545, "eval_runtime": 33.6194, "eval_samples_per_second": 3.48, "eval_steps_per_second": 1.755, "step": 350 }, { "epoch": 0.0071, "eval_loss": 2.598109245300293, "eval_runtime": 33.807, "eval_samples_per_second": 3.461, "eval_steps_per_second": 1.745, "step": 355 }, { "epoch": 0.0072, "eval_loss": 2.596126079559326, "eval_runtime": 33.5287, "eval_samples_per_second": 3.49, "eval_steps_per_second": 1.76, "step": 360 }, { "epoch": 0.0073, "eval_loss": 2.5941832065582275, "eval_runtime": 33.5456, "eval_samples_per_second": 3.488, "eval_steps_per_second": 1.759, "step": 365 }, { "epoch": 0.0074, "eval_loss": 2.592336893081665, "eval_runtime": 33.6972, "eval_samples_per_second": 3.472, "eval_steps_per_second": 1.751, "step": 370 }, { "epoch": 0.0075, "grad_norm": 0.04553004087145917, "learning_rate": 7.480000000000001e-07, "loss": 2.608, "step": 375 }, { "epoch": 0.0075, "eval_loss": 2.590573310852051, "eval_runtime": 33.6132, "eval_samples_per_second": 3.481, "eval_steps_per_second": 1.755, "step": 375 }, { "epoch": 0.0076, "eval_loss": 2.5888302326202393, "eval_runtime": 33.6363, "eval_samples_per_second": 3.478, "eval_steps_per_second": 1.754, "step": 380 }, { "epoch": 0.0077, "eval_loss": 2.5870487689971924, "eval_runtime": 33.6309, "eval_samples_per_second": 3.479, "eval_steps_per_second": 1.754, "step": 385 }, { "epoch": 0.0078, "eval_loss": 2.5851986408233643, "eval_runtime": 33.5237, "eval_samples_per_second": 3.49, "eval_steps_per_second": 1.76, "step": 390 }, { "epoch": 0.0079, "eval_loss": 2.583341598510742, "eval_runtime": 33.4914, "eval_samples_per_second": 3.493, "eval_steps_per_second": 1.762, "step": 395 }, { "epoch": 0.008, "grad_norm": 0.04067489002091025, "learning_rate": 7.98e-07, "loss": 2.6034, "step": 400 }, { "epoch": 0.008, "eval_loss": 2.5816242694854736, "eval_runtime": 33.6305, "eval_samples_per_second": 3.479, "eval_steps_per_second": 1.754, "step": 400 }, { "epoch": 0.0081, "eval_loss": 2.5800209045410156, "eval_runtime": 33.9049, "eval_samples_per_second": 3.451, "eval_steps_per_second": 1.74, "step": 405 }, { "epoch": 0.0082, "eval_loss": 2.5783472061157227, "eval_runtime": 33.6847, "eval_samples_per_second": 3.473, "eval_steps_per_second": 1.752, "step": 410 }, { "epoch": 0.0083, "eval_loss": 2.5765581130981445, "eval_runtime": 33.5467, "eval_samples_per_second": 3.488, "eval_steps_per_second": 1.759, "step": 415 }, { "epoch": 0.0084, "eval_loss": 2.574805974960327, "eval_runtime": 33.6837, "eval_samples_per_second": 3.473, "eval_steps_per_second": 1.752, "step": 420 }, { "epoch": 0.0085, "grad_norm": 0.03957021300313725, "learning_rate": 8.480000000000001e-07, "loss": 2.5881, "step": 425 }, { "epoch": 0.0085, "eval_loss": 2.5732243061065674, "eval_runtime": 33.6883, "eval_samples_per_second": 3.473, "eval_steps_per_second": 1.751, "step": 425 }, { "epoch": 0.0086, "eval_loss": 2.5712339878082275, "eval_runtime": 34.0087, "eval_samples_per_second": 3.44, "eval_steps_per_second": 1.735, "step": 430 }, { "epoch": 0.0087, "eval_loss": 2.5696043968200684, "eval_runtime": 33.5522, "eval_samples_per_second": 3.487, "eval_steps_per_second": 1.758, "step": 435 }, { "epoch": 0.0088, "eval_loss": 2.568011522293091, "eval_runtime": 33.7026, "eval_samples_per_second": 3.472, "eval_steps_per_second": 1.751, "step": 440 }, { "epoch": 0.0089, "eval_loss": 2.5661723613739014, "eval_runtime": 33.7143, "eval_samples_per_second": 3.47, "eval_steps_per_second": 1.75, "step": 445 }, { "epoch": 0.009, "grad_norm": 0.04518058243135632, "learning_rate": 8.980000000000001e-07, "loss": 2.577, "step": 450 }, { "epoch": 0.009, "eval_loss": 2.5647170543670654, "eval_runtime": 33.6066, "eval_samples_per_second": 3.481, "eval_steps_per_second": 1.756, "step": 450 }, { "epoch": 0.0091, "eval_loss": 2.5629138946533203, "eval_runtime": 33.695, "eval_samples_per_second": 3.472, "eval_steps_per_second": 1.751, "step": 455 }, { "epoch": 0.0092, "eval_loss": 2.561223268508911, "eval_runtime": 33.7639, "eval_samples_per_second": 3.465, "eval_steps_per_second": 1.747, "step": 460 }, { "epoch": 0.0093, "eval_loss": 2.559941053390503, "eval_runtime": 33.5726, "eval_samples_per_second": 3.485, "eval_steps_per_second": 1.757, "step": 465 }, { "epoch": 0.0094, "eval_loss": 2.5585126876831055, "eval_runtime": 33.5393, "eval_samples_per_second": 3.488, "eval_steps_per_second": 1.759, "step": 470 }, { "epoch": 0.0095, "grad_norm": 0.04841685552742973, "learning_rate": 9.480000000000001e-07, "loss": 2.5614, "step": 475 }, { "epoch": 0.0095, "eval_loss": 2.557070732116699, "eval_runtime": 33.5396, "eval_samples_per_second": 3.488, "eval_steps_per_second": 1.759, "step": 475 }, { "epoch": 0.0096, "eval_loss": 2.5551016330718994, "eval_runtime": 33.8951, "eval_samples_per_second": 3.452, "eval_steps_per_second": 1.741, "step": 480 }, { "epoch": 0.0097, "eval_loss": 2.553600311279297, "eval_runtime": 33.6678, "eval_samples_per_second": 3.475, "eval_steps_per_second": 1.752, "step": 485 }, { "epoch": 0.0098, "eval_loss": 2.5523183345794678, "eval_runtime": 33.6551, "eval_samples_per_second": 3.476, "eval_steps_per_second": 1.753, "step": 490 }, { "epoch": 0.0099, "eval_loss": 2.5510056018829346, "eval_runtime": 33.6214, "eval_samples_per_second": 3.48, "eval_steps_per_second": 1.755, "step": 495 }, { "epoch": 0.01, "grad_norm": 0.043993000876628545, "learning_rate": 9.98e-07, "loss": 2.5613, "step": 500 }, { "epoch": 0.01, "eval_loss": 2.5498273372650146, "eval_runtime": 33.6069, "eval_samples_per_second": 3.481, "eval_steps_per_second": 1.756, "step": 500 }, { "epoch": 0.0101, "eval_loss": 2.548828601837158, "eval_runtime": 33.7909, "eval_samples_per_second": 3.462, "eval_steps_per_second": 1.746, "step": 505 }, { "epoch": 0.0102, "eval_loss": 2.5474376678466797, "eval_runtime": 33.543, "eval_samples_per_second": 3.488, "eval_steps_per_second": 1.759, "step": 510 }, { "epoch": 0.0103, "eval_loss": 2.5464441776275635, "eval_runtime": 33.6579, "eval_samples_per_second": 3.476, "eval_steps_per_second": 1.753, "step": 515 }, { "epoch": 0.0104, "eval_loss": 2.5453498363494873, "eval_runtime": 33.4841, "eval_samples_per_second": 3.494, "eval_steps_per_second": 1.762, "step": 520 }, { "epoch": 0.0105, "grad_norm": 0.04663602312001795, "learning_rate": 1.0480000000000002e-06, "loss": 2.5521, "step": 525 }, { "epoch": 0.0105, "eval_loss": 2.5442492961883545, "eval_runtime": 33.5915, "eval_samples_per_second": 3.483, "eval_steps_per_second": 1.756, "step": 525 }, { "epoch": 0.0106, "eval_loss": 2.5432002544403076, "eval_runtime": 33.6717, "eval_samples_per_second": 3.475, "eval_steps_per_second": 1.752, "step": 530 }, { "epoch": 0.0107, "eval_loss": 2.542072057723999, "eval_runtime": 33.6153, "eval_samples_per_second": 3.481, "eval_steps_per_second": 1.755, "step": 535 }, { "epoch": 0.0108, "eval_loss": 2.541541814804077, "eval_runtime": 34.4505, "eval_samples_per_second": 3.396, "eval_steps_per_second": 1.713, "step": 540 }, { "epoch": 0.0109, "eval_loss": 2.540494203567505, "eval_runtime": 33.6369, "eval_samples_per_second": 3.478, "eval_steps_per_second": 1.754, "step": 545 }, { "epoch": 0.011, "grad_norm": 0.044473565671350655, "learning_rate": 1.0980000000000001e-06, "loss": 2.5433, "step": 550 }, { "epoch": 0.011, "eval_loss": 2.539369821548462, "eval_runtime": 33.5742, "eval_samples_per_second": 3.485, "eval_steps_per_second": 1.757, "step": 550 }, { "epoch": 0.0111, "eval_loss": 2.5384223461151123, "eval_runtime": 33.9094, "eval_samples_per_second": 3.45, "eval_steps_per_second": 1.74, "step": 555 }, { "epoch": 0.0112, "eval_loss": 2.5375945568084717, "eval_runtime": 33.6016, "eval_samples_per_second": 3.482, "eval_steps_per_second": 1.756, "step": 560 }, { "epoch": 0.0113, "eval_loss": 2.536487340927124, "eval_runtime": 34.3561, "eval_samples_per_second": 3.406, "eval_steps_per_second": 1.717, "step": 565 }, { "epoch": 0.0114, "eval_loss": 2.5356836318969727, "eval_runtime": 34.5074, "eval_samples_per_second": 3.391, "eval_steps_per_second": 1.71, "step": 570 }, { "epoch": 0.0115, "grad_norm": 0.04668528198599521, "learning_rate": 1.148e-06, "loss": 2.5496, "step": 575 }, { "epoch": 0.0115, "eval_loss": 2.5347819328308105, "eval_runtime": 33.5932, "eval_samples_per_second": 3.483, "eval_steps_per_second": 1.756, "step": 575 }, { "epoch": 0.0116, "eval_loss": 2.534010410308838, "eval_runtime": 33.8124, "eval_samples_per_second": 3.46, "eval_steps_per_second": 1.745, "step": 580 }, { "epoch": 0.0117, "eval_loss": 2.5331332683563232, "eval_runtime": 33.5617, "eval_samples_per_second": 3.486, "eval_steps_per_second": 1.758, "step": 585 }, { "epoch": 0.0118, "eval_loss": 2.5322561264038086, "eval_runtime": 33.8081, "eval_samples_per_second": 3.461, "eval_steps_per_second": 1.745, "step": 590 }, { "epoch": 0.0119, "eval_loss": 2.5314669609069824, "eval_runtime": 33.7053, "eval_samples_per_second": 3.471, "eval_steps_per_second": 1.75, "step": 595 }, { "epoch": 0.012, "grad_norm": 0.043769011241975755, "learning_rate": 1.1980000000000002e-06, "loss": 2.5455, "step": 600 }, { "epoch": 0.012, "eval_loss": 2.5307207107543945, "eval_runtime": 33.6848, "eval_samples_per_second": 3.473, "eval_steps_per_second": 1.752, "step": 600 }, { "epoch": 0.0121, "eval_loss": 2.530006170272827, "eval_runtime": 33.686, "eval_samples_per_second": 3.473, "eval_steps_per_second": 1.751, "step": 605 }, { "epoch": 0.0122, "eval_loss": 2.529109239578247, "eval_runtime": 33.7013, "eval_samples_per_second": 3.472, "eval_steps_per_second": 1.751, "step": 610 }, { "epoch": 0.0123, "eval_loss": 2.5284457206726074, "eval_runtime": 33.6733, "eval_samples_per_second": 3.475, "eval_steps_per_second": 1.752, "step": 615 }, { "epoch": 0.0124, "eval_loss": 2.5276710987091064, "eval_runtime": 33.624, "eval_samples_per_second": 3.48, "eval_steps_per_second": 1.755, "step": 620 }, { "epoch": 0.0125, "grad_norm": 0.04196307636615052, "learning_rate": 1.248e-06, "loss": 2.5273, "step": 625 }, { "epoch": 0.0125, "eval_loss": 2.526918411254883, "eval_runtime": 33.5952, "eval_samples_per_second": 3.483, "eval_steps_per_second": 1.756, "step": 625 }, { "epoch": 0.0126, "eval_loss": 2.5262696743011475, "eval_runtime": 33.7522, "eval_samples_per_second": 3.466, "eval_steps_per_second": 1.748, "step": 630 }, { "epoch": 0.0127, "eval_loss": 2.5255067348480225, "eval_runtime": 33.7929, "eval_samples_per_second": 3.462, "eval_steps_per_second": 1.746, "step": 635 }, { "epoch": 0.0128, "eval_loss": 2.524789810180664, "eval_runtime": 33.7139, "eval_samples_per_second": 3.47, "eval_steps_per_second": 1.75, "step": 640 }, { "epoch": 0.0129, "eval_loss": 2.524181604385376, "eval_runtime": 33.7772, "eval_samples_per_second": 3.464, "eval_steps_per_second": 1.747, "step": 645 }, { "epoch": 0.013, "grad_norm": 0.04719575571491393, "learning_rate": 1.2980000000000001e-06, "loss": 2.5226, "step": 650 }, { "epoch": 0.013, "eval_loss": 2.5235090255737305, "eval_runtime": 33.6972, "eval_samples_per_second": 3.472, "eval_steps_per_second": 1.751, "step": 650 }, { "epoch": 0.0131, "eval_loss": 2.5227887630462646, "eval_runtime": 33.8073, "eval_samples_per_second": 3.461, "eval_steps_per_second": 1.745, "step": 655 }, { "epoch": 0.0132, "eval_loss": 2.522101402282715, "eval_runtime": 33.7192, "eval_samples_per_second": 3.47, "eval_steps_per_second": 1.75, "step": 660 }, { "epoch": 0.0133, "eval_loss": 2.5215632915496826, "eval_runtime": 33.7966, "eval_samples_per_second": 3.462, "eval_steps_per_second": 1.746, "step": 665 }, { "epoch": 0.0134, "eval_loss": 2.5208749771118164, "eval_runtime": 33.7485, "eval_samples_per_second": 3.467, "eval_steps_per_second": 1.748, "step": 670 }, { "epoch": 0.0135, "grad_norm": 0.044734235617461586, "learning_rate": 1.348e-06, "loss": 2.5273, "step": 675 }, { "epoch": 0.0135, "eval_loss": 2.5201478004455566, "eval_runtime": 33.8972, "eval_samples_per_second": 3.452, "eval_steps_per_second": 1.741, "step": 675 }, { "epoch": 0.0136, "eval_loss": 2.5197227001190186, "eval_runtime": 33.6652, "eval_samples_per_second": 3.475, "eval_steps_per_second": 1.753, "step": 680 }, { "epoch": 0.0137, "eval_loss": 2.519151449203491, "eval_runtime": 33.6031, "eval_samples_per_second": 3.482, "eval_steps_per_second": 1.756, "step": 685 }, { "epoch": 0.0138, "eval_loss": 2.5185396671295166, "eval_runtime": 33.6292, "eval_samples_per_second": 3.479, "eval_steps_per_second": 1.754, "step": 690 }, { "epoch": 0.0139, "eval_loss": 2.517947196960449, "eval_runtime": 33.5987, "eval_samples_per_second": 3.482, "eval_steps_per_second": 1.756, "step": 695 }, { "epoch": 0.014, "grad_norm": 0.04124740305893712, "learning_rate": 1.3980000000000002e-06, "loss": 2.5214, "step": 700 }, { "epoch": 0.014, "eval_loss": 2.5173356533050537, "eval_runtime": 33.6657, "eval_samples_per_second": 3.475, "eval_steps_per_second": 1.753, "step": 700 }, { "epoch": 0.0141, "eval_loss": 2.5167977809906006, "eval_runtime": 33.5728, "eval_samples_per_second": 3.485, "eval_steps_per_second": 1.757, "step": 705 }, { "epoch": 0.0142, "eval_loss": 2.5162267684936523, "eval_runtime": 33.2779, "eval_samples_per_second": 3.516, "eval_steps_per_second": 1.773, "step": 710 }, { "epoch": 0.0143, "eval_loss": 2.5155909061431885, "eval_runtime": 33.4627, "eval_samples_per_second": 3.496, "eval_steps_per_second": 1.763, "step": 715 }, { "epoch": 0.0144, "eval_loss": 2.515427589416504, "eval_runtime": 33.439, "eval_samples_per_second": 3.499, "eval_steps_per_second": 1.764, "step": 720 }, { "epoch": 0.0145, "grad_norm": 0.04140679897915697, "learning_rate": 1.4480000000000002e-06, "loss": 2.5192, "step": 725 }, { "epoch": 0.0145, "eval_loss": 2.514657735824585, "eval_runtime": 33.3527, "eval_samples_per_second": 3.508, "eval_steps_per_second": 1.769, "step": 725 }, { "epoch": 0.0146, "eval_loss": 2.5141184329986572, "eval_runtime": 33.3623, "eval_samples_per_second": 3.507, "eval_steps_per_second": 1.768, "step": 730 }, { "epoch": 0.0147, "eval_loss": 2.5135021209716797, "eval_runtime": 36.2875, "eval_samples_per_second": 3.224, "eval_steps_per_second": 1.626, "step": 735 }, { "epoch": 0.0148, "eval_loss": 2.5130276679992676, "eval_runtime": 33.3738, "eval_samples_per_second": 3.506, "eval_steps_per_second": 1.768, "step": 740 }, { "epoch": 0.0149, "eval_loss": 2.5123140811920166, "eval_runtime": 33.7458, "eval_samples_per_second": 3.467, "eval_steps_per_second": 1.748, "step": 745 }, { "epoch": 0.015, "grad_norm": 0.03921746155872101, "learning_rate": 1.498e-06, "loss": 2.5077, "step": 750 }, { "epoch": 0.015, "eval_loss": 2.5117204189300537, "eval_runtime": 33.3164, "eval_samples_per_second": 3.512, "eval_steps_per_second": 1.771, "step": 750 }, { "epoch": 0.0151, "eval_loss": 2.5113115310668945, "eval_runtime": 33.464, "eval_samples_per_second": 3.496, "eval_steps_per_second": 1.763, "step": 755 }, { "epoch": 0.0152, "eval_loss": 2.510754108428955, "eval_runtime": 33.426, "eval_samples_per_second": 3.5, "eval_steps_per_second": 1.765, "step": 760 }, { "epoch": 0.0153, "eval_loss": 2.510148525238037, "eval_runtime": 33.5135, "eval_samples_per_second": 3.491, "eval_steps_per_second": 1.76, "step": 765 }, { "epoch": 0.0154, "eval_loss": 2.5096797943115234, "eval_runtime": 33.5467, "eval_samples_per_second": 3.488, "eval_steps_per_second": 1.759, "step": 770 }, { "epoch": 0.0155, "grad_norm": 0.038161493704092234, "learning_rate": 1.548e-06, "loss": 2.5127, "step": 775 }, { "epoch": 0.0155, "eval_loss": 2.5091397762298584, "eval_runtime": 33.6296, "eval_samples_per_second": 3.479, "eval_steps_per_second": 1.754, "step": 775 }, { "epoch": 0.0156, "eval_loss": 2.5085766315460205, "eval_runtime": 33.6417, "eval_samples_per_second": 3.478, "eval_steps_per_second": 1.754, "step": 780 }, { "epoch": 0.0157, "eval_loss": 2.5081799030303955, "eval_runtime": 33.5831, "eval_samples_per_second": 3.484, "eval_steps_per_second": 1.757, "step": 785 }, { "epoch": 0.0158, "eval_loss": 2.5075252056121826, "eval_runtime": 33.5806, "eval_samples_per_second": 3.484, "eval_steps_per_second": 1.757, "step": 790 }, { "epoch": 0.0159, "eval_loss": 2.5069563388824463, "eval_runtime": 33.6257, "eval_samples_per_second": 3.479, "eval_steps_per_second": 1.755, "step": 795 }, { "epoch": 0.016, "grad_norm": 0.04372605860022339, "learning_rate": 1.5980000000000002e-06, "loss": 2.5019, "step": 800 }, { "epoch": 0.016, "eval_loss": 2.5065925121307373, "eval_runtime": 33.6041, "eval_samples_per_second": 3.482, "eval_steps_per_second": 1.756, "step": 800 }, { "epoch": 0.0161, "eval_loss": 2.5059759616851807, "eval_runtime": 33.6116, "eval_samples_per_second": 3.481, "eval_steps_per_second": 1.755, "step": 805 }, { "epoch": 0.0162, "eval_loss": 2.505453109741211, "eval_runtime": 33.5794, "eval_samples_per_second": 3.484, "eval_steps_per_second": 1.757, "step": 810 }, { "epoch": 0.0163, "eval_loss": 2.505023241043091, "eval_runtime": 33.461, "eval_samples_per_second": 3.497, "eval_steps_per_second": 1.763, "step": 815 }, { "epoch": 0.0164, "eval_loss": 2.5042824745178223, "eval_runtime": 33.5988, "eval_samples_per_second": 3.482, "eval_steps_per_second": 1.756, "step": 820 }, { "epoch": 0.0165, "grad_norm": 0.041497520045134684, "learning_rate": 1.6480000000000001e-06, "loss": 2.4977, "step": 825 }, { "epoch": 0.0165, "eval_loss": 2.5039255619049072, "eval_runtime": 33.6107, "eval_samples_per_second": 3.481, "eval_steps_per_second": 1.755, "step": 825 }, { "epoch": 0.0166, "eval_loss": 2.503436803817749, "eval_runtime": 33.6213, "eval_samples_per_second": 3.48, "eval_steps_per_second": 1.755, "step": 830 }, { "epoch": 0.0167, "eval_loss": 2.5028321743011475, "eval_runtime": 33.5009, "eval_samples_per_second": 3.492, "eval_steps_per_second": 1.761, "step": 835 }, { "epoch": 0.0168, "eval_loss": 2.5022666454315186, "eval_runtime": 33.6392, "eval_samples_per_second": 3.478, "eval_steps_per_second": 1.754, "step": 840 }, { "epoch": 0.0169, "eval_loss": 2.5018374919891357, "eval_runtime": 33.5928, "eval_samples_per_second": 3.483, "eval_steps_per_second": 1.756, "step": 845 }, { "epoch": 0.017, "grad_norm": 0.040226840781059835, "learning_rate": 1.6980000000000003e-06, "loss": 2.4968, "step": 850 }, { "epoch": 0.017, "eval_loss": 2.5012588500976562, "eval_runtime": 33.5216, "eval_samples_per_second": 3.49, "eval_steps_per_second": 1.76, "step": 850 }, { "epoch": 0.0171, "eval_loss": 2.5006515979766846, "eval_runtime": 33.4799, "eval_samples_per_second": 3.495, "eval_steps_per_second": 1.762, "step": 855 }, { "epoch": 0.0172, "eval_loss": 2.5001821517944336, "eval_runtime": 33.6067, "eval_samples_per_second": 3.481, "eval_steps_per_second": 1.756, "step": 860 }, { "epoch": 0.0173, "eval_loss": 2.499708652496338, "eval_runtime": 33.5478, "eval_samples_per_second": 3.488, "eval_steps_per_second": 1.759, "step": 865 }, { "epoch": 0.0174, "eval_loss": 2.4992101192474365, "eval_runtime": 33.3608, "eval_samples_per_second": 3.507, "eval_steps_per_second": 1.769, "step": 870 }, { "epoch": 0.0175, "grad_norm": 0.043360400185163274, "learning_rate": 1.7480000000000002e-06, "loss": 2.4947, "step": 875 }, { "epoch": 0.0175, "eval_loss": 2.49912428855896, "eval_runtime": 33.3782, "eval_samples_per_second": 3.505, "eval_steps_per_second": 1.768, "step": 875 }, { "epoch": 0.0176, "eval_loss": 2.498539686203003, "eval_runtime": 33.4271, "eval_samples_per_second": 3.5, "eval_steps_per_second": 1.765, "step": 880 }, { "epoch": 0.0177, "eval_loss": 2.4980475902557373, "eval_runtime": 33.508, "eval_samples_per_second": 3.492, "eval_steps_per_second": 1.761, "step": 885 }, { "epoch": 0.0178, "eval_loss": 2.4972891807556152, "eval_runtime": 33.5801, "eval_samples_per_second": 3.484, "eval_steps_per_second": 1.757, "step": 890 }, { "epoch": 0.0179, "eval_loss": 2.496943473815918, "eval_runtime": 33.4984, "eval_samples_per_second": 3.493, "eval_steps_per_second": 1.761, "step": 895 }, { "epoch": 0.018, "grad_norm": 0.040565773819723885, "learning_rate": 1.798e-06, "loss": 2.4878, "step": 900 }, { "epoch": 0.018, "eval_loss": 2.496464252471924, "eval_runtime": 33.6538, "eval_samples_per_second": 3.477, "eval_steps_per_second": 1.753, "step": 900 }, { "epoch": 0.0181, "eval_loss": 2.496126890182495, "eval_runtime": 33.6415, "eval_samples_per_second": 3.478, "eval_steps_per_second": 1.754, "step": 905 }, { "epoch": 0.0182, "eval_loss": 2.4957361221313477, "eval_runtime": 33.7646, "eval_samples_per_second": 3.465, "eval_steps_per_second": 1.747, "step": 910 }, { "epoch": 0.0183, "eval_loss": 2.4954254627227783, "eval_runtime": 33.5639, "eval_samples_per_second": 3.486, "eval_steps_per_second": 1.758, "step": 915 }, { "epoch": 0.0184, "eval_loss": 2.4948976039886475, "eval_runtime": 33.6038, "eval_samples_per_second": 3.482, "eval_steps_per_second": 1.756, "step": 920 }, { "epoch": 0.0185, "grad_norm": 0.039370814834696136, "learning_rate": 1.8480000000000001e-06, "loss": 2.4986, "step": 925 }, { "epoch": 0.0185, "eval_loss": 2.494521379470825, "eval_runtime": 33.7082, "eval_samples_per_second": 3.471, "eval_steps_per_second": 1.75, "step": 925 }, { "epoch": 0.0186, "eval_loss": 2.4939730167388916, "eval_runtime": 33.6147, "eval_samples_per_second": 3.481, "eval_steps_per_second": 1.755, "step": 930 }, { "epoch": 0.0187, "eval_loss": 2.49343204498291, "eval_runtime": 35.1502, "eval_samples_per_second": 3.329, "eval_steps_per_second": 1.679, "step": 935 }, { "epoch": 0.0188, "eval_loss": 2.493082046508789, "eval_runtime": 33.6381, "eval_samples_per_second": 3.478, "eval_steps_per_second": 1.754, "step": 940 }, { "epoch": 0.0189, "eval_loss": 2.492797374725342, "eval_runtime": 33.7089, "eval_samples_per_second": 3.471, "eval_steps_per_second": 1.75, "step": 945 }, { "epoch": 0.019, "grad_norm": 0.04019472793080496, "learning_rate": 1.898e-06, "loss": 2.481, "step": 950 }, { "epoch": 0.019, "eval_loss": 2.4925599098205566, "eval_runtime": 33.5096, "eval_samples_per_second": 3.492, "eval_steps_per_second": 1.761, "step": 950 }, { "epoch": 0.0191, "eval_loss": 2.4918878078460693, "eval_runtime": 33.4921, "eval_samples_per_second": 3.493, "eval_steps_per_second": 1.762, "step": 955 }, { "epoch": 0.0192, "eval_loss": 2.4916608333587646, "eval_runtime": 33.5126, "eval_samples_per_second": 3.491, "eval_steps_per_second": 1.761, "step": 960 }, { "epoch": 0.0193, "eval_loss": 2.491708517074585, "eval_runtime": 33.6466, "eval_samples_per_second": 3.477, "eval_steps_per_second": 1.754, "step": 965 }, { "epoch": 0.0194, "eval_loss": 2.4911839962005615, "eval_runtime": 33.6119, "eval_samples_per_second": 3.481, "eval_steps_per_second": 1.755, "step": 970 }, { "epoch": 0.0195, "grad_norm": 0.04683912756161822, "learning_rate": 1.9480000000000002e-06, "loss": 2.4879, "step": 975 }, { "epoch": 0.0195, "eval_loss": 2.490492343902588, "eval_runtime": 33.4389, "eval_samples_per_second": 3.499, "eval_steps_per_second": 1.764, "step": 975 }, { "epoch": 0.0196, "eval_loss": 2.490133285522461, "eval_runtime": 33.361, "eval_samples_per_second": 3.507, "eval_steps_per_second": 1.769, "step": 980 }, { "epoch": 0.0197, "eval_loss": 2.4896316528320312, "eval_runtime": 33.5863, "eval_samples_per_second": 3.484, "eval_steps_per_second": 1.757, "step": 985 }, { "epoch": 0.0198, "eval_loss": 2.489122152328491, "eval_runtime": 33.6173, "eval_samples_per_second": 3.48, "eval_steps_per_second": 1.755, "step": 990 }, { "epoch": 0.0199, "eval_loss": 2.488906145095825, "eval_runtime": 33.6531, "eval_samples_per_second": 3.477, "eval_steps_per_second": 1.753, "step": 995 }, { "epoch": 0.02, "grad_norm": 0.047671496052023164, "learning_rate": 1.998e-06, "loss": 2.4879, "step": 1000 }, { "epoch": 0.02, "eval_loss": 2.488457202911377, "eval_runtime": 33.7763, "eval_samples_per_second": 3.464, "eval_steps_per_second": 1.747, "step": 1000 }, { "epoch": 0.0201, "eval_loss": 2.4881434440612793, "eval_runtime": 33.6922, "eval_samples_per_second": 3.473, "eval_steps_per_second": 1.751, "step": 1005 }, { "epoch": 0.0202, "eval_loss": 2.4879722595214844, "eval_runtime": 33.6857, "eval_samples_per_second": 3.473, "eval_steps_per_second": 1.751, "step": 1010 }, { "epoch": 0.0203, "eval_loss": 2.4876134395599365, "eval_runtime": 33.7945, "eval_samples_per_second": 3.462, "eval_steps_per_second": 1.746, "step": 1015 }, { "epoch": 0.0204, "eval_loss": 2.4872164726257324, "eval_runtime": 33.7811, "eval_samples_per_second": 3.463, "eval_steps_per_second": 1.747, "step": 1020 }, { "epoch": 0.0205, "grad_norm": 0.04204734602618554, "learning_rate": 2.048e-06, "loss": 2.4708, "step": 1025 }, { "epoch": 0.0205, "eval_loss": 2.48695707321167, "eval_runtime": 33.821, "eval_samples_per_second": 3.459, "eval_steps_per_second": 1.744, "step": 1025 }, { "epoch": 0.0206, "eval_loss": 2.486564874649048, "eval_runtime": 33.82, "eval_samples_per_second": 3.459, "eval_steps_per_second": 1.745, "step": 1030 }, { "epoch": 0.0207, "eval_loss": 2.486281633377075, "eval_runtime": 33.927, "eval_samples_per_second": 3.449, "eval_steps_per_second": 1.739, "step": 1035 }, { "epoch": 0.0208, "eval_loss": 2.4860103130340576, "eval_runtime": 33.9697, "eval_samples_per_second": 3.444, "eval_steps_per_second": 1.737, "step": 1040 }, { "epoch": 0.0209, "eval_loss": 2.4855759143829346, "eval_runtime": 33.9097, "eval_samples_per_second": 3.45, "eval_steps_per_second": 1.74, "step": 1045 }, { "epoch": 0.021, "grad_norm": 0.03813289834436041, "learning_rate": 2.098e-06, "loss": 2.4799, "step": 1050 }, { "epoch": 0.021, "eval_loss": 2.485349416732788, "eval_runtime": 34.0131, "eval_samples_per_second": 3.44, "eval_steps_per_second": 1.735, "step": 1050 }, { "epoch": 0.0211, "eval_loss": 2.48506498336792, "eval_runtime": 34.036, "eval_samples_per_second": 3.438, "eval_steps_per_second": 1.733, "step": 1055 }, { "epoch": 0.0212, "eval_loss": 2.484771966934204, "eval_runtime": 34.0842, "eval_samples_per_second": 3.433, "eval_steps_per_second": 1.731, "step": 1060 }, { "epoch": 0.0213, "eval_loss": 2.4846508502960205, "eval_runtime": 34.0289, "eval_samples_per_second": 3.438, "eval_steps_per_second": 1.734, "step": 1065 }, { "epoch": 0.0214, "eval_loss": 2.484158992767334, "eval_runtime": 34.0038, "eval_samples_per_second": 3.441, "eval_steps_per_second": 1.735, "step": 1070 }, { "epoch": 0.0215, "grad_norm": 0.04289680570208033, "learning_rate": 2.148e-06, "loss": 2.4822, "step": 1075 }, { "epoch": 0.0215, "eval_loss": 2.483947992324829, "eval_runtime": 33.9604, "eval_samples_per_second": 3.445, "eval_steps_per_second": 1.737, "step": 1075 }, { "epoch": 0.0216, "eval_loss": 2.4836008548736572, "eval_runtime": 33.9465, "eval_samples_per_second": 3.447, "eval_steps_per_second": 1.738, "step": 1080 }, { "epoch": 0.0217, "eval_loss": 2.483187675476074, "eval_runtime": 34.1344, "eval_samples_per_second": 3.428, "eval_steps_per_second": 1.728, "step": 1085 }, { "epoch": 0.0218, "eval_loss": 2.4829964637756348, "eval_runtime": 34.0915, "eval_samples_per_second": 3.432, "eval_steps_per_second": 1.731, "step": 1090 }, { "epoch": 0.0219, "eval_loss": 2.482805013656616, "eval_runtime": 33.9291, "eval_samples_per_second": 3.448, "eval_steps_per_second": 1.739, "step": 1095 }, { "epoch": 0.022, "grad_norm": 0.03972633299982532, "learning_rate": 2.198e-06, "loss": 2.4871, "step": 1100 }, { "epoch": 0.022, "eval_loss": 2.482428550720215, "eval_runtime": 33.7324, "eval_samples_per_second": 3.468, "eval_steps_per_second": 1.749, "step": 1100 }, { "epoch": 0.0221, "eval_loss": 2.4822213649749756, "eval_runtime": 33.7954, "eval_samples_per_second": 3.462, "eval_steps_per_second": 1.746, "step": 1105 }, { "epoch": 0.0222, "eval_loss": 2.481689214706421, "eval_runtime": 33.7787, "eval_samples_per_second": 3.464, "eval_steps_per_second": 1.747, "step": 1110 }, { "epoch": 0.0223, "eval_loss": 2.481731414794922, "eval_runtime": 33.6129, "eval_samples_per_second": 3.481, "eval_steps_per_second": 1.755, "step": 1115 }, { "epoch": 0.0224, "eval_loss": 2.4812448024749756, "eval_runtime": 33.511, "eval_samples_per_second": 3.491, "eval_steps_per_second": 1.761, "step": 1120 }, { "epoch": 0.0225, "grad_norm": 0.041792864961431496, "learning_rate": 2.2480000000000003e-06, "loss": 2.4766, "step": 1125 }, { "epoch": 0.0225, "eval_loss": 2.4809837341308594, "eval_runtime": 33.7009, "eval_samples_per_second": 3.472, "eval_steps_per_second": 1.751, "step": 1125 }, { "epoch": 0.0226, "eval_loss": 2.480768918991089, "eval_runtime": 33.6615, "eval_samples_per_second": 3.476, "eval_steps_per_second": 1.753, "step": 1130 }, { "epoch": 0.0227, "eval_loss": 2.480337381362915, "eval_runtime": 33.6203, "eval_samples_per_second": 3.48, "eval_steps_per_second": 1.755, "step": 1135 }, { "epoch": 0.0228, "eval_loss": 2.4803271293640137, "eval_runtime": 33.6559, "eval_samples_per_second": 3.476, "eval_steps_per_second": 1.753, "step": 1140 }, { "epoch": 0.0229, "eval_loss": 2.4799482822418213, "eval_runtime": 33.5023, "eval_samples_per_second": 3.492, "eval_steps_per_second": 1.761, "step": 1145 }, { "epoch": 0.023, "grad_norm": 0.035383899567194975, "learning_rate": 2.2980000000000003e-06, "loss": 2.4749, "step": 1150 }, { "epoch": 0.023, "eval_loss": 2.479668140411377, "eval_runtime": 33.4615, "eval_samples_per_second": 3.497, "eval_steps_per_second": 1.763, "step": 1150 }, { "epoch": 0.0231, "eval_loss": 2.4794092178344727, "eval_runtime": 33.4264, "eval_samples_per_second": 3.5, "eval_steps_per_second": 1.765, "step": 1155 }, { "epoch": 0.0232, "eval_loss": 2.4790964126586914, "eval_runtime": 33.4165, "eval_samples_per_second": 3.501, "eval_steps_per_second": 1.766, "step": 1160 }, { "epoch": 0.0233, "eval_loss": 2.4789323806762695, "eval_runtime": 33.2576, "eval_samples_per_second": 3.518, "eval_steps_per_second": 1.774, "step": 1165 }, { "epoch": 0.0234, "eval_loss": 2.4786429405212402, "eval_runtime": 33.3028, "eval_samples_per_second": 3.513, "eval_steps_per_second": 1.772, "step": 1170 }, { "epoch": 0.0235, "grad_norm": 0.034819138532107045, "learning_rate": 2.3480000000000002e-06, "loss": 2.4874, "step": 1175 }, { "epoch": 0.0235, "eval_loss": 2.4784486293792725, "eval_runtime": 33.3374, "eval_samples_per_second": 3.51, "eval_steps_per_second": 1.77, "step": 1175 }, { "epoch": 0.0236, "eval_loss": 2.478088855743408, "eval_runtime": 33.2864, "eval_samples_per_second": 3.515, "eval_steps_per_second": 1.772, "step": 1180 }, { "epoch": 0.0237, "eval_loss": 2.477979898452759, "eval_runtime": 33.4245, "eval_samples_per_second": 3.5, "eval_steps_per_second": 1.765, "step": 1185 }, { "epoch": 0.0238, "eval_loss": 2.4778709411621094, "eval_runtime": 33.2611, "eval_samples_per_second": 3.518, "eval_steps_per_second": 1.774, "step": 1190 }, { "epoch": 0.0239, "eval_loss": 2.477571487426758, "eval_runtime": 33.3418, "eval_samples_per_second": 3.509, "eval_steps_per_second": 1.77, "step": 1195 }, { "epoch": 0.024, "grad_norm": 0.037748109041694296, "learning_rate": 2.398e-06, "loss": 2.4666, "step": 1200 }, { "epoch": 0.024, "eval_loss": 2.4772226810455322, "eval_runtime": 33.3603, "eval_samples_per_second": 3.507, "eval_steps_per_second": 1.769, "step": 1200 }, { "epoch": 0.0241, "eval_loss": 2.4769959449768066, "eval_runtime": 33.21, "eval_samples_per_second": 3.523, "eval_steps_per_second": 1.777, "step": 1205 }, { "epoch": 0.0242, "eval_loss": 2.4768526554107666, "eval_runtime": 33.4359, "eval_samples_per_second": 3.499, "eval_steps_per_second": 1.765, "step": 1210 }, { "epoch": 0.0243, "eval_loss": 2.476616382598877, "eval_runtime": 33.3341, "eval_samples_per_second": 3.51, "eval_steps_per_second": 1.77, "step": 1215 }, { "epoch": 0.0244, "eval_loss": 2.476250171661377, "eval_runtime": 33.3422, "eval_samples_per_second": 3.509, "eval_steps_per_second": 1.77, "step": 1220 }, { "epoch": 0.0245, "grad_norm": 0.042904100843004035, "learning_rate": 2.448e-06, "loss": 2.4698, "step": 1225 }, { "epoch": 0.0245, "eval_loss": 2.475933790206909, "eval_runtime": 33.3238, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.771, "step": 1225 }, { "epoch": 0.0246, "eval_loss": 2.475733995437622, "eval_runtime": 33.337, "eval_samples_per_second": 3.51, "eval_steps_per_second": 1.77, "step": 1230 }, { "epoch": 0.0247, "eval_loss": 2.4756155014038086, "eval_runtime": 33.3642, "eval_samples_per_second": 3.507, "eval_steps_per_second": 1.768, "step": 1235 }, { "epoch": 0.0248, "eval_loss": 2.475208044052124, "eval_runtime": 33.3567, "eval_samples_per_second": 3.508, "eval_steps_per_second": 1.769, "step": 1240 }, { "epoch": 0.0249, "eval_loss": 2.4751882553100586, "eval_runtime": 33.2409, "eval_samples_per_second": 3.52, "eval_steps_per_second": 1.775, "step": 1245 }, { "epoch": 0.025, "grad_norm": 0.04198064762114288, "learning_rate": 2.498e-06, "loss": 2.4544, "step": 1250 }, { "epoch": 0.025, "eval_loss": 2.4749433994293213, "eval_runtime": 33.219, "eval_samples_per_second": 3.522, "eval_steps_per_second": 1.776, "step": 1250 }, { "epoch": 0.0251, "eval_loss": 2.475109577178955, "eval_runtime": 33.293, "eval_samples_per_second": 3.514, "eval_steps_per_second": 1.772, "step": 1255 }, { "epoch": 0.0252, "eval_loss": 2.474750280380249, "eval_runtime": 33.5388, "eval_samples_per_second": 3.488, "eval_steps_per_second": 1.759, "step": 1260 }, { "epoch": 0.0253, "eval_loss": 2.4743547439575195, "eval_runtime": 33.3597, "eval_samples_per_second": 3.507, "eval_steps_per_second": 1.769, "step": 1265 }, { "epoch": 0.0254, "eval_loss": 2.4740777015686035, "eval_runtime": 33.3283, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.77, "step": 1270 }, { "epoch": 0.0255, "grad_norm": 0.03252077443949688, "learning_rate": 2.5480000000000004e-06, "loss": 2.4647, "step": 1275 }, { "epoch": 0.0255, "eval_loss": 2.473674774169922, "eval_runtime": 33.2492, "eval_samples_per_second": 3.519, "eval_steps_per_second": 1.774, "step": 1275 }, { "epoch": 0.0256, "eval_loss": 2.4734930992126465, "eval_runtime": 33.2934, "eval_samples_per_second": 3.514, "eval_steps_per_second": 1.772, "step": 1280 }, { "epoch": 0.0257, "eval_loss": 2.4735071659088135, "eval_runtime": 33.466, "eval_samples_per_second": 3.496, "eval_steps_per_second": 1.763, "step": 1285 }, { "epoch": 0.0258, "eval_loss": 2.4733572006225586, "eval_runtime": 33.248, "eval_samples_per_second": 3.519, "eval_steps_per_second": 1.775, "step": 1290 }, { "epoch": 0.0259, "eval_loss": 2.4730312824249268, "eval_runtime": 33.3551, "eval_samples_per_second": 3.508, "eval_steps_per_second": 1.769, "step": 1295 }, { "epoch": 0.026, "grad_norm": 0.034740776600877266, "learning_rate": 2.598e-06, "loss": 2.4625, "step": 1300 }, { "epoch": 0.026, "eval_loss": 2.4726204872131348, "eval_runtime": 33.3147, "eval_samples_per_second": 3.512, "eval_steps_per_second": 1.771, "step": 1300 }, { "epoch": 0.0261, "eval_loss": 2.4729621410369873, "eval_runtime": 33.3118, "eval_samples_per_second": 3.512, "eval_steps_per_second": 1.771, "step": 1305 }, { "epoch": 0.0262, "eval_loss": 2.4726085662841797, "eval_runtime": 33.4111, "eval_samples_per_second": 3.502, "eval_steps_per_second": 1.766, "step": 1310 }, { "epoch": 0.0263, "eval_loss": 2.4724133014678955, "eval_runtime": 33.3144, "eval_samples_per_second": 3.512, "eval_steps_per_second": 1.771, "step": 1315 }, { "epoch": 0.0264, "eval_loss": 2.471963405609131, "eval_runtime": 33.3272, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.77, "step": 1320 }, { "epoch": 0.0265, "grad_norm": 0.039738232523319775, "learning_rate": 2.648e-06, "loss": 2.4734, "step": 1325 }, { "epoch": 0.0265, "eval_loss": 2.4717814922332764, "eval_runtime": 33.2395, "eval_samples_per_second": 3.52, "eval_steps_per_second": 1.775, "step": 1325 }, { "epoch": 0.0266, "eval_loss": 2.471389055252075, "eval_runtime": 33.2159, "eval_samples_per_second": 3.522, "eval_steps_per_second": 1.776, "step": 1330 }, { "epoch": 0.0267, "eval_loss": 2.4711251258850098, "eval_runtime": 33.4193, "eval_samples_per_second": 3.501, "eval_steps_per_second": 1.765, "step": 1335 }, { "epoch": 0.0268, "eval_loss": 2.470979928970337, "eval_runtime": 33.2748, "eval_samples_per_second": 3.516, "eval_steps_per_second": 1.773, "step": 1340 }, { "epoch": 0.0269, "eval_loss": 2.4706759452819824, "eval_runtime": 33.3367, "eval_samples_per_second": 3.51, "eval_steps_per_second": 1.77, "step": 1345 }, { "epoch": 0.027, "grad_norm": 0.036968596903604725, "learning_rate": 2.6980000000000003e-06, "loss": 2.4642, "step": 1350 }, { "epoch": 0.027, "eval_loss": 2.470658302307129, "eval_runtime": 33.3288, "eval_samples_per_second": 3.51, "eval_steps_per_second": 1.77, "step": 1350 }, { "epoch": 0.0271, "eval_loss": 2.4704952239990234, "eval_runtime": 33.3162, "eval_samples_per_second": 3.512, "eval_steps_per_second": 1.771, "step": 1355 }, { "epoch": 0.0272, "eval_loss": 2.470270872116089, "eval_runtime": 33.35, "eval_samples_per_second": 3.508, "eval_steps_per_second": 1.769, "step": 1360 }, { "epoch": 0.0273, "eval_loss": 2.4699764251708984, "eval_runtime": 33.3696, "eval_samples_per_second": 3.506, "eval_steps_per_second": 1.768, "step": 1365 }, { "epoch": 0.0274, "eval_loss": 2.469688653945923, "eval_runtime": 33.4143, "eval_samples_per_second": 3.501, "eval_steps_per_second": 1.766, "step": 1370 }, { "epoch": 0.0275, "grad_norm": 0.03899590922475157, "learning_rate": 2.748e-06, "loss": 2.4579, "step": 1375 }, { "epoch": 0.0275, "eval_loss": 2.469435691833496, "eval_runtime": 33.34, "eval_samples_per_second": 3.509, "eval_steps_per_second": 1.77, "step": 1375 }, { "epoch": 0.0276, "eval_loss": 2.469395160675049, "eval_runtime": 33.2655, "eval_samples_per_second": 3.517, "eval_steps_per_second": 1.774, "step": 1380 }, { "epoch": 0.0277, "eval_loss": 2.46889328956604, "eval_runtime": 33.3344, "eval_samples_per_second": 3.51, "eval_steps_per_second": 1.77, "step": 1385 }, { "epoch": 0.0278, "eval_loss": 2.468695640563965, "eval_runtime": 33.4003, "eval_samples_per_second": 3.503, "eval_steps_per_second": 1.766, "step": 1390 }, { "epoch": 0.0279, "eval_loss": 2.4685797691345215, "eval_runtime": 33.252, "eval_samples_per_second": 3.519, "eval_steps_per_second": 1.774, "step": 1395 }, { "epoch": 0.028, "grad_norm": 0.03498385470366268, "learning_rate": 2.798e-06, "loss": 2.472, "step": 1400 }, { "epoch": 0.028, "eval_loss": 2.468594789505005, "eval_runtime": 33.5555, "eval_samples_per_second": 3.487, "eval_steps_per_second": 1.758, "step": 1400 }, { "epoch": 0.0281, "eval_loss": 2.4685287475585938, "eval_runtime": 33.3147, "eval_samples_per_second": 3.512, "eval_steps_per_second": 1.771, "step": 1405 }, { "epoch": 0.0282, "eval_loss": 2.467956304550171, "eval_runtime": 33.3679, "eval_samples_per_second": 3.506, "eval_steps_per_second": 1.768, "step": 1410 }, { "epoch": 0.0283, "eval_loss": 2.467761993408203, "eval_runtime": 33.3242, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.77, "step": 1415 }, { "epoch": 0.0284, "eval_loss": 2.467660903930664, "eval_runtime": 33.3677, "eval_samples_per_second": 3.506, "eval_steps_per_second": 1.768, "step": 1420 }, { "epoch": 0.0285, "grad_norm": 0.03333480906358989, "learning_rate": 2.848e-06, "loss": 2.4676, "step": 1425 }, { "epoch": 0.0285, "eval_loss": 2.4673027992248535, "eval_runtime": 33.3388, "eval_samples_per_second": 3.509, "eval_steps_per_second": 1.77, "step": 1425 }, { "epoch": 0.0286, "eval_loss": 2.467072010040283, "eval_runtime": 33.3596, "eval_samples_per_second": 3.507, "eval_steps_per_second": 1.769, "step": 1430 }, { "epoch": 0.0287, "eval_loss": 2.4668517112731934, "eval_runtime": 33.5136, "eval_samples_per_second": 3.491, "eval_steps_per_second": 1.76, "step": 1435 }, { "epoch": 0.0288, "eval_loss": 2.4666786193847656, "eval_runtime": 33.3405, "eval_samples_per_second": 3.509, "eval_steps_per_second": 1.77, "step": 1440 }, { "epoch": 0.0289, "eval_loss": 2.4667794704437256, "eval_runtime": 33.3333, "eval_samples_per_second": 3.51, "eval_steps_per_second": 1.77, "step": 1445 }, { "epoch": 0.029, "grad_norm": 0.03480548121480933, "learning_rate": 2.8980000000000005e-06, "loss": 2.4524, "step": 1450 }, { "epoch": 0.029, "eval_loss": 2.466280460357666, "eval_runtime": 33.4727, "eval_samples_per_second": 3.495, "eval_steps_per_second": 1.763, "step": 1450 }, { "epoch": 0.0291, "eval_loss": 2.4659922122955322, "eval_runtime": 33.3309, "eval_samples_per_second": 3.51, "eval_steps_per_second": 1.77, "step": 1455 }, { "epoch": 0.0292, "eval_loss": 2.4657278060913086, "eval_runtime": 33.326, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.77, "step": 1460 }, { "epoch": 0.0293, "eval_loss": 2.4654440879821777, "eval_runtime": 33.3457, "eval_samples_per_second": 3.509, "eval_steps_per_second": 1.769, "step": 1465 }, { "epoch": 0.0294, "eval_loss": 2.465367317199707, "eval_runtime": 33.2824, "eval_samples_per_second": 3.515, "eval_steps_per_second": 1.773, "step": 1470 }, { "epoch": 0.0295, "grad_norm": 0.03652712436191979, "learning_rate": 2.9480000000000004e-06, "loss": 2.466, "step": 1475 }, { "epoch": 0.0295, "eval_loss": 2.465318202972412, "eval_runtime": 33.3264, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.77, "step": 1475 }, { "epoch": 0.0296, "eval_loss": 2.465156316757202, "eval_runtime": 33.2661, "eval_samples_per_second": 3.517, "eval_steps_per_second": 1.774, "step": 1480 }, { "epoch": 0.0297, "eval_loss": 2.4648799896240234, "eval_runtime": 33.4782, "eval_samples_per_second": 3.495, "eval_steps_per_second": 1.762, "step": 1485 }, { "epoch": 0.0298, "eval_loss": 2.4646074771881104, "eval_runtime": 33.3194, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.771, "step": 1490 }, { "epoch": 0.0299, "eval_loss": 2.464465856552124, "eval_runtime": 33.3466, "eval_samples_per_second": 3.509, "eval_steps_per_second": 1.769, "step": 1495 }, { "epoch": 0.03, "grad_norm": 0.03778721361564108, "learning_rate": 2.9980000000000003e-06, "loss": 2.4684, "step": 1500 }, { "epoch": 0.03, "eval_loss": 2.464305877685547, "eval_runtime": 33.25, "eval_samples_per_second": 3.519, "eval_steps_per_second": 1.774, "step": 1500 }, { "epoch": 0.0301, "eval_loss": 2.464261531829834, "eval_runtime": 33.3761, "eval_samples_per_second": 3.505, "eval_steps_per_second": 1.768, "step": 1505 }, { "epoch": 0.0302, "eval_loss": 2.464185953140259, "eval_runtime": 33.4957, "eval_samples_per_second": 3.493, "eval_steps_per_second": 1.761, "step": 1510 }, { "epoch": 0.0303, "eval_loss": 2.4639229774475098, "eval_runtime": 33.2475, "eval_samples_per_second": 3.519, "eval_steps_per_second": 1.775, "step": 1515 }, { "epoch": 0.0304, "eval_loss": 2.4636595249176025, "eval_runtime": 33.3124, "eval_samples_per_second": 3.512, "eval_steps_per_second": 1.771, "step": 1520 }, { "epoch": 0.0305, "grad_norm": 0.035809836530372154, "learning_rate": 3.0480000000000003e-06, "loss": 2.4631, "step": 1525 }, { "epoch": 0.0305, "eval_loss": 2.46356201171875, "eval_runtime": 33.3423, "eval_samples_per_second": 3.509, "eval_steps_per_second": 1.77, "step": 1525 }, { "epoch": 0.0306, "eval_loss": 2.463318347930908, "eval_runtime": 33.3917, "eval_samples_per_second": 3.504, "eval_steps_per_second": 1.767, "step": 1530 }, { "epoch": 0.0307, "eval_loss": 2.4631264209747314, "eval_runtime": 33.4053, "eval_samples_per_second": 3.502, "eval_steps_per_second": 1.766, "step": 1535 }, { "epoch": 0.0308, "eval_loss": 2.462981700897217, "eval_runtime": 33.2608, "eval_samples_per_second": 3.518, "eval_steps_per_second": 1.774, "step": 1540 }, { "epoch": 0.0309, "eval_loss": 2.462719202041626, "eval_runtime": 33.3259, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.77, "step": 1545 }, { "epoch": 0.031, "grad_norm": 0.05979367258550731, "learning_rate": 3.0980000000000007e-06, "loss": 2.46, "step": 1550 }, { "epoch": 0.031, "eval_loss": 2.462733268737793, "eval_runtime": 33.3195, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.771, "step": 1550 }, { "epoch": 0.0311, "eval_loss": 2.4625959396362305, "eval_runtime": 33.3704, "eval_samples_per_second": 3.506, "eval_steps_per_second": 1.768, "step": 1555 }, { "epoch": 0.0312, "eval_loss": 2.462366819381714, "eval_runtime": 33.4047, "eval_samples_per_second": 3.503, "eval_steps_per_second": 1.766, "step": 1560 }, { "epoch": 0.0313, "eval_loss": 2.4618427753448486, "eval_runtime": 33.3896, "eval_samples_per_second": 3.504, "eval_steps_per_second": 1.767, "step": 1565 }, { "epoch": 0.0314, "eval_loss": 2.4616317749023438, "eval_runtime": 33.3414, "eval_samples_per_second": 3.509, "eval_steps_per_second": 1.77, "step": 1570 }, { "epoch": 0.0315, "grad_norm": 0.031804244667956116, "learning_rate": 3.1480000000000006e-06, "loss": 2.4477, "step": 1575 }, { "epoch": 0.0315, "eval_loss": 2.4615368843078613, "eval_runtime": 33.3548, "eval_samples_per_second": 3.508, "eval_steps_per_second": 1.769, "step": 1575 }, { "epoch": 0.0316, "eval_loss": 2.461198091506958, "eval_runtime": 33.2416, "eval_samples_per_second": 3.52, "eval_steps_per_second": 1.775, "step": 1580 }, { "epoch": 0.0317, "eval_loss": 2.4611523151397705, "eval_runtime": 33.3445, "eval_samples_per_second": 3.509, "eval_steps_per_second": 1.769, "step": 1585 }, { "epoch": 0.0318, "eval_loss": 2.4609127044677734, "eval_runtime": 33.3175, "eval_samples_per_second": 3.512, "eval_steps_per_second": 1.771, "step": 1590 }, { "epoch": 0.0319, "eval_loss": 2.4608800411224365, "eval_runtime": 33.3052, "eval_samples_per_second": 3.513, "eval_steps_per_second": 1.771, "step": 1595 }, { "epoch": 0.032, "grad_norm": 0.03365841309984822, "learning_rate": 3.198e-06, "loss": 2.4523, "step": 1600 }, { "epoch": 0.032, "eval_loss": 2.460757255554199, "eval_runtime": 33.2636, "eval_samples_per_second": 3.517, "eval_steps_per_second": 1.774, "step": 1600 }, { "epoch": 0.0321, "eval_loss": 2.4605917930603027, "eval_runtime": 33.4595, "eval_samples_per_second": 3.497, "eval_steps_per_second": 1.763, "step": 1605 }, { "epoch": 0.0322, "eval_loss": 2.4604575634002686, "eval_runtime": 33.2706, "eval_samples_per_second": 3.517, "eval_steps_per_second": 1.773, "step": 1610 }, { "epoch": 0.0323, "eval_loss": 2.4603111743927, "eval_runtime": 33.405, "eval_samples_per_second": 3.502, "eval_steps_per_second": 1.766, "step": 1615 }, { "epoch": 0.0324, "eval_loss": 2.460045337677002, "eval_runtime": 33.2598, "eval_samples_per_second": 3.518, "eval_steps_per_second": 1.774, "step": 1620 }, { "epoch": 0.0325, "grad_norm": 0.03534600587541967, "learning_rate": 3.248e-06, "loss": 2.45, "step": 1625 }, { "epoch": 0.0325, "eval_loss": 2.460045099258423, "eval_runtime": 33.2663, "eval_samples_per_second": 3.517, "eval_steps_per_second": 1.774, "step": 1625 }, { "epoch": 0.0326, "eval_loss": 2.4599287509918213, "eval_runtime": 33.2545, "eval_samples_per_second": 3.518, "eval_steps_per_second": 1.774, "step": 1630 }, { "epoch": 0.0327, "eval_loss": 2.459611654281616, "eval_runtime": 33.4189, "eval_samples_per_second": 3.501, "eval_steps_per_second": 1.765, "step": 1635 }, { "epoch": 0.0328, "eval_loss": 2.4594151973724365, "eval_runtime": 33.284, "eval_samples_per_second": 3.515, "eval_steps_per_second": 1.773, "step": 1640 }, { "epoch": 0.0329, "eval_loss": 2.4589221477508545, "eval_runtime": 33.4033, "eval_samples_per_second": 3.503, "eval_steps_per_second": 1.766, "step": 1645 }, { "epoch": 0.033, "grad_norm": 0.032596527761614855, "learning_rate": 3.298e-06, "loss": 2.4422, "step": 1650 }, { "epoch": 0.033, "eval_loss": 2.4589502811431885, "eval_runtime": 33.2986, "eval_samples_per_second": 3.514, "eval_steps_per_second": 1.772, "step": 1650 }, { "epoch": 0.0331, "eval_loss": 2.4588239192962646, "eval_runtime": 33.4046, "eval_samples_per_second": 3.503, "eval_steps_per_second": 1.766, "step": 1655 }, { "epoch": 0.0332, "eval_loss": 2.458603620529175, "eval_runtime": 33.3448, "eval_samples_per_second": 3.509, "eval_steps_per_second": 1.769, "step": 1660 }, { "epoch": 0.0333, "eval_loss": 2.458559513092041, "eval_runtime": 33.368, "eval_samples_per_second": 3.506, "eval_steps_per_second": 1.768, "step": 1665 }, { "epoch": 0.0334, "eval_loss": 2.458500862121582, "eval_runtime": 33.2335, "eval_samples_per_second": 3.521, "eval_steps_per_second": 1.775, "step": 1670 }, { "epoch": 0.0335, "grad_norm": 0.03339611698643194, "learning_rate": 3.348e-06, "loss": 2.447, "step": 1675 }, { "epoch": 0.0335, "eval_loss": 2.458252191543579, "eval_runtime": 33.3623, "eval_samples_per_second": 3.507, "eval_steps_per_second": 1.768, "step": 1675 }, { "epoch": 0.0336, "eval_loss": 2.4580931663513184, "eval_runtime": 33.2532, "eval_samples_per_second": 3.518, "eval_steps_per_second": 1.774, "step": 1680 }, { "epoch": 0.0337, "eval_loss": 2.4578795433044434, "eval_runtime": 33.3214, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.771, "step": 1685 }, { "epoch": 0.0338, "eval_loss": 2.4576218128204346, "eval_runtime": 33.248, "eval_samples_per_second": 3.519, "eval_steps_per_second": 1.775, "step": 1690 }, { "epoch": 0.0339, "eval_loss": 2.4576828479766846, "eval_runtime": 33.3499, "eval_samples_per_second": 3.508, "eval_steps_per_second": 1.769, "step": 1695 }, { "epoch": 0.034, "grad_norm": 0.03028181865357742, "learning_rate": 3.3980000000000003e-06, "loss": 2.4582, "step": 1700 }, { "epoch": 0.034, "eval_loss": 2.457383155822754, "eval_runtime": 33.2574, "eval_samples_per_second": 3.518, "eval_steps_per_second": 1.774, "step": 1700 }, { "epoch": 0.0341, "eval_loss": 2.4572579860687256, "eval_runtime": 33.2947, "eval_samples_per_second": 3.514, "eval_steps_per_second": 1.772, "step": 1705 }, { "epoch": 0.0342, "eval_loss": 2.4584450721740723, "eval_runtime": 33.3296, "eval_samples_per_second": 3.51, "eval_steps_per_second": 1.77, "step": 1710 }, { "epoch": 0.0343, "eval_loss": 2.458603858947754, "eval_runtime": 33.3017, "eval_samples_per_second": 3.513, "eval_steps_per_second": 1.772, "step": 1715 }, { "epoch": 0.0344, "eval_loss": 2.4579555988311768, "eval_runtime": 33.292, "eval_samples_per_second": 3.514, "eval_steps_per_second": 1.772, "step": 1720 }, { "epoch": 0.0345, "grad_norm": 0.03734241446236971, "learning_rate": 3.4480000000000003e-06, "loss": 2.4501, "step": 1725 }, { "epoch": 0.0345, "eval_loss": 2.4574153423309326, "eval_runtime": 33.4313, "eval_samples_per_second": 3.5, "eval_steps_per_second": 1.765, "step": 1725 }, { "epoch": 0.0346, "eval_loss": 2.456867218017578, "eval_runtime": 33.2833, "eval_samples_per_second": 3.515, "eval_steps_per_second": 1.773, "step": 1730 }, { "epoch": 0.0347, "eval_loss": 2.4567270278930664, "eval_runtime": 33.3694, "eval_samples_per_second": 3.506, "eval_steps_per_second": 1.768, "step": 1735 }, { "epoch": 0.0348, "eval_loss": 2.456348180770874, "eval_runtime": 33.3416, "eval_samples_per_second": 3.509, "eval_steps_per_second": 1.77, "step": 1740 }, { "epoch": 0.0349, "eval_loss": 2.4563136100769043, "eval_runtime": 33.3531, "eval_samples_per_second": 3.508, "eval_steps_per_second": 1.769, "step": 1745 }, { "epoch": 0.035, "grad_norm": 0.030782538004837847, "learning_rate": 3.4980000000000002e-06, "loss": 2.4509, "step": 1750 }, { "epoch": 0.035, "eval_loss": 2.455827236175537, "eval_runtime": 33.3143, "eval_samples_per_second": 3.512, "eval_steps_per_second": 1.771, "step": 1750 }, { "epoch": 0.0351, "eval_loss": 2.4558639526367188, "eval_runtime": 33.3716, "eval_samples_per_second": 3.506, "eval_steps_per_second": 1.768, "step": 1755 }, { "epoch": 0.0352, "eval_loss": 2.4555938243865967, "eval_runtime": 33.2966, "eval_samples_per_second": 3.514, "eval_steps_per_second": 1.772, "step": 1760 }, { "epoch": 0.0353, "eval_loss": 2.4551546573638916, "eval_runtime": 33.3145, "eval_samples_per_second": 3.512, "eval_steps_per_second": 1.771, "step": 1765 }, { "epoch": 0.0354, "eval_loss": 2.454957962036133, "eval_runtime": 33.3201, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.771, "step": 1770 }, { "epoch": 0.0355, "grad_norm": 0.03281862515471333, "learning_rate": 3.548e-06, "loss": 2.4439, "step": 1775 }, { "epoch": 0.0355, "eval_loss": 2.455031394958496, "eval_runtime": 33.264, "eval_samples_per_second": 3.517, "eval_steps_per_second": 1.774, "step": 1775 }, { "epoch": 0.0356, "eval_loss": 2.4550724029541016, "eval_runtime": 33.3734, "eval_samples_per_second": 3.506, "eval_steps_per_second": 1.768, "step": 1780 }, { "epoch": 0.0357, "eval_loss": 2.454719305038452, "eval_runtime": 33.3267, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.77, "step": 1785 }, { "epoch": 0.0358, "eval_loss": 2.4547033309936523, "eval_runtime": 33.2651, "eval_samples_per_second": 3.517, "eval_steps_per_second": 1.774, "step": 1790 }, { "epoch": 0.0359, "eval_loss": 2.454416275024414, "eval_runtime": 33.3612, "eval_samples_per_second": 3.507, "eval_steps_per_second": 1.769, "step": 1795 }, { "epoch": 0.036, "grad_norm": 0.031756006482001914, "learning_rate": 3.5980000000000005e-06, "loss": 2.4493, "step": 1800 }, { "epoch": 0.036, "eval_loss": 2.454286813735962, "eval_runtime": 33.326, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.77, "step": 1800 }, { "epoch": 0.0361, "eval_loss": 2.4541101455688477, "eval_runtime": 33.2597, "eval_samples_per_second": 3.518, "eval_steps_per_second": 1.774, "step": 1805 }, { "epoch": 0.0362, "eval_loss": 2.4541351795196533, "eval_runtime": 33.2421, "eval_samples_per_second": 3.52, "eval_steps_per_second": 1.775, "step": 1810 }, { "epoch": 0.0363, "eval_loss": 2.4537973403930664, "eval_runtime": 33.3201, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.771, "step": 1815 }, { "epoch": 0.0364, "eval_loss": 2.4534847736358643, "eval_runtime": 33.2973, "eval_samples_per_second": 3.514, "eval_steps_per_second": 1.772, "step": 1820 }, { "epoch": 0.0365, "grad_norm": 0.03128096989289917, "learning_rate": 3.6480000000000005e-06, "loss": 2.4526, "step": 1825 }, { "epoch": 0.0365, "eval_loss": 2.453655481338501, "eval_runtime": 33.3755, "eval_samples_per_second": 3.506, "eval_steps_per_second": 1.768, "step": 1825 }, { "epoch": 0.0366, "eval_loss": 2.4534049034118652, "eval_runtime": 33.332, "eval_samples_per_second": 3.51, "eval_steps_per_second": 1.77, "step": 1830 }, { "epoch": 0.0367, "eval_loss": 2.4529781341552734, "eval_runtime": 33.3325, "eval_samples_per_second": 3.51, "eval_steps_per_second": 1.77, "step": 1835 }, { "epoch": 0.0368, "eval_loss": 2.454005241394043, "eval_runtime": 33.3975, "eval_samples_per_second": 3.503, "eval_steps_per_second": 1.767, "step": 1840 }, { "epoch": 0.0369, "eval_loss": 2.4538745880126953, "eval_runtime": 33.3, "eval_samples_per_second": 3.514, "eval_steps_per_second": 1.772, "step": 1845 }, { "epoch": 0.037, "grad_norm": 0.02999582338402207, "learning_rate": 3.6980000000000004e-06, "loss": 2.4309, "step": 1850 }, { "epoch": 0.037, "eval_loss": 2.4534404277801514, "eval_runtime": 33.2825, "eval_samples_per_second": 3.515, "eval_steps_per_second": 1.773, "step": 1850 }, { "epoch": 0.0371, "eval_loss": 2.4529800415039062, "eval_runtime": 33.513, "eval_samples_per_second": 3.491, "eval_steps_per_second": 1.761, "step": 1855 }, { "epoch": 0.0372, "eval_loss": 2.453007221221924, "eval_runtime": 33.3414, "eval_samples_per_second": 3.509, "eval_steps_per_second": 1.77, "step": 1860 }, { "epoch": 0.0373, "eval_loss": 2.452350616455078, "eval_runtime": 33.3625, "eval_samples_per_second": 3.507, "eval_steps_per_second": 1.768, "step": 1865 }, { "epoch": 0.0374, "eval_loss": 2.4522666931152344, "eval_runtime": 33.3116, "eval_samples_per_second": 3.512, "eval_steps_per_second": 1.771, "step": 1870 }, { "epoch": 0.0375, "grad_norm": 0.0409025592520596, "learning_rate": 3.7480000000000004e-06, "loss": 2.442, "step": 1875 }, { "epoch": 0.0375, "eval_loss": 2.4521546363830566, "eval_runtime": 33.3782, "eval_samples_per_second": 3.505, "eval_steps_per_second": 1.768, "step": 1875 }, { "epoch": 0.0376, "eval_loss": 2.4520437717437744, "eval_runtime": 33.2887, "eval_samples_per_second": 3.515, "eval_steps_per_second": 1.772, "step": 1880 }, { "epoch": 0.0377, "eval_loss": 2.4519331455230713, "eval_runtime": 33.3746, "eval_samples_per_second": 3.506, "eval_steps_per_second": 1.768, "step": 1885 }, { "epoch": 0.0378, "eval_loss": 2.451744556427002, "eval_runtime": 33.3214, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.771, "step": 1890 }, { "epoch": 0.0379, "eval_loss": 2.451737642288208, "eval_runtime": 33.3457, "eval_samples_per_second": 3.509, "eval_steps_per_second": 1.769, "step": 1895 }, { "epoch": 0.038, "grad_norm": 0.03431980647954774, "learning_rate": 3.7980000000000007e-06, "loss": 2.4477, "step": 1900 }, { "epoch": 0.038, "eval_loss": 2.4515624046325684, "eval_runtime": 33.312, "eval_samples_per_second": 3.512, "eval_steps_per_second": 1.771, "step": 1900 }, { "epoch": 0.0381, "eval_loss": 2.4512295722961426, "eval_runtime": 33.3607, "eval_samples_per_second": 3.507, "eval_steps_per_second": 1.769, "step": 1905 }, { "epoch": 0.0382, "eval_loss": 2.4510445594787598, "eval_runtime": 33.339, "eval_samples_per_second": 3.509, "eval_steps_per_second": 1.77, "step": 1910 }, { "epoch": 0.0383, "eval_loss": 2.4508397579193115, "eval_runtime": 33.3996, "eval_samples_per_second": 3.503, "eval_steps_per_second": 1.766, "step": 1915 }, { "epoch": 0.0384, "eval_loss": 2.4510440826416016, "eval_runtime": 33.2905, "eval_samples_per_second": 3.515, "eval_steps_per_second": 1.772, "step": 1920 }, { "epoch": 0.0385, "grad_norm": 0.03587224652231601, "learning_rate": 3.848e-06, "loss": 2.4433, "step": 1925 }, { "epoch": 0.0385, "eval_loss": 2.450984239578247, "eval_runtime": 33.3263, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.77, "step": 1925 }, { "epoch": 0.0386, "eval_loss": 2.45090651512146, "eval_runtime": 33.3244, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.77, "step": 1930 }, { "epoch": 0.0387, "eval_loss": 2.450443983078003, "eval_runtime": 33.3023, "eval_samples_per_second": 3.513, "eval_steps_per_second": 1.772, "step": 1935 }, { "epoch": 0.0388, "eval_loss": 2.450309991836548, "eval_runtime": 33.4354, "eval_samples_per_second": 3.499, "eval_steps_per_second": 1.765, "step": 1940 }, { "epoch": 0.0389, "eval_loss": 2.4500510692596436, "eval_runtime": 33.3238, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.771, "step": 1945 }, { "epoch": 0.039, "grad_norm": 0.027239293031380653, "learning_rate": 3.898e-06, "loss": 2.4347, "step": 1950 }, { "epoch": 0.039, "eval_loss": 2.4498231410980225, "eval_runtime": 33.3306, "eval_samples_per_second": 3.51, "eval_steps_per_second": 1.77, "step": 1950 }, { "epoch": 0.0391, "eval_loss": 2.449704170227051, "eval_runtime": 33.3865, "eval_samples_per_second": 3.504, "eval_steps_per_second": 1.767, "step": 1955 }, { "epoch": 0.0392, "eval_loss": 2.44974684715271, "eval_runtime": 33.419, "eval_samples_per_second": 3.501, "eval_steps_per_second": 1.765, "step": 1960 }, { "epoch": 0.0393, "eval_loss": 2.450090169906616, "eval_runtime": 33.5315, "eval_samples_per_second": 3.489, "eval_steps_per_second": 1.76, "step": 1965 }, { "epoch": 0.0394, "eval_loss": 2.4494845867156982, "eval_runtime": 33.4607, "eval_samples_per_second": 3.497, "eval_steps_per_second": 1.763, "step": 1970 }, { "epoch": 0.0395, "grad_norm": 0.031553482039351585, "learning_rate": 3.948e-06, "loss": 2.4466, "step": 1975 }, { "epoch": 0.0395, "eval_loss": 2.449598550796509, "eval_runtime": 33.4853, "eval_samples_per_second": 3.494, "eval_steps_per_second": 1.762, "step": 1975 }, { "epoch": 0.0396, "eval_loss": 2.449420213699341, "eval_runtime": 33.4626, "eval_samples_per_second": 3.496, "eval_steps_per_second": 1.763, "step": 1980 }, { "epoch": 0.0397, "eval_loss": 2.449462890625, "eval_runtime": 33.4049, "eval_samples_per_second": 3.502, "eval_steps_per_second": 1.766, "step": 1985 }, { "epoch": 0.0398, "eval_loss": 2.449423313140869, "eval_runtime": 33.5823, "eval_samples_per_second": 3.484, "eval_steps_per_second": 1.757, "step": 1990 }, { "epoch": 0.0399, "eval_loss": 2.4491324424743652, "eval_runtime": 33.662, "eval_samples_per_second": 3.476, "eval_steps_per_second": 1.753, "step": 1995 }, { "epoch": 0.04, "grad_norm": 0.03314009226524554, "learning_rate": 3.9980000000000005e-06, "loss": 2.4391, "step": 2000 }, { "epoch": 0.04, "eval_loss": 2.449084520339966, "eval_runtime": 33.5872, "eval_samples_per_second": 3.483, "eval_steps_per_second": 1.757, "step": 2000 }, { "epoch": 0.0401, "eval_loss": 2.449021577835083, "eval_runtime": 33.5048, "eval_samples_per_second": 3.492, "eval_steps_per_second": 1.761, "step": 2005 }, { "epoch": 0.0402, "eval_loss": 2.449159622192383, "eval_runtime": 33.4845, "eval_samples_per_second": 3.494, "eval_steps_per_second": 1.762, "step": 2010 }, { "epoch": 0.0403, "eval_loss": 2.448726177215576, "eval_runtime": 33.9926, "eval_samples_per_second": 3.442, "eval_steps_per_second": 1.736, "step": 2015 }, { "epoch": 0.0404, "eval_loss": 2.4484922885894775, "eval_runtime": 33.6594, "eval_samples_per_second": 3.476, "eval_steps_per_second": 1.753, "step": 2020 }, { "epoch": 0.0405, "grad_norm": 0.029877786947315705, "learning_rate": 4.048e-06, "loss": 2.438, "step": 2025 }, { "epoch": 0.0405, "eval_loss": 2.4485254287719727, "eval_runtime": 33.6812, "eval_samples_per_second": 3.474, "eval_steps_per_second": 1.752, "step": 2025 }, { "epoch": 0.0406, "eval_loss": 2.448495388031006, "eval_runtime": 33.9733, "eval_samples_per_second": 3.444, "eval_steps_per_second": 1.737, "step": 2030 }, { "epoch": 0.0407, "eval_loss": 2.4482643604278564, "eval_runtime": 33.9957, "eval_samples_per_second": 3.442, "eval_steps_per_second": 1.736, "step": 2035 }, { "epoch": 0.0408, "eval_loss": 2.4481942653656006, "eval_runtime": 34.3014, "eval_samples_per_second": 3.411, "eval_steps_per_second": 1.72, "step": 2040 }, { "epoch": 0.0409, "eval_loss": 2.448082208633423, "eval_runtime": 34.0411, "eval_samples_per_second": 3.437, "eval_steps_per_second": 1.733, "step": 2045 }, { "epoch": 0.041, "grad_norm": 0.031175983773220776, "learning_rate": 4.098e-06, "loss": 2.4332, "step": 2050 }, { "epoch": 0.041, "eval_loss": 2.4478490352630615, "eval_runtime": 33.9245, "eval_samples_per_second": 3.449, "eval_steps_per_second": 1.739, "step": 2050 }, { "epoch": 0.0411, "eval_loss": 2.4480035305023193, "eval_runtime": 34.0079, "eval_samples_per_second": 3.44, "eval_steps_per_second": 1.735, "step": 2055 }, { "epoch": 0.0412, "eval_loss": 2.447685718536377, "eval_runtime": 33.999, "eval_samples_per_second": 3.441, "eval_steps_per_second": 1.735, "step": 2060 }, { "epoch": 0.0413, "eval_loss": 2.447507619857788, "eval_runtime": 34.1446, "eval_samples_per_second": 3.427, "eval_steps_per_second": 1.728, "step": 2065 }, { "epoch": 0.0414, "eval_loss": 2.447322130203247, "eval_runtime": 33.7479, "eval_samples_per_second": 3.467, "eval_steps_per_second": 1.748, "step": 2070 }, { "epoch": 0.0415, "grad_norm": 0.02904850084773878, "learning_rate": 4.148000000000001e-06, "loss": 2.4481, "step": 2075 }, { "epoch": 0.0415, "eval_loss": 2.4471347332000732, "eval_runtime": 33.917, "eval_samples_per_second": 3.45, "eval_steps_per_second": 1.74, "step": 2075 }, { "epoch": 0.0416, "eval_loss": 2.447152853012085, "eval_runtime": 33.8287, "eval_samples_per_second": 3.459, "eval_steps_per_second": 1.744, "step": 2080 }, { "epoch": 0.0417, "eval_loss": 2.4469242095947266, "eval_runtime": 33.7591, "eval_samples_per_second": 3.466, "eval_steps_per_second": 1.748, "step": 2085 }, { "epoch": 0.0418, "eval_loss": 2.4471774101257324, "eval_runtime": 33.7879, "eval_samples_per_second": 3.463, "eval_steps_per_second": 1.746, "step": 2090 }, { "epoch": 0.0419, "eval_loss": 2.447988986968994, "eval_runtime": 33.6878, "eval_samples_per_second": 3.473, "eval_steps_per_second": 1.751, "step": 2095 }, { "epoch": 0.042, "grad_norm": 0.033838990669225626, "learning_rate": 4.198e-06, "loss": 2.4386, "step": 2100 }, { "epoch": 0.042, "eval_loss": 2.4477100372314453, "eval_runtime": 33.6345, "eval_samples_per_second": 3.479, "eval_steps_per_second": 1.754, "step": 2100 }, { "epoch": 0.0421, "eval_loss": 2.447394847869873, "eval_runtime": 33.6221, "eval_samples_per_second": 3.48, "eval_steps_per_second": 1.755, "step": 2105 }, { "epoch": 0.0422, "eval_loss": 2.4470951557159424, "eval_runtime": 33.6689, "eval_samples_per_second": 3.475, "eval_steps_per_second": 1.752, "step": 2110 }, { "epoch": 0.0423, "eval_loss": 2.4467623233795166, "eval_runtime": 33.6979, "eval_samples_per_second": 3.472, "eval_steps_per_second": 1.751, "step": 2115 }, { "epoch": 0.0424, "eval_loss": 2.4469833374023438, "eval_runtime": 33.8632, "eval_samples_per_second": 3.455, "eval_steps_per_second": 1.742, "step": 2120 }, { "epoch": 0.0425, "grad_norm": 0.0382703849144026, "learning_rate": 4.248000000000001e-06, "loss": 2.4313, "step": 2125 }, { "epoch": 0.0425, "eval_loss": 2.447753667831421, "eval_runtime": 33.7269, "eval_samples_per_second": 3.469, "eval_steps_per_second": 1.749, "step": 2125 }, { "epoch": 0.0426, "eval_loss": 2.447281837463379, "eval_runtime": 33.7037, "eval_samples_per_second": 3.471, "eval_steps_per_second": 1.751, "step": 2130 }, { "epoch": 0.0427, "eval_loss": 2.4472267627716064, "eval_runtime": 33.6873, "eval_samples_per_second": 3.473, "eval_steps_per_second": 1.751, "step": 2135 }, { "epoch": 0.0428, "eval_loss": 2.446859836578369, "eval_runtime": 33.6738, "eval_samples_per_second": 3.475, "eval_steps_per_second": 1.752, "step": 2140 }, { "epoch": 0.0429, "eval_loss": 2.446655035018921, "eval_runtime": 33.6536, "eval_samples_per_second": 3.477, "eval_steps_per_second": 1.753, "step": 2145 }, { "epoch": 0.043, "grad_norm": 0.027126678960545086, "learning_rate": 4.298e-06, "loss": 2.4298, "step": 2150 }, { "epoch": 0.043, "eval_loss": 2.4463651180267334, "eval_runtime": 33.6454, "eval_samples_per_second": 3.477, "eval_steps_per_second": 1.754, "step": 2150 }, { "epoch": 0.0431, "eval_loss": 2.4461581707000732, "eval_runtime": 33.6166, "eval_samples_per_second": 3.48, "eval_steps_per_second": 1.755, "step": 2155 }, { "epoch": 0.0432, "eval_loss": 2.4461660385131836, "eval_runtime": 33.5484, "eval_samples_per_second": 3.488, "eval_steps_per_second": 1.759, "step": 2160 }, { "epoch": 0.0433, "eval_loss": 2.4458513259887695, "eval_runtime": 33.6579, "eval_samples_per_second": 3.476, "eval_steps_per_second": 1.753, "step": 2165 }, { "epoch": 0.0434, "eval_loss": 2.4454855918884277, "eval_runtime": 33.5647, "eval_samples_per_second": 3.486, "eval_steps_per_second": 1.758, "step": 2170 }, { "epoch": 0.0435, "grad_norm": 0.030565328679921875, "learning_rate": 4.3480000000000006e-06, "loss": 2.4387, "step": 2175 }, { "epoch": 0.0435, "eval_loss": 2.445688009262085, "eval_runtime": 33.5164, "eval_samples_per_second": 3.491, "eval_steps_per_second": 1.76, "step": 2175 }, { "epoch": 0.0436, "eval_loss": 2.4456729888916016, "eval_runtime": 33.4724, "eval_samples_per_second": 3.495, "eval_steps_per_second": 1.763, "step": 2180 }, { "epoch": 0.0437, "eval_loss": 2.4460015296936035, "eval_runtime": 33.3984, "eval_samples_per_second": 3.503, "eval_steps_per_second": 1.767, "step": 2185 }, { "epoch": 0.0438, "eval_loss": 2.4460256099700928, "eval_runtime": 33.4582, "eval_samples_per_second": 3.497, "eval_steps_per_second": 1.763, "step": 2190 }, { "epoch": 0.0439, "eval_loss": 2.4456872940063477, "eval_runtime": 33.444, "eval_samples_per_second": 3.498, "eval_steps_per_second": 1.764, "step": 2195 }, { "epoch": 0.044, "grad_norm": 0.03864046787827566, "learning_rate": 4.398000000000001e-06, "loss": 2.445, "step": 2200 }, { "epoch": 0.044, "eval_loss": 2.4454870223999023, "eval_runtime": 33.4474, "eval_samples_per_second": 3.498, "eval_steps_per_second": 1.764, "step": 2200 }, { "epoch": 0.0441, "eval_loss": 2.4453113079071045, "eval_runtime": 33.4062, "eval_samples_per_second": 3.502, "eval_steps_per_second": 1.766, "step": 2205 }, { "epoch": 0.0442, "eval_loss": 2.4448771476745605, "eval_runtime": 33.3542, "eval_samples_per_second": 3.508, "eval_steps_per_second": 1.769, "step": 2210 }, { "epoch": 0.0443, "eval_loss": 2.444946765899658, "eval_runtime": 33.3997, "eval_samples_per_second": 3.503, "eval_steps_per_second": 1.766, "step": 2215 }, { "epoch": 0.0444, "eval_loss": 2.445194959640503, "eval_runtime": 33.3669, "eval_samples_per_second": 3.506, "eval_steps_per_second": 1.768, "step": 2220 }, { "epoch": 0.0445, "grad_norm": 0.026792091668494698, "learning_rate": 4.4480000000000004e-06, "loss": 2.4339, "step": 2225 }, { "epoch": 0.0445, "eval_loss": 2.445009469985962, "eval_runtime": 33.4467, "eval_samples_per_second": 3.498, "eval_steps_per_second": 1.764, "step": 2225 }, { "epoch": 0.0446, "eval_loss": 2.4450981616973877, "eval_runtime": 33.4513, "eval_samples_per_second": 3.498, "eval_steps_per_second": 1.764, "step": 2230 }, { "epoch": 0.0447, "eval_loss": 2.444899082183838, "eval_runtime": 33.3869, "eval_samples_per_second": 3.504, "eval_steps_per_second": 1.767, "step": 2235 }, { "epoch": 0.0448, "eval_loss": 2.4448494911193848, "eval_runtime": 33.486, "eval_samples_per_second": 3.494, "eval_steps_per_second": 1.762, "step": 2240 }, { "epoch": 0.0449, "eval_loss": 2.444640636444092, "eval_runtime": 33.4202, "eval_samples_per_second": 3.501, "eval_steps_per_second": 1.765, "step": 2245 }, { "epoch": 0.045, "grad_norm": 0.027104711228224686, "learning_rate": 4.498e-06, "loss": 2.4326, "step": 2250 }, { "epoch": 0.045, "eval_loss": 2.444633722305298, "eval_runtime": 33.4154, "eval_samples_per_second": 3.501, "eval_steps_per_second": 1.766, "step": 2250 }, { "epoch": 0.0451, "eval_loss": 2.44467830657959, "eval_runtime": 33.4237, "eval_samples_per_second": 3.501, "eval_steps_per_second": 1.765, "step": 2255 }, { "epoch": 0.0452, "eval_loss": 2.444413900375366, "eval_runtime": 33.3694, "eval_samples_per_second": 3.506, "eval_steps_per_second": 1.768, "step": 2260 }, { "epoch": 0.0453, "eval_loss": 2.444222927093506, "eval_runtime": 33.3585, "eval_samples_per_second": 3.507, "eval_steps_per_second": 1.769, "step": 2265 }, { "epoch": 0.0454, "eval_loss": 2.444108724594116, "eval_runtime": 33.3346, "eval_samples_per_second": 3.51, "eval_steps_per_second": 1.77, "step": 2270 }, { "epoch": 0.0455, "grad_norm": 0.033569645173308425, "learning_rate": 4.548e-06, "loss": 2.4342, "step": 2275 }, { "epoch": 0.0455, "eval_loss": 2.443859577178955, "eval_runtime": 33.3636, "eval_samples_per_second": 3.507, "eval_steps_per_second": 1.768, "step": 2275 }, { "epoch": 0.0456, "eval_loss": 2.4441120624542236, "eval_runtime": 33.2442, "eval_samples_per_second": 3.519, "eval_steps_per_second": 1.775, "step": 2280 }, { "epoch": 0.0457, "eval_loss": 2.4439260959625244, "eval_runtime": 33.2924, "eval_samples_per_second": 3.514, "eval_steps_per_second": 1.772, "step": 2285 }, { "epoch": 0.0458, "eval_loss": 2.4439032077789307, "eval_runtime": 33.4004, "eval_samples_per_second": 3.503, "eval_steps_per_second": 1.766, "step": 2290 }, { "epoch": 0.0459, "eval_loss": 2.443621873855591, "eval_runtime": 33.3314, "eval_samples_per_second": 3.51, "eval_steps_per_second": 1.77, "step": 2295 }, { "epoch": 0.046, "grad_norm": 0.02648413187023774, "learning_rate": 4.598e-06, "loss": 2.4368, "step": 2300 }, { "epoch": 0.046, "eval_loss": 2.4436306953430176, "eval_runtime": 33.372, "eval_samples_per_second": 3.506, "eval_steps_per_second": 1.768, "step": 2300 }, { "epoch": 0.0461, "eval_loss": 2.4436404705047607, "eval_runtime": 33.3039, "eval_samples_per_second": 3.513, "eval_steps_per_second": 1.772, "step": 2305 }, { "epoch": 0.0462, "eval_loss": 2.44333815574646, "eval_runtime": 33.3059, "eval_samples_per_second": 3.513, "eval_steps_per_second": 1.771, "step": 2310 }, { "epoch": 0.0463, "eval_loss": 2.443415880203247, "eval_runtime": 33.4065, "eval_samples_per_second": 3.502, "eval_steps_per_second": 1.766, "step": 2315 }, { "epoch": 0.0464, "eval_loss": 2.443068742752075, "eval_runtime": 33.2818, "eval_samples_per_second": 3.515, "eval_steps_per_second": 1.773, "step": 2320 }, { "epoch": 0.0465, "grad_norm": 0.0351440602227012, "learning_rate": 4.648e-06, "loss": 2.4381, "step": 2325 }, { "epoch": 0.0465, "eval_loss": 2.443199634552002, "eval_runtime": 33.3538, "eval_samples_per_second": 3.508, "eval_steps_per_second": 1.769, "step": 2325 }, { "epoch": 0.0466, "eval_loss": 2.4433047771453857, "eval_runtime": 33.4816, "eval_samples_per_second": 3.494, "eval_steps_per_second": 1.762, "step": 2330 }, { "epoch": 0.0467, "eval_loss": 2.443272113800049, "eval_runtime": 33.5015, "eval_samples_per_second": 3.492, "eval_steps_per_second": 1.761, "step": 2335 }, { "epoch": 0.0468, "eval_loss": 2.443246603012085, "eval_runtime": 33.5753, "eval_samples_per_second": 3.485, "eval_steps_per_second": 1.757, "step": 2340 }, { "epoch": 0.0469, "eval_loss": 2.4432363510131836, "eval_runtime": 33.2869, "eval_samples_per_second": 3.515, "eval_steps_per_second": 1.772, "step": 2345 }, { "epoch": 0.047, "grad_norm": 0.02695670446644145, "learning_rate": 4.698000000000001e-06, "loss": 2.4303, "step": 2350 }, { "epoch": 0.047, "eval_loss": 2.4429421424865723, "eval_runtime": 33.3556, "eval_samples_per_second": 3.508, "eval_steps_per_second": 1.769, "step": 2350 }, { "epoch": 0.0471, "eval_loss": 2.4427566528320312, "eval_runtime": 33.3612, "eval_samples_per_second": 3.507, "eval_steps_per_second": 1.769, "step": 2355 }, { "epoch": 0.0472, "eval_loss": 2.4425995349884033, "eval_runtime": 33.353, "eval_samples_per_second": 3.508, "eval_steps_per_second": 1.769, "step": 2360 }, { "epoch": 0.0473, "eval_loss": 2.4426395893096924, "eval_runtime": 33.4669, "eval_samples_per_second": 3.496, "eval_steps_per_second": 1.763, "step": 2365 }, { "epoch": 0.0474, "eval_loss": 2.4425301551818848, "eval_runtime": 33.3803, "eval_samples_per_second": 3.505, "eval_steps_per_second": 1.768, "step": 2370 }, { "epoch": 0.0475, "grad_norm": 0.031232764672567994, "learning_rate": 4.748e-06, "loss": 2.4284, "step": 2375 }, { "epoch": 0.0475, "eval_loss": 2.4426214694976807, "eval_runtime": 33.3013, "eval_samples_per_second": 3.513, "eval_steps_per_second": 1.772, "step": 2375 }, { "epoch": 0.0476, "eval_loss": 2.442599296569824, "eval_runtime": 33.3419, "eval_samples_per_second": 3.509, "eval_steps_per_second": 1.77, "step": 2380 }, { "epoch": 0.0477, "eval_loss": 2.442364454269409, "eval_runtime": 33.3677, "eval_samples_per_second": 3.506, "eval_steps_per_second": 1.768, "step": 2385 }, { "epoch": 0.0478, "eval_loss": 2.4425458908081055, "eval_runtime": 33.3892, "eval_samples_per_second": 3.504, "eval_steps_per_second": 1.767, "step": 2390 }, { "epoch": 0.0479, "eval_loss": 2.4425549507141113, "eval_runtime": 33.4202, "eval_samples_per_second": 3.501, "eval_steps_per_second": 1.765, "step": 2395 }, { "epoch": 0.048, "grad_norm": 0.027127721086561404, "learning_rate": 4.7980000000000005e-06, "loss": 2.4291, "step": 2400 }, { "epoch": 0.048, "eval_loss": 2.4425251483917236, "eval_runtime": 33.3802, "eval_samples_per_second": 3.505, "eval_steps_per_second": 1.768, "step": 2400 }, { "epoch": 0.0481, "eval_loss": 2.4424123764038086, "eval_runtime": 33.3283, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.77, "step": 2405 }, { "epoch": 0.0482, "eval_loss": 2.4421849250793457, "eval_runtime": 33.4172, "eval_samples_per_second": 3.501, "eval_steps_per_second": 1.766, "step": 2410 }, { "epoch": 0.0483, "eval_loss": 2.4419970512390137, "eval_runtime": 33.4642, "eval_samples_per_second": 3.496, "eval_steps_per_second": 1.763, "step": 2415 }, { "epoch": 0.0484, "eval_loss": 2.4419567584991455, "eval_runtime": 33.3663, "eval_samples_per_second": 3.507, "eval_steps_per_second": 1.768, "step": 2420 }, { "epoch": 0.0485, "grad_norm": 0.026032952013136927, "learning_rate": 4.848000000000001e-06, "loss": 2.4256, "step": 2425 }, { "epoch": 0.0485, "eval_loss": 2.441688299179077, "eval_runtime": 33.3169, "eval_samples_per_second": 3.512, "eval_steps_per_second": 1.771, "step": 2425 }, { "epoch": 0.0486, "eval_loss": 2.4417548179626465, "eval_runtime": 33.3476, "eval_samples_per_second": 3.508, "eval_steps_per_second": 1.769, "step": 2430 }, { "epoch": 0.0487, "eval_loss": 2.441769599914551, "eval_runtime": 33.4488, "eval_samples_per_second": 3.498, "eval_steps_per_second": 1.764, "step": 2435 }, { "epoch": 0.0488, "eval_loss": 2.4415283203125, "eval_runtime": 33.4555, "eval_samples_per_second": 3.497, "eval_steps_per_second": 1.764, "step": 2440 }, { "epoch": 0.0489, "eval_loss": 2.4416847229003906, "eval_runtime": 33.2459, "eval_samples_per_second": 3.519, "eval_steps_per_second": 1.775, "step": 2445 }, { "epoch": 0.049, "grad_norm": 0.02804626155591942, "learning_rate": 4.898e-06, "loss": 2.4334, "step": 2450 }, { "epoch": 0.049, "eval_loss": 2.4414188861846924, "eval_runtime": 33.2989, "eval_samples_per_second": 3.514, "eval_steps_per_second": 1.772, "step": 2450 }, { "epoch": 0.0491, "eval_loss": 2.4416472911834717, "eval_runtime": 33.3676, "eval_samples_per_second": 3.506, "eval_steps_per_second": 1.768, "step": 2455 }, { "epoch": 0.0492, "eval_loss": 2.4414844512939453, "eval_runtime": 33.4116, "eval_samples_per_second": 3.502, "eval_steps_per_second": 1.766, "step": 2460 }, { "epoch": 0.0493, "eval_loss": 2.441408395767212, "eval_runtime": 33.6104, "eval_samples_per_second": 3.481, "eval_steps_per_second": 1.755, "step": 2465 }, { "epoch": 0.0494, "eval_loss": 2.4413650035858154, "eval_runtime": 33.3838, "eval_samples_per_second": 3.505, "eval_steps_per_second": 1.767, "step": 2470 }, { "epoch": 0.0495, "grad_norm": 0.025351866385684634, "learning_rate": 4.948000000000001e-06, "loss": 2.4356, "step": 2475 }, { "epoch": 0.0495, "eval_loss": 2.4411768913269043, "eval_runtime": 33.3857, "eval_samples_per_second": 3.504, "eval_steps_per_second": 1.767, "step": 2475 }, { "epoch": 0.0496, "eval_loss": 2.441201686859131, "eval_runtime": 33.4117, "eval_samples_per_second": 3.502, "eval_steps_per_second": 1.766, "step": 2480 }, { "epoch": 0.0497, "eval_loss": 2.4408698081970215, "eval_runtime": 33.3015, "eval_samples_per_second": 3.513, "eval_steps_per_second": 1.772, "step": 2485 }, { "epoch": 0.0498, "eval_loss": 2.440950393676758, "eval_runtime": 33.379, "eval_samples_per_second": 3.505, "eval_steps_per_second": 1.768, "step": 2490 }, { "epoch": 0.0499, "eval_loss": 2.4407267570495605, "eval_runtime": 33.2561, "eval_samples_per_second": 3.518, "eval_steps_per_second": 1.774, "step": 2495 }, { "epoch": 0.05, "grad_norm": 0.029743600833546286, "learning_rate": 4.998e-06, "loss": 2.4369, "step": 2500 }, { "epoch": 0.05, "eval_loss": 2.4408068656921387, "eval_runtime": 33.3807, "eval_samples_per_second": 3.505, "eval_steps_per_second": 1.767, "step": 2500 }, { "epoch": 0.0501, "eval_loss": 2.4407401084899902, "eval_runtime": 33.2295, "eval_samples_per_second": 3.521, "eval_steps_per_second": 1.776, "step": 2505 }, { "epoch": 0.0502, "eval_loss": 2.4409286975860596, "eval_runtime": 33.3925, "eval_samples_per_second": 3.504, "eval_steps_per_second": 1.767, "step": 2510 }, { "epoch": 0.0503, "eval_loss": 2.4407782554626465, "eval_runtime": 33.4498, "eval_samples_per_second": 3.498, "eval_steps_per_second": 1.764, "step": 2515 }, { "epoch": 0.0504, "eval_loss": 2.4407856464385986, "eval_runtime": 33.4899, "eval_samples_per_second": 3.494, "eval_steps_per_second": 1.762, "step": 2520 }, { "epoch": 0.0505, "grad_norm": 0.027292319342276494, "learning_rate": 5.048000000000001e-06, "loss": 2.4263, "step": 2525 }, { "epoch": 0.0505, "eval_loss": 2.440830945968628, "eval_runtime": 33.3428, "eval_samples_per_second": 3.509, "eval_steps_per_second": 1.769, "step": 2525 }, { "epoch": 0.0506, "eval_loss": 2.44069504737854, "eval_runtime": 33.2895, "eval_samples_per_second": 3.515, "eval_steps_per_second": 1.772, "step": 2530 }, { "epoch": 0.0507, "eval_loss": 2.4408159255981445, "eval_runtime": 33.3488, "eval_samples_per_second": 3.508, "eval_steps_per_second": 1.769, "step": 2535 }, { "epoch": 0.0508, "eval_loss": 2.440523386001587, "eval_runtime": 33.3582, "eval_samples_per_second": 3.507, "eval_steps_per_second": 1.769, "step": 2540 }, { "epoch": 0.0509, "eval_loss": 2.4403724670410156, "eval_runtime": 33.5287, "eval_samples_per_second": 3.49, "eval_steps_per_second": 1.76, "step": 2545 }, { "epoch": 0.051, "grad_norm": 0.02495087994166461, "learning_rate": 5.098000000000001e-06, "loss": 2.428, "step": 2550 }, { "epoch": 0.051, "eval_loss": 2.440495252609253, "eval_runtime": 34.4575, "eval_samples_per_second": 3.395, "eval_steps_per_second": 1.712, "step": 2550 }, { "epoch": 0.0511, "eval_loss": 2.440384864807129, "eval_runtime": 34.0144, "eval_samples_per_second": 3.44, "eval_steps_per_second": 1.735, "step": 2555 }, { "epoch": 0.0512, "eval_loss": 2.4405176639556885, "eval_runtime": 34.5852, "eval_samples_per_second": 3.383, "eval_steps_per_second": 1.706, "step": 2560 }, { "epoch": 0.0513, "eval_loss": 2.4402472972869873, "eval_runtime": 34.2689, "eval_samples_per_second": 3.414, "eval_steps_per_second": 1.722, "step": 2565 }, { "epoch": 0.0514, "eval_loss": 2.440459966659546, "eval_runtime": 33.3821, "eval_samples_per_second": 3.505, "eval_steps_per_second": 1.767, "step": 2570 }, { "epoch": 0.0515, "grad_norm": 0.029728034222700407, "learning_rate": 5.1480000000000005e-06, "loss": 2.439, "step": 2575 }, { "epoch": 0.0515, "eval_loss": 2.440525531768799, "eval_runtime": 34.3072, "eval_samples_per_second": 3.41, "eval_steps_per_second": 1.72, "step": 2575 }, { "epoch": 0.0516, "eval_loss": 2.440373420715332, "eval_runtime": 33.5748, "eval_samples_per_second": 3.485, "eval_steps_per_second": 1.757, "step": 2580 }, { "epoch": 0.0517, "eval_loss": 2.4405770301818848, "eval_runtime": 35.2655, "eval_samples_per_second": 3.318, "eval_steps_per_second": 1.673, "step": 2585 }, { "epoch": 0.0518, "eval_loss": 2.4402198791503906, "eval_runtime": 34.9918, "eval_samples_per_second": 3.344, "eval_steps_per_second": 1.686, "step": 2590 }, { "epoch": 0.0519, "eval_loss": 2.440136194229126, "eval_runtime": 33.4873, "eval_samples_per_second": 3.494, "eval_steps_per_second": 1.762, "step": 2595 }, { "epoch": 0.052, "grad_norm": 0.02473354917836018, "learning_rate": 5.198000000000001e-06, "loss": 2.427, "step": 2600 }, { "epoch": 0.052, "eval_loss": 2.440282106399536, "eval_runtime": 33.4628, "eval_samples_per_second": 3.496, "eval_steps_per_second": 1.763, "step": 2600 }, { "epoch": 0.0521, "eval_loss": 2.440448045730591, "eval_runtime": 33.4191, "eval_samples_per_second": 3.501, "eval_steps_per_second": 1.765, "step": 2605 }, { "epoch": 0.0522, "eval_loss": 2.440248966217041, "eval_runtime": 33.4911, "eval_samples_per_second": 3.493, "eval_steps_per_second": 1.762, "step": 2610 }, { "epoch": 0.0523, "eval_loss": 2.440030336380005, "eval_runtime": 33.4921, "eval_samples_per_second": 3.493, "eval_steps_per_second": 1.762, "step": 2615 }, { "epoch": 0.0524, "eval_loss": 2.4397685527801514, "eval_runtime": 33.4491, "eval_samples_per_second": 3.498, "eval_steps_per_second": 1.764, "step": 2620 }, { "epoch": 0.0525, "grad_norm": 0.026533778128592735, "learning_rate": 5.248000000000001e-06, "loss": 2.4214, "step": 2625 }, { "epoch": 0.0525, "eval_loss": 2.43971848487854, "eval_runtime": 33.3975, "eval_samples_per_second": 3.503, "eval_steps_per_second": 1.767, "step": 2625 }, { "epoch": 0.0526, "eval_loss": 2.4398951530456543, "eval_runtime": 33.4912, "eval_samples_per_second": 3.493, "eval_steps_per_second": 1.762, "step": 2630 }, { "epoch": 0.0527, "eval_loss": 2.43975830078125, "eval_runtime": 33.4071, "eval_samples_per_second": 3.502, "eval_steps_per_second": 1.766, "step": 2635 }, { "epoch": 0.0528, "eval_loss": 2.439666271209717, "eval_runtime": 33.4208, "eval_samples_per_second": 3.501, "eval_steps_per_second": 1.765, "step": 2640 }, { "epoch": 0.0529, "eval_loss": 2.439816951751709, "eval_runtime": 33.5111, "eval_samples_per_second": 3.491, "eval_steps_per_second": 1.761, "step": 2645 }, { "epoch": 0.053, "grad_norm": 0.024723120971366967, "learning_rate": 5.298000000000001e-06, "loss": 2.4241, "step": 2650 }, { "epoch": 0.053, "eval_loss": 2.4398183822631836, "eval_runtime": 33.506, "eval_samples_per_second": 3.492, "eval_steps_per_second": 1.761, "step": 2650 }, { "epoch": 0.0531, "eval_loss": 2.4402668476104736, "eval_runtime": 34.1298, "eval_samples_per_second": 3.428, "eval_steps_per_second": 1.729, "step": 2655 }, { "epoch": 0.0532, "eval_loss": 2.4400885105133057, "eval_runtime": 33.436, "eval_samples_per_second": 3.499, "eval_steps_per_second": 1.765, "step": 2660 }, { "epoch": 0.0533, "eval_loss": 2.439871311187744, "eval_runtime": 33.3874, "eval_samples_per_second": 3.504, "eval_steps_per_second": 1.767, "step": 2665 }, { "epoch": 0.0534, "eval_loss": 2.4393365383148193, "eval_runtime": 33.5258, "eval_samples_per_second": 3.49, "eval_steps_per_second": 1.76, "step": 2670 }, { "epoch": 0.0535, "grad_norm": 0.02173239513971497, "learning_rate": 5.348000000000001e-06, "loss": 2.4295, "step": 2675 }, { "epoch": 0.0535, "eval_loss": 2.439133405685425, "eval_runtime": 33.4962, "eval_samples_per_second": 3.493, "eval_steps_per_second": 1.761, "step": 2675 }, { "epoch": 0.0536, "eval_loss": 2.439093589782715, "eval_runtime": 33.4708, "eval_samples_per_second": 3.496, "eval_steps_per_second": 1.763, "step": 2680 }, { "epoch": 0.0537, "eval_loss": 2.439096212387085, "eval_runtime": 33.4284, "eval_samples_per_second": 3.5, "eval_steps_per_second": 1.765, "step": 2685 }, { "epoch": 0.0538, "eval_loss": 2.4389584064483643, "eval_runtime": 33.4749, "eval_samples_per_second": 3.495, "eval_steps_per_second": 1.763, "step": 2690 }, { "epoch": 0.0539, "eval_loss": 2.438805103302002, "eval_runtime": 33.478, "eval_samples_per_second": 3.495, "eval_steps_per_second": 1.762, "step": 2695 }, { "epoch": 0.054, "grad_norm": 0.023851331909406925, "learning_rate": 5.398e-06, "loss": 2.4302, "step": 2700 }, { "epoch": 0.054, "eval_loss": 2.4386403560638428, "eval_runtime": 33.4276, "eval_samples_per_second": 3.5, "eval_steps_per_second": 1.765, "step": 2700 }, { "epoch": 0.0541, "eval_loss": 2.438568115234375, "eval_runtime": 33.528, "eval_samples_per_second": 3.49, "eval_steps_per_second": 1.76, "step": 2705 }, { "epoch": 0.0542, "eval_loss": 2.438894510269165, "eval_runtime": 33.5228, "eval_samples_per_second": 3.49, "eval_steps_per_second": 1.76, "step": 2710 }, { "epoch": 0.0543, "eval_loss": 2.4387168884277344, "eval_runtime": 33.4663, "eval_samples_per_second": 3.496, "eval_steps_per_second": 1.763, "step": 2715 }, { "epoch": 0.0544, "eval_loss": 2.4385879039764404, "eval_runtime": 33.513, "eval_samples_per_second": 3.491, "eval_steps_per_second": 1.761, "step": 2720 }, { "epoch": 0.0545, "grad_norm": 0.02728082451264937, "learning_rate": 5.448e-06, "loss": 2.4308, "step": 2725 }, { "epoch": 0.0545, "eval_loss": 2.4388349056243896, "eval_runtime": 33.4525, "eval_samples_per_second": 3.497, "eval_steps_per_second": 1.764, "step": 2725 }, { "epoch": 0.0546, "eval_loss": 2.438887357711792, "eval_runtime": 33.428, "eval_samples_per_second": 3.5, "eval_steps_per_second": 1.765, "step": 2730 }, { "epoch": 0.0547, "eval_loss": 2.438713312149048, "eval_runtime": 33.5229, "eval_samples_per_second": 3.49, "eval_steps_per_second": 1.76, "step": 2735 }, { "epoch": 0.0548, "eval_loss": 2.438657283782959, "eval_runtime": 33.4169, "eval_samples_per_second": 3.501, "eval_steps_per_second": 1.766, "step": 2740 }, { "epoch": 0.0549, "eval_loss": 2.438544988632202, "eval_runtime": 33.4944, "eval_samples_per_second": 3.493, "eval_steps_per_second": 1.761, "step": 2745 }, { "epoch": 0.055, "grad_norm": 0.025461121075693184, "learning_rate": 5.498e-06, "loss": 2.4379, "step": 2750 }, { "epoch": 0.055, "eval_loss": 2.4386098384857178, "eval_runtime": 33.6782, "eval_samples_per_second": 3.474, "eval_steps_per_second": 1.752, "step": 2750 }, { "epoch": 0.0551, "eval_loss": 2.438521146774292, "eval_runtime": 33.5161, "eval_samples_per_second": 3.491, "eval_steps_per_second": 1.76, "step": 2755 }, { "epoch": 0.0552, "eval_loss": 2.438474178314209, "eval_runtime": 33.4773, "eval_samples_per_second": 3.495, "eval_steps_per_second": 1.762, "step": 2760 }, { "epoch": 0.0553, "eval_loss": 2.4382379055023193, "eval_runtime": 33.4869, "eval_samples_per_second": 3.494, "eval_steps_per_second": 1.762, "step": 2765 }, { "epoch": 0.0554, "eval_loss": 2.438157796859741, "eval_runtime": 33.543, "eval_samples_per_second": 3.488, "eval_steps_per_second": 1.759, "step": 2770 }, { "epoch": 0.0555, "grad_norm": 0.0234055445054481, "learning_rate": 5.548e-06, "loss": 2.4326, "step": 2775 }, { "epoch": 0.0555, "eval_loss": 2.438048839569092, "eval_runtime": 33.5073, "eval_samples_per_second": 3.492, "eval_steps_per_second": 1.761, "step": 2775 }, { "epoch": 0.0556, "eval_loss": 2.4379706382751465, "eval_runtime": 33.4567, "eval_samples_per_second": 3.497, "eval_steps_per_second": 1.763, "step": 2780 }, { "epoch": 0.0557, "eval_loss": 2.4379332065582275, "eval_runtime": 33.5172, "eval_samples_per_second": 3.491, "eval_steps_per_second": 1.76, "step": 2785 }, { "epoch": 0.0558, "eval_loss": 2.4380111694335938, "eval_runtime": 33.5913, "eval_samples_per_second": 3.483, "eval_steps_per_second": 1.756, "step": 2790 }, { "epoch": 0.0559, "eval_loss": 2.4379403591156006, "eval_runtime": 33.5223, "eval_samples_per_second": 3.49, "eval_steps_per_second": 1.76, "step": 2795 }, { "epoch": 0.056, "grad_norm": 0.024691045411267393, "learning_rate": 5.5980000000000004e-06, "loss": 2.4297, "step": 2800 }, { "epoch": 0.056, "eval_loss": 2.43778657913208, "eval_runtime": 33.524, "eval_samples_per_second": 3.49, "eval_steps_per_second": 1.76, "step": 2800 }, { "epoch": 0.0561, "eval_loss": 2.4376559257507324, "eval_runtime": 33.58, "eval_samples_per_second": 3.484, "eval_steps_per_second": 1.757, "step": 2805 }, { "epoch": 0.0562, "eval_loss": 2.437596559524536, "eval_runtime": 33.5756, "eval_samples_per_second": 3.485, "eval_steps_per_second": 1.757, "step": 2810 }, { "epoch": 0.0563, "eval_loss": 2.437690496444702, "eval_runtime": 33.5056, "eval_samples_per_second": 3.492, "eval_steps_per_second": 1.761, "step": 2815 }, { "epoch": 0.0564, "eval_loss": 2.437558174133301, "eval_runtime": 33.4948, "eval_samples_per_second": 3.493, "eval_steps_per_second": 1.761, "step": 2820 }, { "epoch": 0.0565, "grad_norm": 0.02500330428035899, "learning_rate": 5.648e-06, "loss": 2.4281, "step": 2825 }, { "epoch": 0.0565, "eval_loss": 2.437875747680664, "eval_runtime": 33.4492, "eval_samples_per_second": 3.498, "eval_steps_per_second": 1.764, "step": 2825 }, { "epoch": 0.0566, "eval_loss": 2.438183546066284, "eval_runtime": 33.5208, "eval_samples_per_second": 3.49, "eval_steps_per_second": 1.76, "step": 2830 }, { "epoch": 0.0567, "eval_loss": 2.4375228881835938, "eval_runtime": 33.5319, "eval_samples_per_second": 3.489, "eval_steps_per_second": 1.76, "step": 2835 }, { "epoch": 0.0568, "eval_loss": 2.437365770339966, "eval_runtime": 33.4734, "eval_samples_per_second": 3.495, "eval_steps_per_second": 1.763, "step": 2840 }, { "epoch": 0.0569, "eval_loss": 2.4376399517059326, "eval_runtime": 33.4578, "eval_samples_per_second": 3.497, "eval_steps_per_second": 1.763, "step": 2845 }, { "epoch": 0.057, "grad_norm": 0.023953363978697285, "learning_rate": 5.698e-06, "loss": 2.4341, "step": 2850 }, { "epoch": 0.057, "eval_loss": 2.437318801879883, "eval_runtime": 33.4551, "eval_samples_per_second": 3.497, "eval_steps_per_second": 1.764, "step": 2850 }, { "epoch": 0.0571, "eval_loss": 2.437349319458008, "eval_runtime": 33.4482, "eval_samples_per_second": 3.498, "eval_steps_per_second": 1.764, "step": 2855 }, { "epoch": 0.0572, "eval_loss": 2.437500476837158, "eval_runtime": 33.5179, "eval_samples_per_second": 3.491, "eval_steps_per_second": 1.76, "step": 2860 }, { "epoch": 0.0573, "eval_loss": 2.4371414184570312, "eval_runtime": 33.4246, "eval_samples_per_second": 3.5, "eval_steps_per_second": 1.765, "step": 2865 }, { "epoch": 0.0574, "eval_loss": 2.4371588230133057, "eval_runtime": 33.5686, "eval_samples_per_second": 3.485, "eval_steps_per_second": 1.758, "step": 2870 }, { "epoch": 0.0575, "grad_norm": 0.023037224733864405, "learning_rate": 5.748e-06, "loss": 2.4201, "step": 2875 }, { "epoch": 0.0575, "eval_loss": 2.4373178482055664, "eval_runtime": 33.4813, "eval_samples_per_second": 3.494, "eval_steps_per_second": 1.762, "step": 2875 }, { "epoch": 0.0576, "eval_loss": 2.4371204376220703, "eval_runtime": 33.5096, "eval_samples_per_second": 3.492, "eval_steps_per_second": 1.761, "step": 2880 }, { "epoch": 0.0577, "eval_loss": 2.43719482421875, "eval_runtime": 33.4709, "eval_samples_per_second": 3.496, "eval_steps_per_second": 1.763, "step": 2885 }, { "epoch": 0.0578, "eval_loss": 2.4369635581970215, "eval_runtime": 33.5125, "eval_samples_per_second": 3.491, "eval_steps_per_second": 1.761, "step": 2890 }, { "epoch": 0.0579, "eval_loss": 2.4367122650146484, "eval_runtime": 33.5349, "eval_samples_per_second": 3.489, "eval_steps_per_second": 1.759, "step": 2895 }, { "epoch": 0.058, "grad_norm": 0.023843041578218274, "learning_rate": 5.798e-06, "loss": 2.4322, "step": 2900 }, { "epoch": 0.058, "eval_loss": 2.436885118484497, "eval_runtime": 33.5038, "eval_samples_per_second": 3.492, "eval_steps_per_second": 1.761, "step": 2900 }, { "epoch": 0.0581, "eval_loss": 2.4368388652801514, "eval_runtime": 33.4337, "eval_samples_per_second": 3.499, "eval_steps_per_second": 1.765, "step": 2905 }, { "epoch": 0.0582, "eval_loss": 2.436776638031006, "eval_runtime": 33.5783, "eval_samples_per_second": 3.484, "eval_steps_per_second": 1.757, "step": 2910 }, { "epoch": 0.0583, "eval_loss": 2.4369046688079834, "eval_runtime": 33.5764, "eval_samples_per_second": 3.485, "eval_steps_per_second": 1.757, "step": 2915 }, { "epoch": 0.0584, "eval_loss": 2.4369351863861084, "eval_runtime": 33.5715, "eval_samples_per_second": 3.485, "eval_steps_per_second": 1.757, "step": 2920 }, { "epoch": 0.0585, "grad_norm": 0.030212978437899864, "learning_rate": 5.848000000000001e-06, "loss": 2.4318, "step": 2925 }, { "epoch": 0.0585, "eval_loss": 2.4367170333862305, "eval_runtime": 33.455, "eval_samples_per_second": 3.497, "eval_steps_per_second": 1.764, "step": 2925 }, { "epoch": 0.0586, "eval_loss": 2.4367101192474365, "eval_runtime": 33.3973, "eval_samples_per_second": 3.503, "eval_steps_per_second": 1.767, "step": 2930 }, { "epoch": 0.0587, "eval_loss": 2.436723470687866, "eval_runtime": 33.4183, "eval_samples_per_second": 3.501, "eval_steps_per_second": 1.766, "step": 2935 }, { "epoch": 0.0588, "eval_loss": 2.4368371963500977, "eval_runtime": 33.5269, "eval_samples_per_second": 3.49, "eval_steps_per_second": 1.76, "step": 2940 }, { "epoch": 0.0589, "eval_loss": 2.436763286590576, "eval_runtime": 33.4623, "eval_samples_per_second": 3.496, "eval_steps_per_second": 1.763, "step": 2945 }, { "epoch": 0.059, "grad_norm": 0.024293450378328845, "learning_rate": 5.898e-06, "loss": 2.4221, "step": 2950 }, { "epoch": 0.059, "eval_loss": 2.436692714691162, "eval_runtime": 33.523, "eval_samples_per_second": 3.49, "eval_steps_per_second": 1.76, "step": 2950 }, { "epoch": 0.0591, "eval_loss": 2.436657667160034, "eval_runtime": 34.902, "eval_samples_per_second": 3.352, "eval_steps_per_second": 1.69, "step": 2955 }, { "epoch": 0.0592, "eval_loss": 2.436432123184204, "eval_runtime": 33.4808, "eval_samples_per_second": 3.495, "eval_steps_per_second": 1.762, "step": 2960 }, { "epoch": 0.0593, "eval_loss": 2.436782121658325, "eval_runtime": 34.5166, "eval_samples_per_second": 3.39, "eval_steps_per_second": 1.709, "step": 2965 }, { "epoch": 0.0594, "eval_loss": 2.4366602897644043, "eval_runtime": 33.7416, "eval_samples_per_second": 3.468, "eval_steps_per_second": 1.749, "step": 2970 }, { "epoch": 0.0595, "grad_norm": 0.028294127858427973, "learning_rate": 5.9480000000000005e-06, "loss": 2.4196, "step": 2975 }, { "epoch": 0.0595, "eval_loss": 2.436668872833252, "eval_runtime": 35.1904, "eval_samples_per_second": 3.325, "eval_steps_per_second": 1.677, "step": 2975 }, { "epoch": 0.0596, "eval_loss": 2.436310052871704, "eval_runtime": 33.583, "eval_samples_per_second": 3.484, "eval_steps_per_second": 1.757, "step": 2980 }, { "epoch": 0.0597, "eval_loss": 2.4361066818237305, "eval_runtime": 34.1148, "eval_samples_per_second": 3.43, "eval_steps_per_second": 1.729, "step": 2985 }, { "epoch": 0.0598, "eval_loss": 2.436128854751587, "eval_runtime": 33.7895, "eval_samples_per_second": 3.463, "eval_steps_per_second": 1.746, "step": 2990 }, { "epoch": 0.0599, "eval_loss": 2.436457872390747, "eval_runtime": 34.0525, "eval_samples_per_second": 3.436, "eval_steps_per_second": 1.733, "step": 2995 }, { "epoch": 0.06, "grad_norm": 0.02242795270420928, "learning_rate": 5.998000000000001e-06, "loss": 2.4245, "step": 3000 }, { "epoch": 0.06, "eval_loss": 2.436203718185425, "eval_runtime": 33.6471, "eval_samples_per_second": 3.477, "eval_steps_per_second": 1.753, "step": 3000 } ], "logging_steps": 25, "max_steps": 50000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.355905264309764e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }