InnerLoopARMTForCausalLM_run_20 / trainer_state.json
irodkin's picture
Training checkpoint at step 3000
1f45be6 verified
{
"best_global_step": 2985,
"best_metric": 2.4361066818237305,
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_7x1024_mem32_bs64_hf_armt_dmem64/run_20/checkpoint-2000",
"epoch": 0.06,
"eval_steps": 5,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0001,
"eval_loss": 3.320133686065674,
"eval_runtime": 33.1817,
"eval_samples_per_second": 3.526,
"eval_steps_per_second": 1.778,
"step": 5
},
{
"epoch": 0.0002,
"eval_loss": 3.319335460662842,
"eval_runtime": 33.1229,
"eval_samples_per_second": 3.532,
"eval_steps_per_second": 1.781,
"step": 10
},
{
"epoch": 0.0003,
"eval_loss": 3.318042516708374,
"eval_runtime": 33.3382,
"eval_samples_per_second": 3.509,
"eval_steps_per_second": 1.77,
"step": 15
},
{
"epoch": 0.0004,
"eval_loss": 3.31443190574646,
"eval_runtime": 33.2423,
"eval_samples_per_second": 3.52,
"eval_steps_per_second": 1.775,
"step": 20
},
{
"epoch": 0.0005,
"grad_norm": 0.8831791054097137,
"learning_rate": 4.8e-08,
"loss": 3.4942,
"step": 25
},
{
"epoch": 0.0005,
"eval_loss": 3.3073768615722656,
"eval_runtime": 33.3914,
"eval_samples_per_second": 3.504,
"eval_steps_per_second": 1.767,
"step": 25
},
{
"epoch": 0.0006,
"eval_loss": 3.299119472503662,
"eval_runtime": 33.4042,
"eval_samples_per_second": 3.503,
"eval_steps_per_second": 1.766,
"step": 30
},
{
"epoch": 0.0007,
"eval_loss": 3.2837445735931396,
"eval_runtime": 33.3171,
"eval_samples_per_second": 3.512,
"eval_steps_per_second": 1.771,
"step": 35
},
{
"epoch": 0.0008,
"eval_loss": 3.26920747756958,
"eval_runtime": 33.2887,
"eval_samples_per_second": 3.515,
"eval_steps_per_second": 1.772,
"step": 40
},
{
"epoch": 0.0009,
"eval_loss": 3.2481868267059326,
"eval_runtime": 33.3291,
"eval_samples_per_second": 3.51,
"eval_steps_per_second": 1.77,
"step": 45
},
{
"epoch": 0.001,
"grad_norm": 0.5545255682809549,
"learning_rate": 9.8e-08,
"loss": 3.4174,
"step": 50
},
{
"epoch": 0.001,
"eval_loss": 3.2263057231903076,
"eval_runtime": 33.3242,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 1.77,
"step": 50
},
{
"epoch": 0.0011,
"eval_loss": 3.2074711322784424,
"eval_runtime": 33.3412,
"eval_samples_per_second": 3.509,
"eval_steps_per_second": 1.77,
"step": 55
},
{
"epoch": 0.0012,
"eval_loss": 3.1877729892730713,
"eval_runtime": 33.5109,
"eval_samples_per_second": 3.491,
"eval_steps_per_second": 1.761,
"step": 60
},
{
"epoch": 0.0013,
"eval_loss": 3.153503894805908,
"eval_runtime": 33.4747,
"eval_samples_per_second": 3.495,
"eval_steps_per_second": 1.763,
"step": 65
},
{
"epoch": 0.0014,
"eval_loss": 3.1214191913604736,
"eval_runtime": 33.5956,
"eval_samples_per_second": 3.483,
"eval_steps_per_second": 1.756,
"step": 70
},
{
"epoch": 0.0015,
"grad_norm": 0.5083106511895727,
"learning_rate": 1.4800000000000003e-07,
"loss": 3.2951,
"step": 75
},
{
"epoch": 0.0015,
"eval_loss": 3.101821184158325,
"eval_runtime": 33.6,
"eval_samples_per_second": 3.482,
"eval_steps_per_second": 1.756,
"step": 75
},
{
"epoch": 0.0016,
"eval_loss": 3.0797102451324463,
"eval_runtime": 33.5302,
"eval_samples_per_second": 3.489,
"eval_steps_per_second": 1.76,
"step": 80
},
{
"epoch": 0.0017,
"eval_loss": 3.0523691177368164,
"eval_runtime": 33.5031,
"eval_samples_per_second": 3.492,
"eval_steps_per_second": 1.761,
"step": 85
},
{
"epoch": 0.0018,
"eval_loss": 3.022620677947998,
"eval_runtime": 33.6265,
"eval_samples_per_second": 3.479,
"eval_steps_per_second": 1.755,
"step": 90
},
{
"epoch": 0.0019,
"eval_loss": 2.991481065750122,
"eval_runtime": 33.5519,
"eval_samples_per_second": 3.487,
"eval_steps_per_second": 1.758,
"step": 95
},
{
"epoch": 0.002,
"grad_norm": 0.28367624064943,
"learning_rate": 1.9800000000000003e-07,
"loss": 3.1531,
"step": 100
},
{
"epoch": 0.002,
"eval_loss": 2.9630048274993896,
"eval_runtime": 33.734,
"eval_samples_per_second": 3.468,
"eval_steps_per_second": 1.749,
"step": 100
},
{
"epoch": 0.0021,
"eval_loss": 2.93916916847229,
"eval_runtime": 33.4897,
"eval_samples_per_second": 3.494,
"eval_steps_per_second": 1.762,
"step": 105
},
{
"epoch": 0.0022,
"eval_loss": 2.9186832904815674,
"eval_runtime": 33.5154,
"eval_samples_per_second": 3.491,
"eval_steps_per_second": 1.76,
"step": 110
},
{
"epoch": 0.0023,
"eval_loss": 2.8985302448272705,
"eval_runtime": 33.5846,
"eval_samples_per_second": 3.484,
"eval_steps_per_second": 1.757,
"step": 115
},
{
"epoch": 0.0024,
"eval_loss": 2.8786001205444336,
"eval_runtime": 33.5482,
"eval_samples_per_second": 3.488,
"eval_steps_per_second": 1.759,
"step": 120
},
{
"epoch": 0.0025,
"grad_norm": 0.19615444236413476,
"learning_rate": 2.48e-07,
"loss": 3.0101,
"step": 125
},
{
"epoch": 0.0025,
"eval_loss": 2.860034704208374,
"eval_runtime": 33.5143,
"eval_samples_per_second": 3.491,
"eval_steps_per_second": 1.76,
"step": 125
},
{
"epoch": 0.0026,
"eval_loss": 2.843663454055786,
"eval_runtime": 33.5082,
"eval_samples_per_second": 3.492,
"eval_steps_per_second": 1.761,
"step": 130
},
{
"epoch": 0.0027,
"eval_loss": 2.82882022857666,
"eval_runtime": 33.4921,
"eval_samples_per_second": 3.493,
"eval_steps_per_second": 1.762,
"step": 135
},
{
"epoch": 0.0028,
"eval_loss": 2.8154728412628174,
"eval_runtime": 33.6656,
"eval_samples_per_second": 3.475,
"eval_steps_per_second": 1.753,
"step": 140
},
{
"epoch": 0.0029,
"eval_loss": 2.801098346710205,
"eval_runtime": 33.5229,
"eval_samples_per_second": 3.49,
"eval_steps_per_second": 1.76,
"step": 145
},
{
"epoch": 0.003,
"grad_norm": 0.5710572013823593,
"learning_rate": 2.9800000000000005e-07,
"loss": 2.876,
"step": 150
},
{
"epoch": 0.003,
"eval_loss": 2.789198160171509,
"eval_runtime": 33.4535,
"eval_samples_per_second": 3.497,
"eval_steps_per_second": 1.764,
"step": 150
},
{
"epoch": 0.0031,
"eval_loss": 2.7789695262908936,
"eval_runtime": 33.6409,
"eval_samples_per_second": 3.478,
"eval_steps_per_second": 1.754,
"step": 155
},
{
"epoch": 0.0032,
"eval_loss": 2.7694201469421387,
"eval_runtime": 33.4266,
"eval_samples_per_second": 3.5,
"eval_steps_per_second": 1.765,
"step": 160
},
{
"epoch": 0.0033,
"eval_loss": 2.7600762844085693,
"eval_runtime": 33.4725,
"eval_samples_per_second": 3.495,
"eval_steps_per_second": 1.763,
"step": 165
},
{
"epoch": 0.0034,
"eval_loss": 2.7517828941345215,
"eval_runtime": 33.6223,
"eval_samples_per_second": 3.48,
"eval_steps_per_second": 1.755,
"step": 170
},
{
"epoch": 0.0035,
"grad_norm": 0.151307501186972,
"learning_rate": 3.48e-07,
"loss": 2.811,
"step": 175
},
{
"epoch": 0.0035,
"eval_loss": 2.743870258331299,
"eval_runtime": 33.5221,
"eval_samples_per_second": 3.49,
"eval_steps_per_second": 1.76,
"step": 175
},
{
"epoch": 0.0036,
"eval_loss": 2.7366557121276855,
"eval_runtime": 33.5448,
"eval_samples_per_second": 3.488,
"eval_steps_per_second": 1.759,
"step": 180
},
{
"epoch": 0.0037,
"eval_loss": 2.7298200130462646,
"eval_runtime": 33.5428,
"eval_samples_per_second": 3.488,
"eval_steps_per_second": 1.759,
"step": 185
},
{
"epoch": 0.0038,
"eval_loss": 2.722888708114624,
"eval_runtime": 33.6302,
"eval_samples_per_second": 3.479,
"eval_steps_per_second": 1.754,
"step": 190
},
{
"epoch": 0.0039,
"eval_loss": 2.714289426803589,
"eval_runtime": 33.5594,
"eval_samples_per_second": 3.486,
"eval_steps_per_second": 1.758,
"step": 195
},
{
"epoch": 0.004,
"grad_norm": 0.10362348542700331,
"learning_rate": 3.9800000000000004e-07,
"loss": 2.7606,
"step": 200
},
{
"epoch": 0.004,
"eval_loss": 2.7078425884246826,
"eval_runtime": 33.6447,
"eval_samples_per_second": 3.478,
"eval_steps_per_second": 1.754,
"step": 200
},
{
"epoch": 0.0041,
"eval_loss": 2.7014663219451904,
"eval_runtime": 33.565,
"eval_samples_per_second": 3.486,
"eval_steps_per_second": 1.758,
"step": 205
},
{
"epoch": 0.0042,
"eval_loss": 2.6956119537353516,
"eval_runtime": 33.5938,
"eval_samples_per_second": 3.483,
"eval_steps_per_second": 1.756,
"step": 210
},
{
"epoch": 0.0043,
"eval_loss": 2.6901819705963135,
"eval_runtime": 33.5009,
"eval_samples_per_second": 3.492,
"eval_steps_per_second": 1.761,
"step": 215
},
{
"epoch": 0.0044,
"eval_loss": 2.684842824935913,
"eval_runtime": 33.5857,
"eval_samples_per_second": 3.484,
"eval_steps_per_second": 1.757,
"step": 220
},
{
"epoch": 0.0045,
"grad_norm": 0.08395542059093342,
"learning_rate": 4.4800000000000004e-07,
"loss": 2.727,
"step": 225
},
{
"epoch": 0.0045,
"eval_loss": 2.679893732070923,
"eval_runtime": 33.5333,
"eval_samples_per_second": 3.489,
"eval_steps_per_second": 1.759,
"step": 225
},
{
"epoch": 0.0046,
"eval_loss": 2.6749234199523926,
"eval_runtime": 33.6847,
"eval_samples_per_second": 3.473,
"eval_steps_per_second": 1.752,
"step": 230
},
{
"epoch": 0.0047,
"eval_loss": 2.670543670654297,
"eval_runtime": 33.5814,
"eval_samples_per_second": 3.484,
"eval_steps_per_second": 1.757,
"step": 235
},
{
"epoch": 0.0048,
"eval_loss": 2.6663973331451416,
"eval_runtime": 33.5943,
"eval_samples_per_second": 3.483,
"eval_steps_per_second": 1.756,
"step": 240
},
{
"epoch": 0.0049,
"eval_loss": 2.662304162979126,
"eval_runtime": 33.5309,
"eval_samples_per_second": 3.489,
"eval_steps_per_second": 1.76,
"step": 245
},
{
"epoch": 0.005,
"grad_norm": 0.06968304771462097,
"learning_rate": 4.98e-07,
"loss": 2.6931,
"step": 250
},
{
"epoch": 0.005,
"eval_loss": 2.65859317779541,
"eval_runtime": 33.4663,
"eval_samples_per_second": 3.496,
"eval_steps_per_second": 1.763,
"step": 250
},
{
"epoch": 0.0051,
"eval_loss": 2.654831886291504,
"eval_runtime": 33.5962,
"eval_samples_per_second": 3.483,
"eval_steps_per_second": 1.756,
"step": 255
},
{
"epoch": 0.0052,
"eval_loss": 2.6509766578674316,
"eval_runtime": 33.5064,
"eval_samples_per_second": 3.492,
"eval_steps_per_second": 1.761,
"step": 260
},
{
"epoch": 0.0053,
"eval_loss": 2.6467387676239014,
"eval_runtime": 33.5346,
"eval_samples_per_second": 3.489,
"eval_steps_per_second": 1.759,
"step": 265
},
{
"epoch": 0.0054,
"eval_loss": 2.6428205966949463,
"eval_runtime": 33.5418,
"eval_samples_per_second": 3.488,
"eval_steps_per_second": 1.759,
"step": 270
},
{
"epoch": 0.0055,
"grad_norm": 0.05704195405230526,
"learning_rate": 5.480000000000001e-07,
"loss": 2.674,
"step": 275
},
{
"epoch": 0.0055,
"eval_loss": 2.6392645835876465,
"eval_runtime": 33.6509,
"eval_samples_per_second": 3.477,
"eval_steps_per_second": 1.753,
"step": 275
},
{
"epoch": 0.0056,
"eval_loss": 2.6361024379730225,
"eval_runtime": 33.6973,
"eval_samples_per_second": 3.472,
"eval_steps_per_second": 1.751,
"step": 280
},
{
"epoch": 0.0057,
"eval_loss": 2.6328718662261963,
"eval_runtime": 33.5639,
"eval_samples_per_second": 3.486,
"eval_steps_per_second": 1.758,
"step": 285
},
{
"epoch": 0.0058,
"eval_loss": 2.629871129989624,
"eval_runtime": 33.5243,
"eval_samples_per_second": 3.49,
"eval_steps_per_second": 1.76,
"step": 290
},
{
"epoch": 0.0059,
"eval_loss": 2.6271257400512695,
"eval_runtime": 33.6427,
"eval_samples_per_second": 3.478,
"eval_steps_per_second": 1.754,
"step": 295
},
{
"epoch": 0.006,
"grad_norm": 0.05013991368613539,
"learning_rate": 5.98e-07,
"loss": 2.6504,
"step": 300
},
{
"epoch": 0.006,
"eval_loss": 2.6243784427642822,
"eval_runtime": 33.5815,
"eval_samples_per_second": 3.484,
"eval_steps_per_second": 1.757,
"step": 300
},
{
"epoch": 0.0061,
"eval_loss": 2.621882915496826,
"eval_runtime": 33.7331,
"eval_samples_per_second": 3.468,
"eval_steps_per_second": 1.749,
"step": 305
},
{
"epoch": 0.0062,
"eval_loss": 2.6194233894348145,
"eval_runtime": 33.594,
"eval_samples_per_second": 3.483,
"eval_steps_per_second": 1.756,
"step": 310
},
{
"epoch": 0.0063,
"eval_loss": 2.6167914867401123,
"eval_runtime": 33.5521,
"eval_samples_per_second": 3.487,
"eval_steps_per_second": 1.758,
"step": 315
},
{
"epoch": 0.0064,
"eval_loss": 2.6143040657043457,
"eval_runtime": 33.58,
"eval_samples_per_second": 3.484,
"eval_steps_per_second": 1.757,
"step": 320
},
{
"epoch": 0.0065,
"grad_norm": 0.04696879401248375,
"learning_rate": 6.48e-07,
"loss": 2.6372,
"step": 325
},
{
"epoch": 0.0065,
"eval_loss": 2.611804246902466,
"eval_runtime": 33.5371,
"eval_samples_per_second": 3.489,
"eval_steps_per_second": 1.759,
"step": 325
},
{
"epoch": 0.0066,
"eval_loss": 2.6093685626983643,
"eval_runtime": 33.8057,
"eval_samples_per_second": 3.461,
"eval_steps_per_second": 1.745,
"step": 330
},
{
"epoch": 0.0067,
"eval_loss": 2.607069492340088,
"eval_runtime": 33.5819,
"eval_samples_per_second": 3.484,
"eval_steps_per_second": 1.757,
"step": 335
},
{
"epoch": 0.0068,
"eval_loss": 2.604562520980835,
"eval_runtime": 33.5971,
"eval_samples_per_second": 3.482,
"eval_steps_per_second": 1.756,
"step": 340
},
{
"epoch": 0.0069,
"eval_loss": 2.6024069786071777,
"eval_runtime": 33.5107,
"eval_samples_per_second": 3.491,
"eval_steps_per_second": 1.761,
"step": 345
},
{
"epoch": 0.007,
"grad_norm": 0.04335213523196003,
"learning_rate": 6.98e-07,
"loss": 2.6173,
"step": 350
},
{
"epoch": 0.007,
"eval_loss": 2.6002795696258545,
"eval_runtime": 33.6194,
"eval_samples_per_second": 3.48,
"eval_steps_per_second": 1.755,
"step": 350
},
{
"epoch": 0.0071,
"eval_loss": 2.598109245300293,
"eval_runtime": 33.807,
"eval_samples_per_second": 3.461,
"eval_steps_per_second": 1.745,
"step": 355
},
{
"epoch": 0.0072,
"eval_loss": 2.596126079559326,
"eval_runtime": 33.5287,
"eval_samples_per_second": 3.49,
"eval_steps_per_second": 1.76,
"step": 360
},
{
"epoch": 0.0073,
"eval_loss": 2.5941832065582275,
"eval_runtime": 33.5456,
"eval_samples_per_second": 3.488,
"eval_steps_per_second": 1.759,
"step": 365
},
{
"epoch": 0.0074,
"eval_loss": 2.592336893081665,
"eval_runtime": 33.6972,
"eval_samples_per_second": 3.472,
"eval_steps_per_second": 1.751,
"step": 370
},
{
"epoch": 0.0075,
"grad_norm": 0.04553004087145917,
"learning_rate": 7.480000000000001e-07,
"loss": 2.608,
"step": 375
},
{
"epoch": 0.0075,
"eval_loss": 2.590573310852051,
"eval_runtime": 33.6132,
"eval_samples_per_second": 3.481,
"eval_steps_per_second": 1.755,
"step": 375
},
{
"epoch": 0.0076,
"eval_loss": 2.5888302326202393,
"eval_runtime": 33.6363,
"eval_samples_per_second": 3.478,
"eval_steps_per_second": 1.754,
"step": 380
},
{
"epoch": 0.0077,
"eval_loss": 2.5870487689971924,
"eval_runtime": 33.6309,
"eval_samples_per_second": 3.479,
"eval_steps_per_second": 1.754,
"step": 385
},
{
"epoch": 0.0078,
"eval_loss": 2.5851986408233643,
"eval_runtime": 33.5237,
"eval_samples_per_second": 3.49,
"eval_steps_per_second": 1.76,
"step": 390
},
{
"epoch": 0.0079,
"eval_loss": 2.583341598510742,
"eval_runtime": 33.4914,
"eval_samples_per_second": 3.493,
"eval_steps_per_second": 1.762,
"step": 395
},
{
"epoch": 0.008,
"grad_norm": 0.04067489002091025,
"learning_rate": 7.98e-07,
"loss": 2.6034,
"step": 400
},
{
"epoch": 0.008,
"eval_loss": 2.5816242694854736,
"eval_runtime": 33.6305,
"eval_samples_per_second": 3.479,
"eval_steps_per_second": 1.754,
"step": 400
},
{
"epoch": 0.0081,
"eval_loss": 2.5800209045410156,
"eval_runtime": 33.9049,
"eval_samples_per_second": 3.451,
"eval_steps_per_second": 1.74,
"step": 405
},
{
"epoch": 0.0082,
"eval_loss": 2.5783472061157227,
"eval_runtime": 33.6847,
"eval_samples_per_second": 3.473,
"eval_steps_per_second": 1.752,
"step": 410
},
{
"epoch": 0.0083,
"eval_loss": 2.5765581130981445,
"eval_runtime": 33.5467,
"eval_samples_per_second": 3.488,
"eval_steps_per_second": 1.759,
"step": 415
},
{
"epoch": 0.0084,
"eval_loss": 2.574805974960327,
"eval_runtime": 33.6837,
"eval_samples_per_second": 3.473,
"eval_steps_per_second": 1.752,
"step": 420
},
{
"epoch": 0.0085,
"grad_norm": 0.03957021300313725,
"learning_rate": 8.480000000000001e-07,
"loss": 2.5881,
"step": 425
},
{
"epoch": 0.0085,
"eval_loss": 2.5732243061065674,
"eval_runtime": 33.6883,
"eval_samples_per_second": 3.473,
"eval_steps_per_second": 1.751,
"step": 425
},
{
"epoch": 0.0086,
"eval_loss": 2.5712339878082275,
"eval_runtime": 34.0087,
"eval_samples_per_second": 3.44,
"eval_steps_per_second": 1.735,
"step": 430
},
{
"epoch": 0.0087,
"eval_loss": 2.5696043968200684,
"eval_runtime": 33.5522,
"eval_samples_per_second": 3.487,
"eval_steps_per_second": 1.758,
"step": 435
},
{
"epoch": 0.0088,
"eval_loss": 2.568011522293091,
"eval_runtime": 33.7026,
"eval_samples_per_second": 3.472,
"eval_steps_per_second": 1.751,
"step": 440
},
{
"epoch": 0.0089,
"eval_loss": 2.5661723613739014,
"eval_runtime": 33.7143,
"eval_samples_per_second": 3.47,
"eval_steps_per_second": 1.75,
"step": 445
},
{
"epoch": 0.009,
"grad_norm": 0.04518058243135632,
"learning_rate": 8.980000000000001e-07,
"loss": 2.577,
"step": 450
},
{
"epoch": 0.009,
"eval_loss": 2.5647170543670654,
"eval_runtime": 33.6066,
"eval_samples_per_second": 3.481,
"eval_steps_per_second": 1.756,
"step": 450
},
{
"epoch": 0.0091,
"eval_loss": 2.5629138946533203,
"eval_runtime": 33.695,
"eval_samples_per_second": 3.472,
"eval_steps_per_second": 1.751,
"step": 455
},
{
"epoch": 0.0092,
"eval_loss": 2.561223268508911,
"eval_runtime": 33.7639,
"eval_samples_per_second": 3.465,
"eval_steps_per_second": 1.747,
"step": 460
},
{
"epoch": 0.0093,
"eval_loss": 2.559941053390503,
"eval_runtime": 33.5726,
"eval_samples_per_second": 3.485,
"eval_steps_per_second": 1.757,
"step": 465
},
{
"epoch": 0.0094,
"eval_loss": 2.5585126876831055,
"eval_runtime": 33.5393,
"eval_samples_per_second": 3.488,
"eval_steps_per_second": 1.759,
"step": 470
},
{
"epoch": 0.0095,
"grad_norm": 0.04841685552742973,
"learning_rate": 9.480000000000001e-07,
"loss": 2.5614,
"step": 475
},
{
"epoch": 0.0095,
"eval_loss": 2.557070732116699,
"eval_runtime": 33.5396,
"eval_samples_per_second": 3.488,
"eval_steps_per_second": 1.759,
"step": 475
},
{
"epoch": 0.0096,
"eval_loss": 2.5551016330718994,
"eval_runtime": 33.8951,
"eval_samples_per_second": 3.452,
"eval_steps_per_second": 1.741,
"step": 480
},
{
"epoch": 0.0097,
"eval_loss": 2.553600311279297,
"eval_runtime": 33.6678,
"eval_samples_per_second": 3.475,
"eval_steps_per_second": 1.752,
"step": 485
},
{
"epoch": 0.0098,
"eval_loss": 2.5523183345794678,
"eval_runtime": 33.6551,
"eval_samples_per_second": 3.476,
"eval_steps_per_second": 1.753,
"step": 490
},
{
"epoch": 0.0099,
"eval_loss": 2.5510056018829346,
"eval_runtime": 33.6214,
"eval_samples_per_second": 3.48,
"eval_steps_per_second": 1.755,
"step": 495
},
{
"epoch": 0.01,
"grad_norm": 0.043993000876628545,
"learning_rate": 9.98e-07,
"loss": 2.5613,
"step": 500
},
{
"epoch": 0.01,
"eval_loss": 2.5498273372650146,
"eval_runtime": 33.6069,
"eval_samples_per_second": 3.481,
"eval_steps_per_second": 1.756,
"step": 500
},
{
"epoch": 0.0101,
"eval_loss": 2.548828601837158,
"eval_runtime": 33.7909,
"eval_samples_per_second": 3.462,
"eval_steps_per_second": 1.746,
"step": 505
},
{
"epoch": 0.0102,
"eval_loss": 2.5474376678466797,
"eval_runtime": 33.543,
"eval_samples_per_second": 3.488,
"eval_steps_per_second": 1.759,
"step": 510
},
{
"epoch": 0.0103,
"eval_loss": 2.5464441776275635,
"eval_runtime": 33.6579,
"eval_samples_per_second": 3.476,
"eval_steps_per_second": 1.753,
"step": 515
},
{
"epoch": 0.0104,
"eval_loss": 2.5453498363494873,
"eval_runtime": 33.4841,
"eval_samples_per_second": 3.494,
"eval_steps_per_second": 1.762,
"step": 520
},
{
"epoch": 0.0105,
"grad_norm": 0.04663602312001795,
"learning_rate": 1.0480000000000002e-06,
"loss": 2.5521,
"step": 525
},
{
"epoch": 0.0105,
"eval_loss": 2.5442492961883545,
"eval_runtime": 33.5915,
"eval_samples_per_second": 3.483,
"eval_steps_per_second": 1.756,
"step": 525
},
{
"epoch": 0.0106,
"eval_loss": 2.5432002544403076,
"eval_runtime": 33.6717,
"eval_samples_per_second": 3.475,
"eval_steps_per_second": 1.752,
"step": 530
},
{
"epoch": 0.0107,
"eval_loss": 2.542072057723999,
"eval_runtime": 33.6153,
"eval_samples_per_second": 3.481,
"eval_steps_per_second": 1.755,
"step": 535
},
{
"epoch": 0.0108,
"eval_loss": 2.541541814804077,
"eval_runtime": 34.4505,
"eval_samples_per_second": 3.396,
"eval_steps_per_second": 1.713,
"step": 540
},
{
"epoch": 0.0109,
"eval_loss": 2.540494203567505,
"eval_runtime": 33.6369,
"eval_samples_per_second": 3.478,
"eval_steps_per_second": 1.754,
"step": 545
},
{
"epoch": 0.011,
"grad_norm": 0.044473565671350655,
"learning_rate": 1.0980000000000001e-06,
"loss": 2.5433,
"step": 550
},
{
"epoch": 0.011,
"eval_loss": 2.539369821548462,
"eval_runtime": 33.5742,
"eval_samples_per_second": 3.485,
"eval_steps_per_second": 1.757,
"step": 550
},
{
"epoch": 0.0111,
"eval_loss": 2.5384223461151123,
"eval_runtime": 33.9094,
"eval_samples_per_second": 3.45,
"eval_steps_per_second": 1.74,
"step": 555
},
{
"epoch": 0.0112,
"eval_loss": 2.5375945568084717,
"eval_runtime": 33.6016,
"eval_samples_per_second": 3.482,
"eval_steps_per_second": 1.756,
"step": 560
},
{
"epoch": 0.0113,
"eval_loss": 2.536487340927124,
"eval_runtime": 34.3561,
"eval_samples_per_second": 3.406,
"eval_steps_per_second": 1.717,
"step": 565
},
{
"epoch": 0.0114,
"eval_loss": 2.5356836318969727,
"eval_runtime": 34.5074,
"eval_samples_per_second": 3.391,
"eval_steps_per_second": 1.71,
"step": 570
},
{
"epoch": 0.0115,
"grad_norm": 0.04668528198599521,
"learning_rate": 1.148e-06,
"loss": 2.5496,
"step": 575
},
{
"epoch": 0.0115,
"eval_loss": 2.5347819328308105,
"eval_runtime": 33.5932,
"eval_samples_per_second": 3.483,
"eval_steps_per_second": 1.756,
"step": 575
},
{
"epoch": 0.0116,
"eval_loss": 2.534010410308838,
"eval_runtime": 33.8124,
"eval_samples_per_second": 3.46,
"eval_steps_per_second": 1.745,
"step": 580
},
{
"epoch": 0.0117,
"eval_loss": 2.5331332683563232,
"eval_runtime": 33.5617,
"eval_samples_per_second": 3.486,
"eval_steps_per_second": 1.758,
"step": 585
},
{
"epoch": 0.0118,
"eval_loss": 2.5322561264038086,
"eval_runtime": 33.8081,
"eval_samples_per_second": 3.461,
"eval_steps_per_second": 1.745,
"step": 590
},
{
"epoch": 0.0119,
"eval_loss": 2.5314669609069824,
"eval_runtime": 33.7053,
"eval_samples_per_second": 3.471,
"eval_steps_per_second": 1.75,
"step": 595
},
{
"epoch": 0.012,
"grad_norm": 0.043769011241975755,
"learning_rate": 1.1980000000000002e-06,
"loss": 2.5455,
"step": 600
},
{
"epoch": 0.012,
"eval_loss": 2.5307207107543945,
"eval_runtime": 33.6848,
"eval_samples_per_second": 3.473,
"eval_steps_per_second": 1.752,
"step": 600
},
{
"epoch": 0.0121,
"eval_loss": 2.530006170272827,
"eval_runtime": 33.686,
"eval_samples_per_second": 3.473,
"eval_steps_per_second": 1.751,
"step": 605
},
{
"epoch": 0.0122,
"eval_loss": 2.529109239578247,
"eval_runtime": 33.7013,
"eval_samples_per_second": 3.472,
"eval_steps_per_second": 1.751,
"step": 610
},
{
"epoch": 0.0123,
"eval_loss": 2.5284457206726074,
"eval_runtime": 33.6733,
"eval_samples_per_second": 3.475,
"eval_steps_per_second": 1.752,
"step": 615
},
{
"epoch": 0.0124,
"eval_loss": 2.5276710987091064,
"eval_runtime": 33.624,
"eval_samples_per_second": 3.48,
"eval_steps_per_second": 1.755,
"step": 620
},
{
"epoch": 0.0125,
"grad_norm": 0.04196307636615052,
"learning_rate": 1.248e-06,
"loss": 2.5273,
"step": 625
},
{
"epoch": 0.0125,
"eval_loss": 2.526918411254883,
"eval_runtime": 33.5952,
"eval_samples_per_second": 3.483,
"eval_steps_per_second": 1.756,
"step": 625
},
{
"epoch": 0.0126,
"eval_loss": 2.5262696743011475,
"eval_runtime": 33.7522,
"eval_samples_per_second": 3.466,
"eval_steps_per_second": 1.748,
"step": 630
},
{
"epoch": 0.0127,
"eval_loss": 2.5255067348480225,
"eval_runtime": 33.7929,
"eval_samples_per_second": 3.462,
"eval_steps_per_second": 1.746,
"step": 635
},
{
"epoch": 0.0128,
"eval_loss": 2.524789810180664,
"eval_runtime": 33.7139,
"eval_samples_per_second": 3.47,
"eval_steps_per_second": 1.75,
"step": 640
},
{
"epoch": 0.0129,
"eval_loss": 2.524181604385376,
"eval_runtime": 33.7772,
"eval_samples_per_second": 3.464,
"eval_steps_per_second": 1.747,
"step": 645
},
{
"epoch": 0.013,
"grad_norm": 0.04719575571491393,
"learning_rate": 1.2980000000000001e-06,
"loss": 2.5226,
"step": 650
},
{
"epoch": 0.013,
"eval_loss": 2.5235090255737305,
"eval_runtime": 33.6972,
"eval_samples_per_second": 3.472,
"eval_steps_per_second": 1.751,
"step": 650
},
{
"epoch": 0.0131,
"eval_loss": 2.5227887630462646,
"eval_runtime": 33.8073,
"eval_samples_per_second": 3.461,
"eval_steps_per_second": 1.745,
"step": 655
},
{
"epoch": 0.0132,
"eval_loss": 2.522101402282715,
"eval_runtime": 33.7192,
"eval_samples_per_second": 3.47,
"eval_steps_per_second": 1.75,
"step": 660
},
{
"epoch": 0.0133,
"eval_loss": 2.5215632915496826,
"eval_runtime": 33.7966,
"eval_samples_per_second": 3.462,
"eval_steps_per_second": 1.746,
"step": 665
},
{
"epoch": 0.0134,
"eval_loss": 2.5208749771118164,
"eval_runtime": 33.7485,
"eval_samples_per_second": 3.467,
"eval_steps_per_second": 1.748,
"step": 670
},
{
"epoch": 0.0135,
"grad_norm": 0.044734235617461586,
"learning_rate": 1.348e-06,
"loss": 2.5273,
"step": 675
},
{
"epoch": 0.0135,
"eval_loss": 2.5201478004455566,
"eval_runtime": 33.8972,
"eval_samples_per_second": 3.452,
"eval_steps_per_second": 1.741,
"step": 675
},
{
"epoch": 0.0136,
"eval_loss": 2.5197227001190186,
"eval_runtime": 33.6652,
"eval_samples_per_second": 3.475,
"eval_steps_per_second": 1.753,
"step": 680
},
{
"epoch": 0.0137,
"eval_loss": 2.519151449203491,
"eval_runtime": 33.6031,
"eval_samples_per_second": 3.482,
"eval_steps_per_second": 1.756,
"step": 685
},
{
"epoch": 0.0138,
"eval_loss": 2.5185396671295166,
"eval_runtime": 33.6292,
"eval_samples_per_second": 3.479,
"eval_steps_per_second": 1.754,
"step": 690
},
{
"epoch": 0.0139,
"eval_loss": 2.517947196960449,
"eval_runtime": 33.5987,
"eval_samples_per_second": 3.482,
"eval_steps_per_second": 1.756,
"step": 695
},
{
"epoch": 0.014,
"grad_norm": 0.04124740305893712,
"learning_rate": 1.3980000000000002e-06,
"loss": 2.5214,
"step": 700
},
{
"epoch": 0.014,
"eval_loss": 2.5173356533050537,
"eval_runtime": 33.6657,
"eval_samples_per_second": 3.475,
"eval_steps_per_second": 1.753,
"step": 700
},
{
"epoch": 0.0141,
"eval_loss": 2.5167977809906006,
"eval_runtime": 33.5728,
"eval_samples_per_second": 3.485,
"eval_steps_per_second": 1.757,
"step": 705
},
{
"epoch": 0.0142,
"eval_loss": 2.5162267684936523,
"eval_runtime": 33.2779,
"eval_samples_per_second": 3.516,
"eval_steps_per_second": 1.773,
"step": 710
},
{
"epoch": 0.0143,
"eval_loss": 2.5155909061431885,
"eval_runtime": 33.4627,
"eval_samples_per_second": 3.496,
"eval_steps_per_second": 1.763,
"step": 715
},
{
"epoch": 0.0144,
"eval_loss": 2.515427589416504,
"eval_runtime": 33.439,
"eval_samples_per_second": 3.499,
"eval_steps_per_second": 1.764,
"step": 720
},
{
"epoch": 0.0145,
"grad_norm": 0.04140679897915697,
"learning_rate": 1.4480000000000002e-06,
"loss": 2.5192,
"step": 725
},
{
"epoch": 0.0145,
"eval_loss": 2.514657735824585,
"eval_runtime": 33.3527,
"eval_samples_per_second": 3.508,
"eval_steps_per_second": 1.769,
"step": 725
},
{
"epoch": 0.0146,
"eval_loss": 2.5141184329986572,
"eval_runtime": 33.3623,
"eval_samples_per_second": 3.507,
"eval_steps_per_second": 1.768,
"step": 730
},
{
"epoch": 0.0147,
"eval_loss": 2.5135021209716797,
"eval_runtime": 36.2875,
"eval_samples_per_second": 3.224,
"eval_steps_per_second": 1.626,
"step": 735
},
{
"epoch": 0.0148,
"eval_loss": 2.5130276679992676,
"eval_runtime": 33.3738,
"eval_samples_per_second": 3.506,
"eval_steps_per_second": 1.768,
"step": 740
},
{
"epoch": 0.0149,
"eval_loss": 2.5123140811920166,
"eval_runtime": 33.7458,
"eval_samples_per_second": 3.467,
"eval_steps_per_second": 1.748,
"step": 745
},
{
"epoch": 0.015,
"grad_norm": 0.03921746155872101,
"learning_rate": 1.498e-06,
"loss": 2.5077,
"step": 750
},
{
"epoch": 0.015,
"eval_loss": 2.5117204189300537,
"eval_runtime": 33.3164,
"eval_samples_per_second": 3.512,
"eval_steps_per_second": 1.771,
"step": 750
},
{
"epoch": 0.0151,
"eval_loss": 2.5113115310668945,
"eval_runtime": 33.464,
"eval_samples_per_second": 3.496,
"eval_steps_per_second": 1.763,
"step": 755
},
{
"epoch": 0.0152,
"eval_loss": 2.510754108428955,
"eval_runtime": 33.426,
"eval_samples_per_second": 3.5,
"eval_steps_per_second": 1.765,
"step": 760
},
{
"epoch": 0.0153,
"eval_loss": 2.510148525238037,
"eval_runtime": 33.5135,
"eval_samples_per_second": 3.491,
"eval_steps_per_second": 1.76,
"step": 765
},
{
"epoch": 0.0154,
"eval_loss": 2.5096797943115234,
"eval_runtime": 33.5467,
"eval_samples_per_second": 3.488,
"eval_steps_per_second": 1.759,
"step": 770
},
{
"epoch": 0.0155,
"grad_norm": 0.038161493704092234,
"learning_rate": 1.548e-06,
"loss": 2.5127,
"step": 775
},
{
"epoch": 0.0155,
"eval_loss": 2.5091397762298584,
"eval_runtime": 33.6296,
"eval_samples_per_second": 3.479,
"eval_steps_per_second": 1.754,
"step": 775
},
{
"epoch": 0.0156,
"eval_loss": 2.5085766315460205,
"eval_runtime": 33.6417,
"eval_samples_per_second": 3.478,
"eval_steps_per_second": 1.754,
"step": 780
},
{
"epoch": 0.0157,
"eval_loss": 2.5081799030303955,
"eval_runtime": 33.5831,
"eval_samples_per_second": 3.484,
"eval_steps_per_second": 1.757,
"step": 785
},
{
"epoch": 0.0158,
"eval_loss": 2.5075252056121826,
"eval_runtime": 33.5806,
"eval_samples_per_second": 3.484,
"eval_steps_per_second": 1.757,
"step": 790
},
{
"epoch": 0.0159,
"eval_loss": 2.5069563388824463,
"eval_runtime": 33.6257,
"eval_samples_per_second": 3.479,
"eval_steps_per_second": 1.755,
"step": 795
},
{
"epoch": 0.016,
"grad_norm": 0.04372605860022339,
"learning_rate": 1.5980000000000002e-06,
"loss": 2.5019,
"step": 800
},
{
"epoch": 0.016,
"eval_loss": 2.5065925121307373,
"eval_runtime": 33.6041,
"eval_samples_per_second": 3.482,
"eval_steps_per_second": 1.756,
"step": 800
},
{
"epoch": 0.0161,
"eval_loss": 2.5059759616851807,
"eval_runtime": 33.6116,
"eval_samples_per_second": 3.481,
"eval_steps_per_second": 1.755,
"step": 805
},
{
"epoch": 0.0162,
"eval_loss": 2.505453109741211,
"eval_runtime": 33.5794,
"eval_samples_per_second": 3.484,
"eval_steps_per_second": 1.757,
"step": 810
},
{
"epoch": 0.0163,
"eval_loss": 2.505023241043091,
"eval_runtime": 33.461,
"eval_samples_per_second": 3.497,
"eval_steps_per_second": 1.763,
"step": 815
},
{
"epoch": 0.0164,
"eval_loss": 2.5042824745178223,
"eval_runtime": 33.5988,
"eval_samples_per_second": 3.482,
"eval_steps_per_second": 1.756,
"step": 820
},
{
"epoch": 0.0165,
"grad_norm": 0.041497520045134684,
"learning_rate": 1.6480000000000001e-06,
"loss": 2.4977,
"step": 825
},
{
"epoch": 0.0165,
"eval_loss": 2.5039255619049072,
"eval_runtime": 33.6107,
"eval_samples_per_second": 3.481,
"eval_steps_per_second": 1.755,
"step": 825
},
{
"epoch": 0.0166,
"eval_loss": 2.503436803817749,
"eval_runtime": 33.6213,
"eval_samples_per_second": 3.48,
"eval_steps_per_second": 1.755,
"step": 830
},
{
"epoch": 0.0167,
"eval_loss": 2.5028321743011475,
"eval_runtime": 33.5009,
"eval_samples_per_second": 3.492,
"eval_steps_per_second": 1.761,
"step": 835
},
{
"epoch": 0.0168,
"eval_loss": 2.5022666454315186,
"eval_runtime": 33.6392,
"eval_samples_per_second": 3.478,
"eval_steps_per_second": 1.754,
"step": 840
},
{
"epoch": 0.0169,
"eval_loss": 2.5018374919891357,
"eval_runtime": 33.5928,
"eval_samples_per_second": 3.483,
"eval_steps_per_second": 1.756,
"step": 845
},
{
"epoch": 0.017,
"grad_norm": 0.040226840781059835,
"learning_rate": 1.6980000000000003e-06,
"loss": 2.4968,
"step": 850
},
{
"epoch": 0.017,
"eval_loss": 2.5012588500976562,
"eval_runtime": 33.5216,
"eval_samples_per_second": 3.49,
"eval_steps_per_second": 1.76,
"step": 850
},
{
"epoch": 0.0171,
"eval_loss": 2.5006515979766846,
"eval_runtime": 33.4799,
"eval_samples_per_second": 3.495,
"eval_steps_per_second": 1.762,
"step": 855
},
{
"epoch": 0.0172,
"eval_loss": 2.5001821517944336,
"eval_runtime": 33.6067,
"eval_samples_per_second": 3.481,
"eval_steps_per_second": 1.756,
"step": 860
},
{
"epoch": 0.0173,
"eval_loss": 2.499708652496338,
"eval_runtime": 33.5478,
"eval_samples_per_second": 3.488,
"eval_steps_per_second": 1.759,
"step": 865
},
{
"epoch": 0.0174,
"eval_loss": 2.4992101192474365,
"eval_runtime": 33.3608,
"eval_samples_per_second": 3.507,
"eval_steps_per_second": 1.769,
"step": 870
},
{
"epoch": 0.0175,
"grad_norm": 0.043360400185163274,
"learning_rate": 1.7480000000000002e-06,
"loss": 2.4947,
"step": 875
},
{
"epoch": 0.0175,
"eval_loss": 2.49912428855896,
"eval_runtime": 33.3782,
"eval_samples_per_second": 3.505,
"eval_steps_per_second": 1.768,
"step": 875
},
{
"epoch": 0.0176,
"eval_loss": 2.498539686203003,
"eval_runtime": 33.4271,
"eval_samples_per_second": 3.5,
"eval_steps_per_second": 1.765,
"step": 880
},
{
"epoch": 0.0177,
"eval_loss": 2.4980475902557373,
"eval_runtime": 33.508,
"eval_samples_per_second": 3.492,
"eval_steps_per_second": 1.761,
"step": 885
},
{
"epoch": 0.0178,
"eval_loss": 2.4972891807556152,
"eval_runtime": 33.5801,
"eval_samples_per_second": 3.484,
"eval_steps_per_second": 1.757,
"step": 890
},
{
"epoch": 0.0179,
"eval_loss": 2.496943473815918,
"eval_runtime": 33.4984,
"eval_samples_per_second": 3.493,
"eval_steps_per_second": 1.761,
"step": 895
},
{
"epoch": 0.018,
"grad_norm": 0.040565773819723885,
"learning_rate": 1.798e-06,
"loss": 2.4878,
"step": 900
},
{
"epoch": 0.018,
"eval_loss": 2.496464252471924,
"eval_runtime": 33.6538,
"eval_samples_per_second": 3.477,
"eval_steps_per_second": 1.753,
"step": 900
},
{
"epoch": 0.0181,
"eval_loss": 2.496126890182495,
"eval_runtime": 33.6415,
"eval_samples_per_second": 3.478,
"eval_steps_per_second": 1.754,
"step": 905
},
{
"epoch": 0.0182,
"eval_loss": 2.4957361221313477,
"eval_runtime": 33.7646,
"eval_samples_per_second": 3.465,
"eval_steps_per_second": 1.747,
"step": 910
},
{
"epoch": 0.0183,
"eval_loss": 2.4954254627227783,
"eval_runtime": 33.5639,
"eval_samples_per_second": 3.486,
"eval_steps_per_second": 1.758,
"step": 915
},
{
"epoch": 0.0184,
"eval_loss": 2.4948976039886475,
"eval_runtime": 33.6038,
"eval_samples_per_second": 3.482,
"eval_steps_per_second": 1.756,
"step": 920
},
{
"epoch": 0.0185,
"grad_norm": 0.039370814834696136,
"learning_rate": 1.8480000000000001e-06,
"loss": 2.4986,
"step": 925
},
{
"epoch": 0.0185,
"eval_loss": 2.494521379470825,
"eval_runtime": 33.7082,
"eval_samples_per_second": 3.471,
"eval_steps_per_second": 1.75,
"step": 925
},
{
"epoch": 0.0186,
"eval_loss": 2.4939730167388916,
"eval_runtime": 33.6147,
"eval_samples_per_second": 3.481,
"eval_steps_per_second": 1.755,
"step": 930
},
{
"epoch": 0.0187,
"eval_loss": 2.49343204498291,
"eval_runtime": 35.1502,
"eval_samples_per_second": 3.329,
"eval_steps_per_second": 1.679,
"step": 935
},
{
"epoch": 0.0188,
"eval_loss": 2.493082046508789,
"eval_runtime": 33.6381,
"eval_samples_per_second": 3.478,
"eval_steps_per_second": 1.754,
"step": 940
},
{
"epoch": 0.0189,
"eval_loss": 2.492797374725342,
"eval_runtime": 33.7089,
"eval_samples_per_second": 3.471,
"eval_steps_per_second": 1.75,
"step": 945
},
{
"epoch": 0.019,
"grad_norm": 0.04019472793080496,
"learning_rate": 1.898e-06,
"loss": 2.481,
"step": 950
},
{
"epoch": 0.019,
"eval_loss": 2.4925599098205566,
"eval_runtime": 33.5096,
"eval_samples_per_second": 3.492,
"eval_steps_per_second": 1.761,
"step": 950
},
{
"epoch": 0.0191,
"eval_loss": 2.4918878078460693,
"eval_runtime": 33.4921,
"eval_samples_per_second": 3.493,
"eval_steps_per_second": 1.762,
"step": 955
},
{
"epoch": 0.0192,
"eval_loss": 2.4916608333587646,
"eval_runtime": 33.5126,
"eval_samples_per_second": 3.491,
"eval_steps_per_second": 1.761,
"step": 960
},
{
"epoch": 0.0193,
"eval_loss": 2.491708517074585,
"eval_runtime": 33.6466,
"eval_samples_per_second": 3.477,
"eval_steps_per_second": 1.754,
"step": 965
},
{
"epoch": 0.0194,
"eval_loss": 2.4911839962005615,
"eval_runtime": 33.6119,
"eval_samples_per_second": 3.481,
"eval_steps_per_second": 1.755,
"step": 970
},
{
"epoch": 0.0195,
"grad_norm": 0.04683912756161822,
"learning_rate": 1.9480000000000002e-06,
"loss": 2.4879,
"step": 975
},
{
"epoch": 0.0195,
"eval_loss": 2.490492343902588,
"eval_runtime": 33.4389,
"eval_samples_per_second": 3.499,
"eval_steps_per_second": 1.764,
"step": 975
},
{
"epoch": 0.0196,
"eval_loss": 2.490133285522461,
"eval_runtime": 33.361,
"eval_samples_per_second": 3.507,
"eval_steps_per_second": 1.769,
"step": 980
},
{
"epoch": 0.0197,
"eval_loss": 2.4896316528320312,
"eval_runtime": 33.5863,
"eval_samples_per_second": 3.484,
"eval_steps_per_second": 1.757,
"step": 985
},
{
"epoch": 0.0198,
"eval_loss": 2.489122152328491,
"eval_runtime": 33.6173,
"eval_samples_per_second": 3.48,
"eval_steps_per_second": 1.755,
"step": 990
},
{
"epoch": 0.0199,
"eval_loss": 2.488906145095825,
"eval_runtime": 33.6531,
"eval_samples_per_second": 3.477,
"eval_steps_per_second": 1.753,
"step": 995
},
{
"epoch": 0.02,
"grad_norm": 0.047671496052023164,
"learning_rate": 1.998e-06,
"loss": 2.4879,
"step": 1000
},
{
"epoch": 0.02,
"eval_loss": 2.488457202911377,
"eval_runtime": 33.7763,
"eval_samples_per_second": 3.464,
"eval_steps_per_second": 1.747,
"step": 1000
},
{
"epoch": 0.0201,
"eval_loss": 2.4881434440612793,
"eval_runtime": 33.6922,
"eval_samples_per_second": 3.473,
"eval_steps_per_second": 1.751,
"step": 1005
},
{
"epoch": 0.0202,
"eval_loss": 2.4879722595214844,
"eval_runtime": 33.6857,
"eval_samples_per_second": 3.473,
"eval_steps_per_second": 1.751,
"step": 1010
},
{
"epoch": 0.0203,
"eval_loss": 2.4876134395599365,
"eval_runtime": 33.7945,
"eval_samples_per_second": 3.462,
"eval_steps_per_second": 1.746,
"step": 1015
},
{
"epoch": 0.0204,
"eval_loss": 2.4872164726257324,
"eval_runtime": 33.7811,
"eval_samples_per_second": 3.463,
"eval_steps_per_second": 1.747,
"step": 1020
},
{
"epoch": 0.0205,
"grad_norm": 0.04204734602618554,
"learning_rate": 2.048e-06,
"loss": 2.4708,
"step": 1025
},
{
"epoch": 0.0205,
"eval_loss": 2.48695707321167,
"eval_runtime": 33.821,
"eval_samples_per_second": 3.459,
"eval_steps_per_second": 1.744,
"step": 1025
},
{
"epoch": 0.0206,
"eval_loss": 2.486564874649048,
"eval_runtime": 33.82,
"eval_samples_per_second": 3.459,
"eval_steps_per_second": 1.745,
"step": 1030
},
{
"epoch": 0.0207,
"eval_loss": 2.486281633377075,
"eval_runtime": 33.927,
"eval_samples_per_second": 3.449,
"eval_steps_per_second": 1.739,
"step": 1035
},
{
"epoch": 0.0208,
"eval_loss": 2.4860103130340576,
"eval_runtime": 33.9697,
"eval_samples_per_second": 3.444,
"eval_steps_per_second": 1.737,
"step": 1040
},
{
"epoch": 0.0209,
"eval_loss": 2.4855759143829346,
"eval_runtime": 33.9097,
"eval_samples_per_second": 3.45,
"eval_steps_per_second": 1.74,
"step": 1045
},
{
"epoch": 0.021,
"grad_norm": 0.03813289834436041,
"learning_rate": 2.098e-06,
"loss": 2.4799,
"step": 1050
},
{
"epoch": 0.021,
"eval_loss": 2.485349416732788,
"eval_runtime": 34.0131,
"eval_samples_per_second": 3.44,
"eval_steps_per_second": 1.735,
"step": 1050
},
{
"epoch": 0.0211,
"eval_loss": 2.48506498336792,
"eval_runtime": 34.036,
"eval_samples_per_second": 3.438,
"eval_steps_per_second": 1.733,
"step": 1055
},
{
"epoch": 0.0212,
"eval_loss": 2.484771966934204,
"eval_runtime": 34.0842,
"eval_samples_per_second": 3.433,
"eval_steps_per_second": 1.731,
"step": 1060
},
{
"epoch": 0.0213,
"eval_loss": 2.4846508502960205,
"eval_runtime": 34.0289,
"eval_samples_per_second": 3.438,
"eval_steps_per_second": 1.734,
"step": 1065
},
{
"epoch": 0.0214,
"eval_loss": 2.484158992767334,
"eval_runtime": 34.0038,
"eval_samples_per_second": 3.441,
"eval_steps_per_second": 1.735,
"step": 1070
},
{
"epoch": 0.0215,
"grad_norm": 0.04289680570208033,
"learning_rate": 2.148e-06,
"loss": 2.4822,
"step": 1075
},
{
"epoch": 0.0215,
"eval_loss": 2.483947992324829,
"eval_runtime": 33.9604,
"eval_samples_per_second": 3.445,
"eval_steps_per_second": 1.737,
"step": 1075
},
{
"epoch": 0.0216,
"eval_loss": 2.4836008548736572,
"eval_runtime": 33.9465,
"eval_samples_per_second": 3.447,
"eval_steps_per_second": 1.738,
"step": 1080
},
{
"epoch": 0.0217,
"eval_loss": 2.483187675476074,
"eval_runtime": 34.1344,
"eval_samples_per_second": 3.428,
"eval_steps_per_second": 1.728,
"step": 1085
},
{
"epoch": 0.0218,
"eval_loss": 2.4829964637756348,
"eval_runtime": 34.0915,
"eval_samples_per_second": 3.432,
"eval_steps_per_second": 1.731,
"step": 1090
},
{
"epoch": 0.0219,
"eval_loss": 2.482805013656616,
"eval_runtime": 33.9291,
"eval_samples_per_second": 3.448,
"eval_steps_per_second": 1.739,
"step": 1095
},
{
"epoch": 0.022,
"grad_norm": 0.03972633299982532,
"learning_rate": 2.198e-06,
"loss": 2.4871,
"step": 1100
},
{
"epoch": 0.022,
"eval_loss": 2.482428550720215,
"eval_runtime": 33.7324,
"eval_samples_per_second": 3.468,
"eval_steps_per_second": 1.749,
"step": 1100
},
{
"epoch": 0.0221,
"eval_loss": 2.4822213649749756,
"eval_runtime": 33.7954,
"eval_samples_per_second": 3.462,
"eval_steps_per_second": 1.746,
"step": 1105
},
{
"epoch": 0.0222,
"eval_loss": 2.481689214706421,
"eval_runtime": 33.7787,
"eval_samples_per_second": 3.464,
"eval_steps_per_second": 1.747,
"step": 1110
},
{
"epoch": 0.0223,
"eval_loss": 2.481731414794922,
"eval_runtime": 33.6129,
"eval_samples_per_second": 3.481,
"eval_steps_per_second": 1.755,
"step": 1115
},
{
"epoch": 0.0224,
"eval_loss": 2.4812448024749756,
"eval_runtime": 33.511,
"eval_samples_per_second": 3.491,
"eval_steps_per_second": 1.761,
"step": 1120
},
{
"epoch": 0.0225,
"grad_norm": 0.041792864961431496,
"learning_rate": 2.2480000000000003e-06,
"loss": 2.4766,
"step": 1125
},
{
"epoch": 0.0225,
"eval_loss": 2.4809837341308594,
"eval_runtime": 33.7009,
"eval_samples_per_second": 3.472,
"eval_steps_per_second": 1.751,
"step": 1125
},
{
"epoch": 0.0226,
"eval_loss": 2.480768918991089,
"eval_runtime": 33.6615,
"eval_samples_per_second": 3.476,
"eval_steps_per_second": 1.753,
"step": 1130
},
{
"epoch": 0.0227,
"eval_loss": 2.480337381362915,
"eval_runtime": 33.6203,
"eval_samples_per_second": 3.48,
"eval_steps_per_second": 1.755,
"step": 1135
},
{
"epoch": 0.0228,
"eval_loss": 2.4803271293640137,
"eval_runtime": 33.6559,
"eval_samples_per_second": 3.476,
"eval_steps_per_second": 1.753,
"step": 1140
},
{
"epoch": 0.0229,
"eval_loss": 2.4799482822418213,
"eval_runtime": 33.5023,
"eval_samples_per_second": 3.492,
"eval_steps_per_second": 1.761,
"step": 1145
},
{
"epoch": 0.023,
"grad_norm": 0.035383899567194975,
"learning_rate": 2.2980000000000003e-06,
"loss": 2.4749,
"step": 1150
},
{
"epoch": 0.023,
"eval_loss": 2.479668140411377,
"eval_runtime": 33.4615,
"eval_samples_per_second": 3.497,
"eval_steps_per_second": 1.763,
"step": 1150
},
{
"epoch": 0.0231,
"eval_loss": 2.4794092178344727,
"eval_runtime": 33.4264,
"eval_samples_per_second": 3.5,
"eval_steps_per_second": 1.765,
"step": 1155
},
{
"epoch": 0.0232,
"eval_loss": 2.4790964126586914,
"eval_runtime": 33.4165,
"eval_samples_per_second": 3.501,
"eval_steps_per_second": 1.766,
"step": 1160
},
{
"epoch": 0.0233,
"eval_loss": 2.4789323806762695,
"eval_runtime": 33.2576,
"eval_samples_per_second": 3.518,
"eval_steps_per_second": 1.774,
"step": 1165
},
{
"epoch": 0.0234,
"eval_loss": 2.4786429405212402,
"eval_runtime": 33.3028,
"eval_samples_per_second": 3.513,
"eval_steps_per_second": 1.772,
"step": 1170
},
{
"epoch": 0.0235,
"grad_norm": 0.034819138532107045,
"learning_rate": 2.3480000000000002e-06,
"loss": 2.4874,
"step": 1175
},
{
"epoch": 0.0235,
"eval_loss": 2.4784486293792725,
"eval_runtime": 33.3374,
"eval_samples_per_second": 3.51,
"eval_steps_per_second": 1.77,
"step": 1175
},
{
"epoch": 0.0236,
"eval_loss": 2.478088855743408,
"eval_runtime": 33.2864,
"eval_samples_per_second": 3.515,
"eval_steps_per_second": 1.772,
"step": 1180
},
{
"epoch": 0.0237,
"eval_loss": 2.477979898452759,
"eval_runtime": 33.4245,
"eval_samples_per_second": 3.5,
"eval_steps_per_second": 1.765,
"step": 1185
},
{
"epoch": 0.0238,
"eval_loss": 2.4778709411621094,
"eval_runtime": 33.2611,
"eval_samples_per_second": 3.518,
"eval_steps_per_second": 1.774,
"step": 1190
},
{
"epoch": 0.0239,
"eval_loss": 2.477571487426758,
"eval_runtime": 33.3418,
"eval_samples_per_second": 3.509,
"eval_steps_per_second": 1.77,
"step": 1195
},
{
"epoch": 0.024,
"grad_norm": 0.037748109041694296,
"learning_rate": 2.398e-06,
"loss": 2.4666,
"step": 1200
},
{
"epoch": 0.024,
"eval_loss": 2.4772226810455322,
"eval_runtime": 33.3603,
"eval_samples_per_second": 3.507,
"eval_steps_per_second": 1.769,
"step": 1200
},
{
"epoch": 0.0241,
"eval_loss": 2.4769959449768066,
"eval_runtime": 33.21,
"eval_samples_per_second": 3.523,
"eval_steps_per_second": 1.777,
"step": 1205
},
{
"epoch": 0.0242,
"eval_loss": 2.4768526554107666,
"eval_runtime": 33.4359,
"eval_samples_per_second": 3.499,
"eval_steps_per_second": 1.765,
"step": 1210
},
{
"epoch": 0.0243,
"eval_loss": 2.476616382598877,
"eval_runtime": 33.3341,
"eval_samples_per_second": 3.51,
"eval_steps_per_second": 1.77,
"step": 1215
},
{
"epoch": 0.0244,
"eval_loss": 2.476250171661377,
"eval_runtime": 33.3422,
"eval_samples_per_second": 3.509,
"eval_steps_per_second": 1.77,
"step": 1220
},
{
"epoch": 0.0245,
"grad_norm": 0.042904100843004035,
"learning_rate": 2.448e-06,
"loss": 2.4698,
"step": 1225
},
{
"epoch": 0.0245,
"eval_loss": 2.475933790206909,
"eval_runtime": 33.3238,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 1.771,
"step": 1225
},
{
"epoch": 0.0246,
"eval_loss": 2.475733995437622,
"eval_runtime": 33.337,
"eval_samples_per_second": 3.51,
"eval_steps_per_second": 1.77,
"step": 1230
},
{
"epoch": 0.0247,
"eval_loss": 2.4756155014038086,
"eval_runtime": 33.3642,
"eval_samples_per_second": 3.507,
"eval_steps_per_second": 1.768,
"step": 1235
},
{
"epoch": 0.0248,
"eval_loss": 2.475208044052124,
"eval_runtime": 33.3567,
"eval_samples_per_second": 3.508,
"eval_steps_per_second": 1.769,
"step": 1240
},
{
"epoch": 0.0249,
"eval_loss": 2.4751882553100586,
"eval_runtime": 33.2409,
"eval_samples_per_second": 3.52,
"eval_steps_per_second": 1.775,
"step": 1245
},
{
"epoch": 0.025,
"grad_norm": 0.04198064762114288,
"learning_rate": 2.498e-06,
"loss": 2.4544,
"step": 1250
},
{
"epoch": 0.025,
"eval_loss": 2.4749433994293213,
"eval_runtime": 33.219,
"eval_samples_per_second": 3.522,
"eval_steps_per_second": 1.776,
"step": 1250
},
{
"epoch": 0.0251,
"eval_loss": 2.475109577178955,
"eval_runtime": 33.293,
"eval_samples_per_second": 3.514,
"eval_steps_per_second": 1.772,
"step": 1255
},
{
"epoch": 0.0252,
"eval_loss": 2.474750280380249,
"eval_runtime": 33.5388,
"eval_samples_per_second": 3.488,
"eval_steps_per_second": 1.759,
"step": 1260
},
{
"epoch": 0.0253,
"eval_loss": 2.4743547439575195,
"eval_runtime": 33.3597,
"eval_samples_per_second": 3.507,
"eval_steps_per_second": 1.769,
"step": 1265
},
{
"epoch": 0.0254,
"eval_loss": 2.4740777015686035,
"eval_runtime": 33.3283,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 1.77,
"step": 1270
},
{
"epoch": 0.0255,
"grad_norm": 0.03252077443949688,
"learning_rate": 2.5480000000000004e-06,
"loss": 2.4647,
"step": 1275
},
{
"epoch": 0.0255,
"eval_loss": 2.473674774169922,
"eval_runtime": 33.2492,
"eval_samples_per_second": 3.519,
"eval_steps_per_second": 1.774,
"step": 1275
},
{
"epoch": 0.0256,
"eval_loss": 2.4734930992126465,
"eval_runtime": 33.2934,
"eval_samples_per_second": 3.514,
"eval_steps_per_second": 1.772,
"step": 1280
},
{
"epoch": 0.0257,
"eval_loss": 2.4735071659088135,
"eval_runtime": 33.466,
"eval_samples_per_second": 3.496,
"eval_steps_per_second": 1.763,
"step": 1285
},
{
"epoch": 0.0258,
"eval_loss": 2.4733572006225586,
"eval_runtime": 33.248,
"eval_samples_per_second": 3.519,
"eval_steps_per_second": 1.775,
"step": 1290
},
{
"epoch": 0.0259,
"eval_loss": 2.4730312824249268,
"eval_runtime": 33.3551,
"eval_samples_per_second": 3.508,
"eval_steps_per_second": 1.769,
"step": 1295
},
{
"epoch": 0.026,
"grad_norm": 0.034740776600877266,
"learning_rate": 2.598e-06,
"loss": 2.4625,
"step": 1300
},
{
"epoch": 0.026,
"eval_loss": 2.4726204872131348,
"eval_runtime": 33.3147,
"eval_samples_per_second": 3.512,
"eval_steps_per_second": 1.771,
"step": 1300
},
{
"epoch": 0.0261,
"eval_loss": 2.4729621410369873,
"eval_runtime": 33.3118,
"eval_samples_per_second": 3.512,
"eval_steps_per_second": 1.771,
"step": 1305
},
{
"epoch": 0.0262,
"eval_loss": 2.4726085662841797,
"eval_runtime": 33.4111,
"eval_samples_per_second": 3.502,
"eval_steps_per_second": 1.766,
"step": 1310
},
{
"epoch": 0.0263,
"eval_loss": 2.4724133014678955,
"eval_runtime": 33.3144,
"eval_samples_per_second": 3.512,
"eval_steps_per_second": 1.771,
"step": 1315
},
{
"epoch": 0.0264,
"eval_loss": 2.471963405609131,
"eval_runtime": 33.3272,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 1.77,
"step": 1320
},
{
"epoch": 0.0265,
"grad_norm": 0.039738232523319775,
"learning_rate": 2.648e-06,
"loss": 2.4734,
"step": 1325
},
{
"epoch": 0.0265,
"eval_loss": 2.4717814922332764,
"eval_runtime": 33.2395,
"eval_samples_per_second": 3.52,
"eval_steps_per_second": 1.775,
"step": 1325
},
{
"epoch": 0.0266,
"eval_loss": 2.471389055252075,
"eval_runtime": 33.2159,
"eval_samples_per_second": 3.522,
"eval_steps_per_second": 1.776,
"step": 1330
},
{
"epoch": 0.0267,
"eval_loss": 2.4711251258850098,
"eval_runtime": 33.4193,
"eval_samples_per_second": 3.501,
"eval_steps_per_second": 1.765,
"step": 1335
},
{
"epoch": 0.0268,
"eval_loss": 2.470979928970337,
"eval_runtime": 33.2748,
"eval_samples_per_second": 3.516,
"eval_steps_per_second": 1.773,
"step": 1340
},
{
"epoch": 0.0269,
"eval_loss": 2.4706759452819824,
"eval_runtime": 33.3367,
"eval_samples_per_second": 3.51,
"eval_steps_per_second": 1.77,
"step": 1345
},
{
"epoch": 0.027,
"grad_norm": 0.036968596903604725,
"learning_rate": 2.6980000000000003e-06,
"loss": 2.4642,
"step": 1350
},
{
"epoch": 0.027,
"eval_loss": 2.470658302307129,
"eval_runtime": 33.3288,
"eval_samples_per_second": 3.51,
"eval_steps_per_second": 1.77,
"step": 1350
},
{
"epoch": 0.0271,
"eval_loss": 2.4704952239990234,
"eval_runtime": 33.3162,
"eval_samples_per_second": 3.512,
"eval_steps_per_second": 1.771,
"step": 1355
},
{
"epoch": 0.0272,
"eval_loss": 2.470270872116089,
"eval_runtime": 33.35,
"eval_samples_per_second": 3.508,
"eval_steps_per_second": 1.769,
"step": 1360
},
{
"epoch": 0.0273,
"eval_loss": 2.4699764251708984,
"eval_runtime": 33.3696,
"eval_samples_per_second": 3.506,
"eval_steps_per_second": 1.768,
"step": 1365
},
{
"epoch": 0.0274,
"eval_loss": 2.469688653945923,
"eval_runtime": 33.4143,
"eval_samples_per_second": 3.501,
"eval_steps_per_second": 1.766,
"step": 1370
},
{
"epoch": 0.0275,
"grad_norm": 0.03899590922475157,
"learning_rate": 2.748e-06,
"loss": 2.4579,
"step": 1375
},
{
"epoch": 0.0275,
"eval_loss": 2.469435691833496,
"eval_runtime": 33.34,
"eval_samples_per_second": 3.509,
"eval_steps_per_second": 1.77,
"step": 1375
},
{
"epoch": 0.0276,
"eval_loss": 2.469395160675049,
"eval_runtime": 33.2655,
"eval_samples_per_second": 3.517,
"eval_steps_per_second": 1.774,
"step": 1380
},
{
"epoch": 0.0277,
"eval_loss": 2.46889328956604,
"eval_runtime": 33.3344,
"eval_samples_per_second": 3.51,
"eval_steps_per_second": 1.77,
"step": 1385
},
{
"epoch": 0.0278,
"eval_loss": 2.468695640563965,
"eval_runtime": 33.4003,
"eval_samples_per_second": 3.503,
"eval_steps_per_second": 1.766,
"step": 1390
},
{
"epoch": 0.0279,
"eval_loss": 2.4685797691345215,
"eval_runtime": 33.252,
"eval_samples_per_second": 3.519,
"eval_steps_per_second": 1.774,
"step": 1395
},
{
"epoch": 0.028,
"grad_norm": 0.03498385470366268,
"learning_rate": 2.798e-06,
"loss": 2.472,
"step": 1400
},
{
"epoch": 0.028,
"eval_loss": 2.468594789505005,
"eval_runtime": 33.5555,
"eval_samples_per_second": 3.487,
"eval_steps_per_second": 1.758,
"step": 1400
},
{
"epoch": 0.0281,
"eval_loss": 2.4685287475585938,
"eval_runtime": 33.3147,
"eval_samples_per_second": 3.512,
"eval_steps_per_second": 1.771,
"step": 1405
},
{
"epoch": 0.0282,
"eval_loss": 2.467956304550171,
"eval_runtime": 33.3679,
"eval_samples_per_second": 3.506,
"eval_steps_per_second": 1.768,
"step": 1410
},
{
"epoch": 0.0283,
"eval_loss": 2.467761993408203,
"eval_runtime": 33.3242,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 1.77,
"step": 1415
},
{
"epoch": 0.0284,
"eval_loss": 2.467660903930664,
"eval_runtime": 33.3677,
"eval_samples_per_second": 3.506,
"eval_steps_per_second": 1.768,
"step": 1420
},
{
"epoch": 0.0285,
"grad_norm": 0.03333480906358989,
"learning_rate": 2.848e-06,
"loss": 2.4676,
"step": 1425
},
{
"epoch": 0.0285,
"eval_loss": 2.4673027992248535,
"eval_runtime": 33.3388,
"eval_samples_per_second": 3.509,
"eval_steps_per_second": 1.77,
"step": 1425
},
{
"epoch": 0.0286,
"eval_loss": 2.467072010040283,
"eval_runtime": 33.3596,
"eval_samples_per_second": 3.507,
"eval_steps_per_second": 1.769,
"step": 1430
},
{
"epoch": 0.0287,
"eval_loss": 2.4668517112731934,
"eval_runtime": 33.5136,
"eval_samples_per_second": 3.491,
"eval_steps_per_second": 1.76,
"step": 1435
},
{
"epoch": 0.0288,
"eval_loss": 2.4666786193847656,
"eval_runtime": 33.3405,
"eval_samples_per_second": 3.509,
"eval_steps_per_second": 1.77,
"step": 1440
},
{
"epoch": 0.0289,
"eval_loss": 2.4667794704437256,
"eval_runtime": 33.3333,
"eval_samples_per_second": 3.51,
"eval_steps_per_second": 1.77,
"step": 1445
},
{
"epoch": 0.029,
"grad_norm": 0.03480548121480933,
"learning_rate": 2.8980000000000005e-06,
"loss": 2.4524,
"step": 1450
},
{
"epoch": 0.029,
"eval_loss": 2.466280460357666,
"eval_runtime": 33.4727,
"eval_samples_per_second": 3.495,
"eval_steps_per_second": 1.763,
"step": 1450
},
{
"epoch": 0.0291,
"eval_loss": 2.4659922122955322,
"eval_runtime": 33.3309,
"eval_samples_per_second": 3.51,
"eval_steps_per_second": 1.77,
"step": 1455
},
{
"epoch": 0.0292,
"eval_loss": 2.4657278060913086,
"eval_runtime": 33.326,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 1.77,
"step": 1460
},
{
"epoch": 0.0293,
"eval_loss": 2.4654440879821777,
"eval_runtime": 33.3457,
"eval_samples_per_second": 3.509,
"eval_steps_per_second": 1.769,
"step": 1465
},
{
"epoch": 0.0294,
"eval_loss": 2.465367317199707,
"eval_runtime": 33.2824,
"eval_samples_per_second": 3.515,
"eval_steps_per_second": 1.773,
"step": 1470
},
{
"epoch": 0.0295,
"grad_norm": 0.03652712436191979,
"learning_rate": 2.9480000000000004e-06,
"loss": 2.466,
"step": 1475
},
{
"epoch": 0.0295,
"eval_loss": 2.465318202972412,
"eval_runtime": 33.3264,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 1.77,
"step": 1475
},
{
"epoch": 0.0296,
"eval_loss": 2.465156316757202,
"eval_runtime": 33.2661,
"eval_samples_per_second": 3.517,
"eval_steps_per_second": 1.774,
"step": 1480
},
{
"epoch": 0.0297,
"eval_loss": 2.4648799896240234,
"eval_runtime": 33.4782,
"eval_samples_per_second": 3.495,
"eval_steps_per_second": 1.762,
"step": 1485
},
{
"epoch": 0.0298,
"eval_loss": 2.4646074771881104,
"eval_runtime": 33.3194,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 1.771,
"step": 1490
},
{
"epoch": 0.0299,
"eval_loss": 2.464465856552124,
"eval_runtime": 33.3466,
"eval_samples_per_second": 3.509,
"eval_steps_per_second": 1.769,
"step": 1495
},
{
"epoch": 0.03,
"grad_norm": 0.03778721361564108,
"learning_rate": 2.9980000000000003e-06,
"loss": 2.4684,
"step": 1500
},
{
"epoch": 0.03,
"eval_loss": 2.464305877685547,
"eval_runtime": 33.25,
"eval_samples_per_second": 3.519,
"eval_steps_per_second": 1.774,
"step": 1500
},
{
"epoch": 0.0301,
"eval_loss": 2.464261531829834,
"eval_runtime": 33.3761,
"eval_samples_per_second": 3.505,
"eval_steps_per_second": 1.768,
"step": 1505
},
{
"epoch": 0.0302,
"eval_loss": 2.464185953140259,
"eval_runtime": 33.4957,
"eval_samples_per_second": 3.493,
"eval_steps_per_second": 1.761,
"step": 1510
},
{
"epoch": 0.0303,
"eval_loss": 2.4639229774475098,
"eval_runtime": 33.2475,
"eval_samples_per_second": 3.519,
"eval_steps_per_second": 1.775,
"step": 1515
},
{
"epoch": 0.0304,
"eval_loss": 2.4636595249176025,
"eval_runtime": 33.3124,
"eval_samples_per_second": 3.512,
"eval_steps_per_second": 1.771,
"step": 1520
},
{
"epoch": 0.0305,
"grad_norm": 0.035809836530372154,
"learning_rate": 3.0480000000000003e-06,
"loss": 2.4631,
"step": 1525
},
{
"epoch": 0.0305,
"eval_loss": 2.46356201171875,
"eval_runtime": 33.3423,
"eval_samples_per_second": 3.509,
"eval_steps_per_second": 1.77,
"step": 1525
},
{
"epoch": 0.0306,
"eval_loss": 2.463318347930908,
"eval_runtime": 33.3917,
"eval_samples_per_second": 3.504,
"eval_steps_per_second": 1.767,
"step": 1530
},
{
"epoch": 0.0307,
"eval_loss": 2.4631264209747314,
"eval_runtime": 33.4053,
"eval_samples_per_second": 3.502,
"eval_steps_per_second": 1.766,
"step": 1535
},
{
"epoch": 0.0308,
"eval_loss": 2.462981700897217,
"eval_runtime": 33.2608,
"eval_samples_per_second": 3.518,
"eval_steps_per_second": 1.774,
"step": 1540
},
{
"epoch": 0.0309,
"eval_loss": 2.462719202041626,
"eval_runtime": 33.3259,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 1.77,
"step": 1545
},
{
"epoch": 0.031,
"grad_norm": 0.05979367258550731,
"learning_rate": 3.0980000000000007e-06,
"loss": 2.46,
"step": 1550
},
{
"epoch": 0.031,
"eval_loss": 2.462733268737793,
"eval_runtime": 33.3195,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 1.771,
"step": 1550
},
{
"epoch": 0.0311,
"eval_loss": 2.4625959396362305,
"eval_runtime": 33.3704,
"eval_samples_per_second": 3.506,
"eval_steps_per_second": 1.768,
"step": 1555
},
{
"epoch": 0.0312,
"eval_loss": 2.462366819381714,
"eval_runtime": 33.4047,
"eval_samples_per_second": 3.503,
"eval_steps_per_second": 1.766,
"step": 1560
},
{
"epoch": 0.0313,
"eval_loss": 2.4618427753448486,
"eval_runtime": 33.3896,
"eval_samples_per_second": 3.504,
"eval_steps_per_second": 1.767,
"step": 1565
},
{
"epoch": 0.0314,
"eval_loss": 2.4616317749023438,
"eval_runtime": 33.3414,
"eval_samples_per_second": 3.509,
"eval_steps_per_second": 1.77,
"step": 1570
},
{
"epoch": 0.0315,
"grad_norm": 0.031804244667956116,
"learning_rate": 3.1480000000000006e-06,
"loss": 2.4477,
"step": 1575
},
{
"epoch": 0.0315,
"eval_loss": 2.4615368843078613,
"eval_runtime": 33.3548,
"eval_samples_per_second": 3.508,
"eval_steps_per_second": 1.769,
"step": 1575
},
{
"epoch": 0.0316,
"eval_loss": 2.461198091506958,
"eval_runtime": 33.2416,
"eval_samples_per_second": 3.52,
"eval_steps_per_second": 1.775,
"step": 1580
},
{
"epoch": 0.0317,
"eval_loss": 2.4611523151397705,
"eval_runtime": 33.3445,
"eval_samples_per_second": 3.509,
"eval_steps_per_second": 1.769,
"step": 1585
},
{
"epoch": 0.0318,
"eval_loss": 2.4609127044677734,
"eval_runtime": 33.3175,
"eval_samples_per_second": 3.512,
"eval_steps_per_second": 1.771,
"step": 1590
},
{
"epoch": 0.0319,
"eval_loss": 2.4608800411224365,
"eval_runtime": 33.3052,
"eval_samples_per_second": 3.513,
"eval_steps_per_second": 1.771,
"step": 1595
},
{
"epoch": 0.032,
"grad_norm": 0.03365841309984822,
"learning_rate": 3.198e-06,
"loss": 2.4523,
"step": 1600
},
{
"epoch": 0.032,
"eval_loss": 2.460757255554199,
"eval_runtime": 33.2636,
"eval_samples_per_second": 3.517,
"eval_steps_per_second": 1.774,
"step": 1600
},
{
"epoch": 0.0321,
"eval_loss": 2.4605917930603027,
"eval_runtime": 33.4595,
"eval_samples_per_second": 3.497,
"eval_steps_per_second": 1.763,
"step": 1605
},
{
"epoch": 0.0322,
"eval_loss": 2.4604575634002686,
"eval_runtime": 33.2706,
"eval_samples_per_second": 3.517,
"eval_steps_per_second": 1.773,
"step": 1610
},
{
"epoch": 0.0323,
"eval_loss": 2.4603111743927,
"eval_runtime": 33.405,
"eval_samples_per_second": 3.502,
"eval_steps_per_second": 1.766,
"step": 1615
},
{
"epoch": 0.0324,
"eval_loss": 2.460045337677002,
"eval_runtime": 33.2598,
"eval_samples_per_second": 3.518,
"eval_steps_per_second": 1.774,
"step": 1620
},
{
"epoch": 0.0325,
"grad_norm": 0.03534600587541967,
"learning_rate": 3.248e-06,
"loss": 2.45,
"step": 1625
},
{
"epoch": 0.0325,
"eval_loss": 2.460045099258423,
"eval_runtime": 33.2663,
"eval_samples_per_second": 3.517,
"eval_steps_per_second": 1.774,
"step": 1625
},
{
"epoch": 0.0326,
"eval_loss": 2.4599287509918213,
"eval_runtime": 33.2545,
"eval_samples_per_second": 3.518,
"eval_steps_per_second": 1.774,
"step": 1630
},
{
"epoch": 0.0327,
"eval_loss": 2.459611654281616,
"eval_runtime": 33.4189,
"eval_samples_per_second": 3.501,
"eval_steps_per_second": 1.765,
"step": 1635
},
{
"epoch": 0.0328,
"eval_loss": 2.4594151973724365,
"eval_runtime": 33.284,
"eval_samples_per_second": 3.515,
"eval_steps_per_second": 1.773,
"step": 1640
},
{
"epoch": 0.0329,
"eval_loss": 2.4589221477508545,
"eval_runtime": 33.4033,
"eval_samples_per_second": 3.503,
"eval_steps_per_second": 1.766,
"step": 1645
},
{
"epoch": 0.033,
"grad_norm": 0.032596527761614855,
"learning_rate": 3.298e-06,
"loss": 2.4422,
"step": 1650
},
{
"epoch": 0.033,
"eval_loss": 2.4589502811431885,
"eval_runtime": 33.2986,
"eval_samples_per_second": 3.514,
"eval_steps_per_second": 1.772,
"step": 1650
},
{
"epoch": 0.0331,
"eval_loss": 2.4588239192962646,
"eval_runtime": 33.4046,
"eval_samples_per_second": 3.503,
"eval_steps_per_second": 1.766,
"step": 1655
},
{
"epoch": 0.0332,
"eval_loss": 2.458603620529175,
"eval_runtime": 33.3448,
"eval_samples_per_second": 3.509,
"eval_steps_per_second": 1.769,
"step": 1660
},
{
"epoch": 0.0333,
"eval_loss": 2.458559513092041,
"eval_runtime": 33.368,
"eval_samples_per_second": 3.506,
"eval_steps_per_second": 1.768,
"step": 1665
},
{
"epoch": 0.0334,
"eval_loss": 2.458500862121582,
"eval_runtime": 33.2335,
"eval_samples_per_second": 3.521,
"eval_steps_per_second": 1.775,
"step": 1670
},
{
"epoch": 0.0335,
"grad_norm": 0.03339611698643194,
"learning_rate": 3.348e-06,
"loss": 2.447,
"step": 1675
},
{
"epoch": 0.0335,
"eval_loss": 2.458252191543579,
"eval_runtime": 33.3623,
"eval_samples_per_second": 3.507,
"eval_steps_per_second": 1.768,
"step": 1675
},
{
"epoch": 0.0336,
"eval_loss": 2.4580931663513184,
"eval_runtime": 33.2532,
"eval_samples_per_second": 3.518,
"eval_steps_per_second": 1.774,
"step": 1680
},
{
"epoch": 0.0337,
"eval_loss": 2.4578795433044434,
"eval_runtime": 33.3214,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 1.771,
"step": 1685
},
{
"epoch": 0.0338,
"eval_loss": 2.4576218128204346,
"eval_runtime": 33.248,
"eval_samples_per_second": 3.519,
"eval_steps_per_second": 1.775,
"step": 1690
},
{
"epoch": 0.0339,
"eval_loss": 2.4576828479766846,
"eval_runtime": 33.3499,
"eval_samples_per_second": 3.508,
"eval_steps_per_second": 1.769,
"step": 1695
},
{
"epoch": 0.034,
"grad_norm": 0.03028181865357742,
"learning_rate": 3.3980000000000003e-06,
"loss": 2.4582,
"step": 1700
},
{
"epoch": 0.034,
"eval_loss": 2.457383155822754,
"eval_runtime": 33.2574,
"eval_samples_per_second": 3.518,
"eval_steps_per_second": 1.774,
"step": 1700
},
{
"epoch": 0.0341,
"eval_loss": 2.4572579860687256,
"eval_runtime": 33.2947,
"eval_samples_per_second": 3.514,
"eval_steps_per_second": 1.772,
"step": 1705
},
{
"epoch": 0.0342,
"eval_loss": 2.4584450721740723,
"eval_runtime": 33.3296,
"eval_samples_per_second": 3.51,
"eval_steps_per_second": 1.77,
"step": 1710
},
{
"epoch": 0.0343,
"eval_loss": 2.458603858947754,
"eval_runtime": 33.3017,
"eval_samples_per_second": 3.513,
"eval_steps_per_second": 1.772,
"step": 1715
},
{
"epoch": 0.0344,
"eval_loss": 2.4579555988311768,
"eval_runtime": 33.292,
"eval_samples_per_second": 3.514,
"eval_steps_per_second": 1.772,
"step": 1720
},
{
"epoch": 0.0345,
"grad_norm": 0.03734241446236971,
"learning_rate": 3.4480000000000003e-06,
"loss": 2.4501,
"step": 1725
},
{
"epoch": 0.0345,
"eval_loss": 2.4574153423309326,
"eval_runtime": 33.4313,
"eval_samples_per_second": 3.5,
"eval_steps_per_second": 1.765,
"step": 1725
},
{
"epoch": 0.0346,
"eval_loss": 2.456867218017578,
"eval_runtime": 33.2833,
"eval_samples_per_second": 3.515,
"eval_steps_per_second": 1.773,
"step": 1730
},
{
"epoch": 0.0347,
"eval_loss": 2.4567270278930664,
"eval_runtime": 33.3694,
"eval_samples_per_second": 3.506,
"eval_steps_per_second": 1.768,
"step": 1735
},
{
"epoch": 0.0348,
"eval_loss": 2.456348180770874,
"eval_runtime": 33.3416,
"eval_samples_per_second": 3.509,
"eval_steps_per_second": 1.77,
"step": 1740
},
{
"epoch": 0.0349,
"eval_loss": 2.4563136100769043,
"eval_runtime": 33.3531,
"eval_samples_per_second": 3.508,
"eval_steps_per_second": 1.769,
"step": 1745
},
{
"epoch": 0.035,
"grad_norm": 0.030782538004837847,
"learning_rate": 3.4980000000000002e-06,
"loss": 2.4509,
"step": 1750
},
{
"epoch": 0.035,
"eval_loss": 2.455827236175537,
"eval_runtime": 33.3143,
"eval_samples_per_second": 3.512,
"eval_steps_per_second": 1.771,
"step": 1750
},
{
"epoch": 0.0351,
"eval_loss": 2.4558639526367188,
"eval_runtime": 33.3716,
"eval_samples_per_second": 3.506,
"eval_steps_per_second": 1.768,
"step": 1755
},
{
"epoch": 0.0352,
"eval_loss": 2.4555938243865967,
"eval_runtime": 33.2966,
"eval_samples_per_second": 3.514,
"eval_steps_per_second": 1.772,
"step": 1760
},
{
"epoch": 0.0353,
"eval_loss": 2.4551546573638916,
"eval_runtime": 33.3145,
"eval_samples_per_second": 3.512,
"eval_steps_per_second": 1.771,
"step": 1765
},
{
"epoch": 0.0354,
"eval_loss": 2.454957962036133,
"eval_runtime": 33.3201,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 1.771,
"step": 1770
},
{
"epoch": 0.0355,
"grad_norm": 0.03281862515471333,
"learning_rate": 3.548e-06,
"loss": 2.4439,
"step": 1775
},
{
"epoch": 0.0355,
"eval_loss": 2.455031394958496,
"eval_runtime": 33.264,
"eval_samples_per_second": 3.517,
"eval_steps_per_second": 1.774,
"step": 1775
},
{
"epoch": 0.0356,
"eval_loss": 2.4550724029541016,
"eval_runtime": 33.3734,
"eval_samples_per_second": 3.506,
"eval_steps_per_second": 1.768,
"step": 1780
},
{
"epoch": 0.0357,
"eval_loss": 2.454719305038452,
"eval_runtime": 33.3267,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 1.77,
"step": 1785
},
{
"epoch": 0.0358,
"eval_loss": 2.4547033309936523,
"eval_runtime": 33.2651,
"eval_samples_per_second": 3.517,
"eval_steps_per_second": 1.774,
"step": 1790
},
{
"epoch": 0.0359,
"eval_loss": 2.454416275024414,
"eval_runtime": 33.3612,
"eval_samples_per_second": 3.507,
"eval_steps_per_second": 1.769,
"step": 1795
},
{
"epoch": 0.036,
"grad_norm": 0.031756006482001914,
"learning_rate": 3.5980000000000005e-06,
"loss": 2.4493,
"step": 1800
},
{
"epoch": 0.036,
"eval_loss": 2.454286813735962,
"eval_runtime": 33.326,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 1.77,
"step": 1800
},
{
"epoch": 0.0361,
"eval_loss": 2.4541101455688477,
"eval_runtime": 33.2597,
"eval_samples_per_second": 3.518,
"eval_steps_per_second": 1.774,
"step": 1805
},
{
"epoch": 0.0362,
"eval_loss": 2.4541351795196533,
"eval_runtime": 33.2421,
"eval_samples_per_second": 3.52,
"eval_steps_per_second": 1.775,
"step": 1810
},
{
"epoch": 0.0363,
"eval_loss": 2.4537973403930664,
"eval_runtime": 33.3201,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 1.771,
"step": 1815
},
{
"epoch": 0.0364,
"eval_loss": 2.4534847736358643,
"eval_runtime": 33.2973,
"eval_samples_per_second": 3.514,
"eval_steps_per_second": 1.772,
"step": 1820
},
{
"epoch": 0.0365,
"grad_norm": 0.03128096989289917,
"learning_rate": 3.6480000000000005e-06,
"loss": 2.4526,
"step": 1825
},
{
"epoch": 0.0365,
"eval_loss": 2.453655481338501,
"eval_runtime": 33.3755,
"eval_samples_per_second": 3.506,
"eval_steps_per_second": 1.768,
"step": 1825
},
{
"epoch": 0.0366,
"eval_loss": 2.4534049034118652,
"eval_runtime": 33.332,
"eval_samples_per_second": 3.51,
"eval_steps_per_second": 1.77,
"step": 1830
},
{
"epoch": 0.0367,
"eval_loss": 2.4529781341552734,
"eval_runtime": 33.3325,
"eval_samples_per_second": 3.51,
"eval_steps_per_second": 1.77,
"step": 1835
},
{
"epoch": 0.0368,
"eval_loss": 2.454005241394043,
"eval_runtime": 33.3975,
"eval_samples_per_second": 3.503,
"eval_steps_per_second": 1.767,
"step": 1840
},
{
"epoch": 0.0369,
"eval_loss": 2.4538745880126953,
"eval_runtime": 33.3,
"eval_samples_per_second": 3.514,
"eval_steps_per_second": 1.772,
"step": 1845
},
{
"epoch": 0.037,
"grad_norm": 0.02999582338402207,
"learning_rate": 3.6980000000000004e-06,
"loss": 2.4309,
"step": 1850
},
{
"epoch": 0.037,
"eval_loss": 2.4534404277801514,
"eval_runtime": 33.2825,
"eval_samples_per_second": 3.515,
"eval_steps_per_second": 1.773,
"step": 1850
},
{
"epoch": 0.0371,
"eval_loss": 2.4529800415039062,
"eval_runtime": 33.513,
"eval_samples_per_second": 3.491,
"eval_steps_per_second": 1.761,
"step": 1855
},
{
"epoch": 0.0372,
"eval_loss": 2.453007221221924,
"eval_runtime": 33.3414,
"eval_samples_per_second": 3.509,
"eval_steps_per_second": 1.77,
"step": 1860
},
{
"epoch": 0.0373,
"eval_loss": 2.452350616455078,
"eval_runtime": 33.3625,
"eval_samples_per_second": 3.507,
"eval_steps_per_second": 1.768,
"step": 1865
},
{
"epoch": 0.0374,
"eval_loss": 2.4522666931152344,
"eval_runtime": 33.3116,
"eval_samples_per_second": 3.512,
"eval_steps_per_second": 1.771,
"step": 1870
},
{
"epoch": 0.0375,
"grad_norm": 0.0409025592520596,
"learning_rate": 3.7480000000000004e-06,
"loss": 2.442,
"step": 1875
},
{
"epoch": 0.0375,
"eval_loss": 2.4521546363830566,
"eval_runtime": 33.3782,
"eval_samples_per_second": 3.505,
"eval_steps_per_second": 1.768,
"step": 1875
},
{
"epoch": 0.0376,
"eval_loss": 2.4520437717437744,
"eval_runtime": 33.2887,
"eval_samples_per_second": 3.515,
"eval_steps_per_second": 1.772,
"step": 1880
},
{
"epoch": 0.0377,
"eval_loss": 2.4519331455230713,
"eval_runtime": 33.3746,
"eval_samples_per_second": 3.506,
"eval_steps_per_second": 1.768,
"step": 1885
},
{
"epoch": 0.0378,
"eval_loss": 2.451744556427002,
"eval_runtime": 33.3214,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 1.771,
"step": 1890
},
{
"epoch": 0.0379,
"eval_loss": 2.451737642288208,
"eval_runtime": 33.3457,
"eval_samples_per_second": 3.509,
"eval_steps_per_second": 1.769,
"step": 1895
},
{
"epoch": 0.038,
"grad_norm": 0.03431980647954774,
"learning_rate": 3.7980000000000007e-06,
"loss": 2.4477,
"step": 1900
},
{
"epoch": 0.038,
"eval_loss": 2.4515624046325684,
"eval_runtime": 33.312,
"eval_samples_per_second": 3.512,
"eval_steps_per_second": 1.771,
"step": 1900
},
{
"epoch": 0.0381,
"eval_loss": 2.4512295722961426,
"eval_runtime": 33.3607,
"eval_samples_per_second": 3.507,
"eval_steps_per_second": 1.769,
"step": 1905
},
{
"epoch": 0.0382,
"eval_loss": 2.4510445594787598,
"eval_runtime": 33.339,
"eval_samples_per_second": 3.509,
"eval_steps_per_second": 1.77,
"step": 1910
},
{
"epoch": 0.0383,
"eval_loss": 2.4508397579193115,
"eval_runtime": 33.3996,
"eval_samples_per_second": 3.503,
"eval_steps_per_second": 1.766,
"step": 1915
},
{
"epoch": 0.0384,
"eval_loss": 2.4510440826416016,
"eval_runtime": 33.2905,
"eval_samples_per_second": 3.515,
"eval_steps_per_second": 1.772,
"step": 1920
},
{
"epoch": 0.0385,
"grad_norm": 0.03587224652231601,
"learning_rate": 3.848e-06,
"loss": 2.4433,
"step": 1925
},
{
"epoch": 0.0385,
"eval_loss": 2.450984239578247,
"eval_runtime": 33.3263,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 1.77,
"step": 1925
},
{
"epoch": 0.0386,
"eval_loss": 2.45090651512146,
"eval_runtime": 33.3244,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 1.77,
"step": 1930
},
{
"epoch": 0.0387,
"eval_loss": 2.450443983078003,
"eval_runtime": 33.3023,
"eval_samples_per_second": 3.513,
"eval_steps_per_second": 1.772,
"step": 1935
},
{
"epoch": 0.0388,
"eval_loss": 2.450309991836548,
"eval_runtime": 33.4354,
"eval_samples_per_second": 3.499,
"eval_steps_per_second": 1.765,
"step": 1940
},
{
"epoch": 0.0389,
"eval_loss": 2.4500510692596436,
"eval_runtime": 33.3238,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 1.771,
"step": 1945
},
{
"epoch": 0.039,
"grad_norm": 0.027239293031380653,
"learning_rate": 3.898e-06,
"loss": 2.4347,
"step": 1950
},
{
"epoch": 0.039,
"eval_loss": 2.4498231410980225,
"eval_runtime": 33.3306,
"eval_samples_per_second": 3.51,
"eval_steps_per_second": 1.77,
"step": 1950
},
{
"epoch": 0.0391,
"eval_loss": 2.449704170227051,
"eval_runtime": 33.3865,
"eval_samples_per_second": 3.504,
"eval_steps_per_second": 1.767,
"step": 1955
},
{
"epoch": 0.0392,
"eval_loss": 2.44974684715271,
"eval_runtime": 33.419,
"eval_samples_per_second": 3.501,
"eval_steps_per_second": 1.765,
"step": 1960
},
{
"epoch": 0.0393,
"eval_loss": 2.450090169906616,
"eval_runtime": 33.5315,
"eval_samples_per_second": 3.489,
"eval_steps_per_second": 1.76,
"step": 1965
},
{
"epoch": 0.0394,
"eval_loss": 2.4494845867156982,
"eval_runtime": 33.4607,
"eval_samples_per_second": 3.497,
"eval_steps_per_second": 1.763,
"step": 1970
},
{
"epoch": 0.0395,
"grad_norm": 0.031553482039351585,
"learning_rate": 3.948e-06,
"loss": 2.4466,
"step": 1975
},
{
"epoch": 0.0395,
"eval_loss": 2.449598550796509,
"eval_runtime": 33.4853,
"eval_samples_per_second": 3.494,
"eval_steps_per_second": 1.762,
"step": 1975
},
{
"epoch": 0.0396,
"eval_loss": 2.449420213699341,
"eval_runtime": 33.4626,
"eval_samples_per_second": 3.496,
"eval_steps_per_second": 1.763,
"step": 1980
},
{
"epoch": 0.0397,
"eval_loss": 2.449462890625,
"eval_runtime": 33.4049,
"eval_samples_per_second": 3.502,
"eval_steps_per_second": 1.766,
"step": 1985
},
{
"epoch": 0.0398,
"eval_loss": 2.449423313140869,
"eval_runtime": 33.5823,
"eval_samples_per_second": 3.484,
"eval_steps_per_second": 1.757,
"step": 1990
},
{
"epoch": 0.0399,
"eval_loss": 2.4491324424743652,
"eval_runtime": 33.662,
"eval_samples_per_second": 3.476,
"eval_steps_per_second": 1.753,
"step": 1995
},
{
"epoch": 0.04,
"grad_norm": 0.03314009226524554,
"learning_rate": 3.9980000000000005e-06,
"loss": 2.4391,
"step": 2000
},
{
"epoch": 0.04,
"eval_loss": 2.449084520339966,
"eval_runtime": 33.5872,
"eval_samples_per_second": 3.483,
"eval_steps_per_second": 1.757,
"step": 2000
},
{
"epoch": 0.0401,
"eval_loss": 2.449021577835083,
"eval_runtime": 33.5048,
"eval_samples_per_second": 3.492,
"eval_steps_per_second": 1.761,
"step": 2005
},
{
"epoch": 0.0402,
"eval_loss": 2.449159622192383,
"eval_runtime": 33.4845,
"eval_samples_per_second": 3.494,
"eval_steps_per_second": 1.762,
"step": 2010
},
{
"epoch": 0.0403,
"eval_loss": 2.448726177215576,
"eval_runtime": 33.9926,
"eval_samples_per_second": 3.442,
"eval_steps_per_second": 1.736,
"step": 2015
},
{
"epoch": 0.0404,
"eval_loss": 2.4484922885894775,
"eval_runtime": 33.6594,
"eval_samples_per_second": 3.476,
"eval_steps_per_second": 1.753,
"step": 2020
},
{
"epoch": 0.0405,
"grad_norm": 0.029877786947315705,
"learning_rate": 4.048e-06,
"loss": 2.438,
"step": 2025
},
{
"epoch": 0.0405,
"eval_loss": 2.4485254287719727,
"eval_runtime": 33.6812,
"eval_samples_per_second": 3.474,
"eval_steps_per_second": 1.752,
"step": 2025
},
{
"epoch": 0.0406,
"eval_loss": 2.448495388031006,
"eval_runtime": 33.9733,
"eval_samples_per_second": 3.444,
"eval_steps_per_second": 1.737,
"step": 2030
},
{
"epoch": 0.0407,
"eval_loss": 2.4482643604278564,
"eval_runtime": 33.9957,
"eval_samples_per_second": 3.442,
"eval_steps_per_second": 1.736,
"step": 2035
},
{
"epoch": 0.0408,
"eval_loss": 2.4481942653656006,
"eval_runtime": 34.3014,
"eval_samples_per_second": 3.411,
"eval_steps_per_second": 1.72,
"step": 2040
},
{
"epoch": 0.0409,
"eval_loss": 2.448082208633423,
"eval_runtime": 34.0411,
"eval_samples_per_second": 3.437,
"eval_steps_per_second": 1.733,
"step": 2045
},
{
"epoch": 0.041,
"grad_norm": 0.031175983773220776,
"learning_rate": 4.098e-06,
"loss": 2.4332,
"step": 2050
},
{
"epoch": 0.041,
"eval_loss": 2.4478490352630615,
"eval_runtime": 33.9245,
"eval_samples_per_second": 3.449,
"eval_steps_per_second": 1.739,
"step": 2050
},
{
"epoch": 0.0411,
"eval_loss": 2.4480035305023193,
"eval_runtime": 34.0079,
"eval_samples_per_second": 3.44,
"eval_steps_per_second": 1.735,
"step": 2055
},
{
"epoch": 0.0412,
"eval_loss": 2.447685718536377,
"eval_runtime": 33.999,
"eval_samples_per_second": 3.441,
"eval_steps_per_second": 1.735,
"step": 2060
},
{
"epoch": 0.0413,
"eval_loss": 2.447507619857788,
"eval_runtime": 34.1446,
"eval_samples_per_second": 3.427,
"eval_steps_per_second": 1.728,
"step": 2065
},
{
"epoch": 0.0414,
"eval_loss": 2.447322130203247,
"eval_runtime": 33.7479,
"eval_samples_per_second": 3.467,
"eval_steps_per_second": 1.748,
"step": 2070
},
{
"epoch": 0.0415,
"grad_norm": 0.02904850084773878,
"learning_rate": 4.148000000000001e-06,
"loss": 2.4481,
"step": 2075
},
{
"epoch": 0.0415,
"eval_loss": 2.4471347332000732,
"eval_runtime": 33.917,
"eval_samples_per_second": 3.45,
"eval_steps_per_second": 1.74,
"step": 2075
},
{
"epoch": 0.0416,
"eval_loss": 2.447152853012085,
"eval_runtime": 33.8287,
"eval_samples_per_second": 3.459,
"eval_steps_per_second": 1.744,
"step": 2080
},
{
"epoch": 0.0417,
"eval_loss": 2.4469242095947266,
"eval_runtime": 33.7591,
"eval_samples_per_second": 3.466,
"eval_steps_per_second": 1.748,
"step": 2085
},
{
"epoch": 0.0418,
"eval_loss": 2.4471774101257324,
"eval_runtime": 33.7879,
"eval_samples_per_second": 3.463,
"eval_steps_per_second": 1.746,
"step": 2090
},
{
"epoch": 0.0419,
"eval_loss": 2.447988986968994,
"eval_runtime": 33.6878,
"eval_samples_per_second": 3.473,
"eval_steps_per_second": 1.751,
"step": 2095
},
{
"epoch": 0.042,
"grad_norm": 0.033838990669225626,
"learning_rate": 4.198e-06,
"loss": 2.4386,
"step": 2100
},
{
"epoch": 0.042,
"eval_loss": 2.4477100372314453,
"eval_runtime": 33.6345,
"eval_samples_per_second": 3.479,
"eval_steps_per_second": 1.754,
"step": 2100
},
{
"epoch": 0.0421,
"eval_loss": 2.447394847869873,
"eval_runtime": 33.6221,
"eval_samples_per_second": 3.48,
"eval_steps_per_second": 1.755,
"step": 2105
},
{
"epoch": 0.0422,
"eval_loss": 2.4470951557159424,
"eval_runtime": 33.6689,
"eval_samples_per_second": 3.475,
"eval_steps_per_second": 1.752,
"step": 2110
},
{
"epoch": 0.0423,
"eval_loss": 2.4467623233795166,
"eval_runtime": 33.6979,
"eval_samples_per_second": 3.472,
"eval_steps_per_second": 1.751,
"step": 2115
},
{
"epoch": 0.0424,
"eval_loss": 2.4469833374023438,
"eval_runtime": 33.8632,
"eval_samples_per_second": 3.455,
"eval_steps_per_second": 1.742,
"step": 2120
},
{
"epoch": 0.0425,
"grad_norm": 0.0382703849144026,
"learning_rate": 4.248000000000001e-06,
"loss": 2.4313,
"step": 2125
},
{
"epoch": 0.0425,
"eval_loss": 2.447753667831421,
"eval_runtime": 33.7269,
"eval_samples_per_second": 3.469,
"eval_steps_per_second": 1.749,
"step": 2125
},
{
"epoch": 0.0426,
"eval_loss": 2.447281837463379,
"eval_runtime": 33.7037,
"eval_samples_per_second": 3.471,
"eval_steps_per_second": 1.751,
"step": 2130
},
{
"epoch": 0.0427,
"eval_loss": 2.4472267627716064,
"eval_runtime": 33.6873,
"eval_samples_per_second": 3.473,
"eval_steps_per_second": 1.751,
"step": 2135
},
{
"epoch": 0.0428,
"eval_loss": 2.446859836578369,
"eval_runtime": 33.6738,
"eval_samples_per_second": 3.475,
"eval_steps_per_second": 1.752,
"step": 2140
},
{
"epoch": 0.0429,
"eval_loss": 2.446655035018921,
"eval_runtime": 33.6536,
"eval_samples_per_second": 3.477,
"eval_steps_per_second": 1.753,
"step": 2145
},
{
"epoch": 0.043,
"grad_norm": 0.027126678960545086,
"learning_rate": 4.298e-06,
"loss": 2.4298,
"step": 2150
},
{
"epoch": 0.043,
"eval_loss": 2.4463651180267334,
"eval_runtime": 33.6454,
"eval_samples_per_second": 3.477,
"eval_steps_per_second": 1.754,
"step": 2150
},
{
"epoch": 0.0431,
"eval_loss": 2.4461581707000732,
"eval_runtime": 33.6166,
"eval_samples_per_second": 3.48,
"eval_steps_per_second": 1.755,
"step": 2155
},
{
"epoch": 0.0432,
"eval_loss": 2.4461660385131836,
"eval_runtime": 33.5484,
"eval_samples_per_second": 3.488,
"eval_steps_per_second": 1.759,
"step": 2160
},
{
"epoch": 0.0433,
"eval_loss": 2.4458513259887695,
"eval_runtime": 33.6579,
"eval_samples_per_second": 3.476,
"eval_steps_per_second": 1.753,
"step": 2165
},
{
"epoch": 0.0434,
"eval_loss": 2.4454855918884277,
"eval_runtime": 33.5647,
"eval_samples_per_second": 3.486,
"eval_steps_per_second": 1.758,
"step": 2170
},
{
"epoch": 0.0435,
"grad_norm": 0.030565328679921875,
"learning_rate": 4.3480000000000006e-06,
"loss": 2.4387,
"step": 2175
},
{
"epoch": 0.0435,
"eval_loss": 2.445688009262085,
"eval_runtime": 33.5164,
"eval_samples_per_second": 3.491,
"eval_steps_per_second": 1.76,
"step": 2175
},
{
"epoch": 0.0436,
"eval_loss": 2.4456729888916016,
"eval_runtime": 33.4724,
"eval_samples_per_second": 3.495,
"eval_steps_per_second": 1.763,
"step": 2180
},
{
"epoch": 0.0437,
"eval_loss": 2.4460015296936035,
"eval_runtime": 33.3984,
"eval_samples_per_second": 3.503,
"eval_steps_per_second": 1.767,
"step": 2185
},
{
"epoch": 0.0438,
"eval_loss": 2.4460256099700928,
"eval_runtime": 33.4582,
"eval_samples_per_second": 3.497,
"eval_steps_per_second": 1.763,
"step": 2190
},
{
"epoch": 0.0439,
"eval_loss": 2.4456872940063477,
"eval_runtime": 33.444,
"eval_samples_per_second": 3.498,
"eval_steps_per_second": 1.764,
"step": 2195
},
{
"epoch": 0.044,
"grad_norm": 0.03864046787827566,
"learning_rate": 4.398000000000001e-06,
"loss": 2.445,
"step": 2200
},
{
"epoch": 0.044,
"eval_loss": 2.4454870223999023,
"eval_runtime": 33.4474,
"eval_samples_per_second": 3.498,
"eval_steps_per_second": 1.764,
"step": 2200
},
{
"epoch": 0.0441,
"eval_loss": 2.4453113079071045,
"eval_runtime": 33.4062,
"eval_samples_per_second": 3.502,
"eval_steps_per_second": 1.766,
"step": 2205
},
{
"epoch": 0.0442,
"eval_loss": 2.4448771476745605,
"eval_runtime": 33.3542,
"eval_samples_per_second": 3.508,
"eval_steps_per_second": 1.769,
"step": 2210
},
{
"epoch": 0.0443,
"eval_loss": 2.444946765899658,
"eval_runtime": 33.3997,
"eval_samples_per_second": 3.503,
"eval_steps_per_second": 1.766,
"step": 2215
},
{
"epoch": 0.0444,
"eval_loss": 2.445194959640503,
"eval_runtime": 33.3669,
"eval_samples_per_second": 3.506,
"eval_steps_per_second": 1.768,
"step": 2220
},
{
"epoch": 0.0445,
"grad_norm": 0.026792091668494698,
"learning_rate": 4.4480000000000004e-06,
"loss": 2.4339,
"step": 2225
},
{
"epoch": 0.0445,
"eval_loss": 2.445009469985962,
"eval_runtime": 33.4467,
"eval_samples_per_second": 3.498,
"eval_steps_per_second": 1.764,
"step": 2225
},
{
"epoch": 0.0446,
"eval_loss": 2.4450981616973877,
"eval_runtime": 33.4513,
"eval_samples_per_second": 3.498,
"eval_steps_per_second": 1.764,
"step": 2230
},
{
"epoch": 0.0447,
"eval_loss": 2.444899082183838,
"eval_runtime": 33.3869,
"eval_samples_per_second": 3.504,
"eval_steps_per_second": 1.767,
"step": 2235
},
{
"epoch": 0.0448,
"eval_loss": 2.4448494911193848,
"eval_runtime": 33.486,
"eval_samples_per_second": 3.494,
"eval_steps_per_second": 1.762,
"step": 2240
},
{
"epoch": 0.0449,
"eval_loss": 2.444640636444092,
"eval_runtime": 33.4202,
"eval_samples_per_second": 3.501,
"eval_steps_per_second": 1.765,
"step": 2245
},
{
"epoch": 0.045,
"grad_norm": 0.027104711228224686,
"learning_rate": 4.498e-06,
"loss": 2.4326,
"step": 2250
},
{
"epoch": 0.045,
"eval_loss": 2.444633722305298,
"eval_runtime": 33.4154,
"eval_samples_per_second": 3.501,
"eval_steps_per_second": 1.766,
"step": 2250
},
{
"epoch": 0.0451,
"eval_loss": 2.44467830657959,
"eval_runtime": 33.4237,
"eval_samples_per_second": 3.501,
"eval_steps_per_second": 1.765,
"step": 2255
},
{
"epoch": 0.0452,
"eval_loss": 2.444413900375366,
"eval_runtime": 33.3694,
"eval_samples_per_second": 3.506,
"eval_steps_per_second": 1.768,
"step": 2260
},
{
"epoch": 0.0453,
"eval_loss": 2.444222927093506,
"eval_runtime": 33.3585,
"eval_samples_per_second": 3.507,
"eval_steps_per_second": 1.769,
"step": 2265
},
{
"epoch": 0.0454,
"eval_loss": 2.444108724594116,
"eval_runtime": 33.3346,
"eval_samples_per_second": 3.51,
"eval_steps_per_second": 1.77,
"step": 2270
},
{
"epoch": 0.0455,
"grad_norm": 0.033569645173308425,
"learning_rate": 4.548e-06,
"loss": 2.4342,
"step": 2275
},
{
"epoch": 0.0455,
"eval_loss": 2.443859577178955,
"eval_runtime": 33.3636,
"eval_samples_per_second": 3.507,
"eval_steps_per_second": 1.768,
"step": 2275
},
{
"epoch": 0.0456,
"eval_loss": 2.4441120624542236,
"eval_runtime": 33.2442,
"eval_samples_per_second": 3.519,
"eval_steps_per_second": 1.775,
"step": 2280
},
{
"epoch": 0.0457,
"eval_loss": 2.4439260959625244,
"eval_runtime": 33.2924,
"eval_samples_per_second": 3.514,
"eval_steps_per_second": 1.772,
"step": 2285
},
{
"epoch": 0.0458,
"eval_loss": 2.4439032077789307,
"eval_runtime": 33.4004,
"eval_samples_per_second": 3.503,
"eval_steps_per_second": 1.766,
"step": 2290
},
{
"epoch": 0.0459,
"eval_loss": 2.443621873855591,
"eval_runtime": 33.3314,
"eval_samples_per_second": 3.51,
"eval_steps_per_second": 1.77,
"step": 2295
},
{
"epoch": 0.046,
"grad_norm": 0.02648413187023774,
"learning_rate": 4.598e-06,
"loss": 2.4368,
"step": 2300
},
{
"epoch": 0.046,
"eval_loss": 2.4436306953430176,
"eval_runtime": 33.372,
"eval_samples_per_second": 3.506,
"eval_steps_per_second": 1.768,
"step": 2300
},
{
"epoch": 0.0461,
"eval_loss": 2.4436404705047607,
"eval_runtime": 33.3039,
"eval_samples_per_second": 3.513,
"eval_steps_per_second": 1.772,
"step": 2305
},
{
"epoch": 0.0462,
"eval_loss": 2.44333815574646,
"eval_runtime": 33.3059,
"eval_samples_per_second": 3.513,
"eval_steps_per_second": 1.771,
"step": 2310
},
{
"epoch": 0.0463,
"eval_loss": 2.443415880203247,
"eval_runtime": 33.4065,
"eval_samples_per_second": 3.502,
"eval_steps_per_second": 1.766,
"step": 2315
},
{
"epoch": 0.0464,
"eval_loss": 2.443068742752075,
"eval_runtime": 33.2818,
"eval_samples_per_second": 3.515,
"eval_steps_per_second": 1.773,
"step": 2320
},
{
"epoch": 0.0465,
"grad_norm": 0.0351440602227012,
"learning_rate": 4.648e-06,
"loss": 2.4381,
"step": 2325
},
{
"epoch": 0.0465,
"eval_loss": 2.443199634552002,
"eval_runtime": 33.3538,
"eval_samples_per_second": 3.508,
"eval_steps_per_second": 1.769,
"step": 2325
},
{
"epoch": 0.0466,
"eval_loss": 2.4433047771453857,
"eval_runtime": 33.4816,
"eval_samples_per_second": 3.494,
"eval_steps_per_second": 1.762,
"step": 2330
},
{
"epoch": 0.0467,
"eval_loss": 2.443272113800049,
"eval_runtime": 33.5015,
"eval_samples_per_second": 3.492,
"eval_steps_per_second": 1.761,
"step": 2335
},
{
"epoch": 0.0468,
"eval_loss": 2.443246603012085,
"eval_runtime": 33.5753,
"eval_samples_per_second": 3.485,
"eval_steps_per_second": 1.757,
"step": 2340
},
{
"epoch": 0.0469,
"eval_loss": 2.4432363510131836,
"eval_runtime": 33.2869,
"eval_samples_per_second": 3.515,
"eval_steps_per_second": 1.772,
"step": 2345
},
{
"epoch": 0.047,
"grad_norm": 0.02695670446644145,
"learning_rate": 4.698000000000001e-06,
"loss": 2.4303,
"step": 2350
},
{
"epoch": 0.047,
"eval_loss": 2.4429421424865723,
"eval_runtime": 33.3556,
"eval_samples_per_second": 3.508,
"eval_steps_per_second": 1.769,
"step": 2350
},
{
"epoch": 0.0471,
"eval_loss": 2.4427566528320312,
"eval_runtime": 33.3612,
"eval_samples_per_second": 3.507,
"eval_steps_per_second": 1.769,
"step": 2355
},
{
"epoch": 0.0472,
"eval_loss": 2.4425995349884033,
"eval_runtime": 33.353,
"eval_samples_per_second": 3.508,
"eval_steps_per_second": 1.769,
"step": 2360
},
{
"epoch": 0.0473,
"eval_loss": 2.4426395893096924,
"eval_runtime": 33.4669,
"eval_samples_per_second": 3.496,
"eval_steps_per_second": 1.763,
"step": 2365
},
{
"epoch": 0.0474,
"eval_loss": 2.4425301551818848,
"eval_runtime": 33.3803,
"eval_samples_per_second": 3.505,
"eval_steps_per_second": 1.768,
"step": 2370
},
{
"epoch": 0.0475,
"grad_norm": 0.031232764672567994,
"learning_rate": 4.748e-06,
"loss": 2.4284,
"step": 2375
},
{
"epoch": 0.0475,
"eval_loss": 2.4426214694976807,
"eval_runtime": 33.3013,
"eval_samples_per_second": 3.513,
"eval_steps_per_second": 1.772,
"step": 2375
},
{
"epoch": 0.0476,
"eval_loss": 2.442599296569824,
"eval_runtime": 33.3419,
"eval_samples_per_second": 3.509,
"eval_steps_per_second": 1.77,
"step": 2380
},
{
"epoch": 0.0477,
"eval_loss": 2.442364454269409,
"eval_runtime": 33.3677,
"eval_samples_per_second": 3.506,
"eval_steps_per_second": 1.768,
"step": 2385
},
{
"epoch": 0.0478,
"eval_loss": 2.4425458908081055,
"eval_runtime": 33.3892,
"eval_samples_per_second": 3.504,
"eval_steps_per_second": 1.767,
"step": 2390
},
{
"epoch": 0.0479,
"eval_loss": 2.4425549507141113,
"eval_runtime": 33.4202,
"eval_samples_per_second": 3.501,
"eval_steps_per_second": 1.765,
"step": 2395
},
{
"epoch": 0.048,
"grad_norm": 0.027127721086561404,
"learning_rate": 4.7980000000000005e-06,
"loss": 2.4291,
"step": 2400
},
{
"epoch": 0.048,
"eval_loss": 2.4425251483917236,
"eval_runtime": 33.3802,
"eval_samples_per_second": 3.505,
"eval_steps_per_second": 1.768,
"step": 2400
},
{
"epoch": 0.0481,
"eval_loss": 2.4424123764038086,
"eval_runtime": 33.3283,
"eval_samples_per_second": 3.511,
"eval_steps_per_second": 1.77,
"step": 2405
},
{
"epoch": 0.0482,
"eval_loss": 2.4421849250793457,
"eval_runtime": 33.4172,
"eval_samples_per_second": 3.501,
"eval_steps_per_second": 1.766,
"step": 2410
},
{
"epoch": 0.0483,
"eval_loss": 2.4419970512390137,
"eval_runtime": 33.4642,
"eval_samples_per_second": 3.496,
"eval_steps_per_second": 1.763,
"step": 2415
},
{
"epoch": 0.0484,
"eval_loss": 2.4419567584991455,
"eval_runtime": 33.3663,
"eval_samples_per_second": 3.507,
"eval_steps_per_second": 1.768,
"step": 2420
},
{
"epoch": 0.0485,
"grad_norm": 0.026032952013136927,
"learning_rate": 4.848000000000001e-06,
"loss": 2.4256,
"step": 2425
},
{
"epoch": 0.0485,
"eval_loss": 2.441688299179077,
"eval_runtime": 33.3169,
"eval_samples_per_second": 3.512,
"eval_steps_per_second": 1.771,
"step": 2425
},
{
"epoch": 0.0486,
"eval_loss": 2.4417548179626465,
"eval_runtime": 33.3476,
"eval_samples_per_second": 3.508,
"eval_steps_per_second": 1.769,
"step": 2430
},
{
"epoch": 0.0487,
"eval_loss": 2.441769599914551,
"eval_runtime": 33.4488,
"eval_samples_per_second": 3.498,
"eval_steps_per_second": 1.764,
"step": 2435
},
{
"epoch": 0.0488,
"eval_loss": 2.4415283203125,
"eval_runtime": 33.4555,
"eval_samples_per_second": 3.497,
"eval_steps_per_second": 1.764,
"step": 2440
},
{
"epoch": 0.0489,
"eval_loss": 2.4416847229003906,
"eval_runtime": 33.2459,
"eval_samples_per_second": 3.519,
"eval_steps_per_second": 1.775,
"step": 2445
},
{
"epoch": 0.049,
"grad_norm": 0.02804626155591942,
"learning_rate": 4.898e-06,
"loss": 2.4334,
"step": 2450
},
{
"epoch": 0.049,
"eval_loss": 2.4414188861846924,
"eval_runtime": 33.2989,
"eval_samples_per_second": 3.514,
"eval_steps_per_second": 1.772,
"step": 2450
},
{
"epoch": 0.0491,
"eval_loss": 2.4416472911834717,
"eval_runtime": 33.3676,
"eval_samples_per_second": 3.506,
"eval_steps_per_second": 1.768,
"step": 2455
},
{
"epoch": 0.0492,
"eval_loss": 2.4414844512939453,
"eval_runtime": 33.4116,
"eval_samples_per_second": 3.502,
"eval_steps_per_second": 1.766,
"step": 2460
},
{
"epoch": 0.0493,
"eval_loss": 2.441408395767212,
"eval_runtime": 33.6104,
"eval_samples_per_second": 3.481,
"eval_steps_per_second": 1.755,
"step": 2465
},
{
"epoch": 0.0494,
"eval_loss": 2.4413650035858154,
"eval_runtime": 33.3838,
"eval_samples_per_second": 3.505,
"eval_steps_per_second": 1.767,
"step": 2470
},
{
"epoch": 0.0495,
"grad_norm": 0.025351866385684634,
"learning_rate": 4.948000000000001e-06,
"loss": 2.4356,
"step": 2475
},
{
"epoch": 0.0495,
"eval_loss": 2.4411768913269043,
"eval_runtime": 33.3857,
"eval_samples_per_second": 3.504,
"eval_steps_per_second": 1.767,
"step": 2475
},
{
"epoch": 0.0496,
"eval_loss": 2.441201686859131,
"eval_runtime": 33.4117,
"eval_samples_per_second": 3.502,
"eval_steps_per_second": 1.766,
"step": 2480
},
{
"epoch": 0.0497,
"eval_loss": 2.4408698081970215,
"eval_runtime": 33.3015,
"eval_samples_per_second": 3.513,
"eval_steps_per_second": 1.772,
"step": 2485
},
{
"epoch": 0.0498,
"eval_loss": 2.440950393676758,
"eval_runtime": 33.379,
"eval_samples_per_second": 3.505,
"eval_steps_per_second": 1.768,
"step": 2490
},
{
"epoch": 0.0499,
"eval_loss": 2.4407267570495605,
"eval_runtime": 33.2561,
"eval_samples_per_second": 3.518,
"eval_steps_per_second": 1.774,
"step": 2495
},
{
"epoch": 0.05,
"grad_norm": 0.029743600833546286,
"learning_rate": 4.998e-06,
"loss": 2.4369,
"step": 2500
},
{
"epoch": 0.05,
"eval_loss": 2.4408068656921387,
"eval_runtime": 33.3807,
"eval_samples_per_second": 3.505,
"eval_steps_per_second": 1.767,
"step": 2500
},
{
"epoch": 0.0501,
"eval_loss": 2.4407401084899902,
"eval_runtime": 33.2295,
"eval_samples_per_second": 3.521,
"eval_steps_per_second": 1.776,
"step": 2505
},
{
"epoch": 0.0502,
"eval_loss": 2.4409286975860596,
"eval_runtime": 33.3925,
"eval_samples_per_second": 3.504,
"eval_steps_per_second": 1.767,
"step": 2510
},
{
"epoch": 0.0503,
"eval_loss": 2.4407782554626465,
"eval_runtime": 33.4498,
"eval_samples_per_second": 3.498,
"eval_steps_per_second": 1.764,
"step": 2515
},
{
"epoch": 0.0504,
"eval_loss": 2.4407856464385986,
"eval_runtime": 33.4899,
"eval_samples_per_second": 3.494,
"eval_steps_per_second": 1.762,
"step": 2520
},
{
"epoch": 0.0505,
"grad_norm": 0.027292319342276494,
"learning_rate": 5.048000000000001e-06,
"loss": 2.4263,
"step": 2525
},
{
"epoch": 0.0505,
"eval_loss": 2.440830945968628,
"eval_runtime": 33.3428,
"eval_samples_per_second": 3.509,
"eval_steps_per_second": 1.769,
"step": 2525
},
{
"epoch": 0.0506,
"eval_loss": 2.44069504737854,
"eval_runtime": 33.2895,
"eval_samples_per_second": 3.515,
"eval_steps_per_second": 1.772,
"step": 2530
},
{
"epoch": 0.0507,
"eval_loss": 2.4408159255981445,
"eval_runtime": 33.3488,
"eval_samples_per_second": 3.508,
"eval_steps_per_second": 1.769,
"step": 2535
},
{
"epoch": 0.0508,
"eval_loss": 2.440523386001587,
"eval_runtime": 33.3582,
"eval_samples_per_second": 3.507,
"eval_steps_per_second": 1.769,
"step": 2540
},
{
"epoch": 0.0509,
"eval_loss": 2.4403724670410156,
"eval_runtime": 33.5287,
"eval_samples_per_second": 3.49,
"eval_steps_per_second": 1.76,
"step": 2545
},
{
"epoch": 0.051,
"grad_norm": 0.02495087994166461,
"learning_rate": 5.098000000000001e-06,
"loss": 2.428,
"step": 2550
},
{
"epoch": 0.051,
"eval_loss": 2.440495252609253,
"eval_runtime": 34.4575,
"eval_samples_per_second": 3.395,
"eval_steps_per_second": 1.712,
"step": 2550
},
{
"epoch": 0.0511,
"eval_loss": 2.440384864807129,
"eval_runtime": 34.0144,
"eval_samples_per_second": 3.44,
"eval_steps_per_second": 1.735,
"step": 2555
},
{
"epoch": 0.0512,
"eval_loss": 2.4405176639556885,
"eval_runtime": 34.5852,
"eval_samples_per_second": 3.383,
"eval_steps_per_second": 1.706,
"step": 2560
},
{
"epoch": 0.0513,
"eval_loss": 2.4402472972869873,
"eval_runtime": 34.2689,
"eval_samples_per_second": 3.414,
"eval_steps_per_second": 1.722,
"step": 2565
},
{
"epoch": 0.0514,
"eval_loss": 2.440459966659546,
"eval_runtime": 33.3821,
"eval_samples_per_second": 3.505,
"eval_steps_per_second": 1.767,
"step": 2570
},
{
"epoch": 0.0515,
"grad_norm": 0.029728034222700407,
"learning_rate": 5.1480000000000005e-06,
"loss": 2.439,
"step": 2575
},
{
"epoch": 0.0515,
"eval_loss": 2.440525531768799,
"eval_runtime": 34.3072,
"eval_samples_per_second": 3.41,
"eval_steps_per_second": 1.72,
"step": 2575
},
{
"epoch": 0.0516,
"eval_loss": 2.440373420715332,
"eval_runtime": 33.5748,
"eval_samples_per_second": 3.485,
"eval_steps_per_second": 1.757,
"step": 2580
},
{
"epoch": 0.0517,
"eval_loss": 2.4405770301818848,
"eval_runtime": 35.2655,
"eval_samples_per_second": 3.318,
"eval_steps_per_second": 1.673,
"step": 2585
},
{
"epoch": 0.0518,
"eval_loss": 2.4402198791503906,
"eval_runtime": 34.9918,
"eval_samples_per_second": 3.344,
"eval_steps_per_second": 1.686,
"step": 2590
},
{
"epoch": 0.0519,
"eval_loss": 2.440136194229126,
"eval_runtime": 33.4873,
"eval_samples_per_second": 3.494,
"eval_steps_per_second": 1.762,
"step": 2595
},
{
"epoch": 0.052,
"grad_norm": 0.02473354917836018,
"learning_rate": 5.198000000000001e-06,
"loss": 2.427,
"step": 2600
},
{
"epoch": 0.052,
"eval_loss": 2.440282106399536,
"eval_runtime": 33.4628,
"eval_samples_per_second": 3.496,
"eval_steps_per_second": 1.763,
"step": 2600
},
{
"epoch": 0.0521,
"eval_loss": 2.440448045730591,
"eval_runtime": 33.4191,
"eval_samples_per_second": 3.501,
"eval_steps_per_second": 1.765,
"step": 2605
},
{
"epoch": 0.0522,
"eval_loss": 2.440248966217041,
"eval_runtime": 33.4911,
"eval_samples_per_second": 3.493,
"eval_steps_per_second": 1.762,
"step": 2610
},
{
"epoch": 0.0523,
"eval_loss": 2.440030336380005,
"eval_runtime": 33.4921,
"eval_samples_per_second": 3.493,
"eval_steps_per_second": 1.762,
"step": 2615
},
{
"epoch": 0.0524,
"eval_loss": 2.4397685527801514,
"eval_runtime": 33.4491,
"eval_samples_per_second": 3.498,
"eval_steps_per_second": 1.764,
"step": 2620
},
{
"epoch": 0.0525,
"grad_norm": 0.026533778128592735,
"learning_rate": 5.248000000000001e-06,
"loss": 2.4214,
"step": 2625
},
{
"epoch": 0.0525,
"eval_loss": 2.43971848487854,
"eval_runtime": 33.3975,
"eval_samples_per_second": 3.503,
"eval_steps_per_second": 1.767,
"step": 2625
},
{
"epoch": 0.0526,
"eval_loss": 2.4398951530456543,
"eval_runtime": 33.4912,
"eval_samples_per_second": 3.493,
"eval_steps_per_second": 1.762,
"step": 2630
},
{
"epoch": 0.0527,
"eval_loss": 2.43975830078125,
"eval_runtime": 33.4071,
"eval_samples_per_second": 3.502,
"eval_steps_per_second": 1.766,
"step": 2635
},
{
"epoch": 0.0528,
"eval_loss": 2.439666271209717,
"eval_runtime": 33.4208,
"eval_samples_per_second": 3.501,
"eval_steps_per_second": 1.765,
"step": 2640
},
{
"epoch": 0.0529,
"eval_loss": 2.439816951751709,
"eval_runtime": 33.5111,
"eval_samples_per_second": 3.491,
"eval_steps_per_second": 1.761,
"step": 2645
},
{
"epoch": 0.053,
"grad_norm": 0.024723120971366967,
"learning_rate": 5.298000000000001e-06,
"loss": 2.4241,
"step": 2650
},
{
"epoch": 0.053,
"eval_loss": 2.4398183822631836,
"eval_runtime": 33.506,
"eval_samples_per_second": 3.492,
"eval_steps_per_second": 1.761,
"step": 2650
},
{
"epoch": 0.0531,
"eval_loss": 2.4402668476104736,
"eval_runtime": 34.1298,
"eval_samples_per_second": 3.428,
"eval_steps_per_second": 1.729,
"step": 2655
},
{
"epoch": 0.0532,
"eval_loss": 2.4400885105133057,
"eval_runtime": 33.436,
"eval_samples_per_second": 3.499,
"eval_steps_per_second": 1.765,
"step": 2660
},
{
"epoch": 0.0533,
"eval_loss": 2.439871311187744,
"eval_runtime": 33.3874,
"eval_samples_per_second": 3.504,
"eval_steps_per_second": 1.767,
"step": 2665
},
{
"epoch": 0.0534,
"eval_loss": 2.4393365383148193,
"eval_runtime": 33.5258,
"eval_samples_per_second": 3.49,
"eval_steps_per_second": 1.76,
"step": 2670
},
{
"epoch": 0.0535,
"grad_norm": 0.02173239513971497,
"learning_rate": 5.348000000000001e-06,
"loss": 2.4295,
"step": 2675
},
{
"epoch": 0.0535,
"eval_loss": 2.439133405685425,
"eval_runtime": 33.4962,
"eval_samples_per_second": 3.493,
"eval_steps_per_second": 1.761,
"step": 2675
},
{
"epoch": 0.0536,
"eval_loss": 2.439093589782715,
"eval_runtime": 33.4708,
"eval_samples_per_second": 3.496,
"eval_steps_per_second": 1.763,
"step": 2680
},
{
"epoch": 0.0537,
"eval_loss": 2.439096212387085,
"eval_runtime": 33.4284,
"eval_samples_per_second": 3.5,
"eval_steps_per_second": 1.765,
"step": 2685
},
{
"epoch": 0.0538,
"eval_loss": 2.4389584064483643,
"eval_runtime": 33.4749,
"eval_samples_per_second": 3.495,
"eval_steps_per_second": 1.763,
"step": 2690
},
{
"epoch": 0.0539,
"eval_loss": 2.438805103302002,
"eval_runtime": 33.478,
"eval_samples_per_second": 3.495,
"eval_steps_per_second": 1.762,
"step": 2695
},
{
"epoch": 0.054,
"grad_norm": 0.023851331909406925,
"learning_rate": 5.398e-06,
"loss": 2.4302,
"step": 2700
},
{
"epoch": 0.054,
"eval_loss": 2.4386403560638428,
"eval_runtime": 33.4276,
"eval_samples_per_second": 3.5,
"eval_steps_per_second": 1.765,
"step": 2700
},
{
"epoch": 0.0541,
"eval_loss": 2.438568115234375,
"eval_runtime": 33.528,
"eval_samples_per_second": 3.49,
"eval_steps_per_second": 1.76,
"step": 2705
},
{
"epoch": 0.0542,
"eval_loss": 2.438894510269165,
"eval_runtime": 33.5228,
"eval_samples_per_second": 3.49,
"eval_steps_per_second": 1.76,
"step": 2710
},
{
"epoch": 0.0543,
"eval_loss": 2.4387168884277344,
"eval_runtime": 33.4663,
"eval_samples_per_second": 3.496,
"eval_steps_per_second": 1.763,
"step": 2715
},
{
"epoch": 0.0544,
"eval_loss": 2.4385879039764404,
"eval_runtime": 33.513,
"eval_samples_per_second": 3.491,
"eval_steps_per_second": 1.761,
"step": 2720
},
{
"epoch": 0.0545,
"grad_norm": 0.02728082451264937,
"learning_rate": 5.448e-06,
"loss": 2.4308,
"step": 2725
},
{
"epoch": 0.0545,
"eval_loss": 2.4388349056243896,
"eval_runtime": 33.4525,
"eval_samples_per_second": 3.497,
"eval_steps_per_second": 1.764,
"step": 2725
},
{
"epoch": 0.0546,
"eval_loss": 2.438887357711792,
"eval_runtime": 33.428,
"eval_samples_per_second": 3.5,
"eval_steps_per_second": 1.765,
"step": 2730
},
{
"epoch": 0.0547,
"eval_loss": 2.438713312149048,
"eval_runtime": 33.5229,
"eval_samples_per_second": 3.49,
"eval_steps_per_second": 1.76,
"step": 2735
},
{
"epoch": 0.0548,
"eval_loss": 2.438657283782959,
"eval_runtime": 33.4169,
"eval_samples_per_second": 3.501,
"eval_steps_per_second": 1.766,
"step": 2740
},
{
"epoch": 0.0549,
"eval_loss": 2.438544988632202,
"eval_runtime": 33.4944,
"eval_samples_per_second": 3.493,
"eval_steps_per_second": 1.761,
"step": 2745
},
{
"epoch": 0.055,
"grad_norm": 0.025461121075693184,
"learning_rate": 5.498e-06,
"loss": 2.4379,
"step": 2750
},
{
"epoch": 0.055,
"eval_loss": 2.4386098384857178,
"eval_runtime": 33.6782,
"eval_samples_per_second": 3.474,
"eval_steps_per_second": 1.752,
"step": 2750
},
{
"epoch": 0.0551,
"eval_loss": 2.438521146774292,
"eval_runtime": 33.5161,
"eval_samples_per_second": 3.491,
"eval_steps_per_second": 1.76,
"step": 2755
},
{
"epoch": 0.0552,
"eval_loss": 2.438474178314209,
"eval_runtime": 33.4773,
"eval_samples_per_second": 3.495,
"eval_steps_per_second": 1.762,
"step": 2760
},
{
"epoch": 0.0553,
"eval_loss": 2.4382379055023193,
"eval_runtime": 33.4869,
"eval_samples_per_second": 3.494,
"eval_steps_per_second": 1.762,
"step": 2765
},
{
"epoch": 0.0554,
"eval_loss": 2.438157796859741,
"eval_runtime": 33.543,
"eval_samples_per_second": 3.488,
"eval_steps_per_second": 1.759,
"step": 2770
},
{
"epoch": 0.0555,
"grad_norm": 0.0234055445054481,
"learning_rate": 5.548e-06,
"loss": 2.4326,
"step": 2775
},
{
"epoch": 0.0555,
"eval_loss": 2.438048839569092,
"eval_runtime": 33.5073,
"eval_samples_per_second": 3.492,
"eval_steps_per_second": 1.761,
"step": 2775
},
{
"epoch": 0.0556,
"eval_loss": 2.4379706382751465,
"eval_runtime": 33.4567,
"eval_samples_per_second": 3.497,
"eval_steps_per_second": 1.763,
"step": 2780
},
{
"epoch": 0.0557,
"eval_loss": 2.4379332065582275,
"eval_runtime": 33.5172,
"eval_samples_per_second": 3.491,
"eval_steps_per_second": 1.76,
"step": 2785
},
{
"epoch": 0.0558,
"eval_loss": 2.4380111694335938,
"eval_runtime": 33.5913,
"eval_samples_per_second": 3.483,
"eval_steps_per_second": 1.756,
"step": 2790
},
{
"epoch": 0.0559,
"eval_loss": 2.4379403591156006,
"eval_runtime": 33.5223,
"eval_samples_per_second": 3.49,
"eval_steps_per_second": 1.76,
"step": 2795
},
{
"epoch": 0.056,
"grad_norm": 0.024691045411267393,
"learning_rate": 5.5980000000000004e-06,
"loss": 2.4297,
"step": 2800
},
{
"epoch": 0.056,
"eval_loss": 2.43778657913208,
"eval_runtime": 33.524,
"eval_samples_per_second": 3.49,
"eval_steps_per_second": 1.76,
"step": 2800
},
{
"epoch": 0.0561,
"eval_loss": 2.4376559257507324,
"eval_runtime": 33.58,
"eval_samples_per_second": 3.484,
"eval_steps_per_second": 1.757,
"step": 2805
},
{
"epoch": 0.0562,
"eval_loss": 2.437596559524536,
"eval_runtime": 33.5756,
"eval_samples_per_second": 3.485,
"eval_steps_per_second": 1.757,
"step": 2810
},
{
"epoch": 0.0563,
"eval_loss": 2.437690496444702,
"eval_runtime": 33.5056,
"eval_samples_per_second": 3.492,
"eval_steps_per_second": 1.761,
"step": 2815
},
{
"epoch": 0.0564,
"eval_loss": 2.437558174133301,
"eval_runtime": 33.4948,
"eval_samples_per_second": 3.493,
"eval_steps_per_second": 1.761,
"step": 2820
},
{
"epoch": 0.0565,
"grad_norm": 0.02500330428035899,
"learning_rate": 5.648e-06,
"loss": 2.4281,
"step": 2825
},
{
"epoch": 0.0565,
"eval_loss": 2.437875747680664,
"eval_runtime": 33.4492,
"eval_samples_per_second": 3.498,
"eval_steps_per_second": 1.764,
"step": 2825
},
{
"epoch": 0.0566,
"eval_loss": 2.438183546066284,
"eval_runtime": 33.5208,
"eval_samples_per_second": 3.49,
"eval_steps_per_second": 1.76,
"step": 2830
},
{
"epoch": 0.0567,
"eval_loss": 2.4375228881835938,
"eval_runtime": 33.5319,
"eval_samples_per_second": 3.489,
"eval_steps_per_second": 1.76,
"step": 2835
},
{
"epoch": 0.0568,
"eval_loss": 2.437365770339966,
"eval_runtime": 33.4734,
"eval_samples_per_second": 3.495,
"eval_steps_per_second": 1.763,
"step": 2840
},
{
"epoch": 0.0569,
"eval_loss": 2.4376399517059326,
"eval_runtime": 33.4578,
"eval_samples_per_second": 3.497,
"eval_steps_per_second": 1.763,
"step": 2845
},
{
"epoch": 0.057,
"grad_norm": 0.023953363978697285,
"learning_rate": 5.698e-06,
"loss": 2.4341,
"step": 2850
},
{
"epoch": 0.057,
"eval_loss": 2.437318801879883,
"eval_runtime": 33.4551,
"eval_samples_per_second": 3.497,
"eval_steps_per_second": 1.764,
"step": 2850
},
{
"epoch": 0.0571,
"eval_loss": 2.437349319458008,
"eval_runtime": 33.4482,
"eval_samples_per_second": 3.498,
"eval_steps_per_second": 1.764,
"step": 2855
},
{
"epoch": 0.0572,
"eval_loss": 2.437500476837158,
"eval_runtime": 33.5179,
"eval_samples_per_second": 3.491,
"eval_steps_per_second": 1.76,
"step": 2860
},
{
"epoch": 0.0573,
"eval_loss": 2.4371414184570312,
"eval_runtime": 33.4246,
"eval_samples_per_second": 3.5,
"eval_steps_per_second": 1.765,
"step": 2865
},
{
"epoch": 0.0574,
"eval_loss": 2.4371588230133057,
"eval_runtime": 33.5686,
"eval_samples_per_second": 3.485,
"eval_steps_per_second": 1.758,
"step": 2870
},
{
"epoch": 0.0575,
"grad_norm": 0.023037224733864405,
"learning_rate": 5.748e-06,
"loss": 2.4201,
"step": 2875
},
{
"epoch": 0.0575,
"eval_loss": 2.4373178482055664,
"eval_runtime": 33.4813,
"eval_samples_per_second": 3.494,
"eval_steps_per_second": 1.762,
"step": 2875
},
{
"epoch": 0.0576,
"eval_loss": 2.4371204376220703,
"eval_runtime": 33.5096,
"eval_samples_per_second": 3.492,
"eval_steps_per_second": 1.761,
"step": 2880
},
{
"epoch": 0.0577,
"eval_loss": 2.43719482421875,
"eval_runtime": 33.4709,
"eval_samples_per_second": 3.496,
"eval_steps_per_second": 1.763,
"step": 2885
},
{
"epoch": 0.0578,
"eval_loss": 2.4369635581970215,
"eval_runtime": 33.5125,
"eval_samples_per_second": 3.491,
"eval_steps_per_second": 1.761,
"step": 2890
},
{
"epoch": 0.0579,
"eval_loss": 2.4367122650146484,
"eval_runtime": 33.5349,
"eval_samples_per_second": 3.489,
"eval_steps_per_second": 1.759,
"step": 2895
},
{
"epoch": 0.058,
"grad_norm": 0.023843041578218274,
"learning_rate": 5.798e-06,
"loss": 2.4322,
"step": 2900
},
{
"epoch": 0.058,
"eval_loss": 2.436885118484497,
"eval_runtime": 33.5038,
"eval_samples_per_second": 3.492,
"eval_steps_per_second": 1.761,
"step": 2900
},
{
"epoch": 0.0581,
"eval_loss": 2.4368388652801514,
"eval_runtime": 33.4337,
"eval_samples_per_second": 3.499,
"eval_steps_per_second": 1.765,
"step": 2905
},
{
"epoch": 0.0582,
"eval_loss": 2.436776638031006,
"eval_runtime": 33.5783,
"eval_samples_per_second": 3.484,
"eval_steps_per_second": 1.757,
"step": 2910
},
{
"epoch": 0.0583,
"eval_loss": 2.4369046688079834,
"eval_runtime": 33.5764,
"eval_samples_per_second": 3.485,
"eval_steps_per_second": 1.757,
"step": 2915
},
{
"epoch": 0.0584,
"eval_loss": 2.4369351863861084,
"eval_runtime": 33.5715,
"eval_samples_per_second": 3.485,
"eval_steps_per_second": 1.757,
"step": 2920
},
{
"epoch": 0.0585,
"grad_norm": 0.030212978437899864,
"learning_rate": 5.848000000000001e-06,
"loss": 2.4318,
"step": 2925
},
{
"epoch": 0.0585,
"eval_loss": 2.4367170333862305,
"eval_runtime": 33.455,
"eval_samples_per_second": 3.497,
"eval_steps_per_second": 1.764,
"step": 2925
},
{
"epoch": 0.0586,
"eval_loss": 2.4367101192474365,
"eval_runtime": 33.3973,
"eval_samples_per_second": 3.503,
"eval_steps_per_second": 1.767,
"step": 2930
},
{
"epoch": 0.0587,
"eval_loss": 2.436723470687866,
"eval_runtime": 33.4183,
"eval_samples_per_second": 3.501,
"eval_steps_per_second": 1.766,
"step": 2935
},
{
"epoch": 0.0588,
"eval_loss": 2.4368371963500977,
"eval_runtime": 33.5269,
"eval_samples_per_second": 3.49,
"eval_steps_per_second": 1.76,
"step": 2940
},
{
"epoch": 0.0589,
"eval_loss": 2.436763286590576,
"eval_runtime": 33.4623,
"eval_samples_per_second": 3.496,
"eval_steps_per_second": 1.763,
"step": 2945
},
{
"epoch": 0.059,
"grad_norm": 0.024293450378328845,
"learning_rate": 5.898e-06,
"loss": 2.4221,
"step": 2950
},
{
"epoch": 0.059,
"eval_loss": 2.436692714691162,
"eval_runtime": 33.523,
"eval_samples_per_second": 3.49,
"eval_steps_per_second": 1.76,
"step": 2950
},
{
"epoch": 0.0591,
"eval_loss": 2.436657667160034,
"eval_runtime": 34.902,
"eval_samples_per_second": 3.352,
"eval_steps_per_second": 1.69,
"step": 2955
},
{
"epoch": 0.0592,
"eval_loss": 2.436432123184204,
"eval_runtime": 33.4808,
"eval_samples_per_second": 3.495,
"eval_steps_per_second": 1.762,
"step": 2960
},
{
"epoch": 0.0593,
"eval_loss": 2.436782121658325,
"eval_runtime": 34.5166,
"eval_samples_per_second": 3.39,
"eval_steps_per_second": 1.709,
"step": 2965
},
{
"epoch": 0.0594,
"eval_loss": 2.4366602897644043,
"eval_runtime": 33.7416,
"eval_samples_per_second": 3.468,
"eval_steps_per_second": 1.749,
"step": 2970
},
{
"epoch": 0.0595,
"grad_norm": 0.028294127858427973,
"learning_rate": 5.9480000000000005e-06,
"loss": 2.4196,
"step": 2975
},
{
"epoch": 0.0595,
"eval_loss": 2.436668872833252,
"eval_runtime": 35.1904,
"eval_samples_per_second": 3.325,
"eval_steps_per_second": 1.677,
"step": 2975
},
{
"epoch": 0.0596,
"eval_loss": 2.436310052871704,
"eval_runtime": 33.583,
"eval_samples_per_second": 3.484,
"eval_steps_per_second": 1.757,
"step": 2980
},
{
"epoch": 0.0597,
"eval_loss": 2.4361066818237305,
"eval_runtime": 34.1148,
"eval_samples_per_second": 3.43,
"eval_steps_per_second": 1.729,
"step": 2985
},
{
"epoch": 0.0598,
"eval_loss": 2.436128854751587,
"eval_runtime": 33.7895,
"eval_samples_per_second": 3.463,
"eval_steps_per_second": 1.746,
"step": 2990
},
{
"epoch": 0.0599,
"eval_loss": 2.436457872390747,
"eval_runtime": 34.0525,
"eval_samples_per_second": 3.436,
"eval_steps_per_second": 1.733,
"step": 2995
},
{
"epoch": 0.06,
"grad_norm": 0.02242795270420928,
"learning_rate": 5.998000000000001e-06,
"loss": 2.4245,
"step": 3000
},
{
"epoch": 0.06,
"eval_loss": 2.436203718185425,
"eval_runtime": 33.6471,
"eval_samples_per_second": 3.477,
"eval_steps_per_second": 1.753,
"step": 3000
}
],
"logging_steps": 25,
"max_steps": 50000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.355905264309764e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}