tinyllama-salience / checkpoint-1000 /trainer_state.json
thebnbrkr's picture
Upload folder using huggingface_hub
45bdf15 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.881844380403458,
"eval_steps": 10,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01440922190201729,
"grad_norm": 0.888121485710144,
"learning_rate": 0.0002988472622478386,
"loss": 2.4115,
"step": 5
},
{
"epoch": 0.02881844380403458,
"grad_norm": 0.8970298767089844,
"learning_rate": 0.00029740634005763684,
"loss": 2.23,
"step": 10
},
{
"epoch": 0.02881844380403458,
"eval_loss": 2.165903091430664,
"eval_runtime": 1.8407,
"eval_samples_per_second": 84.207,
"eval_steps_per_second": 10.865,
"step": 10
},
{
"epoch": 0.043227665706051875,
"grad_norm": 1.078068494796753,
"learning_rate": 0.00029596541786743513,
"loss": 2.0155,
"step": 15
},
{
"epoch": 0.05763688760806916,
"grad_norm": 1.2069385051727295,
"learning_rate": 0.0002945244956772334,
"loss": 1.9346,
"step": 20
},
{
"epoch": 0.05763688760806916,
"eval_loss": 1.7943660020828247,
"eval_runtime": 1.7809,
"eval_samples_per_second": 87.035,
"eval_steps_per_second": 11.23,
"step": 20
},
{
"epoch": 0.07204610951008646,
"grad_norm": 1.3408102989196777,
"learning_rate": 0.0002930835734870317,
"loss": 1.7434,
"step": 25
},
{
"epoch": 0.08645533141210375,
"grad_norm": 1.4849472045898438,
"learning_rate": 0.00029164265129682994,
"loss": 1.5409,
"step": 30
},
{
"epoch": 0.08645533141210375,
"eval_loss": 1.5225533246994019,
"eval_runtime": 1.7657,
"eval_samples_per_second": 87.784,
"eval_steps_per_second": 11.327,
"step": 30
},
{
"epoch": 0.10086455331412104,
"grad_norm": 2.2489142417907715,
"learning_rate": 0.0002902017291066282,
"loss": 1.4702,
"step": 35
},
{
"epoch": 0.11527377521613832,
"grad_norm": 2.169492244720459,
"learning_rate": 0.00028876080691642647,
"loss": 1.398,
"step": 40
},
{
"epoch": 0.11527377521613832,
"eval_loss": 1.3084412813186646,
"eval_runtime": 1.7715,
"eval_samples_per_second": 87.495,
"eval_steps_per_second": 11.29,
"step": 40
},
{
"epoch": 0.12968299711815562,
"grad_norm": 2.665741205215454,
"learning_rate": 0.00028731988472622475,
"loss": 1.2839,
"step": 45
},
{
"epoch": 0.1440922190201729,
"grad_norm": 2.9897077083587646,
"learning_rate": 0.00028587896253602304,
"loss": 1.0893,
"step": 50
},
{
"epoch": 0.1440922190201729,
"eval_loss": 1.0865856409072876,
"eval_runtime": 1.7767,
"eval_samples_per_second": 87.241,
"eval_steps_per_second": 11.257,
"step": 50
},
{
"epoch": 0.1585014409221902,
"grad_norm": 2.3380894660949707,
"learning_rate": 0.0002844380403458213,
"loss": 1.0454,
"step": 55
},
{
"epoch": 0.1729106628242075,
"grad_norm": 2.571589469909668,
"learning_rate": 0.00028299711815561957,
"loss": 0.8605,
"step": 60
},
{
"epoch": 0.1729106628242075,
"eval_loss": 0.8586989641189575,
"eval_runtime": 1.7853,
"eval_samples_per_second": 86.822,
"eval_steps_per_second": 11.203,
"step": 60
},
{
"epoch": 0.1873198847262248,
"grad_norm": 3.0408358573913574,
"learning_rate": 0.00028155619596541786,
"loss": 0.8101,
"step": 65
},
{
"epoch": 0.2017291066282421,
"grad_norm": 2.4773292541503906,
"learning_rate": 0.0002801152737752161,
"loss": 0.7769,
"step": 70
},
{
"epoch": 0.2017291066282421,
"eval_loss": 0.6477732062339783,
"eval_runtime": 1.781,
"eval_samples_per_second": 87.029,
"eval_steps_per_second": 11.23,
"step": 70
},
{
"epoch": 0.21613832853025935,
"grad_norm": 2.5505621433258057,
"learning_rate": 0.0002786743515850144,
"loss": 0.6286,
"step": 75
},
{
"epoch": 0.23054755043227665,
"grad_norm": 2.5216686725616455,
"learning_rate": 0.00027723342939481267,
"loss": 0.5306,
"step": 80
},
{
"epoch": 0.23054755043227665,
"eval_loss": 0.4904349446296692,
"eval_runtime": 1.7712,
"eval_samples_per_second": 87.51,
"eval_steps_per_second": 11.292,
"step": 80
},
{
"epoch": 0.24495677233429394,
"grad_norm": 3.0844411849975586,
"learning_rate": 0.00027579250720461096,
"loss": 0.5331,
"step": 85
},
{
"epoch": 0.25936599423631124,
"grad_norm": 1.8952299356460571,
"learning_rate": 0.0002743515850144092,
"loss": 0.4093,
"step": 90
},
{
"epoch": 0.25936599423631124,
"eval_loss": 0.40096166729927063,
"eval_runtime": 1.773,
"eval_samples_per_second": 87.422,
"eval_steps_per_second": 11.28,
"step": 90
},
{
"epoch": 0.2737752161383285,
"grad_norm": 3.3445639610290527,
"learning_rate": 0.0002729106628242075,
"loss": 0.3654,
"step": 95
},
{
"epoch": 0.2881844380403458,
"grad_norm": 1.9506555795669556,
"learning_rate": 0.0002714697406340057,
"loss": 0.3458,
"step": 100
},
{
"epoch": 0.2881844380403458,
"eval_loss": 0.32525885105133057,
"eval_runtime": 1.7918,
"eval_samples_per_second": 86.503,
"eval_steps_per_second": 11.162,
"step": 100
},
{
"epoch": 0.3025936599423631,
"grad_norm": 1.9951375722885132,
"learning_rate": 0.000270028818443804,
"loss": 0.2672,
"step": 105
},
{
"epoch": 0.3170028818443804,
"grad_norm": 2.8618788719177246,
"learning_rate": 0.0002685878962536023,
"loss": 0.3316,
"step": 110
},
{
"epoch": 0.3170028818443804,
"eval_loss": 0.29092785716056824,
"eval_runtime": 1.7704,
"eval_samples_per_second": 87.549,
"eval_steps_per_second": 11.297,
"step": 110
},
{
"epoch": 0.3314121037463977,
"grad_norm": 2.436544179916382,
"learning_rate": 0.00026714697406340053,
"loss": 0.3176,
"step": 115
},
{
"epoch": 0.345821325648415,
"grad_norm": 1.1800215244293213,
"learning_rate": 0.0002657060518731988,
"loss": 0.2378,
"step": 120
},
{
"epoch": 0.345821325648415,
"eval_loss": 0.25983747839927673,
"eval_runtime": 1.7719,
"eval_samples_per_second": 87.476,
"eval_steps_per_second": 11.287,
"step": 120
},
{
"epoch": 0.36023054755043227,
"grad_norm": 1.0937371253967285,
"learning_rate": 0.0002642651296829971,
"loss": 0.2617,
"step": 125
},
{
"epoch": 0.3746397694524496,
"grad_norm": 1.5132169723510742,
"learning_rate": 0.0002628242074927954,
"loss": 0.2669,
"step": 130
},
{
"epoch": 0.3746397694524496,
"eval_loss": 0.2434806078672409,
"eval_runtime": 1.7851,
"eval_samples_per_second": 86.831,
"eval_steps_per_second": 11.204,
"step": 130
},
{
"epoch": 0.38904899135446686,
"grad_norm": 1.4011964797973633,
"learning_rate": 0.00026138328530259363,
"loss": 0.2684,
"step": 135
},
{
"epoch": 0.4034582132564842,
"grad_norm": 1.3246668577194214,
"learning_rate": 0.0002599423631123919,
"loss": 0.2377,
"step": 140
},
{
"epoch": 0.4034582132564842,
"eval_loss": 0.2348952293395996,
"eval_runtime": 1.7807,
"eval_samples_per_second": 87.042,
"eval_steps_per_second": 11.231,
"step": 140
},
{
"epoch": 0.41786743515850144,
"grad_norm": 2.753978967666626,
"learning_rate": 0.0002585014409221902,
"loss": 0.2716,
"step": 145
},
{
"epoch": 0.4322766570605187,
"grad_norm": 0.8502065539360046,
"learning_rate": 0.00025706051873198844,
"loss": 0.2282,
"step": 150
},
{
"epoch": 0.4322766570605187,
"eval_loss": 0.23007912933826447,
"eval_runtime": 1.766,
"eval_samples_per_second": 87.771,
"eval_steps_per_second": 11.325,
"step": 150
},
{
"epoch": 0.44668587896253603,
"grad_norm": 1.0703223943710327,
"learning_rate": 0.00025561959654178673,
"loss": 0.2374,
"step": 155
},
{
"epoch": 0.4610951008645533,
"grad_norm": 0.7980679869651794,
"learning_rate": 0.00025417867435158497,
"loss": 0.2151,
"step": 160
},
{
"epoch": 0.4610951008645533,
"eval_loss": 0.23066848516464233,
"eval_runtime": 1.7821,
"eval_samples_per_second": 86.978,
"eval_steps_per_second": 11.223,
"step": 160
},
{
"epoch": 0.4755043227665706,
"grad_norm": 1.1535905599594116,
"learning_rate": 0.00025273775216138326,
"loss": 0.2086,
"step": 165
},
{
"epoch": 0.4899135446685879,
"grad_norm": 0.9484102725982666,
"learning_rate": 0.00025129682997118155,
"loss": 0.2127,
"step": 170
},
{
"epoch": 0.4899135446685879,
"eval_loss": 0.2209121137857437,
"eval_runtime": 1.7727,
"eval_samples_per_second": 87.439,
"eval_steps_per_second": 11.283,
"step": 170
},
{
"epoch": 0.5043227665706052,
"grad_norm": 0.9559063911437988,
"learning_rate": 0.0002498559077809798,
"loss": 0.2134,
"step": 175
},
{
"epoch": 0.5187319884726225,
"grad_norm": 0.66960209608078,
"learning_rate": 0.00024841498559077807,
"loss": 0.2555,
"step": 180
},
{
"epoch": 0.5187319884726225,
"eval_loss": 0.21855449676513672,
"eval_runtime": 1.7887,
"eval_samples_per_second": 86.654,
"eval_steps_per_second": 11.181,
"step": 180
},
{
"epoch": 0.5331412103746398,
"grad_norm": 0.6968249082565308,
"learning_rate": 0.00024697406340057636,
"loss": 0.2199,
"step": 185
},
{
"epoch": 0.547550432276657,
"grad_norm": 0.6100601553916931,
"learning_rate": 0.00024553314121037465,
"loss": 0.2348,
"step": 190
},
{
"epoch": 0.547550432276657,
"eval_loss": 0.21677546203136444,
"eval_runtime": 1.7784,
"eval_samples_per_second": 87.156,
"eval_steps_per_second": 11.246,
"step": 190
},
{
"epoch": 0.5619596541786743,
"grad_norm": 0.6942987442016602,
"learning_rate": 0.00024409221902017288,
"loss": 0.2165,
"step": 195
},
{
"epoch": 0.5763688760806917,
"grad_norm": 0.8180645704269409,
"learning_rate": 0.00024265129682997117,
"loss": 0.2141,
"step": 200
},
{
"epoch": 0.5763688760806917,
"eval_loss": 0.21652507781982422,
"eval_runtime": 1.7751,
"eval_samples_per_second": 87.319,
"eval_steps_per_second": 11.267,
"step": 200
},
{
"epoch": 0.590778097982709,
"grad_norm": 0.6298684477806091,
"learning_rate": 0.00024121037463976943,
"loss": 0.2261,
"step": 205
},
{
"epoch": 0.6051873198847262,
"grad_norm": 2.6468467712402344,
"learning_rate": 0.0002397694524495677,
"loss": 0.245,
"step": 210
},
{
"epoch": 0.6051873198847262,
"eval_loss": 0.2188318818807602,
"eval_runtime": 1.7737,
"eval_samples_per_second": 87.389,
"eval_steps_per_second": 11.276,
"step": 210
},
{
"epoch": 0.6195965417867435,
"grad_norm": 0.6621644496917725,
"learning_rate": 0.00023832853025936598,
"loss": 0.1866,
"step": 215
},
{
"epoch": 0.6340057636887608,
"grad_norm": 0.5494632124900818,
"learning_rate": 0.00023688760806916425,
"loss": 0.2128,
"step": 220
},
{
"epoch": 0.6340057636887608,
"eval_loss": 0.21284270286560059,
"eval_runtime": 1.787,
"eval_samples_per_second": 86.739,
"eval_steps_per_second": 11.192,
"step": 220
},
{
"epoch": 0.6484149855907781,
"grad_norm": 0.5896772146224976,
"learning_rate": 0.00023544668587896253,
"loss": 0.2266,
"step": 225
},
{
"epoch": 0.6628242074927954,
"grad_norm": 0.45764079689979553,
"learning_rate": 0.0002340057636887608,
"loss": 0.2393,
"step": 230
},
{
"epoch": 0.6628242074927954,
"eval_loss": 0.21253199875354767,
"eval_runtime": 1.7769,
"eval_samples_per_second": 87.232,
"eval_steps_per_second": 11.256,
"step": 230
},
{
"epoch": 0.6772334293948127,
"grad_norm": 0.699612557888031,
"learning_rate": 0.00023256484149855909,
"loss": 0.2517,
"step": 235
},
{
"epoch": 0.69164265129683,
"grad_norm": 0.42438164353370667,
"learning_rate": 0.00023112391930835732,
"loss": 0.2016,
"step": 240
},
{
"epoch": 0.69164265129683,
"eval_loss": 0.21218827366828918,
"eval_runtime": 1.7865,
"eval_samples_per_second": 86.762,
"eval_steps_per_second": 11.195,
"step": 240
},
{
"epoch": 0.7060518731988472,
"grad_norm": 0.5191032886505127,
"learning_rate": 0.00022968299711815558,
"loss": 0.2218,
"step": 245
},
{
"epoch": 0.7204610951008645,
"grad_norm": 0.5536476969718933,
"learning_rate": 0.00022824207492795387,
"loss": 0.2145,
"step": 250
},
{
"epoch": 0.7204610951008645,
"eval_loss": 0.2105206400156021,
"eval_runtime": 1.8096,
"eval_samples_per_second": 85.654,
"eval_steps_per_second": 11.052,
"step": 250
},
{
"epoch": 0.7348703170028819,
"grad_norm": 0.5729750394821167,
"learning_rate": 0.00022680115273775213,
"loss": 0.2356,
"step": 255
},
{
"epoch": 0.7492795389048992,
"grad_norm": 0.442891389131546,
"learning_rate": 0.00022536023054755042,
"loss": 0.203,
"step": 260
},
{
"epoch": 0.7492795389048992,
"eval_loss": 0.20932228863239288,
"eval_runtime": 1.77,
"eval_samples_per_second": 87.57,
"eval_steps_per_second": 11.299,
"step": 260
},
{
"epoch": 0.7636887608069164,
"grad_norm": 0.556273341178894,
"learning_rate": 0.00022391930835734868,
"loss": 0.2092,
"step": 265
},
{
"epoch": 0.7780979827089337,
"grad_norm": 0.461923211812973,
"learning_rate": 0.00022247838616714695,
"loss": 0.2116,
"step": 270
},
{
"epoch": 0.7780979827089337,
"eval_loss": 0.2080857753753662,
"eval_runtime": 1.7792,
"eval_samples_per_second": 87.12,
"eval_steps_per_second": 11.241,
"step": 270
},
{
"epoch": 0.792507204610951,
"grad_norm": 0.5841118693351746,
"learning_rate": 0.00022103746397694523,
"loss": 0.2244,
"step": 275
},
{
"epoch": 0.8069164265129684,
"grad_norm": 0.5412226319313049,
"learning_rate": 0.0002195965417867435,
"loss": 0.1943,
"step": 280
},
{
"epoch": 0.8069164265129684,
"eval_loss": 0.208485409617424,
"eval_runtime": 1.7916,
"eval_samples_per_second": 86.515,
"eval_steps_per_second": 11.163,
"step": 280
},
{
"epoch": 0.8213256484149856,
"grad_norm": 0.5236246585845947,
"learning_rate": 0.00021815561959654179,
"loss": 0.2243,
"step": 285
},
{
"epoch": 0.8357348703170029,
"grad_norm": 0.48271429538726807,
"learning_rate": 0.00021671469740634002,
"loss": 0.2123,
"step": 290
},
{
"epoch": 0.8357348703170029,
"eval_loss": 0.20807716250419617,
"eval_runtime": 1.7753,
"eval_samples_per_second": 87.308,
"eval_steps_per_second": 11.266,
"step": 290
},
{
"epoch": 0.8501440922190202,
"grad_norm": 0.4914911985397339,
"learning_rate": 0.0002152737752161383,
"loss": 0.2252,
"step": 295
},
{
"epoch": 0.8645533141210374,
"grad_norm": 0.46419402956962585,
"learning_rate": 0.00021383285302593657,
"loss": 0.1999,
"step": 300
},
{
"epoch": 0.8645533141210374,
"eval_loss": 0.20786339044570923,
"eval_runtime": 1.7845,
"eval_samples_per_second": 86.858,
"eval_steps_per_second": 11.208,
"step": 300
},
{
"epoch": 0.8789625360230547,
"grad_norm": 0.540306568145752,
"learning_rate": 0.00021239193083573483,
"loss": 0.1856,
"step": 305
},
{
"epoch": 0.8933717579250721,
"grad_norm": 0.46772050857543945,
"learning_rate": 0.00021095100864553312,
"loss": 0.2185,
"step": 310
},
{
"epoch": 0.8933717579250721,
"eval_loss": 0.20842696726322174,
"eval_runtime": 1.7693,
"eval_samples_per_second": 87.604,
"eval_steps_per_second": 11.304,
"step": 310
},
{
"epoch": 0.9077809798270894,
"grad_norm": 0.5399373173713684,
"learning_rate": 0.00020951008645533138,
"loss": 0.2108,
"step": 315
},
{
"epoch": 0.9221902017291066,
"grad_norm": 0.5167156457901001,
"learning_rate": 0.00020806916426512967,
"loss": 0.2364,
"step": 320
},
{
"epoch": 0.9221902017291066,
"eval_loss": 0.2061736136674881,
"eval_runtime": 1.7797,
"eval_samples_per_second": 87.095,
"eval_steps_per_second": 11.238,
"step": 320
},
{
"epoch": 0.9365994236311239,
"grad_norm": 0.5894590616226196,
"learning_rate": 0.00020662824207492793,
"loss": 0.2185,
"step": 325
},
{
"epoch": 0.9510086455331412,
"grad_norm": 0.4573725163936615,
"learning_rate": 0.00020518731988472622,
"loss": 0.2109,
"step": 330
},
{
"epoch": 0.9510086455331412,
"eval_loss": 0.20550554990768433,
"eval_runtime": 1.787,
"eval_samples_per_second": 86.736,
"eval_steps_per_second": 11.192,
"step": 330
},
{
"epoch": 0.9654178674351584,
"grad_norm": 0.4973134696483612,
"learning_rate": 0.00020374639769452449,
"loss": 0.2354,
"step": 335
},
{
"epoch": 0.9798270893371758,
"grad_norm": 0.5064740180969238,
"learning_rate": 0.00020230547550432275,
"loss": 0.2263,
"step": 340
},
{
"epoch": 0.9798270893371758,
"eval_loss": 0.20669177174568176,
"eval_runtime": 1.7751,
"eval_samples_per_second": 87.32,
"eval_steps_per_second": 11.267,
"step": 340
},
{
"epoch": 0.9942363112391931,
"grad_norm": 0.6807605028152466,
"learning_rate": 0.00020086455331412104,
"loss": 0.1868,
"step": 345
},
{
"epoch": 1.0086455331412103,
"grad_norm": 0.551680326461792,
"learning_rate": 0.00019942363112391927,
"loss": 0.2149,
"step": 350
},
{
"epoch": 1.0086455331412103,
"eval_loss": 0.2055429071187973,
"eval_runtime": 1.7741,
"eval_samples_per_second": 87.371,
"eval_steps_per_second": 11.274,
"step": 350
},
{
"epoch": 1.0230547550432276,
"grad_norm": 0.4975515902042389,
"learning_rate": 0.00019798270893371756,
"loss": 0.2046,
"step": 355
},
{
"epoch": 1.037463976945245,
"grad_norm": 0.55193692445755,
"learning_rate": 0.00019654178674351582,
"loss": 0.2091,
"step": 360
},
{
"epoch": 1.037463976945245,
"eval_loss": 0.20510512590408325,
"eval_runtime": 1.785,
"eval_samples_per_second": 86.835,
"eval_steps_per_second": 11.205,
"step": 360
},
{
"epoch": 1.0518731988472623,
"grad_norm": 0.4859946668148041,
"learning_rate": 0.0001951008645533141,
"loss": 0.2124,
"step": 365
},
{
"epoch": 1.0662824207492796,
"grad_norm": 0.5230706334114075,
"learning_rate": 0.00019365994236311237,
"loss": 0.2044,
"step": 370
},
{
"epoch": 1.0662824207492796,
"eval_loss": 0.20450517535209656,
"eval_runtime": 1.7787,
"eval_samples_per_second": 87.14,
"eval_steps_per_second": 11.244,
"step": 370
},
{
"epoch": 1.080691642651297,
"grad_norm": 0.4757685363292694,
"learning_rate": 0.00019221902017291063,
"loss": 0.2041,
"step": 375
},
{
"epoch": 1.0951008645533142,
"grad_norm": 0.47648361325263977,
"learning_rate": 0.00019077809798270892,
"loss": 0.2186,
"step": 380
},
{
"epoch": 1.0951008645533142,
"eval_loss": 0.2045309692621231,
"eval_runtime": 1.7831,
"eval_samples_per_second": 86.927,
"eval_steps_per_second": 11.216,
"step": 380
},
{
"epoch": 1.1095100864553313,
"grad_norm": 0.6783398985862732,
"learning_rate": 0.00018933717579250719,
"loss": 0.2279,
"step": 385
},
{
"epoch": 1.1239193083573487,
"grad_norm": 0.6662940382957458,
"learning_rate": 0.00018789625360230547,
"loss": 0.1721,
"step": 390
},
{
"epoch": 1.1239193083573487,
"eval_loss": 0.2042209804058075,
"eval_runtime": 1.7959,
"eval_samples_per_second": 86.307,
"eval_steps_per_second": 11.136,
"step": 390
},
{
"epoch": 1.138328530259366,
"grad_norm": 0.5504616498947144,
"learning_rate": 0.00018645533141210374,
"loss": 0.2167,
"step": 395
},
{
"epoch": 1.1527377521613833,
"grad_norm": 0.400045782327652,
"learning_rate": 0.00018501440922190203,
"loss": 0.1859,
"step": 400
},
{
"epoch": 1.1527377521613833,
"eval_loss": 0.20452381670475006,
"eval_runtime": 1.7689,
"eval_samples_per_second": 87.624,
"eval_steps_per_second": 11.306,
"step": 400
},
{
"epoch": 1.1671469740634006,
"grad_norm": 0.4646718502044678,
"learning_rate": 0.00018357348703170026,
"loss": 0.1954,
"step": 405
},
{
"epoch": 1.181556195965418,
"grad_norm": 0.4777772128582001,
"learning_rate": 0.00018213256484149852,
"loss": 0.2099,
"step": 410
},
{
"epoch": 1.181556195965418,
"eval_loss": 0.2040420025587082,
"eval_runtime": 1.7756,
"eval_samples_per_second": 87.293,
"eval_steps_per_second": 11.264,
"step": 410
},
{
"epoch": 1.195965417867435,
"grad_norm": 0.5278341174125671,
"learning_rate": 0.0001806916426512968,
"loss": 0.1911,
"step": 415
},
{
"epoch": 1.2103746397694524,
"grad_norm": 0.459689199924469,
"learning_rate": 0.00017925072046109507,
"loss": 0.1766,
"step": 420
},
{
"epoch": 1.2103746397694524,
"eval_loss": 0.20372864603996277,
"eval_runtime": 1.7739,
"eval_samples_per_second": 87.38,
"eval_steps_per_second": 11.275,
"step": 420
},
{
"epoch": 1.2247838616714697,
"grad_norm": 0.5434823036193848,
"learning_rate": 0.00017780979827089336,
"loss": 0.186,
"step": 425
},
{
"epoch": 1.239193083573487,
"grad_norm": 0.4612482488155365,
"learning_rate": 0.00017636887608069162,
"loss": 0.2171,
"step": 430
},
{
"epoch": 1.239193083573487,
"eval_loss": 0.20268237590789795,
"eval_runtime": 1.7749,
"eval_samples_per_second": 87.328,
"eval_steps_per_second": 11.268,
"step": 430
},
{
"epoch": 1.2536023054755043,
"grad_norm": 0.5373527407646179,
"learning_rate": 0.0001749279538904899,
"loss": 0.2306,
"step": 435
},
{
"epoch": 1.2680115273775217,
"grad_norm": 0.6169385313987732,
"learning_rate": 0.00017348703170028817,
"loss": 0.2243,
"step": 440
},
{
"epoch": 1.2680115273775217,
"eval_loss": 0.2024766057729721,
"eval_runtime": 1.7891,
"eval_samples_per_second": 86.635,
"eval_steps_per_second": 11.179,
"step": 440
},
{
"epoch": 1.282420749279539,
"grad_norm": 0.49312788248062134,
"learning_rate": 0.00017204610951008644,
"loss": 0.1845,
"step": 445
},
{
"epoch": 1.2968299711815563,
"grad_norm": 0.4392940402030945,
"learning_rate": 0.00017060518731988473,
"loss": 0.2133,
"step": 450
},
{
"epoch": 1.2968299711815563,
"eval_loss": 0.20215217769145966,
"eval_runtime": 1.7855,
"eval_samples_per_second": 86.811,
"eval_steps_per_second": 11.201,
"step": 450
},
{
"epoch": 1.3112391930835736,
"grad_norm": 0.5321723818778992,
"learning_rate": 0.000169164265129683,
"loss": 0.1973,
"step": 455
},
{
"epoch": 1.3256484149855907,
"grad_norm": 0.46987223625183105,
"learning_rate": 0.00016772334293948128,
"loss": 0.1694,
"step": 460
},
{
"epoch": 1.3256484149855907,
"eval_loss": 0.20472992956638336,
"eval_runtime": 1.7828,
"eval_samples_per_second": 86.942,
"eval_steps_per_second": 11.218,
"step": 460
},
{
"epoch": 1.340057636887608,
"grad_norm": 0.5288825631141663,
"learning_rate": 0.0001662824207492795,
"loss": 0.1892,
"step": 465
},
{
"epoch": 1.3544668587896254,
"grad_norm": 0.5629428029060364,
"learning_rate": 0.0001648414985590778,
"loss": 0.189,
"step": 470
},
{
"epoch": 1.3544668587896254,
"eval_loss": 0.20317500829696655,
"eval_runtime": 1.7746,
"eval_samples_per_second": 87.341,
"eval_steps_per_second": 11.27,
"step": 470
},
{
"epoch": 1.3688760806916427,
"grad_norm": 0.4766279458999634,
"learning_rate": 0.00016340057636887606,
"loss": 0.1788,
"step": 475
},
{
"epoch": 1.38328530259366,
"grad_norm": 0.43711772561073303,
"learning_rate": 0.00016195965417867432,
"loss": 0.2054,
"step": 480
},
{
"epoch": 1.38328530259366,
"eval_loss": 0.20241950452327728,
"eval_runtime": 1.7822,
"eval_samples_per_second": 86.97,
"eval_steps_per_second": 11.222,
"step": 480
},
{
"epoch": 1.397694524495677,
"grad_norm": 0.6571159958839417,
"learning_rate": 0.0001605187319884726,
"loss": 0.1953,
"step": 485
},
{
"epoch": 1.4121037463976944,
"grad_norm": 0.5928535461425781,
"learning_rate": 0.00015907780979827087,
"loss": 0.2118,
"step": 490
},
{
"epoch": 1.4121037463976944,
"eval_loss": 0.20221129059791565,
"eval_runtime": 1.7713,
"eval_samples_per_second": 87.504,
"eval_steps_per_second": 11.291,
"step": 490
},
{
"epoch": 1.4265129682997117,
"grad_norm": 0.5120033025741577,
"learning_rate": 0.00015763688760806916,
"loss": 0.2251,
"step": 495
},
{
"epoch": 1.440922190201729,
"grad_norm": 0.5128481388092041,
"learning_rate": 0.00015619596541786743,
"loss": 0.2221,
"step": 500
},
{
"epoch": 1.440922190201729,
"eval_loss": 0.20349927246570587,
"eval_runtime": 1.7928,
"eval_samples_per_second": 86.457,
"eval_steps_per_second": 11.156,
"step": 500
},
{
"epoch": 1.4553314121037464,
"grad_norm": 0.4616795480251312,
"learning_rate": 0.0001547550432276657,
"loss": 0.2036,
"step": 505
},
{
"epoch": 1.4697406340057637,
"grad_norm": 0.5091164112091064,
"learning_rate": 0.00015331412103746398,
"loss": 0.2036,
"step": 510
},
{
"epoch": 1.4697406340057637,
"eval_loss": 0.2018601894378662,
"eval_runtime": 1.8038,
"eval_samples_per_second": 85.928,
"eval_steps_per_second": 11.088,
"step": 510
},
{
"epoch": 1.484149855907781,
"grad_norm": 0.48645836114883423,
"learning_rate": 0.0001518731988472622,
"loss": 0.2059,
"step": 515
},
{
"epoch": 1.4985590778097984,
"grad_norm": 0.46957656741142273,
"learning_rate": 0.0001504322766570605,
"loss": 0.2032,
"step": 520
},
{
"epoch": 1.4985590778097984,
"eval_loss": 0.2017858922481537,
"eval_runtime": 1.7969,
"eval_samples_per_second": 86.26,
"eval_steps_per_second": 11.13,
"step": 520
},
{
"epoch": 1.5129682997118157,
"grad_norm": 0.47207146883010864,
"learning_rate": 0.00014899135446685876,
"loss": 0.2121,
"step": 525
},
{
"epoch": 1.527377521613833,
"grad_norm": 0.4411737024784088,
"learning_rate": 0.00014755043227665705,
"loss": 0.2045,
"step": 530
},
{
"epoch": 1.527377521613833,
"eval_loss": 0.20163200795650482,
"eval_runtime": 1.7919,
"eval_samples_per_second": 86.499,
"eval_steps_per_second": 11.161,
"step": 530
},
{
"epoch": 1.54178674351585,
"grad_norm": 0.5532639026641846,
"learning_rate": 0.0001461095100864553,
"loss": 0.2184,
"step": 535
},
{
"epoch": 1.5561959654178674,
"grad_norm": 0.3798025846481323,
"learning_rate": 0.0001446685878962536,
"loss": 0.1842,
"step": 540
},
{
"epoch": 1.5561959654178674,
"eval_loss": 0.20085880160331726,
"eval_runtime": 1.7882,
"eval_samples_per_second": 86.68,
"eval_steps_per_second": 11.185,
"step": 540
},
{
"epoch": 1.5706051873198847,
"grad_norm": 0.37717685103416443,
"learning_rate": 0.00014322766570605186,
"loss": 0.18,
"step": 545
},
{
"epoch": 1.585014409221902,
"grad_norm": 0.3843863606452942,
"learning_rate": 0.00014178674351585013,
"loss": 0.1884,
"step": 550
},
{
"epoch": 1.585014409221902,
"eval_loss": 0.20081885159015656,
"eval_runtime": 1.7697,
"eval_samples_per_second": 87.587,
"eval_steps_per_second": 11.302,
"step": 550
},
{
"epoch": 1.5994236311239192,
"grad_norm": 0.44589126110076904,
"learning_rate": 0.00014034582132564841,
"loss": 0.2239,
"step": 555
},
{
"epoch": 1.6138328530259365,
"grad_norm": 0.47011956572532654,
"learning_rate": 0.00013890489913544668,
"loss": 0.1746,
"step": 560
},
{
"epoch": 1.6138328530259365,
"eval_loss": 0.20064498484134674,
"eval_runtime": 1.7785,
"eval_samples_per_second": 87.153,
"eval_steps_per_second": 11.246,
"step": 560
},
{
"epoch": 1.6282420749279538,
"grad_norm": 0.4640989899635315,
"learning_rate": 0.00013746397694524494,
"loss": 0.1897,
"step": 565
},
{
"epoch": 1.6426512968299711,
"grad_norm": 0.4140304625034332,
"learning_rate": 0.00013602305475504323,
"loss": 0.2143,
"step": 570
},
{
"epoch": 1.6426512968299711,
"eval_loss": 0.20059892535209656,
"eval_runtime": 1.7771,
"eval_samples_per_second": 87.22,
"eval_steps_per_second": 11.254,
"step": 570
},
{
"epoch": 1.6570605187319885,
"grad_norm": 0.5634166598320007,
"learning_rate": 0.0001345821325648415,
"loss": 0.1838,
"step": 575
},
{
"epoch": 1.6714697406340058,
"grad_norm": 0.4056183993816376,
"learning_rate": 0.00013314121037463975,
"loss": 0.1843,
"step": 580
},
{
"epoch": 1.6714697406340058,
"eval_loss": 0.20035365223884583,
"eval_runtime": 1.7767,
"eval_samples_per_second": 87.241,
"eval_steps_per_second": 11.257,
"step": 580
},
{
"epoch": 1.685878962536023,
"grad_norm": 0.4725261628627777,
"learning_rate": 0.00013170028818443804,
"loss": 0.2345,
"step": 585
},
{
"epoch": 1.7002881844380404,
"grad_norm": 0.4181499481201172,
"learning_rate": 0.0001302593659942363,
"loss": 0.2028,
"step": 590
},
{
"epoch": 1.7002881844380404,
"eval_loss": 0.2000962197780609,
"eval_runtime": 1.781,
"eval_samples_per_second": 87.028,
"eval_steps_per_second": 11.229,
"step": 590
},
{
"epoch": 1.7146974063400577,
"grad_norm": 0.5479499101638794,
"learning_rate": 0.00012881844380403456,
"loss": 0.2121,
"step": 595
},
{
"epoch": 1.729106628242075,
"grad_norm": 0.49796512722969055,
"learning_rate": 0.00012737752161383283,
"loss": 0.1993,
"step": 600
},
{
"epoch": 1.729106628242075,
"eval_loss": 0.2000320702791214,
"eval_runtime": 1.7839,
"eval_samples_per_second": 86.888,
"eval_steps_per_second": 11.211,
"step": 600
},
{
"epoch": 1.7435158501440924,
"grad_norm": 0.4585292339324951,
"learning_rate": 0.00012593659942363111,
"loss": 0.1885,
"step": 605
},
{
"epoch": 1.7579250720461095,
"grad_norm": 0.5045236945152283,
"learning_rate": 0.00012449567723342938,
"loss": 0.2005,
"step": 610
},
{
"epoch": 1.7579250720461095,
"eval_loss": 0.19937343895435333,
"eval_runtime": 1.7792,
"eval_samples_per_second": 87.119,
"eval_steps_per_second": 11.241,
"step": 610
},
{
"epoch": 1.7723342939481268,
"grad_norm": 0.4200640022754669,
"learning_rate": 0.00012305475504322767,
"loss": 0.2043,
"step": 615
},
{
"epoch": 1.7867435158501441,
"grad_norm": 0.44112369418144226,
"learning_rate": 0.00012161383285302593,
"loss": 0.2055,
"step": 620
},
{
"epoch": 1.7867435158501441,
"eval_loss": 0.19952693581581116,
"eval_runtime": 1.7818,
"eval_samples_per_second": 86.993,
"eval_steps_per_second": 11.225,
"step": 620
},
{
"epoch": 1.8011527377521612,
"grad_norm": 0.4573463499546051,
"learning_rate": 0.00012017291066282419,
"loss": 0.1886,
"step": 625
},
{
"epoch": 1.8155619596541785,
"grad_norm": 0.4851074516773224,
"learning_rate": 0.00011873198847262246,
"loss": 0.1967,
"step": 630
},
{
"epoch": 1.8155619596541785,
"eval_loss": 0.19990864396095276,
"eval_runtime": 1.7816,
"eval_samples_per_second": 87.0,
"eval_steps_per_second": 11.226,
"step": 630
},
{
"epoch": 1.8299711815561959,
"grad_norm": 0.4560496509075165,
"learning_rate": 0.00011729106628242074,
"loss": 0.2134,
"step": 635
},
{
"epoch": 1.8443804034582132,
"grad_norm": 0.5062277317047119,
"learning_rate": 0.00011585014409221902,
"loss": 0.1999,
"step": 640
},
{
"epoch": 1.8443804034582132,
"eval_loss": 0.19952501356601715,
"eval_runtime": 1.7865,
"eval_samples_per_second": 86.762,
"eval_steps_per_second": 11.195,
"step": 640
},
{
"epoch": 1.8587896253602305,
"grad_norm": 0.5651894807815552,
"learning_rate": 0.00011440922190201728,
"loss": 0.203,
"step": 645
},
{
"epoch": 1.8731988472622478,
"grad_norm": 0.4294355809688568,
"learning_rate": 0.00011296829971181555,
"loss": 0.1899,
"step": 650
},
{
"epoch": 1.8731988472622478,
"eval_loss": 0.19932051002979279,
"eval_runtime": 1.7807,
"eval_samples_per_second": 87.044,
"eval_steps_per_second": 11.231,
"step": 650
},
{
"epoch": 1.8876080691642652,
"grad_norm": 0.49073395133018494,
"learning_rate": 0.00011152737752161383,
"loss": 0.1719,
"step": 655
},
{
"epoch": 1.9020172910662825,
"grad_norm": 0.5511584877967834,
"learning_rate": 0.00011008645533141209,
"loss": 0.1819,
"step": 660
},
{
"epoch": 1.9020172910662825,
"eval_loss": 0.19900086522102356,
"eval_runtime": 1.7807,
"eval_samples_per_second": 87.044,
"eval_steps_per_second": 11.232,
"step": 660
},
{
"epoch": 1.9164265129682998,
"grad_norm": 0.5805220007896423,
"learning_rate": 0.00010864553314121037,
"loss": 0.2017,
"step": 665
},
{
"epoch": 1.9308357348703171,
"grad_norm": 0.44642174243927,
"learning_rate": 0.00010720461095100864,
"loss": 0.1737,
"step": 670
},
{
"epoch": 1.9308357348703171,
"eval_loss": 0.19866690039634705,
"eval_runtime": 1.7717,
"eval_samples_per_second": 87.485,
"eval_steps_per_second": 11.288,
"step": 670
},
{
"epoch": 1.9452449567723344,
"grad_norm": 0.42899489402770996,
"learning_rate": 0.0001057636887608069,
"loss": 0.1939,
"step": 675
},
{
"epoch": 1.9596541786743515,
"grad_norm": 0.4786996841430664,
"learning_rate": 0.00010432276657060518,
"loss": 0.2328,
"step": 680
},
{
"epoch": 1.9596541786743515,
"eval_loss": 0.1981932669878006,
"eval_runtime": 1.7915,
"eval_samples_per_second": 86.522,
"eval_steps_per_second": 11.164,
"step": 680
},
{
"epoch": 1.9740634005763689,
"grad_norm": 0.44985663890838623,
"learning_rate": 0.00010288184438040345,
"loss": 0.1819,
"step": 685
},
{
"epoch": 1.9884726224783862,
"grad_norm": 0.42518705129623413,
"learning_rate": 0.00010144092219020172,
"loss": 0.2063,
"step": 690
},
{
"epoch": 1.9884726224783862,
"eval_loss": 0.19816331565380096,
"eval_runtime": 1.778,
"eval_samples_per_second": 87.175,
"eval_steps_per_second": 11.248,
"step": 690
},
{
"epoch": 2.0028818443804033,
"grad_norm": 0.4421190619468689,
"learning_rate": 9.999999999999999e-05,
"loss": 0.2381,
"step": 695
},
{
"epoch": 2.0172910662824206,
"grad_norm": 0.44720008969306946,
"learning_rate": 9.855907780979825e-05,
"loss": 0.1827,
"step": 700
},
{
"epoch": 2.0172910662824206,
"eval_loss": 0.1987195909023285,
"eval_runtime": 1.8003,
"eval_samples_per_second": 86.095,
"eval_steps_per_second": 11.109,
"step": 700
},
{
"epoch": 2.031700288184438,
"grad_norm": 0.5390461683273315,
"learning_rate": 9.711815561959653e-05,
"loss": 0.2097,
"step": 705
},
{
"epoch": 2.0461095100864553,
"grad_norm": 0.4617297947406769,
"learning_rate": 9.56772334293948e-05,
"loss": 0.2101,
"step": 710
},
{
"epoch": 2.0461095100864553,
"eval_loss": 0.19854187965393066,
"eval_runtime": 1.7961,
"eval_samples_per_second": 86.299,
"eval_steps_per_second": 11.135,
"step": 710
},
{
"epoch": 2.0605187319884726,
"grad_norm": 0.4737541079521179,
"learning_rate": 9.423631123919308e-05,
"loss": 0.208,
"step": 715
},
{
"epoch": 2.07492795389049,
"grad_norm": 0.582775354385376,
"learning_rate": 9.279538904899135e-05,
"loss": 0.1854,
"step": 720
},
{
"epoch": 2.07492795389049,
"eval_loss": 0.19860170781612396,
"eval_runtime": 1.7927,
"eval_samples_per_second": 86.464,
"eval_steps_per_second": 11.157,
"step": 720
},
{
"epoch": 2.089337175792507,
"grad_norm": 0.532686173915863,
"learning_rate": 9.135446685878962e-05,
"loss": 0.1972,
"step": 725
},
{
"epoch": 2.1037463976945245,
"grad_norm": 0.5368837714195251,
"learning_rate": 8.991354466858788e-05,
"loss": 0.1933,
"step": 730
},
{
"epoch": 2.1037463976945245,
"eval_loss": 0.1989794820547104,
"eval_runtime": 1.787,
"eval_samples_per_second": 86.738,
"eval_steps_per_second": 11.192,
"step": 730
},
{
"epoch": 2.118155619596542,
"grad_norm": 0.4096311032772064,
"learning_rate": 8.847262247838615e-05,
"loss": 0.216,
"step": 735
},
{
"epoch": 2.132564841498559,
"grad_norm": 0.565958559513092,
"learning_rate": 8.703170028818443e-05,
"loss": 0.2091,
"step": 740
},
{
"epoch": 2.132564841498559,
"eval_loss": 0.1991860717535019,
"eval_runtime": 1.7801,
"eval_samples_per_second": 87.076,
"eval_steps_per_second": 11.236,
"step": 740
},
{
"epoch": 2.1469740634005765,
"grad_norm": 0.49229133129119873,
"learning_rate": 8.55907780979827e-05,
"loss": 0.1869,
"step": 745
},
{
"epoch": 2.161383285302594,
"grad_norm": 0.4366638660430908,
"learning_rate": 8.414985590778098e-05,
"loss": 0.1694,
"step": 750
},
{
"epoch": 2.161383285302594,
"eval_loss": 0.1993846446275711,
"eval_runtime": 1.7803,
"eval_samples_per_second": 87.063,
"eval_steps_per_second": 11.234,
"step": 750
},
{
"epoch": 2.175792507204611,
"grad_norm": 0.5318723320960999,
"learning_rate": 8.270893371757926e-05,
"loss": 0.1953,
"step": 755
},
{
"epoch": 2.1902017291066285,
"grad_norm": 0.4861218333244324,
"learning_rate": 8.12680115273775e-05,
"loss": 0.1721,
"step": 760
},
{
"epoch": 2.1902017291066285,
"eval_loss": 0.19942361116409302,
"eval_runtime": 1.8023,
"eval_samples_per_second": 86.001,
"eval_steps_per_second": 11.097,
"step": 760
},
{
"epoch": 2.2046109510086453,
"grad_norm": 0.5396477580070496,
"learning_rate": 7.982708933717578e-05,
"loss": 0.1927,
"step": 765
},
{
"epoch": 2.2190201729106627,
"grad_norm": 0.4643673598766327,
"learning_rate": 7.838616714697405e-05,
"loss": 0.1883,
"step": 770
},
{
"epoch": 2.2190201729106627,
"eval_loss": 0.19924961030483246,
"eval_runtime": 1.7928,
"eval_samples_per_second": 86.457,
"eval_steps_per_second": 11.156,
"step": 770
},
{
"epoch": 2.23342939481268,
"grad_norm": 0.4864201545715332,
"learning_rate": 7.694524495677233e-05,
"loss": 0.1801,
"step": 775
},
{
"epoch": 2.2478386167146973,
"grad_norm": 0.41535478830337524,
"learning_rate": 7.55043227665706e-05,
"loss": 0.1779,
"step": 780
},
{
"epoch": 2.2478386167146973,
"eval_loss": 0.19901590049266815,
"eval_runtime": 1.7833,
"eval_samples_per_second": 86.917,
"eval_steps_per_second": 11.215,
"step": 780
},
{
"epoch": 2.2622478386167146,
"grad_norm": 0.4977608025074005,
"learning_rate": 7.406340057636887e-05,
"loss": 0.1801,
"step": 785
},
{
"epoch": 2.276657060518732,
"grad_norm": 0.4228823781013489,
"learning_rate": 7.262247838616714e-05,
"loss": 0.1992,
"step": 790
},
{
"epoch": 2.276657060518732,
"eval_loss": 0.19835665822029114,
"eval_runtime": 1.7963,
"eval_samples_per_second": 86.289,
"eval_steps_per_second": 11.134,
"step": 790
},
{
"epoch": 2.2910662824207493,
"grad_norm": 0.5940558314323425,
"learning_rate": 7.118155619596542e-05,
"loss": 0.1877,
"step": 795
},
{
"epoch": 2.3054755043227666,
"grad_norm": 0.5785874724388123,
"learning_rate": 6.974063400576368e-05,
"loss": 0.1987,
"step": 800
},
{
"epoch": 2.3054755043227666,
"eval_loss": 0.19842053949832916,
"eval_runtime": 1.7908,
"eval_samples_per_second": 86.553,
"eval_steps_per_second": 11.168,
"step": 800
},
{
"epoch": 2.319884726224784,
"grad_norm": 0.4548996388912201,
"learning_rate": 6.829971181556196e-05,
"loss": 0.1903,
"step": 805
},
{
"epoch": 2.3342939481268012,
"grad_norm": 0.5720356106758118,
"learning_rate": 6.685878962536023e-05,
"loss": 0.1817,
"step": 810
},
{
"epoch": 2.3342939481268012,
"eval_loss": 0.19885385036468506,
"eval_runtime": 1.7713,
"eval_samples_per_second": 87.505,
"eval_steps_per_second": 11.291,
"step": 810
},
{
"epoch": 2.3487031700288186,
"grad_norm": 0.6115606427192688,
"learning_rate": 6.541786743515849e-05,
"loss": 0.1793,
"step": 815
},
{
"epoch": 2.363112391930836,
"grad_norm": 0.5724362134933472,
"learning_rate": 6.397694524495677e-05,
"loss": 0.2322,
"step": 820
},
{
"epoch": 2.363112391930836,
"eval_loss": 0.19859647750854492,
"eval_runtime": 1.7959,
"eval_samples_per_second": 86.31,
"eval_steps_per_second": 11.137,
"step": 820
},
{
"epoch": 2.377521613832853,
"grad_norm": 0.5599442720413208,
"learning_rate": 6.253602305475504e-05,
"loss": 0.2018,
"step": 825
},
{
"epoch": 2.39193083573487,
"grad_norm": 0.5310724377632141,
"learning_rate": 6.10951008645533e-05,
"loss": 0.1891,
"step": 830
},
{
"epoch": 2.39193083573487,
"eval_loss": 0.19837234914302826,
"eval_runtime": 1.7902,
"eval_samples_per_second": 86.584,
"eval_steps_per_second": 11.172,
"step": 830
},
{
"epoch": 2.4063400576368874,
"grad_norm": 0.49322766065597534,
"learning_rate": 5.9654178674351575e-05,
"loss": 0.1905,
"step": 835
},
{
"epoch": 2.4207492795389047,
"grad_norm": 0.5298819541931152,
"learning_rate": 5.821325648414985e-05,
"loss": 0.1884,
"step": 840
},
{
"epoch": 2.4207492795389047,
"eval_loss": 0.19790256023406982,
"eval_runtime": 1.7729,
"eval_samples_per_second": 87.428,
"eval_steps_per_second": 11.281,
"step": 840
},
{
"epoch": 2.435158501440922,
"grad_norm": 0.4365543723106384,
"learning_rate": 5.6772334293948125e-05,
"loss": 0.19,
"step": 845
},
{
"epoch": 2.4495677233429394,
"grad_norm": 0.5986719131469727,
"learning_rate": 5.533141210374639e-05,
"loss": 0.2034,
"step": 850
},
{
"epoch": 2.4495677233429394,
"eval_loss": 0.197686105966568,
"eval_runtime": 1.7835,
"eval_samples_per_second": 86.908,
"eval_steps_per_second": 11.214,
"step": 850
},
{
"epoch": 2.4639769452449567,
"grad_norm": 0.4506608247756958,
"learning_rate": 5.389048991354466e-05,
"loss": 0.1498,
"step": 855
},
{
"epoch": 2.478386167146974,
"grad_norm": 0.4216344654560089,
"learning_rate": 5.244956772334294e-05,
"loss": 0.1995,
"step": 860
},
{
"epoch": 2.478386167146974,
"eval_loss": 0.19728927314281464,
"eval_runtime": 1.7891,
"eval_samples_per_second": 86.636,
"eval_steps_per_second": 11.179,
"step": 860
},
{
"epoch": 2.4927953890489913,
"grad_norm": 0.5227183103561401,
"learning_rate": 5.100864553314121e-05,
"loss": 0.1796,
"step": 865
},
{
"epoch": 2.5072046109510087,
"grad_norm": 0.5168836712837219,
"learning_rate": 4.9567723342939476e-05,
"loss": 0.1684,
"step": 870
},
{
"epoch": 2.5072046109510087,
"eval_loss": 0.1969619244337082,
"eval_runtime": 1.7718,
"eval_samples_per_second": 87.481,
"eval_steps_per_second": 11.288,
"step": 870
},
{
"epoch": 2.521613832853026,
"grad_norm": 0.4830915629863739,
"learning_rate": 4.812680115273775e-05,
"loss": 0.1791,
"step": 875
},
{
"epoch": 2.5360230547550433,
"grad_norm": 0.46774524450302124,
"learning_rate": 4.668587896253602e-05,
"loss": 0.1686,
"step": 880
},
{
"epoch": 2.5360230547550433,
"eval_loss": 0.1968482881784439,
"eval_runtime": 1.7701,
"eval_samples_per_second": 87.564,
"eval_steps_per_second": 11.299,
"step": 880
},
{
"epoch": 2.5504322766570606,
"grad_norm": 0.5271076560020447,
"learning_rate": 4.524495677233429e-05,
"loss": 0.2292,
"step": 885
},
{
"epoch": 2.564841498559078,
"grad_norm": 0.5477223992347717,
"learning_rate": 4.3804034582132564e-05,
"loss": 0.2002,
"step": 890
},
{
"epoch": 2.564841498559078,
"eval_loss": 0.19697827100753784,
"eval_runtime": 1.7775,
"eval_samples_per_second": 87.202,
"eval_steps_per_second": 11.252,
"step": 890
},
{
"epoch": 2.5792507204610953,
"grad_norm": 0.5050249695777893,
"learning_rate": 4.236311239193083e-05,
"loss": 0.1842,
"step": 895
},
{
"epoch": 2.5936599423631126,
"grad_norm": 0.4689369201660156,
"learning_rate": 4.09221902017291e-05,
"loss": 0.1989,
"step": 900
},
{
"epoch": 2.5936599423631126,
"eval_loss": 0.19711896777153015,
"eval_runtime": 1.7779,
"eval_samples_per_second": 87.181,
"eval_steps_per_second": 11.249,
"step": 900
},
{
"epoch": 2.60806916426513,
"grad_norm": 0.4809912145137787,
"learning_rate": 3.948126801152737e-05,
"loss": 0.1896,
"step": 905
},
{
"epoch": 2.6224783861671472,
"grad_norm": 0.5310996174812317,
"learning_rate": 3.8040345821325645e-05,
"loss": 0.1837,
"step": 910
},
{
"epoch": 2.6224783861671472,
"eval_loss": 0.19707301259040833,
"eval_runtime": 1.7997,
"eval_samples_per_second": 86.127,
"eval_steps_per_second": 11.113,
"step": 910
},
{
"epoch": 2.636887608069164,
"grad_norm": 0.5194202065467834,
"learning_rate": 3.6599423631123914e-05,
"loss": 0.1965,
"step": 915
},
{
"epoch": 2.6512968299711814,
"grad_norm": 0.5268927812576294,
"learning_rate": 3.515850144092219e-05,
"loss": 0.1978,
"step": 920
},
{
"epoch": 2.6512968299711814,
"eval_loss": 0.19690388441085815,
"eval_runtime": 1.7802,
"eval_samples_per_second": 87.069,
"eval_steps_per_second": 11.235,
"step": 920
},
{
"epoch": 2.6657060518731988,
"grad_norm": 0.43076756596565247,
"learning_rate": 3.371757925072046e-05,
"loss": 0.1616,
"step": 925
},
{
"epoch": 2.680115273775216,
"grad_norm": 0.4744175374507904,
"learning_rate": 3.2276657060518727e-05,
"loss": 0.2015,
"step": 930
},
{
"epoch": 2.680115273775216,
"eval_loss": 0.1967049390077591,
"eval_runtime": 1.7776,
"eval_samples_per_second": 87.198,
"eval_steps_per_second": 11.251,
"step": 930
},
{
"epoch": 2.6945244956772334,
"grad_norm": 0.5074586868286133,
"learning_rate": 3.0835734870317e-05,
"loss": 0.2332,
"step": 935
},
{
"epoch": 2.7089337175792507,
"grad_norm": 0.6548565030097961,
"learning_rate": 2.939481268011527e-05,
"loss": 0.1998,
"step": 940
},
{
"epoch": 2.7089337175792507,
"eval_loss": 0.19677585363388062,
"eval_runtime": 1.7853,
"eval_samples_per_second": 86.821,
"eval_steps_per_second": 11.203,
"step": 940
},
{
"epoch": 2.723342939481268,
"grad_norm": 0.47033998370170593,
"learning_rate": 2.7953890489913543e-05,
"loss": 0.1756,
"step": 945
},
{
"epoch": 2.7377521613832854,
"grad_norm": 0.4205470383167267,
"learning_rate": 2.6512968299711815e-05,
"loss": 0.1866,
"step": 950
},
{
"epoch": 2.7377521613832854,
"eval_loss": 0.1967228651046753,
"eval_runtime": 1.7804,
"eval_samples_per_second": 87.058,
"eval_steps_per_second": 11.233,
"step": 950
},
{
"epoch": 2.7521613832853027,
"grad_norm": 0.47508129477500916,
"learning_rate": 2.5072046109510083e-05,
"loss": 0.1741,
"step": 955
},
{
"epoch": 2.76657060518732,
"grad_norm": 0.412384569644928,
"learning_rate": 2.3631123919308355e-05,
"loss": 0.2009,
"step": 960
},
{
"epoch": 2.76657060518732,
"eval_loss": 0.1967916190624237,
"eval_runtime": 1.7727,
"eval_samples_per_second": 87.439,
"eval_steps_per_second": 11.282,
"step": 960
},
{
"epoch": 2.7809798270893373,
"grad_norm": 0.6153486371040344,
"learning_rate": 2.2190201729106624e-05,
"loss": 0.2123,
"step": 965
},
{
"epoch": 2.795389048991354,
"grad_norm": 0.4739130139350891,
"learning_rate": 2.07492795389049e-05,
"loss": 0.1686,
"step": 970
},
{
"epoch": 2.795389048991354,
"eval_loss": 0.19692417979240417,
"eval_runtime": 1.7768,
"eval_samples_per_second": 87.234,
"eval_steps_per_second": 11.256,
"step": 970
},
{
"epoch": 2.8097982708933715,
"grad_norm": 0.5100451707839966,
"learning_rate": 1.9308357348703168e-05,
"loss": 0.1524,
"step": 975
},
{
"epoch": 2.824207492795389,
"grad_norm": 0.4990577697753906,
"learning_rate": 1.786743515850144e-05,
"loss": 0.189,
"step": 980
},
{
"epoch": 2.824207492795389,
"eval_loss": 0.19698160886764526,
"eval_runtime": 1.7793,
"eval_samples_per_second": 87.113,
"eval_steps_per_second": 11.24,
"step": 980
},
{
"epoch": 2.838616714697406,
"grad_norm": 0.5253724455833435,
"learning_rate": 1.6426512968299712e-05,
"loss": 0.1804,
"step": 985
},
{
"epoch": 2.8530259365994235,
"grad_norm": 0.4607682526111603,
"learning_rate": 1.4985590778097981e-05,
"loss": 0.174,
"step": 990
},
{
"epoch": 2.8530259365994235,
"eval_loss": 0.19698897004127502,
"eval_runtime": 1.7737,
"eval_samples_per_second": 87.387,
"eval_steps_per_second": 11.276,
"step": 990
},
{
"epoch": 2.867435158501441,
"grad_norm": 0.525158166885376,
"learning_rate": 1.3544668587896251e-05,
"loss": 0.1927,
"step": 995
},
{
"epoch": 2.881844380403458,
"grad_norm": 0.5077706575393677,
"learning_rate": 1.2103746397694523e-05,
"loss": 0.1794,
"step": 1000
},
{
"epoch": 2.881844380403458,
"eval_loss": 0.19700436294078827,
"eval_runtime": 1.8163,
"eval_samples_per_second": 85.339,
"eval_steps_per_second": 11.012,
"step": 1000
}
],
"logging_steps": 5,
"max_steps": 1041,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.2719566413103104e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}