{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999973032011003, "eval_steps": 500, "global_step": 18540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.026967988997060488, "grad_norm": 1.21619713306427, "learning_rate": 4.9911040721359096e-05, "loss": 0.6704, "step": 500 }, { "epoch": 0.053935977994120976, "grad_norm": 1.506378412246704, "learning_rate": 4.96433714360016e-05, "loss": 0.5236, "step": 1000 }, { "epoch": 0.08090396699118146, "grad_norm": 1.5647770166397095, "learning_rate": 4.919891096381639e-05, "loss": 0.5145, "step": 1500 }, { "epoch": 0.10787195598824195, "grad_norm": 0.6760592460632324, "learning_rate": 4.858084785774071e-05, "loss": 0.5134, "step": 2000 }, { "epoch": 0.13483994498530244, "grad_norm": 0.8161666989326477, "learning_rate": 4.779361609347228e-05, "loss": 0.5069, "step": 2500 }, { "epoch": 0.16180793398236293, "grad_norm": 0.7477109432220459, "learning_rate": 4.684492361710262e-05, "loss": 0.4985, "step": 3000 }, { "epoch": 0.18877592297942342, "grad_norm": 0.9579030275344849, "learning_rate": 4.5737776239310215e-05, "loss": 0.5048, "step": 3500 }, { "epoch": 0.2157439119764839, "grad_norm": 1.604243278503418, "learning_rate": 4.4481856357599805e-05, "loss": 0.4911, "step": 4000 }, { "epoch": 0.2427119009735444, "grad_norm": 0.525109052658081, "learning_rate": 4.3086173922863254e-05, "loss": 0.4954, "step": 4500 }, { "epoch": 0.2696798899706049, "grad_norm": 0.776738703250885, "learning_rate": 4.1560741540506945e-05, "loss": 0.491, "step": 5000 }, { "epoch": 0.29664787896766537, "grad_norm": 0.5928918123245239, "learning_rate": 3.9916502640166816e-05, "loss": 0.4927, "step": 5500 }, { "epoch": 0.32361586796472586, "grad_norm": 0.7824705243110657, "learning_rate": 3.816525296770396e-05, "loss": 0.4915, "step": 6000 }, { "epoch": 0.35058385696178634, "grad_norm": 0.6703963875770569, "learning_rate": 3.631955596269604e-05, "loss": 0.493, "step": 6500 }, { "epoch": 0.37755184595884683, "grad_norm": 0.8029218912124634, "learning_rate": 3.439265262850525e-05, "loss": 0.4891, "step": 7000 }, { "epoch": 0.4045198349559073, "grad_norm": 0.7873429656028748, "learning_rate": 3.2402412922624755e-05, "loss": 0.4893, "step": 7500 }, { "epoch": 0.4314878239529678, "grad_norm": 0.8047146201133728, "learning_rate": 3.035514266481141e-05, "loss": 0.4891, "step": 8000 }, { "epoch": 0.4584558129500283, "grad_norm": 0.7394176125526428, "learning_rate": 2.8269454691719026e-05, "loss": 0.4864, "step": 8500 }, { "epoch": 0.4854238019470888, "grad_norm": 0.6094326972961426, "learning_rate": 2.6160311698410382e-05, "loss": 0.4865, "step": 9000 }, { "epoch": 0.5123917909441493, "grad_norm": 1.3134502172470093, "learning_rate": 2.4042844645920752e-05, "loss": 0.4861, "step": 9500 }, { "epoch": 0.5393597799412098, "grad_norm": 1.6002802848815918, "learning_rate": 2.1932244211964456e-05, "loss": 0.4822, "step": 10000 }, { "epoch": 0.5663277689382703, "grad_norm": 0.8714670538902283, "learning_rate": 1.984365181323471e-05, "loss": 0.4878, "step": 10500 }, { "epoch": 0.5932957579353307, "grad_norm": 0.9478123188018799, "learning_rate": 1.779610742769174e-05, "loss": 0.4827, "step": 11000 }, { "epoch": 0.6202637469323913, "grad_norm": 0.9522613286972046, "learning_rate": 1.5800037269566696e-05, "loss": 0.4822, "step": 11500 }, { "epoch": 0.6472317359294517, "grad_norm": 0.8568246364593506, "learning_rate": 1.386591085629102e-05, "loss": 0.4808, "step": 12000 }, { "epoch": 0.6741997249265123, "grad_norm": 0.8014527559280396, "learning_rate": 1.201166023594709e-05, "loss": 0.4841, "step": 12500 }, { "epoch": 0.7011677139235727, "grad_norm": 0.7167889475822449, "learning_rate": 1.0250587775408596e-05, "loss": 0.4817, "step": 13000 }, { "epoch": 0.7281357029206332, "grad_norm": 0.9786660671234131, "learning_rate": 8.595327382791429e-06, "loss": 0.4762, "step": 13500 }, { "epoch": 0.7551036919176937, "grad_norm": 0.7599090337753296, "learning_rate": 7.05775387198132e-06, "loss": 0.4837, "step": 14000 }, { "epoch": 0.7820716809147542, "grad_norm": 0.6934608221054077, "learning_rate": 5.648897772892467e-06, "loss": 0.4765, "step": 14500 }, { "epoch": 0.8090396699118146, "grad_norm": 1.7073050737380981, "learning_rate": 4.378866198606929e-06, "loss": 0.4769, "step": 15000 }, { "epoch": 0.8360076589088752, "grad_norm": 1.0103133916854858, "learning_rate": 3.256770337093046e-06, "loss": 0.4779, "step": 15500 }, { "epoch": 0.8629756479059356, "grad_norm": 0.8780825138092041, "learning_rate": 2.2906600876759358e-06, "loss": 0.4783, "step": 16000 }, { "epoch": 0.8899436369029962, "grad_norm": 0.7215288877487183, "learning_rate": 1.4874663111773158e-06, "loss": 0.4784, "step": 16500 }, { "epoch": 0.9169116259000566, "grad_norm": 0.8022609353065491, "learning_rate": 8.529511080211772e-07, "loss": 0.4804, "step": 17000 }, { "epoch": 0.9438796148971171, "grad_norm": 0.8722068071365356, "learning_rate": 3.9166648100946724e-07, "loss": 0.4794, "step": 17500 }, { "epoch": 0.9708476038941776, "grad_norm": 0.787874162197113, "learning_rate": 1.0692167932047637e-07, "loss": 0.4803, "step": 18000 }, { "epoch": 0.9978155928912381, "grad_norm": 0.9449836611747742, "learning_rate": 7.928366524107e-10, "loss": 0.4768, "step": 18500 }, { "epoch": 0.999973032011003, "step": 18540, "total_flos": 3.0107423632254566e+17, "train_loss": 0.49351318332626853, "train_runtime": 10689.6882, "train_samples_per_second": 6.938, "train_steps_per_second": 1.734 } ], "logging_steps": 500, "max_steps": 18540, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.0107423632254566e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }