{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.991394148020654, "eval_steps": 500, "global_step": 870, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03442340791738382, "grad_norm": 1101.9962158203125, "learning_rate": 2.2988505747126437e-06, "loss": 2.9358, "step": 10 }, { "epoch": 0.06884681583476764, "grad_norm": 112.010009765625, "learning_rate": 8.045977011494253e-06, "loss": 2.8836, "step": 20 }, { "epoch": 0.10327022375215146, "grad_norm": 70.39278411865234, "learning_rate": 1.3793103448275863e-05, "loss": 2.5828, "step": 30 }, { "epoch": 0.13769363166953527, "grad_norm": 10.689151763916016, "learning_rate": 1.9540229885057475e-05, "loss": 2.0904, "step": 40 }, { "epoch": 0.1721170395869191, "grad_norm": 21.27436637878418, "learning_rate": 2.5287356321839083e-05, "loss": 1.0867, "step": 50 }, { "epoch": 0.20654044750430292, "grad_norm": 191.7716064453125, "learning_rate": 3.103448275862069e-05, "loss": 0.4477, "step": 60 }, { "epoch": 0.24096385542168675, "grad_norm": 11.0482816696167, "learning_rate": 3.67816091954023e-05, "loss": 0.5028, "step": 70 }, { "epoch": 0.27538726333907054, "grad_norm": 2.7239110469818115, "learning_rate": 4.1379310344827587e-05, "loss": 0.3772, "step": 80 }, { "epoch": 0.3098106712564544, "grad_norm": 1.2920331954956055, "learning_rate": 4.7126436781609195e-05, "loss": 0.3437, "step": 90 }, { "epoch": 0.3442340791738382, "grad_norm": 0.8523675799369812, "learning_rate": 4.999496949392727e-05, "loss": 0.3507, "step": 100 }, { "epoch": 0.37865748709122204, "grad_norm": 0.7762532234191895, "learning_rate": 4.9954737591406555e-05, "loss": 0.2585, "step": 110 }, { "epoch": 0.41308089500860584, "grad_norm": 0.6530282497406006, "learning_rate": 4.9874338543634794e-05, "loss": 0.2455, "step": 120 }, { "epoch": 0.4475043029259897, "grad_norm": 1.0387413501739502, "learning_rate": 4.975390176091809e-05, "loss": 0.24, "step": 130 }, { "epoch": 0.4819277108433735, "grad_norm": 0.45220646262168884, "learning_rate": 4.959362109830007e-05, "loss": 0.2177, "step": 140 }, { "epoch": 0.5163511187607573, "grad_norm": 0.3468196392059326, "learning_rate": 4.9393754543532886e-05, "loss": 0.2083, "step": 150 }, { "epoch": 0.5507745266781411, "grad_norm": 1.368167519569397, "learning_rate": 4.915462380182008e-05, "loss": 0.2119, "step": 160 }, { "epoch": 0.5851979345955249, "grad_norm": 0.3761076331138611, "learning_rate": 4.887661377799989e-05, "loss": 0.1688, "step": 170 }, { "epoch": 0.6196213425129088, "grad_norm": 0.5200079083442688, "learning_rate": 4.856017195700247e-05, "loss": 0.2084, "step": 180 }, { "epoch": 0.6540447504302926, "grad_norm": 0.7359570264816284, "learning_rate": 4.820580768357815e-05, "loss": 0.2355, "step": 190 }, { "epoch": 0.6884681583476764, "grad_norm": 0.6369001865386963, "learning_rate": 4.781409134245608e-05, "loss": 0.1958, "step": 200 }, { "epoch": 0.7228915662650602, "grad_norm": 0.6961612105369568, "learning_rate": 4.7385653440253026e-05, "loss": 0.222, "step": 210 }, { "epoch": 0.7573149741824441, "grad_norm": 0.2700839638710022, "learning_rate": 4.692118359060992e-05, "loss": 0.1754, "step": 220 }, { "epoch": 0.7917383820998278, "grad_norm": 0.5857542157173157, "learning_rate": 4.642142940418973e-05, "loss": 0.1893, "step": 230 }, { "epoch": 0.8261617900172117, "grad_norm": 1.13412344455719, "learning_rate": 4.588719528532342e-05, "loss": 0.1897, "step": 240 }, { "epoch": 0.8605851979345955, "grad_norm": 0.42694586515426636, "learning_rate": 4.5319341137240626e-05, "loss": 0.1583, "step": 250 }, { "epoch": 0.8950086058519794, "grad_norm": 0.8095172047615051, "learning_rate": 4.471878097796958e-05, "loss": 0.1879, "step": 260 }, { "epoch": 0.9294320137693631, "grad_norm": 0.5348218679428101, "learning_rate": 4.408648146913357e-05, "loss": 0.1788, "step": 270 }, { "epoch": 0.963855421686747, "grad_norm": 0.42760810256004333, "learning_rate": 4.3423460360012495e-05, "loss": 0.1745, "step": 280 }, { "epoch": 0.9982788296041308, "grad_norm": 0.5095299482345581, "learning_rate": 4.2730784849373615e-05, "loss": 0.2198, "step": 290 }, { "epoch": 1.0309810671256454, "grad_norm": 0.24384614825248718, "learning_rate": 4.20095698677084e-05, "loss": 0.1812, "step": 300 }, { "epoch": 1.0654044750430294, "grad_norm": 0.40465590357780457, "learning_rate": 4.1260976282640466e-05, "loss": 0.1707, "step": 310 }, { "epoch": 1.0998278829604131, "grad_norm": 0.2638281285762787, "learning_rate": 4.048620903039308e-05, "loss": 0.1462, "step": 320 }, { "epoch": 1.1342512908777969, "grad_norm": 0.2884138524532318, "learning_rate": 3.968651517632376e-05, "loss": 0.1684, "step": 330 }, { "epoch": 1.1686746987951806, "grad_norm": 0.2671195864677429, "learning_rate": 3.886318190764797e-05, "loss": 0.1522, "step": 340 }, { "epoch": 1.2030981067125646, "grad_norm": 0.40242624282836914, "learning_rate": 3.801753446158256e-05, "loss": 0.1743, "step": 350 }, { "epoch": 1.2375215146299483, "grad_norm": 0.2599651515483856, "learning_rate": 3.715093399224398e-05, "loss": 0.1635, "step": 360 }, { "epoch": 1.2719449225473323, "grad_norm": 0.4389915466308594, "learning_rate": 3.626477537973462e-05, "loss": 0.2023, "step": 370 }, { "epoch": 1.306368330464716, "grad_norm": 0.6437344551086426, "learning_rate": 3.53604849849438e-05, "loss": 0.1639, "step": 380 }, { "epoch": 1.3407917383820998, "grad_norm": 0.2979753315448761, "learning_rate": 3.443951835367736e-05, "loss": 0.1639, "step": 390 }, { "epoch": 1.3752151462994837, "grad_norm": 1.0099725723266602, "learning_rate": 3.3503357873811105e-05, "loss": 0.1516, "step": 400 }, { "epoch": 1.4096385542168675, "grad_norm": 0.2964365780353546, "learning_rate": 3.2553510389239313e-05, "loss": 0.1872, "step": 410 }, { "epoch": 1.4440619621342514, "grad_norm": 0.32599982619285583, "learning_rate": 3.159150477445886e-05, "loss": 0.1705, "step": 420 }, { "epoch": 1.4784853700516352, "grad_norm": 0.46588459610939026, "learning_rate": 3.0618889473692756e-05, "loss": 0.1713, "step": 430 }, { "epoch": 1.512908777969019, "grad_norm": 0.4096398949623108, "learning_rate": 2.9637230008514494e-05, "loss": 0.1459, "step": 440 }, { "epoch": 1.5473321858864026, "grad_norm": 2.8078343868255615, "learning_rate": 2.864810645798442e-05, "loss": 0.166, "step": 450 }, { "epoch": 1.5817555938037866, "grad_norm": 0.25247690081596375, "learning_rate": 2.7653110915354542e-05, "loss": 0.1379, "step": 460 }, { "epoch": 1.6161790017211703, "grad_norm": 0.31816768646240234, "learning_rate": 2.6653844925435295e-05, "loss": 0.1596, "step": 470 }, { "epoch": 1.6506024096385543, "grad_norm": 0.29774752259254456, "learning_rate": 2.5651916906748995e-05, "loss": 0.1412, "step": 480 }, { "epoch": 1.685025817555938, "grad_norm": 0.4214092493057251, "learning_rate": 2.4648939562619423e-05, "loss": 0.1643, "step": 490 }, { "epoch": 1.7194492254733218, "grad_norm": 0.35638073086738586, "learning_rate": 2.3646527285364565e-05, "loss": 0.174, "step": 500 }, { "epoch": 1.7538726333907055, "grad_norm": 0.3502406179904938, "learning_rate": 2.2646293557770787e-05, "loss": 0.1862, "step": 510 }, { "epoch": 1.7882960413080895, "grad_norm": 0.2601950168609619, "learning_rate": 2.1649848356030945e-05, "loss": 0.1501, "step": 520 }, { "epoch": 1.8227194492254735, "grad_norm": 1.0533198118209839, "learning_rate": 2.0658795558326743e-05, "loss": 0.1461, "step": 530 }, { "epoch": 1.8571428571428572, "grad_norm": 0.36168837547302246, "learning_rate": 1.967473036322644e-05, "loss": 0.1522, "step": 540 }, { "epoch": 1.891566265060241, "grad_norm": 0.4589601457118988, "learning_rate": 1.869923672205322e-05, "loss": 0.1682, "step": 550 }, { "epoch": 1.9259896729776247, "grad_norm": 0.2802368700504303, "learning_rate": 1.7733884789357226e-05, "loss": 0.1657, "step": 560 }, { "epoch": 1.9604130808950087, "grad_norm": 0.4951450526714325, "learning_rate": 1.6780228395594777e-05, "loss": 0.1534, "step": 570 }, { "epoch": 1.9948364888123924, "grad_norm": 0.3530052602291107, "learning_rate": 1.58398025460829e-05, "loss": 0.1904, "step": 580 }, { "epoch": 2.027538726333907, "grad_norm": 0.32037100195884705, "learning_rate": 1.491412095025479e-05, "loss": 0.1301, "step": 590 }, { "epoch": 2.061962134251291, "grad_norm": 0.4082101285457611, "learning_rate": 1.4004673585193145e-05, "loss": 0.141, "step": 600 }, { "epoch": 2.0963855421686746, "grad_norm": 0.5846110582351685, "learning_rate": 1.3112924297363066e-05, "loss": 0.1167, "step": 610 }, { "epoch": 2.1308089500860588, "grad_norm": 0.3282460570335388, "learning_rate": 1.2240308446404797e-05, "loss": 0.136, "step": 620 }, { "epoch": 2.1652323580034425, "grad_norm": 0.3618537187576294, "learning_rate": 1.1388230594778888e-05, "loss": 0.1321, "step": 630 }, { "epoch": 2.1996557659208262, "grad_norm": 0.4230036437511444, "learning_rate": 1.0558062246982484e-05, "loss": 0.1474, "step": 640 }, { "epoch": 2.23407917383821, "grad_norm": 0.6187330484390259, "learning_rate": 9.751139641975682e-06, "loss": 0.1216, "step": 650 }, { "epoch": 2.2685025817555937, "grad_norm": 0.6022536754608154, "learning_rate": 8.96876160237133e-06, "loss": 0.1328, "step": 660 }, { "epoch": 2.3029259896729775, "grad_norm": 0.47852516174316406, "learning_rate": 8.212187443850203e-06, "loss": 0.1244, "step": 670 }, { "epoch": 2.337349397590361, "grad_norm": 0.8470497727394104, "learning_rate": 7.482634948166442e-06, "loss": 0.1135, "step": 680 }, { "epoch": 2.3717728055077454, "grad_norm": 0.5010228157043457, "learning_rate": 6.7812784030061e-06, "loss": 0.115, "step": 690 }, { "epoch": 2.406196213425129, "grad_norm": 0.3755123019218445, "learning_rate": 6.109246711853711e-06, "loss": 0.1305, "step": 700 }, { "epoch": 2.440619621342513, "grad_norm": 0.464460164308548, "learning_rate": 5.467621576909218e-06, "loss": 0.11, "step": 710 }, { "epoch": 2.4750430292598966, "grad_norm": 0.8106716871261597, "learning_rate": 4.8574357579801495e-06, "loss": 0.1152, "step": 720 }, { "epoch": 2.509466437177281, "grad_norm": 0.6007969975471497, "learning_rate": 4.2796714101514725e-06, "loss": 0.1486, "step": 730 }, { "epoch": 2.5438898450946645, "grad_norm": 0.4238184690475464, "learning_rate": 3.735258502908737e-06, "loss": 0.1256, "step": 740 }, { "epoch": 2.5783132530120483, "grad_norm": 0.6052628755569458, "learning_rate": 3.2250733232592337e-06, "loss": 0.1188, "step": 750 }, { "epoch": 2.612736660929432, "grad_norm": 0.41783905029296875, "learning_rate": 2.749937065260444e-06, "loss": 0.1155, "step": 760 }, { "epoch": 2.6471600688468158, "grad_norm": 0.47119778394699097, "learning_rate": 2.310614508226078e-06, "loss": 0.122, "step": 770 }, { "epoch": 2.6815834767641995, "grad_norm": 0.37864789366722107, "learning_rate": 1.907812785737312e-06, "loss": 0.1164, "step": 780 }, { "epoch": 2.7160068846815832, "grad_norm": 0.5162110924720764, "learning_rate": 1.5421802474406027e-06, "loss": 0.13, "step": 790 }, { "epoch": 2.7504302925989674, "grad_norm": 0.7263408303260803, "learning_rate": 1.214305415464076e-06, "loss": 0.1139, "step": 800 }, { "epoch": 2.784853700516351, "grad_norm": 0.46103435754776, "learning_rate": 9.247160371323449e-07, "loss": 0.1424, "step": 810 }, { "epoch": 2.819277108433735, "grad_norm": 0.4251267910003662, "learning_rate": 6.738782355044049e-07, "loss": 0.1089, "step": 820 }, { "epoch": 2.8537005163511187, "grad_norm": 0.3590181767940521, "learning_rate": 4.6219575910197156e-07, "loss": 0.1212, "step": 830 }, { "epoch": 2.888123924268503, "grad_norm": 0.30818095803260803, "learning_rate": 2.9000933203588153e-07, "loss": 0.1245, "step": 840 }, { "epoch": 2.9225473321858866, "grad_norm": 0.43673816323280334, "learning_rate": 1.575961055765379e-07, "loss": 0.1263, "step": 850 }, { "epoch": 2.9569707401032703, "grad_norm": 0.5171876549720764, "learning_rate": 6.516921205125537e-08, "loss": 0.1232, "step": 860 }, { "epoch": 2.991394148020654, "grad_norm": 0.31435626745224, "learning_rate": 1.2877421786458566e-08, "loss": 0.114, "step": 870 } ], "logging_steps": 10, "max_steps": 870, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.3941567684085023e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }