| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.991394148020654, | |
| "eval_steps": 500, | |
| "global_step": 870, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03442340791738382, | |
| "grad_norm": 1101.9962158203125, | |
| "learning_rate": 2.2988505747126437e-06, | |
| "loss": 2.9358, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06884681583476764, | |
| "grad_norm": 112.010009765625, | |
| "learning_rate": 8.045977011494253e-06, | |
| "loss": 2.8836, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10327022375215146, | |
| "grad_norm": 70.39278411865234, | |
| "learning_rate": 1.3793103448275863e-05, | |
| "loss": 2.5828, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.13769363166953527, | |
| "grad_norm": 10.689151763916016, | |
| "learning_rate": 1.9540229885057475e-05, | |
| "loss": 2.0904, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1721170395869191, | |
| "grad_norm": 21.27436637878418, | |
| "learning_rate": 2.5287356321839083e-05, | |
| "loss": 1.0867, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.20654044750430292, | |
| "grad_norm": 191.7716064453125, | |
| "learning_rate": 3.103448275862069e-05, | |
| "loss": 0.4477, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.24096385542168675, | |
| "grad_norm": 11.0482816696167, | |
| "learning_rate": 3.67816091954023e-05, | |
| "loss": 0.5028, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.27538726333907054, | |
| "grad_norm": 2.7239110469818115, | |
| "learning_rate": 4.1379310344827587e-05, | |
| "loss": 0.3772, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3098106712564544, | |
| "grad_norm": 1.2920331954956055, | |
| "learning_rate": 4.7126436781609195e-05, | |
| "loss": 0.3437, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3442340791738382, | |
| "grad_norm": 0.8523675799369812, | |
| "learning_rate": 4.999496949392727e-05, | |
| "loss": 0.3507, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.37865748709122204, | |
| "grad_norm": 0.7762532234191895, | |
| "learning_rate": 4.9954737591406555e-05, | |
| "loss": 0.2585, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.41308089500860584, | |
| "grad_norm": 0.6530282497406006, | |
| "learning_rate": 4.9874338543634794e-05, | |
| "loss": 0.2455, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4475043029259897, | |
| "grad_norm": 1.0387413501739502, | |
| "learning_rate": 4.975390176091809e-05, | |
| "loss": 0.24, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4819277108433735, | |
| "grad_norm": 0.45220646262168884, | |
| "learning_rate": 4.959362109830007e-05, | |
| "loss": 0.2177, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5163511187607573, | |
| "grad_norm": 0.3468196392059326, | |
| "learning_rate": 4.9393754543532886e-05, | |
| "loss": 0.2083, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5507745266781411, | |
| "grad_norm": 1.368167519569397, | |
| "learning_rate": 4.915462380182008e-05, | |
| "loss": 0.2119, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5851979345955249, | |
| "grad_norm": 0.3761076331138611, | |
| "learning_rate": 4.887661377799989e-05, | |
| "loss": 0.1688, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6196213425129088, | |
| "grad_norm": 0.5200079083442688, | |
| "learning_rate": 4.856017195700247e-05, | |
| "loss": 0.2084, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6540447504302926, | |
| "grad_norm": 0.7359570264816284, | |
| "learning_rate": 4.820580768357815e-05, | |
| "loss": 0.2355, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6884681583476764, | |
| "grad_norm": 0.6369001865386963, | |
| "learning_rate": 4.781409134245608e-05, | |
| "loss": 0.1958, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7228915662650602, | |
| "grad_norm": 0.6961612105369568, | |
| "learning_rate": 4.7385653440253026e-05, | |
| "loss": 0.222, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7573149741824441, | |
| "grad_norm": 0.2700839638710022, | |
| "learning_rate": 4.692118359060992e-05, | |
| "loss": 0.1754, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7917383820998278, | |
| "grad_norm": 0.5857542157173157, | |
| "learning_rate": 4.642142940418973e-05, | |
| "loss": 0.1893, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8261617900172117, | |
| "grad_norm": 1.13412344455719, | |
| "learning_rate": 4.588719528532342e-05, | |
| "loss": 0.1897, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8605851979345955, | |
| "grad_norm": 0.42694586515426636, | |
| "learning_rate": 4.5319341137240626e-05, | |
| "loss": 0.1583, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8950086058519794, | |
| "grad_norm": 0.8095172047615051, | |
| "learning_rate": 4.471878097796958e-05, | |
| "loss": 0.1879, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9294320137693631, | |
| "grad_norm": 0.5348218679428101, | |
| "learning_rate": 4.408648146913357e-05, | |
| "loss": 0.1788, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.963855421686747, | |
| "grad_norm": 0.42760810256004333, | |
| "learning_rate": 4.3423460360012495e-05, | |
| "loss": 0.1745, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9982788296041308, | |
| "grad_norm": 0.5095299482345581, | |
| "learning_rate": 4.2730784849373615e-05, | |
| "loss": 0.2198, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.0309810671256454, | |
| "grad_norm": 0.24384614825248718, | |
| "learning_rate": 4.20095698677084e-05, | |
| "loss": 0.1812, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.0654044750430294, | |
| "grad_norm": 0.40465590357780457, | |
| "learning_rate": 4.1260976282640466e-05, | |
| "loss": 0.1707, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.0998278829604131, | |
| "grad_norm": 0.2638281285762787, | |
| "learning_rate": 4.048620903039308e-05, | |
| "loss": 0.1462, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.1342512908777969, | |
| "grad_norm": 0.2884138524532318, | |
| "learning_rate": 3.968651517632376e-05, | |
| "loss": 0.1684, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.1686746987951806, | |
| "grad_norm": 0.2671195864677429, | |
| "learning_rate": 3.886318190764797e-05, | |
| "loss": 0.1522, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.2030981067125646, | |
| "grad_norm": 0.40242624282836914, | |
| "learning_rate": 3.801753446158256e-05, | |
| "loss": 0.1743, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.2375215146299483, | |
| "grad_norm": 0.2599651515483856, | |
| "learning_rate": 3.715093399224398e-05, | |
| "loss": 0.1635, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.2719449225473323, | |
| "grad_norm": 0.4389915466308594, | |
| "learning_rate": 3.626477537973462e-05, | |
| "loss": 0.2023, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.306368330464716, | |
| "grad_norm": 0.6437344551086426, | |
| "learning_rate": 3.53604849849438e-05, | |
| "loss": 0.1639, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.3407917383820998, | |
| "grad_norm": 0.2979753315448761, | |
| "learning_rate": 3.443951835367736e-05, | |
| "loss": 0.1639, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.3752151462994837, | |
| "grad_norm": 1.0099725723266602, | |
| "learning_rate": 3.3503357873811105e-05, | |
| "loss": 0.1516, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.4096385542168675, | |
| "grad_norm": 0.2964365780353546, | |
| "learning_rate": 3.2553510389239313e-05, | |
| "loss": 0.1872, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.4440619621342514, | |
| "grad_norm": 0.32599982619285583, | |
| "learning_rate": 3.159150477445886e-05, | |
| "loss": 0.1705, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.4784853700516352, | |
| "grad_norm": 0.46588459610939026, | |
| "learning_rate": 3.0618889473692756e-05, | |
| "loss": 0.1713, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.512908777969019, | |
| "grad_norm": 0.4096398949623108, | |
| "learning_rate": 2.9637230008514494e-05, | |
| "loss": 0.1459, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.5473321858864026, | |
| "grad_norm": 2.8078343868255615, | |
| "learning_rate": 2.864810645798442e-05, | |
| "loss": 0.166, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.5817555938037866, | |
| "grad_norm": 0.25247690081596375, | |
| "learning_rate": 2.7653110915354542e-05, | |
| "loss": 0.1379, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.6161790017211703, | |
| "grad_norm": 0.31816768646240234, | |
| "learning_rate": 2.6653844925435295e-05, | |
| "loss": 0.1596, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.6506024096385543, | |
| "grad_norm": 0.29774752259254456, | |
| "learning_rate": 2.5651916906748995e-05, | |
| "loss": 0.1412, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.685025817555938, | |
| "grad_norm": 0.4214092493057251, | |
| "learning_rate": 2.4648939562619423e-05, | |
| "loss": 0.1643, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.7194492254733218, | |
| "grad_norm": 0.35638073086738586, | |
| "learning_rate": 2.3646527285364565e-05, | |
| "loss": 0.174, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.7538726333907055, | |
| "grad_norm": 0.3502406179904938, | |
| "learning_rate": 2.2646293557770787e-05, | |
| "loss": 0.1862, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.7882960413080895, | |
| "grad_norm": 0.2601950168609619, | |
| "learning_rate": 2.1649848356030945e-05, | |
| "loss": 0.1501, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.8227194492254735, | |
| "grad_norm": 1.0533198118209839, | |
| "learning_rate": 2.0658795558326743e-05, | |
| "loss": 0.1461, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.8571428571428572, | |
| "grad_norm": 0.36168837547302246, | |
| "learning_rate": 1.967473036322644e-05, | |
| "loss": 0.1522, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.891566265060241, | |
| "grad_norm": 0.4589601457118988, | |
| "learning_rate": 1.869923672205322e-05, | |
| "loss": 0.1682, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.9259896729776247, | |
| "grad_norm": 0.2802368700504303, | |
| "learning_rate": 1.7733884789357226e-05, | |
| "loss": 0.1657, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.9604130808950087, | |
| "grad_norm": 0.4951450526714325, | |
| "learning_rate": 1.6780228395594777e-05, | |
| "loss": 0.1534, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.9948364888123924, | |
| "grad_norm": 0.3530052602291107, | |
| "learning_rate": 1.58398025460829e-05, | |
| "loss": 0.1904, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.027538726333907, | |
| "grad_norm": 0.32037100195884705, | |
| "learning_rate": 1.491412095025479e-05, | |
| "loss": 0.1301, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.061962134251291, | |
| "grad_norm": 0.4082101285457611, | |
| "learning_rate": 1.4004673585193145e-05, | |
| "loss": 0.141, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.0963855421686746, | |
| "grad_norm": 0.5846110582351685, | |
| "learning_rate": 1.3112924297363066e-05, | |
| "loss": 0.1167, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.1308089500860588, | |
| "grad_norm": 0.3282460570335388, | |
| "learning_rate": 1.2240308446404797e-05, | |
| "loss": 0.136, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.1652323580034425, | |
| "grad_norm": 0.3618537187576294, | |
| "learning_rate": 1.1388230594778888e-05, | |
| "loss": 0.1321, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.1996557659208262, | |
| "grad_norm": 0.4230036437511444, | |
| "learning_rate": 1.0558062246982484e-05, | |
| "loss": 0.1474, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.23407917383821, | |
| "grad_norm": 0.6187330484390259, | |
| "learning_rate": 9.751139641975682e-06, | |
| "loss": 0.1216, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.2685025817555937, | |
| "grad_norm": 0.6022536754608154, | |
| "learning_rate": 8.96876160237133e-06, | |
| "loss": 0.1328, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.3029259896729775, | |
| "grad_norm": 0.47852516174316406, | |
| "learning_rate": 8.212187443850203e-06, | |
| "loss": 0.1244, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.337349397590361, | |
| "grad_norm": 0.8470497727394104, | |
| "learning_rate": 7.482634948166442e-06, | |
| "loss": 0.1135, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.3717728055077454, | |
| "grad_norm": 0.5010228157043457, | |
| "learning_rate": 6.7812784030061e-06, | |
| "loss": 0.115, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.406196213425129, | |
| "grad_norm": 0.3755123019218445, | |
| "learning_rate": 6.109246711853711e-06, | |
| "loss": 0.1305, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.440619621342513, | |
| "grad_norm": 0.464460164308548, | |
| "learning_rate": 5.467621576909218e-06, | |
| "loss": 0.11, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.4750430292598966, | |
| "grad_norm": 0.8106716871261597, | |
| "learning_rate": 4.8574357579801495e-06, | |
| "loss": 0.1152, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.509466437177281, | |
| "grad_norm": 0.6007969975471497, | |
| "learning_rate": 4.2796714101514725e-06, | |
| "loss": 0.1486, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.5438898450946645, | |
| "grad_norm": 0.4238184690475464, | |
| "learning_rate": 3.735258502908737e-06, | |
| "loss": 0.1256, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.5783132530120483, | |
| "grad_norm": 0.6052628755569458, | |
| "learning_rate": 3.2250733232592337e-06, | |
| "loss": 0.1188, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.612736660929432, | |
| "grad_norm": 0.41783905029296875, | |
| "learning_rate": 2.749937065260444e-06, | |
| "loss": 0.1155, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.6471600688468158, | |
| "grad_norm": 0.47119778394699097, | |
| "learning_rate": 2.310614508226078e-06, | |
| "loss": 0.122, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.6815834767641995, | |
| "grad_norm": 0.37864789366722107, | |
| "learning_rate": 1.907812785737312e-06, | |
| "loss": 0.1164, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.7160068846815832, | |
| "grad_norm": 0.5162110924720764, | |
| "learning_rate": 1.5421802474406027e-06, | |
| "loss": 0.13, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.7504302925989674, | |
| "grad_norm": 0.7263408303260803, | |
| "learning_rate": 1.214305415464076e-06, | |
| "loss": 0.1139, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.784853700516351, | |
| "grad_norm": 0.46103435754776, | |
| "learning_rate": 9.247160371323449e-07, | |
| "loss": 0.1424, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.819277108433735, | |
| "grad_norm": 0.4251267910003662, | |
| "learning_rate": 6.738782355044049e-07, | |
| "loss": 0.1089, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.8537005163511187, | |
| "grad_norm": 0.3590181767940521, | |
| "learning_rate": 4.6219575910197156e-07, | |
| "loss": 0.1212, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.888123924268503, | |
| "grad_norm": 0.30818095803260803, | |
| "learning_rate": 2.9000933203588153e-07, | |
| "loss": 0.1245, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.9225473321858866, | |
| "grad_norm": 0.43673816323280334, | |
| "learning_rate": 1.575961055765379e-07, | |
| "loss": 0.1263, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.9569707401032703, | |
| "grad_norm": 0.5171876549720764, | |
| "learning_rate": 6.516921205125537e-08, | |
| "loss": 0.1232, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.991394148020654, | |
| "grad_norm": 0.31435626745224, | |
| "learning_rate": 1.2877421786458566e-08, | |
| "loss": 0.114, | |
| "step": 870 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 870, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.3941567684085023e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |