qwen3-8B-sft-mix-v20250921_05 / trainer_state.json
rulins's picture
Upload folder using huggingface_hub
0ca93ec verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 315,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.08,
"grad_norm": 7.553986895560367,
"learning_rate": 5e-06,
"loss": 1.5771,
"step": 5
},
{
"epoch": 0.16,
"grad_norm": 1.906740207205835,
"learning_rate": 1.125e-05,
"loss": 1.3506,
"step": 10
},
{
"epoch": 0.24,
"grad_norm": 0.8767819770258468,
"learning_rate": 1.7500000000000002e-05,
"loss": 1.1889,
"step": 15
},
{
"epoch": 0.32,
"grad_norm": 0.5446745909703954,
"learning_rate": 2.375e-05,
"loss": 1.0966,
"step": 20
},
{
"epoch": 0.4,
"grad_norm": 0.5140030398932292,
"learning_rate": 3.0000000000000004e-05,
"loss": 1.0404,
"step": 25
},
{
"epoch": 0.48,
"grad_norm": 0.4675005019349853,
"learning_rate": 3.625e-05,
"loss": 1.0091,
"step": 30
},
{
"epoch": 0.56,
"grad_norm": 0.44935151084519126,
"learning_rate": 3.9995070884147604e-05,
"loss": 0.9838,
"step": 35
},
{
"epoch": 0.64,
"grad_norm": 0.45169510024799914,
"learning_rate": 3.9939646229673775e-05,
"loss": 0.9736,
"step": 40
},
{
"epoch": 0.72,
"grad_norm": 0.42767814157802786,
"learning_rate": 3.98228068051382e-05,
"loss": 0.9652,
"step": 45
},
{
"epoch": 0.8,
"grad_norm": 0.4359248691134457,
"learning_rate": 3.964491247983392e-05,
"loss": 0.9755,
"step": 50
},
{
"epoch": 0.88,
"grad_norm": 0.4096569254948115,
"learning_rate": 3.940651117416824e-05,
"loss": 0.9617,
"step": 55
},
{
"epoch": 0.96,
"grad_norm": 0.39321526767386744,
"learning_rate": 3.9108337172049794e-05,
"loss": 0.9436,
"step": 60
},
{
"epoch": 1.032,
"grad_norm": 0.5454178809247112,
"learning_rate": 3.875130885926973e-05,
"loss": 0.8933,
"step": 65
},
{
"epoch": 1.112,
"grad_norm": 0.4889310546790741,
"learning_rate": 3.83365258948432e-05,
"loss": 0.8369,
"step": 70
},
{
"epoch": 1.192,
"grad_norm": 0.43597726561431077,
"learning_rate": 3.786526582402313e-05,
"loss": 0.8081,
"step": 75
},
{
"epoch": 1.272,
"grad_norm": 0.43639248467106284,
"learning_rate": 3.733898014341858e-05,
"loss": 0.8166,
"step": 80
},
{
"epoch": 1.3519999999999999,
"grad_norm": 0.3826009932791081,
"learning_rate": 3.6759289830337246e-05,
"loss": 0.8047,
"step": 85
},
{
"epoch": 1.432,
"grad_norm": 0.36002961067047146,
"learning_rate": 3.612798035012161e-05,
"loss": 0.7919,
"step": 90
},
{
"epoch": 1.512,
"grad_norm": 0.46361237648534737,
"learning_rate": 3.544699615685671e-05,
"loss": 0.7863,
"step": 95
},
{
"epoch": 1.592,
"grad_norm": 0.3866372632245251,
"learning_rate": 3.4718434704387174e-05,
"loss": 0.8034,
"step": 100
},
{
"epoch": 1.6720000000000002,
"grad_norm": 0.3878544531373144,
"learning_rate": 3.394453998609001e-05,
"loss": 0.777,
"step": 105
},
{
"epoch": 1.752,
"grad_norm": 0.43357660375765866,
"learning_rate": 3.312769562330075e-05,
"loss": 0.796,
"step": 110
},
{
"epoch": 1.8319999999999999,
"grad_norm": 0.38664388005160477,
"learning_rate": 3.227041752368091e-05,
"loss": 0.7816,
"step": 115
},
{
"epoch": 1.912,
"grad_norm": 0.3823673140467302,
"learning_rate": 3.1375346132139135e-05,
"loss": 0.7766,
"step": 120
},
{
"epoch": 1.992,
"grad_norm": 0.37828406727880126,
"learning_rate": 3.0445238298173492e-05,
"loss": 0.7966,
"step": 125
},
{
"epoch": 2.064,
"grad_norm": 0.6875770696894508,
"learning_rate": 2.9482958784683883e-05,
"loss": 0.6479,
"step": 130
},
{
"epoch": 2.144,
"grad_norm": 0.5536819048526282,
"learning_rate": 2.849147144440747e-05,
"loss": 0.6117,
"step": 135
},
{
"epoch": 2.224,
"grad_norm": 0.5248020590041124,
"learning_rate": 2.7473830091154243e-05,
"loss": 0.6098,
"step": 140
},
{
"epoch": 2.304,
"grad_norm": 0.4504880059072583,
"learning_rate": 2.6433169093959405e-05,
"loss": 0.6007,
"step": 145
},
{
"epoch": 2.384,
"grad_norm": 0.42667978139991347,
"learning_rate": 2.5372693723123075e-05,
"loss": 0.6083,
"step": 150
},
{
"epoch": 2.464,
"grad_norm": 0.432582533447793,
"learning_rate": 2.4295670277871736e-05,
"loss": 0.6029,
"step": 155
},
{
"epoch": 2.544,
"grad_norm": 0.4098794172365012,
"learning_rate": 2.320541602604851e-05,
"loss": 0.5893,
"step": 160
},
{
"epoch": 2.624,
"grad_norm": 0.4308341436869616,
"learning_rate": 2.210528898681851e-05,
"loss": 0.5806,
"step": 165
},
{
"epoch": 2.7039999999999997,
"grad_norm": 0.43820711621775427,
"learning_rate": 2.099867758785866e-05,
"loss": 0.6151,
"step": 170
},
{
"epoch": 2.784,
"grad_norm": 0.41576971172263233,
"learning_rate": 1.988899022888841e-05,
"loss": 0.601,
"step": 175
},
{
"epoch": 2.864,
"grad_norm": 0.406055148224806,
"learning_rate": 1.877964478368577e-05,
"loss": 0.6103,
"step": 180
},
{
"epoch": 2.944,
"grad_norm": 0.40196819912431503,
"learning_rate": 1.7674058072923075e-05,
"loss": 0.6014,
"step": 185
},
{
"epoch": 3.016,
"grad_norm": 0.7427654443430163,
"learning_rate": 1.6575635340246203e-05,
"loss": 0.5633,
"step": 190
},
{
"epoch": 3.096,
"grad_norm": 0.6302486308462233,
"learning_rate": 1.548775976401152e-05,
"loss": 0.4498,
"step": 195
},
{
"epoch": 3.176,
"grad_norm": 0.6453164328082227,
"learning_rate": 1.4413782036984616e-05,
"loss": 0.4413,
"step": 200
},
{
"epoch": 3.2560000000000002,
"grad_norm": 0.4777050134578628,
"learning_rate": 1.3357010046095741e-05,
"loss": 0.4268,
"step": 205
},
{
"epoch": 3.336,
"grad_norm": 0.45826237793958197,
"learning_rate": 1.2320698684038599e-05,
"loss": 0.4376,
"step": 210
},
{
"epoch": 3.416,
"grad_norm": 0.5034497132926059,
"learning_rate": 1.1308039824093197e-05,
"loss": 0.4402,
"step": 215
},
{
"epoch": 3.496,
"grad_norm": 0.41252483370606546,
"learning_rate": 1.0322152489050508e-05,
"loss": 0.4294,
"step": 220
},
{
"epoch": 3.576,
"grad_norm": 0.4695635107153035,
"learning_rate": 9.366073244519124e-06,
"loss": 0.4348,
"step": 225
},
{
"epoch": 3.656,
"grad_norm": 0.4709707441227059,
"learning_rate": 8.442746846202711e-06,
"loss": 0.437,
"step": 230
},
{
"epoch": 3.7359999999999998,
"grad_norm": 0.6313760297797569,
"learning_rate": 7.5550171699549945e-06,
"loss": 0.4356,
"step": 235
},
{
"epoch": 3.816,
"grad_norm": 0.4346437262602198,
"learning_rate": 6.705618452548057e-06,
"loss": 0.4368,
"step": 240
},
{
"epoch": 3.896,
"grad_norm": 0.42927897911166574,
"learning_rate": 5.897166870132658e-06,
"loss": 0.4295,
"step": 245
},
{
"epoch": 3.976,
"grad_norm": 0.39740342880248314,
"learning_rate": 5.132152480329072e-06,
"loss": 0.4439,
"step": 250
},
{
"epoch": 4.048,
"grad_norm": 0.6330823851675211,
"learning_rate": 4.412931552767295e-06,
"loss": 0.3693,
"step": 255
},
{
"epoch": 4.128,
"grad_norm": 0.7298997105017592,
"learning_rate": 3.741719311698608e-06,
"loss": 0.3449,
"step": 260
},
{
"epoch": 4.208,
"grad_norm": 0.4908230958897118,
"learning_rate": 3.120583113031579e-06,
"loss": 0.3457,
"step": 265
},
{
"epoch": 4.288,
"grad_norm": 0.4620570668309974,
"learning_rate": 2.551436076807501e-06,
"loss": 0.3326,
"step": 270
},
{
"epoch": 4.368,
"grad_norm": 0.44029177637407785,
"learning_rate": 2.036031194727346e-06,
"loss": 0.3267,
"step": 275
},
{
"epoch": 4.448,
"grad_norm": 0.4285706823245102,
"learning_rate": 1.5759559308793448e-06,
"loss": 0.3433,
"step": 280
},
{
"epoch": 4.5280000000000005,
"grad_norm": 0.44416591740232586,
"learning_rate": 1.172627332297076e-06,
"loss": 0.3388,
"step": 285
},
{
"epoch": 4.608,
"grad_norm": 0.4088428558019627,
"learning_rate": 8.272876644077188e-07,
"loss": 0.33,
"step": 290
},
{
"epoch": 4.688,
"grad_norm": 0.4102958065555951,
"learning_rate": 5.410005848134315e-07,
"loss": 0.3273,
"step": 295
},
{
"epoch": 4.768,
"grad_norm": 0.4530496321154963,
"learning_rate": 3.1464786719075825e-07,
"loss": 0.3376,
"step": 300
},
{
"epoch": 4.848,
"grad_norm": 0.4054203925196989,
"learning_rate": 1.4892668539853606e-07,
"loss": 0.3341,
"step": 305
},
{
"epoch": 4.928,
"grad_norm": 0.39471491526358593,
"learning_rate": 4.434746615932018e-08,
"loss": 0.3297,
"step": 310
},
{
"epoch": 5.0,
"grad_norm": 0.3906649028386399,
"learning_rate": 1.2323169282257852e-09,
"loss": 0.3311,
"step": 315
},
{
"epoch": 5.0,
"step": 315,
"total_flos": 616445810049024.0,
"train_loss": 0.650969924624004,
"train_runtime": 29756.3874,
"train_samples_per_second": 1.344,
"train_steps_per_second": 0.011
}
],
"logging_steps": 5,
"max_steps": 315,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 616445810049024.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}