diff --git "a/Office_ckpt/trainer_state.json" "b/Office_ckpt/trainer_state.json" new file mode 100644--- /dev/null +++ "b/Office_ckpt/trainer_state.json" @@ -0,0 +1,15652 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6006944444444444, + "eval_steps": 346, + "global_step": 1038, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "categorical_diversity": 1.0, + "completion_length": 5.07421875, + "epoch": 0.0005787037037037037, + "grad_norm": 2.5435311794281006, + "kl": 7.319450378417969e-05, + "learning_rate": 0.0, + "loss": 0.0, + "reward": 0.003938489011488855, + "reward_std": 0.12443657219409943, + "rewards/ndcg_rule_reward": -0.025358385406434536, + "rewards/rule_reward": 0.029296875, + "step": 1, + "token_diversity": 0.5703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.080078125, + "epoch": 0.0011574074074074073, + "grad_norm": 2.8393402099609375, + "kl": 0.00022602081298828125, + "learning_rate": 9.615384615384617e-08, + "loss": 0.0, + "reward": 0.004222166491672397, + "reward_std": 0.13271314650774002, + "rewards/ndcg_rule_reward": -0.02702783327549696, + "rewards/rule_reward": 0.03125, + "step": 2, + "token_diversity": 0.50571875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.001736111111111111, + "grad_norm": 2.1978094577789307, + "kl": 0.0002295970916748047, + "learning_rate": 1.9230769230769234e-07, + "loss": 0.0, + "reward": 0.0038152976194396615, + "reward_std": 0.11604835838079453, + "rewards/ndcg_rule_reward": -0.023528452962636948, + "rewards/rule_reward": 0.02734375, + "step": 3, + "token_diversity": 0.55078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.28515625, + "epoch": 0.0023148148148148147, + "grad_norm": 1.9839231967926025, + "kl": 0.0003814697265625, + "learning_rate": 2.884615384615385e-07, + "loss": 0.0, + "reward": 0.003702003159560263, + "reward_std": 0.11611336469650269, + "rewards/ndcg_rule_reward": -0.02364174649119377, + "rewards/rule_reward": 0.02734375, + "step": 4, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.0028935185185185184, + "grad_norm": 1.545789122581482, + "kl": 0.00012826919555664062, + "learning_rate": 3.846153846153847e-07, + "loss": 0.0, + "reward": 0.002662331215105951, + "reward_std": 0.11660568788647652, + "rewards/ndcg_rule_reward": -0.02468141820281744, + "rewards/rule_reward": 0.02734375, + "step": 5, + "token_diversity": 0.54171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.003472222222222222, + "grad_norm": 1.5978279113769531, + "kl": 0.00044155120849609375, + "learning_rate": 4.807692307692308e-07, + "loss": 0.0, + "reward": 0.004029628471471369, + "reward_std": 0.14117427915334702, + "rewards/ndcg_rule_reward": -0.02917349524796009, + "rewards/rule_reward": 0.033203125, + "step": 6, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.095703125, + "epoch": 0.004050925925925926, + "grad_norm": 1.784928321838379, + "kl": 0.0002574920654296875, + "learning_rate": 5.76923076923077e-07, + "loss": 0.0, + "reward": 0.003446195274591446, + "reward_std": 0.10780075564980507, + "rewards/ndcg_rule_reward": -0.021944429725408554, + "rewards/rule_reward": 0.025390625, + "step": 7, + "token_diversity": 0.55859375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.072265625, + "epoch": 0.004629629629629629, + "grad_norm": 3.472127676010132, + "kl": 0.0004978179931640625, + "learning_rate": 6.730769230769231e-07, + "loss": 0.0, + "reward": 0.004502338590100408, + "reward_std": 0.1578269973397255, + "rewards/ndcg_rule_reward": -0.03260703571140766, + "rewards/rule_reward": 0.037109375, + "step": 8, + "token_diversity": 0.5054375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.005208333333333333, + "grad_norm": 1.2353060245513916, + "kl": 0.000308990478515625, + "learning_rate": 7.692307692307694e-07, + "loss": 0.0, + "reward": 0.003110437421128154, + "reward_std": 0.10796928033232689, + "rewards/ndcg_rule_reward": -0.022280187346041203, + "rewards/rule_reward": 0.025390625, + "step": 9, + "token_diversity": 0.53515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.072265625, + "epoch": 0.005787037037037037, + "grad_norm": 1.52326500415802, + "kl": 0.0005555152893066406, + "learning_rate": 8.653846153846154e-07, + "loss": 0.0, + "reward": 0.0031200118246488273, + "reward_std": 0.11639609932899475, + "rewards/ndcg_rule_reward": -0.02422373928129673, + "rewards/rule_reward": 0.02734375, + "step": 10, + "token_diversity": 0.5390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.00636574074074074, + "grad_norm": 1.3322960138320923, + "kl": 0.00058746337890625, + "learning_rate": 9.615384615384617e-07, + "loss": 0.0, + "reward": 0.0038526543648913503, + "reward_std": 0.12443757057189941, + "rewards/ndcg_rule_reward": -0.025444219820201397, + "rewards/rule_reward": 0.029296875, + "step": 11, + "token_diversity": 0.53125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.006944444444444444, + "grad_norm": 1.1985896825790405, + "kl": 0.00125885009765625, + "learning_rate": 1.0576923076923078e-06, + "loss": 0.0, + "reward": 0.0040072964038699865, + "reward_std": 0.1412099301815033, + "rewards/ndcg_rule_reward": -0.02919582836329937, + "rewards/rule_reward": 0.033203125, + "step": 12, + "token_diversity": 0.52734375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.007523148148148148, + "grad_norm": 0.9701127409934998, + "kl": 0.0017948150634765625, + "learning_rate": 1.153846153846154e-06, + "loss": 0.0, + "reward": 0.0038384387735277414, + "reward_std": 0.09918517246842384, + "rewards/ndcg_rule_reward": -0.01959906192496419, + "rewards/rule_reward": 0.0234375, + "step": 13, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.008101851851851851, + "grad_norm": 1.0480159521102905, + "kl": 0.0030472278594970703, + "learning_rate": 1.25e-06, + "loss": 0.0, + "reward": 0.002340657403692603, + "reward_std": 0.07465843856334686, + "rewards/ndcg_rule_reward": -0.015237467363476753, + "rewards/rule_reward": 0.017578125, + "step": 14, + "token_diversity": 0.55078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.008680555555555556, + "grad_norm": 1.7193124294281006, + "kl": 0.14227294921875, + "learning_rate": 1.3461538461538462e-06, + "loss": 0.0001, + "reward": 0.0035817292518913746, + "reward_std": 0.1330166459083557, + "rewards/ndcg_rule_reward": -0.027668270282447338, + "rewards/rule_reward": 0.03125, + "step": 15, + "token_diversity": 0.54296875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.009259259259259259, + "grad_norm": 3.354548454284668, + "kl": 2.0926513671875, + "learning_rate": 1.4423076923076922e-06, + "loss": 0.0021, + "reward": 0.005657832603901625, + "reward_std": 0.16566532105207443, + "rewards/ndcg_rule_reward": -0.033404670655727386, + "rewards/rule_reward": 0.0390625, + "step": 16, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.009837962962962963, + "grad_norm": 1.584397554397583, + "kl": 0.3125443458557129, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.0003, + "reward": 0.003980403067544103, + "reward_std": 0.12438932433724403, + "rewards/ndcg_rule_reward": -0.025316471233963966, + "rewards/rule_reward": 0.029296875, + "step": 17, + "token_diversity": 0.5625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.010416666666666666, + "grad_norm": 0.9085068702697754, + "kl": 0.0046405792236328125, + "learning_rate": 1.6346153846153848e-06, + "loss": 0.0, + "reward": 0.0020240373560227454, + "reward_std": 0.07478424534201622, + "rewards/ndcg_rule_reward": -0.015554087702184916, + "rewards/rule_reward": 0.017578125, + "step": 18, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.01099537037037037, + "grad_norm": 2.2751269340515137, + "kl": 1.7333984375, + "learning_rate": 1.7307692307692308e-06, + "loss": 0.0017, + "reward": 0.005158801097422838, + "reward_std": 0.17431050539016724, + "rewards/ndcg_rule_reward": -0.03585682436823845, + "rewards/rule_reward": 0.041015625, + "step": 19, + "token_diversity": 0.5234375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.011574074074074073, + "grad_norm": 2.152127742767334, + "kl": 0.5966796875, + "learning_rate": 1.826923076923077e-06, + "loss": 0.0006, + "reward": 0.003670619917102158, + "reward_std": 0.13294794037938118, + "rewards/ndcg_rule_reward": -0.027579380199313164, + "rewards/rule_reward": 0.03125, + "step": 20, + "token_diversity": 0.53696875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.012152777777777778, + "grad_norm": 1.9997245073318481, + "kl": 0.5584716796875, + "learning_rate": 1.9230769230769234e-06, + "loss": 0.0006, + "reward": 0.004295771010220051, + "reward_std": 0.15788210928440094, + "rewards/ndcg_rule_reward": -0.03281360398977995, + "rewards/rule_reward": 0.037109375, + "step": 21, + "token_diversity": 0.46199999999999997 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.01273148148148148, + "grad_norm": 1.9098100662231445, + "kl": 2.8515625, + "learning_rate": 2.0192307692307692e-06, + "loss": 0.0029, + "reward": 0.004226544639095664, + "reward_std": 0.1411074846982956, + "rewards/ndcg_rule_reward": -0.028976581059396267, + "rewards/rule_reward": 0.033203125, + "step": 22, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.013310185185185185, + "grad_norm": 37.25251007080078, + "kl": 33.90625, + "learning_rate": 2.1153846153846155e-06, + "loss": 0.0339, + "reward": 0.0044616920640692115, + "reward_std": 0.13258373737335205, + "rewards/ndcg_rule_reward": -0.026788308285176754, + "rewards/rule_reward": 0.03125, + "step": 23, + "token_diversity": 0.546875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.013888888888888888, + "grad_norm": 5.030327796936035, + "kl": 6.796875, + "learning_rate": 2.211538461538462e-06, + "loss": 0.0068, + "reward": 0.004255102248862386, + "reward_std": 0.1326795592904091, + "rewards/ndcg_rule_reward": -0.026994897983968258, + "rewards/rule_reward": 0.03125, + "step": 24, + "token_diversity": 0.56640625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.014467592592592593, + "grad_norm": 628.8905639648438, + "kl": 728.0004615783691, + "learning_rate": 2.307692307692308e-06, + "loss": 0.7286, + "reward": 0.0028623539255931973, + "reward_std": 0.09124944731593132, + "rewards/ndcg_rule_reward": -0.018622021190822124, + "rewards/rule_reward": 0.021484375, + "step": 25, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.015046296296296295, + "grad_norm": 6.23038387298584, + "kl": 8.6875, + "learning_rate": 2.403846153846154e-06, + "loss": 0.0087, + "reward": 0.0036050297785550356, + "reward_std": 0.1329449713230133, + "rewards/ndcg_rule_reward": -0.027644970454275608, + "rewards/rule_reward": 0.03125, + "step": 26, + "token_diversity": 0.54296875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.015625, + "grad_norm": 1.0764398574829102, + "kl": 0.01055908203125, + "learning_rate": 2.5e-06, + "loss": 0.0, + "reward": 0.0030874772928655148, + "reward_std": 0.12480359151959419, + "rewards/ndcg_rule_reward": -0.026209398172795773, + "rewards/rule_reward": 0.029296875, + "step": 27, + "token_diversity": 0.5625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.016203703703703703, + "grad_norm": 2.2805721759796143, + "kl": 1.8603515625, + "learning_rate": 2.5961538461538465e-06, + "loss": 0.0019, + "reward": 0.0034119455376639962, + "reward_std": 0.09938351064920425, + "rewards/ndcg_rule_reward": -0.020025555044412613, + "rewards/rule_reward": 0.0234375, + "step": 28, + "token_diversity": 0.5546875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.01678240740740741, + "grad_norm": 6.902415752410889, + "kl": 6.15234375, + "learning_rate": 2.6923076923076923e-06, + "loss": 0.0062, + "reward": 0.0037448315415531397, + "reward_std": 0.12450059875845909, + "rewards/ndcg_rule_reward": -0.025552045553922653, + "rewards/rule_reward": 0.029296875, + "step": 29, + "token_diversity": 0.59765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.017361111111111112, + "grad_norm": 232.36761474609375, + "kl": 349.765625, + "learning_rate": 2.7884615384615386e-06, + "loss": 0.3489, + "reward": 0.0030232337885536253, + "reward_std": 0.09957066923379898, + "rewards/ndcg_rule_reward": -0.020414266735315323, + "rewards/rule_reward": 0.0234375, + "step": 30, + "token_diversity": 0.54296875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.017939814814814815, + "grad_norm": 3.705733299255371, + "kl": 5.484375, + "learning_rate": 2.8846153846153845e-06, + "loss": 0.0055, + "reward": 0.0037470401730388403, + "reward_std": 0.11607079952955246, + "rewards/ndcg_rule_reward": -0.02359671052545309, + "rewards/rule_reward": 0.02734375, + "step": 31, + "token_diversity": 0.58984375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.018518518518518517, + "grad_norm": 1.7853459119796753, + "kl": 4.4411773681640625, + "learning_rate": 2.980769230769231e-06, + "loss": 0.0044, + "reward": 0.004961613565683365, + "reward_std": 0.1576160416007042, + "rewards/ndcg_rule_reward": -0.03214776236563921, + "rewards/rule_reward": 0.037109375, + "step": 32, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.019097222222222224, + "grad_norm": 9.94151496887207, + "kl": 9.0859375, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.0091, + "reward": 0.00389632151927799, + "reward_std": 0.13288239389657974, + "rewards/ndcg_rule_reward": -0.0273536778986454, + "rewards/rule_reward": 0.03125, + "step": 33, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.019675925925925927, + "grad_norm": 3.473350763320923, + "kl": 7.2548828125, + "learning_rate": 3.1730769230769233e-06, + "loss": 0.0073, + "reward": 0.0033643220085650682, + "reward_std": 0.0994311012327671, + "rewards/ndcg_rule_reward": -0.020073178224265575, + "rewards/rule_reward": 0.0234375, + "step": 34, + "token_diversity": 0.5234375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.02025462962962963, + "grad_norm": 50.731266021728516, + "kl": 38.502593994140625, + "learning_rate": 3.2692307692307696e-06, + "loss": 0.0386, + "reward": 0.003773641074076295, + "reward_std": 0.12452413886785507, + "rewards/ndcg_rule_reward": -0.02552323415875435, + "rewards/rule_reward": 0.029296875, + "step": 35, + "token_diversity": 0.53515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.020833333333333332, + "grad_norm": 1.1886072158813477, + "kl": 3.296875, + "learning_rate": 3.365384615384616e-06, + "loss": 0.0033, + "reward": 0.003438280546106398, + "reward_std": 0.1078336276113987, + "rewards/ndcg_rule_reward": -0.021952344104647636, + "rewards/rule_reward": 0.025390625, + "step": 36, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.02141203703703704, + "grad_norm": 2.4624130725860596, + "kl": 5.23828125, + "learning_rate": 3.4615384615384617e-06, + "loss": 0.0053, + "reward": 0.004174494417384267, + "reward_std": 0.1411292590200901, + "rewards/ndcg_rule_reward": -0.029028630815446377, + "rewards/rule_reward": 0.033203125, + "step": 37, + "token_diversity": 0.526 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.02199074074074074, + "grad_norm": 1.9708118438720703, + "kl": 3.27734375, + "learning_rate": 3.557692307692308e-06, + "loss": 0.0033, + "reward": 0.004310172167606652, + "reward_std": 0.1326315701007843, + "rewards/ndcg_rule_reward": -0.026939828880131245, + "rewards/rule_reward": 0.03125, + "step": 38, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.022569444444444444, + "grad_norm": 1.424256682395935, + "kl": 4.06640625, + "learning_rate": 3.653846153846154e-06, + "loss": 0.0041, + "reward": 0.00417801458388567, + "reward_std": 0.11587423086166382, + "rewards/ndcg_rule_reward": -0.023165734484791756, + "rewards/rule_reward": 0.02734375, + "step": 39, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.023148148148148147, + "grad_norm": 13.398024559020996, + "kl": 11.765625, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0118, + "reward": 0.00465521402657032, + "reward_std": 0.15771086513996124, + "rewards/ndcg_rule_reward": -0.03245416097342968, + "rewards/rule_reward": 0.037109375, + "step": 40, + "token_diversity": 0.546875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.023726851851851853, + "grad_norm": 1.2068060636520386, + "kl": 2.58984375, + "learning_rate": 3.846153846153847e-06, + "loss": 0.0026, + "reward": 0.003186140093021095, + "reward_std": 0.09948527440428734, + "rewards/ndcg_rule_reward": -0.02025136025622487, + "rewards/rule_reward": 0.0234375, + "step": 41, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.024305555555555556, + "grad_norm": 0.9672524929046631, + "kl": 0.476806640625, + "learning_rate": 3.942307692307692e-06, + "loss": 0.0005, + "reward": 0.002964170416817069, + "reward_std": 0.10801388323307037, + "rewards/ndcg_rule_reward": -0.022426454350352287, + "rewards/rule_reward": 0.025390625, + "step": 42, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.02488425925925926, + "grad_norm": 1.6210402250289917, + "kl": 2.517578125, + "learning_rate": 4.0384615384615385e-06, + "loss": 0.0025, + "reward": 0.005058033857494593, + "reward_std": 0.15758205205202103, + "rewards/ndcg_rule_reward": -0.032051341608166695, + "rewards/rule_reward": 0.037109375, + "step": 43, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.02546296296296296, + "grad_norm": 1.6393539905548096, + "kl": 0.849609375, + "learning_rate": 4.134615384615385e-06, + "loss": 0.0008, + "reward": 0.00479107117280364, + "reward_std": 0.15766775608062744, + "rewards/ndcg_rule_reward": -0.0323183024302125, + "rewards/rule_reward": 0.037109375, + "step": 44, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.026041666666666668, + "grad_norm": 1.248025894165039, + "kl": 1.1785888671875, + "learning_rate": 4.230769230769231e-06, + "loss": 0.0012, + "reward": 0.005138740409165621, + "reward_std": 0.16594763845205307, + "rewards/ndcg_rule_reward": -0.033923760056495667, + "rewards/rule_reward": 0.0390625, + "step": 45, + "token_diversity": 0.59765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.02662037037037037, + "grad_norm": 2.5542960166931152, + "kl": 8.84765625, + "learning_rate": 4.326923076923077e-06, + "loss": 0.0088, + "reward": 0.0024778731749393046, + "reward_std": 0.09983935207128525, + "rewards/ndcg_rule_reward": -0.020959626883268356, + "rewards/rule_reward": 0.0234375, + "step": 46, + "token_diversity": 0.52934375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.027199074074074073, + "grad_norm": 1.769195556640625, + "kl": 7.8671875, + "learning_rate": 4.423076923076924e-06, + "loss": 0.0079, + "reward": 0.0026507247239351273, + "reward_std": 0.08292639628052711, + "rewards/ndcg_rule_reward": -0.016880524810403585, + "rewards/rule_reward": 0.01953125, + "step": 47, + "token_diversity": 0.515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.027777777777777776, + "grad_norm": 3.6618802547454834, + "kl": 16.5, + "learning_rate": 4.51923076923077e-06, + "loss": 0.0165, + "reward": 0.003835644922219217, + "reward_std": 0.12445050477981567, + "rewards/ndcg_rule_reward": -0.02546123042702675, + "rewards/rule_reward": 0.029296875, + "step": 48, + "token_diversity": 0.53515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.028356481481481483, + "grad_norm": 1.5814570188522339, + "kl": 2.9921875, + "learning_rate": 4.615384615384616e-06, + "loss": 0.003, + "reward": 0.0038441268261522055, + "reward_std": 0.10761278122663498, + "rewards/ndcg_rule_reward": -0.02154649794101715, + "rewards/rule_reward": 0.025390625, + "step": 49, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.028935185185185185, + "grad_norm": 0.9833499789237976, + "kl": 1.792724609375, + "learning_rate": 4.711538461538462e-06, + "loss": 0.0018, + "reward": 0.003112397389486432, + "reward_std": 0.10794124752283096, + "rewards/ndcg_rule_reward": -0.02227822784334421, + "rewards/rule_reward": 0.025390625, + "step": 50, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.029513888888888888, + "grad_norm": 17.988117218017578, + "kl": 21.46875, + "learning_rate": 4.807692307692308e-06, + "loss": 0.0215, + "reward": 0.0045267033856362104, + "reward_std": 0.14095794409513474, + "rewards/ndcg_rule_reward": -0.02867642045021057, + "rewards/rule_reward": 0.033203125, + "step": 51, + "token_diversity": 0.52734375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.03009259259259259, + "grad_norm": 1.2161898612976074, + "kl": 0.98828125, + "learning_rate": 4.903846153846154e-06, + "loss": 0.001, + "reward": 0.004159182775765657, + "reward_std": 0.14953358471393585, + "rewards/ndcg_rule_reward": -0.03099706768989563, + "rewards/rule_reward": 0.03515625, + "step": 52, + "token_diversity": 0.5234375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.030671296296296297, + "grad_norm": 1.063670039176941, + "kl": 2.421875, + "learning_rate": 5e-06, + "loss": 0.0024, + "reward": 0.002522778115235269, + "reward_std": 0.07456038892269135, + "rewards/ndcg_rule_reward": -0.015055347234010696, + "rewards/rule_reward": 0.017578125, + "step": 53, + "token_diversity": 0.5234375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.03125, + "grad_norm": 1.63593327999115, + "kl": 7.59375, + "learning_rate": 5.096153846153846e-06, + "loss": 0.0076, + "reward": 0.004456004127860069, + "reward_std": 0.12415613606572151, + "rewards/ndcg_rule_reward": -0.02484087087213993, + "rewards/rule_reward": 0.029296875, + "step": 54, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.031828703703703706, + "grad_norm": 1.374873399734497, + "kl": 4.59375, + "learning_rate": 5.192307692307693e-06, + "loss": 0.0046, + "reward": 0.0029203647864051163, + "reward_std": 0.10805504024028778, + "rewards/ndcg_rule_reward": -0.0224702600389719, + "rewards/rule_reward": 0.025390625, + "step": 55, + "token_diversity": 0.515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.032407407407407406, + "grad_norm": 1.2758244276046753, + "kl": 1.78759765625, + "learning_rate": 5.288461538461539e-06, + "loss": 0.0018, + "reward": 0.003825745778158307, + "reward_std": 0.124473687261343, + "rewards/ndcg_rule_reward": -0.025471129454672337, + "rewards/rule_reward": 0.029296875, + "step": 56, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.03298611111111111, + "grad_norm": 1.663013219833374, + "kl": 6.70703125, + "learning_rate": 5.384615384615385e-06, + "loss": 0.0067, + "reward": 0.004785498837009072, + "reward_std": 0.15767550468444824, + "rewards/ndcg_rule_reward": -0.03232387453317642, + "rewards/rule_reward": 0.037109375, + "step": 57, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.03356481481481482, + "grad_norm": 2.1676619052886963, + "kl": 3.0625, + "learning_rate": 5.480769230769232e-06, + "loss": 0.0031, + "reward": 0.00483974302187562, + "reward_std": 0.15767697989940643, + "rewards/ndcg_rule_reward": -0.03226963244378567, + "rewards/rule_reward": 0.037109375, + "step": 58, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.06640625, + "epoch": 0.03414351851851852, + "grad_norm": 2.5445492267608643, + "kl": 12.625, + "learning_rate": 5.576923076923077e-06, + "loss": 0.0126, + "reward": 0.0038912741001695395, + "reward_std": 0.1244259811937809, + "rewards/ndcg_rule_reward": -0.02540560159832239, + "rewards/rule_reward": 0.029296875, + "step": 59, + "token_diversity": 0.53515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.034722222222222224, + "grad_norm": 3.8330349922180176, + "kl": 15.515625, + "learning_rate": 5.6730769230769235e-06, + "loss": 0.0155, + "reward": 0.00439235963858664, + "reward_std": 0.13260675594210625, + "rewards/ndcg_rule_reward": -0.026857641991227865, + "rewards/rule_reward": 0.03125, + "step": 60, + "token_diversity": 0.55078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.03530092592592592, + "grad_norm": 1.4106311798095703, + "kl": 0.31396484375, + "learning_rate": 5.769230769230769e-06, + "loss": 0.0003, + "reward": 0.0023132311180233955, + "reward_std": 0.07465843670070171, + "rewards/ndcg_rule_reward": -0.015264894580468535, + "rewards/rule_reward": 0.017578125, + "step": 61, + "token_diversity": 0.53515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.08203125, + "epoch": 0.03587962962962963, + "grad_norm": 1.6968085765838623, + "kl": 4.921875, + "learning_rate": 5.865384615384616e-06, + "loss": 0.0049, + "reward": 0.0032475602347403765, + "reward_std": 0.11628789827227592, + "rewards/ndcg_rule_reward": -0.024096189998090267, + "rewards/rule_reward": 0.02734375, + "step": 62, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.036458333333333336, + "grad_norm": 2.1102969646453857, + "kl": 6.21875, + "learning_rate": 5.961538461538462e-06, + "loss": 0.0062, + "reward": 0.004297895589843392, + "reward_std": 0.1326339691877365, + "rewards/ndcg_rule_reward": -0.026952105574309826, + "rewards/rule_reward": 0.03125, + "step": 63, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.037037037037037035, + "grad_norm": 1.3856912851333618, + "kl": 2.33203125, + "learning_rate": 6.057692307692308e-06, + "loss": 0.0023, + "reward": 0.0032866201363503933, + "reward_std": 0.11628668755292892, + "rewards/ndcg_rule_reward": -0.02405712939798832, + "rewards/rule_reward": 0.02734375, + "step": 64, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.03761574074074074, + "grad_norm": 1.6187721490859985, + "kl": 1.1484375, + "learning_rate": 6.153846153846155e-06, + "loss": 0.0011, + "reward": 0.003533161128871143, + "reward_std": 0.13302071765065193, + "rewards/ndcg_rule_reward": -0.027716838754713535, + "rewards/rule_reward": 0.03125, + "step": 65, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.03819444444444445, + "grad_norm": 1.429261565208435, + "kl": 4.828125, + "learning_rate": 6.25e-06, + "loss": 0.0048, + "reward": 0.004077733960002661, + "reward_std": 0.14116205275058746, + "rewards/ndcg_rule_reward": -0.0291253924369812, + "rewards/rule_reward": 0.033203125, + "step": 66, + "token_diversity": 0.53125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.03877314814814815, + "grad_norm": 6.405779838562012, + "kl": 15.7890625, + "learning_rate": 6.3461538461538466e-06, + "loss": 0.0158, + "reward": 0.002580382162705064, + "reward_std": 0.11663408577442169, + "rewards/ndcg_rule_reward": -0.024763369467109442, + "rewards/rule_reward": 0.02734375, + "step": 67, + "token_diversity": 0.546875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.03935185185185185, + "grad_norm": 11.978711128234863, + "kl": 18.828125, + "learning_rate": 6.442307692307693e-06, + "loss": 0.0189, + "reward": 0.004274305305443704, + "reward_std": 0.13267403468489647, + "rewards/ndcg_rule_reward": -0.02697569504380226, + "rewards/rule_reward": 0.03125, + "step": 68, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.03993055555555555, + "grad_norm": 99.22816467285156, + "kl": 199.90625, + "learning_rate": 6.538461538461539e-06, + "loss": 0.1999, + "reward": 0.003564358805306256, + "reward_std": 0.12459143996238708, + "rewards/ndcg_rule_reward": -0.02573251724243164, + "rewards/rule_reward": 0.029296875, + "step": 69, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.04050925925925926, + "grad_norm": 102.52205657958984, + "kl": 111.875, + "learning_rate": 6.6346153846153846e-06, + "loss": 0.1119, + "reward": 0.0035764046479016542, + "reward_std": 0.12458251416683197, + "rewards/ndcg_rule_reward": -0.025720471516251564, + "rewards/rule_reward": 0.029296875, + "step": 70, + "token_diversity": 0.5390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.041087962962962965, + "grad_norm": 16.701744079589844, + "kl": 30.78515625, + "learning_rate": 6.730769230769232e-06, + "loss": 0.0308, + "reward": 0.004775819135829806, + "reward_std": 0.15770866349339485, + "rewards/ndcg_rule_reward": -0.032333556562662125, + "rewards/rule_reward": 0.037109375, + "step": 71, + "token_diversity": 0.53125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.041666666666666664, + "grad_norm": 4.927725315093994, + "kl": 17.25, + "learning_rate": 6.826923076923078e-06, + "loss": 0.0172, + "reward": 0.0038265507901087403, + "reward_std": 0.14969782531261444, + "rewards/ndcg_rule_reward": -0.031329698860645294, + "rewards/rule_reward": 0.03515625, + "step": 72, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.00390625, + "epoch": 0.04224537037037037, + "grad_norm": 1.778765320777893, + "kl": 6.875, + "learning_rate": 6.923076923076923e-06, + "loss": 0.0069, + "reward": 0.004603820154443383, + "reward_std": 0.132532499730587, + "rewards/ndcg_rule_reward": -0.02664618007838726, + "rewards/rule_reward": 0.03125, + "step": 73, + "token_diversity": 0.53125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.04282407407407408, + "grad_norm": 3.2308645248413086, + "kl": 12.21875, + "learning_rate": 7.01923076923077e-06, + "loss": 0.0122, + "reward": 0.0038241040892899036, + "reward_std": 0.11604535579681396, + "rewards/ndcg_rule_reward": -0.023519645910710096, + "rewards/rule_reward": 0.02734375, + "step": 74, + "token_diversity": 0.48209375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.043402777777777776, + "grad_norm": 1.1794108152389526, + "kl": 0.95703125, + "learning_rate": 7.115384615384616e-06, + "loss": 0.001, + "reward": 0.002560124034062028, + "reward_std": 0.1082034558057785, + "rewards/ndcg_rule_reward": -0.022830501198768616, + "rewards/rule_reward": 0.025390625, + "step": 75, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.04398148148148148, + "grad_norm": 1.1546603441238403, + "kl": 3.671875, + "learning_rate": 7.211538461538462e-06, + "loss": 0.0037, + "reward": 0.003118663327768445, + "reward_std": 0.09953746944665909, + "rewards/ndcg_rule_reward": -0.0203188369050622, + "rewards/rule_reward": 0.0234375, + "step": 76, + "token_diversity": 0.5546875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.04456018518518518, + "grad_norm": 1.0587372779846191, + "kl": 0.546875, + "learning_rate": 7.307692307692308e-06, + "loss": 0.0005, + "reward": 0.0021968877408653498, + "reward_std": 0.0747254267334938, + "rewards/ndcg_rule_reward": -0.015381237491965294, + "rewards/rule_reward": 0.017578125, + "step": 77, + "token_diversity": 0.54296875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.04513888888888889, + "grad_norm": 1.1947546005249023, + "kl": 1.6328125, + "learning_rate": 7.403846153846155e-06, + "loss": 0.0016, + "reward": 0.003409161581657827, + "reward_std": 0.11624178290367126, + "rewards/ndcg_rule_reward": -0.02393458876758814, + "rewards/rule_reward": 0.02734375, + "step": 78, + "token_diversity": 0.51790625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.990234375, + "epoch": 0.045717592592592594, + "grad_norm": 1.5698641538619995, + "kl": 7.390625, + "learning_rate": 7.500000000000001e-06, + "loss": 0.0074, + "reward": 0.0034334120573475957, + "reward_std": 0.12463255971670151, + "rewards/ndcg_rule_reward": -0.025863463059067726, + "rewards/rule_reward": 0.029296875, + "step": 79, + "token_diversity": 0.53125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.046296296296296294, + "grad_norm": 59.04481887817383, + "kl": 134.8515625, + "learning_rate": 7.5961538461538465e-06, + "loss": 0.1346, + "reward": 0.0033790116431191564, + "reward_std": 0.11625358462333679, + "rewards/ndcg_rule_reward": -0.02396473940461874, + "rewards/rule_reward": 0.02734375, + "step": 80, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.046875, + "grad_norm": 1.6113567352294922, + "kl": 4.671875, + "learning_rate": 7.692307692307694e-06, + "loss": 0.0047, + "reward": 0.0036947117187082767, + "reward_std": 0.12451837584376335, + "rewards/ndcg_rule_reward": -0.02560216374695301, + "rewards/rule_reward": 0.029296875, + "step": 81, + "token_diversity": 0.47381249999999997 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.047453703703703706, + "grad_norm": 1.400386095046997, + "kl": 4.21484375, + "learning_rate": 7.78846153846154e-06, + "loss": 0.0042, + "reward": 0.003292868728749454, + "reward_std": 0.13313165307044983, + "rewards/ndcg_rule_reward": -0.0279571320861578, + "rewards/rule_reward": 0.03125, + "step": 82, + "token_diversity": 0.50571875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.970703125, + "epoch": 0.048032407407407406, + "grad_norm": 1.3092408180236816, + "kl": 3.25, + "learning_rate": 7.884615384615384e-06, + "loss": 0.0032, + "reward": 0.00427292799577117, + "reward_std": 0.13267014175653458, + "rewards/ndcg_rule_reward": -0.02697707060724497, + "rewards/rule_reward": 0.03125, + "step": 83, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.04861111111111111, + "grad_norm": 1.8711247444152832, + "kl": 7.6953125, + "learning_rate": 7.980769230769232e-06, + "loss": 0.0077, + "reward": 0.003945193835534155, + "reward_std": 0.14966098964214325, + "rewards/ndcg_rule_reward": -0.031211054883897305, + "rewards/rule_reward": 0.03515625, + "step": 84, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.958984375, + "epoch": 0.04918981481481482, + "grad_norm": 58.00933837890625, + "kl": 167.0625, + "learning_rate": 8.076923076923077e-06, + "loss": 0.1674, + "reward": 0.003582009579986334, + "reward_std": 0.13298995047807693, + "rewards/ndcg_rule_reward": -0.02766798995435238, + "rewards/rule_reward": 0.03125, + "step": 85, + "token_diversity": 0.48209375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.958984375, + "epoch": 0.04976851851851852, + "grad_norm": 2.0821170806884766, + "kl": 12.125, + "learning_rate": 8.173076923076923e-06, + "loss": 0.0121, + "reward": 0.004572001053020358, + "reward_std": 0.14937957376241684, + "rewards/ndcg_rule_reward": -0.030584249645471573, + "rewards/rule_reward": 0.03515625, + "step": 86, + "token_diversity": 0.44 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.050347222222222224, + "grad_norm": 1.5322024822235107, + "kl": 1.1171875, + "learning_rate": 8.26923076923077e-06, + "loss": 0.0011, + "reward": 0.0030444556614384055, + "reward_std": 0.13324758410453796, + "rewards/ndcg_rule_reward": -0.02820554468780756, + "rewards/rule_reward": 0.03125, + "step": 87, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.05092592592592592, + "grad_norm": 1.3778868913650513, + "kl": 4.703125, + "learning_rate": 8.365384615384616e-06, + "loss": 0.0047, + "reward": 0.0037951068952679634, + "reward_std": 0.14127695560455322, + "rewards/ndcg_rule_reward": -0.029408018104732037, + "rewards/rule_reward": 0.033203125, + "step": 88, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.05150462962962963, + "grad_norm": 4.087715148925781, + "kl": 21.53125, + "learning_rate": 8.461538461538462e-06, + "loss": 0.0215, + "reward": 0.0029565878794528544, + "reward_std": 0.09118866920471191, + "rewards/ndcg_rule_reward": -0.018527787178754807, + "rewards/rule_reward": 0.021484375, + "step": 89, + "token_diversity": 0.47884375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.052083333333333336, + "grad_norm": 1.3472102880477905, + "kl": 2.7265625, + "learning_rate": 8.557692307692308e-06, + "loss": 0.0027, + "reward": 0.0035585963632911444, + "reward_std": 0.12459585815668106, + "rewards/ndcg_rule_reward": -0.025738278403878212, + "rewards/rule_reward": 0.029296875, + "step": 90, + "token_diversity": 0.485625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.052662037037037035, + "grad_norm": 1.605459451675415, + "kl": 1.5234375, + "learning_rate": 8.653846153846155e-06, + "loss": 0.0015, + "reward": 0.004513848456554115, + "reward_std": 0.15782560408115387, + "rewards/ndcg_rule_reward": -0.03259552735835314, + "rewards/rule_reward": 0.037109375, + "step": 91, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.05324074074074074, + "grad_norm": 2.7568564414978027, + "kl": 13.0234375, + "learning_rate": 8.750000000000001e-06, + "loss": 0.013, + "reward": 0.004724844824522734, + "reward_std": 0.15768428891897202, + "rewards/ndcg_rule_reward": -0.0323845325037837, + "rewards/rule_reward": 0.037109375, + "step": 92, + "token_diversity": 0.53125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.05381944444444445, + "grad_norm": 26.35652732849121, + "kl": 73.28125, + "learning_rate": 8.846153846153847e-06, + "loss": 0.0733, + "reward": 0.004340509185567498, + "reward_std": 0.14108021557331085, + "rewards/ndcg_rule_reward": -0.028862616047263145, + "rewards/rule_reward": 0.033203125, + "step": 93, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.05439814814814815, + "grad_norm": 1.33370840549469, + "kl": 4.40625, + "learning_rate": 8.942307692307693e-06, + "loss": 0.0044, + "reward": 0.004488698672503233, + "reward_std": 0.13253788650035858, + "rewards/ndcg_rule_reward": -0.026761301793158054, + "rewards/rule_reward": 0.03125, + "step": 94, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.05497685185185185, + "grad_norm": 1.0616055727005005, + "kl": 8.5625, + "learning_rate": 9.03846153846154e-06, + "loss": 0.0086, + "reward": 0.0026638712733983994, + "reward_std": 0.09131206944584846, + "rewards/ndcg_rule_reward": -0.018820504657924175, + "rewards/rule_reward": 0.021484375, + "step": 95, + "token_diversity": 0.51371875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.05555555555555555, + "grad_norm": 4.509409427642822, + "kl": 20.984375, + "learning_rate": 9.134615384615384e-06, + "loss": 0.0209, + "reward": 0.004679993959143758, + "reward_std": 0.12404875084757805, + "rewards/ndcg_rule_reward": -0.02461688034236431, + "rewards/rule_reward": 0.029296875, + "step": 96, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.96484375, + "epoch": 0.05613425925925926, + "grad_norm": 1.6430739164352417, + "kl": 7.46875, + "learning_rate": 9.230769230769232e-06, + "loss": 0.0075, + "reward": 0.0040221260860562325, + "reward_std": 0.12434807419776917, + "rewards/ndcg_rule_reward": -0.025274749845266342, + "rewards/rule_reward": 0.029296875, + "step": 97, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.056712962962962965, + "grad_norm": 2.3971657752990723, + "kl": 14.046875, + "learning_rate": 9.326923076923079e-06, + "loss": 0.014, + "reward": 0.005165487062186003, + "reward_std": 0.1490909680724144, + "rewards/ndcg_rule_reward": -0.02999076247215271, + "rewards/rule_reward": 0.03515625, + "step": 98, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.057291666666666664, + "grad_norm": 1.2210479974746704, + "kl": 5.86328125, + "learning_rate": 9.423076923076923e-06, + "loss": 0.0058, + "reward": 0.003192138858139515, + "reward_std": 0.11631110683083534, + "rewards/ndcg_rule_reward": -0.02415161207318306, + "rewards/rule_reward": 0.02734375, + "step": 99, + "token_diversity": 0.52734375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.05787037037037037, + "grad_norm": 2.441042423248291, + "kl": 9.515625, + "learning_rate": 9.51923076923077e-06, + "loss": 0.0095, + "reward": 0.002542472444474697, + "reward_std": 0.09138195961713791, + "rewards/ndcg_rule_reward": -0.01894190162420273, + "rewards/rule_reward": 0.021484375, + "step": 100, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.05844907407407408, + "grad_norm": 1.793097972869873, + "kl": 7.375, + "learning_rate": 9.615384615384616e-06, + "loss": 0.0074, + "reward": 0.0033193915151059628, + "reward_std": 0.12469805777072906, + "rewards/ndcg_rule_reward": -0.0259774848818779, + "rewards/rule_reward": 0.029296875, + "step": 101, + "token_diversity": 0.5390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.059027777777777776, + "grad_norm": 1.4509283304214478, + "kl": 8.390625, + "learning_rate": 9.711538461538462e-06, + "loss": 0.0084, + "reward": 0.00383947161026299, + "reward_std": 0.13287170231342316, + "rewards/ndcg_rule_reward": -0.027410528622567654, + "rewards/rule_reward": 0.03125, + "step": 102, + "token_diversity": 0.48990625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.01953125, + "epoch": 0.05960648148148148, + "grad_norm": 1.5125563144683838, + "kl": 8.34375, + "learning_rate": 9.807692307692308e-06, + "loss": 0.0083, + "reward": 0.004078189027495682, + "reward_std": 0.13276179507374763, + "rewards/ndcg_rule_reward": -0.02717181108891964, + "rewards/rule_reward": 0.03125, + "step": 103, + "token_diversity": 0.35807291666666663 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.06018518518518518, + "grad_norm": 2.024587392807007, + "kl": 8.78125, + "learning_rate": 9.903846153846155e-06, + "loss": 0.0088, + "reward": 0.005883355159312487, + "reward_std": 0.1908496767282486, + "rewards/ndcg_rule_reward": -0.0390385203063488, + "rewards/rule_reward": 0.044921875, + "step": 104, + "token_diversity": 0.44140625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.06076388888888889, + "grad_norm": 4.216940402984619, + "kl": 13.984375, + "learning_rate": 1e-05, + "loss": 0.0139, + "reward": 0.002244777337182313, + "reward_std": 0.08311399817466736, + "rewards/ndcg_rule_reward": -0.01728647295385599, + "rewards/rule_reward": 0.01953125, + "step": 105, + "token_diversity": 0.53515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.061342592592592594, + "grad_norm": 4.706640243530273, + "kl": 23.203125, + "learning_rate": 9.999997804003204e-06, + "loss": 0.0233, + "reward": 0.00288363266736269, + "reward_std": 0.10807809978723526, + "rewards/ndcg_rule_reward": -0.02250699233263731, + "rewards/rule_reward": 0.025390625, + "step": 106, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.061921296296296294, + "grad_norm": 1.6674824953079224, + "kl": 8.25, + "learning_rate": 9.999991216014741e-06, + "loss": 0.0083, + "reward": 0.005252262344583869, + "reward_std": 0.14903102070093155, + "rewards/ndcg_rule_reward": -0.029903989285230637, + "rewards/rule_reward": 0.03515625, + "step": 107, + "token_diversity": 0.465625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.0625, + "grad_norm": 1.4467806816101074, + "kl": 6.09375, + "learning_rate": 9.9999802360404e-06, + "loss": 0.0061, + "reward": 0.004984318278729916, + "reward_std": 0.1491761952638626, + "rewards/ndcg_rule_reward": -0.03017193265259266, + "rewards/rule_reward": 0.03515625, + "step": 108, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.0630787037037037, + "grad_norm": 11.78942584991455, + "kl": 49.875, + "learning_rate": 9.999964864089827e-06, + "loss": 0.0499, + "reward": 0.004337949911132455, + "reward_std": 0.12422342225909233, + "rewards/ndcg_rule_reward": -0.024958926253020763, + "rewards/rule_reward": 0.029296875, + "step": 109, + "token_diversity": 0.44921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.06365740740740741, + "grad_norm": 732.6746826171875, + "kl": 1082.359375, + "learning_rate": 9.999945100176522e-06, + "loss": 1.0859, + "reward": 0.0023598155239596963, + "reward_std": 0.08304723352193832, + "rewards/ndcg_rule_reward": -0.017171435058116913, + "rewards/rule_reward": 0.01953125, + "step": 110, + "token_diversity": 0.44140625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.0642361111111111, + "grad_norm": 1.8365117311477661, + "kl": 6.25, + "learning_rate": 9.999920944317846e-06, + "loss": 0.0063, + "reward": 0.004499781643971801, + "reward_std": 0.14936240762472153, + "rewards/ndcg_rule_reward": -0.030656468123197556, + "rewards/rule_reward": 0.03515625, + "step": 111, + "token_diversity": 0.45084375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.06481481481481481, + "grad_norm": 12.607120513916016, + "kl": 47.75, + "learning_rate": 9.999892396535019e-06, + "loss": 0.0475, + "reward": 0.004336328012868762, + "reward_std": 0.14950163662433624, + "rewards/ndcg_rule_reward": -0.030819921754300594, + "rewards/rule_reward": 0.03515625, + "step": 112, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.06539351851851852, + "grad_norm": 1.7263888120651245, + "kl": 7.515625, + "learning_rate": 9.999859456853116e-06, + "loss": 0.0075, + "reward": 0.003462518216110766, + "reward_std": 0.11618843674659729, + "rewards/ndcg_rule_reward": -0.023881232365965843, + "rewards/rule_reward": 0.02734375, + "step": 113, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.06597222222222222, + "grad_norm": 36.16665267944336, + "kl": 165.125, + "learning_rate": 9.999822125301071e-06, + "loss": 0.1646, + "reward": 0.004527646116912365, + "reward_std": 0.13258331269025803, + "rewards/ndcg_rule_reward": -0.02672235295176506, + "rewards/rule_reward": 0.03125, + "step": 114, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.06655092592592593, + "grad_norm": 4.791245937347412, + "kl": 22.84375, + "learning_rate": 9.99978040191168e-06, + "loss": 0.0229, + "reward": 0.003888698644004762, + "reward_std": 0.11599093675613403, + "rewards/ndcg_rule_reward": -0.023455051705241203, + "rewards/rule_reward": 0.02734375, + "step": 115, + "token_diversity": 0.44921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.06712962962962964, + "grad_norm": 1.7056164741516113, + "kl": 10.0, + "learning_rate": 9.999734286721586e-06, + "loss": 0.01, + "reward": 0.00323127256706357, + "reward_std": 0.10792403295636177, + "rewards/ndcg_rule_reward": -0.022159352898597717, + "rewards/rule_reward": 0.025390625, + "step": 116, + "token_diversity": 0.52734375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.06770833333333333, + "grad_norm": 1.9072879552841187, + "kl": 13.0, + "learning_rate": 9.999683779771301e-06, + "loss": 0.013, + "reward": 0.005388270132243633, + "reward_std": 0.16581441462039948, + "rewards/ndcg_rule_reward": -0.03367423079907894, + "rewards/rule_reward": 0.0390625, + "step": 117, + "token_diversity": 0.42646875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.984375, + "epoch": 0.06828703703703703, + "grad_norm": 2.402801036834717, + "kl": 9.453125, + "learning_rate": 9.99962888110519e-06, + "loss": 0.0095, + "reward": 0.003335450543090701, + "reward_std": 0.12471820414066315, + "rewards/ndcg_rule_reward": -0.025961424224078655, + "rewards/rule_reward": 0.029296875, + "step": 118, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.06886574074074074, + "grad_norm": 2.1914889812469482, + "kl": 13.03125, + "learning_rate": 9.999569590771474e-06, + "loss": 0.0131, + "reward": 0.004541096976026893, + "reward_std": 0.14938419312238693, + "rewards/ndcg_rule_reward": -0.030615152791142464, + "rewards/rule_reward": 0.03515625, + "step": 119, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.06944444444444445, + "grad_norm": 6.816300868988037, + "kl": 25.421875, + "learning_rate": 9.999505908822234e-06, + "loss": 0.0256, + "reward": 0.003512683790177107, + "reward_std": 0.10777831450104713, + "rewards/ndcg_rule_reward": -0.02187794167548418, + "rewards/rule_reward": 0.025390625, + "step": 120, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.07002314814814815, + "grad_norm": 1.3076725006103516, + "kl": 6.34375, + "learning_rate": 9.99943783531341e-06, + "loss": 0.0063, + "reward": 0.0024060329888015985, + "reward_std": 0.09147000312805176, + "rewards/ndcg_rule_reward": -0.019078342244029045, + "rewards/rule_reward": 0.021484375, + "step": 121, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.07060185185185185, + "grad_norm": 1.938544750213623, + "kl": 15.890625, + "learning_rate": 9.999365370304797e-06, + "loss": 0.0159, + "reward": 0.004212364088743925, + "reward_std": 0.141114741563797, + "rewards/ndcg_rule_reward": -0.028990760445594788, + "rewards/rule_reward": 0.033203125, + "step": 122, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.07118055555555555, + "grad_norm": 1.3268145322799683, + "kl": 3.5546875, + "learning_rate": 9.999288513860049e-06, + "loss": 0.0036, + "reward": 0.0028923844220116735, + "reward_std": 0.09963967651128769, + "rewards/ndcg_rule_reward": -0.02054511569440365, + "rewards/rule_reward": 0.0234375, + "step": 123, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.07175925925925926, + "grad_norm": 1.6562062501907349, + "kl": 9.921875, + "learning_rate": 9.999207266046674e-06, + "loss": 0.0099, + "reward": 0.005143312504515052, + "reward_std": 0.14909812808036804, + "rewards/ndcg_rule_reward": -0.03001293633133173, + "rewards/rule_reward": 0.03515625, + "step": 124, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.07233796296296297, + "grad_norm": 1.5087007284164429, + "kl": 9.984375, + "learning_rate": 9.999121626936038e-06, + "loss": 0.01, + "reward": 0.005008111707866192, + "reward_std": 0.16603151708841324, + "rewards/ndcg_rule_reward": -0.034054387360811234, + "rewards/rule_reward": 0.0390625, + "step": 125, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.07291666666666667, + "grad_norm": 1.5372503995895386, + "kl": 6.8125, + "learning_rate": 9.999031596603374e-06, + "loss": 0.0068, + "reward": 0.0038908389979042113, + "reward_std": 0.12442348152399063, + "rewards/ndcg_rule_reward": -0.025406035594642162, + "rewards/rule_reward": 0.029296875, + "step": 126, + "token_diversity": 0.474375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.07349537037037036, + "grad_norm": 4.083111763000488, + "kl": 26.59375, + "learning_rate": 9.998937175127758e-06, + "loss": 0.0266, + "reward": 0.005179226398468018, + "reward_std": 0.16591478139162064, + "rewards/ndcg_rule_reward": -0.03388327360153198, + "rewards/rule_reward": 0.0390625, + "step": 127, + "token_diversity": 0.48665625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0078125, + "epoch": 0.07407407407407407, + "grad_norm": 2.0637423992156982, + "kl": 22.09375, + "learning_rate": 9.998838362592132e-06, + "loss": 0.022, + "reward": 0.0046057256404310465, + "reward_std": 0.14092295989394188, + "rewards/ndcg_rule_reward": -0.028597401455044746, + "rewards/rule_reward": 0.033203125, + "step": 128, + "token_diversity": 0.53515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.07465277777777778, + "grad_norm": 1.0554078817367554, + "kl": 2.5859375, + "learning_rate": 9.998735159083295e-06, + "loss": 0.0026, + "reward": 0.0026687849313020706, + "reward_std": 0.08291643485426903, + "rewards/ndcg_rule_reward": -0.016862466000020504, + "rewards/rule_reward": 0.01953125, + "step": 129, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.07523148148148148, + "grad_norm": 4.304537296295166, + "kl": 37.375, + "learning_rate": 9.998627564691895e-06, + "loss": 0.0374, + "reward": 0.003715288359671831, + "reward_std": 0.1329115927219391, + "rewards/ndcg_rule_reward": -0.027534712105989456, + "rewards/rule_reward": 0.03125, + "step": 130, + "token_diversity": 0.515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.07581018518518519, + "grad_norm": 1.5055108070373535, + "kl": 2.8828125, + "learning_rate": 9.998515579512446e-06, + "loss": 0.0029, + "reward": 0.0042209934908896685, + "reward_std": 0.1326894611120224, + "rewards/ndcg_rule_reward": -0.027029006741940975, + "rewards/rule_reward": 0.03125, + "step": 131, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.0763888888888889, + "grad_norm": 10.909578323364258, + "kl": 61.4140625, + "learning_rate": 9.998399203643319e-06, + "loss": 0.0613, + "reward": 0.003810521215200424, + "reward_std": 0.12448476627469063, + "rewards/ndcg_rule_reward": -0.025486353784799576, + "rewards/rule_reward": 0.029296875, + "step": 132, + "token_diversity": 0.4861875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.07696759259259259, + "grad_norm": 394.364990234375, + "kl": 726.203125, + "learning_rate": 9.998278437186734e-06, + "loss": 0.7239, + "reward": 0.003510931273922324, + "reward_std": 0.1162053644657135, + "rewards/ndcg_rule_reward": -0.023832818493247032, + "rewards/rule_reward": 0.02734375, + "step": 133, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.0775462962962963, + "grad_norm": 96.32406616210938, + "kl": 242.0625, + "learning_rate": 9.998153280248772e-06, + "loss": 0.2434, + "reward": 0.005088977050036192, + "reward_std": 0.17437244206666946, + "rewards/ndcg_rule_reward": -0.03592665120959282, + "rewards/rule_reward": 0.041015625, + "step": 134, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.078125, + "grad_norm": 30.258102416992188, + "kl": 79.65625, + "learning_rate": 9.998023732939374e-06, + "loss": 0.0795, + "reward": 0.003652912680990994, + "reward_std": 0.10770386084914207, + "rewards/ndcg_rule_reward": -0.021737712435424328, + "rewards/rule_reward": 0.025390625, + "step": 135, + "token_diversity": 0.4140625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.0787037037037037, + "grad_norm": 1.641474723815918, + "kl": 7.921875, + "learning_rate": 9.997889795372331e-06, + "loss": 0.0079, + "reward": 0.003243251470848918, + "reward_std": 0.11632728204131126, + "rewards/ndcg_rule_reward": -0.02410049829632044, + "rewards/rule_reward": 0.02734375, + "step": 136, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.041015625, + "epoch": 0.07928240740740741, + "grad_norm": 3.1413142681121826, + "kl": 20.78125, + "learning_rate": 9.997751467665295e-06, + "loss": 0.0208, + "reward": 0.004209124483168125, + "reward_std": 0.14112325012683868, + "rewards/ndcg_rule_reward": -0.0289939995855093, + "rewards/rule_reward": 0.033203125, + "step": 137, + "token_diversity": 0.4471219512195122 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0703125, + "epoch": 0.0798611111111111, + "grad_norm": 2.5232949256896973, + "kl": 12.0703125, + "learning_rate": 9.997608749939774e-06, + "loss": 0.0121, + "reward": 0.0029764444334432483, + "reward_std": 0.10804028809070587, + "rewards/ndcg_rule_reward": -0.02241418045014143, + "rewards/rule_reward": 0.025390625, + "step": 138, + "token_diversity": 0.54934375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.970703125, + "epoch": 0.08043981481481481, + "grad_norm": 2.4950382709503174, + "kl": 22.5, + "learning_rate": 9.997461642321127e-06, + "loss": 0.0225, + "reward": 0.0040368190966546535, + "reward_std": 0.13280470296740532, + "rewards/ndcg_rule_reward": -0.027213181369006634, + "rewards/rule_reward": 0.03125, + "step": 139, + "token_diversity": 0.54296875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.08101851851851852, + "grad_norm": 1.5677881240844727, + "kl": 4.3125, + "learning_rate": 9.99731014493858e-06, + "loss": 0.0043, + "reward": 0.003972131060436368, + "reward_std": 0.14122271165251732, + "rewards/ndcg_rule_reward": -0.02923099510371685, + "rewards/rule_reward": 0.033203125, + "step": 140, + "token_diversity": 0.5703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.08159722222222222, + "grad_norm": 1.612089991569519, + "kl": 2.904296875, + "learning_rate": 9.997154257925199e-06, + "loss": 0.0029, + "reward": 0.0026492217439226806, + "reward_std": 0.1081739217042923, + "rewards/ndcg_rule_reward": -0.022741403430700302, + "rewards/rule_reward": 0.025390625, + "step": 141, + "token_diversity": 0.52734375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.083984375, + "epoch": 0.08217592592592593, + "grad_norm": 16.21250343322754, + "kl": 38.25, + "learning_rate": 9.996993981417925e-06, + "loss": 0.0381, + "reward": 0.004577783634886146, + "reward_std": 0.14093907177448273, + "rewards/ndcg_rule_reward": -0.02862534113228321, + "rewards/rule_reward": 0.033203125, + "step": 142, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.08275462962962964, + "grad_norm": 1.3095362186431885, + "kl": 2.361328125, + "learning_rate": 9.996829315557535e-06, + "loss": 0.0024, + "reward": 0.0021285928087309003, + "reward_std": 0.09157603606581688, + "rewards/ndcg_rule_reward": -0.019355783239006996, + "rewards/rule_reward": 0.021484375, + "step": 143, + "token_diversity": 0.5546875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.021484375, + "epoch": 0.08333333333333333, + "grad_norm": 2.3988914489746094, + "kl": 11.0, + "learning_rate": 9.996660260488678e-06, + "loss": 0.011, + "reward": 0.004407697590067983, + "reward_std": 0.17469431459903717, + "rewards/ndcg_rule_reward": -0.036607928574085236, + "rewards/rule_reward": 0.041015625, + "step": 144, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.08391203703703703, + "grad_norm": 1.3021409511566162, + "kl": 3.484375, + "learning_rate": 9.996486816359851e-06, + "loss": 0.0035, + "reward": 0.003054355620406568, + "reward_std": 0.09959182888269424, + "rewards/ndcg_rule_reward": -0.020383143797516823, + "rewards/rule_reward": 0.0234375, + "step": 145, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.08449074074074074, + "grad_norm": 2.1075050830841064, + "kl": 16.3125, + "learning_rate": 9.996308983323404e-06, + "loss": 0.0163, + "reward": 0.004171836655586958, + "reward_std": 0.1411641612648964, + "rewards/ndcg_rule_reward": -0.02903128881007433, + "rewards/rule_reward": 0.033203125, + "step": 146, + "token_diversity": 0.434 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.08506944444444445, + "grad_norm": 2.9504904747009277, + "kl": 18.78125, + "learning_rate": 9.996126761535547e-06, + "loss": 0.0187, + "reward": 0.004464008146896958, + "reward_std": 0.1494011953473091, + "rewards/ndcg_rule_reward": -0.030692243948578835, + "rewards/rule_reward": 0.03515625, + "step": 147, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.08564814814814815, + "grad_norm": 22.354555130004883, + "kl": 89.0, + "learning_rate": 9.995940151156344e-06, + "loss": 0.089, + "reward": 0.0056968373246490955, + "reward_std": 0.1909354254603386, + "rewards/ndcg_rule_reward": -0.03922503814101219, + "rewards/rule_reward": 0.044921875, + "step": 148, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.08622685185185185, + "grad_norm": 1.3824520111083984, + "kl": 8.0625, + "learning_rate": 9.995749152349715e-06, + "loss": 0.0081, + "reward": 0.0034701055847108364, + "reward_std": 0.10779714584350586, + "rewards/ndcg_rule_reward": -0.021920520812273026, + "rewards/rule_reward": 0.025390625, + "step": 149, + "token_diversity": 0.40331249999999996 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.08680555555555555, + "grad_norm": 1.2636902332305908, + "kl": 7.5625, + "learning_rate": 9.995553765283429e-06, + "loss": 0.0076, + "reward": 0.0022550441790372133, + "reward_std": 0.0915289968252182, + "rewards/ndcg_rule_reward": -0.019229331519454718, + "rewards/rule_reward": 0.021484375, + "step": 150, + "token_diversity": 0.4781875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.08738425925925926, + "grad_norm": 2.1100590229034424, + "kl": 14.546875, + "learning_rate": 9.995353990129115e-06, + "loss": 0.0146, + "reward": 0.003696698695421219, + "reward_std": 0.12451079487800598, + "rewards/ndcg_rule_reward": -0.025600175373256207, + "rewards/rule_reward": 0.029296875, + "step": 151, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.08796296296296297, + "grad_norm": 1.4393932819366455, + "kl": 4.6171875, + "learning_rate": 9.995149827062256e-06, + "loss": 0.0046, + "reward": 0.0031895197462290525, + "reward_std": 0.11631079390645027, + "rewards/ndcg_rule_reward": -0.024154230020940304, + "rewards/rule_reward": 0.02734375, + "step": 152, + "token_diversity": 0.49790625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.08854166666666667, + "grad_norm": 2.613635778427124, + "kl": 9.5, + "learning_rate": 9.994941276262188e-06, + "loss": 0.0095, + "reward": 0.00394737021997571, + "reward_std": 0.11597709730267525, + "rewards/ndcg_rule_reward": -0.023396380245685577, + "rewards/rule_reward": 0.02734375, + "step": 153, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.08912037037037036, + "grad_norm": 1.501265048980713, + "kl": 10.28125, + "learning_rate": 9.994728337912105e-06, + "loss": 0.0103, + "reward": 0.0034845280461013317, + "reward_std": 0.09935629740357399, + "rewards/ndcg_rule_reward": -0.019952972885221243, + "rewards/rule_reward": 0.0234375, + "step": 154, + "token_diversity": 0.53125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.08969907407407407, + "grad_norm": 1.9355782270431519, + "kl": 11.34375, + "learning_rate": 9.994511012199047e-06, + "loss": 0.0114, + "reward": 0.004337949678301811, + "reward_std": 0.12422341853380203, + "rewards/ndcg_rule_reward": -0.02495892532169819, + "rewards/rule_reward": 0.029296875, + "step": 155, + "token_diversity": 0.4375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.09027777777777778, + "grad_norm": 4.958339214324951, + "kl": 17.1875, + "learning_rate": 9.994289299313915e-06, + "loss": 0.0171, + "reward": 0.0026988020981661975, + "reward_std": 0.09133382886648178, + "rewards/ndcg_rule_reward": -0.018785573542118073, + "rewards/rule_reward": 0.021484375, + "step": 156, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.09085648148148148, + "grad_norm": 1.8624495267868042, + "kl": 5.390625, + "learning_rate": 9.99406319945146e-06, + "loss": 0.0054, + "reward": 0.004079842823557556, + "reward_std": 0.1411849483847618, + "rewards/ndcg_rule_reward": -0.029123282060027122, + "rewards/rule_reward": 0.033203125, + "step": 157, + "token_diversity": 0.52734375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.09143518518518519, + "grad_norm": 4.194841384887695, + "kl": 24.828125, + "learning_rate": 9.99383271281029e-06, + "loss": 0.0249, + "reward": 0.003905713325366378, + "reward_std": 0.1244528740644455, + "rewards/ndcg_rule_reward": -0.02539116144180298, + "rewards/rule_reward": 0.029296875, + "step": 158, + "token_diversity": 0.53125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.0920138888888889, + "grad_norm": 7.076115608215332, + "kl": 29.375, + "learning_rate": 9.993597839592863e-06, + "loss": 0.0294, + "reward": 0.004882907727733254, + "reward_std": 0.1407802253961563, + "rewards/ndcg_rule_reward": -0.028320217039436102, + "rewards/rule_reward": 0.033203125, + "step": 159, + "token_diversity": 0.53125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.09259259259259259, + "grad_norm": 48.21598815917969, + "kl": 186.125, + "learning_rate": 9.99335858000549e-06, + "loss": 0.1871, + "reward": 0.004547311807982624, + "reward_std": 0.14094552397727966, + "rewards/ndcg_rule_reward": -0.028655813075602055, + "rewards/rule_reward": 0.033203125, + "step": 160, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.0931712962962963, + "grad_norm": 1.3419537544250488, + "kl": 5.6875, + "learning_rate": 9.99311493425834e-06, + "loss": 0.0057, + "reward": 0.0038818351458758116, + "reward_std": 0.13285491988062859, + "rewards/ndcg_rule_reward": -0.027368164621293545, + "rewards/rule_reward": 0.03125, + "step": 161, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.09375, + "grad_norm": 1.4815242290496826, + "kl": 12.0, + "learning_rate": 9.992866902565428e-06, + "loss": 0.012, + "reward": 0.0032407627440989017, + "reward_std": 0.0994686670601368, + "rewards/ndcg_rule_reward": -0.02019673679023981, + "rewards/rule_reward": 0.0234375, + "step": 162, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.0943287037037037, + "grad_norm": 1.452575445175171, + "kl": 5.125, + "learning_rate": 9.992614485144624e-06, + "loss": 0.0051, + "reward": 0.00391158857382834, + "reward_std": 0.13284815847873688, + "rewards/ndcg_rule_reward": -0.027338411659002304, + "rewards/rule_reward": 0.03125, + "step": 163, + "token_diversity": 0.43359375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.09490740740740741, + "grad_norm": 2.2328052520751953, + "kl": 21.25, + "learning_rate": 9.992357682217653e-06, + "loss": 0.0213, + "reward": 0.004257158609107137, + "reward_std": 0.13267428427934647, + "rewards/ndcg_rule_reward": -0.026992841623723507, + "rewards/rule_reward": 0.03125, + "step": 164, + "token_diversity": 0.4505625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.0954861111111111, + "grad_norm": 1.3650492429733276, + "kl": 9.78125, + "learning_rate": 9.99209649401009e-06, + "loss": 0.0098, + "reward": 0.0034450748935341835, + "reward_std": 0.09937231615185738, + "rewards/ndcg_rule_reward": -0.019992425106465816, + "rewards/rule_reward": 0.0234375, + "step": 165, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.09606481481481481, + "grad_norm": 20.76105499267578, + "kl": 77.59375, + "learning_rate": 9.991830920751362e-06, + "loss": 0.0777, + "reward": 0.0057755589950829744, + "reward_std": 0.1740284562110901, + "rewards/ndcg_rule_reward": -0.035240067169070244, + "rewards/rule_reward": 0.041015625, + "step": 166, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.09664351851851852, + "grad_norm": 1.6667269468307495, + "kl": 15.09375, + "learning_rate": 9.991560962674749e-06, + "loss": 0.0151, + "reward": 0.0030245418893173337, + "reward_std": 0.11639701202511787, + "rewards/ndcg_rule_reward": -0.024319208227097988, + "rewards/rule_reward": 0.02734375, + "step": 167, + "token_diversity": 0.436 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.09722222222222222, + "grad_norm": 15.095471382141113, + "kl": 50.40625, + "learning_rate": 9.991286620017383e-06, + "loss": 0.0505, + "reward": 0.004082794999703765, + "reward_std": 0.12433501705527306, + "rewards/ndcg_rule_reward": -0.02521407976746559, + "rewards/rule_reward": 0.029296875, + "step": 168, + "token_diversity": 0.454375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.09780092592592593, + "grad_norm": 26.376192092895508, + "kl": 68.59375, + "learning_rate": 9.991007893020242e-06, + "loss": 0.0687, + "reward": 0.003520325990393758, + "reward_std": 0.11617536842823029, + "rewards/ndcg_rule_reward": -0.023823424242436886, + "rewards/rule_reward": 0.02734375, + "step": 169, + "token_diversity": 0.53125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.09837962962962964, + "grad_norm": 1.5913351774215698, + "kl": 15.96875, + "learning_rate": 9.990724781928163e-06, + "loss": 0.016, + "reward": 0.004268725519068539, + "reward_std": 0.13266894966363907, + "rewards/ndcg_rule_reward": -0.026981275528669357, + "rewards/rule_reward": 0.03125, + "step": 170, + "token_diversity": 0.45399999999999996 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.09895833333333333, + "grad_norm": 1.7596466541290283, + "kl": 1.97265625, + "learning_rate": 9.99043728698983e-06, + "loss": 0.002, + "reward": 0.003880319301970303, + "reward_std": 0.10760163515806198, + "rewards/ndcg_rule_reward": -0.02151030581444502, + "rewards/rule_reward": 0.025390625, + "step": 171, + "token_diversity": 0.515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.09953703703703703, + "grad_norm": 1.8409804105758667, + "kl": 17.0, + "learning_rate": 9.990145408457777e-06, + "loss": 0.0169, + "reward": 0.004380995640531182, + "reward_std": 0.14942315965890884, + "rewards/ndcg_rule_reward": -0.03077525459229946, + "rewards/rule_reward": 0.03515625, + "step": 172, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.10011574074074074, + "grad_norm": 1.9636378288269043, + "kl": 20.390625, + "learning_rate": 9.98984914658839e-06, + "loss": 0.0204, + "reward": 0.003083456540480256, + "reward_std": 0.09954766556620598, + "rewards/ndcg_rule_reward": -0.020354043692350388, + "rewards/rule_reward": 0.0234375, + "step": 173, + "token_diversity": 0.54134375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.021484375, + "epoch": 0.10069444444444445, + "grad_norm": 1.5379183292388916, + "kl": 8.46875, + "learning_rate": 9.989548501641906e-06, + "loss": 0.0085, + "reward": 0.003977210493758321, + "reward_std": 0.13278799504041672, + "rewards/ndcg_rule_reward": -0.027272788807749748, + "rewards/rule_reward": 0.03125, + "step": 174, + "token_diversity": 0.44921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.970703125, + "epoch": 0.10127314814814815, + "grad_norm": 3.2257752418518066, + "kl": 22.828125, + "learning_rate": 9.989243473882411e-06, + "loss": 0.0229, + "reward": 0.003409460885450244, + "reward_std": 0.12466900423169136, + "rewards/ndcg_rule_reward": -0.025887414813041687, + "rewards/rule_reward": 0.029296875, + "step": 175, + "token_diversity": 0.48571875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.10185185185185185, + "grad_norm": 1.7014633417129517, + "kl": 8.0625, + "learning_rate": 9.98893406357784e-06, + "loss": 0.0081, + "reward": 0.0038972427137196064, + "reward_std": 0.14128346741199493, + "rewards/ndcg_rule_reward": -0.02930588275194168, + "rewards/rule_reward": 0.033203125, + "step": 176, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.10243055555555555, + "grad_norm": 1.4794011116027832, + "kl": 7.75, + "learning_rate": 9.98862027099998e-06, + "loss": 0.0078, + "reward": 0.0038933708565309644, + "reward_std": 0.11602075397968292, + "rewards/ndcg_rule_reward": -0.023450379259884357, + "rewards/rule_reward": 0.02734375, + "step": 177, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.10300925925925926, + "grad_norm": 1.6724566221237183, + "kl": 10.609375, + "learning_rate": 9.988302096424463e-06, + "loss": 0.0106, + "reward": 0.0034728283062577248, + "reward_std": 0.1330726593732834, + "rewards/ndcg_rule_reward": -0.0277771707624197, + "rewards/rule_reward": 0.03125, + "step": 178, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.10358796296296297, + "grad_norm": 2.08439040184021, + "kl": 10.3125, + "learning_rate": 9.987979540130777e-06, + "loss": 0.0103, + "reward": 0.0034548642579466105, + "reward_std": 0.12458901479840279, + "rewards/ndcg_rule_reward": -0.025842010974884033, + "rewards/rule_reward": 0.029296875, + "step": 179, + "token_diversity": 0.4538125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.10416666666666667, + "grad_norm": 1.4248733520507812, + "kl": 8.265625, + "learning_rate": 9.987652602402254e-06, + "loss": 0.0083, + "reward": 0.003748095128685236, + "reward_std": 0.11607903614640236, + "rewards/ndcg_rule_reward": -0.023595656268298626, + "rewards/rule_reward": 0.02734375, + "step": 180, + "token_diversity": 0.5234375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.10474537037037036, + "grad_norm": 47.853492736816406, + "kl": 67.8828125, + "learning_rate": 9.987321283526072e-06, + "loss": 0.068, + "reward": 0.0035767288645729423, + "reward_std": 0.11616173759102821, + "rewards/ndcg_rule_reward": -0.023767021484673023, + "rewards/rule_reward": 0.02734375, + "step": 181, + "token_diversity": 0.492 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.10532407407407407, + "grad_norm": 2.3171446323394775, + "kl": 13.6875, + "learning_rate": 9.986985583793268e-06, + "loss": 0.0137, + "reward": 0.004059256287291646, + "reward_std": 0.1412198841571808, + "rewards/ndcg_rule_reward": -0.029143869876861572, + "rewards/rule_reward": 0.033203125, + "step": 182, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.10590277777777778, + "grad_norm": 1.4396650791168213, + "kl": 6.9375, + "learning_rate": 9.986645503498713e-06, + "loss": 0.0069, + "reward": 0.003240579506382346, + "reward_std": 0.10789451003074646, + "rewards/ndcg_rule_reward": -0.02215004526078701, + "rewards/rule_reward": 0.025390625, + "step": 183, + "token_diversity": 0.5234375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.10648148148148148, + "grad_norm": 7.357369899749756, + "kl": 43.03125, + "learning_rate": 9.986301042941139e-06, + "loss": 0.043, + "reward": 0.0036137059796601534, + "reward_std": 0.12457402050495148, + "rewards/ndcg_rule_reward": -0.025683170184493065, + "rewards/rule_reward": 0.029296875, + "step": 184, + "token_diversity": 0.546875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.10706018518518519, + "grad_norm": 1.3349391222000122, + "kl": 11.109375, + "learning_rate": 9.985952202423116e-06, + "loss": 0.0111, + "reward": 0.0030584702035412192, + "reward_std": 0.09116169437766075, + "rewards/ndcg_rule_reward": -0.018425905145704746, + "rewards/rule_reward": 0.021484375, + "step": 185, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.1076388888888889, + "grad_norm": 4.551126956939697, + "kl": 24.34375, + "learning_rate": 9.985598982251065e-06, + "loss": 0.0243, + "reward": 0.004258750705048442, + "reward_std": 0.1579279899597168, + "rewards/ndcg_rule_reward": -0.03285062499344349, + "rewards/rule_reward": 0.037109375, + "step": 186, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.10821759259259259, + "grad_norm": 1.6507372856140137, + "kl": 16.8125, + "learning_rate": 9.985241382735259e-06, + "loss": 0.0168, + "reward": 0.004550508572719991, + "reward_std": 0.13254791498184204, + "rewards/ndcg_rule_reward": -0.02669949270784855, + "rewards/rule_reward": 0.03125, + "step": 187, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.96484375, + "epoch": 0.1087962962962963, + "grad_norm": 2.5143704414367676, + "kl": 18.8125, + "learning_rate": 9.984879404189805e-06, + "loss": 0.0187, + "reward": 0.004058515420183539, + "reward_std": 0.14959433674812317, + "rewards/ndcg_rule_reward": -0.031097734346985817, + "rewards/rule_reward": 0.03515625, + "step": 188, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.109375, + "grad_norm": 2.250676155090332, + "kl": 14.484375, + "learning_rate": 9.98451304693267e-06, + "loss": 0.0145, + "reward": 0.0030996627174317837, + "reward_std": 0.10800182819366455, + "rewards/ndcg_rule_reward": -0.02229096181690693, + "rewards/rule_reward": 0.025390625, + "step": 189, + "token_diversity": 0.44265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.1099537037037037, + "grad_norm": 2.0291647911071777, + "kl": 18.4375, + "learning_rate": 9.984142311285662e-06, + "loss": 0.0185, + "reward": 0.004066008375957608, + "reward_std": 0.14958564937114716, + "rewards/ndcg_rule_reward": -0.03109024092555046, + "rewards/rule_reward": 0.03515625, + "step": 190, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.11053240740740741, + "grad_norm": 2.002106189727783, + "kl": 17.15625, + "learning_rate": 9.983767197574432e-06, + "loss": 0.0172, + "reward": 0.0037526650121435523, + "reward_std": 0.12449149042367935, + "rewards/ndcg_rule_reward": -0.025544210337102413, + "rewards/rule_reward": 0.029296875, + "step": 191, + "token_diversity": 0.49428125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.1111111111111111, + "grad_norm": 1.4461768865585327, + "kl": 8.640625, + "learning_rate": 9.98338770612848e-06, + "loss": 0.0086, + "reward": 0.004337949794717133, + "reward_std": 0.12422341853380203, + "rewards/ndcg_rule_reward": -0.024958926253020763, + "rewards/rule_reward": 0.029296875, + "step": 192, + "token_diversity": 0.4785625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.11168981481481481, + "grad_norm": 2.1024577617645264, + "kl": 21.46875, + "learning_rate": 9.983003837281152e-06, + "loss": 0.0215, + "reward": 0.0032424740493297577, + "reward_std": 0.09946837648749352, + "rewards/ndcg_rule_reward": -0.020195025019347668, + "rewards/rule_reward": 0.0234375, + "step": 193, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.11226851851851852, + "grad_norm": 560.9481811523438, + "kl": 1214.0625, + "learning_rate": 9.98261559136964e-06, + "loss": 1.2179, + "reward": 0.005123803857713938, + "reward_std": 0.16593087464571, + "rewards/ndcg_rule_reward": -0.0339386947453022, + "rewards/rule_reward": 0.0390625, + "step": 194, + "token_diversity": 0.474 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.11284722222222222, + "grad_norm": 275.1831970214844, + "kl": 570.375, + "learning_rate": 9.982222968734974e-06, + "loss": 0.5714, + "reward": 0.003612211439758539, + "reward_std": 0.12453244253993034, + "rewards/ndcg_rule_reward": -0.025684664025902748, + "rewards/rule_reward": 0.029296875, + "step": 195, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.11342592592592593, + "grad_norm": 2.6531403064727783, + "kl": 21.078125, + "learning_rate": 9.981825969722034e-06, + "loss": 0.021, + "reward": 0.004264455754309893, + "reward_std": 0.12424305826425552, + "rewards/ndcg_rule_reward": -0.02503241878002882, + "rewards/rule_reward": 0.029296875, + "step": 196, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.11400462962962964, + "grad_norm": 1.5272783041000366, + "kl": 6.6875, + "learning_rate": 9.981424594679545e-06, + "loss": 0.0067, + "reward": 0.004368629306554794, + "reward_std": 0.14947008341550827, + "rewards/ndcg_rule_reward": -0.03078762162476778, + "rewards/rule_reward": 0.03515625, + "step": 197, + "token_diversity": 0.5410625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.11458333333333333, + "grad_norm": 32.62174987792969, + "kl": 137.25, + "learning_rate": 9.981018843960075e-06, + "loss": 0.137, + "reward": 0.003212661948055029, + "reward_std": 0.10793743655085564, + "rewards/ndcg_rule_reward": -0.022177962586283684, + "rewards/rule_reward": 0.025390625, + "step": 198, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.11516203703703703, + "grad_norm": 2.359384298324585, + "kl": 7.546875, + "learning_rate": 9.980608717920034e-06, + "loss": 0.0076, + "reward": 0.0026564239524304867, + "reward_std": 0.0913468711078167, + "rewards/ndcg_rule_reward": -0.0188279515132308, + "rewards/rule_reward": 0.021484375, + "step": 199, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.11574074074074074, + "grad_norm": 20.020307540893555, + "kl": 81.59375, + "learning_rate": 9.980194216919676e-06, + "loss": 0.0819, + "reward": 0.004002627218142152, + "reward_std": 0.12436092272400856, + "rewards/ndcg_rule_reward": -0.025294248014688492, + "rewards/rule_reward": 0.029296875, + "step": 200, + "token_diversity": 0.46190624999999996 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.11631944444444445, + "grad_norm": 1.5459603071212769, + "kl": 7.3984375, + "learning_rate": 9.979775341323097e-06, + "loss": 0.0074, + "reward": 0.003290464694146067, + "reward_std": 0.09945726208388805, + "rewards/ndcg_rule_reward": -0.020147035364061594, + "rewards/rule_reward": 0.0234375, + "step": 201, + "token_diversity": 0.49371875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.11689814814814815, + "grad_norm": 1.1216109991073608, + "kl": 7.1806640625, + "learning_rate": 9.97935209149824e-06, + "loss": 0.0072, + "reward": 0.0020954771316610277, + "reward_std": 0.06632944941520691, + "rewards/ndcg_rule_reward": -0.013529523275792599, + "rewards/rule_reward": 0.015625, + "step": 202, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.11747685185185185, + "grad_norm": 1.8041737079620361, + "kl": 8.96875, + "learning_rate": 9.978924467816884e-06, + "loss": 0.009, + "reward": 0.0022699476685374975, + "reward_std": 0.07467754930257797, + "rewards/ndcg_rule_reward": -0.015308177098631859, + "rewards/rule_reward": 0.017578125, + "step": 203, + "token_diversity": 0.5234375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.11805555555555555, + "grad_norm": 1.365405797958374, + "kl": 9.4375, + "learning_rate": 9.978492470654654e-06, + "loss": 0.0094, + "reward": 0.002904410124756396, + "reward_std": 0.09964073076844215, + "rewards/ndcg_rule_reward": -0.020533090457320213, + "rewards/rule_reward": 0.0234375, + "step": 204, + "token_diversity": 0.486 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.11863425925925926, + "grad_norm": 1.371785283088684, + "kl": 8.328125, + "learning_rate": 9.978056100391017e-06, + "loss": 0.0083, + "reward": 0.003921948606148362, + "reward_std": 0.14123956486582756, + "rewards/ndcg_rule_reward": -0.02928117662668228, + "rewards/rule_reward": 0.033203125, + "step": 205, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.11921296296296297, + "grad_norm": 1.685886263847351, + "kl": 7.3125, + "learning_rate": 9.977615357409281e-06, + "loss": 0.0073, + "reward": 0.0033032370265573263, + "reward_std": 0.11630560085177422, + "rewards/ndcg_rule_reward": -0.024040513671934605, + "rewards/rule_reward": 0.02734375, + "step": 206, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.11979166666666667, + "grad_norm": 1.5659021139144897, + "kl": 5.625, + "learning_rate": 9.977170242096588e-06, + "loss": 0.0056, + "reward": 0.003808901645243168, + "reward_std": 0.12444756180047989, + "rewards/ndcg_rule_reward": -0.025487973354756832, + "rewards/rule_reward": 0.029296875, + "step": 207, + "token_diversity": 0.4778125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.970703125, + "epoch": 0.12037037037037036, + "grad_norm": 20.051677703857422, + "kl": 95.96875, + "learning_rate": 9.976720754843933e-06, + "loss": 0.096, + "reward": 0.0037238255608826876, + "reward_std": 0.13289638236165047, + "rewards/ndcg_rule_reward": -0.027526174671947956, + "rewards/rule_reward": 0.03125, + "step": 208, + "token_diversity": 0.47406506147540983 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.12094907407407407, + "grad_norm": 1.8089065551757812, + "kl": 9.84375, + "learning_rate": 9.976266896046143e-06, + "loss": 0.0098, + "reward": 0.002931955736130476, + "reward_std": 0.10805151611566544, + "rewards/ndcg_rule_reward": -0.02245866972953081, + "rewards/rule_reward": 0.025390625, + "step": 209, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.12152777777777778, + "grad_norm": 1.4354021549224854, + "kl": 7.3125, + "learning_rate": 9.975808666101887e-06, + "loss": 0.0073, + "reward": 0.0039137552957981825, + "reward_std": 0.11598599702119827, + "rewards/ndcg_rule_reward": -0.023429994471371174, + "rewards/rule_reward": 0.02734375, + "step": 210, + "token_diversity": 0.46665625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.970703125, + "epoch": 0.12210648148148148, + "grad_norm": 4.380974292755127, + "kl": 28.40625, + "learning_rate": 9.975346065413673e-06, + "loss": 0.0285, + "reward": 0.003608238184824586, + "reward_std": 0.12455273419618607, + "rewards/ndcg_rule_reward": -0.025688636116683483, + "rewards/rule_reward": 0.029296875, + "step": 211, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.970703125, + "epoch": 0.12268518518518519, + "grad_norm": 1.6632018089294434, + "kl": 12.546875, + "learning_rate": 9.974879094387849e-06, + "loss": 0.0126, + "reward": 0.0030388315208256245, + "reward_std": 0.11638538539409637, + "rewards/ndcg_rule_reward": -0.024304918013513088, + "rewards/rule_reward": 0.02734375, + "step": 212, + "token_diversity": 0.40665625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.1232638888888889, + "grad_norm": 2.027757406234741, + "kl": 9.03125, + "learning_rate": 9.974407753434604e-06, + "loss": 0.009, + "reward": 0.00334672664757818, + "reward_std": 0.10786322504281998, + "rewards/ndcg_rule_reward": -0.02204389963299036, + "rewards/rule_reward": 0.025390625, + "step": 213, + "token_diversity": 0.51334375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.96484375, + "epoch": 0.12384259259259259, + "grad_norm": 1.6451691389083862, + "kl": 16.21875, + "learning_rate": 9.973932042967957e-06, + "loss": 0.0162, + "reward": 0.004158308845944703, + "reward_std": 0.10745060443878174, + "rewards/ndcg_rule_reward": -0.02123231627047062, + "rewards/rule_reward": 0.025390625, + "step": 214, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.970703125, + "epoch": 0.1244212962962963, + "grad_norm": 10.134576797485352, + "kl": 39.4375, + "learning_rate": 9.97345196340578e-06, + "loss": 0.0395, + "reward": 0.004512685351073742, + "reward_std": 0.1409619227051735, + "rewards/ndcg_rule_reward": -0.028690439648926258, + "rewards/rule_reward": 0.033203125, + "step": 215, + "token_diversity": 0.4825625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.125, + "grad_norm": 2.5761241912841797, + "kl": 23.0, + "learning_rate": 9.97296751516977e-06, + "loss": 0.023, + "reward": 0.004847373929806054, + "reward_std": 0.16608117148280144, + "rewards/ndcg_rule_reward": -0.034215125255286694, + "rewards/rule_reward": 0.0390625, + "step": 216, + "token_diversity": 0.45065625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.1255787037037037, + "grad_norm": 1.2402633428573608, + "kl": 11.265625, + "learning_rate": 9.972478698685463e-06, + "loss": 0.0113, + "reward": 0.0016850050305947661, + "reward_std": 0.06655768677592278, + "rewards/ndcg_rule_reward": -0.013939994852989912, + "rewards/rule_reward": 0.015625, + "step": 217, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.970703125, + "epoch": 0.1261574074074074, + "grad_norm": 1.735680341720581, + "kl": 20.25, + "learning_rate": 9.971985514382239e-06, + "loss": 0.0202, + "reward": 0.002251421508844942, + "reward_std": 0.08311107009649277, + "rewards/ndcg_rule_reward": -0.017279828898608685, + "rewards/rule_reward": 0.01953125, + "step": 218, + "token_diversity": 0.478 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.1267361111111111, + "grad_norm": 38.04151153564453, + "kl": 92.625, + "learning_rate": 9.97148796269331e-06, + "loss": 0.093, + "reward": 0.0033794642658904195, + "reward_std": 0.11621838808059692, + "rewards/ndcg_rule_reward": -0.023964285850524902, + "rewards/rule_reward": 0.02734375, + "step": 219, + "token_diversity": 0.4375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0078125, + "epoch": 0.12731481481481483, + "grad_norm": 30.989521026611328, + "kl": 164.875, + "learning_rate": 9.970986044055723e-06, + "loss": 0.1648, + "reward": 0.004176492337137461, + "reward_std": 0.1326996013522148, + "rewards/ndcg_rule_reward": -0.027073507197201252, + "rewards/rule_reward": 0.03125, + "step": 220, + "token_diversity": 0.5234375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.12789351851851852, + "grad_norm": 4.331508159637451, + "kl": 33.0625, + "learning_rate": 9.970479758910365e-06, + "loss": 0.033, + "reward": 0.0037427610950544477, + "reward_std": 0.13290971890091896, + "rewards/ndcg_rule_reward": -0.027507239021360874, + "rewards/rule_reward": 0.03125, + "step": 221, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.1284722222222222, + "grad_norm": 1.450467824935913, + "kl": 14.671875, + "learning_rate": 9.969969107701952e-06, + "loss": 0.0147, + "reward": 0.0029353881254792213, + "reward_std": 0.0912020318210125, + "rewards/ndcg_rule_reward": -0.01854898687452078, + "rewards/rule_reward": 0.021484375, + "step": 222, + "token_diversity": 0.44921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.12905092592592593, + "grad_norm": 1.4840534925460815, + "kl": 17.875, + "learning_rate": 9.969454090879043e-06, + "loss": 0.0178, + "reward": 0.003450399381108582, + "reward_std": 0.09937350824475288, + "rewards/ndcg_rule_reward": -0.01998710073530674, + "rewards/rule_reward": 0.0234375, + "step": 223, + "token_diversity": 0.5546875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.12962962962962962, + "grad_norm": 1.5277975797653198, + "kl": 21.25, + "learning_rate": 9.968934708894029e-06, + "loss": 0.0213, + "reward": 0.004833946004509926, + "reward_std": 0.14921987056732178, + "rewards/ndcg_rule_reward": -0.030322303995490074, + "rewards/rule_reward": 0.03515625, + "step": 224, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.13020833333333334, + "grad_norm": 0.9313162565231323, + "kl": 5.03125, + "learning_rate": 9.968410962203131e-06, + "loss": 0.005, + "reward": 0.0026372154243290424, + "reward_std": 0.07449373230338097, + "rewards/ndcg_rule_reward": -0.014940910041332245, + "rewards/rule_reward": 0.017578125, + "step": 225, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.13078703703703703, + "grad_norm": 1.8906068801879883, + "kl": 8.875, + "learning_rate": 9.96788285126641e-06, + "loss": 0.0089, + "reward": 0.0044950305018574, + "reward_std": 0.1409653276205063, + "rewards/ndcg_rule_reward": -0.028708094730973244, + "rewards/rule_reward": 0.033203125, + "step": 226, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.13136574074074073, + "grad_norm": 1.5521939992904663, + "kl": 13.125, + "learning_rate": 9.967350376547757e-06, + "loss": 0.0131, + "reward": 0.0032062861137092113, + "reward_std": 0.09953805804252625, + "rewards/ndcg_rule_reward": -0.020231214817613363, + "rewards/rule_reward": 0.0234375, + "step": 227, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.953125, + "epoch": 0.13194444444444445, + "grad_norm": 2.538470506668091, + "kl": 41.625, + "learning_rate": 9.966813538514898e-06, + "loss": 0.0417, + "reward": 0.00600956124253571, + "reward_std": 0.19074992090463638, + "rewards/ndcg_rule_reward": -0.03891231399029493, + "rewards/rule_reward": 0.044921875, + "step": 228, + "token_diversity": 0.52971875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.13252314814814814, + "grad_norm": 3.4107472896575928, + "kl": 23.875, + "learning_rate": 9.966272337639386e-06, + "loss": 0.0239, + "reward": 0.005032099550589919, + "reward_std": 0.1407046876847744, + "rewards/ndcg_rule_reward": -0.028171025216579437, + "rewards/rule_reward": 0.033203125, + "step": 229, + "token_diversity": 0.44921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.13310185185185186, + "grad_norm": 94.10326385498047, + "kl": 380.5, + "learning_rate": 9.965726774396619e-06, + "loss": 0.3818, + "reward": 0.005005551967769861, + "reward_std": 0.14074178785085678, + "rewards/ndcg_rule_reward": -0.028197573497891426, + "rewards/rule_reward": 0.033203125, + "step": 230, + "token_diversity": 0.45428124999999997 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.13368055555555555, + "grad_norm": 1.6199374198913574, + "kl": 12.765625, + "learning_rate": 9.965176849265814e-06, + "loss": 0.0128, + "reward": 0.003759535844437778, + "reward_std": 0.11606846004724503, + "rewards/ndcg_rule_reward": -0.0235842140391469, + "rewards/rule_reward": 0.02734375, + "step": 231, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.13425925925925927, + "grad_norm": 2.1195833683013916, + "kl": 23.5, + "learning_rate": 9.964622562730026e-06, + "loss": 0.0235, + "reward": 0.004512006998993456, + "reward_std": 0.15781936421990395, + "rewards/ndcg_rule_reward": -0.03259736858308315, + "rewards/rule_reward": 0.037109375, + "step": 232, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.970703125, + "epoch": 0.13483796296296297, + "grad_norm": 22.261951446533203, + "kl": 97.8125, + "learning_rate": 9.964063915276141e-06, + "loss": 0.0981, + "reward": 0.0037838652497157454, + "reward_std": 0.1244901530444622, + "rewards/ndcg_rule_reward": -0.02551301009953022, + "rewards/rule_reward": 0.029296875, + "step": 233, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.13541666666666666, + "grad_norm": 9.681098937988281, + "kl": 59.6875, + "learning_rate": 9.96350090739487e-06, + "loss": 0.0596, + "reward": 0.003092047991231084, + "reward_std": 0.09954698011279106, + "rewards/ndcg_rule_reward": -0.02034545224159956, + "rewards/rule_reward": 0.0234375, + "step": 234, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.13599537037037038, + "grad_norm": 1.5919853448867798, + "kl": 19.125, + "learning_rate": 9.962933539580763e-06, + "loss": 0.0191, + "reward": 0.004200189374387264, + "reward_std": 0.11586707085371017, + "rewards/ndcg_rule_reward": -0.023143560625612736, + "rewards/rule_reward": 0.02734375, + "step": 235, + "token_diversity": 0.53515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.13657407407407407, + "grad_norm": 1.5562537908554077, + "kl": 9.203125, + "learning_rate": 9.96236181233219e-06, + "loss": 0.0092, + "reward": 0.0022028969833627343, + "reward_std": 0.06630209274590015, + "rewards/ndcg_rule_reward": -0.013422103598713875, + "rewards/rule_reward": 0.015625, + "step": 236, + "token_diversity": 0.41084375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.1371527777777778, + "grad_norm": 1.2431248426437378, + "kl": 4.296875, + "learning_rate": 9.961785726151363e-06, + "loss": 0.0043, + "reward": 0.0024378886446356773, + "reward_std": 0.09145256131887436, + "rewards/ndcg_rule_reward": -0.019046487286686897, + "rewards/rule_reward": 0.021484375, + "step": 237, + "token_diversity": 0.55078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.970703125, + "epoch": 0.13773148148148148, + "grad_norm": 2.594188690185547, + "kl": 11.375, + "learning_rate": 9.961205281544308e-06, + "loss": 0.0114, + "reward": 0.0036354591138660908, + "reward_std": 0.11613690853118896, + "rewards/ndcg_rule_reward": -0.023708291351795197, + "rewards/rule_reward": 0.02734375, + "step": 238, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.13831018518518517, + "grad_norm": 1.7599163055419922, + "kl": 10.0625, + "learning_rate": 9.96062047902089e-06, + "loss": 0.0101, + "reward": 0.0029006176628172398, + "reward_std": 0.10809947922825813, + "rewards/ndcg_rule_reward": -0.022490007802844048, + "rewards/rule_reward": 0.025390625, + "step": 239, + "token_diversity": 0.546875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.9765625, + "epoch": 0.1388888888888889, + "grad_norm": 1.9778149127960205, + "kl": 9.875, + "learning_rate": 9.9600313190948e-06, + "loss": 0.0099, + "reward": 0.004750518128275871, + "reward_std": 0.1830368936061859, + "rewards/ndcg_rule_reward": -0.03821823373436928, + "rewards/rule_reward": 0.04296875, + "step": 240, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.970703125, + "epoch": 0.1394675925925926, + "grad_norm": 1.676651120185852, + "kl": 17.578125, + "learning_rate": 9.959437802283552e-06, + "loss": 0.0176, + "reward": 0.0041133377235382795, + "reward_std": 0.13272428140044212, + "rewards/ndcg_rule_reward": -0.027136662043631077, + "rewards/rule_reward": 0.03125, + "step": 241, + "token_diversity": 0.47790625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.12109375, + "epoch": 0.1400462962962963, + "grad_norm": 2.1542255878448486, + "kl": 10.78125, + "learning_rate": 9.958839929108495e-06, + "loss": 0.0108, + "reward": 0.0063866013661026955, + "reward_std": 0.17775721848011017, + "rewards/ndcg_rule_reward": -0.03658214956521988, + "rewards/rule_reward": 0.04296875, + "step": 242, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.140625, + "grad_norm": 4.695531368255615, + "kl": 33.5, + "learning_rate": 9.958237700094795e-06, + "loss": 0.0335, + "reward": 0.0027214534347876906, + "reward_std": 0.11658339947462082, + "rewards/ndcg_rule_reward": -0.024622296914458275, + "rewards/rule_reward": 0.02734375, + "step": 243, + "token_diversity": 0.5021875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.576171875, + "epoch": 0.1412037037037037, + "grad_norm": 1.388343334197998, + "kl": 9.2734375, + "learning_rate": 9.957631115771451e-06, + "loss": 0.0093, + "reward": 0.004850620636716485, + "reward_std": 0.10268417000770569, + "rewards/ndcg_rule_reward": -0.020540005061775446, + "rewards/rule_reward": 0.025390625, + "step": 244, + "token_diversity": 0.48143749999999996 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.1417824074074074, + "grad_norm": 1.6832258701324463, + "kl": 16.21875, + "learning_rate": 9.957020176671289e-06, + "loss": 0.0162, + "reward": 0.004334785393439233, + "reward_std": 0.12418752908706665, + "rewards/ndcg_rule_reward": -0.024962089955806732, + "rewards/rule_reward": 0.029296875, + "step": 245, + "token_diversity": 0.48543749999999997 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.001953125, + "epoch": 0.1423611111111111, + "grad_norm": 2.8039603233337402, + "kl": 32.0625, + "learning_rate": 9.956404883330953e-06, + "loss": 0.0321, + "reward": 0.006673220079392195, + "reward_std": 0.15237951278686523, + "rewards/ndcg_rule_reward": -0.030436155386269093, + "rewards/rule_reward": 0.037109375, + "step": 246, + "token_diversity": 0.44618749999999996 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.14293981481481483, + "grad_norm": 1.0813724994659424, + "kl": 13.28125, + "learning_rate": 9.955785236290918e-06, + "loss": 0.0133, + "reward": 0.0018644927768036723, + "reward_std": 0.07486936822533607, + "rewards/ndcg_rule_reward": -0.015713632106781006, + "rewards/rule_reward": 0.017578125, + "step": 247, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 6.064453125, + "epoch": 0.14351851851851852, + "grad_norm": 1.1457293033599854, + "kl": 4.8125, + "learning_rate": 9.955161236095478e-06, + "loss": 0.0048, + "reward": 0.005041609751060605, + "reward_std": 0.09421968087553978, + "rewards/ndcg_rule_reward": -0.01839589048177004, + "rewards/rule_reward": 0.0234375, + "step": 248, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.75390625, + "epoch": 0.1440972222222222, + "grad_norm": 1.7724554538726807, + "kl": 9.625, + "learning_rate": 9.954532883292761e-06, + "loss": 0.0096, + "reward": 0.013758484739810228, + "reward_std": 0.16842082142829895, + "rewards/ndcg_rule_reward": -0.031163389794528484, + "rewards/rule_reward": 0.044921875, + "step": 249, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.47265625, + "epoch": 0.14467592592592593, + "grad_norm": 3.3048219680786133, + "kl": 17.15625, + "learning_rate": 9.953900178434703e-06, + "loss": 0.0171, + "reward": 0.012719136546365917, + "reward_std": 0.15107353031635284, + "rewards/ndcg_rule_reward": -0.028296489268541336, + "rewards/rule_reward": 0.041015625, + "step": 250, + "token_diversity": 0.53125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.236328125, + "epoch": 0.14525462962962962, + "grad_norm": 2.0292885303497314, + "kl": 9.75, + "learning_rate": 9.953263122077077e-06, + "loss": 0.0097, + "reward": 0.009856057353317738, + "reward_std": 0.12414234131574631, + "rewards/ndcg_rule_reward": -0.023347068578004837, + "rewards/rule_reward": 0.033203125, + "step": 251, + "token_diversity": 0.457625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.14583333333333334, + "grad_norm": 2.009976387023926, + "kl": 14.25, + "learning_rate": 9.952621714779469e-06, + "loss": 0.0142, + "reward": 0.004389917012304068, + "reward_std": 0.14103099703788757, + "rewards/ndcg_rule_reward": -0.028813209384679794, + "rewards/rule_reward": 0.033203125, + "step": 252, + "token_diversity": 0.53125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.56640625, + "epoch": 0.14641203703703703, + "grad_norm": 1.9624507427215576, + "kl": 27.8125, + "learning_rate": 9.951975957105293e-06, + "loss": 0.0278, + "reward": 0.010063253808766603, + "reward_std": 0.1324651539325714, + "rewards/ndcg_rule_reward": -0.02509299572557211, + "rewards/rule_reward": 0.03515625, + "step": 253, + "token_diversity": 0.44921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.5390625, + "epoch": 0.14699074074074073, + "grad_norm": 24.71966552734375, + "kl": 35.40625, + "learning_rate": 9.95132584962178e-06, + "loss": 0.0355, + "reward": 0.004288436146453023, + "reward_std": 0.08611833304166794, + "rewards/ndcg_rule_reward": -0.017195938155055046, + "rewards/rule_reward": 0.021484375, + "step": 254, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.306640625, + "epoch": 0.14756944444444445, + "grad_norm": 1.854827642440796, + "kl": 14.15625, + "learning_rate": 9.950671392899985e-06, + "loss": 0.0142, + "reward": 0.00731650972738862, + "reward_std": 0.11299415677785873, + "rewards/ndcg_rule_reward": -0.021980365738272667, + "rewards/rule_reward": 0.029296875, + "step": 255, + "token_diversity": 0.406301738410596 + }, + { + "categorical_diversity": 1.0, + "completion_length": 6.189453125, + "epoch": 0.14814814814814814, + "grad_norm": 4.325963497161865, + "kl": 34.24609375, + "learning_rate": 9.95001258751478e-06, + "loss": 0.0344, + "reward": 0.016942830756306648, + "reward_std": 0.14708822220563889, + "rewards/ndcg_rule_reward": -0.026025920175015926, + "rewards/rule_reward": 0.04296875, + "step": 256, + "token_diversity": 0.44140625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.029296875, + "epoch": 0.14872685185185186, + "grad_norm": 35.80392837524414, + "kl": 130.5, + "learning_rate": 9.949349434044862e-06, + "loss": 0.1309, + "reward": 0.012386551359668374, + "reward_std": 0.14028532803058624, + "rewards/ndcg_rule_reward": -0.026675949804484844, + "rewards/rule_reward": 0.0390625, + "step": 257, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.14930555555555555, + "grad_norm": 2.5252249240875244, + "kl": 27.3125, + "learning_rate": 9.94868193307274e-06, + "loss": 0.0273, + "reward": 0.0038764423225075006, + "reward_std": 0.14126718044281006, + "rewards/ndcg_rule_reward": -0.029326682910323143, + "rewards/rule_reward": 0.033203125, + "step": 258, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.037109375, + "epoch": 0.14988425925925927, + "grad_norm": 1.5434321165084839, + "kl": 15.90625, + "learning_rate": 9.948010085184748e-06, + "loss": 0.0159, + "reward": 0.022044923272915184, + "reward_std": 0.115901879966259, + "rewards/ndcg_rule_reward": -0.020923828706145287, + "rewards/rule_reward": 0.04296875, + "step": 259, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.99609375, + "epoch": 0.15046296296296297, + "grad_norm": 2.283210277557373, + "kl": 20.5, + "learning_rate": 9.947333890971036e-06, + "loss": 0.0205, + "reward": 0.016226567095145583, + "reward_std": 0.14402683079242706, + "rewards/ndcg_rule_reward": -0.026742182672023773, + "rewards/rule_reward": 0.04296875, + "step": 260, + "token_diversity": 0.46044146825396826 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.15104166666666666, + "grad_norm": 2.640972852706909, + "kl": 22.5625, + "learning_rate": 9.946653351025575e-06, + "loss": 0.0225, + "reward": 0.0046374311204999685, + "reward_std": 0.14931488037109375, + "rewards/ndcg_rule_reward": -0.030518819577991962, + "rewards/rule_reward": 0.03515625, + "step": 261, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.01171875, + "epoch": 0.15162037037037038, + "grad_norm": 3.1356537342071533, + "kl": 32.09375, + "learning_rate": 9.945968465946145e-06, + "loss": 0.032, + "reward": 0.022381024435162544, + "reward_std": 0.1325329877436161, + "rewards/ndcg_rule_reward": -0.02449397649616003, + "rewards/rule_reward": 0.046875, + "step": 262, + "token_diversity": 0.52734375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.15219907407407407, + "grad_norm": 5.493335723876953, + "kl": 49.875, + "learning_rate": 9.945279236334353e-06, + "loss": 0.0498, + "reward": 0.003117907792329788, + "reward_std": 0.09953810647130013, + "rewards/ndcg_rule_reward": -0.020319592207670212, + "rewards/rule_reward": 0.0234375, + "step": 263, + "token_diversity": 0.5061875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.99609375, + "epoch": 0.1527777777777778, + "grad_norm": 3.1211965084075928, + "kl": 41.75, + "learning_rate": 9.944585662795614e-06, + "loss": 0.0417, + "reward": 0.006475519156083465, + "reward_std": 0.1271800436079502, + "rewards/ndcg_rule_reward": -0.02477448061108589, + "rewards/rule_reward": 0.03125, + "step": 264, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.15335648148148148, + "grad_norm": 1.614283561706543, + "kl": 10.59375, + "learning_rate": 9.943887745939164e-06, + "loss": 0.0106, + "reward": 0.0026239522267132998, + "reward_std": 0.09978962689638138, + "rewards/ndcg_rule_reward": -0.020813548006117344, + "rewards/rule_reward": 0.0234375, + "step": 265, + "token_diversity": 0.47046875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.15393518518518517, + "grad_norm": 3.0231401920318604, + "kl": 25.578125, + "learning_rate": 9.943185486378054e-06, + "loss": 0.0256, + "reward": 0.004344483953900635, + "reward_std": 0.14101224392652512, + "rewards/ndcg_rule_reward": -0.028858641162514687, + "rewards/rule_reward": 0.033203125, + "step": 266, + "token_diversity": 0.41015625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.1545138888888889, + "grad_norm": 1.883553385734558, + "kl": 9.453125, + "learning_rate": 9.942478884729144e-06, + "loss": 0.0095, + "reward": 0.003986558178439736, + "reward_std": 0.14118992537260056, + "rewards/ndcg_rule_reward": -0.029216567054390907, + "rewards/rule_reward": 0.033203125, + "step": 267, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.1550925925925926, + "grad_norm": 1.5802838802337646, + "kl": 20.90625, + "learning_rate": 9.941767941613112e-06, + "loss": 0.0209, + "reward": 0.008829612634144723, + "reward_std": 0.11450044810771942, + "rewards/ndcg_rule_reward": -0.022420387715101242, + "rewards/rule_reward": 0.03125, + "step": 268, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.98828125, + "epoch": 0.1556712962962963, + "grad_norm": 3.3935928344726562, + "kl": 41.5, + "learning_rate": 9.941052657654453e-06, + "loss": 0.0415, + "reward": 0.00861932645784691, + "reward_std": 0.08935081958770752, + "rewards/ndcg_rule_reward": -0.016771298367530107, + "rewards/rule_reward": 0.025390625, + "step": 269, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.982421875, + "epoch": 0.15625, + "grad_norm": 3.1026437282562256, + "kl": 45.375, + "learning_rate": 9.940333033481472e-06, + "loss": 0.0453, + "reward": 0.004978100070729852, + "reward_std": 0.1491812765598297, + "rewards/ndcg_rule_reward": -0.030178150162100792, + "rewards/rule_reward": 0.03515625, + "step": 270, + "token_diversity": 0.48571875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.01953125, + "epoch": 0.1568287037037037, + "grad_norm": 2.3473997116088867, + "kl": 11.90625, + "learning_rate": 9.939609069726279e-06, + "loss": 0.0119, + "reward": 0.02837453316897154, + "reward_std": 0.13794220983982086, + "rewards/ndcg_rule_reward": -0.02631296683102846, + "rewards/rule_reward": 0.0546875, + "step": 271, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.025390625, + "epoch": 0.1574074074074074, + "grad_norm": 2.730544090270996, + "kl": 10.78125, + "learning_rate": 9.938880767024811e-06, + "loss": 0.0108, + "reward": 0.033001872012391686, + "reward_std": 0.12387988716363907, + "rewards/ndcg_rule_reward": -0.021685628220438957, + "rewards/rule_reward": 0.0546875, + "step": 272, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.01953125, + "epoch": 0.1579861111111111, + "grad_norm": 6.239088535308838, + "kl": 60.875, + "learning_rate": 9.938148126016805e-06, + "loss": 0.0609, + "reward": 0.027651492971926928, + "reward_std": 0.11293710768222809, + "rewards/ndcg_rule_reward": -0.021176633425056934, + "rewards/rule_reward": 0.048828125, + "step": 273, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 0.9375, + "completion_length": 5.01953125, + "epoch": 0.15856481481481483, + "grad_norm": 1.9355725049972534, + "kl": 17.40625, + "learning_rate": 9.93741114734581e-06, + "loss": 0.0174, + "reward": 0.03170574922114611, + "reward_std": 0.1689780130982399, + "rewards/ndcg_rule_reward": -0.03274737484753132, + "rewards/rule_reward": 0.064453125, + "step": 274, + "token_diversity": 0.3764722222222222 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.15914351851851852, + "grad_norm": 2.7340259552001953, + "kl": 30.6875, + "learning_rate": 9.936669831659189e-06, + "loss": 0.0307, + "reward": 0.04333976469933987, + "reward_std": 0.09160168468952179, + "rewards/ndcg_rule_reward": -0.01525398576632142, + "rewards/rule_reward": 0.05859375, + "step": 275, + "token_diversity": 0.43648616412213737 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.1597222222222222, + "grad_norm": 2.1990339756011963, + "kl": 16.734375, + "learning_rate": 9.935924179608112e-06, + "loss": 0.0167, + "reward": 0.00308064476121217, + "reward_std": 0.09115452691912651, + "rewards/ndcg_rule_reward": -0.01840373035520315, + "rewards/rule_reward": 0.021484375, + "step": 276, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.025390625, + "epoch": 0.16030092592592593, + "grad_norm": 2.580545663833618, + "kl": 40.25, + "learning_rate": 9.93517419184756e-06, + "loss": 0.0404, + "reward": 0.02614997373893857, + "reward_std": 0.13941475749015808, + "rewards/ndcg_rule_reward": -0.026584403589367867, + "rewards/rule_reward": 0.052734375, + "step": 277, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.009765625, + "epoch": 0.16087962962962962, + "grad_norm": 2.192462205886841, + "kl": 14.65625, + "learning_rate": 9.934419869036322e-06, + "loss": 0.0147, + "reward": 0.017584326677024364, + "reward_std": 0.15025058388710022, + "rewards/ndcg_rule_reward": -0.029290671460330486, + "rewards/rule_reward": 0.046875, + "step": 278, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 0.953125, + "completion_length": 5.072265625, + "epoch": 0.16145833333333334, + "grad_norm": 2.0852701663970947, + "kl": 33.375, + "learning_rate": 9.933661211836992e-06, + "loss": 0.0334, + "reward": 0.05762724974192679, + "reward_std": 0.15366029739379883, + "rewards/ndcg_rule_reward": -0.02635712642222643, + "rewards/rule_reward": 0.083984375, + "step": 279, + "token_diversity": 0.4084201388888889 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.16203703703703703, + "grad_norm": 1.6333078145980835, + "kl": 14.703125, + "learning_rate": 9.932898220915971e-06, + "loss": 0.0147, + "reward": 0.0028474947321228683, + "reward_std": 0.09968330711126328, + "rewards/ndcg_rule_reward": -0.020590005442500114, + "rewards/rule_reward": 0.0234375, + "step": 280, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.16261574074074073, + "grad_norm": 2.533780574798584, + "kl": 36.875, + "learning_rate": 9.932130896943477e-06, + "loss": 0.0368, + "reward": 0.003568289102986455, + "reward_std": 0.10775629431009293, + "rewards/ndcg_rule_reward": -0.02182233612984419, + "rewards/rule_reward": 0.025390625, + "step": 281, + "token_diversity": 0.4140625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0625, + "epoch": 0.16319444444444445, + "grad_norm": 3.0718259811401367, + "kl": 34.125, + "learning_rate": 9.931359240593521e-06, + "loss": 0.0342, + "reward": 0.0338636739179492, + "reward_std": 0.10034685954451561, + "rewards/ndcg_rule_reward": -0.020823827013373375, + "rewards/rule_reward": 0.0546875, + "step": 282, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 0.90625, + "completion_length": 5.0859375, + "epoch": 0.16377314814814814, + "grad_norm": 3.667328357696533, + "kl": 35.0625, + "learning_rate": 9.930583252543925e-06, + "loss": 0.035, + "reward": 0.0633287001401186, + "reward_std": 0.18110592663288116, + "rewards/ndcg_rule_reward": -0.0304213035851717, + "rewards/rule_reward": 0.09375, + "step": 283, + "token_diversity": 0.4212541390728477 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0625, + "epoch": 0.16435185185185186, + "grad_norm": 1.9954578876495361, + "kl": 27.0625, + "learning_rate": 9.929802933476317e-06, + "loss": 0.0271, + "reward": 0.04779301304370165, + "reward_std": 0.12081345170736313, + "rewards/ndcg_rule_reward": -0.02056636568158865, + "rewards/rule_reward": 0.068359375, + "step": 284, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.033203125, + "epoch": 0.16493055555555555, + "grad_norm": 2.5151124000549316, + "kl": 23.71875, + "learning_rate": 9.929018284076128e-06, + "loss": 0.0237, + "reward": 0.022200040286406875, + "reward_std": 0.1661885567009449, + "rewards/ndcg_rule_reward": -0.032487460412085056, + "rewards/rule_reward": 0.0546875, + "step": 285, + "token_diversity": 0.4375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.16550925925925927, + "grad_norm": 2.1797189712524414, + "kl": 16.625, + "learning_rate": 9.928229305032592e-06, + "loss": 0.0166, + "reward": 0.037083620205521584, + "reward_std": 0.14489015936851501, + "rewards/ndcg_rule_reward": -0.02932263072580099, + "rewards/rule_reward": 0.06640625, + "step": 286, + "token_diversity": 0.4631782945736434 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.16608796296296297, + "grad_norm": 2.1502304077148438, + "kl": 11.921875, + "learning_rate": 9.927435997038752e-06, + "loss": 0.0119, + "reward": 0.004561863490380347, + "reward_std": 0.149346224963665, + "rewards/ndcg_rule_reward": -0.030594386160373688, + "rewards/rule_reward": 0.03515625, + "step": 287, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 0.984375, + "completion_length": 5.09765625, + "epoch": 0.16666666666666666, + "grad_norm": 3.326375961303711, + "kl": 13.6875, + "learning_rate": 9.926638360791442e-06, + "loss": 0.0137, + "reward": 0.07790782488882542, + "reward_std": 0.1501917988061905, + "rewards/ndcg_rule_reward": -0.025607800111174583, + "rewards/rule_reward": 0.103515625, + "step": 288, + "token_diversity": 0.41711336678832117 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.16724537037037038, + "grad_norm": 2.452181339263916, + "kl": 20.046875, + "learning_rate": 9.925836396991309e-06, + "loss": 0.0201, + "reward": 0.0736471638083458, + "reward_std": 0.11638534069061279, + "rewards/ndcg_rule_reward": -0.01814971026033163, + "rewards/rule_reward": 0.091796875, + "step": 289, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0625, + "epoch": 0.16782407407407407, + "grad_norm": 2.9913487434387207, + "kl": 43.375, + "learning_rate": 9.925030106342794e-06, + "loss": 0.0434, + "reward": 0.04712013155221939, + "reward_std": 0.1503029614686966, + "rewards/ndcg_rule_reward": -0.02709861844778061, + "rewards/rule_reward": 0.07421875, + "step": 290, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.1684027777777778, + "grad_norm": 4.167126655578613, + "kl": 45.625, + "learning_rate": 9.924219489554145e-06, + "loss": 0.0456, + "reward": 0.0028138189227320254, + "reward_std": 0.09126139432191849, + "rewards/ndcg_rule_reward": -0.018670556135475636, + "rewards/rule_reward": 0.021484375, + "step": 291, + "token_diversity": 0.390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.060546875, + "epoch": 0.16898148148148148, + "grad_norm": 4.637401580810547, + "kl": 61.75, + "learning_rate": 9.923404547337403e-06, + "loss": 0.0619, + "reward": 0.03526002448052168, + "reward_std": 0.13277750462293625, + "rewards/ndcg_rule_reward": -0.02723997551947832, + "rewards/rule_reward": 0.0625, + "step": 292, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.177734375, + "epoch": 0.16956018518518517, + "grad_norm": 2.322012186050415, + "kl": 35.0, + "learning_rate": 9.922585280408417e-06, + "loss": 0.035, + "reward": 0.09230180131271482, + "reward_std": 0.0945015586912632, + "rewards/ndcg_rule_reward": -0.017073199152946472, + "rewards/rule_reward": 0.109375, + "step": 293, + "token_diversity": 0.5234375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.240234375, + "epoch": 0.1701388888888889, + "grad_norm": 134.68218994140625, + "kl": 246.1875, + "learning_rate": 9.921761689486825e-06, + "loss": 0.2468, + "reward": 0.12396114319562912, + "reward_std": 0.11672571673989296, + "rewards/ndcg_rule_reward": -0.02057010307908058, + "rewards/rule_reward": 0.14453125, + "step": 294, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.15234375, + "epoch": 0.1707175925925926, + "grad_norm": 2.347637414932251, + "kl": 40.0, + "learning_rate": 9.92093377529607e-06, + "loss": 0.04, + "reward": 0.09192857146263123, + "reward_std": 0.15372635424137115, + "rewards/ndcg_rule_reward": -0.023305803537368774, + "rewards/rule_reward": 0.115234375, + "step": 295, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.103515625, + "epoch": 0.1712962962962963, + "grad_norm": 1.8575600385665894, + "kl": 14.65625, + "learning_rate": 9.920101538563395e-06, + "loss": 0.0147, + "reward": 0.06265097158029675, + "reward_std": 0.09171288087964058, + "rewards/ndcg_rule_reward": -0.017427152022719383, + "rewards/rule_reward": 0.080078125, + "step": 296, + "token_diversity": 0.5234375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.099609375, + "epoch": 0.171875, + "grad_norm": 1.4621894359588623, + "kl": 11.125, + "learning_rate": 9.919264980019829e-06, + "loss": 0.0111, + "reward": 0.06258983636507764, + "reward_std": 0.07490538619458675, + "rewards/ndcg_rule_reward": -0.013582037761807442, + "rewards/rule_reward": 0.076171875, + "step": 297, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.1724537037037037, + "grad_norm": 2.019887685775757, + "kl": 22.0, + "learning_rate": 9.918424100400208e-06, + "loss": 0.022, + "reward": 0.037845754995942116, + "reward_std": 0.11686724051833153, + "rewards/ndcg_rule_reward": -0.018794872798025608, + "rewards/rule_reward": 0.056640625, + "step": 298, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 0.921875, + "completion_length": 5.240234375, + "epoch": 0.1730324074074074, + "grad_norm": 2.5262982845306396, + "kl": 16.8125, + "learning_rate": 9.917578900443158e-06, + "loss": 0.0168, + "reward": 0.15875747799873352, + "reward_std": 0.16092239692807198, + "rewards/ndcg_rule_reward": -0.02483627386391163, + "rewards/rule_reward": 0.18359375, + "step": 299, + "token_diversity": 0.3854880136986301 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.048828125, + "epoch": 0.1736111111111111, + "grad_norm": 2.280848264694214, + "kl": 25.3125, + "learning_rate": 9.916729380891101e-06, + "loss": 0.0253, + "reward": 0.035497302655130625, + "reward_std": 0.1863340586423874, + "rewards/ndcg_rule_reward": -0.036768319085240364, + "rewards/rule_reward": 0.072265625, + "step": 300, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0546875, + "epoch": 0.17418981481481483, + "grad_norm": 2.2980709075927734, + "kl": 13.375, + "learning_rate": 9.915875542490257e-06, + "loss": 0.0134, + "reward": 0.03386740118730813, + "reward_std": 0.10818653553724289, + "rewards/ndcg_rule_reward": -0.022773224860429764, + "rewards/rule_reward": 0.056640625, + "step": 301, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.140625, + "epoch": 0.17476851851851852, + "grad_norm": 4.0855393409729, + "kl": 64.5, + "learning_rate": 9.915017385990633e-06, + "loss": 0.0644, + "reward": 0.088873241096735, + "reward_std": 0.13946667313575745, + "rewards/ndcg_rule_reward": -0.022454887628555298, + "rewards/rule_reward": 0.111328125, + "step": 302, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 0.859375, + "completion_length": 5.146484375, + "epoch": 0.1753472222222222, + "grad_norm": 4.1964569091796875, + "kl": 33.5, + "learning_rate": 9.914154912146035e-06, + "loss": 0.0335, + "reward": 0.06738504907116294, + "reward_std": 0.15764296799898148, + "rewards/ndcg_rule_reward": -0.03222431801259518, + "rewards/rule_reward": 0.099609375, + "step": 303, + "token_diversity": 0.3860176282051282 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.17592592592592593, + "grad_norm": 2.2484521865844727, + "kl": 16.75, + "learning_rate": 9.913288121714058e-06, + "loss": 0.0168, + "reward": 0.0020448610302992165, + "reward_std": 0.06640904769301414, + "rewards/ndcg_rule_reward": -0.0135801387950778, + "rewards/rule_reward": 0.015625, + "step": 304, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.17650462962962962, + "grad_norm": 2.29398512840271, + "kl": 24.25, + "learning_rate": 9.912417015456088e-06, + "loss": 0.0242, + "reward": 0.0037020101444795728, + "reward_std": 0.11608981713652611, + "rewards/ndcg_rule_reward": -0.023641739040613174, + "rewards/rule_reward": 0.02734375, + "step": 305, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 0.921875, + "completion_length": 5.125, + "epoch": 0.17708333333333334, + "grad_norm": 2.108675718307495, + "kl": 21.0, + "learning_rate": 9.911541594137307e-06, + "loss": 0.021, + "reward": 0.0859636589884758, + "reward_std": 0.13309656083583832, + "rewards/ndcg_rule_reward": -0.0234113410115242, + "rewards/rule_reward": 0.109375, + "step": 306, + "token_diversity": 0.3796237244897959 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.076171875, + "epoch": 0.17766203703703703, + "grad_norm": 1.9950133562088013, + "kl": 32.9375, + "learning_rate": 9.910661858526683e-06, + "loss": 0.033, + "reward": 0.05080530187115073, + "reward_std": 0.128240454941988, + "rewards/ndcg_rule_reward": -0.02146032266318798, + "rewards/rule_reward": 0.072265625, + "step": 307, + "token_diversity": 0.52734375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.17824074074074073, + "grad_norm": 2.848386287689209, + "kl": 11.5625, + "learning_rate": 9.909777809396973e-06, + "loss": 0.0116, + "reward": 0.0023799645132385194, + "reward_std": 0.09988906607031822, + "rewards/ndcg_rule_reward": -0.021057534962892532, + "rewards/rule_reward": 0.0234375, + "step": 308, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.193359375, + "epoch": 0.17881944444444445, + "grad_norm": 5.81364631652832, + "kl": 39.375, + "learning_rate": 9.908889447524724e-06, + "loss": 0.0394, + "reward": 0.12385539710521698, + "reward_std": 0.15909814089536667, + "rewards/ndcg_rule_reward": -0.030441475100815296, + "rewards/rule_reward": 0.154296875, + "step": 309, + "token_diversity": 0.3619877897350993 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.17939814814814814, + "grad_norm": 2.62302565574646, + "kl": 21.34375, + "learning_rate": 9.907996773690273e-06, + "loss": 0.0214, + "reward": 0.03335901163518429, + "reward_std": 0.07475556433200836, + "rewards/ndcg_rule_reward": -0.015469112899154425, + "rewards/rule_reward": 0.048828125, + "step": 310, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.044921875, + "epoch": 0.17997685185185186, + "grad_norm": 1.9853262901306152, + "kl": 15.125, + "learning_rate": 9.907099788677745e-06, + "loss": 0.0151, + "reward": 0.031753482995554805, + "reward_std": 0.1167396679520607, + "rewards/ndcg_rule_reward": -0.02293401677161455, + "rewards/rule_reward": 0.0546875, + "step": 311, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.06640625, + "epoch": 0.18055555555555555, + "grad_norm": 6.0446085929870605, + "kl": 49.375, + "learning_rate": 9.90619849327505e-06, + "loss": 0.0493, + "reward": 0.05124967987649143, + "reward_std": 0.13347267359495163, + "rewards/ndcg_rule_reward": -0.024922195822000504, + "rewards/rule_reward": 0.076171875, + "step": 312, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 0.984375, + "completion_length": 5.091796875, + "epoch": 0.18113425925925927, + "grad_norm": 3.324408531188965, + "kl": 39.5, + "learning_rate": 9.905292888273883e-06, + "loss": 0.0396, + "reward": 0.0741119459271431, + "reward_std": 0.17319459468126297, + "rewards/ndcg_rule_reward": -0.029403680004179478, + "rewards/rule_reward": 0.103515625, + "step": 313, + "token_diversity": 0.4494485294117647 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.236328125, + "epoch": 0.18171296296296297, + "grad_norm": 3.567931652069092, + "kl": 47.0, + "learning_rate": 9.904382974469729e-06, + "loss": 0.047, + "reward": 0.14899545907974243, + "reward_std": 0.14817840605974197, + "rewards/ndcg_rule_reward": -0.02483266592025757, + "rewards/rule_reward": 0.173828125, + "step": 314, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1015625, + "epoch": 0.18229166666666666, + "grad_norm": 2.289703369140625, + "kl": 23.71875, + "learning_rate": 9.90346875266185e-06, + "loss": 0.0237, + "reward": 0.0661206990480423, + "reward_std": 0.12459224462509155, + "rewards/ndcg_rule_reward": -0.025676174089312553, + "rewards/rule_reward": 0.091796875, + "step": 315, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.404296875, + "epoch": 0.18287037037037038, + "grad_norm": 1.9568909406661987, + "kl": 26.3125, + "learning_rate": 9.902550223653304e-06, + "loss": 0.0264, + "reward": 0.04691855260170996, + "reward_std": 0.10799572616815567, + "rewards/ndcg_rule_reward": -0.01948769809678197, + "rewards/rule_reward": 0.06640625, + "step": 316, + "token_diversity": 0.3472630033557047 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.064453125, + "epoch": 0.18344907407407407, + "grad_norm": 2.2962286472320557, + "kl": 28.875, + "learning_rate": 9.901627388250923e-06, + "loss": 0.0289, + "reward": 0.04964568559080362, + "reward_std": 0.16788789629936218, + "rewards/ndcg_rule_reward": -0.028479316271841526, + "rewards/rule_reward": 0.078125, + "step": 317, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.34765625, + "epoch": 0.1840277777777778, + "grad_norm": 2.946898937225342, + "kl": 41.0625, + "learning_rate": 9.900700247265323e-06, + "loss": 0.041, + "reward": 0.10634459182620049, + "reward_std": 0.12884031236171722, + "rewards/ndcg_rule_reward": -0.02256165351718664, + "rewards/rule_reward": 0.12890625, + "step": 318, + "token_diversity": 0.3832575158227848 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.166015625, + "epoch": 0.18460648148148148, + "grad_norm": 2.088124990463257, + "kl": 16.171875, + "learning_rate": 9.899768801510903e-06, + "loss": 0.0161, + "reward": 0.08778326958417892, + "reward_std": 0.09810368344187737, + "rewards/ndcg_rule_reward": -0.017685478553175926, + "rewards/rule_reward": 0.10546875, + "step": 319, + "token_diversity": 0.546875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.04296875, + "epoch": 0.18518518518518517, + "grad_norm": 1.8344197273254395, + "kl": 19.1875, + "learning_rate": 9.898833051805846e-06, + "loss": 0.0192, + "reward": 0.027734652627259493, + "reward_std": 0.1301904134452343, + "rewards/ndcg_rule_reward": -0.024999722838401794, + "rewards/rule_reward": 0.052734375, + "step": 320, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.109375, + "epoch": 0.1857638888888889, + "grad_norm": 1.8468724489212036, + "kl": 21.0, + "learning_rate": 9.897892998972113e-06, + "loss": 0.021, + "reward": 0.06345582648646086, + "reward_std": 0.14185261726379395, + "rewards/ndcg_rule_reward": -0.02834104746580124, + "rewards/rule_reward": 0.091796875, + "step": 321, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.080078125, + "epoch": 0.1863425925925926, + "grad_norm": 3.334296941757202, + "kl": 63.4375, + "learning_rate": 9.896948643835445e-06, + "loss": 0.0633, + "reward": 0.0543831754475832, + "reward_std": 0.16769029200077057, + "rewards/ndcg_rule_reward": -0.0296012032777071, + "rewards/rule_reward": 0.083984375, + "step": 322, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.109375, + "epoch": 0.1869212962962963, + "grad_norm": 2.6252074241638184, + "kl": 34.25, + "learning_rate": 9.89599998722536e-06, + "loss": 0.0342, + "reward": 0.07565136253833771, + "reward_std": 0.15373919904232025, + "rewards/ndcg_rule_reward": -0.02591113280504942, + "rewards/rule_reward": 0.1015625, + "step": 323, + "token_diversity": 0.37767137096774195 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.037109375, + "epoch": 0.1875, + "grad_norm": 1.9955849647521973, + "kl": 30.4375, + "learning_rate": 9.89504702997516e-06, + "loss": 0.0304, + "reward": 0.027395873330533504, + "reward_std": 0.11341925710439682, + "rewards/ndcg_rule_reward": -0.021432253532111645, + "rewards/rule_reward": 0.048828125, + "step": 324, + "token_diversity": 0.43359375 + }, + { + "categorical_diversity": 0.828125, + "completion_length": 5.1640625, + "epoch": 0.1880787037037037, + "grad_norm": 2.9871320724487305, + "kl": 29.4375, + "learning_rate": 9.89408977292192e-06, + "loss": 0.0294, + "reward": 0.11557031143456697, + "reward_std": 0.16884496062994003, + "rewards/ndcg_rule_reward": -0.02700781263411045, + "rewards/rule_reward": 0.142578125, + "step": 325, + "token_diversity": 0.23418898809523808 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.166015625, + "epoch": 0.1886574074074074, + "grad_norm": 3.0413763523101807, + "kl": 21.125, + "learning_rate": 9.893128216906495e-06, + "loss": 0.0211, + "reward": 0.11808779463171959, + "reward_std": 0.1587602086365223, + "rewards/ndcg_rule_reward": -0.02839658223092556, + "rewards/rule_reward": 0.146484375, + "step": 326, + "token_diversity": 0.3552900326797386 + }, + { + "categorical_diversity": 0.8125, + "completion_length": 5.083984375, + "epoch": 0.1892361111111111, + "grad_norm": 1.7228572368621826, + "kl": 38.25, + "learning_rate": 9.892162362773513e-06, + "loss": 0.0382, + "reward": 0.05856395384762436, + "reward_std": 0.11323816329240799, + "rewards/ndcg_rule_reward": -0.017607921734452248, + "rewards/rule_reward": 0.076171875, + "step": 327, + "token_diversity": 0.2954419378698225 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.310546875, + "epoch": 0.18981481481481483, + "grad_norm": 2.0307257175445557, + "kl": 30.125, + "learning_rate": 9.891192211371376e-06, + "loss": 0.0301, + "reward": 0.18220766261219978, + "reward_std": 0.12176603823900223, + "rewards/ndcg_rule_reward": -0.02091733179986477, + "rewards/rule_reward": 0.203125, + "step": 328, + "token_diversity": 0.34339488636363635 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0546875, + "epoch": 0.19039351851851852, + "grad_norm": 2.5428011417388916, + "kl": 19.125, + "learning_rate": 9.89021776355227e-06, + "loss": 0.0192, + "reward": 0.03507700772024691, + "reward_std": 0.10761705413460732, + "rewards/ndcg_rule_reward": -0.021563617512583733, + "rewards/rule_reward": 0.056640625, + "step": 329, + "token_diversity": 0.42578125 + }, + { + "categorical_diversity": 0.859375, + "completion_length": 5.150390625, + "epoch": 0.1909722222222222, + "grad_norm": 3.4596683979034424, + "kl": 41.3125, + "learning_rate": 9.889239020172144e-06, + "loss": 0.0415, + "reward": 0.09414373338222504, + "reward_std": 0.1285097934305668, + "rewards/ndcg_rule_reward": -0.021090644411742687, + "rewards/rule_reward": 0.115234375, + "step": 330, + "token_diversity": 0.3588773885350318 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.171875, + "epoch": 0.19155092592592593, + "grad_norm": 3.5842766761779785, + "kl": 33.0625, + "learning_rate": 9.888255982090728e-06, + "loss": 0.0332, + "reward": 0.10361173376441002, + "reward_std": 0.11421753466129303, + "rewards/ndcg_rule_reward": -0.021388263441622257, + "rewards/rule_reward": 0.125, + "step": 331, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 0.78125, + "completion_length": 5.328125, + "epoch": 0.19212962962962962, + "grad_norm": 2.411360502243042, + "kl": 51.5625, + "learning_rate": 9.88726865017152e-06, + "loss": 0.0516, + "reward": 0.17600750178098679, + "reward_std": 0.15323881804943085, + "rewards/ndcg_rule_reward": -0.02516437415033579, + "rewards/rule_reward": 0.201171875, + "step": 332, + "token_diversity": 0.28594941094941095 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.19270833333333334, + "grad_norm": 3.2192790508270264, + "kl": 42.625, + "learning_rate": 9.88627702528179e-06, + "loss": 0.0426, + "reward": 0.004675231873989105, + "reward_std": 0.1661001443862915, + "rewards/ndcg_rule_reward": -0.034387268126010895, + "rewards/rule_reward": 0.0390625, + "step": 333, + "token_diversity": 0.42578125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.19328703703703703, + "grad_norm": 1.8543038368225098, + "kl": 19.96875, + "learning_rate": 9.885281108292581e-06, + "loss": 0.02, + "reward": 0.03344812779687345, + "reward_std": 0.09158004075288773, + "rewards/ndcg_rule_reward": -0.01928624790161848, + "rewards/rule_reward": 0.052734375, + "step": 334, + "token_diversity": 0.4375 + }, + { + "categorical_diversity": 0.859375, + "completion_length": 5.11328125, + "epoch": 0.19386574074074073, + "grad_norm": 2.4688806533813477, + "kl": 21.65625, + "learning_rate": 9.884280900078706e-06, + "loss": 0.0216, + "reward": 0.0698515735566616, + "reward_std": 0.12978245317935944, + "rewards/ndcg_rule_reward": -0.02585155051201582, + "rewards/rule_reward": 0.095703125, + "step": 335, + "token_diversity": 0.36032046178343946 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.03515625, + "epoch": 0.19444444444444445, + "grad_norm": 1.5724258422851562, + "kl": 30.375, + "learning_rate": 9.883276401518745e-06, + "loss": 0.0303, + "reward": 0.025546192890033126, + "reward_std": 0.09798404574394226, + "rewards/ndcg_rule_reward": -0.017422557808458805, + "rewards/rule_reward": 0.04296875, + "step": 336, + "token_diversity": 0.43359375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.158203125, + "epoch": 0.19502314814814814, + "grad_norm": 1.8742880821228027, + "kl": 14.125, + "learning_rate": 9.882267613495049e-06, + "loss": 0.0141, + "reward": 0.09555717650800943, + "reward_std": 0.12234438210725784, + "rewards/ndcg_rule_reward": -0.021630320232361555, + "rewards/rule_reward": 0.1171875, + "step": 337, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.056640625, + "epoch": 0.19560185185185186, + "grad_norm": 2.4353578090667725, + "kl": 18.71875, + "learning_rate": 9.881254536893737e-06, + "loss": 0.0187, + "reward": 0.03736602608114481, + "reward_std": 0.09830618649721146, + "rewards/ndcg_rule_reward": -0.017321473453193903, + "rewards/rule_reward": 0.0546875, + "step": 338, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.001953125, + "epoch": 0.19618055555555555, + "grad_norm": 6.301953315734863, + "kl": 66.5625, + "learning_rate": 9.880237172604695e-06, + "loss": 0.0666, + "reward": 0.0040418768767267466, + "reward_std": 0.10307006910443306, + "rewards/ndcg_rule_reward": -0.021348748356103897, + "rewards/rule_reward": 0.025390625, + "step": 339, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.080078125, + "epoch": 0.19675925925925927, + "grad_norm": 2.059166193008423, + "kl": 36.515625, + "learning_rate": 9.87921552152157e-06, + "loss": 0.0366, + "reward": 0.052646319032646716, + "reward_std": 0.1348443143069744, + "rewards/ndcg_rule_reward": -0.023525556549429893, + "rewards/rule_reward": 0.076171875, + "step": 340, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.314453125, + "epoch": 0.19733796296296297, + "grad_norm": 3.5588138103485107, + "kl": 30.4375, + "learning_rate": 9.878189584541783e-06, + "loss": 0.0305, + "reward": 0.06025354168377817, + "reward_std": 0.14594019949436188, + "rewards/ndcg_rule_reward": -0.027637080289423466, + "rewards/rule_reward": 0.087890625, + "step": 341, + "token_diversity": 0.43359375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.240234375, + "epoch": 0.19791666666666666, + "grad_norm": 2.4960527420043945, + "kl": 42.875, + "learning_rate": 9.877159362566516e-06, + "loss": 0.0428, + "reward": 0.11245053261518478, + "reward_std": 0.1200469583272934, + "rewards/ndcg_rule_reward": -0.018408842384815216, + "rewards/rule_reward": 0.130859375, + "step": 342, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.01171875, + "epoch": 0.19849537037037038, + "grad_norm": 3.102487325668335, + "kl": 52.1875, + "learning_rate": 9.876124856500711e-06, + "loss": 0.0523, + "reward": 0.015373238362371922, + "reward_std": 0.1573905386030674, + "rewards/ndcg_rule_reward": -0.029548635706305504, + "rewards/rule_reward": 0.044921875, + "step": 343, + "token_diversity": 0.53125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.048828125, + "epoch": 0.19907407407407407, + "grad_norm": 2.588881731033325, + "kl": 32.3125, + "learning_rate": 9.87508606725308e-06, + "loss": 0.0323, + "reward": 0.033704821253195405, + "reward_std": 0.14954914152622223, + "rewards/ndcg_rule_reward": -0.028795180842280388, + "rewards/rule_reward": 0.0625, + "step": 344, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.1996527777777778, + "grad_norm": 2.4398913383483887, + "kl": 25.5, + "learning_rate": 9.874042995736095e-06, + "loss": 0.0255, + "reward": 0.035009863786399364, + "reward_std": 0.152865968644619, + "rewards/ndcg_rule_reward": -0.029443261213600636, + "rewards/rule_reward": 0.064453125, + "step": 345, + "token_diversity": 0.421875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.20023148148148148, + "grad_norm": 2.0744125843048096, + "kl": 19.8125, + "learning_rate": 9.872995642865986e-06, + "loss": 0.0198, + "reward": 0.0034845282789319754, + "reward_std": 0.10778923332691193, + "rewards/ndcg_rule_reward": -0.02190609648823738, + "rewards/rule_reward": 0.025390625, + "step": 346, + "token_diversity": 0.48828125 + }, + { + "epoch": 0.20023148148148148, + "eval_categorical_diversity": 1.0, + "eval_completion_length": 5.0, + "eval_kl": 21.982142857142858, + "eval_loss": 0.022016065195202827, + "eval_reward": 0.0018845258625345184, + "eval_reward_std": 0.06300976407992376, + "eval_rewards/ndcg_rule_reward": -0.012941468392148034, + "eval_rewards/rule_reward": 0.014825994318181818, + "eval_runtime": 92.4916, + "eval_samples_per_second": 52.61, + "eval_steps_per_second": 0.054, + "eval_token_diversity": 0.3730215097402597, + "step": 346 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.056640625, + "epoch": 0.20081018518518517, + "grad_norm": 1.7568210363388062, + "kl": 18.03125, + "learning_rate": 9.871944009562746e-06, + "loss": 0.018, + "reward": 0.033647241769358516, + "reward_std": 0.08302770555019379, + "rewards/ndcg_rule_reward": -0.017134005669504404, + "rewards/rule_reward": 0.05078125, + "step": 347, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 0.859375, + "completion_length": 5.1015625, + "epoch": 0.2013888888888889, + "grad_norm": 4.167365550994873, + "kl": 35.875, + "learning_rate": 9.87088809675013e-06, + "loss": 0.0359, + "reward": 0.061002615839242935, + "reward_std": 0.1450544036924839, + "rewards/ndcg_rule_reward": -0.02884113695472479, + "rewards/rule_reward": 0.08984375, + "step": 348, + "token_diversity": 0.3050358280254777 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0546875, + "epoch": 0.2019675925925926, + "grad_norm": 4.415468215942383, + "kl": 45.5, + "learning_rate": 9.869827905355654e-06, + "loss": 0.0455, + "reward": 0.03480351180769503, + "reward_std": 0.14141946285963058, + "rewards/ndcg_rule_reward": -0.029649612493813038, + "rewards/rule_reward": 0.064453125, + "step": 349, + "token_diversity": 0.44140625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.2025462962962963, + "grad_norm": 5.597947120666504, + "kl": 45.28125, + "learning_rate": 9.86876343631058e-06, + "loss": 0.0452, + "reward": 0.0025539204943925142, + "reward_std": 0.10820911079645157, + "rewards/ndcg_rule_reward": -0.022836703807115555, + "rewards/rule_reward": 0.025390625, + "step": 350, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.150390625, + "epoch": 0.203125, + "grad_norm": 3.0209696292877197, + "kl": 41.25, + "learning_rate": 9.867694690549943e-06, + "loss": 0.0411, + "reward": 0.09142064861953259, + "reward_std": 0.178264319896698, + "rewards/ndcg_rule_reward": -0.033579347655177116, + "rewards/rule_reward": 0.125, + "step": 351, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.056640625, + "epoch": 0.2037037037037037, + "grad_norm": 2.0246846675872803, + "kl": 17.28125, + "learning_rate": 9.866621669012526e-06, + "loss": 0.0173, + "reward": 0.03377925086533651, + "reward_std": 0.0914214551448822, + "rewards/ndcg_rule_reward": -0.018955121748149395, + "rewards/rule_reward": 0.052734375, + "step": 352, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.001953125, + "epoch": 0.2042824074074074, + "grad_norm": 2.1020636558532715, + "kl": 27.1875, + "learning_rate": 9.865544372640872e-06, + "loss": 0.0272, + "reward": 0.006115372525528073, + "reward_std": 0.144184410572052, + "rewards/ndcg_rule_reward": -0.029040878638625145, + "rewards/rule_reward": 0.03515625, + "step": 353, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.2048611111111111, + "grad_norm": 1.9550526142120361, + "kl": 15.53125, + "learning_rate": 9.864462802381271e-06, + "loss": 0.0155, + "reward": 0.0031410150695592165, + "reward_std": 0.13318617641925812, + "rewards/ndcg_rule_reward": -0.028108986094594002, + "rewards/rule_reward": 0.03125, + "step": 354, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 0.90625, + "completion_length": 5.19140625, + "epoch": 0.20543981481481483, + "grad_norm": 1.8346284627914429, + "kl": 18.1875, + "learning_rate": 9.86337695918378e-06, + "loss": 0.0182, + "reward": 0.09658420085906982, + "reward_std": 0.09968680143356323, + "rewards/ndcg_rule_reward": -0.020603297743946314, + "rewards/rule_reward": 0.1171875, + "step": 355, + "token_diversity": 0.359583025147929 + }, + { + "categorical_diversity": 0.9375, + "completion_length": 5.080078125, + "epoch": 0.20601851851851852, + "grad_norm": 2.019437789916992, + "kl": 30.375, + "learning_rate": 9.862286844002196e-06, + "loss": 0.0304, + "reward": 0.05423441343009472, + "reward_std": 0.14941637217998505, + "rewards/ndcg_rule_reward": -0.027796833775937557, + "rewards/rule_reward": 0.08203125, + "step": 356, + "token_diversity": 0.367462588028169 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.2065972222222222, + "grad_norm": 3.6133551597595215, + "kl": 32.5625, + "learning_rate": 9.861192457794077e-06, + "loss": 0.0326, + "reward": 0.00439235963858664, + "reward_std": 0.12417382001876831, + "rewards/ndcg_rule_reward": -0.024904515594244003, + "rewards/rule_reward": 0.029296875, + "step": 357, + "token_diversity": 0.53515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.080078125, + "epoch": 0.20717592592592593, + "grad_norm": 2.325145959854126, + "kl": 16.90625, + "learning_rate": 9.860093801520734e-06, + "loss": 0.0169, + "reward": 0.03420625883154571, + "reward_std": 0.11648494750261307, + "rewards/ndcg_rule_reward": -0.024387490935623646, + "rewards/rule_reward": 0.05859375, + "step": 358, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.103515625, + "epoch": 0.20775462962962962, + "grad_norm": 1.5265424251556396, + "kl": 12.046875, + "learning_rate": 9.85899087614722e-06, + "loss": 0.0121, + "reward": 0.06504173204302788, + "reward_std": 0.08295481652021408, + "rewards/ndcg_rule_reward": -0.016989514231681824, + "rewards/rule_reward": 0.08203125, + "step": 359, + "token_diversity": 0.5625 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.046875, + "epoch": 0.20833333333333334, + "grad_norm": 1.2472352981567383, + "kl": 21.4375, + "learning_rate": 9.857883682642347e-06, + "loss": 0.0214, + "reward": 0.03197388956323266, + "reward_std": 0.08300863951444626, + "rewards/ndcg_rule_reward": -0.014901111833751202, + "rewards/rule_reward": 0.046875, + "step": 360, + "token_diversity": 0.3501233552631579 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.20891203703703703, + "grad_norm": 3.4119226932525635, + "kl": 36.375, + "learning_rate": 9.85677222197867e-06, + "loss": 0.0363, + "reward": 0.03328860132023692, + "reward_std": 0.149738110601902, + "rewards/ndcg_rule_reward": -0.02921140193939209, + "rewards/rule_reward": 0.0625, + "step": 361, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 0.75, + "completion_length": 5.146484375, + "epoch": 0.20949074074074073, + "grad_norm": 2.076490879058838, + "kl": 22.9375, + "learning_rate": 9.855656495132497e-06, + "loss": 0.023, + "reward": 0.08993777632713318, + "reward_std": 0.12004593387246132, + "rewards/ndcg_rule_reward": -0.021390347741544247, + "rewards/rule_reward": 0.111328125, + "step": 362, + "token_diversity": 0.26243589743589746 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.21006944444444445, + "grad_norm": 4.142364025115967, + "kl": 33.875, + "learning_rate": 9.854536503083878e-06, + "loss": 0.0339, + "reward": 0.002032152668107301, + "reward_std": 0.08321140706539154, + "rewards/ndcg_rule_reward": -0.01749909808859229, + "rewards/rule_reward": 0.01953125, + "step": 363, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.21064814814814814, + "grad_norm": 3.10528302192688, + "kl": 49.375, + "learning_rate": 9.853412246816614e-06, + "loss": 0.0494, + "reward": 0.004282727371901274, + "reward_std": 0.1494985818862915, + "rewards/ndcg_rule_reward": -0.03087352216243744, + "rewards/rule_reward": 0.03515625, + "step": 364, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.21122685185185186, + "grad_norm": 2.9563052654266357, + "kl": 17.78125, + "learning_rate": 9.852283727318251e-06, + "loss": 0.0178, + "reward": 0.061295462772250175, + "reward_std": 0.1396557055413723, + "rewards/ndcg_rule_reward": -0.02464203629642725, + "rewards/rule_reward": 0.0859375, + "step": 365, + "token_diversity": 0.40625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.21180555555555555, + "grad_norm": 2.2233715057373047, + "kl": 15.28125, + "learning_rate": 9.851150945580079e-06, + "loss": 0.0153, + "reward": 0.03361427900381386, + "reward_std": 0.10829183459281921, + "rewards/ndcg_rule_reward": -0.023026345297694206, + "rewards/rule_reward": 0.056640625, + "step": 366, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.21238425925925927, + "grad_norm": 1.7332439422607422, + "kl": 32.625, + "learning_rate": 9.85001390259713e-06, + "loss": 0.0326, + "reward": 0.003508315945509821, + "reward_std": 0.11618590727448463, + "rewards/ndcg_rule_reward": -0.023835433647036552, + "rewards/rule_reward": 0.02734375, + "step": 367, + "token_diversity": 0.41015625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.21296296296296297, + "grad_norm": 2.923234701156616, + "kl": 21.28125, + "learning_rate": 9.848872599368184e-06, + "loss": 0.0213, + "reward": 0.03443266870453954, + "reward_std": 0.11630784347653389, + "rewards/ndcg_rule_reward": -0.024161078967154026, + "rewards/rule_reward": 0.05859375, + "step": 368, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.1953125, + "epoch": 0.21354166666666666, + "grad_norm": 1.8317537307739258, + "kl": 32.125, + "learning_rate": 9.847727036895759e-06, + "loss": 0.0322, + "reward": 0.12439335137605667, + "reward_std": 0.13335807621479034, + "rewards/ndcg_rule_reward": -0.02404414303600788, + "rewards/rule_reward": 0.1484375, + "step": 369, + "token_diversity": 0.38507401315789475 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.080078125, + "epoch": 0.21412037037037038, + "grad_norm": 1.9261651039123535, + "kl": 20.71875, + "learning_rate": 9.846577216186114e-06, + "loss": 0.0207, + "reward": 0.033222980331629515, + "reward_std": 0.07479656487703323, + "rewards/ndcg_rule_reward": -0.015605142340064049, + "rewards/rule_reward": 0.048828125, + "step": 370, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1015625, + "epoch": 0.21469907407407407, + "grad_norm": 1.7321476936340332, + "kl": 21.625, + "learning_rate": 9.845423138249254e-06, + "loss": 0.0216, + "reward": 0.06608106987550855, + "reward_std": 0.11618304997682571, + "rewards/ndcg_rule_reward": -0.023762681987136602, + "rewards/rule_reward": 0.08984375, + "step": 371, + "token_diversity": 0.515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.2152777777777778, + "grad_norm": 2.0231680870056152, + "kl": 29.9375, + "learning_rate": 9.844264804098914e-06, + "loss": 0.0299, + "reward": 0.00333169917576015, + "reward_std": 0.10783666744828224, + "rewards/ndcg_rule_reward": -0.022058925591409206, + "rewards/rule_reward": 0.025390625, + "step": 372, + "token_diversity": 0.55859375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.03515625, + "epoch": 0.21585648148148148, + "grad_norm": 2.3529140949249268, + "kl": 43.25, + "learning_rate": 9.843102214752577e-06, + "loss": 0.0433, + "reward": 0.02677827957086265, + "reward_std": 0.13946230709552765, + "rewards/ndcg_rule_reward": -0.025956097058951855, + "rewards/rule_reward": 0.052734375, + "step": 373, + "token_diversity": 0.42578125 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.154296875, + "epoch": 0.21643518518518517, + "grad_norm": 4.014507293701172, + "kl": 48.3125, + "learning_rate": 9.841935371231461e-06, + "loss": 0.0483, + "reward": 0.10119181498885155, + "reward_std": 0.13134601339697838, + "rewards/ndcg_rule_reward": -0.023808181285858154, + "rewards/rule_reward": 0.125, + "step": 374, + "token_diversity": 0.37987012987012986 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.04296875, + "epoch": 0.2170138888888889, + "grad_norm": 3.6684515476226807, + "kl": 51.125, + "learning_rate": 9.840764274560518e-06, + "loss": 0.051, + "reward": 0.03280231752432883, + "reward_std": 0.19410854578018188, + "rewards/ndcg_rule_reward": -0.037510186433792114, + "rewards/rule_reward": 0.0703125, + "step": 375, + "token_diversity": 0.44140625 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.046875, + "epoch": 0.2175925925925926, + "grad_norm": 2.6227309703826904, + "kl": 51.5, + "learning_rate": 9.839588925768436e-06, + "loss": 0.0514, + "reward": 0.03272962453775108, + "reward_std": 0.11633000895380974, + "rewards/ndcg_rule_reward": -0.021957875229418278, + "rewards/rule_reward": 0.0546875, + "step": 376, + "token_diversity": 0.3665707236842105 + }, + { + "categorical_diversity": 0.984375, + "completion_length": 5.10546875, + "epoch": 0.2181712962962963, + "grad_norm": 2.3185341358184814, + "kl": 62.5, + "learning_rate": 9.838409325887643e-06, + "loss": 0.0625, + "reward": 0.06969834305346012, + "reward_std": 0.14202401787042618, + "rewards/ndcg_rule_reward": -0.022098535671830177, + "rewards/rule_reward": 0.091796875, + "step": 377, + "token_diversity": 0.3674685251798561 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.10546875, + "epoch": 0.21875, + "grad_norm": 2.222018241882324, + "kl": 41.375, + "learning_rate": 9.837225475954298e-06, + "loss": 0.0413, + "reward": 0.06789068132638931, + "reward_std": 0.16578802466392517, + "rewards/ndcg_rule_reward": -0.03367181867361069, + "rewards/rule_reward": 0.1015625, + "step": 378, + "token_diversity": 0.37601461038961037 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.171875, + "epoch": 0.2193287037037037, + "grad_norm": 30.169761657714844, + "kl": 144.59375, + "learning_rate": 9.836037377008291e-06, + "loss": 0.1448, + "reward": 0.11217297986149788, + "reward_std": 0.16116736829280853, + "rewards/ndcg_rule_reward": -0.02845202013850212, + "rewards/rule_reward": 0.140625, + "step": 379, + "token_diversity": 0.35323660714285715 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.09375, + "epoch": 0.2199074074074074, + "grad_norm": 2.4978578090667725, + "kl": 29.875, + "learning_rate": 9.834845030093247e-06, + "loss": 0.0298, + "reward": 0.06116405868669972, + "reward_std": 0.11125504225492477, + "rewards/ndcg_rule_reward": -0.020867194049060345, + "rewards/rule_reward": 0.08203125, + "step": 380, + "token_diversity": 0.38382711038961037 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.060546875, + "epoch": 0.2204861111111111, + "grad_norm": 2.546703815460205, + "kl": 20.0, + "learning_rate": 9.833648436256525e-06, + "loss": 0.02, + "reward": 0.046736134216189384, + "reward_std": 0.1556689292192459, + "rewards/ndcg_rule_reward": -0.02748261485248804, + "rewards/rule_reward": 0.07421875, + "step": 381, + "token_diversity": 0.4296875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.087890625, + "epoch": 0.22106481481481483, + "grad_norm": 2.297092914581299, + "kl": 17.75, + "learning_rate": 9.832447596549209e-06, + "loss": 0.0178, + "reward": 0.062419090420007706, + "reward_std": 0.15278655290603638, + "rewards/ndcg_rule_reward": -0.02937778364866972, + "rewards/rule_reward": 0.091796875, + "step": 382, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1015625, + "epoch": 0.22164351851851852, + "grad_norm": 1.821342945098877, + "kl": 30.25, + "learning_rate": 9.831242512026114e-06, + "loss": 0.0303, + "reward": 0.06619180436246097, + "reward_std": 0.1076573021709919, + "rewards/ndcg_rule_reward": -0.02169881761074066, + "rewards/rule_reward": 0.087890625, + "step": 383, + "token_diversity": 0.42578125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.2222222222222222, + "grad_norm": 1.7041295766830444, + "kl": 29.5625, + "learning_rate": 9.830033183745786e-06, + "loss": 0.0295, + "reward": 0.06185254640877247, + "reward_std": 0.1193772628903389, + "rewards/ndcg_rule_reward": -0.022131827659904957, + "rewards/rule_reward": 0.083984375, + "step": 384, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.22280092592592593, + "grad_norm": 2.5458483695983887, + "kl": 28.25, + "learning_rate": 9.828819612770497e-06, + "loss": 0.0282, + "reward": 0.06660714000463486, + "reward_std": 0.1746513992547989, + "rewards/ndcg_rule_reward": -0.03300223406404257, + "rewards/rule_reward": 0.099609375, + "step": 385, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.09765625, + "epoch": 0.22337962962962962, + "grad_norm": 2.303783893585205, + "kl": 21.875, + "learning_rate": 9.827601800166248e-06, + "loss": 0.0219, + "reward": 0.060924600809812546, + "reward_std": 0.11139670014381409, + "rewards/ndcg_rule_reward": -0.02110664453357458, + "rewards/rule_reward": 0.08203125, + "step": 386, + "token_diversity": 0.37219551282051283 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0546875, + "epoch": 0.22395833333333334, + "grad_norm": 8.944234848022461, + "kl": 76.625, + "learning_rate": 9.826379747002763e-06, + "loss": 0.0766, + "reward": 0.044827102683484554, + "reward_std": 0.12442599982023239, + "rewards/ndcg_rule_reward": -0.023532272316515446, + "rewards/rule_reward": 0.068359375, + "step": 387, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.22453703703703703, + "grad_norm": 2.2320995330810547, + "kl": 58.625, + "learning_rate": 9.82515345435349e-06, + "loss": 0.0585, + "reward": 0.03405286406632513, + "reward_std": 0.0912722498178482, + "rewards/ndcg_rule_reward": -0.0186815089546144, + "rewards/rule_reward": 0.052734375, + "step": 388, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 0.765625, + "completion_length": 9.0078125, + "epoch": 0.22511574074074073, + "grad_norm": 2.431715726852417, + "kl": 47.5, + "learning_rate": 9.823922923295606e-06, + "loss": 0.0475, + "reward": 0.11637420579791069, + "reward_std": 0.09173072874546051, + "rewards/ndcg_rule_reward": -0.014485168736428022, + "rewards/rule_reward": 0.130859375, + "step": 389, + "token_diversity": 0.276281310211946 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0546875, + "epoch": 0.22569444444444445, + "grad_norm": 1.7528918981552124, + "kl": 42.5, + "learning_rate": 9.822688154910006e-06, + "loss": 0.0425, + "reward": 0.038800097070634365, + "reward_std": 0.11285858228802681, + "rewards/ndcg_rule_reward": -0.021746776066720486, + "rewards/rule_reward": 0.060546875, + "step": 390, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.22627314814814814, + "grad_norm": 2.0721731185913086, + "kl": 37.0625, + "learning_rate": 9.821449150281308e-06, + "loss": 0.037, + "reward": 0.03540551825426519, + "reward_std": 0.14116528630256653, + "rewards/ndcg_rule_reward": -0.029047604650259018, + "rewards/rule_reward": 0.064453125, + "step": 391, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.095703125, + "epoch": 0.22685185185185186, + "grad_norm": 1.6777064800262451, + "kl": 25.6875, + "learning_rate": 9.820205910497853e-06, + "loss": 0.0256, + "reward": 0.06097695045173168, + "reward_std": 0.10295023769140244, + "rewards/ndcg_rule_reward": -0.019101174548268318, + "rewards/rule_reward": 0.080078125, + "step": 392, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.2109375, + "epoch": 0.22743055555555555, + "grad_norm": 14.170621871948242, + "kl": 90.5, + "learning_rate": 9.818958436651704e-06, + "loss": 0.0906, + "reward": 0.13369622826576233, + "reward_std": 0.1387079954147339, + "rewards/ndcg_rule_reward": -0.024506892077624798, + "rewards/rule_reward": 0.158203125, + "step": 393, + "token_diversity": 0.4484608208955224 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0390625, + "epoch": 0.22800925925925927, + "grad_norm": 2.0895776748657227, + "kl": 18.9375, + "learning_rate": 9.817706729838637e-06, + "loss": 0.0189, + "reward": 0.029065671609714627, + "reward_std": 0.163202702999115, + "rewards/ndcg_rule_reward": -0.03148120455443859, + "rewards/rule_reward": 0.060546875, + "step": 394, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.072265625, + "epoch": 0.22858796296296297, + "grad_norm": 5.781368732452393, + "kl": 81.0, + "learning_rate": 9.816450791158151e-06, + "loss": 0.0809, + "reward": 0.04879953316412866, + "reward_std": 0.13369085639715195, + "rewards/ndcg_rule_reward": -0.021512966603040695, + "rewards/rule_reward": 0.0703125, + "step": 395, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0546875, + "epoch": 0.22916666666666666, + "grad_norm": 2.982430934906006, + "kl": 35.875, + "learning_rate": 9.81519062171346e-06, + "loss": 0.0358, + "reward": 0.03670474817045033, + "reward_std": 0.1826503425836563, + "rewards/ndcg_rule_reward": -0.03751400113105774, + "rewards/rule_reward": 0.07421875, + "step": 396, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1015625, + "epoch": 0.22974537037037038, + "grad_norm": 2.070840835571289, + "kl": 28.75, + "learning_rate": 9.813926222611496e-06, + "loss": 0.0287, + "reward": 0.06455687992274761, + "reward_std": 0.1497422456741333, + "rewards/ndcg_rule_reward": -0.02919311635196209, + "rewards/rule_reward": 0.09375, + "step": 397, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0546875, + "epoch": 0.23032407407407407, + "grad_norm": 1.5737512111663818, + "kl": 21.40625, + "learning_rate": 9.812657594962907e-06, + "loss": 0.0214, + "reward": 0.03413732990156859, + "reward_std": 0.10807549953460693, + "rewards/ndcg_rule_reward": -0.022503296844661236, + "rewards/rule_reward": 0.056640625, + "step": 398, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.12890625, + "epoch": 0.2309027777777778, + "grad_norm": 2.303781270980835, + "kl": 25.03125, + "learning_rate": 9.81138473988205e-06, + "loss": 0.025, + "reward": 0.08168909139931202, + "reward_std": 0.1369650736451149, + "rewards/ndcg_rule_reward": -0.02377965673804283, + "rewards/rule_reward": 0.10546875, + "step": 399, + "token_diversity": 0.35326522435897434 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.23148148148148148, + "grad_norm": 2.0266664028167725, + "kl": 29.46875, + "learning_rate": 9.810107658487003e-06, + "loss": 0.0295, + "reward": 0.003004470723681152, + "reward_std": 0.09120534360408783, + "rewards/ndcg_rule_reward": -0.018479904159903526, + "rewards/rule_reward": 0.021484375, + "step": 400, + "token_diversity": 0.4375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.23206018518518517, + "grad_norm": 2.052910327911377, + "kl": 57.125, + "learning_rate": 9.808826351899551e-06, + "loss": 0.0571, + "reward": 0.035416833125054836, + "reward_std": 0.12427539378404617, + "rewards/ndcg_rule_reward": -0.025130038149654865, + "rewards/rule_reward": 0.060546875, + "step": 401, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.08203125, + "epoch": 0.2326388888888889, + "grad_norm": 4.462392807006836, + "kl": 58.25, + "learning_rate": 9.807540821245193e-06, + "loss": 0.0581, + "reward": 0.05162348970770836, + "reward_std": 0.1332300342619419, + "rewards/ndcg_rule_reward": -0.024548384360969067, + "rewards/rule_reward": 0.076171875, + "step": 402, + "token_diversity": 0.3689903846153846 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.2332175925925926, + "grad_norm": 3.3033406734466553, + "kl": 39.75, + "learning_rate": 9.806251067653137e-06, + "loss": 0.0397, + "reward": 0.034376831259578466, + "reward_std": 0.11636364087462425, + "rewards/ndcg_rule_reward": -0.024216915480792522, + "rewards/rule_reward": 0.05859375, + "step": 403, + "token_diversity": 0.40625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.00390625, + "epoch": 0.2337962962962963, + "grad_norm": 2.0319666862487793, + "kl": 30.125, + "learning_rate": 9.8049570922563e-06, + "loss": 0.0302, + "reward": 0.008476212387904525, + "reward_std": 0.14609059691429138, + "rewards/ndcg_rule_reward": -0.028633163310587406, + "rewards/rule_reward": 0.037109375, + "step": 404, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.28125, + "epoch": 0.234375, + "grad_norm": 1.8892115354537964, + "kl": 24.1875, + "learning_rate": 9.80365889619131e-06, + "loss": 0.0242, + "reward": 0.15482324361801147, + "reward_std": 0.11991218477487564, + "rewards/ndcg_rule_reward": -0.022911133244633675, + "rewards/rule_reward": 0.177734375, + "step": 405, + "token_diversity": 0.35441028225806454 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.126953125, + "epoch": 0.2349537037037037, + "grad_norm": 1.843740701675415, + "kl": 48.875, + "learning_rate": 9.802356480598499e-06, + "loss": 0.0489, + "reward": 0.08071482740342617, + "reward_std": 0.09401141107082367, + "rewards/ndcg_rule_reward": -0.013035171199589968, + "rewards/rule_reward": 0.09375, + "step": 406, + "token_diversity": 0.3638174019607843 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.111328125, + "epoch": 0.2355324074074074, + "grad_norm": 1.6434470415115356, + "kl": 36.625, + "learning_rate": 9.801049846621906e-06, + "loss": 0.0367, + "reward": 0.06554663926362991, + "reward_std": 0.11638271063566208, + "rewards/ndcg_rule_reward": -0.024297107942402363, + "rewards/rule_reward": 0.08984375, + "step": 407, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 0.953125, + "completion_length": 5.13671875, + "epoch": 0.2361111111111111, + "grad_norm": 2.252314329147339, + "kl": 27.0, + "learning_rate": 9.799738995409282e-06, + "loss": 0.027, + "reward": 0.0937039703130722, + "reward_std": 0.1366535909473896, + "rewards/ndcg_rule_reward": -0.021530402824282646, + "rewards/rule_reward": 0.115234375, + "step": 408, + "token_diversity": 0.41428571428571426 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.13671875, + "epoch": 0.23668981481481483, + "grad_norm": 2.2373576164245605, + "kl": 39.375, + "learning_rate": 9.798423928112071e-06, + "loss": 0.0394, + "reward": 0.0864595752209425, + "reward_std": 0.12293286249041557, + "rewards/ndcg_rule_reward": -0.019009176641702652, + "rewards/rule_reward": 0.10546875, + "step": 409, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.23726851851851852, + "grad_norm": 2.2682483196258545, + "kl": 25.6875, + "learning_rate": 9.797104645885432e-06, + "loss": 0.0257, + "reward": 0.0030561196617782116, + "reward_std": 0.10795318335294724, + "rewards/ndcg_rule_reward": -0.0223345048725605, + "rewards/rule_reward": 0.025390625, + "step": 410, + "token_diversity": 0.43359375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.15234375, + "epoch": 0.2378472222222222, + "grad_norm": 3.0925889015197754, + "kl": 51.25, + "learning_rate": 9.795781149888216e-06, + "loss": 0.0514, + "reward": 0.09549742192029953, + "reward_std": 0.14464505016803741, + "rewards/ndcg_rule_reward": -0.02559632994234562, + "rewards/rule_reward": 0.12109375, + "step": 411, + "token_diversity": 0.375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.23842592592592593, + "grad_norm": 1.448779582977295, + "kl": 24.625, + "learning_rate": 9.794453441282984e-06, + "loss": 0.0246, + "reward": 0.06022574566304684, + "reward_std": 0.07805665209889412, + "rewards/ndcg_rule_reward": -0.01399300480261445, + "rewards/rule_reward": 0.07421875, + "step": 412, + "token_diversity": 0.52734375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1328125, + "epoch": 0.23900462962962962, + "grad_norm": 2.365180730819702, + "kl": 39.875, + "learning_rate": 9.793121521235988e-06, + "loss": 0.0398, + "reward": 0.08316604420542717, + "reward_std": 0.10150743648409843, + "rewards/ndcg_rule_reward": -0.01644332893192768, + "rewards/rule_reward": 0.099609375, + "step": 413, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.109375, + "epoch": 0.23958333333333334, + "grad_norm": 2.091829776763916, + "kl": 33.5, + "learning_rate": 9.79178539091719e-06, + "loss": 0.0335, + "reward": 0.06713519152253866, + "reward_std": 0.15771637856960297, + "rewards/ndcg_rule_reward": -0.032474178820848465, + "rewards/rule_reward": 0.099609375, + "step": 414, + "token_diversity": 0.362880608974359 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.24016203703703703, + "grad_norm": 1.9657446146011353, + "kl": 30.0625, + "learning_rate": 9.790445051500245e-06, + "loss": 0.0301, + "reward": 0.06157074309885502, + "reward_std": 0.1335148960351944, + "rewards/ndcg_rule_reward": -0.024366757832467556, + "rewards/rule_reward": 0.0859375, + "step": 415, + "token_diversity": 0.53515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.24074074074074073, + "grad_norm": 1.7061165571212769, + "kl": 38.375, + "learning_rate": 9.789100504162503e-06, + "loss": 0.0385, + "reward": 0.06289827823638916, + "reward_std": 0.10003563016653061, + "rewards/ndcg_rule_reward": -0.019132974557578564, + "rewards/rule_reward": 0.08203125, + "step": 416, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.24131944444444445, + "grad_norm": 2.0235512256622314, + "kl": 20.25, + "learning_rate": 9.787751750085014e-06, + "loss": 0.0203, + "reward": 0.002826948883011937, + "reward_std": 0.09969405084848404, + "rewards/ndcg_rule_reward": -0.020610551349818707, + "rewards/rule_reward": 0.0234375, + "step": 417, + "token_diversity": 0.44140625 + }, + { + "categorical_diversity": 0.90625, + "completion_length": 5.1328125, + "epoch": 0.24189814814814814, + "grad_norm": 4.255209445953369, + "kl": 35.875, + "learning_rate": 9.786398790452521e-06, + "loss": 0.0358, + "reward": 0.08828953467309475, + "reward_std": 0.13159815222024918, + "rewards/ndcg_rule_reward": -0.02303859405219555, + "rewards/rule_reward": 0.111328125, + "step": 418, + "token_diversity": 0.4038118708053691 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1875, + "epoch": 0.24247685185185186, + "grad_norm": 2.2750632762908936, + "kl": 32.8125, + "learning_rate": 9.785041626453465e-06, + "loss": 0.0329, + "reward": 0.06990430876612663, + "reward_std": 0.1313093602657318, + "rewards/ndcg_rule_reward": -0.023845691233873367, + "rewards/rule_reward": 0.09375, + "step": 419, + "token_diversity": 0.44921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.041015625, + "epoch": 0.24305555555555555, + "grad_norm": 3.044367790222168, + "kl": 35.625, + "learning_rate": 9.783680259279971e-06, + "loss": 0.0356, + "reward": 0.033299060771241784, + "reward_std": 0.13287696242332458, + "rewards/ndcg_rule_reward": -0.025294690392911434, + "rewards/rule_reward": 0.05859375, + "step": 420, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.24363425925925927, + "grad_norm": 1.47986900806427, + "kl": 11.4375, + "learning_rate": 9.782314690127867e-06, + "loss": 0.0114, + "reward": 0.0021216569002717733, + "reward_std": 0.08315097168087959, + "rewards/ndcg_rule_reward": -0.017409592866897583, + "rewards/rule_reward": 0.01953125, + "step": 421, + "token_diversity": 0.515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1484375, + "epoch": 0.24421296296296297, + "grad_norm": 1.8438228368759155, + "kl": 23.3125, + "learning_rate": 9.780944920196668e-06, + "loss": 0.0233, + "reward": 0.06390015897341073, + "reward_std": 0.1331758201122284, + "rewards/ndcg_rule_reward": -0.025943594984710217, + "rewards/rule_reward": 0.08984375, + "step": 422, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 0.90625, + "completion_length": 5.095703125, + "epoch": 0.24479166666666666, + "grad_norm": 2.1801111698150635, + "kl": 54.625, + "learning_rate": 9.779570950689575e-06, + "loss": 0.0546, + "reward": 0.06109368312172592, + "reward_std": 0.0944850891828537, + "rewards/ndcg_rule_reward": -0.0170313180424273, + "rewards/rule_reward": 0.078125, + "step": 423, + "token_diversity": 0.3955208333333333 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.24537037037037038, + "grad_norm": 4.444077014923096, + "kl": 50.375, + "learning_rate": 9.778192782813483e-06, + "loss": 0.0503, + "reward": 0.005288022803142667, + "reward_std": 0.16584919393062592, + "rewards/ndcg_rule_reward": -0.03377447836101055, + "rewards/rule_reward": 0.0390625, + "step": 424, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.24594907407407407, + "grad_norm": 1.609845757484436, + "kl": 14.09375, + "learning_rate": 9.77681041777897e-06, + "loss": 0.0141, + "reward": 0.03694845293648541, + "reward_std": 0.11916118860244751, + "rewards/ndcg_rule_reward": -0.02359842136502266, + "rewards/rule_reward": 0.060546875, + "step": 425, + "token_diversity": 0.4710210755813954 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.2465277777777778, + "grad_norm": 2.985424757003784, + "kl": 32.375, + "learning_rate": 9.775423856800309e-06, + "loss": 0.0324, + "reward": 0.0035318336449563503, + "reward_std": 0.14145107567310333, + "rewards/ndcg_rule_reward": -0.029671291820704937, + "rewards/rule_reward": 0.033203125, + "step": 426, + "token_diversity": 0.4375 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.05078125, + "epoch": 0.24710648148148148, + "grad_norm": 2.2904326915740967, + "kl": 23.0625, + "learning_rate": 9.774033101095448e-06, + "loss": 0.0231, + "reward": 0.03359937050845474, + "reward_std": 0.08304836973547935, + "rewards/ndcg_rule_reward": -0.017181879840791225, + "rewards/rule_reward": 0.05078125, + "step": 427, + "token_diversity": 0.31483360389610393 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.24768518518518517, + "grad_norm": 2.730196237564087, + "kl": 16.4375, + "learning_rate": 9.772638151886029e-06, + "loss": 0.0164, + "reward": 0.03133898111991584, + "reward_std": 0.10855812579393387, + "rewards/ndcg_rule_reward": -0.021395393647253513, + "rewards/rule_reward": 0.052734375, + "step": 428, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.2482638888888889, + "grad_norm": 1.9550676345825195, + "kl": 26.875, + "learning_rate": 9.77123901039737e-06, + "loss": 0.0269, + "reward": 0.003349365433678031, + "reward_std": 0.1078261211514473, + "rewards/ndcg_rule_reward": -0.022041259333491325, + "rewards/rule_reward": 0.025390625, + "step": 429, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.2488425925925926, + "grad_norm": 5.300313472747803, + "kl": 22.8125, + "learning_rate": 9.769835677858479e-06, + "loss": 0.0228, + "reward": 0.0037473628763109446, + "reward_std": 0.1413339599967003, + "rewards/ndcg_rule_reward": -0.02945576049387455, + "rewards/rule_reward": 0.033203125, + "step": 430, + "token_diversity": 0.41796875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.2494212962962963, + "grad_norm": 1.9717464447021484, + "kl": 29.6875, + "learning_rate": 9.768428155502038e-06, + "loss": 0.0297, + "reward": 0.00349749147426337, + "reward_std": 0.12461387366056442, + "rewards/ndcg_rule_reward": -0.025799383409321308, + "rewards/rule_reward": 0.029296875, + "step": 431, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1015625, + "epoch": 0.25, + "grad_norm": 2.498180627822876, + "kl": 41.0, + "learning_rate": 9.767016444564414e-06, + "loss": 0.041, + "reward": 0.0666685663163662, + "reward_std": 0.13273610919713974, + "rewards/ndcg_rule_reward": -0.027081424370408058, + "rewards/rule_reward": 0.09375, + "step": 432, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.193359375, + "epoch": 0.2505787037037037, + "grad_norm": 1.9206674098968506, + "kl": 34.25, + "learning_rate": 9.765600546285654e-06, + "loss": 0.0343, + "reward": 0.12182331830263138, + "reward_std": 0.1131945252418518, + "rewards/ndcg_rule_reward": -0.020754799246788025, + "rewards/rule_reward": 0.142578125, + "step": 433, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.15234375, + "epoch": 0.2511574074074074, + "grad_norm": 5.309033393859863, + "kl": 51.0625, + "learning_rate": 9.764180461909478e-06, + "loss": 0.051, + "reward": 0.09654922410845757, + "reward_std": 0.09127453342080116, + "rewards/ndcg_rule_reward": -0.01868514670059085, + "rewards/rule_reward": 0.115234375, + "step": 434, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.2517361111111111, + "grad_norm": 3.088524580001831, + "kl": 27.4375, + "learning_rate": 9.762756192683289e-06, + "loss": 0.0274, + "reward": 0.002389901434071362, + "reward_std": 0.08306640759110451, + "rewards/ndcg_rule_reward": -0.01714134868234396, + "rewards/rule_reward": 0.01953125, + "step": 435, + "token_diversity": 0.44921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.2523148148148148, + "grad_norm": 2.434518814086914, + "kl": 27.125, + "learning_rate": 9.761327739858161e-06, + "loss": 0.0272, + "reward": 0.034010773873887956, + "reward_std": 0.08288909494876862, + "rewards/ndcg_rule_reward": -0.016770475544035435, + "rewards/rule_reward": 0.05078125, + "step": 436, + "token_diversity": 0.4375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.25289351851851855, + "grad_norm": 2.2443902492523193, + "kl": 19.0, + "learning_rate": 9.759895104688848e-06, + "loss": 0.019, + "reward": 0.03166707081254572, + "reward_std": 0.09159525111317635, + "rewards/ndcg_rule_reward": -0.017161055468022823, + "rewards/rule_reward": 0.048828125, + "step": 437, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.08984375, + "epoch": 0.2534722222222222, + "grad_norm": 2.824049472808838, + "kl": 28.375, + "learning_rate": 9.758458288433772e-06, + "loss": 0.0283, + "reward": 0.05941580794751644, + "reward_std": 0.13650896772742271, + "rewards/ndcg_rule_reward": -0.024568566121160984, + "rewards/rule_reward": 0.083984375, + "step": 438, + "token_diversity": 0.40234375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.25405092592592593, + "grad_norm": 2.6052937507629395, + "kl": 20.8125, + "learning_rate": 9.757017292355033e-06, + "loss": 0.0209, + "reward": 0.06305942498147488, + "reward_std": 0.10838097706437111, + "rewards/ndcg_rule_reward": -0.0209249472245574, + "rewards/rule_reward": 0.083984375, + "step": 439, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.25462962962962965, + "grad_norm": 3.301901340484619, + "kl": 32.9375, + "learning_rate": 9.755572117718396e-06, + "loss": 0.0329, + "reward": 0.0035751977702602744, + "reward_std": 0.12458135932683945, + "rewards/ndcg_rule_reward": -0.02572167757898569, + "rewards/rule_reward": 0.029296875, + "step": 440, + "token_diversity": 0.44921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.2552083333333333, + "grad_norm": 1.9000256061553955, + "kl": 25.5625, + "learning_rate": 9.754122765793306e-06, + "loss": 0.0256, + "reward": 0.003314285073429346, + "reward_std": 0.09945075958967209, + "rewards/ndcg_rule_reward": -0.020123216323554516, + "rewards/rule_reward": 0.0234375, + "step": 441, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.24609375, + "epoch": 0.25578703703703703, + "grad_norm": 2.016144275665283, + "kl": 22.0, + "learning_rate": 9.75266923785287e-06, + "loss": 0.022, + "reward": 0.15421147644519806, + "reward_std": 0.09191442653536797, + "rewards/ndcg_rule_reward": -0.015710405074059963, + "rewards/rule_reward": 0.169921875, + "step": 442, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.03515625, + "epoch": 0.25636574074074076, + "grad_norm": 1.931097388267517, + "kl": 23.75, + "learning_rate": 9.751211535173862e-06, + "loss": 0.0237, + "reward": 0.02573490678332746, + "reward_std": 0.12312012165784836, + "rewards/ndcg_rule_reward": -0.023093219846487045, + "rewards/rule_reward": 0.048828125, + "step": 443, + "token_diversity": 0.421875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.2569444444444444, + "grad_norm": 1.5313100814819336, + "kl": 15.59375, + "learning_rate": 9.74974965903673e-06, + "loss": 0.0156, + "reward": 0.0029256039997562766, + "reward_std": 0.10805252194404602, + "rewards/ndcg_rule_reward": -0.022465019952505827, + "rewards/rule_reward": 0.025390625, + "step": 444, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.25752314814814814, + "grad_norm": 2.0321977138519287, + "kl": 12.1875, + "learning_rate": 9.748283610725581e-06, + "loss": 0.0122, + "reward": 0.002967443550005555, + "reward_std": 0.11644656211137772, + "rewards/ndcg_rule_reward": -0.024376305751502514, + "rewards/rule_reward": 0.02734375, + "step": 445, + "token_diversity": 0.5546875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.25810185185185186, + "grad_norm": 1.782600998878479, + "kl": 29.75, + "learning_rate": 9.746813391528193e-06, + "loss": 0.0298, + "reward": 0.0039120722794905305, + "reward_std": 0.12442222982645035, + "rewards/ndcg_rule_reward": -0.02538480330258608, + "rewards/rule_reward": 0.029296875, + "step": 446, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.2586805555555556, + "grad_norm": 1.879508376121521, + "kl": 26.75, + "learning_rate": 9.745339002736006e-06, + "loss": 0.0268, + "reward": 0.003103030612692237, + "reward_std": 0.09111404791474342, + "rewards/ndcg_rule_reward": -0.018381345085799694, + "rewards/rule_reward": 0.021484375, + "step": 447, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.140625, + "epoch": 0.25925925925925924, + "grad_norm": 3.150132417678833, + "kl": 38.125, + "learning_rate": 9.743860445644114e-06, + "loss": 0.0381, + "reward": 0.08992466330528259, + "reward_std": 0.09662879630923271, + "rewards/ndcg_rule_reward": -0.017497209832072258, + "rewards/rule_reward": 0.107421875, + "step": 448, + "token_diversity": 0.3909293831168831 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.25983796296296297, + "grad_norm": 1.6823652982711792, + "kl": 25.1875, + "learning_rate": 9.742377721551286e-06, + "loss": 0.0252, + "reward": 0.03246398060582578, + "reward_std": 0.13330386579036713, + "rewards/ndcg_rule_reward": -0.026129769161343575, + "rewards/rule_reward": 0.05859375, + "step": 449, + "token_diversity": 0.515625 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.1484375, + "epoch": 0.2604166666666667, + "grad_norm": 1.7991300821304321, + "kl": 33.46875, + "learning_rate": 9.740890831759943e-06, + "loss": 0.0335, + "reward": 0.09415097418241203, + "reward_std": 0.1000010035932064, + "rewards/ndcg_rule_reward": -0.01913027511909604, + "rewards/rule_reward": 0.11328125, + "step": 450, + "token_diversity": 0.2972861842105263 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.091796875, + "epoch": 0.26099537037037035, + "grad_norm": 4.627768039703369, + "kl": 43.875, + "learning_rate": 9.739399777576169e-06, + "loss": 0.0439, + "reward": 0.06462949700653553, + "reward_std": 0.1581447720527649, + "rewards/ndcg_rule_reward": -0.03107362426817417, + "rewards/rule_reward": 0.095703125, + "step": 451, + "token_diversity": 0.4140625 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.044921875, + "epoch": 0.26157407407407407, + "grad_norm": 2.7198376655578613, + "kl": 55.375, + "learning_rate": 9.737904560309699e-06, + "loss": 0.0554, + "reward": 0.03664623526856303, + "reward_std": 0.18263272941112518, + "rewards/ndcg_rule_reward": -0.03757251426577568, + "rewards/rule_reward": 0.07421875, + "step": 452, + "token_diversity": 0.34993912337662336 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.14453125, + "epoch": 0.2621527777777778, + "grad_norm": 2.252824544906616, + "kl": 46.125, + "learning_rate": 9.736405181273935e-06, + "loss": 0.0461, + "reward": 0.09280958957970142, + "reward_std": 0.13349512964487076, + "rewards/ndcg_rule_reward": -0.02437791135162115, + "rewards/rule_reward": 0.1171875, + "step": 453, + "token_diversity": 0.53125 + }, + { + "categorical_diversity": 0.90625, + "completion_length": 5.04296875, + "epoch": 0.26273148148148145, + "grad_norm": 2.0872044563293457, + "kl": 35.25, + "learning_rate": 9.734901641785927e-06, + "loss": 0.0353, + "reward": 0.030265147564932704, + "reward_std": 0.11112204939126968, + "rewards/ndcg_rule_reward": -0.020516103133559227, + "rewards/rule_reward": 0.05078125, + "step": 454, + "token_diversity": 0.4028645833333333 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.2633101851851852, + "grad_norm": 1.910973310470581, + "kl": 26.375, + "learning_rate": 9.733393943166386e-06, + "loss": 0.0264, + "reward": 0.03417039418127388, + "reward_std": 0.10802354291081429, + "rewards/ndcg_rule_reward": -0.022470230236649513, + "rewards/rule_reward": 0.056640625, + "step": 455, + "token_diversity": 0.515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.2638888888888889, + "grad_norm": 1.7410237789154053, + "kl": 22.3125, + "learning_rate": 9.731882086739665e-06, + "loss": 0.0223, + "reward": 0.033854017267003655, + "reward_std": 0.10819248110055923, + "rewards/ndcg_rule_reward": -0.02278660424053669, + "rewards/rule_reward": 0.056640625, + "step": 456, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.2644675925925926, + "grad_norm": 2.802551746368408, + "kl": 40.0625, + "learning_rate": 9.730366073833785e-06, + "loss": 0.04, + "reward": 0.061424845829606056, + "reward_std": 0.11957806721329689, + "rewards/ndcg_rule_reward": -0.02255952823907137, + "rewards/rule_reward": 0.083984375, + "step": 457, + "token_diversity": 0.4375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.2650462962962963, + "grad_norm": 2.3497719764709473, + "kl": 26.0, + "learning_rate": 9.728845905780404e-06, + "loss": 0.026, + "reward": 0.03625813452526927, + "reward_std": 0.15760263055562973, + "rewards/ndcg_rule_reward": -0.032101238146424294, + "rewards/rule_reward": 0.068359375, + "step": 458, + "token_diversity": 0.4375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.265625, + "grad_norm": 2.7191648483276367, + "kl": 17.59375, + "learning_rate": 9.727321583914839e-06, + "loss": 0.0176, + "reward": 0.03538976330310106, + "reward_std": 0.14113782718777657, + "rewards/ndcg_rule_reward": -0.02906335797160864, + "rewards/rule_reward": 0.064453125, + "step": 459, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.2662037037037037, + "grad_norm": 1.9066585302352905, + "kl": 24.875, + "learning_rate": 9.72579310957605e-06, + "loss": 0.0249, + "reward": 0.0355301545932889, + "reward_std": 0.1242370530962944, + "rewards/ndcg_rule_reward": -0.025016718544065952, + "rewards/rule_reward": 0.060546875, + "step": 460, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.2667824074074074, + "grad_norm": 2.517242431640625, + "kl": 26.3125, + "learning_rate": 9.72426048410665e-06, + "loss": 0.0262, + "reward": 0.03484846849460155, + "reward_std": 0.10771914571523666, + "rewards/ndcg_rule_reward": -0.021792156621813774, + "rewards/rule_reward": 0.056640625, + "step": 461, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.08984375, + "epoch": 0.2673611111111111, + "grad_norm": 2.287604808807373, + "kl": 30.5, + "learning_rate": 9.72272370885289e-06, + "loss": 0.0304, + "reward": 0.05949860438704491, + "reward_std": 0.1299261599779129, + "rewards/ndcg_rule_reward": -0.024485770612955093, + "rewards/rule_reward": 0.083984375, + "step": 462, + "token_diversity": 0.42578125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.2679398148148148, + "grad_norm": 5.4452009201049805, + "kl": 14.21875, + "learning_rate": 9.721182785164678e-06, + "loss": 0.0142, + "reward": 0.03484130930155516, + "reward_std": 0.11615573987364769, + "rewards/ndcg_rule_reward": -0.023752442561089993, + "rewards/rule_reward": 0.05859375, + "step": 463, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.26851851851851855, + "grad_norm": 1.4475369453430176, + "kl": 15.375, + "learning_rate": 9.719637714395554e-06, + "loss": 0.0154, + "reward": 0.06113094592001289, + "reward_std": 0.10290253907442093, + "rewards/ndcg_rule_reward": -0.018947179429233074, + "rewards/rule_reward": 0.080078125, + "step": 464, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.19140625, + "epoch": 0.2690972222222222, + "grad_norm": 2.6890571117401123, + "kl": 29.71875, + "learning_rate": 9.718088497902709e-06, + "loss": 0.0297, + "reward": 0.12195795401930809, + "reward_std": 0.15336255356669426, + "rewards/ndcg_rule_reward": -0.028432668186724186, + "rewards/rule_reward": 0.150390625, + "step": 465, + "token_diversity": 0.43359375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.103515625, + "epoch": 0.26967592592592593, + "grad_norm": 2.7954630851745605, + "kl": 31.375, + "learning_rate": 9.716535137046971e-06, + "loss": 0.0314, + "reward": 0.0735749900341034, + "reward_std": 0.1483936756849289, + "rewards/ndcg_rule_reward": -0.026034386828541756, + "rewards/rule_reward": 0.099609375, + "step": 466, + "token_diversity": 0.40625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0859375, + "epoch": 0.27025462962962965, + "grad_norm": 1.5028355121612549, + "kl": 32.875, + "learning_rate": 9.714977633192813e-06, + "loss": 0.0328, + "reward": 0.05715057882480323, + "reward_std": 0.11341387405991554, + "rewards/ndcg_rule_reward": -0.01902129640802741, + "rewards/rule_reward": 0.076171875, + "step": 467, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.134765625, + "epoch": 0.2708333333333333, + "grad_norm": 2.150573253631592, + "kl": 35.875, + "learning_rate": 9.713415987708342e-06, + "loss": 0.0359, + "reward": 0.0925789400935173, + "reward_std": 0.11655589193105698, + "rewards/ndcg_rule_reward": -0.020702304784208536, + "rewards/rule_reward": 0.11328125, + "step": 468, + "token_diversity": 0.3614309210526316 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.27141203703703703, + "grad_norm": 1.8941031694412231, + "kl": 51.125, + "learning_rate": 9.711850201965305e-06, + "loss": 0.0511, + "reward": 0.0330316498875618, + "reward_std": 0.12460019439458847, + "rewards/ndcg_rule_reward": -0.023608975112438202, + "rewards/rule_reward": 0.056640625, + "step": 469, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.27199074074074076, + "grad_norm": 3.3434576988220215, + "kl": 48.125, + "learning_rate": 9.71028027733909e-06, + "loss": 0.0481, + "reward": 0.0046489236410707235, + "reward_std": 0.15773824602365494, + "rewards/ndcg_rule_reward": -0.03246045298874378, + "rewards/rule_reward": 0.037109375, + "step": 470, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.14453125, + "epoch": 0.2725694444444444, + "grad_norm": 4.614661693572998, + "kl": 50.3125, + "learning_rate": 9.70870621520871e-06, + "loss": 0.0503, + "reward": 0.06442969292402267, + "reward_std": 0.0748501792550087, + "rewards/ndcg_rule_reward": -0.015648433938622475, + "rewards/rule_reward": 0.080078125, + "step": 471, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05859375, + "epoch": 0.27314814814814814, + "grad_norm": 2.8009233474731445, + "kl": 69.375, + "learning_rate": 9.707128016956826e-06, + "loss": 0.0695, + "reward": 0.042791286832652986, + "reward_std": 0.1322571597993374, + "rewards/ndcg_rule_reward": -0.02556808851659298, + "rewards/rule_reward": 0.068359375, + "step": 472, + "token_diversity": 0.4844933712121212 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05859375, + "epoch": 0.27372685185185186, + "grad_norm": 1.4481595754623413, + "kl": 21.96875, + "learning_rate": 9.705545683969722e-06, + "loss": 0.022, + "reward": 0.03237544547300786, + "reward_std": 0.04997270368039608, + "rewards/ndcg_rule_reward": -0.010593305341899395, + "rewards/rule_reward": 0.04296875, + "step": 473, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.001953125, + "epoch": 0.2743055555555556, + "grad_norm": 1.8964678049087524, + "kl": 47.75, + "learning_rate": 9.703959217637318e-06, + "loss": 0.0477, + "reward": 0.005268369568511844, + "reward_std": 0.11934925988316536, + "rewards/ndcg_rule_reward": -0.024028506129980087, + "rewards/rule_reward": 0.029296875, + "step": 474, + "token_diversity": 0.42799176356589147 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.27488425925925924, + "grad_norm": 2.490827798843384, + "kl": 31.8125, + "learning_rate": 9.702368619353162e-06, + "loss": 0.0319, + "reward": 0.032905255910009146, + "reward_std": 0.1415061242878437, + "rewards/ndcg_rule_reward": -0.02764161955565214, + "rewards/rule_reward": 0.060546875, + "step": 475, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0546875, + "epoch": 0.27546296296296297, + "grad_norm": 2.01771879196167, + "kl": 26.3125, + "learning_rate": 9.700773890514436e-06, + "loss": 0.0263, + "reward": 0.032617395161651075, + "reward_std": 0.14162323623895645, + "rewards/ndcg_rule_reward": -0.02792948018759489, + "rewards/rule_reward": 0.060546875, + "step": 476, + "token_diversity": 0.44140625 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.140625, + "epoch": 0.2760416666666667, + "grad_norm": 2.051396608352661, + "kl": 21.4375, + "learning_rate": 9.699175032521949e-06, + "loss": 0.0214, + "reward": 0.056218404322862625, + "reward_std": 0.0894693173468113, + "rewards/ndcg_rule_reward": -0.01604722160845995, + "rewards/rule_reward": 0.072265625, + "step": 477, + "token_diversity": 0.37855113636363635 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1484375, + "epoch": 0.27662037037037035, + "grad_norm": 1.496404767036438, + "kl": 14.65625, + "learning_rate": 9.697572046780132e-06, + "loss": 0.0146, + "reward": 0.09445170685648918, + "reward_std": 0.09144873917102814, + "rewards/ndcg_rule_reward": -0.016876411624252796, + "rewards/rule_reward": 0.111328125, + "step": 478, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.27719907407407407, + "grad_norm": 2.364828586578369, + "kl": 18.40625, + "learning_rate": 9.695964934697047e-06, + "loss": 0.0184, + "reward": 0.03535892954096198, + "reward_std": 0.14957962930202484, + "rewards/ndcg_rule_reward": -0.031047320924699306, + "rewards/rule_reward": 0.06640625, + "step": 479, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.2777777777777778, + "grad_norm": 2.05800461769104, + "kl": 34.0, + "learning_rate": 9.69435369768438e-06, + "loss": 0.0341, + "reward": 0.004306995659135282, + "reward_std": 0.13262956589460373, + "rewards/ndcg_rule_reward": -0.02694300375878811, + "rewards/rule_reward": 0.03125, + "step": 480, + "token_diversity": 0.4296875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1484375, + "epoch": 0.27835648148148145, + "grad_norm": 2.434399127960205, + "kl": 21.375, + "learning_rate": 9.692738337157441e-06, + "loss": 0.0214, + "reward": 0.09460287541151047, + "reward_std": 0.1166275255382061, + "rewards/ndcg_rule_reward": -0.022584629245102406, + "rewards/rule_reward": 0.1171875, + "step": 481, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.2789351851851852, + "grad_norm": 1.8652397394180298, + "kl": 19.75, + "learning_rate": 9.691118854535157e-06, + "loss": 0.0197, + "reward": 0.0036258248146623373, + "reward_std": 0.11614777147769928, + "rewards/ndcg_rule_reward": -0.02371792495250702, + "rewards/rule_reward": 0.02734375, + "step": 482, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.2795138888888889, + "grad_norm": 1.8185368776321411, + "kl": 13.40625, + "learning_rate": 9.689495251240081e-06, + "loss": 0.0134, + "reward": 0.002189184306189418, + "reward_std": 0.09156538546085358, + "rewards/ndcg_rule_reward": -0.019295191392302513, + "rewards/rule_reward": 0.021484375, + "step": 483, + "token_diversity": 0.546875 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.09765625, + "epoch": 0.2800925925925926, + "grad_norm": 1.904834508895874, + "kl": 15.1875, + "learning_rate": 9.687867528698385e-06, + "loss": 0.0152, + "reward": 0.06340267299674451, + "reward_std": 0.12500108405947685, + "rewards/ndcg_rule_reward": -0.024487948045134544, + "rewards/rule_reward": 0.087890625, + "step": 484, + "token_diversity": 0.37535511363636365 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.2806712962962963, + "grad_norm": 2.002305269241333, + "kl": 18.34375, + "learning_rate": 9.686235688339857e-06, + "loss": 0.0183, + "reward": 0.0030415181536227465, + "reward_std": 0.11641855910420418, + "rewards/ndcg_rule_reward": -0.024302231147885323, + "rewards/rule_reward": 0.02734375, + "step": 485, + "token_diversity": 0.43359375 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.09765625, + "epoch": 0.28125, + "grad_norm": 2.7381179332733154, + "kl": 14.9375, + "learning_rate": 9.684599731597906e-06, + "loss": 0.0149, + "reward": 0.06316356151364744, + "reward_std": 0.10831543058156967, + "rewards/ndcg_rule_reward": -0.020820816047489643, + "rewards/rule_reward": 0.083984375, + "step": 486, + "token_diversity": 0.35907061688311687 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.09765625, + "epoch": 0.2818287037037037, + "grad_norm": 2.9123165607452393, + "kl": 40.125, + "learning_rate": 9.68295965990955e-06, + "loss": 0.0401, + "reward": 0.06508899480104446, + "reward_std": 0.16631196439266205, + "rewards/ndcg_rule_reward": -0.03256725613027811, + "rewards/rule_reward": 0.09765625, + "step": 487, + "token_diversity": 0.2791940789473684 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.2824074074074074, + "grad_norm": 3.335521936416626, + "kl": 37.1875, + "learning_rate": 9.68131547471543e-06, + "loss": 0.0372, + "reward": 0.032038564793765545, + "reward_std": 0.0998116247355938, + "rewards/ndcg_rule_reward": -0.01874268427491188, + "rewards/rule_reward": 0.05078125, + "step": 488, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.2829861111111111, + "grad_norm": 2.1638388633728027, + "kl": 45.875, + "learning_rate": 9.679667177459794e-06, + "loss": 0.046, + "reward": 0.0332921848166734, + "reward_std": 0.14131514728069305, + "rewards/ndcg_rule_reward": -0.027254688553512096, + "rewards/rule_reward": 0.060546875, + "step": 489, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.13671875, + "epoch": 0.2835648148148148, + "grad_norm": 6.769766807556152, + "kl": 65.625, + "learning_rate": 9.678014769590505e-06, + "loss": 0.0657, + "reward": 0.08904050104320049, + "reward_std": 0.17200805991888046, + "rewards/ndcg_rule_reward": -0.03205324895679951, + "rewards/rule_reward": 0.12109375, + "step": 490, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1015625, + "epoch": 0.28414351851851855, + "grad_norm": 4.366808891296387, + "kl": 42.375, + "learning_rate": 9.676358252559034e-06, + "loss": 0.0424, + "reward": 0.06703627854585648, + "reward_std": 0.15774669498205185, + "rewards/ndcg_rule_reward": -0.03257308900356293, + "rewards/rule_reward": 0.099609375, + "step": 491, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.15234375, + "epoch": 0.2847222222222222, + "grad_norm": 1.682349681854248, + "kl": 27.9375, + "learning_rate": 9.674697627820469e-06, + "loss": 0.0279, + "reward": 0.09642495587468147, + "reward_std": 0.08291395753622055, + "rewards/ndcg_rule_reward": -0.016856294125318527, + "rewards/rule_reward": 0.11328125, + "step": 492, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 0.921875, + "completion_length": 5.2890625, + "epoch": 0.28530092592592593, + "grad_norm": 2.2712831497192383, + "kl": 43.125, + "learning_rate": 9.673032896833493e-06, + "loss": 0.0431, + "reward": 0.18121924996376038, + "reward_std": 0.08983420208096504, + "rewards/ndcg_rule_reward": -0.016046371776610613, + "rewards/rule_reward": 0.197265625, + "step": 493, + "token_diversity": 0.355736301369863 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.28587962962962965, + "grad_norm": 2.102579355239868, + "kl": 31.625, + "learning_rate": 9.671364061060408e-06, + "loss": 0.0317, + "reward": 0.03508615889586508, + "reward_std": 0.13286811485886574, + "rewards/ndcg_rule_reward": -0.027413838542997837, + "rewards/rule_reward": 0.0625, + "step": 494, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.109375, + "epoch": 0.2864583333333333, + "grad_norm": 1.853853702545166, + "kl": 31.6875, + "learning_rate": 9.669691121967117e-06, + "loss": 0.0317, + "reward": 0.06306852283887565, + "reward_std": 0.10834923014044762, + "rewards/ndcg_rule_reward": -0.02091585285961628, + "rewards/rule_reward": 0.083984375, + "step": 495, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.28703703703703703, + "grad_norm": 1.9968469142913818, + "kl": 29.4375, + "learning_rate": 9.668014081023128e-06, + "loss": 0.0294, + "reward": 0.06322618015110493, + "reward_std": 0.10826229304075241, + "rewards/ndcg_rule_reward": -0.020758191123604774, + "rewards/rule_reward": 0.083984375, + "step": 496, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.28761574074074076, + "grad_norm": 2.015817880630493, + "kl": 23.1875, + "learning_rate": 9.66633293970155e-06, + "loss": 0.0231, + "reward": 0.0019922169158235192, + "reward_std": 0.08322878554463387, + "rewards/ndcg_rule_reward": -0.017539033200591803, + "rewards/rule_reward": 0.01953125, + "step": 497, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.16796875, + "epoch": 0.2881944444444444, + "grad_norm": 2.3677031993865967, + "kl": 43.625, + "learning_rate": 9.664647699479095e-06, + "loss": 0.0435, + "reward": 0.10845508053898811, + "reward_std": 0.11346757039427757, + "rewards/ndcg_rule_reward": -0.018498041667044163, + "rewards/rule_reward": 0.126953125, + "step": 498, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.1015625, + "epoch": 0.28877314814814814, + "grad_norm": 2.996192216873169, + "kl": 24.0, + "learning_rate": 9.662958361836079e-06, + "loss": 0.024, + "reward": 0.06640881672501564, + "reward_std": 0.14967354759573936, + "rewards/ndcg_rule_reward": -0.03124743513762951, + "rewards/rule_reward": 0.09765625, + "step": 499, + "token_diversity": 0.33690137987012986 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.19140625, + "epoch": 0.28935185185185186, + "grad_norm": 1.606724500656128, + "kl": 24.125, + "learning_rate": 9.66126492825641e-06, + "loss": 0.0241, + "reward": 0.12129479530267417, + "reward_std": 0.1200060099363327, + "rewards/ndcg_rule_reward": -0.021283335518091917, + "rewards/rule_reward": 0.142578125, + "step": 500, + "token_diversity": 0.34457236842105265 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.2899305555555556, + "grad_norm": 2.2019898891448975, + "kl": 42.25, + "learning_rate": 9.6595674002276e-06, + "loss": 0.0422, + "reward": 0.03604414709843695, + "reward_std": 0.14924415946006775, + "rewards/ndcg_rule_reward": -0.03036210499703884, + "rewards/rule_reward": 0.06640625, + "step": 501, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.29050925925925924, + "grad_norm": 2.6926493644714355, + "kl": 23.5625, + "learning_rate": 9.657865779240756e-06, + "loss": 0.0235, + "reward": 0.032331530237570405, + "reward_std": 0.13334187865257263, + "rewards/ndcg_rule_reward": -0.026262219063937664, + "rewards/rule_reward": 0.05859375, + "step": 502, + "token_diversity": 0.5234375 + }, + { + "categorical_diversity": 0.75, + "completion_length": 5.154296875, + "epoch": 0.29108796296296297, + "grad_norm": 3.0523340702056885, + "kl": 17.875, + "learning_rate": 9.65616006679058e-06, + "loss": 0.0179, + "reward": 0.0963062196969986, + "reward_std": 0.09980782866477966, + "rewards/ndcg_rule_reward": -0.02088127564638853, + "rewards/rule_reward": 0.1171875, + "step": 503, + "token_diversity": 0.2854368093922652 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.2916666666666667, + "grad_norm": 1.5897232294082642, + "kl": 28.0, + "learning_rate": 9.654450264375367e-06, + "loss": 0.0281, + "reward": 0.03435562853701413, + "reward_std": 0.09954050555825233, + "rewards/ndcg_rule_reward": -0.02033186936751008, + "rewards/rule_reward": 0.0546875, + "step": 504, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.099609375, + "epoch": 0.29224537037037035, + "grad_norm": 3.7746996879577637, + "kl": 22.71875, + "learning_rate": 9.652736373497001e-06, + "loss": 0.0227, + "reward": 0.06351873918902129, + "reward_std": 0.11653270944952965, + "rewards/ndcg_rule_reward": -0.022418759763240814, + "rewards/rule_reward": 0.0859375, + "step": 505, + "token_diversity": 0.3645148026315789 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.107421875, + "epoch": 0.29282407407407407, + "grad_norm": 1.3770694732666016, + "kl": 20.65625, + "learning_rate": 9.651018395660969e-06, + "loss": 0.0206, + "reward": 0.06449795141816139, + "reward_std": 0.06639857403934002, + "rewards/ndcg_rule_reward": -0.01362704811617732, + "rewards/rule_reward": 0.078125, + "step": 506, + "token_diversity": 0.515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.107421875, + "epoch": 0.2934027777777778, + "grad_norm": 2.2717835903167725, + "kl": 21.75, + "learning_rate": 9.649296332376335e-06, + "loss": 0.0217, + "reward": 0.06554005667567253, + "reward_std": 0.12481438368558884, + "rewards/ndcg_rule_reward": -0.026256815530359745, + "rewards/rule_reward": 0.091796875, + "step": 507, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 0.9375, + "completion_length": 5.12890625, + "epoch": 0.29398148148148145, + "grad_norm": 3.726142644882202, + "kl": 56.875, + "learning_rate": 9.64757018515576e-06, + "loss": 0.0566, + "reward": 0.07830704934895039, + "reward_std": 0.15503669530153275, + "rewards/ndcg_rule_reward": -0.027161698788404465, + "rewards/rule_reward": 0.10546875, + "step": 508, + "token_diversity": 0.3337673611111111 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.04296875, + "epoch": 0.2945601851851852, + "grad_norm": 5.037398338317871, + "kl": 38.9375, + "learning_rate": 9.645839955515488e-06, + "loss": 0.039, + "reward": 0.02917284658178687, + "reward_std": 0.08635945245623589, + "rewards/ndcg_rule_reward": -0.015749029349535704, + "rewards/rule_reward": 0.044921875, + "step": 509, + "token_diversity": 0.44140625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.2951388888888889, + "grad_norm": 2.1146719455718994, + "kl": 19.09375, + "learning_rate": 9.644105644975352e-06, + "loss": 0.0191, + "reward": 0.033840869553387165, + "reward_std": 0.11659521237015724, + "rewards/ndcg_rule_reward": -0.024752880446612835, + "rewards/rule_reward": 0.05859375, + "step": 510, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.140625, + "epoch": 0.2957175925925926, + "grad_norm": 1.8227647542953491, + "kl": 34.3125, + "learning_rate": 9.642367255058767e-06, + "loss": 0.0343, + "reward": 0.08801347692497075, + "reward_std": 0.13142472505569458, + "rewards/ndcg_rule_reward": -0.023314655758440495, + "rewards/rule_reward": 0.111328125, + "step": 511, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.09375, + "epoch": 0.2962962962962963, + "grad_norm": 5.452460765838623, + "kl": 22.625, + "learning_rate": 9.640624787292731e-06, + "loss": 0.0227, + "reward": 0.061403625761158764, + "reward_std": 0.14483441412448883, + "rewards/ndcg_rule_reward": -0.02844012901186943, + "rewards/rule_reward": 0.08984375, + "step": 512, + "token_diversity": 0.40716314935064934 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.296875, + "grad_norm": 1.2297004461288452, + "kl": 7.453125, + "learning_rate": 9.63887824320783e-06, + "loss": 0.0074, + "reward": 0.0024362875265069306, + "reward_std": 0.0745907761156559, + "rewards/ndcg_rule_reward": -0.015141838230192661, + "rewards/rule_reward": 0.017578125, + "step": 513, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.2974537037037037, + "grad_norm": 2.425574541091919, + "kl": 8.0625, + "learning_rate": 9.637127624338223e-06, + "loss": 0.008, + "reward": 0.03506093251053244, + "reward_std": 0.13291175663471222, + "rewards/ndcg_rule_reward": -0.027439069002866745, + "rewards/rule_reward": 0.0625, + "step": 514, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.2980324074074074, + "grad_norm": 1.750848412513733, + "kl": 9.140625, + "learning_rate": 9.635372932221649e-06, + "loss": 0.0091, + "reward": 0.03190175909548998, + "reward_std": 0.10831540077924728, + "rewards/ndcg_rule_reward": -0.020832614041864872, + "rewards/rule_reward": 0.052734375, + "step": 515, + "token_diversity": 0.4375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.2986111111111111, + "grad_norm": 3.035336971282959, + "kl": 10.6953125, + "learning_rate": 9.633614168399432e-06, + "loss": 0.0107, + "reward": 0.003964080358855426, + "reward_std": 0.11600164324045181, + "rewards/ndcg_rule_reward": -0.023379670456051826, + "rewards/rule_reward": 0.02734375, + "step": 516, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0703125, + "epoch": 0.2991898148148148, + "grad_norm": 2.3131203651428223, + "kl": 7.578125, + "learning_rate": 9.631851334416467e-06, + "loss": 0.0076, + "reward": 0.04836429562419653, + "reward_std": 0.1001739390194416, + "rewards/ndcg_rule_reward": -0.018041951581835747, + "rewards/rule_reward": 0.06640625, + "step": 517, + "token_diversity": 0.515625 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.099609375, + "epoch": 0.29976851851851855, + "grad_norm": 1.9601541757583618, + "kl": 7.96875, + "learning_rate": 9.630084431821222e-06, + "loss": 0.008, + "reward": 0.06324928253889084, + "reward_std": 0.09985987842082977, + "rewards/ndcg_rule_reward": -0.01878196746110916, + "rewards/rule_reward": 0.08203125, + "step": 518, + "token_diversity": 0.36163651315789475 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.3003472222222222, + "grad_norm": 2.2219107151031494, + "kl": 12.6875, + "learning_rate": 9.628313462165745e-06, + "loss": 0.0127, + "reward": 0.0031599716749042273, + "reward_std": 0.10791952908039093, + "rewards/ndcg_rule_reward": -0.022230652626603842, + "rewards/rule_reward": 0.025390625, + "step": 519, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.03125, + "epoch": 0.30092592592592593, + "grad_norm": 2.0628890991210938, + "kl": 25.3125, + "learning_rate": 9.626538427005652e-06, + "loss": 0.0253, + "reward": 0.024311432149261236, + "reward_std": 0.12382516264915466, + "rewards/ndcg_rule_reward": -0.02256356831640005, + "rewards/rule_reward": 0.046875, + "step": 520, + "token_diversity": 0.4140625 + }, + { + "categorical_diversity": 0.90625, + "completion_length": 5.04296875, + "epoch": 0.30150462962962965, + "grad_norm": 2.348862886428833, + "kl": 16.9375, + "learning_rate": 9.624759327900131e-06, + "loss": 0.0169, + "reward": 0.0300866044126451, + "reward_std": 0.11963483691215515, + "rewards/ndcg_rule_reward": -0.022647771053016186, + "rewards/rule_reward": 0.052734375, + "step": 521, + "token_diversity": 0.3855208333333333 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.146484375, + "epoch": 0.3020833333333333, + "grad_norm": 1.8148430585861206, + "kl": 18.25, + "learning_rate": 9.622976166411944e-06, + "loss": 0.0182, + "reward": 0.09121096134185791, + "reward_std": 0.07817601785063744, + "rewards/ndcg_rule_reward": -0.01425778679549694, + "rewards/rule_reward": 0.10546875, + "step": 522, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.04296875, + "epoch": 0.30266203703703703, + "grad_norm": 3.2591190338134766, + "kl": 35.375, + "learning_rate": 9.621188944107413e-06, + "loss": 0.0354, + "reward": 0.03053025808185339, + "reward_std": 0.169872485101223, + "rewards/ndcg_rule_reward": -0.03392286691814661, + "rewards/rule_reward": 0.064453125, + "step": 523, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.30324074074074076, + "grad_norm": 2.0389156341552734, + "kl": 29.625, + "learning_rate": 9.619397662556434e-06, + "loss": 0.0296, + "reward": 0.033764035906642675, + "reward_std": 0.09139770269393921, + "rewards/ndcg_rule_reward": -0.018970340490341187, + "rewards/rule_reward": 0.052734375, + "step": 524, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.091796875, + "epoch": 0.3038194444444444, + "grad_norm": 4.02306604385376, + "kl": 78.25, + "learning_rate": 9.617602323332467e-06, + "loss": 0.0781, + "reward": 0.0589948205742985, + "reward_std": 0.11991523578763008, + "rewards/ndcg_rule_reward": -0.02108330186456442, + "rewards/rule_reward": 0.080078125, + "step": 525, + "token_diversity": 0.358141447368421 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.001953125, + "epoch": 0.30439814814814814, + "grad_norm": 2.930171251296997, + "kl": 33.75, + "learning_rate": 9.615802928012535e-06, + "loss": 0.0338, + "reward": 0.005916548892855644, + "reward_std": 0.14431944116950035, + "rewards/ndcg_rule_reward": -0.029239701107144356, + "rewards/rule_reward": 0.03515625, + "step": 526, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.30497685185185186, + "grad_norm": 3.392481803894043, + "kl": 27.1875, + "learning_rate": 9.613999478177224e-06, + "loss": 0.0272, + "reward": 0.06234593689441681, + "reward_std": 0.19205426424741745, + "rewards/ndcg_rule_reward": -0.03726343810558319, + "rewards/rule_reward": 0.099609375, + "step": 527, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.001953125, + "epoch": 0.3055555555555556, + "grad_norm": 2.547255039215088, + "kl": 35.25, + "learning_rate": 9.612191975410685e-06, + "loss": 0.0351, + "reward": 0.005017805844545364, + "reward_std": 0.16596291959285736, + "rewards/ndcg_rule_reward": -0.034044694155454636, + "rewards/rule_reward": 0.0390625, + "step": 528, + "token_diversity": 0.4375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.08984375, + "epoch": 0.30613425925925924, + "grad_norm": 3.065277576446533, + "kl": 19.25, + "learning_rate": 9.610380421300623e-06, + "loss": 0.0193, + "reward": 0.05822693929076195, + "reward_std": 0.09686610475182533, + "rewards/ndcg_rule_reward": -0.017944936640560627, + "rewards/rule_reward": 0.076171875, + "step": 529, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.30671296296296297, + "grad_norm": 2.642186164855957, + "kl": 17.4375, + "learning_rate": 9.608564817438304e-06, + "loss": 0.0174, + "reward": 0.06479532504454255, + "reward_std": 0.1664155274629593, + "rewards/ndcg_rule_reward": -0.03286092262715101, + "rewards/rule_reward": 0.09765625, + "step": 530, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.08203125, + "epoch": 0.3072916666666667, + "grad_norm": 2.1777586936950684, + "kl": 20.8125, + "learning_rate": 9.606745165418554e-06, + "loss": 0.0208, + "reward": 0.035629829159006476, + "reward_std": 0.13263988494873047, + "rewards/ndcg_rule_reward": -0.02687016874551773, + "rewards/rule_reward": 0.0625, + "step": 531, + "token_diversity": 0.4296875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.30787037037037035, + "grad_norm": 2.5238919258117676, + "kl": 13.4375, + "learning_rate": 9.604921466839753e-06, + "loss": 0.0134, + "reward": 0.03492635441944003, + "reward_std": 0.1413150280714035, + "rewards/ndcg_rule_reward": -0.02952677011489868, + "rewards/rule_reward": 0.064453125, + "step": 532, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.158203125, + "epoch": 0.30844907407407407, + "grad_norm": 2.2914199829101562, + "kl": 10.890625, + "learning_rate": 9.603093723303835e-06, + "loss": 0.0109, + "reward": 0.09633690863847733, + "reward_std": 0.0998314917087555, + "rewards/ndcg_rule_reward": -0.0208505867049098, + "rewards/rule_reward": 0.1171875, + "step": 533, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.431640625, + "epoch": 0.3090277777777778, + "grad_norm": 2.971076488494873, + "kl": 27.9375, + "learning_rate": 9.601261936416288e-06, + "loss": 0.028, + "reward": 0.05922744981944561, + "reward_std": 0.11325465142726898, + "rewards/ndcg_rule_reward": -0.020850677974522114, + "rewards/rule_reward": 0.080078125, + "step": 534, + "token_diversity": 0.52734375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.103515625, + "epoch": 0.30960648148148145, + "grad_norm": 1.7406997680664062, + "kl": 19.9375, + "learning_rate": 9.59942610778615e-06, + "loss": 0.0199, + "reward": 0.03384171717334539, + "reward_std": 0.09139911830425262, + "rewards/ndcg_rule_reward": -0.018892657943069935, + "rewards/rule_reward": 0.052734375, + "step": 535, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.095703125, + "epoch": 0.3101851851851852, + "grad_norm": 3.3891708850860596, + "kl": 42.875, + "learning_rate": 9.597586239026014e-06, + "loss": 0.0427, + "reward": 0.06272156722843647, + "reward_std": 0.1526600942015648, + "rewards/ndcg_rule_reward": -0.02907530963420868, + "rewards/rule_reward": 0.091796875, + "step": 536, + "token_diversity": 0.34269153225806454 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.10546875, + "epoch": 0.3107638888888889, + "grad_norm": 1.8164002895355225, + "kl": 19.84375, + "learning_rate": 9.595742331752014e-06, + "loss": 0.0199, + "reward": 0.06512774154543877, + "reward_std": 0.09976109117269516, + "rewards/ndcg_rule_reward": -0.020809759385883808, + "rewards/rule_reward": 0.0859375, + "step": 537, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.2109375, + "epoch": 0.3113425925925926, + "grad_norm": 2.24491548538208, + "kl": 14.8125, + "learning_rate": 9.593894387583837e-06, + "loss": 0.0148, + "reward": 0.12350413575768471, + "reward_std": 0.11693055182695389, + "rewards/ndcg_rule_reward": -0.021027110517024994, + "rewards/rule_reward": 0.14453125, + "step": 538, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.14453125, + "epoch": 0.3119212962962963, + "grad_norm": 3.157870292663574, + "kl": 29.75, + "learning_rate": 9.592042408144717e-06, + "loss": 0.0298, + "reward": 0.09373488277196884, + "reward_std": 0.1498645544052124, + "rewards/ndcg_rule_reward": -0.02735886164009571, + "rewards/rule_reward": 0.12109375, + "step": 539, + "token_diversity": 0.319078947368421 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0390625, + "epoch": 0.3125, + "grad_norm": 2.7818024158477783, + "kl": 22.8125, + "learning_rate": 9.590186395061427e-06, + "loss": 0.0228, + "reward": 0.0037739372346550226, + "reward_std": 0.13288599997758865, + "rewards/ndcg_rule_reward": -0.02747606299817562, + "rewards/rule_reward": 0.03125, + "step": 540, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 0.90625, + "completion_length": 5.0859375, + "epoch": 0.3130787037037037, + "grad_norm": 2.4111781120300293, + "kl": 22.0, + "learning_rate": 9.58832634996429e-06, + "loss": 0.022, + "reward": 0.057726265862584114, + "reward_std": 0.1309950016438961, + "rewards/ndcg_rule_reward": -0.022351861000061035, + "rewards/rule_reward": 0.080078125, + "step": 541, + "token_diversity": 0.40609375000000003 + }, + { + "categorical_diversity": 0.78125, + "completion_length": 5.1328125, + "epoch": 0.3136574074074074, + "grad_norm": 2.028716564178467, + "kl": 20.96875, + "learning_rate": 9.586462274487166e-06, + "loss": 0.021, + "reward": 0.085474519059062, + "reward_std": 0.10838484019041061, + "rewards/ndcg_rule_reward": -0.01804111059755087, + "rewards/rule_reward": 0.103515625, + "step": 542, + "token_diversity": 0.299004329004329 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.3142361111111111, + "grad_norm": 1.7396621704101562, + "kl": 8.21875, + "learning_rate": 9.584594170267454e-06, + "loss": 0.0082, + "reward": 0.0022770240902900696, + "reward_std": 0.08310193195939064, + "rewards/ndcg_rule_reward": -0.017254226375371218, + "rewards/rule_reward": 0.01953125, + "step": 543, + "token_diversity": 0.515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1015625, + "epoch": 0.3148148148148148, + "grad_norm": 2.1283483505249023, + "kl": 24.5, + "learning_rate": 9.582722038946098e-06, + "loss": 0.0245, + "reward": 0.06550050713121891, + "reward_std": 0.09960699453949928, + "rewards/ndcg_rule_reward": -0.020436986349523067, + "rewards/rule_reward": 0.0859375, + "step": 544, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 0.921875, + "completion_length": 5.07421875, + "epoch": 0.31539351851851855, + "grad_norm": 2.9911725521087646, + "kl": 38.0625, + "learning_rate": 9.580845882167574e-06, + "loss": 0.0381, + "reward": 0.051270291209220886, + "reward_std": 0.15335947275161743, + "rewards/ndcg_rule_reward": -0.026854708790779114, + "rewards/rule_reward": 0.078125, + "step": 545, + "token_diversity": 0.3835515202702703 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.3159722222222222, + "grad_norm": 2.8609063625335693, + "kl": 25.59375, + "learning_rate": 9.578965701579894e-06, + "loss": 0.0256, + "reward": 0.035308271646499634, + "reward_std": 0.141167551279068, + "rewards/ndcg_rule_reward": -0.029144855216145515, + "rewards/rule_reward": 0.064453125, + "step": 546, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.373046875, + "epoch": 0.31655092592592593, + "grad_norm": 3.166027545928955, + "kl": 27.3125, + "learning_rate": 9.577081498834611e-06, + "loss": 0.0273, + "reward": 0.020622751442715526, + "reward_std": 0.10842239111661911, + "rewards/ndcg_rule_reward": -0.020392873790115118, + "rewards/rule_reward": 0.041015625, + "step": 547, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.0546875, + "epoch": 0.31712962962962965, + "grad_norm": 2.2806262969970703, + "kl": 24.25, + "learning_rate": 9.575193275586801e-06, + "loss": 0.0242, + "reward": 0.03950552036985755, + "reward_std": 0.12934338301420212, + "rewards/ndcg_rule_reward": -0.024947604164481163, + "rewards/rule_reward": 0.064453125, + "step": 548, + "token_diversity": 0.34927962662337664 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.3177083333333333, + "grad_norm": 1.8648982048034668, + "kl": 10.3125, + "learning_rate": 9.57330103349508e-06, + "loss": 0.0103, + "reward": 0.031592047307640314, + "reward_std": 0.09162087738513947, + "rewards/ndcg_rule_reward": -0.017236079089343548, + "rewards/rule_reward": 0.048828125, + "step": 549, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1015625, + "epoch": 0.31828703703703703, + "grad_norm": 1.3727304935455322, + "kl": 20.28125, + "learning_rate": 9.571404774221592e-06, + "loss": 0.0202, + "reward": 0.06512557598762214, + "reward_std": 0.08295661956071854, + "rewards/ndcg_rule_reward": -0.016905677039176226, + "rewards/rule_reward": 0.08203125, + "step": 550, + "token_diversity": 0.515625 + }, + { + "categorical_diversity": 0.875, + "completion_length": 6.546875, + "epoch": 0.31886574074074076, + "grad_norm": 1.9286141395568848, + "kl": 30.1875, + "learning_rate": 9.569504499432005e-06, + "loss": 0.0302, + "reward": 0.07430662610568106, + "reward_std": 0.1383843496441841, + "rewards/ndcg_rule_reward": -0.023349624127149582, + "rewards/rule_reward": 0.09765625, + "step": 551, + "token_diversity": 0.36490462662337664 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.3194444444444444, + "grad_norm": 2.1324737071990967, + "kl": 14.75, + "learning_rate": 9.567600210795521e-06, + "loss": 0.0147, + "reward": 0.004354564473032951, + "reward_std": 0.14104070514440536, + "rewards/ndcg_rule_reward": -0.02884856052696705, + "rewards/rule_reward": 0.033203125, + "step": 552, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.32002314814814814, + "grad_norm": 1.7158550024032593, + "kl": 10.4375, + "learning_rate": 9.565691909984864e-06, + "loss": 0.0104, + "reward": 0.032165370881557465, + "reward_std": 0.11660493165254593, + "rewards/ndcg_rule_reward": -0.022522129118442535, + "rewards/rule_reward": 0.0546875, + "step": 553, + "token_diversity": 0.53515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.32060185185185186, + "grad_norm": 3.2894654273986816, + "kl": 27.6875, + "learning_rate": 9.563779598676281e-06, + "loss": 0.0277, + "reward": 0.03388554439879954, + "reward_std": 0.16629517078399658, + "rewards/ndcg_rule_reward": -0.032520705834031105, + "rewards/rule_reward": 0.06640625, + "step": 554, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.3211805555555556, + "grad_norm": 2.3822553157806396, + "kl": 39.5, + "learning_rate": 9.561863278549547e-06, + "loss": 0.0396, + "reward": 0.0333860560785979, + "reward_std": 0.15811266005039215, + "rewards/ndcg_rule_reward": -0.031067069619894028, + "rewards/rule_reward": 0.064453125, + "step": 555, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.19140625, + "epoch": 0.32175925925925924, + "grad_norm": 2.5043349266052246, + "kl": 27.25, + "learning_rate": 9.559942951287952e-06, + "loss": 0.0273, + "reward": 0.12169612641446292, + "reward_std": 0.12825166061520576, + "rewards/ndcg_rule_reward": -0.02283512055873871, + "rewards/rule_reward": 0.14453125, + "step": 556, + "token_diversity": 0.34933035714285715 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.32233796296296297, + "grad_norm": 3.293795585632324, + "kl": 32.9375, + "learning_rate": 9.558018618578314e-06, + "loss": 0.0329, + "reward": 0.003804645035415888, + "reward_std": 0.1413041055202484, + "rewards/ndcg_rule_reward": -0.0293984804302454, + "rewards/rule_reward": 0.033203125, + "step": 557, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.3229166666666667, + "grad_norm": 3.385383367538452, + "kl": 30.9375, + "learning_rate": 9.556090282110956e-06, + "loss": 0.031, + "reward": 0.06384929269552231, + "reward_std": 0.11639968678355217, + "rewards/ndcg_rule_reward": -0.02208820730447769, + "rewards/rule_reward": 0.0859375, + "step": 558, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.19921875, + "epoch": 0.32349537037037035, + "grad_norm": 1.7744683027267456, + "kl": 28.875, + "learning_rate": 9.554157943579734e-06, + "loss": 0.0288, + "reward": 0.09509647451341152, + "reward_std": 0.10797267407178879, + "rewards/ndcg_rule_reward": -0.020137893967330456, + "rewards/rule_reward": 0.115234375, + "step": 559, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.32407407407407407, + "grad_norm": 1.8563107252120972, + "kl": 30.5625, + "learning_rate": 9.552221604682009e-06, + "loss": 0.0305, + "reward": 0.034223570954054594, + "reward_std": 0.10800132900476456, + "rewards/ndcg_rule_reward": -0.022417055442929268, + "rewards/rule_reward": 0.056640625, + "step": 560, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.07421875, + "epoch": 0.3246527777777778, + "grad_norm": 4.062830924987793, + "kl": 61.625, + "learning_rate": 9.550281267118659e-06, + "loss": 0.0617, + "reward": 0.05128183774650097, + "reward_std": 0.15231836587190628, + "rewards/ndcg_rule_reward": -0.026843163184821606, + "rewards/rule_reward": 0.078125, + "step": 561, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0703125, + "epoch": 0.32523148148148145, + "grad_norm": 2.303722620010376, + "kl": 36.125, + "learning_rate": 9.548336932594073e-06, + "loss": 0.0361, + "reward": 0.04829722875729203, + "reward_std": 0.12551606073975563, + "rewards/ndcg_rule_reward": -0.02201527263969183, + "rewards/rule_reward": 0.0703125, + "step": 562, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.099609375, + "epoch": 0.3258101851851852, + "grad_norm": 4.039846897125244, + "kl": 36.625, + "learning_rate": 9.546388602816152e-06, + "loss": 0.0367, + "reward": 0.06424392014741898, + "reward_std": 0.1414354108273983, + "rewards/ndcg_rule_reward": -0.02755295392125845, + "rewards/rule_reward": 0.091796875, + "step": 563, + "token_diversity": 0.3606085526315789 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.05078125, + "epoch": 0.3263888888888889, + "grad_norm": 2.3382558822631836, + "kl": 25.4375, + "learning_rate": 9.544436279496307e-06, + "loss": 0.0254, + "reward": 0.03424880187958479, + "reward_std": 0.11643102020025253, + "rewards/ndcg_rule_reward": -0.024344947189092636, + "rewards/rule_reward": 0.05859375, + "step": 564, + "token_diversity": 0.34542410714285715 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.3269675925925926, + "grad_norm": 2.604473829269409, + "kl": 39.375, + "learning_rate": 9.542479964349456e-06, + "loss": 0.0394, + "reward": 0.003135038772597909, + "reward_std": 0.09949584677815437, + "rewards/ndcg_rule_reward": -0.020302461460232735, + "rewards/rule_reward": 0.0234375, + "step": 565, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.3275462962962963, + "grad_norm": 2.147420883178711, + "kl": 23.5, + "learning_rate": 9.540519659094026e-06, + "loss": 0.0235, + "reward": 0.03473195433616638, + "reward_std": 0.11619067937135696, + "rewards/ndcg_rule_reward": -0.02386179380118847, + "rewards/rule_reward": 0.05859375, + "step": 566, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.166015625, + "epoch": 0.328125, + "grad_norm": 3.076935291290283, + "kl": 26.4375, + "learning_rate": 9.538555365451942e-06, + "loss": 0.0264, + "reward": 0.1129762027412653, + "reward_std": 0.1357097029685974, + "rewards/ndcg_rule_reward": -0.021789425052702427, + "rewards/rule_reward": 0.134765625, + "step": 567, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.08984375, + "epoch": 0.3287037037037037, + "grad_norm": 2.243260145187378, + "kl": 10.875, + "learning_rate": 9.536587085148642e-06, + "loss": 0.0109, + "reward": 0.06189746968448162, + "reward_std": 0.12812091782689095, + "rewards/ndcg_rule_reward": -0.02404002845287323, + "rewards/rule_reward": 0.0859375, + "step": 568, + "token_diversity": 0.42767054738562094 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.3292824074074074, + "grad_norm": 2.388979434967041, + "kl": 12.890625, + "learning_rate": 9.534614819913056e-06, + "loss": 0.0129, + "reward": 0.03498384170234203, + "reward_std": 0.14132781326770782, + "rewards/ndcg_rule_reward": -0.029469283297657967, + "rewards/rule_reward": 0.064453125, + "step": 569, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.16015625, + "epoch": 0.3298611111111111, + "grad_norm": 2.8915607929229736, + "kl": 16.59375, + "learning_rate": 9.532638571477624e-06, + "loss": 0.0166, + "reward": 0.10956600651843473, + "reward_std": 0.12032981961965561, + "rewards/ndcg_rule_reward": -0.0154339917935431, + "rewards/rule_reward": 0.125, + "step": 570, + "token_diversity": 0.4523795324427481 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.03125, + "epoch": 0.3304398148148148, + "grad_norm": 2.4419169425964355, + "kl": 28.5625, + "learning_rate": 9.530658341578276e-06, + "loss": 0.0285, + "reward": 0.023833515355363488, + "reward_std": 0.10724089294672012, + "rewards/ndcg_rule_reward": -0.01913523580878973, + "rewards/rule_reward": 0.04296875, + "step": 571, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.109375, + "epoch": 0.33101851851851855, + "grad_norm": 2.2258975505828857, + "kl": 39.9375, + "learning_rate": 9.528674131954446e-06, + "loss": 0.04, + "reward": 0.07720089750364423, + "reward_std": 0.16594436764717102, + "rewards/ndcg_rule_reward": -0.028267850168049335, + "rewards/rule_reward": 0.10546875, + "step": 572, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.3315972222222222, + "grad_norm": 1.7725082635879517, + "kl": 23.4375, + "learning_rate": 9.526685944349061e-06, + "loss": 0.0234, + "reward": 0.003786958404816687, + "reward_std": 0.1160242035984993, + "rewards/ndcg_rule_reward": -0.023556792177259922, + "rewards/rule_reward": 0.02734375, + "step": 573, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.138671875, + "epoch": 0.33217592592592593, + "grad_norm": 2.1755611896514893, + "kl": 42.75, + "learning_rate": 9.524693780508541e-06, + "loss": 0.0429, + "reward": 0.0679466500878334, + "reward_std": 0.11923607811331749, + "rewards/ndcg_rule_reward": -0.02385022398084402, + "rewards/rule_reward": 0.091796875, + "step": 574, + "token_diversity": 0.37155330882352944 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.060546875, + "epoch": 0.33275462962962965, + "grad_norm": 3.1273467540740967, + "kl": 34.125, + "learning_rate": 9.5226976421828e-06, + "loss": 0.0341, + "reward": 0.03417091770097613, + "reward_std": 0.09120496734976768, + "rewards/ndcg_rule_reward": -0.01856345497071743, + "rewards/rule_reward": 0.052734375, + "step": 575, + "token_diversity": 0.53515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.375, + "epoch": 0.3333333333333333, + "grad_norm": 3.0349488258361816, + "kl": 42.8125, + "learning_rate": 9.520697531125247e-06, + "loss": 0.0428, + "reward": 0.15931133925914764, + "reward_std": 0.12482073530554771, + "rewards/ndcg_rule_reward": -0.026235539466142654, + "rewards/rule_reward": 0.185546875, + "step": 576, + "token_diversity": 0.40625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.115234375, + "epoch": 0.33391203703703703, + "grad_norm": 2.3963515758514404, + "kl": 57.5, + "learning_rate": 9.518693449092772e-06, + "loss": 0.0576, + "reward": 0.05760293058119714, + "reward_std": 0.12286267057061195, + "rewards/ndcg_rule_reward": -0.02247519325464964, + "rewards/rule_reward": 0.080078125, + "step": 577, + "token_diversity": 0.515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.33449074074074076, + "grad_norm": 1.8688056468963623, + "kl": 29.4375, + "learning_rate": 9.516685397845763e-06, + "loss": 0.0294, + "reward": 0.00426382344448939, + "reward_std": 0.1326703056693077, + "rewards/ndcg_rule_reward": -0.02698617661371827, + "rewards/rule_reward": 0.03125, + "step": 578, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.314453125, + "epoch": 0.3350694444444444, + "grad_norm": 6.981077194213867, + "kl": 35.1875, + "learning_rate": 9.514673379148087e-06, + "loss": 0.0352, + "reward": 0.17930486798286438, + "reward_std": 0.11392242833971977, + "rewards/ndcg_rule_reward": -0.01796074863523245, + "rewards/rule_reward": 0.197265625, + "step": 579, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.33564814814814814, + "grad_norm": 2.6982228755950928, + "kl": 35.1875, + "learning_rate": 9.512657394767097e-06, + "loss": 0.0351, + "reward": 0.034453109838068485, + "reward_std": 0.10790923610329628, + "rewards/ndcg_rule_reward": -0.022187513299286366, + "rewards/rule_reward": 0.056640625, + "step": 580, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.060546875, + "epoch": 0.33622685185185186, + "grad_norm": 2.776918888092041, + "kl": 17.75, + "learning_rate": 9.510637446473633e-06, + "loss": 0.0177, + "reward": 0.049025341868400574, + "reward_std": 0.14178475737571716, + "rewards/ndcg_rule_reward": -0.02519340720027685, + "rewards/rule_reward": 0.07421875, + "step": 581, + "token_diversity": 0.4375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.138671875, + "epoch": 0.3368055555555556, + "grad_norm": 1.838842749595642, + "kl": 16.4375, + "learning_rate": 9.508613536042015e-06, + "loss": 0.0164, + "reward": 0.08852901682257652, + "reward_std": 0.13139373809099197, + "rewards/ndcg_rule_reward": -0.02475222945213318, + "rewards/rule_reward": 0.11328125, + "step": 582, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.099609375, + "epoch": 0.33738425925925924, + "grad_norm": 1.654392957687378, + "kl": 16.40625, + "learning_rate": 9.506585665250043e-06, + "loss": 0.0164, + "reward": 0.06272399052977562, + "reward_std": 0.10008721426129341, + "rewards/ndcg_rule_reward": -0.01930725760757923, + "rewards/rule_reward": 0.08203125, + "step": 583, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.09375, + "epoch": 0.33796296296296297, + "grad_norm": 2.0498366355895996, + "kl": 32.4375, + "learning_rate": 9.504553835878997e-06, + "loss": 0.0324, + "reward": 0.06226405082270503, + "reward_std": 0.16680222749710083, + "rewards/ndcg_rule_reward": -0.03148595057427883, + "rewards/rule_reward": 0.09375, + "step": 584, + "token_diversity": 0.412828947368421 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.146484375, + "epoch": 0.3385416666666667, + "grad_norm": 9.541601181030273, + "kl": 50.125, + "learning_rate": 9.502518049713633e-06, + "loss": 0.05, + "reward": 0.09616232733242214, + "reward_std": 0.12489963322877884, + "rewards/ndcg_rule_reward": -0.019072051160037518, + "rewards/rule_reward": 0.115234375, + "step": 585, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.33912037037037035, + "grad_norm": 2.0869557857513428, + "kl": 57.5, + "learning_rate": 9.500478308542183e-06, + "loss": 0.0576, + "reward": 0.03317308018449694, + "reward_std": 0.1329842507839203, + "rewards/ndcg_rule_reward": -0.025420669466257095, + "rewards/rule_reward": 0.05859375, + "step": 586, + "token_diversity": 0.3984375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.33969907407407407, + "grad_norm": 2.0894107818603516, + "kl": 32.25, + "learning_rate": 9.498434614156351e-06, + "loss": 0.0322, + "reward": 0.03629928221926093, + "reward_std": 0.15754982829093933, + "rewards/ndcg_rule_reward": -0.032060093246400356, + "rewards/rule_reward": 0.068359375, + "step": 587, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.3402777777777778, + "grad_norm": 2.6919944286346436, + "kl": 26.1875, + "learning_rate": 9.496386968351316e-06, + "loss": 0.0262, + "reward": 0.03251255233772099, + "reward_std": 0.11642995476722717, + "rewards/ndcg_rule_reward": -0.022174949757754803, + "rewards/rule_reward": 0.0546875, + "step": 588, + "token_diversity": 0.40625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.078125, + "epoch": 0.34085648148148145, + "grad_norm": 3.4467928409576416, + "kl": 40.4375, + "learning_rate": 9.49433537292573e-06, + "loss": 0.0404, + "reward": 0.05262420221697539, + "reward_std": 0.1328458935022354, + "rewards/ndcg_rule_reward": -0.023547672666609287, + "rewards/rule_reward": 0.076171875, + "step": 589, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.10546875, + "epoch": 0.3414351851851852, + "grad_norm": 3.0173559188842773, + "kl": 36.25, + "learning_rate": 9.492279829681713e-06, + "loss": 0.0363, + "reward": 0.06742288917303085, + "reward_std": 0.1744159311056137, + "rewards/ndcg_rule_reward": -0.03609273210167885, + "rewards/rule_reward": 0.103515625, + "step": 590, + "token_diversity": 0.3515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.3420138888888889, + "grad_norm": 3.408247470855713, + "kl": 59.125, + "learning_rate": 9.490220340424844e-06, + "loss": 0.0591, + "reward": 0.03606267785653472, + "reward_std": 0.14080633968114853, + "rewards/ndcg_rule_reward": -0.02839044388383627, + "rewards/rule_reward": 0.064453125, + "step": 591, + "token_diversity": 0.421875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.228515625, + "epoch": 0.3425925925925926, + "grad_norm": 3.0965774059295654, + "kl": 49.75, + "learning_rate": 9.488156906964184e-06, + "loss": 0.0498, + "reward": 0.14553074911236763, + "reward_std": 0.17809796333312988, + "rewards/ndcg_rule_reward": -0.030250495299696922, + "rewards/rule_reward": 0.17578125, + "step": 592, + "token_diversity": 0.55078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.3431712962962963, + "grad_norm": 2.2230701446533203, + "kl": 20.625, + "learning_rate": 9.486089531112247e-06, + "loss": 0.0206, + "reward": 0.034143781987950206, + "reward_std": 0.09963138401508331, + "rewards/ndcg_rule_reward": -0.020543714053928852, + "rewards/rule_reward": 0.0546875, + "step": 593, + "token_diversity": 0.39453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.041015625, + "epoch": 0.34375, + "grad_norm": 1.7566096782684326, + "kl": 26.6875, + "learning_rate": 9.484018214685012e-06, + "loss": 0.0267, + "reward": 0.03000744036398828, + "reward_std": 0.12801632285118103, + "rewards/ndcg_rule_reward": -0.0246800584718585, + "rewards/rule_reward": 0.0546875, + "step": 594, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.3443287037037037, + "grad_norm": 1.4912998676300049, + "kl": 27.6875, + "learning_rate": 9.481942959501922e-06, + "loss": 0.0277, + "reward": 0.03415355971083045, + "reward_std": 0.09123851358890533, + "rewards/ndcg_rule_reward": -0.018580812960863113, + "rewards/rule_reward": 0.052734375, + "step": 595, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.04296875, + "epoch": 0.3449074074074074, + "grad_norm": 1.7434568405151367, + "kl": 22.375, + "learning_rate": 9.47986376738588e-06, + "loss": 0.0224, + "reward": 0.029866517055779696, + "reward_std": 0.10291246324777603, + "rewards/ndcg_rule_reward": -0.01896160887554288, + "rewards/rule_reward": 0.048828125, + "step": 596, + "token_diversity": 0.4296875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.3454861111111111, + "grad_norm": 3.1333117485046387, + "kl": 28.21875, + "learning_rate": 9.477780640163242e-06, + "loss": 0.0283, + "reward": 0.03426325786858797, + "reward_std": 0.17450397461652756, + "rewards/ndcg_rule_reward": -0.034096118062734604, + "rewards/rule_reward": 0.068359375, + "step": 597, + "token_diversity": 0.55859375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.17578125, + "epoch": 0.3460648148148148, + "grad_norm": 2.1764190196990967, + "kl": 23.1875, + "learning_rate": 9.475693579663827e-06, + "loss": 0.0231, + "reward": 0.11372351087629795, + "reward_std": 0.13550782948732376, + "rewards/ndcg_rule_reward": -0.022995241917669773, + "rewards/rule_reward": 0.13671875, + "step": 598, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.09765625, + "epoch": 0.34664351851851855, + "grad_norm": 2.0353002548217773, + "kl": 16.375, + "learning_rate": 9.473602587720908e-06, + "loss": 0.0164, + "reward": 0.0636100263800472, + "reward_std": 0.13333051279187202, + "rewards/ndcg_rule_reward": -0.026233726181089878, + "rewards/rule_reward": 0.08984375, + "step": 599, + "token_diversity": 0.38034539473684215 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.3472222222222222, + "grad_norm": 1.727420687675476, + "kl": 20.25, + "learning_rate": 9.471507666171204e-06, + "loss": 0.0203, + "reward": 0.0038841126952320337, + "reward_std": 0.11602244526147842, + "rewards/ndcg_rule_reward": -0.023459636606276035, + "rewards/rule_reward": 0.02734375, + "step": 600, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.140625, + "epoch": 0.34780092592592593, + "grad_norm": 2.17559814453125, + "kl": 26.984375, + "learning_rate": 9.469408816854898e-06, + "loss": 0.0271, + "reward": 0.08995408611372113, + "reward_std": 0.13403469324111938, + "rewards/ndcg_rule_reward": -0.023327162489295006, + "rewards/rule_reward": 0.11328125, + "step": 601, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0859375, + "epoch": 0.34837962962962965, + "grad_norm": 2.2457780838012695, + "kl": 22.21875, + "learning_rate": 9.46730604161561e-06, + "loss": 0.0222, + "reward": 0.056508916546590626, + "reward_std": 0.12210619822144508, + "rewards/ndcg_rule_reward": -0.021616080775856972, + "rewards/rule_reward": 0.078125, + "step": 602, + "token_diversity": 0.56640625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.3489583333333333, + "grad_norm": 2.859098434448242, + "kl": 35.9375, + "learning_rate": 9.46519934230042e-06, + "loss": 0.0359, + "reward": 0.03593675186857581, + "reward_std": 0.14927316457033157, + "rewards/ndcg_rule_reward": -0.030469495803117752, + "rewards/rule_reward": 0.06640625, + "step": 603, + "token_diversity": 0.40234375 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.1015625, + "epoch": 0.34953703703703703, + "grad_norm": 1.7525756359100342, + "kl": 33.125, + "learning_rate": 9.463088720759847e-06, + "loss": 0.0331, + "reward": 0.06666306778788567, + "reward_std": 0.12427346408367157, + "rewards/ndcg_rule_reward": -0.025133803486824036, + "rewards/rule_reward": 0.091796875, + "step": 604, + "token_diversity": 0.3635856331168831 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.35011574074074076, + "grad_norm": 2.1779372692108154, + "kl": 35.5, + "learning_rate": 9.460974178847861e-06, + "loss": 0.0354, + "reward": 0.0640893466770649, + "reward_std": 0.12469431012868881, + "rewards/ndcg_rule_reward": -0.02380127552896738, + "rewards/rule_reward": 0.087890625, + "step": 605, + "token_diversity": 0.43359375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.060546875, + "epoch": 0.3506944444444444, + "grad_norm": 1.2236733436584473, + "kl": 42.5, + "learning_rate": 9.45885571842187e-06, + "loss": 0.0425, + "reward": 0.034330641967244446, + "reward_std": 0.08272159472107887, + "rewards/ndcg_rule_reward": -0.01645060582086444, + "rewards/rule_reward": 0.05078125, + "step": 606, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.53125, + "epoch": 0.35127314814814814, + "grad_norm": 2.754702568054199, + "kl": 39.125, + "learning_rate": 9.456733341342732e-06, + "loss": 0.0392, + "reward": 0.0351914269849658, + "reward_std": 0.12440508604049683, + "rewards/ndcg_rule_reward": -0.02535544615238905, + "rewards/rule_reward": 0.060546875, + "step": 607, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.35185185185185186, + "grad_norm": 2.822737455368042, + "kl": 45.8125, + "learning_rate": 9.454607049474734e-06, + "loss": 0.0459, + "reward": 0.028538569808006287, + "reward_std": 0.13820475339889526, + "rewards/ndcg_rule_reward": -0.026148930191993713, + "rewards/rule_reward": 0.0546875, + "step": 608, + "token_diversity": 0.5084134615384616 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.056640625, + "epoch": 0.3524305555555556, + "grad_norm": 2.358506202697754, + "kl": 55.5, + "learning_rate": 9.452476844685611e-06, + "loss": 0.0556, + "reward": 0.034468403086066246, + "reward_std": 0.09107794240117073, + "rewards/ndcg_rule_reward": -0.018265970051288605, + "rewards/rule_reward": 0.052734375, + "step": 609, + "token_diversity": 0.3568371815286624 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.35300925925925924, + "grad_norm": 2.935542106628418, + "kl": 35.9375, + "learning_rate": 9.450342728846531e-06, + "loss": 0.0359, + "reward": 0.032110358006320894, + "reward_std": 0.11662261188030243, + "rewards/ndcg_rule_reward": -0.02257714234292507, + "rewards/rule_reward": 0.0546875, + "step": 610, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.138671875, + "epoch": 0.35358796296296297, + "grad_norm": 2.7309248447418213, + "kl": 28.5, + "learning_rate": 9.448204703832102e-06, + "loss": 0.0285, + "reward": 0.08825276186689734, + "reward_std": 0.10606414079666138, + "rewards/ndcg_rule_reward": -0.01721598720178008, + "rewards/rule_reward": 0.10546875, + "step": 611, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1015625, + "epoch": 0.3541666666666667, + "grad_norm": 2.452486038208008, + "kl": 26.5, + "learning_rate": 9.446062771520358e-06, + "loss": 0.0265, + "reward": 0.06588658317923546, + "reward_std": 0.11621677502989769, + "rewards/ndcg_rule_reward": -0.023957165889441967, + "rewards/rule_reward": 0.08984375, + "step": 612, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.01953125, + "epoch": 0.35474537037037035, + "grad_norm": 2.162888526916504, + "kl": 24.375, + "learning_rate": 9.443916933792774e-06, + "loss": 0.0244, + "reward": 0.01729441632051021, + "reward_std": 0.11694952100515366, + "rewards/ndcg_rule_reward": -0.021768083795905113, + "rewards/rule_reward": 0.0390625, + "step": 613, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0859375, + "epoch": 0.35532407407407407, + "grad_norm": 2.8967158794403076, + "kl": 29.75, + "learning_rate": 9.441767192534246e-06, + "loss": 0.0298, + "reward": 0.055627521709538996, + "reward_std": 0.08985482156276703, + "rewards/ndcg_rule_reward": -0.014684980269521475, + "rewards/rule_reward": 0.0703125, + "step": 614, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.3559027777777778, + "grad_norm": 1.982359766960144, + "kl": 22.5, + "learning_rate": 9.439613549633107e-06, + "loss": 0.0225, + "reward": 0.03426394728012383, + "reward_std": 0.09960442036390305, + "rewards/ndcg_rule_reward": -0.02042355202138424, + "rewards/rule_reward": 0.0546875, + "step": 615, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.046875, + "epoch": 0.35648148148148145, + "grad_norm": 1.876311182975769, + "kl": 29.0, + "learning_rate": 9.437456006981115e-06, + "loss": 0.029, + "reward": 0.03203399805352092, + "reward_std": 0.09981245547533035, + "rewards/ndcg_rule_reward": -0.01874725241214037, + "rewards/rule_reward": 0.05078125, + "step": 616, + "token_diversity": 0.3863075657894737 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.3570601851851852, + "grad_norm": 4.22296667098999, + "kl": 42.125, + "learning_rate": 9.435294566473453e-06, + "loss": 0.0421, + "reward": 0.06147258181590587, + "reward_std": 0.10829383134841919, + "rewards/ndcg_rule_reward": -0.018605546094477177, + "rewards/rule_reward": 0.080078125, + "step": 617, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.091796875, + "epoch": 0.3576388888888889, + "grad_norm": 2.0016965866088867, + "kl": 38.8125, + "learning_rate": 9.433129230008723e-06, + "loss": 0.0389, + "reward": 0.03539977804757655, + "reward_std": 0.14113470166921616, + "rewards/ndcg_rule_reward": -0.029053345322608948, + "rewards/rule_reward": 0.064453125, + "step": 618, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.04296875, + "epoch": 0.3582175925925926, + "grad_norm": 92.8266372680664, + "kl": 266.625, + "learning_rate": 9.430959999488959e-06, + "loss": 0.2673, + "reward": 0.032507337629795074, + "reward_std": 0.16897976398468018, + "rewards/ndcg_rule_reward": -0.031945787370204926, + "rewards/rule_reward": 0.064453125, + "step": 619, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.099609375, + "epoch": 0.3587962962962963, + "grad_norm": 2.5742554664611816, + "kl": 36.25, + "learning_rate": 9.428786876819607e-06, + "loss": 0.0362, + "reward": 0.06341015640646219, + "reward_std": 0.10820337384939194, + "rewards/ndcg_rule_reward": -0.020574217662215233, + "rewards/rule_reward": 0.083984375, + "step": 620, + "token_diversity": 0.42578125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.236328125, + "epoch": 0.359375, + "grad_norm": 3.194570541381836, + "kl": 36.875, + "learning_rate": 9.426609863909537e-06, + "loss": 0.0369, + "reward": 0.06012111157178879, + "reward_std": 0.13806777447462082, + "rewards/ndcg_rule_reward": -0.025816393084824085, + "rewards/rule_reward": 0.0859375, + "step": 621, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.3599537037037037, + "grad_norm": 1.6193530559539795, + "kl": 33.375, + "learning_rate": 9.424428962671033e-06, + "loss": 0.0334, + "reward": 0.06106346845626831, + "reward_std": 0.1028900146484375, + "rewards/ndcg_rule_reward": -0.019014655612409115, + "rewards/rule_reward": 0.080078125, + "step": 622, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.3605324074074074, + "grad_norm": 2.1269876956939697, + "kl": 21.578125, + "learning_rate": 9.422244175019797e-06, + "loss": 0.0215, + "reward": 0.0031510867411270738, + "reward_std": 0.10795607045292854, + "rewards/ndcg_rule_reward": -0.022239538840949535, + "rewards/rule_reward": 0.025390625, + "step": 623, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.3611111111111111, + "grad_norm": 2.3070099353790283, + "kl": 28.625, + "learning_rate": 9.420055502874943e-06, + "loss": 0.0286, + "reward": 0.06458012573421001, + "reward_std": 0.0885394886136055, + "rewards/ndcg_rule_reward": -0.01354487705975771, + "rewards/rule_reward": 0.078125, + "step": 624, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.3616898148148148, + "grad_norm": 1.9678136110305786, + "kl": 26.4375, + "learning_rate": 9.417862948158997e-06, + "loss": 0.0264, + "reward": 0.03313503577373922, + "reward_std": 0.1413983628153801, + "rewards/ndcg_rule_reward": -0.027411839924752712, + "rewards/rule_reward": 0.060546875, + "step": 625, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.1015625, + "epoch": 0.36226851851851855, + "grad_norm": 1.7019379138946533, + "kl": 19.3125, + "learning_rate": 9.4156665127979e-06, + "loss": 0.0193, + "reward": 0.06504585593938828, + "reward_std": 0.09137988835573196, + "rewards/ndcg_rule_reward": -0.018938513472676277, + "rewards/rule_reward": 0.083984375, + "step": 626, + "token_diversity": 0.35907061688311687 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.1875, + "epoch": 0.3628472222222222, + "grad_norm": 2.657409191131592, + "kl": 44.75, + "learning_rate": 9.413466198720995e-06, + "loss": 0.0447, + "reward": 0.11987235397100449, + "reward_std": 0.14515908062458038, + "rewards/ndcg_rule_reward": -0.02465889696031809, + "rewards/rule_reward": 0.14453125, + "step": 627, + "token_diversity": 0.3743832236842105 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.431640625, + "epoch": 0.36342592592592593, + "grad_norm": 1.8798941373825073, + "kl": 31.125, + "learning_rate": 9.411262007861033e-06, + "loss": 0.0311, + "reward": 0.003714669030159712, + "reward_std": 0.11608211696147919, + "rewards/ndcg_rule_reward": -0.023629081435501575, + "rewards/rule_reward": 0.02734375, + "step": 628, + "token_diversity": 0.42578125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.36400462962962965, + "grad_norm": 3.5634758472442627, + "kl": 35.0, + "learning_rate": 9.409053942154177e-06, + "loss": 0.035, + "reward": 0.004078588099218905, + "reward_std": 0.13276298344135284, + "rewards/ndcg_rule_reward": -0.02717141155153513, + "rewards/rule_reward": 0.03125, + "step": 629, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.3645833333333333, + "grad_norm": 1.8220279216766357, + "kl": 16.5, + "learning_rate": 9.406842003539987e-06, + "loss": 0.0165, + "reward": 0.003442069632001221, + "reward_std": 0.11623802781105042, + "rewards/ndcg_rule_reward": -0.023901681415736675, + "rewards/rule_reward": 0.02734375, + "step": 630, + "token_diversity": 0.44921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.115234375, + "epoch": 0.36516203703703703, + "grad_norm": 2.122192621231079, + "kl": 25.125, + "learning_rate": 9.404626193961427e-06, + "loss": 0.0251, + "reward": 0.06601387076079845, + "reward_std": 0.12461120635271072, + "rewards/ndcg_rule_reward": -0.0257829949259758, + "rewards/rule_reward": 0.091796875, + "step": 631, + "token_diversity": 0.4570023148148148 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.32421875, + "epoch": 0.36574074074074076, + "grad_norm": 2.4090213775634766, + "kl": 51.25, + "learning_rate": 9.402406515364859e-06, + "loss": 0.0511, + "reward": 0.0849645845592022, + "reward_std": 0.1427399292588234, + "rewards/ndcg_rule_reward": -0.024410415440797806, + "rewards/rule_reward": 0.109375, + "step": 632, + "token_diversity": 0.41796875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1015625, + "epoch": 0.3663194444444444, + "grad_norm": 2.2673394680023193, + "kl": 19.78125, + "learning_rate": 9.40018296970005e-06, + "loss": 0.0198, + "reward": 0.0656876340508461, + "reward_std": 0.13317399844527245, + "rewards/ndcg_rule_reward": -0.028062366880476475, + "rewards/rule_reward": 0.09375, + "step": 633, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.36689814814814814, + "grad_norm": 1.7280596494674683, + "kl": 21.4375, + "learning_rate": 9.397955558920156e-06, + "loss": 0.0214, + "reward": 0.03353353030979633, + "reward_std": 0.08313128352165222, + "rewards/ndcg_rule_reward": -0.017247721552848816, + "rewards/rule_reward": 0.05078125, + "step": 634, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.25390625, + "epoch": 0.36747685185185186, + "grad_norm": 1.9182792901992798, + "kl": 34.375, + "learning_rate": 9.395724284981736e-06, + "loss": 0.0344, + "reward": 0.06375482864677906, + "reward_std": 0.11642881482839584, + "rewards/ndcg_rule_reward": -0.02218266762793064, + "rewards/rule_reward": 0.0859375, + "step": 635, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.3680555555555556, + "grad_norm": 2.999177932739258, + "kl": 27.8125, + "learning_rate": 9.393489149844732e-06, + "loss": 0.0278, + "reward": 0.0019264608854427934, + "reward_std": 0.09163377806544304, + "rewards/ndcg_rule_reward": -0.019557914696633816, + "rewards/rule_reward": 0.021484375, + "step": 636, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.02734375, + "epoch": 0.36863425925925924, + "grad_norm": 1.6009347438812256, + "kl": 29.3125, + "learning_rate": 9.391250155472492e-06, + "loss": 0.0293, + "reward": 0.021440579497721046, + "reward_std": 0.10803859308362007, + "rewards/ndcg_rule_reward": -0.01957504777237773, + "rewards/rule_reward": 0.041015625, + "step": 637, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.02734375, + "epoch": 0.36921296296296297, + "grad_norm": 3.2257320880889893, + "kl": 57.125, + "learning_rate": 9.389007303831738e-06, + "loss": 0.0573, + "reward": 0.02375204162672162, + "reward_std": 0.15744853019714355, + "rewards/ndcg_rule_reward": -0.02898233477026224, + "rewards/rule_reward": 0.052734375, + "step": 638, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 0.90625, + "completion_length": 5.125, + "epoch": 0.3697916666666667, + "grad_norm": 14.39608097076416, + "kl": 67.125, + "learning_rate": 9.386760596892592e-06, + "loss": 0.0671, + "reward": 0.08227362437173724, + "reward_std": 0.1366051286458969, + "rewards/ndcg_rule_reward": -0.023195119574666023, + "rewards/rule_reward": 0.10546875, + "step": 639, + "token_diversity": 0.42725929054054057 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.37037037037037035, + "grad_norm": 2.3202054500579834, + "kl": 23.9375, + "learning_rate": 9.38451003662856e-06, + "loss": 0.024, + "reward": 0.03415447403676808, + "reward_std": 0.09120938926935196, + "rewards/ndcg_rule_reward": -0.018579899333417416, + "rewards/rule_reward": 0.052734375, + "step": 640, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.37094907407407407, + "grad_norm": 2.027688503265381, + "kl": 20.8125, + "learning_rate": 9.382255625016527e-06, + "loss": 0.0208, + "reward": 0.004088510642759502, + "reward_std": 0.12433372437953949, + "rewards/ndcg_rule_reward": -0.02520836517214775, + "rewards/rule_reward": 0.029296875, + "step": 641, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.04296875, + "epoch": 0.3715277777777778, + "grad_norm": 1.843016505241394, + "kl": 27.9375, + "learning_rate": 9.379997364036768e-06, + "loss": 0.0279, + "reward": 0.03018070012331009, + "reward_std": 0.1027502790093422, + "rewards/ndcg_rule_reward": -0.01864742673933506, + "rewards/rule_reward": 0.048828125, + "step": 642, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.37210648148148145, + "grad_norm": 1.7096010446548462, + "kl": 21.125, + "learning_rate": 9.377735255672935e-06, + "loss": 0.0211, + "reward": 0.03228787437546998, + "reward_std": 0.10813264176249504, + "rewards/ndcg_rule_reward": -0.02044650260359049, + "rewards/rule_reward": 0.052734375, + "step": 643, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.3726851851851852, + "grad_norm": 3.5796051025390625, + "kl": 36.6875, + "learning_rate": 9.375469301912062e-06, + "loss": 0.0367, + "reward": 0.0032446013065055013, + "reward_std": 0.12472570687532425, + "rewards/ndcg_rule_reward": -0.026052272878587246, + "rewards/rule_reward": 0.029296875, + "step": 644, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.03125, + "epoch": 0.3732638888888889, + "grad_norm": 1.8853768110275269, + "kl": 55.75, + "learning_rate": 9.37319950474456e-06, + "loss": 0.0558, + "reward": 0.023976330878213048, + "reward_std": 0.11559434980154037, + "rewards/ndcg_rule_reward": -0.02094554528594017, + "rewards/rule_reward": 0.044921875, + "step": 645, + "token_diversity": 0.37890625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.3738425925925926, + "grad_norm": 2.9414219856262207, + "kl": 27.5, + "learning_rate": 9.370925866164218e-06, + "loss": 0.0275, + "reward": 0.002227641933131963, + "reward_std": 0.08314459212124348, + "rewards/ndcg_rule_reward": -0.017303607426583767, + "rewards/rule_reward": 0.01953125, + "step": 646, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.09765625, + "epoch": 0.3744212962962963, + "grad_norm": 2.3378453254699707, + "kl": 29.6875, + "learning_rate": 9.368648388168193e-06, + "loss": 0.0297, + "reward": 0.06300573667977005, + "reward_std": 0.09993290156126022, + "rewards/ndcg_rule_reward": -0.019025510177016258, + "rewards/rule_reward": 0.08203125, + "step": 647, + "token_diversity": 0.35907061688311687 + }, + { + "categorical_diversity": 0.921875, + "completion_length": 5.1796875, + "epoch": 0.375, + "grad_norm": 2.8274691104888916, + "kl": 35.0625, + "learning_rate": 9.36636707275702e-06, + "loss": 0.0351, + "reward": 0.11514878459274769, + "reward_std": 0.12357448041439056, + "rewards/ndcg_rule_reward": -0.01961683901026845, + "rewards/rule_reward": 0.134765625, + "step": 648, + "token_diversity": 0.3640303938356164 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.3755787037037037, + "grad_norm": 1.8769339323043823, + "kl": 33.3125, + "learning_rate": 9.364081921934607e-06, + "loss": 0.0334, + "reward": 0.03665972105227411, + "reward_std": 0.10245457291603088, + "rewards/ndcg_rule_reward": -0.01998090371489525, + "rewards/rule_reward": 0.056640625, + "step": 649, + "token_diversity": 0.55859375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.072265625, + "epoch": 0.3761574074074074, + "grad_norm": 2.113893747329712, + "kl": 29.1875, + "learning_rate": 9.361792937708223e-06, + "loss": 0.0292, + "reward": 0.05292976647615433, + "reward_std": 0.12397480756044388, + "rewards/ndcg_rule_reward": -0.02128898724913597, + "rewards/rule_reward": 0.07421875, + "step": 650, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.3767361111111111, + "grad_norm": 1.986336350440979, + "kl": 25.5625, + "learning_rate": 9.359500122088511e-06, + "loss": 0.0256, + "reward": 0.003972191130742431, + "reward_std": 0.14119604229927063, + "rewards/ndcg_rule_reward": -0.0292309345677495, + "rewards/rule_reward": 0.033203125, + "step": 651, + "token_diversity": 0.4140625 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.05078125, + "epoch": 0.3773148148148148, + "grad_norm": 1.297594666481018, + "kl": 21.15625, + "learning_rate": 9.357203477089475e-06, + "loss": 0.0212, + "reward": 0.033988600596785545, + "reward_std": 0.08289626240730286, + "rewards/ndcg_rule_reward": -0.01679265033453703, + "rewards/rule_reward": 0.05078125, + "step": 652, + "token_diversity": 0.37271712662337664 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.109375, + "epoch": 0.37789351851851855, + "grad_norm": 2.6925277709960938, + "kl": 30.875, + "learning_rate": 9.354903004728491e-06, + "loss": 0.0308, + "reward": 0.07451068982481956, + "reward_std": 0.1817079782485962, + "rewards/ndcg_rule_reward": -0.030958060175180435, + "rewards/rule_reward": 0.10546875, + "step": 653, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.029296875, + "epoch": 0.3784722222222222, + "grad_norm": 2.1739845275878906, + "kl": 19.65625, + "learning_rate": 9.352598707026284e-06, + "loss": 0.0196, + "reward": 0.02845374378375709, + "reward_std": 0.16253646463155746, + "rewards/ndcg_rule_reward": -0.03014000877737999, + "rewards/rule_reward": 0.05859375, + "step": 654, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 6.064453125, + "epoch": 0.37905092592592593, + "grad_norm": 2.022737979888916, + "kl": 26.59375, + "learning_rate": 9.35029058600695e-06, + "loss": 0.0266, + "reward": 0.0921734981238842, + "reward_std": 0.11695769056677818, + "rewards/ndcg_rule_reward": -0.021107753738760948, + "rewards/rule_reward": 0.11328125, + "step": 655, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.37962962962962965, + "grad_norm": 2.3203275203704834, + "kl": 22.75, + "learning_rate": 9.347978643697939e-06, + "loss": 0.0227, + "reward": 0.032660174067132175, + "reward_std": 0.1331801638007164, + "rewards/ndcg_rule_reward": -0.025933576747775078, + "rewards/rule_reward": 0.05859375, + "step": 656, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.044921875, + "epoch": 0.3802083333333333, + "grad_norm": 2.310659885406494, + "kl": 38.25, + "learning_rate": 9.345662882130056e-06, + "loss": 0.0382, + "reward": 0.0325435483828187, + "reward_std": 0.14163776859641075, + "rewards/ndcg_rule_reward": -0.0280033266171813, + "rewards/rule_reward": 0.060546875, + "step": 657, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 0.90625, + "completion_length": 5.3515625, + "epoch": 0.38078703703703703, + "grad_norm": 3.7375102043151855, + "kl": 63.6875, + "learning_rate": 9.343343303337467e-06, + "loss": 0.0636, + "reward": 0.08555155573412776, + "reward_std": 0.16735760867595673, + "rewards/ndcg_rule_reward": -0.029682821594178677, + "rewards/rule_reward": 0.115234375, + "step": 658, + "token_diversity": 0.32906008687258687 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.38136574074074076, + "grad_norm": 3.8813889026641846, + "kl": 50.375, + "learning_rate": 9.341019909357685e-06, + "loss": 0.0503, + "reward": 0.0038866953691467643, + "reward_std": 0.12443394213914871, + "rewards/ndcg_rule_reward": -0.025410180911421776, + "rewards/rule_reward": 0.029296875, + "step": 659, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.158203125, + "epoch": 0.3819444444444444, + "grad_norm": 2.0068552494049072, + "kl": 41.25, + "learning_rate": 9.338692702231575e-06, + "loss": 0.0412, + "reward": 0.09759292006492615, + "reward_std": 0.11601068824529648, + "rewards/ndcg_rule_reward": -0.023500829935073853, + "rewards/rule_reward": 0.12109375, + "step": 660, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.38252314814814814, + "grad_norm": 1.3559945821762085, + "kl": 35.5, + "learning_rate": 9.336361684003353e-06, + "loss": 0.0355, + "reward": 0.03333699586801231, + "reward_std": 0.06636185944080353, + "rewards/ndcg_rule_reward": -0.013538005761802197, + "rewards/rule_reward": 0.046875, + "step": 661, + "token_diversity": 0.42578125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.38310185185185186, + "grad_norm": 2.38149356842041, + "kl": 40.875, + "learning_rate": 9.334026856720585e-06, + "loss": 0.0408, + "reward": 0.004184124409221113, + "reward_std": 0.1327216997742653, + "rewards/ndcg_rule_reward": -0.027065875008702278, + "rewards/rule_reward": 0.03125, + "step": 662, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.140625, + "epoch": 0.3836805555555556, + "grad_norm": 1.7263303995132446, + "kl": 38.25, + "learning_rate": 9.331688222434179e-06, + "loss": 0.0382, + "reward": 0.09070119634270668, + "reward_std": 0.12193620204925537, + "rewards/ndcg_rule_reward": -0.022580054588615894, + "rewards/rule_reward": 0.11328125, + "step": 663, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.38425925925925924, + "grad_norm": 1.7507152557373047, + "kl": 32.375, + "learning_rate": 9.329345783198386e-06, + "loss": 0.0324, + "reward": 0.035288035636767745, + "reward_std": 0.12434561550617218, + "rewards/ndcg_rule_reward": -0.02525884099304676, + "rewards/rule_reward": 0.060546875, + "step": 664, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 0.953125, + "completion_length": 5.287109375, + "epoch": 0.38483796296296297, + "grad_norm": 2.5250420570373535, + "kl": 31.9375, + "learning_rate": 9.326999541070804e-06, + "loss": 0.0319, + "reward": 0.06685630232095718, + "reward_std": 0.15828849375247955, + "rewards/ndcg_rule_reward": -0.03275307081639767, + "rewards/rule_reward": 0.099609375, + "step": 665, + "token_diversity": 0.29926915322580644 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.287109375, + "epoch": 0.3854166666666667, + "grad_norm": 2.205615758895874, + "kl": 29.3515625, + "learning_rate": 9.324649498112371e-06, + "loss": 0.0293, + "reward": 0.06520174816250801, + "reward_std": 0.08290580660104752, + "rewards/ndcg_rule_reward": -0.016829502768814564, + "rewards/rule_reward": 0.08203125, + "step": 666, + "token_diversity": 0.44921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.38599537037037035, + "grad_norm": 2.0986783504486084, + "kl": 22.0625, + "learning_rate": 9.322295656387357e-06, + "loss": 0.0221, + "reward": 0.03531491244211793, + "reward_std": 0.14119866490364075, + "rewards/ndcg_rule_reward": -0.029138214886188507, + "rewards/rule_reward": 0.064453125, + "step": 667, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.38657407407407407, + "grad_norm": 2.0125679969787598, + "kl": 40.0625, + "learning_rate": 9.319938017963378e-06, + "loss": 0.04, + "reward": 0.004220583010464907, + "reward_std": 0.12426247075200081, + "rewards/ndcg_rule_reward": -0.025076291523873806, + "rewards/rule_reward": 0.029296875, + "step": 668, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.3871527777777778, + "grad_norm": 2.172351837158203, + "kl": 34.3125, + "learning_rate": 9.317576584911376e-06, + "loss": 0.0344, + "reward": 0.003994139609858394, + "reward_std": 0.14118264615535736, + "rewards/ndcg_rule_reward": -0.029208987019956112, + "rewards/rule_reward": 0.033203125, + "step": 669, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.38773148148148145, + "grad_norm": 2.1059563159942627, + "kl": 30.84375, + "learning_rate": 9.315211359305635e-06, + "loss": 0.0309, + "reward": 0.004458320327103138, + "reward_std": 0.14097358286380768, + "rewards/ndcg_rule_reward": -0.028744804672896862, + "rewards/rule_reward": 0.033203125, + "step": 670, + "token_diversity": 0.4296875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.146484375, + "epoch": 0.3883101851851852, + "grad_norm": 2.604337215423584, + "kl": 37.75, + "learning_rate": 9.312842343223764e-06, + "loss": 0.0377, + "reward": 0.09233526955358684, + "reward_std": 0.10844901949167252, + "rewards/ndcg_rule_reward": -0.01899285474792123, + "rewards/rule_reward": 0.111328125, + "step": 671, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.3888888888888889, + "grad_norm": 2.1649820804595947, + "kl": 23.0, + "learning_rate": 9.310469538746706e-06, + "loss": 0.023, + "reward": 0.002924749976955354, + "reward_std": 0.1080847941339016, + "rewards/ndcg_rule_reward": -0.02246587537229061, + "rewards/rule_reward": 0.025390625, + "step": 672, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.05078125, + "epoch": 0.3894675925925926, + "grad_norm": 3.808959484100342, + "kl": 54.75, + "learning_rate": 9.308092947958725e-06, + "loss": 0.0547, + "reward": 0.03571016760542989, + "reward_std": 0.14944178611040115, + "rewards/ndcg_rule_reward": -0.030696086585521698, + "rewards/rule_reward": 0.06640625, + "step": 673, + "token_diversity": 0.35516436688311687 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.3900462962962963, + "grad_norm": 2.2641549110412598, + "kl": 26.90625, + "learning_rate": 9.305712572947418e-06, + "loss": 0.0269, + "reward": 0.03524340013973415, + "reward_std": 0.1328161358833313, + "rewards/ndcg_rule_reward": -0.027256598696112633, + "rewards/rule_reward": 0.0625, + "step": 674, + "token_diversity": 0.421875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.146484375, + "epoch": 0.390625, + "grad_norm": 15.324199676513672, + "kl": 71.875, + "learning_rate": 9.303328415803703e-06, + "loss": 0.072, + "reward": 0.09233768656849861, + "reward_std": 0.10291510075330734, + "rewards/ndcg_rule_reward": -0.01899043843150139, + "rewards/rule_reward": 0.111328125, + "step": 675, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.3912037037037037, + "grad_norm": 1.602573037147522, + "kl": 26.875, + "learning_rate": 9.300940478621822e-06, + "loss": 0.0269, + "reward": 0.003445390844717622, + "reward_std": 0.10780258104205132, + "rewards/ndcg_rule_reward": -0.021945233456790447, + "rewards/rule_reward": 0.025390625, + "step": 676, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.3917824074074074, + "grad_norm": 1.9342790842056274, + "kl": 37.3125, + "learning_rate": 9.298548763499336e-06, + "loss": 0.0373, + "reward": 0.06440115347504616, + "reward_std": 0.13296069204807281, + "rewards/ndcg_rule_reward": -0.02544259186834097, + "rewards/rule_reward": 0.08984375, + "step": 677, + "token_diversity": 0.44921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.04296875, + "epoch": 0.3923611111111111, + "grad_norm": 2.671261787414551, + "kl": 32.875, + "learning_rate": 9.29615327253712e-06, + "loss": 0.0328, + "reward": 0.030986781464889646, + "reward_std": 0.1444493755698204, + "rewards/ndcg_rule_reward": -0.027606971561908722, + "rewards/rule_reward": 0.05859375, + "step": 678, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.109375, + "epoch": 0.3929398148148148, + "grad_norm": 1.939698576927185, + "kl": 42.625, + "learning_rate": 9.293754007839374e-06, + "loss": 0.0426, + "reward": 0.07341565983369946, + "reward_std": 0.10820600762963295, + "rewards/ndcg_rule_reward": -0.01838121935725212, + "rewards/rule_reward": 0.091796875, + "step": 679, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.01953125, + "epoch": 0.39351851851851855, + "grad_norm": 1.8456817865371704, + "kl": 45.5, + "learning_rate": 9.291350971513611e-06, + "loss": 0.0454, + "reward": 0.017954515758901834, + "reward_std": 0.1418665051460266, + "rewards/ndcg_rule_reward": -0.026967357844114304, + "rewards/rule_reward": 0.044921875, + "step": 680, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.078125, + "epoch": 0.3940972222222222, + "grad_norm": 1.7582858800888062, + "kl": 35.875, + "learning_rate": 9.288944165670651e-06, + "loss": 0.0359, + "reward": 0.05205258168280125, + "reward_std": 0.09153715521097183, + "rewards/ndcg_rule_reward": -0.01630679238587618, + "rewards/rule_reward": 0.068359375, + "step": 681, + "token_diversity": 0.44921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.39467592592592593, + "grad_norm": 1.9435755014419556, + "kl": 21.09375, + "learning_rate": 9.286533592424632e-06, + "loss": 0.0211, + "reward": 0.003113290702458471, + "reward_std": 0.09953908994793892, + "rewards/ndcg_rule_reward": -0.02032421063631773, + "rewards/rule_reward": 0.0234375, + "step": 682, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.11328125, + "epoch": 0.39525462962962965, + "grad_norm": 7.795340061187744, + "kl": 30.125, + "learning_rate": 9.284119253892996e-06, + "loss": 0.0301, + "reward": 0.05091725755482912, + "reward_std": 0.15352775156497955, + "rewards/ndcg_rule_reward": -0.027207741513848305, + "rewards/rule_reward": 0.078125, + "step": 683, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.15234375, + "epoch": 0.3958333333333333, + "grad_norm": 2.6141557693481445, + "kl": 32.5625, + "learning_rate": 9.281701152196495e-06, + "loss": 0.0325, + "reward": 0.10237674787640572, + "reward_std": 0.16466327011585236, + "rewards/ndcg_rule_reward": -0.028482625260949135, + "rewards/rule_reward": 0.130859375, + "step": 684, + "token_diversity": 0.4400275735294118 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.078125, + "epoch": 0.39641203703703703, + "grad_norm": 2.6592795848846436, + "kl": 27.0, + "learning_rate": 9.279279289459188e-06, + "loss": 0.027, + "reward": 0.0534761028829962, + "reward_std": 0.14134396985173225, + "rewards/ndcg_rule_reward": -0.026602022349834442, + "rewards/rule_reward": 0.080078125, + "step": 685, + "token_diversity": 0.52734375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1484375, + "epoch": 0.39699074074074076, + "grad_norm": 3.426417112350464, + "kl": 17.1875, + "learning_rate": 9.276853667808437e-06, + "loss": 0.0172, + "reward": 0.09445209032855928, + "reward_std": 0.10829732194542885, + "rewards/ndcg_rule_reward": -0.020782284438610077, + "rewards/rule_reward": 0.115234375, + "step": 686, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 0.96875, + "completion_length": 5.16796875, + "epoch": 0.3975694444444444, + "grad_norm": 3.8784101009368896, + "kl": 39.0, + "learning_rate": 9.274424289374901e-06, + "loss": 0.0389, + "reward": 0.10855631157755852, + "reward_std": 0.11159270629286766, + "rewards/ndcg_rule_reward": -0.016443694941699505, + "rewards/rule_reward": 0.125, + "step": 687, + "token_diversity": 0.4201694542253521 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.39814814814814814, + "grad_norm": 2.3018152713775635, + "kl": 28.03125, + "learning_rate": 9.271991156292548e-06, + "loss": 0.028, + "reward": 0.0361772789619863, + "reward_std": 0.1660218983888626, + "rewards/ndcg_rule_reward": -0.03413522057235241, + "rewards/rule_reward": 0.0703125, + "step": 688, + "token_diversity": 0.5234375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.39872685185185186, + "grad_norm": 2.283848762512207, + "kl": 31.59375, + "learning_rate": 9.269554270698636e-06, + "loss": 0.0315, + "reward": 0.03183660900685936, + "reward_std": 0.12516817077994347, + "rewards/ndcg_rule_reward": -0.024804013781249523, + "rewards/rule_reward": 0.056640625, + "step": 689, + "token_diversity": 0.5390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.03515625, + "epoch": 0.3993055555555556, + "grad_norm": 2.1971511840820312, + "kl": 19.0625, + "learning_rate": 9.267113634733723e-06, + "loss": 0.0191, + "reward": 0.02474905946291983, + "reward_std": 0.08153865858912468, + "rewards/ndcg_rule_reward": -0.014313442632555962, + "rewards/rule_reward": 0.0390625, + "step": 690, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.09765625, + "epoch": 0.39988425925925924, + "grad_norm": 5.43479585647583, + "kl": 30.0625, + "learning_rate": 9.264669250541658e-06, + "loss": 0.03, + "reward": 0.0635112770833075, + "reward_std": 0.15022698044776917, + "rewards/ndcg_rule_reward": -0.030238726176321507, + "rewards/rule_reward": 0.09375, + "step": 691, + "token_diversity": 0.36862664473684215 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.2265625, + "epoch": 0.40046296296296297, + "grad_norm": 1.866894006729126, + "kl": 29.6875, + "learning_rate": 9.262221120269588e-06, + "loss": 0.0297, + "reward": 0.14386290684342384, + "reward_std": 0.11838347092270851, + "rewards/ndcg_rule_reward": -0.01824646256864071, + "rewards/rule_reward": 0.162109375, + "step": 692, + "token_diversity": 0.47265625 + }, + { + "epoch": 0.40046296296296297, + "eval_categorical_diversity": 1.0, + "eval_completion_length": 5.0, + "eval_kl": 25.946428571428573, + "eval_loss": 0.02599971741437912, + "eval_reward": 0.00202539944628181, + "eval_reward_std": 0.06572837242251867, + "eval_rewards/ndcg_rule_reward": -0.013447408897003956, + "eval_rewards/rule_reward": 0.015472808441558442, + "eval_runtime": 92.31, + "eval_samples_per_second": 52.714, + "eval_steps_per_second": 0.054, + "eval_token_diversity": 0.3752536525974026, + "step": 692 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.15234375, + "epoch": 0.4010416666666667, + "grad_norm": 1.9511610269546509, + "kl": 38.4375, + "learning_rate": 9.259769246067946e-06, + "loss": 0.0385, + "reward": 0.09792459383606911, + "reward_std": 0.12430201098322868, + "rewards/ndcg_rule_reward": -0.025122277438640594, + "rewards/rule_reward": 0.123046875, + "step": 693, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.09765625, + "epoch": 0.40162037037037035, + "grad_norm": 2.113987445831299, + "kl": 12.1796875, + "learning_rate": 9.257313630090456e-06, + "loss": 0.0122, + "reward": 0.06405769661068916, + "reward_std": 0.14152583479881287, + "rewards/ndcg_rule_reward": -0.027739176526665688, + "rewards/rule_reward": 0.091796875, + "step": 694, + "token_diversity": 0.40198863636363635 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1015625, + "epoch": 0.40219907407407407, + "grad_norm": 2.046823024749756, + "kl": 13.46875, + "learning_rate": 9.254854274494128e-06, + "loss": 0.0134, + "reward": 0.06535656005144119, + "reward_std": 0.1080479547381401, + "rewards/ndcg_rule_reward": -0.022534062154591084, + "rewards/rule_reward": 0.087890625, + "step": 695, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1953125, + "epoch": 0.4027777777777778, + "grad_norm": 2.542956829071045, + "kl": 41.125, + "learning_rate": 9.252391181439257e-06, + "loss": 0.0411, + "reward": 0.12350138649344444, + "reward_std": 0.12533284723758698, + "rewards/ndcg_rule_reward": -0.02298299130052328, + "rewards/rule_reward": 0.146484375, + "step": 696, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1171875, + "epoch": 0.40335648148148145, + "grad_norm": 10.3110990524292, + "kl": 117.5, + "learning_rate": 9.24992435308942e-06, + "loss": 0.1175, + "reward": 0.08317902311682701, + "reward_std": 0.18926531821489334, + "rewards/ndcg_rule_reward": -0.03205535188317299, + "rewards/rule_reward": 0.115234375, + "step": 697, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.4039351851851852, + "grad_norm": 2.2776575088500977, + "kl": 39.375, + "learning_rate": 9.247453791611475e-06, + "loss": 0.0394, + "reward": 0.06256766617298126, + "reward_std": 0.15271514654159546, + "rewards/ndcg_rule_reward": -0.029229210689663887, + "rewards/rule_reward": 0.091796875, + "step": 698, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.4045138888888889, + "grad_norm": 2.112199544906616, + "kl": 18.65625, + "learning_rate": 9.244979499175562e-06, + "loss": 0.0186, + "reward": 0.0635810736566782, + "reward_std": 0.1333843097090721, + "rewards/ndcg_rule_reward": -0.02626267448067665, + "rewards/rule_reward": 0.08984375, + "step": 699, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.4050925925925926, + "grad_norm": 1.6370484828948975, + "kl": 21.9375, + "learning_rate": 9.242501477955094e-06, + "loss": 0.022, + "reward": 0.03508269414305687, + "reward_std": 0.11604466289281845, + "rewards/ndcg_rule_reward": -0.02351105399429798, + "rewards/rule_reward": 0.05859375, + "step": 700, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.4056712962962963, + "grad_norm": 1.8637104034423828, + "kl": 40.125, + "learning_rate": 9.240019730126764e-06, + "loss": 0.0401, + "reward": 0.03414966014679521, + "reward_std": 0.09963765740394592, + "rewards/ndcg_rule_reward": -0.020537841133773327, + "rewards/rule_reward": 0.0546875, + "step": 701, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.04296875, + "epoch": 0.40625, + "grad_norm": 2.574402332305908, + "kl": 31.0, + "learning_rate": 9.237534257870537e-06, + "loss": 0.031, + "reward": 0.030946264509111643, + "reward_std": 0.14449549466371536, + "rewards/ndcg_rule_reward": -0.027647485956549644, + "rewards/rule_reward": 0.05859375, + "step": 702, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.19140625, + "epoch": 0.4068287037037037, + "grad_norm": 3.1649222373962402, + "kl": 43.125, + "learning_rate": 9.235045063369644e-06, + "loss": 0.0432, + "reward": 0.1230066604912281, + "reward_std": 0.1612660139799118, + "rewards/ndcg_rule_reward": -0.029337088577449322, + "rewards/rule_reward": 0.15234375, + "step": 703, + "token_diversity": 0.4221286525974026 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.04296875, + "epoch": 0.4074074074074074, + "grad_norm": 1.5325844287872314, + "kl": 24.03125, + "learning_rate": 9.232552148810594e-06, + "loss": 0.0241, + "reward": 0.029356978717260063, + "reward_std": 0.1031399592757225, + "rewards/ndcg_rule_reward": -0.019471146166324615, + "rewards/rule_reward": 0.048828125, + "step": 704, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.4079861111111111, + "grad_norm": 2.9474942684173584, + "kl": 34.25, + "learning_rate": 9.23005551638316e-06, + "loss": 0.0342, + "reward": 0.034867874695919454, + "reward_std": 0.12453135475516319, + "rewards/ndcg_rule_reward": -0.025678997859358788, + "rewards/rule_reward": 0.060546875, + "step": 705, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.1015625, + "epoch": 0.4085648148148148, + "grad_norm": 2.267597198486328, + "kl": 15.59375, + "learning_rate": 9.22755516828038e-06, + "loss": 0.0156, + "reward": 0.06655294448137283, + "reward_std": 0.14116296917200089, + "rewards/ndcg_rule_reward": -0.029150180518627167, + "rewards/rule_reward": 0.095703125, + "step": 706, + "token_diversity": 0.35709212662337664 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.275390625, + "epoch": 0.40914351851851855, + "grad_norm": 2.2475013732910156, + "kl": 24.25, + "learning_rate": 9.225051106698555e-06, + "loss": 0.0242, + "reward": 0.036956289783120155, + "reward_std": 0.11914834752678871, + "rewards/ndcg_rule_reward": -0.023590581491589546, + "rewards/rule_reward": 0.060546875, + "step": 707, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.4097222222222222, + "grad_norm": 2.0207207202911377, + "kl": 33.375, + "learning_rate": 9.222543333837251e-06, + "loss": 0.0333, + "reward": 0.03494434687308967, + "reward_std": 0.11609195917844772, + "rewards/ndcg_rule_reward": -0.02364940196275711, + "rewards/rule_reward": 0.05859375, + "step": 708, + "token_diversity": 0.5625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.091796875, + "epoch": 0.41030092592592593, + "grad_norm": 2.291900634765625, + "kl": 54.5, + "learning_rate": 9.220031851899293e-06, + "loss": 0.0546, + "reward": 0.06235622428357601, + "reward_std": 0.14097196981310844, + "rewards/ndcg_rule_reward": -0.025534399785101414, + "rewards/rule_reward": 0.087890625, + "step": 709, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.15234375, + "epoch": 0.41087962962962965, + "grad_norm": 3.049992561340332, + "kl": 10.875, + "learning_rate": 9.217516663090762e-06, + "loss": 0.0109, + "reward": 0.06503114470979199, + "reward_std": 0.09141329675912857, + "rewards/ndcg_rule_reward": -0.0189532358199358, + "rewards/rule_reward": 0.083984375, + "step": 710, + "token_diversity": 0.38047889610389607 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.09765625, + "epoch": 0.4114583333333333, + "grad_norm": 1.413703441619873, + "kl": 25.1875, + "learning_rate": 9.214997769620998e-06, + "loss": 0.0251, + "reward": 0.06225856317905709, + "reward_std": 0.0750420019030571, + "rewards/ndcg_rule_reward": -0.013913311995565891, + "rewards/rule_reward": 0.076171875, + "step": 711, + "token_diversity": 0.39742288961038963 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.41203703703703703, + "grad_norm": 2.7036874294281006, + "kl": 51.5, + "learning_rate": 9.212475173702593e-06, + "loss": 0.0515, + "reward": 0.03292370890267193, + "reward_std": 0.14146170020103455, + "rewards/ndcg_rule_reward": -0.02762316633015871, + "rewards/rule_reward": 0.060546875, + "step": 712, + "token_diversity": 0.53515625 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.150390625, + "epoch": 0.41261574074074076, + "grad_norm": 5.346015453338623, + "kl": 66.5, + "learning_rate": 9.209948877551393e-06, + "loss": 0.0664, + "reward": 0.08921198546886444, + "reward_std": 0.0717560164630413, + "rewards/ndcg_rule_reward": -0.012350515462458134, + "rewards/rule_reward": 0.1015625, + "step": 713, + "token_diversity": 0.31290584415584416 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.08984375, + "epoch": 0.4131944444444444, + "grad_norm": 1.9914329051971436, + "kl": 22.0625, + "learning_rate": 9.207418883386492e-06, + "loss": 0.022, + "reward": 0.05831712298095226, + "reward_std": 0.12018423900008202, + "rewards/ndcg_rule_reward": -0.021761002019047737, + "rewards/rule_reward": 0.080078125, + "step": 714, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 0.984375, + "completion_length": 6.46484375, + "epoch": 0.41377314814814814, + "grad_norm": 2.4501781463623047, + "kl": 34.0, + "learning_rate": 9.204885193430234e-06, + "loss": 0.034, + "reward": 0.03483220096677542, + "reward_std": 0.10772297158837318, + "rewards/ndcg_rule_reward": -0.021808422170579433, + "rewards/rule_reward": 0.056640625, + "step": 715, + "token_diversity": 0.2922710414218566 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.41435185185185186, + "grad_norm": 2.7223048210144043, + "kl": 38.75, + "learning_rate": 9.20234780990821e-06, + "loss": 0.0387, + "reward": 0.03523960802704096, + "reward_std": 0.12435682490468025, + "rewards/ndcg_rule_reward": -0.025307269766926765, + "rewards/rule_reward": 0.060546875, + "step": 716, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.14453125, + "epoch": 0.4149305555555556, + "grad_norm": 2.833574056625366, + "kl": 41.625, + "learning_rate": 9.199806735049253e-06, + "loss": 0.0416, + "reward": 0.09249377995729446, + "reward_std": 0.17118311673402786, + "rewards/ndcg_rule_reward": -0.032506220042705536, + "rewards/rule_reward": 0.125, + "step": 717, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.607421875, + "epoch": 0.41550925925925924, + "grad_norm": 2.600550413131714, + "kl": 41.875, + "learning_rate": 9.197261971085441e-06, + "loss": 0.0419, + "reward": 0.08280140161514282, + "reward_std": 0.1437697410583496, + "rewards/ndcg_rule_reward": -0.024620477110147476, + "rewards/rule_reward": 0.107421875, + "step": 718, + "token_diversity": 0.421875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.41608796296296297, + "grad_norm": 1.7597451210021973, + "kl": 25.125, + "learning_rate": 9.194713520252089e-06, + "loss": 0.0251, + "reward": 0.003964291187003255, + "reward_std": 0.12436969205737114, + "rewards/ndcg_rule_reward": -0.025332584977149963, + "rewards/rule_reward": 0.029296875, + "step": 719, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.03515625, + "epoch": 0.4166666666666667, + "grad_norm": 2.321655511856079, + "kl": 36.875, + "learning_rate": 9.192161384787757e-06, + "loss": 0.037, + "reward": 0.02727325912564993, + "reward_std": 0.17289388179779053, + "rewards/ndcg_rule_reward": -0.033273616805672646, + "rewards/rule_reward": 0.060546875, + "step": 720, + "token_diversity": 0.515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.41724537037037035, + "grad_norm": 2.1324219703674316, + "kl": 30.25, + "learning_rate": 9.189605566934235e-06, + "loss": 0.0302, + "reward": 0.03502341965213418, + "reward_std": 0.11606483161449432, + "rewards/ndcg_rule_reward": -0.023570331744849682, + "rewards/rule_reward": 0.05859375, + "step": 721, + "token_diversity": 0.44921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.19921875, + "epoch": 0.41782407407407407, + "grad_norm": 2.1975109577178955, + "kl": 65.875, + "learning_rate": 9.18704606893655e-06, + "loss": 0.0658, + "reward": 0.13040407001972198, + "reward_std": 0.17023011296987534, + "rewards/ndcg_rule_reward": -0.025845929980278015, + "rewards/rule_reward": 0.15625, + "step": 722, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.4184027777777778, + "grad_norm": 2.4827208518981934, + "kl": 24.9375, + "learning_rate": 9.184482893042963e-06, + "loss": 0.0249, + "reward": 0.03478486044332385, + "reward_std": 0.11620408296585083, + "rewards/ndcg_rule_reward": -0.023808891884982586, + "rewards/rule_reward": 0.05859375, + "step": 723, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.03125, + "epoch": 0.41898148148148145, + "grad_norm": 3.6052515506744385, + "kl": 20.625, + "learning_rate": 9.181916041504963e-06, + "loss": 0.0206, + "reward": 0.023790923412889242, + "reward_std": 0.13251357525587082, + "rewards/ndcg_rule_reward": -0.02503719925880432, + "rewards/rule_reward": 0.048828125, + "step": 724, + "token_diversity": 0.54296875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.4195601851851852, + "grad_norm": 2.651691198348999, + "kl": 27.6875, + "learning_rate": 9.179345516577271e-06, + "loss": 0.0277, + "reward": 0.03388810157775879, + "reward_std": 0.08292349800467491, + "rewards/ndcg_rule_reward": -0.01689314702525735, + "rewards/rule_reward": 0.05078125, + "step": 725, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.07421875, + "epoch": 0.4201388888888889, + "grad_norm": 2.392484664916992, + "kl": 41.875, + "learning_rate": 9.17677132051783e-06, + "loss": 0.042, + "reward": 0.0498765857773833, + "reward_std": 0.1087292730808258, + "rewards/ndcg_rule_reward": -0.020435916259884834, + "rewards/rule_reward": 0.0703125, + "step": 726, + "token_diversity": 0.37789163961038963 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.142578125, + "epoch": 0.4207175925925926, + "grad_norm": 2.3286185264587402, + "kl": 76.75, + "learning_rate": 9.174193455587814e-06, + "loss": 0.0765, + "reward": 0.09582061693072319, + "reward_std": 0.16138329356908798, + "rewards/ndcg_rule_reward": -0.027226263657212257, + "rewards/rule_reward": 0.123046875, + "step": 727, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.048828125, + "epoch": 0.4212962962962963, + "grad_norm": 3.167949676513672, + "kl": 40.75, + "learning_rate": 9.171611924051614e-06, + "loss": 0.0407, + "reward": 0.040749797597527504, + "reward_std": 0.14923537522554398, + "rewards/ndcg_rule_reward": -0.027609577402472496, + "rewards/rule_reward": 0.068359375, + "step": 728, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.099609375, + "epoch": 0.421875, + "grad_norm": 2.4237587451934814, + "kl": 45.9375, + "learning_rate": 9.169026728176845e-06, + "loss": 0.0459, + "reward": 0.06298803165555, + "reward_std": 0.08311247825622559, + "rewards/ndcg_rule_reward": -0.015136968344449997, + "rewards/rule_reward": 0.078125, + "step": 729, + "token_diversity": 0.3984375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 4.994140625, + "epoch": 0.4224537037037037, + "grad_norm": 1.521089792251587, + "kl": 24.4375, + "learning_rate": 9.166437870234337e-06, + "loss": 0.0244, + "reward": 0.0035944220144301653, + "reward_std": 0.1161528043448925, + "rewards/ndcg_rule_reward": -0.02374932821840048, + "rewards/rule_reward": 0.02734375, + "step": 730, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.14453125, + "epoch": 0.4230324074074074, + "grad_norm": 2.0519745349884033, + "kl": 34.375, + "learning_rate": 9.163845352498141e-06, + "loss": 0.0344, + "reward": 0.09348005801439285, + "reward_std": 0.1415848210453987, + "rewards/ndcg_rule_reward": -0.025660565122961998, + "rewards/rule_reward": 0.119140625, + "step": 731, + "token_diversity": 0.5234375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.4236111111111111, + "grad_norm": 3.110650062561035, + "kl": 25.125, + "learning_rate": 9.161249177245524e-06, + "loss": 0.0252, + "reward": 0.03276698081754148, + "reward_std": 0.133182555437088, + "rewards/ndcg_rule_reward": -0.025826768949627876, + "rewards/rule_reward": 0.05859375, + "step": 732, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 0.90625, + "completion_length": 5.13671875, + "epoch": 0.4241898148148148, + "grad_norm": 2.0835447311401367, + "kl": 34.75, + "learning_rate": 9.158649346756962e-06, + "loss": 0.0347, + "reward": 0.08775017783045769, + "reward_std": 0.11471054330468178, + "rewards/ndcg_rule_reward": -0.019671696238219738, + "rewards/rule_reward": 0.107421875, + "step": 733, + "token_diversity": 0.4050520833333333 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.09765625, + "epoch": 0.42476851851851855, + "grad_norm": 2.0480997562408447, + "kl": 27.3125, + "learning_rate": 9.156045863316139e-06, + "loss": 0.0272, + "reward": 0.06373218004591763, + "reward_std": 0.11643737554550171, + "rewards/ndcg_rule_reward": -0.022205322049558163, + "rewards/rule_reward": 0.0859375, + "step": 734, + "token_diversity": 0.34669237012987014 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.201171875, + "epoch": 0.4253472222222222, + "grad_norm": 1.7535064220428467, + "kl": 21.125, + "learning_rate": 9.153438729209956e-06, + "loss": 0.0211, + "reward": 0.12665466219186783, + "reward_std": 0.11624051630496979, + "rewards/ndcg_rule_reward": -0.02178283128887415, + "rewards/rule_reward": 0.1484375, + "step": 735, + "token_diversity": 0.3912550403225806 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.42592592592592593, + "grad_norm": 8.178337097167969, + "kl": 47.4375, + "learning_rate": 9.150827946728514e-06, + "loss": 0.0475, + "reward": 0.03397458023391664, + "reward_std": 0.08290023729205132, + "rewards/ndcg_rule_reward": -0.016806667670607567, + "rewards/rule_reward": 0.05078125, + "step": 736, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.42650462962962965, + "grad_norm": 1.7766118049621582, + "kl": 25.5625, + "learning_rate": 9.148213518165121e-06, + "loss": 0.0256, + "reward": 0.0029325337382033467, + "reward_std": 0.10802771151065826, + "rewards/ndcg_rule_reward": -0.022458091378211975, + "rewards/rule_reward": 0.025390625, + "step": 737, + "token_diversity": 0.5390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.4270833333333333, + "grad_norm": 2.181314706802368, + "kl": 29.875, + "learning_rate": 9.145595445816289e-06, + "loss": 0.0299, + "reward": 0.03156466665677726, + "reward_std": 0.10844840481877327, + "rewards/ndcg_rule_reward": -0.021169708110392094, + "rewards/rule_reward": 0.052734375, + "step": 738, + "token_diversity": 0.44140625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.42766203703703703, + "grad_norm": 2.095871925354004, + "kl": 53.75, + "learning_rate": 9.142973731981727e-06, + "loss": 0.0537, + "reward": 0.06287880800664425, + "reward_std": 0.16654076427221298, + "rewards/ndcg_rule_reward": -0.03087119199335575, + "rewards/rule_reward": 0.09375, + "step": 739, + "token_diversity": 0.40625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.42824074074074076, + "grad_norm": 2.300292491912842, + "kl": 16.96875, + "learning_rate": 9.140348378964349e-06, + "loss": 0.0169, + "reward": 0.03335167083423585, + "reward_std": 0.08319147676229477, + "rewards/ndcg_rule_reward": -0.01742957765236497, + "rewards/rule_reward": 0.05078125, + "step": 740, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.099609375, + "epoch": 0.4288194444444444, + "grad_norm": 1.7438085079193115, + "kl": 30.625, + "learning_rate": 9.137719389070259e-06, + "loss": 0.0306, + "reward": 0.06347411312162876, + "reward_std": 0.12494552507996559, + "rewards/ndcg_rule_reward": -0.024416510947048664, + "rewards/rule_reward": 0.087890625, + "step": 741, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0390625, + "epoch": 0.42939814814814814, + "grad_norm": 5.3754048347473145, + "kl": 49.875, + "learning_rate": 9.135086764608756e-06, + "loss": 0.0499, + "reward": 0.02847189037129283, + "reward_std": 0.1466580256819725, + "rewards/ndcg_rule_reward": -0.028168732300400734, + "rewards/rule_reward": 0.056640625, + "step": 742, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.001953125, + "epoch": 0.42997685185185186, + "grad_norm": 2.461094617843628, + "kl": 23.0, + "learning_rate": 9.132450507892339e-06, + "loss": 0.023, + "reward": 0.004348101560026407, + "reward_std": 0.13261358067393303, + "rewards/ndcg_rule_reward": -0.02690189890563488, + "rewards/rule_reward": 0.03125, + "step": 743, + "token_diversity": 0.44140625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.08984375, + "epoch": 0.4305555555555556, + "grad_norm": 2.3856401443481445, + "kl": 31.75, + "learning_rate": 9.12981062123669e-06, + "loss": 0.0318, + "reward": 0.05920462682843208, + "reward_std": 0.14510203897953033, + "rewards/ndcg_rule_reward": -0.02673287410289049, + "rewards/rule_reward": 0.0859375, + "step": 744, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.43113425925925924, + "grad_norm": 1.9609482288360596, + "kl": 52.5, + "learning_rate": 9.127167106960682e-06, + "loss": 0.0525, + "reward": 0.061940996907651424, + "reward_std": 0.12774435058236122, + "rewards/ndcg_rule_reward": -0.0239965058863163, + "rewards/rule_reward": 0.0859375, + "step": 745, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.15234375, + "epoch": 0.43171296296296297, + "grad_norm": 1.8519879579544067, + "kl": 28.6875, + "learning_rate": 9.124519967386374e-06, + "loss": 0.0287, + "reward": 0.09617509692907333, + "reward_std": 0.08303020521998405, + "rewards/ndcg_rule_reward": -0.01710614375770092, + "rewards/rule_reward": 0.11328125, + "step": 746, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.4322916666666667, + "grad_norm": 3.719388961791992, + "kl": 39.625, + "learning_rate": 9.121869204839011e-06, + "loss": 0.0397, + "reward": 0.03355890337843448, + "reward_std": 0.14119598269462585, + "rewards/ndcg_rule_reward": -0.026987971737980843, + "rewards/rule_reward": 0.060546875, + "step": 747, + "token_diversity": 0.44140625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.03515625, + "epoch": 0.43287037037037035, + "grad_norm": 2.299987554550171, + "kl": 19.09375, + "learning_rate": 9.11921482164702e-06, + "loss": 0.0191, + "reward": 0.02496818988583982, + "reward_std": 0.09823457151651382, + "rewards/ndcg_rule_reward": -0.018000562209635973, + "rewards/rule_reward": 0.04296875, + "step": 748, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 0.90625, + "completion_length": 5.15625, + "epoch": 0.43344907407407407, + "grad_norm": 1.9344444274902344, + "kl": 27.5, + "learning_rate": 9.116556820142007e-06, + "loss": 0.0274, + "reward": 0.10152571648359299, + "reward_std": 0.12557445839047432, + "rewards/ndcg_rule_reward": -0.01761490758508444, + "rewards/rule_reward": 0.119140625, + "step": 749, + "token_diversity": 0.34369791666666666 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.4340277777777778, + "grad_norm": 4.004100322723389, + "kl": 19.75, + "learning_rate": 9.113895202658757e-06, + "loss": 0.0198, + "reward": 0.034577333368360996, + "reward_std": 0.116267129778862, + "rewards/ndcg_rule_reward": -0.02401641756296158, + "rewards/rule_reward": 0.05859375, + "step": 750, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.822265625, + "epoch": 0.43460648148148145, + "grad_norm": 2.4787304401397705, + "kl": 28.875, + "learning_rate": 9.111229971535231e-06, + "loss": 0.0289, + "reward": 0.00257866526953876, + "reward_std": 0.07454708218574524, + "rewards/ndcg_rule_reward": -0.01499946042895317, + "rewards/rule_reward": 0.017578125, + "step": 751, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.4351851851851852, + "grad_norm": 1.8925715684890747, + "kl": 31.1875, + "learning_rate": 9.108561129112566e-06, + "loss": 0.0311, + "reward": 0.004154377616941929, + "reward_std": 0.12427862733602524, + "rewards/ndcg_rule_reward": -0.025142496451735497, + "rewards/rule_reward": 0.029296875, + "step": 752, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.4357638888888889, + "grad_norm": 1.7521052360534668, + "kl": 30.15625, + "learning_rate": 9.105888677735069e-06, + "loss": 0.0301, + "reward": 0.03383369208313525, + "reward_std": 0.09140601754188538, + "rewards/ndcg_rule_reward": -0.018900682218372822, + "rewards/rule_reward": 0.052734375, + "step": 753, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.4363425925925926, + "grad_norm": 2.0314860343933105, + "kl": 19.5625, + "learning_rate": 9.103212619750217e-06, + "loss": 0.0195, + "reward": 0.03444295737426728, + "reward_std": 0.09951380640268326, + "rewards/ndcg_rule_reward": -0.020244544371962547, + "rewards/rule_reward": 0.0546875, + "step": 754, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.4369212962962963, + "grad_norm": 2.5850162506103516, + "kl": 18.1875, + "learning_rate": 9.100532957508657e-06, + "loss": 0.0182, + "reward": 0.03261018544435501, + "reward_std": 0.13322902843356133, + "rewards/ndcg_rule_reward": -0.02598356455564499, + "rewards/rule_reward": 0.05859375, + "step": 755, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.19921875, + "epoch": 0.4375, + "grad_norm": 2.3379554748535156, + "kl": 25.1875, + "learning_rate": 9.0978496933642e-06, + "loss": 0.0253, + "reward": 0.12512602657079697, + "reward_std": 0.0748942457139492, + "rewards/ndcg_rule_reward": -0.01354584563523531, + "rewards/rule_reward": 0.138671875, + "step": 756, + "token_diversity": 0.30823863636363635 + }, + { + "categorical_diversity": 0.90625, + "completion_length": 5.236328125, + "epoch": 0.4380787037037037, + "grad_norm": 3.22703218460083, + "kl": 43.25, + "learning_rate": 9.095162829673824e-06, + "loss": 0.0433, + "reward": 0.03405248571652919, + "reward_std": 0.09966876357793808, + "rewards/ndcg_rule_reward": -0.020635013468563557, + "rewards/rule_reward": 0.0546875, + "step": 757, + "token_diversity": 0.32017131024096385 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0546875, + "epoch": 0.4386574074074074, + "grad_norm": 2.955162525177002, + "kl": 35.0625, + "learning_rate": 9.092472368797662e-06, + "loss": 0.035, + "reward": 0.0387727078050375, + "reward_std": 0.12129965797066689, + "rewards/ndcg_rule_reward": -0.023727286607027054, + "rewards/rule_reward": 0.0625, + "step": 758, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.048828125, + "epoch": 0.4392361111111111, + "grad_norm": 2.095766305923462, + "kl": 51.375, + "learning_rate": 9.089778313099018e-06, + "loss": 0.0512, + "reward": 0.03734349994920194, + "reward_std": 0.14245515316724777, + "rewards/ndcg_rule_reward": -0.025156499817967415, + "rewards/rule_reward": 0.0625, + "step": 759, + "token_diversity": 0.43666507633587787 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.328125, + "epoch": 0.4398148148148148, + "grad_norm": 2.5411083698272705, + "kl": 29.3125, + "learning_rate": 9.087080664944342e-06, + "loss": 0.0293, + "reward": 0.059133823961019516, + "reward_std": 0.11098276823759079, + "rewards/ndcg_rule_reward": -0.018991176038980484, + "rewards/rule_reward": 0.078125, + "step": 760, + "token_diversity": 0.37582236842105265 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.150390625, + "epoch": 0.44039351851851855, + "grad_norm": 3.2106635570526123, + "kl": 23.4375, + "learning_rate": 9.084379426703245e-06, + "loss": 0.0234, + "reward": 0.09417941537685692, + "reward_std": 0.09159445017576218, + "rewards/ndcg_rule_reward": -0.017148712649941444, + "rewards/rule_reward": 0.111328125, + "step": 761, + "token_diversity": 0.38175403225806454 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.119140625, + "epoch": 0.4409722222222222, + "grad_norm": 242.9435577392578, + "kl": 773.75, + "learning_rate": 9.081674600748493e-06, + "loss": 0.7758, + "reward": 0.08113081753253937, + "reward_std": 0.14139487594366074, + "rewards/ndcg_rule_reward": -0.024337933398783207, + "rewards/rule_reward": 0.10546875, + "step": 762, + "token_diversity": 0.44140625 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.05078125, + "epoch": 0.44155092592592593, + "grad_norm": 8.776951789855957, + "kl": 36.375, + "learning_rate": 9.078966189456e-06, + "loss": 0.0362, + "reward": 0.034933219430968165, + "reward_std": 0.12448546290397644, + "rewards/ndcg_rule_reward": -0.025613654404878616, + "rewards/rule_reward": 0.060546875, + "step": 763, + "token_diversity": 0.37200689935064934 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.44212962962962965, + "grad_norm": 2.561373472213745, + "kl": 29.625, + "learning_rate": 9.076254195204835e-06, + "loss": 0.0296, + "reward": 0.034742638701573014, + "reward_std": 0.10778345912694931, + "rewards/ndcg_rule_reward": -0.021897981874644756, + "rewards/rule_reward": 0.056640625, + "step": 764, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.193359375, + "epoch": 0.4427083333333333, + "grad_norm": 2.6652050018310547, + "kl": 42.125, + "learning_rate": 9.073538620377203e-06, + "loss": 0.0421, + "reward": 0.12037501484155655, + "reward_std": 0.09520300105214119, + "rewards/ndcg_rule_reward": -0.016343730501830578, + "rewards/rule_reward": 0.13671875, + "step": 765, + "token_diversity": 0.40234375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.44328703703703703, + "grad_norm": 2.021252393722534, + "kl": 14.5, + "learning_rate": 9.070819467358469e-06, + "loss": 0.0145, + "reward": 0.033056239248253405, + "reward_std": 0.09174839034676552, + "rewards/ndcg_rule_reward": -0.01967813540250063, + "rewards/rule_reward": 0.052734375, + "step": 766, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.08984375, + "epoch": 0.44386574074074076, + "grad_norm": 2.1661365032196045, + "kl": 33.6875, + "learning_rate": 9.068096738537128e-06, + "loss": 0.0336, + "reward": 0.05890633910894394, + "reward_std": 0.11338307708501816, + "rewards/ndcg_rule_reward": -0.021171782165765762, + "rewards/rule_reward": 0.080078125, + "step": 767, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.4444444444444444, + "grad_norm": 1.2411118745803833, + "kl": 15.53125, + "learning_rate": 9.065370436304826e-06, + "loss": 0.0155, + "reward": 0.03281316690845415, + "reward_std": 0.049764095805585384, + "rewards/ndcg_rule_reward": -0.010155581287108362, + "rewards/rule_reward": 0.04296875, + "step": 768, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.44502314814814814, + "grad_norm": 2.398341178894043, + "kl": 29.625, + "learning_rate": 9.062640563056339e-06, + "loss": 0.0296, + "reward": 0.03525810455903411, + "reward_std": 0.12435204535722733, + "rewards/ndcg_rule_reward": -0.025288766250014305, + "rewards/rule_reward": 0.060546875, + "step": 769, + "token_diversity": 0.44921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.04296875, + "epoch": 0.44560185185185186, + "grad_norm": 1.958714246749878, + "kl": 26.1875, + "learning_rate": 9.059907121189588e-06, + "loss": 0.0262, + "reward": 0.03087929799221456, + "reward_std": 0.13608784601092339, + "rewards/ndcg_rule_reward": -0.025761328637599945, + "rewards/rule_reward": 0.056640625, + "step": 770, + "token_diversity": 0.5234375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.4461805555555556, + "grad_norm": 6.071366786956787, + "kl": 29.3125, + "learning_rate": 9.057170113105622e-06, + "loss": 0.0293, + "reward": 0.061661697924137115, + "reward_std": 0.1334657222032547, + "rewards/ndcg_rule_reward": -0.02427580300718546, + "rewards/rule_reward": 0.0859375, + "step": 771, + "token_diversity": 0.4375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.08984375, + "epoch": 0.44675925925925924, + "grad_norm": 3.1321635246276855, + "kl": 28.9375, + "learning_rate": 9.054429541208627e-06, + "loss": 0.0289, + "reward": 0.05916150473058224, + "reward_std": 0.11140165477991104, + "rewards/ndcg_rule_reward": -0.018963496200740337, + "rewards/rule_reward": 0.078125, + "step": 772, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.08203125, + "epoch": 0.44733796296296297, + "grad_norm": 2.897353410720825, + "kl": 31.84375, + "learning_rate": 9.051685407905916e-06, + "loss": 0.0319, + "reward": 0.05452421493828297, + "reward_std": 0.10747864097356796, + "rewards/ndcg_rule_reward": -0.019694535993039608, + "rewards/rule_reward": 0.07421875, + "step": 773, + "token_diversity": 0.35384537337662336 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.4479166666666667, + "grad_norm": 2.615777015686035, + "kl": 36.375, + "learning_rate": 9.048937715607937e-06, + "loss": 0.0365, + "reward": 0.036102262791246176, + "reward_std": 0.15765098482370377, + "rewards/ndcg_rule_reward": -0.032257113605737686, + "rewards/rule_reward": 0.068359375, + "step": 774, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.14453125, + "epoch": 0.44849537037037035, + "grad_norm": 2.575727939605713, + "kl": 23.3125, + "learning_rate": 9.046186466728252e-06, + "loss": 0.0233, + "reward": 0.0922296866774559, + "reward_std": 0.11134485900402069, + "rewards/ndcg_rule_reward": -0.021051553543657064, + "rewards/rule_reward": 0.11328125, + "step": 775, + "token_diversity": 0.37012987012987014 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.287109375, + "epoch": 0.44907407407407407, + "grad_norm": 2.7171831130981445, + "kl": 33.0, + "learning_rate": 9.043431663683562e-06, + "loss": 0.0331, + "reward": 0.0658171798568219, + "reward_std": 0.13312383741140366, + "rewards/ndcg_rule_reward": -0.02793281991034746, + "rewards/rule_reward": 0.09375, + "step": 776, + "token_diversity": 0.53515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1796875, + "epoch": 0.4496527777777778, + "grad_norm": 2.7017226219177246, + "kl": 38.0, + "learning_rate": 9.040673308893677e-06, + "loss": 0.038, + "reward": 0.11572858691215515, + "reward_std": 0.1316988468170166, + "rewards/ndcg_rule_reward": -0.020990164019167423, + "rewards/rule_reward": 0.13671875, + "step": 777, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.45023148148148145, + "grad_norm": 8.795379638671875, + "kl": 36.625, + "learning_rate": 9.037911404781533e-06, + "loss": 0.0366, + "reward": 0.060702499002218246, + "reward_std": 0.10309355333447456, + "rewards/ndcg_rule_reward": -0.019375626929104328, + "rewards/rule_reward": 0.080078125, + "step": 778, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.4508101851851852, + "grad_norm": 103.81069946289062, + "kl": 290.0, + "learning_rate": 9.035145953773187e-06, + "loss": 0.2902, + "reward": 0.06472730217501521, + "reward_std": 0.14962351322174072, + "rewards/ndcg_rule_reward": -0.0290226973593235, + "rewards/rule_reward": 0.09375, + "step": 779, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.14453125, + "epoch": 0.4513888888888889, + "grad_norm": 1.6299078464508057, + "kl": 32.8125, + "learning_rate": 9.032376958297804e-06, + "loss": 0.0327, + "reward": 0.09176024235785007, + "reward_std": 0.09189708158373833, + "rewards/ndcg_rule_reward": -0.015661634504795074, + "rewards/rule_reward": 0.107421875, + "step": 780, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1484375, + "epoch": 0.4519675925925926, + "grad_norm": 2.452291965484619, + "kl": 26.5, + "learning_rate": 9.029604420787666e-06, + "loss": 0.0265, + "reward": 0.09447570890188217, + "reward_std": 0.10826204717159271, + "rewards/ndcg_rule_reward": -0.02075866423547268, + "rewards/rule_reward": 0.115234375, + "step": 781, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1015625, + "epoch": 0.4525462962962963, + "grad_norm": 2.3213493824005127, + "kl": 31.65625, + "learning_rate": 9.026828343678166e-06, + "loss": 0.0317, + "reward": 0.0658830888569355, + "reward_std": 0.1162516251206398, + "rewards/ndcg_rule_reward": -0.023960668593645096, + "rewards/rule_reward": 0.08984375, + "step": 782, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.453125, + "grad_norm": 1.6824570894241333, + "kl": 21.71875, + "learning_rate": 9.024048729407811e-06, + "loss": 0.0217, + "reward": 0.003748458344489336, + "reward_std": 0.12450544908642769, + "rewards/ndcg_rule_reward": -0.02554841712117195, + "rewards/rule_reward": 0.029296875, + "step": 783, + "token_diversity": 0.515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.140625, + "epoch": 0.4537037037037037, + "grad_norm": 2.8716378211975098, + "kl": 77.0, + "learning_rate": 9.021265580418206e-06, + "loss": 0.0772, + "reward": 0.09106715023517609, + "reward_std": 0.12795889005064964, + "rewards/ndcg_rule_reward": -0.022214100696146488, + "rewards/rule_reward": 0.11328125, + "step": 784, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.4542824074074074, + "grad_norm": 4.648709774017334, + "kl": 31.5625, + "learning_rate": 9.018478899154068e-06, + "loss": 0.0315, + "reward": 0.034294128650799394, + "reward_std": 0.09956727176904678, + "rewards/ndcg_rule_reward": -0.0203933734446764, + "rewards/rule_reward": 0.0546875, + "step": 785, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0390625, + "epoch": 0.4548611111111111, + "grad_norm": 1.6906728744506836, + "kl": 22.5, + "learning_rate": 9.015688688063211e-06, + "loss": 0.0225, + "reward": 0.02670531114563346, + "reward_std": 0.0717284046113491, + "rewards/ndcg_rule_reward": -0.012357189320027828, + "rewards/rule_reward": 0.0390625, + "step": 786, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.1796875, + "epoch": 0.4554398148148148, + "grad_norm": 10.422670364379883, + "kl": 65.25, + "learning_rate": 9.012894949596558e-06, + "loss": 0.0653, + "reward": 0.11550496495328844, + "reward_std": 0.15701831877231598, + "rewards/ndcg_rule_reward": -0.027073164470493793, + "rewards/rule_reward": 0.142578125, + "step": 787, + "token_diversity": 0.27059659090909094 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.15234375, + "epoch": 0.45601851851851855, + "grad_norm": 1.9147040843963623, + "kl": 22.5, + "learning_rate": 9.01009768620812e-06, + "loss": 0.0225, + "reward": 0.09759648889303207, + "reward_std": 0.12443781271576881, + "rewards/ndcg_rule_reward": -0.02545038517564535, + "rewards/rule_reward": 0.123046875, + "step": 788, + "token_diversity": 0.35318587662337664 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.05078125, + "epoch": 0.4565972222222222, + "grad_norm": 1.8728612661361694, + "kl": 24.15625, + "learning_rate": 9.007296900355013e-06, + "loss": 0.0242, + "reward": 0.03417639737017453, + "reward_std": 0.09123295173048973, + "rewards/ndcg_rule_reward": -0.018557977862656116, + "rewards/rule_reward": 0.052734375, + "step": 789, + "token_diversity": 0.37337662337662336 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.05078125, + "epoch": 0.45717592592592593, + "grad_norm": 2.550112009048462, + "kl": 35.125, + "learning_rate": 9.004492594497443e-06, + "loss": 0.0351, + "reward": 0.03450432140380144, + "reward_std": 0.10789458826184273, + "rewards/ndcg_rule_reward": -0.022136306390166283, + "rewards/rule_reward": 0.056640625, + "step": 790, + "token_diversity": 0.28875811688311687 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1640625, + "epoch": 0.45775462962962965, + "grad_norm": 2.183406114578247, + "kl": 37.0, + "learning_rate": 9.001684771098709e-06, + "loss": 0.0369, + "reward": 0.10770512744784355, + "reward_std": 0.1395002156496048, + "rewards/ndcg_rule_reward": -0.023154246620833874, + "rewards/rule_reward": 0.130859375, + "step": 791, + "token_diversity": 0.421875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.07421875, + "epoch": 0.4583333333333333, + "grad_norm": 5.73401403427124, + "kl": 49.4375, + "learning_rate": 8.998873432625197e-06, + "loss": 0.0493, + "reward": 0.05054328590631485, + "reward_std": 0.12529174610972404, + "rewards/ndcg_rule_reward": -0.0236754659563303, + "rewards/rule_reward": 0.07421875, + "step": 792, + "token_diversity": 0.52734375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.45891203703703703, + "grad_norm": 2.4695756435394287, + "kl": 22.3125, + "learning_rate": 8.996058581546386e-06, + "loss": 0.0223, + "reward": 0.038211017614230514, + "reward_std": 0.16065318882465363, + "rewards/ndcg_rule_reward": -0.03210148215293884, + "rewards/rule_reward": 0.0703125, + "step": 793, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.12890625, + "epoch": 0.45949074074074076, + "grad_norm": 2.464505910873413, + "kl": 31.625, + "learning_rate": 8.993240220334837e-06, + "loss": 0.0316, + "reward": 0.08452140027657151, + "reward_std": 0.1435280777513981, + "rewards/ndcg_rule_reward": -0.024853598326444626, + "rewards/rule_reward": 0.109375, + "step": 794, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.078125, + "epoch": 0.4600694444444444, + "grad_norm": 2.390979528427124, + "kl": 36.375, + "learning_rate": 8.990418351466193e-06, + "loss": 0.0364, + "reward": 0.05386140011250973, + "reward_std": 0.15801989287137985, + "rewards/ndcg_rule_reward": -0.030122973956167698, + "rewards/rule_reward": 0.083984375, + "step": 795, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 0.90625, + "completion_length": 5.09375, + "epoch": 0.46064814814814814, + "grad_norm": 2.5036532878875732, + "kl": 19.5, + "learning_rate": 8.987592977419183e-06, + "loss": 0.0195, + "reward": 0.06079377233982086, + "reward_std": 0.094612717628479, + "rewards/ndcg_rule_reward": -0.01733122393488884, + "rewards/rule_reward": 0.078125, + "step": 796, + "token_diversity": 0.3766666666666667 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.46122685185185186, + "grad_norm": 1.439043402671814, + "kl": 21.3125, + "learning_rate": 8.98476410067561e-06, + "loss": 0.0213, + "reward": 0.031061848159879446, + "reward_std": 0.07505642250180244, + "rewards/ndcg_rule_reward": -0.013860028237104416, + "rewards/rule_reward": 0.044921875, + "step": 797, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.4618055555555556, + "grad_norm": 2.2577009201049805, + "kl": 40.875, + "learning_rate": 8.981931723720357e-06, + "loss": 0.0409, + "reward": 0.06632483936846256, + "reward_std": 0.15794534981250763, + "rewards/ndcg_rule_reward": -0.029378284700214863, + "rewards/rule_reward": 0.095703125, + "step": 798, + "token_diversity": 0.44921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.19140625, + "epoch": 0.46238425925925924, + "grad_norm": 2.454644203186035, + "kl": 61.625, + "learning_rate": 8.979095849041377e-06, + "loss": 0.0617, + "reward": 0.12171208020299673, + "reward_std": 0.11980932578444481, + "rewards/ndcg_rule_reward": -0.02086604479700327, + "rewards/rule_reward": 0.142578125, + "step": 799, + "token_diversity": 0.44921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.16796875, + "epoch": 0.46296296296296297, + "grad_norm": 2.553788661956787, + "kl": 39.3125, + "learning_rate": 8.976256479129703e-06, + "loss": 0.0393, + "reward": 0.11023608816321939, + "reward_std": 0.13378160446882248, + "rewards/ndcg_rule_reward": -0.022576412186026573, + "rewards/rule_reward": 0.1328125, + "step": 800, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1015625, + "epoch": 0.4635416666666667, + "grad_norm": 3.324281692504883, + "kl": 59.25, + "learning_rate": 8.973413616479429e-06, + "loss": 0.0593, + "reward": 0.06724102422595024, + "reward_std": 0.14925040304660797, + "rewards/ndcg_rule_reward": -0.030415222980082035, + "rewards/rule_reward": 0.09765625, + "step": 801, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.181640625, + "epoch": 0.46412037037037035, + "grad_norm": 2.329824686050415, + "kl": 46.4375, + "learning_rate": 8.970567263587729e-06, + "loss": 0.0464, + "reward": 0.11575455218553543, + "reward_std": 0.1401124782860279, + "rewards/ndcg_rule_reward": -0.022917321883141994, + "rewards/rule_reward": 0.138671875, + "step": 802, + "token_diversity": 0.3054939516129032 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1015625, + "epoch": 0.46469907407407407, + "grad_norm": 1.7119364738464355, + "kl": 31.5625, + "learning_rate": 8.967717422954828e-06, + "loss": 0.0315, + "reward": 0.06529762223362923, + "reward_std": 0.08284568414092064, + "rewards/ndcg_rule_reward": -0.016733622644096613, + "rewards/rule_reward": 0.08203125, + "step": 803, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.193359375, + "epoch": 0.4652777777777778, + "grad_norm": 2.083611488342285, + "kl": 59.5, + "learning_rate": 8.964864097084029e-06, + "loss": 0.0595, + "reward": 0.12218006327748299, + "reward_std": 0.13644016534090042, + "rewards/ndcg_rule_reward": -0.024304297752678394, + "rewards/rule_reward": 0.146484375, + "step": 804, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.46585648148148145, + "grad_norm": 1.8572242259979248, + "kl": 28.78125, + "learning_rate": 8.962007288481684e-06, + "loss": 0.0287, + "reward": 0.0031117245089262724, + "reward_std": 0.11636938527226448, + "rewards/ndcg_rule_reward": -0.02423202432692051, + "rewards/rule_reward": 0.02734375, + "step": 805, + "token_diversity": 0.5390625 + }, + { + "categorical_diversity": 0.75, + "completion_length": 5.1953125, + "epoch": 0.4664351851851852, + "grad_norm": 2.9933204650878906, + "kl": 30.5, + "learning_rate": 8.959146999657215e-06, + "loss": 0.0305, + "reward": 0.12218769639730453, + "reward_std": 0.07491195574402809, + "rewards/ndcg_rule_reward": -0.012577927205711603, + "rewards/rule_reward": 0.134765625, + "step": 806, + "token_diversity": 0.22995391705069124 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.4670138888888889, + "grad_norm": 1.9553673267364502, + "kl": 20.0625, + "learning_rate": 8.956283233123092e-06, + "loss": 0.02, + "reward": 0.00400664855260402, + "reward_std": 0.12438986822962761, + "rewards/ndcg_rule_reward": -0.025290227495133877, + "rewards/rule_reward": 0.029296875, + "step": 807, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.4675925925925926, + "grad_norm": 1.939219355583191, + "kl": 14.90625, + "learning_rate": 8.953415991394847e-06, + "loss": 0.0149, + "reward": 0.03398612973978743, + "reward_std": 0.09132706001400948, + "rewards/ndcg_rule_reward": -0.01874824333935976, + "rewards/rule_reward": 0.052734375, + "step": 808, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 0.75, + "completion_length": 5.193359375, + "epoch": 0.4681712962962963, + "grad_norm": 1.88022780418396, + "kl": 27.75, + "learning_rate": 8.95054527699106e-06, + "loss": 0.0278, + "reward": 0.1221294142305851, + "reward_std": 0.1214267909526825, + "rewards/ndcg_rule_reward": -0.02240182925015688, + "rewards/rule_reward": 0.14453125, + "step": 809, + "token_diversity": 0.2394637620444072 + }, + { + "categorical_diversity": 0.953125, + "completion_length": 5.07421875, + "epoch": 0.46875, + "grad_norm": 2.0087437629699707, + "kl": 38.875, + "learning_rate": 8.947671092433363e-06, + "loss": 0.0388, + "reward": 0.05774922529235482, + "reward_std": 0.156358040869236, + "rewards/ndcg_rule_reward": -0.026235152035951614, + "rewards/rule_reward": 0.083984375, + "step": 810, + "token_diversity": 0.3693882042253521 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.1015625, + "epoch": 0.4693287037037037, + "grad_norm": 1.9355723857879639, + "kl": 19.03125, + "learning_rate": 8.944793440246435e-06, + "loss": 0.0191, + "reward": 0.06542038172483444, + "reward_std": 0.0996398776769638, + "rewards/ndcg_rule_reward": -0.02051711641252041, + "rewards/rule_reward": 0.0859375, + "step": 811, + "token_diversity": 0.37667410714285715 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.4699074074074074, + "grad_norm": 2.203500986099243, + "kl": 39.375, + "learning_rate": 8.941912322958004e-06, + "loss": 0.0394, + "reward": 0.03298390633426607, + "reward_std": 0.13304618746042252, + "rewards/ndcg_rule_reward": -0.02560984343290329, + "rewards/rule_reward": 0.05859375, + "step": 812, + "token_diversity": 0.40625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.10546875, + "epoch": 0.4704861111111111, + "grad_norm": 1.8018699884414673, + "kl": 35.375, + "learning_rate": 8.939027743098838e-06, + "loss": 0.0354, + "reward": 0.06548691540956497, + "reward_std": 0.09957951307296753, + "rewards/ndcg_rule_reward": -0.02045058272778988, + "rewards/rule_reward": 0.0859375, + "step": 813, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05859375, + "epoch": 0.4710648148148148, + "grad_norm": 1.9764803647994995, + "kl": 29.3125, + "learning_rate": 8.936139703202749e-06, + "loss": 0.0293, + "reward": 0.040395828895270824, + "reward_std": 0.10610474646091461, + "rewards/ndcg_rule_reward": -0.0201510451734066, + "rewards/rule_reward": 0.060546875, + "step": 814, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.08984375, + "epoch": 0.47164351851851855, + "grad_norm": 5.577699661254883, + "kl": 80.375, + "learning_rate": 8.933248205806588e-06, + "loss": 0.0804, + "reward": 0.05869780096691102, + "reward_std": 0.10505044460296631, + "rewards/ndcg_rule_reward": -0.01942720077931881, + "rewards/rule_reward": 0.078125, + "step": 815, + "token_diversity": 0.44140625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1484375, + "epoch": 0.4722222222222222, + "grad_norm": 1.8711254596710205, + "kl": 57.75, + "learning_rate": 8.930353253450242e-06, + "loss": 0.0577, + "reward": 0.0938970297574997, + "reward_std": 0.14425884187221527, + "rewards/ndcg_rule_reward": -0.027196713723242283, + "rewards/rule_reward": 0.12109375, + "step": 816, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.158203125, + "epoch": 0.47280092592592593, + "grad_norm": 4.66630220413208, + "kl": 30.5, + "learning_rate": 8.927454848676633e-06, + "loss": 0.0305, + "reward": 0.09650165354833007, + "reward_std": 0.0913153737783432, + "rewards/ndcg_rule_reward": -0.018732723779976368, + "rewards/rule_reward": 0.115234375, + "step": 817, + "token_diversity": 0.34201108870967745 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.099609375, + "epoch": 0.47337962962962965, + "grad_norm": 2.2973520755767822, + "kl": 25.0, + "learning_rate": 8.924552994031717e-06, + "loss": 0.025, + "reward": 0.06223387457430363, + "reward_std": 0.08351271227002144, + "rewards/ndcg_rule_reward": -0.015891124960035086, + "rewards/rule_reward": 0.078125, + "step": 818, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05859375, + "epoch": 0.4739583333333333, + "grad_norm": 3.0857887268066406, + "kl": 62.0625, + "learning_rate": 8.92164769206448e-06, + "loss": 0.0621, + "reward": 0.04279433935880661, + "reward_std": 0.15342846512794495, + "rewards/ndcg_rule_reward": -0.027518163435161114, + "rewards/rule_reward": 0.0703125, + "step": 819, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.47453703703703703, + "grad_norm": 2.2928426265716553, + "kl": 35.875, + "learning_rate": 8.918738945326932e-06, + "loss": 0.0358, + "reward": 0.003449977491982281, + "reward_std": 0.11622869595885277, + "rewards/ndcg_rule_reward": -0.023893771693110466, + "rewards/rule_reward": 0.02734375, + "step": 820, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1640625, + "epoch": 0.47511574074074076, + "grad_norm": 2.65854811668396, + "kl": 28.3125, + "learning_rate": 8.915826756374118e-06, + "loss": 0.0283, + "reward": 0.10864568687975407, + "reward_std": 0.14232761785387993, + "rewards/ndcg_rule_reward": -0.02416681218892336, + "rewards/rule_reward": 0.1328125, + "step": 821, + "token_diversity": 0.4227807971014493 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.15234375, + "epoch": 0.4756944444444444, + "grad_norm": 2.528040885925293, + "kl": 19.90625, + "learning_rate": 8.912911127764097e-06, + "loss": 0.0199, + "reward": 0.09686580405104905, + "reward_std": 0.13321565091609955, + "rewards/ndcg_rule_reward": -0.028134193271398544, + "rewards/rule_reward": 0.125, + "step": 822, + "token_diversity": 0.40132913961038963 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.06640625, + "epoch": 0.47627314814814814, + "grad_norm": 2.1836676597595215, + "kl": 28.5625, + "learning_rate": 8.909992062057953e-06, + "loss": 0.0285, + "reward": 0.04698025435209274, + "reward_std": 0.12528583407402039, + "rewards/ndcg_rule_reward": -0.021379122510552406, + "rewards/rule_reward": 0.068359375, + "step": 823, + "token_diversity": 0.515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.14453125, + "epoch": 0.47685185185185186, + "grad_norm": 2.4627652168273926, + "kl": 16.78125, + "learning_rate": 8.907069561819793e-06, + "loss": 0.0168, + "reward": 0.09314800426363945, + "reward_std": 0.13618815317749977, + "rewards/ndcg_rule_reward": -0.025992628186941147, + "rewards/rule_reward": 0.119140625, + "step": 824, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.140625, + "epoch": 0.4774305555555556, + "grad_norm": 1.7860114574432373, + "kl": 34.375, + "learning_rate": 8.904143629616735e-06, + "loss": 0.0344, + "reward": 0.09102184837684035, + "reward_std": 0.12138354033231735, + "rewards/ndcg_rule_reward": -0.022259398363530636, + "rewards/rule_reward": 0.11328125, + "step": 825, + "token_diversity": 0.421875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.47800925925925924, + "grad_norm": 1.9406418800354004, + "kl": 34.1875, + "learning_rate": 8.901214268018913e-06, + "loss": 0.0341, + "reward": 0.03570503764785826, + "reward_std": 0.14101215824484825, + "rewards/ndcg_rule_reward": -0.02874808758497238, + "rewards/rule_reward": 0.064453125, + "step": 826, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.03125, + "epoch": 0.47858796296296297, + "grad_norm": 1.505709171295166, + "kl": 26.0625, + "learning_rate": 8.898281479599478e-06, + "loss": 0.026, + "reward": 0.022632361738942564, + "reward_std": 0.07412083074450493, + "rewards/ndcg_rule_reward": -0.012523890007287264, + "rewards/rule_reward": 0.03515625, + "step": 827, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.11328125, + "epoch": 0.4791666666666667, + "grad_norm": 10.672271728515625, + "kl": 72.4375, + "learning_rate": 8.895345266934584e-06, + "loss": 0.0724, + "reward": 0.07561642304062843, + "reward_std": 0.12015276402235031, + "rewards/ndcg_rule_reward": -0.020086701028048992, + "rewards/rule_reward": 0.095703125, + "step": 828, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.25390625, + "epoch": 0.47974537037037035, + "grad_norm": 3.211651086807251, + "kl": 29.375, + "learning_rate": 8.892405632603397e-06, + "loss": 0.0294, + "reward": 0.1653212532401085, + "reward_std": 0.16105425357818604, + "rewards/ndcg_rule_reward": -0.02608499303460121, + "rewards/rule_reward": 0.19140625, + "step": 829, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.48032407407407407, + "grad_norm": 2.1879026889801025, + "kl": 44.75, + "learning_rate": 8.889462579188093e-06, + "loss": 0.0447, + "reward": 0.003686090698465705, + "reward_std": 0.12454346939921379, + "rewards/ndcg_rule_reward": -0.025610785000026226, + "rewards/rule_reward": 0.029296875, + "step": 830, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.4809027777777778, + "grad_norm": 1.719310998916626, + "kl": 46.625, + "learning_rate": 8.88651610927384e-06, + "loss": 0.0467, + "reward": 0.03265139437280595, + "reward_std": 0.11638610064983368, + "rewards/ndcg_rule_reward": -0.022036105394363403, + "rewards/rule_reward": 0.0546875, + "step": 831, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.48148148148148145, + "grad_norm": 2.0606231689453125, + "kl": 29.0625, + "learning_rate": 8.883566225448816e-06, + "loss": 0.0291, + "reward": 0.06330214557237923, + "reward_std": 0.09137892350554466, + "rewards/ndcg_rule_reward": -0.016775980591773987, + "rewards/rule_reward": 0.080078125, + "step": 832, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.4820601851851852, + "grad_norm": 2.1343774795532227, + "kl": 51.5, + "learning_rate": 8.880612930304196e-06, + "loss": 0.0515, + "reward": 0.0327128671342507, + "reward_std": 0.13315899670124054, + "rewards/ndcg_rule_reward": -0.025880882516503334, + "rewards/rule_reward": 0.05859375, + "step": 833, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 0.9375, + "completion_length": 5.07421875, + "epoch": 0.4826388888888889, + "grad_norm": 2.8074498176574707, + "kl": 59.5, + "learning_rate": 8.877656226434148e-06, + "loss": 0.0595, + "reward": 0.05069036071654409, + "reward_std": 0.136733278632164, + "rewards/ndcg_rule_reward": -0.023528387770056725, + "rewards/rule_reward": 0.07421875, + "step": 834, + "token_diversity": 0.3688998287671233 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.4832175925925926, + "grad_norm": 3.765246629714966, + "kl": 36.875, + "learning_rate": 8.87469611643584e-06, + "loss": 0.0369, + "reward": 0.034035762306302786, + "reward_std": 0.09127507358789444, + "rewards/ndcg_rule_reward": -0.01869861362501979, + "rewards/rule_reward": 0.052734375, + "step": 835, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.1015625, + "epoch": 0.4837962962962963, + "grad_norm": 1.360501766204834, + "kl": 39.125, + "learning_rate": 8.871732602909427e-06, + "loss": 0.0391, + "reward": 0.06624276237562299, + "reward_std": 0.10764491558074951, + "rewards/ndcg_rule_reward": -0.021647854708135128, + "rewards/rule_reward": 0.087890625, + "step": 836, + "token_diversity": 0.37662337662337664 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.05078125, + "epoch": 0.484375, + "grad_norm": 1.5779470205307007, + "kl": 24.6875, + "learning_rate": 8.868765688458056e-06, + "loss": 0.0247, + "reward": 0.034044211148284376, + "reward_std": 0.09970639273524284, + "rewards/ndcg_rule_reward": -0.02064328920096159, + "rewards/rule_reward": 0.0546875, + "step": 837, + "token_diversity": 0.43450689935064934 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1015625, + "epoch": 0.4849537037037037, + "grad_norm": 3.4186694622039795, + "kl": 34.0625, + "learning_rate": 8.86579537568786e-06, + "loss": 0.0341, + "reward": 0.06723727146163583, + "reward_std": 0.1745055764913559, + "rewards/ndcg_rule_reward": -0.03627835027873516, + "rewards/rule_reward": 0.103515625, + "step": 838, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.04296875, + "epoch": 0.4855324074074074, + "grad_norm": 4.106934547424316, + "kl": 20.4375, + "learning_rate": 8.86282166720796e-06, + "loss": 0.0205, + "reward": 0.02968467283062637, + "reward_std": 0.10296551510691643, + "rewards/ndcg_rule_reward": -0.01914345473051071, + "rewards/rule_reward": 0.048828125, + "step": 839, + "token_diversity": 0.515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.13671875, + "epoch": 0.4861111111111111, + "grad_norm": 2.1646828651428223, + "kl": 22.03125, + "learning_rate": 8.859844565630455e-06, + "loss": 0.022, + "reward": 0.08819332742132246, + "reward_std": 0.13690077140927315, + "rewards/ndcg_rule_reward": -0.023134799674153328, + "rewards/rule_reward": 0.111328125, + "step": 840, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.23828125, + "epoch": 0.4866898148148148, + "grad_norm": 2.602994441986084, + "kl": 28.5, + "learning_rate": 8.856864073570429e-06, + "loss": 0.0285, + "reward": 0.1513584554195404, + "reward_std": 0.13000492751598358, + "rewards/ndcg_rule_reward": -0.022469663061201572, + "rewards/rule_reward": 0.173828125, + "step": 841, + "token_diversity": 0.3713981331168831 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.037109375, + "epoch": 0.48726851851851855, + "grad_norm": 2.315544843673706, + "kl": 17.40625, + "learning_rate": 8.853880193645944e-06, + "loss": 0.0174, + "reward": 0.02924908953718841, + "reward_std": 0.10294320061802864, + "rewards/ndcg_rule_reward": -0.01957903616130352, + "rewards/rule_reward": 0.048828125, + "step": 842, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.4878472222222222, + "grad_norm": 4.181227207183838, + "kl": 31.8125, + "learning_rate": 8.850892928478032e-06, + "loss": 0.0317, + "reward": 0.06239286810159683, + "reward_std": 0.13597043976187706, + "rewards/ndcg_rule_reward": -0.025497756898403168, + "rewards/rule_reward": 0.087890625, + "step": 843, + "token_diversity": 0.43359375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.48842592592592593, + "grad_norm": 1.9557421207427979, + "kl": 40.875, + "learning_rate": 8.847902280690706e-06, + "loss": 0.0408, + "reward": 0.003464780980721116, + "reward_std": 0.10779594630002975, + "rewards/ndcg_rule_reward": -0.021925845183432102, + "rewards/rule_reward": 0.025390625, + "step": 844, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.060546875, + "epoch": 0.48900462962962965, + "grad_norm": 2.386265516281128, + "kl": 58.75, + "learning_rate": 8.844908252910947e-06, + "loss": 0.0588, + "reward": 0.05003530811518431, + "reward_std": 0.17827441543340683, + "rewards/ndcg_rule_reward": -0.030042816884815693, + "rewards/rule_reward": 0.080078125, + "step": 845, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.4895833333333333, + "grad_norm": 3.497624397277832, + "kl": 41.0625, + "learning_rate": 8.841910847768702e-06, + "loss": 0.0411, + "reward": 0.00240302924066782, + "reward_std": 0.09146055951714516, + "rewards/ndcg_rule_reward": -0.019081344828009605, + "rewards/rule_reward": 0.021484375, + "step": 846, + "token_diversity": 0.42578125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.49016203703703703, + "grad_norm": 1.7588211297988892, + "kl": 29.59375, + "learning_rate": 8.838910067896894e-06, + "loss": 0.0295, + "reward": 0.031936867628246546, + "reward_std": 0.09987546503543854, + "rewards/ndcg_rule_reward": -0.01884438470005989, + "rewards/rule_reward": 0.05078125, + "step": 847, + "token_diversity": 0.44140625 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.1015625, + "epoch": 0.49074074074074076, + "grad_norm": 3.1157913208007812, + "kl": 38.0625, + "learning_rate": 8.835905915931398e-06, + "loss": 0.038, + "reward": 0.06595351547002792, + "reward_std": 0.1330544464290142, + "rewards/ndcg_rule_reward": -0.02779647894203663, + "rewards/rule_reward": 0.09375, + "step": 848, + "token_diversity": 0.38184862012987014 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.4913194444444444, + "grad_norm": 2.072882652282715, + "kl": 42.375, + "learning_rate": 8.83289839451106e-06, + "loss": 0.0423, + "reward": 0.061654267366975546, + "reward_std": 0.1250297762453556, + "rewards/ndcg_rule_reward": -0.02233010809868574, + "rewards/rule_reward": 0.083984375, + "step": 849, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 6.130859375, + "epoch": 0.49189814814814814, + "grad_norm": 4.341796398162842, + "kl": 60.25, + "learning_rate": 8.829887506277684e-06, + "loss": 0.0603, + "reward": 0.08787684189155698, + "reward_std": 0.1220407672226429, + "rewards/ndcg_rule_reward": -0.021498157642781734, + "rewards/rule_reward": 0.109375, + "step": 850, + "token_diversity": 0.38113839285714285 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.49247685185185186, + "grad_norm": 5.0951642990112305, + "kl": 70.25, + "learning_rate": 8.82687325387603e-06, + "loss": 0.0701, + "reward": 0.03545542573556304, + "reward_std": 0.13269269838929176, + "rewards/ndcg_rule_reward": -0.027044571936130524, + "rewards/rule_reward": 0.0625, + "step": 851, + "token_diversity": 0.4375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.14453125, + "epoch": 0.4930555555555556, + "grad_norm": 1.28134286403656, + "kl": 43.125, + "learning_rate": 8.82385563995381e-06, + "loss": 0.043, + "reward": 0.09164491668343544, + "reward_std": 0.07800480723381042, + "rewards/ndcg_rule_reward": -0.01382383517920971, + "rewards/rule_reward": 0.10546875, + "step": 852, + "token_diversity": 0.52734375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.49363425925925924, + "grad_norm": 2.161083459854126, + "kl": 33.0, + "learning_rate": 8.820834667161698e-06, + "loss": 0.033, + "reward": 0.032220579916611314, + "reward_std": 0.12498178705573082, + "rewards/ndcg_rule_reward": -0.02442004531621933, + "rewards/rule_reward": 0.056640625, + "step": 853, + "token_diversity": 0.546875 + }, + { + "categorical_diversity": 0.90625, + "completion_length": 5.1796875, + "epoch": 0.49421296296296297, + "grad_norm": 2.4815847873687744, + "kl": 47.625, + "learning_rate": 8.817810338153306e-06, + "loss": 0.0476, + "reward": 0.11613878048956394, + "reward_std": 0.16779263317584991, + "rewards/ndcg_rule_reward": -0.028392470441758633, + "rewards/rule_reward": 0.14453125, + "step": 854, + "token_diversity": 0.3454166666666667 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.4947916666666667, + "grad_norm": 3.7689201831817627, + "kl": 46.5, + "learning_rate": 8.814782655585205e-06, + "loss": 0.0465, + "reward": 0.06454129377380013, + "reward_std": 0.13288848102092743, + "rewards/ndcg_rule_reward": -0.025302457623183727, + "rewards/rule_reward": 0.08984375, + "step": 855, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0859375, + "epoch": 0.49537037037037035, + "grad_norm": 1.8914357423782349, + "kl": 30.5625, + "learning_rate": 8.81175162211691e-06, + "loss": 0.0306, + "reward": 0.05576020834269002, + "reward_std": 0.09824039041996002, + "rewards/ndcg_rule_reward": -0.016505418345332146, + "rewards/rule_reward": 0.072265625, + "step": 856, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.49594907407407407, + "grad_norm": 2.276165008544922, + "kl": 14.21875, + "learning_rate": 8.80871724041087e-06, + "loss": 0.0142, + "reward": 0.03490390838123858, + "reward_std": 0.14139363169670105, + "rewards/ndcg_rule_reward": -0.029549213126301765, + "rewards/rule_reward": 0.064453125, + "step": 857, + "token_diversity": 0.44140625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.4965277777777778, + "grad_norm": 1.5532554388046265, + "kl": 29.96875, + "learning_rate": 8.805679513132487e-06, + "loss": 0.03, + "reward": 0.062460124492645264, + "reward_std": 0.07494524121284485, + "rewards/ndcg_rule_reward": -0.013711747247725725, + "rewards/rule_reward": 0.076171875, + "step": 858, + "token_diversity": 0.3984375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.49710648148148145, + "grad_norm": 1.8070755004882812, + "kl": 22.375, + "learning_rate": 8.802638442950095e-06, + "loss": 0.0223, + "reward": 0.06432512402534485, + "reward_std": 0.13297859951853752, + "rewards/ndcg_rule_reward": -0.025518626905977726, + "rewards/rule_reward": 0.08984375, + "step": 859, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.13671875, + "epoch": 0.4976851851851852, + "grad_norm": 2.2618556022644043, + "kl": 20.625, + "learning_rate": 8.799594032534966e-06, + "loss": 0.0206, + "reward": 0.08818922936916351, + "reward_std": 0.11479681730270386, + "rewards/ndcg_rule_reward": -0.021185768768191338, + "rewards/rule_reward": 0.109375, + "step": 860, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1015625, + "epoch": 0.4982638888888889, + "grad_norm": 2.299286365509033, + "kl": 43.375, + "learning_rate": 8.796546284561307e-06, + "loss": 0.0434, + "reward": 0.06644908711314201, + "reward_std": 0.14119123667478561, + "rewards/ndcg_rule_reward": -0.029254033230245113, + "rewards/rule_reward": 0.095703125, + "step": 861, + "token_diversity": 0.52734375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.23828125, + "epoch": 0.4988425925925926, + "grad_norm": 48.573246002197266, + "kl": 181.3125, + "learning_rate": 8.793495201706257e-06, + "loss": 0.1811, + "reward": 0.03473427495919168, + "reward_std": 0.10807033628225327, + "rewards/ndcg_rule_reward": -0.021906349807977676, + "rewards/rule_reward": 0.056640625, + "step": 862, + "token_diversity": 0.5625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.04296875, + "epoch": 0.4994212962962963, + "grad_norm": 3.3822219371795654, + "kl": 34.0, + "learning_rate": 8.79044078664988e-06, + "loss": 0.0339, + "reward": 0.030581860919483006, + "reward_std": 0.12784650921821594, + "rewards/ndcg_rule_reward": -0.024105639196932316, + "rewards/rule_reward": 0.0546875, + "step": 863, + "token_diversity": 0.52734375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.08984375, + "epoch": 0.5, + "grad_norm": 2.4986493587493896, + "kl": 16.71875, + "learning_rate": 8.787383042075173e-06, + "loss": 0.0168, + "reward": 0.05969221144914627, + "reward_std": 0.1382303461432457, + "rewards/ndcg_rule_reward": -0.026245292276144028, + "rewards/rule_reward": 0.0859375, + "step": 864, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.5005787037037037, + "grad_norm": 1.395385980606079, + "kl": 18.625, + "learning_rate": 8.784321970668054e-06, + "loss": 0.0186, + "reward": 0.06263764947652817, + "reward_std": 0.09168148040771484, + "rewards/ndcg_rule_reward": -0.017440469935536385, + "rewards/rule_reward": 0.080078125, + "step": 865, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.08984375, + "epoch": 0.5011574074074074, + "grad_norm": 1.600346565246582, + "kl": 23.9375, + "learning_rate": 8.781257575117364e-06, + "loss": 0.0239, + "reward": 0.05867193092126399, + "reward_std": 0.10322289168834686, + "rewards/ndcg_rule_reward": -0.017499943263828754, + "rewards/rule_reward": 0.076171875, + "step": 866, + "token_diversity": 0.52734375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.5017361111111112, + "grad_norm": 2.81894850730896, + "kl": 32.375, + "learning_rate": 8.778189858114866e-06, + "loss": 0.0324, + "reward": 0.034809600561857224, + "reward_std": 0.13301526755094528, + "rewards/ndcg_rule_reward": -0.027690401300787926, + "rewards/rule_reward": 0.0625, + "step": 867, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.5023148148148148, + "grad_norm": 2.3223862648010254, + "kl": 33.9375, + "learning_rate": 8.775118822355235e-06, + "loss": 0.0339, + "reward": 0.0628933422267437, + "reward_std": 0.1665085256099701, + "rewards/ndcg_rule_reward": -0.030856656841933727, + "rewards/rule_reward": 0.09375, + "step": 868, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.533203125, + "epoch": 0.5028935185185185, + "grad_norm": 2.923003673553467, + "kl": 49.6875, + "learning_rate": 8.772044470536067e-06, + "loss": 0.0498, + "reward": 0.05787594139110297, + "reward_std": 0.08861764147877693, + "rewards/ndcg_rule_reward": -0.016342810355126858, + "rewards/rule_reward": 0.07421875, + "step": 869, + "token_diversity": 0.53125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.044921875, + "epoch": 0.5034722222222222, + "grad_norm": 2.038236141204834, + "kl": 57.5, + "learning_rate": 8.76896680535787e-06, + "loss": 0.0574, + "reward": 0.028825643006712198, + "reward_std": 0.14647973328828812, + "rewards/ndcg_rule_reward": -0.02781498432159424, + "rewards/rule_reward": 0.056640625, + "step": 870, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.5040509259259259, + "grad_norm": 2.164541244506836, + "kl": 41.75, + "learning_rate": 8.765885829524059e-06, + "loss": 0.0417, + "reward": 0.06124170927796513, + "reward_std": 0.11969400942325592, + "rewards/ndcg_rule_reward": -0.022742665372788906, + "rewards/rule_reward": 0.083984375, + "step": 871, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5046296296296297, + "grad_norm": 2.463217258453369, + "kl": 29.1875, + "learning_rate": 8.76280154574096e-06, + "loss": 0.0291, + "reward": 0.003338184207677841, + "reward_std": 0.10782836750149727, + "rewards/ndcg_rule_reward": -0.02205244079232216, + "rewards/rule_reward": 0.025390625, + "step": 872, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.5052083333333334, + "grad_norm": 1.8813631534576416, + "kl": 38.375, + "learning_rate": 8.759713956717804e-06, + "loss": 0.0384, + "reward": 0.03184299229178578, + "reward_std": 0.10830153152346611, + "rewards/ndcg_rule_reward": -0.020891383290290833, + "rewards/rule_reward": 0.052734375, + "step": 873, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.08984375, + "epoch": 0.5057870370370371, + "grad_norm": 3.3832814693450928, + "kl": 47.625, + "learning_rate": 8.756623065166725e-06, + "loss": 0.0475, + "reward": 0.05863526463508606, + "reward_std": 0.1116618923842907, + "rewards/ndcg_rule_reward": -0.019489736296236515, + "rewards/rule_reward": 0.078125, + "step": 874, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5063657407407407, + "grad_norm": 1.5462327003479004, + "kl": 19.9375, + "learning_rate": 8.75352887380276e-06, + "loss": 0.0199, + "reward": 0.0023365429951809347, + "reward_std": 0.08308857306838036, + "rewards/ndcg_rule_reward": -0.017194706946611404, + "rewards/rule_reward": 0.01953125, + "step": 875, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.5069444444444444, + "grad_norm": 2.5182316303253174, + "kl": 35.75, + "learning_rate": 8.75043138534384e-06, + "loss": 0.0358, + "reward": 0.06464494508691132, + "reward_std": 0.1496797874569893, + "rewards/ndcg_rule_reward": -0.029105055145919323, + "rewards/rule_reward": 0.09375, + "step": 876, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5075231481481481, + "grad_norm": 4.396366119384766, + "kl": 53.75, + "learning_rate": 8.747330602510796e-06, + "loss": 0.0539, + "reward": 0.004545045783743262, + "reward_std": 0.14095140993595123, + "rewards/ndcg_rule_reward": -0.028658080846071243, + "rewards/rule_reward": 0.033203125, + "step": 877, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.03515625, + "epoch": 0.5081018518518519, + "grad_norm": 4.152349472045898, + "kl": 25.5625, + "learning_rate": 8.74422652802735e-06, + "loss": 0.0256, + "reward": 0.026639199000783265, + "reward_std": 0.13956715166568756, + "rewards/ndcg_rule_reward": -0.02609517890959978, + "rewards/rule_reward": 0.052734375, + "step": 878, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5086805555555556, + "grad_norm": 1.8979895114898682, + "kl": 18.78125, + "learning_rate": 8.741119164620122e-06, + "loss": 0.0187, + "reward": 0.0033059550914913416, + "reward_std": 0.10788234323263168, + "rewards/ndcg_rule_reward": -0.022084670141339302, + "rewards/rule_reward": 0.025390625, + "step": 879, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.03515625, + "epoch": 0.5092592592592593, + "grad_norm": 2.3659732341766357, + "kl": 31.0, + "learning_rate": 8.73800851501861e-06, + "loss": 0.0311, + "reward": 0.028070127125829458, + "reward_std": 0.13745469599962234, + "rewards/ndcg_rule_reward": -0.024664247408509254, + "rewards/rule_reward": 0.052734375, + "step": 880, + "token_diversity": 0.55859375 + }, + { + "categorical_diversity": 0.78125, + "completion_length": 5.29296875, + "epoch": 0.5098379629629629, + "grad_norm": 1.5759551525115967, + "kl": 31.125, + "learning_rate": 8.734894581955208e-06, + "loss": 0.0311, + "reward": 0.1836138442158699, + "reward_std": 0.10040636546909809, + "rewards/ndcg_rule_reward": -0.015604906249791384, + "rewards/rule_reward": 0.19921875, + "step": 881, + "token_diversity": 0.25 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.25390625, + "epoch": 0.5104166666666666, + "grad_norm": 2.227968454360962, + "kl": 29.8125, + "learning_rate": 8.73177736816519e-06, + "loss": 0.0298, + "reward": 0.16004294902086258, + "reward_std": 0.11605361476540565, + "rewards/ndcg_rule_reward": -0.023550784215331078, + "rewards/rule_reward": 0.18359375, + "step": 882, + "token_diversity": 0.36683238636363635 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5109953703703703, + "grad_norm": 2.6785225868225098, + "kl": 30.375, + "learning_rate": 8.728656876386714e-06, + "loss": 0.0304, + "reward": 0.002573281468357891, + "reward_std": 0.10820479318499565, + "rewards/ndcg_rule_reward": -0.022817344404757023, + "rewards/rule_reward": 0.025390625, + "step": 883, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 0.953125, + "completion_length": 5.03125, + "epoch": 0.5115740740740741, + "grad_norm": 2.6763150691986084, + "kl": 36.375, + "learning_rate": 8.725533109360815e-06, + "loss": 0.0365, + "reward": 0.026352945831604302, + "reward_std": 0.11424426734447479, + "rewards/ndcg_rule_reward": -0.02052205801010132, + "rewards/rule_reward": 0.046875, + "step": 884, + "token_diversity": 0.4097222222222222 + }, + { + "categorical_diversity": 0.9375, + "completion_length": 5.259765625, + "epoch": 0.5121527777777778, + "grad_norm": 2.6149392127990723, + "kl": 44.125, + "learning_rate": 8.722406069831403e-06, + "loss": 0.0442, + "reward": 0.1665028571151197, + "reward_std": 0.12494411692023277, + "rewards/ndcg_rule_reward": -0.017090894747525454, + "rewards/rule_reward": 0.18359375, + "step": 885, + "token_diversity": 0.3663194444444444 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5127314814814815, + "grad_norm": 2.644972085952759, + "kl": 32.0625, + "learning_rate": 8.719275760545272e-06, + "loss": 0.0321, + "reward": 0.0040861808229237795, + "reward_std": 0.12432443723082542, + "rewards/ndcg_rule_reward": -0.02521069347858429, + "rewards/rule_reward": 0.029296875, + "step": 886, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5133101851851852, + "grad_norm": 2.2092552185058594, + "kl": 40.875, + "learning_rate": 8.716142184252076e-06, + "loss": 0.0408, + "reward": 0.00336432212498039, + "reward_std": 0.10783016309142113, + "rewards/ndcg_rule_reward": -0.022026302292943, + "rewards/rule_reward": 0.025390625, + "step": 887, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.087890625, + "epoch": 0.5138888888888888, + "grad_norm": 2.4863064289093018, + "kl": 52.625, + "learning_rate": 8.713005343704347e-06, + "loss": 0.0525, + "reward": 0.05641921563073993, + "reward_std": 0.09813696146011353, + "rewards/ndcg_rule_reward": -0.017799532040953636, + "rewards/rule_reward": 0.07421875, + "step": 888, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.2734375, + "epoch": 0.5144675925925926, + "grad_norm": 3.5822956562042236, + "kl": 71.75, + "learning_rate": 8.70986524165748e-06, + "loss": 0.0718, + "reward": 0.07609464228153229, + "reward_std": 0.16424249857664108, + "rewards/ndcg_rule_reward": -0.029374105855822563, + "rewards/rule_reward": 0.10546875, + "step": 889, + "token_diversity": 0.41015625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.5150462962962963, + "grad_norm": 2.055844783782959, + "kl": 39.875, + "learning_rate": 8.706721880869738e-06, + "loss": 0.0399, + "reward": 0.0340167245012708, + "reward_std": 0.09127883985638618, + "rewards/ndcg_rule_reward": -0.018717648461461067, + "rewards/rule_reward": 0.052734375, + "step": 890, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.107421875, + "epoch": 0.515625, + "grad_norm": 3.0937418937683105, + "kl": 44.0, + "learning_rate": 8.703575264102245e-06, + "loss": 0.044, + "reward": 0.06952877808362246, + "reward_std": 0.15220536291599274, + "rewards/ndcg_rule_reward": -0.030080599710345268, + "rewards/rule_reward": 0.099609375, + "step": 891, + "token_diversity": 0.47862160852713176 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.5162037037037037, + "grad_norm": 2.341226816177368, + "kl": 31.8125, + "learning_rate": 8.700425394118985e-06, + "loss": 0.0318, + "reward": 0.03529852838255465, + "reward_std": 0.14120539277791977, + "rewards/ndcg_rule_reward": -0.029154596850275993, + "rewards/rule_reward": 0.064453125, + "step": 892, + "token_diversity": 0.56640625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.08203125, + "epoch": 0.5167824074074074, + "grad_norm": 2.57702374458313, + "kl": 51.0, + "learning_rate": 8.697272273686798e-06, + "loss": 0.0509, + "reward": 0.05568596348166466, + "reward_std": 0.1329556629061699, + "rewards/ndcg_rule_reward": -0.02243903838098049, + "rewards/rule_reward": 0.078125, + "step": 893, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.158203125, + "epoch": 0.5173611111111112, + "grad_norm": 1.6491646766662598, + "kl": 31.0, + "learning_rate": 8.694115905575385e-06, + "loss": 0.031, + "reward": 0.0962801236892119, + "reward_std": 0.08295808732509613, + "rewards/ndcg_rule_reward": -0.017001119442284107, + "rewards/rule_reward": 0.11328125, + "step": 894, + "token_diversity": 0.4375 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.20703125, + "epoch": 0.5179398148148148, + "grad_norm": 3.4315919876098633, + "kl": 35.25, + "learning_rate": 8.690956292557292e-06, + "loss": 0.0353, + "reward": 0.13338186219334602, + "reward_std": 0.12039848044514656, + "rewards/ndcg_rule_reward": -0.018961884081363678, + "rewards/rule_reward": 0.15234375, + "step": 895, + "token_diversity": 0.32848011363636365 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.5185185185185185, + "grad_norm": 1.8174912929534912, + "kl": 27.75, + "learning_rate": 8.687793437407922e-06, + "loss": 0.0278, + "reward": 0.002596894046291709, + "reward_std": 0.09136735647916794, + "rewards/ndcg_rule_reward": -0.018887481652200222, + "rewards/rule_reward": 0.021484375, + "step": 896, + "token_diversity": 0.44140625 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.05078125, + "epoch": 0.5190972222222222, + "grad_norm": 2.037513494491577, + "kl": 27.625, + "learning_rate": 8.684627342905519e-06, + "loss": 0.0277, + "reward": 0.03621436981484294, + "reward_std": 0.14915582537651062, + "rewards/ndcg_rule_reward": -0.0301918787881732, + "rewards/rule_reward": 0.06640625, + "step": 897, + "token_diversity": 0.34993912337662336 + }, + { + "categorical_diversity": 0.90625, + "completion_length": 5.13671875, + "epoch": 0.5196759259259259, + "grad_norm": 3.015190601348877, + "kl": 36.75, + "learning_rate": 8.681458011831178e-06, + "loss": 0.0368, + "reward": 0.0921320803463459, + "reward_std": 0.11720949038863182, + "rewards/ndcg_rule_reward": -0.021149168722331524, + "rewards/rule_reward": 0.11328125, + "step": 898, + "token_diversity": 0.38966783940397354 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5202546296296297, + "grad_norm": 2.982590913772583, + "kl": 17.59375, + "learning_rate": 8.678285446968837e-06, + "loss": 0.0176, + "reward": 0.002903486369177699, + "reward_std": 0.09963671490550041, + "rewards/ndcg_rule_reward": -0.020534013397991657, + "rewards/rule_reward": 0.0234375, + "step": 899, + "token_diversity": 0.4296875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5208333333333334, + "grad_norm": 2.0985352993011475, + "kl": 15.8125, + "learning_rate": 8.675109651105274e-06, + "loss": 0.0158, + "reward": 0.0026595033705234528, + "reward_std": 0.09974982216954231, + "rewards/ndcg_rule_reward": -0.020777996629476547, + "rewards/rule_reward": 0.0234375, + "step": 900, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5214120370370371, + "grad_norm": 4.024219989776611, + "kl": 30.0, + "learning_rate": 8.6719306270301e-06, + "loss": 0.03, + "reward": 0.004160868003964424, + "reward_std": 0.12430740892887115, + "rewards/ndcg_rule_reward": -0.025136006996035576, + "rewards/rule_reward": 0.029296875, + "step": 901, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.5219907407407407, + "grad_norm": 1.8721940517425537, + "kl": 39.25, + "learning_rate": 8.668748377535768e-06, + "loss": 0.0392, + "reward": 0.03605373506434262, + "reward_std": 0.13240908086299896, + "rewards/ndcg_rule_reward": -0.026446263305842876, + "rewards/rule_reward": 0.0625, + "step": 902, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 0.921875, + "completion_length": 5.083984375, + "epoch": 0.5225694444444444, + "grad_norm": 2.6403236389160156, + "kl": 46.125, + "learning_rate": 8.66556290541756e-06, + "loss": 0.0461, + "reward": 0.06224600039422512, + "reward_std": 0.16415666043758392, + "rewards/ndcg_rule_reward": -0.02955087460577488, + "rewards/rule_reward": 0.091796875, + "step": 903, + "token_diversity": 0.33989583333333334 + }, + { + "categorical_diversity": 0.765625, + "completion_length": 5.201171875, + "epoch": 0.5231481481481481, + "grad_norm": 3.2631876468658447, + "kl": 35.625, + "learning_rate": 8.662374213473593e-06, + "loss": 0.0356, + "reward": 0.12939791288226843, + "reward_std": 0.13058381527662277, + "rewards/ndcg_rule_reward": -0.020992710255086422, + "rewards/rule_reward": 0.150390625, + "step": 904, + "token_diversity": 0.2730863764044944 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.037109375, + "epoch": 0.5237268518518519, + "grad_norm": 1.8324483633041382, + "kl": 34.0625, + "learning_rate": 8.65918230450481e-06, + "loss": 0.0342, + "reward": 0.027707983972504735, + "reward_std": 0.11780213564634323, + "rewards/ndcg_rule_reward": -0.021120142191648483, + "rewards/rule_reward": 0.048828125, + "step": 905, + "token_diversity": 0.44140625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.08984375, + "epoch": 0.5243055555555556, + "grad_norm": 8.049781799316406, + "kl": 58.0, + "learning_rate": 8.655987181314976e-06, + "loss": 0.0581, + "reward": 0.05902427260298282, + "reward_std": 0.1301407478749752, + "rewards/ndcg_rule_reward": -0.024960103910416365, + "rewards/rule_reward": 0.083984375, + "step": 906, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.19921875, + "epoch": 0.5248842592592593, + "grad_norm": 3.5717782974243164, + "kl": 43.75, + "learning_rate": 8.652788846710689e-06, + "loss": 0.0437, + "reward": 0.12670358270406723, + "reward_std": 0.13305360823869705, + "rewards/ndcg_rule_reward": -0.025640161707997322, + "rewards/rule_reward": 0.15234375, + "step": 907, + "token_diversity": 0.3423108552631579 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5254629629629629, + "grad_norm": 1.730417251586914, + "kl": 38.0625, + "learning_rate": 8.649587303501357e-06, + "loss": 0.0381, + "reward": 0.003725372487679124, + "reward_std": 0.12447697669267654, + "rewards/ndcg_rule_reward": -0.025571501813828945, + "rewards/rule_reward": 0.029296875, + "step": 908, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.1015625, + "epoch": 0.5260416666666666, + "grad_norm": 1.7324106693267822, + "kl": 27.25, + "learning_rate": 8.646382554499212e-06, + "loss": 0.0272, + "reward": 0.06570820137858391, + "reward_std": 0.11633498966693878, + "rewards/ndcg_rule_reward": -0.024135545827448368, + "rewards/rule_reward": 0.08984375, + "step": 909, + "token_diversity": 0.3870231331168831 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.2890625, + "epoch": 0.5266203703703703, + "grad_norm": 4.060348987579346, + "kl": 32.375, + "learning_rate": 8.643174602519305e-06, + "loss": 0.0324, + "reward": 0.06624143302906305, + "reward_std": 0.1244937852025032, + "rewards/ndcg_rule_reward": -0.025555440224707127, + "rewards/rule_reward": 0.091796875, + "step": 910, + "token_diversity": 0.37860186688311687 + }, + { + "categorical_diversity": 0.90625, + "completion_length": 5.08984375, + "epoch": 0.5271990740740741, + "grad_norm": 3.1938416957855225, + "kl": 53.625, + "learning_rate": 8.639963450379494e-06, + "loss": 0.0536, + "reward": 0.062071289867162704, + "reward_std": 0.1527738980948925, + "rewards/ndcg_rule_reward": -0.025819338858127594, + "rewards/rule_reward": 0.087890625, + "step": 911, + "token_diversity": 0.38666666666666666 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5277777777777778, + "grad_norm": 1.353822112083435, + "kl": 32.625, + "learning_rate": 8.636749100900452e-06, + "loss": 0.0327, + "reward": 0.0033664738293737173, + "reward_std": 0.10785651952028275, + "rewards/ndcg_rule_reward": -0.02202415093779564, + "rewards/rule_reward": 0.025390625, + "step": 912, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1015625, + "epoch": 0.5283564814814815, + "grad_norm": 2.4386589527130127, + "kl": 22.125, + "learning_rate": 8.63353155690566e-06, + "loss": 0.0221, + "reward": 0.06467343121767044, + "reward_std": 0.07473068311810493, + "rewards/ndcg_rule_reward": -0.01540469005703926, + "rewards/rule_reward": 0.080078125, + "step": 913, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.5289351851851852, + "grad_norm": 2.8052780628204346, + "kl": 31.25, + "learning_rate": 8.630310821221402e-06, + "loss": 0.0314, + "reward": 0.0640819133259356, + "reward_std": 0.133121520280838, + "rewards/ndcg_rule_reward": -0.025761833414435387, + "rewards/rule_reward": 0.08984375, + "step": 914, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.046875, + "epoch": 0.5295138888888888, + "grad_norm": 2.5229244232177734, + "kl": 49.125, + "learning_rate": 8.627086896676768e-06, + "loss": 0.0491, + "reward": 0.03356912545859814, + "reward_std": 0.13276592642068863, + "rewards/ndcg_rule_reward": -0.025024626404047012, + "rewards/rule_reward": 0.05859375, + "step": 915, + "token_diversity": 0.4214638157894737 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.1015625, + "epoch": 0.5300925925925926, + "grad_norm": 1.899888515472412, + "kl": 49.3125, + "learning_rate": 8.623859786103653e-06, + "loss": 0.0493, + "reward": 0.06584719568490982, + "reward_std": 0.09943537041544914, + "rewards/ndcg_rule_reward": -0.02009029733017087, + "rewards/rule_reward": 0.0859375, + "step": 916, + "token_diversity": 0.4104606331168831 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.05078125, + "epoch": 0.5306712962962963, + "grad_norm": 1.9847822189331055, + "kl": 33.375, + "learning_rate": 8.620629492336745e-06, + "loss": 0.0333, + "reward": 0.03434159909375012, + "reward_std": 0.0995459295809269, + "rewards/ndcg_rule_reward": -0.020345900673419237, + "rewards/rule_reward": 0.0546875, + "step": 917, + "token_diversity": 0.37459415584415584 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.53125, + "grad_norm": 1.826404333114624, + "kl": 30.1875, + "learning_rate": 8.617396018213528e-06, + "loss": 0.0302, + "reward": 0.0018794212373904884, + "reward_std": 0.058030154556035995, + "rewards/ndcg_rule_reward": -0.011792454170063138, + "rewards/rule_reward": 0.013671875, + "step": 918, + "token_diversity": 0.55078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.24609375, + "epoch": 0.5318287037037037, + "grad_norm": 2.0516068935394287, + "kl": 29.125, + "learning_rate": 8.614159366574285e-06, + "loss": 0.0291, + "reward": 0.09993986785411835, + "reward_std": 0.1441609412431717, + "rewards/ndcg_rule_reward": -0.028966374695301056, + "rewards/rule_reward": 0.12890625, + "step": 919, + "token_diversity": 0.55859375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.5324074074074074, + "grad_norm": 1.956230878829956, + "kl": 30.3125, + "learning_rate": 8.610919540262082e-06, + "loss": 0.0303, + "reward": 0.06051159091293812, + "reward_std": 0.10872844606637955, + "rewards/ndcg_rule_reward": -0.019566535018384457, + "rewards/rule_reward": 0.080078125, + "step": 920, + "token_diversity": 0.53515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.16796875, + "epoch": 0.5329861111111112, + "grad_norm": 1.7732138633728027, + "kl": 42.0, + "learning_rate": 8.607676542122782e-06, + "loss": 0.0419, + "reward": 0.10909555107355118, + "reward_std": 0.1215980127453804, + "rewards/ndcg_rule_reward": -0.01981070265173912, + "rewards/rule_reward": 0.12890625, + "step": 921, + "token_diversity": 0.5234375 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.203125, + "epoch": 0.5335648148148148, + "grad_norm": 2.358193874359131, + "kl": 24.0625, + "learning_rate": 8.60443037500503e-06, + "loss": 0.024, + "reward": 0.12803690880537033, + "reward_std": 0.13321011513471603, + "rewards/ndcg_rule_reward": -0.02821309771388769, + "rewards/rule_reward": 0.15625, + "step": 922, + "token_diversity": 0.39285714285714285 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.5341435185185185, + "grad_norm": 1.8674097061157227, + "kl": 31.6875, + "learning_rate": 8.601181041760255e-06, + "loss": 0.0316, + "reward": 0.034978222101926804, + "reward_std": 0.11610548570752144, + "rewards/ndcg_rule_reward": -0.023615526035428047, + "rewards/rule_reward": 0.05859375, + "step": 923, + "token_diversity": 0.5234375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.04296875, + "epoch": 0.5347222222222222, + "grad_norm": 1.6872804164886475, + "kl": 35.625, + "learning_rate": 8.597928545242666e-06, + "loss": 0.0356, + "reward": 0.030272829462774098, + "reward_std": 0.1111137792468071, + "rewards/ndcg_rule_reward": -0.0205084215849638, + "rewards/rule_reward": 0.05078125, + "step": 924, + "token_diversity": 0.515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5353009259259259, + "grad_norm": 1.7364166975021362, + "kl": 41.125, + "learning_rate": 8.594672888309252e-06, + "loss": 0.041, + "reward": 0.004100855207070708, + "reward_std": 0.132757980376482, + "rewards/ndcg_rule_reward": -0.027149146422743797, + "rewards/rule_reward": 0.03125, + "step": 925, + "token_diversity": 0.4140625 + }, + { + "categorical_diversity": 0.953125, + "completion_length": 5.033203125, + "epoch": 0.5358796296296297, + "grad_norm": 5.825555801391602, + "kl": 37.9375, + "learning_rate": 8.591414073819779e-06, + "loss": 0.038, + "reward": 0.025084953755140305, + "reward_std": 0.10221502557396889, + "rewards/ndcg_rule_reward": -0.01788379531353712, + "rewards/rule_reward": 0.04296875, + "step": 926, + "token_diversity": 0.37239583333333337 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.5364583333333334, + "grad_norm": 1.737836241722107, + "kl": 32.0, + "learning_rate": 8.588152104636784e-06, + "loss": 0.0321, + "reward": 0.061219760216772556, + "reward_std": 0.1168367899954319, + "rewards/ndcg_rule_reward": -0.02081149071455002, + "rewards/rule_reward": 0.08203125, + "step": 927, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.5370370370370371, + "grad_norm": 2.489253520965576, + "kl": 31.3125, + "learning_rate": 8.584886983625578e-06, + "loss": 0.0314, + "reward": 0.03455529804341495, + "reward_std": 0.12466903030872345, + "rewards/ndcg_rule_reward": -0.02599157951772213, + "rewards/rule_reward": 0.060546875, + "step": 928, + "token_diversity": 0.4375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.10546875, + "epoch": 0.5376157407407407, + "grad_norm": 2.173468828201294, + "kl": 27.25, + "learning_rate": 8.581618713654239e-06, + "loss": 0.0272, + "reward": 0.0609355999622494, + "reward_std": 0.10295945405960083, + "rewards/ndcg_rule_reward": -0.019142521545290947, + "rewards/rule_reward": 0.080078125, + "step": 929, + "token_diversity": 0.53125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.181640625, + "epoch": 0.5381944444444444, + "grad_norm": 1.774932622909546, + "kl": 40.0, + "learning_rate": 8.578347297593609e-06, + "loss": 0.04, + "reward": 0.11992968060076237, + "reward_std": 0.12981809675693512, + "rewards/ndcg_rule_reward": -0.022648447658866644, + "rewards/rule_reward": 0.142578125, + "step": 930, + "token_diversity": 0.515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.5387731481481481, + "grad_norm": 1.9517405033111572, + "kl": 18.6875, + "learning_rate": 8.575072738317297e-06, + "loss": 0.0187, + "reward": 0.06123235635459423, + "reward_std": 0.1281222514808178, + "rewards/ndcg_rule_reward": -0.02470514364540577, + "rewards/rule_reward": 0.0859375, + "step": 931, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0703125, + "epoch": 0.5393518518518519, + "grad_norm": 2.2558069229125977, + "kl": 47.375, + "learning_rate": 8.571795038701673e-06, + "loss": 0.0474, + "reward": 0.04973088018596172, + "reward_std": 0.14166520535945892, + "rewards/ndcg_rule_reward": -0.024487869814038277, + "rewards/rule_reward": 0.07421875, + "step": 932, + "token_diversity": 0.44140625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.5399305555555556, + "grad_norm": 3.8688697814941406, + "kl": 67.5, + "learning_rate": 8.568514201625862e-06, + "loss": 0.0673, + "reward": 0.03436300763860345, + "reward_std": 0.18289601802825928, + "rewards/ndcg_rule_reward": -0.03594949282705784, + "rewards/rule_reward": 0.0703125, + "step": 933, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.5405092592592593, + "grad_norm": 1.9365427494049072, + "kl": 45.375, + "learning_rate": 8.56523022997175e-06, + "loss": 0.0454, + "reward": 0.03614895371720195, + "reward_std": 0.13234735652804375, + "rewards/ndcg_rule_reward": -0.026351045351475477, + "rewards/rule_reward": 0.0625, + "step": 934, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.099609375, + "epoch": 0.5410879629629629, + "grad_norm": 4.02945613861084, + "kl": 48.25, + "learning_rate": 8.561943126623971e-06, + "loss": 0.0483, + "reward": 0.06163623929023743, + "reward_std": 0.13343996554613113, + "rewards/ndcg_rule_reward": -0.024301261641085148, + "rewards/rule_reward": 0.0859375, + "step": 935, + "token_diversity": 0.3653371710526316 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.5416666666666666, + "grad_norm": 7.66828727722168, + "kl": 56.875, + "learning_rate": 8.558652894469911e-06, + "loss": 0.0569, + "reward": 0.03614403086248785, + "reward_std": 0.15760502219200134, + "rewards/ndcg_rule_reward": -0.03221534192562103, + "rewards/rule_reward": 0.068359375, + "step": 936, + "token_diversity": 0.5234375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.158203125, + "epoch": 0.5422453703703703, + "grad_norm": 1.542612910270691, + "kl": 28.1875, + "learning_rate": 8.55535953639971e-06, + "loss": 0.0282, + "reward": 0.09562916168943048, + "reward_std": 0.0664295181632042, + "rewards/ndcg_rule_reward": -0.013745838310569525, + "rewards/rule_reward": 0.109375, + "step": 937, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.05078125, + "epoch": 0.5428240740740741, + "grad_norm": 1.4685895442962646, + "kl": 26.0625, + "learning_rate": 8.552063055306245e-06, + "loss": 0.026, + "reward": 0.03323006723076105, + "reward_std": 0.06639634817838669, + "rewards/ndcg_rule_reward": -0.0136449309065938, + "rewards/rule_reward": 0.046875, + "step": 938, + "token_diversity": 0.34151785714285715 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.576171875, + "epoch": 0.5434027777777778, + "grad_norm": 2.1992123126983643, + "kl": 36.6875, + "learning_rate": 8.548763454085145e-06, + "loss": 0.0367, + "reward": 0.11131540313363075, + "reward_std": 0.15210113674402237, + "rewards/ndcg_rule_reward": -0.025403342209756374, + "rewards/rule_reward": 0.13671875, + "step": 939, + "token_diversity": 0.44921875 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.10546875, + "epoch": 0.5439814814814815, + "grad_norm": 1.5615034103393555, + "kl": 22.75, + "learning_rate": 8.545460735634773e-06, + "loss": 0.0228, + "reward": 0.06534721329808235, + "reward_std": 0.08282498270273209, + "rewards/ndcg_rule_reward": -0.016684035304933786, + "rewards/rule_reward": 0.08203125, + "step": 940, + "token_diversity": 0.3990675403225806 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.08984375, + "epoch": 0.5445601851851852, + "grad_norm": 3.9787254333496094, + "kl": 56.75, + "learning_rate": 8.542154902856232e-06, + "loss": 0.0568, + "reward": 0.060459390515461564, + "reward_std": 0.1781388223171234, + "rewards/ndcg_rule_reward": -0.03329060971736908, + "rewards/rule_reward": 0.09375, + "step": 941, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5451388888888888, + "grad_norm": 1.5758682489395142, + "kl": 41.9375, + "learning_rate": 8.538845958653363e-06, + "loss": 0.042, + "reward": 0.0025772307999432087, + "reward_std": 0.0829460360109806, + "rewards/ndcg_rule_reward": -0.01695401966571808, + "rewards/rule_reward": 0.01953125, + "step": 942, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.10546875, + "epoch": 0.5457175925925926, + "grad_norm": 1.8302934169769287, + "kl": 27.875, + "learning_rate": 8.535533905932739e-06, + "loss": 0.0278, + "reward": 0.06634412333369255, + "reward_std": 0.12444477155804634, + "rewards/ndcg_rule_reward": -0.0254527498036623, + "rewards/rule_reward": 0.091796875, + "step": 943, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.5462962962962963, + "grad_norm": 1.829110860824585, + "kl": 45.6875, + "learning_rate": 8.53221874760366e-06, + "loss": 0.0456, + "reward": 0.03308634855784476, + "reward_std": 0.132984958589077, + "rewards/ndcg_rule_reward": -0.02550740260630846, + "rewards/rule_reward": 0.05859375, + "step": 944, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.02734375, + "epoch": 0.546875, + "grad_norm": 2.266265392303467, + "kl": 28.5, + "learning_rate": 8.528900486578158e-06, + "loss": 0.0285, + "reward": 0.023442784324288368, + "reward_std": 0.10719309374690056, + "rewards/ndcg_rule_reward": -0.019525968469679356, + "rewards/rule_reward": 0.04296875, + "step": 945, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.107421875, + "epoch": 0.5474537037037037, + "grad_norm": 2.4309325218200684, + "kl": 24.34375, + "learning_rate": 8.525579125770992e-06, + "loss": 0.0244, + "reward": 0.07148994447197765, + "reward_std": 0.13265879824757576, + "rewards/ndcg_rule_reward": -0.024213183671236038, + "rewards/rule_reward": 0.095703125, + "step": 946, + "token_diversity": 0.33038651315789475 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.10546875, + "epoch": 0.5480324074074074, + "grad_norm": 1.5175871849060059, + "kl": 20.15625, + "learning_rate": 8.522254668099636e-06, + "loss": 0.0201, + "reward": 0.06568077951669693, + "reward_std": 0.10792097449302673, + "rewards/ndcg_rule_reward": -0.02220984734594822, + "rewards/rule_reward": 0.087890625, + "step": 947, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.5486111111111112, + "grad_norm": 2.3991429805755615, + "kl": 26.625, + "learning_rate": 8.518927116484294e-06, + "loss": 0.0266, + "reward": 0.03523298248182982, + "reward_std": 0.13280081376433372, + "rewards/ndcg_rule_reward": -0.027267013676464558, + "rewards/rule_reward": 0.0625, + "step": 948, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.091796875, + "epoch": 0.5491898148148148, + "grad_norm": 1.8640161752700806, + "kl": 19.34375, + "learning_rate": 8.515596473847882e-06, + "loss": 0.0193, + "reward": 0.05915585160255432, + "reward_std": 0.13011032715439796, + "rewards/ndcg_rule_reward": -0.024828524328768253, + "rewards/rule_reward": 0.083984375, + "step": 949, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.19140625, + "epoch": 0.5497685185185185, + "grad_norm": 1.5930529832839966, + "kl": 23.9375, + "learning_rate": 8.512262743116029e-06, + "loss": 0.0239, + "reward": 0.1191069707274437, + "reward_std": 0.1052250936627388, + "rewards/ndcg_rule_reward": -0.017611777875572443, + "rewards/rule_reward": 0.13671875, + "step": 950, + "token_diversity": 0.36204637096774195 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5503472222222222, + "grad_norm": 1.6826838254928589, + "kl": 23.25, + "learning_rate": 8.508925927217083e-06, + "loss": 0.0232, + "reward": 0.004581898218020797, + "reward_std": 0.13250893726944923, + "rewards/ndcg_rule_reward": -0.026668100617825985, + "rewards/rule_reward": 0.03125, + "step": 951, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 0.953125, + "completion_length": 5.28125, + "epoch": 0.5509259259259259, + "grad_norm": 1.4887735843658447, + "kl": 35.0, + "learning_rate": 8.505586029082099e-06, + "loss": 0.0351, + "reward": 0.17546247690916061, + "reward_std": 0.11668215692043304, + "rewards/ndcg_rule_reward": -0.019850024953484535, + "rewards/rule_reward": 0.1953125, + "step": 952, + "token_diversity": 0.34121919014084506 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.107421875, + "epoch": 0.5515046296296297, + "grad_norm": 1.47955322265625, + "kl": 29.875, + "learning_rate": 8.502243051644838e-06, + "loss": 0.0299, + "reward": 0.06334402225911617, + "reward_std": 0.0997953861951828, + "rewards/ndcg_rule_reward": -0.018687224946916103, + "rewards/rule_reward": 0.08203125, + "step": 953, + "token_diversity": 0.35653409090909094 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.099609375, + "epoch": 0.5520833333333334, + "grad_norm": 2.168396472930908, + "kl": 51.0, + "learning_rate": 8.498896997841766e-06, + "loss": 0.0509, + "reward": 0.06355216726660728, + "reward_std": 0.1333843655884266, + "rewards/ndcg_rule_reward": -0.02629158366471529, + "rewards/rule_reward": 0.08984375, + "step": 954, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.048828125, + "epoch": 0.5526620370370371, + "grad_norm": 3.3571674823760986, + "kl": 61.25, + "learning_rate": 8.495547870612055e-06, + "loss": 0.0614, + "reward": 0.03884717728942633, + "reward_std": 0.12952324748039246, + "rewards/ndcg_rule_reward": -0.021699695847928524, + "rewards/rule_reward": 0.060546875, + "step": 955, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.099609375, + "epoch": 0.5532407407407407, + "grad_norm": 1.8952791690826416, + "kl": 50.25, + "learning_rate": 8.492195672897572e-06, + "loss": 0.0502, + "reward": 0.0674569308757782, + "reward_std": 0.16580019146203995, + "rewards/ndcg_rule_reward": -0.030199317261576653, + "rewards/rule_reward": 0.09765625, + "step": 956, + "token_diversity": 0.46544471153846156 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.00390625, + "epoch": 0.5538194444444444, + "grad_norm": 2.056157112121582, + "kl": 46.625, + "learning_rate": 8.488840407642885e-06, + "loss": 0.0466, + "reward": 0.007572507718577981, + "reward_std": 0.12125032395124435, + "rewards/ndcg_rule_reward": -0.023677492514252663, + "rewards/rule_reward": 0.03125, + "step": 957, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.154296875, + "epoch": 0.5543981481481481, + "grad_norm": 1.7234184741973877, + "kl": 21.5625, + "learning_rate": 8.485482077795253e-06, + "loss": 0.0216, + "reward": 0.09670756012201309, + "reward_std": 0.09119486808776855, + "rewards/ndcg_rule_reward": -0.01852681301534176, + "rewards/rule_reward": 0.115234375, + "step": 958, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5549768518518519, + "grad_norm": 2.1351983547210693, + "kl": 31.6875, + "learning_rate": 8.48212068630463e-06, + "loss": 0.0317, + "reward": 0.0018189831753261387, + "reward_std": 0.058053158223629, + "rewards/ndcg_rule_reward": -0.011852892115712166, + "rewards/rule_reward": 0.013671875, + "step": 959, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5555555555555556, + "grad_norm": 1.6025824546813965, + "kl": 40.078125, + "learning_rate": 8.478756236123657e-06, + "loss": 0.04, + "reward": 0.004208604455925524, + "reward_std": 0.12426408380270004, + "rewards/ndcg_rule_reward": -0.025088271126151085, + "rewards/rule_reward": 0.029296875, + "step": 960, + "token_diversity": 0.4375 + }, + { + "categorical_diversity": 0.921875, + "completion_length": 5.12890625, + "epoch": 0.5561342592592593, + "grad_norm": 2.7239761352539062, + "kl": 61.0, + "learning_rate": 8.475388730207662e-06, + "loss": 0.0611, + "reward": 0.08449904504232109, + "reward_std": 0.16038894653320312, + "rewards/ndcg_rule_reward": -0.028782198205590248, + "rewards/rule_reward": 0.11328125, + "step": 961, + "token_diversity": 0.42388091216216217 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.05078125, + "epoch": 0.5567129629629629, + "grad_norm": 1.7546428442001343, + "kl": 35.03125, + "learning_rate": 8.47201817151466e-06, + "loss": 0.0351, + "reward": 0.034396468894556165, + "reward_std": 0.10792317241430283, + "rewards/ndcg_rule_reward": -0.022244155406951904, + "rewards/rule_reward": 0.056640625, + "step": 962, + "token_diversity": 0.3161525974025974 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1015625, + "epoch": 0.5572916666666666, + "grad_norm": 1.6651250123977661, + "kl": 39.9375, + "learning_rate": 8.468644563005344e-06, + "loss": 0.0399, + "reward": 0.06595401838421822, + "reward_std": 0.11622798070311546, + "rewards/ndcg_rule_reward": -0.023889726027846336, + "rewards/rule_reward": 0.08984375, + "step": 963, + "token_diversity": 0.515625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5578703703703703, + "grad_norm": 1.5937695503234863, + "kl": 28.8125, + "learning_rate": 8.465267907643088e-06, + "loss": 0.0288, + "reward": 0.003524256986565888, + "reward_std": 0.09934734925627708, + "rewards/ndcg_rule_reward": -0.019913243129849434, + "rewards/rule_reward": 0.0234375, + "step": 964, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.044921875, + "epoch": 0.5584490740740741, + "grad_norm": 2.136192798614502, + "kl": 15.5, + "learning_rate": 8.46188820839394e-06, + "loss": 0.0155, + "reward": 0.03335018805228174, + "reward_std": 0.08319346606731415, + "rewards/ndcg_rule_reward": -0.017431057058274746, + "rewards/rule_reward": 0.05078125, + "step": 965, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.09765625, + "epoch": 0.5590277777777778, + "grad_norm": 2.7365245819091797, + "kl": 43.5, + "learning_rate": 8.458505468226627e-06, + "loss": 0.0435, + "reward": 0.06425371509976685, + "reward_std": 0.1330036073923111, + "rewards/ndcg_rule_reward": -0.02559003420174122, + "rewards/rule_reward": 0.08984375, + "step": 966, + "token_diversity": 0.33822037337662336 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.1015625, + "epoch": 0.5596064814814815, + "grad_norm": 1.316144347190857, + "kl": 14.375, + "learning_rate": 8.455119690112542e-06, + "loss": 0.0144, + "reward": 0.06492008658824489, + "reward_std": 0.0830292571336031, + "rewards/ndcg_rule_reward": -0.017111169872805476, + "rewards/rule_reward": 0.08203125, + "step": 967, + "token_diversity": 0.35841112012987014 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.1953125, + "epoch": 0.5601851851851852, + "grad_norm": 1.4395174980163574, + "kl": 40.375, + "learning_rate": 8.451730877025746e-06, + "loss": 0.0404, + "reward": 0.1244526356458664, + "reward_std": 0.12487771734595299, + "rewards/ndcg_rule_reward": -0.02203173842281103, + "rewards/rule_reward": 0.146484375, + "step": 968, + "token_diversity": 0.32913961038961037 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.5607638888888888, + "grad_norm": 3.3808434009552, + "kl": 29.8125, + "learning_rate": 8.44833903194297e-06, + "loss": 0.0298, + "reward": 0.063437819480896, + "reward_std": 0.12499110773205757, + "rewards/ndcg_rule_reward": -0.024452805519104004, + "rewards/rule_reward": 0.087890625, + "step": 969, + "token_diversity": 0.4375 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.046875, + "epoch": 0.5613425925925926, + "grad_norm": 4.363109111785889, + "kl": 52.25, + "learning_rate": 8.444944157843607e-06, + "loss": 0.0523, + "reward": 0.03371779597364366, + "reward_std": 0.15794430673122406, + "rewards/ndcg_rule_reward": -0.030735328793525696, + "rewards/rule_reward": 0.064453125, + "step": 970, + "token_diversity": 0.37088815789473684 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.08203125, + "epoch": 0.5619212962962963, + "grad_norm": 2.2435812950134277, + "kl": 39.25, + "learning_rate": 8.441546257709708e-06, + "loss": 0.0392, + "reward": 0.05453253164887428, + "reward_std": 0.10664443671703339, + "rewards/ndcg_rule_reward": -0.01773309288546443, + "rewards/rule_reward": 0.072265625, + "step": 971, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.5625, + "grad_norm": 2.310734748840332, + "kl": 23.74609375, + "learning_rate": 8.438145334525987e-06, + "loss": 0.0238, + "reward": 0.03429084620438516, + "reward_std": 0.0911591611802578, + "rewards/ndcg_rule_reward": -0.018443528562784195, + "rewards/rule_reward": 0.052734375, + "step": 972, + "token_diversity": 0.5 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.5630787037037037, + "grad_norm": 2.869065999984741, + "kl": 34.25, + "learning_rate": 8.434741391279809e-06, + "loss": 0.0342, + "reward": 0.03172442887444049, + "reward_std": 0.0999743714928627, + "rewards/ndcg_rule_reward": -0.01905681937932968, + "rewards/rule_reward": 0.05078125, + "step": 973, + "token_diversity": 0.52734375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.076171875, + "epoch": 0.5636574074074074, + "grad_norm": 4.979064464569092, + "kl": 95.25, + "learning_rate": 8.431334430961192e-06, + "loss": 0.0951, + "reward": 0.055895778350532055, + "reward_std": 0.1478128917515278, + "rewards/ndcg_rule_reward": -0.02613547444343567, + "rewards/rule_reward": 0.08203125, + "step": 974, + "token_diversity": 0.41796875 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.15234375, + "epoch": 0.5642361111111112, + "grad_norm": 2.2436211109161377, + "kl": 28.5, + "learning_rate": 8.427924456562807e-06, + "loss": 0.0285, + "reward": 0.09755217656493187, + "reward_std": 0.1244494654238224, + "rewards/ndcg_rule_reward": -0.02549469657242298, + "rewards/rule_reward": 0.123046875, + "step": 975, + "token_diversity": 0.37012987012987014 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.08984375, + "epoch": 0.5648148148148148, + "grad_norm": 2.0690014362335205, + "kl": 29.53125, + "learning_rate": 8.424511471079971e-06, + "loss": 0.0295, + "reward": 0.05816801148466766, + "reward_std": 0.10348125919699669, + "rewards/ndcg_rule_reward": -0.018003864213824272, + "rewards/rule_reward": 0.076171875, + "step": 976, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.08984375, + "epoch": 0.5653935185185185, + "grad_norm": 2.3147552013397217, + "kl": 48.5625, + "learning_rate": 8.421095477510648e-06, + "loss": 0.0484, + "reward": 0.05925705283880234, + "reward_std": 0.12167151272296906, + "rewards/ndcg_rule_reward": -0.022774194832891226, + "rewards/rule_reward": 0.08203125, + "step": 977, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1171875, + "epoch": 0.5659722222222222, + "grad_norm": 1.7914824485778809, + "kl": 25.53125, + "learning_rate": 8.417676478855438e-06, + "loss": 0.0255, + "reward": 0.07716352120041847, + "reward_std": 0.10324328392744064, + "rewards/ndcg_rule_reward": -0.0146333584561944, + "rewards/rule_reward": 0.091796875, + "step": 978, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.5665509259259259, + "grad_norm": 2.865507125854492, + "kl": 40.5, + "learning_rate": 8.414254478117589e-06, + "loss": 0.0405, + "reward": 0.033018861431628466, + "reward_std": 0.13303284347057343, + "rewards/ndcg_rule_reward": -0.02557489089667797, + "rewards/rule_reward": 0.05859375, + "step": 979, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5671296296296297, + "grad_norm": 2.1047236919403076, + "kl": 21.75, + "learning_rate": 8.410829478302982e-06, + "loss": 0.0217, + "reward": 0.004456443013623357, + "reward_std": 0.16623662412166595, + "rewards/ndcg_rule_reward": -0.03460605815052986, + "rewards/rule_reward": 0.0390625, + "step": 980, + "token_diversity": 0.48828125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5677083333333334, + "grad_norm": 2.9495840072631836, + "kl": 38.25, + "learning_rate": 8.407401482420129e-06, + "loss": 0.0382, + "reward": 0.0035544168204069138, + "reward_std": 0.11615829542279243, + "rewards/ndcg_rule_reward": -0.023789334576576948, + "rewards/rule_reward": 0.02734375, + "step": 981, + "token_diversity": 0.44140625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.5682870370370371, + "grad_norm": 2.8638553619384766, + "kl": 17.625, + "learning_rate": 8.40397049348018e-06, + "loss": 0.0177, + "reward": 0.06354300118982792, + "reward_std": 0.11652971431612968, + "rewards/ndcg_rule_reward": -0.022394496016204357, + "rewards/rule_reward": 0.0859375, + "step": 982, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.5688657407407407, + "grad_norm": 3.5625975131988525, + "kl": 37.0, + "learning_rate": 8.400536514496908e-06, + "loss": 0.037, + "reward": 0.035147922462783754, + "reward_std": 0.14124276489019394, + "rewards/ndcg_rule_reward": -0.029305201023817062, + "rewards/rule_reward": 0.064453125, + "step": 983, + "token_diversity": 0.453125 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.15234375, + "epoch": 0.5694444444444444, + "grad_norm": 1.3912975788116455, + "kl": 42.75, + "learning_rate": 8.39709954848672e-06, + "loss": 0.0428, + "reward": 0.09574976935982704, + "reward_std": 0.07481998205184937, + "rewards/ndcg_rule_reward": -0.015578349120914936, + "rewards/rule_reward": 0.111328125, + "step": 984, + "token_diversity": 0.35247564935064934 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.14453125, + "epoch": 0.5700231481481481, + "grad_norm": 1.9398061037063599, + "kl": 16.28125, + "learning_rate": 8.393659598468644e-06, + "loss": 0.0163, + "reward": 0.09149145893752575, + "reward_std": 0.09201466292142868, + "rewards/ndcg_rule_reward": -0.0159304141998291, + "rewards/rule_reward": 0.107421875, + "step": 985, + "token_diversity": 0.3900082236842105 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5706018518518519, + "grad_norm": 1.6475982666015625, + "kl": 20.3125, + "learning_rate": 8.39021666746432e-06, + "loss": 0.0203, + "reward": 0.00267536606406793, + "reward_std": 0.09134094417095184, + "rewards/ndcg_rule_reward": -0.018809008412063122, + "rewards/rule_reward": 0.021484375, + "step": 986, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.3515625, + "epoch": 0.5711805555555556, + "grad_norm": 2.012657403945923, + "kl": 48.875, + "learning_rate": 8.386770758498021e-06, + "loss": 0.0489, + "reward": 0.12823134195059538, + "reward_std": 0.15139725431799889, + "rewards/ndcg_rule_reward": -0.02606554003432393, + "rewards/rule_reward": 0.154296875, + "step": 987, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.5717592592592593, + "grad_norm": 3.2143003940582275, + "kl": 28.40625, + "learning_rate": 8.383321874596627e-06, + "loss": 0.0283, + "reward": 0.06359850149601698, + "reward_std": 0.12494384124875069, + "rewards/ndcg_rule_reward": -0.024292124435305595, + "rewards/rule_reward": 0.087890625, + "step": 988, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.09765625, + "epoch": 0.5723379629629629, + "grad_norm": 1.585476040840149, + "kl": 41.625, + "learning_rate": 8.379870018789632e-06, + "loss": 0.0416, + "reward": 0.06322591681964695, + "reward_std": 0.10826375335454941, + "rewards/ndcg_rule_reward": -0.02075845655053854, + "rewards/rule_reward": 0.083984375, + "step": 989, + "token_diversity": 0.4019325657894737 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1015625, + "epoch": 0.5729166666666666, + "grad_norm": 3.2331883907318115, + "kl": 42.125, + "learning_rate": 8.376415194109143e-06, + "loss": 0.0421, + "reward": 0.06738406792283058, + "reward_std": 0.15763338655233383, + "rewards/ndcg_rule_reward": -0.03222529962658882, + "rewards/rule_reward": 0.099609375, + "step": 990, + "token_diversity": 0.52734375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 8.712890625, + "epoch": 0.5734953703703703, + "grad_norm": 1.8067280054092407, + "kl": 13.09375, + "learning_rate": 8.372957403589873e-06, + "loss": 0.0131, + "reward": 0.06289966776967049, + "reward_std": 0.10839810594916344, + "rewards/ndcg_rule_reward": -0.021084708161652088, + "rewards/rule_reward": 0.083984375, + "step": 991, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.154296875, + "epoch": 0.5740740740740741, + "grad_norm": 4.850770950317383, + "kl": 47.8125, + "learning_rate": 8.369496650269141e-06, + "loss": 0.0478, + "reward": 0.09903142973780632, + "reward_std": 0.12994804978370667, + "rewards/ndcg_rule_reward": -0.024015444330871105, + "rewards/rule_reward": 0.123046875, + "step": 992, + "token_diversity": 0.41015625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.5746527777777778, + "grad_norm": 7.275876045227051, + "kl": 44.375, + "learning_rate": 8.366032937186869e-06, + "loss": 0.0443, + "reward": 0.03243524406570941, + "reward_std": 0.12491356953978539, + "rewards/ndcg_rule_reward": -0.02420538105070591, + "rewards/rule_reward": 0.056640625, + "step": 993, + "token_diversity": 0.5078125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.5752314814814815, + "grad_norm": 2.312795877456665, + "kl": 66.875, + "learning_rate": 8.362566267385576e-06, + "loss": 0.0669, + "reward": 0.03408701601438224, + "reward_std": 0.15776699781417847, + "rewards/ndcg_rule_reward": -0.030366109684109688, + "rewards/rule_reward": 0.064453125, + "step": 994, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.138671875, + "epoch": 0.5758101851851852, + "grad_norm": 2.6007814407348633, + "kl": 32.0625, + "learning_rate": 8.359096643910382e-06, + "loss": 0.0321, + "reward": 0.09226481383666396, + "reward_std": 0.13184945657849312, + "rewards/ndcg_rule_reward": -0.022969561628997326, + "rewards/rule_reward": 0.115234375, + "step": 995, + "token_diversity": 0.41796875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.04296875, + "epoch": 0.5763888888888888, + "grad_norm": 2.8727684020996094, + "kl": 37.9375, + "learning_rate": 8.355624069809e-06, + "loss": 0.038, + "reward": 0.03187486180104315, + "reward_std": 0.19452229142189026, + "rewards/ndcg_rule_reward": -0.03843763843178749, + "rewards/rule_reward": 0.0703125, + "step": 996, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.5769675925925926, + "grad_norm": 1.9126839637756348, + "kl": 28.6875, + "learning_rate": 8.352148548131736e-06, + "loss": 0.0286, + "reward": 0.03564235707744956, + "reward_std": 0.12417382374405861, + "rewards/ndcg_rule_reward": -0.024904515594244003, + "rewards/rule_reward": 0.060546875, + "step": 997, + "token_diversity": 0.4765625 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.1015625, + "epoch": 0.5775462962962963, + "grad_norm": 2.2811245918273926, + "kl": 24.53125, + "learning_rate": 8.348670081931477e-06, + "loss": 0.0246, + "reward": 0.0690665952861309, + "reward_std": 0.17983204126358032, + "rewards/ndcg_rule_reward": -0.0344490222632885, + "rewards/rule_reward": 0.103515625, + "step": 998, + "token_diversity": 0.34273538961038963 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.078125, + "epoch": 0.578125, + "grad_norm": 3.862781524658203, + "kl": 36.25, + "learning_rate": 8.345188674263713e-06, + "loss": 0.0362, + "reward": 0.05595764936879277, + "reward_std": 0.13186683505773544, + "rewards/ndcg_rule_reward": -0.024120476096868515, + "rewards/rule_reward": 0.080078125, + "step": 999, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.5787037037037037, + "grad_norm": 1.6725515127182007, + "kl": 25.875, + "learning_rate": 8.341704328186501e-06, + "loss": 0.0259, + "reward": 0.03314903483260423, + "reward_std": 0.06642518192529678, + "rewards/ndcg_rule_reward": -0.013725962955504656, + "rewards/rule_reward": 0.046875, + "step": 1000, + "token_diversity": 0.45703125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.052734375, + "epoch": 0.5792824074074074, + "grad_norm": 6.140981674194336, + "kl": 61.5, + "learning_rate": 8.33821704676049e-06, + "loss": 0.0615, + "reward": 0.03466873907018453, + "reward_std": 0.10780730843544006, + "rewards/ndcg_rule_reward": -0.021971882320940495, + "rewards/rule_reward": 0.056640625, + "step": 1001, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.189453125, + "epoch": 0.5798611111111112, + "grad_norm": 2.3532798290252686, + "kl": 35.1875, + "learning_rate": 8.3347268330489e-06, + "loss": 0.0352, + "reward": 0.11962208151817322, + "reward_std": 0.13967174664139748, + "rewards/ndcg_rule_reward": -0.02490917406976223, + "rewards/rule_reward": 0.14453125, + "step": 1002, + "token_diversity": 0.35450487012987014 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.134765625, + "epoch": 0.5804398148148148, + "grad_norm": 2.11910343170166, + "kl": 32.0625, + "learning_rate": 8.331233690117533e-06, + "loss": 0.032, + "reward": 0.08667367487214506, + "reward_std": 0.14072906225919724, + "rewards/ndcg_rule_reward": -0.02660757303237915, + "rewards/rule_reward": 0.11328125, + "step": 1003, + "token_diversity": 0.36495535714285715 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.087890625, + "epoch": 0.5810185185185185, + "grad_norm": 1.6001932621002197, + "kl": 40.75, + "learning_rate": 8.327737621034761e-06, + "loss": 0.0408, + "reward": 0.056619087350554764, + "reward_std": 0.08964589238166809, + "rewards/ndcg_rule_reward": -0.015646533574908972, + "rewards/rule_reward": 0.072265625, + "step": 1004, + "token_diversity": 0.4453125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0625, + "epoch": 0.5815972222222222, + "grad_norm": 1.7848604917526245, + "kl": 38.8125, + "learning_rate": 8.324238628871526e-06, + "loss": 0.0388, + "reward": 0.044913413003087044, + "reward_std": 0.11839285865426064, + "rewards/ndcg_rule_reward": -0.02149283792823553, + "rewards/rule_reward": 0.06640625, + "step": 1005, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.095703125, + "epoch": 0.5821759259259259, + "grad_norm": 5.974712371826172, + "kl": 47.0625, + "learning_rate": 8.320736716701336e-06, + "loss": 0.0472, + "reward": 0.06170429289340973, + "reward_std": 0.13629010319709778, + "rewards/ndcg_rule_reward": -0.026186330243945122, + "rewards/rule_reward": 0.087890625, + "step": 1006, + "token_diversity": 0.484375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.5827546296296297, + "grad_norm": 3.2107667922973633, + "kl": 39.4375, + "learning_rate": 8.317231887600273e-06, + "loss": 0.0394, + "reward": 0.06040681153535843, + "reward_std": 0.11162005364894867, + "rewards/ndcg_rule_reward": -0.02162443706765771, + "rewards/rule_reward": 0.08203125, + "step": 1007, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 0.796875, + "completion_length": 5.1953125, + "epoch": 0.5833333333333334, + "grad_norm": 3.8458428382873535, + "kl": 38.25, + "learning_rate": 8.313724144646965e-06, + "loss": 0.0382, + "reward": 0.12171119987033308, + "reward_std": 0.1366414651274681, + "rewards/ndcg_rule_reward": -0.024773172102868557, + "rewards/rule_reward": 0.146484375, + "step": 1008, + "token_diversity": 0.2647270114942529 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09375, + "epoch": 0.5839120370370371, + "grad_norm": 3.192847728729248, + "kl": 38.75, + "learning_rate": 8.310213490922616e-06, + "loss": 0.0387, + "reward": 0.061976321740075946, + "reward_std": 0.13613354787230492, + "rewards/ndcg_rule_reward": -0.025914296507835388, + "rewards/rule_reward": 0.087890625, + "step": 1009, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5844907407407407, + "grad_norm": 6.030204772949219, + "kl": 42.625, + "learning_rate": 8.306699929510978e-06, + "loss": 0.0428, + "reward": 0.004470547661185265, + "reward_std": 0.13254518806934357, + "rewards/ndcg_rule_reward": -0.026779452338814735, + "rewards/rule_reward": 0.03125, + "step": 1010, + "token_diversity": 0.46484375 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.1796875, + "epoch": 0.5850694444444444, + "grad_norm": 2.3431661128997803, + "kl": 22.1875, + "learning_rate": 8.303183463498357e-06, + "loss": 0.0222, + "reward": 0.11452769488096237, + "reward_std": 0.11418202891945839, + "rewards/ndcg_rule_reward": -0.016331681981682777, + "rewards/rule_reward": 0.130859375, + "step": 1011, + "token_diversity": 0.3638980263157895 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.046875, + "epoch": 0.5856481481481481, + "grad_norm": 1.966410517692566, + "kl": 27.375, + "learning_rate": 8.299664095973615e-06, + "loss": 0.0274, + "reward": 0.03345418535172939, + "reward_std": 0.10822586342692375, + "rewards/ndcg_rule_reward": -0.019280191510915756, + "rewards/rule_reward": 0.052734375, + "step": 1012, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.080078125, + "epoch": 0.5862268518518519, + "grad_norm": 2.0816638469696045, + "kl": 38.75, + "learning_rate": 8.296141830028157e-06, + "loss": 0.0386, + "reward": 0.05030527478083968, + "reward_std": 0.0999334380030632, + "rewards/ndcg_rule_reward": -0.016100979410111904, + "rewards/rule_reward": 0.06640625, + "step": 1013, + "token_diversity": 0.54296875 + }, + { + "categorical_diversity": 0.796875, + "completion_length": 5.193359375, + "epoch": 0.5868055555555556, + "grad_norm": 1.3074026107788086, + "kl": 27.625, + "learning_rate": 8.292616668755939e-06, + "loss": 0.0276, + "reward": 0.12148564867675304, + "reward_std": 0.09652867913246155, + "rewards/ndcg_rule_reward": -0.01718622539192438, + "rewards/rule_reward": 0.138671875, + "step": 1014, + "token_diversity": 0.2646251089799477 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.103515625, + "epoch": 0.5873842592592593, + "grad_norm": 2.221428871154785, + "kl": 31.375, + "learning_rate": 8.289088615253458e-06, + "loss": 0.0314, + "reward": 0.06503114884253591, + "reward_std": 0.09138612076640129, + "rewards/ndcg_rule_reward": -0.018953227903693914, + "rewards/rule_reward": 0.083984375, + "step": 1015, + "token_diversity": 0.4921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.5879629629629629, + "grad_norm": 2.1486775875091553, + "kl": 20.5, + "learning_rate": 8.28555767261975e-06, + "loss": 0.0205, + "reward": 0.034633987001143396, + "reward_std": 0.11624680832028389, + "rewards/ndcg_rule_reward": -0.02395976148545742, + "rewards/rule_reward": 0.05859375, + "step": 1016, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.09375, + "epoch": 0.5885416666666666, + "grad_norm": 2.1563897132873535, + "kl": 54.375, + "learning_rate": 8.282023843956392e-06, + "loss": 0.0544, + "reward": 0.061331362230703235, + "reward_std": 0.11120826378464699, + "rewards/ndcg_rule_reward": -0.020699886605143547, + "rewards/rule_reward": 0.08203125, + "step": 1017, + "token_diversity": 0.35709212662337664 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1484375, + "epoch": 0.5891203703703703, + "grad_norm": 2.4413561820983887, + "kl": 49.625, + "learning_rate": 8.278487132367494e-06, + "loss": 0.0497, + "reward": 0.09466747380793095, + "reward_std": 0.10820162668824196, + "rewards/ndcg_rule_reward": -0.02056690026074648, + "rewards/rule_reward": 0.115234375, + "step": 1018, + "token_diversity": 0.51171875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.09765625, + "epoch": 0.5896990740740741, + "grad_norm": 3.504434585571289, + "kl": 39.6875, + "learning_rate": 8.2749475409597e-06, + "loss": 0.0396, + "reward": 0.06514130160212517, + "reward_std": 0.15786123275756836, + "rewards/ndcg_rule_reward": -0.030561822466552258, + "rewards/rule_reward": 0.095703125, + "step": 1019, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.1171875, + "epoch": 0.5902777777777778, + "grad_norm": 2.2151448726654053, + "kl": 41.875, + "learning_rate": 8.271405072842183e-06, + "loss": 0.0419, + "reward": 0.07753012329339981, + "reward_std": 0.13094279915094376, + "rewards/ndcg_rule_reward": -0.02207924984395504, + "rewards/rule_reward": 0.099609375, + "step": 1020, + "token_diversity": 0.46875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.154296875, + "epoch": 0.5908564814814815, + "grad_norm": 2.1057279109954834, + "kl": 35.9375, + "learning_rate": 8.26785973112664e-06, + "loss": 0.0359, + "reward": 0.09958687797188759, + "reward_std": 0.11907833069562912, + "rewards/ndcg_rule_reward": -0.02345999889075756, + "rewards/rule_reward": 0.123046875, + "step": 1021, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.5914351851851852, + "grad_norm": 2.2893011569976807, + "kl": 43.375, + "learning_rate": 8.264311518927295e-06, + "loss": 0.0434, + "reward": 0.03453264129348099, + "reward_std": 0.09945220500230789, + "rewards/ndcg_rule_reward": -0.020154857076704502, + "rewards/rule_reward": 0.0546875, + "step": 1022, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 0.953125, + "completion_length": 5.037109375, + "epoch": 0.5920138888888888, + "grad_norm": 2.286802053451538, + "kl": 18.375, + "learning_rate": 8.260760439360894e-06, + "loss": 0.0183, + "reward": 0.029164484178181738, + "reward_std": 0.1114015281200409, + "rewards/ndcg_rule_reward": -0.021616763435304165, + "rewards/rule_reward": 0.05078125, + "step": 1023, + "token_diversity": 0.3995269982993197 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.05078125, + "epoch": 0.5925925925925926, + "grad_norm": 2.335581064224243, + "kl": 37.75, + "learning_rate": 8.257206495546701e-06, + "loss": 0.0377, + "reward": 0.03547460958361626, + "reward_std": 0.14111118018627167, + "rewards/ndcg_rule_reward": -0.028978517279028893, + "rewards/rule_reward": 0.064453125, + "step": 1024, + "token_diversity": 0.34212662337662336 + }, + { + "categorical_diversity": 0.875, + "completion_length": 5.09765625, + "epoch": 0.5931712962962963, + "grad_norm": 2.556367874145508, + "kl": 1.42578125, + "learning_rate": 8.253649690606495e-06, + "loss": 0.0014, + "reward": 0.06407167389988899, + "reward_std": 0.12470360845327377, + "rewards/ndcg_rule_reward": -0.023818946443498135, + "rewards/rule_reward": 0.087890625, + "step": 1025, + "token_diversity": 0.38047889610389607 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.201171875, + "epoch": 0.59375, + "grad_norm": 2.0514869689941406, + "kl": 1.33984375, + "learning_rate": 8.25009002766457e-06, + "loss": 0.0013, + "reward": 0.12622245773673058, + "reward_std": 0.11644013971090317, + "rewards/ndcg_rule_reward": -0.022215040400624275, + "rewards/rule_reward": 0.1484375, + "step": 1026, + "token_diversity": 0.49609375 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.046875, + "epoch": 0.5943287037037037, + "grad_norm": 2.2785866260528564, + "kl": 1.66015625, + "learning_rate": 8.246527509847729e-06, + "loss": 0.0017, + "reward": 0.033022947143763304, + "reward_std": 0.12463254481554031, + "rewards/ndcg_rule_reward": -0.02361767739057541, + "rewards/rule_reward": 0.056640625, + "step": 1027, + "token_diversity": 0.4091282894736842 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.23046875, + "epoch": 0.5949074074074074, + "grad_norm": 2.2549045085906982, + "kl": 2.8046875, + "learning_rate": 8.24296214028528e-06, + "loss": 0.0028, + "reward": 0.12272275984287262, + "reward_std": 0.0948888510465622, + "rewards/ndcg_rule_reward": -0.01790224015712738, + "rewards/rule_reward": 0.140625, + "step": 1028, + "token_diversity": 0.50390625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5954861111111112, + "grad_norm": 1.6789252758026123, + "kl": 1.95703125, + "learning_rate": 8.239393922109045e-06, + "loss": 0.002, + "reward": 0.0031951526179909706, + "reward_std": 0.0994829498231411, + "rewards/ndcg_rule_reward": -0.02024234738200903, + "rewards/rule_reward": 0.0234375, + "step": 1029, + "token_diversity": 0.47265625 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5960648148148148, + "grad_norm": 2.7565813064575195, + "kl": 6.984375, + "learning_rate": 8.235822858453337e-06, + "loss": 0.007, + "reward": 0.005207367241382599, + "reward_std": 0.1490745097398758, + "rewards/ndcg_rule_reward": -0.0299488827586174, + "rewards/rule_reward": 0.03515625, + "step": 1030, + "token_diversity": 0.4609375 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.431640625, + "epoch": 0.5966435185185185, + "grad_norm": 2.47145676612854, + "kl": 10.0, + "learning_rate": 8.232248952454978e-06, + "loss": 0.01, + "reward": 0.09615654568187892, + "reward_std": 0.10869968309998512, + "rewards/ndcg_rule_reward": -0.02298407955095172, + "rewards/rule_reward": 0.119140625, + "step": 1031, + "token_diversity": 0.48046875 + }, + { + "categorical_diversity": 0.890625, + "completion_length": 5.19921875, + "epoch": 0.5972222222222222, + "grad_norm": 2.4344985485076904, + "kl": 14.375, + "learning_rate": 8.228672207253277e-06, + "loss": 0.0144, + "reward": 0.1255122497677803, + "reward_std": 0.10836193710565567, + "rewards/ndcg_rule_reward": -0.020972132682800293, + "rewards/rule_reward": 0.146484375, + "step": 1032, + "token_diversity": 0.3784950657894737 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.419921875, + "epoch": 0.5978009259259259, + "grad_norm": 2.948340654373169, + "kl": 23.125, + "learning_rate": 8.225092625990047e-06, + "loss": 0.0231, + "reward": 0.0036409670719876885, + "reward_std": 0.12453600391745567, + "rewards/ndcg_rule_reward": -0.02565590664744377, + "rewards/rule_reward": 0.029296875, + "step": 1033, + "token_diversity": 0.44921875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 9.24609375, + "epoch": 0.5983796296296297, + "grad_norm": 3.0138611793518066, + "kl": 36.4375, + "learning_rate": 8.221510211809585e-06, + "loss": 0.0364, + "reward": 0.03366720490157604, + "reward_std": 0.09142060950398445, + "rewards/ndcg_rule_reward": -0.019067171961069107, + "rewards/rule_reward": 0.052734375, + "step": 1034, + "token_diversity": 0.421875 + }, + { + "categorical_diversity": 1.0, + "completion_length": 9.552734375, + "epoch": 0.5989583333333334, + "grad_norm": 3.2268106937408447, + "kl": 31.1875, + "learning_rate": 8.217924967858682e-06, + "loss": 0.0312, + "reward": 0.0019384484621696174, + "reward_std": 0.06641250476241112, + "rewards/ndcg_rule_reward": -0.013686551712453365, + "rewards/rule_reward": 0.015625, + "step": 1035, + "token_diversity": 0.51953125 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.0, + "epoch": 0.5995370370370371, + "grad_norm": 2.9225356578826904, + "kl": 31.25, + "learning_rate": 8.21433689728661e-06, + "loss": 0.0312, + "reward": 0.004780902061611414, + "reward_std": 0.13241463527083397, + "rewards/ndcg_rule_reward": -0.0264690974727273, + "rewards/rule_reward": 0.03125, + "step": 1036, + "token_diversity": 0.53515625 + }, + { + "categorical_diversity": 0.90625, + "completion_length": 5.3671875, + "epoch": 0.6001157407407407, + "grad_norm": 2.411831855773926, + "kl": 43.875, + "learning_rate": 8.210746003245126e-06, + "loss": 0.0438, + "reward": 0.00445587863214314, + "reward_std": 0.14937318861484528, + "rewards/ndcg_rule_reward": -0.030700373463332653, + "rewards/rule_reward": 0.03515625, + "step": 1037, + "token_diversity": 0.27571202531645567 + }, + { + "categorical_diversity": 1.0, + "completion_length": 5.05078125, + "epoch": 0.6006944444444444, + "grad_norm": 3.192974328994751, + "kl": 28.875, + "learning_rate": 8.207152288888464e-06, + "loss": 0.0289, + "reward": 0.03341930150054395, + "reward_std": 0.09996428713202477, + "rewards/ndcg_rule_reward": -0.021268192678689957, + "rewards/rule_reward": 0.0546875, + "step": 1038, + "token_diversity": 0.48828125 + }, + { + "epoch": 0.6006944444444444, + "eval_categorical_diversity": 1.0, + "eval_completion_length": 5.0, + "eval_kl": 26.121753246753247, + "eval_loss": 0.026170117780566216, + "eval_reward": 0.0020334478724723707, + "eval_reward_std": 0.06490304773407322, + "eval_rewards/ndcg_rule_reward": -0.0132491211951166, + "eval_rewards/rule_reward": 0.015282568993506494, + "eval_runtime": 92.1805, + "eval_samples_per_second": 52.788, + "eval_steps_per_second": 0.054, + "eval_token_diversity": 0.37469561688311687, + "step": 1038 + } + ], + "logging_steps": 1, + "max_steps": 3456, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 346, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}