watermelonhjg commited on
Commit
8594e3b
·
verified ·
1 Parent(s): 79c9688

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-3B-Instruct
3
+ datasets: xiaodongguaAIGC/X-R1-TAL-SCQ5K
4
+ library_name: transformers
5
+ tags:
6
+ - generated_from_trainer
7
+ - X-R1
8
+ licence: license
9
+ ---
10
+
11
+ # Model Card for None
12
+
13
+ This model is a fine-tuned version of [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) on the [xiaodongguaAIGC/X-R1-TAL-SCQ5K](https://huggingface.co/datasets/xiaodongguaAIGC/X-R1-TAL-SCQ5K) dataset.
14
+ It has been trained using [TRL](https://github.com/huggingface/trl).
15
+
16
+ ## Quick start
17
+
18
+ ```python
19
+ from transformers import pipeline
20
+
21
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
22
+ generator = pipeline("text-generation", model="None", device="cuda")
23
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
24
+ print(output["generated_text"])
25
+ ```
26
+
27
+ ## Training procedure
28
+
29
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/watermelonhjg/huggingface/runs/8spp7whw)
30
+
31
+
32
+ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
33
+
34
+ ### Framework versions
35
+
36
+ - TRL: 0.15.0
37
+ - Transformers: 4.48.2
38
+ - Pytorch: 2.5.1
39
+ - Datasets: 3.3.0
40
+ - Tokenizers: 0.21.0
41
+
42
+ ## Citations
43
+
44
+ Cite GRPO as:
45
+
46
+ ```bibtex
47
+ @article{zhihong2024deepseekmath,
48
+ title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
49
+ author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
50
+ year = 2024,
51
+ eprint = {arXiv:2402.03300},
52
+ }
53
+
54
+ ```
55
+
56
+ Cite TRL as:
57
+
58
+ ```bibtex
59
+ @misc{vonwerra2022trl,
60
+ title = {{TRL: Transformer Reinforcement Learning}},
61
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
62
+ year = 2020,
63
+ journal = {GitHub repository},
64
+ publisher = {GitHub},
65
+ howpublished = {\url{https://github.com/huggingface/trl}}
66
+ }
67
+ ```
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.283822166296343,
4
+ "train_runtime": 92192.5552,
5
+ "train_samples": 6000,
6
+ "train_samples_per_second": 0.195,
7
+ "train_steps_per_second": 0.024
8
+ }
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Qwen/Qwen2.5-3B-Instruct",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 2048,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 11008,
13
+ "max_position_embeddings": 32768,
14
+ "max_window_layers": 70,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 16,
17
+ "num_hidden_layers": 36,
18
+ "num_key_value_heads": 2,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": null,
21
+ "rope_theta": 1000000.0,
22
+ "sliding_window": null,
23
+ "tie_word_embeddings": true,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.48.2",
26
+ "use_cache": true,
27
+ "use_sliding_window": false,
28
+ "vocab_size": 151936
29
+ }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.05,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.48.2"
14
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d91d5ad0f17fdf5598034f8fea6a53af405ae40ee4066ea33099d333c2921384
3
+ size 4957560304
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d306f3ccebea9f8b304bb2c03004b191c34c6a64e13f4d52dbc17ffd72122abb
3
+ size 1214366696
model.safetensors.index.json ADDED
@@ -0,0 +1,441 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 6171877376
4
+ },
5
+ "weight_map": {
6
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
7
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
8
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
9
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
10
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
11
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
16
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
18
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
19
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
21
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
26
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
28
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
30
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
31
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
33
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
36
+ "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
37
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
38
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
40
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
41
+ "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
42
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
43
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
44
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
45
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
46
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
47
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
48
+ "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
49
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
50
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
51
+ "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
52
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
53
+ "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
54
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
55
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
56
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
57
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
58
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
59
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
60
+ "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
61
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
62
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
63
+ "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
64
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
65
+ "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
66
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
67
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
68
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
69
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
70
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
71
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
72
+ "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
73
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
74
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
75
+ "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
76
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
77
+ "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
78
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
79
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
80
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
81
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
82
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
83
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
84
+ "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
85
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
86
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
87
+ "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
88
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
89
+ "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
90
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
91
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
92
+ "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
93
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
94
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
95
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
96
+ "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
97
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
98
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
99
+ "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
100
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
101
+ "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
102
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
103
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
104
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
105
+ "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
106
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
107
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
108
+ "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
109
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
110
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
111
+ "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
112
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
113
+ "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
114
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
115
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
116
+ "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
117
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
118
+ "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
119
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
120
+ "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
121
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
122
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
123
+ "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
124
+ "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
125
+ "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
126
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
127
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
128
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
129
+ "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
130
+ "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
131
+ "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
132
+ "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
133
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
134
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
135
+ "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
136
+ "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
137
+ "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
138
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
139
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
140
+ "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
141
+ "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
142
+ "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
143
+ "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
144
+ "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
145
+ "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
146
+ "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
147
+ "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
148
+ "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
149
+ "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
150
+ "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
151
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
152
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
153
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
154
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
155
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
156
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
157
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
158
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
159
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
160
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
161
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
162
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
163
+ "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
164
+ "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
165
+ "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
166
+ "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
167
+ "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
168
+ "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
169
+ "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
170
+ "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
171
+ "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
172
+ "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
173
+ "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
174
+ "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
175
+ "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
176
+ "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
177
+ "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
178
+ "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
179
+ "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
180
+ "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
181
+ "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
182
+ "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
183
+ "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
184
+ "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
185
+ "model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
186
+ "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
187
+ "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
188
+ "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
189
+ "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
190
+ "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
191
+ "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
192
+ "model.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
193
+ "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
194
+ "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
195
+ "model.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
196
+ "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
197
+ "model.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
198
+ "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
199
+ "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors",
200
+ "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
201
+ "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
202
+ "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
203
+ "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
204
+ "model.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
205
+ "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
206
+ "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
207
+ "model.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
208
+ "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
209
+ "model.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
210
+ "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
211
+ "model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors",
212
+ "model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
213
+ "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
214
+ "model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
215
+ "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
216
+ "model.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
217
+ "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
218
+ "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
219
+ "model.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
220
+ "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
221
+ "model.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
222
+ "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
223
+ "model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors",
224
+ "model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
225
+ "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
226
+ "model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
227
+ "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
228
+ "model.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
229
+ "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
230
+ "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
231
+ "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
232
+ "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
233
+ "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
234
+ "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
235
+ "model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors",
236
+ "model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
237
+ "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
238
+ "model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
239
+ "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
240
+ "model.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
241
+ "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
242
+ "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
243
+ "model.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
244
+ "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
245
+ "model.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
246
+ "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
247
+ "model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors",
248
+ "model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
249
+ "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
250
+ "model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
251
+ "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
252
+ "model.layers.27.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
253
+ "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
254
+ "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
255
+ "model.layers.27.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
256
+ "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
257
+ "model.layers.27.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
258
+ "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
259
+ "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
260
+ "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
261
+ "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
262
+ "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
263
+ "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
264
+ "model.layers.28.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
265
+ "model.layers.28.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
266
+ "model.layers.28.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
267
+ "model.layers.28.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
268
+ "model.layers.28.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
269
+ "model.layers.28.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
270
+ "model.layers.28.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
271
+ "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
272
+ "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
273
+ "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
274
+ "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
275
+ "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
276
+ "model.layers.29.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
277
+ "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
278
+ "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
279
+ "model.layers.29.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
280
+ "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
281
+ "model.layers.29.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
282
+ "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
283
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
284
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
285
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
286
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
287
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
288
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
289
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
290
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
291
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
292
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
293
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
294
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
295
+ "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
296
+ "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
297
+ "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
298
+ "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
299
+ "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
300
+ "model.layers.30.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
301
+ "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
302
+ "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
303
+ "model.layers.30.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
304
+ "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
305
+ "model.layers.30.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
306
+ "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
307
+ "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
308
+ "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
309
+ "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
310
+ "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
311
+ "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
312
+ "model.layers.31.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
313
+ "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
314
+ "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
315
+ "model.layers.31.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
316
+ "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
317
+ "model.layers.31.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
318
+ "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
319
+ "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
320
+ "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
321
+ "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
322
+ "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
323
+ "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
324
+ "model.layers.32.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
325
+ "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
326
+ "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
327
+ "model.layers.32.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
328
+ "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
329
+ "model.layers.32.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
330
+ "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
331
+ "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
332
+ "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
333
+ "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
334
+ "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
335
+ "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
336
+ "model.layers.33.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
337
+ "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
338
+ "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
339
+ "model.layers.33.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
340
+ "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
341
+ "model.layers.33.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
342
+ "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
343
+ "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
344
+ "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
345
+ "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
346
+ "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
347
+ "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
348
+ "model.layers.34.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
349
+ "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
350
+ "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
351
+ "model.layers.34.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
352
+ "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
353
+ "model.layers.34.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
354
+ "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
355
+ "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
356
+ "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
357
+ "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
358
+ "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
359
+ "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
360
+ "model.layers.35.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
361
+ "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
362
+ "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
363
+ "model.layers.35.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
364
+ "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
365
+ "model.layers.35.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
366
+ "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
367
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
368
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
369
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
370
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
371
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
372
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
373
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
374
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
375
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
376
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
377
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
378
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
379
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
380
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
381
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
382
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
383
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
384
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
385
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
386
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
387
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
388
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
389
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
390
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
391
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
392
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
393
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
394
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
395
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
396
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
397
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
398
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
399
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
400
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
401
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
402
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
403
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
404
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
405
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
406
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
407
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
408
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
409
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
410
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
411
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
412
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
413
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
414
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
415
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
416
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
417
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
418
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
419
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
420
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
421
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
422
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
423
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
424
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
425
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
426
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
427
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
428
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
429
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
430
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
431
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
432
+ "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
433
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
434
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
435
+ "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
436
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
437
+ "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
438
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
439
+ "model.norm.weight": "model-00002-of-00002.safetensors"
440
+ }
441
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5eee858c5123a4279c3e1f7b81247343f356ac767940b2692a928ad929543214
3
+ size 11422063
tokenizer_config.json ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "extra_special_tokens": {},
203
+ "model_max_length": 131072,
204
+ "pad_token": "<|endoftext|>",
205
+ "padding_side": "left",
206
+ "split_special_tokens": false,
207
+ "tokenizer_class": "Qwen2Tokenizer",
208
+ "unk_token": null
209
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.283822166296343,
4
+ "train_runtime": 92192.5552,
5
+ "train_samples": 6000,
6
+ "train_samples_per_second": 0.195,
7
+ "train_steps_per_second": 0.024
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,2967 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 10,
6
+ "global_step": 2250,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "completion_length": 358.0412570953369,
13
+ "epoch": 0.013333333333333334,
14
+ "grad_norm": 0.1626528948545456,
15
+ "kl": 0.0003269195556640625,
16
+ "learning_rate": 1.3333333333333334e-07,
17
+ "loss": 0.0023,
18
+ "reward": 0.16000000322237612,
19
+ "reward_std": 0.25307216234505175,
20
+ "rewards/accuracy_reward": 0.09875000147148967,
21
+ "rewards/format_reward": 0.061250001285225154,
22
+ "step": 10
23
+ },
24
+ {
25
+ "completion_length": 365.58875579833983,
26
+ "epoch": 0.02666666666666667,
27
+ "grad_norm": 0.0887719914317131,
28
+ "kl": 0.0004455804824829102,
29
+ "learning_rate": 2.6666666666666667e-07,
30
+ "loss": 0.0173,
31
+ "reward": 0.1725000019185245,
32
+ "reward_std": 0.20696474984288216,
33
+ "rewards/accuracy_reward": 0.11375000216066837,
34
+ "rewards/format_reward": 0.058750000689178704,
35
+ "step": 20
36
+ },
37
+ {
38
+ "completion_length": 364.56750640869143,
39
+ "epoch": 0.04,
40
+ "grad_norm": 0.4593224823474884,
41
+ "kl": 0.003619217872619629,
42
+ "learning_rate": 4e-07,
43
+ "loss": 0.0084,
44
+ "reward": 0.2150000055320561,
45
+ "reward_std": 0.31464808210730555,
46
+ "rewards/accuracy_reward": 0.12000000262632966,
47
+ "rewards/format_reward": 0.09500000132247806,
48
+ "step": 30
49
+ },
50
+ {
51
+ "completion_length": 380.52500648498534,
52
+ "epoch": 0.05333333333333334,
53
+ "grad_norm": 0.3159937262535095,
54
+ "kl": 0.04821491241455078,
55
+ "learning_rate": 5.333333333333333e-07,
56
+ "loss": 0.0439,
57
+ "reward": 0.323750005569309,
58
+ "reward_std": 0.38860970810055734,
59
+ "rewards/accuracy_reward": 0.13250000309199095,
60
+ "rewards/format_reward": 0.19125000201165676,
61
+ "step": 40
62
+ },
63
+ {
64
+ "completion_length": 322.1050077438355,
65
+ "epoch": 0.06666666666666667,
66
+ "grad_norm": 0.6419838070869446,
67
+ "kl": 0.013416862487792969,
68
+ "learning_rate": 6.666666666666666e-07,
69
+ "loss": 0.0551,
70
+ "reward": 0.31375000393018126,
71
+ "reward_std": 0.3664026964455843,
72
+ "rewards/accuracy_reward": 0.11875000335276127,
73
+ "rewards/format_reward": 0.19500000402331352,
74
+ "step": 50
75
+ },
76
+ {
77
+ "completion_length": 313.8512550354004,
78
+ "epoch": 0.08,
79
+ "grad_norm": 0.43794259428977966,
80
+ "kl": 0.031152725219726562,
81
+ "learning_rate": 8e-07,
82
+ "loss": 0.044,
83
+ "reward": 0.5187500116415322,
84
+ "reward_std": 0.3655149843543768,
85
+ "rewards/accuracy_reward": 0.14000000162050127,
86
+ "rewards/format_reward": 0.37875000573694706,
87
+ "step": 60
88
+ },
89
+ {
90
+ "completion_length": 334.03500576019286,
91
+ "epoch": 0.09333333333333334,
92
+ "grad_norm": 0.43774211406707764,
93
+ "kl": 0.034784889221191405,
94
+ "learning_rate": 9.333333333333333e-07,
95
+ "loss": 0.0763,
96
+ "reward": 0.4912500069476664,
97
+ "reward_std": 0.418181811645627,
98
+ "rewards/accuracy_reward": 0.09625000236555933,
99
+ "rewards/format_reward": 0.3950000065378845,
100
+ "step": 70
101
+ },
102
+ {
103
+ "completion_length": 292.93375568389894,
104
+ "epoch": 0.10666666666666667,
105
+ "grad_norm": 1.4061037302017212,
106
+ "kl": 0.04006805419921875,
107
+ "learning_rate": 1.0666666666666667e-06,
108
+ "loss": 0.0895,
109
+ "reward": 0.6825000144541263,
110
+ "reward_std": 0.40880861394107343,
111
+ "rewards/accuracy_reward": 0.09875000203028321,
112
+ "rewards/format_reward": 0.5837500099092722,
113
+ "step": 80
114
+ },
115
+ {
116
+ "completion_length": 328.5087568283081,
117
+ "epoch": 0.12,
118
+ "grad_norm": 0.4465954303741455,
119
+ "kl": 0.0293853759765625,
120
+ "learning_rate": 1.2000000000000002e-06,
121
+ "loss": 0.1086,
122
+ "reward": 0.6250000076368452,
123
+ "reward_std": 0.42218363620340826,
124
+ "rewards/accuracy_reward": 0.12625000178813933,
125
+ "rewards/format_reward": 0.4987500081770122,
126
+ "step": 90
127
+ },
128
+ {
129
+ "completion_length": 271.25250511169435,
130
+ "epoch": 0.13333333333333333,
131
+ "grad_norm": 0.4239254891872406,
132
+ "kl": 0.0445556640625,
133
+ "learning_rate": 1.3333333333333332e-06,
134
+ "loss": 0.0794,
135
+ "reward": 0.8187500163912773,
136
+ "reward_std": 0.4009550239890814,
137
+ "rewards/accuracy_reward": 0.12000000113621354,
138
+ "rewards/format_reward": 0.6987500078976154,
139
+ "step": 100
140
+ },
141
+ {
142
+ "completion_length": 238.7037540435791,
143
+ "epoch": 0.14666666666666667,
144
+ "grad_norm": 0.45924320816993713,
145
+ "kl": 0.045458984375,
146
+ "learning_rate": 1.4666666666666667e-06,
147
+ "loss": 0.0688,
148
+ "reward": 0.9087500125169754,
149
+ "reward_std": 0.2800120744854212,
150
+ "rewards/accuracy_reward": 0.0662500012665987,
151
+ "rewards/format_reward": 0.8425000071525574,
152
+ "step": 110
153
+ },
154
+ {
155
+ "completion_length": 251.6537540435791,
156
+ "epoch": 0.16,
157
+ "grad_norm": 0.4579772353172302,
158
+ "kl": 0.0559661865234375,
159
+ "learning_rate": 1.6e-06,
160
+ "loss": 0.0552,
161
+ "reward": 0.9850000143051147,
162
+ "reward_std": 0.28530918546020984,
163
+ "rewards/accuracy_reward": 0.11875000353902579,
164
+ "rewards/format_reward": 0.8662500090897083,
165
+ "step": 120
166
+ },
167
+ {
168
+ "completion_length": 285.78625469207765,
169
+ "epoch": 0.17333333333333334,
170
+ "grad_norm": 0.4548390209674835,
171
+ "kl": 0.0518890380859375,
172
+ "learning_rate": 1.7333333333333332e-06,
173
+ "loss": 0.0473,
174
+ "reward": 0.9662500187754631,
175
+ "reward_std": 0.3944980699568987,
176
+ "rewards/accuracy_reward": 0.14875000305473804,
177
+ "rewards/format_reward": 0.8175000086426735,
178
+ "step": 130
179
+ },
180
+ {
181
+ "completion_length": 268.450004196167,
182
+ "epoch": 0.18666666666666668,
183
+ "grad_norm": 0.358254611492157,
184
+ "kl": 0.082470703125,
185
+ "learning_rate": 1.8666666666666667e-06,
186
+ "loss": 0.0507,
187
+ "reward": 1.0162500128149987,
188
+ "reward_std": 0.38786814287304877,
189
+ "rewards/accuracy_reward": 0.17125000292435288,
190
+ "rewards/format_reward": 0.8450000129640103,
191
+ "step": 140
192
+ },
193
+ {
194
+ "completion_length": 304.1125057220459,
195
+ "epoch": 0.2,
196
+ "grad_norm": 4.261535167694092,
197
+ "kl": 0.107891845703125,
198
+ "learning_rate": 2e-06,
199
+ "loss": 0.0398,
200
+ "reward": 1.1762500151991844,
201
+ "reward_std": 0.46659310162067413,
202
+ "rewards/accuracy_reward": 0.2962500025518239,
203
+ "rewards/format_reward": 0.8800000116229058,
204
+ "step": 150
205
+ },
206
+ {
207
+ "completion_length": 284.4462554931641,
208
+ "epoch": 0.21333333333333335,
209
+ "grad_norm": 0.29285335540771484,
210
+ "kl": 0.086627197265625,
211
+ "learning_rate": 2.1333333333333334e-06,
212
+ "loss": 0.0141,
213
+ "reward": 1.3200000196695327,
214
+ "reward_std": 0.4418893948197365,
215
+ "rewards/accuracy_reward": 0.37750000674277545,
216
+ "rewards/format_reward": 0.9425000041723252,
217
+ "step": 160
218
+ },
219
+ {
220
+ "completion_length": 279.2050067901611,
221
+ "epoch": 0.22666666666666666,
222
+ "grad_norm": 0.47264882922172546,
223
+ "kl": 0.100860595703125,
224
+ "learning_rate": 2.266666666666667e-06,
225
+ "loss": 0.051,
226
+ "reward": 1.3650000289082527,
227
+ "reward_std": 0.4196435324847698,
228
+ "rewards/accuracy_reward": 0.4200000060722232,
229
+ "rewards/format_reward": 0.9450000047683715,
230
+ "step": 170
231
+ },
232
+ {
233
+ "completion_length": 266.97375679016113,
234
+ "epoch": 0.24,
235
+ "grad_norm": 0.31524142622947693,
236
+ "kl": 0.1145751953125,
237
+ "learning_rate": 2.4000000000000003e-06,
238
+ "loss": 0.0426,
239
+ "reward": 1.4912500247359275,
240
+ "reward_std": 0.38923392072319984,
241
+ "rewards/accuracy_reward": 0.543750009778887,
242
+ "rewards/format_reward": 0.9475000031292439,
243
+ "step": 180
244
+ },
245
+ {
246
+ "completion_length": 340.537505531311,
247
+ "epoch": 0.25333333333333335,
248
+ "grad_norm": 0.28264713287353516,
249
+ "kl": 0.1277587890625,
250
+ "learning_rate": 2.5333333333333334e-06,
251
+ "loss": 0.0855,
252
+ "reward": 1.2712500244379044,
253
+ "reward_std": 0.49123715460300443,
254
+ "rewards/accuracy_reward": 0.4312500088475645,
255
+ "rewards/format_reward": 0.840000007301569,
256
+ "step": 190
257
+ },
258
+ {
259
+ "completion_length": 298.52000389099123,
260
+ "epoch": 0.26666666666666666,
261
+ "grad_norm": 0.4394519627094269,
262
+ "kl": 0.16021728515625,
263
+ "learning_rate": 2.6666666666666664e-06,
264
+ "loss": 0.1097,
265
+ "reward": 1.2837500274181366,
266
+ "reward_std": 0.5194122649729251,
267
+ "rewards/accuracy_reward": 0.43875000802800057,
268
+ "rewards/format_reward": 0.845000010728836,
269
+ "step": 200
270
+ },
271
+ {
272
+ "completion_length": 337.09625549316405,
273
+ "epoch": 0.28,
274
+ "grad_norm": 0.4397446811199188,
275
+ "kl": 0.286328125,
276
+ "learning_rate": 2.8000000000000003e-06,
277
+ "loss": 0.1787,
278
+ "reward": 1.057500022649765,
279
+ "reward_std": 0.6535363413393498,
280
+ "rewards/accuracy_reward": 0.3850000096485019,
281
+ "rewards/format_reward": 0.6725000105798244,
282
+ "step": 210
283
+ },
284
+ {
285
+ "completion_length": 195.2412536621094,
286
+ "epoch": 0.29333333333333333,
287
+ "grad_norm": 1.9711108207702637,
288
+ "kl": 3.52977294921875,
289
+ "learning_rate": 2.9333333333333333e-06,
290
+ "loss": 0.0607,
291
+ "reward": 1.2225000165402888,
292
+ "reward_std": 0.504400571808219,
293
+ "rewards/accuracy_reward": 0.41500000692903993,
294
+ "rewards/format_reward": 0.8075000129640102,
295
+ "step": 220
296
+ },
297
+ {
298
+ "completion_length": 197.4412525177002,
299
+ "epoch": 0.30666666666666664,
300
+ "grad_norm": 2256.19873046875,
301
+ "kl": 17.518798828125,
302
+ "learning_rate": 2.9999548717196514e-06,
303
+ "loss": 0.5765,
304
+ "reward": 1.3587500154972076,
305
+ "reward_std": 0.4323975473642349,
306
+ "rewards/accuracy_reward": 0.4712500068359077,
307
+ "rewards/format_reward": 0.8875000089406967,
308
+ "step": 230
309
+ },
310
+ {
311
+ "completion_length": 222.5012535095215,
312
+ "epoch": 0.32,
313
+ "grad_norm": 0.370374858379364,
314
+ "kl": 0.28663330078125,
315
+ "learning_rate": 2.9995938617691924e-06,
316
+ "loss": 0.0185,
317
+ "reward": 1.3312500312924385,
318
+ "reward_std": 0.5138044867664575,
319
+ "rewards/accuracy_reward": 0.45125000588595865,
320
+ "rewards/format_reward": 0.8800000086426735,
321
+ "step": 240
322
+ },
323
+ {
324
+ "completion_length": 213.4362533569336,
325
+ "epoch": 0.3333333333333333,
326
+ "grad_norm": 0.3446196913719177,
327
+ "kl": 0.21429443359375,
328
+ "learning_rate": 2.9988719287563454e-06,
329
+ "loss": 0.0129,
330
+ "reward": 1.4750000223517419,
331
+ "reward_std": 0.3717566329985857,
332
+ "rewards/accuracy_reward": 0.5112500058487057,
333
+ "rewards/format_reward": 0.9637500002980233,
334
+ "step": 250
335
+ },
336
+ {
337
+ "completion_length": 252.42875595092772,
338
+ "epoch": 0.3466666666666667,
339
+ "grad_norm": 2.4441771507263184,
340
+ "kl": 1.01444091796875,
341
+ "learning_rate": 2.9977892464363372e-06,
342
+ "loss": 0.0758,
343
+ "reward": 1.4012500166893005,
344
+ "reward_std": 0.4755144312977791,
345
+ "rewards/accuracy_reward": 0.4700000075623393,
346
+ "rewards/format_reward": 0.9312500089406968,
347
+ "step": 260
348
+ },
349
+ {
350
+ "completion_length": 247.25250587463378,
351
+ "epoch": 0.36,
352
+ "grad_norm": 2.5208985805511475,
353
+ "kl": 0.5568603515625,
354
+ "learning_rate": 2.9963460753897363e-06,
355
+ "loss": 0.0401,
356
+ "reward": 1.3375000193715096,
357
+ "reward_std": 0.4566458873450756,
358
+ "rewards/accuracy_reward": 0.4337500057183206,
359
+ "rewards/format_reward": 0.9037500128149987,
360
+ "step": 270
361
+ },
362
+ {
363
+ "completion_length": 246.50625495910646,
364
+ "epoch": 0.37333333333333335,
365
+ "grad_norm": 96.51844024658203,
366
+ "kl": 1.2575439453125,
367
+ "learning_rate": 2.9945427629597305e-06,
368
+ "loss": 0.0839,
369
+ "reward": 1.376250022649765,
370
+ "reward_std": 0.4465434730052948,
371
+ "rewards/accuracy_reward": 0.4450000057928264,
372
+ "rewards/format_reward": 0.9312500059604645,
373
+ "step": 280
374
+ },
375
+ {
376
+ "completion_length": 286.57250423431395,
377
+ "epoch": 0.38666666666666666,
378
+ "grad_norm": 0.3689205050468445,
379
+ "kl": 0.414300537109375,
380
+ "learning_rate": 2.992379743168532e-06,
381
+ "loss": 0.0462,
382
+ "reward": 1.4450000196695327,
383
+ "reward_std": 0.4324021231383085,
384
+ "rewards/accuracy_reward": 0.503750005364418,
385
+ "rewards/format_reward": 0.941250005364418,
386
+ "step": 290
387
+ },
388
+ {
389
+ "completion_length": 332.27125587463377,
390
+ "epoch": 0.4,
391
+ "grad_norm": 0.4576380252838135,
392
+ "kl": 0.3354248046875,
393
+ "learning_rate": 2.989857536612915e-06,
394
+ "loss": 0.0378,
395
+ "reward": 1.4400000303983689,
396
+ "reward_std": 0.39175261445343496,
397
+ "rewards/accuracy_reward": 0.4987500077113509,
398
+ "rewards/format_reward": 0.941250005364418,
399
+ "step": 300
400
+ },
401
+ {
402
+ "completion_length": 318.8962543487549,
403
+ "epoch": 0.41333333333333333,
404
+ "grad_norm": 0.44957035779953003,
405
+ "kl": 1.1047119140625,
406
+ "learning_rate": 2.9869767503389175e-06,
407
+ "loss": 0.0515,
408
+ "reward": 1.4150000244379044,
409
+ "reward_std": 0.4395812951028347,
410
+ "rewards/accuracy_reward": 0.4762500121258199,
411
+ "rewards/format_reward": 0.9387500062584877,
412
+ "step": 310
413
+ },
414
+ {
415
+ "completion_length": 293.3062553405762,
416
+ "epoch": 0.4266666666666667,
417
+ "grad_norm": 0.47710099816322327,
418
+ "kl": 8.84039306640625,
419
+ "learning_rate": 2.983738077695741e-06,
420
+ "loss": 0.4032,
421
+ "reward": 1.4200000286102294,
422
+ "reward_std": 0.40100472904741763,
423
+ "rewards/accuracy_reward": 0.4850000117905438,
424
+ "rewards/format_reward": 0.9350000098347664,
425
+ "step": 320
426
+ },
427
+ {
428
+ "completion_length": 311.5937572479248,
429
+ "epoch": 0.44,
430
+ "grad_norm": 4.005710124969482,
431
+ "kl": 9.49320068359375,
432
+ "learning_rate": 2.980142298168869e-06,
433
+ "loss": 0.5626,
434
+ "reward": 1.4375000283122064,
435
+ "reward_std": 0.46121844090521336,
436
+ "rewards/accuracy_reward": 0.5175000098533928,
437
+ "rewards/format_reward": 0.9200000077486038,
438
+ "step": 330
439
+ },
440
+ {
441
+ "completion_length": 313.74875526428224,
442
+ "epoch": 0.4533333333333333,
443
+ "grad_norm": 10.983561515808105,
444
+ "kl": 0.34615478515625,
445
+ "learning_rate": 2.976190277192465e-06,
446
+ "loss": 0.0495,
447
+ "reward": 1.4325000122189522,
448
+ "reward_std": 0.4127622898668051,
449
+ "rewards/accuracy_reward": 0.5262500100769103,
450
+ "rewards/format_reward": 0.9062500059604645,
451
+ "step": 340
452
+ },
453
+ {
454
+ "completion_length": 317.42875480651855,
455
+ "epoch": 0.4666666666666667,
456
+ "grad_norm": 1.4480901956558228,
457
+ "kl": 16.34658203125,
458
+ "learning_rate": 2.971882965941077e-06,
459
+ "loss": 0.7464,
460
+ "reward": 1.4687500238418578,
461
+ "reward_std": 0.44693985804915426,
462
+ "rewards/accuracy_reward": 0.5800000098533928,
463
+ "rewards/format_reward": 0.8887500122189522,
464
+ "step": 350
465
+ },
466
+ {
467
+ "completion_length": 345.25625648498533,
468
+ "epoch": 0.48,
469
+ "grad_norm": 0.3819202184677124,
470
+ "kl": 0.4673583984375,
471
+ "learning_rate": 2.9672214011007086e-06,
472
+ "loss": 0.0278,
473
+ "reward": 1.3925000250339508,
474
+ "reward_std": 0.36666983254253865,
475
+ "rewards/accuracy_reward": 0.45375000443309543,
476
+ "rewards/format_reward": 0.9387500062584877,
477
+ "step": 360
478
+ },
479
+ {
480
+ "completion_length": 308.32375564575193,
481
+ "epoch": 0.49333333333333335,
482
+ "grad_norm": 0.812126874923706,
483
+ "kl": 0.8212890625,
484
+ "learning_rate": 2.9622067046193085e-06,
485
+ "loss": 0.0635,
486
+ "reward": 1.4100000277161597,
487
+ "reward_std": 0.420729149505496,
488
+ "rewards/accuracy_reward": 0.46875001015141604,
489
+ "rewards/format_reward": 0.9412500023841858,
490
+ "step": 370
491
+ },
492
+ {
493
+ "completion_length": 298.4312526702881,
494
+ "epoch": 0.5066666666666667,
495
+ "grad_norm": 1.2376179695129395,
496
+ "kl": 1.26268310546875,
497
+ "learning_rate": 2.956840083436741e-06,
498
+ "loss": 0.0954,
499
+ "reward": 1.471250019967556,
500
+ "reward_std": 0.4468890752643347,
501
+ "rewards/accuracy_reward": 0.5375000098720193,
502
+ "rewards/format_reward": 0.933750006556511,
503
+ "step": 380
504
+ },
505
+ {
506
+ "completion_length": 277.4425052642822,
507
+ "epoch": 0.52,
508
+ "grad_norm": 1.022598147392273,
509
+ "kl": 2.3487060546875,
510
+ "learning_rate": 2.951122829194296e-06,
511
+ "loss": 0.1583,
512
+ "reward": 1.3462500274181366,
513
+ "reward_std": 0.5475293599069119,
514
+ "rewards/accuracy_reward": 0.4762500089593232,
515
+ "rewards/format_reward": 0.870000010728836,
516
+ "step": 390
517
+ },
518
+ {
519
+ "completion_length": 311.357506942749,
520
+ "epoch": 0.5333333333333333,
521
+ "grad_norm": 0.5792402625083923,
522
+ "kl": 0.7629638671875,
523
+ "learning_rate": 2.9450563179238205e-06,
524
+ "loss": 0.1346,
525
+ "reward": 1.3450000122189523,
526
+ "reward_std": 0.5392270684242249,
527
+ "rewards/accuracy_reward": 0.4387500088661909,
528
+ "rewards/format_reward": 0.9062500074505806,
529
+ "step": 400
530
+ },
531
+ {
532
+ "completion_length": 268.91125507354735,
533
+ "epoch": 0.5466666666666666,
534
+ "grad_norm": 2.355006694793701,
535
+ "kl": 10.3116455078125,
536
+ "learning_rate": 2.938642009716531e-06,
537
+ "loss": 0.7061,
538
+ "reward": 1.411250014603138,
539
+ "reward_std": 0.44688397236168387,
540
+ "rewards/accuracy_reward": 0.48625000976026056,
541
+ "rewards/format_reward": 0.9250000059604645,
542
+ "step": 410
543
+ },
544
+ {
545
+ "completion_length": 330.4012561798096,
546
+ "epoch": 0.56,
547
+ "grad_norm": 1.2017070055007935,
548
+ "kl": 2.034033203125,
549
+ "learning_rate": 2.9318814483715983e-06,
550
+ "loss": 0.3712,
551
+ "reward": 1.37750001847744,
552
+ "reward_std": 0.5567759402096272,
553
+ "rewards/accuracy_reward": 0.5162500075995922,
554
+ "rewards/format_reward": 0.8612500131130219,
555
+ "step": 420
556
+ },
557
+ {
558
+ "completion_length": 262.88875579833984,
559
+ "epoch": 0.5733333333333334,
560
+ "grad_norm": 8.315053939819336,
561
+ "kl": 1.03427734375,
562
+ "learning_rate": 2.924776261024586e-06,
563
+ "loss": 0.1372,
564
+ "reward": 1.5275000169873238,
565
+ "reward_std": 0.4006519988179207,
566
+ "rewards/accuracy_reward": 0.5787500092759729,
567
+ "rewards/format_reward": 0.948750002682209,
568
+ "step": 430
569
+ },
570
+ {
571
+ "completion_length": 250.29125347137452,
572
+ "epoch": 0.5866666666666667,
573
+ "grad_norm": 15.071596145629883,
574
+ "kl": 1.954150390625,
575
+ "learning_rate": 2.917328157755832e-06,
576
+ "loss": 0.1571,
577
+ "reward": 1.378750017285347,
578
+ "reward_std": 0.42997160777449606,
579
+ "rewards/accuracy_reward": 0.5175000073388218,
580
+ "rewards/format_reward": 0.8612500071525574,
581
+ "step": 440
582
+ },
583
+ {
584
+ "completion_length": 267.2262548446655,
585
+ "epoch": 0.6,
586
+ "grad_norm": 4.3944549560546875,
587
+ "kl": 3.891845703125,
588
+ "learning_rate": 2.9095389311788626e-06,
589
+ "loss": 0.2468,
590
+ "reward": 1.2350000157952308,
591
+ "reward_std": 0.5935393430292606,
592
+ "rewards/accuracy_reward": 0.48000000631436707,
593
+ "rewards/format_reward": 0.7550000131130219,
594
+ "step": 450
595
+ },
596
+ {
597
+ "completion_length": 232.88375339508056,
598
+ "epoch": 0.6133333333333333,
599
+ "grad_norm": 10.471932411193848,
600
+ "kl": 1.11915283203125,
601
+ "learning_rate": 2.9014104560089465e-06,
602
+ "loss": 0.1208,
603
+ "reward": 1.4125000298023225,
604
+ "reward_std": 0.4672528047114611,
605
+ "rewards/accuracy_reward": 0.5075000061653554,
606
+ "rewards/format_reward": 0.9050000071525574,
607
+ "step": 460
608
+ },
609
+ {
610
+ "completion_length": 266.50000495910643,
611
+ "epoch": 0.6266666666666667,
612
+ "grad_norm": 0.3841674029827118,
613
+ "kl": 1.0602783203125,
614
+ "learning_rate": 2.8929446886118866e-06,
615
+ "loss": 0.2335,
616
+ "reward": 1.4612500220537186,
617
+ "reward_std": 0.42485330663621423,
618
+ "rewards/accuracy_reward": 0.541250011883676,
619
+ "rewards/format_reward": 0.9200000077486038,
620
+ "step": 470
621
+ },
622
+ {
623
+ "completion_length": 256.0362522125244,
624
+ "epoch": 0.64,
625
+ "grad_norm": 0.425624281167984,
626
+ "kl": 0.7212890625,
627
+ "learning_rate": 2.8841436665331635e-06,
628
+ "loss": 0.1887,
629
+ "reward": 1.3937500238418579,
630
+ "reward_std": 0.48963438235223294,
631
+ "rewards/accuracy_reward": 0.4762500075623393,
632
+ "rewards/format_reward": 0.9175000041723251,
633
+ "step": 480
634
+ },
635
+ {
636
+ "completion_length": 278.662505531311,
637
+ "epoch": 0.6533333333333333,
638
+ "grad_norm": 0.6774723529815674,
639
+ "kl": 0.314697265625,
640
+ "learning_rate": 2.875009508007535e-06,
641
+ "loss": 0.2242,
642
+ "reward": 1.3637500166893006,
643
+ "reward_std": 0.5403582151979208,
644
+ "rewards/accuracy_reward": 0.5012500101700426,
645
+ "rewards/format_reward": 0.8625000104308128,
646
+ "step": 490
647
+ },
648
+ {
649
+ "completion_length": 295.65125541687013,
650
+ "epoch": 0.6666666666666666,
651
+ "grad_norm": 1.2344285249710083,
652
+ "kl": 0.48665771484375,
653
+ "learning_rate": 2.86554441144922e-06,
654
+ "loss": 0.2865,
655
+ "reward": 1.4550000220537185,
656
+ "reward_std": 0.4966724131256342,
657
+ "rewards/accuracy_reward": 0.5925000066868961,
658
+ "rewards/format_reward": 0.8625000059604645,
659
+ "step": 500
660
+ },
661
+ {
662
+ "completion_length": 309.08625679016114,
663
+ "epoch": 0.68,
664
+ "grad_norm": 16.80157470703125,
665
+ "kl": 2.29215087890625,
666
+ "learning_rate": 2.855750654922781e-06,
667
+ "loss": 0.4345,
668
+ "reward": 1.4012500315904617,
669
+ "reward_std": 0.5136609964072705,
670
+ "rewards/accuracy_reward": 0.556250006146729,
671
+ "rewards/format_reward": 0.8450000122189522,
672
+ "step": 510
673
+ },
674
+ {
675
+ "completion_length": 320.7975061416626,
676
+ "epoch": 0.6933333333333334,
677
+ "grad_norm": 6.8107590675354,
678
+ "kl": 2.5089599609375,
679
+ "learning_rate": 2.84563059559484e-06,
680
+ "loss": 0.5524,
681
+ "reward": 1.2937500305473804,
682
+ "reward_std": 0.5724672272801399,
683
+ "rewards/accuracy_reward": 0.49500000951811673,
684
+ "rewards/format_reward": 0.7987500131130219,
685
+ "step": 520
686
+ },
687
+ {
688
+ "completion_length": 312.92125396728517,
689
+ "epoch": 0.7066666666666667,
690
+ "grad_norm": 1.1629796028137207,
691
+ "kl": 1.2917724609375,
692
+ "learning_rate": 2.8351866691667543e-06,
693
+ "loss": 0.3865,
694
+ "reward": 1.1650000177323818,
695
+ "reward_std": 0.5869733650237322,
696
+ "rewards/accuracy_reward": 0.39125000657513737,
697
+ "rewards/format_reward": 0.7737500131130218,
698
+ "step": 530
699
+ },
700
+ {
701
+ "completion_length": 264.66250495910646,
702
+ "epoch": 0.72,
703
+ "grad_norm": 1.581681251525879,
704
+ "kl": 1.5130859375,
705
+ "learning_rate": 2.8244213892883906e-06,
706
+ "loss": 0.3496,
707
+ "reward": 1.4375000178813935,
708
+ "reward_std": 0.45811997428536416,
709
+ "rewards/accuracy_reward": 0.5450000078417361,
710
+ "rewards/format_reward": 0.8925000086426735,
711
+ "step": 540
712
+ },
713
+ {
714
+ "completion_length": 299.5112560272217,
715
+ "epoch": 0.7333333333333333,
716
+ "grad_norm": 1.8020122051239014,
717
+ "kl": 1.855810546875,
718
+ "learning_rate": 2.8133373469531365e-06,
719
+ "loss": 0.4243,
720
+ "reward": 1.37500002682209,
721
+ "reward_std": 0.4744249366223812,
722
+ "rewards/accuracy_reward": 0.5300000084564089,
723
+ "rewards/format_reward": 0.845000010728836,
724
+ "step": 550
725
+ },
726
+ {
727
+ "completion_length": 279.5200065612793,
728
+ "epoch": 0.7466666666666667,
729
+ "grad_norm": 1.1065877676010132,
730
+ "kl": 1.2305419921875,
731
+ "learning_rate": 2.801937209874301e-06,
732
+ "loss": 0.3489,
733
+ "reward": 1.3450000271201135,
734
+ "reward_std": 0.48592774122953414,
735
+ "rewards/accuracy_reward": 0.4600000066682696,
736
+ "rewards/format_reward": 0.8850000105798245,
737
+ "step": 560
738
+ },
739
+ {
740
+ "completion_length": 262.961252784729,
741
+ "epoch": 0.76,
742
+ "grad_norm": 2.5658416748046875,
743
+ "kl": 0.5228271484375,
744
+ "learning_rate": 2.7902237218430485e-06,
745
+ "loss": 0.2554,
746
+ "reward": 1.4400000259280206,
747
+ "reward_std": 0.437805675342679,
748
+ "rewards/accuracy_reward": 0.5325000083073974,
749
+ "rewards/format_reward": 0.90750000923872,
750
+ "step": 570
751
+ },
752
+ {
753
+ "completion_length": 393.1900066375732,
754
+ "epoch": 0.7733333333333333,
755
+ "grad_norm": 3.0309977531433105,
756
+ "kl": 0.73193359375,
757
+ "learning_rate": 2.778199702068017e-06,
758
+ "loss": 0.4131,
759
+ "reward": 1.1987500235438346,
760
+ "reward_std": 0.603440997377038,
761
+ "rewards/accuracy_reward": 0.46500000907108185,
762
+ "rewards/format_reward": 0.7337500136345625,
763
+ "step": 580
764
+ },
765
+ {
766
+ "completion_length": 356.66125717163084,
767
+ "epoch": 0.7866666666666666,
768
+ "grad_norm": 4.907287120819092,
769
+ "kl": 15.196728515625,
770
+ "learning_rate": 2.7658680444967964e-06,
771
+ "loss": 1.5553,
772
+ "reward": 1.2862500250339508,
773
+ "reward_std": 0.5923882402479649,
774
+ "rewards/accuracy_reward": 0.5062500080093741,
775
+ "rewards/format_reward": 0.7800000138580799,
776
+ "step": 590
777
+ },
778
+ {
779
+ "completion_length": 308.2250047683716,
780
+ "epoch": 0.8,
781
+ "grad_norm": 3.880446434020996,
782
+ "kl": 1.6797119140625,
783
+ "learning_rate": 2.753231717119405e-06,
784
+ "loss": 0.3899,
785
+ "reward": 1.3175000190734862,
786
+ "reward_std": 0.4703429743647575,
787
+ "rewards/accuracy_reward": 0.47125000776723025,
788
+ "rewards/format_reward": 0.8462500080466271,
789
+ "step": 600
790
+ },
791
+ {
792
+ "completion_length": 272.85125598907473,
793
+ "epoch": 0.8133333333333334,
794
+ "grad_norm": 7.690130233764648,
795
+ "kl": 2.55137939453125,
796
+ "learning_rate": 2.740293761253956e-06,
797
+ "loss": 0.4518,
798
+ "reward": 1.413750021159649,
799
+ "reward_std": 0.4325053282082081,
800
+ "rewards/accuracy_reward": 0.531250010151416,
801
+ "rewards/format_reward": 0.8825000062584877,
802
+ "step": 610
803
+ },
804
+ {
805
+ "completion_length": 297.4387529373169,
806
+ "epoch": 0.8266666666666667,
807
+ "grad_norm": 2.0755786895751953,
808
+ "kl": 2.9087890625,
809
+ "learning_rate": 2.7270572908146716e-06,
810
+ "loss": 0.4945,
811
+ "reward": 1.3737500190734864,
812
+ "reward_std": 0.48490689508616924,
813
+ "rewards/accuracy_reward": 0.5175000118091703,
814
+ "rewards/format_reward": 0.8562500081956387,
815
+ "step": 620
816
+ },
817
+ {
818
+ "completion_length": 288.5512563705444,
819
+ "epoch": 0.84,
820
+ "grad_norm": 2.110800266265869,
821
+ "kl": 4.269287109375,
822
+ "learning_rate": 2.713525491562421e-06,
823
+ "loss": 0.6393,
824
+ "reward": 1.3537500187754632,
825
+ "reward_std": 0.5092445306479931,
826
+ "rewards/accuracy_reward": 0.4800000081770122,
827
+ "rewards/format_reward": 0.8737500131130218,
828
+ "step": 630
829
+ },
830
+ {
831
+ "completion_length": 273.556254196167,
832
+ "epoch": 0.8533333333333334,
833
+ "grad_norm": 2.306751251220703,
834
+ "kl": 2.8123046875,
835
+ "learning_rate": 2.699701620337974e-06,
836
+ "loss": 0.4281,
837
+ "reward": 1.3900000244379043,
838
+ "reward_std": 0.48922200947999955,
839
+ "rewards/accuracy_reward": 0.4987500078044832,
840
+ "rewards/format_reward": 0.8912500098347664,
841
+ "step": 640
842
+ },
843
+ {
844
+ "completion_length": 273.3600038528442,
845
+ "epoch": 0.8666666666666667,
846
+ "grad_norm": 3.8477585315704346,
847
+ "kl": 4.17413330078125,
848
+ "learning_rate": 2.685589004278139e-06,
849
+ "loss": 0.4829,
850
+ "reward": 1.2900000154972076,
851
+ "reward_std": 0.5038297042250633,
852
+ "rewards/accuracy_reward": 0.4262500065378845,
853
+ "rewards/format_reward": 0.8637500092387199,
854
+ "step": 650
855
+ },
856
+ {
857
+ "completion_length": 209.00375442504884,
858
+ "epoch": 0.88,
859
+ "grad_norm": 0.8722585439682007,
860
+ "kl": 2.2757568359375,
861
+ "learning_rate": 2.671191040014989e-06,
862
+ "loss": 0.2218,
863
+ "reward": 1.4350000306963921,
864
+ "reward_std": 0.4456099320203066,
865
+ "rewards/accuracy_reward": 0.5400000099092722,
866
+ "rewards/format_reward": 0.8950000107288361,
867
+ "step": 660
868
+ },
869
+ {
870
+ "completion_length": 210.77125377655028,
871
+ "epoch": 0.8933333333333333,
872
+ "grad_norm": 19.34569549560547,
873
+ "kl": 2.20546875,
874
+ "learning_rate": 2.656511192858356e-06,
875
+ "loss": 0.1753,
876
+ "reward": 1.536250014603138,
877
+ "reward_std": 0.34286502450704576,
878
+ "rewards/accuracy_reward": 0.6000000078231096,
879
+ "rewards/format_reward": 0.9362500041723252,
880
+ "step": 670
881
+ },
882
+ {
883
+ "completion_length": 215.156254196167,
884
+ "epoch": 0.9066666666666666,
885
+ "grad_norm": 1.1418198347091675,
886
+ "kl": 1.42802734375,
887
+ "learning_rate": 2.641552995961801e-06,
888
+ "loss": 0.1849,
889
+ "reward": 1.4125000208616256,
890
+ "reward_std": 0.4128339193761349,
891
+ "rewards/accuracy_reward": 0.4987500081770122,
892
+ "rewards/format_reward": 0.9137500122189521,
893
+ "step": 680
894
+ },
895
+ {
896
+ "completion_length": 203.80875244140626,
897
+ "epoch": 0.92,
898
+ "grad_norm": 1.1534955501556396,
899
+ "kl": 1.554296875,
900
+ "learning_rate": 2.626320049472249e-06,
901
+ "loss": 0.1974,
902
+ "reward": 1.4050000324845313,
903
+ "reward_std": 0.46511752642691134,
904
+ "rewards/accuracy_reward": 0.4950000086799264,
905
+ "rewards/format_reward": 0.910000005364418,
906
+ "step": 690
907
+ },
908
+ {
909
+ "completion_length": 229.41625385284425,
910
+ "epoch": 0.9333333333333333,
911
+ "grad_norm": 0.6829231977462769,
912
+ "kl": 1.338916015625,
913
+ "learning_rate": 2.610816019663507e-06,
914
+ "loss": 0.1411,
915
+ "reward": 1.4262500196695327,
916
+ "reward_std": 0.4427020225673914,
917
+ "rewards/accuracy_reward": 0.5125000110827387,
918
+ "rewards/format_reward": 0.9137500077486038,
919
+ "step": 700
920
+ },
921
+ {
922
+ "completion_length": 227.5175048828125,
923
+ "epoch": 0.9466666666666667,
924
+ "grad_norm": 3.9809014797210693,
925
+ "kl": 0.76220703125,
926
+ "learning_rate": 2.595044638053862e-06,
927
+ "loss": 0.0753,
928
+ "reward": 1.5462500333786011,
929
+ "reward_std": 0.43644947446882726,
930
+ "rewards/accuracy_reward": 0.5962500106543303,
931
+ "rewards/format_reward": 0.9500000029802322,
932
+ "step": 710
933
+ },
934
+ {
935
+ "completion_length": 211.47125434875488,
936
+ "epoch": 0.96,
937
+ "grad_norm": 1.0556763410568237,
938
+ "kl": 0.74736328125,
939
+ "learning_rate": 2.5790097005079765e-06,
940
+ "loss": 0.0787,
941
+ "reward": 1.537500023841858,
942
+ "reward_std": 0.36086434237658976,
943
+ "rewards/accuracy_reward": 0.58500001039356,
944
+ "rewards/format_reward": 0.9525000035762787,
945
+ "step": 720
946
+ },
947
+ {
948
+ "completion_length": 275.242505645752,
949
+ "epoch": 0.9733333333333334,
950
+ "grad_norm": 0.5037354826927185,
951
+ "kl": 1.3829833984375,
952
+ "learning_rate": 2.5627150663233e-06,
953
+ "loss": 0.2147,
954
+ "reward": 1.410000029206276,
955
+ "reward_std": 0.4641982387751341,
956
+ "rewards/accuracy_reward": 0.518750009406358,
957
+ "rewards/format_reward": 0.8912500098347664,
958
+ "step": 730
959
+ },
960
+ {
961
+ "completion_length": 230.93375358581542,
962
+ "epoch": 0.9866666666666667,
963
+ "grad_norm": 0.47338035702705383,
964
+ "kl": 0.36016845703125,
965
+ "learning_rate": 2.5461646573012073e-06,
966
+ "loss": 0.0876,
967
+ "reward": 1.560000017285347,
968
+ "reward_std": 0.3064346365630627,
969
+ "rewards/accuracy_reward": 0.5987500073388219,
970
+ "rewards/format_reward": 0.9612500011920929,
971
+ "step": 740
972
+ },
973
+ {
974
+ "completion_length": 253.68500442504882,
975
+ "epoch": 1.0,
976
+ "grad_norm": 1.1472781896591187,
977
+ "kl": 0.9532470703125,
978
+ "learning_rate": 2.529362456803101e-06,
979
+ "loss": 0.1619,
980
+ "reward": 1.4637500196695328,
981
+ "reward_std": 0.4234457302838564,
982
+ "rewards/accuracy_reward": 0.5312500048428774,
983
+ "rewards/format_reward": 0.9325000002980233,
984
+ "step": 750
985
+ },
986
+ {
987
+ "completion_length": 231.0400037765503,
988
+ "epoch": 1.0133333333333334,
989
+ "grad_norm": 0.6581693887710571,
990
+ "kl": 0.7316162109375,
991
+ "learning_rate": 2.5123125087916918e-06,
992
+ "loss": 0.1485,
993
+ "reward": 1.4725000202655791,
994
+ "reward_std": 0.39589033350348474,
995
+ "rewards/accuracy_reward": 0.5262500112876296,
996
+ "rewards/format_reward": 0.9462500065565109,
997
+ "step": 760
998
+ },
999
+ {
1000
+ "completion_length": 258.4175052642822,
1001
+ "epoch": 1.0266666666666666,
1002
+ "grad_norm": 0.7054517269134521,
1003
+ "kl": 0.7291259765625,
1004
+ "learning_rate": 2.495018916857696e-06,
1005
+ "loss": 0.2479,
1006
+ "reward": 1.4350000232458116,
1007
+ "reward_std": 0.46736107058823106,
1008
+ "rewards/accuracy_reward": 0.5350000127218664,
1009
+ "rewards/format_reward": 0.9000000074505806,
1010
+ "step": 770
1011
+ },
1012
+ {
1013
+ "completion_length": 262.02250385284424,
1014
+ "epoch": 1.04,
1015
+ "grad_norm": 0.44452065229415894,
1016
+ "kl": 0.6681884765625,
1017
+ "learning_rate": 2.477485843232183e-06,
1018
+ "loss": 0.2143,
1019
+ "reward": 1.4725000143051148,
1020
+ "reward_std": 0.4894147712737322,
1021
+ "rewards/accuracy_reward": 0.5600000077858567,
1022
+ "rewards/format_reward": 0.9125000074505806,
1023
+ "step": 780
1024
+ },
1025
+ {
1026
+ "completion_length": 225.12500190734863,
1027
+ "epoch": 1.0533333333333332,
1028
+ "grad_norm": 0.4003940224647522,
1029
+ "kl": 0.526318359375,
1030
+ "learning_rate": 2.4597175077848023e-06,
1031
+ "loss": 0.0642,
1032
+ "reward": 1.5487500220537185,
1033
+ "reward_std": 0.3219704765826464,
1034
+ "rewards/accuracy_reward": 0.5812500060535968,
1035
+ "rewards/format_reward": 0.9674999997019768,
1036
+ "step": 790
1037
+ },
1038
+ {
1039
+ "completion_length": 271.36250534057615,
1040
+ "epoch": 1.0666666666666667,
1041
+ "grad_norm": 2.0866036415100098,
1042
+ "kl": 0.4521728515625,
1043
+ "learning_rate": 2.441718187008148e-06,
1044
+ "loss": 0.1265,
1045
+ "reward": 1.448750016093254,
1046
+ "reward_std": 0.4373233333230019,
1047
+ "rewards/accuracy_reward": 0.5062500108033419,
1048
+ "rewards/format_reward": 0.9425000041723252,
1049
+ "step": 800
1050
+ },
1051
+ {
1052
+ "completion_length": 251.02625427246093,
1053
+ "epoch": 1.08,
1054
+ "grad_norm": 0.5189586877822876,
1055
+ "kl": 0.80755615234375,
1056
+ "learning_rate": 2.4234922129884873e-06,
1057
+ "loss": 0.1842,
1058
+ "reward": 1.4925000175833703,
1059
+ "reward_std": 0.41841145791113377,
1060
+ "rewards/accuracy_reward": 0.5675000082701445,
1061
+ "rewards/format_reward": 0.9250000044703484,
1062
+ "step": 810
1063
+ },
1064
+ {
1065
+ "completion_length": 252.2237548828125,
1066
+ "epoch": 1.0933333333333333,
1067
+ "grad_norm": 0.30723270773887634,
1068
+ "kl": 0.4115234375,
1069
+ "learning_rate": 2.405043972363114e-06,
1070
+ "loss": 0.1354,
1071
+ "reward": 1.4812500208616257,
1072
+ "reward_std": 0.39995415285229685,
1073
+ "rewards/accuracy_reward": 0.5387500130571425,
1074
+ "rewards/format_reward": 0.9425000056624413,
1075
+ "step": 820
1076
+ },
1077
+ {
1078
+ "completion_length": 259.47125205993655,
1079
+ "epoch": 1.1066666666666667,
1080
+ "grad_norm": 0.6361457705497742,
1081
+ "kl": 0.4533203125,
1082
+ "learning_rate": 2.386377905264567e-06,
1083
+ "loss": 0.1431,
1084
+ "reward": 1.4937500223517417,
1085
+ "reward_std": 0.4016253810375929,
1086
+ "rewards/accuracy_reward": 0.5575000120326876,
1087
+ "rewards/format_reward": 0.9362500071525574,
1088
+ "step": 830
1089
+ },
1090
+ {
1091
+ "completion_length": 273.63625564575193,
1092
+ "epoch": 1.12,
1093
+ "grad_norm": 1.0046416521072388,
1094
+ "kl": 0.51337890625,
1095
+ "learning_rate": 2.36749850425198e-06,
1096
+ "loss": 0.1486,
1097
+ "reward": 1.4950000181794167,
1098
+ "reward_std": 0.39630253836512563,
1099
+ "rewards/accuracy_reward": 0.5575000086799264,
1100
+ "rewards/format_reward": 0.9375000059604645,
1101
+ "step": 840
1102
+ },
1103
+ {
1104
+ "completion_length": 274.59875602722167,
1105
+ "epoch": 1.1333333333333333,
1106
+ "grad_norm": 1.476180076599121,
1107
+ "kl": 0.6783447265625,
1108
+ "learning_rate": 2.348410313229808e-06,
1109
+ "loss": 0.1884,
1110
+ "reward": 1.431250025331974,
1111
+ "reward_std": 0.4796358771622181,
1112
+ "rewards/accuracy_reward": 0.5237500093877315,
1113
+ "rewards/format_reward": 0.907500010728836,
1114
+ "step": 850
1115
+ },
1116
+ {
1117
+ "completion_length": 247.44750480651857,
1118
+ "epoch": 1.1466666666666667,
1119
+ "grad_norm": 0.2607942819595337,
1120
+ "kl": 0.76661376953125,
1121
+ "learning_rate": 2.329117926354199e-06,
1122
+ "loss": 0.1561,
1123
+ "reward": 1.5350000128149985,
1124
+ "reward_std": 0.44854437448084356,
1125
+ "rewards/accuracy_reward": 0.6200000122189522,
1126
+ "rewards/format_reward": 0.9150000125169754,
1127
+ "step": 860
1128
+ },
1129
+ {
1130
+ "completion_length": 233.2700038909912,
1131
+ "epoch": 1.16,
1132
+ "grad_norm": 1.2619364261627197,
1133
+ "kl": 0.8174560546875,
1134
+ "learning_rate": 2.3096259869272697e-06,
1135
+ "loss": 0.1501,
1136
+ "reward": 1.5450000151991845,
1137
+ "reward_std": 0.35568486750125883,
1138
+ "rewards/accuracy_reward": 0.6012500108219683,
1139
+ "rewards/format_reward": 0.9437500029802323,
1140
+ "step": 870
1141
+ },
1142
+ {
1143
+ "completion_length": 294.38750610351565,
1144
+ "epoch": 1.1733333333333333,
1145
+ "grad_norm": 2.265449047088623,
1146
+ "kl": 1.6038818359375,
1147
+ "learning_rate": 2.2899391862795513e-06,
1148
+ "loss": 0.2953,
1149
+ "reward": 1.3987500235438346,
1150
+ "reward_std": 0.4723483145236969,
1151
+ "rewards/accuracy_reward": 0.5225000081583857,
1152
+ "rewards/format_reward": 0.8762500047683716,
1153
+ "step": 880
1154
+ },
1155
+ {
1156
+ "completion_length": 282.3900045394897,
1157
+ "epoch": 1.1866666666666668,
1158
+ "grad_norm": 1.2791324853897095,
1159
+ "kl": 1.3260498046875,
1160
+ "learning_rate": 2.2700622626408814e-06,
1161
+ "loss": 0.2613,
1162
+ "reward": 1.4525000251829625,
1163
+ "reward_std": 0.46175364293158055,
1164
+ "rewards/accuracy_reward": 0.5725000085309148,
1165
+ "rewards/format_reward": 0.8800000064074993,
1166
+ "step": 890
1167
+ },
1168
+ {
1169
+ "completion_length": 292.1812559127808,
1170
+ "epoch": 1.2,
1171
+ "grad_norm": 2.6686275005340576,
1172
+ "kl": 1.6286865234375,
1173
+ "learning_rate": 2.25e-06,
1174
+ "loss": 0.2693,
1175
+ "reward": 1.4675000235438347,
1176
+ "reward_std": 0.42986451014876365,
1177
+ "rewards/accuracy_reward": 0.5775000100024045,
1178
+ "rewards/format_reward": 0.8900000095367432,
1179
+ "step": 900
1180
+ },
1181
+ {
1182
+ "completion_length": 258.4387538909912,
1183
+ "epoch": 1.2133333333333334,
1184
+ "grad_norm": 0.4539574384689331,
1185
+ "kl": 0.7174560546875,
1186
+ "learning_rate": 2.2297572269531398e-06,
1187
+ "loss": 0.1006,
1188
+ "reward": 1.4200000211596489,
1189
+ "reward_std": 0.4356360357254744,
1190
+ "rewards/accuracy_reward": 0.5112500078976154,
1191
+ "rewards/format_reward": 0.9087500035762787,
1192
+ "step": 910
1193
+ },
1194
+ {
1195
+ "completion_length": 269.2125053405762,
1196
+ "epoch": 1.2266666666666666,
1197
+ "grad_norm": 1.3635307550430298,
1198
+ "kl": 0.868798828125,
1199
+ "learning_rate": 2.2093388155418754e-06,
1200
+ "loss": 0.1843,
1201
+ "reward": 1.4000000208616257,
1202
+ "reward_std": 0.45293766520917417,
1203
+ "rewards/accuracy_reward": 0.5125000071711838,
1204
+ "rewards/format_reward": 0.8875000134110451,
1205
+ "step": 920
1206
+ },
1207
+ {
1208
+ "completion_length": 259.2087543487549,
1209
+ "epoch": 1.24,
1210
+ "grad_norm": 0.48847752809524536,
1211
+ "kl": 0.7795654296875,
1212
+ "learning_rate": 2.1887496800805174e-06,
1213
+ "loss": 0.1366,
1214
+ "reward": 1.4850000232458114,
1215
+ "reward_std": 0.41869538016617297,
1216
+ "rewards/accuracy_reward": 0.5825000096112489,
1217
+ "rewards/format_reward": 0.9025000095367431,
1218
+ "step": 930
1219
+ },
1220
+ {
1221
+ "completion_length": 269.03500480651854,
1222
+ "epoch": 1.2533333333333334,
1223
+ "grad_norm": 0.4667813777923584,
1224
+ "kl": 0.8,
1225
+ "learning_rate": 2.167994775973334e-06,
1226
+ "loss": 0.1864,
1227
+ "reward": 1.4250000298023224,
1228
+ "reward_std": 0.4630339309573174,
1229
+ "rewards/accuracy_reward": 0.5162500089034439,
1230
+ "rewards/format_reward": 0.9087500050663948,
1231
+ "step": 940
1232
+ },
1233
+ {
1234
+ "completion_length": 244.85000133514404,
1235
+ "epoch": 1.2666666666666666,
1236
+ "grad_norm": 0.40062400698661804,
1237
+ "kl": 0.45528564453125,
1238
+ "learning_rate": 2.1470790985218807e-06,
1239
+ "loss": 0.1361,
1240
+ "reward": 1.5050000175833702,
1241
+ "reward_std": 0.3423164799809456,
1242
+ "rewards/accuracy_reward": 0.5662500064820051,
1243
+ "rewards/format_reward": 0.9387500077486038,
1244
+ "step": 950
1245
+ },
1246
+ {
1247
+ "completion_length": 222.37750396728515,
1248
+ "epoch": 1.28,
1249
+ "grad_norm": 0.6321477293968201,
1250
+ "kl": 0.36839599609375,
1251
+ "learning_rate": 2.126007681722727e-06,
1252
+ "loss": 0.0723,
1253
+ "reward": 1.59375002682209,
1254
+ "reward_std": 0.3785114776343107,
1255
+ "rewards/accuracy_reward": 0.635000008251518,
1256
+ "rewards/format_reward": 0.9587499991059303,
1257
+ "step": 960
1258
+ },
1259
+ {
1260
+ "completion_length": 251.10250453948976,
1261
+ "epoch": 1.2933333333333334,
1262
+ "grad_norm": 0.9746308326721191,
1263
+ "kl": 0.741796875,
1264
+ "learning_rate": 2.1047855970558753e-06,
1265
+ "loss": 0.1903,
1266
+ "reward": 1.4962500169873238,
1267
+ "reward_std": 0.4037658181041479,
1268
+ "rewards/accuracy_reward": 0.5725000067614019,
1269
+ "rewards/format_reward": 0.9237500071525574,
1270
+ "step": 970
1271
+ },
1272
+ {
1273
+ "completion_length": 267.5337539672852,
1274
+ "epoch": 1.3066666666666666,
1275
+ "grad_norm": 0.694874107837677,
1276
+ "kl": 0.81298828125,
1277
+ "learning_rate": 2.0834179522641508e-06,
1278
+ "loss": 0.175,
1279
+ "reward": 1.436250016093254,
1280
+ "reward_std": 0.48411559909582136,
1281
+ "rewards/accuracy_reward": 0.5475000135600567,
1282
+ "rewards/format_reward": 0.8887500107288361,
1283
+ "step": 980
1284
+ },
1285
+ {
1286
+ "completion_length": 249.49375286102295,
1287
+ "epoch": 1.32,
1288
+ "grad_norm": 1.1866592168807983,
1289
+ "kl": 0.6787109375,
1290
+ "learning_rate": 2.061909890123868e-06,
1291
+ "loss": 0.1094,
1292
+ "reward": 1.5050000250339508,
1293
+ "reward_std": 0.38125632815063,
1294
+ "rewards/accuracy_reward": 0.5500000072643161,
1295
+ "rewards/format_reward": 0.955000002682209,
1296
+ "step": 990
1297
+ },
1298
+ {
1299
+ "completion_length": 212.09500427246093,
1300
+ "epoch": 1.3333333333333333,
1301
+ "grad_norm": 0.23463357985019684,
1302
+ "kl": 0.49178466796875,
1303
+ "learning_rate": 2.040266587207066e-06,
1304
+ "loss": 0.06,
1305
+ "reward": 1.6800000190734863,
1306
+ "reward_std": 0.24786825627088546,
1307
+ "rewards/accuracy_reward": 0.695000005979091,
1308
+ "rewards/format_reward": 0.9850000008940697,
1309
+ "step": 1000
1310
+ },
1311
+ {
1312
+ "completion_length": 242.02375335693358,
1313
+ "epoch": 1.3466666666666667,
1314
+ "grad_norm": 0.6274837255477905,
1315
+ "kl": 0.54989013671875,
1316
+ "learning_rate": 2.018493252635605e-06,
1317
+ "loss": 0.1312,
1318
+ "reward": 1.4687500163912772,
1319
+ "reward_std": 0.4060431461781263,
1320
+ "rewards/accuracy_reward": 0.5275000087916851,
1321
+ "rewards/format_reward": 0.9412499994039536,
1322
+ "step": 1010
1323
+ },
1324
+ {
1325
+ "completion_length": 245.67000522613526,
1326
+ "epoch": 1.3599999999999999,
1327
+ "grad_norm": 0.6642753481864929,
1328
+ "kl": 1.0931396484375,
1329
+ "learning_rate": 1.9965951268274372e-06,
1330
+ "loss": 0.1868,
1331
+ "reward": 1.407500022649765,
1332
+ "reward_std": 0.43992825597524643,
1333
+ "rewards/accuracy_reward": 0.5087500089779496,
1334
+ "rewards/format_reward": 0.8987500041723251,
1335
+ "step": 1020
1336
+ },
1337
+ {
1338
+ "completion_length": 278.58875465393066,
1339
+ "epoch": 1.3733333333333333,
1340
+ "grad_norm": 0.6922321915626526,
1341
+ "kl": 0.83028564453125,
1342
+ "learning_rate": 1.9745774802353347e-06,
1343
+ "loss": 0.2212,
1344
+ "reward": 1.403750029206276,
1345
+ "reward_std": 0.4952257383614779,
1346
+ "rewards/accuracy_reward": 0.5325000079348683,
1347
+ "rewards/format_reward": 0.871250006556511,
1348
+ "step": 1030
1349
+ },
1350
+ {
1351
+ "completion_length": 248.9375057220459,
1352
+ "epoch": 1.3866666666666667,
1353
+ "grad_norm": 0.7095142602920532,
1354
+ "kl": 0.58486328125,
1355
+ "learning_rate": 1.9524456120783984e-06,
1356
+ "loss": 0.1207,
1357
+ "reward": 1.4600000217556954,
1358
+ "reward_std": 0.40168970376253127,
1359
+ "rewards/accuracy_reward": 0.5100000099278986,
1360
+ "rewards/format_reward": 0.9500000044703484,
1361
+ "step": 1040
1362
+ },
1363
+ {
1364
+ "completion_length": 205.45750255584716,
1365
+ "epoch": 1.4,
1366
+ "grad_norm": 0.5801204442977905,
1367
+ "kl": 0.292529296875,
1368
+ "learning_rate": 1.9302048490666355e-06,
1369
+ "loss": 0.0814,
1370
+ "reward": 1.6212500214576722,
1371
+ "reward_std": 0.3047562278807163,
1372
+ "rewards/accuracy_reward": 0.6475000135600567,
1373
+ "rewards/format_reward": 0.9737500011920929,
1374
+ "step": 1050
1375
+ },
1376
+ {
1377
+ "completion_length": 273.5137548446655,
1378
+ "epoch": 1.4133333333333333,
1379
+ "grad_norm": 0.6409977078437805,
1380
+ "kl": 0.58953857421875,
1381
+ "learning_rate": 1.9078605441189272e-06,
1382
+ "loss": 0.1281,
1383
+ "reward": 1.563750021159649,
1384
+ "reward_std": 0.356473108753562,
1385
+ "rewards/accuracy_reward": 0.620000010356307,
1386
+ "rewards/format_reward": 0.9437500007450581,
1387
+ "step": 1060
1388
+ },
1389
+ {
1390
+ "completion_length": 294.0987562179565,
1391
+ "epoch": 1.4266666666666667,
1392
+ "grad_norm": 0.7412673234939575,
1393
+ "kl": 1.6057373046875,
1394
+ "learning_rate": 1.8854180750746833e-06,
1395
+ "loss": 0.2387,
1396
+ "reward": 1.4962500244379044,
1397
+ "reward_std": 0.4668436422944069,
1398
+ "rewards/accuracy_reward": 0.5862500118091702,
1399
+ "rewards/format_reward": 0.9100000038743019,
1400
+ "step": 1070
1401
+ },
1402
+ {
1403
+ "completion_length": 293.7837539672852,
1404
+ "epoch": 1.44,
1405
+ "grad_norm": 0.8722511529922485,
1406
+ "kl": 0.612548828125,
1407
+ "learning_rate": 1.8628828433995015e-06,
1408
+ "loss": 0.1585,
1409
+ "reward": 1.4412500128149985,
1410
+ "reward_std": 0.44985799603164195,
1411
+ "rewards/accuracy_reward": 0.528750010021031,
1412
+ "rewards/format_reward": 0.9125000044703484,
1413
+ "step": 1080
1414
+ },
1415
+ {
1416
+ "completion_length": 252.32750244140624,
1417
+ "epoch": 1.4533333333333334,
1418
+ "grad_norm": 0.5518996119499207,
1419
+ "kl": 0.4095947265625,
1420
+ "learning_rate": 1.8402602728851404e-06,
1421
+ "loss": 0.1381,
1422
+ "reward": 1.5250000193715096,
1423
+ "reward_std": 0.4259300407022238,
1424
+ "rewards/accuracy_reward": 0.5825000069104135,
1425
+ "rewards/format_reward": 0.942500002682209,
1426
+ "step": 1090
1427
+ },
1428
+ {
1429
+ "completion_length": 241.40625343322753,
1430
+ "epoch": 1.4666666666666668,
1431
+ "grad_norm": 0.4469586908817291,
1432
+ "kl": 0.302294921875,
1433
+ "learning_rate": 1.8175558083441164e-06,
1434
+ "loss": 0.0911,
1435
+ "reward": 1.506250025331974,
1436
+ "reward_std": 0.4210752282291651,
1437
+ "rewards/accuracy_reward": 0.5437500099651515,
1438
+ "rewards/format_reward": 0.9625000029802322,
1439
+ "step": 1100
1440
+ },
1441
+ {
1442
+ "completion_length": 286.4412563323975,
1443
+ "epoch": 1.48,
1444
+ "grad_norm": 0.7064781785011292,
1445
+ "kl": 0.4929443359375,
1446
+ "learning_rate": 1.7947749142992453e-06,
1447
+ "loss": 0.2215,
1448
+ "reward": 1.4950000211596488,
1449
+ "reward_std": 0.51452008895576,
1450
+ "rewards/accuracy_reward": 0.5887500088661909,
1451
+ "rewards/format_reward": 0.9062500074505806,
1452
+ "step": 1110
1453
+ },
1454
+ {
1455
+ "completion_length": 285.3000036239624,
1456
+ "epoch": 1.4933333333333334,
1457
+ "grad_norm": 0.34774771332740784,
1458
+ "kl": 0.623095703125,
1459
+ "learning_rate": 1.7719230736684376e-06,
1460
+ "loss": 0.1511,
1461
+ "reward": 1.4975000232458116,
1462
+ "reward_std": 0.43432362116873263,
1463
+ "rewards/accuracy_reward": 0.567500009946525,
1464
+ "rewards/format_reward": 0.9300000041723251,
1465
+ "step": 1120
1466
+ },
1467
+ {
1468
+ "completion_length": 286.05500450134275,
1469
+ "epoch": 1.5066666666666668,
1470
+ "grad_norm": 0.33899247646331787,
1471
+ "kl": 0.45242919921875,
1472
+ "learning_rate": 1.7490057864450664e-06,
1473
+ "loss": 0.1266,
1474
+ "reward": 1.588750024139881,
1475
+ "reward_std": 0.38815081119537354,
1476
+ "rewards/accuracy_reward": 0.6287500103004277,
1477
+ "rewards/format_reward": 0.9600000023841858,
1478
+ "step": 1130
1479
+ },
1480
+ {
1481
+ "completion_length": 244.75125102996827,
1482
+ "epoch": 1.52,
1483
+ "grad_norm": 0.8289720416069031,
1484
+ "kl": 0.41170654296875,
1485
+ "learning_rate": 1.7260285683742248e-06,
1486
+ "loss": 0.1008,
1487
+ "reward": 1.5962500244379043,
1488
+ "reward_std": 0.3575746387243271,
1489
+ "rewards/accuracy_reward": 0.6387500097043812,
1490
+ "rewards/format_reward": 0.9575000002980232,
1491
+ "step": 1140
1492
+ },
1493
+ {
1494
+ "completion_length": 295.0500051498413,
1495
+ "epoch": 1.5333333333333332,
1496
+ "grad_norm": 0.7848290205001831,
1497
+ "kl": 0.7958251953125,
1498
+ "learning_rate": 1.702996949625197e-06,
1499
+ "loss": 0.1871,
1500
+ "reward": 1.458750021457672,
1501
+ "reward_std": 0.44078024849295616,
1502
+ "rewards/accuracy_reward": 0.556250006519258,
1503
+ "rewards/format_reward": 0.9025000110268593,
1504
+ "step": 1150
1505
+ },
1506
+ {
1507
+ "completion_length": 257.19375495910646,
1508
+ "epoch": 1.5466666666666666,
1509
+ "grad_norm": 1.2715580463409424,
1510
+ "kl": 0.8576416015625,
1511
+ "learning_rate": 1.6799164734604496e-06,
1512
+ "loss": 0.2015,
1513
+ "reward": 1.556250013411045,
1514
+ "reward_std": 0.5148981977254152,
1515
+ "rewards/accuracy_reward": 0.6575000089593231,
1516
+ "rewards/format_reward": 0.8987500116229057,
1517
+ "step": 1160
1518
+ },
1519
+ {
1520
+ "completion_length": 288.4437545776367,
1521
+ "epoch": 1.56,
1522
+ "grad_norm": 1.1853872537612915,
1523
+ "kl": 1.4738037109375,
1524
+ "learning_rate": 1.6567926949014804e-06,
1525
+ "loss": 0.2435,
1526
+ "reward": 1.4325000256299973,
1527
+ "reward_std": 0.45326643362641333,
1528
+ "rewards/accuracy_reward": 0.5550000136718154,
1529
+ "rewards/format_reward": 0.8775000154972077,
1530
+ "step": 1170
1531
+ },
1532
+ {
1533
+ "completion_length": 250.51375427246094,
1534
+ "epoch": 1.5733333333333333,
1535
+ "grad_norm": 0.7030949592590332,
1536
+ "kl": 0.99638671875,
1537
+ "learning_rate": 1.6336311793918296e-06,
1538
+ "loss": 0.1342,
1539
+ "reward": 1.4425000190734862,
1540
+ "reward_std": 0.3800163000822067,
1541
+ "rewards/accuracy_reward": 0.49750001104548575,
1542
+ "rewards/format_reward": 0.9450000047683715,
1543
+ "step": 1180
1544
+ },
1545
+ {
1546
+ "completion_length": 239.85750503540038,
1547
+ "epoch": 1.5866666666666667,
1548
+ "grad_norm": 0.744570791721344,
1549
+ "kl": 0.98948974609375,
1550
+ "learning_rate": 1.6104375014575872e-06,
1551
+ "loss": 0.1446,
1552
+ "reward": 1.5650000274181366,
1553
+ "reward_std": 0.404941875860095,
1554
+ "rewards/accuracy_reward": 0.6237500123679638,
1555
+ "rewards/format_reward": 0.9412500113248825,
1556
+ "step": 1190
1557
+ },
1558
+ {
1559
+ "completion_length": 242.6050043106079,
1560
+ "epoch": 1.6,
1561
+ "grad_norm": 0.6778237819671631,
1562
+ "kl": 59.4015869140625,
1563
+ "learning_rate": 1.5872172433657137e-06,
1564
+ "loss": 3.6695,
1565
+ "reward": 1.4675000220537187,
1566
+ "reward_std": 0.4261274352669716,
1567
+ "rewards/accuracy_reward": 0.5350000061094761,
1568
+ "rewards/format_reward": 0.9325000092387199,
1569
+ "step": 1200
1570
+ },
1571
+ {
1572
+ "completion_length": 254.56750507354735,
1573
+ "epoch": 1.6133333333333333,
1574
+ "grad_norm": 1.1174402236938477,
1575
+ "kl": 1.0924072265625,
1576
+ "learning_rate": 1.563975993780496e-06,
1577
+ "loss": 0.2037,
1578
+ "reward": 1.426250010728836,
1579
+ "reward_std": 0.4171265188604593,
1580
+ "rewards/accuracy_reward": 0.5037500091828406,
1581
+ "rewards/format_reward": 0.9225000068545341,
1582
+ "step": 1210
1583
+ },
1584
+ {
1585
+ "completion_length": 250.00875434875488,
1586
+ "epoch": 1.6266666666666667,
1587
+ "grad_norm": 0.1975303441286087,
1588
+ "kl": 0.5531982421875,
1589
+ "learning_rate": 1.5407193464184646e-06,
1590
+ "loss": 0.0767,
1591
+ "reward": 1.5937500238418578,
1592
+ "reward_std": 0.293658309802413,
1593
+ "rewards/accuracy_reward": 0.6125000089406967,
1594
+ "rewards/format_reward": 0.9812499985098839,
1595
+ "step": 1220
1596
+ },
1597
+ {
1598
+ "completion_length": 250.21375293731688,
1599
+ "epoch": 1.6400000000000001,
1600
+ "grad_norm": 0.6114612817764282,
1601
+ "kl": 0.53138427734375,
1602
+ "learning_rate": 1.5174528987020958e-06,
1603
+ "loss": 0.1333,
1604
+ "reward": 1.557500022649765,
1605
+ "reward_std": 0.36959521323442457,
1606
+ "rewards/accuracy_reward": 0.6162500135600567,
1607
+ "rewards/format_reward": 0.9412500083446502,
1608
+ "step": 1230
1609
+ },
1610
+ {
1611
+ "completion_length": 296.5837543487549,
1612
+ "epoch": 1.6533333333333333,
1613
+ "grad_norm": 0.6539386510848999,
1614
+ "kl": 0.7985107421875,
1615
+ "learning_rate": 1.4941822504126198e-06,
1616
+ "loss": 0.1803,
1617
+ "reward": 1.4350000217556953,
1618
+ "reward_std": 0.4217760156840086,
1619
+ "rewards/accuracy_reward": 0.5200000086799264,
1620
+ "rewards/format_reward": 0.9150000095367432,
1621
+ "step": 1240
1622
+ },
1623
+ {
1624
+ "completion_length": 257.9250057220459,
1625
+ "epoch": 1.6666666666666665,
1626
+ "grad_norm": 0.6114557385444641,
1627
+ "kl": 44.12962646484375,
1628
+ "learning_rate": 1.4709130023422637e-06,
1629
+ "loss": 0.8266,
1630
+ "reward": 1.5075000166893004,
1631
+ "reward_std": 0.3939127091318369,
1632
+ "rewards/accuracy_reward": 0.5762500106357038,
1633
+ "rewards/format_reward": 0.9312500074505806,
1634
+ "step": 1250
1635
+ },
1636
+ {
1637
+ "completion_length": 265.36750450134275,
1638
+ "epoch": 1.6800000000000002,
1639
+ "grad_norm": 0.49496781826019287,
1640
+ "kl": 0.77027587890625,
1641
+ "learning_rate": 1.4476507549462489e-06,
1642
+ "loss": 0.1636,
1643
+ "reward": 1.4825000166893005,
1644
+ "reward_std": 0.443204028531909,
1645
+ "rewards/accuracy_reward": 0.5437500104308128,
1646
+ "rewards/format_reward": 0.9387500062584877,
1647
+ "step": 1260
1648
+ },
1649
+ {
1650
+ "completion_length": 281.16000480651854,
1651
+ "epoch": 1.6933333333333334,
1652
+ "grad_norm": 0.7085089087486267,
1653
+ "kl": 0.9700439453125,
1654
+ "learning_rate": 1.4244011069948701e-06,
1655
+ "loss": 0.2137,
1656
+ "reward": 1.413750021159649,
1657
+ "reward_std": 0.4564998596906662,
1658
+ "rewards/accuracy_reward": 0.5200000060722232,
1659
+ "rewards/format_reward": 0.8937500104308128,
1660
+ "step": 1270
1661
+ },
1662
+ {
1663
+ "completion_length": 287.8412548065186,
1664
+ "epoch": 1.7066666666666666,
1665
+ "grad_norm": 1.8162543773651123,
1666
+ "kl": 1.462109375,
1667
+ "learning_rate": 1.401169654225982e-06,
1668
+ "loss": 0.2375,
1669
+ "reward": 1.4162500217556953,
1670
+ "reward_std": 0.49586384519934656,
1671
+ "rewards/accuracy_reward": 0.5150000074878335,
1672
+ "rewards/format_reward": 0.9012500092387199,
1673
+ "step": 1280
1674
+ },
1675
+ {
1676
+ "completion_length": 279.57375583648684,
1677
+ "epoch": 1.72,
1678
+ "grad_norm": 0.674432098865509,
1679
+ "kl": 0.92662353515625,
1680
+ "learning_rate": 1.3779619879982127e-06,
1681
+ "loss": 0.1772,
1682
+ "reward": 1.5175000190734864,
1683
+ "reward_std": 0.4668840833008289,
1684
+ "rewards/accuracy_reward": 0.5912500144913793,
1685
+ "rewards/format_reward": 0.9262500107288361,
1686
+ "step": 1290
1687
+ },
1688
+ {
1689
+ "completion_length": 289.03125457763673,
1690
+ "epoch": 1.7333333333333334,
1691
+ "grad_norm": 0.7836817502975464,
1692
+ "kl": 0.88634033203125,
1693
+ "learning_rate": 1.3547836939452313e-06,
1694
+ "loss": 0.1244,
1695
+ "reward": 1.4987500250339507,
1696
+ "reward_std": 0.3810611065477133,
1697
+ "rewards/accuracy_reward": 0.5412500070407986,
1698
+ "rewards/format_reward": 0.9575000047683716,
1699
+ "step": 1300
1700
+ },
1701
+ {
1702
+ "completion_length": 284.0875045776367,
1703
+ "epoch": 1.7466666666666666,
1704
+ "grad_norm": 0.7931172847747803,
1705
+ "kl": 1.0406494140625,
1706
+ "learning_rate": 1.3316403506313982e-06,
1707
+ "loss": 0.1187,
1708
+ "reward": 1.5062500283122062,
1709
+ "reward_std": 0.3796513009816408,
1710
+ "rewards/accuracy_reward": 0.560000008251518,
1711
+ "rewards/format_reward": 0.9462500020861626,
1712
+ "step": 1310
1713
+ },
1714
+ {
1715
+ "completion_length": 267.00625438690184,
1716
+ "epoch": 1.76,
1717
+ "grad_norm": 0.7942720651626587,
1718
+ "kl": 0.7413818359375,
1719
+ "learning_rate": 1.308537528209108e-06,
1720
+ "loss": 0.0855,
1721
+ "reward": 1.5075000256299973,
1722
+ "reward_std": 0.43465386927127836,
1723
+ "rewards/accuracy_reward": 0.568750012665987,
1724
+ "rewards/format_reward": 0.9387500032782554,
1725
+ "step": 1320
1726
+ },
1727
+ {
1728
+ "completion_length": 275.38625450134276,
1729
+ "epoch": 1.7733333333333334,
1730
+ "grad_norm": 1.8723665475845337,
1731
+ "kl": 1.5499755859375,
1732
+ "learning_rate": 1.2854807870781687e-06,
1733
+ "loss": 0.1942,
1734
+ "reward": 1.4487500324845315,
1735
+ "reward_std": 0.42911659814417363,
1736
+ "rewards/accuracy_reward": 0.5437500085681677,
1737
+ "rewards/format_reward": 0.9050000056624412,
1738
+ "step": 1330
1739
+ },
1740
+ {
1741
+ "completion_length": 276.2012542724609,
1742
+ "epoch": 1.7866666666666666,
1743
+ "grad_norm": 0.5324939489364624,
1744
+ "kl": 1.421630859375,
1745
+ "learning_rate": 1.2624756765475158e-06,
1746
+ "loss": 0.1505,
1747
+ "reward": 1.3537500247359275,
1748
+ "reward_std": 0.43583590127527716,
1749
+ "rewards/accuracy_reward": 0.462500009406358,
1750
+ "rewards/format_reward": 0.8912500083446503,
1751
+ "step": 1340
1752
+ },
1753
+ {
1754
+ "completion_length": 270.83750343322754,
1755
+ "epoch": 1.8,
1756
+ "grad_norm": 2.056743621826172,
1757
+ "kl": 1.954345703125,
1758
+ "learning_rate": 1.2395277334996047e-06,
1759
+ "loss": 0.2111,
1760
+ "reward": 1.5400000274181367,
1761
+ "reward_std": 0.45903370566666124,
1762
+ "rewards/accuracy_reward": 0.60125000923872,
1763
+ "rewards/format_reward": 0.9387500077486038,
1764
+ "step": 1350
1765
+ },
1766
+ {
1767
+ "completion_length": 279.6050037384033,
1768
+ "epoch": 1.8133333333333335,
1769
+ "grad_norm": 0.9626988768577576,
1770
+ "kl": 1.0287353515625,
1771
+ "learning_rate": 1.2166424810577898e-06,
1772
+ "loss": 0.1235,
1773
+ "reward": 1.4925000071525574,
1774
+ "reward_std": 0.3699305009096861,
1775
+ "rewards/accuracy_reward": 0.5512500081211329,
1776
+ "rewards/format_reward": 0.9412500083446502,
1777
+ "step": 1360
1778
+ },
1779
+ {
1780
+ "completion_length": 325.78000526428224,
1781
+ "epoch": 1.8266666666666667,
1782
+ "grad_norm": 1.466002345085144,
1783
+ "kl": 1.82490234375,
1784
+ "learning_rate": 1.1938254272570166e-06,
1785
+ "loss": 0.2711,
1786
+ "reward": 1.445000024139881,
1787
+ "reward_std": 0.5365722481161356,
1788
+ "rewards/accuracy_reward": 0.5700000077486038,
1789
+ "rewards/format_reward": 0.8750000096857548,
1790
+ "step": 1370
1791
+ },
1792
+ {
1793
+ "completion_length": 284.211255645752,
1794
+ "epoch": 1.8399999999999999,
1795
+ "grad_norm": 0.9256098866462708,
1796
+ "kl": 1.1132080078125,
1797
+ "learning_rate": 1.1710820637181448e-06,
1798
+ "loss": 0.1941,
1799
+ "reward": 1.490000031888485,
1800
+ "reward_std": 0.47110406346619127,
1801
+ "rewards/accuracy_reward": 0.58000000808388,
1802
+ "rewards/format_reward": 0.9100000098347664,
1803
+ "step": 1380
1804
+ },
1805
+ {
1806
+ "completion_length": 248.72625560760497,
1807
+ "epoch": 1.8533333333333335,
1808
+ "grad_norm": 0.4293259084224701,
1809
+ "kl": 0.5682373046875,
1810
+ "learning_rate": 1.1484178643262232e-06,
1811
+ "loss": 0.0911,
1812
+ "reward": 1.6337500184774398,
1813
+ "reward_std": 0.3414465494453907,
1814
+ "rewards/accuracy_reward": 0.6662500083446503,
1815
+ "rewards/format_reward": 0.967500002682209,
1816
+ "step": 1390
1817
+ },
1818
+ {
1819
+ "completion_length": 240.6875026702881,
1820
+ "epoch": 1.8666666666666667,
1821
+ "grad_norm": 0.5720169544219971,
1822
+ "kl": 0.82125244140625,
1823
+ "learning_rate": 1.1258382839130282e-06,
1824
+ "loss": 0.1097,
1825
+ "reward": 1.570000022649765,
1826
+ "reward_std": 0.4139075022190809,
1827
+ "rewards/accuracy_reward": 0.6237500090152025,
1828
+ "rewards/format_reward": 0.9462500035762786,
1829
+ "step": 1400
1830
+ },
1831
+ {
1832
+ "completion_length": 257.53500480651854,
1833
+ "epoch": 1.88,
1834
+ "grad_norm": 1.1818897724151611,
1835
+ "kl": 0.95081787109375,
1836
+ "learning_rate": 1.103348756944197e-06,
1837
+ "loss": 0.1008,
1838
+ "reward": 1.4837500154972076,
1839
+ "reward_std": 0.42501535080373287,
1840
+ "rewards/accuracy_reward": 0.5600000080652535,
1841
+ "rewards/format_reward": 0.9237500101327896,
1842
+ "step": 1410
1843
+ },
1844
+ {
1845
+ "completion_length": 247.4950050354004,
1846
+ "epoch": 1.8933333333333333,
1847
+ "grad_norm": 0.6974407434463501,
1848
+ "kl": 1.0361083984375,
1849
+ "learning_rate": 1.0809546962112535e-06,
1850
+ "loss": 0.1492,
1851
+ "reward": 1.4875000149011612,
1852
+ "reward_std": 0.3567169703543186,
1853
+ "rewards/accuracy_reward": 0.538750009983778,
1854
+ "rewards/format_reward": 0.9487500041723251,
1855
+ "step": 1420
1856
+ },
1857
+ {
1858
+ "completion_length": 268.65250663757325,
1859
+ "epoch": 1.9066666666666667,
1860
+ "grad_norm": 1.1433619260787964,
1861
+ "kl": 1.185693359375,
1862
+ "learning_rate": 1.0586614915288572e-06,
1863
+ "loss": 0.1517,
1864
+ "reward": 1.4925000205636025,
1865
+ "reward_std": 0.4326970729976892,
1866
+ "rewards/accuracy_reward": 0.5587500042282045,
1867
+ "rewards/format_reward": 0.9337500005960464,
1868
+ "step": 1430
1869
+ },
1870
+ {
1871
+ "completion_length": 255.78125648498536,
1872
+ "epoch": 1.92,
1873
+ "grad_norm": 0.6243150234222412,
1874
+ "kl": 1.2357177734375,
1875
+ "learning_rate": 1.036474508437579e-06,
1876
+ "loss": 0.1554,
1877
+ "reward": 1.4700000181794166,
1878
+ "reward_std": 0.44454205557703974,
1879
+ "rewards/accuracy_reward": 0.5537500033155084,
1880
+ "rewards/format_reward": 0.9162500083446503,
1881
+ "step": 1440
1882
+ },
1883
+ {
1884
+ "completion_length": 234.58750495910644,
1885
+ "epoch": 1.9333333333333333,
1886
+ "grad_norm": 0.6096332669258118,
1887
+ "kl": 0.67691650390625,
1888
+ "learning_rate": 1.0143990869125186e-06,
1889
+ "loss": 0.099,
1890
+ "reward": 1.5912500202655793,
1891
+ "reward_std": 0.36347288005053996,
1892
+ "rewards/accuracy_reward": 0.6275000140070915,
1893
+ "rewards/format_reward": 0.9637500032782554,
1894
+ "step": 1450
1895
+ },
1896
+ {
1897
+ "completion_length": 256.7262538909912,
1898
+ "epoch": 1.9466666666666668,
1899
+ "grad_norm": 1.1025887727737427,
1900
+ "kl": 1.1599609375,
1901
+ "learning_rate": 9.924405400780786e-07,
1902
+ "loss": 0.1453,
1903
+ "reward": 1.4675000190734864,
1904
+ "reward_std": 0.3775463242083788,
1905
+ "rewards/accuracy_reward": 0.5312500094994903,
1906
+ "rewards/format_reward": 0.9362500011920929,
1907
+ "step": 1460
1908
+ },
1909
+ {
1910
+ "completion_length": 268.33000297546386,
1911
+ "epoch": 1.96,
1912
+ "grad_norm": 1.1707123517990112,
1913
+ "kl": 1.133642578125,
1914
+ "learning_rate": 9.70604152929197e-07,
1915
+ "loss": 0.1489,
1916
+ "reward": 1.4312500163912774,
1917
+ "reward_std": 0.4398480519652367,
1918
+ "rewards/accuracy_reward": 0.5075000053271651,
1919
+ "rewards/format_reward": 0.9237500041723251,
1920
+ "step": 1470
1921
+ },
1922
+ {
1923
+ "completion_length": 257.52125358581543,
1924
+ "epoch": 1.9733333333333334,
1925
+ "grad_norm": 0.9173078536987305,
1926
+ "kl": 1.48629150390625,
1927
+ "learning_rate": 9.488951810593526e-07,
1928
+ "loss": 0.1856,
1929
+ "reward": 1.4750000312924385,
1930
+ "reward_std": 0.4462425637990236,
1931
+ "rewards/accuracy_reward": 0.5600000127218664,
1932
+ "rewards/format_reward": 0.9150000050663948,
1933
+ "step": 1480
1934
+ },
1935
+ {
1936
+ "completion_length": 266.10000228881836,
1937
+ "epoch": 1.9866666666666668,
1938
+ "grad_norm": 0.7360009551048279,
1939
+ "kl": 1.1963623046875,
1940
+ "learning_rate": 9.273188493956475e-07,
1941
+ "loss": 0.1318,
1942
+ "reward": 1.5275000244379044,
1943
+ "reward_std": 0.38836930617690085,
1944
+ "rewards/accuracy_reward": 0.6137500053271652,
1945
+ "rewards/format_reward": 0.9137500077486038,
1946
+ "step": 1490
1947
+ },
1948
+ {
1949
+ "completion_length": 254.0337522506714,
1950
+ "epoch": 2.0,
1951
+ "grad_norm": 0.8448986411094666,
1952
+ "kl": 1.1837646484375,
1953
+ "learning_rate": 9.058803509412648e-07,
1954
+ "loss": 0.1552,
1955
+ "reward": 1.5100000187754632,
1956
+ "reward_std": 0.4034050587564707,
1957
+ "rewards/accuracy_reward": 0.5787500125356019,
1958
+ "rewards/format_reward": 0.9312500029802322,
1959
+ "step": 1500
1960
+ },
1961
+ {
1962
+ "completion_length": 251.92000274658204,
1963
+ "epoch": 2.013333333333333,
1964
+ "grad_norm": 0.3304300904273987,
1965
+ "kl": 0.811279296875,
1966
+ "learning_rate": 8.84584845525618e-07,
1967
+ "loss": 0.1094,
1968
+ "reward": 1.5350000232458114,
1969
+ "reward_std": 0.37311027348041537,
1970
+ "rewards/accuracy_reward": 0.5937500129453838,
1971
+ "rewards/format_reward": 0.941250005364418,
1972
+ "step": 1510
1973
+ },
1974
+ {
1975
+ "completion_length": 257.3825050354004,
1976
+ "epoch": 2.026666666666667,
1977
+ "grad_norm": 2.269232988357544,
1978
+ "kl": 1.01397705078125,
1979
+ "learning_rate": 8.63437458562477e-07,
1980
+ "loss": 0.1731,
1981
+ "reward": 1.5512500256299973,
1982
+ "reward_std": 0.4206295557320118,
1983
+ "rewards/accuracy_reward": 0.6212500111199916,
1984
+ "rewards/format_reward": 0.9300000056624412,
1985
+ "step": 1520
1986
+ },
1987
+ {
1988
+ "completion_length": 246.79625415802002,
1989
+ "epoch": 2.04,
1990
+ "grad_norm": 2.4657490253448486,
1991
+ "kl": 1.84619140625,
1992
+ "learning_rate": 8.424432798163837e-07,
1993
+ "loss": 0.1957,
1994
+ "reward": 1.462500013411045,
1995
+ "reward_std": 0.46618823148310184,
1996
+ "rewards/accuracy_reward": 0.5862500076182187,
1997
+ "rewards/format_reward": 0.87625000923872,
1998
+ "step": 1530
1999
+ },
2000
+ {
2001
+ "completion_length": 244.07625579833984,
2002
+ "epoch": 2.0533333333333332,
2003
+ "grad_norm": 0.878600537776947,
2004
+ "kl": 1.319287109375,
2005
+ "learning_rate": 8.216073621776437e-07,
2006
+ "loss": 0.097,
2007
+ "reward": 1.4925000190734863,
2008
+ "reward_std": 0.4461408320814371,
2009
+ "rewards/accuracy_reward": 0.5762500118464231,
2010
+ "rewards/format_reward": 0.916250005364418,
2011
+ "step": 1540
2012
+ },
2013
+ {
2014
+ "completion_length": 231.71000556945802,
2015
+ "epoch": 2.066666666666667,
2016
+ "grad_norm": 0.6690750122070312,
2017
+ "kl": 0.91011962890625,
2018
+ "learning_rate": 8.009347204461922e-07,
2019
+ "loss": 0.0719,
2020
+ "reward": 1.5675000205636025,
2021
+ "reward_std": 0.3554057668894529,
2022
+ "rewards/accuracy_reward": 0.6100000100210309,
2023
+ "rewards/format_reward": 0.9575000047683716,
2024
+ "step": 1550
2025
+ },
2026
+ {
2027
+ "completion_length": 230.32750396728517,
2028
+ "epoch": 2.08,
2029
+ "grad_norm": 0.9904080033302307,
2030
+ "kl": 0.50635986328125,
2031
+ "learning_rate": 7.804303301246311e-07,
2032
+ "loss": 0.0681,
2033
+ "reward": 1.675000011920929,
2034
+ "reward_std": 0.30205987617373464,
2035
+ "rewards/accuracy_reward": 0.7025000074878335,
2036
+ "rewards/format_reward": 0.9725000008940696,
2037
+ "step": 1560
2038
+ },
2039
+ {
2040
+ "completion_length": 257.5050031661987,
2041
+ "epoch": 2.0933333333333333,
2042
+ "grad_norm": 0.5869216918945312,
2043
+ "kl": 1.6803466796875,
2044
+ "learning_rate": 7.600991262207221e-07,
2045
+ "loss": 0.1555,
2046
+ "reward": 1.542500025033951,
2047
+ "reward_std": 0.3918322518467903,
2048
+ "rewards/accuracy_reward": 0.6312500072643161,
2049
+ "rewards/format_reward": 0.9112500101327896,
2050
+ "step": 1570
2051
+ },
2052
+ {
2053
+ "completion_length": 257.85625553131104,
2054
+ "epoch": 2.1066666666666665,
2055
+ "grad_norm": 0.7470917105674744,
2056
+ "kl": 1.1965576171875,
2057
+ "learning_rate": 7.399460020596266e-07,
2058
+ "loss": 0.1217,
2059
+ "reward": 1.4912500128149986,
2060
+ "reward_std": 0.44596712924540044,
2061
+ "rewards/accuracy_reward": 0.5762500122189522,
2062
+ "rewards/format_reward": 0.9150000065565109,
2063
+ "step": 1580
2064
+ },
2065
+ {
2066
+ "completion_length": 249.50250511169435,
2067
+ "epoch": 2.12,
2068
+ "grad_norm": 1.4397474527359009,
2069
+ "kl": 1.5828369140625,
2070
+ "learning_rate": 7.19975808106177e-07,
2071
+ "loss": 0.1932,
2072
+ "reward": 1.5000000178813935,
2073
+ "reward_std": 0.44045890420675277,
2074
+ "rewards/accuracy_reward": 0.5787500116042793,
2075
+ "rewards/format_reward": 0.921250007301569,
2076
+ "step": 1590
2077
+ },
2078
+ {
2079
+ "completion_length": 255.7712547302246,
2080
+ "epoch": 2.1333333333333333,
2081
+ "grad_norm": 0.47938272356987,
2082
+ "kl": 1.07216796875,
2083
+ "learning_rate": 7.001933507974634e-07,
2084
+ "loss": 0.1435,
2085
+ "reward": 1.4987500235438347,
2086
+ "reward_std": 0.4268573712557554,
2087
+ "rewards/accuracy_reward": 0.5737500079907477,
2088
+ "rewards/format_reward": 0.9250000044703484,
2089
+ "step": 1600
2090
+ },
2091
+ {
2092
+ "completion_length": 245.55125331878662,
2093
+ "epoch": 2.1466666666666665,
2094
+ "grad_norm": 0.8688609600067139,
2095
+ "kl": 1.22481689453125,
2096
+ "learning_rate": 6.806033913860195e-07,
2097
+ "loss": 0.1412,
2098
+ "reward": 1.4875000298023224,
2099
+ "reward_std": 0.41953446678817274,
2100
+ "rewards/accuracy_reward": 0.5675000123679638,
2101
+ "rewards/format_reward": 0.9200000070035458,
2102
+ "step": 1610
2103
+ },
2104
+ {
2105
+ "completion_length": 250.85125579833985,
2106
+ "epoch": 2.16,
2107
+ "grad_norm": 0.5293228030204773,
2108
+ "kl": 0.846044921875,
2109
+ "learning_rate": 6.6121064479388e-07,
2110
+ "loss": 0.1231,
2111
+ "reward": 1.5712500154972076,
2112
+ "reward_std": 0.38107405565679076,
2113
+ "rewards/accuracy_reward": 0.6225000062957406,
2114
+ "rewards/format_reward": 0.9487500011920929,
2115
+ "step": 1620
2116
+ },
2117
+ {
2118
+ "completion_length": 252.32750434875487,
2119
+ "epoch": 2.1733333333333333,
2120
+ "grad_norm": 1.0724272727966309,
2121
+ "kl": 1.2016357421875,
2122
+ "learning_rate": 6.420197784777925e-07,
2123
+ "loss": 0.1292,
2124
+ "reward": 1.4837500169873237,
2125
+ "reward_std": 0.4388523455709219,
2126
+ "rewards/accuracy_reward": 0.5450000113807618,
2127
+ "rewards/format_reward": 0.9387500062584877,
2128
+ "step": 1630
2129
+ },
2130
+ {
2131
+ "completion_length": 262.9075046539307,
2132
+ "epoch": 2.1866666666666665,
2133
+ "grad_norm": 0.852165699005127,
2134
+ "kl": 1.47001953125,
2135
+ "learning_rate": 6.230354113058505e-07,
2136
+ "loss": 0.1207,
2137
+ "reward": 1.438750022649765,
2138
+ "reward_std": 0.47429373003542424,
2139
+ "rewards/accuracy_reward": 0.5275000119581819,
2140
+ "rewards/format_reward": 0.9112500056624413,
2141
+ "step": 1640
2142
+ },
2143
+ {
2144
+ "completion_length": 237.82875328063966,
2145
+ "epoch": 2.2,
2146
+ "grad_norm": 0.7865124344825745,
2147
+ "kl": 0.8052978515625,
2148
+ "learning_rate": 6.04262112445821e-07,
2149
+ "loss": 0.0906,
2150
+ "reward": 1.5237500220537186,
2151
+ "reward_std": 0.35442090816795824,
2152
+ "rewards/accuracy_reward": 0.5787500074133277,
2153
+ "rewards/format_reward": 0.9450000032782555,
2154
+ "step": 1650
2155
+ },
2156
+ {
2157
+ "completion_length": 244.2200050354004,
2158
+ "epoch": 2.2133333333333334,
2159
+ "grad_norm": 2.7681257724761963,
2160
+ "kl": 1.0701416015625,
2161
+ "learning_rate": 5.857044002654356e-07,
2162
+ "loss": 0.1363,
2163
+ "reward": 1.5400000184774398,
2164
+ "reward_std": 0.41701370403170585,
2165
+ "rewards/accuracy_reward": 0.6250000096857548,
2166
+ "rewards/format_reward": 0.9150000095367432,
2167
+ "step": 1660
2168
+ },
2169
+ {
2170
+ "completion_length": 236.56875324249268,
2171
+ "epoch": 2.2266666666666666,
2172
+ "grad_norm": 1.139227032661438,
2173
+ "kl": 1.362841796875,
2174
+ "learning_rate": 5.673667412449069e-07,
2175
+ "loss": 0.1759,
2176
+ "reward": 1.4887500166893006,
2177
+ "reward_std": 0.46174298636615274,
2178
+ "rewards/accuracy_reward": 0.5825000075623393,
2179
+ "rewards/format_reward": 0.9062500104308129,
2180
+ "step": 1670
2181
+ },
2182
+ {
2183
+ "completion_length": 251.44250373840333,
2184
+ "epoch": 2.24,
2185
+ "grad_norm": 1.1506483554840088,
2186
+ "kl": 3.351171875,
2187
+ "learning_rate": 5.492535489019345e-07,
2188
+ "loss": 0.3264,
2189
+ "reward": 1.5837500229477883,
2190
+ "reward_std": 0.4318976990878582,
2191
+ "rewards/accuracy_reward": 0.6537500128149987,
2192
+ "rewards/format_reward": 0.9300000041723251,
2193
+ "step": 1680
2194
+ },
2195
+ {
2196
+ "completion_length": 237.26500434875487,
2197
+ "epoch": 2.2533333333333334,
2198
+ "grad_norm": 0.24002552032470703,
2199
+ "kl": 0.6318603515625,
2200
+ "learning_rate": 5.313691827294569e-07,
2201
+ "loss": 0.0718,
2202
+ "reward": 1.6012500166893004,
2203
+ "reward_std": 0.3187932658940554,
2204
+ "rewards/accuracy_reward": 0.6325000114738941,
2205
+ "rewards/format_reward": 0.9687500029802323,
2206
+ "step": 1690
2207
+ },
2208
+ {
2209
+ "completion_length": 239.81500282287598,
2210
+ "epoch": 2.2666666666666666,
2211
+ "grad_norm": 0.3545837998390198,
2212
+ "kl": 0.5182861328125,
2213
+ "learning_rate": 5.137179471464047e-07,
2214
+ "loss": 0.0914,
2215
+ "reward": 1.5900000303983688,
2216
+ "reward_std": 0.36133957989513876,
2217
+ "rewards/accuracy_reward": 0.6212500061839819,
2218
+ "rewards/format_reward": 0.9687500029802323,
2219
+ "step": 1700
2220
+ },
2221
+ {
2222
+ "completion_length": 239.17750453948975,
2223
+ "epoch": 2.2800000000000002,
2224
+ "grad_norm": 0.3821852505207062,
2225
+ "kl": 0.6372802734375,
2226
+ "learning_rate": 4.963040904617131e-07,
2227
+ "loss": 0.0939,
2228
+ "reward": 1.532500021159649,
2229
+ "reward_std": 0.3489324226975441,
2230
+ "rewards/accuracy_reward": 0.580000011343509,
2231
+ "rewards/format_reward": 0.9525000095367432,
2232
+ "step": 1710
2233
+ },
2234
+ {
2235
+ "completion_length": 251.4825029373169,
2236
+ "epoch": 2.2933333333333334,
2237
+ "grad_norm": 1.0452297925949097,
2238
+ "kl": 1.1371826171875,
2239
+ "learning_rate": 4.791318038518345e-07,
2240
+ "loss": 0.1565,
2241
+ "reward": 1.4887500151991844,
2242
+ "reward_std": 0.4074371732771397,
2243
+ "rewards/accuracy_reward": 0.5575000095181167,
2244
+ "rewards/format_reward": 0.9312500029802322,
2245
+ "step": 1720
2246
+ },
2247
+ {
2248
+ "completion_length": 252.94750308990479,
2249
+ "epoch": 2.3066666666666666,
2250
+ "grad_norm": 1.300804853439331,
2251
+ "kl": 1.0220703125,
2252
+ "learning_rate": 4.6220522035200607e-07,
2253
+ "loss": 0.1473,
2254
+ "reward": 1.4225000247359276,
2255
+ "reward_std": 0.4227703019976616,
2256
+ "rewards/accuracy_reward": 0.4975000057369471,
2257
+ "rewards/format_reward": 0.9250000044703484,
2258
+ "step": 1730
2259
+ },
2260
+ {
2261
+ "completion_length": 245.8537540435791,
2262
+ "epoch": 2.32,
2263
+ "grad_norm": 1.0373808145523071,
2264
+ "kl": 1.1215576171875,
2265
+ "learning_rate": 4.4552841386150737e-07,
2266
+ "loss": 0.1625,
2267
+ "reward": 1.570000010728836,
2268
+ "reward_std": 0.44271881096065047,
2269
+ "rewards/accuracy_reward": 0.6412500072270632,
2270
+ "rewards/format_reward": 0.9287500068545341,
2271
+ "step": 1740
2272
+ },
2273
+ {
2274
+ "completion_length": 242.66250534057616,
2275
+ "epoch": 2.3333333333333335,
2276
+ "grad_norm": 1.0271825790405273,
2277
+ "kl": 1.0448974609375,
2278
+ "learning_rate": 4.291053981631517e-07,
2279
+ "loss": 0.1839,
2280
+ "reward": 1.5500000163912773,
2281
+ "reward_std": 0.4699274588376284,
2282
+ "rewards/accuracy_reward": 0.6150000074878335,
2283
+ "rewards/format_reward": 0.9350000068545341,
2284
+ "step": 1750
2285
+ },
2286
+ {
2287
+ "completion_length": 243.92625598907472,
2288
+ "epoch": 2.3466666666666667,
2289
+ "grad_norm": 0.7303493022918701,
2290
+ "kl": 1.138134765625,
2291
+ "learning_rate": 4.129401259572467e-07,
2292
+ "loss": 0.1799,
2293
+ "reward": 1.595000022649765,
2294
+ "reward_std": 0.41913925893604753,
2295
+ "rewards/accuracy_reward": 0.6700000133365392,
2296
+ "rewards/format_reward": 0.9250000059604645,
2297
+ "step": 1760
2298
+ },
2299
+ {
2300
+ "completion_length": 246.7412540435791,
2301
+ "epoch": 2.36,
2302
+ "grad_norm": 0.9631237983703613,
2303
+ "kl": 0.8330322265625,
2304
+ "learning_rate": 3.9703648791025716e-07,
2305
+ "loss": 0.1262,
2306
+ "reward": 1.5162500217556953,
2307
+ "reward_std": 0.3487443562597036,
2308
+ "rewards/accuracy_reward": 0.5662500067614019,
2309
+ "rewards/format_reward": 0.95,
2310
+ "step": 1770
2311
+ },
2312
+ {
2313
+ "completion_length": 232.69125442504884,
2314
+ "epoch": 2.3733333333333335,
2315
+ "grad_norm": 0.3894151449203491,
2316
+ "kl": 0.7792236328125,
2317
+ "learning_rate": 3.813983117183973e-07,
2318
+ "loss": 0.1313,
2319
+ "reward": 1.5537500232458115,
2320
+ "reward_std": 0.36217943094670774,
2321
+ "rewards/accuracy_reward": 0.6062500105239451,
2322
+ "rewards/format_reward": 0.947500005364418,
2323
+ "step": 1780
2324
+ },
2325
+ {
2326
+ "completion_length": 248.34000549316406,
2327
+ "epoch": 2.3866666666666667,
2328
+ "grad_norm": 1.5399941205978394,
2329
+ "kl": 0.61976318359375,
2330
+ "learning_rate": 3.660293611863782e-07,
2331
+ "loss": 0.1232,
2332
+ "reward": 1.5750000208616257,
2333
+ "reward_std": 0.3620978184044361,
2334
+ "rewards/accuracy_reward": 0.6150000086054206,
2335
+ "rewards/format_reward": 0.9600000008940697,
2336
+ "step": 1790
2337
+ },
2338
+ {
2339
+ "completion_length": 256.740007019043,
2340
+ "epoch": 2.4,
2341
+ "grad_norm": 1.343291163444519,
2342
+ "kl": 0.779443359375,
2343
+ "learning_rate": 3.5093333532153313e-07,
2344
+ "loss": 0.1479,
2345
+ "reward": 1.5312500238418578,
2346
+ "reward_std": 0.4155931018292904,
2347
+ "rewards/accuracy_reward": 0.6075000097043812,
2348
+ "rewards/format_reward": 0.9237500041723251,
2349
+ "step": 1800
2350
+ },
2351
+ {
2352
+ "completion_length": 254.71500473022462,
2353
+ "epoch": 2.413333333333333,
2354
+ "grad_norm": 0.8221819400787354,
2355
+ "kl": 0.804345703125,
2356
+ "learning_rate": 3.361138674435386e-07,
2357
+ "loss": 0.1537,
2358
+ "reward": 1.520000022649765,
2359
+ "reward_std": 0.39584226049482824,
2360
+ "rewards/accuracy_reward": 0.5900000093504787,
2361
+ "rewards/format_reward": 0.9300000071525574,
2362
+ "step": 1810
2363
+ },
2364
+ {
2365
+ "completion_length": 225.73875350952147,
2366
+ "epoch": 2.4266666666666667,
2367
+ "grad_norm": 0.5089777112007141,
2368
+ "kl": 0.951416015625,
2369
+ "learning_rate": 3.215745243099449e-07,
2370
+ "loss": 0.0989,
2371
+ "reward": 1.5050000190734862,
2372
+ "reward_std": 0.34333288110792637,
2373
+ "rewards/accuracy_reward": 0.5612500077113509,
2374
+ "rewards/format_reward": 0.9437500059604644,
2375
+ "step": 1820
2376
+ },
2377
+ {
2378
+ "completion_length": 244.07000427246095,
2379
+ "epoch": 2.44,
2380
+ "grad_norm": 0.4870164096355438,
2381
+ "kl": 0.99180908203125,
2382
+ "learning_rate": 3.073188052577282e-07,
2383
+ "loss": 0.1526,
2384
+ "reward": 1.5562500178813934,
2385
+ "reward_std": 0.424020304530859,
2386
+ "rewards/accuracy_reward": 0.6175000119954348,
2387
+ "rewards/format_reward": 0.9387500047683716,
2388
+ "step": 1830
2389
+ },
2390
+ {
2391
+ "completion_length": 233.82125263214112,
2392
+ "epoch": 2.453333333333333,
2393
+ "grad_norm": 1.4273000955581665,
2394
+ "kl": 1.02783203125,
2395
+ "learning_rate": 2.93350141361067e-07,
2396
+ "loss": 0.1579,
2397
+ "reward": 1.5900000289082528,
2398
+ "reward_std": 0.4301666900515556,
2399
+ "rewards/accuracy_reward": 0.6575000090524554,
2400
+ "rewards/format_reward": 0.9325000047683716,
2401
+ "step": 1840
2402
+ },
2403
+ {
2404
+ "completion_length": 230.63875503540038,
2405
+ "epoch": 2.466666666666667,
2406
+ "grad_norm": 0.8823533654212952,
2407
+ "kl": 0.8214599609375,
2408
+ "learning_rate": 2.796718946055488e-07,
2409
+ "loss": 0.1264,
2410
+ "reward": 1.5850000232458115,
2411
+ "reward_std": 0.3670020330697298,
2412
+ "rewards/accuracy_reward": 0.6350000068545342,
2413
+ "rewards/format_reward": 0.9500000029802322,
2414
+ "step": 1850
2415
+ },
2416
+ {
2417
+ "completion_length": 238.70250186920165,
2418
+ "epoch": 2.48,
2419
+ "grad_norm": 0.6716477274894714,
2420
+ "kl": 0.7898681640625,
2421
+ "learning_rate": 2.6628735707900655e-07,
2422
+ "loss": 0.119,
2423
+ "reward": 1.6300000220537185,
2424
+ "reward_std": 0.34950714409351347,
2425
+ "rewards/accuracy_reward": 0.6787500105798244,
2426
+ "rewards/format_reward": 0.9512500062584877,
2427
+ "step": 1860
2428
+ },
2429
+ {
2430
+ "completion_length": 255.3150053024292,
2431
+ "epoch": 2.493333333333333,
2432
+ "grad_norm": 0.72442227602005,
2433
+ "kl": 0.9836181640625,
2434
+ "learning_rate": 2.531997501791779e-07,
2435
+ "loss": 0.1863,
2436
+ "reward": 1.5250000268220902,
2437
+ "reward_std": 0.4403968006372452,
2438
+ "rewards/accuracy_reward": 0.5950000144541263,
2439
+ "rewards/format_reward": 0.9300000086426735,
2440
+ "step": 1870
2441
+ },
2442
+ {
2443
+ "completion_length": 269.0000047683716,
2444
+ "epoch": 2.506666666666667,
2445
+ "grad_norm": 0.5673468112945557,
2446
+ "kl": 0.9330322265625,
2447
+ "learning_rate": 2.4041222383837535e-07,
2448
+ "loss": 0.1553,
2449
+ "reward": 1.537500023841858,
2450
+ "reward_std": 0.4138102397322655,
2451
+ "rewards/accuracy_reward": 0.6012500094249844,
2452
+ "rewards/format_reward": 0.936250002682209,
2453
+ "step": 1880
2454
+ },
2455
+ {
2456
+ "completion_length": 285.29625511169434,
2457
+ "epoch": 2.52,
2458
+ "grad_norm": 0.7273412942886353,
2459
+ "kl": 1.2927001953125,
2460
+ "learning_rate": 2.2792785576536108e-07,
2461
+ "loss": 0.2201,
2462
+ "reward": 1.441250018775463,
2463
+ "reward_std": 0.47595637403428553,
2464
+ "rewards/accuracy_reward": 0.5312500111758709,
2465
+ "rewards/format_reward": 0.9100000083446502,
2466
+ "step": 1890
2467
+ },
2468
+ {
2469
+ "completion_length": 254.1075038909912,
2470
+ "epoch": 2.533333333333333,
2471
+ "grad_norm": 0.752768874168396,
2472
+ "kl": 1.1075439453125,
2473
+ "learning_rate": 2.1574965070460045e-07,
2474
+ "loss": 0.152,
2475
+ "reward": 1.6112500235438347,
2476
+ "reward_std": 0.40718725696206093,
2477
+ "rewards/accuracy_reward": 0.6700000101700425,
2478
+ "rewards/format_reward": 0.9412500068545342,
2479
+ "step": 1900
2480
+ },
2481
+ {
2482
+ "completion_length": 234.2762550354004,
2483
+ "epoch": 2.546666666666667,
2484
+ "grad_norm": 0.5253682136535645,
2485
+ "kl": 0.95108642578125,
2486
+ "learning_rate": 2.0388053971307929e-07,
2487
+ "loss": 0.1638,
2488
+ "reward": 1.605000016093254,
2489
+ "reward_std": 0.39718882702291014,
2490
+ "rewards/accuracy_reward": 0.6737500067800284,
2491
+ "rewards/format_reward": 0.9312500044703483,
2492
+ "step": 1910
2493
+ },
2494
+ {
2495
+ "completion_length": 255.07750434875487,
2496
+ "epoch": 2.56,
2497
+ "grad_norm": 0.5825310945510864,
2498
+ "kl": 1.0739501953125,
2499
+ "learning_rate": 1.9232337945485655e-07,
2500
+ "loss": 0.1466,
2501
+ "reward": 1.5462500244379043,
2502
+ "reward_std": 0.4311613071709871,
2503
+ "rewards/accuracy_reward": 0.6100000113248825,
2504
+ "rewards/format_reward": 0.9362500086426735,
2505
+ "step": 1920
2506
+ },
2507
+ {
2508
+ "completion_length": 262.95500526428225,
2509
+ "epoch": 2.5733333333333333,
2510
+ "grad_norm": 1.1008692979812622,
2511
+ "kl": 8.222119140625,
2512
+ "learning_rate": 1.810809515135184e-07,
2513
+ "loss": 0.4634,
2514
+ "reward": 1.4800000235438346,
2515
+ "reward_std": 0.4289998158812523,
2516
+ "rewards/accuracy_reward": 0.5450000076554715,
2517
+ "rewards/format_reward": 0.9350000068545341,
2518
+ "step": 1930
2519
+ },
2520
+ {
2521
+ "completion_length": 239.78375244140625,
2522
+ "epoch": 2.586666666666667,
2523
+ "grad_norm": 0.9321441054344177,
2524
+ "kl": 1.1059326171875,
2525
+ "learning_rate": 1.701559617227084e-07,
2526
+ "loss": 0.1856,
2527
+ "reward": 1.5887500256299973,
2528
+ "reward_std": 0.413610565662384,
2529
+ "rewards/accuracy_reward": 0.6537500060163438,
2530
+ "rewards/format_reward": 0.9350000038743019,
2531
+ "step": 1940
2532
+ },
2533
+ {
2534
+ "completion_length": 267.56750259399416,
2535
+ "epoch": 2.6,
2536
+ "grad_norm": 0.4667798578739166,
2537
+ "kl": 0.897509765625,
2538
+ "learning_rate": 1.5955103951488177e-07,
2539
+ "loss": 0.1291,
2540
+ "reward": 1.512500023841858,
2541
+ "reward_std": 0.36861380077898503,
2542
+ "rewards/accuracy_reward": 0.5650000072084367,
2543
+ "rewards/format_reward": 0.9475000098347663,
2544
+ "step": 1950
2545
+ },
2546
+ {
2547
+ "completion_length": 244.6812545776367,
2548
+ "epoch": 2.6133333333333333,
2549
+ "grad_norm": 1.1859592199325562,
2550
+ "kl": 0.7323974609375,
2551
+ "learning_rate": 1.4926873728845668e-07,
2552
+ "loss": 0.132,
2553
+ "reward": 1.6525000289082528,
2554
+ "reward_std": 0.3647048894315958,
2555
+ "rewards/accuracy_reward": 0.7000000123865903,
2556
+ "rewards/format_reward": 0.9525000050663948,
2557
+ "step": 1960
2558
+ },
2559
+ {
2560
+ "completion_length": 250.45750370025635,
2561
+ "epoch": 2.626666666666667,
2562
+ "grad_norm": 0.6404736042022705,
2563
+ "kl": 0.98271484375,
2564
+ "learning_rate": 1.3931152979349926e-07,
2565
+ "loss": 0.1389,
2566
+ "reward": 1.5887500122189522,
2567
+ "reward_std": 0.4178105805069208,
2568
+ "rewards/accuracy_reward": 0.6537500105798244,
2569
+ "rewards/format_reward": 0.9350000023841858,
2570
+ "step": 1970
2571
+ },
2572
+ {
2573
+ "completion_length": 241.34000396728516,
2574
+ "epoch": 2.64,
2575
+ "grad_norm": 1.1606582403182983,
2576
+ "kl": 0.8234375,
2577
+ "learning_rate": 1.2968181353609853e-07,
2578
+ "loss": 0.1034,
2579
+ "reward": 1.6037500262260438,
2580
+ "reward_std": 0.3661252219229937,
2581
+ "rewards/accuracy_reward": 0.6487500038929284,
2582
+ "rewards/format_reward": 0.9550000041723251,
2583
+ "step": 1980
2584
+ },
2585
+ {
2586
+ "completion_length": 258.2050045013428,
2587
+ "epoch": 2.6533333333333333,
2588
+ "grad_norm": 0.9993649125099182,
2589
+ "kl": 0.7095947265625,
2590
+ "learning_rate": 1.2038190620157685e-07,
2591
+ "loss": 0.0885,
2592
+ "reward": 1.6100000202655793,
2593
+ "reward_std": 0.3543590843677521,
2594
+ "rewards/accuracy_reward": 0.6512500140815973,
2595
+ "rewards/format_reward": 0.9587500020861626,
2596
+ "step": 1990
2597
+ },
2598
+ {
2599
+ "completion_length": 248.90000267028807,
2600
+ "epoch": 2.6666666666666665,
2601
+ "grad_norm": 0.7114527821540833,
2602
+ "kl": 486.95928955078125,
2603
+ "learning_rate": 1.114140460966645e-07,
2604
+ "loss": 20.4763,
2605
+ "reward": 1.6000000208616256,
2606
+ "reward_std": 0.3433480467647314,
2607
+ "rewards/accuracy_reward": 0.6437500057741999,
2608
+ "rewards/format_reward": 0.95625,
2609
+ "step": 2000
2610
+ },
2611
+ {
2612
+ "completion_length": 242.95375537872314,
2613
+ "epoch": 2.68,
2614
+ "grad_norm": 1.368294596672058,
2615
+ "kl": 0.9123046875,
2616
+ "learning_rate": 1.0278039161078634e-07,
2617
+ "loss": 0.1081,
2618
+ "reward": 1.583750031888485,
2619
+ "reward_std": 0.3553517021238804,
2620
+ "rewards/accuracy_reward": 0.6350000066682696,
2621
+ "rewards/format_reward": 0.9487500056624413,
2622
+ "step": 2010
2623
+ },
2624
+ {
2625
+ "completion_length": 267.4425052642822,
2626
+ "epoch": 2.6933333333333334,
2627
+ "grad_norm": 0.9436002969741821,
2628
+ "kl": 1.2000244140625,
2629
+ "learning_rate": 9.4483020696578e-08,
2630
+ "loss": 0.1847,
2631
+ "reward": 1.4937500163912774,
2632
+ "reward_std": 0.42551035098731516,
2633
+ "rewards/accuracy_reward": 0.5700000072829425,
2634
+ "rewards/format_reward": 0.9237500086426735,
2635
+ "step": 2020
2636
+ },
2637
+ {
2638
+ "completion_length": 261.7937522888184,
2639
+ "epoch": 2.7066666666666666,
2640
+ "grad_norm": 0.832007884979248,
2641
+ "kl": 1.18310546875,
2642
+ "learning_rate": 8.652393036976158e-08,
2643
+ "loss": 0.1359,
2644
+ "reward": 1.4962500289082528,
2645
+ "reward_std": 0.4743993539363146,
2646
+ "rewards/accuracy_reward": 0.5837500118650496,
2647
+ "rewards/format_reward": 0.9125000044703484,
2648
+ "step": 2030
2649
+ },
2650
+ {
2651
+ "completion_length": 248.56500606536866,
2652
+ "epoch": 2.7199999999999998,
2653
+ "grad_norm": 0.9683050513267517,
2654
+ "kl": 1.044775390625,
2655
+ "learning_rate": 7.89050362285062e-08,
2656
+ "loss": 0.1735,
2657
+ "reward": 1.5050000220537185,
2658
+ "reward_std": 0.4250973217189312,
2659
+ "rewards/accuracy_reward": 0.5762500053271651,
2660
+ "rewards/format_reward": 0.928750005364418,
2661
+ "step": 2040
2662
+ },
2663
+ {
2664
+ "completion_length": 261.2425033569336,
2665
+ "epoch": 2.7333333333333334,
2666
+ "grad_norm": 0.8732206225395203,
2667
+ "kl": 2.1559326171875,
2668
+ "learning_rate": 7.162817199237703e-08,
2669
+ "loss": 0.228,
2670
+ "reward": 1.4837500289082528,
2671
+ "reward_std": 0.4267508018761873,
2672
+ "rewards/accuracy_reward": 0.5775000069290399,
2673
+ "rewards/format_reward": 0.9062500104308129,
2674
+ "step": 2050
2675
+ },
2676
+ {
2677
+ "completion_length": 264.16000480651854,
2678
+ "epoch": 2.7466666666666666,
2679
+ "grad_norm": 0.6869509220123291,
2680
+ "kl": 0.93458251953125,
2681
+ "learning_rate": 6.469508906099792e-08,
2682
+ "loss": 0.1569,
2683
+ "reward": 1.5300000235438347,
2684
+ "reward_std": 0.43696386478841304,
2685
+ "rewards/accuracy_reward": 0.5937500102445483,
2686
+ "rewards/format_reward": 0.9362500056624412,
2687
+ "step": 2060
2688
+ },
2689
+ {
2690
+ "completion_length": 238.19875507354737,
2691
+ "epoch": 2.76,
2692
+ "grad_norm": 1.0900628566741943,
2693
+ "kl": 0.9701171875,
2694
+ "learning_rate": 5.810745609252166e-08,
2695
+ "loss": 0.116,
2696
+ "reward": 1.491250030696392,
2697
+ "reward_std": 0.35836701430380347,
2698
+ "rewards/accuracy_reward": 0.5612500054761768,
2699
+ "rewards/format_reward": 0.9300000026822091,
2700
+ "step": 2070
2701
+ },
2702
+ {
2703
+ "completion_length": 249.6000057220459,
2704
+ "epoch": 2.7733333333333334,
2705
+ "grad_norm": 0.7228848338127136,
2706
+ "kl": 0.80611572265625,
2707
+ "learning_rate": 5.186685860201718e-08,
2708
+ "loss": 0.1248,
2709
+ "reward": 1.57750001847744,
2710
+ "reward_std": 0.37631080821156504,
2711
+ "rewards/accuracy_reward": 0.6237500101327896,
2712
+ "rewards/format_reward": 0.9537500008940697,
2713
+ "step": 2080
2714
+ },
2715
+ {
2716
+ "completion_length": 251.9662546157837,
2717
+ "epoch": 2.7866666666666666,
2718
+ "grad_norm": 0.471282422542572,
2719
+ "kl": 1.272216796875,
2720
+ "learning_rate": 4.59747985798662e-08,
2721
+ "loss": 0.1567,
2722
+ "reward": 1.493750023841858,
2723
+ "reward_std": 0.44336883127689364,
2724
+ "rewards/accuracy_reward": 0.5787500059232116,
2725
+ "rewards/format_reward": 0.9150000050663948,
2726
+ "step": 2090
2727
+ },
2728
+ {
2729
+ "completion_length": 268.16250438690184,
2730
+ "epoch": 2.8,
2731
+ "grad_norm": 0.7542273998260498,
2732
+ "kl": 1.019189453125,
2733
+ "learning_rate": 4.0432694130264294e-08,
2734
+ "loss": 0.1766,
2735
+ "reward": 1.512500025331974,
2736
+ "reward_std": 0.3997300285845995,
2737
+ "rewards/accuracy_reward": 0.5850000084377825,
2738
+ "rewards/format_reward": 0.9275000020861626,
2739
+ "step": 2100
2740
+ },
2741
+ {
2742
+ "completion_length": 258.7000057220459,
2743
+ "epoch": 2.8133333333333335,
2744
+ "grad_norm": 0.652788519859314,
2745
+ "kl": 1.01083984375,
2746
+ "learning_rate": 3.524187912991056e-08,
2747
+ "loss": 0.0999,
2748
+ "reward": 1.5262500286102294,
2749
+ "reward_std": 0.3544511809945107,
2750
+ "rewards/accuracy_reward": 0.5775000077672303,
2751
+ "rewards/format_reward": 0.9487500041723251,
2752
+ "step": 2110
2753
+ },
2754
+ {
2755
+ "completion_length": 257.93125343322754,
2756
+ "epoch": 2.8266666666666667,
2757
+ "grad_norm": 0.9080987572669983,
2758
+ "kl": 1.49254150390625,
2759
+ "learning_rate": 3.040360290696909e-08,
2760
+ "loss": 0.2127,
2761
+ "reward": 1.4812500149011611,
2762
+ "reward_std": 0.42118182219564915,
2763
+ "rewards/accuracy_reward": 0.5700000080280005,
2764
+ "rewards/format_reward": 0.9112500056624413,
2765
+ "step": 2120
2766
+ },
2767
+ {
2768
+ "completion_length": 234.36625385284424,
2769
+ "epoch": 2.84,
2770
+ "grad_norm": 0.9428286552429199,
2771
+ "kl": 0.8838134765625,
2772
+ "learning_rate": 2.5919029940380145e-08,
2773
+ "loss": 0.1446,
2774
+ "reward": 1.5850000232458115,
2775
+ "reward_std": 0.3889846485108137,
2776
+ "rewards/accuracy_reward": 0.652500010933727,
2777
+ "rewards/format_reward": 0.9325000062584877,
2778
+ "step": 2130
2779
+ },
2780
+ {
2781
+ "completion_length": 253.12500400543212,
2782
+ "epoch": 2.8533333333333335,
2783
+ "grad_norm": 1.143280029296875,
2784
+ "kl": 0.8545166015625,
2785
+ "learning_rate": 2.178923957959289e-08,
2786
+ "loss": 0.1153,
2787
+ "reward": 1.526250024139881,
2788
+ "reward_std": 0.39961728677153585,
2789
+ "rewards/accuracy_reward": 0.5787500070407987,
2790
+ "rewards/format_reward": 0.947500005364418,
2791
+ "step": 2140
2792
+ },
2793
+ {
2794
+ "completion_length": 244.91625385284425,
2795
+ "epoch": 2.8666666666666667,
2796
+ "grad_norm": 0.6661178469657898,
2797
+ "kl": 0.988916015625,
2798
+ "learning_rate": 1.8015225784786483e-08,
2799
+ "loss": 0.1399,
2800
+ "reward": 1.551250022649765,
2801
+ "reward_std": 0.3881247241050005,
2802
+ "rewards/accuracy_reward": 0.6075000107288361,
2803
+ "rewards/format_reward": 0.9437500059604644,
2804
+ "step": 2150
2805
+ },
2806
+ {
2807
+ "completion_length": 232.5712547302246,
2808
+ "epoch": 2.88,
2809
+ "grad_norm": 0.9537074565887451,
2810
+ "kl": 0.7067138671875,
2811
+ "learning_rate": 1.4597896887644457e-08,
2812
+ "loss": 0.1309,
2813
+ "reward": 1.641250017285347,
2814
+ "reward_std": 0.3669580578804016,
2815
+ "rewards/accuracy_reward": 0.6787500075995923,
2816
+ "rewards/format_reward": 0.9625000029802322,
2817
+ "step": 2160
2818
+ },
2819
+ {
2820
+ "completion_length": 228.0150047302246,
2821
+ "epoch": 2.8933333333333335,
2822
+ "grad_norm": 1.5120928287506104,
2823
+ "kl": 0.9121337890625,
2824
+ "learning_rate": 1.1538075372735435e-08,
2825
+ "loss": 0.1243,
2826
+ "reward": 1.5912500202655793,
2827
+ "reward_std": 0.36697540059685707,
2828
+ "rewards/accuracy_reward": 0.6437500088475645,
2829
+ "rewards/format_reward": 0.9475000023841857,
2830
+ "step": 2170
2831
+ },
2832
+ {
2833
+ "completion_length": 248.69500427246095,
2834
+ "epoch": 2.9066666666666667,
2835
+ "grad_norm": 13.778345108032227,
2836
+ "kl": 1.008544921875,
2837
+ "learning_rate": 8.836497679557964e-09,
2838
+ "loss": 0.1507,
2839
+ "reward": 1.547500017285347,
2840
+ "reward_std": 0.39245944768190383,
2841
+ "rewards/accuracy_reward": 0.600000009033829,
2842
+ "rewards/format_reward": 0.9475000008940697,
2843
+ "step": 2180
2844
+ },
2845
+ {
2846
+ "completion_length": 239.8487533569336,
2847
+ "epoch": 2.92,
2848
+ "grad_norm": 0.7719865441322327,
2849
+ "kl": 0.8395263671875,
2850
+ "learning_rate": 6.493814025293476e-09,
2851
+ "loss": 0.1101,
2852
+ "reward": 1.5650000244379043,
2853
+ "reward_std": 0.36453715413808824,
2854
+ "rewards/accuracy_reward": 0.616250010766089,
2855
+ "rewards/format_reward": 0.948750002682209,
2856
+ "step": 2190
2857
+ },
2858
+ {
2859
+ "completion_length": 241.00000209808348,
2860
+ "epoch": 2.9333333333333336,
2861
+ "grad_norm": 0.8857264518737793,
2862
+ "kl": 1.47626953125,
2863
+ "learning_rate": 4.510588248311964e-09,
2864
+ "loss": 0.1851,
2865
+ "reward": 1.4975000217556953,
2866
+ "reward_std": 0.3946533836424351,
2867
+ "rewards/accuracy_reward": 0.5750000107102096,
2868
+ "rewards/format_reward": 0.9225000023841858,
2869
+ "step": 2200
2870
+ },
2871
+ {
2872
+ "completion_length": 259.2575029373169,
2873
+ "epoch": 2.9466666666666668,
2874
+ "grad_norm": 1.154828667640686,
2875
+ "kl": 0.94993896484375,
2876
+ "learning_rate": 2.8872976724670375e-09,
2877
+ "loss": 0.1104,
2878
+ "reward": 1.6000000178813933,
2879
+ "reward_std": 0.37620634213089943,
2880
+ "rewards/accuracy_reward": 0.6500000108033419,
2881
+ "rewards/format_reward": 0.9500000044703484,
2882
+ "step": 2210
2883
+ },
2884
+ {
2885
+ "completion_length": 269.49375610351564,
2886
+ "epoch": 2.96,
2887
+ "grad_norm": 0.6843157410621643,
2888
+ "kl": 1.0091552734375,
2889
+ "learning_rate": 1.624332992213151e-09,
2890
+ "loss": 0.1773,
2891
+ "reward": 1.4937500327825546,
2892
+ "reward_std": 0.4581430654972792,
2893
+ "rewards/accuracy_reward": 0.5700000094249844,
2894
+ "rewards/format_reward": 0.9237500056624413,
2895
+ "step": 2220
2896
+ },
2897
+ {
2898
+ "completion_length": 259.83625545501707,
2899
+ "epoch": 2.9733333333333336,
2900
+ "grad_norm": 0.854248583316803,
2901
+ "kl": 0.9797119140625,
2902
+ "learning_rate": 7.219981785733243e-10,
2903
+ "loss": 0.1681,
2904
+ "reward": 1.557500024139881,
2905
+ "reward_std": 0.4305118963122368,
2906
+ "rewards/accuracy_reward": 0.6300000097602606,
2907
+ "rewards/format_reward": 0.927500008046627,
2908
+ "step": 2230
2909
+ },
2910
+ {
2911
+ "completion_length": 244.36375274658204,
2912
+ "epoch": 2.986666666666667,
2913
+ "grad_norm": 0.7409846186637878,
2914
+ "kl": 1.03876953125,
2915
+ "learning_rate": 1.8051040597882872e-10,
2916
+ "loss": 0.1227,
2917
+ "reward": 1.4725000217556954,
2918
+ "reward_std": 0.3398312862962484,
2919
+ "rewards/accuracy_reward": 0.5237500051036477,
2920
+ "rewards/format_reward": 0.9487500056624413,
2921
+ "step": 2240
2922
+ },
2923
+ {
2924
+ "completion_length": 253.62000465393066,
2925
+ "epoch": 3.0,
2926
+ "grad_norm": 1.5809358358383179,
2927
+ "kl": 1.09482421875,
2928
+ "learning_rate": 0.0,
2929
+ "loss": 0.1685,
2930
+ "reward": 1.4925000250339509,
2931
+ "reward_std": 0.3981329433619976,
2932
+ "rewards/accuracy_reward": 0.5537500067614018,
2933
+ "rewards/format_reward": 0.9387500017881394,
2934
+ "step": 2250
2935
+ },
2936
+ {
2937
+ "epoch": 3.0,
2938
+ "step": 2250,
2939
+ "total_flos": 0.0,
2940
+ "train_loss": 0.283822166296343,
2941
+ "train_runtime": 92192.5552,
2942
+ "train_samples_per_second": 0.195,
2943
+ "train_steps_per_second": 0.024
2944
+ }
2945
+ ],
2946
+ "logging_steps": 10,
2947
+ "max_steps": 2250,
2948
+ "num_input_tokens_seen": 0,
2949
+ "num_train_epochs": 3,
2950
+ "save_steps": 500,
2951
+ "stateful_callbacks": {
2952
+ "TrainerControl": {
2953
+ "args": {
2954
+ "should_epoch_stop": false,
2955
+ "should_evaluate": false,
2956
+ "should_log": false,
2957
+ "should_save": true,
2958
+ "should_training_stop": true
2959
+ },
2960
+ "attributes": {}
2961
+ }
2962
+ },
2963
+ "total_flos": 0.0,
2964
+ "train_batch_size": 4,
2965
+ "trial_name": null,
2966
+ "trial_params": null
2967
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd89efa8d265dde04bcb161c6b8d01109b4a6c858df429f14aab173c9d805713
3
+ size 7608
vocab.json ADDED
The diff for this file is too large to render. See raw diff