minmingzhu02 commited on
Commit
7958e73
1 Parent(s): 7684115

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +1 -0
  2. adapter_model.safetensors +1 -1
  3. all_results.json +5 -5
  4. checkpoint-1000/README.md +1 -0
  5. checkpoint-1000/adapter_model.safetensors +1 -1
  6. checkpoint-1000/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1 -1
  7. checkpoint-1000/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1 -1
  8. checkpoint-1000/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1 -1
  9. checkpoint-1000/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +1 -1
  10. checkpoint-1000/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1 -1
  11. checkpoint-1000/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1 -1
  12. checkpoint-1000/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1 -1
  13. checkpoint-1000/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1 -1
  14. checkpoint-1000/global_step1000/mp_rank_00_model_states.pt +1 -1
  15. checkpoint-1000/rng_state_0.pth +2 -2
  16. checkpoint-1000/rng_state_1.pth +2 -2
  17. checkpoint-1000/rng_state_2.pth +2 -2
  18. checkpoint-1000/rng_state_3.pth +2 -2
  19. checkpoint-1000/rng_state_4.pth +2 -2
  20. checkpoint-1000/rng_state_5.pth +2 -2
  21. checkpoint-1000/rng_state_6.pth +2 -2
  22. checkpoint-1000/rng_state_7.pth +2 -2
  23. checkpoint-1000/trainer_state.json +483 -483
  24. checkpoint-1000/training_args.bin +2 -2
  25. checkpoint-2000/README.md +9 -0
  26. checkpoint-2000/adapter_config.json +23 -0
  27. checkpoint-2000/adapter_model.safetensors +3 -0
  28. checkpoint-2000/gaudi_config.json +10 -0
  29. checkpoint-2000/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  30. checkpoint-2000/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  31. checkpoint-2000/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  32. checkpoint-2000/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  33. checkpoint-2000/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  34. checkpoint-2000/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  35. checkpoint-2000/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  36. checkpoint-2000/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
  37. checkpoint-2000/global_step2000/mp_rank_00_model_states.pt +3 -0
  38. checkpoint-2000/latest +1 -0
  39. checkpoint-2000/rng_state_0.pth +3 -0
  40. checkpoint-2000/rng_state_1.pth +3 -0
  41. checkpoint-2000/rng_state_2.pth +3 -0
  42. checkpoint-2000/rng_state_3.pth +3 -0
  43. checkpoint-2000/rng_state_4.pth +3 -0
  44. checkpoint-2000/rng_state_5.pth +3 -0
  45. checkpoint-2000/rng_state_6.pth +3 -0
  46. checkpoint-2000/rng_state_7.pth +3 -0
  47. checkpoint-2000/special_tokens_map.json +12 -0
  48. checkpoint-2000/tokenizer.json +0 -0
  49. checkpoint-2000/tokenizer.model +3 -0
  50. checkpoint-2000/tokenizer_config.json +42 -0
README.md CHANGED
@@ -5,5 +5,6 @@ library_name: peft
5
 
6
  ### Framework versions
7
 
 
8
 
9
  - PEFT 0.4.0
 
5
 
6
  ### Framework versions
7
 
8
+ - PEFT 0.4.0
9
 
10
  - PEFT 0.4.0
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ea5492a2629a41a3834cf2de2d413d3c30977c44b370a4bdc94bfa086eb6f04
3
  size 13665592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a40df01fec581a7cd1da2c66520eb45000ec0730bd41494a3203415bcacf33e3
3
  size 13665592
all_results.json CHANGED
@@ -6,12 +6,12 @@
6
  "eval_samples": 25899,
7
  "eval_samples_per_second": 160.844,
8
  "eval_steps_per_second": 2.515,
9
- "max_memory_allocated (GB)": 91.87,
10
  "memory_allocated (GB)": 24.39,
11
  "perplexity": 3.0531438702149534,
12
  "total_memory_available (GB)": 94.62,
13
- "train_loss": 1.0469276263736165,
14
- "train_runtime": 6085.8982,
15
- "train_samples_per_second": 162.542,
16
- "train_steps_per_second": 0.317
17
  }
 
6
  "eval_samples": 25899,
7
  "eval_samples_per_second": 160.844,
8
  "eval_steps_per_second": 2.515,
9
+ "max_memory_allocated (GB)": 91.9,
10
  "memory_allocated (GB)": 24.39,
11
  "perplexity": 3.0531438702149534,
12
  "total_memory_available (GB)": 94.62,
13
+ "train_loss": 1.050027716289524,
14
+ "train_runtime": 6424.4949,
15
+ "train_samples_per_second": 162.078,
16
+ "train_steps_per_second": 0.316
17
  }
checkpoint-1000/README.md CHANGED
@@ -5,5 +5,6 @@ library_name: peft
5
 
6
  ### Framework versions
7
 
 
8
 
9
  - PEFT 0.4.0
 
5
 
6
  ### Framework versions
7
 
8
+ - PEFT 0.4.0
9
 
10
  - PEFT 0.4.0
checkpoint-1000/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81c901e3a02306c80837c992a65b028aa5720bd76546736f6b293eb7dfc7140c
3
  size 13665592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c40dc4d08aadcdaf95d2045cc57b08ba3438ccc3bb482464c0f4fc60165266e
3
  size 13665592
checkpoint-1000/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fbcc1217cb0120aeb98ec1de36b0d9911840455125d2ac021d411340695b0e41
3
  size 10229904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d46d6c3a35c5aa1f0d69f23137498ba05dcec1b85ab8f758931bb9d3a9c61f34
3
  size 10229904
checkpoint-1000/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b97ccb06cab86f7787aa54118479ed5365ea36124fa165035de40978981d7d90
3
  size 10229904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9605b32dc9734841f4135f4b05d12dd42f4d99c4361a0351f4237cad48f90be3
3
  size 10229904
checkpoint-1000/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e88a8703d82aca1a49db083c3389258661948ca907ccc2e9cb111b1551ca4b4
3
  size 10229904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5876ec371de4435d924cc505aa32eb3dad219f45fc6663e1b1b7d8e6b0acf67f
3
  size 10229904
checkpoint-1000/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e427e683a4b2c0eaf3f92eeb4d17130f2db3cf91beb49cdec09db31cd03bc5d0
3
  size 10229904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e300264d099e85c2eaec5813361ea1324250f75f2b589f945f2f835b83af40d
3
  size 10229904
checkpoint-1000/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:447bcfa260f2ca4e01eab0355a25bc057c887425ee46599c03014702139424e9
3
  size 10229904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b16bb1a71dc108e5f58765fb6631fd6dac31c9f448d7d14103ca456c36f952ce
3
  size 10229904
checkpoint-1000/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f82c584aafdcfd6f83b4a382b39b2a21a1c6d328e15b598c757f0bbd57d7f40
3
  size 10229904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f7fc624f7d0466934102a8e291439878ba39384d242fee71905534ec7904e06
3
  size 10229904
checkpoint-1000/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed340b57ccacdb58d19cb98548194affef267ef083941da271ddb8bda6982c71
3
  size 10229904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4f0723c2be1778d263d0b01b0b9d4e12020ffdf1dd81d6905cc10e8fa92d41d
3
  size 10229904
checkpoint-1000/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e1108862ec93ac55212f8e0a1b8f3e51f7907e476768c008d36049e3882d3c6
3
  size 10229904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24a0be42f2cadbc3fe2cb110dd10263c4641ceea734da5d7e75eba26e2901d3c
3
  size 10229904
checkpoint-1000/global_step1000/mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7eaa5aa673d8036fda2b25512f384ce627430c73f15341bdbc6c9ea4bbcccca
3
  size 13740018
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb5bdbbb926d99fb70d7fc5227430f4f4f997a514d20837bc7e19ed2d1ae9e08
3
  size 13740018
checkpoint-1000/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da3853db45e30a134bde6283db65f4d2e12b4f06b534a780433dd598db8504cb
3
- size 18032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a15cf27d7c1440499566c0754bb1d50c83a5017b6e7fa437ea341504ae66e2b6
3
+ size 17968
checkpoint-1000/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d0664a5f5d0f31c8412dfa2b9c211689c20cc952d4ce4660ee1761e2ec10776d
3
- size 18032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:825d61abc5a5ad521ad84ebb500b93ae8a8623f9369ad48cbe6d38bab5442ff1
3
+ size 17968
checkpoint-1000/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8418b1622ae3a1586b0d267b3c8c1d9d5d403a56cc73434016d717b501b2c0fd
3
- size 18032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac10f0c3ac84964c70d9fd60ba3e1f3ba4b539d4b6c1ae404241006617438fc1
3
+ size 17968
checkpoint-1000/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:51d300bdb34d5407f13acbfb278589f1a2907891fba4e020420c6053746cf8e2
3
- size 18032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8954554f29ca86b2a2d0065650e4ed1767ee8c977ff3c3089f50b821a3a8fea1
3
+ size 17968
checkpoint-1000/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d597fad31002fa89ec1fd8aefe8f76311a32275c74a84439354f630b9b001a4
3
- size 18032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32cae0c2f26eab0ed7620c9d8a8c01759ff3f2406e4eca977af345db9260dfbc
3
+ size 17968
checkpoint-1000/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1fbf5d648175322bc9d62f67af45fa2e710bdc93b49adb768d8df629d2b0347f
3
- size 18032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8b789b47341496ea469886b9092dd0722b325942d9e0fa64277b02e5c6a462a
3
+ size 17968
checkpoint-1000/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcfd1907de0ce290b15c0e3cda02ca34b46efde1a255be0c9b8e383cbf93e363
3
- size 18032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:211ce782229e6a2d3e1400311a29875805358cdd93d4b0eb3697d599fa847ccb
3
+ size 17968
checkpoint-1000/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a4d45767ae6a4db5ed86b012add567287d0976c731678dabe7dedeb253ea63b
3
- size 18032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b712b22f1679953e1505d7e1d8f16de7d3f9052abc5168daadd90c918870ab95
3
+ size 17968
checkpoint-1000/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0404473923787227,
5
  "eval_steps": 500,
6
  "global_step": 1000,
7
  "is_hyper_param_search": false,
@@ -10,9 +10,9 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
- "grad_norm": 0.8564500212669373,
14
- "learning_rate": 5.670773083167062e-05,
15
- "loss": 1.7378,
16
  "max_memory_allocated (GB)": 91.86,
17
  "memory_allocated (GB)": 24.39,
18
  "step": 10,
@@ -20,9 +20,9 @@
20
  },
21
  {
22
  "epoch": 0.02,
23
- "grad_norm": 0.9088606238365173,
24
- "learning_rate": 7.377845879804262e-05,
25
- "loss": 1.5183,
26
  "max_memory_allocated (GB)": 91.86,
27
  "memory_allocated (GB)": 24.39,
28
  "step": 20,
@@ -30,9 +30,9 @@
30
  },
31
  {
32
  "epoch": 0.03,
33
- "grad_norm": 0.396087646484375,
34
- "learning_rate": 8.376419451838216e-05,
35
- "loss": 1.3578,
36
  "max_memory_allocated (GB)": 91.86,
37
  "memory_allocated (GB)": 24.39,
38
  "step": 30,
@@ -40,9 +40,9 @@
40
  },
41
  {
42
  "epoch": 0.04,
43
- "grad_norm": 0.2860943078994751,
44
- "learning_rate": 9.084918676441463e-05,
45
- "loss": 1.2832,
46
  "max_memory_allocated (GB)": 91.86,
47
  "memory_allocated (GB)": 24.39,
48
  "step": 40,
@@ -50,9 +50,9 @@
50
  },
51
  {
52
  "epoch": 0.05,
53
- "grad_norm": 0.2048167884349823,
54
- "learning_rate": 9.634473369696918e-05,
55
- "loss": 1.2717,
56
  "max_memory_allocated (GB)": 91.86,
57
  "memory_allocated (GB)": 24.39,
58
  "step": 50,
@@ -60,9 +60,9 @@
60
  },
61
  {
62
  "epoch": 0.06,
63
- "grad_norm": 0.2355910986661911,
64
- "learning_rate": 9.994635193133047e-05,
65
- "loss": 1.2359,
66
  "max_memory_allocated (GB)": 91.86,
67
  "memory_allocated (GB)": 24.39,
68
  "step": 60,
@@ -70,947 +70,947 @@
70
  },
71
  {
72
  "epoch": 0.07,
73
- "grad_norm": 0.25649788975715637,
74
- "learning_rate": 9.94098712446352e-05,
75
- "loss": 1.2202,
76
- "max_memory_allocated (GB)": 91.86,
77
  "memory_allocated (GB)": 24.39,
78
  "step": 70,
79
  "total_memory_available (GB)": 94.62
80
  },
81
  {
82
  "epoch": 0.08,
83
- "grad_norm": 0.3248240649700165,
84
- "learning_rate": 9.887339055793991e-05,
85
- "loss": 1.1928,
86
- "max_memory_allocated (GB)": 91.86,
87
  "memory_allocated (GB)": 24.39,
88
  "step": 80,
89
  "total_memory_available (GB)": 94.62
90
  },
91
  {
92
  "epoch": 0.09,
93
- "grad_norm": 0.34330031275749207,
94
- "learning_rate": 9.833690987124465e-05,
95
- "loss": 1.1839,
96
- "max_memory_allocated (GB)": 91.86,
97
  "memory_allocated (GB)": 24.39,
98
  "step": 90,
99
  "total_memory_available (GB)": 94.62
100
  },
101
  {
102
  "epoch": 0.1,
103
- "grad_norm": 0.481341689825058,
104
- "learning_rate": 9.780042918454936e-05,
105
- "loss": 1.1682,
106
- "max_memory_allocated (GB)": 91.86,
107
  "memory_allocated (GB)": 24.39,
108
  "step": 100,
109
  "total_memory_available (GB)": 94.62
110
  },
111
  {
112
  "epoch": 0.11,
113
- "grad_norm": 0.33835262060165405,
114
- "learning_rate": 9.726394849785409e-05,
115
- "loss": 1.1528,
116
- "max_memory_allocated (GB)": 91.86,
117
  "memory_allocated (GB)": 24.39,
118
  "step": 110,
119
  "total_memory_available (GB)": 94.62
120
  },
121
  {
122
  "epoch": 0.12,
123
- "grad_norm": 0.5484705567359924,
124
- "learning_rate": 9.67274678111588e-05,
125
- "loss": 1.1511,
126
- "max_memory_allocated (GB)": 91.86,
127
  "memory_allocated (GB)": 24.39,
128
  "step": 120,
129
  "total_memory_available (GB)": 94.62
130
  },
131
  {
132
- "epoch": 0.14,
133
- "grad_norm": 0.3785193860530853,
134
- "learning_rate": 9.619098712446352e-05,
135
- "loss": 1.152,
136
- "max_memory_allocated (GB)": 91.86,
137
  "memory_allocated (GB)": 24.39,
138
  "step": 130,
139
  "total_memory_available (GB)": 94.62
140
  },
141
  {
142
- "epoch": 0.15,
143
- "grad_norm": 0.36997660994529724,
144
- "learning_rate": 9.565450643776824e-05,
145
- "loss": 1.1489,
146
- "max_memory_allocated (GB)": 91.86,
147
  "memory_allocated (GB)": 24.39,
148
  "step": 140,
149
  "total_memory_available (GB)": 94.62
150
  },
151
  {
152
- "epoch": 0.16,
153
- "grad_norm": 0.5300387144088745,
154
- "learning_rate": 9.511802575107297e-05,
155
- "loss": 1.1326,
156
- "max_memory_allocated (GB)": 91.86,
157
  "memory_allocated (GB)": 24.39,
158
  "step": 150,
159
  "total_memory_available (GB)": 94.62
160
  },
161
  {
162
- "epoch": 0.17,
163
- "grad_norm": 0.5168531537055969,
164
- "learning_rate": 9.458154506437769e-05,
165
- "loss": 1.1253,
166
- "max_memory_allocated (GB)": 91.86,
167
  "memory_allocated (GB)": 24.39,
168
  "step": 160,
169
  "total_memory_available (GB)": 94.62
170
  },
171
  {
172
- "epoch": 0.18,
173
- "grad_norm": 0.48498621582984924,
174
- "learning_rate": 9.404506437768241e-05,
175
- "loss": 1.1079,
176
- "max_memory_allocated (GB)": 91.86,
177
  "memory_allocated (GB)": 24.39,
178
  "step": 170,
179
  "total_memory_available (GB)": 94.62
180
  },
181
  {
182
- "epoch": 0.19,
183
- "grad_norm": 0.4298243522644043,
184
- "learning_rate": 9.350858369098713e-05,
185
- "loss": 1.0987,
186
- "max_memory_allocated (GB)": 91.86,
187
  "memory_allocated (GB)": 24.39,
188
  "step": 180,
189
  "total_memory_available (GB)": 94.62
190
  },
191
  {
192
- "epoch": 0.2,
193
- "grad_norm": 0.4342297315597534,
194
- "learning_rate": 9.297210300429185e-05,
195
- "loss": 1.0975,
196
- "max_memory_allocated (GB)": 91.86,
197
  "memory_allocated (GB)": 24.39,
198
  "step": 190,
199
  "total_memory_available (GB)": 94.62
200
  },
201
  {
202
- "epoch": 0.21,
203
- "grad_norm": 0.5405712127685547,
204
- "learning_rate": 9.243562231759658e-05,
205
- "loss": 1.1084,
206
- "max_memory_allocated (GB)": 91.86,
207
  "memory_allocated (GB)": 24.39,
208
  "step": 200,
209
  "total_memory_available (GB)": 94.62
210
  },
211
  {
212
- "epoch": 0.22,
213
- "grad_norm": 0.4317038655281067,
214
- "learning_rate": 9.189914163090128e-05,
215
- "loss": 1.0969,
216
- "max_memory_allocated (GB)": 91.86,
217
  "memory_allocated (GB)": 24.39,
218
  "step": 210,
219
  "total_memory_available (GB)": 94.62
220
  },
221
  {
222
- "epoch": 0.23,
223
- "grad_norm": 0.41190406680107117,
224
- "learning_rate": 9.136266094420602e-05,
225
  "loss": 1.0992,
226
- "max_memory_allocated (GB)": 91.86,
227
  "memory_allocated (GB)": 24.39,
228
  "step": 220,
229
  "total_memory_available (GB)": 94.62
230
  },
231
  {
232
- "epoch": 0.24,
233
- "grad_norm": 0.5093066692352295,
234
- "learning_rate": 9.082618025751073e-05,
235
- "loss": 1.0875,
236
- "max_memory_allocated (GB)": 91.86,
237
  "memory_allocated (GB)": 24.39,
238
  "step": 230,
239
  "total_memory_available (GB)": 94.62
240
  },
241
  {
242
- "epoch": 0.25,
243
- "grad_norm": 0.40610459446907043,
244
- "learning_rate": 9.028969957081545e-05,
245
- "loss": 1.0886,
246
- "max_memory_allocated (GB)": 91.86,
247
  "memory_allocated (GB)": 24.39,
248
  "step": 240,
249
  "total_memory_available (GB)": 94.62
250
  },
251
  {
252
- "epoch": 0.26,
253
- "grad_norm": 0.38791623711586,
254
- "learning_rate": 8.975321888412017e-05,
255
- "loss": 1.0783,
256
- "max_memory_allocated (GB)": 91.86,
257
  "memory_allocated (GB)": 24.39,
258
  "step": 250,
259
  "total_memory_available (GB)": 94.62
260
  },
261
  {
262
- "epoch": 0.27,
263
- "grad_norm": 0.3828742206096649,
264
- "learning_rate": 8.92167381974249e-05,
265
- "loss": 1.0816,
266
- "max_memory_allocated (GB)": 91.86,
267
  "memory_allocated (GB)": 24.39,
268
  "step": 260,
269
  "total_memory_available (GB)": 94.62
270
  },
271
  {
272
- "epoch": 0.28,
273
- "grad_norm": 0.5216621160507202,
274
- "learning_rate": 8.868025751072962e-05,
275
- "loss": 1.0731,
276
- "max_memory_allocated (GB)": 91.86,
277
  "memory_allocated (GB)": 24.39,
278
  "step": 270,
279
  "total_memory_available (GB)": 94.62
280
  },
281
  {
282
- "epoch": 0.29,
283
- "grad_norm": 0.43531420826911926,
284
- "learning_rate": 8.814377682403434e-05,
285
- "loss": 1.0804,
286
- "max_memory_allocated (GB)": 91.86,
287
  "memory_allocated (GB)": 24.39,
288
  "step": 280,
289
  "total_memory_available (GB)": 94.62
290
  },
291
  {
292
- "epoch": 0.3,
293
- "grad_norm": 0.41790470480918884,
294
- "learning_rate": 8.760729613733906e-05,
295
- "loss": 1.0785,
296
- "max_memory_allocated (GB)": 91.86,
297
  "memory_allocated (GB)": 24.39,
298
  "step": 290,
299
  "total_memory_available (GB)": 94.62
300
  },
301
  {
302
- "epoch": 0.31,
303
- "grad_norm": 0.456264466047287,
304
- "learning_rate": 8.707081545064378e-05,
305
- "loss": 1.0708,
306
- "max_memory_allocated (GB)": 91.86,
307
  "memory_allocated (GB)": 24.39,
308
  "step": 300,
309
  "total_memory_available (GB)": 94.62
310
  },
311
  {
312
- "epoch": 0.32,
313
- "grad_norm": 0.3793538212776184,
314
- "learning_rate": 8.65343347639485e-05,
315
- "loss": 1.0818,
316
- "max_memory_allocated (GB)": 91.86,
317
  "memory_allocated (GB)": 24.39,
318
  "step": 310,
319
  "total_memory_available (GB)": 94.62
320
  },
321
  {
322
- "epoch": 0.33,
323
- "grad_norm": 0.37741824984550476,
324
- "learning_rate": 8.599785407725323e-05,
325
- "loss": 1.0669,
326
- "max_memory_allocated (GB)": 91.86,
327
  "memory_allocated (GB)": 24.39,
328
  "step": 320,
329
  "total_memory_available (GB)": 94.62
330
  },
331
  {
332
- "epoch": 0.34,
333
- "grad_norm": 0.3576098084449768,
334
- "learning_rate": 8.546137339055795e-05,
335
- "loss": 1.0678,
336
- "max_memory_allocated (GB)": 91.86,
337
  "memory_allocated (GB)": 24.39,
338
  "step": 330,
339
  "total_memory_available (GB)": 94.62
340
  },
341
  {
342
- "epoch": 0.35,
343
- "grad_norm": 0.4295920133590698,
344
- "learning_rate": 8.492489270386267e-05,
345
- "loss": 1.0571,
346
- "max_memory_allocated (GB)": 91.86,
347
  "memory_allocated (GB)": 24.39,
348
  "step": 340,
349
  "total_memory_available (GB)": 94.62
350
  },
351
  {
352
- "epoch": 0.36,
353
- "grad_norm": 0.38484010100364685,
354
- "learning_rate": 8.438841201716738e-05,
355
- "loss": 1.0697,
356
- "max_memory_allocated (GB)": 91.86,
357
  "memory_allocated (GB)": 24.39,
358
  "step": 350,
359
  "total_memory_available (GB)": 94.62
360
  },
361
  {
362
- "epoch": 0.37,
363
- "grad_norm": 0.3806072175502777,
364
- "learning_rate": 8.385193133047211e-05,
365
- "loss": 1.0652,
366
- "max_memory_allocated (GB)": 91.86,
367
  "memory_allocated (GB)": 24.39,
368
  "step": 360,
369
  "total_memory_available (GB)": 94.62
370
  },
371
  {
372
- "epoch": 0.38,
373
- "grad_norm": 0.3507857918739319,
374
- "learning_rate": 8.331545064377682e-05,
375
- "loss": 1.061,
376
- "max_memory_allocated (GB)": 91.86,
377
  "memory_allocated (GB)": 24.39,
378
  "step": 370,
379
  "total_memory_available (GB)": 94.62
380
  },
381
  {
382
- "epoch": 0.4,
383
- "grad_norm": 0.3869009017944336,
384
- "learning_rate": 8.277896995708156e-05,
385
- "loss": 1.0622,
386
- "max_memory_allocated (GB)": 91.86,
387
  "memory_allocated (GB)": 24.39,
388
  "step": 380,
389
  "total_memory_available (GB)": 94.62
390
  },
391
  {
392
- "epoch": 0.41,
393
- "grad_norm": 0.3941822648048401,
394
- "learning_rate": 8.224248927038627e-05,
395
- "loss": 1.0629,
396
- "max_memory_allocated (GB)": 91.86,
397
  "memory_allocated (GB)": 24.39,
398
  "step": 390,
399
  "total_memory_available (GB)": 94.62
400
  },
401
  {
402
- "epoch": 0.42,
403
- "grad_norm": 0.3660201132297516,
404
- "learning_rate": 8.1706008583691e-05,
405
- "loss": 1.0589,
406
- "max_memory_allocated (GB)": 91.86,
407
  "memory_allocated (GB)": 24.39,
408
  "step": 400,
409
  "total_memory_available (GB)": 94.62
410
  },
411
  {
412
- "epoch": 0.43,
413
- "grad_norm": 0.3641981780529022,
414
- "learning_rate": 8.116952789699571e-05,
415
- "loss": 1.0611,
416
- "max_memory_allocated (GB)": 91.86,
417
  "memory_allocated (GB)": 24.39,
418
  "step": 410,
419
  "total_memory_available (GB)": 94.62
420
  },
421
  {
422
- "epoch": 0.44,
423
- "grad_norm": 0.38291656970977783,
424
- "learning_rate": 8.063304721030043e-05,
425
- "loss": 1.0554,
426
- "max_memory_allocated (GB)": 91.86,
427
  "memory_allocated (GB)": 24.39,
428
  "step": 420,
429
  "total_memory_available (GB)": 94.62
430
  },
431
  {
432
- "epoch": 0.45,
433
- "grad_norm": 0.3162360191345215,
434
- "learning_rate": 8.009656652360515e-05,
435
- "loss": 1.0504,
436
- "max_memory_allocated (GB)": 91.86,
437
  "memory_allocated (GB)": 24.39,
438
  "step": 430,
439
  "total_memory_available (GB)": 94.62
440
  },
441
  {
442
- "epoch": 0.46,
443
- "grad_norm": 0.3741254508495331,
444
- "learning_rate": 7.956008583690988e-05,
445
- "loss": 1.0525,
446
- "max_memory_allocated (GB)": 91.86,
447
  "memory_allocated (GB)": 24.39,
448
  "step": 440,
449
  "total_memory_available (GB)": 94.62
450
  },
451
  {
452
- "epoch": 0.47,
453
- "grad_norm": 0.3503776788711548,
454
- "learning_rate": 7.90236051502146e-05,
455
- "loss": 1.0554,
456
- "max_memory_allocated (GB)": 91.86,
457
  "memory_allocated (GB)": 24.39,
458
  "step": 450,
459
  "total_memory_available (GB)": 94.62
460
  },
461
  {
462
- "epoch": 0.48,
463
- "grad_norm": 0.3659127652645111,
464
- "learning_rate": 7.848712446351931e-05,
465
- "loss": 1.0464,
466
- "max_memory_allocated (GB)": 91.86,
467
  "memory_allocated (GB)": 24.39,
468
  "step": 460,
469
  "total_memory_available (GB)": 94.62
470
  },
471
  {
472
- "epoch": 0.49,
473
- "grad_norm": 0.3401431143283844,
474
- "learning_rate": 7.795064377682404e-05,
475
- "loss": 1.0596,
476
- "max_memory_allocated (GB)": 91.86,
477
  "memory_allocated (GB)": 24.39,
478
  "step": 470,
479
  "total_memory_available (GB)": 94.62
480
  },
481
  {
482
- "epoch": 0.5,
483
- "grad_norm": 0.33970314264297485,
484
- "learning_rate": 7.741416309012875e-05,
485
- "loss": 1.0659,
486
- "max_memory_allocated (GB)": 91.86,
487
  "memory_allocated (GB)": 24.39,
488
  "step": 480,
489
  "total_memory_available (GB)": 94.62
490
  },
491
  {
492
- "epoch": 0.51,
493
- "grad_norm": 0.40222179889678955,
494
- "learning_rate": 7.687768240343349e-05,
495
- "loss": 1.0485,
496
- "max_memory_allocated (GB)": 91.86,
497
  "memory_allocated (GB)": 24.39,
498
  "step": 490,
499
  "total_memory_available (GB)": 94.62
500
  },
501
  {
502
- "epoch": 0.52,
503
- "grad_norm": 0.3137299716472626,
504
- "learning_rate": 7.63412017167382e-05,
505
- "loss": 1.0506,
506
- "max_memory_allocated (GB)": 91.86,
507
  "memory_allocated (GB)": 24.39,
508
  "step": 500,
509
  "total_memory_available (GB)": 94.62
510
  },
511
  {
512
- "epoch": 0.53,
513
- "grad_norm": 0.3413981795310974,
514
- "learning_rate": 7.580472103004293e-05,
515
- "loss": 1.0466,
516
- "max_memory_allocated (GB)": 91.86,
517
  "memory_allocated (GB)": 24.39,
518
  "step": 510,
519
  "total_memory_available (GB)": 94.62
520
  },
521
  {
522
- "epoch": 0.54,
523
- "grad_norm": 0.3440905511379242,
524
- "learning_rate": 7.526824034334764e-05,
525
- "loss": 1.0503,
526
- "max_memory_allocated (GB)": 91.86,
527
  "memory_allocated (GB)": 24.39,
528
  "step": 520,
529
  "total_memory_available (GB)": 94.62
530
  },
531
  {
532
- "epoch": 0.55,
533
- "grad_norm": 0.367201030254364,
534
- "learning_rate": 7.473175965665236e-05,
535
- "loss": 1.0449,
536
- "max_memory_allocated (GB)": 91.86,
537
  "memory_allocated (GB)": 24.39,
538
  "step": 530,
539
  "total_memory_available (GB)": 94.62
540
  },
541
  {
542
- "epoch": 0.56,
543
- "grad_norm": 0.37777239084243774,
544
- "learning_rate": 7.419527896995708e-05,
545
- "loss": 1.0514,
546
- "max_memory_allocated (GB)": 91.86,
547
  "memory_allocated (GB)": 24.39,
548
  "step": 540,
549
  "total_memory_available (GB)": 94.62
550
  },
551
  {
552
- "epoch": 0.57,
553
- "grad_norm": 0.355307936668396,
554
- "learning_rate": 7.36587982832618e-05,
555
- "loss": 1.0458,
556
- "max_memory_allocated (GB)": 91.86,
557
  "memory_allocated (GB)": 24.39,
558
  "step": 550,
559
  "total_memory_available (GB)": 94.62
560
  },
561
  {
562
- "epoch": 0.58,
563
- "grad_norm": 0.43950700759887695,
564
- "learning_rate": 7.312231759656653e-05,
565
- "loss": 1.0426,
566
- "max_memory_allocated (GB)": 91.86,
567
  "memory_allocated (GB)": 24.39,
568
  "step": 560,
569
  "total_memory_available (GB)": 94.62
570
  },
571
  {
572
- "epoch": 0.59,
573
- "grad_norm": 0.31208208203315735,
574
- "learning_rate": 7.258583690987125e-05,
575
- "loss": 1.0432,
576
- "max_memory_allocated (GB)": 91.86,
577
  "memory_allocated (GB)": 24.39,
578
  "step": 570,
579
  "total_memory_available (GB)": 94.62
580
  },
581
  {
582
- "epoch": 0.6,
583
- "grad_norm": 0.3358898460865021,
584
- "learning_rate": 7.204935622317597e-05,
585
- "loss": 1.0563,
586
- "max_memory_allocated (GB)": 91.86,
587
  "memory_allocated (GB)": 24.39,
588
  "step": 580,
589
  "total_memory_available (GB)": 94.62
590
  },
591
  {
592
- "epoch": 0.61,
593
- "grad_norm": 0.3219963610172272,
594
- "learning_rate": 7.15128755364807e-05,
595
- "loss": 1.0402,
596
- "max_memory_allocated (GB)": 91.86,
597
  "memory_allocated (GB)": 24.39,
598
  "step": 590,
599
  "total_memory_available (GB)": 94.62
600
  },
601
  {
602
- "epoch": 0.62,
603
- "grad_norm": 0.3409494161605835,
604
- "learning_rate": 7.097639484978542e-05,
605
- "loss": 1.0452,
606
- "max_memory_allocated (GB)": 91.86,
607
  "memory_allocated (GB)": 24.39,
608
  "step": 600,
609
  "total_memory_available (GB)": 94.62
610
  },
611
  {
612
- "epoch": 0.63,
613
- "grad_norm": 0.3917517364025116,
614
- "learning_rate": 7.043991416309014e-05,
615
- "loss": 1.0358,
616
- "max_memory_allocated (GB)": 91.86,
617
  "memory_allocated (GB)": 24.39,
618
  "step": 610,
619
  "total_memory_available (GB)": 94.62
620
  },
621
  {
622
- "epoch": 0.65,
623
- "grad_norm": 0.36202436685562134,
624
- "learning_rate": 6.990343347639486e-05,
625
- "loss": 1.0421,
626
- "max_memory_allocated (GB)": 91.86,
627
  "memory_allocated (GB)": 24.39,
628
  "step": 620,
629
  "total_memory_available (GB)": 94.62
630
  },
631
  {
632
- "epoch": 0.66,
633
- "grad_norm": 0.34764185547828674,
634
- "learning_rate": 6.936695278969958e-05,
635
- "loss": 1.0414,
636
- "max_memory_allocated (GB)": 91.86,
637
  "memory_allocated (GB)": 24.39,
638
  "step": 630,
639
  "total_memory_available (GB)": 94.62
640
  },
641
  {
642
- "epoch": 0.67,
643
- "grad_norm": 0.3368031084537506,
644
- "learning_rate": 6.883047210300429e-05,
645
- "loss": 1.0328,
646
- "max_memory_allocated (GB)": 91.86,
647
  "memory_allocated (GB)": 24.39,
648
  "step": 640,
649
  "total_memory_available (GB)": 94.62
650
  },
651
  {
652
- "epoch": 0.68,
653
- "grad_norm": 0.3216454088687897,
654
- "learning_rate": 6.829399141630901e-05,
655
- "loss": 1.0368,
656
- "max_memory_allocated (GB)": 91.86,
657
  "memory_allocated (GB)": 24.39,
658
  "step": 650,
659
  "total_memory_available (GB)": 94.62
660
  },
661
  {
662
- "epoch": 0.69,
663
- "grad_norm": 0.36135369539260864,
664
- "learning_rate": 6.775751072961373e-05,
665
- "loss": 1.0375,
666
- "max_memory_allocated (GB)": 91.86,
667
  "memory_allocated (GB)": 24.39,
668
  "step": 660,
669
  "total_memory_available (GB)": 94.62
670
  },
671
  {
672
- "epoch": 0.7,
673
- "grad_norm": 0.3772068917751312,
674
- "learning_rate": 6.722103004291846e-05,
675
- "loss": 1.0384,
676
- "max_memory_allocated (GB)": 91.86,
677
  "memory_allocated (GB)": 24.39,
678
  "step": 670,
679
  "total_memory_available (GB)": 94.62
680
  },
681
  {
682
- "epoch": 0.71,
683
- "grad_norm": 0.352990984916687,
684
- "learning_rate": 6.668454935622318e-05,
685
- "loss": 1.0347,
686
- "max_memory_allocated (GB)": 91.86,
687
  "memory_allocated (GB)": 24.39,
688
  "step": 680,
689
  "total_memory_available (GB)": 94.62
690
  },
691
  {
692
- "epoch": 0.72,
693
- "grad_norm": 0.38070163130760193,
694
- "learning_rate": 6.61480686695279e-05,
695
- "loss": 1.0414,
696
- "max_memory_allocated (GB)": 91.86,
697
  "memory_allocated (GB)": 24.39,
698
  "step": 690,
699
  "total_memory_available (GB)": 94.62
700
  },
701
  {
702
- "epoch": 0.73,
703
- "grad_norm": 0.3262191414833069,
704
- "learning_rate": 6.561158798283262e-05,
705
- "loss": 1.0427,
706
- "max_memory_allocated (GB)": 91.86,
707
  "memory_allocated (GB)": 24.39,
708
  "step": 700,
709
  "total_memory_available (GB)": 94.62
710
  },
711
  {
712
- "epoch": 0.74,
713
- "grad_norm": 0.37472084164619446,
714
- "learning_rate": 6.507510729613734e-05,
715
- "loss": 1.035,
716
- "max_memory_allocated (GB)": 91.86,
717
  "memory_allocated (GB)": 24.39,
718
  "step": 710,
719
  "total_memory_available (GB)": 94.62
720
  },
721
  {
722
- "epoch": 0.75,
723
- "grad_norm": 0.3830932676792145,
724
- "learning_rate": 6.453862660944207e-05,
725
- "loss": 1.0329,
726
- "max_memory_allocated (GB)": 91.86,
727
  "memory_allocated (GB)": 24.39,
728
  "step": 720,
729
  "total_memory_available (GB)": 94.62
730
  },
731
  {
732
- "epoch": 0.76,
733
- "grad_norm": 0.33351582288742065,
734
- "learning_rate": 6.400214592274679e-05,
735
- "loss": 1.0338,
736
- "max_memory_allocated (GB)": 91.86,
737
  "memory_allocated (GB)": 24.39,
738
  "step": 730,
739
  "total_memory_available (GB)": 94.62
740
  },
741
  {
742
- "epoch": 0.77,
743
- "grad_norm": 0.30762043595314026,
744
- "learning_rate": 6.346566523605151e-05,
745
- "loss": 1.0452,
746
- "max_memory_allocated (GB)": 91.86,
747
  "memory_allocated (GB)": 24.39,
748
  "step": 740,
749
  "total_memory_available (GB)": 94.62
750
  },
751
  {
752
- "epoch": 0.78,
753
- "grad_norm": 0.3884871006011963,
754
- "learning_rate": 6.292918454935622e-05,
755
- "loss": 1.0387,
756
- "max_memory_allocated (GB)": 91.86,
757
  "memory_allocated (GB)": 24.39,
758
  "step": 750,
759
  "total_memory_available (GB)": 94.62
760
  },
761
  {
762
- "epoch": 0.79,
763
- "grad_norm": 0.34226515889167786,
764
- "learning_rate": 6.239270386266095e-05,
765
- "loss": 1.0282,
766
- "max_memory_allocated (GB)": 91.86,
767
  "memory_allocated (GB)": 24.39,
768
  "step": 760,
769
  "total_memory_available (GB)": 94.62
770
  },
771
  {
772
- "epoch": 0.8,
773
- "grad_norm": 0.3282904624938965,
774
- "learning_rate": 6.185622317596566e-05,
775
- "loss": 1.0266,
776
- "max_memory_allocated (GB)": 91.86,
777
  "memory_allocated (GB)": 24.39,
778
  "step": 770,
779
  "total_memory_available (GB)": 94.62
780
  },
781
  {
782
- "epoch": 0.81,
783
- "grad_norm": 0.34865814447402954,
784
- "learning_rate": 6.13197424892704e-05,
785
- "loss": 1.0369,
786
- "max_memory_allocated (GB)": 91.86,
787
  "memory_allocated (GB)": 24.39,
788
  "step": 780,
789
  "total_memory_available (GB)": 94.62
790
  },
791
  {
792
- "epoch": 0.82,
793
- "grad_norm": 0.34097954630851746,
794
- "learning_rate": 6.0783261802575106e-05,
795
- "loss": 1.0241,
796
- "max_memory_allocated (GB)": 91.86,
797
  "memory_allocated (GB)": 24.39,
798
  "step": 790,
799
  "total_memory_available (GB)": 94.62
800
  },
801
  {
802
- "epoch": 0.83,
803
- "grad_norm": 0.342785120010376,
804
- "learning_rate": 6.0246781115879835e-05,
805
- "loss": 1.0289,
806
- "max_memory_allocated (GB)": 91.86,
807
  "memory_allocated (GB)": 24.39,
808
  "step": 800,
809
  "total_memory_available (GB)": 94.62
810
  },
811
  {
812
- "epoch": 0.84,
813
- "grad_norm": 0.33090144395828247,
814
- "learning_rate": 5.971030042918455e-05,
815
- "loss": 1.0335,
816
- "max_memory_allocated (GB)": 91.86,
817
  "memory_allocated (GB)": 24.39,
818
  "step": 810,
819
  "total_memory_available (GB)": 94.62
820
  },
821
  {
822
- "epoch": 0.85,
823
- "grad_norm": 0.38638409972190857,
824
- "learning_rate": 5.917381974248928e-05,
825
- "loss": 1.0272,
826
- "max_memory_allocated (GB)": 91.86,
827
  "memory_allocated (GB)": 24.39,
828
  "step": 820,
829
  "total_memory_available (GB)": 94.62
830
  },
831
  {
832
- "epoch": 0.86,
833
- "grad_norm": 0.34252798557281494,
834
- "learning_rate": 5.8637339055793994e-05,
835
- "loss": 1.0284,
836
- "max_memory_allocated (GB)": 91.86,
837
  "memory_allocated (GB)": 24.39,
838
  "step": 830,
839
  "total_memory_available (GB)": 94.62
840
  },
841
  {
842
- "epoch": 0.87,
843
- "grad_norm": 0.3451687693595886,
844
- "learning_rate": 5.810085836909872e-05,
845
- "loss": 1.0212,
846
- "max_memory_allocated (GB)": 91.86,
847
  "memory_allocated (GB)": 24.39,
848
  "step": 840,
849
  "total_memory_available (GB)": 94.62
850
  },
851
  {
852
- "epoch": 0.88,
853
- "grad_norm": 0.384164422750473,
854
- "learning_rate": 5.756437768240344e-05,
855
- "loss": 1.0347,
856
- "max_memory_allocated (GB)": 91.86,
857
  "memory_allocated (GB)": 24.39,
858
  "step": 850,
859
  "total_memory_available (GB)": 94.62
860
  },
861
  {
862
- "epoch": 0.89,
863
- "grad_norm": 0.3310836851596832,
864
- "learning_rate": 5.7027896995708154e-05,
865
- "loss": 1.0153,
866
- "max_memory_allocated (GB)": 91.86,
867
  "memory_allocated (GB)": 24.39,
868
  "step": 860,
869
  "total_memory_available (GB)": 94.62
870
  },
871
  {
872
- "epoch": 0.91,
873
- "grad_norm": 0.3287970721721649,
874
- "learning_rate": 5.6491416309012876e-05,
875
- "loss": 1.0386,
876
- "max_memory_allocated (GB)": 91.86,
877
  "memory_allocated (GB)": 24.39,
878
  "step": 870,
879
  "total_memory_available (GB)": 94.62
880
  },
881
  {
882
- "epoch": 0.92,
883
- "grad_norm": 0.3661152720451355,
884
- "learning_rate": 5.59549356223176e-05,
885
- "loss": 1.0195,
886
- "max_memory_allocated (GB)": 91.86,
887
  "memory_allocated (GB)": 24.39,
888
  "step": 880,
889
  "total_memory_available (GB)": 94.62
890
  },
891
  {
892
- "epoch": 0.93,
893
- "grad_norm": 0.39101266860961914,
894
- "learning_rate": 5.541845493562232e-05,
895
- "loss": 1.0285,
896
- "max_memory_allocated (GB)": 91.86,
897
  "memory_allocated (GB)": 24.39,
898
  "step": 890,
899
  "total_memory_available (GB)": 94.62
900
  },
901
  {
902
- "epoch": 0.94,
903
- "grad_norm": 0.38129398226737976,
904
- "learning_rate": 5.4881974248927035e-05,
905
- "loss": 1.0375,
906
- "max_memory_allocated (GB)": 91.86,
907
  "memory_allocated (GB)": 24.39,
908
  "step": 900,
909
  "total_memory_available (GB)": 94.62
910
  },
911
  {
912
- "epoch": 0.95,
913
- "grad_norm": 0.32832667231559753,
914
- "learning_rate": 5.4345493562231764e-05,
915
- "loss": 1.0229,
916
- "max_memory_allocated (GB)": 91.86,
917
  "memory_allocated (GB)": 24.39,
918
  "step": 910,
919
  "total_memory_available (GB)": 94.62
920
  },
921
  {
922
- "epoch": 0.96,
923
- "grad_norm": 0.35413920879364014,
924
- "learning_rate": 5.380901287553648e-05,
925
- "loss": 1.024,
926
- "max_memory_allocated (GB)": 91.86,
927
  "memory_allocated (GB)": 24.39,
928
  "step": 920,
929
  "total_memory_available (GB)": 94.62
930
  },
931
  {
932
- "epoch": 0.97,
933
- "grad_norm": 0.3270743787288666,
934
- "learning_rate": 5.327253218884121e-05,
935
- "loss": 1.0319,
936
- "max_memory_allocated (GB)": 91.86,
937
  "memory_allocated (GB)": 24.39,
938
  "step": 930,
939
  "total_memory_available (GB)": 94.62
940
  },
941
  {
942
- "epoch": 0.98,
943
- "grad_norm": 0.33611926436424255,
944
- "learning_rate": 5.273605150214592e-05,
945
- "loss": 1.0198,
946
- "max_memory_allocated (GB)": 91.86,
947
  "memory_allocated (GB)": 24.39,
948
  "step": 940,
949
  "total_memory_available (GB)": 94.62
950
  },
951
  {
952
- "epoch": 0.99,
953
- "grad_norm": 0.34108686447143555,
954
- "learning_rate": 5.219957081545065e-05,
955
- "loss": 1.0294,
956
- "max_memory_allocated (GB)": 91.86,
957
  "memory_allocated (GB)": 24.39,
958
  "step": 950,
959
  "total_memory_available (GB)": 94.62
960
  },
961
  {
962
- "epoch": 1.0,
963
- "grad_norm": 0.3304935395717621,
964
- "learning_rate": 5.166309012875537e-05,
965
- "loss": 1.0264,
966
- "max_memory_allocated (GB)": 91.86,
967
  "memory_allocated (GB)": 24.39,
968
  "step": 960,
969
  "total_memory_available (GB)": 94.62
970
  },
971
  {
972
- "epoch": 1.01,
973
- "grad_norm": 0.3912031054496765,
974
- "learning_rate": 5.112660944206009e-05,
975
- "loss": 1.0192,
976
- "max_memory_allocated (GB)": 91.87,
977
  "memory_allocated (GB)": 24.39,
978
  "step": 970,
979
  "total_memory_available (GB)": 94.62
980
  },
981
  {
982
- "epoch": 1.02,
983
- "grad_norm": 0.33931615948677063,
984
- "learning_rate": 5.0590128755364804e-05,
985
- "loss": 1.0242,
986
- "max_memory_allocated (GB)": 91.87,
987
  "memory_allocated (GB)": 24.39,
988
  "step": 980,
989
  "total_memory_available (GB)": 94.62
990
  },
991
  {
992
- "epoch": 1.03,
993
- "grad_norm": 0.3690173923969269,
994
- "learning_rate": 5.005364806866953e-05,
995
- "loss": 1.0202,
996
- "max_memory_allocated (GB)": 91.87,
997
  "memory_allocated (GB)": 24.39,
998
  "step": 990,
999
  "total_memory_available (GB)": 94.62
1000
  },
1001
  {
1002
- "epoch": 1.04,
1003
- "grad_norm": 0.35189080238342285,
1004
- "learning_rate": 4.951716738197425e-05,
1005
- "loss": 1.0206,
1006
- "max_memory_allocated (GB)": 91.87,
1007
  "memory_allocated (GB)": 24.39,
1008
  "step": 1000,
1009
  "total_memory_available (GB)": 94.62
1010
  }
1011
  ],
1012
  "logging_steps": 10,
1013
- "max_steps": 1922,
1014
  "num_input_tokens_seen": 0,
1015
  "num_train_epochs": 2,
1016
  "save_steps": 1000,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9883864591055103,
5
  "eval_steps": 500,
6
  "global_step": 1000,
7
  "is_hyper_param_search": false,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
+ "grad_norm": 0.8518019318580627,
14
+ "learning_rate": 5.6012058970266934e-05,
15
+ "loss": 1.7421,
16
  "max_memory_allocated (GB)": 91.86,
17
  "memory_allocated (GB)": 24.39,
18
  "step": 10,
 
20
  },
21
  {
22
  "epoch": 0.02,
23
+ "grad_norm": 0.8390570878982544,
24
+ "learning_rate": 7.287336883921704e-05,
25
+ "loss": 1.5281,
26
  "max_memory_allocated (GB)": 91.86,
27
  "memory_allocated (GB)": 24.39,
28
  "step": 20,
 
30
  },
31
  {
32
  "epoch": 0.03,
33
+ "grad_norm": 0.3700675666332245,
34
+ "learning_rate": 8.273660282559241e-05,
35
+ "loss": 1.3485,
36
  "max_memory_allocated (GB)": 91.86,
37
  "memory_allocated (GB)": 24.39,
38
  "step": 30,
 
40
  },
41
  {
42
  "epoch": 0.04,
43
+ "grad_norm": 0.3168916404247284,
44
+ "learning_rate": 8.973467870816715e-05,
45
+ "loss": 1.2968,
46
  "max_memory_allocated (GB)": 91.86,
47
  "memory_allocated (GB)": 24.39,
48
  "step": 40,
 
50
  },
51
  {
52
  "epoch": 0.05,
53
+ "grad_norm": 0.24861344695091248,
54
+ "learning_rate": 9.516280807158375e-05,
55
+ "loss": 1.2689,
56
  "max_memory_allocated (GB)": 91.86,
57
  "memory_allocated (GB)": 24.39,
58
  "step": 50,
 
60
  },
61
  {
62
  "epoch": 0.06,
63
+ "grad_norm": 0.22251686453819275,
64
+ "learning_rate": 9.959791269454252e-05,
65
+ "loss": 1.2434,
66
  "max_memory_allocated (GB)": 91.86,
67
  "memory_allocated (GB)": 24.39,
68
  "step": 60,
 
70
  },
71
  {
72
  "epoch": 0.07,
73
+ "grad_norm": 0.23426611721515656,
74
+ "learning_rate": 9.959204487506375e-05,
75
+ "loss": 1.2152,
76
+ "max_memory_allocated (GB)": 91.87,
77
  "memory_allocated (GB)": 24.39,
78
  "step": 70,
79
  "total_memory_available (GB)": 94.62
80
  },
81
  {
82
  "epoch": 0.08,
83
+ "grad_norm": 0.45850667357444763,
84
+ "learning_rate": 9.908210096889343e-05,
85
+ "loss": 1.2108,
86
+ "max_memory_allocated (GB)": 91.87,
87
  "memory_allocated (GB)": 24.39,
88
  "step": 80,
89
  "total_memory_available (GB)": 94.62
90
  },
91
  {
92
  "epoch": 0.09,
93
+ "grad_norm": 0.4196653366088867,
94
+ "learning_rate": 9.85721570627231e-05,
95
+ "loss": 1.1913,
96
+ "max_memory_allocated (GB)": 91.87,
97
  "memory_allocated (GB)": 24.39,
98
  "step": 90,
99
  "total_memory_available (GB)": 94.62
100
  },
101
  {
102
  "epoch": 0.1,
103
+ "grad_norm": 0.5248636603355408,
104
+ "learning_rate": 9.806221315655279e-05,
105
+ "loss": 1.1924,
106
+ "max_memory_allocated (GB)": 91.87,
107
  "memory_allocated (GB)": 24.39,
108
  "step": 100,
109
  "total_memory_available (GB)": 94.62
110
  },
111
  {
112
  "epoch": 0.11,
113
+ "grad_norm": 0.3434283137321472,
114
+ "learning_rate": 9.755226925038246e-05,
115
+ "loss": 1.1558,
116
+ "max_memory_allocated (GB)": 91.87,
117
  "memory_allocated (GB)": 24.39,
118
  "step": 110,
119
  "total_memory_available (GB)": 94.62
120
  },
121
  {
122
  "epoch": 0.12,
123
+ "grad_norm": 0.47737815976142883,
124
+ "learning_rate": 9.704232534421214e-05,
125
+ "loss": 1.1492,
126
+ "max_memory_allocated (GB)": 91.87,
127
  "memory_allocated (GB)": 24.39,
128
  "step": 120,
129
  "total_memory_available (GB)": 94.62
130
  },
131
  {
132
+ "epoch": 0.13,
133
+ "grad_norm": 0.47788286209106445,
134
+ "learning_rate": 9.653238143804181e-05,
135
+ "loss": 1.1486,
136
+ "max_memory_allocated (GB)": 91.87,
137
  "memory_allocated (GB)": 24.39,
138
  "step": 130,
139
  "total_memory_available (GB)": 94.62
140
  },
141
  {
142
+ "epoch": 0.14,
143
+ "grad_norm": 0.45408132672309875,
144
+ "learning_rate": 9.60224375318715e-05,
145
+ "loss": 1.1456,
146
+ "max_memory_allocated (GB)": 91.87,
147
  "memory_allocated (GB)": 24.39,
148
  "step": 140,
149
  "total_memory_available (GB)": 94.62
150
  },
151
  {
152
+ "epoch": 0.15,
153
+ "grad_norm": 0.4091607630252838,
154
+ "learning_rate": 9.551249362570118e-05,
155
+ "loss": 1.1365,
156
+ "max_memory_allocated (GB)": 91.87,
157
  "memory_allocated (GB)": 24.39,
158
  "step": 150,
159
  "total_memory_available (GB)": 94.62
160
  },
161
  {
162
+ "epoch": 0.16,
163
+ "grad_norm": 0.5064594745635986,
164
+ "learning_rate": 9.500254971953085e-05,
165
+ "loss": 1.137,
166
+ "max_memory_allocated (GB)": 91.87,
167
  "memory_allocated (GB)": 24.39,
168
  "step": 160,
169
  "total_memory_available (GB)": 94.62
170
  },
171
  {
172
+ "epoch": 0.17,
173
+ "grad_norm": 0.4288266897201538,
174
+ "learning_rate": 9.449260581336054e-05,
175
+ "loss": 1.1181,
176
+ "max_memory_allocated (GB)": 91.87,
177
  "memory_allocated (GB)": 24.39,
178
  "step": 170,
179
  "total_memory_available (GB)": 94.62
180
  },
181
  {
182
+ "epoch": 0.18,
183
+ "grad_norm": 0.3854447901248932,
184
+ "learning_rate": 9.398266190719021e-05,
185
+ "loss": 1.1091,
186
+ "max_memory_allocated (GB)": 91.87,
187
  "memory_allocated (GB)": 24.39,
188
  "step": 180,
189
  "total_memory_available (GB)": 94.62
190
  },
191
  {
192
+ "epoch": 0.19,
193
+ "grad_norm": 0.4143249988555908,
194
+ "learning_rate": 9.347271800101989e-05,
195
+ "loss": 1.1156,
196
+ "max_memory_allocated (GB)": 91.87,
197
  "memory_allocated (GB)": 24.39,
198
  "step": 190,
199
  "total_memory_available (GB)": 94.62
200
  },
201
  {
202
+ "epoch": 0.2,
203
+ "grad_norm": 0.521230161190033,
204
+ "learning_rate": 9.296277409484956e-05,
205
+ "loss": 1.1117,
206
+ "max_memory_allocated (GB)": 91.87,
207
  "memory_allocated (GB)": 24.39,
208
  "step": 200,
209
  "total_memory_available (GB)": 94.62
210
  },
211
  {
212
+ "epoch": 0.21,
213
+ "grad_norm": 0.487106055021286,
214
+ "learning_rate": 9.245283018867925e-05,
215
+ "loss": 1.1003,
216
+ "max_memory_allocated (GB)": 91.87,
217
  "memory_allocated (GB)": 24.39,
218
  "step": 210,
219
  "total_memory_available (GB)": 94.62
220
  },
221
  {
222
+ "epoch": 0.22,
223
+ "grad_norm": 0.4616335928440094,
224
+ "learning_rate": 9.194288628250894e-05,
225
  "loss": 1.0992,
226
+ "max_memory_allocated (GB)": 91.88,
227
  "memory_allocated (GB)": 24.39,
228
  "step": 220,
229
  "total_memory_available (GB)": 94.62
230
  },
231
  {
232
+ "epoch": 0.23,
233
+ "grad_norm": 0.3908211290836334,
234
+ "learning_rate": 9.14329423763386e-05,
235
+ "loss": 1.1074,
236
+ "max_memory_allocated (GB)": 91.88,
237
  "memory_allocated (GB)": 24.39,
238
  "step": 230,
239
  "total_memory_available (GB)": 94.62
240
  },
241
  {
242
+ "epoch": 0.24,
243
+ "grad_norm": 0.4411673843860626,
244
+ "learning_rate": 9.092299847016829e-05,
245
+ "loss": 1.1055,
246
+ "max_memory_allocated (GB)": 91.88,
247
  "memory_allocated (GB)": 24.39,
248
  "step": 240,
249
  "total_memory_available (GB)": 94.62
250
  },
251
  {
252
+ "epoch": 0.25,
253
+ "grad_norm": 0.4123621881008148,
254
+ "learning_rate": 9.041305456399796e-05,
255
+ "loss": 1.0883,
256
+ "max_memory_allocated (GB)": 91.88,
257
  "memory_allocated (GB)": 24.39,
258
  "step": 250,
259
  "total_memory_available (GB)": 94.62
260
  },
261
  {
262
+ "epoch": 0.26,
263
+ "grad_norm": 0.5461438298225403,
264
+ "learning_rate": 8.990311065782764e-05,
265
+ "loss": 1.0928,
266
+ "max_memory_allocated (GB)": 91.9,
267
  "memory_allocated (GB)": 24.39,
268
  "step": 260,
269
  "total_memory_available (GB)": 94.62
270
  },
271
  {
272
+ "epoch": 0.27,
273
+ "grad_norm": 0.4907448887825012,
274
+ "learning_rate": 8.939316675165733e-05,
275
+ "loss": 1.0912,
276
+ "max_memory_allocated (GB)": 91.9,
277
  "memory_allocated (GB)": 24.39,
278
  "step": 270,
279
  "total_memory_available (GB)": 94.62
280
  },
281
  {
282
+ "epoch": 0.28,
283
+ "grad_norm": 0.45152169466018677,
284
+ "learning_rate": 8.8883222845487e-05,
285
+ "loss": 1.0891,
286
+ "max_memory_allocated (GB)": 91.9,
287
  "memory_allocated (GB)": 24.39,
288
  "step": 280,
289
  "total_memory_available (GB)": 94.62
290
  },
291
  {
292
+ "epoch": 0.29,
293
+ "grad_norm": 0.41472557187080383,
294
+ "learning_rate": 8.837327893931669e-05,
295
+ "loss": 1.0864,
296
+ "max_memory_allocated (GB)": 91.9,
297
  "memory_allocated (GB)": 24.39,
298
  "step": 290,
299
  "total_memory_available (GB)": 94.62
300
  },
301
  {
302
+ "epoch": 0.3,
303
+ "grad_norm": 0.45566004514694214,
304
+ "learning_rate": 8.786333503314635e-05,
305
+ "loss": 1.0776,
306
+ "max_memory_allocated (GB)": 91.9,
307
  "memory_allocated (GB)": 24.39,
308
  "step": 300,
309
  "total_memory_available (GB)": 94.62
310
  },
311
  {
312
+ "epoch": 0.31,
313
+ "grad_norm": 0.3909231424331665,
314
+ "learning_rate": 8.735339112697604e-05,
315
+ "loss": 1.0801,
316
+ "max_memory_allocated (GB)": 91.9,
317
  "memory_allocated (GB)": 24.39,
318
  "step": 310,
319
  "total_memory_available (GB)": 94.62
320
  },
321
  {
322
+ "epoch": 0.32,
323
+ "grad_norm": 0.39705774188041687,
324
+ "learning_rate": 8.684344722080571e-05,
325
+ "loss": 1.0746,
326
+ "max_memory_allocated (GB)": 91.9,
327
  "memory_allocated (GB)": 24.39,
328
  "step": 320,
329
  "total_memory_available (GB)": 94.62
330
  },
331
  {
332
+ "epoch": 0.33,
333
+ "grad_norm": 0.4257935881614685,
334
+ "learning_rate": 8.633350331463539e-05,
335
+ "loss": 1.0738,
336
+ "max_memory_allocated (GB)": 91.9,
337
  "memory_allocated (GB)": 24.39,
338
  "step": 330,
339
  "total_memory_available (GB)": 94.62
340
  },
341
  {
342
+ "epoch": 0.34,
343
+ "grad_norm": 0.41336777806282043,
344
+ "learning_rate": 8.582355940846507e-05,
345
+ "loss": 1.0811,
346
+ "max_memory_allocated (GB)": 91.9,
347
  "memory_allocated (GB)": 24.39,
348
  "step": 340,
349
  "total_memory_available (GB)": 94.62
350
  },
351
  {
352
+ "epoch": 0.35,
353
+ "grad_norm": 0.3621828854084015,
354
+ "learning_rate": 8.531361550229475e-05,
355
+ "loss": 1.0762,
356
+ "max_memory_allocated (GB)": 91.9,
357
  "memory_allocated (GB)": 24.39,
358
  "step": 350,
359
  "total_memory_available (GB)": 94.62
360
  },
361
  {
362
+ "epoch": 0.36,
363
+ "grad_norm": 0.398189902305603,
364
+ "learning_rate": 8.480367159612444e-05,
365
+ "loss": 1.0622,
366
+ "max_memory_allocated (GB)": 91.9,
367
  "memory_allocated (GB)": 24.39,
368
  "step": 360,
369
  "total_memory_available (GB)": 94.62
370
  },
371
  {
372
+ "epoch": 0.37,
373
+ "grad_norm": 0.37738627195358276,
374
+ "learning_rate": 8.42937276899541e-05,
375
+ "loss": 1.06,
376
+ "max_memory_allocated (GB)": 91.9,
377
  "memory_allocated (GB)": 24.39,
378
  "step": 370,
379
  "total_memory_available (GB)": 94.62
380
  },
381
  {
382
+ "epoch": 0.38,
383
+ "grad_norm": 0.40790703892707825,
384
+ "learning_rate": 8.378378378378379e-05,
385
+ "loss": 1.0768,
386
+ "max_memory_allocated (GB)": 91.9,
387
  "memory_allocated (GB)": 24.39,
388
  "step": 380,
389
  "total_memory_available (GB)": 94.62
390
  },
391
  {
392
+ "epoch": 0.39,
393
+ "grad_norm": 0.35230451822280884,
394
+ "learning_rate": 8.327383987761347e-05,
395
+ "loss": 1.0631,
396
+ "max_memory_allocated (GB)": 91.9,
397
  "memory_allocated (GB)": 24.39,
398
  "step": 390,
399
  "total_memory_available (GB)": 94.62
400
  },
401
  {
402
+ "epoch": 0.4,
403
+ "grad_norm": 0.37737661600112915,
404
+ "learning_rate": 8.276389597144315e-05,
405
+ "loss": 1.0665,
406
+ "max_memory_allocated (GB)": 91.9,
407
  "memory_allocated (GB)": 24.39,
408
  "step": 400,
409
  "total_memory_available (GB)": 94.62
410
  },
411
  {
412
+ "epoch": 0.41,
413
+ "grad_norm": 0.39823117852211,
414
+ "learning_rate": 8.225395206527282e-05,
415
+ "loss": 1.0739,
416
+ "max_memory_allocated (GB)": 91.9,
417
  "memory_allocated (GB)": 24.39,
418
  "step": 410,
419
  "total_memory_available (GB)": 94.62
420
  },
421
  {
422
+ "epoch": 0.42,
423
+ "grad_norm": 0.38277310132980347,
424
+ "learning_rate": 8.17440081591025e-05,
425
+ "loss": 1.07,
426
+ "max_memory_allocated (GB)": 91.9,
427
  "memory_allocated (GB)": 24.39,
428
  "step": 420,
429
  "total_memory_available (GB)": 94.62
430
  },
431
  {
432
+ "epoch": 0.43,
433
+ "grad_norm": 0.34220802783966064,
434
+ "learning_rate": 8.123406425293219e-05,
435
+ "loss": 1.0698,
436
+ "max_memory_allocated (GB)": 91.9,
437
  "memory_allocated (GB)": 24.39,
438
  "step": 430,
439
  "total_memory_available (GB)": 94.62
440
  },
441
  {
442
+ "epoch": 0.43,
443
+ "grad_norm": 0.3858403265476227,
444
+ "learning_rate": 8.072412034676186e-05,
445
+ "loss": 1.0488,
446
+ "max_memory_allocated (GB)": 91.9,
447
  "memory_allocated (GB)": 24.39,
448
  "step": 440,
449
  "total_memory_available (GB)": 94.62
450
  },
451
  {
452
+ "epoch": 0.44,
453
+ "grad_norm": 0.36855727434158325,
454
+ "learning_rate": 8.021417644059154e-05,
455
+ "loss": 1.0612,
456
+ "max_memory_allocated (GB)": 91.9,
457
  "memory_allocated (GB)": 24.39,
458
  "step": 450,
459
  "total_memory_available (GB)": 94.62
460
  },
461
  {
462
+ "epoch": 0.45,
463
+ "grad_norm": 0.4122312664985657,
464
+ "learning_rate": 7.970423253442122e-05,
465
+ "loss": 1.0566,
466
+ "max_memory_allocated (GB)": 91.9,
467
  "memory_allocated (GB)": 24.39,
468
  "step": 460,
469
  "total_memory_available (GB)": 94.62
470
  },
471
  {
472
+ "epoch": 0.46,
473
+ "grad_norm": 0.38682645559310913,
474
+ "learning_rate": 7.91942886282509e-05,
475
+ "loss": 1.0575,
476
+ "max_memory_allocated (GB)": 91.9,
477
  "memory_allocated (GB)": 24.39,
478
  "step": 470,
479
  "total_memory_available (GB)": 94.62
480
  },
481
  {
482
+ "epoch": 0.47,
483
+ "grad_norm": 0.38858598470687866,
484
+ "learning_rate": 7.868434472208057e-05,
485
+ "loss": 1.0579,
486
+ "max_memory_allocated (GB)": 91.9,
487
  "memory_allocated (GB)": 24.39,
488
  "step": 480,
489
  "total_memory_available (GB)": 94.62
490
  },
491
  {
492
+ "epoch": 0.48,
493
+ "grad_norm": 0.3749813139438629,
494
+ "learning_rate": 7.817440081591025e-05,
495
+ "loss": 1.0531,
496
+ "max_memory_allocated (GB)": 91.9,
497
  "memory_allocated (GB)": 24.39,
498
  "step": 490,
499
  "total_memory_available (GB)": 94.62
500
  },
501
  {
502
+ "epoch": 0.49,
503
+ "grad_norm": 0.36404120922088623,
504
+ "learning_rate": 7.766445690973994e-05,
505
+ "loss": 1.0447,
506
+ "max_memory_allocated (GB)": 91.9,
507
  "memory_allocated (GB)": 24.39,
508
  "step": 500,
509
  "total_memory_available (GB)": 94.62
510
  },
511
  {
512
+ "epoch": 0.5,
513
+ "grad_norm": 0.4445250332355499,
514
+ "learning_rate": 7.715451300356961e-05,
515
+ "loss": 1.0526,
516
+ "max_memory_allocated (GB)": 91.9,
517
  "memory_allocated (GB)": 24.39,
518
  "step": 510,
519
  "total_memory_available (GB)": 94.62
520
  },
521
  {
522
+ "epoch": 0.51,
523
+ "grad_norm": 0.3644183278083801,
524
+ "learning_rate": 7.664456909739929e-05,
525
+ "loss": 1.0494,
526
+ "max_memory_allocated (GB)": 91.9,
527
  "memory_allocated (GB)": 24.39,
528
  "step": 520,
529
  "total_memory_available (GB)": 94.62
530
  },
531
  {
532
+ "epoch": 0.52,
533
+ "grad_norm": 0.34624868631362915,
534
+ "learning_rate": 7.613462519122897e-05,
535
+ "loss": 1.0572,
536
+ "max_memory_allocated (GB)": 91.9,
537
  "memory_allocated (GB)": 24.39,
538
  "step": 530,
539
  "total_memory_available (GB)": 94.62
540
  },
541
  {
542
+ "epoch": 0.53,
543
+ "grad_norm": 0.3788256347179413,
544
+ "learning_rate": 7.562468128505865e-05,
545
+ "loss": 1.0502,
546
+ "max_memory_allocated (GB)": 91.9,
547
  "memory_allocated (GB)": 24.39,
548
  "step": 540,
549
  "total_memory_available (GB)": 94.62
550
  },
551
  {
552
+ "epoch": 0.54,
553
+ "grad_norm": 0.3667903542518616,
554
+ "learning_rate": 7.511473737888832e-05,
555
+ "loss": 1.0505,
556
+ "max_memory_allocated (GB)": 91.9,
557
  "memory_allocated (GB)": 24.39,
558
  "step": 550,
559
  "total_memory_available (GB)": 94.62
560
  },
561
  {
562
+ "epoch": 0.55,
563
+ "grad_norm": 0.37510526180267334,
564
+ "learning_rate": 7.460479347271801e-05,
565
+ "loss": 1.045,
566
+ "max_memory_allocated (GB)": 91.9,
567
  "memory_allocated (GB)": 24.39,
568
  "step": 560,
569
  "total_memory_available (GB)": 94.62
570
  },
571
  {
572
+ "epoch": 0.56,
573
+ "grad_norm": 0.3509054183959961,
574
+ "learning_rate": 7.409484956654769e-05,
575
+ "loss": 1.0504,
576
+ "max_memory_allocated (GB)": 91.9,
577
  "memory_allocated (GB)": 24.39,
578
  "step": 570,
579
  "total_memory_available (GB)": 94.62
580
  },
581
  {
582
+ "epoch": 0.57,
583
+ "grad_norm": 0.3294220268726349,
584
+ "learning_rate": 7.358490566037736e-05,
585
+ "loss": 1.0573,
586
+ "max_memory_allocated (GB)": 91.9,
587
  "memory_allocated (GB)": 24.39,
588
  "step": 580,
589
  "total_memory_available (GB)": 94.62
590
  },
591
  {
592
+ "epoch": 0.58,
593
+ "grad_norm": 0.34325262904167175,
594
+ "learning_rate": 7.307496175420703e-05,
595
+ "loss": 1.0445,
596
+ "max_memory_allocated (GB)": 91.9,
597
  "memory_allocated (GB)": 24.39,
598
  "step": 590,
599
  "total_memory_available (GB)": 94.62
600
  },
601
  {
602
+ "epoch": 0.59,
603
+ "grad_norm": 0.3679581880569458,
604
+ "learning_rate": 7.256501784803672e-05,
605
+ "loss": 1.0445,
606
+ "max_memory_allocated (GB)": 91.9,
607
  "memory_allocated (GB)": 24.39,
608
  "step": 600,
609
  "total_memory_available (GB)": 94.62
610
  },
611
  {
612
+ "epoch": 0.6,
613
+ "grad_norm": 0.3220757246017456,
614
+ "learning_rate": 7.20550739418664e-05,
615
+ "loss": 1.0458,
616
+ "max_memory_allocated (GB)": 91.9,
617
  "memory_allocated (GB)": 24.39,
618
  "step": 610,
619
  "total_memory_available (GB)": 94.62
620
  },
621
  {
622
+ "epoch": 0.61,
623
+ "grad_norm": 0.3660426139831543,
624
+ "learning_rate": 7.154513003569607e-05,
625
+ "loss": 1.0447,
626
+ "max_memory_allocated (GB)": 91.9,
627
  "memory_allocated (GB)": 24.39,
628
  "step": 620,
629
  "total_memory_available (GB)": 94.62
630
  },
631
  {
632
+ "epoch": 0.62,
633
+ "grad_norm": 0.32533150911331177,
634
+ "learning_rate": 7.103518612952576e-05,
635
+ "loss": 1.0471,
636
+ "max_memory_allocated (GB)": 91.9,
637
  "memory_allocated (GB)": 24.39,
638
  "step": 630,
639
  "total_memory_available (GB)": 94.62
640
  },
641
  {
642
+ "epoch": 0.63,
643
+ "grad_norm": 0.33115923404693604,
644
+ "learning_rate": 7.052524222335543e-05,
645
+ "loss": 1.0431,
646
+ "max_memory_allocated (GB)": 91.9,
647
  "memory_allocated (GB)": 24.39,
648
  "step": 640,
649
  "total_memory_available (GB)": 94.62
650
  },
651
  {
652
+ "epoch": 0.64,
653
+ "grad_norm": 0.337576299905777,
654
+ "learning_rate": 7.001529831718512e-05,
655
+ "loss": 1.057,
656
+ "max_memory_allocated (GB)": 91.9,
657
  "memory_allocated (GB)": 24.39,
658
  "step": 650,
659
  "total_memory_available (GB)": 94.62
660
  },
661
  {
662
+ "epoch": 0.65,
663
+ "grad_norm": 0.3337574303150177,
664
+ "learning_rate": 6.950535441101478e-05,
665
+ "loss": 1.0408,
666
+ "max_memory_allocated (GB)": 91.9,
667
  "memory_allocated (GB)": 24.39,
668
  "step": 660,
669
  "total_memory_available (GB)": 94.62
670
  },
671
  {
672
+ "epoch": 0.66,
673
+ "grad_norm": 0.35560840368270874,
674
+ "learning_rate": 6.899541050484447e-05,
675
+ "loss": 1.0363,
676
+ "max_memory_allocated (GB)": 91.9,
677
  "memory_allocated (GB)": 24.39,
678
  "step": 670,
679
  "total_memory_available (GB)": 94.62
680
  },
681
  {
682
+ "epoch": 0.67,
683
+ "grad_norm": 0.3454528748989105,
684
+ "learning_rate": 6.848546659867415e-05,
685
+ "loss": 1.0412,
686
+ "max_memory_allocated (GB)": 91.9,
687
  "memory_allocated (GB)": 24.39,
688
  "step": 680,
689
  "total_memory_available (GB)": 94.62
690
  },
691
  {
692
+ "epoch": 0.68,
693
+ "grad_norm": 0.3608352541923523,
694
+ "learning_rate": 6.797552269250382e-05,
695
+ "loss": 1.0578,
696
+ "max_memory_allocated (GB)": 91.9,
697
  "memory_allocated (GB)": 24.39,
698
  "step": 690,
699
  "total_memory_available (GB)": 94.62
700
  },
701
  {
702
+ "epoch": 0.69,
703
+ "grad_norm": 0.332792729139328,
704
+ "learning_rate": 6.746557878633351e-05,
705
+ "loss": 1.0426,
706
+ "max_memory_allocated (GB)": 91.9,
707
  "memory_allocated (GB)": 24.39,
708
  "step": 700,
709
  "total_memory_available (GB)": 94.62
710
  },
711
  {
712
+ "epoch": 0.7,
713
+ "grad_norm": 0.4059067666530609,
714
+ "learning_rate": 6.695563488016318e-05,
715
+ "loss": 1.0441,
716
+ "max_memory_allocated (GB)": 91.9,
717
  "memory_allocated (GB)": 24.39,
718
  "step": 710,
719
  "total_memory_available (GB)": 94.62
720
  },
721
  {
722
+ "epoch": 0.71,
723
+ "grad_norm": 0.3741580843925476,
724
+ "learning_rate": 6.644569097399287e-05,
725
+ "loss": 1.0525,
726
+ "max_memory_allocated (GB)": 91.9,
727
  "memory_allocated (GB)": 24.39,
728
  "step": 720,
729
  "total_memory_available (GB)": 94.62
730
  },
731
  {
732
+ "epoch": 0.72,
733
+ "grad_norm": 0.3646301329135895,
734
+ "learning_rate": 6.593574706782255e-05,
735
+ "loss": 1.0302,
736
+ "max_memory_allocated (GB)": 91.9,
737
  "memory_allocated (GB)": 24.39,
738
  "step": 730,
739
  "total_memory_available (GB)": 94.62
740
  },
741
  {
742
+ "epoch": 0.73,
743
+ "grad_norm": 0.35956060886383057,
744
+ "learning_rate": 6.542580316165222e-05,
745
+ "loss": 1.0439,
746
+ "max_memory_allocated (GB)": 91.9,
747
  "memory_allocated (GB)": 24.39,
748
  "step": 740,
749
  "total_memory_available (GB)": 94.62
750
  },
751
  {
752
+ "epoch": 0.74,
753
+ "grad_norm": 0.3517419099807739,
754
+ "learning_rate": 6.491585925548191e-05,
755
+ "loss": 1.0314,
756
+ "max_memory_allocated (GB)": 91.9,
757
  "memory_allocated (GB)": 24.39,
758
  "step": 750,
759
  "total_memory_available (GB)": 94.62
760
  },
761
  {
762
+ "epoch": 0.75,
763
+ "grad_norm": 0.33927640318870544,
764
+ "learning_rate": 6.440591534931157e-05,
765
+ "loss": 1.042,
766
+ "max_memory_allocated (GB)": 91.9,
767
  "memory_allocated (GB)": 24.39,
768
  "step": 760,
769
  "total_memory_available (GB)": 94.62
770
  },
771
  {
772
+ "epoch": 0.76,
773
+ "grad_norm": 0.3502146005630493,
774
+ "learning_rate": 6.389597144314126e-05,
775
+ "loss": 1.0416,
776
+ "max_memory_allocated (GB)": 91.9,
777
  "memory_allocated (GB)": 24.39,
778
  "step": 770,
779
  "total_memory_available (GB)": 94.62
780
  },
781
  {
782
+ "epoch": 0.77,
783
+ "grad_norm": 0.37221387028694153,
784
+ "learning_rate": 6.338602753697093e-05,
785
+ "loss": 1.0453,
786
+ "max_memory_allocated (GB)": 91.9,
787
  "memory_allocated (GB)": 24.39,
788
  "step": 780,
789
  "total_memory_available (GB)": 94.62
790
  },
791
  {
792
+ "epoch": 0.78,
793
+ "grad_norm": 0.3718739449977875,
794
+ "learning_rate": 6.287608363080062e-05,
795
+ "loss": 1.0392,
796
+ "max_memory_allocated (GB)": 91.9,
797
  "memory_allocated (GB)": 24.39,
798
  "step": 790,
799
  "total_memory_available (GB)": 94.62
800
  },
801
  {
802
+ "epoch": 0.79,
803
+ "grad_norm": 0.35249418020248413,
804
+ "learning_rate": 6.23661397246303e-05,
805
+ "loss": 1.0413,
806
+ "max_memory_allocated (GB)": 91.9,
807
  "memory_allocated (GB)": 24.39,
808
  "step": 800,
809
  "total_memory_available (GB)": 94.62
810
  },
811
  {
812
+ "epoch": 0.8,
813
+ "grad_norm": 0.35906028747558594,
814
+ "learning_rate": 6.185619581845997e-05,
815
+ "loss": 1.0375,
816
+ "max_memory_allocated (GB)": 91.9,
817
  "memory_allocated (GB)": 24.39,
818
  "step": 810,
819
  "total_memory_available (GB)": 94.62
820
  },
821
  {
822
+ "epoch": 0.81,
823
+ "grad_norm": 0.33932170271873474,
824
+ "learning_rate": 6.134625191228966e-05,
825
+ "loss": 1.0286,
826
+ "max_memory_allocated (GB)": 91.9,
827
  "memory_allocated (GB)": 24.39,
828
  "step": 820,
829
  "total_memory_available (GB)": 94.62
830
  },
831
  {
832
+ "epoch": 0.82,
833
+ "grad_norm": 0.33107632398605347,
834
+ "learning_rate": 6.0836308006119326e-05,
835
+ "loss": 1.0319,
836
+ "max_memory_allocated (GB)": 91.9,
837
  "memory_allocated (GB)": 24.39,
838
  "step": 830,
839
  "total_memory_available (GB)": 94.62
840
  },
841
  {
842
+ "epoch": 0.83,
843
+ "grad_norm": 0.32848185300827026,
844
+ "learning_rate": 6.032636409994901e-05,
845
+ "loss": 1.0329,
846
+ "max_memory_allocated (GB)": 91.9,
847
  "memory_allocated (GB)": 24.39,
848
  "step": 840,
849
  "total_memory_available (GB)": 94.62
850
  },
851
  {
852
+ "epoch": 0.84,
853
+ "grad_norm": 0.33085334300994873,
854
+ "learning_rate": 5.981642019377869e-05,
855
+ "loss": 1.0326,
856
+ "max_memory_allocated (GB)": 91.9,
857
  "memory_allocated (GB)": 24.39,
858
  "step": 850,
859
  "total_memory_available (GB)": 94.62
860
  },
861
  {
862
+ "epoch": 0.85,
863
+ "grad_norm": 0.3043057322502136,
864
+ "learning_rate": 5.930647628760837e-05,
865
+ "loss": 1.0379,
866
+ "max_memory_allocated (GB)": 91.9,
867
  "memory_allocated (GB)": 24.39,
868
  "step": 860,
869
  "total_memory_available (GB)": 94.62
870
  },
871
  {
872
+ "epoch": 0.86,
873
+ "grad_norm": 0.3407464027404785,
874
+ "learning_rate": 5.879653238143804e-05,
875
+ "loss": 1.0336,
876
+ "max_memory_allocated (GB)": 91.9,
877
  "memory_allocated (GB)": 24.39,
878
  "step": 870,
879
  "total_memory_available (GB)": 94.62
880
  },
881
  {
882
+ "epoch": 0.87,
883
+ "grad_norm": 0.34069886803627014,
884
+ "learning_rate": 5.8286588475267726e-05,
885
+ "loss": 1.0294,
886
+ "max_memory_allocated (GB)": 91.9,
887
  "memory_allocated (GB)": 24.39,
888
  "step": 880,
889
  "total_memory_available (GB)": 94.62
890
  },
891
  {
892
+ "epoch": 0.88,
893
+ "grad_norm": 0.4303439259529114,
894
+ "learning_rate": 5.777664456909741e-05,
895
+ "loss": 1.0223,
896
+ "max_memory_allocated (GB)": 91.9,
897
  "memory_allocated (GB)": 24.39,
898
  "step": 890,
899
  "total_memory_available (GB)": 94.62
900
  },
901
  {
902
+ "epoch": 0.89,
903
+ "grad_norm": 0.3378705382347107,
904
+ "learning_rate": 5.7266700662927075e-05,
905
+ "loss": 1.042,
906
+ "max_memory_allocated (GB)": 91.9,
907
  "memory_allocated (GB)": 24.39,
908
  "step": 900,
909
  "total_memory_available (GB)": 94.62
910
  },
911
  {
912
+ "epoch": 0.9,
913
+ "grad_norm": 0.4227118194103241,
914
+ "learning_rate": 5.6756756756756757e-05,
915
+ "loss": 1.0301,
916
+ "max_memory_allocated (GB)": 91.9,
917
  "memory_allocated (GB)": 24.39,
918
  "step": 910,
919
  "total_memory_available (GB)": 94.62
920
  },
921
  {
922
+ "epoch": 0.91,
923
+ "grad_norm": 0.36343687772750854,
924
+ "learning_rate": 5.624681285058644e-05,
925
+ "loss": 1.0249,
926
+ "max_memory_allocated (GB)": 91.9,
927
  "memory_allocated (GB)": 24.39,
928
  "step": 920,
929
  "total_memory_available (GB)": 94.62
930
  },
931
  {
932
+ "epoch": 0.92,
933
+ "grad_norm": 0.37661993503570557,
934
+ "learning_rate": 5.573686894441612e-05,
935
+ "loss": 1.0201,
936
+ "max_memory_allocated (GB)": 91.9,
937
  "memory_allocated (GB)": 24.39,
938
  "step": 930,
939
  "total_memory_available (GB)": 94.62
940
  },
941
  {
942
+ "epoch": 0.93,
943
+ "grad_norm": 0.38610127568244934,
944
+ "learning_rate": 5.5226925038245794e-05,
945
+ "loss": 1.0351,
946
+ "max_memory_allocated (GB)": 91.9,
947
  "memory_allocated (GB)": 24.39,
948
  "step": 940,
949
  "total_memory_available (GB)": 94.62
950
  },
951
  {
952
+ "epoch": 0.94,
953
+ "grad_norm": 0.32767486572265625,
954
+ "learning_rate": 5.4716981132075475e-05,
955
+ "loss": 1.0364,
956
+ "max_memory_allocated (GB)": 91.9,
957
  "memory_allocated (GB)": 24.39,
958
  "step": 950,
959
  "total_memory_available (GB)": 94.62
960
  },
961
  {
962
+ "epoch": 0.95,
963
+ "grad_norm": 0.36714789271354675,
964
+ "learning_rate": 5.4207037225905157e-05,
965
+ "loss": 1.0252,
966
+ "max_memory_allocated (GB)": 91.9,
967
  "memory_allocated (GB)": 24.39,
968
  "step": 960,
969
  "total_memory_available (GB)": 94.62
970
  },
971
  {
972
+ "epoch": 0.96,
973
+ "grad_norm": 0.34143558144569397,
974
+ "learning_rate": 5.369709331973484e-05,
975
+ "loss": 1.0266,
976
+ "max_memory_allocated (GB)": 91.9,
977
  "memory_allocated (GB)": 24.39,
978
  "step": 970,
979
  "total_memory_available (GB)": 94.62
980
  },
981
  {
982
+ "epoch": 0.97,
983
+ "grad_norm": 0.3287556767463684,
984
+ "learning_rate": 5.3187149413564506e-05,
985
+ "loss": 1.0265,
986
+ "max_memory_allocated (GB)": 91.9,
987
  "memory_allocated (GB)": 24.39,
988
  "step": 980,
989
  "total_memory_available (GB)": 94.62
990
  },
991
  {
992
+ "epoch": 0.98,
993
+ "grad_norm": 0.33613139390945435,
994
+ "learning_rate": 5.267720550739419e-05,
995
+ "loss": 1.0215,
996
+ "max_memory_allocated (GB)": 91.9,
997
  "memory_allocated (GB)": 24.39,
998
  "step": 990,
999
  "total_memory_available (GB)": 94.62
1000
  },
1001
  {
1002
+ "epoch": 0.99,
1003
+ "grad_norm": 0.32623520493507385,
1004
+ "learning_rate": 5.216726160122387e-05,
1005
+ "loss": 1.0213,
1006
+ "max_memory_allocated (GB)": 91.9,
1007
  "memory_allocated (GB)": 24.39,
1008
  "step": 1000,
1009
  "total_memory_available (GB)": 94.62
1010
  }
1011
  ],
1012
  "logging_steps": 10,
1013
+ "max_steps": 2022,
1014
  "num_input_tokens_seen": 0,
1015
  "num_train_epochs": 2,
1016
  "save_steps": 1000,
checkpoint-1000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93e2964507a7ee2a283815dc99898c3c891ee7684ce7926ce108452bc498151d
3
- size 5944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97b6e8859324ecd7d4cabf5785e4c14760758a590bc4da42d43455de464ecb58
3
+ size 5880
checkpoint-2000/README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+ ### Framework versions
7
+
8
+
9
+ - PEFT 0.4.0
checkpoint-2000/adapter_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "mistralai/Mistral-7B-v0.1",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 8,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "q_proj",
18
+ "v_proj",
19
+ "o_proj",
20
+ "k_proj"
21
+ ],
22
+ "task_type": "CAUSAL_LM"
23
+ }
checkpoint-2000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb117a1cf2fc42a861e73affe771f35e3d2d51f081d5bfe2ff89e443467df952
3
+ size 13665592
checkpoint-2000/gaudi_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "autocast_bf16_ops": null,
3
+ "autocast_fp32_ops": null,
4
+ "optimum_version": "1.20.0",
5
+ "transformers_version": "4.38.2",
6
+ "use_dynamic_shapes": false,
7
+ "use_fused_adam": true,
8
+ "use_fused_clip_norm": true,
9
+ "use_torch_autocast": false
10
+ }
checkpoint-2000/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd44951050f26ebc4aa37181bde6fbdae39e5bc24925768b3297d59014814bf9
3
+ size 10229904
checkpoint-2000/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c81e5eb6f8fb0cd3a1f453ed7f07e4d7cc1e072f70b435265c54a14ec3942927
3
+ size 10229904
checkpoint-2000/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1befcff6836a413ddbcd6dcc7c4c93355b6e00aee0ce2ec517a51e768c247a0
3
+ size 10229904
checkpoint-2000/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:851f0dec6528943d2147a3569e7d508196bbe41d416a38fac86aaff828d7445e
3
+ size 10229904
checkpoint-2000/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:237c21ede3374d2842cf6fec23e61643f2ea678a328182e0a185551d3bd42250
3
+ size 10229904
checkpoint-2000/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:410046cc577e9c0bdcb980066951b5a424b1ba59de8162b31b39b0d209b0bce4
3
+ size 10229904
checkpoint-2000/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d62a67c257991f682da6c2a289846735b7cc95dc18adbfd96e102b3ef29a8bff
3
+ size 10229904
checkpoint-2000/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1f791eec6fc0ee0ba321b0158fa103da507d4b66df52e5c83ba736e6370cace
3
+ size 10229904
checkpoint-2000/global_step2000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ef7324ff623840a10639dd6caa4026b284da5537ed671f5b38c67e38bc5ad09
3
+ size 13740018
checkpoint-2000/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step2000
checkpoint-2000/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5543e0754fe9df260bd82cfa7e571704fd10d14732fd9f105223113b609efbc2
3
+ size 17968
checkpoint-2000/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7f739514ec0dc0df61d6d839eef9ff980a82e135422879f07e2cf78ae941894
3
+ size 17968
checkpoint-2000/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15f56123a00985d37c3d262835e83f25048a8316afa24ac66747eea53d775b20
3
+ size 17968
checkpoint-2000/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dab859d8ea187b03a86f8103eb04cb46964db4a4f623eb4ea44cce5fa97a5d32
3
+ size 17968
checkpoint-2000/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73a442b5a5145fccb3b785bb3d21d5a0a0d98db2bf3cce065c3a3341d9ceebcb
3
+ size 17968
checkpoint-2000/rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d4f4621a0fbf3451fbaa425a77d700278bcc178c26328ec4ede18bfb01f76ee
3
+ size 17968
checkpoint-2000/rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14ee6a4d4a973ce22b281fd2f4441fc97805ef0bdfab68d48d7ac7cd802b0a17
3
+ size 17968
checkpoint-2000/rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3effc8943473a84c5671504d52ab42275bd7891696938a4991b4095fdf0e6c71
3
+ size 17968
checkpoint-2000/special_tokens_map.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "</s>",
5
+ "unk_token": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ }
12
+ }
checkpoint-2000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2000/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
checkpoint-2000/tokenizer_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ }
29
+ },
30
+ "additional_special_tokens": [],
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "legacy": true,
35
+ "model_max_length": 1000000000000000019884624838656,
36
+ "pad_token": "</s>",
37
+ "sp_model_kwargs": {},
38
+ "spaces_between_special_tokens": false,
39
+ "tokenizer_class": "LlamaTokenizer",
40
+ "unk_token": "<unk>",
41
+ "use_default_system_prompt": false
42
+ }