minmingzhu02 commited on Jun 6

Commit

7958e73

•

1 Parent(s): 7684115

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +1 -0
adapter_model.safetensors +1 -1
all_results.json +5 -5
checkpoint-1000/README.md +1 -0
checkpoint-1000/adapter_model.safetensors +1 -1
checkpoint-1000/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1 -1
checkpoint-1000/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1 -1
checkpoint-1000/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1 -1
checkpoint-1000/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +1 -1
checkpoint-1000/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1 -1
checkpoint-1000/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1 -1
checkpoint-1000/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1 -1
checkpoint-1000/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1 -1
checkpoint-1000/global_step1000/mp_rank_00_model_states.pt +1 -1
checkpoint-1000/rng_state_0.pth +2 -2
checkpoint-1000/rng_state_1.pth +2 -2
checkpoint-1000/rng_state_2.pth +2 -2
checkpoint-1000/rng_state_3.pth +2 -2
checkpoint-1000/rng_state_4.pth +2 -2
checkpoint-1000/rng_state_5.pth +2 -2
checkpoint-1000/rng_state_6.pth +2 -2
checkpoint-1000/rng_state_7.pth +2 -2
checkpoint-1000/trainer_state.json +483 -483
checkpoint-1000/training_args.bin +2 -2
checkpoint-2000/README.md +9 -0
checkpoint-2000/adapter_config.json +23 -0
checkpoint-2000/adapter_model.safetensors +3 -0
checkpoint-2000/gaudi_config.json +10 -0
checkpoint-2000/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
checkpoint-2000/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
checkpoint-2000/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
checkpoint-2000/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
checkpoint-2000/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
checkpoint-2000/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
checkpoint-2000/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
checkpoint-2000/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
checkpoint-2000/global_step2000/mp_rank_00_model_states.pt +3 -0
checkpoint-2000/latest +1 -0
checkpoint-2000/rng_state_0.pth +3 -0
checkpoint-2000/rng_state_1.pth +3 -0
checkpoint-2000/rng_state_2.pth +3 -0
checkpoint-2000/rng_state_3.pth +3 -0
checkpoint-2000/rng_state_4.pth +3 -0
checkpoint-2000/rng_state_5.pth +3 -0
checkpoint-2000/rng_state_6.pth +3 -0
checkpoint-2000/rng_state_7.pth +3 -0
checkpoint-2000/special_tokens_map.json +12 -0
checkpoint-2000/tokenizer.json +0 -0
checkpoint-2000/tokenizer.model +3 -0
checkpoint-2000/tokenizer_config.json +42 -0

README.md CHANGED Viewed

@@ -5,5 +5,6 @@ library_name: peft
 ### Framework versions
 - PEFT 0.4.0

 ### Framework versions
+- PEFT 0.4.0
 - PEFT 0.4.0

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5ea5492a2629a41a3834cf2de2d413d3c30977c44b370a4bdc94bfa086eb6f04
 size 13665592

 version https://git-lfs.github.com/spec/v1
+oid sha256:a40df01fec581a7cd1da2c66520eb45000ec0730bd41494a3203415bcacf33e3
 size 13665592

all_results.json CHANGED Viewed

@@ -6,12 +6,12 @@
     "eval_samples": 25899,
     "eval_samples_per_second": 160.844,
     "eval_steps_per_second": 2.515,
-    "max_memory_allocated (GB)": 91.87,
     "memory_allocated (GB)": 24.39,
     "perplexity": 3.0531438702149534,
     "total_memory_available (GB)": 94.62,
-    "train_loss": 1.0469276263736165,
-    "train_runtime": 6085.8982,
-    "train_samples_per_second": 162.542,
-    "train_steps_per_second": 0.317
 }

     "eval_samples": 25899,
     "eval_samples_per_second": 160.844,
     "eval_steps_per_second": 2.515,
+    "max_memory_allocated (GB)": 91.9,
     "memory_allocated (GB)": 24.39,
     "perplexity": 3.0531438702149534,
     "total_memory_available (GB)": 94.62,
+    "train_loss": 1.050027716289524,
+    "train_runtime": 6424.4949,
+    "train_samples_per_second": 162.078,
+    "train_steps_per_second": 0.316
 }

checkpoint-1000/README.md CHANGED Viewed

@@ -5,5 +5,6 @@ library_name: peft
 ### Framework versions
 - PEFT 0.4.0

 ### Framework versions
+- PEFT 0.4.0
 - PEFT 0.4.0

checkpoint-1000/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:81c901e3a02306c80837c992a65b028aa5720bd76546736f6b293eb7dfc7140c
 size 13665592

 version https://git-lfs.github.com/spec/v1
+oid sha256:5c40dc4d08aadcdaf95d2045cc57b08ba3438ccc3bb482464c0f4fc60165266e
 size 13665592

checkpoint-1000/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fbcc1217cb0120aeb98ec1de36b0d9911840455125d2ac021d411340695b0e41
 size 10229904

 version https://git-lfs.github.com/spec/v1
+oid sha256:d46d6c3a35c5aa1f0d69f23137498ba05dcec1b85ab8f758931bb9d3a9c61f34
 size 10229904

checkpoint-1000/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b97ccb06cab86f7787aa54118479ed5365ea36124fa165035de40978981d7d90
 size 10229904

 version https://git-lfs.github.com/spec/v1
+oid sha256:9605b32dc9734841f4135f4b05d12dd42f4d99c4361a0351f4237cad48f90be3
 size 10229904

checkpoint-1000/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0e88a8703d82aca1a49db083c3389258661948ca907ccc2e9cb111b1551ca4b4
 size 10229904

 version https://git-lfs.github.com/spec/v1
+oid sha256:5876ec371de4435d924cc505aa32eb3dad219f45fc6663e1b1b7d8e6b0acf67f
 size 10229904

checkpoint-1000/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e427e683a4b2c0eaf3f92eeb4d17130f2db3cf91beb49cdec09db31cd03bc5d0
 size 10229904

 version https://git-lfs.github.com/spec/v1
+oid sha256:9e300264d099e85c2eaec5813361ea1324250f75f2b589f945f2f835b83af40d
 size 10229904

checkpoint-1000/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:447bcfa260f2ca4e01eab0355a25bc057c887425ee46599c03014702139424e9
 size 10229904

 version https://git-lfs.github.com/spec/v1
+oid sha256:b16bb1a71dc108e5f58765fb6631fd6dac31c9f448d7d14103ca456c36f952ce
 size 10229904

checkpoint-1000/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2f82c584aafdcfd6f83b4a382b39b2a21a1c6d328e15b598c757f0bbd57d7f40
 size 10229904

 version https://git-lfs.github.com/spec/v1
+oid sha256:7f7fc624f7d0466934102a8e291439878ba39384d242fee71905534ec7904e06
 size 10229904

checkpoint-1000/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ed340b57ccacdb58d19cb98548194affef267ef083941da271ddb8bda6982c71
 size 10229904

 version https://git-lfs.github.com/spec/v1
+oid sha256:f4f0723c2be1778d263d0b01b0b9d4e12020ffdf1dd81d6905cc10e8fa92d41d
 size 10229904

checkpoint-1000/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4e1108862ec93ac55212f8e0a1b8f3e51f7907e476768c008d36049e3882d3c6
 size 10229904

 version https://git-lfs.github.com/spec/v1
+oid sha256:24a0be42f2cadbc3fe2cb110dd10263c4641ceea734da5d7e75eba26e2901d3c
 size 10229904

checkpoint-1000/global_step1000/mp_rank_00_model_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f7eaa5aa673d8036fda2b25512f384ce627430c73f15341bdbc6c9ea4bbcccca
 size 13740018

 version https://git-lfs.github.com/spec/v1
+oid sha256:fb5bdbbb926d99fb70d7fc5227430f4f4f997a514d20837bc7e19ed2d1ae9e08
 size 13740018

checkpoint-1000/rng_state_0.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:da3853db45e30a134bde6283db65f4d2e12b4f06b534a780433dd598db8504cb
-size 18032

 version https://git-lfs.github.com/spec/v1
+oid sha256:a15cf27d7c1440499566c0754bb1d50c83a5017b6e7fa437ea341504ae66e2b6
+size 17968

checkpoint-1000/rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d0664a5f5d0f31c8412dfa2b9c211689c20cc952d4ce4660ee1761e2ec10776d
-size 18032

 version https://git-lfs.github.com/spec/v1
+oid sha256:825d61abc5a5ad521ad84ebb500b93ae8a8623f9369ad48cbe6d38bab5442ff1
+size 17968

checkpoint-1000/rng_state_2.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8418b1622ae3a1586b0d267b3c8c1d9d5d403a56cc73434016d717b501b2c0fd
-size 18032

 version https://git-lfs.github.com/spec/v1
+oid sha256:ac10f0c3ac84964c70d9fd60ba3e1f3ba4b539d4b6c1ae404241006617438fc1
+size 17968

checkpoint-1000/rng_state_3.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:51d300bdb34d5407f13acbfb278589f1a2907891fba4e020420c6053746cf8e2
-size 18032

 version https://git-lfs.github.com/spec/v1
+oid sha256:8954554f29ca86b2a2d0065650e4ed1767ee8c977ff3c3089f50b821a3a8fea1
+size 17968

checkpoint-1000/rng_state_4.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9d597fad31002fa89ec1fd8aefe8f76311a32275c74a84439354f630b9b001a4
-size 18032

 version https://git-lfs.github.com/spec/v1
+oid sha256:32cae0c2f26eab0ed7620c9d8a8c01759ff3f2406e4eca977af345db9260dfbc
+size 17968

checkpoint-1000/rng_state_5.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1fbf5d648175322bc9d62f67af45fa2e710bdc93b49adb768d8df629d2b0347f
-size 18032

 version https://git-lfs.github.com/spec/v1
+oid sha256:c8b789b47341496ea469886b9092dd0722b325942d9e0fa64277b02e5c6a462a
+size 17968

checkpoint-1000/rng_state_6.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fcfd1907de0ce290b15c0e3cda02ca34b46efde1a255be0c9b8e383cbf93e363
-size 18032

 version https://git-lfs.github.com/spec/v1
+oid sha256:211ce782229e6a2d3e1400311a29875805358cdd93d4b0eb3697d599fa847ccb
+size 17968

checkpoint-1000/rng_state_7.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6a4d45767ae6a4db5ed86b012add567287d0976c731678dabe7dedeb253ea63b
-size 18032

 version https://git-lfs.github.com/spec/v1
+oid sha256:b712b22f1679953e1505d7e1d8f16de7d3f9052abc5168daadd90c918870ab95
+size 17968

checkpoint-1000/trainer_state.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 1.0404473923787227,
   "eval_steps": 500,
   "global_step": 1000,
   "is_hyper_param_search": false,
@@ -10,9 +10,9 @@
   "log_history": [
     {
       "epoch": 0.01,
-      "grad_norm": 0.8564500212669373,
-      "learning_rate": 5.670773083167062e-05,
-      "loss": 1.7378,
       "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 10,
@@ -20,9 +20,9 @@
     },
     {
       "epoch": 0.02,
-      "grad_norm": 0.9088606238365173,
-      "learning_rate": 7.377845879804262e-05,
-      "loss": 1.5183,
       "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 20,
@@ -30,9 +30,9 @@
     },
     {
       "epoch": 0.03,
-      "grad_norm": 0.396087646484375,
-      "learning_rate": 8.376419451838216e-05,
-      "loss": 1.3578,
       "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 30,
@@ -40,9 +40,9 @@
     },
     {
       "epoch": 0.04,
-      "grad_norm": 0.2860943078994751,
-      "learning_rate": 9.084918676441463e-05,
-      "loss": 1.2832,
       "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 40,
@@ -50,9 +50,9 @@
     },
     {
       "epoch": 0.05,
-      "grad_norm": 0.2048167884349823,
-      "learning_rate": 9.634473369696918e-05,
-      "loss": 1.2717,
       "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 50,
@@ -60,9 +60,9 @@
     },
     {
       "epoch": 0.06,
-      "grad_norm": 0.2355910986661911,
-      "learning_rate": 9.994635193133047e-05,
-      "loss": 1.2359,
       "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 60,
@@ -70,947 +70,947 @@
     },
     {
       "epoch": 0.07,
-      "grad_norm": 0.25649788975715637,
-      "learning_rate": 9.94098712446352e-05,
-      "loss": 1.2202,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 70,
       "total_memory_available (GB)": 94.62
     },
     {
       "epoch": 0.08,
-      "grad_norm": 0.3248240649700165,
-      "learning_rate": 9.887339055793991e-05,
-      "loss": 1.1928,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 80,
       "total_memory_available (GB)": 94.62
     },
     {
       "epoch": 0.09,
-      "grad_norm": 0.34330031275749207,
-      "learning_rate": 9.833690987124465e-05,
-      "loss": 1.1839,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 90,
       "total_memory_available (GB)": 94.62
     },
     {
       "epoch": 0.1,
-      "grad_norm": 0.481341689825058,
-      "learning_rate": 9.780042918454936e-05,
-      "loss": 1.1682,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 100,
       "total_memory_available (GB)": 94.62
     },
     {
       "epoch": 0.11,
-      "grad_norm": 0.33835262060165405,
-      "learning_rate": 9.726394849785409e-05,
-      "loss": 1.1528,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 110,
       "total_memory_available (GB)": 94.62
     },
     {
       "epoch": 0.12,
-      "grad_norm": 0.5484705567359924,
-      "learning_rate": 9.67274678111588e-05,
-      "loss": 1.1511,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 120,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.14,
-      "grad_norm": 0.3785193860530853,
-      "learning_rate": 9.619098712446352e-05,
-      "loss": 1.152,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 130,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.15,
-      "grad_norm": 0.36997660994529724,
-      "learning_rate": 9.565450643776824e-05,
-      "loss": 1.1489,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 140,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.16,
-      "grad_norm": 0.5300387144088745,
-      "learning_rate": 9.511802575107297e-05,
-      "loss": 1.1326,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 150,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.17,
-      "grad_norm": 0.5168531537055969,
-      "learning_rate": 9.458154506437769e-05,
-      "loss": 1.1253,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 160,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.18,
-      "grad_norm": 0.48498621582984924,
-      "learning_rate": 9.404506437768241e-05,
-      "loss": 1.1079,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 170,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.19,
-      "grad_norm": 0.4298243522644043,
-      "learning_rate": 9.350858369098713e-05,
-      "loss": 1.0987,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 180,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.2,
-      "grad_norm": 0.4342297315597534,
-      "learning_rate": 9.297210300429185e-05,
-      "loss": 1.0975,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 190,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.21,
-      "grad_norm": 0.5405712127685547,
-      "learning_rate": 9.243562231759658e-05,
-      "loss": 1.1084,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 200,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.22,
-      "grad_norm": 0.4317038655281067,
-      "learning_rate": 9.189914163090128e-05,
-      "loss": 1.0969,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 210,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.23,
-      "grad_norm": 0.41190406680107117,
-      "learning_rate": 9.136266094420602e-05,
       "loss": 1.0992,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 220,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.24,
-      "grad_norm": 0.5093066692352295,
-      "learning_rate": 9.082618025751073e-05,
-      "loss": 1.0875,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 230,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.25,
-      "grad_norm": 0.40610459446907043,
-      "learning_rate": 9.028969957081545e-05,
-      "loss": 1.0886,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 240,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.26,
-      "grad_norm": 0.38791623711586,
-      "learning_rate": 8.975321888412017e-05,
-      "loss": 1.0783,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 250,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.27,
-      "grad_norm": 0.3828742206096649,
-      "learning_rate": 8.92167381974249e-05,
-      "loss": 1.0816,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 260,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.28,
-      "grad_norm": 0.5216621160507202,
-      "learning_rate": 8.868025751072962e-05,
-      "loss": 1.0731,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 270,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.29,
-      "grad_norm": 0.43531420826911926,
-      "learning_rate": 8.814377682403434e-05,
-      "loss": 1.0804,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 280,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.3,
-      "grad_norm": 0.41790470480918884,
-      "learning_rate": 8.760729613733906e-05,
-      "loss": 1.0785,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 290,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.31,
-      "grad_norm": 0.456264466047287,
-      "learning_rate": 8.707081545064378e-05,
-      "loss": 1.0708,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 300,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.32,
-      "grad_norm": 0.3793538212776184,
-      "learning_rate": 8.65343347639485e-05,
-      "loss": 1.0818,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 310,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.33,
-      "grad_norm": 0.37741824984550476,
-      "learning_rate": 8.599785407725323e-05,
-      "loss": 1.0669,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 320,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.34,
-      "grad_norm": 0.3576098084449768,
-      "learning_rate": 8.546137339055795e-05,
-      "loss": 1.0678,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 330,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.35,
-      "grad_norm": 0.4295920133590698,
-      "learning_rate": 8.492489270386267e-05,
-      "loss": 1.0571,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 340,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.36,
-      "grad_norm": 0.38484010100364685,
-      "learning_rate": 8.438841201716738e-05,
-      "loss": 1.0697,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 350,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.37,
-      "grad_norm": 0.3806072175502777,
-      "learning_rate": 8.385193133047211e-05,
-      "loss": 1.0652,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 360,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.38,
-      "grad_norm": 0.3507857918739319,
-      "learning_rate": 8.331545064377682e-05,
-      "loss": 1.061,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 370,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.4,
-      "grad_norm": 0.3869009017944336,
-      "learning_rate": 8.277896995708156e-05,
-      "loss": 1.0622,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 380,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.41,
-      "grad_norm": 0.3941822648048401,
-      "learning_rate": 8.224248927038627e-05,
-      "loss": 1.0629,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 390,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.42,
-      "grad_norm": 0.3660201132297516,
-      "learning_rate": 8.1706008583691e-05,
-      "loss": 1.0589,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 400,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.43,
-      "grad_norm": 0.3641981780529022,
-      "learning_rate": 8.116952789699571e-05,
-      "loss": 1.0611,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 410,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.44,
-      "grad_norm": 0.38291656970977783,
-      "learning_rate": 8.063304721030043e-05,
-      "loss": 1.0554,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 420,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.45,
-      "grad_norm": 0.3162360191345215,
-      "learning_rate": 8.009656652360515e-05,
-      "loss": 1.0504,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 430,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.46,
-      "grad_norm": 0.3741254508495331,
-      "learning_rate": 7.956008583690988e-05,
-      "loss": 1.0525,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 440,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.47,
-      "grad_norm": 0.3503776788711548,
-      "learning_rate": 7.90236051502146e-05,
-      "loss": 1.0554,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 450,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.48,
-      "grad_norm": 0.3659127652645111,
-      "learning_rate": 7.848712446351931e-05,
-      "loss": 1.0464,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 460,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.49,
-      "grad_norm": 0.3401431143283844,
-      "learning_rate": 7.795064377682404e-05,
-      "loss": 1.0596,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 470,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.5,
-      "grad_norm": 0.33970314264297485,
-      "learning_rate": 7.741416309012875e-05,
-      "loss": 1.0659,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 480,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.51,
-      "grad_norm": 0.40222179889678955,
-      "learning_rate": 7.687768240343349e-05,
-      "loss": 1.0485,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 490,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.52,
-      "grad_norm": 0.3137299716472626,
-      "learning_rate": 7.63412017167382e-05,
-      "loss": 1.0506,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 500,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.53,
-      "grad_norm": 0.3413981795310974,
-      "learning_rate": 7.580472103004293e-05,
-      "loss": 1.0466,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 510,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.54,
-      "grad_norm": 0.3440905511379242,
-      "learning_rate": 7.526824034334764e-05,
-      "loss": 1.0503,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 520,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.55,
-      "grad_norm": 0.367201030254364,
-      "learning_rate": 7.473175965665236e-05,
-      "loss": 1.0449,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 530,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.56,
-      "grad_norm": 0.37777239084243774,
-      "learning_rate": 7.419527896995708e-05,
-      "loss": 1.0514,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 540,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.57,
-      "grad_norm": 0.355307936668396,
-      "learning_rate": 7.36587982832618e-05,
-      "loss": 1.0458,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 550,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.58,
-      "grad_norm": 0.43950700759887695,
-      "learning_rate": 7.312231759656653e-05,
-      "loss": 1.0426,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 560,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.59,
-      "grad_norm": 0.31208208203315735,
-      "learning_rate": 7.258583690987125e-05,
-      "loss": 1.0432,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 570,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.6,
-      "grad_norm": 0.3358898460865021,
-      "learning_rate": 7.204935622317597e-05,
-      "loss": 1.0563,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 580,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.61,
-      "grad_norm": 0.3219963610172272,
-      "learning_rate": 7.15128755364807e-05,
-      "loss": 1.0402,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 590,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.62,
-      "grad_norm": 0.3409494161605835,
-      "learning_rate": 7.097639484978542e-05,
-      "loss": 1.0452,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 600,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.63,
-      "grad_norm": 0.3917517364025116,
-      "learning_rate": 7.043991416309014e-05,
-      "loss": 1.0358,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 610,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.65,
-      "grad_norm": 0.36202436685562134,
-      "learning_rate": 6.990343347639486e-05,
-      "loss": 1.0421,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 620,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.66,
-      "grad_norm": 0.34764185547828674,
-      "learning_rate": 6.936695278969958e-05,
-      "loss": 1.0414,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 630,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.67,
-      "grad_norm": 0.3368031084537506,
-      "learning_rate": 6.883047210300429e-05,
-      "loss": 1.0328,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 640,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.68,
-      "grad_norm": 0.3216454088687897,
-      "learning_rate": 6.829399141630901e-05,
-      "loss": 1.0368,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 650,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.69,
-      "grad_norm": 0.36135369539260864,
-      "learning_rate": 6.775751072961373e-05,
-      "loss": 1.0375,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 660,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.7,
-      "grad_norm": 0.3772068917751312,
-      "learning_rate": 6.722103004291846e-05,
-      "loss": 1.0384,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 670,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.71,
-      "grad_norm": 0.352990984916687,
-      "learning_rate": 6.668454935622318e-05,
-      "loss": 1.0347,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 680,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.72,
-      "grad_norm": 0.38070163130760193,
-      "learning_rate": 6.61480686695279e-05,
-      "loss": 1.0414,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 690,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.73,
-      "grad_norm": 0.3262191414833069,
-      "learning_rate": 6.561158798283262e-05,
-      "loss": 1.0427,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 700,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.74,
-      "grad_norm": 0.37472084164619446,
-      "learning_rate": 6.507510729613734e-05,
-      "loss": 1.035,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 710,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.75,
-      "grad_norm": 0.3830932676792145,
-      "learning_rate": 6.453862660944207e-05,
-      "loss": 1.0329,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 720,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.76,
-      "grad_norm": 0.33351582288742065,
-      "learning_rate": 6.400214592274679e-05,
-      "loss": 1.0338,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 730,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.77,
-      "grad_norm": 0.30762043595314026,
-      "learning_rate": 6.346566523605151e-05,
-      "loss": 1.0452,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 740,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.78,
-      "grad_norm": 0.3884871006011963,
-      "learning_rate": 6.292918454935622e-05,
-      "loss": 1.0387,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 750,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.79,
-      "grad_norm": 0.34226515889167786,
-      "learning_rate": 6.239270386266095e-05,
-      "loss": 1.0282,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 760,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.8,
-      "grad_norm": 0.3282904624938965,
-      "learning_rate": 6.185622317596566e-05,
-      "loss": 1.0266,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 770,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.81,
-      "grad_norm": 0.34865814447402954,
-      "learning_rate": 6.13197424892704e-05,
-      "loss": 1.0369,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 780,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.82,
-      "grad_norm": 0.34097954630851746,
-      "learning_rate": 6.0783261802575106e-05,
-      "loss": 1.0241,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 790,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.83,
-      "grad_norm": 0.342785120010376,
-      "learning_rate": 6.0246781115879835e-05,
-      "loss": 1.0289,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 800,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.84,
-      "grad_norm": 0.33090144395828247,
-      "learning_rate": 5.971030042918455e-05,
-      "loss": 1.0335,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 810,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.85,
-      "grad_norm": 0.38638409972190857,
-      "learning_rate": 5.917381974248928e-05,
-      "loss": 1.0272,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 820,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.86,
-      "grad_norm": 0.34252798557281494,
-      "learning_rate": 5.8637339055793994e-05,
-      "loss": 1.0284,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 830,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.87,
-      "grad_norm": 0.3451687693595886,
-      "learning_rate": 5.810085836909872e-05,
-      "loss": 1.0212,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 840,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.88,
-      "grad_norm": 0.384164422750473,
-      "learning_rate": 5.756437768240344e-05,
-      "loss": 1.0347,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 850,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.89,
-      "grad_norm": 0.3310836851596832,
-      "learning_rate": 5.7027896995708154e-05,
-      "loss": 1.0153,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 860,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.91,
-      "grad_norm": 0.3287970721721649,
-      "learning_rate": 5.6491416309012876e-05,
-      "loss": 1.0386,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 870,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.92,
-      "grad_norm": 0.3661152720451355,
-      "learning_rate": 5.59549356223176e-05,
-      "loss": 1.0195,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 880,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.93,
-      "grad_norm": 0.39101266860961914,
-      "learning_rate": 5.541845493562232e-05,
-      "loss": 1.0285,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 890,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.94,
-      "grad_norm": 0.38129398226737976,
-      "learning_rate": 5.4881974248927035e-05,
-      "loss": 1.0375,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 900,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.95,
-      "grad_norm": 0.32832667231559753,
-      "learning_rate": 5.4345493562231764e-05,
-      "loss": 1.0229,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 910,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.96,
-      "grad_norm": 0.35413920879364014,
-      "learning_rate": 5.380901287553648e-05,
-      "loss": 1.024,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 920,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.97,
-      "grad_norm": 0.3270743787288666,
-      "learning_rate": 5.327253218884121e-05,
-      "loss": 1.0319,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 930,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.98,
-      "grad_norm": 0.33611926436424255,
-      "learning_rate": 5.273605150214592e-05,
-      "loss": 1.0198,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 940,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 0.99,
-      "grad_norm": 0.34108686447143555,
-      "learning_rate": 5.219957081545065e-05,
-      "loss": 1.0294,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 950,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 1.0,
-      "grad_norm": 0.3304935395717621,
-      "learning_rate": 5.166309012875537e-05,
-      "loss": 1.0264,
-      "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 960,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 1.01,
-      "grad_norm": 0.3912031054496765,
-      "learning_rate": 5.112660944206009e-05,
-      "loss": 1.0192,
-      "max_memory_allocated (GB)": 91.87,
       "memory_allocated (GB)": 24.39,
       "step": 970,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 1.02,
-      "grad_norm": 0.33931615948677063,
-      "learning_rate": 5.0590128755364804e-05,
-      "loss": 1.0242,
-      "max_memory_allocated (GB)": 91.87,
       "memory_allocated (GB)": 24.39,
       "step": 980,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 1.03,
-      "grad_norm": 0.3690173923969269,
-      "learning_rate": 5.005364806866953e-05,
-      "loss": 1.0202,
-      "max_memory_allocated (GB)": 91.87,
       "memory_allocated (GB)": 24.39,
       "step": 990,
       "total_memory_available (GB)": 94.62
     },
     {
-      "epoch": 1.04,
-      "grad_norm": 0.35189080238342285,
-      "learning_rate": 4.951716738197425e-05,
-      "loss": 1.0206,
-      "max_memory_allocated (GB)": 91.87,
       "memory_allocated (GB)": 24.39,
       "step": 1000,
       "total_memory_available (GB)": 94.62
     }
   ],
   "logging_steps": 10,
-  "max_steps": 1922,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 2,
   "save_steps": 1000,

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.9883864591055103,
   "eval_steps": 500,
   "global_step": 1000,
   "is_hyper_param_search": false,
   "log_history": [
     {
       "epoch": 0.01,
+      "grad_norm": 0.8518019318580627,
+      "learning_rate": 5.6012058970266934e-05,
+      "loss": 1.7421,
       "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 10,
     },
     {
       "epoch": 0.02,
+      "grad_norm": 0.8390570878982544,
+      "learning_rate": 7.287336883921704e-05,
+      "loss": 1.5281,
       "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 20,
     },
     {
       "epoch": 0.03,
+      "grad_norm": 0.3700675666332245,
+      "learning_rate": 8.273660282559241e-05,
+      "loss": 1.3485,
       "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 30,
     },
     {
       "epoch": 0.04,
+      "grad_norm": 0.3168916404247284,
+      "learning_rate": 8.973467870816715e-05,
+      "loss": 1.2968,
       "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 40,
     },
     {
       "epoch": 0.05,
+      "grad_norm": 0.24861344695091248,
+      "learning_rate": 9.516280807158375e-05,
+      "loss": 1.2689,
       "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 50,
     },
     {
       "epoch": 0.06,
+      "grad_norm": 0.22251686453819275,
+      "learning_rate": 9.959791269454252e-05,
+      "loss": 1.2434,
       "max_memory_allocated (GB)": 91.86,
       "memory_allocated (GB)": 24.39,
       "step": 60,
     },
     {
       "epoch": 0.07,
+      "grad_norm": 0.23426611721515656,
+      "learning_rate": 9.959204487506375e-05,
+      "loss": 1.2152,
+      "max_memory_allocated (GB)": 91.87,
       "memory_allocated (GB)": 24.39,
       "step": 70,
       "total_memory_available (GB)": 94.62
     },
     {
       "epoch": 0.08,
+      "grad_norm": 0.45850667357444763,
+      "learning_rate": 9.908210096889343e-05,
+      "loss": 1.2108,
+      "max_memory_allocated (GB)": 91.87,
       "memory_allocated (GB)": 24.39,
       "step": 80,
       "total_memory_available (GB)": 94.62
     },
     {
       "epoch": 0.09,
+      "grad_norm": 0.4196653366088867,
+      "learning_rate": 9.85721570627231e-05,
+      "loss": 1.1913,
+      "max_memory_allocated (GB)": 91.87,
       "memory_allocated (GB)": 24.39,
       "step": 90,
       "total_memory_available (GB)": 94.62
     },
     {
       "epoch": 0.1,
+      "grad_norm": 0.5248636603355408,
+      "learning_rate": 9.806221315655279e-05,
+      "loss": 1.1924,
+      "max_memory_allocated (GB)": 91.87,
       "memory_allocated (GB)": 24.39,
       "step": 100,
       "total_memory_available (GB)": 94.62
     },
     {
       "epoch": 0.11,
+      "grad_norm": 0.3434283137321472,
+      "learning_rate": 9.755226925038246e-05,
+      "loss": 1.1558,
+      "max_memory_allocated (GB)": 91.87,
       "memory_allocated (GB)": 24.39,
       "step": 110,
       "total_memory_available (GB)": 94.62
     },
     {
       "epoch": 0.12,
+      "grad_norm": 0.47737815976142883,
+      "learning_rate": 9.704232534421214e-05,
+      "loss": 1.1492,
+      "max_memory_allocated (GB)": 91.87,
       "memory_allocated (GB)": 24.39,
       "step": 120,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.13,
+      "grad_norm": 0.47788286209106445,
+      "learning_rate": 9.653238143804181e-05,
+      "loss": 1.1486,
+      "max_memory_allocated (GB)": 91.87,
       "memory_allocated (GB)": 24.39,
       "step": 130,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.14,
+      "grad_norm": 0.45408132672309875,
+      "learning_rate": 9.60224375318715e-05,
+      "loss": 1.1456,
+      "max_memory_allocated (GB)": 91.87,
       "memory_allocated (GB)": 24.39,
       "step": 140,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.15,
+      "grad_norm": 0.4091607630252838,
+      "learning_rate": 9.551249362570118e-05,
+      "loss": 1.1365,
+      "max_memory_allocated (GB)": 91.87,
       "memory_allocated (GB)": 24.39,
       "step": 150,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.16,
+      "grad_norm": 0.5064594745635986,
+      "learning_rate": 9.500254971953085e-05,
+      "loss": 1.137,
+      "max_memory_allocated (GB)": 91.87,
       "memory_allocated (GB)": 24.39,
       "step": 160,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.17,
+      "grad_norm": 0.4288266897201538,
+      "learning_rate": 9.449260581336054e-05,
+      "loss": 1.1181,
+      "max_memory_allocated (GB)": 91.87,
       "memory_allocated (GB)": 24.39,
       "step": 170,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.18,
+      "grad_norm": 0.3854447901248932,
+      "learning_rate": 9.398266190719021e-05,
+      "loss": 1.1091,
+      "max_memory_allocated (GB)": 91.87,
       "memory_allocated (GB)": 24.39,
       "step": 180,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.19,
+      "grad_norm": 0.4143249988555908,
+      "learning_rate": 9.347271800101989e-05,
+      "loss": 1.1156,
+      "max_memory_allocated (GB)": 91.87,
       "memory_allocated (GB)": 24.39,
       "step": 190,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.2,
+      "grad_norm": 0.521230161190033,
+      "learning_rate": 9.296277409484956e-05,
+      "loss": 1.1117,
+      "max_memory_allocated (GB)": 91.87,
       "memory_allocated (GB)": 24.39,
       "step": 200,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.21,
+      "grad_norm": 0.487106055021286,
+      "learning_rate": 9.245283018867925e-05,
+      "loss": 1.1003,
+      "max_memory_allocated (GB)": 91.87,
       "memory_allocated (GB)": 24.39,
       "step": 210,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.22,
+      "grad_norm": 0.4616335928440094,
+      "learning_rate": 9.194288628250894e-05,
       "loss": 1.0992,
+      "max_memory_allocated (GB)": 91.88,
       "memory_allocated (GB)": 24.39,
       "step": 220,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.23,
+      "grad_norm": 0.3908211290836334,
+      "learning_rate": 9.14329423763386e-05,
+      "loss": 1.1074,
+      "max_memory_allocated (GB)": 91.88,
       "memory_allocated (GB)": 24.39,
       "step": 230,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.24,
+      "grad_norm": 0.4411673843860626,
+      "learning_rate": 9.092299847016829e-05,
+      "loss": 1.1055,
+      "max_memory_allocated (GB)": 91.88,
       "memory_allocated (GB)": 24.39,
       "step": 240,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.25,
+      "grad_norm": 0.4123621881008148,
+      "learning_rate": 9.041305456399796e-05,
+      "loss": 1.0883,
+      "max_memory_allocated (GB)": 91.88,
       "memory_allocated (GB)": 24.39,
       "step": 250,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.26,
+      "grad_norm": 0.5461438298225403,
+      "learning_rate": 8.990311065782764e-05,
+      "loss": 1.0928,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 260,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.27,
+      "grad_norm": 0.4907448887825012,
+      "learning_rate": 8.939316675165733e-05,
+      "loss": 1.0912,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 270,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.28,
+      "grad_norm": 0.45152169466018677,
+      "learning_rate": 8.8883222845487e-05,
+      "loss": 1.0891,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 280,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.29,
+      "grad_norm": 0.41472557187080383,
+      "learning_rate": 8.837327893931669e-05,
+      "loss": 1.0864,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 290,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.3,
+      "grad_norm": 0.45566004514694214,
+      "learning_rate": 8.786333503314635e-05,
+      "loss": 1.0776,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 300,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.31,
+      "grad_norm": 0.3909231424331665,
+      "learning_rate": 8.735339112697604e-05,
+      "loss": 1.0801,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 310,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.32,
+      "grad_norm": 0.39705774188041687,
+      "learning_rate": 8.684344722080571e-05,
+      "loss": 1.0746,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 320,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.33,
+      "grad_norm": 0.4257935881614685,
+      "learning_rate": 8.633350331463539e-05,
+      "loss": 1.0738,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 330,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.34,
+      "grad_norm": 0.41336777806282043,
+      "learning_rate": 8.582355940846507e-05,
+      "loss": 1.0811,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 340,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.35,
+      "grad_norm": 0.3621828854084015,
+      "learning_rate": 8.531361550229475e-05,
+      "loss": 1.0762,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 350,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.36,
+      "grad_norm": 0.398189902305603,
+      "learning_rate": 8.480367159612444e-05,
+      "loss": 1.0622,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 360,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.37,
+      "grad_norm": 0.37738627195358276,
+      "learning_rate": 8.42937276899541e-05,
+      "loss": 1.06,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 370,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.38,
+      "grad_norm": 0.40790703892707825,
+      "learning_rate": 8.378378378378379e-05,
+      "loss": 1.0768,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 380,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.39,
+      "grad_norm": 0.35230451822280884,
+      "learning_rate": 8.327383987761347e-05,
+      "loss": 1.0631,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 390,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.4,
+      "grad_norm": 0.37737661600112915,
+      "learning_rate": 8.276389597144315e-05,
+      "loss": 1.0665,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 400,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.41,
+      "grad_norm": 0.39823117852211,
+      "learning_rate": 8.225395206527282e-05,
+      "loss": 1.0739,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 410,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.42,
+      "grad_norm": 0.38277310132980347,
+      "learning_rate": 8.17440081591025e-05,
+      "loss": 1.07,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 420,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.43,
+      "grad_norm": 0.34220802783966064,
+      "learning_rate": 8.123406425293219e-05,
+      "loss": 1.0698,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 430,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.43,
+      "grad_norm": 0.3858403265476227,
+      "learning_rate": 8.072412034676186e-05,
+      "loss": 1.0488,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 440,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.44,
+      "grad_norm": 0.36855727434158325,
+      "learning_rate": 8.021417644059154e-05,
+      "loss": 1.0612,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 450,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.45,
+      "grad_norm": 0.4122312664985657,
+      "learning_rate": 7.970423253442122e-05,
+      "loss": 1.0566,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 460,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.46,
+      "grad_norm": 0.38682645559310913,
+      "learning_rate": 7.91942886282509e-05,
+      "loss": 1.0575,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 470,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.47,
+      "grad_norm": 0.38858598470687866,
+      "learning_rate": 7.868434472208057e-05,
+      "loss": 1.0579,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 480,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.48,
+      "grad_norm": 0.3749813139438629,
+      "learning_rate": 7.817440081591025e-05,
+      "loss": 1.0531,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 490,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.49,
+      "grad_norm": 0.36404120922088623,
+      "learning_rate": 7.766445690973994e-05,
+      "loss": 1.0447,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 500,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.5,
+      "grad_norm": 0.4445250332355499,
+      "learning_rate": 7.715451300356961e-05,
+      "loss": 1.0526,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 510,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.51,
+      "grad_norm": 0.3644183278083801,
+      "learning_rate": 7.664456909739929e-05,
+      "loss": 1.0494,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 520,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.52,
+      "grad_norm": 0.34624868631362915,
+      "learning_rate": 7.613462519122897e-05,
+      "loss": 1.0572,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 530,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.53,
+      "grad_norm": 0.3788256347179413,
+      "learning_rate": 7.562468128505865e-05,
+      "loss": 1.0502,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 540,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.54,
+      "grad_norm": 0.3667903542518616,
+      "learning_rate": 7.511473737888832e-05,
+      "loss": 1.0505,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 550,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.55,
+      "grad_norm": 0.37510526180267334,
+      "learning_rate": 7.460479347271801e-05,
+      "loss": 1.045,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 560,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.56,
+      "grad_norm": 0.3509054183959961,
+      "learning_rate": 7.409484956654769e-05,
+      "loss": 1.0504,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 570,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.57,
+      "grad_norm": 0.3294220268726349,
+      "learning_rate": 7.358490566037736e-05,
+      "loss": 1.0573,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 580,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.58,
+      "grad_norm": 0.34325262904167175,
+      "learning_rate": 7.307496175420703e-05,
+      "loss": 1.0445,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 590,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.59,
+      "grad_norm": 0.3679581880569458,
+      "learning_rate": 7.256501784803672e-05,
+      "loss": 1.0445,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 600,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.6,
+      "grad_norm": 0.3220757246017456,
+      "learning_rate": 7.20550739418664e-05,
+      "loss": 1.0458,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 610,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.61,
+      "grad_norm": 0.3660426139831543,
+      "learning_rate": 7.154513003569607e-05,
+      "loss": 1.0447,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 620,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.62,
+      "grad_norm": 0.32533150911331177,
+      "learning_rate": 7.103518612952576e-05,
+      "loss": 1.0471,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 630,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.63,
+      "grad_norm": 0.33115923404693604,
+      "learning_rate": 7.052524222335543e-05,
+      "loss": 1.0431,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 640,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.64,
+      "grad_norm": 0.337576299905777,
+      "learning_rate": 7.001529831718512e-05,
+      "loss": 1.057,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 650,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.65,
+      "grad_norm": 0.3337574303150177,
+      "learning_rate": 6.950535441101478e-05,
+      "loss": 1.0408,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 660,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.66,
+      "grad_norm": 0.35560840368270874,
+      "learning_rate": 6.899541050484447e-05,
+      "loss": 1.0363,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 670,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.67,
+      "grad_norm": 0.3454528748989105,
+      "learning_rate": 6.848546659867415e-05,
+      "loss": 1.0412,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 680,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.68,
+      "grad_norm": 0.3608352541923523,
+      "learning_rate": 6.797552269250382e-05,
+      "loss": 1.0578,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 690,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.69,
+      "grad_norm": 0.332792729139328,
+      "learning_rate": 6.746557878633351e-05,
+      "loss": 1.0426,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 700,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.7,
+      "grad_norm": 0.4059067666530609,
+      "learning_rate": 6.695563488016318e-05,
+      "loss": 1.0441,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 710,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.71,
+      "grad_norm": 0.3741580843925476,
+      "learning_rate": 6.644569097399287e-05,
+      "loss": 1.0525,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 720,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.72,
+      "grad_norm": 0.3646301329135895,
+      "learning_rate": 6.593574706782255e-05,
+      "loss": 1.0302,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 730,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.73,
+      "grad_norm": 0.35956060886383057,
+      "learning_rate": 6.542580316165222e-05,
+      "loss": 1.0439,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 740,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.74,
+      "grad_norm": 0.3517419099807739,
+      "learning_rate": 6.491585925548191e-05,
+      "loss": 1.0314,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 750,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.75,
+      "grad_norm": 0.33927640318870544,
+      "learning_rate": 6.440591534931157e-05,
+      "loss": 1.042,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 760,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.76,
+      "grad_norm": 0.3502146005630493,
+      "learning_rate": 6.389597144314126e-05,
+      "loss": 1.0416,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 770,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.77,
+      "grad_norm": 0.37221387028694153,
+      "learning_rate": 6.338602753697093e-05,
+      "loss": 1.0453,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 780,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.78,
+      "grad_norm": 0.3718739449977875,
+      "learning_rate": 6.287608363080062e-05,
+      "loss": 1.0392,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 790,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.79,
+      "grad_norm": 0.35249418020248413,
+      "learning_rate": 6.23661397246303e-05,
+      "loss": 1.0413,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 800,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.8,
+      "grad_norm": 0.35906028747558594,
+      "learning_rate": 6.185619581845997e-05,
+      "loss": 1.0375,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 810,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.81,
+      "grad_norm": 0.33932170271873474,
+      "learning_rate": 6.134625191228966e-05,
+      "loss": 1.0286,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 820,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.82,
+      "grad_norm": 0.33107632398605347,
+      "learning_rate": 6.0836308006119326e-05,
+      "loss": 1.0319,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 830,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.83,
+      "grad_norm": 0.32848185300827026,
+      "learning_rate": 6.032636409994901e-05,
+      "loss": 1.0329,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 840,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.84,
+      "grad_norm": 0.33085334300994873,
+      "learning_rate": 5.981642019377869e-05,
+      "loss": 1.0326,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 850,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.85,
+      "grad_norm": 0.3043057322502136,
+      "learning_rate": 5.930647628760837e-05,
+      "loss": 1.0379,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 860,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.86,
+      "grad_norm": 0.3407464027404785,
+      "learning_rate": 5.879653238143804e-05,
+      "loss": 1.0336,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 870,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.87,
+      "grad_norm": 0.34069886803627014,
+      "learning_rate": 5.8286588475267726e-05,
+      "loss": 1.0294,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 880,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.88,
+      "grad_norm": 0.4303439259529114,
+      "learning_rate": 5.777664456909741e-05,
+      "loss": 1.0223,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 890,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.89,
+      "grad_norm": 0.3378705382347107,
+      "learning_rate": 5.7266700662927075e-05,
+      "loss": 1.042,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 900,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.9,
+      "grad_norm": 0.4227118194103241,
+      "learning_rate": 5.6756756756756757e-05,
+      "loss": 1.0301,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 910,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.91,
+      "grad_norm": 0.36343687772750854,
+      "learning_rate": 5.624681285058644e-05,
+      "loss": 1.0249,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 920,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.92,
+      "grad_norm": 0.37661993503570557,
+      "learning_rate": 5.573686894441612e-05,
+      "loss": 1.0201,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 930,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.93,
+      "grad_norm": 0.38610127568244934,
+      "learning_rate": 5.5226925038245794e-05,
+      "loss": 1.0351,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 940,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.94,
+      "grad_norm": 0.32767486572265625,
+      "learning_rate": 5.4716981132075475e-05,
+      "loss": 1.0364,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 950,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.95,
+      "grad_norm": 0.36714789271354675,
+      "learning_rate": 5.4207037225905157e-05,
+      "loss": 1.0252,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 960,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.96,
+      "grad_norm": 0.34143558144569397,
+      "learning_rate": 5.369709331973484e-05,
+      "loss": 1.0266,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 970,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.97,
+      "grad_norm": 0.3287556767463684,
+      "learning_rate": 5.3187149413564506e-05,
+      "loss": 1.0265,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 980,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.98,
+      "grad_norm": 0.33613139390945435,
+      "learning_rate": 5.267720550739419e-05,
+      "loss": 1.0215,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 990,
       "total_memory_available (GB)": 94.62
     },
     {
+      "epoch": 0.99,
+      "grad_norm": 0.32623520493507385,
+      "learning_rate": 5.216726160122387e-05,
+      "loss": 1.0213,
+      "max_memory_allocated (GB)": 91.9,
       "memory_allocated (GB)": 24.39,
       "step": 1000,
       "total_memory_available (GB)": 94.62
     }
   ],
   "logging_steps": 10,
+  "max_steps": 2022,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 2,
   "save_steps": 1000,

checkpoint-1000/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:93e2964507a7ee2a283815dc99898c3c891ee7684ce7926ce108452bc498151d
-size 5944

 version https://git-lfs.github.com/spec/v1
+oid sha256:97b6e8859324ecd7d4cabf5785e4c14760758a590bc4da42d43455de464ecb58
+size 5880

checkpoint-2000/README.md ADDED Viewed

	@@ -0,0 +1,9 @@

+---
+library_name: peft
+---
+## Training procedure
+### Framework versions
+- PEFT 0.4.0

checkpoint-2000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-v0.1",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj"
+  ],
+  "task_type": "CAUSAL_LM"
+}

checkpoint-2000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb117a1cf2fc42a861e73affe771f35e3d2d51f081d5bfe2ff89e443467df952
+size 13665592

checkpoint-2000/gaudi_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "autocast_bf16_ops": null,
+  "autocast_fp32_ops": null,
+  "optimum_version": "1.20.0",
+  "transformers_version": "4.38.2",
+  "use_dynamic_shapes": false,
+  "use_fused_adam": true,
+  "use_fused_clip_norm": true,
+  "use_torch_autocast": false
+}

checkpoint-2000/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd44951050f26ebc4aa37181bde6fbdae39e5bc24925768b3297d59014814bf9
+size 10229904

checkpoint-2000/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c81e5eb6f8fb0cd3a1f453ed7f07e4d7cc1e072f70b435265c54a14ec3942927
+size 10229904

checkpoint-2000/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1befcff6836a413ddbcd6dcc7c4c93355b6e00aee0ce2ec517a51e768c247a0
+size 10229904

checkpoint-2000/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:851f0dec6528943d2147a3569e7d508196bbe41d416a38fac86aaff828d7445e
+size 10229904

checkpoint-2000/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:237c21ede3374d2842cf6fec23e61643f2ea678a328182e0a185551d3bd42250
+size 10229904

checkpoint-2000/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:410046cc577e9c0bdcb980066951b5a424b1ba59de8162b31b39b0d209b0bce4
+size 10229904

checkpoint-2000/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d62a67c257991f682da6c2a289846735b7cc95dc18adbfd96e102b3ef29a8bff
+size 10229904

checkpoint-2000/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1f791eec6fc0ee0ba321b0158fa103da507d4b66df52e5c83ba736e6370cace
+size 10229904

checkpoint-2000/global_step2000/mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ef7324ff623840a10639dd6caa4026b284da5537ed671f5b38c67e38bc5ad09
+size 13740018

checkpoint-2000/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step2000

checkpoint-2000/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5543e0754fe9df260bd82cfa7e571704fd10d14732fd9f105223113b609efbc2
+size 17968

checkpoint-2000/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7f739514ec0dc0df61d6d839eef9ff980a82e135422879f07e2cf78ae941894
+size 17968

checkpoint-2000/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15f56123a00985d37c3d262835e83f25048a8316afa24ac66747eea53d775b20
+size 17968

checkpoint-2000/rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dab859d8ea187b03a86f8103eb04cb46964db4a4f623eb4ea44cce5fa97a5d32
+size 17968

checkpoint-2000/rng_state_4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73a442b5a5145fccb3b785bb3d21d5a0a0d98db2bf3cce065c3a3341d9ceebcb
+size 17968

checkpoint-2000/rng_state_5.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d4f4621a0fbf3451fbaa425a77d700278bcc178c26328ec4ede18bfb01f76ee
+size 17968

checkpoint-2000/rng_state_6.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14ee6a4d4a973ce22b281fd2f4441fc97805ef0bdfab68d48d7ac7cd802b0a17
+size 17968

checkpoint-2000/rng_state_7.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3effc8943473a84c5671504d52ab42275bd7891696938a4991b4095fdf0e6c71
+size 17968

checkpoint-2000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-2000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2000/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

checkpoint-2000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}