kasa-lora
/

llama3.1-8b-gpt4o_100k_closedqa-fft

+---
+library_name: transformers
+license: llama3
+base_model: meta-llama/Meta-Llama-3-8B
+tags:
+- trl
+- sft
+- generated_from_trainer
+datasets:
+- generator
+model-index:
+- name: llama3.1-8b-gpt4o_100k_closedqa-fft
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# llama3.1-8b-gpt4o_100k_closedqa-fft
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the generator dataset.
+It achieves the following results on the evaluation set:
+- Loss: 3.5964
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0003
+- train_batch_size: 2
+- eval_batch_size: 2
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 32
+- total_eval_batch_size: 16
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 0.9303        | 0.9990 | 511  | 3.5964          |
+### Framework versions
+- Transformers 4.45.1
+- Pytorch 2.4.1+cu121
+- Datasets 3.0.1
+- Tokenizers 0.20.0

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9990224828934506,
+    "total_flos": 26722078556160.0,
+    "train_loss": 1.3314139091805235,
+    "train_runtime": 8881.3456,
+    "train_samples": 111440,
+    "train_samples_per_second": 1.842,
+    "train_steps_per_second": 0.058
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "bos_token_id": 128256,
+  "do_sample": true,
+  "eos_token_id": 128257,
+  "max_length": 4096,
+  "pad_token_id": 128257,
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "transformers_version": "4.45.1"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9990224828934506,
+    "total_flos": 26722078556160.0,
+    "train_loss": 1.3314139091805235,
+    "train_runtime": 8881.3456,
+    "train_samples": 111440,
+    "train_samples_per_second": 1.842,
+    "train_steps_per_second": 0.058
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,771 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9990224828934506,
+  "eval_steps": 500,
+  "global_step": 511,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0019550342130987292,
+      "grad_norm": 28.436507384573893,
+      "learning_rate": 5.769230769230769e-06,
+      "loss": 1.9755,
+      "step": 1
+    },
+    {
+      "epoch": 0.009775171065493646,
+      "grad_norm": 16.326267191478145,
+      "learning_rate": 2.8846153846153845e-05,
+      "loss": 1.7228,
+      "step": 5
+    },
+    {
+      "epoch": 0.019550342130987292,
+      "grad_norm": 7.77688805789614,
+      "learning_rate": 5.769230769230769e-05,
+      "loss": 1.1824,
+      "step": 10
+    },
+    {
+      "epoch": 0.02932551319648094,
+      "grad_norm": 23.182562040558295,
+      "learning_rate": 8.653846153846152e-05,
+      "loss": 1.2132,
+      "step": 15
+    },
+    {
+      "epoch": 0.039100684261974585,
+      "grad_norm": 3.8046090746822716,
+      "learning_rate": 0.00011538461538461538,
+      "loss": 1.0978,
+      "step": 20
+    },
+    {
+      "epoch": 0.04887585532746823,
+      "grad_norm": 6.364935901579011,
+      "learning_rate": 0.00014423076923076922,
+      "loss": 1.2143,
+      "step": 25
+    },
+    {
+      "epoch": 0.05865102639296188,
+      "grad_norm": 4.553589249592707,
+      "learning_rate": 0.00017307692307692304,
+      "loss": 1.1357,
+      "step": 30
+    },
+    {
+      "epoch": 0.06842619745845552,
+      "grad_norm": 5.819131618520497,
+      "learning_rate": 0.00020192307692307691,
+      "loss": 1.2003,
+      "step": 35
+    },
+    {
+      "epoch": 0.07820136852394917,
+      "grad_norm": 8.631833955662083,
+      "learning_rate": 0.00023076923076923076,
+      "loss": 1.2049,
+      "step": 40
+    },
+    {
+      "epoch": 0.08797653958944282,
+      "grad_norm": 27.119527389294134,
+      "learning_rate": 0.0002596153846153846,
+      "loss": 2.0482,
+      "step": 45
+    },
+    {
+      "epoch": 0.09775171065493646,
+      "grad_norm": 32.33385091368177,
+      "learning_rate": 0.00028846153846153843,
+      "loss": 1.6455,
+      "step": 50
+    },
+    {
+      "epoch": 0.10752688172043011,
+      "grad_norm": 45.94902359405063,
+      "learning_rate": 0.0002999683799255387,
+      "loss": 1.9763,
+      "step": 55
+    },
+    {
+      "epoch": 0.11730205278592376,
+      "grad_norm": 128.59363629595714,
+      "learning_rate": 0.0002997751944121241,
+      "loss": 1.5422,
+      "step": 60
+    },
+    {
+      "epoch": 0.1270772238514174,
+      "grad_norm": 10.71651832916556,
+      "learning_rate": 0.0002994066160471166,
+      "loss": 1.7548,
+      "step": 65
+    },
+    {
+      "epoch": 0.13685239491691104,
+      "grad_norm": 4.189410908699583,
+      "learning_rate": 0.0002988630764507904,
+      "loss": 1.3404,
+      "step": 70
+    },
+    {
+      "epoch": 0.1466275659824047,
+      "grad_norm": 33.78267011030123,
+      "learning_rate": 0.00029814521213014585,
+      "loss": 1.4341,
+      "step": 75
+    },
+    {
+      "epoch": 0.15640273704789834,
+      "grad_norm": 4.447241036402298,
+      "learning_rate": 0.00029725386373353455,
+      "loss": 1.4355,
+      "step": 80
+    },
+    {
+      "epoch": 0.16617790811339198,
+      "grad_norm": 19.324376604180756,
+      "learning_rate": 0.00029619007506622504,
+      "loss": 1.4037,
+      "step": 85
+    },
+    {
+      "epoch": 0.17595307917888564,
+      "grad_norm": 56.398325865658165,
+      "learning_rate": 0.00029495509186806487,
+      "loss": 2.1883,
+      "step": 90
+    },
+    {
+      "epoch": 0.18572825024437928,
+      "grad_norm": 117.39758059322921,
+      "learning_rate": 0.0002935503603546683,
+      "loss": 2.0507,
+      "step": 95
+    },
+    {
+      "epoch": 0.19550342130987292,
+      "grad_norm": 7.6884925370685995,
+      "learning_rate": 0.00029197752552383914,
+      "loss": 1.5932,
+      "step": 100
+    },
+    {
+      "epoch": 0.20527859237536658,
+      "grad_norm": 963.0390862127407,
+      "learning_rate": 0.000290238429229211,
+      "loss": 5.223,
+      "step": 105
+    },
+    {
+      "epoch": 0.21505376344086022,
+      "grad_norm": 258.2156128276988,
+      "learning_rate": 0.00028833510802336203,
+      "loss": 4.5213,
+      "step": 110
+    },
+    {
+      "epoch": 0.22482893450635386,
+      "grad_norm": 264.762100287562,
+      "learning_rate": 0.0002862697907729285,
+      "loss": 2.0849,
+      "step": 115
+    },
+    {
+      "epoch": 0.23460410557184752,
+      "grad_norm": 18.694604887076736,
+      "learning_rate": 0.0002840448960485118,
+      "loss": 1.8192,
+      "step": 120
+    },
+    {
+      "epoch": 0.24437927663734116,
+      "grad_norm": 67.0812503804155,
+      "learning_rate": 0.00028166302929243326,
+      "loss": 1.3915,
+      "step": 125
+    },
+    {
+      "epoch": 0.2541544477028348,
+      "grad_norm": 21.92554603766507,
+      "learning_rate": 0.0002791269797676551,
+      "loss": 1.5317,
+      "step": 130
+    },
+    {
+      "epoch": 0.26392961876832843,
+      "grad_norm": 7.078011892011075,
+      "learning_rate": 0.00027643971729144056,
+      "loss": 1.4673,
+      "step": 135
+    },
+    {
+      "epoch": 0.27370478983382207,
+      "grad_norm": 9.631691414307214,
+      "learning_rate": 0.0002736043887575761,
+      "loss": 1.3131,
+      "step": 140
+    },
+    {
+      "epoch": 0.28347996089931576,
+      "grad_norm": 23.096760351073254,
+      "learning_rate": 0.00027062431445123124,
+      "loss": 1.572,
+      "step": 145
+    },
+    {
+      "epoch": 0.2932551319648094,
+      "grad_norm": 3.17350344043114,
+      "learning_rate": 0.0002675029841607691,
+      "loss": 1.3668,
+      "step": 150
+    },
+    {
+      "epoch": 0.30303030303030304,
+      "grad_norm": 6.678371795997961,
+      "learning_rate": 0.00026424405309106216,
+      "loss": 1.3082,
+      "step": 155
+    },
+    {
+      "epoch": 0.3128054740957967,
+      "grad_norm": 4.718771263163467,
+      "learning_rate": 0.00026085133758309883,
+      "loss": 1.3286,
+      "step": 160
+    },
+    {
+      "epoch": 0.3225806451612903,
+      "grad_norm": 8.790382644648172,
+      "learning_rate": 0.00025732881064489233,
+      "loss": 1.3241,
+      "step": 165
+    },
+    {
+      "epoch": 0.33235581622678395,
+      "grad_norm": 4.416208770248871,
+      "learning_rate": 0.0002536805972989267,
+      "loss": 1.3357,
+      "step": 170
+    },
+    {
+      "epoch": 0.3421309872922776,
+      "grad_norm": 3.1002678359027285,
+      "learning_rate": 0.0002499109697515875,
+      "loss": 1.4037,
+      "step": 175
+    },
+    {
+      "epoch": 0.3519061583577713,
+      "grad_norm": 7.124469693986668,
+      "learning_rate": 0.0002460243423902342,
+      "loss": 1.625,
+      "step": 180
+    },
+    {
+      "epoch": 0.3616813294232649,
+      "grad_norm": 51.006744297270856,
+      "learning_rate": 0.00024202526661377277,
+      "loss": 1.6499,
+      "step": 185
+    },
+    {
+      "epoch": 0.37145650048875856,
+      "grad_norm": 14.662023479458805,
+      "learning_rate": 0.00023791842550278217,
+      "loss": 1.8342,
+      "step": 190
+    },
+    {
+      "epoch": 0.3812316715542522,
+      "grad_norm": 4.973242239066626,
+      "learning_rate": 0.00023370862833543648,
+      "loss": 1.6823,
+      "step": 195
+    },
+    {
+      "epoch": 0.39100684261974583,
+      "grad_norm": 95.12537291599145,
+      "learning_rate": 0.0002294008049556441,
+      "loss": 1.5268,
+      "step": 200
+    },
+    {
+      "epoch": 0.40078201368523947,
+      "grad_norm": 3.394429345078054,
+      "learning_rate": 0.000225,
+      "loss": 1.45,
+      "step": 205
+    },
+    {
+      "epoch": 0.41055718475073316,
+      "grad_norm": 3.5902349674560456,
+      "learning_rate": 0.00022051136699031057,
+      "loss": 1.2502,
+      "step": 210
+    },
+    {
+      "epoch": 0.4203323558162268,
+      "grad_norm": 260.2217345574012,
+      "learning_rate": 0.00021594016229861007,
+      "loss": 1.4486,
+      "step": 215
+    },
+    {
+      "epoch": 0.43010752688172044,
+      "grad_norm": 4.8647475236367725,
+      "learning_rate": 0.0002112917389917347,
+      "loss": 1.486,
+      "step": 220
+    },
+    {
+      "epoch": 0.4398826979472141,
+      "grad_norm": 1.9024756731821182,
+      "learning_rate": 0.0002065715405626634,
+      "loss": 1.2628,
+      "step": 225
+    },
+    {
+      "epoch": 0.4496578690127077,
+      "grad_norm": 10.37907922884646,
+      "learning_rate": 0.00020178509455596596,
+      "loss": 1.2518,
+      "step": 230
+    },
+    {
+      "epoch": 0.45943304007820135,
+      "grad_norm": 1.9405506982628546,
+      "learning_rate": 0.00019693800609482315,
+      "loss": 1.2849,
+      "step": 235
+    },
+    {
+      "epoch": 0.46920821114369504,
+      "grad_norm": 2.6160283264932462,
+      "learning_rate": 0.00019203595131719932,
+      "loss": 1.2548,
+      "step": 240
+    },
+    {
+      "epoch": 0.4789833822091887,
+      "grad_norm": 2.1695347772705373,
+      "learning_rate": 0.00018708467072885382,
+      "loss": 1.3377,
+      "step": 245
+    },
+    {
+      "epoch": 0.4887585532746823,
+      "grad_norm": 2.2144620011763374,
+      "learning_rate": 0.00018208996248097458,
+      "loss": 1.3093,
+      "step": 250
+    },
+    {
+      "epoch": 0.49853372434017595,
+      "grad_norm": 2.1880901805448403,
+      "learning_rate": 0.00017705767558030754,
+      "loss": 1.245,
+      "step": 255
+    },
+    {
+      "epoch": 0.5083088954056696,
+      "grad_norm": 2.991429961153096,
+      "learning_rate": 0.0001719937030397311,
+      "loss": 1.2559,
+      "step": 260
+    },
+    {
+      "epoch": 0.5180840664711632,
+      "grad_norm": 23.064953881729117,
+      "learning_rate": 0.00016690397497729818,
+      "loss": 1.288,
+      "step": 265
+    },
+    {
+      "epoch": 0.5278592375366569,
+      "grad_norm": 1.3455036549144204,
+      "learning_rate": 0.00016179445167182677,
+      "loss": 1.2717,
+      "step": 270
+    },
+    {
+      "epoch": 0.5376344086021505,
+      "grad_norm": 1.1846238387921606,
+      "learning_rate": 0.00015667111658317054,
+      "loss": 1.2394,
+      "step": 275
+    },
+    {
+      "epoch": 0.5474095796676441,
+      "grad_norm": 137.50996798714343,
+      "learning_rate": 0.00015153996934534348,
+      "loss": 1.3296,
+      "step": 280
+    },
+    {
+      "epoch": 0.5571847507331378,
+      "grad_norm": 1.1488279921928382,
+      "learning_rate": 0.00014640701874070455,
+      "loss": 1.2874,
+      "step": 285
+    },
+    {
+      "epoch": 0.5669599217986315,
+      "grad_norm": 1.2689991198054311,
+      "learning_rate": 0.00014127827566342863,
+      "loss": 1.2561,
+      "step": 290
+    },
+    {
+      "epoch": 0.5767350928641252,
+      "grad_norm": 2.140511295392104,
+      "learning_rate": 0.0001361597460805047,
+      "loss": 1.2205,
+      "step": 295
+    },
+    {
+      "epoch": 0.5865102639296188,
+      "grad_norm": 12.286388401977892,
+      "learning_rate": 0.000131057423998504,
+      "loss": 1.252,
+      "step": 300
+    },
+    {
+      "epoch": 0.5962854349951124,
+      "grad_norm": 2.5541457654289395,
+      "learning_rate": 0.00012597728444435418,
+      "loss": 1.215,
+      "step": 305
+    },
+    {
+      "epoch": 0.6060606060606061,
+      "grad_norm": 1.1732833800621696,
+      "learning_rate": 0.00012092527646833949,
+      "loss": 1.2053,
+      "step": 310
+    },
+    {
+      "epoch": 0.6158357771260997,
+      "grad_norm": 1.4481798374657,
+      "learning_rate": 0.00011590731617752066,
+      "loss": 1.2061,
+      "step": 315
+    },
+    {
+      "epoch": 0.6256109481915934,
+      "grad_norm": 0.9912604590459435,
+      "learning_rate": 0.00011092927980773267,
+      "loss": 1.1604,
+      "step": 320
+    },
+    {
+      "epoch": 0.635386119257087,
+      "grad_norm": 1.0322599469502478,
+      "learning_rate": 0.00010599699684227311,
+      "loss": 1.1369,
+      "step": 325
+    },
+    {
+      "epoch": 0.6451612903225806,
+      "grad_norm": 1.5437893851108073,
+      "learning_rate": 0.00010111624318534006,
+      "loss": 1.1721,
+      "step": 330
+    },
+    {
+      "epoch": 0.6549364613880743,
+      "grad_norm": 1.383693940282455,
+      "learning_rate": 9.629273439821313e-05,
+      "loss": 1.1094,
+      "step": 335
+    },
+    {
+      "epoch": 0.6647116324535679,
+      "grad_norm": 2.598856235735631,
+      "learning_rate": 9.15321190060981e-05,
+      "loss": 1.1251,
+      "step": 340
+    },
+    {
+      "epoch": 0.6744868035190615,
+      "grad_norm": 1.0440965009342513,
+      "learning_rate": 8.683997188347435e-05,
+      "loss": 1.0953,
+      "step": 345
+    },
+    {
+      "epoch": 0.6842619745845552,
+      "grad_norm": 1.2172823916521676,
+      "learning_rate": 8.222178772568959e-05,
+      "loss": 1.0839,
+      "step": 350
+    },
+    {
+      "epoch": 0.6940371456500489,
+      "grad_norm": 0.9291103607651107,
+      "learning_rate": 7.768297461444765e-05,
+      "loss": 1.0786,
+      "step": 355
+    },
+    {
+      "epoch": 0.7038123167155426,
+      "grad_norm": 36.656353739168324,
+      "learning_rate": 7.32288476847252e-05,
+      "loss": 1.1001,
+      "step": 360
+    },
+    {
+      "epoch": 0.7135874877810362,
+      "grad_norm": 0.7923067460462172,
+      "learning_rate": 6.886462290053158e-05,
+      "loss": 1.0793,
+      "step": 365
+    },
+    {
+      "epoch": 0.7233626588465298,
+      "grad_norm": 0.8419278909431203,
+      "learning_rate": 6.4595410946803e-05,
+      "loss": 1.0869,
+      "step": 370
+    },
+    {
+      "epoch": 0.7331378299120235,
+      "grad_norm": 1.0339571657214093,
+      "learning_rate": 6.04262112445821e-05,
+      "loss": 1.0128,
+      "step": 375
+    },
+    {
+      "epoch": 0.7429130009775171,
+      "grad_norm": 0.7333799573780848,
+      "learning_rate": 5.636190609649249e-05,
+      "loss": 1.0101,
+      "step": 380
+    },
+    {
+      "epoch": 0.7526881720430108,
+      "grad_norm": 0.8595033406539786,
+      "learning_rate": 5.240725496936372e-05,
+      "loss": 1.0224,
+      "step": 385
+    },
+    {
+      "epoch": 0.7624633431085044,
+      "grad_norm": 0.7112388580251547,
+      "learning_rate": 4.8566888920701196e-05,
+      "loss": 1.0016,
+      "step": 390
+    },
+    {
+      "epoch": 0.772238514173998,
+      "grad_norm": 0.8167634152987004,
+      "learning_rate": 4.48453051755301e-05,
+      "loss": 0.9793,
+      "step": 395
+    },
+    {
+      "epoch": 0.7820136852394917,
+      "grad_norm": 0.6682157986400304,
+      "learning_rate": 4.12468618599611e-05,
+      "loss": 1.0015,
+      "step": 400
+    },
+    {
+      "epoch": 0.7917888563049853,
+      "grad_norm": 0.7797058649722416,
+      "learning_rate": 3.777577289764752e-05,
+      "loss": 0.9784,
+      "step": 405
+    },
+    {
+      "epoch": 0.8015640273704789,
+      "grad_norm": 0.6768885831261553,
+      "learning_rate": 3.443610307510907e-05,
+      "loss": 0.9605,
+      "step": 410
+    },
+    {
+      "epoch": 0.8113391984359726,
+      "grad_norm": 0.6416162504789994,
+      "learning_rate": 3.1231763281701305e-05,
+      "loss": 0.971,
+      "step": 415
+    },
+    {
+      "epoch": 0.8211143695014663,
+      "grad_norm": 0.6784950124595185,
+      "learning_rate": 2.816650592980495e-05,
+      "loss": 0.9553,
+      "step": 420
+    },
+    {
+      "epoch": 0.83088954056696,
+      "grad_norm": 0.689853020596839,
+      "learning_rate": 2.5243920560598184e-05,
+      "loss": 0.9351,
+      "step": 425
+    },
+    {
+      "epoch": 0.8406647116324536,
+      "grad_norm": 0.6460996639595076,
+      "learning_rate": 2.24674296405579e-05,
+      "loss": 0.9313,
+      "step": 430
+    },
+    {
+      "epoch": 0.8504398826979472,
+      "grad_norm": 0.6636695831614443,
+      "learning_rate": 1.98402845536117e-05,
+      "loss": 0.9266,
+      "step": 435
+    },
+    {
+      "epoch": 0.8602150537634409,
+      "grad_norm": 0.6119153060968766,
+      "learning_rate": 1.736556179363543e-05,
+      "loss": 0.9134,
+      "step": 440
+    },
+    {
+      "epoch": 0.8699902248289345,
+      "grad_norm": 0.5955119176542452,
+      "learning_rate": 1.5046159361753224e-05,
+      "loss": 0.9198,
+      "step": 445
+    },
+    {
+      "epoch": 0.8797653958944281,
+      "grad_norm": 0.6324852203366718,
+      "learning_rate": 1.2884793372660207e-05,
+      "loss": 0.9051,
+      "step": 450
+    },
+    {
+      "epoch": 0.8895405669599218,
+      "grad_norm": 0.6803072103352026,
+      "learning_rate": 1.0883994873941815e-05,
+      "loss": 0.8923,
+      "step": 455
+    },
+    {
+      "epoch": 0.8993157380254154,
+      "grad_norm": 2.125025596226289,
+      "learning_rate": 9.046106882113751e-06,
+      "loss": 0.9315,
+      "step": 460
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 0.5812394213670566,
+      "learning_rate": 7.373281638854328e-06,
+      "loss": 0.9131,
+      "step": 465
+    },
+    {
+      "epoch": 0.9188660801564027,
+      "grad_norm": 0.6819326019234143,
+      "learning_rate": 5.867478090641892e-06,
+      "loss": 0.9521,
+      "step": 470
+    },
+    {
+      "epoch": 0.9286412512218963,
+      "grad_norm": 0.6009777818830081,
+      "learning_rate": 4.530459594748592e-06,
+      "loss": 0.8585,
+      "step": 475
+    },
+    {
+      "epoch": 0.9384164222873901,
+      "grad_norm": 0.5996940109383458,
+      "learning_rate": 3.363791854277348e-06,
+      "loss": 0.8938,
+      "step": 480
+    },
+    {
+      "epoch": 0.9481915933528837,
+      "grad_norm": 0.6175664551394122,
+      "learning_rate": 2.3688410846596282e-06,
+      "loss": 0.8891,
+      "step": 485
+    },
+    {
+      "epoch": 0.9579667644183774,
+      "grad_norm": 2.0911762607681768,
+      "learning_rate": 1.5467724137617043e-06,
+      "loss": 0.924,
+      "step": 490
+    },
+    {
+      "epoch": 0.967741935483871,
+      "grad_norm": 0.6696352577624144,
+      "learning_rate": 8.985485174722973e-07,
+      "loss": 0.9077,
+      "step": 495
+    },
+    {
+      "epoch": 0.9775171065493646,
+      "grad_norm": 0.5766498486118373,
+      "learning_rate": 4.249284923700358e-07,
+      "loss": 0.9012,
+      "step": 500
+    },
+    {
+      "epoch": 0.9872922776148583,
+      "grad_norm": 0.6254945643083392,
+      "learning_rate": 1.2646696679042833e-07,
+      "loss": 0.9035,
+      "step": 505
+    },
+    {
+      "epoch": 0.9970674486803519,
+      "grad_norm": 0.6202670688713676,
+      "learning_rate": 3.5134513334200697e-09,
+      "loss": 0.9303,
+      "step": 510
+    },
+    {
+      "epoch": 0.9990224828934506,
+      "eval_loss": 3.596351385116577,
+      "eval_runtime": 2.2495,
+      "eval_samples_per_second": 2.667,
+      "eval_steps_per_second": 0.445,
+      "step": 511
+    },
+    {
+      "epoch": 0.9990224828934506,
+      "step": 511,
+      "total_flos": 26722078556160.0,
+      "train_loss": 1.3314139091805235,
+      "train_runtime": 8881.3456,
+      "train_samples_per_second": 1.842,
+      "train_steps_per_second": 0.058
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 511,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 26722078556160.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}