Model save

Browse files

Files changed (8) hide show

README.md +8 -11
all_results.json +6 -6
config.json +1 -1
model.safetensors +1 -1
runs/Aug13_00-20-24_ip-26-0-161-138/events.out.tfevents.1723508454.ip-26-0-161-138.2676503.0 +3 -0
train_results.json +6 -6
trainer_state.json +43 -21
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -2,16 +2,12 @@
 license: apache-2.0
 base_model: HuggingFaceTB/SmolLM-360M
 tags:
-- alignment-handbook
-- trl
-- sft
-- generated_from_trainer
 - trl
 - sft
 - generated_from_trainer
 datasets:
-- HuggingFaceTB/everyday-topics-MT-conversations-H4
-- HuggingFaceTB/instruct-data-basics-H4
 model-index:
 - name: smollm-350M-instruct-add-basics-only
   results: []
@@ -20,12 +16,12 @@ model-index:
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 should probably proofread and complete it, then remove this comment. -->
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/loubnabnl/huggingface/runs/6npzjkdc)
 # smollm-350M-instruct-add-basics-only
-This model is a fine-tuned version of [HuggingFaceTB/SmolLM-360M](https://huggingface.co/HuggingFaceTB/SmolLM-360M) on the HuggingFaceTB/everyday-topics-MT-conversations-H4 and the HuggingFaceTB/instruct-data-basics-H4 datasets.
 It achieves the following results on the evaluation set:
-- Loss: 2.1009
 ## Model description
@@ -56,14 +52,15 @@ The following hyperparameters were used during training:
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
-- num_epochs: 3
 ### Training results
 | Training Loss | Epoch  | Step | Validation Loss |
 |:-------------:|:------:|:----:|:---------------:|
 | 2.2577        | 0.5714 | 1    | 2.2421          |
-| 2.2577        | 1.7143 | 3    | 2.1009          |
 ### Framework versions

 license: apache-2.0
 base_model: HuggingFaceTB/SmolLM-360M
 tags:
 - trl
 - sft
+- alignment-handbook
 - generated_from_trainer
 datasets:
+- generator
 model-index:
 - name: smollm-350M-instruct-add-basics-only
   results: []
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/loubnabnl/huggingface/runs/6sdxeci4)
 # smollm-350M-instruct-add-basics-only
+This model is a fine-tuned version of [HuggingFaceTB/SmolLM-360M](https://huggingface.co/HuggingFaceTB/SmolLM-360M) on the generator dataset.
 It achieves the following results on the evaluation set:
+- Loss: 1.4730
 ## Model description
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 5
 ### Training results
 | Training Loss | Epoch  | Step | Validation Loss |
 |:-------------:|:------:|:----:|:---------------:|
 | 2.2577        | 0.5714 | 1    | 2.2421          |
+| 2.2317        | 1.7143 | 3    | 2.2371          |
+| 2.8729        | 2.8571 | 5    | 1.4730          |
 ### Framework versions

all_results.json CHANGED Viewed

@@ -1,14 +1,14 @@
 {
-    "epoch": 1.7142857142857144,
     "eval_loss": 2.1008596420288086,
     "eval_runtime": 0.1817,
     "eval_samples": 119,
     "eval_samples_per_second": 60.549,
     "eval_steps_per_second": 5.504,
-    "total_flos": 269903462400.0,
-    "train_loss": 2.6609482765197754,
-    "train_runtime": 22.3672,
     "train_samples": 2260,
-    "train_samples_per_second": 29.373,
-    "train_steps_per_second": 0.134
 }

 {
+    "epoch": 2.857142857142857,
     "eval_loss": 2.1008596420288086,
     "eval_runtime": 0.1817,
     "eval_samples": 119,
     "eval_samples_per_second": 60.549,
     "eval_steps_per_second": 5.504,
+    "total_flos": 466196889600.0,
+    "train_loss": 2.3683324337005613,
+    "train_runtime": 24.8838,
     "train_samples": 2260,
+    "train_samples_per_second": 44.005,
+    "train_steps_per_second": 0.201
 }

config.json CHANGED Viewed

@@ -25,6 +25,6 @@
   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.42.3",
-  "use_cache": true,
   "vocab_size": 49152
 }

   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.42.3",
+  "use_cache": false,
   "vocab_size": 49152
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ec217abb025996ac0c8b47f06bcf9f1b4f1b5f225794a7149289912587f0c330
 size 723674912

 version https://git-lfs.github.com/spec/v1
+oid sha256:2636a610f778234e909d5331b85376337249b61192afc33ab237c7387cf39e1e
 size 723674912

runs/Aug13_00-20-24_ip-26-0-161-138/events.out.tfevents.1723508454.ip-26-0-161-138.2676503.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b76ddf9970ea2176425184eaf2f2f98ac9ab5c1847d46f1eb3e32acc0bffcccd
+size 6940

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-    "epoch": 1.7142857142857144,
-    "total_flos": 269903462400.0,
-    "train_loss": 2.6609482765197754,
-    "train_runtime": 22.3672,
     "train_samples": 2260,
-    "train_samples_per_second": 29.373,
-    "train_steps_per_second": 0.134
 }

 {
+    "epoch": 2.857142857142857,
+    "total_flos": 466196889600.0,
+    "train_loss": 2.3683324337005613,
+    "train_runtime": 24.8838,
     "train_samples": 2260,
+    "train_samples_per_second": 44.005,
+    "train_steps_per_second": 0.201
 }

trainer_state.json CHANGED Viewed

@@ -1,16 +1,16 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 1.7142857142857144,
   "eval_steps": 500,
-  "global_step": 3,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.5714285714285714,
-      "grad_norm": 12.102353615020318,
       "learning_rate": 0.001,
       "loss": 2.2577,
       "step": 1
@@ -18,33 +18,55 @@
     {
       "epoch": 0.5714285714285714,
       "eval_loss": 2.2420599460601807,
-      "eval_runtime": 0.2087,
-      "eval_samples_per_second": 52.712,
-      "eval_steps_per_second": 4.792,
       "step": 1
     },
     {
       "epoch": 1.7142857142857144,
-      "eval_loss": 2.1008596420288086,
-      "eval_runtime": 0.49,
-      "eval_samples_per_second": 22.447,
-      "eval_steps_per_second": 2.041,
       "step": 3
     },
     {
-      "epoch": 1.7142857142857144,
-      "step": 3,
-      "total_flos": 269903462400.0,
-      "train_loss": 2.6609482765197754,
-      "train_runtime": 22.3672,
-      "train_samples_per_second": 29.373,
-      "train_steps_per_second": 0.134
     }
   ],
-  "logging_steps": 5,
-  "max_steps": 3,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 3,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -58,7 +80,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 269903462400.0,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 2.857142857142857,
   "eval_steps": 500,
+  "global_step": 5,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.5714285714285714,
+      "grad_norm": 12.10239091235565,
       "learning_rate": 0.001,
       "loss": 2.2577,
       "step": 1
     {
       "epoch": 0.5714285714285714,
       "eval_loss": 2.2420599460601807,
+      "eval_runtime": 0.207,
+      "eval_samples_per_second": 53.143,
+      "eval_steps_per_second": 4.831,
       "step": 1
     },
+    {
+      "epoch": 1.1428571428571428,
+      "grad_norm": 11.977258060606532,
+      "learning_rate": 0.0008535533905932737,
+      "loss": 2.2317,
+      "step": 2
+    },
     {
       "epoch": 1.7142857142857144,
+      "eval_loss": 2.237124443054199,
+      "eval_runtime": 0.4926,
+      "eval_samples_per_second": 22.329,
+      "eval_steps_per_second": 2.03,
       "step": 3
     },
     {
+      "epoch": 2.2857142857142856,
+      "grad_norm": 11.428072306567266,
+      "learning_rate": 0.00014644660940672628,
+      "loss": 2.8729,
+      "step": 4
+    },
+    {
+      "epoch": 2.857142857142857,
+      "eval_loss": 1.4730318784713745,
+      "eval_runtime": 0.1729,
+      "eval_samples_per_second": 63.61,
+      "eval_steps_per_second": 5.783,
+      "step": 5
+    },
+    {
+      "epoch": 2.857142857142857,
+      "step": 5,
+      "total_flos": 466196889600.0,
+      "train_loss": 2.3683324337005613,
+      "train_runtime": 24.8838,
+      "train_samples_per_second": 44.005,
+      "train_steps_per_second": 0.201
     }
   ],
+  "logging_steps": 2,
+  "max_steps": 5,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
       "attributes": {}
     }
   },
+  "total_flos": 466196889600.0,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1055d80adf596039074f483bf577b7082ed2b42598cab5ba8ec22a7756ab0e93
 size 6520

 version https://git-lfs.github.com/spec/v1
+oid sha256:ff33343a0ab9f2620fa2199a9aa5b3965c7eb8345fa84b58422e6e486384b0d0
 size 6520