adarshxs commited on
Commit
2f1140b
1 Parent(s): 05c1d29
README.md DELETED
@@ -1,145 +0,0 @@
1
- ---
2
- tags:
3
- - generated_from_trainer
4
- model-index:
5
- - name: out
6
- results: []
7
- ---
8
-
9
- <!-- This model card has been generated automatically according to the information the Trainer had access to. You
10
- should probably proofread and complete it, then remove this comment. -->
11
-
12
- [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
13
- <details><summary>See axolotl config</summary>
14
-
15
- axolotl version: `0.3.0`
16
- ```yaml
17
- base_model: ./TinyLlama-1.1B-intermediate-step-1431k-3T
18
-
19
- model_type: LlamaForCausalLM
20
- tokenizer_type: LlamaTokenizer
21
- is_llama_derived_model: true
22
-
23
- load_in_8bit: false
24
- load_in_4bit: false
25
- strict: false
26
-
27
- datasets:
28
- - path: ./openhermes
29
- type: alpaca
30
- dataset_prepared_path:
31
- val_set_size: 0.05
32
- output_dir: ./out
33
-
34
- sequence_len: 4096
35
- sample_packing: false
36
-
37
- adapter:
38
- lora_model_dir:
39
- lora_r:
40
- lora_alpha:
41
- lora_dropout:
42
- lora_target_linear:
43
- lora_fan_in_fan_out:
44
-
45
- wandb_project: tinyllama-openhermes
46
- wandb_entity: tensoic
47
- wandb_watch:
48
- wandb_name:
49
- wandb_log_model:
50
-
51
- gradient_accumulation_steps: 2
52
- micro_batch_size: 8
53
- num_epochs: 1
54
- optimizer: adamw_bnb_8bit
55
- adam_epsilon: 0.00001
56
- max_grad_norm: 1.0
57
- lr_scheduler: cosine
58
- learning_rate: 0.0002
59
-
60
- train_on_inputs: false
61
- group_by_length: false
62
- bf16: false
63
- fp16: true
64
- tf32: false
65
-
66
- gradient_checkpointing: true
67
- early_stopping_patience:
68
- resume_from_checkpoint:
69
- local_rank:
70
- logging_steps: 1
71
- xformers_attention: true
72
- flash_attention:
73
-
74
- warmup_steps: 100
75
- evals_per_epoch: 4
76
- eval_table_size:
77
- saves_per_epoch: 1
78
- debug:
79
- deepspeed: zero2.json
80
- weight_decay: 0.0
81
- fsdp:
82
- fsdp_config:
83
- special_tokens:
84
- bos_token: "<s>"
85
- eos_token: "</s>"
86
- unk_token: "<unk>"
87
-
88
- ```
89
-
90
- </details><br>
91
-
92
- # out
93
-
94
- This model was trained from scratch on the None dataset.
95
- It achieves the following results on the evaluation set:
96
- - Loss: 1.3647
97
-
98
- ## Model description
99
-
100
- More information needed
101
-
102
- ## Intended uses & limitations
103
-
104
- More information needed
105
-
106
- ## Training and evaluation data
107
-
108
- More information needed
109
-
110
- ## Training procedure
111
-
112
- ### Training hyperparameters
113
-
114
- The following hyperparameters were used during training:
115
- - learning_rate: 0.0002
116
- - train_batch_size: 8
117
- - eval_batch_size: 8
118
- - seed: 42
119
- - distributed_type: multi-GPU
120
- - num_devices: 8
121
- - gradient_accumulation_steps: 2
122
- - total_train_batch_size: 128
123
- - total_eval_batch_size: 64
124
- - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-05
125
- - lr_scheduler_type: cosine
126
- - lr_scheduler_warmup_steps: 100
127
- - num_epochs: 1
128
- - mixed_precision_training: Native AMP
129
-
130
- ### Training results
131
-
132
- | Training Loss | Epoch | Step | Validation Loss |
133
- |:-------------:|:-----:|:----:|:---------------:|
134
- | 3.0006 | 0.0 | 1 | 1.6838 |
135
- | 0.8195 | 0.25 | 451 | 1.4620 |
136
- | 0.6836 | 0.5 | 902 | 1.4158 |
137
- | 0.6811 | 0.75 | 1353 | 1.3647 |
138
-
139
-
140
- ### Framework versions
141
-
142
- - Transformers 4.36.2
143
- - Pytorch 2.0.1+cu117
144
- - Datasets 2.15.0
145
- - Tokenizers 0.15.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json DELETED
@@ -1,28 +0,0 @@
1
- {
2
- "_name_or_path": "./TinyLlama-1.1B-intermediate-step-1431k-3T",
3
- "architectures": [
4
- "LlamaForCausalLM"
5
- ],
6
- "attention_bias": false,
7
- "attention_dropout": 0.0,
8
- "bos_token_id": 1,
9
- "eos_token_id": 2,
10
- "hidden_act": "silu",
11
- "hidden_size": 2048,
12
- "initializer_range": 0.02,
13
- "intermediate_size": 5632,
14
- "max_position_embeddings": 4096,
15
- "model_type": "llama",
16
- "num_attention_heads": 32,
17
- "num_hidden_layers": 22,
18
- "num_key_value_heads": 4,
19
- "pretraining_tp": 1,
20
- "rms_norm_eps": 1e-05,
21
- "rope_scaling": null,
22
- "rope_theta": 10000.0,
23
- "tie_word_embeddings": false,
24
- "torch_dtype": "float16",
25
- "transformers_version": "4.36.2",
26
- "use_cache": false,
27
- "vocab_size": 32000
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
generation_config.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "bos_token_id": 1,
3
- "eos_token_id": 2,
4
- "max_length": 2048,
5
- "pad_token_id": 0,
6
- "transformers_version": "4.36.2"
7
- }
 
 
 
 
 
 
 
 
pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc780bb0faf671a3cb8409e7b4aab151cf6c760ad7ebe2748a189370924e3bfb
3
- size 2200123773
 
 
 
 
special_tokens_map.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "</s>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": "</s>",
17
- "unk_token": {
18
- "content": "<unk>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
- size 499723
 
 
 
 
tokenizer_config.json DELETED
@@ -1,44 +0,0 @@
1
- {
2
- "add_bos_token": true,
3
- "add_eos_token": false,
4
- "added_tokens_decoder": {
5
- "0": {
6
- "content": "<unk>",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "1": {
14
- "content": "<s>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "2": {
22
- "content": "</s>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- }
29
- },
30
- "bos_token": "<s>",
31
- "clean_up_tokenization_spaces": false,
32
- "eos_token": "</s>",
33
- "legacy": false,
34
- "model_max_length": 1000000000000000019884624838656,
35
- "pad_token": "</s>",
36
- "padding_side": "right",
37
- "sp_model_kwargs": {},
38
- "spaces_between_special_tokens": false,
39
- "tokenizer_class": "LlamaTokenizer",
40
- "trust_remote_code": false,
41
- "unk_token": "<unk>",
42
- "use_default_system_prompt": false,
43
- "use_fast": true
44
- }