add ckp
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- checkpoint_metadata.json +9 -0
- config.yaml +150 -0
- lr_scheduler/lr_scheduler.pt +3 -0
- model/model/decoder/0/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
- model/model/decoder/0/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
- model/model/decoder/0/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
- model/model/decoder/0/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-3-of-4.safetensors +3 -0
- model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
- model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
- model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
- model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors +3 -0
- model/model/decoder/0/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
- model/model/decoder/0/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
- model/model/decoder/0/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
- model/model/decoder/0/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors +3 -0
- model/model/decoder/0/pp_block/input_layernorm/model_weight.safetensors +3 -0
- model/model/decoder/0/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
- model/model/decoder/0/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
- model/model/decoder/0/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
- model/model/decoder/0/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors +3 -0
- model/model/decoder/0/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
- model/model/decoder/0/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
- model/model/decoder/0/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
- model/model/decoder/0/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors +3 -0
- model/model/decoder/0/pp_block/post_attention_layernorm/model_weight.safetensors +3 -0
- model/model/decoder/1/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
- model/model/decoder/1/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
- model/model/decoder/1/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
- model/model/decoder/1/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-3-of-4.safetensors +3 -0
- model/model/decoder/1/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
- model/model/decoder/1/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
- model/model/decoder/1/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
- model/model/decoder/1/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors +3 -0
- model/model/decoder/1/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
- model/model/decoder/1/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
- model/model/decoder/1/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
- model/model/decoder/1/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors +3 -0
- model/model/decoder/1/pp_block/input_layernorm/model_weight.safetensors +3 -0
- model/model/decoder/1/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
- model/model/decoder/1/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
- model/model/decoder/1/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
- model/model/decoder/1/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors +3 -0
- model/model/decoder/1/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
- model/model/decoder/1/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
- model/model/decoder/1/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
- model/model/decoder/1/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors +3 -0
- model/model/decoder/1/pp_block/post_attention_layernorm/model_weight.safetensors +3 -0
- model/model/decoder/10/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
- model/model/decoder/10/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
- model/model/decoder/10/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
checkpoint_metadata.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dp": 6,
|
3 |
+
"metas": {
|
4 |
+
"consumed_train_samples": 1920000,
|
5 |
+
"last_train_step": 20000
|
6 |
+
},
|
7 |
+
"tp": 4,
|
8 |
+
"version": "1.2"
|
9 |
+
}
|
config.yaml
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
checkpoints:
|
2 |
+
checkpoint_interval: 1000
|
3 |
+
checkpoints_path: /fsx/phuc/new_workspace/experiments/exp57_8b_llama_1024_ctx_length_and_64_segment_length_and_100k_bs_and_global_lr_1.0e-5_and_balance_factor_lr_0.01_and_balance_factor_0_weight_decay/checkpoints
|
4 |
+
checkpoints_path_is_shared_file_system: true
|
5 |
+
resume_checkpoint_path: /fsx/phuc/new_workspace/experiments/infini_attention_8b_llama/exp57_8b_llama_1024_ctx_length_and_64_segment_length_and_100k_bs_and_global_lr_1.0e-5_and_balance_factor_lr_0.01_and_balance_factor_0_weight_decay/checkpoints
|
6 |
+
save_initial_state: false
|
7 |
+
data:
|
8 |
+
dataset:
|
9 |
+
dataloader_type: single
|
10 |
+
dataset_max_tokens: null
|
11 |
+
dataset_weights:
|
12 |
+
- 0.3
|
13 |
+
- 0.3
|
14 |
+
- 0.45
|
15 |
+
- 0.15
|
16 |
+
- 0.08
|
17 |
+
- 0.02
|
18 |
+
datasets:
|
19 |
+
- dtype: uint32
|
20 |
+
filename_pattern: .*.ds
|
21 |
+
folder: s3://huggingface-llm-datasets/stack_full_v21-8k/tokenized-llama3/long/
|
22 |
+
skip_tokens: 0
|
23 |
+
- dtype: uint32
|
24 |
+
filename_pattern: .*.ds
|
25 |
+
folder: s3://huggingface-llm-datasets/stack_full_v21-8k/tokenized-llama3/short/
|
26 |
+
skip_tokens: 0
|
27 |
+
- dtype: uint32
|
28 |
+
filename_pattern: .*.ds
|
29 |
+
folder: s3://huggingface-llm-datasets/fineweb-v1-8k/tokenized-llama3/long/CC-MAIN-2024-10
|
30 |
+
skip_tokens: 0
|
31 |
+
- dtype: uint32
|
32 |
+
filename_pattern: .*.ds
|
33 |
+
folder: s3://huggingface-llm-datasets/fineweb-v1-8k/tokenized-llama3/short/CC-MAIN-2024-10
|
34 |
+
skip_tokens: 0
|
35 |
+
- dtype: uint32
|
36 |
+
filename_pattern: .*.ds
|
37 |
+
folder: s3://huggingface-llm-datasets/project-gutenberg/tokenized-llama3/
|
38 |
+
skip_tokens: 0
|
39 |
+
- dtype: uint32
|
40 |
+
filename_pattern: .*.ds
|
41 |
+
folder: s3://huggingface-llm-datasets/OpenHermes-2-5/tokenized-llama3
|
42 |
+
skip_tokens: 0
|
43 |
+
pad_samples_to_global_batch_size: false
|
44 |
+
skip_in_stream: true
|
45 |
+
num_loading_workers: 0
|
46 |
+
seed: 42
|
47 |
+
data_stages: null
|
48 |
+
experiment_logger:
|
49 |
+
tensorboard_logger:
|
50 |
+
flush_secs: 30
|
51 |
+
tensorboard_dir: /fsx/phuc/project_data/infini_attention/tb_logs
|
52 |
+
wandb_logger:
|
53 |
+
wandb_entity: null
|
54 |
+
wandb_project: infini_attention_8b_llama
|
55 |
+
general:
|
56 |
+
benchmark_csv_path: null
|
57 |
+
consumed_train_samples: 1920000
|
58 |
+
ignore_sanity_checks: true
|
59 |
+
project: infini_attention_8b_llama
|
60 |
+
run: exp57_8b_llama_1024_ctx_length_and_64_segment_length_and_100k_bs_and_global_lr_1.0e-5_and_balance_factor_lr_0.01_and_balance_factor_0_weight_decay
|
61 |
+
seed: 42
|
62 |
+
step: 20000
|
63 |
+
infini_attention:
|
64 |
+
balance_act_type: orig_sigmoid
|
65 |
+
balance_factor_lr: 0.01
|
66 |
+
balance_factor_weight_decay: 0.0
|
67 |
+
balance_init_type: zeros
|
68 |
+
log_grad: false
|
69 |
+
log_segment_acts: false
|
70 |
+
logging: true
|
71 |
+
logging_interval: 250
|
72 |
+
segment_length: 64
|
73 |
+
turn_on_memory: true
|
74 |
+
kill_switch_path: null
|
75 |
+
lighteval: null
|
76 |
+
logging:
|
77 |
+
iteration_step_info_interval: 1
|
78 |
+
log_level: info
|
79 |
+
log_level_replica: info
|
80 |
+
model:
|
81 |
+
ddp_bucket_cap_mb: 25
|
82 |
+
dtype: bfloat16
|
83 |
+
init_method:
|
84 |
+
path: /fsx/phuc/projects/infini-attention/llama3-ckps/haojun-8b-llama-nanotron-ckp/NanotronLlama3-8B
|
85 |
+
make_vocab_size_divisible_by: 1
|
86 |
+
model_config:
|
87 |
+
bos_token_id: 128000
|
88 |
+
eos_token_id: 128001
|
89 |
+
hidden_act: silu
|
90 |
+
hidden_size: 4096
|
91 |
+
initializer_range: 0.02
|
92 |
+
intermediate_size: 14336
|
93 |
+
is_llama_config: true
|
94 |
+
max_position_embeddings: 8192
|
95 |
+
num_attention_heads: 32
|
96 |
+
num_hidden_layers: 32
|
97 |
+
num_key_value_heads: 8
|
98 |
+
pad_token_id: null
|
99 |
+
pretraining_tp: 1
|
100 |
+
rms_norm_eps: 1.0e-05
|
101 |
+
rope_interleaved: false
|
102 |
+
rope_scaling: null
|
103 |
+
rope_theta: 500000.0
|
104 |
+
tie_word_embeddings: false
|
105 |
+
use_cache: true
|
106 |
+
vocab_size: 128256
|
107 |
+
optimizer:
|
108 |
+
accumulate_grad_in_fp32: false
|
109 |
+
adam_beta1: 0.9
|
110 |
+
adam_beta2: 0.95
|
111 |
+
adam_eps: 1.0e-08
|
112 |
+
clip_grad: 1.0
|
113 |
+
learning_rate_scheduler:
|
114 |
+
learning_rate: 1.0e-05
|
115 |
+
lr_decay_starting_step: null
|
116 |
+
lr_decay_steps: 23500
|
117 |
+
lr_decay_style: cosine
|
118 |
+
lr_warmup_steps: 1500
|
119 |
+
lr_warmup_style: linear
|
120 |
+
min_decay_lr: 1.0e-06
|
121 |
+
torch_adam_is_fused: true
|
122 |
+
weight_decay: 0.1
|
123 |
+
zero_stage: 0
|
124 |
+
parallelism:
|
125 |
+
dp: 6
|
126 |
+
expert_parallel_size: 1
|
127 |
+
pp: 1
|
128 |
+
pp_engine: 1f1b
|
129 |
+
tp: 4
|
130 |
+
tp_linear_async_communication: false
|
131 |
+
tp_mode: ALL_REDUCE
|
132 |
+
profiler: null
|
133 |
+
s3_upload:
|
134 |
+
remove_after_upload: true
|
135 |
+
s5cmd_concurrency: 5
|
136 |
+
s5cmd_numworkers: 16
|
137 |
+
s5cmd_path: null
|
138 |
+
upload_s3_path: s3://phuc-experiments/infini-attention/8b-llama/exp57_8b_llama_1024_ctx_length_and_64_segment_length_and_100k_bs_and_global_lr_1.0e-5_and_balance_factor_lr_0.01_and_balance_factor_0_weight_decay
|
139 |
+
tokenizer:
|
140 |
+
tokenizer_max_length: null
|
141 |
+
tokenizer_name_or_path: /fsx/haojun/lighteval_evaluation_model/NanotronLlama3-8B
|
142 |
+
tokenizer_revision: null
|
143 |
+
tokens:
|
144 |
+
batch_accumulation_per_replica: 1
|
145 |
+
limit_test_batches: 0
|
146 |
+
limit_val_batches: 0
|
147 |
+
micro_batch_size: 16
|
148 |
+
sequence_length: 1024
|
149 |
+
train_steps: 25000
|
150 |
+
val_check_interval: -1
|
lr_scheduler/lr_scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:737facbe7635b84da684cbb0920e1e12cbfa59d865027e3d29946e1da7fcb6c9
|
3 |
+
size 5812
|
model/model/decoder/0/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-0-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cd3cd7ea1b37e9d8245500104946a1f9beda585b582c3c6c86417f2143e62c0a
|
3 |
+
size 200
|
model/model/decoder/0/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-1-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a5ecce917758b783cf75bcfba65bb98e2598b47fa369e79e97217dac514fe7cb
|
3 |
+
size 200
|
model/model/decoder/0/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-2-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f4eca6ff9837681e57b8b7359a0b1450a2a2faaa217f191343a224bcfa4bac2d
|
3 |
+
size 200
|
model/model/decoder/0/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-3-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:70e26e50995d04b476cef24f25fe3b123db242b79d2b26721e958a27a94e95c3
|
3 |
+
size 200
|
model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:afbb7fafb264594a507fe03060a966b91335e60401aea3f3531c9036a37bdc2b
|
3 |
+
size 8388848
|
model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fd9505ee95b8f6e228216d63826f095d1d2bd704c090a9496b0cd204b5dc3cc7
|
3 |
+
size 8388848
|
model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f4f77e670759e11cca92e8eb1a4ca8cb1d997cbe1a3c7ec44097d91704dec79
|
3 |
+
size 8388848
|
model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:20f2977cc2ccc9df4828d9170c829143ac84e7bf1a10a17dbcff03b8e7d2b9c4
|
3 |
+
size 8388848
|
model/model/decoder/0/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b85fff5b4969a14a90d3251beea89a4f58b3476951dc4d80842fbc42859551a6
|
3 |
+
size 12583264
|
model/model/decoder/0/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c906c130138839577b399f263d2cd0377c684161cab8eb9db82cf4f30e178fa1
|
3 |
+
size 12583272
|
model/model/decoder/0/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:691718e5119e0a7115a84454c6ed9eafd768b9b9783f747c698059db2233224e
|
3 |
+
size 12583272
|
model/model/decoder/0/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3b315f2fdabb2b6e6aaa65bcbc1e77d7c307f50b20de2b3f55501e3c9e355884
|
3 |
+
size 12583272
|
model/model/decoder/0/pp_block/input_layernorm/model_weight.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:839be36eddaad9760a68863d4618402af1893620b281a05ff0ff9e7cfe0ed802
|
3 |
+
size 8288
|
model/model/decoder/0/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6d638b1dbd9ab13fbfbeb47885a43ae81679b199c18c0cbaed3f202cf4c36942
|
3 |
+
size 29360368
|
model/model/decoder/0/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a030aaac659587f67dca0efc2b49aaa8b8736eed1906d2f5980529f6c7fe45c1
|
3 |
+
size 29360368
|
model/model/decoder/0/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f5c763bab5ae055a0f8d095329c60c4ad64f0d05a52db890ed860d003f0f14ee
|
3 |
+
size 29360368
|
model/model/decoder/0/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4001b487aa10205cc4b4a335c2df5dbcf2b6cf692a19b90e1e6346a6adb25df5
|
3 |
+
size 29360368
|
model/model/decoder/0/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eafbd550659acf898eea0fab8867281ce636543eff7f13c40f32f9028bda67d2
|
3 |
+
size 58720552
|
model/model/decoder/0/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3db63345724e44faf6ba407334ecc96348ba03e0192f9f60550518368e87b6ac
|
3 |
+
size 58720560
|
model/model/decoder/0/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:81385d8ec0ae77104655fbb1f5dbf799f70466a32f708c04cb47080b2d46d3e3
|
3 |
+
size 58720560
|
model/model/decoder/0/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b9ba738a34d22e416b1db4bded73974c726edb399b284d23eceb2ef16da5dd06
|
3 |
+
size 58720560
|
model/model/decoder/0/pp_block/post_attention_layernorm/model_weight.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:835706c82a03cc5e4a4c3879f76dde2c8873efd2e11c7e6d39789798af173773
|
3 |
+
size 8288
|
model/model/decoder/1/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-0-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:10aec76847af0d3b72cbf894d8ef1629db51ddf0721f2e9a839d7294e94b25ba
|
3 |
+
size 200
|
model/model/decoder/1/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-1-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:44e7dc8301e6cac7b9cff6fa11dc95222e33c275101952f0fe8096d6e78927db
|
3 |
+
size 200
|
model/model/decoder/1/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-2-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0ced99e5e02b4b2df6734a354a1bb1eee1ee32e0f7d71c00c15ba50349c520bc
|
3 |
+
size 200
|
model/model/decoder/1/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-3-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:62cc82e79b26a0311182c0a0bc0e565d89e7205950536ff511a4190dfe5b4cb2
|
3 |
+
size 200
|
model/model/decoder/1/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c2fbb9870ae118de4d8225df2d0111916f337e87f6ffb171e6e99e942a6f5e84
|
3 |
+
size 8388848
|
model/model/decoder/1/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:99382bce346eb0aa855de26305cc64d581c209306867ecfe665209e142724cd1
|
3 |
+
size 8388848
|
model/model/decoder/1/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:df614829f243bc8d2c4ba3deda49e8a9bab6ced8cd2e63c46b08fdae176ab6c6
|
3 |
+
size 8388848
|
model/model/decoder/1/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a42aec6f5c07a392ebfc588f67c9b8713c3f1cbf1dad988c3bb7ae0ed72a4d47
|
3 |
+
size 8388848
|
model/model/decoder/1/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8005058e37aa101f31327e80c1d3a2646a3303acab2e21f6e78c0f4f72f01495
|
3 |
+
size 12583264
|
model/model/decoder/1/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef2b1fe9ec243434a30e0d2aef93794229ca86157146e79a8037b144494246b0
|
3 |
+
size 12583272
|
model/model/decoder/1/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:92f89cc57b8db928d5dd5b2baf9422acf4a6fcb5b13d7c57a63fb6891bd68bcf
|
3 |
+
size 12583272
|
model/model/decoder/1/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3feca5b544559386a355f31d3658237169eb8c170523fdac2047c0aec838a8b6
|
3 |
+
size 12583272
|
model/model/decoder/1/pp_block/input_layernorm/model_weight.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f3e247f0ff9fef5db3801019d29cbab0e939f9811bc87948dbf2ffccf3c804c7
|
3 |
+
size 8288
|
model/model/decoder/1/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5de65ac6d4af028cfa691029645ba2cdbd80e8717789505e5850c5978807256b
|
3 |
+
size 29360368
|
model/model/decoder/1/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a9d8b4af82370734fa7f9c2a99958a732b8face2142d04ddd7d0b7321eb7af71
|
3 |
+
size 29360368
|
model/model/decoder/1/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1c30074512f3a8e912c0f3efd0546d3049d42f2781625ee29184de6b8d2f2b55
|
3 |
+
size 29360368
|
model/model/decoder/1/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:111a16b5ed3e23b81976ccee35575e1cb2d7a17f675b9f448315cf813642b157
|
3 |
+
size 29360368
|
model/model/decoder/1/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:374da23673af8dde22742199fb2c504849f54ff68978d13e856b3d66f7e1233b
|
3 |
+
size 58720552
|
model/model/decoder/1/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:afd78bf516b96f51ead6a14a5821cec58857c46c98c6ed1bc99924ca4bf9c67b
|
3 |
+
size 58720560
|
model/model/decoder/1/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:23e758214cc2fd7c534e5face960fe4123b37d76889d95eec0d548f3734a54ad
|
3 |
+
size 58720560
|
model/model/decoder/1/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5e5d9e4777df66e6bd1affbdb44ed554e66913b0077b83d8551c18979fd5089e
|
3 |
+
size 58720560
|
model/model/decoder/1/pp_block/post_attention_layernorm/model_weight.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:915cdab464967593881825306bb37d3565c1463d54c596606f4756d8b4f3023b
|
3 |
+
size 8288
|
model/model/decoder/10/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-0-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0800aeaafa9b470cde1e420ebf24a853846755269f90f9b0a54316e4c0666ef9
|
3 |
+
size 200
|
model/model/decoder/10/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-1-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b37c0e48385b0aff19585ad996520319d194b68d8c5bef9a0255b35db6391f19
|
3 |
+
size 200
|
model/model/decoder/10/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-2-of-4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f81fafc188236802bae75d2f0309ed3988f9d0c7f87a6d4689670be8b41cf8a0
|
3 |
+
size 200
|