akswelh's picture
Upload 251 files
d90b3a8 verified
raw
history blame
1.33 kB
{
"pipe_parallel_size": 1,
"model_parallel_size": 1,
"make_vocab_size_divisible_by": 1,
# model settings
"num_layers": 32,
"hidden_size": 4096,
"intermediate_size": 14336,
"num_attention_heads": 32,
"num_kv_heads": 8,
# per Mistral, Mistral-7B-v0.1 was pretrained with 8192 seqlen
# and instruction tuned to 16384 seqlen, all with 4096 sliding window
"seq_length": 8192,
"sliding_window_width": 4096,
"max_position_embeddings": 131072,
"pos_emb": "rotary",
"rotary_pct": 1,
"rotary_emb_base": 10000,
"no_weight_tying": true,
"gpt_j_residual": false,
"output_layer_parallelism": "column",
"norm": "rmsnorm",
"rms_norm_epsilon": 1.0e-5,
# Grouped Query Attention is supported for both default ("global")
# and Flash attention. However, we highly recommend the use of Flash attention
# to get FLOP + runtime speedups when using GQA,
# and sliding window attention is currently only supported by Flash attention.
"attention_config": [[["flash"], 32]],
"scaled_upper_triang_masked_softmax_fusion": true,
"bias_gelu_fusion": false,
"use_bias_in_norms": false,
"use_bias_in_attn_linear": false,
"activation": "swiglu",
"tokenizer_type": "SPMTokenizer",
#"vocab-file": ".../mistral-7B-v0.1/tokenizer.model", # use tokenizer.model from Mistral-7B-v0.1 direct download
}