{ "pipe_parallel_size": 1, "model_parallel_size": 1, "make_vocab_size_divisible_by": 1, # model settings "num_layers": 32, "hidden_size": 4096, "intermediate_size": 14336, "num_attention_heads": 32, "num_kv_heads": 8, # per Mistral, Mistral-7B-v0.1 was pretrained with 8192 seqlen # and instruction tuned to 16384 seqlen, all with 4096 sliding window "seq_length": 8192, "sliding_window_width": 4096, "max_position_embeddings": 131072, "pos_emb": "rotary", "rotary_pct": 1, "rotary_emb_base": 10000, "no_weight_tying": true, "gpt_j_residual": false, "output_layer_parallelism": "column", "norm": "rmsnorm", "rms_norm_epsilon": 1.0e-5, # Grouped Query Attention is supported for both default ("global") # and Flash attention. However, we highly recommend the use of Flash attention # to get FLOP + runtime speedups when using GQA, # and sliding window attention is currently only supported by Flash attention. "attention_config": [[["flash"], 32]], "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": false, "use_bias_in_norms": false, "use_bias_in_attn_linear": false, "activation": "swiglu", "tokenizer_type": "SPMTokenizer", #"vocab-file": ".../mistral-7B-v0.1/tokenizer.model", # use tokenizer.model from Mistral-7B-v0.1 direct download }