DEFAULT_IMAGE_PATCH_TOKEN = f"" DEFAULT_IM_START_TOKEN = f"" DEFAULT_IM_END_TOKEN = f"" DEFAULT_IM_COL_TOKEN = f"" IMAGE_PROMPT = "<|image|>" EXTRA_TOKENS = (DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_COL_TOKEN, IMAGE_PROMPT) VIT_STANDARD_CONFIGS = { "dinov2-large": { "image_emb_dim": 1024, "image_mlp_dim": 4096, 'image_patch_size': 14, 'image_pos_patch_size': 14, 'image_num_layers': 24, 'image_num_heads': 16, 'image_num_key_value_heads': 16, 'image_head_dim': 64, 'image_mlp_activations': 'gelu', 'image_default_input_size': (224, 224), 'image_num_pos': 257, 'image_norm_eps': 1e-6, "image_model_type": "dino" }, "SigLIP-So400m-14-384": { "image_emb_dim": 1152, 'image_num_layers': 27, "image_mlp_dim": 4304, 'image_patch_size': 14, 'image_pos_patch_size': 14, 'image_num_heads': 16, 'image_num_key_value_heads': 16, 'image_head_dim': 72, 'image_mlp_activations': 'gelu', # Although it is called "384" that seems to be an error of the author's # part, it actually only handles 378 inputs 'image_default_input_size': (378, 378), 'image_num_pos': 729, # note not CLS token 'image_norm_eps': 1e-6, "image_model_type": "siglip", "resize_mode": "siglip" }, "DFN5B-CLIP-ViT-H-14-378": { "image_emb_dim": 1280, 'image_patch_size': 14, 'image_pos_patch_size': 14, 'image_num_layers': 32, 'image_num_heads': 16, 'image_num_key_value_heads': 16, 'image_head_dim': 80, 'image_mlp_dim': 5120, 'image_dropout_rate': 0.0, 'image_mlp_activations': 'quick_gelu', 'image_default_input_size': (378, 378), 'image_num_pos': 730, 'image_norm_eps': 1e-5, "image_model_type": "openai", "resize_mode": "no_aspect_ratio" }, 'ViT-L/14-336': { 'image_patch_size': 14, 'image_pos_patch_size': 14, 'image_emb_dim': 1024, 'image_num_heads': 16, 'image_num_layers': 23, 'image_head_dim': 64, 'image_mlp_dim': 4096, 'image_mlp_activations': 'quick_gelu', 'image_dropout_rate': 0.0, 'image_num_pos': 577, 'image_default_input_size': (336, 336), 'image_norm_eps': 1e-5, 'image_num_key_value_heads': 16, "image_model_type": "openai" }, 'EVA02-L-14-336': { 'image_patch_size': 14, 'image_pos_patch_size': 14, 'image_emb_dim': 1024, 'image_num_heads': 16, 'image_num_layers': 24, 'image_head_dim': 64, 'image_mlp_dim': 2730, 'image_mlp_activations': 'silu', 'image_dropout_rate': 0.0, 'image_num_pos': 577, 'image_default_input_size': (336, 336), 'image_norm_eps': 1e-6, 'image_num_key_value_heads': 16, "image_model_type": "eva" }, 'ViT-L/14': { 'image_patch_size': 14, 'image_pos_patch_size': 14, 'image_emb_dim': 1024, 'image_num_heads': 16, # Note the original model has 24 layers, but we don't use the last layer 'image_num_layers': 23, 'image_head_dim': 64, 'image_mlp_dim': 4096, 'image_mlp_activations': 'quick_gelu', 'image_dropout_rate': 0.0, 'image_num_pos': 257, 'image_default_input_size': (224, 224), 'image_norm_eps': 1e-5, 'image_num_key_value_heads': 16, "image_model_type": "openai" }, 'debug': { 'image_patch_size': 14, 'image_pos_patch_size': 14, 'image_emb_dim': 1024, 'image_num_heads': 16, 'image_num_layers': 1, 'image_head_dim': 64, 'image_mlp_dim': 1024, 'image_mlp_activations': 'quick_gelu', 'image_dropout_rate': 0.0, 'image_num_pos': 577, 'image_default_input_size': (336, 336), 'image_norm_eps': 1e-5, 'image_num_key_value_heads': 16, "image_model_type": "openai" } } OPEN_LLM_STANDARD_CONFIGS = { "qwen1.5_7b": { 'vocab_size': 151936, 'hidden_size': 4096, 'intermediate_size': 11008, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 32, 'max_sequence_length': 2048, 'max_position_embeddings': 32768, 'rope_theta': 1000000.0, 'initializer_range': 0.02, 'rms_norm_eps': 1e-6, "qkv_bias": True, 'tie_word_embeddings': False, 'hidden_act': 'silu', 'norm_module': 'RMSNorm', "tokenizer": "hf-Qwen/Qwen1.5-7B", }, "qwen1.5_14b": { 'vocab_size': 152064, 'hidden_size': 5120, 'intermediate_size': 13696, 'num_hidden_layers': 40, 'num_attention_heads': 40, 'num_key_value_heads': 40, 'max_sequence_length': 2048, 'max_position_embeddings': 32768, 'rope_theta': 1000000.0, 'initializer_range': 0.02, 'rms_norm_eps': 1e-6, "qkv_bias": True, 'tie_word_embeddings': False, 'hidden_act': 'silu', 'norm_module': 'RMSNorm', "tokenizer": "hf-Qwen/Qwen1.5-14B", }, "qwen1.5_32b": { "vocab_size": 152064, "hidden_size": 5120, "intermediate_size": 27392, "num_hidden_layers": 64, "num_attention_heads": 40, "num_key_value_heads": 8, 'max_sequence_length': 2048, 'max_position_embeddings': 32768, "rope_theta": 1000000.0, 'initializer_range': 0.02, "rms_norm_eps": 1e-6, "qkv_bias": True, "tie_word_embeddings": False, 'hidden_act': 'silu', 'norm_module': 'RMSNorm', "tokenizer": "hf-Qwen/Qwen1.5-32B", }, 'llama_7b': { 'vocab_size': 32000, 'hidden_size': 4096, 'intermediate_size': 11008, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 32, 'max_sequence_length': 2048, 'max_position_embeddings': 8192, 'rope_theta': 10000.0, 'initializer_range': 0.02, 'rms_norm_eps': 1e-5, 'tie_word_embeddings': False, 'hidden_act': 'silu', 'norm_module': 'RMSNorm', "tokenizer": "llama" }, 'yi_6b': { 'vocab_size': 64000, 'hidden_size': 4096, 'intermediate_size': 11008, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 4, 'max_sequence_length': 4096, 'max_position_embeddings': 4096, 'rope_theta': 5000000.0, 'initializer_range': 0.02, 'rms_norm_eps': 1e-5, 'tie_word_embeddings': False, 'hidden_act': 'silu', 'norm_module': 'RMSNorm', "tokenizer": "yi" }, 'yi_9b': { 'vocab_size': 64000, 'hidden_size': 4096, 'intermediate_size': 11008, 'num_hidden_layers': 48, 'num_attention_heads': 32, 'num_key_value_heads': 4, 'max_sequence_length': 4096, 'max_position_embeddings': 4096, 'rope_theta': 10000, 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'tie_word_embeddings': False, 'hidden_act': 'silu', 'norm_module': 'RMSNorm', "tokenizer": "yi" }, 'yi_34b': { 'vocab_size': 64000, 'hidden_size': 7168, 'intermediate_size': 20480, 'num_hidden_layers': 60, 'num_attention_heads': 56, 'num_key_value_heads': 8, 'max_sequence_length': 4096, 'max_position_embeddings': 4096, 'rope_theta': 5000000.0, 'initializer_range': 0.02, 'rms_norm_eps': 1e-5, 'tie_word_embeddings': False, 'hidden_act': 'silu', 'norm_module': 'RMSNorm', "tokenizer": "yi" }, "olmo_1b": { 'vocab_size': 50304, 'hidden_size': 2048, 'intermediate_size': 8192, 'num_hidden_layers': 16, 'num_attention_heads': 16, 'num_key_value_heads': 16, 'max_sequence_length': 4096, 'max_position_embeddings': 32768, 'rope_theta': 10000.0, 'initializer_range': 0.02, 'rms_norm_eps': 1e-5, 'tie_word_embeddings': True, 'hidden_act': 'silu', 'norm_module': 'OlmoLayerNorm', "tokenizer": "hf-allenai/OLMo-1B" }, "olmo_7b": { 'vocab_size': 50304, 'hidden_size': 4096, 'intermediate_size': 22016//2, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 32, 'max_sequence_length': 4096, 'max_position_embeddings': 32768, 'rope_theta': 10000.0, 'initializer_range': 0.02, 'rms_norm_eps': 1e-5, 'tie_word_embeddings': False, 'hidden_act': 'silu', 'norm_module': 'OlmoLayerNorm', "tokenizer": "hf-allenai/OLMo-7B", }, "olmo_1.7_7b": { 'vocab_size': 50304, 'hidden_size': 4096, 'intermediate_size': 22016//2, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 32, 'max_sequence_length': 4096, 'max_position_embeddings': 32768, 'rope_theta': 10000.0, 'initializer_range': 0.02, 'rms_norm_eps': 1e-5, 'tie_word_embeddings': False, 'hidden_act': 'silu', "qkv_clip": 8, 'norm_module': 'OlmoLayerNorm', "tokenizer": "hf-allenai/OLMo-1.7-7B", }, 'mistral_7b': { 'vocab_size': 32000, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'max_sequence_length': 4096, 'max_position_embeddings': 32768, 'rope_theta': 10000.0, 'initializer_range': 0.02, 'rms_norm_eps': 1e-5, 'tie_word_embeddings': False, 'hidden_act': 'silu', 'norm_module': 'RMSNorm', "tokenizer": "mistral" }, 'mistral0.3_7b': { 'vocab_size': 32768, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'max_sequence_length': 4096, 'max_position_embeddings': 32768, 'rope_theta': 1000000.0, 'initializer_range': 0.02, 'rms_norm_eps': 1e-5, 'tie_word_embeddings': False, 'hidden_act': 'silu', 'norm_module': 'RMSNorm', "tokenizer": "mistral0.3" }, "mistral0.2_22b": { 'vocab_size': 32000, 'hidden_size': 6144, 'intermediate_size': 16384, 'num_hidden_layers': 56, 'num_attention_heads': 48, 'num_key_value_heads': 8, 'max_sequence_length': 4096, 'max_position_embeddings': 32768, 'rope_theta': 1000000, 'initializer_range': 0.02, 'rms_norm_eps': 1e-5, 'tie_word_embeddings': False, 'hidden_act': 'silu', 'norm_module': 'RMSNorm', "tokenizer": "mistral" }, 'llama_13b': { 'vocab_size': 32000, 'hidden_size': 5120, 'intermediate_size': 13824, 'num_hidden_layers': 40, 'num_attention_heads': 40, 'num_key_value_heads': 40, 'max_sequence_length': 2048, 'max_position_embeddings': 8192, 'initializer_range': 0.02, 'rms_norm_eps': 1e-5, 'tie_word_embeddings': False, 'hidden_act': 'silu', "norm_module": 'RMSNorm', 'rope_theta': 10000.0, "tokenizer": "llama" }, 'llama_70b': { 'vocab_size': 32000, 'hidden_size': 8192, 'intermediate_size': 28672, 'num_hidden_layers': 80, 'num_attention_heads': 64, 'num_key_value_heads': 8, 'max_sequence_length': 8192, 'max_position_embeddings': 8192, 'rope_theta': 10000.0, 'initializer_range': 0.02, 'rms_norm_eps': 1e-5, 'tie_word_embeddings': False, 'hidden_act': 'silu', "tokenizer": "llama" }, 'llama_70bflash': { 'vocab_size': 32000, 'hidden_size': 8192, 'intermediate_size': 28672, 'num_hidden_layers': 80, 'num_attention_heads': 64, 'num_key_value_heads': 8, 'max_sequence_length': 8192, 'max_position_embeddings': 8192, 'rope_theta': 10000.0, 'initializer_range': 0.02, 'rms_norm_eps': 1e-5, 'tie_word_embeddings': False, 'scan_attention': True, 'scan_mlp': True, 'hidden_act': 'silu', "tokenizer": "llama" }, 'llama3_8b': { 'vocab_size': 128256, 'hidden_size': 4096, 'intermediate_size': 14336, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'max_sequence_length': 8192, 'max_position_embeddings': 8192, 'rope_theta': 500000.0, 'initializer_range': 0.02, 'rms_norm_eps': 1e-5, 'tie_word_embeddings': False, 'hidden_act': 'silu', 'norm_module': 'RMSNorm', "tokenizer": "hf-meta-llama/Meta-Llama-3-8B", }, 'llama3_70b': { 'vocab_size': 128256, 'hidden_size': 8192, 'intermediate_size': 28672, 'num_hidden_layers': 80, 'num_attention_heads': 64, 'num_key_value_heads': 8, 'max_sequence_length': 8192, 'max_position_embeddings': 8192, 'rope_theta': 500000.0, 'initializer_range': 0.02, 'rms_norm_eps': 1e-5, 'tie_word_embeddings': False, 'hidden_act': 'silu', 'norm_module': 'RMSNorm', "tokenizer": "hf-meta-llama/Meta-Llama-3-70B", }, 'open_llama_3b': { 'vocab_size': 32000, 'hidden_size': 3200, 'intermediate_size': 8640, 'num_hidden_layers': 26, 'num_attention_heads': 32, 'max_sequence_length': 2048, 'initializer_range': 0.02, 'rms_norm_eps': 1e-6, 'max_position_embeddings': 2048, 'num_key_value_heads': 32, 'rope_theta': 10000.0, 'tie_word_embeddings': False, 'hidden_act': 'silu', 'norm_module': 'RMSNorm', "tokenizer": "llama" }, 'gemma_2b': { 'vocab_size': 256000, 'hidden_size': 2048, 'intermediate_size': 16384, 'num_hidden_layers': 18, 'num_attention_heads': 8, 'max_sequence_length': 8192, 'initializer_range': 0.02, 'rms_norm_eps': 1e-6, 'max_position_embeddings': 8192, 'num_key_value_heads': 1, 'rope_theta': 10000.0, 'tie_word_embeddings': True, 'normalize_input_embeds': True, 'norm_module': 'GemmaRMSNorm', 'hidden_act': 'gelu', "tokenizer": "gemma" }, 'gemma_7b': { 'vocab_size': 256000, 'hidden_size': 3072, 'intermediate_size': 24576, 'num_hidden_layers': 28, 'num_attention_heads': 16, 'max_sequence_length': 8192, 'initializer_range': 0.02, 'rms_norm_eps': 1e-6, 'max_position_embeddings': 8192, 'num_key_value_heads': 16, 'rope_theta': 10000.0, 'tie_word_embeddings': True, 'normalize_input_embeds': True, 'norm_module': 'GemmaRMSNorm', 'hidden_act': 'gelu', "tokenizer": "gemma" }, 'tiny_llama_1b': { 'vocab_size': 32000, 'hidden_size': 2048, 'intermediate_size': 5632, 'num_hidden_layers': 22, 'num_attention_heads': 32, 'max_sequence_length': 2048, 'initializer_range': 0.02, 'rms_norm_eps': 1e-5, 'max_position_embeddings': 2048, 'num_key_value_heads': 4, 'rope_theta': 10000.0, 'tie_word_embeddings': False, 'hidden_act': 'silu', 'norm_module': 'RMSNorm', "tokenizer": "llama" }, 'debug': { # A small model for debugging 'vocab_size': 32000, 'hidden_size': 512, 'intermediate_size': 512, 'num_hidden_layers': 1, 'num_attention_heads': 8, 'max_sequence_length': 4096, 'initializer_range': 0.02, 'rms_norm_eps': 1e-5, 'max_position_embeddings': 4096, 'num_key_value_heads': 8, 'rope_theta': 10000.0, 'tie_word_embeddings': False, 'hidden_act': 'silu', 'norm_module': 'RMSNorm', "tokenizer": "llama" }, 'gemma2_9b': { 'vocab_size': 256000, 'hidden_size': 3584, 'head_dim': 256, 'intermediate_size': 14336, 'num_hidden_layers': 42, 'num_attention_heads': 16, 'max_sequence_length': 8192, "query_pre_attn_scalar": 224, 'initializer_range': 0.02, 'rms_norm_eps': 1e-6, 'max_position_embeddings': 8192, 'num_key_value_heads': 8, 'rope_theta': 10000.0, 'tie_word_embeddings': False, 'normalize_input_embeds': True, 'norm_module': 'GemmaRMSNorm', 'hidden_act': 'gelu_tanh', "tokenizer": "hf-google/gemma-2-9b", "attn_logit_softcapping": 50.0, "final_logit_softcapping": 30.0, }, 'gemma2_27b': { 'vocab_size': 256000, 'hidden_size': 4608, 'head_dim': 128, 'intermediate_size': 36864, 'num_hidden_layers': 46, 'num_attention_heads': 32, 'max_sequence_length': 8192, "query_pre_attn_scalar": 144, 'initializer_range': 0.02, 'rms_norm_eps': 1e-6, 'max_position_embeddings': 8192, 'num_key_value_heads': 16, 'rope_theta': 10000.0, 'tie_word_embeddings': False, 'normalize_input_embeds': True, 'norm_module': 'GemmaRMSNorm', 'hidden_act': 'gelu_tanh', "tokenizer": "hf-google/gemma-2-27b", "attn_logit_softcapping": 50.0, "final_logit_softcapping": 30.0, }, }