Muennighoff
commited on
Commit
•
7251f46
1
Parent(s):
def21c3
Fix
Browse files- config.json +1 -1
- config_molmoe.py +8 -5
- modeling_molmoe.py +27 -20
config.json
CHANGED
@@ -7,7 +7,7 @@
|
|
7 |
"AutoModelForCausalLM": "modeling_molmoe.MOLMoEForCausalLM"
|
8 |
},
|
9 |
"clip_qkv": null,
|
10 |
-
"embedding_size":
|
11 |
"hidden_size": 2048,
|
12 |
"initializer_range": 0.02,
|
13 |
"intermediate_size": 1024,
|
|
|
7 |
"AutoModelForCausalLM": "modeling_molmoe.MOLMoEForCausalLM"
|
8 |
},
|
9 |
"clip_qkv": null,
|
10 |
+
"embedding_size": 50304,
|
11 |
"hidden_size": 2048,
|
12 |
"initializer_range": 0.02,
|
13 |
"intermediate_size": 1024,
|
config_molmoe.py
CHANGED
@@ -5,19 +5,22 @@ from transformers import PretrainedConfig, AutoTokenizer
|
|
5 |
|
6 |
def config_to_moe_args(config):
|
7 |
from megablocks.layers.arguments import Arguments as MoEArgs
|
|
|
|
|
|
|
8 |
|
9 |
kwargs = {
|
10 |
"activation_fn": F.silu,
|
11 |
"mlp_type": "glu" if "glu" in config.activation_type.lower() else "mlp",
|
12 |
"mlp_impl": "sparse",
|
13 |
-
"hidden_size": config.
|
14 |
-
"ffn_hidden_size": config.
|
15 |
-
"moe_num_experts":
|
16 |
-
"num_layers": config.
|
17 |
# Handled by FSDP (https://github.com/databricks/megablocks/issues/57#issuecomment-1854594483)
|
18 |
"moe_weight_parallelism": False,
|
19 |
"moe_expert_model_parallelism": False,
|
20 |
-
"moe_top_k":
|
21 |
# "moe_loss_weight": config.moe_loss_weight,
|
22 |
# "device": config.init_device,
|
23 |
# Handled by FSDP
|
|
|
5 |
|
6 |
def config_to_moe_args(config):
|
7 |
from megablocks.layers.arguments import Arguments as MoEArgs
|
8 |
+
import torch.nn.functional as F
|
9 |
+
|
10 |
+
# import pdb; pdb.set_trace()
|
11 |
|
12 |
kwargs = {
|
13 |
"activation_fn": F.silu,
|
14 |
"mlp_type": "glu" if "glu" in config.activation_type.lower() else "mlp",
|
15 |
"mlp_impl": "sparse",
|
16 |
+
"hidden_size": config.d_model,
|
17 |
+
"ffn_hidden_size": config.mlp_hidden_size,
|
18 |
+
"moe_num_experts": 64,
|
19 |
+
"num_layers": config.n_layers,
|
20 |
# Handled by FSDP (https://github.com/databricks/megablocks/issues/57#issuecomment-1854594483)
|
21 |
"moe_weight_parallelism": False,
|
22 |
"moe_expert_model_parallelism": False,
|
23 |
+
"moe_top_k": 8,
|
24 |
# "moe_loss_weight": config.moe_loss_weight,
|
25 |
# "device": config.init_device,
|
26 |
# Handled by FSDP
|
modeling_molmoe.py
CHANGED
@@ -235,14 +235,16 @@ class OLMoBlock(nn.Module):
|
|
235 |
device=config.init_device
|
236 |
)
|
237 |
|
238 |
-
|
239 |
-
self.
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
|
|
|
|
246 |
|
247 |
# Rotary embeddings.
|
248 |
if self.config.rope:
|
@@ -423,7 +425,7 @@ class OLMoBlock(nn.Module):
|
|
423 |
return OLMoSequentialBlock(layer_id, config, cache)
|
424 |
elif config.block_type == "llama":
|
425 |
return OLMoLlamaBlock(layer_id, config, cache)
|
426 |
-
elif config.block_type ==
|
427 |
return OLMoEBlock(layer_id, config, cache)
|
428 |
else:
|
429 |
raise NotImplementedError(f"Unknown block type: '{config.block_type}'")
|
@@ -725,7 +727,7 @@ class OLMoEBlock(OLMoBlock):
|
|
725 |
(plus another skip connection).
|
726 |
"""
|
727 |
|
728 |
-
def __init__(self, layer_id: int, config
|
729 |
try:
|
730 |
from megablocks.layers.dmoe import dMoE
|
731 |
from megablocks.layers.moe import MoE
|
@@ -733,12 +735,12 @@ class OLMoEBlock(OLMoBlock):
|
|
733 |
raise ImportError(
|
734 |
"To train MoEs, run `pip install git+https://github.com/Muennighoff/megablocks.git@olmoe`"
|
735 |
)
|
736 |
-
from .
|
737 |
|
738 |
super().__init__(layer_id, config, cache)
|
739 |
|
740 |
self.moe_args = config_to_moe_args(config)
|
741 |
-
self.ffn = dMoE(self.moe_args)
|
742 |
|
743 |
self.attn_norm = LayerNorm.build(config)
|
744 |
self.ff_norm = LayerNorm.build(config)
|
@@ -956,12 +958,14 @@ class VisionBackboneConfig:
|
|
956 |
image_default_input_size: Tuple[int, int] = (336, 336)
|
957 |
image_patch_size: int = 14
|
958 |
image_pos_patch_size: int = 14
|
|
|
959 |
image_emb_dim: int = 1024
|
960 |
image_num_heads: int = 16
|
961 |
image_num_key_value_heads: int = 16
|
962 |
image_num_layers: int = 24
|
963 |
image_head_dim: int = 64
|
964 |
-
image_mlp_dim: int = 4096
|
|
|
965 |
image_mlp_activations: str = "gelu"
|
966 |
image_dropout_rate: float = 0.0
|
967 |
image_num_pos: int = 577
|
@@ -990,10 +994,10 @@ class FullMolmoeConfig:
|
|
990 |
qkv_bias: bool = False
|
991 |
clip_qkv: Optional[float] = None
|
992 |
n_layers: int = 12
|
993 |
-
mlp_ratio: int =
|
994 |
mlp_hidden_size: Optional[int] = None
|
995 |
activation_type: str = "swiglu"
|
996 |
-
block_type: str = "
|
997 |
block_group_size: int = 1
|
998 |
alibi: bool = False
|
999 |
alibi_bias_max: float = 8.0
|
@@ -1009,7 +1013,7 @@ class FullMolmoeConfig:
|
|
1009 |
attention_dropout: float = 0.1
|
1010 |
response_attention_dropout: float = 0.0
|
1011 |
multi_query_attention: Optional[bool] = None
|
1012 |
-
attention_layer_norm: bool =
|
1013 |
residual_dropout: float = 0.1
|
1014 |
response_residual_dropout: float = 0.0
|
1015 |
embedding_dropout: float = 0.1
|
@@ -1651,6 +1655,9 @@ class OLMoVisionBackbone(nn.Module):
|
|
1651 |
[MLP(mlp_config, input_dim), Residual(MLP(config, input_dim))]
|
1652 |
)
|
1653 |
elif config.image_projector == ImageProjectType.mlp:
|
|
|
|
|
|
|
1654 |
self.image_projector = MLP(mlp_config, input_dim)
|
1655 |
elif config.image_projector == ImageProjectType.linear:
|
1656 |
self.image_projector = nn.Linear(
|
@@ -2423,7 +2430,7 @@ class MOLMoEForCausalLM(PreTrainedModel):
|
|
2423 |
base_model_prefix = "model"
|
2424 |
_no_split_modules = ["OLMoBlock"]
|
2425 |
|
2426 |
-
def __init__(self, config: MolmoeConfig, model: Optional[
|
2427 |
super().__init__(config)
|
2428 |
|
2429 |
if not model:
|
@@ -2447,8 +2454,8 @@ class MOLMoEForCausalLM(PreTrainedModel):
|
|
2447 |
additional_vocab_size=128,
|
2448 |
n_heads=config.num_attention_heads,
|
2449 |
n_kv_heads=config.num_key_value_heads,
|
2450 |
-
rope_theta=
|
2451 |
-
layer_norm_eps=1e-
|
2452 |
layer_norm_type="rms",
|
2453 |
pad_tokenizer=True,
|
2454 |
vit_layers=[-2, -9],
|
@@ -2472,7 +2479,7 @@ class MOLMoEForCausalLM(PreTrainedModel):
|
|
2472 |
initializer_range=0.02,
|
2473 |
)
|
2474 |
)
|
2475 |
-
self.model =
|
2476 |
else:
|
2477 |
self.model = model
|
2478 |
|
|
|
235 |
device=config.init_device
|
236 |
)
|
237 |
|
238 |
+
|
239 |
+
if self.config.block_type != "moe":
|
240 |
+
# Feed-forward output projection.
|
241 |
+
self.ff_out = nn.Linear(
|
242 |
+
int(self.act.output_multiplier * self.hidden_size),
|
243 |
+
config.d_model,
|
244 |
+
bias=config.include_bias,
|
245 |
+
device=config.init_device,
|
246 |
+
)
|
247 |
+
self.ff_out._is_residual = True # type: ignore
|
248 |
|
249 |
# Rotary embeddings.
|
250 |
if self.config.rope:
|
|
|
425 |
return OLMoSequentialBlock(layer_id, config, cache)
|
426 |
elif config.block_type == "llama":
|
427 |
return OLMoLlamaBlock(layer_id, config, cache)
|
428 |
+
elif config.block_type == "moe":
|
429 |
return OLMoEBlock(layer_id, config, cache)
|
430 |
else:
|
431 |
raise NotImplementedError(f"Unknown block type: '{config.block_type}'")
|
|
|
727 |
(plus another skip connection).
|
728 |
"""
|
729 |
|
730 |
+
def __init__(self, layer_id: int, config, cache: BufferCache):
|
731 |
try:
|
732 |
from megablocks.layers.dmoe import dMoE
|
733 |
from megablocks.layers.moe import MoE
|
|
|
735 |
raise ImportError(
|
736 |
"To train MoEs, run `pip install git+https://github.com/Muennighoff/megablocks.git@olmoe`"
|
737 |
)
|
738 |
+
from .config_molmoe import config_to_moe_args
|
739 |
|
740 |
super().__init__(layer_id, config, cache)
|
741 |
|
742 |
self.moe_args = config_to_moe_args(config)
|
743 |
+
self.ffn = dMoE(self.moe_args)
|
744 |
|
745 |
self.attn_norm = LayerNorm.build(config)
|
746 |
self.ff_norm = LayerNorm.build(config)
|
|
|
958 |
image_default_input_size: Tuple[int, int] = (336, 336)
|
959 |
image_patch_size: int = 14
|
960 |
image_pos_patch_size: int = 14
|
961 |
+
# image_emb_dim: int = 1024
|
962 |
image_emb_dim: int = 1024
|
963 |
image_num_heads: int = 16
|
964 |
image_num_key_value_heads: int = 16
|
965 |
image_num_layers: int = 24
|
966 |
image_head_dim: int = 64
|
967 |
+
# image_mlp_dim: int = 4096
|
968 |
+
image_mlp_dim: int = 2048
|
969 |
image_mlp_activations: str = "gelu"
|
970 |
image_dropout_rate: float = 0.0
|
971 |
image_num_pos: int = 577
|
|
|
994 |
qkv_bias: bool = False
|
995 |
clip_qkv: Optional[float] = None
|
996 |
n_layers: int = 12
|
997 |
+
mlp_ratio: int = 1
|
998 |
mlp_hidden_size: Optional[int] = None
|
999 |
activation_type: str = "swiglu"
|
1000 |
+
block_type: str = "moe"
|
1001 |
block_group_size: int = 1
|
1002 |
alibi: bool = False
|
1003 |
alibi_bias_max: float = 8.0
|
|
|
1013 |
attention_dropout: float = 0.1
|
1014 |
response_attention_dropout: float = 0.0
|
1015 |
multi_query_attention: Optional[bool] = None
|
1016 |
+
attention_layer_norm: bool = True
|
1017 |
residual_dropout: float = 0.1
|
1018 |
response_residual_dropout: float = 0.0
|
1019 |
embedding_dropout: float = 0.1
|
|
|
1655 |
[MLP(mlp_config, input_dim), Residual(MLP(config, input_dim))]
|
1656 |
)
|
1657 |
elif config.image_projector == ImageProjectType.mlp:
|
1658 |
+
#import pdb; pdb.set_trace()
|
1659 |
+
#mlp_config.image_mlp_dim = 2048
|
1660 |
+
mlp_config.mlp_hidden_size = 2048
|
1661 |
self.image_projector = MLP(mlp_config, input_dim)
|
1662 |
elif config.image_projector == ImageProjectType.linear:
|
1663 |
self.image_projector = nn.Linear(
|
|
|
2430 |
base_model_prefix = "model"
|
2431 |
_no_split_modules = ["OLMoBlock"]
|
2432 |
|
2433 |
+
def __init__(self, config: MolmoeConfig, model: Optional[MOLMoE] = None, init_params: bool = False):
|
2434 |
super().__init__(config)
|
2435 |
|
2436 |
if not model:
|
|
|
2454 |
additional_vocab_size=128,
|
2455 |
n_heads=config.num_attention_heads,
|
2456 |
n_kv_heads=config.num_key_value_heads,
|
2457 |
+
rope_theta=10000.0,
|
2458 |
+
layer_norm_eps=1e-5,
|
2459 |
layer_norm_type="rms",
|
2460 |
pad_tokenizer=True,
|
2461 |
vit_layers=[-2, -9],
|
|
|
2479 |
initializer_range=0.02,
|
2480 |
)
|
2481 |
)
|
2482 |
+
self.model = MOLMoE(full_config, init_params=init_params)
|
2483 |
else:
|
2484 |
self.model = model
|
2485 |
|