tiiuae
/

falcon-7b-instruct

@@ -6,12 +6,12 @@
   ],
   "attention_dropout": 0.0,
   "auto_map": {
-    "AutoConfig": "configuration_RW.RWConfig",
-    "AutoModel": "modeling_RW.RWModel",
-    "AutoModelForSequenceClassification": "modeling_RW.RWForSequenceClassification",
-    "AutoModelForTokenClassification": "modeling_RW.RWForTokenClassification",
-    "AutoModelForQuestionAnswering": "modeling_RW.RWForQuestionAnswering",
-    "AutoModelForCausalLM": "modeling_RW.RWForCausalLM"
   },
   "bias": false,
   "bos_token_id": 11,

   ],
   "attention_dropout": 0.0,
   "auto_map": {
+    "AutoConfig": "configuration_falcon.FalconConfig",
+    "AutoModel": "modeling_falcon.FalconModel",
+    "AutoModelForSequenceClassification": "modeling_falcon.FalconForSequenceClassification",
+    "AutoModelForTokenClassification": "modeling_falcon.FalconForTokenClassification",
+    "AutoModelForQuestionAnswering": "modeling_falcon.FalconForQuestionAnswering",
+    "AutoModelForCausalLM": "modeling_falcon.FalconForCausalLM"
   },
   "bias": false,
   "bos_token_id": 11,

configuration_RW.py → configuration_falcon.py RENAMED Viewed

@@ -25,7 +25,7 @@ FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 }
-class RWConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`FalconModel`]. It is used to instantiate a Falcon
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
@@ -80,10 +80,10 @@ class RWConfig(PretrainedConfig):
     Example:
     ```python
-    >>> from transformers import FalconModel, RWConfig
     >>> # Initializing a small (2-layer) Falcon configuration
-    >>> configuration = RWConfig(num_hidden_layers=2)
     >>> # Initializing a model from the small configuration
     >>> model = FalconModel(configuration)

 }
+class FalconConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`FalconModel`]. It is used to instantiate a Falcon
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
     Example:
     ```python
+    >>> from transformers import FalconModel, FalconConfig
     >>> # Initializing a small (2-layer) Falcon configuration
+    >>> configuration = FalconConfig(num_hidden_layers=2)
     >>> # Initializing a model from the small configuration
     >>> model = FalconModel(configuration)

modeling_RW.py → modeling_falcon.py RENAMED Viewed

@@ -32,7 +32,7 @@ from transformers.modeling_outputs import (
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
-from .configuration_RW import RWConfig
 logger = logging.get_logger(__name__)
@@ -46,7 +46,7 @@ FALCON_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "tiiuae/falcon-rw-1b",
 ]
 _CHECKPOINT_FOR_DOC = "Rocketknight1/falcon-rw-1b"
-_CONFIG_FOR_DOC = "RWConfig"
 # NOTE(Hesslow): Unfortunately we did not fuse matmul and bias during training, this means that there's one additional quantization to bfloat16 between the operations.
@@ -188,7 +188,7 @@ def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training:
 class FalconAttention(nn.Module):
-    def __init__(self, config: RWConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -396,7 +396,7 @@ class FalconAttention(nn.Module):
 class FalconMLP(nn.Module):
-    def __init__(self, config: RWConfig):
         super().__init__()
         hidden_size = config.hidden_size
@@ -412,7 +412,7 @@ class FalconMLP(nn.Module):
 class FalconDecoderLayer(nn.Module):
-    def __init__(self, config: RWConfig):
         super().__init__()
         hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
@@ -499,7 +499,7 @@ FALCON_START_DOCSTRING = r"""
     and behavior.
     Parameters:
-        config ([`RWConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
@@ -559,13 +559,13 @@ FALCON_INPUTS_DOCSTRING = r"""
 """
-class RWPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
-    config_class = RWConfig
     base_model_prefix = "transformer"
     supports_gradient_checkpointing = True
     _no_split_modules = ["FalconDecoderLayer"]
@@ -589,9 +589,9 @@ class RWPreTrainedModel(PreTrainedModel):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
-    # Copied from transformers.models.bloom.modeling_bloom.BloomPreTrainedModel._set_gradient_checkpointing with BloomModel->RWModel
     def _set_gradient_checkpointing(self, module: nn.Module, value: bool = False):
-        if isinstance(module, RWModel):
             module.gradient_checkpointing = value
     @staticmethod
@@ -635,8 +635,8 @@ class RWPreTrainedModel(PreTrainedModel):
     "The bare Falcon Model transformer outputting raw hidden-states without any specific head on top.",
     FALCON_START_DOCSTRING,
 )
-class RWModel(RWPreTrainedModel):
-    def __init__(self, config: RWConfig):
         super().__init__(config)
         self.embed_dim = config.hidden_size
@@ -835,12 +835,12 @@ class RWModel(RWPreTrainedModel):
     "The Falcon Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).",
     FALCON_START_DOCSTRING,
 )
-class RWForCausalLM(RWPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
-    def __init__(self, config: RWConfig):
         super().__init__(config)
-        self.transformer = RWModel(config)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         # Initialize weights and apply final processing
@@ -965,7 +965,7 @@ class RWForCausalLM(RWPreTrainedModel):
     """
     The Falcon Model transformer with a sequence classification head on top (linear layer).
-    [`RWForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-1) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
@@ -976,11 +976,11 @@ class RWForCausalLM(RWPreTrainedModel):
     """,
     FALCON_START_DOCSTRING,
 )
-class RWForSequenceClassification(RWPreTrainedModel):
-    def __init__(self, config: RWConfig):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.transformer = RWModel(config)
         self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False)
         # Initialize weights and apply final processing
@@ -1092,12 +1092,12 @@ class RWForSequenceClassification(RWPreTrainedModel):
     """,
     FALCON_START_DOCSTRING,
 )
-class RWForTokenClassification(RWPreTrainedModel):
-    def __init__(self, config: RWConfig):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.transformer = RWModel(config)
         if getattr(config, "classifier_dropout", None) is not None:
             classifier_dropout = config.classifier_dropout
         elif getattr(config, "hidden_dropout", None) is not None:
@@ -1181,10 +1181,10 @@ class RWForTokenClassification(RWPreTrainedModel):
     """,
     FALCON_START_DOCSTRING,
 )
-class RWForQuestionAnswering(RWPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
-        self.transformer = RWModel(config)
         self.qa_outputs = nn.Linear(config.hidden_size, 2)
         # Initialize weights and apply final processing

 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_falcon import FalconConfig
 logger = logging.get_logger(__name__)
     "tiiuae/falcon-rw-1b",
 ]
 _CHECKPOINT_FOR_DOC = "Rocketknight1/falcon-rw-1b"
+_CONFIG_FOR_DOC = "FalconConfig"
 # NOTE(Hesslow): Unfortunately we did not fuse matmul and bias during training, this means that there's one additional quantization to bfloat16 between the operations.
 class FalconAttention(nn.Module):
+    def __init__(self, config: FalconConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
 class FalconMLP(nn.Module):
+    def __init__(self, config: FalconConfig):
         super().__init__()
         hidden_size = config.hidden_size
 class FalconDecoderLayer(nn.Module):
+    def __init__(self, config: FalconConfig):
         super().__init__()
         hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
     and behavior.
     Parameters:
+        config ([`FalconConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 """
+class FalconPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
+    config_class = FalconConfig
     base_model_prefix = "transformer"
     supports_gradient_checkpointing = True
     _no_split_modules = ["FalconDecoderLayer"]
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
+    # Copied from transformers.models.bloom.modeling_bloom.BloomPreTrainedModel._set_gradient_checkpointing with BloomModel->FalconModel
     def _set_gradient_checkpointing(self, module: nn.Module, value: bool = False):
+        if isinstance(module, FalconModel):
             module.gradient_checkpointing = value
     @staticmethod
     "The bare Falcon Model transformer outputting raw hidden-states without any specific head on top.",
     FALCON_START_DOCSTRING,
 )
+class FalconModel(FalconPreTrainedModel):
+    def __init__(self, config: FalconConfig):
         super().__init__(config)
         self.embed_dim = config.hidden_size
     "The Falcon Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).",
     FALCON_START_DOCSTRING,
 )
+class FalconForCausalLM(FalconPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: FalconConfig):
         super().__init__(config)
+        self.transformer = FalconModel(config)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         # Initialize weights and apply final processing
     """
     The Falcon Model transformer with a sequence classification head on top (linear layer).
+    [`FalconForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-1) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
     """,
     FALCON_START_DOCSTRING,
 )
+class FalconForSequenceClassification(FalconPreTrainedModel):
+    def __init__(self, config: FalconConfig):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.transformer = FalconModel(config)
         self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False)
         # Initialize weights and apply final processing
     """,
     FALCON_START_DOCSTRING,
 )
+class FalconForTokenClassification(FalconPreTrainedModel):
+    def __init__(self, config: FalconConfig):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.transformer = FalconModel(config)
         if getattr(config, "classifier_dropout", None) is not None:
             classifier_dropout = config.classifier_dropout
         elif getattr(config, "hidden_dropout", None) is not None:
     """,
     FALCON_START_DOCSTRING,
 )
+class FalconForQuestionAnswering(FalconPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
+        self.transformer = FalconModel(config)
         self.qa_outputs = nn.Linear(config.hidden_size, 2)
         # Initialize weights and apply final processing