File size: 4,378 Bytes
85a0876
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from typing import Dict, List, Optional

from transformers.configuration_utils import PretrainedConfig


class ReneConfig(PretrainedConfig):
    r"""Configuration class for the Rene model.

    This is the configuration class to store the configuration of a [`ReneLMHeadModel`].
    It is used to instantiate a Rene model according to the specified arguments,
    defining the model architecture. Instantiating a configuration with the defaults will yield
    a similar configuration to that of the Rene-v0.1-1.3b-pytorch model.
    [cartesia-ai/Rene-v0.1-1.3b-pytorch](https://huggingface.co/cartesia-ai/Rene-v0.1-1.3b-pytorch)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        d_model (`int`, *optional*, defaults to 2048):
            Dimension of the hidden representations.
        n_layer (`int`, *optional*, defaults to 48):
            Number of architecture blocks.
        vocab_size (`int`, *optional*, defaults to 50280):
            Vocabulary size of the Rene model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`ReneModel`].
        ssm_cfg (`dict`, *optional*):
            Configuration parameters for the SSM layers.
        attn_layer_idx (`List[int]`, *optional*):
            Indices of the architecture blocks that should have attention layers.
        attn_cfg (`dict`, *optional*):
            Configuration parameters for the attention layers.
        mlp_layer_idx (`List[int]`, *optional*):
            Indices of the architecture blocks that should have MLP layers.
        mlp_cfg (`dict`, *optional*):
            Configuration parameters for the MLP layers.
        rms_norm (`bool`, *optional*, defaults to `True`):
            Whether to use RMSNorm (instead of LayerNorm).
        residual_in_fp32 (`bool`, *optional*, defaults to `True`):
            Whether to keep residual values in fp32.
        pad_vocab_size_multiple (`int`, *optional*, defaults to 16):
            Pad the vocabulary size up to the next multiple of this value.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
            model has a output word embedding layer.
        pad_token_id (`int`, *optional*, defaults to 1):
            The id of the padding token.
        bos_token_id (`int`, *optional*):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`int`, *optional*, defaults to 50279):
            The id of the "end-of-sequence" token.
    """

    model_type = "rene"

    def __init__(
        self,
        d_model: int = 2048,
        n_layer: int = 48,
        vocab_size: int = 50280,
        ssm_cfg: Optional[Dict] = None,
        attn_layer_idx: Optional[List] = None,
        attn_cfg: Optional[Dict] = None,
        mlp_layer_idx: Optional[List] = None,
        mlp_cfg: Optional[Dict] = None,
        rms_norm: bool = True,
        residual_in_fp32: bool = True,
        pad_vocab_size_multiple: int = 16,
        tie_word_embeddings: bool = True,
        pad_token_id=1,
        bos_token_id=None,
        eos_token_id=50279,
        **kwargs,
    ):
        if ssm_cfg is None:
            ssm_cfg = {}
        if attn_layer_idx is None:
            attn_layer_idx = []
        if attn_cfg is None:
            attn_cfg = {}
        if mlp_layer_idx is None:
            mlp_layer_idx = []
        if mlp_cfg is None:
            mlp_cfg = {}

        self.d_model = d_model
        self.n_layer = n_layer
        self.vocab_size = vocab_size
        self.ssm_cfg = ssm_cfg
        self.attn_layer_idx = attn_layer_idx
        self.attn_cfg = attn_cfg
        self.mlp_layer_idx = mlp_layer_idx
        self.mlp_cfg = mlp_cfg
        self.rms_norm = rms_norm
        self.residual_in_fp32 = residual_in_fp32
        self.pad_vocab_size_multiple = pad_vocab_size_multiple
        self.tie_word_embeddings = tie_word_embeddings
        super().__init__(
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            pad_token_id=pad_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )