nvidia
/

Llama-3_1-Nemotron-51B-Instruct

Text Generation

Model card Files Files and versions Community

itlevy commited on Sep 26

Commit

e9d0db3

•

1 Parent(s): d311379

v4.46 support (#7)

- v4.46 support (5b585ec9a9cf91f67bd5696bc4df1090d08bd7fc)

Files changed (1) hide show

variable_cache.py +11 -9

variable_cache.py CHANGED Viewed

@@ -32,18 +32,20 @@ class VariableCache(Cache_4_44_2, Cache):
     The cache of each layer is allocated to the same gpu as the layer itself.
     """
-    def __init__(self,
-                 config: DeciLMConfig,
-                 max_batch_size: int,
-                 max_cache_len: int | None,
-                 device: torch.device | str | None = None,
-                 dtype: torch.dtype | None = None,
-                 **kwargs: Any,
-                 ):
         Cache_4_44_2.__init__(self)
         self.config = config
-        self.max_batch_size = max_batch_size
         self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
         self.dtype = dtype

     The cache of each layer is allocated to the same gpu as the layer itself.
     """
+    def __init__(
+            self,
+            config: DeciLMConfig,
+            batch_size: int = None,
+            max_cache_len: int = None,
+            device: torch.device = None,
+            dtype: torch.dtype = torch.float32,
+            max_batch_size: Optional[int] = None,
+            **kwargs: Any,
+    ) -> None:
         Cache_4_44_2.__init__(self)
         self.config = config
+        self.max_batch_size = batch_size or max_batch_size
         self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
         self.dtype = dtype