baichuan-inc
/

Baichuan-13B-Base

@@ -249,7 +249,8 @@ class BaichuanModel(BaichuanPreTrainedModel):
         self.gradient_checkpointing = config.gradient_checkpointing
         self.post_init()
         self.max_cache_pos = config.model_max_length
-        self.first_run = True
     def get_input_embeddings(self):
         return self.embed_tokens
@@ -306,8 +307,13 @@ class BaichuanModel(BaichuanPreTrainedModel):
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
-        alibi_mask = self.get_alibi_mask(inputs_embeds, seq_length_with_past)
         if attention_mask is not None:
             if len(attention_mask.shape) == 2:
                 expanded_mask = attention_mask.to(alibi_mask.dtype)
@@ -597,4 +603,4 @@ class BaichuanForCausalLM(BaichuanPreTrainedModel):
             self.__class__.generate = PreTrainedModel.generate  # disable stream
             outputs = self.generate(input_ids, generation_config=generation_config)
             response = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
-            return response

         self.gradient_checkpointing = config.gradient_checkpointing
         self.post_init()
         self.max_cache_pos = config.model_max_length
+        self.first_run = True
+        self.alibi_mask = None
     def get_input_embeddings(self):
         return self.embed_tokens
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
+        if self.training:
+            if self.alibi_mask is None or self.alibi_mask.shape[-1] != seq_length_with_past:
+                self.alibi_mask = self.get_alibi_mask(inputs_embeds, seq_length_with_past)
+            alibi_mask = self.alibi_mask
+        else:
+            alibi_mask = self.get_alibi_mask(inputs_embeds, seq_length_with_past)
         if attention_mask is not None:
             if len(attention_mask.shape) == 2:
                 expanded_mask = attention_mask.to(alibi_mask.dtype)
             self.__class__.generate = PreTrainedModel.generate  # disable stream
             outputs = self.generate(input_ids, generation_config=generation_config)
             response = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
+            return response