magicslabnu
/

OutEffHop-opt-125m

Text Generation

text-generation-inference

Model card Files Files and versions Community

robinzixuan commited on Jun 14

Commit

3fa0828

•

1 Parent(s): b36e70f

Update modeling_opt.py

Files changed (1) hide show

modeling_opt.py +4 -3

modeling_opt.py CHANGED Viewed

@@ -32,6 +32,7 @@ from transformers.modeling_outputs import (
     QuestionAnsweringModelOutput,
     SequenceClassifierOutputWithPast,
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
     add_code_sample_docstrings,
@@ -259,10 +260,10 @@ class OPTAttention(nn.Module):
         # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437
         if attn_weights.dtype == torch.float16:
-            attn_weights = nn.functional.softmax(
                 attn_weights, dim=-1, dtype=torch.float32).to(torch.float16)
         else:
-            attn_weights = nn.functional.softmax(attn_weights, dim=-1)
         if layer_head_mask is not None:
             if layer_head_mask.size() != (self.num_heads,):
@@ -489,7 +490,7 @@ class OPTOutEffHop(OPTAttention):
         return attn_output, attn_weights_reshaped, past_key_value
-class OptFlashAttention2(OPTOutEffHop):
     """
     OPT flash attention module. This module inherits from `OPTAttention` as the weights of the module stays untouched.
     The only required change would be on the forward pass where it needs to correctly call the public API of flash

     QuestionAnsweringModelOutput,
     SequenceClassifierOutputWithPast,
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
     add_code_sample_docstrings,
         # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437
         if attn_weights.dtype == torch.float16:
+            attn_weights = softmax_1(
                 attn_weights, dim=-1, dtype=torch.float32).to(torch.float16)
         else:
+            attn_weights = softmax_1(attn_weights, dim=-1)
         if layer_head_mask is not None:
             if layer_head_mask.size() != (self.num_heads,):
         return attn_output, attn_weights_reshaped, past_key_value
+class OptFlashAttention2(OPTAttention):
     """
     OPT flash attention module. This module inherits from `OPTAttention` as the weights of the module stays untouched.
     The only required change would be on the forward pass where it needs to correctly call the public API of flash