Qwen
/

Qwen-14B-Chat

Text Generation

Model card Files Files and versions Community

yangapku commited on Sep 28, 2023

Commit

11d7120

•

1 Parent(s): 091b76e

add softmax_in_fp32

Files changed (2) hide show

configuration_qwen.py +4 -2
modeling_qwen.py +5 -1

configuration_qwen.py CHANGED Viewed

@@ -37,6 +37,7 @@ class QWenConfig(PretrainedConfig):
         tie_word_embeddings=False,
         use_cache_quantization=False,
         use_cache_kernel=False,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -61,8 +62,9 @@ class QWenConfig(PretrainedConfig):
         self.use_logn_attn = use_logn_attn
         self.use_flash_attn = use_flash_attn
         self.no_bias = no_bias
-        self.use_cache_quantization=use_cache_quantization
-        self.use_cache_kernel=use_cache_kernel
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
             **kwargs

         tie_word_embeddings=False,
         use_cache_quantization=False,
         use_cache_kernel=False,
+        softmax_in_fp32=False,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.use_logn_attn = use_logn_attn
         self.use_flash_attn = use_flash_attn
         self.no_bias = no_bias
+        self.use_cache_quantization = use_cache_quantization
+        self.use_cache_kernel = use_cache_kernel
+        self.softmax_in_fp32 = softmax_in_fp32
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
             **kwargs

modeling_qwen.py CHANGED Viewed

@@ -280,6 +280,7 @@ class QWenAttention(nn.Module):
         self.register_buffer("logn_tensor", logn_tensor, persistent=False)
         self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
         self.use_cache_quantization = config.use_cache_quantization if hasattr(config, 'use_cache_quantization') else False
         self.use_cache_kernel = config.use_cache_kernel if hasattr(config,'use_cache_kernel') else False
         cache_dtype = torch.float
@@ -346,7 +347,10 @@ class QWenAttention(nn.Module):
         if attention_mask is not None:
             attn_weights = attn_weights + attention_mask
-        attn_weights = nn.functional.softmax(attn_weights.float(), dim=-1)
         attn_weights = attn_weights.type(query.dtype)
         attn_weights = self.attn_dropout(attn_weights)

         self.register_buffer("logn_tensor", logn_tensor, persistent=False)
         self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
+        self.softmax_in_fp32 = config.softmax_in_fp32 if hasattr(config, 'softmax_in_fp32') else False
         self.use_cache_quantization = config.use_cache_quantization if hasattr(config, 'use_cache_quantization') else False
         self.use_cache_kernel = config.use_cache_kernel if hasattr(config,'use_cache_kernel') else False
         cache_dtype = torch.float
         if attention_mask is not None:
             attn_weights = attn_weights + attention_mask
+        if self.softmax_in_fp32:
+            attn_weights = nn.functional.softmax(attn_weights.float(), dim=-1)
+        else:
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1)
         attn_weights = attn_weights.type(query.dtype)
         attn_weights = self.attn_dropout(attn_weights)