robinzixuan commited on
Commit
a9d6cc6
1 Parent(s): f794bc1

Upload modeling_opt.py

Browse files
Files changed (1) hide show
  1. modeling_opt.py +7 -14
modeling_opt.py CHANGED
@@ -195,8 +195,7 @@ class OPTAttention(nn.Module):
195
 
196
  if (self.head_dim * num_heads) != self.embed_dim:
197
  raise ValueError(
198
- f"embed_dim must be divisible by num_heads (got `embed_dim`: {
199
- self.embed_dim}"
200
  f" and `num_heads`: {num_heads})."
201
  )
202
  self.scaling = self.head_dim**-0.5
@@ -366,16 +365,14 @@ class OPTAttention(nn.Module):
366
 
367
  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
368
  raise ValueError(
369
- f"Attention weights should be of size {
370
- (bsz * self.num_heads, tgt_len, src_len)}, but is"
371
  f" {attn_weights.size()}"
372
  )
373
 
374
  if attention_mask is not None:
375
  if attention_mask.size() != (bsz, 1, tgt_len, src_len):
376
  raise ValueError(
377
- f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {
378
- attention_mask.size()}"
379
  )
380
  attn_weights = attn_weights.view(
381
  bsz, self.num_heads, tgt_len, src_len) + attention_mask
@@ -396,8 +393,7 @@ class OPTAttention(nn.Module):
396
  if layer_head_mask is not None:
397
  if layer_head_mask.size() != (self.num_heads,):
398
  raise ValueError(
399
- f"Head mask for a single layer should be of size {
400
- (self.num_heads,)}, but is"
401
  f" {layer_head_mask.size()}"
402
  )
403
  attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
@@ -431,8 +427,7 @@ class OPTAttention(nn.Module):
431
 
432
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
433
  raise ValueError(
434
- f"`attn_output` should be of size {
435
- (bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
436
  f" {attn_output.size()}"
437
  )
438
 
@@ -1090,8 +1085,7 @@ class OPTDecoder(OPTPreTrainedModel):
1090
  batch_size, mask_seq_length, device=inputs_embeds.device)
1091
  elif attention_mask.shape[1] != mask_seq_length:
1092
  raise ValueError(
1093
- f"The provided attention mask has length {
1094
- attention_mask.shape[1]}, but its length should be "
1095
  f"{mask_seq_length} (sum of the lengths of current and past inputs)"
1096
  )
1097
  causal_attention_mask = _prepare_4d_causal_attention_mask(
@@ -1123,8 +1117,7 @@ class OPTDecoder(OPTPreTrainedModel):
1123
  if attn_mask is not None:
1124
  if attn_mask.size()[0] != (len(self.layers)):
1125
  raise ValueError(
1126
- f"The `{mask_name}` should be specified for {
1127
- len(self.layers)} layers, but it is for"
1128
  f" {head_mask.size()[0]}."
1129
  )
1130
 
 
195
 
196
  if (self.head_dim * num_heads) != self.embed_dim:
197
  raise ValueError(
198
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
 
199
  f" and `num_heads`: {num_heads})."
200
  )
201
  self.scaling = self.head_dim**-0.5
 
365
 
366
  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
367
  raise ValueError(
368
+ f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
 
369
  f" {attn_weights.size()}"
370
  )
371
 
372
  if attention_mask is not None:
373
  if attention_mask.size() != (bsz, 1, tgt_len, src_len):
374
  raise ValueError(
375
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
 
376
  )
377
  attn_weights = attn_weights.view(
378
  bsz, self.num_heads, tgt_len, src_len) + attention_mask
 
393
  if layer_head_mask is not None:
394
  if layer_head_mask.size() != (self.num_heads,):
395
  raise ValueError(
396
+ f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
 
397
  f" {layer_head_mask.size()}"
398
  )
399
  attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
 
427
 
428
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
429
  raise ValueError(
430
+ f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
 
431
  f" {attn_output.size()}"
432
  )
433
 
 
1085
  batch_size, mask_seq_length, device=inputs_embeds.device)
1086
  elif attention_mask.shape[1] != mask_seq_length:
1087
  raise ValueError(
1088
+ f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be "
 
1089
  f"{mask_seq_length} (sum of the lengths of current and past inputs)"
1090
  )
1091
  causal_attention_mask = _prepare_4d_causal_attention_mask(
 
1117
  if attn_mask is not None:
1118
  if attn_mask.size()[0] != (len(self.layers)):
1119
  raise ValueError(
1120
+ f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
 
1121
  f" {head_mask.size()[0]}."
1122
  )
1123