robinzixuan
commited on
Commit
•
e5ee791
1
Parent(s):
42857ca
Update modeling_opt.py
Browse files- modeling_opt.py +18 -18
modeling_opt.py
CHANGED
@@ -157,8 +157,8 @@ class OPTAttention(nn.Module):
|
|
157 |
|
158 |
if (self.head_dim * self.num_heads) != self.embed_dim:
|
159 |
raise ValueError(
|
160 |
-
f
|
161 |
-
self.embed_dim}
|
162 |
f" and `num_heads`: {self.num_heads})."
|
163 |
)
|
164 |
self.scaling = self.head_dim**-0.5
|
@@ -236,16 +236,16 @@ class OPTAttention(nn.Module):
|
|
236 |
|
237 |
if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
|
238 |
raise ValueError(
|
239 |
-
f
|
240 |
-
(bsz * self.num_heads, tgt_len, src_len)}, but is
|
241 |
f" {attn_weights.size()}"
|
242 |
)
|
243 |
|
244 |
if attention_mask is not None:
|
245 |
if attention_mask.size() != (bsz, 1, tgt_len, src_len):
|
246 |
raise ValueError(
|
247 |
-
f
|
248 |
-
attention_mask.size()}
|
249 |
)
|
250 |
attn_weights = attn_weights.view(
|
251 |
bsz, self.num_heads, tgt_len, src_len) + attention_mask
|
@@ -266,8 +266,8 @@ class OPTAttention(nn.Module):
|
|
266 |
if layer_head_mask is not None:
|
267 |
if layer_head_mask.size() != (self.num_heads,):
|
268 |
raise ValueError(
|
269 |
-
f
|
270 |
-
(self.num_heads,)}, but is
|
271 |
f" {layer_head_mask.size()}"
|
272 |
)
|
273 |
attn_weights = layer_head_mask.view(
|
@@ -333,8 +333,8 @@ class OPTOutEffHop(OPTAttention):
|
|
333 |
|
334 |
if (self.head_dim * self.num_heads) != self.embed_dim:
|
335 |
raise ValueError(
|
336 |
-
f
|
337 |
-
self.embed_dim}
|
338 |
f" and `num_heads`: {self.num_heads})."
|
339 |
)
|
340 |
self.scaling = self.head_dim**-0.5
|
@@ -412,16 +412,16 @@ class OPTOutEffHop(OPTAttention):
|
|
412 |
|
413 |
if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
|
414 |
raise ValueError(
|
415 |
-
f
|
416 |
(bsz * self.num_heads, tgt_len, src_len)}, but is"
|
417 |
-
f" {attn_weights.size()}
|
418 |
)
|
419 |
|
420 |
if attention_mask is not None:
|
421 |
if attention_mask.size() != (bsz, 1, tgt_len, src_len):
|
422 |
raise ValueError(
|
423 |
-
f
|
424 |
-
attention_mask.size()}
|
425 |
)
|
426 |
attn_weights = attn_weights.view(
|
427 |
bsz, self.num_heads, tgt_len, src_len) + attention_mask
|
@@ -442,8 +442,8 @@ class OPTOutEffHop(OPTAttention):
|
|
442 |
if layer_head_mask is not None:
|
443 |
if layer_head_mask.size() != (self.num_heads,):
|
444 |
raise ValueError(
|
445 |
-
f
|
446 |
-
(self.num_heads,)}, but is
|
447 |
f" {layer_head_mask.size()}"
|
448 |
)
|
449 |
attn_weights = layer_head_mask.view(
|
@@ -470,8 +470,8 @@ class OPTOutEffHop(OPTAttention):
|
|
470 |
|
471 |
if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
|
472 |
raise ValueError(
|
473 |
-
f
|
474 |
-
(bsz, self.num_heads, tgt_len, self.head_dim)}, but is
|
475 |
f" {attn_output.size()}"
|
476 |
)
|
477 |
|
|
|
157 |
|
158 |
if (self.head_dim * self.num_heads) != self.embed_dim:
|
159 |
raise ValueError(
|
160 |
+
f'''embed_dim must be divisible by num_heads (got `embed_dim`: {
|
161 |
+
self.embed_dim}'''
|
162 |
f" and `num_heads`: {self.num_heads})."
|
163 |
)
|
164 |
self.scaling = self.head_dim**-0.5
|
|
|
236 |
|
237 |
if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
|
238 |
raise ValueError(
|
239 |
+
f'''Attention weights should be of size {
|
240 |
+
(bsz * self.num_heads, tgt_len, src_len)}, but is'''
|
241 |
f" {attn_weights.size()}"
|
242 |
)
|
243 |
|
244 |
if attention_mask is not None:
|
245 |
if attention_mask.size() != (bsz, 1, tgt_len, src_len):
|
246 |
raise ValueError(
|
247 |
+
f'''Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {
|
248 |
+
attention_mask.size()}'''
|
249 |
)
|
250 |
attn_weights = attn_weights.view(
|
251 |
bsz, self.num_heads, tgt_len, src_len) + attention_mask
|
|
|
266 |
if layer_head_mask is not None:
|
267 |
if layer_head_mask.size() != (self.num_heads,):
|
268 |
raise ValueError(
|
269 |
+
f'''Head mask for a single layer should be of size {
|
270 |
+
(self.num_heads,)}, but is'''
|
271 |
f" {layer_head_mask.size()}"
|
272 |
)
|
273 |
attn_weights = layer_head_mask.view(
|
|
|
333 |
|
334 |
if (self.head_dim * self.num_heads) != self.embed_dim:
|
335 |
raise ValueError(
|
336 |
+
f'''embed_dim must be divisible by num_heads (got `embed_dim`: {
|
337 |
+
self.embed_dim}'''
|
338 |
f" and `num_heads`: {self.num_heads})."
|
339 |
)
|
340 |
self.scaling = self.head_dim**-0.5
|
|
|
412 |
|
413 |
if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
|
414 |
raise ValueError(
|
415 |
+
f'''Attention weights should be of size {
|
416 |
(bsz * self.num_heads, tgt_len, src_len)}, but is"
|
417 |
+
f" {attn_weights.size()}'''
|
418 |
)
|
419 |
|
420 |
if attention_mask is not None:
|
421 |
if attention_mask.size() != (bsz, 1, tgt_len, src_len):
|
422 |
raise ValueError(
|
423 |
+
f'''Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {
|
424 |
+
attention_mask.size()}'''
|
425 |
)
|
426 |
attn_weights = attn_weights.view(
|
427 |
bsz, self.num_heads, tgt_len, src_len) + attention_mask
|
|
|
442 |
if layer_head_mask is not None:
|
443 |
if layer_head_mask.size() != (self.num_heads,):
|
444 |
raise ValueError(
|
445 |
+
f'''Head mask for a single layer should be of size {
|
446 |
+
(self.num_heads,)}, but is'''
|
447 |
f" {layer_head_mask.size()}"
|
448 |
)
|
449 |
attn_weights = layer_head_mask.view(
|
|
|
470 |
|
471 |
if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
|
472 |
raise ValueError(
|
473 |
+
f'''`attn_output` should be of size {
|
474 |
+
(bsz, self.num_heads, tgt_len, self.head_dim)}, but is'''
|
475 |
f" {attn_output.size()}"
|
476 |
)
|
477 |
|