Spaces:

nouamanetazi
/

emotion_recognition

Build error

App Files Files Community

nouamanetazi HF staff commited on Mar 20, 2022

Commit

c731c61

•

1 Parent(s): 25b69c5

linting

Browse files

Files changed (11) hide show

app.py +51 -35
layers/fc.py +3 -2
layers/layer_norm.py +1 -1
model_LA.py +64 -74
model_LAV.py +70 -83
utils/audio.py +46 -29
utils/audio_params.py +23 -18
utils/compute_args.py +28 -15
utils/plot.py +1 -1
utils/pred_func.py +1 -1
utils/tokenize.py +14 -15

app.py CHANGED Viewed

@@ -6,39 +6,47 @@ import torch
 import numpy as np
 from utils.audio import load_spectrograms
 from utils.compute_args import compute_args
-from utils.tokenize import tokenize, create_dict, sent_to_ix, cmumosei_2, cmumosei_7, pad_feature
 from model_LA import Model_LA
 import gradio as gr
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # load model
-ckpts_path = 'ckpt'
 model_name = "Model_LA_e"
 # Listing sorted checkpoints
-ckpts = sorted(glob.glob(os.path.join(ckpts_path, model_name,'best*')), reverse=True)
 # Load original args
-args = torch.load(ckpts[0], map_location=torch.device(device))['args']
 args = compute_args(args)
 pretrained_emb = np.load("train_glove.npy")
-token_to_ix = pickle.load(open("token_to_ix.pkl", "rb"))
-state_dict = torch.load(ckpts[0], map_location=torch.device(device))['state_dict']
 net = Model_LA(args, len(token_to_ix), pretrained_emb).to(device)
 net.load_state_dict(state_dict)
 def inference(source_video, transcription):
     # data preprocessing
     # text
     def clean(w):
-        return re.sub(
-                r"([.,'!?\"()*#:;])",
-                '',
-                w.lower()
-                ).replace('-', ' ').replace('/', ' ')
-    s = [clean(w) for w in transcription.split() if clean(w) != '']
     # Sound
     _, mel, mag = load_spectrograms(source_video)
@@ -55,32 +63,40 @@ def inference(source_video, transcription):
     print(f"Processed video shape from {mel.shape} to {V.shape}")
     net.train(False)
-    x = np.expand_dims(L,axis=0)
-    y = np.expand_dims(A,axis=0)
-    z = np.expand_dims(V,axis=0)
-    x, y, z = torch.from_numpy(x).to(device), torch.from_numpy(y).to(device), torch.from_numpy(z).float().to(device)
     pred = net(x, y, z).cpu().data.numpy()[0]
     # pred = np.exp(pred) / np.sum(np.exp(pred)) # softmax
-    label_to_ix = ['happy', 'sad', 'angry', 'fear', 'disgust', 'surprise']
     # result_dict = {label_to_ix[i]: float(pred[i]) for i in range(len(label_to_ix))}
-    result_dict = {label_to_ix[i]: float(pred[i])>0 for i in range(len(label_to_ix))}
     return result_dict
-title="Emotion Recognition"
-description=""
-examples = [
-                ['examples/0h-zjBukYpk_2.mp4', "NOW IM NOT EVEN GONNA SUGAR COAT THIS THIS MOVIE FRUSTRATED ME TO SUCH AN EXTREME EXTENT THAT I WAS LOUDLY EXCLAIMING WHY AT THE END OF THE FILM"],
-                ['examples/0h-zjBukYpk_19.mp4', "NOW OTHER PERFORMANCES ARE BORDERLINE OKAY"],
-                ['examples/03bSnISJMiM_1.mp4', "IT WAS REALLY GOOD "],
-                ['examples/03bSnISJMiM_5.mp4', "AND THEY SHOULDVE I GUESS "],
-           ]
-gr.Interface(inference,
-                     inputs = [gr.inputs.Video(type="avi", source="upload"), "text"],
-                     outputs=["label"],
-                     title=title,
-                     description=description,
-                     examples=examples
-                     ).launch(debug=True)

 import numpy as np
 from utils.audio import load_spectrograms
 from utils.compute_args import compute_args
+from utils.tokenize import (
+    tokenize,
+    create_dict,
+    sent_to_ix,
+    cmumosei_2,
+    cmumosei_7,
+    pad_feature,
+)
 from model_LA import Model_LA
 import gradio as gr
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # load model
+ckpts_path = "ckpt"
 model_name = "Model_LA_e"
 # Listing sorted checkpoints
+ckpts = sorted(glob.glob(os.path.join(ckpts_path, model_name, "best*")), reverse=True)
 # Load original args
+args = torch.load(ckpts[0], map_location=torch.device(device))["args"]
 args = compute_args(args)
 pretrained_emb = np.load("train_glove.npy")
+token_to_ix = pickle.load(open("token_to_ix.pkl", "rb"))
+state_dict = torch.load(ckpts[0], map_location=torch.device(device))["state_dict"]
 net = Model_LA(args, len(token_to_ix), pretrained_emb).to(device)
 net.load_state_dict(state_dict)
 def inference(source_video, transcription):
     # data preprocessing
     # text
     def clean(w):
+        return (
+            re.sub(r"([.,'!?\"()*#:;])", "", w.lower())
+            .replace("-", " ")
+            .replace("/", " ")
+        )
+    s = [clean(w) for w in transcription.split() if clean(w) != ""]
     # Sound
     _, mel, mag = load_spectrograms(source_video)
     print(f"Processed video shape from {mel.shape} to {V.shape}")
     net.train(False)
+    x = np.expand_dims(L, axis=0)
+    y = np.expand_dims(A, axis=0)
+    z = np.expand_dims(V, axis=0)
+    x, y, z = (
+        torch.from_numpy(x).to(device),
+        torch.from_numpy(y).to(device),
+        torch.from_numpy(z).float().to(device),
+    )
     pred = net(x, y, z).cpu().data.numpy()[0]
     # pred = np.exp(pred) / np.sum(np.exp(pred)) # softmax
+    label_to_ix = ["happy", "sad", "angry", "fear", "disgust", "surprise"]
     # result_dict = {label_to_ix[i]: float(pred[i]) for i in range(len(label_to_ix))}
+    result_dict = {label_to_ix[i]: float(pred[i]) > 0 for i in range(len(label_to_ix))}
     return result_dict
+title = "Emotion Recognition"
+description = ""
+examples = [
+    [
+        "examples/0h-zjBukYpk_2.mp4",
+        "NOW IM NOT EVEN GONNA SUGAR COAT THIS THIS MOVIE FRUSTRATED ME TO SUCH AN EXTREME EXTENT THAT I WAS LOUDLY EXCLAIMING WHY AT THE END OF THE FILM",
+    ],
+    ["examples/0h-zjBukYpk_19.mp4", "NOW OTHER PERFORMANCES ARE BORDERLINE OKAY"],
+    ["examples/03bSnISJMiM_1.mp4", "IT WAS REALLY GOOD "],
+    ["examples/03bSnISJMiM_5.mp4", "AND THEY SHOULDVE I GUESS "],
+]
+gr.Interface(
+    inference,
+    inputs=[gr.inputs.Video(type="avi", source="upload"), "text"],
+    outputs=["label"],
+    title=title,
+    description=description,
+    examples=examples,
+).launch(debug=True)

layers/fc.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import torch.nn as nn
 class FC(nn.Module):
-    def __init__(self, in_size, out_size, dropout_r=0., use_relu=True):
         super(FC, self).__init__()
         self.dropout_r = dropout_r
         self.use_relu = use_relu
@@ -27,7 +28,7 @@ class FC(nn.Module):
 class MLP(nn.Module):
-    def __init__(self, in_size, mid_size, out_size, dropout_r=0., use_relu=True):
         super(MLP, self).__init__()
         self.fc = FC(in_size, mid_size, dropout_r=dropout_r, use_relu=use_relu)

 import torch.nn as nn
 class FC(nn.Module):
+    def __init__(self, in_size, out_size, dropout_r=0.0, use_relu=True):
         super(FC, self).__init__()
         self.dropout_r = dropout_r
         self.use_relu = use_relu
 class MLP(nn.Module):
+    def __init__(self, in_size, mid_size, out_size, dropout_r=0.0, use_relu=True):
         super(MLP, self).__init__()
         self.fc = FC(in_size, mid_size, dropout_r=dropout_r, use_relu=use_relu)

layers/layer_norm.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import torch.nn as nn
 import torch
 class LayerNorm(nn.Module):
     def __init__(self, size, eps=1e-6):
         super(LayerNorm, self).__init__()
@@ -13,4 +14,3 @@ class LayerNorm(nn.Module):
         mean = x.mean(-1, keepdim=True)
         std = x.std(-1, keepdim=True)
         return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

 import torch.nn as nn
 import torch
 class LayerNorm(nn.Module):
     def __init__(self, size, eps=1e-6):
         super(LayerNorm, self).__init__()
         mean = x.mean(-1, keepdim=True)
         std = x.std(-1, keepdim=True)
         return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

model_LA.py CHANGED Viewed

@@ -10,10 +10,8 @@ from layers.layer_norm import LayerNorm
 # ---------- Masking sequence --------
 # ------------------------------------
 def make_mask(feature):
-    return (torch.sum(
-        torch.abs(feature),
-        dim=-1
-    ) == 0).unsqueeze(1).unsqueeze(2)
 # ------------------------------
 # ---------- Flattening --------
@@ -31,29 +29,23 @@ class AttFlat(nn.Module):
             mid_size=args.ff_size,
             out_size=flat_glimpse,
             dropout_r=args.dropout_r,
-            use_relu=True
         )
         if self.merge:
             self.linear_merge = nn.Linear(
-                args.hidden_size * flat_glimpse,
-                args.hidden_size * 2
             )
     def forward(self, x, x_mask):
         att = self.mlp(x)
         if x_mask is not None:
-            att = att.masked_fill(
-                x_mask.squeeze(1).squeeze(1).unsqueeze(2),
-                -1e9
-            )
         att = F.softmax(att, dim=1)
         att_list = []
         for i in range(self.flat_glimpse):
-            att_list.append(
-                torch.sum(att[:, :, i: i + 1] * x, dim=1)
-            )
         if self.merge:
             x_atted = torch.cat(att_list, dim=1)
@@ -63,10 +55,12 @@ class AttFlat(nn.Module):
         return torch.stack(att_list).transpose_(0, 1)
 # ------------------------
 # ---- Self Attention ----
 # ------------------------
 class SA(nn.Module):
     def __init__(self, args):
         super(SA, self).__init__()
@@ -81,13 +75,9 @@ class SA(nn.Module):
         self.norm2 = LayerNorm(args.hidden_size)
     def forward(self, y, y_mask):
-        y = self.norm1(y + self.dropout1(
-            self.mhatt(y, y, y, y_mask)
-        ))
-        y = self.norm2(y + self.dropout2(
-            self.ffn(y)
-        ))
         return y
@@ -96,6 +86,7 @@ class SA(nn.Module):
 # ---- Self Guided Attention ----
 # -------------------------------
 class SGA(nn.Module):
     def __init__(self, args):
         super(SGA, self).__init__()
@@ -114,24 +105,20 @@ class SGA(nn.Module):
         self.norm3 = LayerNorm(args.hidden_size)
     def forward(self, x, y, x_mask, y_mask):
-        x = self.norm1(x + self.dropout1(
-            self.mhatt1(v=x, k=x, q=x, mask=x_mask)
-        ))
-        x = self.norm2(x + self.dropout2(
-            self.mhatt2(v=y, k=y, q=x, mask=y_mask)
-        ))
-        x = self.norm3(x + self.dropout3(
-            self.ffn(x)
-        ))
         return x
 # ------------------------------
 # ---- Multi-Head Attention ----
 # ------------------------------
 class MHAtt(nn.Module):
     def __init__(self, args):
         super(MHAtt, self).__init__()
@@ -146,33 +133,45 @@ class MHAtt(nn.Module):
     def forward(self, v, k, q, mask):
         n_batches = q.size(0)
-        v = self.linear_v(v).view(
-            n_batches,
-            -1,
-            self.args.multi_head,
-            int(self.args.hidden_size / self.args.multi_head)
-        ).transpose(1, 2)
-        k = self.linear_k(k).view(
-            n_batches,
-            -1,
-            self.args.multi_head,
-            int(self.args.hidden_size / self.args.multi_head)
-        ).transpose(1, 2)
-        q = self.linear_q(q).view(
-            n_batches,
-            -1,
-            self.args.multi_head,
-            int(self.args.hidden_size / self.args.multi_head)
-        ).transpose(1, 2)
         atted = self.att(v, k, q, mask)
-        atted = atted.transpose(1, 2).contiguous().view(
-            n_batches,
-            -1,
-            self.args.hidden_size
         )
         atted = self.linear_merge(atted)
@@ -181,9 +180,7 @@ class MHAtt(nn.Module):
     def att(self, value, key, query, mask):
         d_k = query.size(-1)
-        scores = torch.matmul(
-            query, key.transpose(-2, -1)
-        ) / math.sqrt(d_k)
         if mask is not None:
             scores = scores.masked_fill(mask, -1e9)
@@ -198,6 +195,7 @@ class MHAtt(nn.Module):
 # ---- Feed Forward Nets ----
 # ---------------------------
 class FFN(nn.Module):
     def __init__(self, args):
         super(FFN, self).__init__()
@@ -207,12 +205,13 @@ class FFN(nn.Module):
             mid_size=args.ff_size,
             out_size=args.hidden_size,
             dropout_r=args.dropout_r,
-            use_relu=True
         )
     def forward(self, x):
         return self.mlp(x)
 # ---------------------------
 # ---- FF + norm  -----------
 # ---------------------------
@@ -231,7 +230,6 @@ class FFAndNorm(nn.Module):
         return x
 class Block(nn.Module):
     def __init__(self, args, i):
         super(Block, self).__init__()
@@ -239,7 +237,7 @@ class Block(nn.Module):
         self.sa1 = SA(args)
         self.sa3 = SGA(args)
-        self.last = (i == args.layer-1)
         if not self.last:
             self.att_lang = AttFlat(args, args.lang_seq_len, merge=False)
             self.att_audio = AttFlat(args, args.audio_seq_len, merge=False)
@@ -261,8 +259,7 @@ class Block(nn.Module):
         ax = self.att_lang(x, x_mask)
         ay = self.att_audio(y, y_mask)
-        return self.norm_l(x + self.dropout(ax)), \
-               self.norm_i(y + self.dropout(ay))
 class Model_LA(nn.Module):
@@ -273,8 +270,7 @@ class Model_LA(nn.Module):
         # LSTM
         self.embedding = nn.Embedding(
-            num_embeddings=vocab_size,
-            embedding_dim=args.word_embed_size
         )
         # Loading the GloVe embedding weights
@@ -284,7 +280,7 @@ class Model_LA(nn.Module):
             input_size=args.word_embed_size,
             hidden_size=args.hidden_size,
             num_layers=1,
-            batch_first=True
         )
         # self.lstm_y = nn.LSTM(
@@ -301,7 +297,7 @@ class Model_LA(nn.Module):
         self.enc_list = nn.ModuleList([Block(args, i) for i in range(args.layer)])
         # Flattenting features before proj
-        self.attflat_img  = AttFlat(args, 1, merge=True)
         self.attflat_lang = AttFlat(args, 1, merge=True)
         # Classification layers
@@ -325,19 +321,13 @@ class Model_LA(nn.Module):
                 x_m, x_y = x_mask, y_mask
             x, y = dec(x, x_m, y, x_y)
-        x = self.attflat_lang(
-            x,
-            None
-        )
-        y = self.attflat_img(
-            y,
-            None
-        )
         # Classification layers
         proj_feat = x + y
         proj_feat = self.proj_norm(proj_feat)
         ans = self.proj(proj_feat)
-        return ans

 # ---------- Masking sequence --------
 # ------------------------------------
 def make_mask(feature):
+    return (torch.sum(torch.abs(feature), dim=-1) == 0).unsqueeze(1).unsqueeze(2)
 # ------------------------------
 # ---------- Flattening --------
             mid_size=args.ff_size,
             out_size=flat_glimpse,
             dropout_r=args.dropout_r,
+            use_relu=True,
         )
         if self.merge:
             self.linear_merge = nn.Linear(
+                args.hidden_size * flat_glimpse, args.hidden_size * 2
             )
     def forward(self, x, x_mask):
         att = self.mlp(x)
         if x_mask is not None:
+            att = att.masked_fill(x_mask.squeeze(1).squeeze(1).unsqueeze(2), -1e9)
         att = F.softmax(att, dim=1)
         att_list = []
         for i in range(self.flat_glimpse):
+            att_list.append(torch.sum(att[:, :, i : i + 1] * x, dim=1))
         if self.merge:
             x_atted = torch.cat(att_list, dim=1)
         return torch.stack(att_list).transpose_(0, 1)
 # ------------------------
 # ---- Self Attention ----
 # ------------------------
 class SA(nn.Module):
     def __init__(self, args):
         super(SA, self).__init__()
         self.norm2 = LayerNorm(args.hidden_size)
     def forward(self, y, y_mask):
+        y = self.norm1(y + self.dropout1(self.mhatt(y, y, y, y_mask)))
+        y = self.norm2(y + self.dropout2(self.ffn(y)))
         return y
 # ---- Self Guided Attention ----
 # -------------------------------
 class SGA(nn.Module):
     def __init__(self, args):
         super(SGA, self).__init__()
         self.norm3 = LayerNorm(args.hidden_size)
     def forward(self, x, y, x_mask, y_mask):
+        x = self.norm1(x + self.dropout1(self.mhatt1(v=x, k=x, q=x, mask=x_mask)))
+        x = self.norm2(x + self.dropout2(self.mhatt2(v=y, k=y, q=x, mask=y_mask)))
+        x = self.norm3(x + self.dropout3(self.ffn(x)))
         return x
 # ------------------------------
 # ---- Multi-Head Attention ----
 # ------------------------------
 class MHAtt(nn.Module):
     def __init__(self, args):
         super(MHAtt, self).__init__()
     def forward(self, v, k, q, mask):
         n_batches = q.size(0)
+        v = (
+            self.linear_v(v)
+            .view(
+                n_batches,
+                -1,
+                self.args.multi_head,
+                int(self.args.hidden_size / self.args.multi_head),
+            )
+            .transpose(1, 2)
+        )
+        k = (
+            self.linear_k(k)
+            .view(
+                n_batches,
+                -1,
+                self.args.multi_head,
+                int(self.args.hidden_size / self.args.multi_head),
+            )
+            .transpose(1, 2)
+        )
+        q = (
+            self.linear_q(q)
+            .view(
+                n_batches,
+                -1,
+                self.args.multi_head,
+                int(self.args.hidden_size / self.args.multi_head),
+            )
+            .transpose(1, 2)
+        )
         atted = self.att(v, k, q, mask)
+        atted = (
+            atted.transpose(1, 2)
+            .contiguous()
+            .view(n_batches, -1, self.args.hidden_size)
         )
         atted = self.linear_merge(atted)
     def att(self, value, key, query, mask):
         d_k = query.size(-1)
+        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
         if mask is not None:
             scores = scores.masked_fill(mask, -1e9)
 # ---- Feed Forward Nets ----
 # ---------------------------
 class FFN(nn.Module):
     def __init__(self, args):
         super(FFN, self).__init__()
             mid_size=args.ff_size,
             out_size=args.hidden_size,
             dropout_r=args.dropout_r,
+            use_relu=True,
         )
     def forward(self, x):
         return self.mlp(x)
 # ---------------------------
 # ---- FF + norm  -----------
 # ---------------------------
         return x
 class Block(nn.Module):
     def __init__(self, args, i):
         super(Block, self).__init__()
         self.sa1 = SA(args)
         self.sa3 = SGA(args)
+        self.last = i == args.layer - 1
         if not self.last:
             self.att_lang = AttFlat(args, args.lang_seq_len, merge=False)
             self.att_audio = AttFlat(args, args.audio_seq_len, merge=False)
         ax = self.att_lang(x, x_mask)
         ay = self.att_audio(y, y_mask)
+        return self.norm_l(x + self.dropout(ax)), self.norm_i(y + self.dropout(ay))
 class Model_LA(nn.Module):
         # LSTM
         self.embedding = nn.Embedding(
+            num_embeddings=vocab_size, embedding_dim=args.word_embed_size
         )
         # Loading the GloVe embedding weights
             input_size=args.word_embed_size,
             hidden_size=args.hidden_size,
             num_layers=1,
+            batch_first=True,
         )
         # self.lstm_y = nn.LSTM(
         self.enc_list = nn.ModuleList([Block(args, i) for i in range(args.layer)])
         # Flattenting features before proj
+        self.attflat_img = AttFlat(args, 1, merge=True)
         self.attflat_lang = AttFlat(args, 1, merge=True)
         # Classification layers
                 x_m, x_y = x_mask, y_mask
             x, y = dec(x, x_m, y, x_y)
+        x = self.attflat_lang(x, None)
+        y = self.attflat_img(y, None)
         # Classification layers
         proj_feat = x + y
         proj_feat = self.proj_norm(proj_feat)
         ans = self.proj(proj_feat)
+        return ans

model_LAV.py CHANGED Viewed

@@ -10,10 +10,8 @@ from layers.layer_norm import LayerNorm
 # ---------- Masking sequence --------
 # ------------------------------------
 def make_mask(feature):
-    return (torch.sum(
-        torch.abs(feature),
-        dim=-1
-    ) == 0).unsqueeze(1).unsqueeze(2)
 # ------------------------------
 # ---------- Flattening --------
@@ -31,29 +29,23 @@ class AttFlat(nn.Module):
             mid_size=args.ff_size,
             out_size=flat_glimpse,
             dropout_r=args.dropout_r,
-            use_relu=True
         )
         if self.merge:
             self.linear_merge = nn.Linear(
-                args.hidden_size * flat_glimpse,
-                args.hidden_size * 2
             )
     def forward(self, x, x_mask):
         att = self.mlp(x)
         if x_mask is not None:
-            att = att.masked_fill(
-                x_mask.squeeze(1).squeeze(1).unsqueeze(2),
-                -1e9
-            )
         att = F.softmax(att, dim=1)
         att_list = []
         for i in range(self.flat_glimpse):
-            att_list.append(
-                torch.sum(att[:, :, i: i + 1] * x, dim=1)
-            )
         if self.merge:
             x_atted = torch.cat(att_list, dim=1)
@@ -63,10 +55,12 @@ class AttFlat(nn.Module):
         return torch.stack(att_list).transpose_(0, 1)
 # ------------------------
 # ---- Self Attention ----
 # ------------------------
 class SA(nn.Module):
     def __init__(self, args):
         super(SA, self).__init__()
@@ -81,13 +75,9 @@ class SA(nn.Module):
         self.norm2 = LayerNorm(args.hidden_size)
     def forward(self, y, y_mask):
-        y = self.norm1(y + self.dropout1(
-            self.mhatt(y, y, y, y_mask)
-        ))
-        y = self.norm2(y + self.dropout2(
-            self.ffn(y)
-        ))
         return y
@@ -96,6 +86,7 @@ class SA(nn.Module):
 # ---- Self Guided Attention ----
 # -------------------------------
 class SGA(nn.Module):
     def __init__(self, args):
         super(SGA, self).__init__()
@@ -114,24 +105,20 @@ class SGA(nn.Module):
         self.norm3 = LayerNorm(args.hidden_size)
     def forward(self, x, y, x_mask, y_mask):
-        x = self.norm1(x + self.dropout1(
-            self.mhatt1(v=x, k=x, q=x, mask=x_mask)
-        ))
-        x = self.norm2(x + self.dropout2(
-            self.mhatt2(v=y, k=y, q=x, mask=y_mask)
-        ))
-        x = self.norm3(x + self.dropout3(
-            self.ffn(x)
-        ))
         return x
 # ------------------------------
 # ---- Multi-Head Attention ----
 # ------------------------------
 class MHAtt(nn.Module):
     def __init__(self, args):
         super(MHAtt, self).__init__()
@@ -146,33 +133,45 @@ class MHAtt(nn.Module):
     def forward(self, v, k, q, mask):
         n_batches = q.size(0)
-        v = self.linear_v(v).view(
-            n_batches,
-            -1,
-            self.args.multi_head,
-            int(self.args.hidden_size / self.args.multi_head)
-        ).transpose(1, 2)
-        k = self.linear_k(k).view(
-            n_batches,
-            -1,
-            self.args.multi_head,
-            int(self.args.hidden_size / self.args.multi_head)
-        ).transpose(1, 2)
-        q = self.linear_q(q).view(
-            n_batches,
-            -1,
-            self.args.multi_head,
-            int(self.args.hidden_size / self.args.multi_head)
-        ).transpose(1, 2)
         atted = self.att(v, k, q, mask)
-        atted = atted.transpose(1, 2).contiguous().view(
-            n_batches,
-            -1,
-            self.args.hidden_size
         )
         atted = self.linear_merge(atted)
@@ -181,9 +180,7 @@ class MHAtt(nn.Module):
     def att(self, value, key, query, mask):
         d_k = query.size(-1)
-        scores = torch.matmul(
-            query, key.transpose(-2, -1)
-        ) / math.sqrt(d_k)
         if mask is not None:
             scores = scores.masked_fill(mask, -1e9)
@@ -198,6 +195,7 @@ class MHAtt(nn.Module):
 # ---- Feed Forward Nets ----
 # ---------------------------
 class FFN(nn.Module):
     def __init__(self, args):
         super(FFN, self).__init__()
@@ -207,12 +205,13 @@ class FFN(nn.Module):
             mid_size=args.ff_size,
             out_size=args.hidden_size,
             dropout_r=args.dropout_r,
-            use_relu=True
         )
     def forward(self, x):
         return self.mlp(x)
 # ---------------------------
 # ---- FF + norm  -----------
 # ---------------------------
@@ -231,7 +230,6 @@ class FFAndNorm(nn.Module):
         return x
 class Block(nn.Module):
     def __init__(self, args, i):
         super(Block, self).__init__()
@@ -240,7 +238,7 @@ class Block(nn.Module):
         self.sa2 = SGA(args)
         self.sa3 = SGA(args)
-        self.last = (i == args.layer-1)
         if not self.last:
             self.att_lang = AttFlat(args, args.lang_seq_len, merge=False)
             self.att_audio = AttFlat(args, args.audio_seq_len, merge=False)
@@ -267,10 +265,11 @@ class Block(nn.Module):
         ay = self.att_audio(y, y_mask)
         az = self.att_vid(z, y_mask)
-        return self.norm_l(x + self.dropout(ax)), \
-               self.norm_a(y + self.dropout(ay)), \
-               self.norm_v(z + self.dropout(az))
 class Model_LAV(nn.Module):
@@ -281,8 +280,7 @@ class Model_LAV(nn.Module):
         # LSTM
         self.embedding = nn.Embedding(
-            num_embeddings=vocab_size,
-            embedding_dim=args.word_embed_size
         )
         # Loading the GloVe embedding weights
@@ -292,7 +290,7 @@ class Model_LAV(nn.Module):
             input_size=args.word_embed_size,
             hidden_size=args.hidden_size,
             num_layers=1,
-            batch_first=True
         )
         # self.lstm_y = nn.LSTM(
@@ -310,8 +308,8 @@ class Model_LAV(nn.Module):
         self.enc_list = nn.ModuleList([Block(args, i) for i in range(args.layer)])
         # Flattenting features before proj
-        self.attflat_ac   = AttFlat(args, 1, merge=True)
-        self.attflat_vid  = AttFlat(args, 1, merge=True)
         self.attflat_lang = AttFlat(args, 1, merge=True)
         # Classification layers
@@ -329,7 +327,6 @@ class Model_LAV(nn.Module):
         y_mask = make_mask(y)
         z_mask = make_mask(z)
         embedding = self.embedding(x)
         x, _ = self.lstm_x(embedding)
@@ -343,25 +340,15 @@ class Model_LAV(nn.Module):
                 x_m, y_m, z_m = x_mask, y_mask, z_mask
             x, y, z = dec(x, x_m, y, y_m, z, z_m)
-        x = self.attflat_lang(
-            x,
-            None
-        )
-        y = self.attflat_ac(
-            y,
-            None
-        )
-        z = self.attflat_vid(
-            z,
-            None
-        )
         # Classification layers
         proj_feat = x + y + z
         proj_feat = self.proj_norm(proj_feat)
         ans = self.proj(proj_feat)
-        return ans

 # ---------- Masking sequence --------
 # ------------------------------------
 def make_mask(feature):
+    return (torch.sum(torch.abs(feature), dim=-1) == 0).unsqueeze(1).unsqueeze(2)
 # ------------------------------
 # ---------- Flattening --------
             mid_size=args.ff_size,
             out_size=flat_glimpse,
             dropout_r=args.dropout_r,
+            use_relu=True,
         )
         if self.merge:
             self.linear_merge = nn.Linear(
+                args.hidden_size * flat_glimpse, args.hidden_size * 2
             )
     def forward(self, x, x_mask):
         att = self.mlp(x)
         if x_mask is not None:
+            att = att.masked_fill(x_mask.squeeze(1).squeeze(1).unsqueeze(2), -1e9)
         att = F.softmax(att, dim=1)
         att_list = []
         for i in range(self.flat_glimpse):
+            att_list.append(torch.sum(att[:, :, i : i + 1] * x, dim=1))
         if self.merge:
             x_atted = torch.cat(att_list, dim=1)
         return torch.stack(att_list).transpose_(0, 1)
 # ------------------------
 # ---- Self Attention ----
 # ------------------------
 class SA(nn.Module):
     def __init__(self, args):
         super(SA, self).__init__()
         self.norm2 = LayerNorm(args.hidden_size)
     def forward(self, y, y_mask):
+        y = self.norm1(y + self.dropout1(self.mhatt(y, y, y, y_mask)))
+        y = self.norm2(y + self.dropout2(self.ffn(y)))
         return y
 # ---- Self Guided Attention ----
 # -------------------------------
 class SGA(nn.Module):
     def __init__(self, args):
         super(SGA, self).__init__()
         self.norm3 = LayerNorm(args.hidden_size)
     def forward(self, x, y, x_mask, y_mask):
+        x = self.norm1(x + self.dropout1(self.mhatt1(v=x, k=x, q=x, mask=x_mask)))
+        x = self.norm2(x + self.dropout2(self.mhatt2(v=y, k=y, q=x, mask=y_mask)))
+        x = self.norm3(x + self.dropout3(self.ffn(x)))
         return x
 # ------------------------------
 # ---- Multi-Head Attention ----
 # ------------------------------
 class MHAtt(nn.Module):
     def __init__(self, args):
         super(MHAtt, self).__init__()
     def forward(self, v, k, q, mask):
         n_batches = q.size(0)
+        v = (
+            self.linear_v(v)
+            .view(
+                n_batches,
+                -1,
+                self.args.multi_head,
+                int(self.args.hidden_size / self.args.multi_head),
+            )
+            .transpose(1, 2)
+        )
+        k = (
+            self.linear_k(k)
+            .view(
+                n_batches,
+                -1,
+                self.args.multi_head,
+                int(self.args.hidden_size / self.args.multi_head),
+            )
+            .transpose(1, 2)
+        )
+        q = (
+            self.linear_q(q)
+            .view(
+                n_batches,
+                -1,
+                self.args.multi_head,
+                int(self.args.hidden_size / self.args.multi_head),
+            )
+            .transpose(1, 2)
+        )
         atted = self.att(v, k, q, mask)
+        atted = (
+            atted.transpose(1, 2)
+            .contiguous()
+            .view(n_batches, -1, self.args.hidden_size)
         )
         atted = self.linear_merge(atted)
     def att(self, value, key, query, mask):
         d_k = query.size(-1)
+        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
         if mask is not None:
             scores = scores.masked_fill(mask, -1e9)
 # ---- Feed Forward Nets ----
 # ---------------------------
 class FFN(nn.Module):
     def __init__(self, args):
         super(FFN, self).__init__()
             mid_size=args.ff_size,
             out_size=args.hidden_size,
             dropout_r=args.dropout_r,
+            use_relu=True,
         )
     def forward(self, x):
         return self.mlp(x)
 # ---------------------------
 # ---- FF + norm  -----------
 # ---------------------------
         return x
 class Block(nn.Module):
     def __init__(self, args, i):
         super(Block, self).__init__()
         self.sa2 = SGA(args)
         self.sa3 = SGA(args)
+        self.last = i == args.layer - 1
         if not self.last:
             self.att_lang = AttFlat(args, args.lang_seq_len, merge=False)
             self.att_audio = AttFlat(args, args.audio_seq_len, merge=False)
         ay = self.att_audio(y, y_mask)
         az = self.att_vid(z, y_mask)
+        return (
+            self.norm_l(x + self.dropout(ax)),
+            self.norm_a(y + self.dropout(ay)),
+            self.norm_v(z + self.dropout(az)),
+        )
 class Model_LAV(nn.Module):
         # LSTM
         self.embedding = nn.Embedding(
+            num_embeddings=vocab_size, embedding_dim=args.word_embed_size
         )
         # Loading the GloVe embedding weights
             input_size=args.word_embed_size,
             hidden_size=args.hidden_size,
             num_layers=1,
+            batch_first=True,
         )
         # self.lstm_y = nn.LSTM(
         self.enc_list = nn.ModuleList([Block(args, i) for i in range(args.layer)])
         # Flattenting features before proj
+        self.attflat_ac = AttFlat(args, 1, merge=True)
+        self.attflat_vid = AttFlat(args, 1, merge=True)
         self.attflat_lang = AttFlat(args, 1, merge=True)
         # Classification layers
         y_mask = make_mask(y)
         z_mask = make_mask(z)
         embedding = self.embedding(x)
         x, _ = self.lstm_x(embedding)
                 x_m, y_m, z_m = x_mask, y_mask, z_mask
             x, y, z = dec(x, x_m, y, y_m, z, z_m)
+        x = self.attflat_lang(x, None)
+        y = self.attflat_ac(y, None)
+        z = self.attflat_vid(z, None)
         # Classification layers
         proj_feat = x + y + z
         proj_feat = self.proj_norm(proj_feat)
         ans = self.proj(proj_feat)
+        return ans

utils/audio.py CHANGED Viewed

@@ -1,24 +1,26 @@
 # -*- coding: utf-8 -*-
-#/usr/bin/python2
-'''
 By kyubyong park. [email protected].
 https://www.github.com/kyubyong/dc_tts
-'''
 from __future__ import print_function, division
 import numpy as np
 import librosa
 import os, copy
 import matplotlib
-matplotlib.use('pdf')
 import matplotlib.pyplot as plt
 from scipy import signal
 from .audio_params import Hyperparams as hp
 import tensorflow as tf
 def get_spectrograms(fpath):
-    '''Parse the wave file in `fpath` and
     Returns normalized melspectrogram and linear spectrogram.
     Args:
@@ -27,7 +29,7 @@ def get_spectrograms(fpath):
     Returns:
       mel: A 2d array of shape (T, n_mels) and dtype of float32.
       mag: A 2d array of shape (T, 1+n_fft/2) and dtype of float32.
-    '''
     # Loading sound file
     y, sr = librosa.load(fpath, sr=hp.sr)
@@ -38,10 +40,9 @@ def get_spectrograms(fpath):
     y = np.append(y[0], y[1:] - hp.preemphasis * y[:-1])
     # stft
-    linear = librosa.stft(y=y,
-                          n_fft=hp.n_fft,
-                          hop_length=hp.hop_length,
-                          win_length=hp.win_length)
     # magnitude spectrogram
     mag = np.abs(linear)  # (1+n_fft//2, T)
@@ -64,15 +65,16 @@ def get_spectrograms(fpath):
     return mel, mag
 def spectrogram2wav(mag):
-    '''# Generate wave file from linear magnitude spectrogram
     Args:
       mag: A numpy array of (T, 1+n_fft//2)
     Returns:
       wav: A 1-D numpy array.
-    '''
     # transpose
     mag = mag.T
@@ -83,7 +85,7 @@ def spectrogram2wav(mag):
     mag = np.power(10.0, mag * 0.05)
     # wav reconstruction
-    wav = griffin_lim(mag**hp.power)
     # de-preemphasis
     wav = signal.lfilter([1], [1, -hp.preemphasis], wav)
@@ -93,8 +95,9 @@ def spectrogram2wav(mag):
     return wav.astype(np.float32)
 def griffin_lim(spectrogram):
-    '''Applies Griffin-Lim's raw.'''
     X_best = copy.deepcopy(spectrogram)
     for i in range(hp.n_iter):
         X_t = invert_spectrogram(X_best)
@@ -106,12 +109,16 @@ def griffin_lim(spectrogram):
     return y
 def invert_spectrogram(spectrogram):
-    '''Applies inverse fft.
     Args:
       spectrogram: [1+n_fft//2, t]
-    '''
-    return librosa.istft(spectrogram, hp.hop_length, win_length=hp.win_length, window="hann")
 def plot_alignment(alignment, gs, dir=hp.logdir):
     """Plots the alignment.
@@ -121,32 +128,43 @@ def plot_alignment(alignment, gs, dir=hp.logdir):
       gs: (int) global step.
       dir: Output path.
     """
-    if not os.path.exists(dir): os.mkdir(dir)
     fig, ax = plt.subplots()
     im = ax.imshow(alignment)
     fig.colorbar(im)
-    plt.title('{} Steps'.format(gs))
-    plt.savefig('{}/alignment_{}.png'.format(dir, gs), format='png')
     plt.close(fig)
 def guided_attention(g=0.2):
-    '''Guided attention. Refer to page 3 on the paper.'''
     W = np.zeros((hp.max_N, hp.max_T), dtype=np.float32)
     for n_pos in range(W.shape[0]):
         for t_pos in range(W.shape[1]):
-            W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(hp.max_T) - n_pos / float(hp.max_N)) ** 2 / (2 * g * g))
     return W
-def learning_rate_decay(init_lr, global_step, warmup_steps = 4000.0):
-    '''Noam scheme from tensor2tensor'''
     step = tf.to_float(global_step + 1)
-    return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, step**-0.5)
 def load_spectrograms(fpath):
-    '''Read the wave file in `fpath`
-    and extracts spectrograms'''
     fname = os.path.basename(fpath)
     mel, mag = get_spectrograms(fpath)
@@ -158,6 +176,5 @@ def load_spectrograms(fpath):
     mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode="constant")
     # Reduction
-    mel = mel[::hp.r, :]
     return fname, mel, mag

 # -*- coding: utf-8 -*-
+# /usr/bin/python2
+"""
 By kyubyong park. [email protected].
 https://www.github.com/kyubyong/dc_tts
+"""
 from __future__ import print_function, division
 import numpy as np
 import librosa
 import os, copy
 import matplotlib
+matplotlib.use("pdf")
 import matplotlib.pyplot as plt
 from scipy import signal
 from .audio_params import Hyperparams as hp
 import tensorflow as tf
 def get_spectrograms(fpath):
+    """Parse the wave file in `fpath` and
     Returns normalized melspectrogram and linear spectrogram.
     Args:
     Returns:
       mel: A 2d array of shape (T, n_mels) and dtype of float32.
       mag: A 2d array of shape (T, 1+n_fft/2) and dtype of float32.
+    """
     # Loading sound file
     y, sr = librosa.load(fpath, sr=hp.sr)
     y = np.append(y[0], y[1:] - hp.preemphasis * y[:-1])
     # stft
+    linear = librosa.stft(
+        y=y, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length
+    )
     # magnitude spectrogram
     mag = np.abs(linear)  # (1+n_fft//2, T)
     return mel, mag
 def spectrogram2wav(mag):
+    """# Generate wave file from linear magnitude spectrogram
     Args:
       mag: A numpy array of (T, 1+n_fft//2)
     Returns:
       wav: A 1-D numpy array.
+    """
     # transpose
     mag = mag.T
     mag = np.power(10.0, mag * 0.05)
     # wav reconstruction
+    wav = griffin_lim(mag ** hp.power)
     # de-preemphasis
     wav = signal.lfilter([1], [1, -hp.preemphasis], wav)
     return wav.astype(np.float32)
 def griffin_lim(spectrogram):
+    """Applies Griffin-Lim's raw."""
     X_best = copy.deepcopy(spectrogram)
     for i in range(hp.n_iter):
         X_t = invert_spectrogram(X_best)
     return y
 def invert_spectrogram(spectrogram):
+    """Applies inverse fft.
     Args:
       spectrogram: [1+n_fft//2, t]
+    """
+    return librosa.istft(
+        spectrogram, hp.hop_length, win_length=hp.win_length, window="hann"
+    )
 def plot_alignment(alignment, gs, dir=hp.logdir):
     """Plots the alignment.
       gs: (int) global step.
       dir: Output path.
     """
+    if not os.path.exists(dir):
+        os.mkdir(dir)
     fig, ax = plt.subplots()
     im = ax.imshow(alignment)
     fig.colorbar(im)
+    plt.title("{} Steps".format(gs))
+    plt.savefig("{}/alignment_{}.png".format(dir, gs), format="png")
     plt.close(fig)
 def guided_attention(g=0.2):
+    """Guided attention. Refer to page 3 on the paper."""
     W = np.zeros((hp.max_N, hp.max_T), dtype=np.float32)
     for n_pos in range(W.shape[0]):
         for t_pos in range(W.shape[1]):
+            W[n_pos, t_pos] = 1 - np.exp(
+                -((t_pos / float(hp.max_T) - n_pos / float(hp.max_N)) ** 2)
+                / (2 * g * g)
+            )
     return W
+def learning_rate_decay(init_lr, global_step, warmup_steps=4000.0):
+    """Noam scheme from tensor2tensor"""
     step = tf.to_float(global_step + 1)
+    return (
+        init_lr
+        * warmup_steps ** 0.5
+        * tf.minimum(step * warmup_steps ** -1.5, step ** -0.5)
+    )
 def load_spectrograms(fpath):
+    """Read the wave file in `fpath`
+    and extracts spectrograms"""
     fname = os.path.basename(fpath)
     mel, mag = get_spectrograms(fpath)
     mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode="constant")
     # Reduction
+    mel = mel[:: hp.r, :]
     return fname, mel, mag

utils/audio_params.py CHANGED Viewed

@@ -1,14 +1,19 @@
 # -*- coding: utf-8 -*-
-#/usr/bin/python2
-'''
 By kyubyong park. [email protected].
 https://www.github.com/kyubyong/dc_tts
-'''
 class Hyperparams:
-    '''Hyper parameters'''
     # pipeline
-    prepro = True  # if True, run `python prepro.py` first before running `python train.py`.
     # signal processing
     sr = 22050  # Sampling rate.
     n_fft = 2048  # fft points (samples)
@@ -19,29 +24,29 @@ class Hyperparams:
     n_mels = 80  # Number of Mel banks to generate
     power = 1.5  # Exponent for amplifying the predicted magnitude
     n_iter = 50  # Number of inversion iterations
-    preemphasis = .97
     max_db = 100
     ref_db = 20
     # Model
-    r = 4 # Reduction factor. Do not change this.
     dropout_rate = 0.05
-    e = 128 # == embedding
-    d = 256 # == hidden units of Text2Mel
-    c = 512 # == hidden units of SSRN
     attention_win_size = 3
     # data
     data = "/data/private/voice/LJSpeech-1.0"
     # data = "/data/private/voice/kate"
-    test_data = 'harvard_sentences.txt'
-    vocab = "PE abcdefghijklmnopqrstuvwxyz'.?" # P: Padding, E: EOS.
-    max_N = 180 # Maximum number of characters.
-    max_T = 210 # Maximum number of mel frames.
     # training scheme
-    lr = 0.001 # Initial learning rate.
     logdir = "logdir/LJ01"
-    sampledir = 'samples'
-    B = 32 # batch size
     num_iterations = 2000000

 # -*- coding: utf-8 -*-
+# /usr/bin/python2
+"""
 By kyubyong park. [email protected].
 https://www.github.com/kyubyong/dc_tts
+"""
 class Hyperparams:
+    """Hyper parameters"""
     # pipeline
+    prepro = (
+        True  # if True, run `python prepro.py` first before running `python train.py`.
+    )
     # signal processing
     sr = 22050  # Sampling rate.
     n_fft = 2048  # fft points (samples)
     n_mels = 80  # Number of Mel banks to generate
     power = 1.5  # Exponent for amplifying the predicted magnitude
     n_iter = 50  # Number of inversion iterations
+    preemphasis = 0.97
     max_db = 100
     ref_db = 20
     # Model
+    r = 4  # Reduction factor. Do not change this.
     dropout_rate = 0.05
+    e = 128  # == embedding
+    d = 256  # == hidden units of Text2Mel
+    c = 512  # == hidden units of SSRN
     attention_win_size = 3
     # data
     data = "/data/private/voice/LJSpeech-1.0"
     # data = "/data/private/voice/kate"
+    test_data = "harvard_sentences.txt"
+    vocab = "PE abcdefghijklmnopqrstuvwxyz'.?"  # P: Padding, E: EOS.
+    max_N = 180  # Maximum number of characters.
+    max_T = 210  # Maximum number of mel frames.
     # training scheme
+    lr = 0.001  # Initial learning rate.
     logdir = "logdir/LJ01"
+    sampledir = "samples"
+    B = 32  # batch size
     num_iterations = 2000000

utils/compute_args.py CHANGED Viewed

@@ -3,26 +3,39 @@ import torch
 def compute_args(args):
     # DataLoader
-    if not hasattr(args, 'dataset'):  # fix for previous version
-        args.dataset = 'MOSEI'
-    if args.dataset == "MOSEI": args.dataloader = 'Mosei_Dataset'
-    if args.dataset == "MELD": args.dataloader = 'Meld_Dataset'
     # Loss function to use
-    if args.dataset == 'MOSEI' and args.task == 'sentiment': args.loss_fn = torch.nn.CrossEntropyLoss(reduction="sum")
-    if args.dataset == 'MOSEI' and args.task == 'emotion': args.loss_fn = torch.nn.BCEWithLogitsLoss(reduction="sum")
-    if args.dataset == 'MELD': args.loss_fn = torch.nn.CrossEntropyLoss(reduction="sum")
     # Answer size
-    if args.dataset == 'MOSEI' and args.task == "sentiment": args.ans_size = 7
-    if args.dataset == 'MOSEI' and args.task == "sentiment" and args.task_binary: args.ans_size = 2
-    if args.dataset == 'MOSEI' and args.task == "emotion": args.ans_size = 6
-    if args.dataset == 'MELD' and args.task == "emotion": args.ans_size = 7
-    if args.dataset == 'MELD' and args.task == "sentiment": args.ans_size = 3
-    if args.dataset == 'MOSEI': args.pred_func = "amax"
-    if args.dataset == 'MOSEI' and args.task == "emotion": args.pred_func = "multi_label"
-    if args.dataset == 'MELD': args.pred_func = "amax"
     return args

 def compute_args(args):
     # DataLoader
+    if not hasattr(args, "dataset"):  # fix for previous version
+        args.dataset = "MOSEI"
+    if args.dataset == "MOSEI":
+        args.dataloader = "Mosei_Dataset"
+    if args.dataset == "MELD":
+        args.dataloader = "Meld_Dataset"
     # Loss function to use
+    if args.dataset == "MOSEI" and args.task == "sentiment":
+        args.loss_fn = torch.nn.CrossEntropyLoss(reduction="sum")
+    if args.dataset == "MOSEI" and args.task == "emotion":
+        args.loss_fn = torch.nn.BCEWithLogitsLoss(reduction="sum")
+    if args.dataset == "MELD":
+        args.loss_fn = torch.nn.CrossEntropyLoss(reduction="sum")
     # Answer size
+    if args.dataset == "MOSEI" and args.task == "sentiment":
+        args.ans_size = 7
+    if args.dataset == "MOSEI" and args.task == "sentiment" and args.task_binary:
+        args.ans_size = 2
+    if args.dataset == "MOSEI" and args.task == "emotion":
+        args.ans_size = 6
+    if args.dataset == "MELD" and args.task == "emotion":
+        args.ans_size = 7
+    if args.dataset == "MELD" and args.task == "sentiment":
+        args.ans_size = 3
+    if args.dataset == "MOSEI":
+        args.pred_func = "amax"
+    if args.dataset == "MOSEI" and args.task == "emotion":
+        args.pred_func = "multi_label"
+    if args.dataset == "MELD":
+        args.pred_func = "amax"
     return args

utils/plot.py CHANGED Viewed

@@ -10,4 +10,4 @@
 #     maxfreq = n.max()
 #     # Set a clean upper y-axis limit.
 #     plt.ylim(ymax=np.ceil(maxfreq / 10) * 10 if maxfreq % 10 else maxfreq + 10)
-#     plt.show()

 #     maxfreq = n.max()
 #     # Set a clean upper y-axis limit.
 #     plt.ylim(ymax=np.ceil(maxfreq / 10) * 10 if maxfreq % 10 else maxfreq + 10)
+#     plt.show()

utils/pred_func.py CHANGED Viewed

@@ -6,4 +6,4 @@ def amax(x):
 def multi_label(x):
-    return (x > 0)


6
7
8	def multi_label(x):
9	+ return x > 0

utils/tokenize.py CHANGED Viewed

@@ -6,38 +6,37 @@ import numpy as np
 import os
 import pickle
 def clean(w):
-    return re.sub(
-            r"([.,'!?\"()*#:;])",
-            '',
-            w.lower()
-            ).replace('-', ' ').replace('/', ' ')
 def tokenize(key_to_word):
     key_to_sentence = {}
     for k, v in key_to_word.items():
-        key_to_sentence[k] = [clean(w) for w in v if clean(w) != '']
     return key_to_sentence
 def create_dict(key_to_sentence, dataroot, use_glove=True):
-    token_file = dataroot+"/token_to_ix.pkl"
-    glove_file = dataroot+"/train_glove.npy"
     if os.path.exists(glove_file) and os.path.exists(token_file):
         print("Loading train language files")
         return pickle.load(open(token_file, "rb")), np.load(glove_file)
     print("Creating train language files")
     token_to_ix = {
-        'UNK': 1,
     }
     spacy_tool = None
     pretrained_emb = []
     if use_glove:
         spacy_tool = en_vectors_web_lg.load()
-        pretrained_emb.append(spacy_tool('UNK').vector)
     for k, v in key_to_sentence.items():
         for word in v:
@@ -51,6 +50,7 @@ def create_dict(key_to_sentence, dataroot, use_glove=True):
     pickle.dump(token_to_ix, open(token_file, "wb"))
     return token_to_ix, pretrained_emb
 def sent_to_ix(s, token_to_ix, max_token=100):
     ques_ix = np.zeros(max_token, np.int64)
@@ -58,7 +58,7 @@ def sent_to_ix(s, token_to_ix, max_token=100):
         if word in token_to_ix:
             ques_ix[ix] = token_to_ix[word]
         else:
-            ques_ix[ix] = token_to_ix['UNK']
         if ix + 1 == max_token:
             break
@@ -83,21 +83,20 @@ def cmumosei_7(a):
         res = 6
     return res
 def cmumosei_2(a):
     if a < 0:
         return 0
     if a >= 0:
         return 1
 def pad_feature(feat, max_len):
     if feat.shape[0] > max_len:
         feat = feat[:max_len]
     feat = np.pad(
-        feat,
-        ((0, max_len - feat.shape[0]), (0, 0)),
-        mode='constant',
-        constant_values=0
     )
     return feat

 import os
 import pickle
 def clean(w):
+    return (
+        re.sub(r"([.,'!?\"()*#:;])", "", w.lower()).replace("-", " ").replace("/", " ")
+    )
 def tokenize(key_to_word):
     key_to_sentence = {}
     for k, v in key_to_word.items():
+        key_to_sentence[k] = [clean(w) for w in v if clean(w) != ""]
     return key_to_sentence
 def create_dict(key_to_sentence, dataroot, use_glove=True):
+    token_file = dataroot + "/token_to_ix.pkl"
+    glove_file = dataroot + "/train_glove.npy"
     if os.path.exists(glove_file) and os.path.exists(token_file):
         print("Loading train language files")
         return pickle.load(open(token_file, "rb")), np.load(glove_file)
     print("Creating train language files")
     token_to_ix = {
+        "UNK": 1,
     }
     spacy_tool = None
     pretrained_emb = []
     if use_glove:
         spacy_tool = en_vectors_web_lg.load()
+        pretrained_emb.append(spacy_tool("UNK").vector)
     for k, v in key_to_sentence.items():
         for word in v:
     pickle.dump(token_to_ix, open(token_file, "wb"))
     return token_to_ix, pretrained_emb
 def sent_to_ix(s, token_to_ix, max_token=100):
     ques_ix = np.zeros(max_token, np.int64)
         if word in token_to_ix:
             ques_ix[ix] = token_to_ix[word]
         else:
+            ques_ix[ix] = token_to_ix["UNK"]
         if ix + 1 == max_token:
             break
         res = 6
     return res
 def cmumosei_2(a):
     if a < 0:
         return 0
     if a >= 0:
         return 1
 def pad_feature(feat, max_len):
     if feat.shape[0] > max_len:
         feat = feat[:max_len]
     feat = np.pad(
+        feat, ((0, max_len - feat.shape[0]), (0, 0)), mode="constant", constant_values=0
     )
     return feat