ptrdvn commited on
Commit
bfa3116
1 Parent(s): 7b2f6b4

Create training.py

Browse files
Files changed (1) hide show
  1. training.py +152 -0
training.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import pandas as pd
3
+ import fasttext
4
+ import numpy as np
5
+ from sklearn.model_selection import train_test_split
6
+ from tqdm.auto import tqdm, trange
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.optim as optim
10
+ from sklearn.metrics import mean_squared_error
11
+ import requests
12
+ import gzip
13
+ import shutil
14
+ import os
15
+ from safetensors.torch import save_model
16
+ import matplotlib.pyplot as plt
17
+
18
+ class FasttextEmbedRegressor(nn.Module):
19
+ def __init__(self, input_size=300):
20
+ super(FasttextEmbedRegressor, self).__init__()
21
+ layer_1_size = 64
22
+ layer_2_size = 32
23
+ self.fc1 = nn.Linear(input_size, layer_1_size)
24
+ self.fc2 = nn.Linear(layer_1_size, layer_2_size)
25
+ self.fc3 = nn.Linear(layer_2_size, 1)
26
+
27
+ def forward(self, x):
28
+ x = torch.relu(self.fc1(x))
29
+ x = torch.relu(self.fc2(x))
30
+ x = self.fc3(x)
31
+ return x
32
+
33
+ def train_regressor(X_train, X_test, y_train, y_test, train_epochs):
34
+
35
+ # Initialize the model, loss function, and optimizer
36
+ input_size = X_train.shape[1]
37
+ model = FasttextEmbedRegressor(input_size)
38
+ criterion = nn.MSELoss()
39
+ optimizer = optim.Adam(model.parameters(), lr=0.001)
40
+ batch_size = 32
41
+
42
+ training_metrics = []
43
+
44
+ for epoch in trange(train_epochs):
45
+ model.train()
46
+ train_losses = []
47
+ for step_num, i in enumerate(trange(0, X_train.shape[0], batch_size)):
48
+ vectors = torch.Tensor(X_train[i:i+batch_size])
49
+ targets = torch.Tensor(y_train[i:i+batch_size])
50
+ optimizer.zero_grad()
51
+ outputs = model(vectors).squeeze()
52
+ loss = criterion(outputs, targets)
53
+ loss.backward()
54
+ optimizer.step()
55
+ train_losses.append(float(loss))
56
+ if step_num % 10 == 0:
57
+ model.eval()
58
+ test_preds = model(torch.Tensor(X_test)).detach().numpy()
59
+ test_mse = mean_squared_error(y_test, test_preds)
60
+ training_metrics.append({
61
+ "epoch": epoch,
62
+ "step_num": step_num,
63
+ "i": i,
64
+ "test_mse": test_mse,
65
+ "train_loss": sum(train_losses) / len(train_losses),
66
+ })
67
+ train_losses = []
68
+ model.train()
69
+
70
+ return pd.DataFrame(training_metrics), model
71
+
72
+ def download_file(url, filename):
73
+ """
74
+ Helper method handling downloading large files from `url` to `filename`. Returns a pointer to `filename`.
75
+ """
76
+ chunkSize = 1024
77
+ r = requests.get(url, stream=True)
78
+ with open(filename, 'wb') as f:
79
+ pbar = tqdm( unit="B", total=int( r.headers['Content-Length'] ) )
80
+ for chunk in r.iter_content(chunk_size=chunkSize):
81
+ if chunk: # filter out keep-alive new chunks
82
+ pbar.update (len(chunk))
83
+ f.write(chunk)
84
+ return filename
85
+
86
+ get_filename = lambda x: f"cc.{x}.300.bin"
87
+
88
+ def download_fasttext_vectors(lang_code):
89
+ filename = get_filename(lang_code)
90
+
91
+ if os.path.isfile(filename):
92
+ return None
93
+
94
+ print(f"Downloading {lang_code} vectors")
95
+ download_file(f"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/{filename}.gz", f"{filename}.gz")
96
+
97
+ print(f"Unzipping {lang_code} vectors")
98
+ with gzip.open(f"{filename}.gz", 'rb') as f_in:
99
+ with open(filename, 'wb') as f_out:
100
+ shutil.copyfileobj(f_in, f_out)
101
+
102
+ print(f"Removing zipped {lang_code} vectors")
103
+ os.remove(f"{filename}.gz")
104
+
105
+ return True
106
+
107
+ def create_quality_eval_model(lang_code, train_epochs=10):
108
+
109
+ download_fasttext_vectors(lang_code)
110
+
111
+ dataset = load_dataset("lightblue/text_ratings", lang_code, split="train")
112
+ text_list = dataset["selected_chunk"]
113
+ label_float = [x / 100 for x in dataset["rating_float"]]
114
+
115
+ fasttext_model = fasttext.load_model(f"cc.{lang_code}.300.bin")
116
+
117
+ embeddings = np.stack([fasttext_model.get_sentence_vector(
118
+ x.replace("\n", " ")
119
+ ) for x in tqdm(text_list)])
120
+
121
+ X_train, X_test, y_train, y_test, text_train, text_test = train_test_split(
122
+ embeddings,
123
+ label_float,
124
+ text_list,
125
+ test_size=0.2,
126
+ random_state=42
127
+ )
128
+
129
+ metrics_df, model = train_regressor(X_train, X_test, y_train, y_test, train_epochs)
130
+
131
+ test_df = pd.DataFrame({
132
+ "text": text_test,
133
+ "gold_score": y_test,
134
+ "pred_score": model(torch.Tensor(X_test)).detach().numpy().flatten()
135
+ })
136
+
137
+ save_model(model, f"{lang_code}.safetensors")
138
+
139
+ os.remove(get_filename(lang_code))
140
+
141
+ return metrics_df, test_df
142
+
143
+ if __name__ == '__main__':
144
+
145
+ langs = ['am', 'ar', 'bg', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fr', 'gu', 'ha', 'hi', 'hu', 'id', 'it', 'ja', 'jv', 'kn', 'ko', 'lt', 'mr', 'nl', 'no', 'yo', 'zh']
146
+
147
+ for l in langs:
148
+ print(l)
149
+ metrics_df, test_df = create_quality_eval_model(l, train_epochs=5)
150
+ print(l)
151
+ metrics_df[["test_mse", "train_loss"]].rolling(50).mean().plot()
152
+ plt.show()