File size: 5,046 Bytes
bfa3116 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
from datasets import load_dataset
import pandas as pd
import fasttext
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm, trange
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error
import requests
import gzip
import shutil
import os
from safetensors.torch import save_model
import matplotlib.pyplot as plt
class FasttextEmbedRegressor(nn.Module):
def __init__(self, input_size=300):
super(FasttextEmbedRegressor, self).__init__()
layer_1_size = 64
layer_2_size = 32
self.fc1 = nn.Linear(input_size, layer_1_size)
self.fc2 = nn.Linear(layer_1_size, layer_2_size)
self.fc3 = nn.Linear(layer_2_size, 1)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
x = self.fc3(x)
return x
def train_regressor(X_train, X_test, y_train, y_test, train_epochs):
# Initialize the model, loss function, and optimizer
input_size = X_train.shape[1]
model = FasttextEmbedRegressor(input_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
batch_size = 32
training_metrics = []
for epoch in trange(train_epochs):
model.train()
train_losses = []
for step_num, i in enumerate(trange(0, X_train.shape[0], batch_size)):
vectors = torch.Tensor(X_train[i:i+batch_size])
targets = torch.Tensor(y_train[i:i+batch_size])
optimizer.zero_grad()
outputs = model(vectors).squeeze()
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
train_losses.append(float(loss))
if step_num % 10 == 0:
model.eval()
test_preds = model(torch.Tensor(X_test)).detach().numpy()
test_mse = mean_squared_error(y_test, test_preds)
training_metrics.append({
"epoch": epoch,
"step_num": step_num,
"i": i,
"test_mse": test_mse,
"train_loss": sum(train_losses) / len(train_losses),
})
train_losses = []
model.train()
return pd.DataFrame(training_metrics), model
def download_file(url, filename):
"""
Helper method handling downloading large files from `url` to `filename`. Returns a pointer to `filename`.
"""
chunkSize = 1024
r = requests.get(url, stream=True)
with open(filename, 'wb') as f:
pbar = tqdm( unit="B", total=int( r.headers['Content-Length'] ) )
for chunk in r.iter_content(chunk_size=chunkSize):
if chunk: # filter out keep-alive new chunks
pbar.update (len(chunk))
f.write(chunk)
return filename
get_filename = lambda x: f"cc.{x}.300.bin"
def download_fasttext_vectors(lang_code):
filename = get_filename(lang_code)
if os.path.isfile(filename):
return None
print(f"Downloading {lang_code} vectors")
download_file(f"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/{filename}.gz", f"{filename}.gz")
print(f"Unzipping {lang_code} vectors")
with gzip.open(f"{filename}.gz", 'rb') as f_in:
with open(filename, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
print(f"Removing zipped {lang_code} vectors")
os.remove(f"{filename}.gz")
return True
def create_quality_eval_model(lang_code, train_epochs=10):
download_fasttext_vectors(lang_code)
dataset = load_dataset("lightblue/text_ratings", lang_code, split="train")
text_list = dataset["selected_chunk"]
label_float = [x / 100 for x in dataset["rating_float"]]
fasttext_model = fasttext.load_model(f"cc.{lang_code}.300.bin")
embeddings = np.stack([fasttext_model.get_sentence_vector(
x.replace("\n", " ")
) for x in tqdm(text_list)])
X_train, X_test, y_train, y_test, text_train, text_test = train_test_split(
embeddings,
label_float,
text_list,
test_size=0.2,
random_state=42
)
metrics_df, model = train_regressor(X_train, X_test, y_train, y_test, train_epochs)
test_df = pd.DataFrame({
"text": text_test,
"gold_score": y_test,
"pred_score": model(torch.Tensor(X_test)).detach().numpy().flatten()
})
save_model(model, f"{lang_code}.safetensors")
os.remove(get_filename(lang_code))
return metrics_df, test_df
if __name__ == '__main__':
langs = ['am', 'ar', 'bg', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fr', 'gu', 'ha', 'hi', 'hu', 'id', 'it', 'ja', 'jv', 'kn', 'ko', 'lt', 'mr', 'nl', 'no', 'yo', 'zh']
for l in langs:
print(l)
metrics_df, test_df = create_quality_eval_model(l, train_epochs=5)
print(l)
metrics_df[["test_mse", "train_loss"]].rolling(50).mean().plot()
plt.show() |