File size: 7,193 Bytes
8841a3f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
import numpy as np
import torch
from evaluate import load as load_metric
from sklearn.metrics import mean_squared_error
from tqdm.auto import tqdm
MAX_TARGET_LENGTH = 128
# load evaluation metrics
sacrebleu = load_metric('sacrebleu')
rouge = load_metric('rouge')
meteor = load_metric('meteor')
bertscore = load_metric('bertscore')
# use gpu if it's available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
def flatten_list(l):
"""
Utility function to convert a list of lists into a flattened list
Params:
l (list of lists): list to be flattened
Returns:
A flattened list with the elements of the original list
"""
return [item for sublist in l for item in sublist]
def parse_float(value):
"""
Utility function to parse a string into a float
Params:
value (string): value to be converted to float
Returns:
The float representation of the given string, or None if the string could
not be converted to a float
"""
try:
float_value = float(value)
return float_value
except ValueError:
return None
def extract_scores(predictions):
"""
Utility function to extract the scores from the predictions of the model
Params:
predictions (list): complete model predictions
Returns:
scores (list): extracted scores from the model's predictions
"""
scores = []
# iterate through predictions and try to extract predicted score;
# if score could not be extracted, set it to None
for pred in predictions:
try:
score_string = pred.split(' ', 1)[0].strip()
score = parse_float(score_string)
except IndexError:
score = None
scores.append(score)
return scores
def extract_feedback(predictions):
"""
Utility function to extract the feedback from the predictions of the model
Params:
predictions (list): complete model predictions
Returns:
feedback (list): extracted feedback from the model's predictions
"""
feedback = []
# iterate through predictions and try to extract predicted feedback
for pred in predictions:
try:
fb = pred.split(':', 1)[1]
except IndexError:
try:
fb = pred.split(' ', 1)[1]
except IndexError:
fb = pred
feedback.append(fb.strip())
return feedback
def compute_rmse(predictions, labels):
"""
Utility function to compute the root mean squared error of the
score predictions in relation to the golden label scores
Params:
predictions (list): model score predictions
labels (list): golden label scores
Returns:
(float, int): rmse of valid samples and number of invalid samples
"""
# get indexes of valid score predictions
# (i.e., where the score is not None)
idx = np.where(np.array(predictions) != None)
# get size of the golden labels list and of
# the valid predictions array
labels_size = np.array(labels).size
valid_predictions_size = idx[0].size
# only compute rmse if valid score predictions were generated,
# otherwise set mse to 1
if valid_predictions_size > 0:
# calculate rmse from labels and predictions
valid_predictions = np.array(predictions)[idx]
score_labels = np.array(labels)[idx]
rmse = mean_squared_error(score_labels, valid_predictions, squared=False)
# cap mse at 1
if rmse > 1:
return 1, labels_size - valid_predictions_size
# return computed rmse and number of invalid samples
return rmse, labels_size - valid_predictions_size
else:
return 1, labels_size - valid_predictions_size
def compute_metrics(predictions, labels):
"""
Compute evaluation metrics from the predictions of the model
Params:
predictions (list): complete model predictions
labels (list): golden labels (previously tokenized)
Returns:
results (dict): dictionary with the computed evaluation metrics
"""
# extract feedback and labels from the model's predictions
predicted_feedback = extract_feedback(predictions)
predicted_scores = extract_scores(predictions)
# extract feedback and labels from the golden labels
reference_feedback = [x.split('Feedback:', 1)[1].strip() for x in labels]
reference_scores = [float(x.split('Feedback:', 1)[0].strip()) for x in labels]
# compute HF metrics
sacrebleu_score = sacrebleu.compute(predictions=predicted_feedback, references=[[x] for x in reference_feedback])['score']
rouge_score = rouge.compute(predictions=predicted_feedback, references=reference_feedback)['rouge2']
meteor_score = meteor.compute(predictions=predicted_feedback, references=reference_feedback)['meteor']
bert_score = bertscore.compute(
predictions=predicted_feedback,
references=reference_feedback,
lang='de',
model_type='bert-base-multilingual-cased',
rescale_with_baseline=True)
# compute rmse of score predictions
rmse, _ = compute_rmse(predicted_scores, reference_scores)
results = {
'sacrebleu': sacrebleu_score,
'rouge': rouge_score,
'meteor': meteor_score,
'bert_score': np.array(bert_score['f1']).mean().item(),
'rmse': rmse
}
return results
def evaluate(model, tokenizer, dataloader):
"""
Evaluate model on the given dataset
Params:
model (PreTrainedModel): seq2seq model
tokenizer (PreTrainedTokenizer): tokenizer from HuggingFace
dataloader (torch Dataloader): dataloader of the dataset to be used for evaluation
Returns:
results (dict): dictionary with the computed evaluation metrics
predictions (list): list of the decoded predictions of the model
"""
decoded_preds, decoded_labels = [], []
model.eval()
# iterate through batchs in the dataloader
for batch in tqdm(dataloader):
with torch.no_grad():
batch = {k: v.to(device) for k, v in batch.items()}
# generate tokens from batch
generated_tokens = model.generate(
batch['input_ids'],
attention_mask=batch['attention_mask'],
max_length=MAX_TARGET_LENGTH
)
# get golden labels from batch
labels_batch = batch['labels']
# decode model predictions and golden labels
decoded_preds_batch = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
decoded_labels_batch = tokenizer.batch_decode(labels_batch, skip_special_tokens=True)
decoded_preds.append(decoded_preds_batch)
decoded_labels.append(decoded_labels_batch)
# convert predictions and golden labels into flattened lists
predictions = flatten_list(decoded_preds)
labels = flatten_list(decoded_labels)
# compute metrics based on predictions and golden labels
results = compute_metrics(predictions, labels)
return results, predictions |