|
import numpy as np |
|
import torch |
|
|
|
from evaluate import load as load_metric |
|
|
|
from sklearn.metrics import mean_squared_error |
|
from tqdm.auto import tqdm |
|
|
|
MAX_TARGET_LENGTH = 128 |
|
|
|
|
|
sacrebleu = load_metric('sacrebleu') |
|
rouge = load_metric('rouge') |
|
meteor = load_metric('meteor') |
|
bertscore = load_metric('bertscore') |
|
|
|
|
|
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') |
|
|
|
def flatten_list(l): |
|
""" |
|
Utility function to convert a list of lists into a flattened list |
|
Params: |
|
l (list of lists): list to be flattened |
|
Returns: |
|
A flattened list with the elements of the original list |
|
""" |
|
return [item for sublist in l for item in sublist] |
|
|
|
def parse_float(value): |
|
""" |
|
Utility function to parse a string into a float |
|
Params: |
|
value (string): value to be converted to float |
|
Returns: |
|
The float representation of the given string, or None if the string could |
|
not be converted to a float |
|
""" |
|
try: |
|
float_value = float(value) |
|
return float_value |
|
except ValueError: |
|
return None |
|
|
|
def extract_scores(predictions): |
|
""" |
|
Utility function to extract the scores from the predictions of the model |
|
Params: |
|
predictions (list): complete model predictions |
|
Returns: |
|
scores (list): extracted scores from the model's predictions |
|
""" |
|
scores = [] |
|
|
|
|
|
for pred in predictions: |
|
try: |
|
score_string = pred.split(' ', 1)[0].strip() |
|
score = parse_float(score_string) |
|
except IndexError: |
|
score = None |
|
scores.append(score) |
|
|
|
return scores |
|
|
|
def extract_feedback(predictions): |
|
""" |
|
Utility function to extract the feedback from the predictions of the model |
|
Params: |
|
predictions (list): complete model predictions |
|
Returns: |
|
feedback (list): extracted feedback from the model's predictions |
|
""" |
|
feedback = [] |
|
|
|
for pred in predictions: |
|
try: |
|
fb = pred.split(':', 1)[1] |
|
except IndexError: |
|
try: |
|
fb = pred.split(' ', 1)[1] |
|
except IndexError: |
|
fb = pred |
|
feedback.append(fb.strip()) |
|
|
|
return feedback |
|
|
|
def compute_rmse(predictions, labels): |
|
""" |
|
Utility function to compute the root mean squared error of the |
|
score predictions in relation to the golden label scores |
|
Params: |
|
predictions (list): model score predictions |
|
labels (list): golden label scores |
|
Returns: |
|
(float, int): rmse of valid samples and number of invalid samples |
|
""" |
|
|
|
|
|
idx = np.where(np.array(predictions) != None) |
|
|
|
|
|
|
|
labels_size = np.array(labels).size |
|
valid_predictions_size = idx[0].size |
|
|
|
|
|
|
|
if valid_predictions_size > 0: |
|
|
|
valid_predictions = np.array(predictions)[idx] |
|
score_labels = np.array(labels)[idx] |
|
rmse = mean_squared_error(score_labels, valid_predictions, squared=False) |
|
|
|
|
|
if rmse > 1: |
|
return 1, labels_size - valid_predictions_size |
|
|
|
|
|
return rmse, labels_size - valid_predictions_size |
|
else: |
|
return 1, labels_size - valid_predictions_size |
|
|
|
def compute_metrics(predictions, labels): |
|
""" |
|
Compute evaluation metrics from the predictions of the model |
|
Params: |
|
predictions (list): complete model predictions |
|
labels (list): golden labels (previously tokenized) |
|
Returns: |
|
results (dict): dictionary with the computed evaluation metrics |
|
""" |
|
|
|
predicted_feedback = extract_feedback(predictions) |
|
predicted_scores = extract_scores(predictions) |
|
|
|
|
|
reference_feedback = [x.split('Feedback:', 1)[1].strip() for x in labels] |
|
reference_scores = [float(x.split('Feedback:', 1)[0].strip()) for x in labels] |
|
|
|
|
|
sacrebleu_score = sacrebleu.compute(predictions=predicted_feedback, references=[[x] for x in reference_feedback])['score'] |
|
rouge_score = rouge.compute(predictions=predicted_feedback, references=reference_feedback)['rouge2'] |
|
meteor_score = meteor.compute(predictions=predicted_feedback, references=reference_feedback)['meteor'] |
|
bert_score = bertscore.compute( |
|
predictions=predicted_feedback, |
|
references=reference_feedback, |
|
lang='de', |
|
model_type='bert-base-multilingual-cased', |
|
rescale_with_baseline=True) |
|
|
|
|
|
rmse, _ = compute_rmse(predicted_scores, reference_scores) |
|
|
|
results = { |
|
'sacrebleu': sacrebleu_score, |
|
'rouge': rouge_score, |
|
'meteor': meteor_score, |
|
'bert_score': np.array(bert_score['f1']).mean().item(), |
|
'rmse': rmse |
|
} |
|
|
|
return results |
|
|
|
def evaluate(model, tokenizer, dataloader): |
|
""" |
|
Evaluate model on the given dataset |
|
Params: |
|
model (PreTrainedModel): seq2seq model |
|
tokenizer (PreTrainedTokenizer): tokenizer from HuggingFace |
|
dataloader (torch Dataloader): dataloader of the dataset to be used for evaluation |
|
Returns: |
|
results (dict): dictionary with the computed evaluation metrics |
|
predictions (list): list of the decoded predictions of the model |
|
""" |
|
decoded_preds, decoded_labels = [], [] |
|
|
|
model.eval() |
|
|
|
for batch in tqdm(dataloader): |
|
with torch.no_grad(): |
|
batch = {k: v.to(device) for k, v in batch.items()} |
|
|
|
generated_tokens = model.generate( |
|
batch['input_ids'], |
|
attention_mask=batch['attention_mask'], |
|
max_length=MAX_TARGET_LENGTH |
|
) |
|
|
|
labels_batch = batch['labels'] |
|
|
|
|
|
decoded_preds_batch = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) |
|
decoded_labels_batch = tokenizer.batch_decode(labels_batch, skip_special_tokens=True) |
|
|
|
decoded_preds.append(decoded_preds_batch) |
|
decoded_labels.append(decoded_labels_batch) |
|
|
|
|
|
predictions = flatten_list(decoded_preds) |
|
labels = flatten_list(decoded_labels) |
|
|
|
|
|
results = compute_metrics(predictions, labels) |
|
|
|
return results, predictions |