File size: 6,025 Bytes
1b7c795 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import numpy as np
import torch
from evaluate import load as load_metric
from sklearn.metrics import accuracy_score, f1_score
from tqdm.auto import tqdm
MAX_TARGET_LENGTH = 128
# load evaluation metrics
sacrebleu = load_metric('sacrebleu')
rouge = load_metric('rouge')
meteor = load_metric('meteor')
bertscore = load_metric('bertscore')
# use gpu if it's available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
def flatten_list(l):
"""
Utility function to convert a list of lists into a flattened list
Params:
l (list of lists): list to be flattened
Returns:
A flattened list with the elements of the original list
"""
return [item for sublist in l for item in sublist]
def extract_feedback(predictions):
"""
Utility function to extract the feedback from the predictions of the model
Params:
predictions (list): complete model predictions
Returns:
feedback (list): extracted feedback from the model's predictions
"""
feedback = []
# iterate through predictions and try to extract predicted feedback
for pred in predictions:
try:
fb = pred.split(':', 1)[1]
except IndexError:
try:
if pred.lower().startswith('partially correct'):
fb = pred.split(' ', 1)[2]
else:
fb = pred.split(' ', 1)[1]
except IndexError:
fb = pred
feedback.append(fb.strip())
return feedback
def extract_labels(predictions):
"""
Utility function to extract the labels from the predictions of the model
Params:
predictions (list): complete model predictions
Returns:
feedback (list): extracted labels from the model's predictions
"""
labels = []
for pred in predictions:
if pred.lower().startswith('correct'):
label = 'Correct'
elif pred.lower().startswith('partially correct'):
label = 'Partially correct'
elif pred.lower().startswith('incorrect'):
label = 'Incorrect'
else:
label = 'Unknown label'
labels.append(label)
return labels
def compute_metrics(predictions, labels):
"""
Compute evaluation metrics from the predictions of the model
Params:
predictions (list): complete model predictions
labels (list): golden labels (previously tokenized)
Returns:
results (dict): dictionary with the computed evaluation metrics
predictions (list): list of the decoded predictions of the model
"""
# extract feedback and labels from the model's predictions
predicted_feedback = extract_feedback(predictions)
predicted_labels = extract_labels(predictions)
# extract feedback and labels from the golden labels
reference_feedback = [x.split('Feedback:', 1)[1].strip() for x in labels]
reference_labels = [x.split('Feedback:', 1)[0].strip() for x in labels]
# compute HF metrics
sacrebleu_score = sacrebleu.compute(predictions=predicted_feedback, references=[[x] for x in reference_feedback])['score']
rouge_score = rouge.compute(predictions=predicted_feedback, references=reference_feedback)['rouge2']
meteor_score = meteor.compute(predictions=predicted_feedback, references=reference_feedback)['meteor']
bert_score = bertscore.compute(
predictions=predicted_feedback,
references=reference_feedback,
lang='en',
rescale_with_baseline=True)
# use sklearn to compute accuracy and f1 score
reference_labels_np = np.array(reference_labels)
accuracy = accuracy_score(reference_labels_np, predicted_labels)
f1_weighted = f1_score(reference_labels_np, predicted_labels, average='weighted')
f1_macro = f1_score(
reference_labels_np,
predicted_labels,
average='macro',
labels=['Incorrect', 'Partially correct', 'Correct'])
results = {
'sacrebleu': sacrebleu_score,
'rouge': rouge_score,
'meteor': meteor_score,
'bert_score': np.array(bert_score['f1']).mean().item(),
'accuracy': accuracy,
'f1_weighted': f1_weighted,
'f1_macro': f1_macro
}
return results
def evaluate(model, tokenizer, dataloader):
"""
Evaluate model on the given dataset
Params:
model (PreTrainedModel): seq2seq model
tokenizer (PreTrainedTokenizer): tokenizer from HuggingFace
dataloader (torch Dataloader): dataloader of the dataset to be used for evaluation
Returns:
results (dict): dictionary with the computed evaluation metrics
predictions (list): list of the decoded predictions of the model
"""
decoded_preds, decoded_labels = [], []
model.eval()
# iterate through batchs in the dataloader
for batch in tqdm(dataloader):
with torch.no_grad():
batch = {k: v.to(device) for k, v in batch.items()}
# generate tokens from batch
generated_tokens = model.generate(
batch['input_ids'],
attention_mask=batch['attention_mask'],
max_length=MAX_TARGET_LENGTH
)
# get golden labels from batch
labels_batch = batch['labels']
# decode model predictions and golden labels
decoded_preds_batch = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
decoded_labels_batch = tokenizer.batch_decode(labels_batch, skip_special_tokens=True)
decoded_preds.append(decoded_preds_batch)
decoded_labels.append(decoded_labels_batch)
# convert predictions and golden labels into flattened lists
predictions = flatten_list(decoded_preds)
labels = flatten_list(decoded_labels)
# compute metrics based on predictions and golden labels
results = compute_metrics(predictions, labels)
return results, predictions |