|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
import datasets |
|
import evaluate |
|
|
|
|
|
_CITATION = """\ |
|
tba |
|
""" |
|
|
|
_DESCRIPTION = """\ |
|
Negation-aware version of BLEURT metric. |
|
BLEURT a learnt evaluation metric for Natural Language Generation. It is built using multiple phases of transfer learning starting from a pretrained BERT model (Devlin et al. 2018) and then employing another pre-training phrase using synthetic data. Finally it is trained on WMT human annotations and the CANNOT negation awareness dataset. |
|
""" |
|
|
|
_KWARGS_DESCRIPTION = """ |
|
Calculates the NegBLEURT scores between references and predictions |
|
Args: |
|
predictions: list of predictions to score. Each prediction should be a string. |
|
references: single reference or list of references for each prediction. If only one reference is given, all predictions will be scored against the same reference |
|
batch_size: batch_size for model inference. Default is 16 |
|
Returns: |
|
negBLEURT: List of NegBLEURT scores for all predictions |
|
Examples: |
|
>>> negBLEURT = evaluate.load('MiriUll/negbleurt') |
|
>>> predictions = ["Ray Charles is a legend.", "Ray Charles isn’t legendary."] |
|
>>> reference = "Ray Charles is legendary." |
|
>>> results = rouge.compute(predictions=predictions, references=reference) |
|
>>> print(results) |
|
{'negBLERUT': [0.8409, 0.2601]} |
|
""" |
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
class NegBLEURT(evaluate.Metric): |
|
def _info(self): |
|
return evaluate.MetricInfo( |
|
description=_DESCRIPTION, |
|
citation=_CITATION, |
|
inputs_description=_KWARGS_DESCRIPTION, |
|
features=[ |
|
datasets.Features( |
|
{ |
|
"predictions": datasets.Value("string", id="sequence"), |
|
"references": datasets.Sequence(datasets.Value("string", id="sequence")), |
|
} |
|
), |
|
datasets.Features( |
|
{ |
|
"predictions": datasets.Value("string", id="sequence"), |
|
"references": datasets.Value("string", id="sequence"), |
|
} |
|
), |
|
], |
|
codebase_urls=["https://github.com/MiriUll/negation_aware_evaluation"] |
|
) |
|
|
|
def _download_and_prepare(self, dl_manager): |
|
model_name = "tum-nlp/NegBLEURT" |
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
self.model = AutoModelForSequenceClassification.from_pretrained(model_name) |
|
|
|
def _compute( |
|
self, predictions, references, batch_size=16 |
|
): |
|
single_ref = isinstance(references, str) |
|
if single_ref: |
|
references = [references] * len(predictions) |
|
|
|
scores_negbleurt = [] |
|
for i in tqdm(range(0, len(references), batch_size)): |
|
tokenized = self.tokenizer(references[i:i+batch_size], candidates[i:i+batch_size], return_tensors='pt', padding=True, max_length=512, truncation=True) |
|
scores_negbleurt += self.model(**tokenized).logits.flatten().tolist() |
|
return {'negBLEURT': scores_negbleurt} |