File size: 2,710 Bytes
83dd7d0 b3ec891 83dd7d0 9a4336b 83dd7d0 b3ec891 83dd7d0 13785d5 5738764 83dd7d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import datasets
import evaluate
_CITATION = """\
tba
"""
_DESCRIPTION = """\
Negation-aware version of BLEURT metric.
BLEURT a learnt evaluation metric for Natural Language Generation. It is built using multiple phases of transfer learning starting from a pretrained BERT model (Devlin et al. 2018) and then employing another pre-training phrase using synthetic data. Finally it is trained on WMT human annotations and the CANNOT negation awareness dataset.
"""
_KWARGS_DESCRIPTION = """
Calculates the NegBLEURT scores between references and predictions
Args:
predictions: list of predictions to score. Each prediction should be a string.
references: list of references, one for each prediction. Each reference should be a string
batch_size: batch_size for model inference. Default is 16
Returns:
negBLEURT: List of NegBLEURT scores for all predictions
Examples:
>>> negBLEURT = evaluate.load('tum-nlp/negbleurt')
>>> predictions = ["Ray Charles is a legend.", "Ray Charles isn’t legendary."]
>>> references = ["Ray Charles is legendary.", "Ray Charles is legendary."]
>>> results = negBLEURT.compute(predictions=predictions, references=references)
>>> print(results)
{'negBLERUT': [0.8409, 0.2601]}
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class NegBLEURT(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=[
datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
}
),
],
codebase_urls=["https://github.com/MiriUll/negation_aware_evaluation"]
)
def _download_and_prepare(self, dl_manager):
model_name = "tum-nlp/NegBLEURT"
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
def _compute(
self, predictions, references, batch_size=16
):
scores_negbleurt = []
for i in range(0, len(references), batch_size):
tokenized = self.tokenizer(references[i:i+batch_size], predictions[i:i+batch_size], return_tensors='pt', padding=True, max_length=512, truncation=True)
scores_negbleurt += self.model(**tokenized).logits.flatten().tolist()
return {'negBLEURT': scores_negbleurt} |