File size: 983 Bytes
0df4b7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
MAX_INPUT_LENGTH = 256
MAX_TARGET_LENGTH = 128

def preprocess_function(examples):    
    """
    Preprocess entries of the given dataset (should be used with a `map` function)
    Params:
        examples (Dataset): dataset to be preprocessed
    Returns:
        model_inputs (BatchEncoding): tokenized dataset entries
    """
    inputs, targets = [], []
    for i in range(len(examples['question'])):
        inputs.append(f"Antwort: {examples['provided_answer'][i]} Lösung: {examples['reference_answer'][i]} Frage: {examples['question'][i]}")
        targets.append(f"{examples['score'][i]} Feedback: {examples['answer_feedback'][i]}")

    # apply tokenization to inputs and labels
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, padding='max_length', truncation=True)
    labels = tokenizer(text_target=targets, max_length=MAX_TARGET_LENGTH, padding='max_length', truncation=True)

    model_inputs['labels'] = labels['input_ids']

    return model_inputs