JohnnyBoy00
commited on
Commit
•
54e08b1
1
Parent(s):
8fe9b43
Update evaluation.py
Browse files- evaluation.py +19 -23
evaluation.py
CHANGED
@@ -34,14 +34,14 @@ def parse_float(value):
|
|
34 |
Params:
|
35 |
value (string): value to be converted to float
|
36 |
Returns:
|
37 |
-
The float representation of the given string, or
|
38 |
not be converted to a float
|
39 |
"""
|
40 |
try:
|
41 |
float_value = float(value)
|
42 |
return float_value
|
43 |
except ValueError:
|
44 |
-
return
|
45 |
|
46 |
def extract_scores(predictions):
|
47 |
"""
|
@@ -54,17 +54,13 @@ def extract_scores(predictions):
|
|
54 |
"""
|
55 |
scores = []
|
56 |
# iterate through predictions and try to extract predicted score;
|
57 |
-
# if score could not be extracted, set it to
|
58 |
for pred in predictions:
|
59 |
try:
|
60 |
-
score_string = pred.split('
|
61 |
score = parse_float(score_string)
|
62 |
except IndexError:
|
63 |
-
|
64 |
-
score_string = pred.split(' ', 1)[0].strip()
|
65 |
-
score = parse_float(score_string)
|
66 |
-
except IndexError:
|
67 |
-
score = -1
|
68 |
scores.append(score)
|
69 |
|
70 |
return scores
|
@@ -92,40 +88,40 @@ def extract_feedback(predictions):
|
|
92 |
|
93 |
return feedback
|
94 |
|
95 |
-
def
|
96 |
"""
|
97 |
-
Utility function to compute the mean squared error of the
|
98 |
score predictions in relation to the golden label scores
|
99 |
|
100 |
Params:
|
101 |
predictions (list): model score predictions
|
102 |
labels (list): golden label scores
|
103 |
Returns:
|
104 |
-
(float, int):
|
105 |
"""
|
106 |
# get indexes of valid score predictions
|
107 |
-
# (i.e., where the score is
|
108 |
-
idx = np.where(np.array(predictions)
|
109 |
|
110 |
# get size of the golden labels list and of
|
111 |
# the valid predictions array
|
112 |
labels_size = np.array(labels).size
|
113 |
valid_predictions_size = idx[0].size
|
114 |
|
115 |
-
# only compute
|
116 |
# otherwise set mse to 1
|
117 |
if valid_predictions_size > 0:
|
118 |
-
# calculate
|
119 |
valid_predictions = np.array(predictions)[idx]
|
120 |
score_labels = np.array(labels)[idx]
|
121 |
-
|
122 |
|
123 |
# cap mse at 1
|
124 |
-
if
|
125 |
return 1, labels_size - valid_predictions_size
|
126 |
|
127 |
-
# return computed
|
128 |
-
return
|
129 |
else:
|
130 |
return 1, labels_size - valid_predictions_size
|
131 |
|
@@ -158,15 +154,15 @@ def compute_metrics(predictions, labels):
|
|
158 |
model_type='bert-base-multilingual-cased',
|
159 |
rescale_with_baseline=True)
|
160 |
|
161 |
-
# compute
|
162 |
-
|
163 |
|
164 |
results = {
|
165 |
'sacrebleu': sacrebleu_score,
|
166 |
'rouge': rouge_score,
|
167 |
'meteor': meteor_score,
|
168 |
'bert_score': np.array(bert_score['f1']).mean().item(),
|
169 |
-
'
|
170 |
}
|
171 |
|
172 |
return results
|
|
|
34 |
Params:
|
35 |
value (string): value to be converted to float
|
36 |
Returns:
|
37 |
+
The float representation of the given string, or None if the string could
|
38 |
not be converted to a float
|
39 |
"""
|
40 |
try:
|
41 |
float_value = float(value)
|
42 |
return float_value
|
43 |
except ValueError:
|
44 |
+
return None
|
45 |
|
46 |
def extract_scores(predictions):
|
47 |
"""
|
|
|
54 |
"""
|
55 |
scores = []
|
56 |
# iterate through predictions and try to extract predicted score;
|
57 |
+
# if score could not be extracted, set it to None
|
58 |
for pred in predictions:
|
59 |
try:
|
60 |
+
score_string = pred.split(' ', 1)[0].strip()
|
61 |
score = parse_float(score_string)
|
62 |
except IndexError:
|
63 |
+
score = None
|
|
|
|
|
|
|
|
|
64 |
scores.append(score)
|
65 |
|
66 |
return scores
|
|
|
88 |
|
89 |
return feedback
|
90 |
|
91 |
+
def compute_rmse(predictions, labels):
|
92 |
"""
|
93 |
+
Utility function to compute the root mean squared error of the
|
94 |
score predictions in relation to the golden label scores
|
95 |
|
96 |
Params:
|
97 |
predictions (list): model score predictions
|
98 |
labels (list): golden label scores
|
99 |
Returns:
|
100 |
+
(float, int): rmse of valid samples and number of invalid samples
|
101 |
"""
|
102 |
# get indexes of valid score predictions
|
103 |
+
# (i.e., where the score is not None)
|
104 |
+
idx = np.where(np.array(predictions) != None)
|
105 |
|
106 |
# get size of the golden labels list and of
|
107 |
# the valid predictions array
|
108 |
labels_size = np.array(labels).size
|
109 |
valid_predictions_size = idx[0].size
|
110 |
|
111 |
+
# only compute rmse if valid score predictions were generated,
|
112 |
# otherwise set mse to 1
|
113 |
if valid_predictions_size > 0:
|
114 |
+
# calculate rmse from labels and predictions
|
115 |
valid_predictions = np.array(predictions)[idx]
|
116 |
score_labels = np.array(labels)[idx]
|
117 |
+
rmse = mean_squared_error(score_labels, valid_predictions, squared=False)
|
118 |
|
119 |
# cap mse at 1
|
120 |
+
if rmse > 1:
|
121 |
return 1, labels_size - valid_predictions_size
|
122 |
|
123 |
+
# return computed rmse and number of invalid samples
|
124 |
+
return rmse, labels_size - valid_predictions_size
|
125 |
else:
|
126 |
return 1, labels_size - valid_predictions_size
|
127 |
|
|
|
154 |
model_type='bert-base-multilingual-cased',
|
155 |
rescale_with_baseline=True)
|
156 |
|
157 |
+
# compute rmse of score predictions
|
158 |
+
rmse, _ = compute_rmse(predicted_scores, reference_scores)
|
159 |
|
160 |
results = {
|
161 |
'sacrebleu': sacrebleu_score,
|
162 |
'rouge': rouge_score,
|
163 |
'meteor': meteor_score,
|
164 |
'bert_score': np.array(bert_score['f1']).mean().item(),
|
165 |
+
'rmse': rmse
|
166 |
}
|
167 |
|
168 |
return results
|