JohnnyBoy00 commited on
Commit
54e08b1
1 Parent(s): 8fe9b43

Update evaluation.py

Browse files
Files changed (1) hide show
  1. evaluation.py +19 -23
evaluation.py CHANGED
@@ -34,14 +34,14 @@ def parse_float(value):
34
  Params:
35
  value (string): value to be converted to float
36
  Returns:
37
- The float representation of the given string, or -1 if the string could
38
  not be converted to a float
39
  """
40
  try:
41
  float_value = float(value)
42
  return float_value
43
  except ValueError:
44
- return -1
45
 
46
  def extract_scores(predictions):
47
  """
@@ -54,17 +54,13 @@ def extract_scores(predictions):
54
  """
55
  scores = []
56
  # iterate through predictions and try to extract predicted score;
57
- # if score could not be extracted, set it to -1
58
  for pred in predictions:
59
  try:
60
- score_string = pred.split('Feedback:', 1)[0].strip()
61
  score = parse_float(score_string)
62
  except IndexError:
63
- try:
64
- score_string = pred.split(' ', 1)[0].strip()
65
- score = parse_float(score_string)
66
- except IndexError:
67
- score = -1
68
  scores.append(score)
69
 
70
  return scores
@@ -92,40 +88,40 @@ def extract_feedback(predictions):
92
 
93
  return feedback
94
 
95
- def compute_mse(predictions, labels):
96
  """
97
- Utility function to compute the mean squared error of the
98
  score predictions in relation to the golden label scores
99
 
100
  Params:
101
  predictions (list): model score predictions
102
  labels (list): golden label scores
103
  Returns:
104
- (float, int): mse of valid samples and number of invalid samples
105
  """
106
  # get indexes of valid score predictions
107
- # (i.e., where the score is greater than zero)
108
- idx = np.where(np.array(predictions) > 0)
109
 
110
  # get size of the golden labels list and of
111
  # the valid predictions array
112
  labels_size = np.array(labels).size
113
  valid_predictions_size = idx[0].size
114
 
115
- # only compute mse if valid score predictions were generated,
116
  # otherwise set mse to 1
117
  if valid_predictions_size > 0:
118
- # calculate mse from labels and predictions
119
  valid_predictions = np.array(predictions)[idx]
120
  score_labels = np.array(labels)[idx]
121
- mse = mean_squared_error(score_labels, valid_predictions)
122
 
123
  # cap mse at 1
124
- if mse > 1:
125
  return 1, labels_size - valid_predictions_size
126
 
127
- # return computed mse and number of invalid samples
128
- return mse, labels_size - valid_predictions_size
129
  else:
130
  return 1, labels_size - valid_predictions_size
131
 
@@ -158,15 +154,15 @@ def compute_metrics(predictions, labels):
158
  model_type='bert-base-multilingual-cased',
159
  rescale_with_baseline=True)
160
 
161
- # compute mse of score predictions
162
- mse, _ = compute_mse(predicted_scores, reference_scores)
163
 
164
  results = {
165
  'sacrebleu': sacrebleu_score,
166
  'rouge': rouge_score,
167
  'meteor': meteor_score,
168
  'bert_score': np.array(bert_score['f1']).mean().item(),
169
- 'mse': mse
170
  }
171
 
172
  return results
 
34
  Params:
35
  value (string): value to be converted to float
36
  Returns:
37
+ The float representation of the given string, or None if the string could
38
  not be converted to a float
39
  """
40
  try:
41
  float_value = float(value)
42
  return float_value
43
  except ValueError:
44
+ return None
45
 
46
  def extract_scores(predictions):
47
  """
 
54
  """
55
  scores = []
56
  # iterate through predictions and try to extract predicted score;
57
+ # if score could not be extracted, set it to None
58
  for pred in predictions:
59
  try:
60
+ score_string = pred.split(' ', 1)[0].strip()
61
  score = parse_float(score_string)
62
  except IndexError:
63
+ score = None
 
 
 
 
64
  scores.append(score)
65
 
66
  return scores
 
88
 
89
  return feedback
90
 
91
+ def compute_rmse(predictions, labels):
92
  """
93
+ Utility function to compute the root mean squared error of the
94
  score predictions in relation to the golden label scores
95
 
96
  Params:
97
  predictions (list): model score predictions
98
  labels (list): golden label scores
99
  Returns:
100
+ (float, int): rmse of valid samples and number of invalid samples
101
  """
102
  # get indexes of valid score predictions
103
+ # (i.e., where the score is not None)
104
+ idx = np.where(np.array(predictions) != None)
105
 
106
  # get size of the golden labels list and of
107
  # the valid predictions array
108
  labels_size = np.array(labels).size
109
  valid_predictions_size = idx[0].size
110
 
111
+ # only compute rmse if valid score predictions were generated,
112
  # otherwise set mse to 1
113
  if valid_predictions_size > 0:
114
+ # calculate rmse from labels and predictions
115
  valid_predictions = np.array(predictions)[idx]
116
  score_labels = np.array(labels)[idx]
117
+ rmse = mean_squared_error(score_labels, valid_predictions, squared=False)
118
 
119
  # cap mse at 1
120
+ if rmse > 1:
121
  return 1, labels_size - valid_predictions_size
122
 
123
+ # return computed rmse and number of invalid samples
124
+ return rmse, labels_size - valid_predictions_size
125
  else:
126
  return 1, labels_size - valid_predictions_size
127
 
 
154
  model_type='bert-base-multilingual-cased',
155
  rescale_with_baseline=True)
156
 
157
+ # compute rmse of score predictions
158
+ rmse, _ = compute_rmse(predicted_scores, reference_scores)
159
 
160
  results = {
161
  'sacrebleu': sacrebleu_score,
162
  'rouge': rouge_score,
163
  'meteor': meteor_score,
164
  'bert_score': np.array(bert_score['f1']).mean().item(),
165
+ 'rmse': rmse
166
  }
167
 
168
  return results