facat commited on
Commit
58f14b3
1 Parent(s): 76eab85

update math

Browse files
Files changed (2) hide show
  1. tasks.py +3 -1
  2. utils.py +67 -111
tasks.py CHANGED
@@ -269,7 +269,9 @@ class Metrics:
269
  else:
270
  ans = response[indices[-2] + 1 : indices[-1]]
271
  extract_responses.append(strip_string(ans))
272
- extract_answers = [strip_string(get_answer(answer)) for answer in answers]
 
 
273
  return extract_responses, extract_answers
274
 
275
 
 
269
  else:
270
  ans = response[indices[-2] + 1 : indices[-1]]
271
  extract_responses.append(strip_string(ans))
272
+ extract_answers=[]
273
+ for answer in answers:
274
+ extract_answers.append(strip_string(get_answer(answer)))
275
  return extract_responses, extract_answers
276
 
277
 
utils.py CHANGED
@@ -9,6 +9,16 @@ NUMERIC_IN_ZH = (
9
  )
10
 
11
 
 
 
 
 
 
 
 
 
 
 
12
  def extract_choice_zh(gen):
13
  # 答案是A | 选项是A | 应该选A选项
14
  res = re.search(
@@ -164,7 +174,7 @@ def remove_right_units(string):
164
  # "\\text{ " only ever occurs (at least in the val set) when describing units
165
  if "\\text{ " in string:
166
  splits = string.split("\\text{ ")
167
- assert len(splits) == 2
168
  return splits[0]
169
  else:
170
  return string
@@ -278,16 +288,14 @@ def strip_string(string):
278
 
279
  # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
280
  # string = fix_a_slash_b(string)
281
-
 
 
282
  return string
283
 
284
 
285
  def get_answer(string):
286
- try:
287
- answer = remove_boxed(last_boxed_only_string(string))
288
- # answer = strip_string(answer)
289
- except Exception:
290
- answer = string
291
  return answer
292
 
293
 
@@ -307,21 +315,22 @@ def is_equiv(str1, str2, verbose=False):
307
  except Exception:
308
  return str1 == str2
309
 
 
310
  def first_option_postprocess(text: str, options: str) -> str:
311
  """Find first valid option for text."""
312
 
313
  patterns = [
314
- f'[Tt]he answer is [{options}]',
315
- f'[Tt]he correct answer\s?(?:option)?\s?is [{options}]', # noqa
316
- f'答案(?:选项)?是(.*?)[{options}]',
317
- f'答案(?:选项)?为(.*?)[{options}]',
318
- f'答案(?:选项)?选(.*?)[{options}]',
319
- f'选项[{options}]是?正确',
320
- f'选项[{options}]为?正确',
321
- f'固选(.*?)[{options}]',
322
- f'答案应该是(.*?)[{options}]',
323
- f'(\s|^)[{options}][\s。,,\.$]', # noqa
324
- f'[{options}]',
325
  ]
326
 
327
  regexes = [re.compile(pattern) for pattern in patterns]
@@ -332,44 +341,41 @@ def first_option_postprocess(text: str, options: str) -> str:
332
  for i in options:
333
  if i in outputs:
334
  return i
335
- return ''
336
 
337
 
338
  def first_capital_postprocess(text: str) -> str:
339
  for t in text:
340
  if t.isupper():
341
  return t
342
- return ''
343
 
344
 
345
  def general_postprocess(text: str) -> str:
346
  # Cut off the first newline, period, or comma
347
- truncated_text = re.split(r'[\n.,]', text, 1)[0]
348
 
349
  # Remove punctuation
350
- no_punctuation = re.sub(r'[^\w\s]', '', truncated_text)
351
 
352
  # Remove article
353
- no_articles = re.sub(r'\b(a|an|the)\b',
354
- '',
355
- no_punctuation,
356
- flags=re.IGNORECASE)
357
 
358
  # Remove duplicated blank spaces
359
- cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()
360
 
361
  return cleaned_text
362
 
363
-
364
  def bbh_mcq_postprocess(text: str) -> str:
365
  ans = text
366
- ans_line = ans.split('answer is ')
367
  if len(ans_line) != 1:
368
  ans = ans_line[-1].strip()
369
- match = re.search(r'\(([A-Z])\)*', ans)
370
  if match:
371
  return f"({match.group(1)})"
372
- match = re.search(r'([A-Z])', ans)
373
  if match:
374
  return f"({match.group(1)})"
375
  return f"({ans})"
@@ -377,18 +383,17 @@ def bbh_mcq_postprocess(text: str) -> str:
377
 
378
  def bbh_freeform_postprocess(text: str) -> str:
379
  ans = text
380
- ans_line = ans.split('answer is ')
381
  if len(ans_line) != 1:
382
  ans = ans_line[-1].strip()
383
- ans = ans.split('\n')[0]
384
- if ans.endswith('.'):
385
  ans = ans[:-1]
386
  return ans
387
 
388
 
389
-
390
  icl_prompts = {
391
- 'temporal_sequences': """Task description: Answer questions about which times certain events could have occurred.
392
 
393
  Q: Today, Emily went to the museum. Between what times could they have gone?
394
  We know that:
@@ -465,9 +470,7 @@ Wake-up time: 5am.
465
  3pm-4pm: free.
466
  The beach closure time: 4pm.
467
  The only time when Tiffany could have gone to the beach was 3pm to 4pm. So the answer is (D).""",
468
-
469
-
470
- 'disambiguation_qa': """Clarify the meaning of sentences with ambiguous pronouns.
471
 
472
  Q: In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.
473
  Sentence: The chief told the counselor that they took the day off.
@@ -504,9 +507,7 @@ Here we need to determine who the pronoun "his" might be referring to. There are
504
  Let's consider Y first: "X will plan to meet Y at Y's office." This case makes sense, because X might want to meet up with Y at Y's office.
505
  Now, consider X: "X will plan to meet Y at X's office." This case also makes sense, because X might want to meet up with Y at X's own office.
506
  Because both X and Y are possible at the same time, we conclude that the antecedent of the pronoun is ambiguous. So the answer is (C).""",
507
-
508
-
509
- 'date_understanding': """Infer the date from context.
510
 
511
  Q: Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY?
512
  Options:
@@ -539,9 +540,7 @@ Options:
539
  (F) 12/03/1960
540
  A: Let's think step by step.
541
  If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963, that is 01/03/1963. So the answer is (B).""",
542
-
543
-
544
- 'tracking_shuffled_objects_three_objects': """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
545
 
546
  Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
547
  As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
@@ -581,9 +580,7 @@ A: Let's think step by step.
581
  (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
582
  (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
583
  At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
584
-
585
-
586
- 'penguins_in_a_table': """Answer questions about a table of penguins and their attributes.
587
 
588
  Q: Here is a table where the first line is a header and each subsequent line is a penguin: name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15 For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm. We now add a penguin to the table:
589
  James, 12, 90, 12
@@ -624,9 +621,7 @@ This question focuses on the name. We know the following: The names of the pengu
624
  When we sort their names alphabetically, we get Bernard, Gwen, Louis, Vincent.
625
  The name of the second penguin sorted by alphabetical order is Gwen.
626
  The name of the second penguin sorted by alphabetic order is Gwen. So the answer is (D).""",
627
-
628
-
629
- 'geometric_shapes': """Name geometric shapes from their SVG paths.
630
 
631
  Q: This SVG path element <path d="M 31.00,73.00 L 32.00,59.00 L 44.00,50.00 L 49.00,41.00 L 64.00,37.00 L 71.00,55.00 L 64.00,76.00 L 52.00,61.00 L 31.00,73.00"/> draws a
632
  Options:
@@ -704,9 +699,7 @@ Length of side B: |B| = sqrt((37.00-41.00)^2 + (34.00-33.00)^2)) = sqrt((4)^2 +
704
  Length of side C: |C| = sqrt((41.00-45.00)^2 + (33.00-34.00)^2)) = sqrt((-4)^2 + (-1)^2) = sqrt(16 + 1) = sqrt(17).
705
  Length of side D: |D| = sqrt((45.00-41.00)^2 + (34.00-43.00)^2)) = sqrt((4)^2 + (-9)^2) = sqrt(16 + 81) = sqrt(97).
706
  Note that |A| = |D| and |B| = |C|. Furthermore, A and D are adjacent and B and C are adjacent. Thus, this polygon has two pairs of equal-length adjacent sides and is "kite". So the answer is (D).""",
707
-
708
-
709
- 'snarks': """Determine which of two sentences is sarcastic.
710
 
711
  According to Cambridge University Dictionary, sarcasm is "the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way." Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.
712
 
@@ -736,9 +729,7 @@ A: Let's think step by step.
736
  If we look at (A), it likens the consistency in the league's punishments with that in politics. Because politics or political affairs are often not considered to be consistent or dependable, this sentence appears to be satirical.
737
  If we look at (B), it likens the consistency in the league's punishments with that in morality. Discussing the consistency of the league's punishments in the context of morality, ethics, or law makes sense and does not appear to make a satirical point about anything.
738
  Above the above, the sarcastic option is (A). So the answer is (A).""",
739
-
740
-
741
- 'ruin_names': """Select the humorous edit that 'ruins' the input movie or musical artist name.
742
 
743
  Q: Which of the following is a humorous edit of this artist or movie name: 'whitesnake'?
744
  Options:
@@ -782,9 +773,7 @@ The original name is "counting crows". This is the name of an American rock band
782
  (C) "courting crows": Here the word "counting" is changed to "courting", and "courting" is an actual word; however, "courting crows" does not sound as humorous as "counting cows".
783
  (D) "coutnting crows": Here the word "counting" is changed to "coutnting", but the word "coutnting" is not an actual word.
784
  Above the above, the only humorous edit is (B). So the answer is (B).""",
785
-
786
-
787
- 'tracking_shuffled_objects_seven_objects': """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
788
 
789
  Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
790
  As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
@@ -824,9 +813,7 @@ A: Let's think step by step.
824
  (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
825
  (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
826
  At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
827
-
828
-
829
- 'tracking_shuffled_objects_five_objects': """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
830
 
831
  Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
832
  As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
@@ -866,9 +853,7 @@ A: Let's think step by step.
866
  (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
867
  (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
868
  At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
869
-
870
-
871
- 'logical_deduction_three_objects': """A logical deduction task which requires deducing the order of a sequence of objects.
872
 
873
  Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
874
  Options:
@@ -905,9 +890,7 @@ A: Let's think step by step.
905
  (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
906
  According to this ordering, the leftmost book is the white book.
907
  The white book is the leftmost. So the answer is (C).""",
908
-
909
-
910
- 'hyperbaton': """Order adjectives correctly in English sentences.
911
 
912
  Q: Which sentence has the correct adjective order:
913
  Options:
@@ -935,9 +918,7 @@ A: Let's think step by step.
935
  When there is more than one adjective before a noun, the adjectives need to respect the following order before a noun: "[1. opinion] [2. size] [3. age] [4. shape] [5. color] [6. origin] [7. material] [8. purpose] noun".
936
  Option (A): "blue gold wonderful square shoe". (1) "blue" falls into the color category. (2) "gold" falls into the material category. (3) "wonderful" falls into the opinion category. (4) "square" falls into the shape category. The adjective order that Option (A) has is [5. color] [7. material] [1. opinion] [4. shape] (or, in numeric terms, 5 7 1 4). Because 5 < 7 < 1 < 4 is not correct, (A) does not have the correct ordering.
937
  Option (B): "wonderful square blue gold shoe". Option (B) has the following adjective order: [1. opinion] [4. shape] [5. color] [7. material] (or, in numeric terms, 1 4 5 7 ). Because 1 < 4 < 5 < 7 is correct, (B) has the correct ordering. So the answer is (B).""",
938
-
939
-
940
- 'logical_deduction_five_objects': """A logical deduction task which requires deducing the order of a sequence of objects.
941
 
942
  Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
943
  Options:
@@ -974,9 +955,7 @@ A: Let's think step by step.
974
  (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
975
  According to this ordering, the leftmost book is the white book.
976
  The white book is the leftmost. So the answer is (C).""",
977
-
978
-
979
- 'logical_deduction_seven_objects': """A logical deduction task which requires deducing the order of a sequence of objects.
980
 
981
  Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
982
  Options:
@@ -1013,9 +992,7 @@ A: Let's think step by step.
1013
  (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
1014
  According to this ordering, the leftmost book is the white book.
1015
  The white book is the leftmost. So the answer is (C).""",
1016
-
1017
-
1018
- 'movie_recommendation': """Recommend movies similar to the given list of movies.
1019
 
1020
  Q: Find a movie similar to Star Wars Episode IV - A New Hope, Indiana Jones and the Last Crusade, Star Wars Episode V - The Empire Strikes Back, The Big Lebowski:
1021
  Options:
@@ -1057,9 +1034,7 @@ A: Let's think step by step.
1057
  - Inside Out (animation, family, comedy; 2015)
1058
  - Forrest Gump (comedy, drama, romance; 1994)
1059
  These are all famous movies produced in the past few decades.Amongst all the options, the only movie similar to these ones seems to be Edge of Tomorrow (action, adventure, crime, mystery; 2014), as it is also a science-fiction movie and features Tom Cruise. So the answer is (D).""",
1060
-
1061
-
1062
- 'salient_translation_error_detection': """Detect the type of error in an English translation of a German source sentence.
1063
 
1064
  Q: The following translations from German to English contain a particular error. That error will be one of the following types: Named Entities: An entity (names, places, locations, etc.) is changed to a different entity. Numerical Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are changed. Negation or Antonyms: Introduce or remove a negation or change comparatives to their antonyms. Facts: Trivial factual errors not pertaining to the above classes are introduced in the translations. Dropped Content: A significant clause in the translation is removed. Please identify that error. Source: In der Liste der Baudenkmale in Lenzen (Elbe) sind alle Baudenkmale der brandenburgischen Stadt Lenzen (Elbe) und ihrer Ortsteile aufgelistet.
1065
  Translation: In the list of architectural monuments in Lenzen all architectural monuments of the Brandenburg city of Lenzen and its districts are listed.
@@ -1099,9 +1074,7 @@ Options:
1099
  (F) Facts
1100
  A: Let's think step by step.
1101
  We solve this question by first translating the source sentence to English and then by comparing our translation with the provided translation. According to Google Translate, the correct translation of the source sentence from German to English is "Łeba is a small town and seaside resort in the Powiat Lęborski of the Polish Pomeranian Voivodeship." On the other hand, the provided translation is "Łeba is not a small town and seaside resort in the Powiat Léborski county of the Pomeranian Voivodeship of Poland." Note that the provided sentence says, "Łeba is not a small town ..." However, the translation should have been "Łeba is a small town ..." Because a negation is introduced at the beginning of the sentence and has fundamentally changed the meaning of the original source, the translation contains an error pertaining to Negation or Antonyms. So the answer is (C).""",
1102
-
1103
-
1104
- 'reasoning_about_colored_objects': """Answer extremely simple questions about the colors of objects on a surface.
1105
 
1106
  Q: On the nightstand, there is a red pencil, a purple mug, a burgundy keychain, a fuchsia teddy bear, a black plate, and a blue stress ball. What color is the stress ball?
1107
  Options:
@@ -1164,8 +1137,7 @@ A: Let's think step by step.
1164
  According to this question, the objects are arranged in a row, from left to right, as follows: (1) a teal plate, (2) a burgundy keychain, (3) a yellow scrunchiephone charger, (4) an orange mug, (5) a pink notebook, (6) a grey cup.
1165
  The teal plate is the first item, namely (1). There is no item to the left of the teal item.
1166
  The number of non-orange items to the left of the teal item is zero. So the answer is (A).""",
1167
-
1168
- 'multistep_arithmetic_two': """Solve multi-step arithmetic problems.
1169
 
1170
  Q: ((-5 + 9 * -4 - 0) * (4 + -7 + 0 * -5)) =
1171
  A: Let's think step by step.
@@ -1190,9 +1162,7 @@ This equation can be written as "A - B", where A = (-3 + 5 * 8 * -4) and B = (9
1190
  Let's calculate A = (-3 + 5 * 8 * -4) = (-3 + (5 * 8) * -4) = (-3 + (40) * -4) = (-3 + (40 * -4)) = (-3 + -160) = -163.
1191
  Let's calculate B = (9 - 8 * -7 + -9) = (9 - (8 * -7) + -9) = (9 - (-56) + -9) = ((9 - (-56)) + -9) = ((65) + -9)= (65 - 9) = 56.
1192
  Then, the final equation is A - B = -163 - 56 = -219. So the answer is -219.""",
1193
-
1194
-
1195
- 'navigate': """Given a series of navigation instructions, determine whether one would end up back at the starting point.
1196
 
1197
  Q: If you follow these instructions, do you return to the starting point? Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. Take 8 steps.
1198
  Options:
@@ -1235,9 +1205,7 @@ We start at the origin (0, 0), facing the positive y-axis.
1235
  (3) Take 9 steps left: (0, -7), facing the positive y-axis.
1236
  (4) Take 7 steps right: (0, 7), facing the positive y-axis.
1237
  Since (0, 0) is (0, 0), we are indeed where we started. So the answer is Yes.""",
1238
-
1239
-
1240
- 'dyck_languages': """Correctly close a Dyck-n word.
1241
 
1242
  Q: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: [ { [
1243
  A: Let's think step by step.
@@ -1309,9 +1277,7 @@ We should process each input one by one and keep track of the stack configuratio
1309
  Now, we have reached the end. The final stack is "< [ {".
1310
  We will need to pop out "{", "[", "<" one by one in that order.
1311
  So, we need "}", "]", ">". So the answer is } ] >.""",
1312
-
1313
-
1314
- 'word_sorting': """Sort a list of words.
1315
 
1316
  Q: Sort the following words alphabetically: List: oven costume counterpart
1317
  A: Let's think step by step.
@@ -1328,9 +1294,7 @@ Q: Sort the following words alphabetically: List: newt arson parthia seismograph
1328
  A: Let's think step by step.
1329
  The first letter: "newt": "n" (14). "arson": "a" (1). "parthia": "p" (16). "seismography": "s" (19). "mugho": "m" (13). "aspect": "a" (1). "census": "c" (3). We now have: (1) ["arson" ? "aspect"] < (3) "census" < (13) "mugho" < (14) "newt" < (16) "parthia" < (19) "seismography". Now let's sort this subpart ["arson" ? "aspect"] by looking at their second letters.
1330
  The second letter: "arson": "r" (18). "aspect": "s" (19). We now have: (18) "arson" < (19) "aspect". Hence, we have ["arson" < "aspect"] < "census" < "mugho" < "newt" < "parthia" < "seismography". So the answer is arson aspect census mugho newt parthia seismography.""",
1331
-
1332
-
1333
- 'sports_understanding': """Determine whether an artificially constructed sentence relating to sports is plausible or not.
1334
 
1335
  Q: Is the following sentence plausible? "Bam Adebayo scored a reverse layup in the Western Conference Finals."
1336
  A: Let's think step by step. Bam Adebayo is an American basketball player. Scoring a reverse layup in the Western Conference Finals is part of the NBA Finals. So the answer is yes.
@@ -1340,9 +1304,7 @@ A: Let's think step by step. Santi Cazorla is a soccer player. Touchdown is part
1340
 
1341
  Q: Is the following sentence plausible? "DeMar DeRozan was called for the goal tend."
1342
  A: Let's think step by step. DeMar DeRozan is an American basketball player. Goal tending is part of basketball. So the answer is yes.""",
1343
-
1344
-
1345
- 'boolean_expressions': """Evaluate the result of a random Boolean expression.
1346
 
1347
  Q: not ( ( not not True ) ) is
1348
  A: Let's think step by step.
@@ -1365,9 +1327,7 @@ Remember that (i) expressions inside brackets are always evaluated first and tha
1365
  We first simplify this expression "Z" as follows: "Z = not not ( not ( False ) ) = not not ( A )" where "A = not ( False )".
1366
  Let's evaluate A: A = not ( False ) = not False = True.
1367
  Plugging in A, we get: Z = not not ( A ) = not not (True) = not not False = True. So the answer is True.""",
1368
-
1369
-
1370
- 'object_counting': """Questions that involve enumerating objects and asking the model to count them.
1371
 
1372
  Q: I have a blackberry, a clarinet, a nectarine, a plum, a strawberry, a banana, a flute, an orange, and a violin. How many fruits do I have?
1373
  A: Let's think step by step.
@@ -1404,9 +1364,7 @@ We first identify the vegetables on the list and include their quantity in paren
1404
  - garlic (1)
1405
  - yam (1)
1406
  Now, let's add the numbers in parentheses: 1 + 1 + 1 + 1 + 2 + 1 + 1 = 8. So the answer is 8.""",
1407
-
1408
-
1409
- 'formal_fallacies': """Distinguish deductively valid arguments from formal fallacies.
1410
 
1411
  Q: "It is not always easy to see who is related to whom -- and in which ways. The following argument pertains to this question: To begin with, Lesley is a close friend of Fernando. Moreover, being a close friend of Fernando or a schoolmate of Lowell is sufficient for being a great-grandfather of Leroy. It follows that Lesley is a great-grandfather of Leroy."
1412
  Is the argument, given the explicitly stated premises, deductively valid or invalid?
@@ -1450,9 +1408,7 @@ By (1), we have if X = infrequent-user(Paul Mitchell), then X = rare-consumer(Ni
1450
  The case X = rare-consumer(Nioxin) does not appear in (2).
1451
  The case X = loyal-buyer(Caress) does not appear in (2), either.
1452
  So, from (1) and (2), we cannot necessarily deduce the Hypothesis. So the answer is invalid.""",
1453
-
1454
-
1455
- 'causal_judgement': """Answer questions about causal attribution.
1456
 
1457
  Q: How would a typical person answer each of the following questions about causation?
1458
  Frank T., had an ongoing dispute with his neighbor over a stretch of land and one day decided to shoot his neighbor in the body. Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild. Nonetheless, the bullet bounced off a large boulder several feet away and hit the neighbor's body, causing significant injury. Did Frank T. intentionally shoot his neighbor in the body?
@@ -1477,7 +1433,7 @@ Options:
1477
  - No
1478
  A: Let's think step by step.
1479
  Here in this question, we are told that "He aims the dart at the low point region." A typical person might therefore think George did intentionally hit the low point region, because he wanted to lift up the spirit of his sister Lena. So the answer is Yes.""",
1480
- 'web_of_lies': """Evaluate a random boolean function expressed as a word problem.
1481
 
1482
  Q: Question: Fidel tells the truth. Jerry says Fidel tells the truth. Vina says Jerry tells the truth. Millicent says Vina lies. Raymond says Millicent lies. Does Raymond tell the truth?
1483
  A: Let's think step by step.
 
9
  )
10
 
11
 
12
+ def return_blank_if_exception(func):
13
+ def wrapper(*args, **kwargs):
14
+ try:
15
+ return func(*args, **kwargs)
16
+ except Exception:
17
+ return ""
18
+
19
+ return wrapper
20
+
21
+
22
  def extract_choice_zh(gen):
23
  # 答案是A | 选项是A | 应该选A选项
24
  res = re.search(
 
174
  # "\\text{ " only ever occurs (at least in the val set) when describing units
175
  if "\\text{ " in string:
176
  splits = string.split("\\text{ ")
177
+ # assert len(splits) == 2
178
  return splits[0]
179
  else:
180
  return string
 
288
 
289
  # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
290
  # string = fix_a_slash_b(string)
291
+ if string.startswith("\\boxed{") and string.endswith("}"):
292
+ string = string[7:-1]
293
+ string = string.split("=")[-1]
294
  return string
295
 
296
 
297
  def get_answer(string):
298
+ answer = remove_boxed(last_boxed_only_string(string))
 
 
 
 
299
  return answer
300
 
301
 
 
315
  except Exception:
316
  return str1 == str2
317
 
318
+
319
  def first_option_postprocess(text: str, options: str) -> str:
320
  """Find first valid option for text."""
321
 
322
  patterns = [
323
+ f"[Tt]he answer is [{options}]",
324
+ f"[Tt]he correct answer\s?(?:option)?\s?is [{options}]", # noqa
325
+ f"答案(?:选项)?是(.*?)[{options}]",
326
+ f"答案(?:选项)?为(.*?)[{options}]",
327
+ f"答案(?:选项)?选(.*?)[{options}]",
328
+ f"选项[{options}]是?正确",
329
+ f"选项[{options}]为?正确",
330
+ f"固选(.*?)[{options}]",
331
+ f"答案应该是(.*?)[{options}]",
332
+ f"(\s|^)[{options}][\s。,,\.$]", # noqa
333
+ f"[{options}]",
334
  ]
335
 
336
  regexes = [re.compile(pattern) for pattern in patterns]
 
341
  for i in options:
342
  if i in outputs:
343
  return i
344
+ return ""
345
 
346
 
347
  def first_capital_postprocess(text: str) -> str:
348
  for t in text:
349
  if t.isupper():
350
  return t
351
+ return ""
352
 
353
 
354
  def general_postprocess(text: str) -> str:
355
  # Cut off the first newline, period, or comma
356
+ truncated_text = re.split(r"[\n.,]", text, 1)[0]
357
 
358
  # Remove punctuation
359
+ no_punctuation = re.sub(r"[^\w\s]", "", truncated_text)
360
 
361
  # Remove article
362
+ no_articles = re.sub(r"\b(a|an|the)\b", "", no_punctuation, flags=re.IGNORECASE)
 
 
 
363
 
364
  # Remove duplicated blank spaces
365
+ cleaned_text = re.sub(r"\s+", " ", no_articles).strip()
366
 
367
  return cleaned_text
368
 
369
+
370
  def bbh_mcq_postprocess(text: str) -> str:
371
  ans = text
372
+ ans_line = ans.split("answer is ")
373
  if len(ans_line) != 1:
374
  ans = ans_line[-1].strip()
375
+ match = re.search(r"\(([A-Z])\)*", ans)
376
  if match:
377
  return f"({match.group(1)})"
378
+ match = re.search(r"([A-Z])", ans)
379
  if match:
380
  return f"({match.group(1)})"
381
  return f"({ans})"
 
383
 
384
  def bbh_freeform_postprocess(text: str) -> str:
385
  ans = text
386
+ ans_line = ans.split("answer is ")
387
  if len(ans_line) != 1:
388
  ans = ans_line[-1].strip()
389
+ ans = ans.split("\n")[0]
390
+ if ans.endswith("."):
391
  ans = ans[:-1]
392
  return ans
393
 
394
 
 
395
  icl_prompts = {
396
+ "temporal_sequences": """Task description: Answer questions about which times certain events could have occurred.
397
 
398
  Q: Today, Emily went to the museum. Between what times could they have gone?
399
  We know that:
 
470
  3pm-4pm: free.
471
  The beach closure time: 4pm.
472
  The only time when Tiffany could have gone to the beach was 3pm to 4pm. So the answer is (D).""",
473
+ "disambiguation_qa": """Clarify the meaning of sentences with ambiguous pronouns.
 
 
474
 
475
  Q: In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.
476
  Sentence: The chief told the counselor that they took the day off.
 
507
  Let's consider Y first: "X will plan to meet Y at Y's office." This case makes sense, because X might want to meet up with Y at Y's office.
508
  Now, consider X: "X will plan to meet Y at X's office." This case also makes sense, because X might want to meet up with Y at X's own office.
509
  Because both X and Y are possible at the same time, we conclude that the antecedent of the pronoun is ambiguous. So the answer is (C).""",
510
+ "date_understanding": """Infer the date from context.
 
 
511
 
512
  Q: Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY?
513
  Options:
 
540
  (F) 12/03/1960
541
  A: Let's think step by step.
542
  If Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963, that is 01/03/1963. So the answer is (B).""",
543
+ "tracking_shuffled_objects_three_objects": """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
 
 
544
 
545
  Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
546
  As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
 
580
  (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
581
  (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
582
  At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
583
+ "penguins_in_a_table": """Answer questions about a table of penguins and their attributes.
 
 
584
 
585
  Q: Here is a table where the first line is a header and each subsequent line is a penguin: name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15 For example: the age of Louis is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm. We now add a penguin to the table:
586
  James, 12, 90, 12
 
621
  When we sort their names alphabetically, we get Bernard, Gwen, Louis, Vincent.
622
  The name of the second penguin sorted by alphabetical order is Gwen.
623
  The name of the second penguin sorted by alphabetic order is Gwen. So the answer is (D).""",
624
+ "geometric_shapes": """Name geometric shapes from their SVG paths.
 
 
625
 
626
  Q: This SVG path element <path d="M 31.00,73.00 L 32.00,59.00 L 44.00,50.00 L 49.00,41.00 L 64.00,37.00 L 71.00,55.00 L 64.00,76.00 L 52.00,61.00 L 31.00,73.00"/> draws a
627
  Options:
 
699
  Length of side C: |C| = sqrt((41.00-45.00)^2 + (33.00-34.00)^2)) = sqrt((-4)^2 + (-1)^2) = sqrt(16 + 1) = sqrt(17).
700
  Length of side D: |D| = sqrt((45.00-41.00)^2 + (34.00-43.00)^2)) = sqrt((4)^2 + (-9)^2) = sqrt(16 + 81) = sqrt(97).
701
  Note that |A| = |D| and |B| = |C|. Furthermore, A and D are adjacent and B and C are adjacent. Thus, this polygon has two pairs of equal-length adjacent sides and is "kite". So the answer is (D).""",
702
+ "snarks": """Determine which of two sentences is sarcastic.
 
 
703
 
704
  According to Cambridge University Dictionary, sarcasm is "the use of remarks that clearly mean the opposite of what they say, made in order to hurt someone's feelings or to criticize something in a humorous way." Sarcastic sentences often contain satirical or ironic utterances, hyperboles, ambivalent or witty remarks.
705
 
 
729
  If we look at (A), it likens the consistency in the league's punishments with that in politics. Because politics or political affairs are often not considered to be consistent or dependable, this sentence appears to be satirical.
730
  If we look at (B), it likens the consistency in the league's punishments with that in morality. Discussing the consistency of the league's punishments in the context of morality, ethics, or law makes sense and does not appear to make a satirical point about anything.
731
  Above the above, the sarcastic option is (A). So the answer is (A).""",
732
+ "ruin_names": """Select the humorous edit that 'ruins' the input movie or musical artist name.
 
 
733
 
734
  Q: Which of the following is a humorous edit of this artist or movie name: 'whitesnake'?
735
  Options:
 
773
  (C) "courting crows": Here the word "counting" is changed to "courting", and "courting" is an actual word; however, "courting crows" does not sound as humorous as "counting cows".
774
  (D) "coutnting crows": Here the word "counting" is changed to "coutnting", but the word "coutnting" is not an actual word.
775
  Above the above, the only humorous edit is (B). So the answer is (B).""",
776
+ "tracking_shuffled_objects_seven_objects": """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
 
 
777
 
778
  Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
779
  As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
 
813
  (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
814
  (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
815
  At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
816
+ "tracking_shuffled_objects_five_objects": """A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.
 
 
817
 
818
  Q: Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire has a pink ball.
819
  As the game progresses, pairs of players trade balls. First, Claire and Alice swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. At the end of the game, Bob has the
 
853
  (2) Claire and Bob switch partners: Alice: Rodrigo, Bob: Patrick, Claire: Lola.
854
  (3) Bob and Alice switch partners: Alice: Patrick, Bob: Rodrigo, Claire: Lola.
855
  At the end of the dance, Alice is dancing with Patrick. So the answer is (C).""",
856
+ "logical_deduction_three_objects": """A logical deduction task which requires deducing the order of a sequence of objects.
 
 
857
 
858
  Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
859
  Options:
 
890
  (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
891
  According to this ordering, the leftmost book is the white book.
892
  The white book is the leftmost. So the answer is (C).""",
893
+ "hyperbaton": """Order adjectives correctly in English sentences.
 
 
894
 
895
  Q: Which sentence has the correct adjective order:
896
  Options:
 
918
  When there is more than one adjective before a noun, the adjectives need to respect the following order before a noun: "[1. opinion] [2. size] [3. age] [4. shape] [5. color] [6. origin] [7. material] [8. purpose] noun".
919
  Option (A): "blue gold wonderful square shoe". (1) "blue" falls into the color category. (2) "gold" falls into the material category. (3) "wonderful" falls into the opinion category. (4) "square" falls into the shape category. The adjective order that Option (A) has is [5. color] [7. material] [1. opinion] [4. shape] (or, in numeric terms, 5 7 1 4). Because 5 < 7 < 1 < 4 is not correct, (A) does not have the correct ordering.
920
  Option (B): "wonderful square blue gold shoe". Option (B) has the following adjective order: [1. opinion] [4. shape] [5. color] [7. material] (or, in numeric terms, 1 4 5 7 ). Because 1 < 4 < 5 < 7 is correct, (B) has the correct ordering. So the answer is (B).""",
921
+ "logical_deduction_five_objects": """A logical deduction task which requires deducing the order of a sequence of objects.
 
 
922
 
923
  Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
924
  Options:
 
955
  (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
956
  According to this ordering, the leftmost book is the white book.
957
  The white book is the leftmost. So the answer is (C).""",
958
+ "logical_deduction_seven_objects": """A logical deduction task which requires deducing the order of a sequence of objects.
 
 
959
 
960
  Q: The following paragraphs each describe a set of three objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished above Amy. Eli finished below Amy.
961
  Options:
 
992
  (3) Combining (1) and (2) we get the following ordering: "(left) white red gray (right)".
993
  According to this ordering, the leftmost book is the white book.
994
  The white book is the leftmost. So the answer is (C).""",
995
+ "movie_recommendation": """Recommend movies similar to the given list of movies.
 
 
996
 
997
  Q: Find a movie similar to Star Wars Episode IV - A New Hope, Indiana Jones and the Last Crusade, Star Wars Episode V - The Empire Strikes Back, The Big Lebowski:
998
  Options:
 
1034
  - Inside Out (animation, family, comedy; 2015)
1035
  - Forrest Gump (comedy, drama, romance; 1994)
1036
  These are all famous movies produced in the past few decades.Amongst all the options, the only movie similar to these ones seems to be Edge of Tomorrow (action, adventure, crime, mystery; 2014), as it is also a science-fiction movie and features Tom Cruise. So the answer is (D).""",
1037
+ "salient_translation_error_detection": """Detect the type of error in an English translation of a German source sentence.
 
 
1038
 
1039
  Q: The following translations from German to English contain a particular error. That error will be one of the following types: Named Entities: An entity (names, places, locations, etc.) is changed to a different entity. Numerical Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are changed. Negation or Antonyms: Introduce or remove a negation or change comparatives to their antonyms. Facts: Trivial factual errors not pertaining to the above classes are introduced in the translations. Dropped Content: A significant clause in the translation is removed. Please identify that error. Source: In der Liste der Baudenkmale in Lenzen (Elbe) sind alle Baudenkmale der brandenburgischen Stadt Lenzen (Elbe) und ihrer Ortsteile aufgelistet.
1040
  Translation: In the list of architectural monuments in Lenzen all architectural monuments of the Brandenburg city of Lenzen and its districts are listed.
 
1074
  (F) Facts
1075
  A: Let's think step by step.
1076
  We solve this question by first translating the source sentence to English and then by comparing our translation with the provided translation. According to Google Translate, the correct translation of the source sentence from German to English is "Łeba is a small town and seaside resort in the Powiat Lęborski of the Polish Pomeranian Voivodeship." On the other hand, the provided translation is "Łeba is not a small town and seaside resort in the Powiat Léborski county of the Pomeranian Voivodeship of Poland." Note that the provided sentence says, "Łeba is not a small town ..." However, the translation should have been "Łeba is a small town ..." Because a negation is introduced at the beginning of the sentence and has fundamentally changed the meaning of the original source, the translation contains an error pertaining to Negation or Antonyms. So the answer is (C).""",
1077
+ "reasoning_about_colored_objects": """Answer extremely simple questions about the colors of objects on a surface.
 
 
1078
 
1079
  Q: On the nightstand, there is a red pencil, a purple mug, a burgundy keychain, a fuchsia teddy bear, a black plate, and a blue stress ball. What color is the stress ball?
1080
  Options:
 
1137
  According to this question, the objects are arranged in a row, from left to right, as follows: (1) a teal plate, (2) a burgundy keychain, (3) a yellow scrunchiephone charger, (4) an orange mug, (5) a pink notebook, (6) a grey cup.
1138
  The teal plate is the first item, namely (1). There is no item to the left of the teal item.
1139
  The number of non-orange items to the left of the teal item is zero. So the answer is (A).""",
1140
+ "multistep_arithmetic_two": """Solve multi-step arithmetic problems.
 
1141
 
1142
  Q: ((-5 + 9 * -4 - 0) * (4 + -7 + 0 * -5)) =
1143
  A: Let's think step by step.
 
1162
  Let's calculate A = (-3 + 5 * 8 * -4) = (-3 + (5 * 8) * -4) = (-3 + (40) * -4) = (-3 + (40 * -4)) = (-3 + -160) = -163.
1163
  Let's calculate B = (9 - 8 * -7 + -9) = (9 - (8 * -7) + -9) = (9 - (-56) + -9) = ((9 - (-56)) + -9) = ((65) + -9)= (65 - 9) = 56.
1164
  Then, the final equation is A - B = -163 - 56 = -219. So the answer is -219.""",
1165
+ "navigate": """Given a series of navigation instructions, determine whether one would end up back at the starting point.
 
 
1166
 
1167
  Q: If you follow these instructions, do you return to the starting point? Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. Take 8 steps.
1168
  Options:
 
1205
  (3) Take 9 steps left: (0, -7), facing the positive y-axis.
1206
  (4) Take 7 steps right: (0, 7), facing the positive y-axis.
1207
  Since (0, 0) is (0, 0), we are indeed where we started. So the answer is Yes.""",
1208
+ "dyck_languages": """Correctly close a Dyck-n word.
 
 
1209
 
1210
  Q: Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: [ { [
1211
  A: Let's think step by step.
 
1277
  Now, we have reached the end. The final stack is "< [ {".
1278
  We will need to pop out "{", "[", "<" one by one in that order.
1279
  So, we need "}", "]", ">". So the answer is } ] >.""",
1280
+ "word_sorting": """Sort a list of words.
 
 
1281
 
1282
  Q: Sort the following words alphabetically: List: oven costume counterpart
1283
  A: Let's think step by step.
 
1294
  A: Let's think step by step.
1295
  The first letter: "newt": "n" (14). "arson": "a" (1). "parthia": "p" (16). "seismography": "s" (19). "mugho": "m" (13). "aspect": "a" (1). "census": "c" (3). We now have: (1) ["arson" ? "aspect"] < (3) "census" < (13) "mugho" < (14) "newt" < (16) "parthia" < (19) "seismography". Now let's sort this subpart ["arson" ? "aspect"] by looking at their second letters.
1296
  The second letter: "arson": "r" (18). "aspect": "s" (19). We now have: (18) "arson" < (19) "aspect". Hence, we have ["arson" < "aspect"] < "census" < "mugho" < "newt" < "parthia" < "seismography". So the answer is arson aspect census mugho newt parthia seismography.""",
1297
+ "sports_understanding": """Determine whether an artificially constructed sentence relating to sports is plausible or not.
 
 
1298
 
1299
  Q: Is the following sentence plausible? "Bam Adebayo scored a reverse layup in the Western Conference Finals."
1300
  A: Let's think step by step. Bam Adebayo is an American basketball player. Scoring a reverse layup in the Western Conference Finals is part of the NBA Finals. So the answer is yes.
 
1304
 
1305
  Q: Is the following sentence plausible? "DeMar DeRozan was called for the goal tend."
1306
  A: Let's think step by step. DeMar DeRozan is an American basketball player. Goal tending is part of basketball. So the answer is yes.""",
1307
+ "boolean_expressions": """Evaluate the result of a random Boolean expression.
 
 
1308
 
1309
  Q: not ( ( not not True ) ) is
1310
  A: Let's think step by step.
 
1327
  We first simplify this expression "Z" as follows: "Z = not not ( not ( False ) ) = not not ( A )" where "A = not ( False )".
1328
  Let's evaluate A: A = not ( False ) = not False = True.
1329
  Plugging in A, we get: Z = not not ( A ) = not not (True) = not not False = True. So the answer is True.""",
1330
+ "object_counting": """Questions that involve enumerating objects and asking the model to count them.
 
 
1331
 
1332
  Q: I have a blackberry, a clarinet, a nectarine, a plum, a strawberry, a banana, a flute, an orange, and a violin. How many fruits do I have?
1333
  A: Let's think step by step.
 
1364
  - garlic (1)
1365
  - yam (1)
1366
  Now, let's add the numbers in parentheses: 1 + 1 + 1 + 1 + 2 + 1 + 1 = 8. So the answer is 8.""",
1367
+ "formal_fallacies": """Distinguish deductively valid arguments from formal fallacies.
 
 
1368
 
1369
  Q: "It is not always easy to see who is related to whom -- and in which ways. The following argument pertains to this question: To begin with, Lesley is a close friend of Fernando. Moreover, being a close friend of Fernando or a schoolmate of Lowell is sufficient for being a great-grandfather of Leroy. It follows that Lesley is a great-grandfather of Leroy."
1370
  Is the argument, given the explicitly stated premises, deductively valid or invalid?
 
1408
  The case X = rare-consumer(Nioxin) does not appear in (2).
1409
  The case X = loyal-buyer(Caress) does not appear in (2), either.
1410
  So, from (1) and (2), we cannot necessarily deduce the Hypothesis. So the answer is invalid.""",
1411
+ "causal_judgement": """Answer questions about causal attribution.
 
 
1412
 
1413
  Q: How would a typical person answer each of the following questions about causation?
1414
  Frank T., had an ongoing dispute with his neighbor over a stretch of land and one day decided to shoot his neighbor in the body. Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild. Nonetheless, the bullet bounced off a large boulder several feet away and hit the neighbor's body, causing significant injury. Did Frank T. intentionally shoot his neighbor in the body?
 
1433
  - No
1434
  A: Let's think step by step.
1435
  Here in this question, we are told that "He aims the dart at the low point region." A typical person might therefore think George did intentionally hit the low point region, because he wanted to lift up the spirit of his sister Lena. So the answer is Yes.""",
1436
+ "web_of_lies": """Evaluate a random boolean function expressed as a word problem.
1437
 
1438
  Q: Question: Fidel tells the truth. Jerry says Fidel tells the truth. Vina says Jerry tells the truth. Millicent says Vina lies. Raymond says Millicent lies. Does Raymond tell the truth?
1439
  A: Let's think step by step.