cccjc commited on
Commit
4301eca
1 Parent(s): 8c04f42

Update single-image results, add model link url

Browse files
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from utils import DefaultDataLoader, CoreSingleDataLoader
3
  import os
4
  from constants import *
5
 
@@ -9,19 +9,19 @@ current_dir = os.path.dirname(os.path.abspath(__file__))
9
  # Construct paths to CSS files
10
  base_css_file = os.path.join(current_dir, "static", "css", "style.css")
11
  default_css_file = os.path.join(current_dir, "static", "css", "default.css")
12
- core_single_css_file = os.path.join(current_dir, "static", "css", "core_single.css")
13
 
14
  # Read CSS files
15
  with open(base_css_file, "r") as f:
16
  base_css = f.read()
17
  with open(default_css_file, "r") as f:
18
  default_css = f.read()
19
- with open(core_single_css_file, "r") as f:
20
- core_single_css = f.read()
21
 
22
  # Initialize data loaders
23
  default_loader = DefaultDataLoader()
24
- core_single_loader = CoreSingleDataLoader()
25
 
26
  with gr.Blocks() as block:
27
  # Add a style element that we'll update
@@ -49,14 +49,14 @@ with gr.Blocks() as block:
49
 
50
  with gr.Row():
51
  table_selector = gr.Radio(
52
- choices=["Default", "Core Single-image"],
53
- label="Select table to display",
54
  value="Default"
55
  )
56
 
57
  # Define different captions for each table
58
  default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword. <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> $\\text{Overall} \\ = \\ \\frac{\\max(\\text{Core w/o CoT}, \\ \\text{Core w/ CoT}) \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$"
59
- core_single_image_caption = "**Table 2: MEGA-Bench Core Single-image results.** The number in the parentheses is the number of tasks in each keyword. <br> This subset contains 273 single-image tasks from the Core set of the benchmark. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. <br> Compared to the default table, some models with only single-image support are added."
60
 
61
  caption_component = gr.Markdown(
62
  value=default_caption,
@@ -80,10 +80,10 @@ with gr.Blocks() as block:
80
  data_component = gr.Dataframe(
81
  value=initial_data,
82
  headers=initial_headers,
83
- datatype=["str"] + ["number"] * (len(initial_headers) - 1),
84
  interactive=False,
85
  elem_classes="custom-dataframe",
86
- max_height=1200,
87
  )
88
 
89
  def update_table_and_caption(table_type, super_group, model_group):
@@ -91,23 +91,24 @@ with gr.Blocks() as block:
91
  headers, data = default_loader.get_leaderboard_data(super_group, model_group)
92
  caption = default_caption
93
  current_css = f"{base_css}\n{default_css}"
94
- else: # Core Single-image
95
- headers, data = core_single_loader.get_leaderboard_data(super_group, model_group)
96
- caption = core_single_image_caption
97
- current_css = f"{base_css}\n{core_single_css}"
98
 
99
  return [
100
  gr.Dataframe(
101
  value=data,
102
  headers=headers,
103
- datatype=["str"] + ["number"] * (len(headers) - 1),
 
104
  ),
105
  caption,
106
  f"<style>{current_css}</style>"
107
  ]
108
 
109
  def update_selectors(table_type):
110
- loader = default_loader if table_type == "Default" else core_single_loader
111
  return [
112
  gr.Radio(choices=list(loader.SUPER_GROUPS.keys())),
113
  gr.Radio(choices=list(loader.MODEL_GROUPS.keys()))
 
1
  import gradio as gr
2
+ from utils import DefaultDataLoader, SingleImageDataLoader
3
  import os
4
  from constants import *
5
 
 
9
  # Construct paths to CSS files
10
  base_css_file = os.path.join(current_dir, "static", "css", "style.css")
11
  default_css_file = os.path.join(current_dir, "static", "css", "default.css")
12
+ si_css_file = os.path.join(current_dir, "static", "css", "single_image.css")
13
 
14
  # Read CSS files
15
  with open(base_css_file, "r") as f:
16
  base_css = f.read()
17
  with open(default_css_file, "r") as f:
18
  default_css = f.read()
19
+ with open(si_css_file, "r") as f:
20
+ si_css = f.read()
21
 
22
  # Initialize data loaders
23
  default_loader = DefaultDataLoader()
24
+ si_loader = SingleImageDataLoader()
25
 
26
  with gr.Blocks() as block:
27
  # Add a style element that we'll update
 
49
 
50
  with gr.Row():
51
  table_selector = gr.Radio(
52
+ choices=["Default", "Single Image"],
53
+ label="Select table to display. Default: all MEGA-Bench tasks; Single Image: single-image tasks only.",
54
  value="Default"
55
  )
56
 
57
  # Define different captions for each table
58
  default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword. <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> $\\text{Overall} \\ = \\ \\frac{\\max(\\text{Core w/o CoT}, \\ \\text{Core w/ CoT}) \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$"
59
+ single_image_caption = "**Table 2: MEGA-Bench Single-image setting results.** The number in the parentheses is the number of tasks in each keyword. <br> This subset contains 273 single-image tasks from the Core set and 42 single-image tasks from the Open-ended set. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. <br> Compared to the default table, some models with only single-image support are added."
60
 
61
  caption_component = gr.Markdown(
62
  value=default_caption,
 
80
  data_component = gr.Dataframe(
81
  value=initial_data,
82
  headers=initial_headers,
83
+ datatype=["html"] + ["number"] * (len(initial_headers) - 1),
84
  interactive=False,
85
  elem_classes="custom-dataframe",
86
+ max_height=2400,
87
  )
88
 
89
  def update_table_and_caption(table_type, super_group, model_group):
 
91
  headers, data = default_loader.get_leaderboard_data(super_group, model_group)
92
  caption = default_caption
93
  current_css = f"{base_css}\n{default_css}"
94
+ else: # Single-image
95
+ headers, data = si_loader.get_leaderboard_data(super_group, model_group)
96
+ caption = single_image_caption
97
+ current_css = f"{base_css}\n{si_css}"
98
 
99
  return [
100
  gr.Dataframe(
101
  value=data,
102
  headers=headers,
103
+ datatype=["html"] + ["number"] * (len(headers) - 1),
104
+ interactive=False,
105
  ),
106
  caption,
107
  f"<style>{current_css}</style>"
108
  ]
109
 
110
  def update_selectors(table_type):
111
+ loader = default_loader if table_type == "Default" else si_loader
112
  return [
113
  gr.Radio(choices=list(loader.SUPER_GROUPS.keys())),
114
  gr.Radio(choices=list(loader.MODEL_GROUPS.keys()))
static/css/{core_single.css → single_image.css} RENAMED
@@ -1,18 +1,18 @@
1
- .custom-dataframe thead th:nth-child(-n+2),
2
- .custom-dataframe tbody td:nth-child(-n+2) {
3
  background-color: var(--global-column-background) !important;
4
  }
5
 
6
- .custom-dataframe thead th:nth-child(n+3),
7
- .custom-dataframe tbody td:nth-child(n+3) {
8
  background-color: var(--dimension-column-background) !important;
9
  }
10
 
11
- .custom-dataframe tbody tr:nth-child(even) td:nth-child(-n+2) {
12
  background-color: var(--row-even-global) !important;
13
  }
14
 
15
- .custom-dataframe tbody tr:nth-child(even) td:nth-child(n+3) {
16
  background-color: var(--row-even-dimension) !important;
17
  }
18
 
@@ -33,21 +33,21 @@
33
  color: var(--text-color) !important;
34
  }
35
 
36
- .custom-dataframe thead th:nth-child(-n+2),
37
- .custom-dataframe tbody td:nth-child(-n+2) {
38
  background-color: var(--global-column-background) !important;
39
  }
40
 
41
- .custom-dataframe thead th:nth-child(n+3),
42
- .custom-dataframe tbody td:nth-child(n+3) {
43
  background-color: var(--dimension-column-background) !important;
44
  }
45
 
46
- .custom-dataframe tbody tr:nth-child(even) td:nth-child(-n+2) {
47
  background-color: var(--row-even-global) !important;
48
  }
49
 
50
- .custom-dataframe tbody tr:nth-child(even) td:nth-child(n+3) {
51
  background-color: var(--row-even-dimension) !important;
52
  }
53
 
 
1
+ .custom-dataframe thead th:nth-child(-n+4),
2
+ .custom-dataframe tbody td:nth-child(-n+4) {
3
  background-color: var(--global-column-background) !important;
4
  }
5
 
6
+ .custom-dataframe thead th:nth-child(n+5),
7
+ .custom-dataframe tbody td:nth-child(n+5) {
8
  background-color: var(--dimension-column-background) !important;
9
  }
10
 
11
+ .custom-dataframe tbody tr:nth-child(even) td:nth-child(-n+4) {
12
  background-color: var(--row-even-global) !important;
13
  }
14
 
15
+ .custom-dataframe tbody tr:nth-child(even) td:nth-child(n+5) {
16
  background-color: var(--row-even-dimension) !important;
17
  }
18
 
 
33
  color: var(--text-color) !important;
34
  }
35
 
36
+ .custom-dataframe thead th:nth-child(-n+4),
37
+ .custom-dataframe tbody td:nth-child(-n+4) {
38
  background-color: var(--global-column-background) !important;
39
  }
40
 
41
+ .custom-dataframe thead th:nth-child(n+5),
42
+ .custom-dataframe tbody td:nth-child(n+5) {
43
  background-color: var(--dimension-column-background) !important;
44
  }
45
 
46
+ .custom-dataframe tbody tr:nth-child(even) td:nth-child(-n+4) {
47
  background-color: var(--row-even-global) !important;
48
  }
49
 
50
+ .custom-dataframe tbody tr:nth-child(even) td:nth-child(n+5) {
51
  background-color: var(--row-even-dimension) !important;
52
  }
53
 
static/eval_results/Core_SI/all_summary.json DELETED
@@ -1,227 +0,0 @@
1
- {
2
- "Aquila_VL_2B": {
3
- "num_eval_tasks": 273,
4
- "num_eval_samples": 4116,
5
- "num_not_eval_samples": 0,
6
- "num_total_samples": 4377,
7
- "macro_mean_score": 0.20770364903712493,
8
- "micro_mean_score": 0.20333142638522636,
9
- "missing_tasks": []
10
- },
11
- "Aria": {
12
- "num_eval_tasks": 273,
13
- "num_eval_samples": 4116,
14
- "num_not_eval_samples": 0,
15
- "num_total_samples": 4377,
16
- "macro_mean_score": 0.3178882776147889,
17
- "micro_mean_score": 0.3101511832828904,
18
- "missing_tasks": []
19
- },
20
- "Claude_3.5": {
21
- "num_eval_tasks": 273,
22
- "num_eval_samples": 4116,
23
- "num_not_eval_samples": 0,
24
- "num_total_samples": 4116,
25
- "macro_mean_score": 0.520276385877485,
26
- "micro_mean_score": 0.520276385877485
27
- },
28
- "Claude_3.5_new": {
29
- "num_eval_tasks": 273,
30
- "num_eval_samples": 4116,
31
- "num_not_eval_samples": 0,
32
- "num_total_samples": 4116,
33
- "macro_mean_score": 0.5462752278980763,
34
- "micro_mean_score": 0.5462752278980763
35
- },
36
- "GPT_4o": {
37
- "num_eval_tasks": 273,
38
- "num_eval_samples": 4116,
39
- "num_not_eval_samples": 0,
40
- "num_total_samples": 4116,
41
- "macro_mean_score": 0.5529953662872719,
42
- "micro_mean_score": 0.5529953662872719
43
- },
44
- "GPT_4o_mini": {
45
- "num_eval_tasks": 273,
46
- "num_eval_samples": 4116,
47
- "num_not_eval_samples": 0,
48
- "num_total_samples": 4116,
49
- "macro_mean_score": 0.44285970964797233,
50
- "micro_mean_score": 0.44285970964797233
51
- },
52
- "Gemini_1.5_flash_002": {
53
- "num_eval_tasks": 273,
54
- "num_eval_samples": 4116,
55
- "num_not_eval_samples": 0,
56
- "num_total_samples": 4116,
57
- "macro_mean_score": 0.42188460865574384,
58
- "micro_mean_score": 0.42188460865574384
59
- },
60
- "Gemini_1.5_pro_002": {
61
- "num_eval_tasks": 273,
62
- "num_eval_samples": 4116,
63
- "num_not_eval_samples": 0,
64
- "num_total_samples": 4116,
65
- "macro_mean_score": 0.4914311038229404,
66
- "micro_mean_score": 0.4914311038229404
67
- },
68
- "Idefics3": {
69
- "num_eval_tasks": 273,
70
- "num_eval_samples": 4116,
71
- "num_not_eval_samples": 0,
72
- "num_total_samples": 4377,
73
- "macro_mean_score": 0.08941182847569326,
74
- "micro_mean_score": 0.08779475233900695,
75
- "missing_tasks": []
76
- },
77
- "InternVL2_2B": {
78
- "num_eval_tasks": 273,
79
- "num_eval_samples": 4116,
80
- "num_not_eval_samples": 0,
81
- "num_total_samples": 4377,
82
- "macro_mean_score": 0.12069001041308772,
83
- "micro_mean_score": 0.11842605219090299,
84
- "missing_tasks": []
85
- },
86
- "InternVL2_76B": {
87
- "num_eval_tasks": 273,
88
- "num_eval_samples": 4116,
89
- "num_not_eval_samples": 0,
90
- "num_total_samples": 4377,
91
- "macro_mean_score": 0.3998616568018755,
92
- "micro_mean_score": 0.39149064302628933,
93
- "missing_tasks": []
94
- },
95
- "InternVL2_8B": {
96
- "num_eval_tasks": 273,
97
- "num_eval_samples": 4116,
98
- "num_not_eval_samples": 0,
99
- "num_total_samples": 4377,
100
- "macro_mean_score": 0.27650612401825575,
101
- "micro_mean_score": 0.27119471729837735,
102
- "missing_tasks": []
103
- },
104
- "Llama_3_2_11B": {
105
- "num_eval_tasks": 273,
106
- "num_eval_samples": 4116,
107
- "num_not_eval_samples": 0,
108
- "num_total_samples": 4377,
109
- "macro_mean_score": 0.20789144960796493,
110
- "micro_mean_score": 0.20163641703273802,
111
- "missing_tasks": []
112
- },
113
- "MiniCPM_v2.6": {
114
- "num_eval_tasks": 273,
115
- "num_eval_samples": 4116,
116
- "num_not_eval_samples": 0,
117
- "num_total_samples": 4377,
118
- "macro_mean_score": 0.23230765810722817,
119
- "micro_mean_score": 0.22684118052665975,
120
- "missing_tasks": []
121
- },
122
- "Molmo_72B": {
123
- "num_eval_tasks": 270,
124
- "num_eval_samples": 4073,
125
- "num_not_eval_samples": 0,
126
- "num_total_samples": 4331,
127
- "macro_mean_score": 0.36480000609384927,
128
- "micro_mean_score": 0.36205779758110807,
129
- "missing_tasks": [
130
- "table_understanding",
131
- "MMSoc_Misinformation_PolitiFact",
132
- "planning_screenshot_termes"
133
- ]
134
- },
135
- "Molmo_7B_D": {
136
- "num_eval_tasks": 272,
137
- "num_eval_samples": 4102,
138
- "num_not_eval_samples": 0,
139
- "num_total_samples": 4362,
140
- "macro_mean_score": 0.2098088446992518,
141
- "micro_mean_score": 0.20550929661464645,
142
- "missing_tasks": [
143
- "MMSoc_Misinformation_PolitiFact"
144
- ]
145
- },
146
- "NVLM": {
147
- "num_eval_tasks": 273,
148
- "num_eval_samples": 4116,
149
- "num_not_eval_samples": 0,
150
- "num_total_samples": 4377,
151
- "macro_mean_score": 0.32989872890926025,
152
- "micro_mean_score": 0.32315683713111915,
153
- "missing_tasks": []
154
- },
155
- "POINTS_7B": {
156
- "num_eval_tasks": 273,
157
- "num_eval_samples": 4116,
158
- "num_not_eval_samples": 0,
159
- "num_total_samples": 4377,
160
- "macro_mean_score": 0.25511317681632334,
161
- "micro_mean_score": 0.24927711632415062,
162
- "missing_tasks": []
163
- },
164
- "Phi-3.5-vision": {
165
- "num_eval_tasks": 273,
166
- "num_eval_samples": 4116,
167
- "num_not_eval_samples": 0,
168
- "num_total_samples": 4377,
169
- "macro_mean_score": 0.2561274958722834,
170
- "micro_mean_score": 0.2504214576875906,
171
- "missing_tasks": []
172
- },
173
- "Pixtral_12B": {
174
- "num_eval_tasks": 273,
175
- "num_eval_samples": 4116,
176
- "num_not_eval_samples": 0,
177
- "num_total_samples": 4377,
178
- "macro_mean_score": 0.3436942439614412,
179
- "micro_mean_score": 0.3373564384613738,
180
- "missing_tasks": []
181
- },
182
- "Qwen2_VL_2B": {
183
- "num_eval_tasks": 273,
184
- "num_eval_samples": 4116,
185
- "num_not_eval_samples": 0,
186
- "num_total_samples": 4377,
187
- "macro_mean_score": 0.22787906973244856,
188
- "micro_mean_score": 0.2234748515064842,
189
- "missing_tasks": []
190
- },
191
- "Qwen2_VL_72B": {
192
- "num_eval_tasks": 273,
193
- "num_eval_samples": 4116,
194
- "num_not_eval_samples": 0,
195
- "num_total_samples": 4377,
196
- "macro_mean_score": 0.4730536307784527,
197
- "micro_mean_score": 0.4659830915476831,
198
- "missing_tasks": []
199
- },
200
- "Qwen2_VL_7B": {
201
- "num_eval_tasks": 273,
202
- "num_eval_samples": 4116,
203
- "num_not_eval_samples": 0,
204
- "num_total_samples": 4377,
205
- "macro_mean_score": 0.3538656561495699,
206
- "micro_mean_score": 0.34581250459157137,
207
- "missing_tasks": []
208
- },
209
- "llava_onevision_72B": {
210
- "num_eval_tasks": 273,
211
- "num_eval_samples": 4116,
212
- "num_not_eval_samples": 0,
213
- "num_total_samples": 4377,
214
- "macro_mean_score": 0.312618242621264,
215
- "micro_mean_score": 0.3098623876487132,
216
- "missing_tasks": []
217
- },
218
- "llava_onevision_7B": {
219
- "num_eval_tasks": 273,
220
- "num_eval_samples": 4116,
221
- "num_not_eval_samples": 0,
222
- "num_total_samples": 4377,
223
- "macro_mean_score": 0.23683339637631812,
224
- "micro_mean_score": 0.23283041278687175,
225
- "missing_tasks": []
226
- }
227
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
static/eval_results/Default/all_model_keywords_stats.json CHANGED
@@ -239,25 +239,25 @@
239
  "count": 303,
240
  "num_samples": 4755,
241
  "tasks": [],
242
- "average_score": 0.5201947642961418
243
  },
244
  "Text Recognition (OCR)": {
245
  "count": 137,
246
  "num_samples": 2239,
247
  "tasks": [],
248
- "average_score": 0.49947304390648534
249
  },
250
  "Language Understanding and Generation": {
251
  "count": 154,
252
  "num_samples": 2509,
253
  "tasks": [],
254
- "average_score": 0.5512750115216515
255
  },
256
  "Scene and Event Understanding": {
257
  "count": 154,
258
  "num_samples": 2467,
259
  "tasks": [],
260
- "average_score": 0.5467324805307577
261
  },
262
  "Mathematical and Logical Reasoning": {
263
  "count": 109,
@@ -269,7 +269,7 @@
269
  "count": 51,
270
  "num_samples": 855,
271
  "tasks": [],
272
- "average_score": 0.5750369536204262
273
  },
274
  "Ethical and Safety Reasoning": {
275
  "count": 15,
@@ -301,7 +301,7 @@
301
  "count": 93,
302
  "num_samples": 1517,
303
  "tasks": [],
304
- "average_score": 0.4592162957187749
305
  },
306
  "Text-Based Images and Documents": {
307
  "count": 82,
@@ -331,7 +331,7 @@
331
  "count": 143,
332
  "num_samples": 2248,
333
  "tasks": [],
334
- "average_score": 0.5500305447809621
335
  },
336
  "3D Models and Aerial Imagery": {
337
  "count": 11,
@@ -351,7 +351,7 @@
351
  "count": 110,
352
  "num_samples": 1714,
353
  "tasks": [],
354
- "average_score": 0.44137714463131966
355
  },
356
  "exact_text": {
357
  "count": 83,
@@ -389,7 +389,7 @@
389
  "count": 41,
390
  "num_samples": 623,
391
  "tasks": [],
392
- "average_score": 0.5295721925617263
393
  },
394
  "1-image": {
395
  "count": 315,
@@ -413,7 +413,7 @@
413
  "count": 51,
414
  "num_samples": 802,
415
  "tasks": [],
416
- "average_score": 0.4553778359922855
417
  }
418
  },
419
  "app": {
@@ -421,7 +421,7 @@
421
  "count": 72,
422
  "num_samples": 1124,
423
  "tasks": [],
424
- "average_score": 0.5378983862471568
425
  },
426
  "Planning": {
427
  "count": 78,
@@ -457,7 +457,7 @@
457
  "count": 97,
458
  "num_samples": 1605,
459
  "tasks": [],
460
- "average_score": 0.5721991184410764
461
  },
462
  "Mathematics": {
463
  "count": 33,
@@ -479,13 +479,13 @@
479
  "count": 137,
480
  "num_samples": 2239,
481
  "tasks": [],
482
- "average_score": 0.4317914359988347
483
  },
484
  "Language Understanding and Generation": {
485
  "count": 154,
486
  "num_samples": 2509,
487
  "tasks": [],
488
- "average_score": 0.49775198805427967
489
  },
490
  "Scene and Event Understanding": {
491
  "count": 154,
@@ -535,7 +535,7 @@
535
  "count": 93,
536
  "num_samples": 1517,
537
  "tasks": [],
538
- "average_score": 0.3836737169374586
539
  },
540
  "Text-Based Images and Documents": {
541
  "count": 82,
@@ -585,7 +585,7 @@
585
  "count": 110,
586
  "num_samples": 1714,
587
  "tasks": [],
588
- "average_score": 0.3962715194192418
589
  },
590
  "exact_text": {
591
  "count": 83,
@@ -623,7 +623,7 @@
623
  "count": 41,
624
  "num_samples": 623,
625
  "tasks": [],
626
- "average_score": 0.4300676062024303
627
  },
628
  "1-image": {
629
  "count": 315,
@@ -655,7 +655,7 @@
655
  "count": 72,
656
  "num_samples": 1124,
657
  "tasks": [],
658
- "average_score": 0.4627701625196691
659
  },
660
  "Planning": {
661
  "count": 78,
@@ -713,13 +713,13 @@
713
  "count": 137,
714
  "num_samples": 2239,
715
  "tasks": [],
716
- "average_score": 0.6046357055234819
717
  },
718
  "Language Understanding and Generation": {
719
  "count": 154,
720
  "num_samples": 2509,
721
  "tasks": [],
722
- "average_score": 0.5712627152062051
723
  },
724
  "Scene and Event Understanding": {
725
  "count": 154,
@@ -769,7 +769,7 @@
769
  "count": 93,
770
  "num_samples": 1517,
771
  "tasks": [],
772
- "average_score": 0.5637906302497772
773
  },
774
  "Text-Based Images and Documents": {
775
  "count": 82,
@@ -819,7 +819,7 @@
819
  "count": 110,
820
  "num_samples": 1714,
821
  "tasks": [],
822
- "average_score": 0.4926030136534706
823
  },
824
  "exact_text": {
825
  "count": 83,
@@ -857,7 +857,7 @@
857
  "count": 41,
858
  "num_samples": 623,
859
  "tasks": [],
860
- "average_score": 0.5292494759360522
861
  },
862
  "1-image": {
863
  "count": 315,
@@ -889,7 +889,7 @@
889
  "count": 72,
890
  "num_samples": 1124,
891
  "tasks": [],
892
- "average_score": 0.6593763006847053
893
  },
894
  "Planning": {
895
  "count": 78,
@@ -1175,25 +1175,25 @@
1175
  "count": 303,
1176
  "num_samples": 4755,
1177
  "tasks": [],
1178
- "average_score": 0.44928744961868194
1179
  },
1180
  "Text Recognition (OCR)": {
1181
  "count": 137,
1182
  "num_samples": 2239,
1183
  "tasks": [],
1184
- "average_score": 0.48842488118273475
1185
  },
1186
  "Language Understanding and Generation": {
1187
  "count": 154,
1188
  "num_samples": 2509,
1189
  "tasks": [],
1190
- "average_score": 0.5152626716886682
1191
  },
1192
  "Scene and Event Understanding": {
1193
  "count": 154,
1194
  "num_samples": 2467,
1195
  "tasks": [],
1196
- "average_score": 0.4672966076116977
1197
  },
1198
  "Mathematical and Logical Reasoning": {
1199
  "count": 109,
@@ -1205,7 +1205,7 @@
1205
  "count": 51,
1206
  "num_samples": 855,
1207
  "tasks": [],
1208
- "average_score": 0.5572281917334303
1209
  },
1210
  "Ethical and Safety Reasoning": {
1211
  "count": 15,
@@ -1237,7 +1237,7 @@
1237
  "count": 93,
1238
  "num_samples": 1517,
1239
  "tasks": [],
1240
- "average_score": 0.4700389569079038
1241
  },
1242
  "Text-Based Images and Documents": {
1243
  "count": 82,
@@ -1267,7 +1267,7 @@
1267
  "count": 143,
1268
  "num_samples": 2248,
1269
  "tasks": [],
1270
- "average_score": 0.46468618797917643
1271
  },
1272
  "3D Models and Aerial Imagery": {
1273
  "count": 11,
@@ -1281,13 +1281,13 @@
1281
  "count": 98,
1282
  "num_samples": 1514,
1283
  "tasks": [],
1284
- "average_score": 0.41174000979649644
1285
  },
1286
  "structured_output": {
1287
  "count": 110,
1288
  "num_samples": 1714,
1289
  "tasks": [],
1290
- "average_score": 0.38893151244736324
1291
  },
1292
  "exact_text": {
1293
  "count": 83,
@@ -1325,13 +1325,13 @@
1325
  "count": 41,
1326
  "num_samples": 623,
1327
  "tasks": [],
1328
- "average_score": 0.4260710116168476
1329
  },
1330
  "1-image": {
1331
  "count": 315,
1332
  "num_samples": 5228,
1333
  "tasks": [],
1334
- "average_score": 0.46322170353087255
1335
  },
1336
  "video": {
1337
  "count": 43,
@@ -1349,7 +1349,7 @@
1349
  "count": 51,
1350
  "num_samples": 802,
1351
  "tasks": [],
1352
- "average_score": 0.3697506340557095
1353
  }
1354
  },
1355
  "app": {
@@ -1357,7 +1357,7 @@
1357
  "count": 72,
1358
  "num_samples": 1124,
1359
  "tasks": [],
1360
- "average_score": 0.5640948591986592
1361
  },
1362
  "Planning": {
1363
  "count": 78,
@@ -1375,7 +1375,7 @@
1375
  "count": 145,
1376
  "num_samples": 2313,
1377
  "tasks": [],
1378
- "average_score": 0.43544861040322835
1379
  },
1380
  "Metrics": {
1381
  "count": 20,
@@ -1393,7 +1393,7 @@
1393
  "count": 97,
1394
  "num_samples": 1605,
1395
  "tasks": [],
1396
- "average_score": 0.5398829253460956
1397
  },
1398
  "Mathematics": {
1399
  "count": 33,
@@ -1409,25 +1409,25 @@
1409
  "count": 303,
1410
  "num_samples": 4755,
1411
  "tasks": [],
1412
- "average_score": 0.49774395003470484
1413
  },
1414
  "Text Recognition (OCR)": {
1415
  "count": 137,
1416
  "num_samples": 2239,
1417
  "tasks": [],
1418
- "average_score": 0.538829507114716
1419
  },
1420
  "Language Understanding and Generation": {
1421
  "count": 154,
1422
  "num_samples": 2509,
1423
  "tasks": [],
1424
- "average_score": 0.534480883952292
1425
  },
1426
  "Scene and Event Understanding": {
1427
  "count": 154,
1428
  "num_samples": 2467,
1429
  "tasks": [],
1430
- "average_score": 0.5092565754998357
1431
  },
1432
  "Mathematical and Logical Reasoning": {
1433
  "count": 109,
@@ -1439,7 +1439,7 @@
1439
  "count": 51,
1440
  "num_samples": 855,
1441
  "tasks": [],
1442
- "average_score": 0.5676174603436022
1443
  },
1444
  "Ethical and Safety Reasoning": {
1445
  "count": 15,
@@ -1471,7 +1471,7 @@
1471
  "count": 93,
1472
  "num_samples": 1517,
1473
  "tasks": [],
1474
- "average_score": 0.5356361790015363
1475
  },
1476
  "Text-Based Images and Documents": {
1477
  "count": 82,
@@ -1501,7 +1501,7 @@
1501
  "count": 143,
1502
  "num_samples": 2248,
1503
  "tasks": [],
1504
- "average_score": 0.495761900914191
1505
  },
1506
  "3D Models and Aerial Imagery": {
1507
  "count": 11,
@@ -1515,13 +1515,13 @@
1515
  "count": 98,
1516
  "num_samples": 1514,
1517
  "tasks": [],
1518
- "average_score": 0.4444770652190341
1519
  },
1520
  "structured_output": {
1521
  "count": 110,
1522
  "num_samples": 1714,
1523
  "tasks": [],
1524
- "average_score": 0.44584364394901616
1525
  },
1526
  "exact_text": {
1527
  "count": 83,
@@ -1559,13 +1559,13 @@
1559
  "count": 41,
1560
  "num_samples": 623,
1561
  "tasks": [],
1562
- "average_score": 0.5364299983756791
1563
  },
1564
  "1-image": {
1565
  "count": 315,
1566
  "num_samples": 5228,
1567
  "tasks": [],
1568
- "average_score": 0.4908605783408196
1569
  },
1570
  "video": {
1571
  "count": 43,
@@ -1583,7 +1583,7 @@
1583
  "count": 51,
1584
  "num_samples": 802,
1585
  "tasks": [],
1586
- "average_score": 0.45169664275718613
1587
  }
1588
  },
1589
  "app": {
@@ -1591,7 +1591,7 @@
1591
  "count": 72,
1592
  "num_samples": 1124,
1593
  "tasks": [],
1594
- "average_score": 0.5748195752273694
1595
  },
1596
  "Planning": {
1597
  "count": 78,
@@ -1609,7 +1609,7 @@
1609
  "count": 145,
1610
  "num_samples": 2313,
1611
  "tasks": [],
1612
- "average_score": 0.5343715685033166
1613
  },
1614
  "Metrics": {
1615
  "count": 20,
@@ -1627,7 +1627,7 @@
1627
  "count": 97,
1628
  "num_samples": 1605,
1629
  "tasks": [],
1630
- "average_score": 0.5162919233645259
1631
  },
1632
  "Mathematics": {
1633
  "count": 33,
@@ -1643,25 +1643,25 @@
1643
  "count": 303,
1644
  "num_samples": 4755,
1645
  "tasks": [],
1646
- "average_score": 0.370836862933556
1647
  },
1648
  "Text Recognition (OCR)": {
1649
  "count": 137,
1650
  "num_samples": 2239,
1651
  "tasks": [],
1652
- "average_score": 0.39973692484032347
1653
  },
1654
  "Language Understanding and Generation": {
1655
  "count": 154,
1656
  "num_samples": 2511,
1657
  "tasks": [],
1658
- "average_score": 0.4012977216731433
1659
  },
1660
  "Scene and Event Understanding": {
1661
  "count": 154,
1662
  "num_samples": 2469,
1663
  "tasks": [],
1664
- "average_score": 0.410990923097227
1665
  },
1666
  "Mathematical and Logical Reasoning": {
1667
  "count": 109,
@@ -1673,7 +1673,7 @@
1673
  "count": 51,
1674
  "num_samples": 855,
1675
  "tasks": [],
1676
- "average_score": 0.493608784197707
1677
  },
1678
  "Ethical and Safety Reasoning": {
1679
  "count": 15,
@@ -1705,7 +1705,7 @@
1705
  "count": 93,
1706
  "num_samples": 1517,
1707
  "tasks": [],
1708
- "average_score": 0.3814353882556586
1709
  },
1710
  "Text-Based Images and Documents": {
1711
  "count": 82,
@@ -1735,7 +1735,7 @@
1735
  "count": 143,
1736
  "num_samples": 2248,
1737
  "tasks": [],
1738
- "average_score": 0.4047366473438155
1739
  },
1740
  "3D Models and Aerial Imagery": {
1741
  "count": 11,
@@ -1749,13 +1749,13 @@
1749
  "count": 98,
1750
  "num_samples": 1514,
1751
  "tasks": [],
1752
- "average_score": 0.3403519326516044
1753
  },
1754
  "structured_output": {
1755
  "count": 110,
1756
  "num_samples": 1714,
1757
  "tasks": [],
1758
- "average_score": 0.3420538306638288
1759
  },
1760
  "exact_text": {
1761
  "count": 83,
@@ -1793,13 +1793,13 @@
1793
  "count": 41,
1794
  "num_samples": 623,
1795
  "tasks": [],
1796
- "average_score": 0.37301502633138073
1797
  },
1798
  "1-image": {
1799
  "count": 315,
1800
  "num_samples": 5228,
1801
  "tasks": [],
1802
- "average_score": 0.3761693199448087
1803
  },
1804
  "video": {
1805
  "count": 43,
@@ -1817,7 +1817,7 @@
1817
  "count": 51,
1818
  "num_samples": 802,
1819
  "tasks": [],
1820
- "average_score": 0.33008667137716374
1821
  }
1822
  },
1823
  "app": {
@@ -1825,7 +1825,7 @@
1825
  "count": 72,
1826
  "num_samples": 1124,
1827
  "tasks": [],
1828
- "average_score": 0.42660307298355216
1829
  },
1830
  "Planning": {
1831
  "count": 78,
@@ -1843,7 +1843,7 @@
1843
  "count": 145,
1844
  "num_samples": 2315,
1845
  "tasks": [],
1846
- "average_score": 0.39864841947520724
1847
  },
1848
  "Metrics": {
1849
  "count": 20,
@@ -1861,7 +1861,7 @@
1861
  "count": 97,
1862
  "num_samples": 1605,
1863
  "tasks": [],
1864
- "average_score": 0.42766370932167636
1865
  },
1866
  "Mathematics": {
1867
  "count": 33,
@@ -1883,13 +1883,13 @@
1883
  "count": 137,
1884
  "num_samples": 2239,
1885
  "tasks": [],
1886
- "average_score": 0.282401662313336
1887
  },
1888
  "Language Understanding and Generation": {
1889
  "count": 154,
1890
  "num_samples": 2509,
1891
  "tasks": [],
1892
- "average_score": 0.36653344218973427
1893
  },
1894
  "Scene and Event Understanding": {
1895
  "count": 154,
@@ -1939,7 +1939,7 @@
1939
  "count": 93,
1940
  "num_samples": 1517,
1941
  "tasks": [],
1942
- "average_score": 0.23294708136735856
1943
  },
1944
  "Text-Based Images and Documents": {
1945
  "count": 82,
@@ -1969,7 +1969,7 @@
1969
  "count": 143,
1970
  "num_samples": 2248,
1971
  "tasks": [],
1972
- "average_score": 0.42429297143518147
1973
  },
1974
  "3D Models and Aerial Imagery": {
1975
  "count": 11,
@@ -1983,13 +1983,13 @@
1983
  "count": 98,
1984
  "num_samples": 1514,
1985
  "tasks": [],
1986
- "average_score": 0.28614732096244
1987
  },
1988
  "structured_output": {
1989
  "count": 110,
1990
  "num_samples": 1714,
1991
  "tasks": [],
1992
- "average_score": 0.25872873777911126
1993
  },
1994
  "exact_text": {
1995
  "count": 83,
@@ -2027,13 +2027,13 @@
2027
  "count": 41,
2028
  "num_samples": 623,
2029
  "tasks": [],
2030
- "average_score": 0.27911174307216713
2031
  },
2032
  "1-image": {
2033
  "count": 315,
2034
  "num_samples": 5228,
2035
  "tasks": [],
2036
- "average_score": 0.3481968601113118
2037
  },
2038
  "video": {
2039
  "count": 43,
@@ -2059,7 +2059,7 @@
2059
  "count": 72,
2060
  "num_samples": 1124,
2061
  "tasks": [],
2062
- "average_score": 0.30653989171354723
2063
  },
2064
  "Planning": {
2065
  "count": 78,
@@ -2077,7 +2077,7 @@
2077
  "count": 145,
2078
  "num_samples": 2313,
2079
  "tasks": [],
2080
- "average_score": 0.38316803441883945
2081
  },
2082
  "Metrics": {
2083
  "count": 20,
@@ -2117,13 +2117,13 @@
2117
  "count": 137,
2118
  "num_samples": 2239,
2119
  "tasks": [],
2120
- "average_score": 0.1902376706945491
2121
  },
2122
  "Language Understanding and Generation": {
2123
  "count": 154,
2124
  "num_samples": 2509,
2125
  "tasks": [],
2126
- "average_score": 0.255069390206439
2127
  },
2128
  "Scene and Event Understanding": {
2129
  "count": 154,
@@ -2173,7 +2173,7 @@
2173
  "count": 93,
2174
  "num_samples": 1517,
2175
  "tasks": [],
2176
- "average_score": 0.1466163383815089
2177
  },
2178
  "Text-Based Images and Documents": {
2179
  "count": 82,
@@ -2203,7 +2203,7 @@
2203
  "count": 143,
2204
  "num_samples": 2248,
2205
  "tasks": [],
2206
- "average_score": 0.3258716730180874
2207
  },
2208
  "3D Models and Aerial Imagery": {
2209
  "count": 11,
@@ -2217,13 +2217,13 @@
2217
  "count": 98,
2218
  "num_samples": 1514,
2219
  "tasks": [],
2220
- "average_score": 0.20209776978059824
2221
  },
2222
  "structured_output": {
2223
  "count": 110,
2224
  "num_samples": 1714,
2225
  "tasks": [],
2226
- "average_score": 0.18285692568564196
2227
  },
2228
  "exact_text": {
2229
  "count": 83,
@@ -2261,13 +2261,13 @@
2261
  "count": 41,
2262
  "num_samples": 623,
2263
  "tasks": [],
2264
- "average_score": 0.13787962981142515
2265
  },
2266
  "1-image": {
2267
  "count": 315,
2268
  "num_samples": 5228,
2269
  "tasks": [],
2270
- "average_score": 0.25459683619676365
2271
  },
2272
  "video": {
2273
  "count": 43,
@@ -2293,7 +2293,7 @@
2293
  "count": 72,
2294
  "num_samples": 1124,
2295
  "tasks": [],
2296
- "average_score": 0.19274192395698486
2297
  },
2298
  "Planning": {
2299
  "count": 78,
@@ -2311,7 +2311,7 @@
2311
  "count": 145,
2312
  "num_samples": 2313,
2313
  "tasks": [],
2314
- "average_score": 0.2845922887108415
2315
  },
2316
  "Metrics": {
2317
  "count": 20,
@@ -2345,25 +2345,25 @@
2345
  "count": 303,
2346
  "num_samples": 4755,
2347
  "tasks": [],
2348
- "average_score": 0.38191947207402666
2349
  },
2350
  "Text Recognition (OCR)": {
2351
  "count": 137,
2352
  "num_samples": 2239,
2353
  "tasks": [],
2354
- "average_score": 0.4103649605406274
2355
  },
2356
  "Language Understanding and Generation": {
2357
  "count": 154,
2358
  "num_samples": 2509,
2359
  "tasks": [],
2360
- "average_score": 0.4341802504488193
2361
  },
2362
  "Scene and Event Understanding": {
2363
  "count": 154,
2364
  "num_samples": 2467,
2365
  "tasks": [],
2366
- "average_score": 0.42654142415639185
2367
  },
2368
  "Mathematical and Logical Reasoning": {
2369
  "count": 109,
@@ -2375,7 +2375,7 @@
2375
  "count": 51,
2376
  "num_samples": 855,
2377
  "tasks": [],
2378
- "average_score": 0.5257357753421337
2379
  },
2380
  "Ethical and Safety Reasoning": {
2381
  "count": 15,
@@ -2407,7 +2407,7 @@
2407
  "count": 93,
2408
  "num_samples": 1517,
2409
  "tasks": [],
2410
- "average_score": 0.362195416198664
2411
  },
2412
  "Text-Based Images and Documents": {
2413
  "count": 82,
@@ -2437,7 +2437,7 @@
2437
  "count": 143,
2438
  "num_samples": 2248,
2439
  "tasks": [],
2440
- "average_score": 0.42686510293379315
2441
  },
2442
  "3D Models and Aerial Imagery": {
2443
  "count": 11,
@@ -2451,13 +2451,13 @@
2451
  "count": 98,
2452
  "num_samples": 1514,
2453
  "tasks": [],
2454
- "average_score": 0.3603288661353782
2455
  },
2456
  "structured_output": {
2457
  "count": 110,
2458
  "num_samples": 1714,
2459
  "tasks": [],
2460
- "average_score": 0.3465926907358438
2461
  },
2462
  "exact_text": {
2463
  "count": 83,
@@ -2495,13 +2495,13 @@
2495
  "count": 41,
2496
  "num_samples": 623,
2497
  "tasks": [],
2498
- "average_score": 0.34490184941501867
2499
  },
2500
  "1-image": {
2501
  "count": 315,
2502
  "num_samples": 5228,
2503
  "tasks": [],
2504
- "average_score": 0.41372274360003347
2505
  },
2506
  "video": {
2507
  "count": 43,
@@ -2519,7 +2519,7 @@
2519
  "count": 51,
2520
  "num_samples": 802,
2521
  "tasks": [],
2522
- "average_score": 0.3152784738582855
2523
  }
2524
  },
2525
  "app": {
@@ -2527,7 +2527,7 @@
2527
  "count": 72,
2528
  "num_samples": 1124,
2529
  "tasks": [],
2530
- "average_score": 0.4290949563510903
2531
  },
2532
  "Planning": {
2533
  "count": 78,
@@ -2545,7 +2545,7 @@
2545
  "count": 145,
2546
  "num_samples": 2313,
2547
  "tasks": [],
2548
- "average_score": 0.4201902630957567
2549
  },
2550
  "Metrics": {
2551
  "count": 20,
@@ -2563,7 +2563,7 @@
2563
  "count": 97,
2564
  "num_samples": 1605,
2565
  "tasks": [],
2566
- "average_score": 0.46253164682269177
2567
  },
2568
  "Mathematics": {
2569
  "count": 33,
@@ -2585,13 +2585,13 @@
2585
  "count": 137,
2586
  "num_samples": 2239,
2587
  "tasks": [],
2588
- "average_score": 0.2794121858805306
2589
  },
2590
  "Language Understanding and Generation": {
2591
  "count": 154,
2592
  "num_samples": 2511,
2593
  "tasks": [],
2594
- "average_score": 0.31918687243853283
2595
  },
2596
  "Scene and Event Understanding": {
2597
  "count": 154,
@@ -2641,7 +2641,7 @@
2641
  "count": 93,
2642
  "num_samples": 1517,
2643
  "tasks": [],
2644
- "average_score": 0.2277532691786533
2645
  },
2646
  "Text-Based Images and Documents": {
2647
  "count": 82,
@@ -2671,7 +2671,7 @@
2671
  "count": 143,
2672
  "num_samples": 2248,
2673
  "tasks": [],
2674
- "average_score": 0.33787327172644077
2675
  },
2676
  "3D Models and Aerial Imagery": {
2677
  "count": 11,
@@ -2685,13 +2685,13 @@
2685
  "count": 98,
2686
  "num_samples": 1514,
2687
  "tasks": [],
2688
- "average_score": 0.24944408255581693
2689
  },
2690
  "structured_output": {
2691
  "count": 110,
2692
  "num_samples": 1714,
2693
  "tasks": [],
2694
- "average_score": 0.25203287826995174
2695
  },
2696
  "exact_text": {
2697
  "count": 83,
@@ -2729,13 +2729,13 @@
2729
  "count": 41,
2730
  "num_samples": 623,
2731
  "tasks": [],
2732
- "average_score": 0.19814032315010577
2733
  },
2734
  "1-image": {
2735
  "count": 315,
2736
  "num_samples": 5228,
2737
  "tasks": [],
2738
- "average_score": 0.30046383040641306
2739
  },
2740
  "video": {
2741
  "count": 43,
@@ -2761,7 +2761,7 @@
2761
  "count": 72,
2762
  "num_samples": 1124,
2763
  "tasks": [],
2764
- "average_score": 0.29096771640715396
2765
  },
2766
  "Planning": {
2767
  "count": 78,
@@ -2779,7 +2779,7 @@
2779
  "count": 145,
2780
  "num_samples": 2315,
2781
  "tasks": [],
2782
- "average_score": 0.3205471121079154
2783
  },
2784
  "Metrics": {
2785
  "count": 20,
@@ -2813,25 +2813,25 @@
2813
  "count": 303,
2814
  "num_samples": 4755,
2815
  "tasks": [],
2816
- "average_score": 0.2604969133146555
2817
  },
2818
  "Text Recognition (OCR)": {
2819
  "count": 137,
2820
  "num_samples": 2239,
2821
  "tasks": [],
2822
- "average_score": 0.24828453993935928
2823
  },
2824
  "Language Understanding and Generation": {
2825
  "count": 154,
2826
  "num_samples": 2509,
2827
  "tasks": [],
2828
- "average_score": 0.2987613496312298
2829
  },
2830
  "Scene and Event Understanding": {
2831
  "count": 154,
2832
  "num_samples": 2467,
2833
  "tasks": [],
2834
- "average_score": 0.31808788094038193
2835
  },
2836
  "Mathematical and Logical Reasoning": {
2837
  "count": 109,
@@ -2843,7 +2843,7 @@
2843
  "count": 51,
2844
  "num_samples": 855,
2845
  "tasks": [],
2846
- "average_score": 0.4073231792632807
2847
  },
2848
  "Ethical and Safety Reasoning": {
2849
  "count": 15,
@@ -2875,7 +2875,7 @@
2875
  "count": 93,
2876
  "num_samples": 1517,
2877
  "tasks": [],
2878
- "average_score": 0.21153173491931837
2879
  },
2880
  "Text-Based Images and Documents": {
2881
  "count": 82,
@@ -2905,7 +2905,7 @@
2905
  "count": 143,
2906
  "num_samples": 2248,
2907
  "tasks": [],
2908
- "average_score": 0.31628986040092516
2909
  },
2910
  "3D Models and Aerial Imagery": {
2911
  "count": 11,
@@ -2919,13 +2919,13 @@
2919
  "count": 98,
2920
  "num_samples": 1514,
2921
  "tasks": [],
2922
- "average_score": 0.23302306387939006
2923
  },
2924
  "structured_output": {
2925
  "count": 110,
2926
  "num_samples": 1714,
2927
  "tasks": [],
2928
- "average_score": 0.17775369699584467
2929
  },
2930
  "exact_text": {
2931
  "count": 83,
@@ -2963,13 +2963,13 @@
2963
  "count": 41,
2964
  "num_samples": 623,
2965
  "tasks": [],
2966
- "average_score": 0.23499726844115643
2967
  },
2968
  "1-image": {
2969
  "count": 315,
2970
  "num_samples": 5228,
2971
  "tasks": [],
2972
- "average_score": 0.2625611181730622
2973
  },
2974
  "video": {
2975
  "count": 43,
@@ -2987,7 +2987,7 @@
2987
  "count": 51,
2988
  "num_samples": 802,
2989
  "tasks": [],
2990
- "average_score": 0.22288678972853282
2991
  }
2992
  },
2993
  "app": {
@@ -2995,7 +2995,7 @@
2995
  "count": 72,
2996
  "num_samples": 1124,
2997
  "tasks": [],
2998
- "average_score": 0.26614948589295767
2999
  },
3000
  "Planning": {
3001
  "count": 78,
@@ -3013,7 +3013,7 @@
3013
  "count": 145,
3014
  "num_samples": 2313,
3015
  "tasks": [],
3016
- "average_score": 0.2910511308735813
3017
  },
3018
  "Metrics": {
3019
  "count": 20,
@@ -3031,7 +3031,7 @@
3031
  "count": 97,
3032
  "num_samples": 1605,
3033
  "tasks": [],
3034
- "average_score": 0.33187792895542906
3035
  },
3036
  "Mathematics": {
3037
  "count": 33,
@@ -3053,13 +3053,13 @@
3053
  "count": 137,
3054
  "num_samples": 2239,
3055
  "tasks": [],
3056
- "average_score": 0.24734930136620975
3057
  },
3058
  "Language Understanding and Generation": {
3059
  "count": 154,
3060
  "num_samples": 2509,
3061
  "tasks": [],
3062
- "average_score": 0.2864612416413776
3063
  },
3064
  "Scene and Event Understanding": {
3065
  "count": 154,
@@ -3109,7 +3109,7 @@
3109
  "count": 93,
3110
  "num_samples": 1517,
3111
  "tasks": [],
3112
- "average_score": 0.18587661796707747
3113
  },
3114
  "Text-Based Images and Documents": {
3115
  "count": 82,
@@ -3139,7 +3139,7 @@
3139
  "count": 143,
3140
  "num_samples": 2248,
3141
  "tasks": [],
3142
- "average_score": 0.34091066308972107
3143
  },
3144
  "3D Models and Aerial Imagery": {
3145
  "count": 11,
@@ -3153,13 +3153,13 @@
3153
  "count": 98,
3154
  "num_samples": 1514,
3155
  "tasks": [],
3156
- "average_score": 0.21711219915973207
3157
  },
3158
  "structured_output": {
3159
  "count": 110,
3160
  "num_samples": 1714,
3161
  "tasks": [],
3162
- "average_score": 0.2138304528863496
3163
  },
3164
  "exact_text": {
3165
  "count": 83,
@@ -3197,13 +3197,13 @@
3197
  "count": 41,
3198
  "num_samples": 623,
3199
  "tasks": [],
3200
- "average_score": 0.14174374624685185
3201
  },
3202
  "1-image": {
3203
  "count": 315,
3204
  "num_samples": 5228,
3205
  "tasks": [],
3206
- "average_score": 0.2776898347355035
3207
  },
3208
  "video": {
3209
  "count": 43,
@@ -3229,7 +3229,7 @@
3229
  "count": 72,
3230
  "num_samples": 1124,
3231
  "tasks": [],
3232
- "average_score": 0.22277777000798116
3233
  },
3234
  "Planning": {
3235
  "count": 78,
@@ -3247,7 +3247,7 @@
3247
  "count": 145,
3248
  "num_samples": 2313,
3249
  "tasks": [],
3250
- "average_score": 0.31585879714366544
3251
  },
3252
  "Metrics": {
3253
  "count": 20,
@@ -3281,25 +3281,25 @@
3281
  "count": 303,
3282
  "num_samples": 4755,
3283
  "tasks": [],
3284
- "average_score": 0.34602671066871027
3285
  },
3286
  "Text Recognition (OCR)": {
3287
  "count": 137,
3288
  "num_samples": 2239,
3289
  "tasks": [],
3290
- "average_score": 0.3764652079852679
3291
  },
3292
  "Language Understanding and Generation": {
3293
  "count": 154,
3294
  "num_samples": 2509,
3295
  "tasks": [],
3296
- "average_score": 0.38183869685317606
3297
  },
3298
  "Scene and Event Understanding": {
3299
  "count": 154,
3300
  "num_samples": 2467,
3301
  "tasks": [],
3302
- "average_score": 0.3776679463596073
3303
  },
3304
  "Mathematical and Logical Reasoning": {
3305
  "count": 109,
@@ -3311,7 +3311,7 @@
3311
  "count": 51,
3312
  "num_samples": 855,
3313
  "tasks": [],
3314
- "average_score": 0.4190587833823822
3315
  },
3316
  "Ethical and Safety Reasoning": {
3317
  "count": 15,
@@ -3343,7 +3343,7 @@
3343
  "count": 93,
3344
  "num_samples": 1517,
3345
  "tasks": [],
3346
- "average_score": 0.30581019415764066
3347
  },
3348
  "Text-Based Images and Documents": {
3349
  "count": 82,
@@ -3373,7 +3373,7 @@
3373
  "count": 143,
3374
  "num_samples": 2248,
3375
  "tasks": [],
3376
- "average_score": 0.37068890840142343
3377
  },
3378
  "3D Models and Aerial Imagery": {
3379
  "count": 11,
@@ -3387,13 +3387,13 @@
3387
  "count": 98,
3388
  "num_samples": 1514,
3389
  "tasks": [],
3390
- "average_score": 0.3071379066920702
3391
  },
3392
  "structured_output": {
3393
  "count": 110,
3394
  "num_samples": 1714,
3395
  "tasks": [],
3396
- "average_score": 0.31782992537086313
3397
  },
3398
  "exact_text": {
3399
  "count": 83,
@@ -3431,13 +3431,13 @@
3431
  "count": 41,
3432
  "num_samples": 623,
3433
  "tasks": [],
3434
- "average_score": 0.16370884074367903
3435
  },
3436
  "1-image": {
3437
  "count": 315,
3438
  "num_samples": 5228,
3439
  "tasks": [],
3440
- "average_score": 0.37086966536142313
3441
  },
3442
  "video": {
3443
  "count": 43,
@@ -3455,7 +3455,7 @@
3455
  "count": 51,
3456
  "num_samples": 802,
3457
  "tasks": [],
3458
- "average_score": 0.310449170121381
3459
  }
3460
  },
3461
  "app": {
@@ -3463,7 +3463,7 @@
3463
  "count": 72,
3464
  "num_samples": 1124,
3465
  "tasks": [],
3466
- "average_score": 0.4285286292013588
3467
  },
3468
  "Planning": {
3469
  "count": 78,
@@ -3481,7 +3481,7 @@
3481
  "count": 145,
3482
  "num_samples": 2313,
3483
  "tasks": [],
3484
- "average_score": 0.388749951743596
3485
  },
3486
  "Metrics": {
3487
  "count": 20,
@@ -3499,7 +3499,7 @@
3499
  "count": 97,
3500
  "num_samples": 1605,
3501
  "tasks": [],
3502
- "average_score": 0.38094471423409354
3503
  },
3504
  "Mathematics": {
3505
  "count": 33,
@@ -3521,13 +3521,13 @@
3521
  "count": 137,
3522
  "num_samples": 2239,
3523
  "tasks": [],
3524
- "average_score": 0.14280015951776653
3525
  },
3526
  "Language Understanding and Generation": {
3527
  "count": 154,
3528
  "num_samples": 2509,
3529
  "tasks": [],
3530
- "average_score": 0.1960311445935766
3531
  },
3532
  "Scene and Event Understanding": {
3533
  "count": 154,
@@ -3607,7 +3607,7 @@
3607
  "count": 143,
3608
  "num_samples": 2248,
3609
  "tasks": [],
3610
- "average_score": 0.24806929522702081
3611
  },
3612
  "3D Models and Aerial Imagery": {
3613
  "count": 11,
@@ -3621,7 +3621,7 @@
3621
  "count": 98,
3622
  "num_samples": 1514,
3623
  "tasks": [],
3624
- "average_score": 0.12349256529641485
3625
  },
3626
  "structured_output": {
3627
  "count": 110,
@@ -3671,7 +3671,7 @@
3671
  "count": 315,
3672
  "num_samples": 5228,
3673
  "tasks": [],
3674
- "average_score": 0.19579907898674231
3675
  },
3676
  "video": {
3677
  "count": 43,
@@ -3715,7 +3715,7 @@
3715
  "count": 145,
3716
  "num_samples": 2313,
3717
  "tasks": [],
3718
- "average_score": 0.19853488174071646
3719
  },
3720
  "Metrics": {
3721
  "count": 20,
 
239
  "count": 303,
240
  "num_samples": 4755,
241
  "tasks": [],
242
+ "average_score": 0.5202055934299538
243
  },
244
  "Text Recognition (OCR)": {
245
  "count": 137,
246
  "num_samples": 2239,
247
  "tasks": [],
248
+ "average_score": 0.5017043129027509
249
  },
250
  "Language Understanding and Generation": {
251
  "count": 154,
252
  "num_samples": 2509,
253
  "tasks": [],
254
+ "average_score": 0.5532599716027446
255
  },
256
  "Scene and Event Understanding": {
257
  "count": 154,
258
  "num_samples": 2467,
259
  "tasks": [],
260
+ "average_score": 0.546753787203128
261
  },
262
  "Mathematical and Logical Reasoning": {
263
  "count": 109,
 
269
  "count": 51,
270
  "num_samples": 855,
271
  "tasks": [],
272
+ "average_score": 0.5751012914154264
273
  },
274
  "Ethical and Safety Reasoning": {
275
  "count": 15,
 
301
  "count": 93,
302
  "num_samples": 1517,
303
  "tasks": [],
304
+ "average_score": 0.4625032188638111
305
  },
306
  "Text-Based Images and Documents": {
307
  "count": 82,
 
331
  "count": 143,
332
  "num_samples": 2248,
333
  "tasks": [],
334
+ "average_score": 0.55005349042813
335
  },
336
  "3D Models and Aerial Imagery": {
337
  "count": 11,
 
351
  "count": 110,
352
  "num_samples": 1714,
353
  "tasks": [],
354
+ "average_score": 0.44418591808616864
355
  },
356
  "exact_text": {
357
  "count": 83,
 
389
  "count": 41,
390
  "num_samples": 623,
391
  "tasks": [],
392
+ "average_score": 0.5370278962809547
393
  },
394
  "1-image": {
395
  "count": 315,
 
413
  "count": 51,
414
  "num_samples": 802,
415
  "tasks": [],
416
+ "average_score": 0.45544217378728585
417
  }
418
  },
419
  "app": {
 
421
  "count": 72,
422
  "num_samples": 1124,
423
  "tasks": [],
424
+ "average_score": 0.5421439953094952
425
  },
426
  "Planning": {
427
  "count": 78,
 
457
  "count": 97,
458
  "num_samples": 1605,
459
  "tasks": [],
460
+ "average_score": 0.5722329455291694
461
  },
462
  "Mathematics": {
463
  "count": 33,
 
479
  "count": 137,
480
  "num_samples": 2239,
481
  "tasks": [],
482
+ "average_score": 0.4337278553354258
483
  },
484
  "Language Understanding and Generation": {
485
  "count": 154,
486
  "num_samples": 2509,
487
  "tasks": [],
488
+ "average_score": 0.49947464681475356
489
  },
490
  "Scene and Event Understanding": {
491
  "count": 154,
 
535
  "count": 93,
536
  "num_samples": 1517,
537
  "tasks": [],
538
+ "average_score": 0.3865262916591035
539
  },
540
  "Text-Based Images and Documents": {
541
  "count": 82,
 
585
  "count": 110,
586
  "num_samples": 1714,
587
  "tasks": [],
588
+ "average_score": 0.39868324168390534
589
  },
590
  "exact_text": {
591
  "count": 83,
 
623
  "count": 41,
624
  "num_samples": 623,
625
  "tasks": [],
626
+ "average_score": 0.43653808057103954
627
  },
628
  "1-image": {
629
  "count": 315,
 
655
  "count": 72,
656
  "num_samples": 1124,
657
  "tasks": [],
658
+ "average_score": 0.46645473820179373
659
  },
660
  "Planning": {
661
  "count": 78,
 
713
  "count": 137,
714
  "num_samples": 2239,
715
  "tasks": [],
716
+ "average_score": 0.6082834220752651
717
  },
718
  "Language Understanding and Generation": {
719
  "count": 154,
720
  "num_samples": 2509,
721
  "tasks": [],
722
+ "average_score": 0.5745077617490254
723
  },
724
  "Scene and Event Understanding": {
725
  "count": 154,
 
769
  "count": 93,
770
  "num_samples": 1517,
771
  "tasks": [],
772
+ "average_score": 0.5691641481808987
773
  },
774
  "Text-Based Images and Documents": {
775
  "count": 82,
 
819
  "count": 110,
820
  "num_samples": 1714,
821
  "tasks": [],
822
+ "average_score": 0.4971460788134188
823
  },
824
  "exact_text": {
825
  "count": 83,
 
857
  "count": 41,
858
  "num_samples": 623,
859
  "tasks": [],
860
+ "average_score": 0.5414381873407914
861
  },
862
  "1-image": {
863
  "count": 315,
 
889
  "count": 72,
890
  "num_samples": 1124,
891
  "tasks": [],
892
+ "average_score": 0.6663170946790707
893
  },
894
  "Planning": {
895
  "count": 78,
 
1175
  "count": 303,
1176
  "num_samples": 4755,
1177
  "tasks": [],
1178
+ "average_score": 0.4492982787524939
1179
  },
1180
  "Text Recognition (OCR)": {
1181
  "count": 137,
1182
  "num_samples": 2239,
1183
  "tasks": [],
1184
+ "average_score": 0.49026056071002017
1185
  },
1186
  "Language Understanding and Generation": {
1187
  "count": 154,
1188
  "num_samples": 2509,
1189
  "tasks": [],
1190
+ "average_score": 0.5168957112681365
1191
  },
1192
  "Scene and Event Understanding": {
1193
  "count": 154,
1194
  "num_samples": 2467,
1195
  "tasks": [],
1196
+ "average_score": 0.46731791428406805
1197
  },
1198
  "Mathematical and Logical Reasoning": {
1199
  "count": 109,
 
1205
  "count": 51,
1206
  "num_samples": 855,
1207
  "tasks": [],
1208
+ "average_score": 0.5572925295284307
1209
  },
1210
  "Ethical and Safety Reasoning": {
1211
  "count": 15,
 
1237
  "count": 93,
1238
  "num_samples": 1517,
1239
  "tasks": [],
1240
+ "average_score": 0.47202628409684394
1241
  },
1242
  "Text-Based Images and Documents": {
1243
  "count": 82,
 
1267
  "count": 143,
1268
  "num_samples": 2248,
1269
  "tasks": [],
1270
+ "average_score": 0.465175334092545
1271
  },
1272
  "3D Models and Aerial Imagery": {
1273
  "count": 11,
 
1281
  "count": 98,
1282
  "num_samples": 1514,
1283
  "tasks": [],
1284
+ "average_score": 0.41242028190533997
1285
  },
1286
  "structured_output": {
1287
  "count": 110,
1288
  "num_samples": 1714,
1289
  "tasks": [],
1290
+ "average_score": 0.3906415365938764
1291
  },
1292
  "exact_text": {
1293
  "count": 83,
 
1325
  "count": 41,
1326
  "num_samples": 623,
1327
  "tasks": [],
1328
+ "average_score": 0.4305788513381019
1329
  },
1330
  "1-image": {
1331
  "count": 315,
1332
  "num_samples": 5228,
1333
  "tasks": [],
1334
+ "average_score": 0.46343334374251277
1335
  },
1336
  "video": {
1337
  "count": 43,
 
1349
  "count": 51,
1350
  "num_samples": 802,
1351
  "tasks": [],
1352
+ "average_score": 0.36981497185070983
1353
  }
1354
  },
1355
  "app": {
 
1357
  "count": 72,
1358
  "num_samples": 1124,
1359
  "tasks": [],
1360
+ "average_score": 0.5666618234843734
1361
  },
1362
  "Planning": {
1363
  "count": 78,
 
1375
  "count": 145,
1376
  "num_samples": 2313,
1377
  "tasks": [],
1378
+ "average_score": 0.43590838051817093
1379
  },
1380
  "Metrics": {
1381
  "count": 20,
 
1393
  "count": 97,
1394
  "num_samples": 1605,
1395
  "tasks": [],
1396
+ "average_score": 0.5399167524341886
1397
  },
1398
  "Mathematics": {
1399
  "count": 33,
 
1409
  "count": 303,
1410
  "num_samples": 4755,
1411
  "tasks": [],
1412
+ "average_score": 0.49787264809826687
1413
  },
1414
  "Text Recognition (OCR)": {
1415
  "count": 137,
1416
  "num_samples": 2239,
1417
  "tasks": [],
1418
+ "average_score": 0.5439010430283516
1419
  },
1420
  "Language Understanding and Generation": {
1421
  "count": 154,
1422
  "num_samples": 2509,
1423
  "tasks": [],
1424
+ "average_score": 0.5392244859385411
1425
  },
1426
  "Scene and Event Understanding": {
1427
  "count": 154,
1428
  "num_samples": 2467,
1429
  "tasks": [],
1430
+ "average_score": 0.509277882172206
1431
  },
1432
  "Mathematical and Logical Reasoning": {
1433
  "count": 109,
 
1439
  "count": 51,
1440
  "num_samples": 855,
1441
  "tasks": [],
1442
+ "average_score": 0.5676817981386025
1443
  },
1444
  "Ethical and Safety Reasoning": {
1445
  "count": 15,
 
1471
  "count": 93,
1472
  "num_samples": 1517,
1473
  "tasks": [],
1474
+ "average_score": 0.5402397677488632
1475
  },
1476
  "Text-Based Images and Documents": {
1477
  "count": 82,
 
1501
  "count": 143,
1502
  "num_samples": 2248,
1503
  "tasks": [],
1504
+ "average_score": 0.49789939867591104
1505
  },
1506
  "3D Models and Aerial Imagery": {
1507
  "count": 11,
 
1515
  "count": 98,
1516
  "num_samples": 1514,
1517
  "tasks": [],
1518
+ "average_score": 0.44719815365440824
1519
  },
1520
  "structured_output": {
1521
  "count": 110,
1522
  "num_samples": 1714,
1523
  "tasks": [],
1524
+ "average_score": 0.4500902736468407
1525
  },
1526
  "exact_text": {
1527
  "count": 83,
 
1559
  "count": 41,
1560
  "num_samples": 623,
1561
  "tasks": [],
1562
+ "average_score": 0.5468722850464449
1563
  },
1564
  "1-image": {
1565
  "count": 315,
1566
  "num_samples": 5228,
1567
  "tasks": [],
1568
+ "average_score": 0.4918205178721877
1569
  },
1570
  "video": {
1571
  "count": 43,
 
1583
  "count": 51,
1584
  "num_samples": 802,
1585
  "tasks": [],
1586
+ "average_score": 0.45176098055218655
1587
  }
1588
  },
1589
  "app": {
 
1591
  "count": 72,
1592
  "num_samples": 1124,
1593
  "tasks": [],
1594
+ "average_score": 0.5807658773593334
1595
  },
1596
  "Planning": {
1597
  "count": 78,
 
1609
  "count": 145,
1610
  "num_samples": 2313,
1611
  "tasks": [],
1612
+ "average_score": 0.5362106489630868
1613
  },
1614
  "Metrics": {
1615
  "count": 20,
 
1627
  "count": 97,
1628
  "num_samples": 1605,
1629
  "tasks": [],
1630
+ "average_score": 0.5166939389651373
1631
  },
1632
  "Mathematics": {
1633
  "count": 33,
 
1643
  "count": 303,
1644
  "num_samples": 4755,
1645
  "tasks": [],
1646
+ "average_score": 0.3708368629321668
1647
  },
1648
  "Text Recognition (OCR)": {
1649
  "count": 137,
1650
  "num_samples": 2239,
1651
  "tasks": [],
1652
+ "average_score": 0.40213773918065815
1653
  },
1654
  "Language Understanding and Generation": {
1655
  "count": 154,
1656
  "num_samples": 2511,
1657
  "tasks": [],
1658
+ "average_score": 0.4034335110538307
1659
  },
1660
  "Scene and Event Understanding": {
1661
  "count": 154,
1662
  "num_samples": 2469,
1663
  "tasks": [],
1664
+ "average_score": 0.4109909230944937
1665
  },
1666
  "Mathematical and Logical Reasoning": {
1667
  "count": 109,
 
1673
  "count": 51,
1674
  "num_samples": 855,
1675
  "tasks": [],
1676
+ "average_score": 0.49360878418945336
1677
  },
1678
  "Ethical and Safety Reasoning": {
1679
  "count": 15,
 
1705
  "count": 93,
1706
  "num_samples": 1517,
1707
  "tasks": [],
1708
+ "average_score": 0.3821046882337143
1709
  },
1710
  "Text-Based Images and Documents": {
1711
  "count": 82,
 
1735
  "count": 143,
1736
  "num_samples": 2248,
1737
  "tasks": [],
1738
+ "average_score": 0.40660144920567376
1739
  },
1740
  "3D Models and Aerial Imagery": {
1741
  "count": 11,
 
1749
  "count": 98,
1750
  "num_samples": 1514,
1751
  "tasks": [],
1752
+ "average_score": 0.3430730210869785
1753
  },
1754
  "structured_output": {
1755
  "count": 110,
1756
  "num_samples": 1714,
1757
  "tasks": [],
1758
+ "average_score": 0.3426196933687219
1759
  },
1760
  "exact_text": {
1761
  "count": 83,
 
1793
  "count": 41,
1794
  "num_samples": 623,
1795
  "tasks": [],
1796
+ "average_score": 0.37453319457428763
1797
  },
1798
  "1-image": {
1799
  "count": 315,
1800
  "num_samples": 5228,
1801
  "tasks": [],
1802
+ "average_score": 0.37701588079136955
1803
  },
1804
  "video": {
1805
  "count": 43,
 
1817
  "count": 51,
1818
  "num_samples": 802,
1819
  "tasks": [],
1820
+ "average_score": 0.33008667136891007
1821
  }
1822
  },
1823
  "app": {
 
1825
  "count": 72,
1826
  "num_samples": 1124,
1827
  "tasks": [],
1828
+ "average_score": 0.42746758545520747
1829
  },
1830
  "Planning": {
1831
  "count": 78,
 
1843
  "count": 145,
1844
  "num_samples": 2315,
1845
  "tasks": [],
1846
+ "average_score": 0.40048749993497734
1847
  },
1848
  "Metrics": {
1849
  "count": 20,
 
1861
  "count": 97,
1862
  "num_samples": 1605,
1863
  "tasks": [],
1864
+ "average_score": 0.4276637093173368
1865
  },
1866
  "Mathematics": {
1867
  "count": 33,
 
1883
  "count": 137,
1884
  "num_samples": 2239,
1885
  "tasks": [],
1886
+ "average_score": 0.2834675874668524
1887
  },
1888
  "Language Understanding and Generation": {
1889
  "count": 154,
1890
  "num_samples": 2509,
1891
  "tasks": [],
1892
+ "average_score": 0.3674817002808495
1893
  },
1894
  "Scene and Event Understanding": {
1895
  "count": 154,
 
1939
  "count": 93,
1940
  "num_samples": 1517,
1941
  "tasks": [],
1942
+ "average_score": 0.23380046931752074
1943
  },
1944
  "Text-Based Images and Documents": {
1945
  "count": 82,
 
1969
  "count": 143,
1970
  "num_samples": 2248,
1971
  "tasks": [],
1972
+ "average_score": 0.4247591719013819
1973
  },
1974
  "3D Models and Aerial Imagery": {
1975
  "count": 11,
 
1983
  "count": 98,
1984
  "num_samples": 1514,
1985
  "tasks": [],
1986
+ "average_score": 0.2868275930712835
1987
  },
1988
  "structured_output": {
1989
  "count": 110,
1990
  "num_samples": 1714,
1991
  "tasks": [],
1992
+ "average_score": 0.259450238500612
1993
  },
1994
  "exact_text": {
1995
  "count": 83,
 
2027
  "count": 41,
2028
  "num_samples": 623,
2029
  "tasks": [],
2030
+ "average_score": 0.28104747671521785
2031
  },
2032
  "1-image": {
2033
  "count": 315,
2034
  "num_samples": 5228,
2035
  "tasks": [],
2036
+ "average_score": 0.34840850032295206
2037
  },
2038
  "video": {
2039
  "count": 43,
 
2059
  "count": 72,
2060
  "num_samples": 1124,
2061
  "tasks": [],
2062
+ "average_score": 0.3076421844825067
2063
  },
2064
  "Planning": {
2065
  "count": 78,
 
2077
  "count": 145,
2078
  "num_samples": 2313,
2079
  "tasks": [],
2080
+ "average_score": 0.38362780453378204
2081
  },
2082
  "Metrics": {
2083
  "count": 20,
 
2117
  "count": 137,
2118
  "num_samples": 2239,
2119
  "tasks": [],
2120
+ "average_score": 0.19077168655703208
2121
  },
2122
  "Language Understanding and Generation": {
2123
  "count": 154,
2124
  "num_samples": 2509,
2125
  "tasks": [],
2126
+ "average_score": 0.2555444562659206
2127
  },
2128
  "Scene and Event Understanding": {
2129
  "count": 154,
 
2173
  "count": 93,
2174
  "num_samples": 1517,
2175
  "tasks": [],
2176
+ "average_score": 0.1466861610319767
2177
  },
2178
  "Text-Based Images and Documents": {
2179
  "count": 82,
 
2203
  "count": 143,
2204
  "num_samples": 2248,
2205
  "tasks": [],
2206
+ "average_score": 0.3263378734842879
2207
  },
2208
  "3D Models and Aerial Imagery": {
2209
  "count": 11,
 
2217
  "count": 98,
2218
  "num_samples": 1514,
2219
  "tasks": [],
2220
+ "average_score": 0.20277804188944173
2221
  },
2222
  "structured_output": {
2223
  "count": 110,
2224
  "num_samples": 1714,
2225
  "tasks": [],
2226
+ "average_score": 0.18291595756285564
2227
  },
2228
  "exact_text": {
2229
  "count": 83,
 
2261
  "count": 41,
2262
  "num_samples": 623,
2263
  "tasks": [],
2264
+ "average_score": 0.13803800801858385
2265
  },
2266
  "1-image": {
2267
  "count": 315,
2268
  "num_samples": 5228,
2269
  "tasks": [],
2270
+ "average_score": 0.2548084764084038
2271
  },
2272
  "video": {
2273
  "count": 43,
 
2293
  "count": 72,
2294
  "num_samples": 1124,
2295
  "tasks": [],
2296
+ "average_score": 0.19283211154717242
2297
  },
2298
  "Planning": {
2299
  "count": 78,
 
2311
  "count": 145,
2312
  "num_samples": 2313,
2313
  "tasks": [],
2314
+ "average_score": 0.28505205882578405
2315
  },
2316
  "Metrics": {
2317
  "count": 20,
 
2345
  "count": 303,
2346
  "num_samples": 4755,
2347
  "tasks": [],
2348
+ "average_score": 0.38193012983650343
2349
  },
2350
  "Text Recognition (OCR)": {
2351
  "count": 137,
2352
  "num_samples": 2239,
2353
  "tasks": [],
2354
+ "average_score": 0.41315219763443384
2355
  },
2356
  "Language Understanding and Generation": {
2357
  "count": 154,
2358
  "num_samples": 2509,
2359
  "tasks": [],
2360
+ "average_score": 0.43665980552577693
2361
  },
2362
  "Scene and Event Understanding": {
2363
  "count": 154,
2364
  "num_samples": 2467,
2365
  "tasks": [],
2366
+ "average_score": 0.4265623936500962
2367
  },
2368
  "Mathematical and Logical Reasoning": {
2369
  "count": 109,
 
2375
  "count": 51,
2376
  "num_samples": 855,
2377
  "tasks": [],
2378
+ "average_score": 0.5257990949897898
2379
  },
2380
  "Ethical and Safety Reasoning": {
2381
  "count": 15,
 
2407
  "count": 93,
2408
  "num_samples": 1517,
2409
  "tasks": [],
2410
+ "average_score": 0.3634339625985008
2411
  },
2412
  "Text-Based Images and Documents": {
2413
  "count": 82,
 
2437
  "count": 143,
2438
  "num_samples": 2248,
2439
  "tasks": [],
2440
+ "average_score": 0.42875248733027654
2441
  },
2442
  "3D Models and Aerial Imagery": {
2443
  "count": 11,
 
2451
  "count": 98,
2452
  "num_samples": 1514,
2453
  "tasks": [],
2454
+ "average_score": 0.3630499545707523
2455
  },
2456
  "structured_output": {
2457
  "count": 110,
2458
  "num_samples": 1714,
2459
  "tasks": [],
2460
+ "average_score": 0.3476691827105281
2461
  },
2462
  "exact_text": {
2463
  "count": 83,
 
2495
  "count": 41,
2496
  "num_samples": 623,
2497
  "tasks": [],
2498
+ "average_score": 0.34771123515123364
2499
  },
2500
  "1-image": {
2501
  "count": 315,
2502
  "num_samples": 5228,
2503
  "tasks": [],
2504
+ "average_score": 0.4145693044465943
2505
  },
2506
  "video": {
2507
  "count": 43,
 
2519
  "count": 51,
2520
  "num_samples": 802,
2521
  "tasks": [],
2522
+ "average_score": 0.3153417935059416
2523
  }
2524
  },
2525
  "app": {
 
2527
  "count": 72,
2528
  "num_samples": 1124,
2529
  "tasks": [],
2530
+ "average_score": 0.4306947454508794
2531
  },
2532
  "Planning": {
2533
  "count": 78,
 
2545
  "count": 145,
2546
  "num_samples": 2313,
2547
  "tasks": [],
2548
+ "average_score": 0.42202934355552685
2549
  },
2550
  "Metrics": {
2551
  "count": 20,
 
2563
  "count": 97,
2564
  "num_samples": 1605,
2565
  "tasks": [],
2566
+ "average_score": 0.4625649385962016
2567
  },
2568
  "Mathematics": {
2569
  "count": 33,
 
2585
  "count": 137,
2586
  "num_samples": 2239,
2587
  "tasks": [],
2588
+ "average_score": 0.280559214034858
2589
  },
2590
  "Language Understanding and Generation": {
2591
  "count": 154,
2592
  "num_samples": 2511,
2593
  "tasks": [],
2594
+ "average_score": 0.32020728060179815
2595
  },
2596
  "Scene and Event Understanding": {
2597
  "count": 154,
 
2641
  "count": 93,
2642
  "num_samples": 1517,
2643
  "tasks": [],
2644
+ "average_score": 0.22800928556370195
2645
  },
2646
  "Text-Based Images and Documents": {
2647
  "count": 82,
 
2671
  "count": 143,
2672
  "num_samples": 2248,
2673
  "tasks": [],
2674
+ "average_score": 0.3388056726588417
2675
  },
2676
  "3D Models and Aerial Imagery": {
2677
  "count": 11,
 
2685
  "count": 98,
2686
  "num_samples": 1514,
2687
  "tasks": [],
2688
+ "average_score": 0.250804626773504
2689
  },
2690
  "structured_output": {
2691
  "count": 110,
2692
  "num_samples": 1714,
2693
  "tasks": [],
2694
+ "average_score": 0.2522493284864019
2695
  },
2696
  "exact_text": {
2697
  "count": 83,
 
2729
  "count": 41,
2730
  "num_samples": 623,
2731
  "tasks": [],
2732
+ "average_score": 0.19872104324302098
2733
  },
2734
  "1-image": {
2735
  "count": 315,
2736
  "num_samples": 5228,
2737
  "tasks": [],
2738
+ "average_score": 0.30088711082969344
2739
  },
2740
  "video": {
2741
  "count": 43,
 
2761
  "count": 72,
2762
  "num_samples": 1124,
2763
  "tasks": [],
2764
+ "average_score": 0.29129840423784176
2765
  },
2766
  "Planning": {
2767
  "count": 78,
 
2779
  "count": 145,
2780
  "num_samples": 2315,
2781
  "tasks": [],
2782
+ "average_score": 0.3214666523378005
2783
  },
2784
  "Metrics": {
2785
  "count": 20,
 
2813
  "count": 303,
2814
  "num_samples": 4755,
2815
  "tasks": [],
2816
+ "average_score": 0.2604967101191775
2817
  },
2818
  "Text Recognition (OCR)": {
2819
  "count": 137,
2820
  "num_samples": 2239,
2821
  "tasks": [],
2822
+ "average_score": 0.2500331562865158
2823
  },
2824
  "Language Understanding and Generation": {
2825
  "count": 154,
2826
  "num_samples": 2509,
2827
  "tasks": [],
2828
+ "average_score": 0.3003169369011028
2829
  },
2830
  "Scene and Event Understanding": {
2831
  "count": 154,
2832
  "num_samples": 2467,
2833
  "tasks": [],
2834
+ "average_score": 0.31808748114668184
2835
  },
2836
  "Mathematical and Logical Reasoning": {
2837
  "count": 109,
 
2843
  "count": 51,
2844
  "num_samples": 855,
2845
  "tasks": [],
2846
+ "average_score": 0.40732197204308807
2847
  },
2848
  "Ethical and Safety Reasoning": {
2849
  "count": 15,
 
2875
  "count": 93,
2876
  "num_samples": 1517,
2877
  "tasks": [],
2878
+ "average_score": 0.21195711598986072
2879
  },
2880
  "Text-Based Images and Documents": {
2881
  "count": 82,
 
2905
  "count": 143,
2906
  "num_samples": 2248,
2907
  "tasks": [],
2908
+ "average_score": 0.3176880312524649
2909
  },
2910
  "3D Models and Aerial Imagery": {
2911
  "count": 11,
 
2919
  "count": 98,
2920
  "num_samples": 1514,
2921
  "tasks": [],
2922
+ "average_score": 0.23506388020592064
2923
  },
2924
  "structured_output": {
2925
  "count": 110,
2926
  "num_samples": 1714,
2927
  "tasks": [],
2928
+ "average_score": 0.1781127776443048
2929
  },
2930
  "exact_text": {
2931
  "count": 83,
 
2963
  "count": 41,
2964
  "num_samples": 623,
2965
  "tasks": [],
2966
+ "average_score": 0.23596215721092323
2967
  },
2968
  "1-image": {
2969
  "count": 315,
2970
  "num_samples": 5228,
2971
  "tasks": [],
2972
+ "average_score": 0.26319603880798287
2973
  },
2974
  "video": {
2975
  "count": 43,
 
2987
  "count": 51,
2988
  "num_samples": 802,
2989
  "tasks": [],
2990
+ "average_score": 0.22288558250834017
2991
  }
2992
  },
2993
  "app": {
 
2995
  "count": 72,
2996
  "num_samples": 1124,
2997
  "tasks": [],
2998
+ "average_score": 0.2666989364424082
2999
  },
3000
  "Planning": {
3001
  "count": 78,
 
3013
  "count": 145,
3014
  "num_samples": 2313,
3015
  "tasks": [],
3016
+ "average_score": 0.29243044121840894
3017
  },
3018
  "Metrics": {
3019
  "count": 20,
 
3031
  "count": 97,
3032
  "num_samples": 1605,
3033
  "tasks": [],
3034
+ "average_score": 0.33187729423141027
3035
  },
3036
  "Mathematics": {
3037
  "count": 33,
 
3053
  "count": 137,
3054
  "num_samples": 2239,
3055
  "tasks": [],
3056
+ "average_score": 0.2483252111012436
3057
  },
3058
  "Language Understanding and Generation": {
3059
  "count": 154,
3060
  "num_samples": 2509,
3061
  "tasks": [],
3062
+ "average_score": 0.28732942108098564
3063
  },
3064
  "Scene and Event Understanding": {
3065
  "count": 154,
 
3109
  "count": 93,
3110
  "num_samples": 1517,
3111
  "tasks": [],
3112
+ "average_score": 0.1865974025588298
3113
  },
3114
  "Text-Based Images and Documents": {
3115
  "count": 82,
 
3139
  "count": 143,
3140
  "num_samples": 2248,
3141
  "tasks": [],
3142
+ "average_score": 0.3413768635559215
3143
  },
3144
  "3D Models and Aerial Imagery": {
3145
  "count": 11,
 
3153
  "count": 98,
3154
  "num_samples": 1514,
3155
  "tasks": [],
3156
+ "average_score": 0.2177924712685756
3157
  },
3158
  "structured_output": {
3159
  "count": 110,
3160
  "num_samples": 1714,
3161
  "tasks": [],
3162
+ "average_score": 0.21443984349574025
3163
  },
3164
  "exact_text": {
3165
  "count": 83,
 
3197
  "count": 41,
3198
  "num_samples": 623,
3199
  "tasks": [],
3200
+ "average_score": 0.14337869666229008
3201
  },
3202
  "1-image": {
3203
  "count": 315,
3204
  "num_samples": 5228,
3205
  "tasks": [],
3206
+ "average_score": 0.27790147494714373
3207
  },
3208
  "video": {
3209
  "count": 43,
 
3229
  "count": 72,
3230
  "num_samples": 1124,
3231
  "tasks": [],
3232
+ "average_score": 0.2237087834389946
3233
  },
3234
  "Planning": {
3235
  "count": 78,
 
3247
  "count": 145,
3248
  "num_samples": 2313,
3249
  "tasks": [],
3250
+ "average_score": 0.316318567258608
3251
  },
3252
  "Metrics": {
3253
  "count": 20,
 
3281
  "count": 303,
3282
  "num_samples": 4755,
3283
  "tasks": [],
3284
+ "average_score": 0.3460288961410444
3285
  },
3286
  "Text Recognition (OCR)": {
3287
  "count": 137,
3288
  "num_samples": 2239,
3289
  "tasks": [],
3290
+ "average_score": 0.3777640755922415
3291
  },
3292
  "Language Understanding and Generation": {
3293
  "count": 154,
3294
  "num_samples": 2509,
3295
  "tasks": [],
3296
+ "average_score": 0.38299418297106824
3297
  },
3298
  "Scene and Event Understanding": {
3299
  "count": 154,
3300
  "num_samples": 2467,
3301
  "tasks": [],
3302
+ "average_score": 0.3776722463473817
3303
  },
3304
  "Mathematical and Logical Reasoning": {
3305
  "count": 109,
 
3311
  "count": 51,
3312
  "num_samples": 855,
3313
  "tasks": [],
3314
+ "average_score": 0.419071767659191
3315
  },
3316
  "Ethical and Safety Reasoning": {
3317
  "count": 15,
 
3343
  "count": 93,
3344
  "num_samples": 1517,
3345
  "tasks": [],
3346
+ "average_score": 0.3070067338940785
3347
  },
3348
  "Text-Based Images and Documents": {
3349
  "count": 82,
 
3373
  "count": 143,
3374
  "num_samples": 2248,
3375
  "tasks": [],
3376
+ "average_score": 0.37115973962368864
3377
  },
3378
  "3D Models and Aerial Imagery": {
3379
  "count": 11,
 
3387
  "count": 98,
3388
  "num_samples": 1514,
3389
  "tasks": [],
3390
+ "average_score": 0.3078181788009137
3391
  },
3392
  "structured_output": {
3393
  "count": 110,
3394
  "num_samples": 1714,
3395
  "tasks": [],
3396
+ "average_score": 0.3188475653127356
3397
  },
3398
  "exact_text": {
3399
  "count": 83,
 
3431
  "count": 41,
3432
  "num_samples": 623,
3433
  "tasks": [],
3434
+ "average_score": 0.16642294307267227
3435
  },
3436
  "1-image": {
3437
  "count": 315,
3438
  "num_samples": 5228,
3439
  "tasks": [],
3440
+ "average_score": 0.37108130557306335
3441
  },
3442
  "video": {
3443
  "count": 43,
 
3455
  "count": 51,
3456
  "num_samples": 802,
3457
  "tasks": [],
3458
+ "average_score": 0.3104621543981899
3459
  }
3460
  },
3461
  "app": {
 
3463
  "count": 72,
3464
  "num_samples": 1124,
3465
  "tasks": [],
3466
+ "average_score": 0.4300741596942578
3467
  },
3468
  "Planning": {
3469
  "count": 78,
 
3481
  "count": 145,
3482
  "num_samples": 2313,
3483
  "tasks": [],
3484
+ "average_score": 0.3892097218585385
3485
  },
3486
  "Metrics": {
3487
  "count": 20,
 
3499
  "count": 97,
3500
  "num_samples": 1605,
3501
  "tasks": [],
3502
+ "average_score": 0.3809515410188075
3503
  },
3504
  "Mathematics": {
3505
  "count": 33,
 
3521
  "count": 137,
3522
  "num_samples": 2239,
3523
  "tasks": [],
3524
+ "average_score": 0.14328677752263275
3525
  },
3526
  "Language Understanding and Generation": {
3527
  "count": 154,
3528
  "num_samples": 2509,
3529
  "tasks": [],
3530
+ "average_score": 0.19646404502647707
3531
  },
3532
  "Scene and Event Understanding": {
3533
  "count": 154,
 
3607
  "count": 143,
3608
  "num_samples": 2248,
3609
  "tasks": [],
3610
+ "average_score": 0.2485354956932213
3611
  },
3612
  "3D Models and Aerial Imagery": {
3613
  "count": 11,
 
3621
  "count": 98,
3622
  "num_samples": 1514,
3623
  "tasks": [],
3624
+ "average_score": 0.12417283740525839
3625
  },
3626
  "structured_output": {
3627
  "count": 110,
 
3671
  "count": 315,
3672
  "num_samples": 5228,
3673
  "tasks": [],
3674
+ "average_score": 0.1960107191983825
3675
  },
3676
  "video": {
3677
  "count": 43,
 
3715
  "count": 145,
3716
  "num_samples": 2313,
3717
  "tasks": [],
3718
+ "average_score": 0.19899465185565898
3719
  },
3720
  "Metrics": {
3721
  "count": 20,
static/eval_results/Default/all_summary.json CHANGED
@@ -4,7 +4,6 @@
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
  "num_not_eval_samples": 0,
7
- "num_total_samples": 6961,
8
  "macro_mean_score": 0.5203440930873326,
9
  "micro_mean_score": 0.514302640282204
10
  },
@@ -12,14 +11,12 @@
12
  "num_eval_tasks": 440,
13
  "num_eval_samples": 6539,
14
  "num_not_eval_samples": 0,
15
- "num_total_samples": 6961,
16
  "macro_mean_score": 0.5265030595065238,
17
  "micro_mean_score": 0.5236338521693411
18
  },
19
  "open": {
20
  "num_eval_tasks": 65,
21
  "num_eval_samples": 1163,
22
- "num_total_samples": 2448,
23
  "macro_mean_score": 0.6478225794744895,
24
  "micro_mean_score": 0.665391229578676
25
  },
@@ -30,85 +27,75 @@
30
  "num_eval_tasks": 440,
31
  "num_eval_samples": 6539,
32
  "num_not_eval_samples": 0,
33
- "num_total_samples": 6961,
34
- "macro_mean_score": 0.46887846869580546,
35
- "micro_mean_score": 0.46403536258864253
36
  },
37
  "core_cot": {
38
  "num_eval_tasks": 440,
39
  "num_eval_samples": 6539,
40
  "num_not_eval_samples": 0,
41
- "num_total_samples": 6961,
42
- "macro_mean_score": 0.48154520292305814,
43
- "micro_mean_score": 0.47581906202211677
44
  },
45
  "open": {
46
  "num_eval_tasks": 65,
47
  "num_eval_samples": 1163,
48
- "num_total_samples": 2448,
49
  "macro_mean_score": 0.5858190649927173,
50
  "micro_mean_score": 0.6104901117798793
51
  },
52
- "overall_score": 0.49496659111024205
53
  },
54
  "Gemini_1.5_flash_002": {
55
  "core_noncot": {
56
  "num_eval_tasks": 440,
57
  "num_eval_samples": 6539,
58
  "num_not_eval_samples": 0,
59
- "num_total_samples": 6961,
60
- "macro_mean_score": 0.4183865592515826,
61
- "micro_mean_score": 0.41216971462683855
62
  },
63
  "core_cot": {
64
  "num_eval_tasks": 440,
65
  "num_eval_samples": 6539,
66
  "num_not_eval_samples": 0,
67
- "num_total_samples": 6961,
68
- "macro_mean_score": 0.4183865592515826,
69
- "micro_mean_score": 0.41216971462683855
70
  },
71
  "open": {
72
  "num_eval_tasks": 65,
73
  "num_eval_samples": 1163,
74
- "num_total_samples": 2168,
75
  "macro_mean_score": 0.5691365176285039,
76
  "micro_mean_score": 0.5987532244196045
77
  },
78
- "overall_score": 0.4377900192406913
79
  },
80
  "Claude_3.5": {
81
  "core_noncot": {
82
  "num_eval_tasks": 440,
83
  "num_eval_samples": 6539,
84
  "num_not_eval_samples": 0,
85
- "num_total_samples": 6961,
86
- "macro_mean_score": 0.4863241841253708,
87
- "micro_mean_score": 0.4798092874490549
88
  },
89
  "core_cot": {
90
  "num_eval_tasks": 440,
91
  "num_eval_samples": 6539,
92
  "num_not_eval_samples": 0,
93
- "num_total_samples": 6961,
94
- "macro_mean_score": 0.5029618079901714,
95
- "micro_mean_score": 0.4991559743144323
96
  },
97
  "open": {
98
  "num_eval_tasks": 65,
99
  "num_eval_samples": 1163,
100
- "num_total_samples": 2288,
101
  "macro_mean_score": 0.6373907158949892,
102
  "micro_mean_score": 0.6569647463456579
103
  },
104
- "overall_score": 0.5202645387105935
105
  },
106
  "Claude_3.5_new": {
107
  "core_noncot": {
108
  "num_eval_tasks": 440,
109
  "num_eval_samples": 6539,
110
  "num_not_eval_samples": 0,
111
- "num_total_samples": 6961,
112
  "macro_mean_score": 0.4919657684484185,
113
  "micro_mean_score": 0.4874520567007144
114
  },
@@ -116,14 +103,12 @@
116
  "num_eval_tasks": 440,
117
  "num_eval_samples": 6539,
118
  "num_not_eval_samples": 0,
119
- "num_total_samples": 6961,
120
  "macro_mean_score": 0.5259191914020757,
121
  "micro_mean_score": 0.5230785894131227
122
  },
123
  "open": {
124
  "num_eval_tasks": 65,
125
  "num_eval_samples": 1163,
126
- "num_total_samples": 1224,
127
  "macro_mean_score": 0.6563419761104125,
128
  "micro_mean_score": 0.6724419604471196
129
  },
@@ -134,267 +119,236 @@
134
  "num_eval_tasks": 440,
135
  "num_eval_samples": 6539,
136
  "num_not_eval_samples": 0,
137
- "num_total_samples": 6961,
138
- "macro_mean_score": 0.3974259652331149,
139
- "micro_mean_score": 0.392578163407945
140
  },
141
  "core_cot": {
142
  "num_eval_tasks": 440,
143
  "num_eval_samples": 6539,
144
  "num_not_eval_samples": 0,
145
- "num_total_samples": 6961,
146
- "macro_mean_score": 0.4070959243997505,
147
- "micro_mean_score": 0.40376078514357017
148
  },
149
  "open": {
150
  "num_eval_tasks": 65,
151
  "num_eval_samples": 1163,
152
- "num_total_samples": 1224,
153
  "macro_mean_score": 0.586537827213665,
154
  "micro_mean_score": 0.6133276010318144
155
  },
156
- "overall_score": 0.43019240694015537
157
  },
158
  "Qwen2_VL_72B": {
159
  "core_noncot": {
160
  "num_eval_tasks": 440,
161
  "num_eval_samples": 6539,
162
  "num_not_eval_samples": 0,
163
- "num_total_samples": 6961,
164
- "macro_mean_score": 0.4623988230573754,
165
- "micro_mean_score": 0.4568583770401895
166
  },
167
  "core_cot": {
168
  "num_eval_tasks": 440,
169
  "num_eval_samples": 6539,
170
  "num_not_eval_samples": 0,
171
- "num_total_samples": 6961,
172
- "macro_mean_score": 0.45284699372478177,
173
- "micro_mean_score": 0.4487693487093462
174
  },
175
  "open": {
176
  "num_eval_tasks": 65,
177
  "num_eval_samples": 1163,
178
- "num_total_samples": 2448,
179
  "macro_mean_score": 0.5639771804231668,
180
  "micro_mean_score": 0.5835339638865004
181
  },
182
- "overall_score": 0.4754732650945565
183
  },
184
  "Qwen2_VL_7B": {
185
  "core_noncot": {
186
  "num_eval_tasks": 440,
187
  "num_eval_samples": 6539,
188
  "num_not_eval_samples": 0,
189
- "num_total_samples": 6961,
190
- "macro_mean_score": 0.34725455697890745,
191
- "micro_mean_score": 0.34344091516995323
192
  },
193
  "core_cot": {
194
  "num_eval_tasks": 440,
195
  "num_eval_samples": 6539,
196
  "num_not_eval_samples": 0,
197
- "num_total_samples": 6961,
198
- "macro_mean_score": 0.3284357723853296,
199
- "micro_mean_score": 0.32443422147119677
200
  },
201
  "open": {
202
  "num_eval_tasks": 65,
203
  "num_eval_samples": 1170,
204
- "num_total_samples": 2452,
205
  "macro_mean_score": 0.43955105763038577,
206
  "micro_mean_score": 0.45508547008546996
207
  },
208
- "overall_score": 0.35913430458751355
209
  },
210
  "llava_onevision_72B": {
211
  "core_noncot": {
212
  "num_eval_tasks": 440,
213
  "num_eval_samples": 6539,
214
  "num_not_eval_samples": 0,
215
- "num_total_samples": 6961,
216
- "macro_mean_score": 0.31960132549012704,
217
- "micro_mean_score": 0.3173848563095166
218
  },
219
  "core_cot": {
220
  "num_eval_tasks": 440,
221
  "num_eval_samples": 6539,
222
  "num_not_eval_samples": 0,
223
- "num_total_samples": 6961,
224
- "macro_mean_score": 0.29725827011768174,
225
- "micro_mean_score": 0.2954433666362564
226
  },
227
  "open": {
228
  "num_eval_tasks": 65,
229
  "num_eval_samples": 1163,
230
- "num_total_samples": 1224,
231
  "macro_mean_score": 0.4599484231632498,
232
  "micro_mean_score": 0.4850386930352536
233
  },
234
- "overall_score": 0.33766580340844976
235
  },
236
  "llava_onevision_7B": {
237
  "core_noncot": {
238
  "num_eval_tasks": 440,
239
  "num_eval_samples": 6539,
240
  "num_not_eval_samples": 0,
241
- "num_total_samples": 6961,
242
- "macro_mean_score": 0.2239290419841492,
243
- "micro_mean_score": 0.22222171180488767
244
  },
245
  "core_cot": {
246
  "num_eval_tasks": 440,
247
  "num_eval_samples": 6539,
248
  "num_not_eval_samples": 0,
249
- "num_total_samples": 6961,
250
- "macro_mean_score": 0.21347545703998197,
251
- "micro_mean_score": 0.210586172002703
252
  },
253
  "open": {
254
  "num_eval_tasks": 65,
255
  "num_eval_samples": 1163,
256
- "num_total_samples": 2448,
257
  "macro_mean_score": 0.33979975321921935,
258
  "micro_mean_score": 0.36474634565778147
259
  },
260
- "overall_score": 0.23884309392529685
261
  },
262
  "InternVL2_76B": {
263
  "core_noncot": {
264
  "num_eval_tasks": 440,
265
  "num_eval_samples": 6539,
266
  "num_not_eval_samples": 0,
267
- "num_total_samples": 6961,
268
- "macro_mean_score": 0.34977582844066846,
269
- "micro_mean_score": 0.3452353155814884
270
  },
271
  "core_cot": {
272
  "num_eval_tasks": 440,
273
  "num_eval_samples": 6539,
274
  "num_not_eval_samples": 0,
275
- "num_total_samples": 6961,
276
- "macro_mean_score": 0.35539585884136143,
277
- "micro_mean_score": 0.35043335903915124
278
  },
279
  "open": {
280
  "num_eval_tasks": 65,
281
  "num_eval_samples": 1163,
282
- "num_total_samples": 1224,
283
  "macro_mean_score": 0.5192997443033639,
284
  "micro_mean_score": 0.5421324161650903
285
  },
286
- "overall_score": 0.37649239855429245
287
  },
288
  "InternVL2_8B": {
289
  "core_noncot": {
290
  "num_eval_tasks": 440,
291
  "num_eval_samples": 6539,
292
  "num_not_eval_samples": 0,
293
- "num_total_samples": 6961,
294
- "macro_mean_score": 0.25920867490737526,
295
- "micro_mean_score": 0.2543416126895087
296
  },
297
  "core_cot": {
298
  "num_eval_tasks": 440,
299
  "num_eval_samples": 6539,
300
  "num_not_eval_samples": 0,
301
- "num_total_samples": 6961,
302
- "macro_mean_score": 0.24055897165959364,
303
- "micro_mean_score": 0.23784634936127952
304
  },
305
  "open": {
306
  "num_eval_tasks": 65,
307
  "num_eval_samples": 1165,
308
- "num_total_samples": 2452,
309
  "macro_mean_score": 0.3978571701460552,
310
  "micro_mean_score": 0.4108583690987125
311
  },
312
- "overall_score": 0.2770545208291856
313
  },
314
  "MiniCPM_v2.6": {
315
  "core_noncot": {
316
  "num_eval_tasks": 440,
317
  "num_eval_samples": 6539,
318
  "num_not_eval_samples": 0,
319
- "num_total_samples": 6961,
320
- "macro_mean_score": 0.22838207666977445,
321
- "micro_mean_score": 0.22452805919103805
322
  },
323
  "core_cot": {
324
  "num_eval_tasks": 440,
325
  "num_eval_samples": 6539,
326
  "num_not_eval_samples": 0,
327
- "num_total_samples": 6961,
328
- "macro_mean_score": 0.22901463640480854,
329
- "micro_mean_score": 0.2250606411323753
330
  },
331
  "open": {
332
  "num_eval_tasks": 65,
333
  "num_eval_samples": 1163,
334
- "num_total_samples": 2448,
335
  "macro_mean_score": 0.41728623355613875,
336
  "micro_mean_score": 0.43452278589853827
337
  },
338
- "overall_score": 0.25324761425596987
339
  },
340
  "Phi-3.5-vision": {
341
  "core_noncot": {
342
  "num_eval_tasks": 440,
343
  "num_eval_samples": 6539,
344
  "num_not_eval_samples": 0,
345
- "num_total_samples": 6961,
346
- "macro_mean_score": 0.23240864879023493,
347
- "micro_mean_score": 0.22932978620408923
348
  },
349
  "core_cot": {
350
  "num_eval_tasks": 440,
351
  "num_eval_samples": 6539,
352
  "num_not_eval_samples": 0,
353
- "num_total_samples": 6961,
354
- "macro_mean_score": 0.2295097914016776,
355
- "micro_mean_score": 0.2266573336398296
356
  },
357
  "open": {
358
  "num_eval_tasks": 65,
359
  "num_eval_samples": 1163,
360
- "num_total_samples": 2428,
361
  "macro_mean_score": 0.3947914647737769,
362
  "micro_mean_score": 0.42459157351676696
363
  },
364
- "overall_score": 0.2533094072831661
365
  },
366
  "Pixtral_12B": {
367
  "core_noncot": {
368
  "num_eval_tasks": 440,
369
  "num_eval_samples": 6539,
370
  "num_not_eval_samples": 0,
371
- "num_total_samples": 6961,
372
- "macro_mean_score": 0.3186510310643637,
373
- "micro_mean_score": 0.3151734861550665
374
  },
375
  "core_cot": {
376
  "num_eval_tasks": 440,
377
  "num_eval_samples": 6539,
378
  "num_not_eval_samples": 0,
379
- "num_total_samples": 6961,
380
- "macro_mean_score": 0.3132232487306254,
381
- "micro_mean_score": 0.30971424472967524
382
  },
383
  "open": {
384
  "num_eval_tasks": 65,
385
  "num_eval_samples": 1163,
386
- "num_total_samples": 1224,
387
  "macro_mean_score": 0.4566234428542061,
388
  "micro_mean_score": 0.4870593293207223
389
  },
390
- "overall_score": 0.3364098563442444
391
  },
392
  "Llama_3_2_11B": {
393
  "core_noncot": {
394
  "num_eval_tasks": 440,
395
  "num_eval_samples": 6539,
396
  "num_not_eval_samples": 0,
397
- "num_total_samples": 6961,
398
  "macro_mean_score": 0.10044261716549671,
399
  "micro_mean_score": 0.09980638766828835
400
  },
@@ -402,25 +356,22 @@
402
  "num_eval_tasks": 440,
403
  "num_eval_samples": 6539,
404
  "num_not_eval_samples": 0,
405
- "num_total_samples": 6961,
406
- "macro_mean_score": 0.15984490401619783,
407
- "micro_mean_score": 0.15794038158731832
408
  },
409
  "open": {
410
  "num_eval_tasks": 65,
411
  "num_eval_samples": 1163,
412
- "num_total_samples": 1224,
413
  "macro_mean_score": 0.3173342406187366,
414
  "micro_mean_score": 0.3487962166809973
415
  },
416
- "overall_score": 0.1801158087274157
417
  },
418
  "Idefics3": {
419
  "core_noncot": {
420
  "num_eval_tasks": 440,
421
  "num_eval_samples": 6539,
422
  "num_not_eval_samples": 0,
423
- "num_total_samples": 6961,
424
  "macro_mean_score": 0.11118980301103833,
425
  "micro_mean_score": 0.11201785633274061
426
  },
@@ -428,14 +379,12 @@
428
  "num_eval_tasks": 440,
429
  "num_eval_samples": 6539,
430
  "num_not_eval_samples": 0,
431
- "num_total_samples": 6961,
432
  "macro_mean_score": 0.08956972487602757,
433
  "micro_mean_score": 0.08982225274252693
434
  },
435
  "open": {
436
  "num_eval_tasks": 65,
437
  "num_eval_samples": 1163,
438
- "num_total_samples": 2448,
439
  "macro_mean_score": 0.3210866162255635,
440
  "micro_mean_score": 0.35649183147033553
441
  },
@@ -446,7 +395,6 @@
446
  "num_eval_tasks": 440,
447
  "num_eval_samples": 6539,
448
  "num_not_eval_samples": 0,
449
- "num_total_samples": 6961,
450
  "macro_mean_score": 0.30485930718699694,
451
  "micro_mean_score": 0.3016713629035311
452
  },
@@ -454,14 +402,12 @@
454
  "num_eval_tasks": 440,
455
  "num_eval_samples": 6539,
456
  "num_not_eval_samples": 0,
457
- "num_total_samples": 6961,
458
  "macro_mean_score": 0.289073788209904,
459
  "micro_mean_score": 0.2859007507765791
460
  },
461
  "open": {
462
  "num_eval_tasks": 65,
463
  "num_eval_samples": 1163,
464
- "num_total_samples": 1224,
465
  "macro_mean_score": 0.5103725263180767,
466
  "micro_mean_score": 0.5349957007738607
467
  },
@@ -472,7 +418,6 @@
472
  "num_eval_tasks": 440,
473
  "num_eval_samples": 6539,
474
  "num_not_eval_samples": 0,
475
- "num_total_samples": 6961,
476
  "macro_mean_score": 0.2420528895703979,
477
  "micro_mean_score": 0.23838419989257642
478
  },
@@ -480,14 +425,12 @@
480
  "num_eval_tasks": 440,
481
  "num_eval_samples": 6539,
482
  "num_not_eval_samples": 0,
483
- "num_total_samples": 6961,
484
  "macro_mean_score": 0.21589726765847422,
485
  "micro_mean_score": 0.21406043849932396
486
  },
487
  "open": {
488
  "num_eval_tasks": 65,
489
  "num_eval_samples": 1163,
490
- "num_total_samples": 1224,
491
  "macro_mean_score": 0.3478114310231307,
492
  "micro_mean_score": 0.3947549441100602
493
  },
@@ -498,7 +441,6 @@
498
  "num_eval_tasks": 440,
499
  "num_eval_samples": 6539,
500
  "num_not_eval_samples": 0,
501
- "num_total_samples": 6961,
502
  "macro_mean_score": 0.09089701489596874,
503
  "micro_mean_score": 0.09036328295381871
504
  },
@@ -506,14 +448,12 @@
506
  "num_eval_tasks": 440,
507
  "num_eval_samples": 6539,
508
  "num_not_eval_samples": 0,
509
- "num_total_samples": 6961,
510
  "macro_mean_score": 0.13141974398938763,
511
  "micro_mean_score": 0.13063500716262516
512
  },
513
  "open": {
514
  "num_eval_tasks": 65,
515
  "num_eval_samples": 1163,
516
- "num_total_samples": 1224,
517
  "macro_mean_score": 0.23864417043743646,
518
  "micro_mean_score": 0.24901117798796224
519
  },
@@ -524,7 +464,6 @@
524
  "num_eval_tasks": 440,
525
  "num_eval_samples": 6539,
526
  "num_not_eval_samples": 0,
527
- "num_total_samples": 6961,
528
  "macro_mean_score": 0.16448220309703876,
529
  "micro_mean_score": 0.1610710186451323
530
  },
@@ -532,14 +471,12 @@
532
  "num_eval_tasks": 440,
533
  "num_eval_samples": 6539,
534
  "num_not_eval_samples": 0,
535
- "num_total_samples": 6961,
536
  "macro_mean_score": 0.20877163406364055,
537
  "micro_mean_score": 0.20561526268932287
538
  },
539
  "open": {
540
  "num_eval_tasks": 65,
541
  "num_eval_samples": 1163,
542
- "num_total_samples": 1224,
543
  "macro_mean_score": 0.3154302566225611,
544
  "micro_mean_score": 0.33856405846947557
545
  },
@@ -550,7 +487,6 @@
550
  "num_eval_tasks": 440,
551
  "num_eval_samples": 6539,
552
  "num_not_eval_samples": 0,
553
- "num_total_samples": 6961,
554
  "macro_mean_score": 0.16317824309838627,
555
  "micro_mean_score": 0.16198837245148487
556
  },
@@ -558,14 +494,12 @@
558
  "num_eval_tasks": 440,
559
  "num_eval_samples": 6539,
560
  "num_not_eval_samples": 0,
561
- "num_total_samples": 6961,
562
  "macro_mean_score": 0.159970161379836,
563
  "micro_mean_score": 0.15844711671722148
564
  },
565
  "open": {
566
  "num_eval_tasks": 65,
567
  "num_eval_samples": 1163,
568
- "num_total_samples": 1224,
569
  "macro_mean_score": 0.24567572098570653,
570
  "micro_mean_score": 0.2704213241616509
571
  },
 
4
  "num_eval_tasks": 440,
5
  "num_eval_samples": 6539,
6
  "num_not_eval_samples": 0,
 
7
  "macro_mean_score": 0.5203440930873326,
8
  "micro_mean_score": 0.514302640282204
9
  },
 
11
  "num_eval_tasks": 440,
12
  "num_eval_samples": 6539,
13
  "num_not_eval_samples": 0,
 
14
  "macro_mean_score": 0.5265030595065238,
15
  "micro_mean_score": 0.5236338521693411
16
  },
17
  "open": {
18
  "num_eval_tasks": 65,
19
  "num_eval_samples": 1163,
 
20
  "macro_mean_score": 0.6478225794744895,
21
  "micro_mean_score": 0.665391229578676
22
  },
 
27
  "num_eval_tasks": 440,
28
  "num_eval_samples": 6539,
29
  "num_not_eval_samples": 0,
30
+ "macro_mean_score": 0.4699992918320008,
31
+ "micro_mean_score": 0.4651116133689296
 
32
  },
33
  "core_cot": {
34
  "num_eval_tasks": 440,
35
  "num_eval_samples": 6539,
36
  "num_not_eval_samples": 0,
37
+ "macro_mean_score": 0.4822473962867704,
38
+ "micro_mean_score": 0.4764805563057179
 
39
  },
40
  "open": {
41
  "num_eval_tasks": 65,
42
  "num_eval_samples": 1163,
 
43
  "macro_mean_score": 0.5858190649927173,
44
  "micro_mean_score": 0.6104901117798793
45
  },
46
+ "overall_score": 0.4955784031499121
47
  },
48
  "Gemini_1.5_flash_002": {
49
  "core_noncot": {
50
  "num_eval_tasks": 440,
51
  "num_eval_samples": 6539,
52
  "num_not_eval_samples": 0,
53
+ "macro_mean_score": 0.41898948981774853,
54
+ "micro_mean_score": 0.4127376993779598
 
55
  },
56
  "core_cot": {
57
  "num_eval_tasks": 440,
58
  "num_eval_samples": 6539,
59
  "num_not_eval_samples": 0,
60
+ "macro_mean_score": 0.4189319021967416,
61
+ "micro_mean_score": 0.41567515414375245
 
62
  },
63
  "open": {
64
  "num_eval_tasks": 65,
65
  "num_eval_samples": 1163,
 
66
  "macro_mean_score": 0.5691365176285039,
67
  "micro_mean_score": 0.5987532244196045
68
  },
69
+ "overall_score": 0.43831534488249924
70
  },
71
  "Claude_3.5": {
72
  "core_noncot": {
73
  "num_eval_tasks": 440,
74
  "num_eval_samples": 6539,
75
  "num_not_eval_samples": 0,
76
+ "macro_mean_score": 0.48800427486796155,
77
+ "micro_mean_score": 0.4814327812005499
 
78
  },
79
  "core_cot": {
80
  "num_eval_tasks": 440,
81
  "num_eval_samples": 6539,
82
  "num_not_eval_samples": 0,
83
+ "macro_mean_score": 0.5040975742801586,
84
+ "micro_mean_score": 0.5002259116666758
 
85
  },
86
  "open": {
87
  "num_eval_tasks": 65,
88
  "num_eval_samples": 1163,
 
89
  "macro_mean_score": 0.6373907158949892,
90
  "micro_mean_score": 0.6569647463456579
91
  },
92
+ "overall_score": 0.5212541172602853
93
  },
94
  "Claude_3.5_new": {
95
  "core_noncot": {
96
  "num_eval_tasks": 440,
97
  "num_eval_samples": 6539,
98
  "num_not_eval_samples": 0,
 
99
  "macro_mean_score": 0.4919657684484185,
100
  "micro_mean_score": 0.4874520567007144
101
  },
 
103
  "num_eval_tasks": 440,
104
  "num_eval_samples": 6539,
105
  "num_not_eval_samples": 0,
 
106
  "macro_mean_score": 0.5259191914020757,
107
  "micro_mean_score": 0.5230785894131227
108
  },
109
  "open": {
110
  "num_eval_tasks": 65,
111
  "num_eval_samples": 1163,
 
112
  "macro_mean_score": 0.6563419761104125,
113
  "micro_mean_score": 0.6724419604471196
114
  },
 
119
  "num_eval_tasks": 440,
120
  "num_eval_samples": 6539,
121
  "num_not_eval_samples": 0,
122
+ "macro_mean_score": 0.39854757130003565,
123
+ "micro_mean_score": 0.3936551517403452
 
124
  },
125
  "core_cot": {
126
  "num_eval_tasks": 440,
127
  "num_eval_samples": 6539,
128
  "num_not_eval_samples": 0,
129
+ "macro_mean_score": 0.40767494558789397,
130
+ "micro_mean_score": 0.40431644154143376
 
131
  },
132
  "open": {
133
  "num_eval_tasks": 65,
134
  "num_eval_samples": 1163,
 
135
  "macro_mean_score": 0.586537827213665,
136
  "micro_mean_score": 0.6133276010318144
137
  },
138
+ "overall_score": 0.43069690064863675
139
  },
140
  "Qwen2_VL_72B": {
141
  "core_noncot": {
142
  "num_eval_tasks": 440,
143
  "num_eval_samples": 6539,
144
  "num_not_eval_samples": 0,
145
+ "macro_mean_score": 0.46406654108789214,
146
+ "micro_mean_score": 0.4584702152011697
 
147
  },
148
  "core_cot": {
149
  "num_eval_tasks": 440,
150
  "num_eval_samples": 6539,
151
  "num_not_eval_samples": 0,
152
+ "macro_mean_score": 0.4542376574527161,
153
+ "micro_mean_score": 0.4501201906164793
 
154
  },
155
  "open": {
156
  "num_eval_tasks": 65,
157
  "num_eval_samples": 1163,
 
158
  "macro_mean_score": 0.5639771804231668,
159
  "micro_mean_score": 0.5835339638865004
160
  },
161
+ "overall_score": 0.4769263263488681
162
  },
163
  "Qwen2_VL_7B": {
164
  "core_noncot": {
165
  "num_eval_tasks": 440,
166
  "num_eval_samples": 6539,
167
  "num_not_eval_samples": 0,
168
+ "macro_mean_score": 0.3480020832611913,
169
+ "micro_mean_score": 0.3441858958345098
 
170
  },
171
  "core_cot": {
172
  "num_eval_tasks": 440,
173
  "num_eval_samples": 6539,
174
  "num_not_eval_samples": 0,
175
+ "macro_mean_score": 0.3293449599230247,
176
+ "micro_mean_score": 0.325331493515679
 
177
  },
178
  "open": {
179
  "num_eval_tasks": 65,
180
  "num_eval_samples": 1170,
 
181
  "macro_mean_score": 0.43955105763038577,
182
  "micro_mean_score": 0.45508547008546996
183
  },
184
+ "overall_score": 0.3597856146156421
185
  },
186
  "llava_onevision_72B": {
187
  "core_noncot": {
188
  "num_eval_tasks": 440,
189
  "num_eval_samples": 6539,
190
  "num_not_eval_samples": 0,
191
+ "macro_mean_score": 0.3199332158220174,
192
+ "micro_mean_score": 0.31770770553892647
 
193
  },
194
  "core_cot": {
195
  "num_eval_tasks": 440,
196
  "num_eval_samples": 6539,
197
  "num_not_eval_samples": 0,
198
+ "macro_mean_score": 0.2974368415462532,
199
+ "micro_mean_score": 0.2956217833156672
 
200
  },
201
  "open": {
202
  "num_eval_tasks": 65,
203
  "num_eval_samples": 1163,
 
204
  "macro_mean_score": 0.4599484231632498,
205
  "micro_mean_score": 0.4850386930352536
206
  },
207
+ "overall_score": 0.33795497518277007
208
  },
209
  "llava_onevision_7B": {
210
  "core_noncot": {
211
  "num_eval_tasks": 440,
212
  "num_eval_samples": 6539,
213
  "num_not_eval_samples": 0,
214
+ "macro_mean_score": 0.22409531510496777,
215
+ "micro_mean_score": 0.22238854298563537
 
216
  },
217
  "core_cot": {
218
  "num_eval_tasks": 440,
219
  "num_eval_samples": 6539,
220
  "num_not_eval_samples": 0,
221
+ "macro_mean_score": 0.21362697219149712,
222
+ "micro_mean_score": 0.21073910058505504
 
223
  },
224
  "open": {
225
  "num_eval_tasks": 65,
226
  "num_eval_samples": 1163,
 
227
  "macro_mean_score": 0.33979975321921935,
228
  "micro_mean_score": 0.36474634565778147
229
  },
230
+ "overall_score": 0.23898796555531696
231
  },
232
  "InternVL2_76B": {
233
  "core_noncot": {
234
  "num_eval_tasks": 440,
235
  "num_eval_samples": 6539,
236
  "num_not_eval_samples": 0,
237
+ "macro_mean_score": 0.3502244283768534,
238
+ "micro_mean_score": 0.3456783051732046
 
239
  },
240
  "core_cot": {
241
  "num_eval_tasks": 440,
242
  "num_eval_samples": 6539,
243
  "num_not_eval_samples": 0,
244
+ "macro_mean_score": 0.3562710424410931,
245
+ "micro_mean_score": 0.35129859801162616
 
246
  },
247
  "open": {
248
  "num_eval_tasks": 65,
249
  "num_eval_samples": 1163,
 
250
  "macro_mean_score": 0.5192997443033639,
251
  "micro_mean_score": 0.5421324161650903
252
  },
253
+ "overall_score": 0.3772549347599992
254
  },
255
  "InternVL2_8B": {
256
  "core_noncot": {
257
  "num_eval_tasks": 440,
258
  "num_eval_samples": 6539,
259
  "num_not_eval_samples": 0,
260
+ "macro_mean_score": 0.25956581776451815,
261
+ "micro_mean_score": 0.2546984460483302
 
262
  },
263
  "core_cot": {
264
  "num_eval_tasks": 440,
265
  "num_eval_samples": 6539,
266
  "num_not_eval_samples": 0,
267
+ "macro_mean_score": 0.24090301358258295,
268
+ "micro_mean_score": 0.23819084111520938
 
269
  },
270
  "open": {
271
  "num_eval_tasks": 65,
272
  "num_eval_samples": 1165,
 
273
  "macro_mean_score": 0.3978571701460552,
274
  "micro_mean_score": 0.4108583690987125
275
  },
276
+ "overall_score": 0.2773656948037259
277
  },
278
  "MiniCPM_v2.6": {
279
  "core_noncot": {
280
  "num_eval_tasks": 440,
281
  "num_eval_samples": 6539,
282
  "num_not_eval_samples": 0,
283
+ "macro_mean_score": 0.2287645706203155,
284
+ "micro_mean_score": 0.2249087742955901
 
285
  },
286
  "core_cot": {
287
  "num_eval_tasks": 440,
288
  "num_eval_samples": 6539,
289
  "num_not_eval_samples": 0,
290
+ "macro_mean_score": 0.22955895202146906,
291
+ "micro_mean_score": 0.22560399396899078
 
292
  },
293
  "open": {
294
  "num_eval_tasks": 65,
295
  "num_eval_samples": 1163,
 
296
  "macro_mean_score": 0.41728623355613875,
297
  "micro_mean_score": 0.43452278589853827
298
  },
299
+ "overall_score": 0.2537218694467236
300
  },
301
  "Phi-3.5-vision": {
302
  "core_noncot": {
303
  "num_eval_tasks": 440,
304
  "num_eval_samples": 6539,
305
  "num_not_eval_samples": 0,
306
+ "macro_mean_score": 0.23271251159409778,
307
+ "micro_mean_score": 0.2296262323791101
 
308
  },
309
  "core_cot": {
310
  "num_eval_tasks": 440,
311
  "num_eval_samples": 6539,
312
  "num_not_eval_samples": 0,
313
+ "macro_mean_score": 0.22995297916629392,
314
+ "micro_mean_score": 0.22708502951025372
 
315
  },
316
  "open": {
317
  "num_eval_tasks": 65,
318
  "num_eval_samples": 1163,
 
319
  "macro_mean_score": 0.3947914647737769,
320
  "micro_mean_score": 0.42459157351676696
321
  },
322
+ "overall_score": 0.25357415903306635
323
  },
324
  "Pixtral_12B": {
325
  "core_noncot": {
326
  "num_eval_tasks": 440,
327
  "num_eval_samples": 6539,
328
  "num_not_eval_samples": 0,
329
+ "macro_mean_score": 0.31905695620134694,
330
+ "micro_mean_score": 0.31556607913724777
 
331
  },
332
  "core_cot": {
333
  "num_eval_tasks": 440,
334
  "num_eval_samples": 6539,
335
  "num_not_eval_samples": 0,
336
+ "macro_mean_score": 0.31362045151669854,
337
+ "micro_mean_score": 0.3100986209078182
 
338
  },
339
  "open": {
340
  "num_eval_tasks": 65,
341
  "num_eval_samples": 1163,
 
342
  "macro_mean_score": 0.4566234428542061,
343
  "micro_mean_score": 0.4870593293207223
344
  },
345
+ "overall_score": 0.33676353369131895
346
  },
347
  "Llama_3_2_11B": {
348
  "core_noncot": {
349
  "num_eval_tasks": 440,
350
  "num_eval_samples": 6539,
351
  "num_not_eval_samples": 0,
 
352
  "macro_mean_score": 0.10044261716549671,
353
  "micro_mean_score": 0.09980638766828835
354
  },
 
356
  "num_eval_tasks": 440,
357
  "num_eval_samples": 6539,
358
  "num_not_eval_samples": 0,
359
+ "macro_mean_score": 0.15999641916771298,
360
+ "micro_mean_score": 0.15809331016967038
 
361
  },
362
  "open": {
363
  "num_eval_tasks": 65,
364
  "num_eval_samples": 1163,
 
365
  "macro_mean_score": 0.3173342406187366,
366
  "micro_mean_score": 0.3487962166809973
367
  },
368
+ "overall_score": 0.1802478219287358
369
  },
370
  "Idefics3": {
371
  "core_noncot": {
372
  "num_eval_tasks": 440,
373
  "num_eval_samples": 6539,
374
  "num_not_eval_samples": 0,
 
375
  "macro_mean_score": 0.11118980301103833,
376
  "micro_mean_score": 0.11201785633274061
377
  },
 
379
  "num_eval_tasks": 440,
380
  "num_eval_samples": 6539,
381
  "num_not_eval_samples": 0,
 
382
  "macro_mean_score": 0.08956972487602757,
383
  "micro_mean_score": 0.08982225274252693
384
  },
385
  "open": {
386
  "num_eval_tasks": 65,
387
  "num_eval_samples": 1163,
 
388
  "macro_mean_score": 0.3210866162255635,
389
  "micro_mean_score": 0.35649183147033553
390
  },
 
395
  "num_eval_tasks": 440,
396
  "num_eval_samples": 6539,
397
  "num_not_eval_samples": 0,
 
398
  "macro_mean_score": 0.30485930718699694,
399
  "micro_mean_score": 0.3016713629035311
400
  },
 
402
  "num_eval_tasks": 440,
403
  "num_eval_samples": 6539,
404
  "num_not_eval_samples": 0,
 
405
  "macro_mean_score": 0.289073788209904,
406
  "micro_mean_score": 0.2859007507765791
407
  },
408
  "open": {
409
  "num_eval_tasks": 65,
410
  "num_eval_samples": 1163,
 
411
  "macro_mean_score": 0.5103725263180767,
412
  "micro_mean_score": 0.5349957007738607
413
  },
 
418
  "num_eval_tasks": 440,
419
  "num_eval_samples": 6539,
420
  "num_not_eval_samples": 0,
 
421
  "macro_mean_score": 0.2420528895703979,
422
  "micro_mean_score": 0.23838419989257642
423
  },
 
425
  "num_eval_tasks": 440,
426
  "num_eval_samples": 6539,
427
  "num_not_eval_samples": 0,
 
428
  "macro_mean_score": 0.21589726765847422,
429
  "micro_mean_score": 0.21406043849932396
430
  },
431
  "open": {
432
  "num_eval_tasks": 65,
433
  "num_eval_samples": 1163,
 
434
  "macro_mean_score": 0.3478114310231307,
435
  "micro_mean_score": 0.3947549441100602
436
  },
 
441
  "num_eval_tasks": 440,
442
  "num_eval_samples": 6539,
443
  "num_not_eval_samples": 0,
 
444
  "macro_mean_score": 0.09089701489596874,
445
  "micro_mean_score": 0.09036328295381871
446
  },
 
448
  "num_eval_tasks": 440,
449
  "num_eval_samples": 6539,
450
  "num_not_eval_samples": 0,
 
451
  "macro_mean_score": 0.13141974398938763,
452
  "micro_mean_score": 0.13063500716262516
453
  },
454
  "open": {
455
  "num_eval_tasks": 65,
456
  "num_eval_samples": 1163,
 
457
  "macro_mean_score": 0.23864417043743646,
458
  "micro_mean_score": 0.24901117798796224
459
  },
 
464
  "num_eval_tasks": 440,
465
  "num_eval_samples": 6539,
466
  "num_not_eval_samples": 0,
 
467
  "macro_mean_score": 0.16448220309703876,
468
  "micro_mean_score": 0.1610710186451323
469
  },
 
471
  "num_eval_tasks": 440,
472
  "num_eval_samples": 6539,
473
  "num_not_eval_samples": 0,
 
474
  "macro_mean_score": 0.20877163406364055,
475
  "micro_mean_score": 0.20561526268932287
476
  },
477
  "open": {
478
  "num_eval_tasks": 65,
479
  "num_eval_samples": 1163,
 
480
  "macro_mean_score": 0.3154302566225611,
481
  "micro_mean_score": 0.33856405846947557
482
  },
 
487
  "num_eval_tasks": 440,
488
  "num_eval_samples": 6539,
489
  "num_not_eval_samples": 0,
 
490
  "macro_mean_score": 0.16317824309838627,
491
  "micro_mean_score": 0.16198837245148487
492
  },
 
494
  "num_eval_tasks": 440,
495
  "num_eval_samples": 6539,
496
  "num_not_eval_samples": 0,
 
497
  "macro_mean_score": 0.159970161379836,
498
  "micro_mean_score": 0.15844711671722148
499
  },
500
  "open": {
501
  "num_eval_tasks": 65,
502
  "num_eval_samples": 1163,
 
503
  "macro_mean_score": 0.24567572098570653,
504
  "micro_mean_score": 0.2704213241616509
505
  },
static/eval_results/{Core_SI → SI}/all_model_keywords_stats.json RENAMED
The diff for this file is too large to render. See raw diff
 
static/eval_results/SI/all_summary.json ADDED
@@ -0,0 +1,471 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Aquila_VL_2B": {
3
+ "core": {
4
+ "num_eval_tasks": 273,
5
+ "num_eval_samples": 4116,
6
+ "num_not_eval_samples": 0,
7
+ "macro_mean_score": 0.20770364903712493,
8
+ "micro_mean_score": 0.20333142638522636,
9
+ "missing_tasks": []
10
+ },
11
+ "open": {
12
+ "num_eval_tasks": 42,
13
+ "num_eval_samples": 813,
14
+ "num_not_eval_samples": 0,
15
+ "macro_mean_score": 0.31474202723571276,
16
+ "micro_mean_score": 0.3326568265682657,
17
+ "missing_tasks": []
18
+ },
19
+ "overall_score": 0.22197543279693666
20
+ },
21
+ "Aria": {
22
+ "core": {
23
+ "num_eval_tasks": 273,
24
+ "num_eval_samples": 4116,
25
+ "num_not_eval_samples": 0,
26
+ "macro_mean_score": 0.3178882776147889,
27
+ "micro_mean_score": 0.3101511832828904,
28
+ "missing_tasks": []
29
+ },
30
+ "open": {
31
+ "num_eval_tasks": 42,
32
+ "num_eval_samples": 813,
33
+ "num_not_eval_samples": 0,
34
+ "macro_mean_score": 0.5137437248005172,
35
+ "micro_mean_score": 0.5472939729397295,
36
+ "missing_tasks": []
37
+ },
38
+ "overall_score": 0.34400233723955265
39
+ },
40
+ "Claude_3.5": {
41
+ "core": {
42
+ "num_eval_tasks": 273,
43
+ "num_eval_samples": 4116,
44
+ "num_not_eval_samples": 0,
45
+ "macro_mean_score": 0.520276385877485,
46
+ "micro_mean_score": 0.5148202137998056
47
+ },
48
+ "open": {
49
+ "num_eval_tasks": 42,
50
+ "num_eval_samples": 813,
51
+ "num_not_eval_samples": 0,
52
+ "macro_mean_score": 0.6479684260295507,
53
+ "micro_mean_score": 0.6801968019680197
54
+ },
55
+ "overall_score": 0.5373019912310938
56
+ },
57
+ "Claude_3.5_new": {
58
+ "core": {
59
+ "num_eval_tasks": 273,
60
+ "num_eval_samples": 4116,
61
+ "num_not_eval_samples": 0,
62
+ "macro_mean_score": 0.5462752278980763,
63
+ "micro_mean_score": 0.5417881438289601
64
+ },
65
+ "open": {
66
+ "num_eval_tasks": 42,
67
+ "num_eval_samples": 813,
68
+ "num_not_eval_samples": 0,
69
+ "macro_mean_score": 0.6764020657053476,
70
+ "micro_mean_score": 0.6924969249692496
71
+ },
72
+ "overall_score": 0.5636254729390457
73
+ },
74
+ "GPT_4o": {
75
+ "core": {
76
+ "num_eval_tasks": 273,
77
+ "num_eval_samples": 4116,
78
+ "num_not_eval_samples": 0,
79
+ "macro_mean_score": 0.5529953662872719,
80
+ "micro_mean_score": 0.5483479105928085
81
+ },
82
+ "open": {
83
+ "num_eval_tasks": 42,
84
+ "num_eval_samples": 813,
85
+ "num_not_eval_samples": 0,
86
+ "macro_mean_score": 0.6600228904804206,
87
+ "micro_mean_score": 0.6801968019680197
88
+ },
89
+ "overall_score": 0.5672657028463584
90
+ },
91
+ "GPT_4o_mini": {
92
+ "core": {
93
+ "num_eval_tasks": 273,
94
+ "num_eval_samples": 4116,
95
+ "num_not_eval_samples": 0,
96
+ "macro_mean_score": 0.44285970964797233,
97
+ "micro_mean_score": 0.43756073858114675
98
+ },
99
+ "open": {
100
+ "num_eval_tasks": 42,
101
+ "num_eval_samples": 813,
102
+ "num_not_eval_samples": 0,
103
+ "macro_mean_score": 0.595574663769726,
104
+ "micro_mean_score": 0.6334563345633456
105
+ },
106
+ "overall_score": 0.46322170353087283
107
+ },
108
+ "Gemini_1.5_flash_002": {
109
+ "core": {
110
+ "num_eval_tasks": 273,
111
+ "num_eval_samples": 4116,
112
+ "num_not_eval_samples": 0,
113
+ "macro_mean_score": 0.42188460865574384,
114
+ "micro_mean_score": 0.413508260447036
115
+ },
116
+ "open": {
117
+ "num_eval_tasks": 42,
118
+ "num_eval_samples": 813,
119
+ "num_not_eval_samples": 0,
120
+ "macro_mean_score": 0.5787083135236054,
121
+ "micro_mean_score": 0.6186961869618696
122
+ },
123
+ "overall_score": 0.44279443597145873
124
+ },
125
+ "Gemini_1.5_pro_002": {
126
+ "core": {
127
+ "num_eval_tasks": 273,
128
+ "num_eval_samples": 4116,
129
+ "num_not_eval_samples": 0,
130
+ "macro_mean_score": 0.4914311038229404,
131
+ "micro_mean_score": 0.48323615160349853
132
+ },
133
+ "open": {
134
+ "num_eval_tasks": 42,
135
+ "num_eval_samples": 813,
136
+ "num_not_eval_samples": 0,
137
+ "macro_mean_score": 0.5814975405131552,
138
+ "micro_mean_score": 0.6174661746617466
139
+ },
140
+ "overall_score": 0.5034399620483024
141
+ },
142
+ "Idefics3": {
143
+ "core": {
144
+ "num_eval_tasks": 273,
145
+ "num_eval_samples": 4116,
146
+ "num_not_eval_samples": 0,
147
+ "macro_mean_score": 0.08941182847569326,
148
+ "micro_mean_score": 0.08779475233900695,
149
+ "missing_tasks": []
150
+ },
151
+ "open": {
152
+ "num_eval_tasks": 42,
153
+ "num_eval_samples": 813,
154
+ "num_not_eval_samples": 0,
155
+ "macro_mean_score": 0.3231434267517844,
156
+ "micro_mean_score": 0.3618081180811809,
157
+ "missing_tasks": []
158
+ },
159
+ "overall_score": 0.12057604157917208
160
+ },
161
+ "InternVL2_2B": {
162
+ "core": {
163
+ "num_eval_tasks": 273,
164
+ "num_eval_samples": 4116,
165
+ "num_not_eval_samples": 0,
166
+ "macro_mean_score": 0.12069001041308772,
167
+ "micro_mean_score": 0.11842605219090299,
168
+ "missing_tasks": []
169
+ },
170
+ "open": {
171
+ "num_eval_tasks": 42,
172
+ "num_eval_samples": 813,
173
+ "num_not_eval_samples": 0,
174
+ "macro_mean_score": 0.28522459992910454,
175
+ "micro_mean_score": 0.28886838868388687,
176
+ "missing_tasks": []
177
+ },
178
+ "overall_score": 0.14262795568189
179
+ },
180
+ "InternVL2_76B": {
181
+ "core": {
182
+ "num_eval_tasks": 273,
183
+ "num_eval_samples": 4116,
184
+ "num_not_eval_samples": 0,
185
+ "macro_mean_score": 0.3998616568018755,
186
+ "micro_mean_score": 0.39149064302628933,
187
+ "missing_tasks": []
188
+ },
189
+ "open": {
190
+ "num_eval_tasks": 42,
191
+ "num_eval_samples": 813,
192
+ "num_not_eval_samples": 0,
193
+ "macro_mean_score": 0.554748737158244,
194
+ "micro_mean_score": 0.5800738007380073,
195
+ "missing_tasks": []
196
+ },
197
+ "overall_score": 0.42051326751605805
198
+ },
199
+ "InternVL2_8B": {
200
+ "core": {
201
+ "num_eval_tasks": 273,
202
+ "num_eval_samples": 4116,
203
+ "num_not_eval_samples": 0,
204
+ "macro_mean_score": 0.27650612401825575,
205
+ "micro_mean_score": 0.27119471729837735,
206
+ "missing_tasks": []
207
+ },
208
+ "open": {
209
+ "num_eval_tasks": 42,
210
+ "num_eval_samples": 813,
211
+ "num_not_eval_samples": 0,
212
+ "macro_mean_score": 0.39388373890935635,
213
+ "micro_mean_score": 0.4045510455104551,
214
+ "missing_tasks": []
215
+ },
216
+ "overall_score": 0.29215647267040246
217
+ },
218
+ "Llama_3_2_11B": {
219
+ "core": {
220
+ "num_eval_tasks": 273,
221
+ "num_eval_samples": 4116,
222
+ "num_not_eval_samples": 0,
223
+ "macro_mean_score": 0.20789144960796493,
224
+ "micro_mean_score": 0.20163641703273802,
225
+ "missing_tasks": []
226
+ },
227
+ "open": {
228
+ "num_eval_tasks": 42,
229
+ "num_eval_samples": 813,
230
+ "num_not_eval_samples": 0,
231
+ "macro_mean_score": 0.3861125858565788,
232
+ "micro_mean_score": 0.4130381303813038,
233
+ "missing_tasks": []
234
+ },
235
+ "overall_score": 0.2316542677744468
236
+ },
237
+ "MiniCPM_v2.6": {
238
+ "core": {
239
+ "num_eval_tasks": 273,
240
+ "num_eval_samples": 4116,
241
+ "num_not_eval_samples": 0,
242
+ "macro_mean_score": 0.23230765810722817,
243
+ "micro_mean_score": 0.22684118052665975,
244
+ "missing_tasks": []
245
+ },
246
+ "open": {
247
+ "num_eval_tasks": 42,
248
+ "num_eval_samples": 813,
249
+ "num_not_eval_samples": 0,
250
+ "macro_mean_score": 0.4360655066213874,
251
+ "micro_mean_score": 0.4588560885608856,
252
+ "missing_tasks": []
253
+ },
254
+ "overall_score": 0.2594753712424494
255
+ },
256
+ "Molmo_72B": {
257
+ "core": {
258
+ "num_eval_tasks": 270,
259
+ "num_eval_samples": 4073,
260
+ "num_not_eval_samples": 0,
261
+ "macro_mean_score": 0.36480000609384927,
262
+ "micro_mean_score": 0.36205779758110807,
263
+ "missing_tasks": [
264
+ "MMSoc_Misinformation_PolitiFact",
265
+ "table_understanding",
266
+ "planning_screenshot_termes"
267
+ ]
268
+ },
269
+ "open": {
270
+ "num_eval_tasks": 42,
271
+ "num_eval_samples": 813,
272
+ "num_not_eval_samples": 0,
273
+ "macro_mean_score": 0.4465682063915481,
274
+ "micro_mean_score": 0.4850553505535054,
275
+ "missing_tasks": []
276
+ },
277
+ "overall_score": 0.3758072638262318
278
+ },
279
+ "Molmo_7B_D": {
280
+ "core": {
281
+ "num_eval_tasks": 272,
282
+ "num_eval_samples": 4102,
283
+ "num_not_eval_samples": 0,
284
+ "macro_mean_score": 0.2098088446992518,
285
+ "micro_mean_score": 0.20550929661464645,
286
+ "missing_tasks": [
287
+ "MMSoc_Misinformation_PolitiFact"
288
+ ]
289
+ },
290
+ "open": {
291
+ "num_eval_tasks": 42,
292
+ "num_eval_samples": 813,
293
+ "num_not_eval_samples": 0,
294
+ "macro_mean_score": 0.35697926179118733,
295
+ "micro_mean_score": 0.38936039360393604,
296
+ "missing_tasks": []
297
+ },
298
+ "overall_score": 0.22949405972428777
299
+ },
300
+ "NVLM": {
301
+ "core": {
302
+ "num_eval_tasks": 273,
303
+ "num_eval_samples": 4116,
304
+ "num_not_eval_samples": 0,
305
+ "macro_mean_score": 0.32989872890926025,
306
+ "micro_mean_score": 0.32315683713111915,
307
+ "missing_tasks": []
308
+ },
309
+ "open": {
310
+ "num_eval_tasks": 42,
311
+ "num_eval_samples": 813,
312
+ "num_not_eval_samples": 0,
313
+ "macro_mean_score": 0.4469349818134809,
314
+ "micro_mean_score": 0.4881303813038132,
315
+ "missing_tasks": []
316
+ },
317
+ "overall_score": 0.34550356262982296
318
+ },
319
+ "POINTS_7B": {
320
+ "core": {
321
+ "num_eval_tasks": 273,
322
+ "num_eval_samples": 4116,
323
+ "num_not_eval_samples": 0,
324
+ "macro_mean_score": 0.25511317681632334,
325
+ "micro_mean_score": 0.24927711632415062,
326
+ "missing_tasks": []
327
+ },
328
+ "open": {
329
+ "num_eval_tasks": 42,
330
+ "num_eval_samples": 813,
331
+ "num_not_eval_samples": 0,
332
+ "macro_mean_score": 0.30315625179016,
333
+ "micro_mean_score": 0.3313653136531366,
334
+ "missing_tasks": []
335
+ },
336
+ "overall_score": 0.26151892014616823
337
+ },
338
+ "Phi-3.5-vision": {
339
+ "core": {
340
+ "num_eval_tasks": 273,
341
+ "num_eval_samples": 4116,
342
+ "num_not_eval_samples": 0,
343
+ "macro_mean_score": 0.2561274958722834,
344
+ "micro_mean_score": 0.2504214576875906,
345
+ "missing_tasks": []
346
+ },
347
+ "open": {
348
+ "num_eval_tasks": 42,
349
+ "num_eval_samples": 813,
350
+ "num_not_eval_samples": 0,
351
+ "macro_mean_score": 0.4272267419054576,
352
+ "micro_mean_score": 0.445879458794588,
353
+ "missing_tasks": []
354
+ },
355
+ "overall_score": 0.2789407286767066
356
+ },
357
+ "Pixtral_12B": {
358
+ "core": {
359
+ "num_eval_tasks": 273,
360
+ "num_eval_samples": 4116,
361
+ "num_not_eval_samples": 0,
362
+ "macro_mean_score": 0.3436942439614412,
363
+ "micro_mean_score": 0.3373564384613738,
364
+ "missing_tasks": []
365
+ },
366
+ "open": {
367
+ "num_eval_tasks": 42,
368
+ "num_eval_samples": 813,
369
+ "num_not_eval_samples": 0,
370
+ "macro_mean_score": 0.4417271955536318,
371
+ "micro_mean_score": 0.4845633456334564,
372
+ "missing_tasks": []
373
+ },
374
+ "overall_score": 0.3567653041737333
375
+ },
376
+ "Qwen2_VL_2B": {
377
+ "core": {
378
+ "num_eval_tasks": 273,
379
+ "num_eval_samples": 4116,
380
+ "num_not_eval_samples": 0,
381
+ "macro_mean_score": 0.22787906973244856,
382
+ "micro_mean_score": 0.2234748515064842,
383
+ "missing_tasks": []
384
+ },
385
+ "open": {
386
+ "num_eval_tasks": 42,
387
+ "num_eval_samples": 813,
388
+ "num_not_eval_samples": 0,
389
+ "macro_mean_score": 0.3509364634962041,
390
+ "micro_mean_score": 0.3768757687576875,
391
+ "missing_tasks": []
392
+ },
393
+ "overall_score": 0.24428672223428263
394
+ },
395
+ "Qwen2_VL_72B": {
396
+ "core": {
397
+ "num_eval_tasks": 273,
398
+ "num_eval_samples": 4116,
399
+ "num_not_eval_samples": 0,
400
+ "macro_mean_score": 0.4730536307784527,
401
+ "micro_mean_score": 0.4659830915476831,
402
+ "missing_tasks": []
403
+ },
404
+ "open": {
405
+ "num_eval_tasks": 42,
406
+ "num_eval_samples": 813,
407
+ "num_not_eval_samples": 0,
408
+ "macro_mean_score": 0.5510079982505317,
409
+ "micro_mean_score": 0.5826568265682657,
410
+ "missing_tasks": []
411
+ },
412
+ "overall_score": 0.48344754644139654
413
+ },
414
+ "Qwen2_VL_7B": {
415
+ "core": {
416
+ "num_eval_tasks": 273,
417
+ "num_eval_samples": 4116,
418
+ "num_not_eval_samples": 0,
419
+ "macro_mean_score": 0.3538656561495699,
420
+ "micro_mean_score": 0.34581250459157137,
421
+ "missing_tasks": []
422
+ },
423
+ "open": {
424
+ "num_eval_tasks": 42,
425
+ "num_eval_samples": 813,
426
+ "num_not_eval_samples": 0,
427
+ "macro_mean_score": 0.4517429592549692,
428
+ "micro_mean_score": 0.4730012300123002,
429
+ "missing_tasks": []
430
+ },
431
+ "overall_score": 0.3669159632302898
432
+ },
433
+ "llava_onevision_72B": {
434
+ "core": {
435
+ "num_eval_tasks": 273,
436
+ "num_eval_samples": 4116,
437
+ "num_not_eval_samples": 0,
438
+ "macro_mean_score": 0.312618242621264,
439
+ "micro_mean_score": 0.3098623876487132,
440
+ "missing_tasks": []
441
+ },
442
+ "open": {
443
+ "num_eval_tasks": 42,
444
+ "num_eval_samples": 813,
445
+ "num_not_eval_samples": 0,
446
+ "macro_mean_score": 0.4425822460912829,
447
+ "micro_mean_score": 0.47539975399754,
448
+ "missing_tasks": []
449
+ },
450
+ "overall_score": 0.32994677641726655
451
+ },
452
+ "llava_onevision_7B": {
453
+ "core": {
454
+ "num_eval_tasks": 273,
455
+ "num_eval_samples": 4116,
456
+ "num_not_eval_samples": 0,
457
+ "macro_mean_score": 0.23683339637631812,
458
+ "micro_mean_score": 0.23283041278687175,
459
+ "missing_tasks": []
460
+ },
461
+ "open": {
462
+ "num_eval_tasks": 42,
463
+ "num_eval_samples": 813,
464
+ "num_not_eval_samples": 0,
465
+ "macro_mean_score": 0.3871602360316429,
466
+ "micro_mean_score": 0.4113161131611316,
467
+ "missing_tasks": []
468
+ },
469
+ "overall_score": 0.25687697499702805
470
+ }
471
+ }
utils.py CHANGED
@@ -11,7 +11,7 @@ MODEL_NAME_MAP = {
11
  "InternVL2_76B": "InternVL2-Llama3-76B",
12
  "Qwen2_VL_72B": "Qwen2-VL-72B",
13
  "llava_onevision_72B": "Llava-OneVision-72B",
14
- "NVLM": "NVLM-72B",
15
  "GPT_4o_mini": "GPT-4o mini",
16
  "Gemini_1.5_flash_002": "Gemini-1.5-Flash-002",
17
  "Pixtral_12B": "Pixtral 12B",
@@ -83,6 +83,34 @@ KEYWORD_NAME_MAP = {
83
  "video": "Video",
84
  }
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  class BaseDataLoader:
87
  # Define the base MODEL_GROUPS structure
88
  BASE_MODEL_GROUPS = {
@@ -183,10 +211,10 @@ class DefaultDataLoader(BaseDataLoader):
183
  core_noncot_score = summary["core_noncot"]["macro_mean_score"]
184
  core_cot_score = summary["core_cot"]["macro_mean_score"]
185
  row = {
186
- "Models": get_display_model_name(model),
187
  "Overall": round(summary["overall_score"] * 100, 2),
188
- "Core(w/o CoT)": round(core_noncot_score * 100, 2),
189
- "Core(w/ CoT)": round(core_cot_score * 100, 2),
190
  "Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
191
  }
192
  for display_name in self.SUPER_GROUPS[selected_super_group]:
@@ -203,21 +231,54 @@ class DefaultDataLoader(BaseDataLoader):
203
 
204
  def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]:
205
  df = self.get_df(selected_super_group, selected_model_group)
206
- headers = ["Models", "Overall", "Core(w/o CoT)", "Core(w/ CoT)", "Open-ended"] + self.SUPER_GROUPS[selected_super_group]
207
- data = df[headers].values.tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  return headers, data
209
 
210
 
211
- class CoreSingleDataLoader(BaseDataLoader):
212
  def __init__(self):
213
  super().__init__()
214
 
215
  def _load_model_data(self) -> Dict[str, Any]:
216
- with open("./static/eval_results/Core_SI/all_model_keywords_stats.json", "r") as f:
217
  return json.load(f)
218
 
219
  def _load_summary_data(self) -> Dict[str, Any]:
220
- with open("./static/eval_results/Core_SI/all_summary.json", "r") as f:
221
  return json.load(f)
222
 
223
  def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame:
@@ -226,10 +287,11 @@ class CoreSingleDataLoader(BaseDataLoader):
226
  for model in self.MODEL_GROUPS[selected_model_group]:
227
  model_data = self.MODEL_DATA[model]
228
  summary = self.SUMMARY_DATA[model]
229
- core_si_score = summary["macro_mean_score"]
230
  row = {
231
- "Models": get_display_model_name(model),
232
- "Core SI": round(core_si_score * 100, 2),
 
 
233
  }
234
  for display_name in self.SUPER_GROUPS[selected_super_group]:
235
  original_keyword = self.keyword_display_map[display_name]
@@ -240,13 +302,43 @@ class CoreSingleDataLoader(BaseDataLoader):
240
  data.append(row)
241
 
242
  df = pd.DataFrame(data)
243
- df = df.sort_values(by="Core SI", ascending=False)
244
  return df
245
 
246
  def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]:
247
  df = self.get_df(selected_super_group, selected_model_group)
248
- headers = ["Models", "Core SI"] + self.SUPER_GROUPS[selected_super_group]
249
- data = df[headers].values.tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  return headers, data
251
 
252
 
@@ -257,5 +349,8 @@ def get_original_dimension(mapped_dimension):
257
  def get_original_keyword(mapped_keyword):
258
  return next((k for k, v in KEYWORD_NAME_MAP.items() if v == mapped_keyword), mapped_keyword)
259
 
260
- def get_display_model_name(model_name):
261
- return MODEL_NAME_MAP.get(model_name, model_name)
 
 
 
 
11
  "InternVL2_76B": "InternVL2-Llama3-76B",
12
  "Qwen2_VL_72B": "Qwen2-VL-72B",
13
  "llava_onevision_72B": "Llava-OneVision-72B",
14
+ "NVLM": "NVLM-D-72B",
15
  "GPT_4o_mini": "GPT-4o mini",
16
  "Gemini_1.5_flash_002": "Gemini-1.5-Flash-002",
17
  "Pixtral_12B": "Pixtral 12B",
 
83
  "video": "Video",
84
  }
85
 
86
+ MODEL_URLS = {
87
+ "Claude_3.5_new": "https://www.anthropic.com/news/3-5-models-and-computer-use",
88
+ "GPT_4o": "https://platform.openai.com/docs/models/gpt-4o",
89
+ "Claude_3.5": "https://www.anthropic.com/news/claude-3-5-sonnet",
90
+ "Gemini_1.5_pro_002": "https://ai.google.dev/gemini-api/docs/models/gemini",
91
+ "Gemini_1.5_flash_002": "https://ai.google.dev/gemini-api/docs/models/gemini",
92
+ "GPT_4o_mini": "https://platform.openai.com/docs/models#gpt-4o-mini",
93
+ "Qwen2_VL_72B": "https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct",
94
+ "InternVL2_76B": "https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B",
95
+ "llava_onevision_72B": "https://huggingface.co/lmms-lab/llava-onevision-qwen2-72b-ov-chat",
96
+ "NVLM": "https://huggingface.co/nvidia/NVLM-D-72B",
97
+ "Molmo_72B": "https://huggingface.co/allenai/Molmo-72B-0924",
98
+ "Qwen2_VL_7B": "https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct",
99
+ "Pixtral_12B": "https://huggingface.co/mistralai/Pixtral-12B-2409",
100
+ "Aria": "https://huggingface.co/rhymes-ai/Aria",
101
+ "InternVL2_8B": "https://huggingface.co/OpenGVLab/InternVL2-8B",
102
+ "Phi-3.5-vision": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct",
103
+ "MiniCPM_v2.6": "https://huggingface.co/openbmb/MiniCPM-V-2_6",
104
+ "llava_onevision_7B": "https://huggingface.co/lmms-lab/llava-onevision-qwen2-7b-ov",
105
+ "Llama_3_2_11B": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision",
106
+ "Idefics3": "https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3",
107
+ "Molmo_7B_D": "https://huggingface.co/allenai/Molmo-7B-D-0924",
108
+ "Aquila_VL_2B": "https://huggingface.co/BAAI/Aquila-VL-2B-llava-qwen",
109
+ "POINTS_7B": "https://huggingface.co/WePOINTS/POINTS-Qwen-2-5-7B-Chat",
110
+ "Qwen2_VL_2B": "https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct",
111
+ "InternVL2_2B": "https://huggingface.co/OpenGVLab/InternVL2-2B"
112
+ }
113
+
114
  class BaseDataLoader:
115
  # Define the base MODEL_GROUPS structure
116
  BASE_MODEL_GROUPS = {
 
211
  core_noncot_score = summary["core_noncot"]["macro_mean_score"]
212
  core_cot_score = summary["core_cot"]["macro_mean_score"]
213
  row = {
214
+ "Models": get_display_model_name(model, as_link=True),
215
  "Overall": round(summary["overall_score"] * 100, 2),
216
+ "Core w/o CoT": round(core_noncot_score * 100, 2),
217
+ "Core w/ CoT": round(core_cot_score * 100, 2),
218
  "Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
219
  }
220
  for display_name in self.SUPER_GROUPS[selected_super_group]:
 
231
 
232
  def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]:
233
  df = self.get_df(selected_super_group, selected_model_group)
234
+
235
+ # Get total task counts from the first model's data
236
+ sample_model = next(iter(self.MODEL_DATA))
237
+ total_core_tasks = self.SUMMARY_DATA[sample_model]["core_noncot"]["num_eval_tasks"]
238
+ total_open_tasks = self.SUMMARY_DATA[sample_model]["open"]["num_eval_tasks"]
239
+ total_tasks = total_core_tasks + total_open_tasks
240
+
241
+ # Define headers with task counts
242
+ column_headers = {
243
+ "Models": "Models",
244
+ "Overall": f"Overall ({total_tasks})",
245
+ "Core w/o CoT": f"Core(w/o CoT) ({total_core_tasks})",
246
+ "Core w/ CoT": f"Core(w/ CoT) ({total_core_tasks})",
247
+ "Open-ended": f"Open-ended ({total_open_tasks})"
248
+ }
249
+
250
+ # Rename the columns in DataFrame to match headers
251
+ df = df.rename(columns=column_headers)
252
+
253
+ headers = [
254
+ column_headers["Models"],
255
+ column_headers["Overall"],
256
+ column_headers["Core w/o CoT"],
257
+ column_headers["Core w/ CoT"],
258
+ column_headers["Open-ended"]
259
+ ] + self.SUPER_GROUPS[selected_super_group]
260
+
261
+ data = df[[
262
+ column_headers["Models"],
263
+ column_headers["Overall"],
264
+ column_headers["Core w/o CoT"],
265
+ column_headers["Core w/ CoT"],
266
+ column_headers["Open-ended"]
267
+ ] + self.SUPER_GROUPS[selected_super_group]].values.tolist()
268
+
269
  return headers, data
270
 
271
 
272
+ class SingleImageDataLoader(BaseDataLoader):
273
  def __init__(self):
274
  super().__init__()
275
 
276
  def _load_model_data(self) -> Dict[str, Any]:
277
+ with open("./static/eval_results/SI/all_model_keywords_stats.json", "r") as f:
278
  return json.load(f)
279
 
280
  def _load_summary_data(self) -> Dict[str, Any]:
281
+ with open("./static/eval_results/SI/all_summary.json", "r") as f:
282
  return json.load(f)
283
 
284
  def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame:
 
287
  for model in self.MODEL_GROUPS[selected_model_group]:
288
  model_data = self.MODEL_DATA[model]
289
  summary = self.SUMMARY_DATA[model]
 
290
  row = {
291
+ "Models": get_display_model_name(model, as_link=True),
292
+ "Overall": round(summary["overall_score"] * 100, 2),
293
+ "Core": round(summary["core"]["macro_mean_score"] * 100, 2),
294
+ "Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
295
  }
296
  for display_name in self.SUPER_GROUPS[selected_super_group]:
297
  original_keyword = self.keyword_display_map[display_name]
 
302
  data.append(row)
303
 
304
  df = pd.DataFrame(data)
305
+ df = df.sort_values(by="Overall", ascending=False)
306
  return df
307
 
308
  def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]:
309
  df = self.get_df(selected_super_group, selected_model_group)
310
+
311
+ # Get total task counts from the first model's data
312
+ sample_model = next(iter(self.MODEL_DATA))
313
+ total_core_tasks = self.SUMMARY_DATA[sample_model]["core"]["num_eval_tasks"]
314
+ total_open_tasks = self.SUMMARY_DATA[sample_model]["open"]["num_eval_tasks"]
315
+ total_tasks = total_core_tasks + total_open_tasks
316
+
317
+ # Define headers with task counts
318
+ column_headers = {
319
+ "Models": "Models",
320
+ "Overall": f"Overall ({total_tasks})",
321
+ "Core": f"Core ({total_core_tasks})",
322
+ "Open-ended": f"Open-ended ({total_open_tasks})"
323
+ }
324
+
325
+ # Rename the columns in DataFrame to match headers
326
+ df = df.rename(columns=column_headers)
327
+
328
+ headers = [
329
+ column_headers["Models"],
330
+ column_headers["Overall"],
331
+ column_headers["Core"],
332
+ column_headers["Open-ended"]
333
+ ] + self.SUPER_GROUPS[selected_super_group]
334
+
335
+ data = df[[
336
+ column_headers["Models"],
337
+ column_headers["Overall"],
338
+ column_headers["Core"],
339
+ column_headers["Open-ended"]
340
+ ] + self.SUPER_GROUPS[selected_super_group]].values.tolist()
341
+
342
  return headers, data
343
 
344
 
 
349
  def get_original_keyword(mapped_keyword):
350
  return next((k for k, v in KEYWORD_NAME_MAP.items() if v == mapped_keyword), mapped_keyword)
351
 
352
+ def get_display_model_name(model_name: str, as_link: bool = True) -> str:
353
+ display_name = MODEL_NAME_MAP.get(model_name, model_name)
354
+ if as_link and model_name in MODEL_URLS:
355
+ return f'<a href="{MODEL_URLS[model_name]}" target="_blank" style="text-decoration: none; color: #2196F3;">{display_name}</a>'
356
+ return display_name