Spaces:
Running
Running
Update single-image results, add model link url
Browse files- app.py +17 -16
- static/css/{core_single.css → single_image.css} +12 -12
- static/eval_results/Core_SI/all_summary.json +0 -227
- static/eval_results/Default/all_model_keywords_stats.json +160 -160
- static/eval_results/Default/all_summary.json +68 -134
- static/eval_results/{Core_SI → SI}/all_model_keywords_stats.json +0 -0
- static/eval_results/SI/all_summary.json +471 -0
- utils.py +112 -17
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
-
from utils import DefaultDataLoader,
|
3 |
import os
|
4 |
from constants import *
|
5 |
|
@@ -9,19 +9,19 @@ current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
9 |
# Construct paths to CSS files
|
10 |
base_css_file = os.path.join(current_dir, "static", "css", "style.css")
|
11 |
default_css_file = os.path.join(current_dir, "static", "css", "default.css")
|
12 |
-
|
13 |
|
14 |
# Read CSS files
|
15 |
with open(base_css_file, "r") as f:
|
16 |
base_css = f.read()
|
17 |
with open(default_css_file, "r") as f:
|
18 |
default_css = f.read()
|
19 |
-
with open(
|
20 |
-
|
21 |
|
22 |
# Initialize data loaders
|
23 |
default_loader = DefaultDataLoader()
|
24 |
-
|
25 |
|
26 |
with gr.Blocks() as block:
|
27 |
# Add a style element that we'll update
|
@@ -49,14 +49,14 @@ with gr.Blocks() as block:
|
|
49 |
|
50 |
with gr.Row():
|
51 |
table_selector = gr.Radio(
|
52 |
-
choices=["Default", "
|
53 |
-
label="Select table to display",
|
54 |
value="Default"
|
55 |
)
|
56 |
|
57 |
# Define different captions for each table
|
58 |
default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword. <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> $\\text{Overall} \\ = \\ \\frac{\\max(\\text{Core w/o CoT}, \\ \\text{Core w/ CoT}) \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$"
|
59 |
-
|
60 |
|
61 |
caption_component = gr.Markdown(
|
62 |
value=default_caption,
|
@@ -80,10 +80,10 @@ with gr.Blocks() as block:
|
|
80 |
data_component = gr.Dataframe(
|
81 |
value=initial_data,
|
82 |
headers=initial_headers,
|
83 |
-
datatype=["
|
84 |
interactive=False,
|
85 |
elem_classes="custom-dataframe",
|
86 |
-
max_height=
|
87 |
)
|
88 |
|
89 |
def update_table_and_caption(table_type, super_group, model_group):
|
@@ -91,23 +91,24 @@ with gr.Blocks() as block:
|
|
91 |
headers, data = default_loader.get_leaderboard_data(super_group, model_group)
|
92 |
caption = default_caption
|
93 |
current_css = f"{base_css}\n{default_css}"
|
94 |
-
else: #
|
95 |
-
headers, data =
|
96 |
-
caption =
|
97 |
-
current_css = f"{base_css}\n{
|
98 |
|
99 |
return [
|
100 |
gr.Dataframe(
|
101 |
value=data,
|
102 |
headers=headers,
|
103 |
-
datatype=["
|
|
|
104 |
),
|
105 |
caption,
|
106 |
f"<style>{current_css}</style>"
|
107 |
]
|
108 |
|
109 |
def update_selectors(table_type):
|
110 |
-
loader = default_loader if table_type == "Default" else
|
111 |
return [
|
112 |
gr.Radio(choices=list(loader.SUPER_GROUPS.keys())),
|
113 |
gr.Radio(choices=list(loader.MODEL_GROUPS.keys()))
|
|
|
1 |
import gradio as gr
|
2 |
+
from utils import DefaultDataLoader, SingleImageDataLoader
|
3 |
import os
|
4 |
from constants import *
|
5 |
|
|
|
9 |
# Construct paths to CSS files
|
10 |
base_css_file = os.path.join(current_dir, "static", "css", "style.css")
|
11 |
default_css_file = os.path.join(current_dir, "static", "css", "default.css")
|
12 |
+
si_css_file = os.path.join(current_dir, "static", "css", "single_image.css")
|
13 |
|
14 |
# Read CSS files
|
15 |
with open(base_css_file, "r") as f:
|
16 |
base_css = f.read()
|
17 |
with open(default_css_file, "r") as f:
|
18 |
default_css = f.read()
|
19 |
+
with open(si_css_file, "r") as f:
|
20 |
+
si_css = f.read()
|
21 |
|
22 |
# Initialize data loaders
|
23 |
default_loader = DefaultDataLoader()
|
24 |
+
si_loader = SingleImageDataLoader()
|
25 |
|
26 |
with gr.Blocks() as block:
|
27 |
# Add a style element that we'll update
|
|
|
49 |
|
50 |
with gr.Row():
|
51 |
table_selector = gr.Radio(
|
52 |
+
choices=["Default", "Single Image"],
|
53 |
+
label="Select table to display. Default: all MEGA-Bench tasks; Single Image: single-image tasks only.",
|
54 |
value="Default"
|
55 |
)
|
56 |
|
57 |
# Define different captions for each table
|
58 |
default_caption = "**Table 1: MEGA-Bench full results.** The number in the parentheses is the number of tasks of each keyword. <br> The Core set contains $N_{\\text{core}} = 440$ tasks evaluated by rule-based metrics, and the Open-ended set contains $N_{\\text{open}} = 65$ tasks evaluated by a VLM judge (we use GPT-4o-0806). <br> $\\text{Overall} \\ = \\ \\frac{\\max(\\text{Core w/o CoT}, \\ \\text{Core w/ CoT}) \\ \\cdot \\ N_{\\text{core}} \\ + \\ \\text{Open-ended} \\ \\cdot \\ N_{\\text{open}}}{N_{\\text{core}} \\ + \\ N_{\\text{open}}}$"
|
59 |
+
single_image_caption = "**Table 2: MEGA-Bench Single-image setting results.** The number in the parentheses is the number of tasks in each keyword. <br> This subset contains 273 single-image tasks from the Core set and 42 single-image tasks from the Open-ended set. For open-source models, we drop the image input in the 1-shot demonstration example so that the entire query contains a single image only. <br> Compared to the default table, some models with only single-image support are added."
|
60 |
|
61 |
caption_component = gr.Markdown(
|
62 |
value=default_caption,
|
|
|
80 |
data_component = gr.Dataframe(
|
81 |
value=initial_data,
|
82 |
headers=initial_headers,
|
83 |
+
datatype=["html"] + ["number"] * (len(initial_headers) - 1),
|
84 |
interactive=False,
|
85 |
elem_classes="custom-dataframe",
|
86 |
+
max_height=2400,
|
87 |
)
|
88 |
|
89 |
def update_table_and_caption(table_type, super_group, model_group):
|
|
|
91 |
headers, data = default_loader.get_leaderboard_data(super_group, model_group)
|
92 |
caption = default_caption
|
93 |
current_css = f"{base_css}\n{default_css}"
|
94 |
+
else: # Single-image
|
95 |
+
headers, data = si_loader.get_leaderboard_data(super_group, model_group)
|
96 |
+
caption = single_image_caption
|
97 |
+
current_css = f"{base_css}\n{si_css}"
|
98 |
|
99 |
return [
|
100 |
gr.Dataframe(
|
101 |
value=data,
|
102 |
headers=headers,
|
103 |
+
datatype=["html"] + ["number"] * (len(headers) - 1),
|
104 |
+
interactive=False,
|
105 |
),
|
106 |
caption,
|
107 |
f"<style>{current_css}</style>"
|
108 |
]
|
109 |
|
110 |
def update_selectors(table_type):
|
111 |
+
loader = default_loader if table_type == "Default" else si_loader
|
112 |
return [
|
113 |
gr.Radio(choices=list(loader.SUPER_GROUPS.keys())),
|
114 |
gr.Radio(choices=list(loader.MODEL_GROUPS.keys()))
|
static/css/{core_single.css → single_image.css}
RENAMED
@@ -1,18 +1,18 @@
|
|
1 |
-
.custom-dataframe thead th:nth-child(-n+
|
2 |
-
.custom-dataframe tbody td:nth-child(-n+
|
3 |
background-color: var(--global-column-background) !important;
|
4 |
}
|
5 |
|
6 |
-
.custom-dataframe thead th:nth-child(n+
|
7 |
-
.custom-dataframe tbody td:nth-child(n+
|
8 |
background-color: var(--dimension-column-background) !important;
|
9 |
}
|
10 |
|
11 |
-
.custom-dataframe tbody tr:nth-child(even) td:nth-child(-n+
|
12 |
background-color: var(--row-even-global) !important;
|
13 |
}
|
14 |
|
15 |
-
.custom-dataframe tbody tr:nth-child(even) td:nth-child(n+
|
16 |
background-color: var(--row-even-dimension) !important;
|
17 |
}
|
18 |
|
@@ -33,21 +33,21 @@
|
|
33 |
color: var(--text-color) !important;
|
34 |
}
|
35 |
|
36 |
-
.custom-dataframe thead th:nth-child(-n+
|
37 |
-
.custom-dataframe tbody td:nth-child(-n+
|
38 |
background-color: var(--global-column-background) !important;
|
39 |
}
|
40 |
|
41 |
-
.custom-dataframe thead th:nth-child(n+
|
42 |
-
.custom-dataframe tbody td:nth-child(n+
|
43 |
background-color: var(--dimension-column-background) !important;
|
44 |
}
|
45 |
|
46 |
-
.custom-dataframe tbody tr:nth-child(even) td:nth-child(-n+
|
47 |
background-color: var(--row-even-global) !important;
|
48 |
}
|
49 |
|
50 |
-
.custom-dataframe tbody tr:nth-child(even) td:nth-child(n+
|
51 |
background-color: var(--row-even-dimension) !important;
|
52 |
}
|
53 |
|
|
|
1 |
+
.custom-dataframe thead th:nth-child(-n+4),
|
2 |
+
.custom-dataframe tbody td:nth-child(-n+4) {
|
3 |
background-color: var(--global-column-background) !important;
|
4 |
}
|
5 |
|
6 |
+
.custom-dataframe thead th:nth-child(n+5),
|
7 |
+
.custom-dataframe tbody td:nth-child(n+5) {
|
8 |
background-color: var(--dimension-column-background) !important;
|
9 |
}
|
10 |
|
11 |
+
.custom-dataframe tbody tr:nth-child(even) td:nth-child(-n+4) {
|
12 |
background-color: var(--row-even-global) !important;
|
13 |
}
|
14 |
|
15 |
+
.custom-dataframe tbody tr:nth-child(even) td:nth-child(n+5) {
|
16 |
background-color: var(--row-even-dimension) !important;
|
17 |
}
|
18 |
|
|
|
33 |
color: var(--text-color) !important;
|
34 |
}
|
35 |
|
36 |
+
.custom-dataframe thead th:nth-child(-n+4),
|
37 |
+
.custom-dataframe tbody td:nth-child(-n+4) {
|
38 |
background-color: var(--global-column-background) !important;
|
39 |
}
|
40 |
|
41 |
+
.custom-dataframe thead th:nth-child(n+5),
|
42 |
+
.custom-dataframe tbody td:nth-child(n+5) {
|
43 |
background-color: var(--dimension-column-background) !important;
|
44 |
}
|
45 |
|
46 |
+
.custom-dataframe tbody tr:nth-child(even) td:nth-child(-n+4) {
|
47 |
background-color: var(--row-even-global) !important;
|
48 |
}
|
49 |
|
50 |
+
.custom-dataframe tbody tr:nth-child(even) td:nth-child(n+5) {
|
51 |
background-color: var(--row-even-dimension) !important;
|
52 |
}
|
53 |
|
static/eval_results/Core_SI/all_summary.json
DELETED
@@ -1,227 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"Aquila_VL_2B": {
|
3 |
-
"num_eval_tasks": 273,
|
4 |
-
"num_eval_samples": 4116,
|
5 |
-
"num_not_eval_samples": 0,
|
6 |
-
"num_total_samples": 4377,
|
7 |
-
"macro_mean_score": 0.20770364903712493,
|
8 |
-
"micro_mean_score": 0.20333142638522636,
|
9 |
-
"missing_tasks": []
|
10 |
-
},
|
11 |
-
"Aria": {
|
12 |
-
"num_eval_tasks": 273,
|
13 |
-
"num_eval_samples": 4116,
|
14 |
-
"num_not_eval_samples": 0,
|
15 |
-
"num_total_samples": 4377,
|
16 |
-
"macro_mean_score": 0.3178882776147889,
|
17 |
-
"micro_mean_score": 0.3101511832828904,
|
18 |
-
"missing_tasks": []
|
19 |
-
},
|
20 |
-
"Claude_3.5": {
|
21 |
-
"num_eval_tasks": 273,
|
22 |
-
"num_eval_samples": 4116,
|
23 |
-
"num_not_eval_samples": 0,
|
24 |
-
"num_total_samples": 4116,
|
25 |
-
"macro_mean_score": 0.520276385877485,
|
26 |
-
"micro_mean_score": 0.520276385877485
|
27 |
-
},
|
28 |
-
"Claude_3.5_new": {
|
29 |
-
"num_eval_tasks": 273,
|
30 |
-
"num_eval_samples": 4116,
|
31 |
-
"num_not_eval_samples": 0,
|
32 |
-
"num_total_samples": 4116,
|
33 |
-
"macro_mean_score": 0.5462752278980763,
|
34 |
-
"micro_mean_score": 0.5462752278980763
|
35 |
-
},
|
36 |
-
"GPT_4o": {
|
37 |
-
"num_eval_tasks": 273,
|
38 |
-
"num_eval_samples": 4116,
|
39 |
-
"num_not_eval_samples": 0,
|
40 |
-
"num_total_samples": 4116,
|
41 |
-
"macro_mean_score": 0.5529953662872719,
|
42 |
-
"micro_mean_score": 0.5529953662872719
|
43 |
-
},
|
44 |
-
"GPT_4o_mini": {
|
45 |
-
"num_eval_tasks": 273,
|
46 |
-
"num_eval_samples": 4116,
|
47 |
-
"num_not_eval_samples": 0,
|
48 |
-
"num_total_samples": 4116,
|
49 |
-
"macro_mean_score": 0.44285970964797233,
|
50 |
-
"micro_mean_score": 0.44285970964797233
|
51 |
-
},
|
52 |
-
"Gemini_1.5_flash_002": {
|
53 |
-
"num_eval_tasks": 273,
|
54 |
-
"num_eval_samples": 4116,
|
55 |
-
"num_not_eval_samples": 0,
|
56 |
-
"num_total_samples": 4116,
|
57 |
-
"macro_mean_score": 0.42188460865574384,
|
58 |
-
"micro_mean_score": 0.42188460865574384
|
59 |
-
},
|
60 |
-
"Gemini_1.5_pro_002": {
|
61 |
-
"num_eval_tasks": 273,
|
62 |
-
"num_eval_samples": 4116,
|
63 |
-
"num_not_eval_samples": 0,
|
64 |
-
"num_total_samples": 4116,
|
65 |
-
"macro_mean_score": 0.4914311038229404,
|
66 |
-
"micro_mean_score": 0.4914311038229404
|
67 |
-
},
|
68 |
-
"Idefics3": {
|
69 |
-
"num_eval_tasks": 273,
|
70 |
-
"num_eval_samples": 4116,
|
71 |
-
"num_not_eval_samples": 0,
|
72 |
-
"num_total_samples": 4377,
|
73 |
-
"macro_mean_score": 0.08941182847569326,
|
74 |
-
"micro_mean_score": 0.08779475233900695,
|
75 |
-
"missing_tasks": []
|
76 |
-
},
|
77 |
-
"InternVL2_2B": {
|
78 |
-
"num_eval_tasks": 273,
|
79 |
-
"num_eval_samples": 4116,
|
80 |
-
"num_not_eval_samples": 0,
|
81 |
-
"num_total_samples": 4377,
|
82 |
-
"macro_mean_score": 0.12069001041308772,
|
83 |
-
"micro_mean_score": 0.11842605219090299,
|
84 |
-
"missing_tasks": []
|
85 |
-
},
|
86 |
-
"InternVL2_76B": {
|
87 |
-
"num_eval_tasks": 273,
|
88 |
-
"num_eval_samples": 4116,
|
89 |
-
"num_not_eval_samples": 0,
|
90 |
-
"num_total_samples": 4377,
|
91 |
-
"macro_mean_score": 0.3998616568018755,
|
92 |
-
"micro_mean_score": 0.39149064302628933,
|
93 |
-
"missing_tasks": []
|
94 |
-
},
|
95 |
-
"InternVL2_8B": {
|
96 |
-
"num_eval_tasks": 273,
|
97 |
-
"num_eval_samples": 4116,
|
98 |
-
"num_not_eval_samples": 0,
|
99 |
-
"num_total_samples": 4377,
|
100 |
-
"macro_mean_score": 0.27650612401825575,
|
101 |
-
"micro_mean_score": 0.27119471729837735,
|
102 |
-
"missing_tasks": []
|
103 |
-
},
|
104 |
-
"Llama_3_2_11B": {
|
105 |
-
"num_eval_tasks": 273,
|
106 |
-
"num_eval_samples": 4116,
|
107 |
-
"num_not_eval_samples": 0,
|
108 |
-
"num_total_samples": 4377,
|
109 |
-
"macro_mean_score": 0.20789144960796493,
|
110 |
-
"micro_mean_score": 0.20163641703273802,
|
111 |
-
"missing_tasks": []
|
112 |
-
},
|
113 |
-
"MiniCPM_v2.6": {
|
114 |
-
"num_eval_tasks": 273,
|
115 |
-
"num_eval_samples": 4116,
|
116 |
-
"num_not_eval_samples": 0,
|
117 |
-
"num_total_samples": 4377,
|
118 |
-
"macro_mean_score": 0.23230765810722817,
|
119 |
-
"micro_mean_score": 0.22684118052665975,
|
120 |
-
"missing_tasks": []
|
121 |
-
},
|
122 |
-
"Molmo_72B": {
|
123 |
-
"num_eval_tasks": 270,
|
124 |
-
"num_eval_samples": 4073,
|
125 |
-
"num_not_eval_samples": 0,
|
126 |
-
"num_total_samples": 4331,
|
127 |
-
"macro_mean_score": 0.36480000609384927,
|
128 |
-
"micro_mean_score": 0.36205779758110807,
|
129 |
-
"missing_tasks": [
|
130 |
-
"table_understanding",
|
131 |
-
"MMSoc_Misinformation_PolitiFact",
|
132 |
-
"planning_screenshot_termes"
|
133 |
-
]
|
134 |
-
},
|
135 |
-
"Molmo_7B_D": {
|
136 |
-
"num_eval_tasks": 272,
|
137 |
-
"num_eval_samples": 4102,
|
138 |
-
"num_not_eval_samples": 0,
|
139 |
-
"num_total_samples": 4362,
|
140 |
-
"macro_mean_score": 0.2098088446992518,
|
141 |
-
"micro_mean_score": 0.20550929661464645,
|
142 |
-
"missing_tasks": [
|
143 |
-
"MMSoc_Misinformation_PolitiFact"
|
144 |
-
]
|
145 |
-
},
|
146 |
-
"NVLM": {
|
147 |
-
"num_eval_tasks": 273,
|
148 |
-
"num_eval_samples": 4116,
|
149 |
-
"num_not_eval_samples": 0,
|
150 |
-
"num_total_samples": 4377,
|
151 |
-
"macro_mean_score": 0.32989872890926025,
|
152 |
-
"micro_mean_score": 0.32315683713111915,
|
153 |
-
"missing_tasks": []
|
154 |
-
},
|
155 |
-
"POINTS_7B": {
|
156 |
-
"num_eval_tasks": 273,
|
157 |
-
"num_eval_samples": 4116,
|
158 |
-
"num_not_eval_samples": 0,
|
159 |
-
"num_total_samples": 4377,
|
160 |
-
"macro_mean_score": 0.25511317681632334,
|
161 |
-
"micro_mean_score": 0.24927711632415062,
|
162 |
-
"missing_tasks": []
|
163 |
-
},
|
164 |
-
"Phi-3.5-vision": {
|
165 |
-
"num_eval_tasks": 273,
|
166 |
-
"num_eval_samples": 4116,
|
167 |
-
"num_not_eval_samples": 0,
|
168 |
-
"num_total_samples": 4377,
|
169 |
-
"macro_mean_score": 0.2561274958722834,
|
170 |
-
"micro_mean_score": 0.2504214576875906,
|
171 |
-
"missing_tasks": []
|
172 |
-
},
|
173 |
-
"Pixtral_12B": {
|
174 |
-
"num_eval_tasks": 273,
|
175 |
-
"num_eval_samples": 4116,
|
176 |
-
"num_not_eval_samples": 0,
|
177 |
-
"num_total_samples": 4377,
|
178 |
-
"macro_mean_score": 0.3436942439614412,
|
179 |
-
"micro_mean_score": 0.3373564384613738,
|
180 |
-
"missing_tasks": []
|
181 |
-
},
|
182 |
-
"Qwen2_VL_2B": {
|
183 |
-
"num_eval_tasks": 273,
|
184 |
-
"num_eval_samples": 4116,
|
185 |
-
"num_not_eval_samples": 0,
|
186 |
-
"num_total_samples": 4377,
|
187 |
-
"macro_mean_score": 0.22787906973244856,
|
188 |
-
"micro_mean_score": 0.2234748515064842,
|
189 |
-
"missing_tasks": []
|
190 |
-
},
|
191 |
-
"Qwen2_VL_72B": {
|
192 |
-
"num_eval_tasks": 273,
|
193 |
-
"num_eval_samples": 4116,
|
194 |
-
"num_not_eval_samples": 0,
|
195 |
-
"num_total_samples": 4377,
|
196 |
-
"macro_mean_score": 0.4730536307784527,
|
197 |
-
"micro_mean_score": 0.4659830915476831,
|
198 |
-
"missing_tasks": []
|
199 |
-
},
|
200 |
-
"Qwen2_VL_7B": {
|
201 |
-
"num_eval_tasks": 273,
|
202 |
-
"num_eval_samples": 4116,
|
203 |
-
"num_not_eval_samples": 0,
|
204 |
-
"num_total_samples": 4377,
|
205 |
-
"macro_mean_score": 0.3538656561495699,
|
206 |
-
"micro_mean_score": 0.34581250459157137,
|
207 |
-
"missing_tasks": []
|
208 |
-
},
|
209 |
-
"llava_onevision_72B": {
|
210 |
-
"num_eval_tasks": 273,
|
211 |
-
"num_eval_samples": 4116,
|
212 |
-
"num_not_eval_samples": 0,
|
213 |
-
"num_total_samples": 4377,
|
214 |
-
"macro_mean_score": 0.312618242621264,
|
215 |
-
"micro_mean_score": 0.3098623876487132,
|
216 |
-
"missing_tasks": []
|
217 |
-
},
|
218 |
-
"llava_onevision_7B": {
|
219 |
-
"num_eval_tasks": 273,
|
220 |
-
"num_eval_samples": 4116,
|
221 |
-
"num_not_eval_samples": 0,
|
222 |
-
"num_total_samples": 4377,
|
223 |
-
"macro_mean_score": 0.23683339637631812,
|
224 |
-
"micro_mean_score": 0.23283041278687175,
|
225 |
-
"missing_tasks": []
|
226 |
-
}
|
227 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static/eval_results/Default/all_model_keywords_stats.json
CHANGED
@@ -239,25 +239,25 @@
|
|
239 |
"count": 303,
|
240 |
"num_samples": 4755,
|
241 |
"tasks": [],
|
242 |
-
"average_score": 0.
|
243 |
},
|
244 |
"Text Recognition (OCR)": {
|
245 |
"count": 137,
|
246 |
"num_samples": 2239,
|
247 |
"tasks": [],
|
248 |
-
"average_score": 0.
|
249 |
},
|
250 |
"Language Understanding and Generation": {
|
251 |
"count": 154,
|
252 |
"num_samples": 2509,
|
253 |
"tasks": [],
|
254 |
-
"average_score": 0.
|
255 |
},
|
256 |
"Scene and Event Understanding": {
|
257 |
"count": 154,
|
258 |
"num_samples": 2467,
|
259 |
"tasks": [],
|
260 |
-
"average_score": 0.
|
261 |
},
|
262 |
"Mathematical and Logical Reasoning": {
|
263 |
"count": 109,
|
@@ -269,7 +269,7 @@
|
|
269 |
"count": 51,
|
270 |
"num_samples": 855,
|
271 |
"tasks": [],
|
272 |
-
"average_score": 0.
|
273 |
},
|
274 |
"Ethical and Safety Reasoning": {
|
275 |
"count": 15,
|
@@ -301,7 +301,7 @@
|
|
301 |
"count": 93,
|
302 |
"num_samples": 1517,
|
303 |
"tasks": [],
|
304 |
-
"average_score": 0.
|
305 |
},
|
306 |
"Text-Based Images and Documents": {
|
307 |
"count": 82,
|
@@ -331,7 +331,7 @@
|
|
331 |
"count": 143,
|
332 |
"num_samples": 2248,
|
333 |
"tasks": [],
|
334 |
-
"average_score": 0.
|
335 |
},
|
336 |
"3D Models and Aerial Imagery": {
|
337 |
"count": 11,
|
@@ -351,7 +351,7 @@
|
|
351 |
"count": 110,
|
352 |
"num_samples": 1714,
|
353 |
"tasks": [],
|
354 |
-
"average_score": 0.
|
355 |
},
|
356 |
"exact_text": {
|
357 |
"count": 83,
|
@@ -389,7 +389,7 @@
|
|
389 |
"count": 41,
|
390 |
"num_samples": 623,
|
391 |
"tasks": [],
|
392 |
-
"average_score": 0.
|
393 |
},
|
394 |
"1-image": {
|
395 |
"count": 315,
|
@@ -413,7 +413,7 @@
|
|
413 |
"count": 51,
|
414 |
"num_samples": 802,
|
415 |
"tasks": [],
|
416 |
-
"average_score": 0.
|
417 |
}
|
418 |
},
|
419 |
"app": {
|
@@ -421,7 +421,7 @@
|
|
421 |
"count": 72,
|
422 |
"num_samples": 1124,
|
423 |
"tasks": [],
|
424 |
-
"average_score": 0.
|
425 |
},
|
426 |
"Planning": {
|
427 |
"count": 78,
|
@@ -457,7 +457,7 @@
|
|
457 |
"count": 97,
|
458 |
"num_samples": 1605,
|
459 |
"tasks": [],
|
460 |
-
"average_score": 0.
|
461 |
},
|
462 |
"Mathematics": {
|
463 |
"count": 33,
|
@@ -479,13 +479,13 @@
|
|
479 |
"count": 137,
|
480 |
"num_samples": 2239,
|
481 |
"tasks": [],
|
482 |
-
"average_score": 0.
|
483 |
},
|
484 |
"Language Understanding and Generation": {
|
485 |
"count": 154,
|
486 |
"num_samples": 2509,
|
487 |
"tasks": [],
|
488 |
-
"average_score": 0.
|
489 |
},
|
490 |
"Scene and Event Understanding": {
|
491 |
"count": 154,
|
@@ -535,7 +535,7 @@
|
|
535 |
"count": 93,
|
536 |
"num_samples": 1517,
|
537 |
"tasks": [],
|
538 |
-
"average_score": 0.
|
539 |
},
|
540 |
"Text-Based Images and Documents": {
|
541 |
"count": 82,
|
@@ -585,7 +585,7 @@
|
|
585 |
"count": 110,
|
586 |
"num_samples": 1714,
|
587 |
"tasks": [],
|
588 |
-
"average_score": 0.
|
589 |
},
|
590 |
"exact_text": {
|
591 |
"count": 83,
|
@@ -623,7 +623,7 @@
|
|
623 |
"count": 41,
|
624 |
"num_samples": 623,
|
625 |
"tasks": [],
|
626 |
-
"average_score": 0.
|
627 |
},
|
628 |
"1-image": {
|
629 |
"count": 315,
|
@@ -655,7 +655,7 @@
|
|
655 |
"count": 72,
|
656 |
"num_samples": 1124,
|
657 |
"tasks": [],
|
658 |
-
"average_score": 0.
|
659 |
},
|
660 |
"Planning": {
|
661 |
"count": 78,
|
@@ -713,13 +713,13 @@
|
|
713 |
"count": 137,
|
714 |
"num_samples": 2239,
|
715 |
"tasks": [],
|
716 |
-
"average_score": 0.
|
717 |
},
|
718 |
"Language Understanding and Generation": {
|
719 |
"count": 154,
|
720 |
"num_samples": 2509,
|
721 |
"tasks": [],
|
722 |
-
"average_score": 0.
|
723 |
},
|
724 |
"Scene and Event Understanding": {
|
725 |
"count": 154,
|
@@ -769,7 +769,7 @@
|
|
769 |
"count": 93,
|
770 |
"num_samples": 1517,
|
771 |
"tasks": [],
|
772 |
-
"average_score": 0.
|
773 |
},
|
774 |
"Text-Based Images and Documents": {
|
775 |
"count": 82,
|
@@ -819,7 +819,7 @@
|
|
819 |
"count": 110,
|
820 |
"num_samples": 1714,
|
821 |
"tasks": [],
|
822 |
-
"average_score": 0.
|
823 |
},
|
824 |
"exact_text": {
|
825 |
"count": 83,
|
@@ -857,7 +857,7 @@
|
|
857 |
"count": 41,
|
858 |
"num_samples": 623,
|
859 |
"tasks": [],
|
860 |
-
"average_score": 0.
|
861 |
},
|
862 |
"1-image": {
|
863 |
"count": 315,
|
@@ -889,7 +889,7 @@
|
|
889 |
"count": 72,
|
890 |
"num_samples": 1124,
|
891 |
"tasks": [],
|
892 |
-
"average_score": 0.
|
893 |
},
|
894 |
"Planning": {
|
895 |
"count": 78,
|
@@ -1175,25 +1175,25 @@
|
|
1175 |
"count": 303,
|
1176 |
"num_samples": 4755,
|
1177 |
"tasks": [],
|
1178 |
-
"average_score": 0.
|
1179 |
},
|
1180 |
"Text Recognition (OCR)": {
|
1181 |
"count": 137,
|
1182 |
"num_samples": 2239,
|
1183 |
"tasks": [],
|
1184 |
-
"average_score": 0.
|
1185 |
},
|
1186 |
"Language Understanding and Generation": {
|
1187 |
"count": 154,
|
1188 |
"num_samples": 2509,
|
1189 |
"tasks": [],
|
1190 |
-
"average_score": 0.
|
1191 |
},
|
1192 |
"Scene and Event Understanding": {
|
1193 |
"count": 154,
|
1194 |
"num_samples": 2467,
|
1195 |
"tasks": [],
|
1196 |
-
"average_score": 0.
|
1197 |
},
|
1198 |
"Mathematical and Logical Reasoning": {
|
1199 |
"count": 109,
|
@@ -1205,7 +1205,7 @@
|
|
1205 |
"count": 51,
|
1206 |
"num_samples": 855,
|
1207 |
"tasks": [],
|
1208 |
-
"average_score": 0.
|
1209 |
},
|
1210 |
"Ethical and Safety Reasoning": {
|
1211 |
"count": 15,
|
@@ -1237,7 +1237,7 @@
|
|
1237 |
"count": 93,
|
1238 |
"num_samples": 1517,
|
1239 |
"tasks": [],
|
1240 |
-
"average_score": 0.
|
1241 |
},
|
1242 |
"Text-Based Images and Documents": {
|
1243 |
"count": 82,
|
@@ -1267,7 +1267,7 @@
|
|
1267 |
"count": 143,
|
1268 |
"num_samples": 2248,
|
1269 |
"tasks": [],
|
1270 |
-
"average_score": 0.
|
1271 |
},
|
1272 |
"3D Models and Aerial Imagery": {
|
1273 |
"count": 11,
|
@@ -1281,13 +1281,13 @@
|
|
1281 |
"count": 98,
|
1282 |
"num_samples": 1514,
|
1283 |
"tasks": [],
|
1284 |
-
"average_score": 0.
|
1285 |
},
|
1286 |
"structured_output": {
|
1287 |
"count": 110,
|
1288 |
"num_samples": 1714,
|
1289 |
"tasks": [],
|
1290 |
-
"average_score": 0.
|
1291 |
},
|
1292 |
"exact_text": {
|
1293 |
"count": 83,
|
@@ -1325,13 +1325,13 @@
|
|
1325 |
"count": 41,
|
1326 |
"num_samples": 623,
|
1327 |
"tasks": [],
|
1328 |
-
"average_score": 0.
|
1329 |
},
|
1330 |
"1-image": {
|
1331 |
"count": 315,
|
1332 |
"num_samples": 5228,
|
1333 |
"tasks": [],
|
1334 |
-
"average_score": 0.
|
1335 |
},
|
1336 |
"video": {
|
1337 |
"count": 43,
|
@@ -1349,7 +1349,7 @@
|
|
1349 |
"count": 51,
|
1350 |
"num_samples": 802,
|
1351 |
"tasks": [],
|
1352 |
-
"average_score": 0.
|
1353 |
}
|
1354 |
},
|
1355 |
"app": {
|
@@ -1357,7 +1357,7 @@
|
|
1357 |
"count": 72,
|
1358 |
"num_samples": 1124,
|
1359 |
"tasks": [],
|
1360 |
-
"average_score": 0.
|
1361 |
},
|
1362 |
"Planning": {
|
1363 |
"count": 78,
|
@@ -1375,7 +1375,7 @@
|
|
1375 |
"count": 145,
|
1376 |
"num_samples": 2313,
|
1377 |
"tasks": [],
|
1378 |
-
"average_score": 0.
|
1379 |
},
|
1380 |
"Metrics": {
|
1381 |
"count": 20,
|
@@ -1393,7 +1393,7 @@
|
|
1393 |
"count": 97,
|
1394 |
"num_samples": 1605,
|
1395 |
"tasks": [],
|
1396 |
-
"average_score": 0.
|
1397 |
},
|
1398 |
"Mathematics": {
|
1399 |
"count": 33,
|
@@ -1409,25 +1409,25 @@
|
|
1409 |
"count": 303,
|
1410 |
"num_samples": 4755,
|
1411 |
"tasks": [],
|
1412 |
-
"average_score": 0.
|
1413 |
},
|
1414 |
"Text Recognition (OCR)": {
|
1415 |
"count": 137,
|
1416 |
"num_samples": 2239,
|
1417 |
"tasks": [],
|
1418 |
-
"average_score": 0.
|
1419 |
},
|
1420 |
"Language Understanding and Generation": {
|
1421 |
"count": 154,
|
1422 |
"num_samples": 2509,
|
1423 |
"tasks": [],
|
1424 |
-
"average_score": 0.
|
1425 |
},
|
1426 |
"Scene and Event Understanding": {
|
1427 |
"count": 154,
|
1428 |
"num_samples": 2467,
|
1429 |
"tasks": [],
|
1430 |
-
"average_score": 0.
|
1431 |
},
|
1432 |
"Mathematical and Logical Reasoning": {
|
1433 |
"count": 109,
|
@@ -1439,7 +1439,7 @@
|
|
1439 |
"count": 51,
|
1440 |
"num_samples": 855,
|
1441 |
"tasks": [],
|
1442 |
-
"average_score": 0.
|
1443 |
},
|
1444 |
"Ethical and Safety Reasoning": {
|
1445 |
"count": 15,
|
@@ -1471,7 +1471,7 @@
|
|
1471 |
"count": 93,
|
1472 |
"num_samples": 1517,
|
1473 |
"tasks": [],
|
1474 |
-
"average_score": 0.
|
1475 |
},
|
1476 |
"Text-Based Images and Documents": {
|
1477 |
"count": 82,
|
@@ -1501,7 +1501,7 @@
|
|
1501 |
"count": 143,
|
1502 |
"num_samples": 2248,
|
1503 |
"tasks": [],
|
1504 |
-
"average_score": 0.
|
1505 |
},
|
1506 |
"3D Models and Aerial Imagery": {
|
1507 |
"count": 11,
|
@@ -1515,13 +1515,13 @@
|
|
1515 |
"count": 98,
|
1516 |
"num_samples": 1514,
|
1517 |
"tasks": [],
|
1518 |
-
"average_score": 0.
|
1519 |
},
|
1520 |
"structured_output": {
|
1521 |
"count": 110,
|
1522 |
"num_samples": 1714,
|
1523 |
"tasks": [],
|
1524 |
-
"average_score": 0.
|
1525 |
},
|
1526 |
"exact_text": {
|
1527 |
"count": 83,
|
@@ -1559,13 +1559,13 @@
|
|
1559 |
"count": 41,
|
1560 |
"num_samples": 623,
|
1561 |
"tasks": [],
|
1562 |
-
"average_score": 0.
|
1563 |
},
|
1564 |
"1-image": {
|
1565 |
"count": 315,
|
1566 |
"num_samples": 5228,
|
1567 |
"tasks": [],
|
1568 |
-
"average_score": 0.
|
1569 |
},
|
1570 |
"video": {
|
1571 |
"count": 43,
|
@@ -1583,7 +1583,7 @@
|
|
1583 |
"count": 51,
|
1584 |
"num_samples": 802,
|
1585 |
"tasks": [],
|
1586 |
-
"average_score": 0.
|
1587 |
}
|
1588 |
},
|
1589 |
"app": {
|
@@ -1591,7 +1591,7 @@
|
|
1591 |
"count": 72,
|
1592 |
"num_samples": 1124,
|
1593 |
"tasks": [],
|
1594 |
-
"average_score": 0.
|
1595 |
},
|
1596 |
"Planning": {
|
1597 |
"count": 78,
|
@@ -1609,7 +1609,7 @@
|
|
1609 |
"count": 145,
|
1610 |
"num_samples": 2313,
|
1611 |
"tasks": [],
|
1612 |
-
"average_score": 0.
|
1613 |
},
|
1614 |
"Metrics": {
|
1615 |
"count": 20,
|
@@ -1627,7 +1627,7 @@
|
|
1627 |
"count": 97,
|
1628 |
"num_samples": 1605,
|
1629 |
"tasks": [],
|
1630 |
-
"average_score": 0.
|
1631 |
},
|
1632 |
"Mathematics": {
|
1633 |
"count": 33,
|
@@ -1643,25 +1643,25 @@
|
|
1643 |
"count": 303,
|
1644 |
"num_samples": 4755,
|
1645 |
"tasks": [],
|
1646 |
-
"average_score": 0.
|
1647 |
},
|
1648 |
"Text Recognition (OCR)": {
|
1649 |
"count": 137,
|
1650 |
"num_samples": 2239,
|
1651 |
"tasks": [],
|
1652 |
-
"average_score": 0.
|
1653 |
},
|
1654 |
"Language Understanding and Generation": {
|
1655 |
"count": 154,
|
1656 |
"num_samples": 2511,
|
1657 |
"tasks": [],
|
1658 |
-
"average_score": 0.
|
1659 |
},
|
1660 |
"Scene and Event Understanding": {
|
1661 |
"count": 154,
|
1662 |
"num_samples": 2469,
|
1663 |
"tasks": [],
|
1664 |
-
"average_score": 0.
|
1665 |
},
|
1666 |
"Mathematical and Logical Reasoning": {
|
1667 |
"count": 109,
|
@@ -1673,7 +1673,7 @@
|
|
1673 |
"count": 51,
|
1674 |
"num_samples": 855,
|
1675 |
"tasks": [],
|
1676 |
-
"average_score": 0.
|
1677 |
},
|
1678 |
"Ethical and Safety Reasoning": {
|
1679 |
"count": 15,
|
@@ -1705,7 +1705,7 @@
|
|
1705 |
"count": 93,
|
1706 |
"num_samples": 1517,
|
1707 |
"tasks": [],
|
1708 |
-
"average_score": 0.
|
1709 |
},
|
1710 |
"Text-Based Images and Documents": {
|
1711 |
"count": 82,
|
@@ -1735,7 +1735,7 @@
|
|
1735 |
"count": 143,
|
1736 |
"num_samples": 2248,
|
1737 |
"tasks": [],
|
1738 |
-
"average_score": 0.
|
1739 |
},
|
1740 |
"3D Models and Aerial Imagery": {
|
1741 |
"count": 11,
|
@@ -1749,13 +1749,13 @@
|
|
1749 |
"count": 98,
|
1750 |
"num_samples": 1514,
|
1751 |
"tasks": [],
|
1752 |
-
"average_score": 0.
|
1753 |
},
|
1754 |
"structured_output": {
|
1755 |
"count": 110,
|
1756 |
"num_samples": 1714,
|
1757 |
"tasks": [],
|
1758 |
-
"average_score": 0.
|
1759 |
},
|
1760 |
"exact_text": {
|
1761 |
"count": 83,
|
@@ -1793,13 +1793,13 @@
|
|
1793 |
"count": 41,
|
1794 |
"num_samples": 623,
|
1795 |
"tasks": [],
|
1796 |
-
"average_score": 0.
|
1797 |
},
|
1798 |
"1-image": {
|
1799 |
"count": 315,
|
1800 |
"num_samples": 5228,
|
1801 |
"tasks": [],
|
1802 |
-
"average_score": 0.
|
1803 |
},
|
1804 |
"video": {
|
1805 |
"count": 43,
|
@@ -1817,7 +1817,7 @@
|
|
1817 |
"count": 51,
|
1818 |
"num_samples": 802,
|
1819 |
"tasks": [],
|
1820 |
-
"average_score": 0.
|
1821 |
}
|
1822 |
},
|
1823 |
"app": {
|
@@ -1825,7 +1825,7 @@
|
|
1825 |
"count": 72,
|
1826 |
"num_samples": 1124,
|
1827 |
"tasks": [],
|
1828 |
-
"average_score": 0.
|
1829 |
},
|
1830 |
"Planning": {
|
1831 |
"count": 78,
|
@@ -1843,7 +1843,7 @@
|
|
1843 |
"count": 145,
|
1844 |
"num_samples": 2315,
|
1845 |
"tasks": [],
|
1846 |
-
"average_score": 0.
|
1847 |
},
|
1848 |
"Metrics": {
|
1849 |
"count": 20,
|
@@ -1861,7 +1861,7 @@
|
|
1861 |
"count": 97,
|
1862 |
"num_samples": 1605,
|
1863 |
"tasks": [],
|
1864 |
-
"average_score": 0.
|
1865 |
},
|
1866 |
"Mathematics": {
|
1867 |
"count": 33,
|
@@ -1883,13 +1883,13 @@
|
|
1883 |
"count": 137,
|
1884 |
"num_samples": 2239,
|
1885 |
"tasks": [],
|
1886 |
-
"average_score": 0.
|
1887 |
},
|
1888 |
"Language Understanding and Generation": {
|
1889 |
"count": 154,
|
1890 |
"num_samples": 2509,
|
1891 |
"tasks": [],
|
1892 |
-
"average_score": 0.
|
1893 |
},
|
1894 |
"Scene and Event Understanding": {
|
1895 |
"count": 154,
|
@@ -1939,7 +1939,7 @@
|
|
1939 |
"count": 93,
|
1940 |
"num_samples": 1517,
|
1941 |
"tasks": [],
|
1942 |
-
"average_score": 0.
|
1943 |
},
|
1944 |
"Text-Based Images and Documents": {
|
1945 |
"count": 82,
|
@@ -1969,7 +1969,7 @@
|
|
1969 |
"count": 143,
|
1970 |
"num_samples": 2248,
|
1971 |
"tasks": [],
|
1972 |
-
"average_score": 0.
|
1973 |
},
|
1974 |
"3D Models and Aerial Imagery": {
|
1975 |
"count": 11,
|
@@ -1983,13 +1983,13 @@
|
|
1983 |
"count": 98,
|
1984 |
"num_samples": 1514,
|
1985 |
"tasks": [],
|
1986 |
-
"average_score": 0.
|
1987 |
},
|
1988 |
"structured_output": {
|
1989 |
"count": 110,
|
1990 |
"num_samples": 1714,
|
1991 |
"tasks": [],
|
1992 |
-
"average_score": 0.
|
1993 |
},
|
1994 |
"exact_text": {
|
1995 |
"count": 83,
|
@@ -2027,13 +2027,13 @@
|
|
2027 |
"count": 41,
|
2028 |
"num_samples": 623,
|
2029 |
"tasks": [],
|
2030 |
-
"average_score": 0.
|
2031 |
},
|
2032 |
"1-image": {
|
2033 |
"count": 315,
|
2034 |
"num_samples": 5228,
|
2035 |
"tasks": [],
|
2036 |
-
"average_score": 0.
|
2037 |
},
|
2038 |
"video": {
|
2039 |
"count": 43,
|
@@ -2059,7 +2059,7 @@
|
|
2059 |
"count": 72,
|
2060 |
"num_samples": 1124,
|
2061 |
"tasks": [],
|
2062 |
-
"average_score": 0.
|
2063 |
},
|
2064 |
"Planning": {
|
2065 |
"count": 78,
|
@@ -2077,7 +2077,7 @@
|
|
2077 |
"count": 145,
|
2078 |
"num_samples": 2313,
|
2079 |
"tasks": [],
|
2080 |
-
"average_score": 0.
|
2081 |
},
|
2082 |
"Metrics": {
|
2083 |
"count": 20,
|
@@ -2117,13 +2117,13 @@
|
|
2117 |
"count": 137,
|
2118 |
"num_samples": 2239,
|
2119 |
"tasks": [],
|
2120 |
-
"average_score": 0.
|
2121 |
},
|
2122 |
"Language Understanding and Generation": {
|
2123 |
"count": 154,
|
2124 |
"num_samples": 2509,
|
2125 |
"tasks": [],
|
2126 |
-
"average_score": 0.
|
2127 |
},
|
2128 |
"Scene and Event Understanding": {
|
2129 |
"count": 154,
|
@@ -2173,7 +2173,7 @@
|
|
2173 |
"count": 93,
|
2174 |
"num_samples": 1517,
|
2175 |
"tasks": [],
|
2176 |
-
"average_score": 0.
|
2177 |
},
|
2178 |
"Text-Based Images and Documents": {
|
2179 |
"count": 82,
|
@@ -2203,7 +2203,7 @@
|
|
2203 |
"count": 143,
|
2204 |
"num_samples": 2248,
|
2205 |
"tasks": [],
|
2206 |
-
"average_score": 0.
|
2207 |
},
|
2208 |
"3D Models and Aerial Imagery": {
|
2209 |
"count": 11,
|
@@ -2217,13 +2217,13 @@
|
|
2217 |
"count": 98,
|
2218 |
"num_samples": 1514,
|
2219 |
"tasks": [],
|
2220 |
-
"average_score": 0.
|
2221 |
},
|
2222 |
"structured_output": {
|
2223 |
"count": 110,
|
2224 |
"num_samples": 1714,
|
2225 |
"tasks": [],
|
2226 |
-
"average_score": 0.
|
2227 |
},
|
2228 |
"exact_text": {
|
2229 |
"count": 83,
|
@@ -2261,13 +2261,13 @@
|
|
2261 |
"count": 41,
|
2262 |
"num_samples": 623,
|
2263 |
"tasks": [],
|
2264 |
-
"average_score": 0.
|
2265 |
},
|
2266 |
"1-image": {
|
2267 |
"count": 315,
|
2268 |
"num_samples": 5228,
|
2269 |
"tasks": [],
|
2270 |
-
"average_score": 0.
|
2271 |
},
|
2272 |
"video": {
|
2273 |
"count": 43,
|
@@ -2293,7 +2293,7 @@
|
|
2293 |
"count": 72,
|
2294 |
"num_samples": 1124,
|
2295 |
"tasks": [],
|
2296 |
-
"average_score": 0.
|
2297 |
},
|
2298 |
"Planning": {
|
2299 |
"count": 78,
|
@@ -2311,7 +2311,7 @@
|
|
2311 |
"count": 145,
|
2312 |
"num_samples": 2313,
|
2313 |
"tasks": [],
|
2314 |
-
"average_score": 0.
|
2315 |
},
|
2316 |
"Metrics": {
|
2317 |
"count": 20,
|
@@ -2345,25 +2345,25 @@
|
|
2345 |
"count": 303,
|
2346 |
"num_samples": 4755,
|
2347 |
"tasks": [],
|
2348 |
-
"average_score": 0.
|
2349 |
},
|
2350 |
"Text Recognition (OCR)": {
|
2351 |
"count": 137,
|
2352 |
"num_samples": 2239,
|
2353 |
"tasks": [],
|
2354 |
-
"average_score": 0.
|
2355 |
},
|
2356 |
"Language Understanding and Generation": {
|
2357 |
"count": 154,
|
2358 |
"num_samples": 2509,
|
2359 |
"tasks": [],
|
2360 |
-
"average_score": 0.
|
2361 |
},
|
2362 |
"Scene and Event Understanding": {
|
2363 |
"count": 154,
|
2364 |
"num_samples": 2467,
|
2365 |
"tasks": [],
|
2366 |
-
"average_score": 0.
|
2367 |
},
|
2368 |
"Mathematical and Logical Reasoning": {
|
2369 |
"count": 109,
|
@@ -2375,7 +2375,7 @@
|
|
2375 |
"count": 51,
|
2376 |
"num_samples": 855,
|
2377 |
"tasks": [],
|
2378 |
-
"average_score": 0.
|
2379 |
},
|
2380 |
"Ethical and Safety Reasoning": {
|
2381 |
"count": 15,
|
@@ -2407,7 +2407,7 @@
|
|
2407 |
"count": 93,
|
2408 |
"num_samples": 1517,
|
2409 |
"tasks": [],
|
2410 |
-
"average_score": 0.
|
2411 |
},
|
2412 |
"Text-Based Images and Documents": {
|
2413 |
"count": 82,
|
@@ -2437,7 +2437,7 @@
|
|
2437 |
"count": 143,
|
2438 |
"num_samples": 2248,
|
2439 |
"tasks": [],
|
2440 |
-
"average_score": 0.
|
2441 |
},
|
2442 |
"3D Models and Aerial Imagery": {
|
2443 |
"count": 11,
|
@@ -2451,13 +2451,13 @@
|
|
2451 |
"count": 98,
|
2452 |
"num_samples": 1514,
|
2453 |
"tasks": [],
|
2454 |
-
"average_score": 0.
|
2455 |
},
|
2456 |
"structured_output": {
|
2457 |
"count": 110,
|
2458 |
"num_samples": 1714,
|
2459 |
"tasks": [],
|
2460 |
-
"average_score": 0.
|
2461 |
},
|
2462 |
"exact_text": {
|
2463 |
"count": 83,
|
@@ -2495,13 +2495,13 @@
|
|
2495 |
"count": 41,
|
2496 |
"num_samples": 623,
|
2497 |
"tasks": [],
|
2498 |
-
"average_score": 0.
|
2499 |
},
|
2500 |
"1-image": {
|
2501 |
"count": 315,
|
2502 |
"num_samples": 5228,
|
2503 |
"tasks": [],
|
2504 |
-
"average_score": 0.
|
2505 |
},
|
2506 |
"video": {
|
2507 |
"count": 43,
|
@@ -2519,7 +2519,7 @@
|
|
2519 |
"count": 51,
|
2520 |
"num_samples": 802,
|
2521 |
"tasks": [],
|
2522 |
-
"average_score": 0.
|
2523 |
}
|
2524 |
},
|
2525 |
"app": {
|
@@ -2527,7 +2527,7 @@
|
|
2527 |
"count": 72,
|
2528 |
"num_samples": 1124,
|
2529 |
"tasks": [],
|
2530 |
-
"average_score": 0.
|
2531 |
},
|
2532 |
"Planning": {
|
2533 |
"count": 78,
|
@@ -2545,7 +2545,7 @@
|
|
2545 |
"count": 145,
|
2546 |
"num_samples": 2313,
|
2547 |
"tasks": [],
|
2548 |
-
"average_score": 0.
|
2549 |
},
|
2550 |
"Metrics": {
|
2551 |
"count": 20,
|
@@ -2563,7 +2563,7 @@
|
|
2563 |
"count": 97,
|
2564 |
"num_samples": 1605,
|
2565 |
"tasks": [],
|
2566 |
-
"average_score": 0.
|
2567 |
},
|
2568 |
"Mathematics": {
|
2569 |
"count": 33,
|
@@ -2585,13 +2585,13 @@
|
|
2585 |
"count": 137,
|
2586 |
"num_samples": 2239,
|
2587 |
"tasks": [],
|
2588 |
-
"average_score": 0.
|
2589 |
},
|
2590 |
"Language Understanding and Generation": {
|
2591 |
"count": 154,
|
2592 |
"num_samples": 2511,
|
2593 |
"tasks": [],
|
2594 |
-
"average_score": 0.
|
2595 |
},
|
2596 |
"Scene and Event Understanding": {
|
2597 |
"count": 154,
|
@@ -2641,7 +2641,7 @@
|
|
2641 |
"count": 93,
|
2642 |
"num_samples": 1517,
|
2643 |
"tasks": [],
|
2644 |
-
"average_score": 0.
|
2645 |
},
|
2646 |
"Text-Based Images and Documents": {
|
2647 |
"count": 82,
|
@@ -2671,7 +2671,7 @@
|
|
2671 |
"count": 143,
|
2672 |
"num_samples": 2248,
|
2673 |
"tasks": [],
|
2674 |
-
"average_score": 0.
|
2675 |
},
|
2676 |
"3D Models and Aerial Imagery": {
|
2677 |
"count": 11,
|
@@ -2685,13 +2685,13 @@
|
|
2685 |
"count": 98,
|
2686 |
"num_samples": 1514,
|
2687 |
"tasks": [],
|
2688 |
-
"average_score": 0.
|
2689 |
},
|
2690 |
"structured_output": {
|
2691 |
"count": 110,
|
2692 |
"num_samples": 1714,
|
2693 |
"tasks": [],
|
2694 |
-
"average_score": 0.
|
2695 |
},
|
2696 |
"exact_text": {
|
2697 |
"count": 83,
|
@@ -2729,13 +2729,13 @@
|
|
2729 |
"count": 41,
|
2730 |
"num_samples": 623,
|
2731 |
"tasks": [],
|
2732 |
-
"average_score": 0.
|
2733 |
},
|
2734 |
"1-image": {
|
2735 |
"count": 315,
|
2736 |
"num_samples": 5228,
|
2737 |
"tasks": [],
|
2738 |
-
"average_score": 0.
|
2739 |
},
|
2740 |
"video": {
|
2741 |
"count": 43,
|
@@ -2761,7 +2761,7 @@
|
|
2761 |
"count": 72,
|
2762 |
"num_samples": 1124,
|
2763 |
"tasks": [],
|
2764 |
-
"average_score": 0.
|
2765 |
},
|
2766 |
"Planning": {
|
2767 |
"count": 78,
|
@@ -2779,7 +2779,7 @@
|
|
2779 |
"count": 145,
|
2780 |
"num_samples": 2315,
|
2781 |
"tasks": [],
|
2782 |
-
"average_score": 0.
|
2783 |
},
|
2784 |
"Metrics": {
|
2785 |
"count": 20,
|
@@ -2813,25 +2813,25 @@
|
|
2813 |
"count": 303,
|
2814 |
"num_samples": 4755,
|
2815 |
"tasks": [],
|
2816 |
-
"average_score": 0.
|
2817 |
},
|
2818 |
"Text Recognition (OCR)": {
|
2819 |
"count": 137,
|
2820 |
"num_samples": 2239,
|
2821 |
"tasks": [],
|
2822 |
-
"average_score": 0.
|
2823 |
},
|
2824 |
"Language Understanding and Generation": {
|
2825 |
"count": 154,
|
2826 |
"num_samples": 2509,
|
2827 |
"tasks": [],
|
2828 |
-
"average_score": 0.
|
2829 |
},
|
2830 |
"Scene and Event Understanding": {
|
2831 |
"count": 154,
|
2832 |
"num_samples": 2467,
|
2833 |
"tasks": [],
|
2834 |
-
"average_score": 0.
|
2835 |
},
|
2836 |
"Mathematical and Logical Reasoning": {
|
2837 |
"count": 109,
|
@@ -2843,7 +2843,7 @@
|
|
2843 |
"count": 51,
|
2844 |
"num_samples": 855,
|
2845 |
"tasks": [],
|
2846 |
-
"average_score": 0.
|
2847 |
},
|
2848 |
"Ethical and Safety Reasoning": {
|
2849 |
"count": 15,
|
@@ -2875,7 +2875,7 @@
|
|
2875 |
"count": 93,
|
2876 |
"num_samples": 1517,
|
2877 |
"tasks": [],
|
2878 |
-
"average_score": 0.
|
2879 |
},
|
2880 |
"Text-Based Images and Documents": {
|
2881 |
"count": 82,
|
@@ -2905,7 +2905,7 @@
|
|
2905 |
"count": 143,
|
2906 |
"num_samples": 2248,
|
2907 |
"tasks": [],
|
2908 |
-
"average_score": 0.
|
2909 |
},
|
2910 |
"3D Models and Aerial Imagery": {
|
2911 |
"count": 11,
|
@@ -2919,13 +2919,13 @@
|
|
2919 |
"count": 98,
|
2920 |
"num_samples": 1514,
|
2921 |
"tasks": [],
|
2922 |
-
"average_score": 0.
|
2923 |
},
|
2924 |
"structured_output": {
|
2925 |
"count": 110,
|
2926 |
"num_samples": 1714,
|
2927 |
"tasks": [],
|
2928 |
-
"average_score": 0.
|
2929 |
},
|
2930 |
"exact_text": {
|
2931 |
"count": 83,
|
@@ -2963,13 +2963,13 @@
|
|
2963 |
"count": 41,
|
2964 |
"num_samples": 623,
|
2965 |
"tasks": [],
|
2966 |
-
"average_score": 0.
|
2967 |
},
|
2968 |
"1-image": {
|
2969 |
"count": 315,
|
2970 |
"num_samples": 5228,
|
2971 |
"tasks": [],
|
2972 |
-
"average_score": 0.
|
2973 |
},
|
2974 |
"video": {
|
2975 |
"count": 43,
|
@@ -2987,7 +2987,7 @@
|
|
2987 |
"count": 51,
|
2988 |
"num_samples": 802,
|
2989 |
"tasks": [],
|
2990 |
-
"average_score": 0.
|
2991 |
}
|
2992 |
},
|
2993 |
"app": {
|
@@ -2995,7 +2995,7 @@
|
|
2995 |
"count": 72,
|
2996 |
"num_samples": 1124,
|
2997 |
"tasks": [],
|
2998 |
-
"average_score": 0.
|
2999 |
},
|
3000 |
"Planning": {
|
3001 |
"count": 78,
|
@@ -3013,7 +3013,7 @@
|
|
3013 |
"count": 145,
|
3014 |
"num_samples": 2313,
|
3015 |
"tasks": [],
|
3016 |
-
"average_score": 0.
|
3017 |
},
|
3018 |
"Metrics": {
|
3019 |
"count": 20,
|
@@ -3031,7 +3031,7 @@
|
|
3031 |
"count": 97,
|
3032 |
"num_samples": 1605,
|
3033 |
"tasks": [],
|
3034 |
-
"average_score": 0.
|
3035 |
},
|
3036 |
"Mathematics": {
|
3037 |
"count": 33,
|
@@ -3053,13 +3053,13 @@
|
|
3053 |
"count": 137,
|
3054 |
"num_samples": 2239,
|
3055 |
"tasks": [],
|
3056 |
-
"average_score": 0.
|
3057 |
},
|
3058 |
"Language Understanding and Generation": {
|
3059 |
"count": 154,
|
3060 |
"num_samples": 2509,
|
3061 |
"tasks": [],
|
3062 |
-
"average_score": 0.
|
3063 |
},
|
3064 |
"Scene and Event Understanding": {
|
3065 |
"count": 154,
|
@@ -3109,7 +3109,7 @@
|
|
3109 |
"count": 93,
|
3110 |
"num_samples": 1517,
|
3111 |
"tasks": [],
|
3112 |
-
"average_score": 0.
|
3113 |
},
|
3114 |
"Text-Based Images and Documents": {
|
3115 |
"count": 82,
|
@@ -3139,7 +3139,7 @@
|
|
3139 |
"count": 143,
|
3140 |
"num_samples": 2248,
|
3141 |
"tasks": [],
|
3142 |
-
"average_score": 0.
|
3143 |
},
|
3144 |
"3D Models and Aerial Imagery": {
|
3145 |
"count": 11,
|
@@ -3153,13 +3153,13 @@
|
|
3153 |
"count": 98,
|
3154 |
"num_samples": 1514,
|
3155 |
"tasks": [],
|
3156 |
-
"average_score": 0.
|
3157 |
},
|
3158 |
"structured_output": {
|
3159 |
"count": 110,
|
3160 |
"num_samples": 1714,
|
3161 |
"tasks": [],
|
3162 |
-
"average_score": 0.
|
3163 |
},
|
3164 |
"exact_text": {
|
3165 |
"count": 83,
|
@@ -3197,13 +3197,13 @@
|
|
3197 |
"count": 41,
|
3198 |
"num_samples": 623,
|
3199 |
"tasks": [],
|
3200 |
-
"average_score": 0.
|
3201 |
},
|
3202 |
"1-image": {
|
3203 |
"count": 315,
|
3204 |
"num_samples": 5228,
|
3205 |
"tasks": [],
|
3206 |
-
"average_score": 0.
|
3207 |
},
|
3208 |
"video": {
|
3209 |
"count": 43,
|
@@ -3229,7 +3229,7 @@
|
|
3229 |
"count": 72,
|
3230 |
"num_samples": 1124,
|
3231 |
"tasks": [],
|
3232 |
-
"average_score": 0.
|
3233 |
},
|
3234 |
"Planning": {
|
3235 |
"count": 78,
|
@@ -3247,7 +3247,7 @@
|
|
3247 |
"count": 145,
|
3248 |
"num_samples": 2313,
|
3249 |
"tasks": [],
|
3250 |
-
"average_score": 0.
|
3251 |
},
|
3252 |
"Metrics": {
|
3253 |
"count": 20,
|
@@ -3281,25 +3281,25 @@
|
|
3281 |
"count": 303,
|
3282 |
"num_samples": 4755,
|
3283 |
"tasks": [],
|
3284 |
-
"average_score": 0.
|
3285 |
},
|
3286 |
"Text Recognition (OCR)": {
|
3287 |
"count": 137,
|
3288 |
"num_samples": 2239,
|
3289 |
"tasks": [],
|
3290 |
-
"average_score": 0.
|
3291 |
},
|
3292 |
"Language Understanding and Generation": {
|
3293 |
"count": 154,
|
3294 |
"num_samples": 2509,
|
3295 |
"tasks": [],
|
3296 |
-
"average_score": 0.
|
3297 |
},
|
3298 |
"Scene and Event Understanding": {
|
3299 |
"count": 154,
|
3300 |
"num_samples": 2467,
|
3301 |
"tasks": [],
|
3302 |
-
"average_score": 0.
|
3303 |
},
|
3304 |
"Mathematical and Logical Reasoning": {
|
3305 |
"count": 109,
|
@@ -3311,7 +3311,7 @@
|
|
3311 |
"count": 51,
|
3312 |
"num_samples": 855,
|
3313 |
"tasks": [],
|
3314 |
-
"average_score": 0.
|
3315 |
},
|
3316 |
"Ethical and Safety Reasoning": {
|
3317 |
"count": 15,
|
@@ -3343,7 +3343,7 @@
|
|
3343 |
"count": 93,
|
3344 |
"num_samples": 1517,
|
3345 |
"tasks": [],
|
3346 |
-
"average_score": 0.
|
3347 |
},
|
3348 |
"Text-Based Images and Documents": {
|
3349 |
"count": 82,
|
@@ -3373,7 +3373,7 @@
|
|
3373 |
"count": 143,
|
3374 |
"num_samples": 2248,
|
3375 |
"tasks": [],
|
3376 |
-
"average_score": 0.
|
3377 |
},
|
3378 |
"3D Models and Aerial Imagery": {
|
3379 |
"count": 11,
|
@@ -3387,13 +3387,13 @@
|
|
3387 |
"count": 98,
|
3388 |
"num_samples": 1514,
|
3389 |
"tasks": [],
|
3390 |
-
"average_score": 0.
|
3391 |
},
|
3392 |
"structured_output": {
|
3393 |
"count": 110,
|
3394 |
"num_samples": 1714,
|
3395 |
"tasks": [],
|
3396 |
-
"average_score": 0.
|
3397 |
},
|
3398 |
"exact_text": {
|
3399 |
"count": 83,
|
@@ -3431,13 +3431,13 @@
|
|
3431 |
"count": 41,
|
3432 |
"num_samples": 623,
|
3433 |
"tasks": [],
|
3434 |
-
"average_score": 0.
|
3435 |
},
|
3436 |
"1-image": {
|
3437 |
"count": 315,
|
3438 |
"num_samples": 5228,
|
3439 |
"tasks": [],
|
3440 |
-
"average_score": 0.
|
3441 |
},
|
3442 |
"video": {
|
3443 |
"count": 43,
|
@@ -3455,7 +3455,7 @@
|
|
3455 |
"count": 51,
|
3456 |
"num_samples": 802,
|
3457 |
"tasks": [],
|
3458 |
-
"average_score": 0.
|
3459 |
}
|
3460 |
},
|
3461 |
"app": {
|
@@ -3463,7 +3463,7 @@
|
|
3463 |
"count": 72,
|
3464 |
"num_samples": 1124,
|
3465 |
"tasks": [],
|
3466 |
-
"average_score": 0.
|
3467 |
},
|
3468 |
"Planning": {
|
3469 |
"count": 78,
|
@@ -3481,7 +3481,7 @@
|
|
3481 |
"count": 145,
|
3482 |
"num_samples": 2313,
|
3483 |
"tasks": [],
|
3484 |
-
"average_score": 0.
|
3485 |
},
|
3486 |
"Metrics": {
|
3487 |
"count": 20,
|
@@ -3499,7 +3499,7 @@
|
|
3499 |
"count": 97,
|
3500 |
"num_samples": 1605,
|
3501 |
"tasks": [],
|
3502 |
-
"average_score": 0.
|
3503 |
},
|
3504 |
"Mathematics": {
|
3505 |
"count": 33,
|
@@ -3521,13 +3521,13 @@
|
|
3521 |
"count": 137,
|
3522 |
"num_samples": 2239,
|
3523 |
"tasks": [],
|
3524 |
-
"average_score": 0.
|
3525 |
},
|
3526 |
"Language Understanding and Generation": {
|
3527 |
"count": 154,
|
3528 |
"num_samples": 2509,
|
3529 |
"tasks": [],
|
3530 |
-
"average_score": 0.
|
3531 |
},
|
3532 |
"Scene and Event Understanding": {
|
3533 |
"count": 154,
|
@@ -3607,7 +3607,7 @@
|
|
3607 |
"count": 143,
|
3608 |
"num_samples": 2248,
|
3609 |
"tasks": [],
|
3610 |
-
"average_score": 0.
|
3611 |
},
|
3612 |
"3D Models and Aerial Imagery": {
|
3613 |
"count": 11,
|
@@ -3621,7 +3621,7 @@
|
|
3621 |
"count": 98,
|
3622 |
"num_samples": 1514,
|
3623 |
"tasks": [],
|
3624 |
-
"average_score": 0.
|
3625 |
},
|
3626 |
"structured_output": {
|
3627 |
"count": 110,
|
@@ -3671,7 +3671,7 @@
|
|
3671 |
"count": 315,
|
3672 |
"num_samples": 5228,
|
3673 |
"tasks": [],
|
3674 |
-
"average_score": 0.
|
3675 |
},
|
3676 |
"video": {
|
3677 |
"count": 43,
|
@@ -3715,7 +3715,7 @@
|
|
3715 |
"count": 145,
|
3716 |
"num_samples": 2313,
|
3717 |
"tasks": [],
|
3718 |
-
"average_score": 0.
|
3719 |
},
|
3720 |
"Metrics": {
|
3721 |
"count": 20,
|
|
|
239 |
"count": 303,
|
240 |
"num_samples": 4755,
|
241 |
"tasks": [],
|
242 |
+
"average_score": 0.5202055934299538
|
243 |
},
|
244 |
"Text Recognition (OCR)": {
|
245 |
"count": 137,
|
246 |
"num_samples": 2239,
|
247 |
"tasks": [],
|
248 |
+
"average_score": 0.5017043129027509
|
249 |
},
|
250 |
"Language Understanding and Generation": {
|
251 |
"count": 154,
|
252 |
"num_samples": 2509,
|
253 |
"tasks": [],
|
254 |
+
"average_score": 0.5532599716027446
|
255 |
},
|
256 |
"Scene and Event Understanding": {
|
257 |
"count": 154,
|
258 |
"num_samples": 2467,
|
259 |
"tasks": [],
|
260 |
+
"average_score": 0.546753787203128
|
261 |
},
|
262 |
"Mathematical and Logical Reasoning": {
|
263 |
"count": 109,
|
|
|
269 |
"count": 51,
|
270 |
"num_samples": 855,
|
271 |
"tasks": [],
|
272 |
+
"average_score": 0.5751012914154264
|
273 |
},
|
274 |
"Ethical and Safety Reasoning": {
|
275 |
"count": 15,
|
|
|
301 |
"count": 93,
|
302 |
"num_samples": 1517,
|
303 |
"tasks": [],
|
304 |
+
"average_score": 0.4625032188638111
|
305 |
},
|
306 |
"Text-Based Images and Documents": {
|
307 |
"count": 82,
|
|
|
331 |
"count": 143,
|
332 |
"num_samples": 2248,
|
333 |
"tasks": [],
|
334 |
+
"average_score": 0.55005349042813
|
335 |
},
|
336 |
"3D Models and Aerial Imagery": {
|
337 |
"count": 11,
|
|
|
351 |
"count": 110,
|
352 |
"num_samples": 1714,
|
353 |
"tasks": [],
|
354 |
+
"average_score": 0.44418591808616864
|
355 |
},
|
356 |
"exact_text": {
|
357 |
"count": 83,
|
|
|
389 |
"count": 41,
|
390 |
"num_samples": 623,
|
391 |
"tasks": [],
|
392 |
+
"average_score": 0.5370278962809547
|
393 |
},
|
394 |
"1-image": {
|
395 |
"count": 315,
|
|
|
413 |
"count": 51,
|
414 |
"num_samples": 802,
|
415 |
"tasks": [],
|
416 |
+
"average_score": 0.45544217378728585
|
417 |
}
|
418 |
},
|
419 |
"app": {
|
|
|
421 |
"count": 72,
|
422 |
"num_samples": 1124,
|
423 |
"tasks": [],
|
424 |
+
"average_score": 0.5421439953094952
|
425 |
},
|
426 |
"Planning": {
|
427 |
"count": 78,
|
|
|
457 |
"count": 97,
|
458 |
"num_samples": 1605,
|
459 |
"tasks": [],
|
460 |
+
"average_score": 0.5722329455291694
|
461 |
},
|
462 |
"Mathematics": {
|
463 |
"count": 33,
|
|
|
479 |
"count": 137,
|
480 |
"num_samples": 2239,
|
481 |
"tasks": [],
|
482 |
+
"average_score": 0.4337278553354258
|
483 |
},
|
484 |
"Language Understanding and Generation": {
|
485 |
"count": 154,
|
486 |
"num_samples": 2509,
|
487 |
"tasks": [],
|
488 |
+
"average_score": 0.49947464681475356
|
489 |
},
|
490 |
"Scene and Event Understanding": {
|
491 |
"count": 154,
|
|
|
535 |
"count": 93,
|
536 |
"num_samples": 1517,
|
537 |
"tasks": [],
|
538 |
+
"average_score": 0.3865262916591035
|
539 |
},
|
540 |
"Text-Based Images and Documents": {
|
541 |
"count": 82,
|
|
|
585 |
"count": 110,
|
586 |
"num_samples": 1714,
|
587 |
"tasks": [],
|
588 |
+
"average_score": 0.39868324168390534
|
589 |
},
|
590 |
"exact_text": {
|
591 |
"count": 83,
|
|
|
623 |
"count": 41,
|
624 |
"num_samples": 623,
|
625 |
"tasks": [],
|
626 |
+
"average_score": 0.43653808057103954
|
627 |
},
|
628 |
"1-image": {
|
629 |
"count": 315,
|
|
|
655 |
"count": 72,
|
656 |
"num_samples": 1124,
|
657 |
"tasks": [],
|
658 |
+
"average_score": 0.46645473820179373
|
659 |
},
|
660 |
"Planning": {
|
661 |
"count": 78,
|
|
|
713 |
"count": 137,
|
714 |
"num_samples": 2239,
|
715 |
"tasks": [],
|
716 |
+
"average_score": 0.6082834220752651
|
717 |
},
|
718 |
"Language Understanding and Generation": {
|
719 |
"count": 154,
|
720 |
"num_samples": 2509,
|
721 |
"tasks": [],
|
722 |
+
"average_score": 0.5745077617490254
|
723 |
},
|
724 |
"Scene and Event Understanding": {
|
725 |
"count": 154,
|
|
|
769 |
"count": 93,
|
770 |
"num_samples": 1517,
|
771 |
"tasks": [],
|
772 |
+
"average_score": 0.5691641481808987
|
773 |
},
|
774 |
"Text-Based Images and Documents": {
|
775 |
"count": 82,
|
|
|
819 |
"count": 110,
|
820 |
"num_samples": 1714,
|
821 |
"tasks": [],
|
822 |
+
"average_score": 0.4971460788134188
|
823 |
},
|
824 |
"exact_text": {
|
825 |
"count": 83,
|
|
|
857 |
"count": 41,
|
858 |
"num_samples": 623,
|
859 |
"tasks": [],
|
860 |
+
"average_score": 0.5414381873407914
|
861 |
},
|
862 |
"1-image": {
|
863 |
"count": 315,
|
|
|
889 |
"count": 72,
|
890 |
"num_samples": 1124,
|
891 |
"tasks": [],
|
892 |
+
"average_score": 0.6663170946790707
|
893 |
},
|
894 |
"Planning": {
|
895 |
"count": 78,
|
|
|
1175 |
"count": 303,
|
1176 |
"num_samples": 4755,
|
1177 |
"tasks": [],
|
1178 |
+
"average_score": 0.4492982787524939
|
1179 |
},
|
1180 |
"Text Recognition (OCR)": {
|
1181 |
"count": 137,
|
1182 |
"num_samples": 2239,
|
1183 |
"tasks": [],
|
1184 |
+
"average_score": 0.49026056071002017
|
1185 |
},
|
1186 |
"Language Understanding and Generation": {
|
1187 |
"count": 154,
|
1188 |
"num_samples": 2509,
|
1189 |
"tasks": [],
|
1190 |
+
"average_score": 0.5168957112681365
|
1191 |
},
|
1192 |
"Scene and Event Understanding": {
|
1193 |
"count": 154,
|
1194 |
"num_samples": 2467,
|
1195 |
"tasks": [],
|
1196 |
+
"average_score": 0.46731791428406805
|
1197 |
},
|
1198 |
"Mathematical and Logical Reasoning": {
|
1199 |
"count": 109,
|
|
|
1205 |
"count": 51,
|
1206 |
"num_samples": 855,
|
1207 |
"tasks": [],
|
1208 |
+
"average_score": 0.5572925295284307
|
1209 |
},
|
1210 |
"Ethical and Safety Reasoning": {
|
1211 |
"count": 15,
|
|
|
1237 |
"count": 93,
|
1238 |
"num_samples": 1517,
|
1239 |
"tasks": [],
|
1240 |
+
"average_score": 0.47202628409684394
|
1241 |
},
|
1242 |
"Text-Based Images and Documents": {
|
1243 |
"count": 82,
|
|
|
1267 |
"count": 143,
|
1268 |
"num_samples": 2248,
|
1269 |
"tasks": [],
|
1270 |
+
"average_score": 0.465175334092545
|
1271 |
},
|
1272 |
"3D Models and Aerial Imagery": {
|
1273 |
"count": 11,
|
|
|
1281 |
"count": 98,
|
1282 |
"num_samples": 1514,
|
1283 |
"tasks": [],
|
1284 |
+
"average_score": 0.41242028190533997
|
1285 |
},
|
1286 |
"structured_output": {
|
1287 |
"count": 110,
|
1288 |
"num_samples": 1714,
|
1289 |
"tasks": [],
|
1290 |
+
"average_score": 0.3906415365938764
|
1291 |
},
|
1292 |
"exact_text": {
|
1293 |
"count": 83,
|
|
|
1325 |
"count": 41,
|
1326 |
"num_samples": 623,
|
1327 |
"tasks": [],
|
1328 |
+
"average_score": 0.4305788513381019
|
1329 |
},
|
1330 |
"1-image": {
|
1331 |
"count": 315,
|
1332 |
"num_samples": 5228,
|
1333 |
"tasks": [],
|
1334 |
+
"average_score": 0.46343334374251277
|
1335 |
},
|
1336 |
"video": {
|
1337 |
"count": 43,
|
|
|
1349 |
"count": 51,
|
1350 |
"num_samples": 802,
|
1351 |
"tasks": [],
|
1352 |
+
"average_score": 0.36981497185070983
|
1353 |
}
|
1354 |
},
|
1355 |
"app": {
|
|
|
1357 |
"count": 72,
|
1358 |
"num_samples": 1124,
|
1359 |
"tasks": [],
|
1360 |
+
"average_score": 0.5666618234843734
|
1361 |
},
|
1362 |
"Planning": {
|
1363 |
"count": 78,
|
|
|
1375 |
"count": 145,
|
1376 |
"num_samples": 2313,
|
1377 |
"tasks": [],
|
1378 |
+
"average_score": 0.43590838051817093
|
1379 |
},
|
1380 |
"Metrics": {
|
1381 |
"count": 20,
|
|
|
1393 |
"count": 97,
|
1394 |
"num_samples": 1605,
|
1395 |
"tasks": [],
|
1396 |
+
"average_score": 0.5399167524341886
|
1397 |
},
|
1398 |
"Mathematics": {
|
1399 |
"count": 33,
|
|
|
1409 |
"count": 303,
|
1410 |
"num_samples": 4755,
|
1411 |
"tasks": [],
|
1412 |
+
"average_score": 0.49787264809826687
|
1413 |
},
|
1414 |
"Text Recognition (OCR)": {
|
1415 |
"count": 137,
|
1416 |
"num_samples": 2239,
|
1417 |
"tasks": [],
|
1418 |
+
"average_score": 0.5439010430283516
|
1419 |
},
|
1420 |
"Language Understanding and Generation": {
|
1421 |
"count": 154,
|
1422 |
"num_samples": 2509,
|
1423 |
"tasks": [],
|
1424 |
+
"average_score": 0.5392244859385411
|
1425 |
},
|
1426 |
"Scene and Event Understanding": {
|
1427 |
"count": 154,
|
1428 |
"num_samples": 2467,
|
1429 |
"tasks": [],
|
1430 |
+
"average_score": 0.509277882172206
|
1431 |
},
|
1432 |
"Mathematical and Logical Reasoning": {
|
1433 |
"count": 109,
|
|
|
1439 |
"count": 51,
|
1440 |
"num_samples": 855,
|
1441 |
"tasks": [],
|
1442 |
+
"average_score": 0.5676817981386025
|
1443 |
},
|
1444 |
"Ethical and Safety Reasoning": {
|
1445 |
"count": 15,
|
|
|
1471 |
"count": 93,
|
1472 |
"num_samples": 1517,
|
1473 |
"tasks": [],
|
1474 |
+
"average_score": 0.5402397677488632
|
1475 |
},
|
1476 |
"Text-Based Images and Documents": {
|
1477 |
"count": 82,
|
|
|
1501 |
"count": 143,
|
1502 |
"num_samples": 2248,
|
1503 |
"tasks": [],
|
1504 |
+
"average_score": 0.49789939867591104
|
1505 |
},
|
1506 |
"3D Models and Aerial Imagery": {
|
1507 |
"count": 11,
|
|
|
1515 |
"count": 98,
|
1516 |
"num_samples": 1514,
|
1517 |
"tasks": [],
|
1518 |
+
"average_score": 0.44719815365440824
|
1519 |
},
|
1520 |
"structured_output": {
|
1521 |
"count": 110,
|
1522 |
"num_samples": 1714,
|
1523 |
"tasks": [],
|
1524 |
+
"average_score": 0.4500902736468407
|
1525 |
},
|
1526 |
"exact_text": {
|
1527 |
"count": 83,
|
|
|
1559 |
"count": 41,
|
1560 |
"num_samples": 623,
|
1561 |
"tasks": [],
|
1562 |
+
"average_score": 0.5468722850464449
|
1563 |
},
|
1564 |
"1-image": {
|
1565 |
"count": 315,
|
1566 |
"num_samples": 5228,
|
1567 |
"tasks": [],
|
1568 |
+
"average_score": 0.4918205178721877
|
1569 |
},
|
1570 |
"video": {
|
1571 |
"count": 43,
|
|
|
1583 |
"count": 51,
|
1584 |
"num_samples": 802,
|
1585 |
"tasks": [],
|
1586 |
+
"average_score": 0.45176098055218655
|
1587 |
}
|
1588 |
},
|
1589 |
"app": {
|
|
|
1591 |
"count": 72,
|
1592 |
"num_samples": 1124,
|
1593 |
"tasks": [],
|
1594 |
+
"average_score": 0.5807658773593334
|
1595 |
},
|
1596 |
"Planning": {
|
1597 |
"count": 78,
|
|
|
1609 |
"count": 145,
|
1610 |
"num_samples": 2313,
|
1611 |
"tasks": [],
|
1612 |
+
"average_score": 0.5362106489630868
|
1613 |
},
|
1614 |
"Metrics": {
|
1615 |
"count": 20,
|
|
|
1627 |
"count": 97,
|
1628 |
"num_samples": 1605,
|
1629 |
"tasks": [],
|
1630 |
+
"average_score": 0.5166939389651373
|
1631 |
},
|
1632 |
"Mathematics": {
|
1633 |
"count": 33,
|
|
|
1643 |
"count": 303,
|
1644 |
"num_samples": 4755,
|
1645 |
"tasks": [],
|
1646 |
+
"average_score": 0.3708368629321668
|
1647 |
},
|
1648 |
"Text Recognition (OCR)": {
|
1649 |
"count": 137,
|
1650 |
"num_samples": 2239,
|
1651 |
"tasks": [],
|
1652 |
+
"average_score": 0.40213773918065815
|
1653 |
},
|
1654 |
"Language Understanding and Generation": {
|
1655 |
"count": 154,
|
1656 |
"num_samples": 2511,
|
1657 |
"tasks": [],
|
1658 |
+
"average_score": 0.4034335110538307
|
1659 |
},
|
1660 |
"Scene and Event Understanding": {
|
1661 |
"count": 154,
|
1662 |
"num_samples": 2469,
|
1663 |
"tasks": [],
|
1664 |
+
"average_score": 0.4109909230944937
|
1665 |
},
|
1666 |
"Mathematical and Logical Reasoning": {
|
1667 |
"count": 109,
|
|
|
1673 |
"count": 51,
|
1674 |
"num_samples": 855,
|
1675 |
"tasks": [],
|
1676 |
+
"average_score": 0.49360878418945336
|
1677 |
},
|
1678 |
"Ethical and Safety Reasoning": {
|
1679 |
"count": 15,
|
|
|
1705 |
"count": 93,
|
1706 |
"num_samples": 1517,
|
1707 |
"tasks": [],
|
1708 |
+
"average_score": 0.3821046882337143
|
1709 |
},
|
1710 |
"Text-Based Images and Documents": {
|
1711 |
"count": 82,
|
|
|
1735 |
"count": 143,
|
1736 |
"num_samples": 2248,
|
1737 |
"tasks": [],
|
1738 |
+
"average_score": 0.40660144920567376
|
1739 |
},
|
1740 |
"3D Models and Aerial Imagery": {
|
1741 |
"count": 11,
|
|
|
1749 |
"count": 98,
|
1750 |
"num_samples": 1514,
|
1751 |
"tasks": [],
|
1752 |
+
"average_score": 0.3430730210869785
|
1753 |
},
|
1754 |
"structured_output": {
|
1755 |
"count": 110,
|
1756 |
"num_samples": 1714,
|
1757 |
"tasks": [],
|
1758 |
+
"average_score": 0.3426196933687219
|
1759 |
},
|
1760 |
"exact_text": {
|
1761 |
"count": 83,
|
|
|
1793 |
"count": 41,
|
1794 |
"num_samples": 623,
|
1795 |
"tasks": [],
|
1796 |
+
"average_score": 0.37453319457428763
|
1797 |
},
|
1798 |
"1-image": {
|
1799 |
"count": 315,
|
1800 |
"num_samples": 5228,
|
1801 |
"tasks": [],
|
1802 |
+
"average_score": 0.37701588079136955
|
1803 |
},
|
1804 |
"video": {
|
1805 |
"count": 43,
|
|
|
1817 |
"count": 51,
|
1818 |
"num_samples": 802,
|
1819 |
"tasks": [],
|
1820 |
+
"average_score": 0.33008667136891007
|
1821 |
}
|
1822 |
},
|
1823 |
"app": {
|
|
|
1825 |
"count": 72,
|
1826 |
"num_samples": 1124,
|
1827 |
"tasks": [],
|
1828 |
+
"average_score": 0.42746758545520747
|
1829 |
},
|
1830 |
"Planning": {
|
1831 |
"count": 78,
|
|
|
1843 |
"count": 145,
|
1844 |
"num_samples": 2315,
|
1845 |
"tasks": [],
|
1846 |
+
"average_score": 0.40048749993497734
|
1847 |
},
|
1848 |
"Metrics": {
|
1849 |
"count": 20,
|
|
|
1861 |
"count": 97,
|
1862 |
"num_samples": 1605,
|
1863 |
"tasks": [],
|
1864 |
+
"average_score": 0.4276637093173368
|
1865 |
},
|
1866 |
"Mathematics": {
|
1867 |
"count": 33,
|
|
|
1883 |
"count": 137,
|
1884 |
"num_samples": 2239,
|
1885 |
"tasks": [],
|
1886 |
+
"average_score": 0.2834675874668524
|
1887 |
},
|
1888 |
"Language Understanding and Generation": {
|
1889 |
"count": 154,
|
1890 |
"num_samples": 2509,
|
1891 |
"tasks": [],
|
1892 |
+
"average_score": 0.3674817002808495
|
1893 |
},
|
1894 |
"Scene and Event Understanding": {
|
1895 |
"count": 154,
|
|
|
1939 |
"count": 93,
|
1940 |
"num_samples": 1517,
|
1941 |
"tasks": [],
|
1942 |
+
"average_score": 0.23380046931752074
|
1943 |
},
|
1944 |
"Text-Based Images and Documents": {
|
1945 |
"count": 82,
|
|
|
1969 |
"count": 143,
|
1970 |
"num_samples": 2248,
|
1971 |
"tasks": [],
|
1972 |
+
"average_score": 0.4247591719013819
|
1973 |
},
|
1974 |
"3D Models and Aerial Imagery": {
|
1975 |
"count": 11,
|
|
|
1983 |
"count": 98,
|
1984 |
"num_samples": 1514,
|
1985 |
"tasks": [],
|
1986 |
+
"average_score": 0.2868275930712835
|
1987 |
},
|
1988 |
"structured_output": {
|
1989 |
"count": 110,
|
1990 |
"num_samples": 1714,
|
1991 |
"tasks": [],
|
1992 |
+
"average_score": 0.259450238500612
|
1993 |
},
|
1994 |
"exact_text": {
|
1995 |
"count": 83,
|
|
|
2027 |
"count": 41,
|
2028 |
"num_samples": 623,
|
2029 |
"tasks": [],
|
2030 |
+
"average_score": 0.28104747671521785
|
2031 |
},
|
2032 |
"1-image": {
|
2033 |
"count": 315,
|
2034 |
"num_samples": 5228,
|
2035 |
"tasks": [],
|
2036 |
+
"average_score": 0.34840850032295206
|
2037 |
},
|
2038 |
"video": {
|
2039 |
"count": 43,
|
|
|
2059 |
"count": 72,
|
2060 |
"num_samples": 1124,
|
2061 |
"tasks": [],
|
2062 |
+
"average_score": 0.3076421844825067
|
2063 |
},
|
2064 |
"Planning": {
|
2065 |
"count": 78,
|
|
|
2077 |
"count": 145,
|
2078 |
"num_samples": 2313,
|
2079 |
"tasks": [],
|
2080 |
+
"average_score": 0.38362780453378204
|
2081 |
},
|
2082 |
"Metrics": {
|
2083 |
"count": 20,
|
|
|
2117 |
"count": 137,
|
2118 |
"num_samples": 2239,
|
2119 |
"tasks": [],
|
2120 |
+
"average_score": 0.19077168655703208
|
2121 |
},
|
2122 |
"Language Understanding and Generation": {
|
2123 |
"count": 154,
|
2124 |
"num_samples": 2509,
|
2125 |
"tasks": [],
|
2126 |
+
"average_score": 0.2555444562659206
|
2127 |
},
|
2128 |
"Scene and Event Understanding": {
|
2129 |
"count": 154,
|
|
|
2173 |
"count": 93,
|
2174 |
"num_samples": 1517,
|
2175 |
"tasks": [],
|
2176 |
+
"average_score": 0.1466861610319767
|
2177 |
},
|
2178 |
"Text-Based Images and Documents": {
|
2179 |
"count": 82,
|
|
|
2203 |
"count": 143,
|
2204 |
"num_samples": 2248,
|
2205 |
"tasks": [],
|
2206 |
+
"average_score": 0.3263378734842879
|
2207 |
},
|
2208 |
"3D Models and Aerial Imagery": {
|
2209 |
"count": 11,
|
|
|
2217 |
"count": 98,
|
2218 |
"num_samples": 1514,
|
2219 |
"tasks": [],
|
2220 |
+
"average_score": 0.20277804188944173
|
2221 |
},
|
2222 |
"structured_output": {
|
2223 |
"count": 110,
|
2224 |
"num_samples": 1714,
|
2225 |
"tasks": [],
|
2226 |
+
"average_score": 0.18291595756285564
|
2227 |
},
|
2228 |
"exact_text": {
|
2229 |
"count": 83,
|
|
|
2261 |
"count": 41,
|
2262 |
"num_samples": 623,
|
2263 |
"tasks": [],
|
2264 |
+
"average_score": 0.13803800801858385
|
2265 |
},
|
2266 |
"1-image": {
|
2267 |
"count": 315,
|
2268 |
"num_samples": 5228,
|
2269 |
"tasks": [],
|
2270 |
+
"average_score": 0.2548084764084038
|
2271 |
},
|
2272 |
"video": {
|
2273 |
"count": 43,
|
|
|
2293 |
"count": 72,
|
2294 |
"num_samples": 1124,
|
2295 |
"tasks": [],
|
2296 |
+
"average_score": 0.19283211154717242
|
2297 |
},
|
2298 |
"Planning": {
|
2299 |
"count": 78,
|
|
|
2311 |
"count": 145,
|
2312 |
"num_samples": 2313,
|
2313 |
"tasks": [],
|
2314 |
+
"average_score": 0.28505205882578405
|
2315 |
},
|
2316 |
"Metrics": {
|
2317 |
"count": 20,
|
|
|
2345 |
"count": 303,
|
2346 |
"num_samples": 4755,
|
2347 |
"tasks": [],
|
2348 |
+
"average_score": 0.38193012983650343
|
2349 |
},
|
2350 |
"Text Recognition (OCR)": {
|
2351 |
"count": 137,
|
2352 |
"num_samples": 2239,
|
2353 |
"tasks": [],
|
2354 |
+
"average_score": 0.41315219763443384
|
2355 |
},
|
2356 |
"Language Understanding and Generation": {
|
2357 |
"count": 154,
|
2358 |
"num_samples": 2509,
|
2359 |
"tasks": [],
|
2360 |
+
"average_score": 0.43665980552577693
|
2361 |
},
|
2362 |
"Scene and Event Understanding": {
|
2363 |
"count": 154,
|
2364 |
"num_samples": 2467,
|
2365 |
"tasks": [],
|
2366 |
+
"average_score": 0.4265623936500962
|
2367 |
},
|
2368 |
"Mathematical and Logical Reasoning": {
|
2369 |
"count": 109,
|
|
|
2375 |
"count": 51,
|
2376 |
"num_samples": 855,
|
2377 |
"tasks": [],
|
2378 |
+
"average_score": 0.5257990949897898
|
2379 |
},
|
2380 |
"Ethical and Safety Reasoning": {
|
2381 |
"count": 15,
|
|
|
2407 |
"count": 93,
|
2408 |
"num_samples": 1517,
|
2409 |
"tasks": [],
|
2410 |
+
"average_score": 0.3634339625985008
|
2411 |
},
|
2412 |
"Text-Based Images and Documents": {
|
2413 |
"count": 82,
|
|
|
2437 |
"count": 143,
|
2438 |
"num_samples": 2248,
|
2439 |
"tasks": [],
|
2440 |
+
"average_score": 0.42875248733027654
|
2441 |
},
|
2442 |
"3D Models and Aerial Imagery": {
|
2443 |
"count": 11,
|
|
|
2451 |
"count": 98,
|
2452 |
"num_samples": 1514,
|
2453 |
"tasks": [],
|
2454 |
+
"average_score": 0.3630499545707523
|
2455 |
},
|
2456 |
"structured_output": {
|
2457 |
"count": 110,
|
2458 |
"num_samples": 1714,
|
2459 |
"tasks": [],
|
2460 |
+
"average_score": 0.3476691827105281
|
2461 |
},
|
2462 |
"exact_text": {
|
2463 |
"count": 83,
|
|
|
2495 |
"count": 41,
|
2496 |
"num_samples": 623,
|
2497 |
"tasks": [],
|
2498 |
+
"average_score": 0.34771123515123364
|
2499 |
},
|
2500 |
"1-image": {
|
2501 |
"count": 315,
|
2502 |
"num_samples": 5228,
|
2503 |
"tasks": [],
|
2504 |
+
"average_score": 0.4145693044465943
|
2505 |
},
|
2506 |
"video": {
|
2507 |
"count": 43,
|
|
|
2519 |
"count": 51,
|
2520 |
"num_samples": 802,
|
2521 |
"tasks": [],
|
2522 |
+
"average_score": 0.3153417935059416
|
2523 |
}
|
2524 |
},
|
2525 |
"app": {
|
|
|
2527 |
"count": 72,
|
2528 |
"num_samples": 1124,
|
2529 |
"tasks": [],
|
2530 |
+
"average_score": 0.4306947454508794
|
2531 |
},
|
2532 |
"Planning": {
|
2533 |
"count": 78,
|
|
|
2545 |
"count": 145,
|
2546 |
"num_samples": 2313,
|
2547 |
"tasks": [],
|
2548 |
+
"average_score": 0.42202934355552685
|
2549 |
},
|
2550 |
"Metrics": {
|
2551 |
"count": 20,
|
|
|
2563 |
"count": 97,
|
2564 |
"num_samples": 1605,
|
2565 |
"tasks": [],
|
2566 |
+
"average_score": 0.4625649385962016
|
2567 |
},
|
2568 |
"Mathematics": {
|
2569 |
"count": 33,
|
|
|
2585 |
"count": 137,
|
2586 |
"num_samples": 2239,
|
2587 |
"tasks": [],
|
2588 |
+
"average_score": 0.280559214034858
|
2589 |
},
|
2590 |
"Language Understanding and Generation": {
|
2591 |
"count": 154,
|
2592 |
"num_samples": 2511,
|
2593 |
"tasks": [],
|
2594 |
+
"average_score": 0.32020728060179815
|
2595 |
},
|
2596 |
"Scene and Event Understanding": {
|
2597 |
"count": 154,
|
|
|
2641 |
"count": 93,
|
2642 |
"num_samples": 1517,
|
2643 |
"tasks": [],
|
2644 |
+
"average_score": 0.22800928556370195
|
2645 |
},
|
2646 |
"Text-Based Images and Documents": {
|
2647 |
"count": 82,
|
|
|
2671 |
"count": 143,
|
2672 |
"num_samples": 2248,
|
2673 |
"tasks": [],
|
2674 |
+
"average_score": 0.3388056726588417
|
2675 |
},
|
2676 |
"3D Models and Aerial Imagery": {
|
2677 |
"count": 11,
|
|
|
2685 |
"count": 98,
|
2686 |
"num_samples": 1514,
|
2687 |
"tasks": [],
|
2688 |
+
"average_score": 0.250804626773504
|
2689 |
},
|
2690 |
"structured_output": {
|
2691 |
"count": 110,
|
2692 |
"num_samples": 1714,
|
2693 |
"tasks": [],
|
2694 |
+
"average_score": 0.2522493284864019
|
2695 |
},
|
2696 |
"exact_text": {
|
2697 |
"count": 83,
|
|
|
2729 |
"count": 41,
|
2730 |
"num_samples": 623,
|
2731 |
"tasks": [],
|
2732 |
+
"average_score": 0.19872104324302098
|
2733 |
},
|
2734 |
"1-image": {
|
2735 |
"count": 315,
|
2736 |
"num_samples": 5228,
|
2737 |
"tasks": [],
|
2738 |
+
"average_score": 0.30088711082969344
|
2739 |
},
|
2740 |
"video": {
|
2741 |
"count": 43,
|
|
|
2761 |
"count": 72,
|
2762 |
"num_samples": 1124,
|
2763 |
"tasks": [],
|
2764 |
+
"average_score": 0.29129840423784176
|
2765 |
},
|
2766 |
"Planning": {
|
2767 |
"count": 78,
|
|
|
2779 |
"count": 145,
|
2780 |
"num_samples": 2315,
|
2781 |
"tasks": [],
|
2782 |
+
"average_score": 0.3214666523378005
|
2783 |
},
|
2784 |
"Metrics": {
|
2785 |
"count": 20,
|
|
|
2813 |
"count": 303,
|
2814 |
"num_samples": 4755,
|
2815 |
"tasks": [],
|
2816 |
+
"average_score": 0.2604967101191775
|
2817 |
},
|
2818 |
"Text Recognition (OCR)": {
|
2819 |
"count": 137,
|
2820 |
"num_samples": 2239,
|
2821 |
"tasks": [],
|
2822 |
+
"average_score": 0.2500331562865158
|
2823 |
},
|
2824 |
"Language Understanding and Generation": {
|
2825 |
"count": 154,
|
2826 |
"num_samples": 2509,
|
2827 |
"tasks": [],
|
2828 |
+
"average_score": 0.3003169369011028
|
2829 |
},
|
2830 |
"Scene and Event Understanding": {
|
2831 |
"count": 154,
|
2832 |
"num_samples": 2467,
|
2833 |
"tasks": [],
|
2834 |
+
"average_score": 0.31808748114668184
|
2835 |
},
|
2836 |
"Mathematical and Logical Reasoning": {
|
2837 |
"count": 109,
|
|
|
2843 |
"count": 51,
|
2844 |
"num_samples": 855,
|
2845 |
"tasks": [],
|
2846 |
+
"average_score": 0.40732197204308807
|
2847 |
},
|
2848 |
"Ethical and Safety Reasoning": {
|
2849 |
"count": 15,
|
|
|
2875 |
"count": 93,
|
2876 |
"num_samples": 1517,
|
2877 |
"tasks": [],
|
2878 |
+
"average_score": 0.21195711598986072
|
2879 |
},
|
2880 |
"Text-Based Images and Documents": {
|
2881 |
"count": 82,
|
|
|
2905 |
"count": 143,
|
2906 |
"num_samples": 2248,
|
2907 |
"tasks": [],
|
2908 |
+
"average_score": 0.3176880312524649
|
2909 |
},
|
2910 |
"3D Models and Aerial Imagery": {
|
2911 |
"count": 11,
|
|
|
2919 |
"count": 98,
|
2920 |
"num_samples": 1514,
|
2921 |
"tasks": [],
|
2922 |
+
"average_score": 0.23506388020592064
|
2923 |
},
|
2924 |
"structured_output": {
|
2925 |
"count": 110,
|
2926 |
"num_samples": 1714,
|
2927 |
"tasks": [],
|
2928 |
+
"average_score": 0.1781127776443048
|
2929 |
},
|
2930 |
"exact_text": {
|
2931 |
"count": 83,
|
|
|
2963 |
"count": 41,
|
2964 |
"num_samples": 623,
|
2965 |
"tasks": [],
|
2966 |
+
"average_score": 0.23596215721092323
|
2967 |
},
|
2968 |
"1-image": {
|
2969 |
"count": 315,
|
2970 |
"num_samples": 5228,
|
2971 |
"tasks": [],
|
2972 |
+
"average_score": 0.26319603880798287
|
2973 |
},
|
2974 |
"video": {
|
2975 |
"count": 43,
|
|
|
2987 |
"count": 51,
|
2988 |
"num_samples": 802,
|
2989 |
"tasks": [],
|
2990 |
+
"average_score": 0.22288558250834017
|
2991 |
}
|
2992 |
},
|
2993 |
"app": {
|
|
|
2995 |
"count": 72,
|
2996 |
"num_samples": 1124,
|
2997 |
"tasks": [],
|
2998 |
+
"average_score": 0.2666989364424082
|
2999 |
},
|
3000 |
"Planning": {
|
3001 |
"count": 78,
|
|
|
3013 |
"count": 145,
|
3014 |
"num_samples": 2313,
|
3015 |
"tasks": [],
|
3016 |
+
"average_score": 0.29243044121840894
|
3017 |
},
|
3018 |
"Metrics": {
|
3019 |
"count": 20,
|
|
|
3031 |
"count": 97,
|
3032 |
"num_samples": 1605,
|
3033 |
"tasks": [],
|
3034 |
+
"average_score": 0.33187729423141027
|
3035 |
},
|
3036 |
"Mathematics": {
|
3037 |
"count": 33,
|
|
|
3053 |
"count": 137,
|
3054 |
"num_samples": 2239,
|
3055 |
"tasks": [],
|
3056 |
+
"average_score": 0.2483252111012436
|
3057 |
},
|
3058 |
"Language Understanding and Generation": {
|
3059 |
"count": 154,
|
3060 |
"num_samples": 2509,
|
3061 |
"tasks": [],
|
3062 |
+
"average_score": 0.28732942108098564
|
3063 |
},
|
3064 |
"Scene and Event Understanding": {
|
3065 |
"count": 154,
|
|
|
3109 |
"count": 93,
|
3110 |
"num_samples": 1517,
|
3111 |
"tasks": [],
|
3112 |
+
"average_score": 0.1865974025588298
|
3113 |
},
|
3114 |
"Text-Based Images and Documents": {
|
3115 |
"count": 82,
|
|
|
3139 |
"count": 143,
|
3140 |
"num_samples": 2248,
|
3141 |
"tasks": [],
|
3142 |
+
"average_score": 0.3413768635559215
|
3143 |
},
|
3144 |
"3D Models and Aerial Imagery": {
|
3145 |
"count": 11,
|
|
|
3153 |
"count": 98,
|
3154 |
"num_samples": 1514,
|
3155 |
"tasks": [],
|
3156 |
+
"average_score": 0.2177924712685756
|
3157 |
},
|
3158 |
"structured_output": {
|
3159 |
"count": 110,
|
3160 |
"num_samples": 1714,
|
3161 |
"tasks": [],
|
3162 |
+
"average_score": 0.21443984349574025
|
3163 |
},
|
3164 |
"exact_text": {
|
3165 |
"count": 83,
|
|
|
3197 |
"count": 41,
|
3198 |
"num_samples": 623,
|
3199 |
"tasks": [],
|
3200 |
+
"average_score": 0.14337869666229008
|
3201 |
},
|
3202 |
"1-image": {
|
3203 |
"count": 315,
|
3204 |
"num_samples": 5228,
|
3205 |
"tasks": [],
|
3206 |
+
"average_score": 0.27790147494714373
|
3207 |
},
|
3208 |
"video": {
|
3209 |
"count": 43,
|
|
|
3229 |
"count": 72,
|
3230 |
"num_samples": 1124,
|
3231 |
"tasks": [],
|
3232 |
+
"average_score": 0.2237087834389946
|
3233 |
},
|
3234 |
"Planning": {
|
3235 |
"count": 78,
|
|
|
3247 |
"count": 145,
|
3248 |
"num_samples": 2313,
|
3249 |
"tasks": [],
|
3250 |
+
"average_score": 0.316318567258608
|
3251 |
},
|
3252 |
"Metrics": {
|
3253 |
"count": 20,
|
|
|
3281 |
"count": 303,
|
3282 |
"num_samples": 4755,
|
3283 |
"tasks": [],
|
3284 |
+
"average_score": 0.3460288961410444
|
3285 |
},
|
3286 |
"Text Recognition (OCR)": {
|
3287 |
"count": 137,
|
3288 |
"num_samples": 2239,
|
3289 |
"tasks": [],
|
3290 |
+
"average_score": 0.3777640755922415
|
3291 |
},
|
3292 |
"Language Understanding and Generation": {
|
3293 |
"count": 154,
|
3294 |
"num_samples": 2509,
|
3295 |
"tasks": [],
|
3296 |
+
"average_score": 0.38299418297106824
|
3297 |
},
|
3298 |
"Scene and Event Understanding": {
|
3299 |
"count": 154,
|
3300 |
"num_samples": 2467,
|
3301 |
"tasks": [],
|
3302 |
+
"average_score": 0.3776722463473817
|
3303 |
},
|
3304 |
"Mathematical and Logical Reasoning": {
|
3305 |
"count": 109,
|
|
|
3311 |
"count": 51,
|
3312 |
"num_samples": 855,
|
3313 |
"tasks": [],
|
3314 |
+
"average_score": 0.419071767659191
|
3315 |
},
|
3316 |
"Ethical and Safety Reasoning": {
|
3317 |
"count": 15,
|
|
|
3343 |
"count": 93,
|
3344 |
"num_samples": 1517,
|
3345 |
"tasks": [],
|
3346 |
+
"average_score": 0.3070067338940785
|
3347 |
},
|
3348 |
"Text-Based Images and Documents": {
|
3349 |
"count": 82,
|
|
|
3373 |
"count": 143,
|
3374 |
"num_samples": 2248,
|
3375 |
"tasks": [],
|
3376 |
+
"average_score": 0.37115973962368864
|
3377 |
},
|
3378 |
"3D Models and Aerial Imagery": {
|
3379 |
"count": 11,
|
|
|
3387 |
"count": 98,
|
3388 |
"num_samples": 1514,
|
3389 |
"tasks": [],
|
3390 |
+
"average_score": 0.3078181788009137
|
3391 |
},
|
3392 |
"structured_output": {
|
3393 |
"count": 110,
|
3394 |
"num_samples": 1714,
|
3395 |
"tasks": [],
|
3396 |
+
"average_score": 0.3188475653127356
|
3397 |
},
|
3398 |
"exact_text": {
|
3399 |
"count": 83,
|
|
|
3431 |
"count": 41,
|
3432 |
"num_samples": 623,
|
3433 |
"tasks": [],
|
3434 |
+
"average_score": 0.16642294307267227
|
3435 |
},
|
3436 |
"1-image": {
|
3437 |
"count": 315,
|
3438 |
"num_samples": 5228,
|
3439 |
"tasks": [],
|
3440 |
+
"average_score": 0.37108130557306335
|
3441 |
},
|
3442 |
"video": {
|
3443 |
"count": 43,
|
|
|
3455 |
"count": 51,
|
3456 |
"num_samples": 802,
|
3457 |
"tasks": [],
|
3458 |
+
"average_score": 0.3104621543981899
|
3459 |
}
|
3460 |
},
|
3461 |
"app": {
|
|
|
3463 |
"count": 72,
|
3464 |
"num_samples": 1124,
|
3465 |
"tasks": [],
|
3466 |
+
"average_score": 0.4300741596942578
|
3467 |
},
|
3468 |
"Planning": {
|
3469 |
"count": 78,
|
|
|
3481 |
"count": 145,
|
3482 |
"num_samples": 2313,
|
3483 |
"tasks": [],
|
3484 |
+
"average_score": 0.3892097218585385
|
3485 |
},
|
3486 |
"Metrics": {
|
3487 |
"count": 20,
|
|
|
3499 |
"count": 97,
|
3500 |
"num_samples": 1605,
|
3501 |
"tasks": [],
|
3502 |
+
"average_score": 0.3809515410188075
|
3503 |
},
|
3504 |
"Mathematics": {
|
3505 |
"count": 33,
|
|
|
3521 |
"count": 137,
|
3522 |
"num_samples": 2239,
|
3523 |
"tasks": [],
|
3524 |
+
"average_score": 0.14328677752263275
|
3525 |
},
|
3526 |
"Language Understanding and Generation": {
|
3527 |
"count": 154,
|
3528 |
"num_samples": 2509,
|
3529 |
"tasks": [],
|
3530 |
+
"average_score": 0.19646404502647707
|
3531 |
},
|
3532 |
"Scene and Event Understanding": {
|
3533 |
"count": 154,
|
|
|
3607 |
"count": 143,
|
3608 |
"num_samples": 2248,
|
3609 |
"tasks": [],
|
3610 |
+
"average_score": 0.2485354956932213
|
3611 |
},
|
3612 |
"3D Models and Aerial Imagery": {
|
3613 |
"count": 11,
|
|
|
3621 |
"count": 98,
|
3622 |
"num_samples": 1514,
|
3623 |
"tasks": [],
|
3624 |
+
"average_score": 0.12417283740525839
|
3625 |
},
|
3626 |
"structured_output": {
|
3627 |
"count": 110,
|
|
|
3671 |
"count": 315,
|
3672 |
"num_samples": 5228,
|
3673 |
"tasks": [],
|
3674 |
+
"average_score": 0.1960107191983825
|
3675 |
},
|
3676 |
"video": {
|
3677 |
"count": 43,
|
|
|
3715 |
"count": 145,
|
3716 |
"num_samples": 2313,
|
3717 |
"tasks": [],
|
3718 |
+
"average_score": 0.19899465185565898
|
3719 |
},
|
3720 |
"Metrics": {
|
3721 |
"count": 20,
|
static/eval_results/Default/all_summary.json
CHANGED
@@ -4,7 +4,6 @@
|
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
"num_not_eval_samples": 0,
|
7 |
-
"num_total_samples": 6961,
|
8 |
"macro_mean_score": 0.5203440930873326,
|
9 |
"micro_mean_score": 0.514302640282204
|
10 |
},
|
@@ -12,14 +11,12 @@
|
|
12 |
"num_eval_tasks": 440,
|
13 |
"num_eval_samples": 6539,
|
14 |
"num_not_eval_samples": 0,
|
15 |
-
"num_total_samples": 6961,
|
16 |
"macro_mean_score": 0.5265030595065238,
|
17 |
"micro_mean_score": 0.5236338521693411
|
18 |
},
|
19 |
"open": {
|
20 |
"num_eval_tasks": 65,
|
21 |
"num_eval_samples": 1163,
|
22 |
-
"num_total_samples": 2448,
|
23 |
"macro_mean_score": 0.6478225794744895,
|
24 |
"micro_mean_score": 0.665391229578676
|
25 |
},
|
@@ -30,85 +27,75 @@
|
|
30 |
"num_eval_tasks": 440,
|
31 |
"num_eval_samples": 6539,
|
32 |
"num_not_eval_samples": 0,
|
33 |
-
"
|
34 |
-
"
|
35 |
-
"micro_mean_score": 0.46403536258864253
|
36 |
},
|
37 |
"core_cot": {
|
38 |
"num_eval_tasks": 440,
|
39 |
"num_eval_samples": 6539,
|
40 |
"num_not_eval_samples": 0,
|
41 |
-
"
|
42 |
-
"
|
43 |
-
"micro_mean_score": 0.47581906202211677
|
44 |
},
|
45 |
"open": {
|
46 |
"num_eval_tasks": 65,
|
47 |
"num_eval_samples": 1163,
|
48 |
-
"num_total_samples": 2448,
|
49 |
"macro_mean_score": 0.5858190649927173,
|
50 |
"micro_mean_score": 0.6104901117798793
|
51 |
},
|
52 |
-
"overall_score": 0.
|
53 |
},
|
54 |
"Gemini_1.5_flash_002": {
|
55 |
"core_noncot": {
|
56 |
"num_eval_tasks": 440,
|
57 |
"num_eval_samples": 6539,
|
58 |
"num_not_eval_samples": 0,
|
59 |
-
"
|
60 |
-
"
|
61 |
-
"micro_mean_score": 0.41216971462683855
|
62 |
},
|
63 |
"core_cot": {
|
64 |
"num_eval_tasks": 440,
|
65 |
"num_eval_samples": 6539,
|
66 |
"num_not_eval_samples": 0,
|
67 |
-
"
|
68 |
-
"
|
69 |
-
"micro_mean_score": 0.41216971462683855
|
70 |
},
|
71 |
"open": {
|
72 |
"num_eval_tasks": 65,
|
73 |
"num_eval_samples": 1163,
|
74 |
-
"num_total_samples": 2168,
|
75 |
"macro_mean_score": 0.5691365176285039,
|
76 |
"micro_mean_score": 0.5987532244196045
|
77 |
},
|
78 |
-
"overall_score": 0.
|
79 |
},
|
80 |
"Claude_3.5": {
|
81 |
"core_noncot": {
|
82 |
"num_eval_tasks": 440,
|
83 |
"num_eval_samples": 6539,
|
84 |
"num_not_eval_samples": 0,
|
85 |
-
"
|
86 |
-
"
|
87 |
-
"micro_mean_score": 0.4798092874490549
|
88 |
},
|
89 |
"core_cot": {
|
90 |
"num_eval_tasks": 440,
|
91 |
"num_eval_samples": 6539,
|
92 |
"num_not_eval_samples": 0,
|
93 |
-
"
|
94 |
-
"
|
95 |
-
"micro_mean_score": 0.4991559743144323
|
96 |
},
|
97 |
"open": {
|
98 |
"num_eval_tasks": 65,
|
99 |
"num_eval_samples": 1163,
|
100 |
-
"num_total_samples": 2288,
|
101 |
"macro_mean_score": 0.6373907158949892,
|
102 |
"micro_mean_score": 0.6569647463456579
|
103 |
},
|
104 |
-
"overall_score": 0.
|
105 |
},
|
106 |
"Claude_3.5_new": {
|
107 |
"core_noncot": {
|
108 |
"num_eval_tasks": 440,
|
109 |
"num_eval_samples": 6539,
|
110 |
"num_not_eval_samples": 0,
|
111 |
-
"num_total_samples": 6961,
|
112 |
"macro_mean_score": 0.4919657684484185,
|
113 |
"micro_mean_score": 0.4874520567007144
|
114 |
},
|
@@ -116,14 +103,12 @@
|
|
116 |
"num_eval_tasks": 440,
|
117 |
"num_eval_samples": 6539,
|
118 |
"num_not_eval_samples": 0,
|
119 |
-
"num_total_samples": 6961,
|
120 |
"macro_mean_score": 0.5259191914020757,
|
121 |
"micro_mean_score": 0.5230785894131227
|
122 |
},
|
123 |
"open": {
|
124 |
"num_eval_tasks": 65,
|
125 |
"num_eval_samples": 1163,
|
126 |
-
"num_total_samples": 1224,
|
127 |
"macro_mean_score": 0.6563419761104125,
|
128 |
"micro_mean_score": 0.6724419604471196
|
129 |
},
|
@@ -134,267 +119,236 @@
|
|
134 |
"num_eval_tasks": 440,
|
135 |
"num_eval_samples": 6539,
|
136 |
"num_not_eval_samples": 0,
|
137 |
-
"
|
138 |
-
"
|
139 |
-
"micro_mean_score": 0.392578163407945
|
140 |
},
|
141 |
"core_cot": {
|
142 |
"num_eval_tasks": 440,
|
143 |
"num_eval_samples": 6539,
|
144 |
"num_not_eval_samples": 0,
|
145 |
-
"
|
146 |
-
"
|
147 |
-
"micro_mean_score": 0.40376078514357017
|
148 |
},
|
149 |
"open": {
|
150 |
"num_eval_tasks": 65,
|
151 |
"num_eval_samples": 1163,
|
152 |
-
"num_total_samples": 1224,
|
153 |
"macro_mean_score": 0.586537827213665,
|
154 |
"micro_mean_score": 0.6133276010318144
|
155 |
},
|
156 |
-
"overall_score": 0.
|
157 |
},
|
158 |
"Qwen2_VL_72B": {
|
159 |
"core_noncot": {
|
160 |
"num_eval_tasks": 440,
|
161 |
"num_eval_samples": 6539,
|
162 |
"num_not_eval_samples": 0,
|
163 |
-
"
|
164 |
-
"
|
165 |
-
"micro_mean_score": 0.4568583770401895
|
166 |
},
|
167 |
"core_cot": {
|
168 |
"num_eval_tasks": 440,
|
169 |
"num_eval_samples": 6539,
|
170 |
"num_not_eval_samples": 0,
|
171 |
-
"
|
172 |
-
"
|
173 |
-
"micro_mean_score": 0.4487693487093462
|
174 |
},
|
175 |
"open": {
|
176 |
"num_eval_tasks": 65,
|
177 |
"num_eval_samples": 1163,
|
178 |
-
"num_total_samples": 2448,
|
179 |
"macro_mean_score": 0.5639771804231668,
|
180 |
"micro_mean_score": 0.5835339638865004
|
181 |
},
|
182 |
-
"overall_score": 0.
|
183 |
},
|
184 |
"Qwen2_VL_7B": {
|
185 |
"core_noncot": {
|
186 |
"num_eval_tasks": 440,
|
187 |
"num_eval_samples": 6539,
|
188 |
"num_not_eval_samples": 0,
|
189 |
-
"
|
190 |
-
"
|
191 |
-
"micro_mean_score": 0.34344091516995323
|
192 |
},
|
193 |
"core_cot": {
|
194 |
"num_eval_tasks": 440,
|
195 |
"num_eval_samples": 6539,
|
196 |
"num_not_eval_samples": 0,
|
197 |
-
"
|
198 |
-
"
|
199 |
-
"micro_mean_score": 0.32443422147119677
|
200 |
},
|
201 |
"open": {
|
202 |
"num_eval_tasks": 65,
|
203 |
"num_eval_samples": 1170,
|
204 |
-
"num_total_samples": 2452,
|
205 |
"macro_mean_score": 0.43955105763038577,
|
206 |
"micro_mean_score": 0.45508547008546996
|
207 |
},
|
208 |
-
"overall_score": 0.
|
209 |
},
|
210 |
"llava_onevision_72B": {
|
211 |
"core_noncot": {
|
212 |
"num_eval_tasks": 440,
|
213 |
"num_eval_samples": 6539,
|
214 |
"num_not_eval_samples": 0,
|
215 |
-
"
|
216 |
-
"
|
217 |
-
"micro_mean_score": 0.3173848563095166
|
218 |
},
|
219 |
"core_cot": {
|
220 |
"num_eval_tasks": 440,
|
221 |
"num_eval_samples": 6539,
|
222 |
"num_not_eval_samples": 0,
|
223 |
-
"
|
224 |
-
"
|
225 |
-
"micro_mean_score": 0.2954433666362564
|
226 |
},
|
227 |
"open": {
|
228 |
"num_eval_tasks": 65,
|
229 |
"num_eval_samples": 1163,
|
230 |
-
"num_total_samples": 1224,
|
231 |
"macro_mean_score": 0.4599484231632498,
|
232 |
"micro_mean_score": 0.4850386930352536
|
233 |
},
|
234 |
-
"overall_score": 0.
|
235 |
},
|
236 |
"llava_onevision_7B": {
|
237 |
"core_noncot": {
|
238 |
"num_eval_tasks": 440,
|
239 |
"num_eval_samples": 6539,
|
240 |
"num_not_eval_samples": 0,
|
241 |
-
"
|
242 |
-
"
|
243 |
-
"micro_mean_score": 0.22222171180488767
|
244 |
},
|
245 |
"core_cot": {
|
246 |
"num_eval_tasks": 440,
|
247 |
"num_eval_samples": 6539,
|
248 |
"num_not_eval_samples": 0,
|
249 |
-
"
|
250 |
-
"
|
251 |
-
"micro_mean_score": 0.210586172002703
|
252 |
},
|
253 |
"open": {
|
254 |
"num_eval_tasks": 65,
|
255 |
"num_eval_samples": 1163,
|
256 |
-
"num_total_samples": 2448,
|
257 |
"macro_mean_score": 0.33979975321921935,
|
258 |
"micro_mean_score": 0.36474634565778147
|
259 |
},
|
260 |
-
"overall_score": 0.
|
261 |
},
|
262 |
"InternVL2_76B": {
|
263 |
"core_noncot": {
|
264 |
"num_eval_tasks": 440,
|
265 |
"num_eval_samples": 6539,
|
266 |
"num_not_eval_samples": 0,
|
267 |
-
"
|
268 |
-
"
|
269 |
-
"micro_mean_score": 0.3452353155814884
|
270 |
},
|
271 |
"core_cot": {
|
272 |
"num_eval_tasks": 440,
|
273 |
"num_eval_samples": 6539,
|
274 |
"num_not_eval_samples": 0,
|
275 |
-
"
|
276 |
-
"
|
277 |
-
"micro_mean_score": 0.35043335903915124
|
278 |
},
|
279 |
"open": {
|
280 |
"num_eval_tasks": 65,
|
281 |
"num_eval_samples": 1163,
|
282 |
-
"num_total_samples": 1224,
|
283 |
"macro_mean_score": 0.5192997443033639,
|
284 |
"micro_mean_score": 0.5421324161650903
|
285 |
},
|
286 |
-
"overall_score": 0.
|
287 |
},
|
288 |
"InternVL2_8B": {
|
289 |
"core_noncot": {
|
290 |
"num_eval_tasks": 440,
|
291 |
"num_eval_samples": 6539,
|
292 |
"num_not_eval_samples": 0,
|
293 |
-
"
|
294 |
-
"
|
295 |
-
"micro_mean_score": 0.2543416126895087
|
296 |
},
|
297 |
"core_cot": {
|
298 |
"num_eval_tasks": 440,
|
299 |
"num_eval_samples": 6539,
|
300 |
"num_not_eval_samples": 0,
|
301 |
-
"
|
302 |
-
"
|
303 |
-
"micro_mean_score": 0.23784634936127952
|
304 |
},
|
305 |
"open": {
|
306 |
"num_eval_tasks": 65,
|
307 |
"num_eval_samples": 1165,
|
308 |
-
"num_total_samples": 2452,
|
309 |
"macro_mean_score": 0.3978571701460552,
|
310 |
"micro_mean_score": 0.4108583690987125
|
311 |
},
|
312 |
-
"overall_score": 0.
|
313 |
},
|
314 |
"MiniCPM_v2.6": {
|
315 |
"core_noncot": {
|
316 |
"num_eval_tasks": 440,
|
317 |
"num_eval_samples": 6539,
|
318 |
"num_not_eval_samples": 0,
|
319 |
-
"
|
320 |
-
"
|
321 |
-
"micro_mean_score": 0.22452805919103805
|
322 |
},
|
323 |
"core_cot": {
|
324 |
"num_eval_tasks": 440,
|
325 |
"num_eval_samples": 6539,
|
326 |
"num_not_eval_samples": 0,
|
327 |
-
"
|
328 |
-
"
|
329 |
-
"micro_mean_score": 0.2250606411323753
|
330 |
},
|
331 |
"open": {
|
332 |
"num_eval_tasks": 65,
|
333 |
"num_eval_samples": 1163,
|
334 |
-
"num_total_samples": 2448,
|
335 |
"macro_mean_score": 0.41728623355613875,
|
336 |
"micro_mean_score": 0.43452278589853827
|
337 |
},
|
338 |
-
"overall_score": 0.
|
339 |
},
|
340 |
"Phi-3.5-vision": {
|
341 |
"core_noncot": {
|
342 |
"num_eval_tasks": 440,
|
343 |
"num_eval_samples": 6539,
|
344 |
"num_not_eval_samples": 0,
|
345 |
-
"
|
346 |
-
"
|
347 |
-
"micro_mean_score": 0.22932978620408923
|
348 |
},
|
349 |
"core_cot": {
|
350 |
"num_eval_tasks": 440,
|
351 |
"num_eval_samples": 6539,
|
352 |
"num_not_eval_samples": 0,
|
353 |
-
"
|
354 |
-
"
|
355 |
-
"micro_mean_score": 0.2266573336398296
|
356 |
},
|
357 |
"open": {
|
358 |
"num_eval_tasks": 65,
|
359 |
"num_eval_samples": 1163,
|
360 |
-
"num_total_samples": 2428,
|
361 |
"macro_mean_score": 0.3947914647737769,
|
362 |
"micro_mean_score": 0.42459157351676696
|
363 |
},
|
364 |
-
"overall_score": 0.
|
365 |
},
|
366 |
"Pixtral_12B": {
|
367 |
"core_noncot": {
|
368 |
"num_eval_tasks": 440,
|
369 |
"num_eval_samples": 6539,
|
370 |
"num_not_eval_samples": 0,
|
371 |
-
"
|
372 |
-
"
|
373 |
-
"micro_mean_score": 0.3151734861550665
|
374 |
},
|
375 |
"core_cot": {
|
376 |
"num_eval_tasks": 440,
|
377 |
"num_eval_samples": 6539,
|
378 |
"num_not_eval_samples": 0,
|
379 |
-
"
|
380 |
-
"
|
381 |
-
"micro_mean_score": 0.30971424472967524
|
382 |
},
|
383 |
"open": {
|
384 |
"num_eval_tasks": 65,
|
385 |
"num_eval_samples": 1163,
|
386 |
-
"num_total_samples": 1224,
|
387 |
"macro_mean_score": 0.4566234428542061,
|
388 |
"micro_mean_score": 0.4870593293207223
|
389 |
},
|
390 |
-
"overall_score": 0.
|
391 |
},
|
392 |
"Llama_3_2_11B": {
|
393 |
"core_noncot": {
|
394 |
"num_eval_tasks": 440,
|
395 |
"num_eval_samples": 6539,
|
396 |
"num_not_eval_samples": 0,
|
397 |
-
"num_total_samples": 6961,
|
398 |
"macro_mean_score": 0.10044261716549671,
|
399 |
"micro_mean_score": 0.09980638766828835
|
400 |
},
|
@@ -402,25 +356,22 @@
|
|
402 |
"num_eval_tasks": 440,
|
403 |
"num_eval_samples": 6539,
|
404 |
"num_not_eval_samples": 0,
|
405 |
-
"
|
406 |
-
"
|
407 |
-
"micro_mean_score": 0.15794038158731832
|
408 |
},
|
409 |
"open": {
|
410 |
"num_eval_tasks": 65,
|
411 |
"num_eval_samples": 1163,
|
412 |
-
"num_total_samples": 1224,
|
413 |
"macro_mean_score": 0.3173342406187366,
|
414 |
"micro_mean_score": 0.3487962166809973
|
415 |
},
|
416 |
-
"overall_score": 0.
|
417 |
},
|
418 |
"Idefics3": {
|
419 |
"core_noncot": {
|
420 |
"num_eval_tasks": 440,
|
421 |
"num_eval_samples": 6539,
|
422 |
"num_not_eval_samples": 0,
|
423 |
-
"num_total_samples": 6961,
|
424 |
"macro_mean_score": 0.11118980301103833,
|
425 |
"micro_mean_score": 0.11201785633274061
|
426 |
},
|
@@ -428,14 +379,12 @@
|
|
428 |
"num_eval_tasks": 440,
|
429 |
"num_eval_samples": 6539,
|
430 |
"num_not_eval_samples": 0,
|
431 |
-
"num_total_samples": 6961,
|
432 |
"macro_mean_score": 0.08956972487602757,
|
433 |
"micro_mean_score": 0.08982225274252693
|
434 |
},
|
435 |
"open": {
|
436 |
"num_eval_tasks": 65,
|
437 |
"num_eval_samples": 1163,
|
438 |
-
"num_total_samples": 2448,
|
439 |
"macro_mean_score": 0.3210866162255635,
|
440 |
"micro_mean_score": 0.35649183147033553
|
441 |
},
|
@@ -446,7 +395,6 @@
|
|
446 |
"num_eval_tasks": 440,
|
447 |
"num_eval_samples": 6539,
|
448 |
"num_not_eval_samples": 0,
|
449 |
-
"num_total_samples": 6961,
|
450 |
"macro_mean_score": 0.30485930718699694,
|
451 |
"micro_mean_score": 0.3016713629035311
|
452 |
},
|
@@ -454,14 +402,12 @@
|
|
454 |
"num_eval_tasks": 440,
|
455 |
"num_eval_samples": 6539,
|
456 |
"num_not_eval_samples": 0,
|
457 |
-
"num_total_samples": 6961,
|
458 |
"macro_mean_score": 0.289073788209904,
|
459 |
"micro_mean_score": 0.2859007507765791
|
460 |
},
|
461 |
"open": {
|
462 |
"num_eval_tasks": 65,
|
463 |
"num_eval_samples": 1163,
|
464 |
-
"num_total_samples": 1224,
|
465 |
"macro_mean_score": 0.5103725263180767,
|
466 |
"micro_mean_score": 0.5349957007738607
|
467 |
},
|
@@ -472,7 +418,6 @@
|
|
472 |
"num_eval_tasks": 440,
|
473 |
"num_eval_samples": 6539,
|
474 |
"num_not_eval_samples": 0,
|
475 |
-
"num_total_samples": 6961,
|
476 |
"macro_mean_score": 0.2420528895703979,
|
477 |
"micro_mean_score": 0.23838419989257642
|
478 |
},
|
@@ -480,14 +425,12 @@
|
|
480 |
"num_eval_tasks": 440,
|
481 |
"num_eval_samples": 6539,
|
482 |
"num_not_eval_samples": 0,
|
483 |
-
"num_total_samples": 6961,
|
484 |
"macro_mean_score": 0.21589726765847422,
|
485 |
"micro_mean_score": 0.21406043849932396
|
486 |
},
|
487 |
"open": {
|
488 |
"num_eval_tasks": 65,
|
489 |
"num_eval_samples": 1163,
|
490 |
-
"num_total_samples": 1224,
|
491 |
"macro_mean_score": 0.3478114310231307,
|
492 |
"micro_mean_score": 0.3947549441100602
|
493 |
},
|
@@ -498,7 +441,6 @@
|
|
498 |
"num_eval_tasks": 440,
|
499 |
"num_eval_samples": 6539,
|
500 |
"num_not_eval_samples": 0,
|
501 |
-
"num_total_samples": 6961,
|
502 |
"macro_mean_score": 0.09089701489596874,
|
503 |
"micro_mean_score": 0.09036328295381871
|
504 |
},
|
@@ -506,14 +448,12 @@
|
|
506 |
"num_eval_tasks": 440,
|
507 |
"num_eval_samples": 6539,
|
508 |
"num_not_eval_samples": 0,
|
509 |
-
"num_total_samples": 6961,
|
510 |
"macro_mean_score": 0.13141974398938763,
|
511 |
"micro_mean_score": 0.13063500716262516
|
512 |
},
|
513 |
"open": {
|
514 |
"num_eval_tasks": 65,
|
515 |
"num_eval_samples": 1163,
|
516 |
-
"num_total_samples": 1224,
|
517 |
"macro_mean_score": 0.23864417043743646,
|
518 |
"micro_mean_score": 0.24901117798796224
|
519 |
},
|
@@ -524,7 +464,6 @@
|
|
524 |
"num_eval_tasks": 440,
|
525 |
"num_eval_samples": 6539,
|
526 |
"num_not_eval_samples": 0,
|
527 |
-
"num_total_samples": 6961,
|
528 |
"macro_mean_score": 0.16448220309703876,
|
529 |
"micro_mean_score": 0.1610710186451323
|
530 |
},
|
@@ -532,14 +471,12 @@
|
|
532 |
"num_eval_tasks": 440,
|
533 |
"num_eval_samples": 6539,
|
534 |
"num_not_eval_samples": 0,
|
535 |
-
"num_total_samples": 6961,
|
536 |
"macro_mean_score": 0.20877163406364055,
|
537 |
"micro_mean_score": 0.20561526268932287
|
538 |
},
|
539 |
"open": {
|
540 |
"num_eval_tasks": 65,
|
541 |
"num_eval_samples": 1163,
|
542 |
-
"num_total_samples": 1224,
|
543 |
"macro_mean_score": 0.3154302566225611,
|
544 |
"micro_mean_score": 0.33856405846947557
|
545 |
},
|
@@ -550,7 +487,6 @@
|
|
550 |
"num_eval_tasks": 440,
|
551 |
"num_eval_samples": 6539,
|
552 |
"num_not_eval_samples": 0,
|
553 |
-
"num_total_samples": 6961,
|
554 |
"macro_mean_score": 0.16317824309838627,
|
555 |
"micro_mean_score": 0.16198837245148487
|
556 |
},
|
@@ -558,14 +494,12 @@
|
|
558 |
"num_eval_tasks": 440,
|
559 |
"num_eval_samples": 6539,
|
560 |
"num_not_eval_samples": 0,
|
561 |
-
"num_total_samples": 6961,
|
562 |
"macro_mean_score": 0.159970161379836,
|
563 |
"micro_mean_score": 0.15844711671722148
|
564 |
},
|
565 |
"open": {
|
566 |
"num_eval_tasks": 65,
|
567 |
"num_eval_samples": 1163,
|
568 |
-
"num_total_samples": 1224,
|
569 |
"macro_mean_score": 0.24567572098570653,
|
570 |
"micro_mean_score": 0.2704213241616509
|
571 |
},
|
|
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
"num_not_eval_samples": 0,
|
|
|
7 |
"macro_mean_score": 0.5203440930873326,
|
8 |
"micro_mean_score": 0.514302640282204
|
9 |
},
|
|
|
11 |
"num_eval_tasks": 440,
|
12 |
"num_eval_samples": 6539,
|
13 |
"num_not_eval_samples": 0,
|
|
|
14 |
"macro_mean_score": 0.5265030595065238,
|
15 |
"micro_mean_score": 0.5236338521693411
|
16 |
},
|
17 |
"open": {
|
18 |
"num_eval_tasks": 65,
|
19 |
"num_eval_samples": 1163,
|
|
|
20 |
"macro_mean_score": 0.6478225794744895,
|
21 |
"micro_mean_score": 0.665391229578676
|
22 |
},
|
|
|
27 |
"num_eval_tasks": 440,
|
28 |
"num_eval_samples": 6539,
|
29 |
"num_not_eval_samples": 0,
|
30 |
+
"macro_mean_score": 0.4699992918320008,
|
31 |
+
"micro_mean_score": 0.4651116133689296
|
|
|
32 |
},
|
33 |
"core_cot": {
|
34 |
"num_eval_tasks": 440,
|
35 |
"num_eval_samples": 6539,
|
36 |
"num_not_eval_samples": 0,
|
37 |
+
"macro_mean_score": 0.4822473962867704,
|
38 |
+
"micro_mean_score": 0.4764805563057179
|
|
|
39 |
},
|
40 |
"open": {
|
41 |
"num_eval_tasks": 65,
|
42 |
"num_eval_samples": 1163,
|
|
|
43 |
"macro_mean_score": 0.5858190649927173,
|
44 |
"micro_mean_score": 0.6104901117798793
|
45 |
},
|
46 |
+
"overall_score": 0.4955784031499121
|
47 |
},
|
48 |
"Gemini_1.5_flash_002": {
|
49 |
"core_noncot": {
|
50 |
"num_eval_tasks": 440,
|
51 |
"num_eval_samples": 6539,
|
52 |
"num_not_eval_samples": 0,
|
53 |
+
"macro_mean_score": 0.41898948981774853,
|
54 |
+
"micro_mean_score": 0.4127376993779598
|
|
|
55 |
},
|
56 |
"core_cot": {
|
57 |
"num_eval_tasks": 440,
|
58 |
"num_eval_samples": 6539,
|
59 |
"num_not_eval_samples": 0,
|
60 |
+
"macro_mean_score": 0.4189319021967416,
|
61 |
+
"micro_mean_score": 0.41567515414375245
|
|
|
62 |
},
|
63 |
"open": {
|
64 |
"num_eval_tasks": 65,
|
65 |
"num_eval_samples": 1163,
|
|
|
66 |
"macro_mean_score": 0.5691365176285039,
|
67 |
"micro_mean_score": 0.5987532244196045
|
68 |
},
|
69 |
+
"overall_score": 0.43831534488249924
|
70 |
},
|
71 |
"Claude_3.5": {
|
72 |
"core_noncot": {
|
73 |
"num_eval_tasks": 440,
|
74 |
"num_eval_samples": 6539,
|
75 |
"num_not_eval_samples": 0,
|
76 |
+
"macro_mean_score": 0.48800427486796155,
|
77 |
+
"micro_mean_score": 0.4814327812005499
|
|
|
78 |
},
|
79 |
"core_cot": {
|
80 |
"num_eval_tasks": 440,
|
81 |
"num_eval_samples": 6539,
|
82 |
"num_not_eval_samples": 0,
|
83 |
+
"macro_mean_score": 0.5040975742801586,
|
84 |
+
"micro_mean_score": 0.5002259116666758
|
|
|
85 |
},
|
86 |
"open": {
|
87 |
"num_eval_tasks": 65,
|
88 |
"num_eval_samples": 1163,
|
|
|
89 |
"macro_mean_score": 0.6373907158949892,
|
90 |
"micro_mean_score": 0.6569647463456579
|
91 |
},
|
92 |
+
"overall_score": 0.5212541172602853
|
93 |
},
|
94 |
"Claude_3.5_new": {
|
95 |
"core_noncot": {
|
96 |
"num_eval_tasks": 440,
|
97 |
"num_eval_samples": 6539,
|
98 |
"num_not_eval_samples": 0,
|
|
|
99 |
"macro_mean_score": 0.4919657684484185,
|
100 |
"micro_mean_score": 0.4874520567007144
|
101 |
},
|
|
|
103 |
"num_eval_tasks": 440,
|
104 |
"num_eval_samples": 6539,
|
105 |
"num_not_eval_samples": 0,
|
|
|
106 |
"macro_mean_score": 0.5259191914020757,
|
107 |
"micro_mean_score": 0.5230785894131227
|
108 |
},
|
109 |
"open": {
|
110 |
"num_eval_tasks": 65,
|
111 |
"num_eval_samples": 1163,
|
|
|
112 |
"macro_mean_score": 0.6563419761104125,
|
113 |
"micro_mean_score": 0.6724419604471196
|
114 |
},
|
|
|
119 |
"num_eval_tasks": 440,
|
120 |
"num_eval_samples": 6539,
|
121 |
"num_not_eval_samples": 0,
|
122 |
+
"macro_mean_score": 0.39854757130003565,
|
123 |
+
"micro_mean_score": 0.3936551517403452
|
|
|
124 |
},
|
125 |
"core_cot": {
|
126 |
"num_eval_tasks": 440,
|
127 |
"num_eval_samples": 6539,
|
128 |
"num_not_eval_samples": 0,
|
129 |
+
"macro_mean_score": 0.40767494558789397,
|
130 |
+
"micro_mean_score": 0.40431644154143376
|
|
|
131 |
},
|
132 |
"open": {
|
133 |
"num_eval_tasks": 65,
|
134 |
"num_eval_samples": 1163,
|
|
|
135 |
"macro_mean_score": 0.586537827213665,
|
136 |
"micro_mean_score": 0.6133276010318144
|
137 |
},
|
138 |
+
"overall_score": 0.43069690064863675
|
139 |
},
|
140 |
"Qwen2_VL_72B": {
|
141 |
"core_noncot": {
|
142 |
"num_eval_tasks": 440,
|
143 |
"num_eval_samples": 6539,
|
144 |
"num_not_eval_samples": 0,
|
145 |
+
"macro_mean_score": 0.46406654108789214,
|
146 |
+
"micro_mean_score": 0.4584702152011697
|
|
|
147 |
},
|
148 |
"core_cot": {
|
149 |
"num_eval_tasks": 440,
|
150 |
"num_eval_samples": 6539,
|
151 |
"num_not_eval_samples": 0,
|
152 |
+
"macro_mean_score": 0.4542376574527161,
|
153 |
+
"micro_mean_score": 0.4501201906164793
|
|
|
154 |
},
|
155 |
"open": {
|
156 |
"num_eval_tasks": 65,
|
157 |
"num_eval_samples": 1163,
|
|
|
158 |
"macro_mean_score": 0.5639771804231668,
|
159 |
"micro_mean_score": 0.5835339638865004
|
160 |
},
|
161 |
+
"overall_score": 0.4769263263488681
|
162 |
},
|
163 |
"Qwen2_VL_7B": {
|
164 |
"core_noncot": {
|
165 |
"num_eval_tasks": 440,
|
166 |
"num_eval_samples": 6539,
|
167 |
"num_not_eval_samples": 0,
|
168 |
+
"macro_mean_score": 0.3480020832611913,
|
169 |
+
"micro_mean_score": 0.3441858958345098
|
|
|
170 |
},
|
171 |
"core_cot": {
|
172 |
"num_eval_tasks": 440,
|
173 |
"num_eval_samples": 6539,
|
174 |
"num_not_eval_samples": 0,
|
175 |
+
"macro_mean_score": 0.3293449599230247,
|
176 |
+
"micro_mean_score": 0.325331493515679
|
|
|
177 |
},
|
178 |
"open": {
|
179 |
"num_eval_tasks": 65,
|
180 |
"num_eval_samples": 1170,
|
|
|
181 |
"macro_mean_score": 0.43955105763038577,
|
182 |
"micro_mean_score": 0.45508547008546996
|
183 |
},
|
184 |
+
"overall_score": 0.3597856146156421
|
185 |
},
|
186 |
"llava_onevision_72B": {
|
187 |
"core_noncot": {
|
188 |
"num_eval_tasks": 440,
|
189 |
"num_eval_samples": 6539,
|
190 |
"num_not_eval_samples": 0,
|
191 |
+
"macro_mean_score": 0.3199332158220174,
|
192 |
+
"micro_mean_score": 0.31770770553892647
|
|
|
193 |
},
|
194 |
"core_cot": {
|
195 |
"num_eval_tasks": 440,
|
196 |
"num_eval_samples": 6539,
|
197 |
"num_not_eval_samples": 0,
|
198 |
+
"macro_mean_score": 0.2974368415462532,
|
199 |
+
"micro_mean_score": 0.2956217833156672
|
|
|
200 |
},
|
201 |
"open": {
|
202 |
"num_eval_tasks": 65,
|
203 |
"num_eval_samples": 1163,
|
|
|
204 |
"macro_mean_score": 0.4599484231632498,
|
205 |
"micro_mean_score": 0.4850386930352536
|
206 |
},
|
207 |
+
"overall_score": 0.33795497518277007
|
208 |
},
|
209 |
"llava_onevision_7B": {
|
210 |
"core_noncot": {
|
211 |
"num_eval_tasks": 440,
|
212 |
"num_eval_samples": 6539,
|
213 |
"num_not_eval_samples": 0,
|
214 |
+
"macro_mean_score": 0.22409531510496777,
|
215 |
+
"micro_mean_score": 0.22238854298563537
|
|
|
216 |
},
|
217 |
"core_cot": {
|
218 |
"num_eval_tasks": 440,
|
219 |
"num_eval_samples": 6539,
|
220 |
"num_not_eval_samples": 0,
|
221 |
+
"macro_mean_score": 0.21362697219149712,
|
222 |
+
"micro_mean_score": 0.21073910058505504
|
|
|
223 |
},
|
224 |
"open": {
|
225 |
"num_eval_tasks": 65,
|
226 |
"num_eval_samples": 1163,
|
|
|
227 |
"macro_mean_score": 0.33979975321921935,
|
228 |
"micro_mean_score": 0.36474634565778147
|
229 |
},
|
230 |
+
"overall_score": 0.23898796555531696
|
231 |
},
|
232 |
"InternVL2_76B": {
|
233 |
"core_noncot": {
|
234 |
"num_eval_tasks": 440,
|
235 |
"num_eval_samples": 6539,
|
236 |
"num_not_eval_samples": 0,
|
237 |
+
"macro_mean_score": 0.3502244283768534,
|
238 |
+
"micro_mean_score": 0.3456783051732046
|
|
|
239 |
},
|
240 |
"core_cot": {
|
241 |
"num_eval_tasks": 440,
|
242 |
"num_eval_samples": 6539,
|
243 |
"num_not_eval_samples": 0,
|
244 |
+
"macro_mean_score": 0.3562710424410931,
|
245 |
+
"micro_mean_score": 0.35129859801162616
|
|
|
246 |
},
|
247 |
"open": {
|
248 |
"num_eval_tasks": 65,
|
249 |
"num_eval_samples": 1163,
|
|
|
250 |
"macro_mean_score": 0.5192997443033639,
|
251 |
"micro_mean_score": 0.5421324161650903
|
252 |
},
|
253 |
+
"overall_score": 0.3772549347599992
|
254 |
},
|
255 |
"InternVL2_8B": {
|
256 |
"core_noncot": {
|
257 |
"num_eval_tasks": 440,
|
258 |
"num_eval_samples": 6539,
|
259 |
"num_not_eval_samples": 0,
|
260 |
+
"macro_mean_score": 0.25956581776451815,
|
261 |
+
"micro_mean_score": 0.2546984460483302
|
|
|
262 |
},
|
263 |
"core_cot": {
|
264 |
"num_eval_tasks": 440,
|
265 |
"num_eval_samples": 6539,
|
266 |
"num_not_eval_samples": 0,
|
267 |
+
"macro_mean_score": 0.24090301358258295,
|
268 |
+
"micro_mean_score": 0.23819084111520938
|
|
|
269 |
},
|
270 |
"open": {
|
271 |
"num_eval_tasks": 65,
|
272 |
"num_eval_samples": 1165,
|
|
|
273 |
"macro_mean_score": 0.3978571701460552,
|
274 |
"micro_mean_score": 0.4108583690987125
|
275 |
},
|
276 |
+
"overall_score": 0.2773656948037259
|
277 |
},
|
278 |
"MiniCPM_v2.6": {
|
279 |
"core_noncot": {
|
280 |
"num_eval_tasks": 440,
|
281 |
"num_eval_samples": 6539,
|
282 |
"num_not_eval_samples": 0,
|
283 |
+
"macro_mean_score": 0.2287645706203155,
|
284 |
+
"micro_mean_score": 0.2249087742955901
|
|
|
285 |
},
|
286 |
"core_cot": {
|
287 |
"num_eval_tasks": 440,
|
288 |
"num_eval_samples": 6539,
|
289 |
"num_not_eval_samples": 0,
|
290 |
+
"macro_mean_score": 0.22955895202146906,
|
291 |
+
"micro_mean_score": 0.22560399396899078
|
|
|
292 |
},
|
293 |
"open": {
|
294 |
"num_eval_tasks": 65,
|
295 |
"num_eval_samples": 1163,
|
|
|
296 |
"macro_mean_score": 0.41728623355613875,
|
297 |
"micro_mean_score": 0.43452278589853827
|
298 |
},
|
299 |
+
"overall_score": 0.2537218694467236
|
300 |
},
|
301 |
"Phi-3.5-vision": {
|
302 |
"core_noncot": {
|
303 |
"num_eval_tasks": 440,
|
304 |
"num_eval_samples": 6539,
|
305 |
"num_not_eval_samples": 0,
|
306 |
+
"macro_mean_score": 0.23271251159409778,
|
307 |
+
"micro_mean_score": 0.2296262323791101
|
|
|
308 |
},
|
309 |
"core_cot": {
|
310 |
"num_eval_tasks": 440,
|
311 |
"num_eval_samples": 6539,
|
312 |
"num_not_eval_samples": 0,
|
313 |
+
"macro_mean_score": 0.22995297916629392,
|
314 |
+
"micro_mean_score": 0.22708502951025372
|
|
|
315 |
},
|
316 |
"open": {
|
317 |
"num_eval_tasks": 65,
|
318 |
"num_eval_samples": 1163,
|
|
|
319 |
"macro_mean_score": 0.3947914647737769,
|
320 |
"micro_mean_score": 0.42459157351676696
|
321 |
},
|
322 |
+
"overall_score": 0.25357415903306635
|
323 |
},
|
324 |
"Pixtral_12B": {
|
325 |
"core_noncot": {
|
326 |
"num_eval_tasks": 440,
|
327 |
"num_eval_samples": 6539,
|
328 |
"num_not_eval_samples": 0,
|
329 |
+
"macro_mean_score": 0.31905695620134694,
|
330 |
+
"micro_mean_score": 0.31556607913724777
|
|
|
331 |
},
|
332 |
"core_cot": {
|
333 |
"num_eval_tasks": 440,
|
334 |
"num_eval_samples": 6539,
|
335 |
"num_not_eval_samples": 0,
|
336 |
+
"macro_mean_score": 0.31362045151669854,
|
337 |
+
"micro_mean_score": 0.3100986209078182
|
|
|
338 |
},
|
339 |
"open": {
|
340 |
"num_eval_tasks": 65,
|
341 |
"num_eval_samples": 1163,
|
|
|
342 |
"macro_mean_score": 0.4566234428542061,
|
343 |
"micro_mean_score": 0.4870593293207223
|
344 |
},
|
345 |
+
"overall_score": 0.33676353369131895
|
346 |
},
|
347 |
"Llama_3_2_11B": {
|
348 |
"core_noncot": {
|
349 |
"num_eval_tasks": 440,
|
350 |
"num_eval_samples": 6539,
|
351 |
"num_not_eval_samples": 0,
|
|
|
352 |
"macro_mean_score": 0.10044261716549671,
|
353 |
"micro_mean_score": 0.09980638766828835
|
354 |
},
|
|
|
356 |
"num_eval_tasks": 440,
|
357 |
"num_eval_samples": 6539,
|
358 |
"num_not_eval_samples": 0,
|
359 |
+
"macro_mean_score": 0.15999641916771298,
|
360 |
+
"micro_mean_score": 0.15809331016967038
|
|
|
361 |
},
|
362 |
"open": {
|
363 |
"num_eval_tasks": 65,
|
364 |
"num_eval_samples": 1163,
|
|
|
365 |
"macro_mean_score": 0.3173342406187366,
|
366 |
"micro_mean_score": 0.3487962166809973
|
367 |
},
|
368 |
+
"overall_score": 0.1802478219287358
|
369 |
},
|
370 |
"Idefics3": {
|
371 |
"core_noncot": {
|
372 |
"num_eval_tasks": 440,
|
373 |
"num_eval_samples": 6539,
|
374 |
"num_not_eval_samples": 0,
|
|
|
375 |
"macro_mean_score": 0.11118980301103833,
|
376 |
"micro_mean_score": 0.11201785633274061
|
377 |
},
|
|
|
379 |
"num_eval_tasks": 440,
|
380 |
"num_eval_samples": 6539,
|
381 |
"num_not_eval_samples": 0,
|
|
|
382 |
"macro_mean_score": 0.08956972487602757,
|
383 |
"micro_mean_score": 0.08982225274252693
|
384 |
},
|
385 |
"open": {
|
386 |
"num_eval_tasks": 65,
|
387 |
"num_eval_samples": 1163,
|
|
|
388 |
"macro_mean_score": 0.3210866162255635,
|
389 |
"micro_mean_score": 0.35649183147033553
|
390 |
},
|
|
|
395 |
"num_eval_tasks": 440,
|
396 |
"num_eval_samples": 6539,
|
397 |
"num_not_eval_samples": 0,
|
|
|
398 |
"macro_mean_score": 0.30485930718699694,
|
399 |
"micro_mean_score": 0.3016713629035311
|
400 |
},
|
|
|
402 |
"num_eval_tasks": 440,
|
403 |
"num_eval_samples": 6539,
|
404 |
"num_not_eval_samples": 0,
|
|
|
405 |
"macro_mean_score": 0.289073788209904,
|
406 |
"micro_mean_score": 0.2859007507765791
|
407 |
},
|
408 |
"open": {
|
409 |
"num_eval_tasks": 65,
|
410 |
"num_eval_samples": 1163,
|
|
|
411 |
"macro_mean_score": 0.5103725263180767,
|
412 |
"micro_mean_score": 0.5349957007738607
|
413 |
},
|
|
|
418 |
"num_eval_tasks": 440,
|
419 |
"num_eval_samples": 6539,
|
420 |
"num_not_eval_samples": 0,
|
|
|
421 |
"macro_mean_score": 0.2420528895703979,
|
422 |
"micro_mean_score": 0.23838419989257642
|
423 |
},
|
|
|
425 |
"num_eval_tasks": 440,
|
426 |
"num_eval_samples": 6539,
|
427 |
"num_not_eval_samples": 0,
|
|
|
428 |
"macro_mean_score": 0.21589726765847422,
|
429 |
"micro_mean_score": 0.21406043849932396
|
430 |
},
|
431 |
"open": {
|
432 |
"num_eval_tasks": 65,
|
433 |
"num_eval_samples": 1163,
|
|
|
434 |
"macro_mean_score": 0.3478114310231307,
|
435 |
"micro_mean_score": 0.3947549441100602
|
436 |
},
|
|
|
441 |
"num_eval_tasks": 440,
|
442 |
"num_eval_samples": 6539,
|
443 |
"num_not_eval_samples": 0,
|
|
|
444 |
"macro_mean_score": 0.09089701489596874,
|
445 |
"micro_mean_score": 0.09036328295381871
|
446 |
},
|
|
|
448 |
"num_eval_tasks": 440,
|
449 |
"num_eval_samples": 6539,
|
450 |
"num_not_eval_samples": 0,
|
|
|
451 |
"macro_mean_score": 0.13141974398938763,
|
452 |
"micro_mean_score": 0.13063500716262516
|
453 |
},
|
454 |
"open": {
|
455 |
"num_eval_tasks": 65,
|
456 |
"num_eval_samples": 1163,
|
|
|
457 |
"macro_mean_score": 0.23864417043743646,
|
458 |
"micro_mean_score": 0.24901117798796224
|
459 |
},
|
|
|
464 |
"num_eval_tasks": 440,
|
465 |
"num_eval_samples": 6539,
|
466 |
"num_not_eval_samples": 0,
|
|
|
467 |
"macro_mean_score": 0.16448220309703876,
|
468 |
"micro_mean_score": 0.1610710186451323
|
469 |
},
|
|
|
471 |
"num_eval_tasks": 440,
|
472 |
"num_eval_samples": 6539,
|
473 |
"num_not_eval_samples": 0,
|
|
|
474 |
"macro_mean_score": 0.20877163406364055,
|
475 |
"micro_mean_score": 0.20561526268932287
|
476 |
},
|
477 |
"open": {
|
478 |
"num_eval_tasks": 65,
|
479 |
"num_eval_samples": 1163,
|
|
|
480 |
"macro_mean_score": 0.3154302566225611,
|
481 |
"micro_mean_score": 0.33856405846947557
|
482 |
},
|
|
|
487 |
"num_eval_tasks": 440,
|
488 |
"num_eval_samples": 6539,
|
489 |
"num_not_eval_samples": 0,
|
|
|
490 |
"macro_mean_score": 0.16317824309838627,
|
491 |
"micro_mean_score": 0.16198837245148487
|
492 |
},
|
|
|
494 |
"num_eval_tasks": 440,
|
495 |
"num_eval_samples": 6539,
|
496 |
"num_not_eval_samples": 0,
|
|
|
497 |
"macro_mean_score": 0.159970161379836,
|
498 |
"micro_mean_score": 0.15844711671722148
|
499 |
},
|
500 |
"open": {
|
501 |
"num_eval_tasks": 65,
|
502 |
"num_eval_samples": 1163,
|
|
|
503 |
"macro_mean_score": 0.24567572098570653,
|
504 |
"micro_mean_score": 0.2704213241616509
|
505 |
},
|
static/eval_results/{Core_SI → SI}/all_model_keywords_stats.json
RENAMED
The diff for this file is too large to render.
See raw diff
|
|
static/eval_results/SI/all_summary.json
ADDED
@@ -0,0 +1,471 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Aquila_VL_2B": {
|
3 |
+
"core": {
|
4 |
+
"num_eval_tasks": 273,
|
5 |
+
"num_eval_samples": 4116,
|
6 |
+
"num_not_eval_samples": 0,
|
7 |
+
"macro_mean_score": 0.20770364903712493,
|
8 |
+
"micro_mean_score": 0.20333142638522636,
|
9 |
+
"missing_tasks": []
|
10 |
+
},
|
11 |
+
"open": {
|
12 |
+
"num_eval_tasks": 42,
|
13 |
+
"num_eval_samples": 813,
|
14 |
+
"num_not_eval_samples": 0,
|
15 |
+
"macro_mean_score": 0.31474202723571276,
|
16 |
+
"micro_mean_score": 0.3326568265682657,
|
17 |
+
"missing_tasks": []
|
18 |
+
},
|
19 |
+
"overall_score": 0.22197543279693666
|
20 |
+
},
|
21 |
+
"Aria": {
|
22 |
+
"core": {
|
23 |
+
"num_eval_tasks": 273,
|
24 |
+
"num_eval_samples": 4116,
|
25 |
+
"num_not_eval_samples": 0,
|
26 |
+
"macro_mean_score": 0.3178882776147889,
|
27 |
+
"micro_mean_score": 0.3101511832828904,
|
28 |
+
"missing_tasks": []
|
29 |
+
},
|
30 |
+
"open": {
|
31 |
+
"num_eval_tasks": 42,
|
32 |
+
"num_eval_samples": 813,
|
33 |
+
"num_not_eval_samples": 0,
|
34 |
+
"macro_mean_score": 0.5137437248005172,
|
35 |
+
"micro_mean_score": 0.5472939729397295,
|
36 |
+
"missing_tasks": []
|
37 |
+
},
|
38 |
+
"overall_score": 0.34400233723955265
|
39 |
+
},
|
40 |
+
"Claude_3.5": {
|
41 |
+
"core": {
|
42 |
+
"num_eval_tasks": 273,
|
43 |
+
"num_eval_samples": 4116,
|
44 |
+
"num_not_eval_samples": 0,
|
45 |
+
"macro_mean_score": 0.520276385877485,
|
46 |
+
"micro_mean_score": 0.5148202137998056
|
47 |
+
},
|
48 |
+
"open": {
|
49 |
+
"num_eval_tasks": 42,
|
50 |
+
"num_eval_samples": 813,
|
51 |
+
"num_not_eval_samples": 0,
|
52 |
+
"macro_mean_score": 0.6479684260295507,
|
53 |
+
"micro_mean_score": 0.6801968019680197
|
54 |
+
},
|
55 |
+
"overall_score": 0.5373019912310938
|
56 |
+
},
|
57 |
+
"Claude_3.5_new": {
|
58 |
+
"core": {
|
59 |
+
"num_eval_tasks": 273,
|
60 |
+
"num_eval_samples": 4116,
|
61 |
+
"num_not_eval_samples": 0,
|
62 |
+
"macro_mean_score": 0.5462752278980763,
|
63 |
+
"micro_mean_score": 0.5417881438289601
|
64 |
+
},
|
65 |
+
"open": {
|
66 |
+
"num_eval_tasks": 42,
|
67 |
+
"num_eval_samples": 813,
|
68 |
+
"num_not_eval_samples": 0,
|
69 |
+
"macro_mean_score": 0.6764020657053476,
|
70 |
+
"micro_mean_score": 0.6924969249692496
|
71 |
+
},
|
72 |
+
"overall_score": 0.5636254729390457
|
73 |
+
},
|
74 |
+
"GPT_4o": {
|
75 |
+
"core": {
|
76 |
+
"num_eval_tasks": 273,
|
77 |
+
"num_eval_samples": 4116,
|
78 |
+
"num_not_eval_samples": 0,
|
79 |
+
"macro_mean_score": 0.5529953662872719,
|
80 |
+
"micro_mean_score": 0.5483479105928085
|
81 |
+
},
|
82 |
+
"open": {
|
83 |
+
"num_eval_tasks": 42,
|
84 |
+
"num_eval_samples": 813,
|
85 |
+
"num_not_eval_samples": 0,
|
86 |
+
"macro_mean_score": 0.6600228904804206,
|
87 |
+
"micro_mean_score": 0.6801968019680197
|
88 |
+
},
|
89 |
+
"overall_score": 0.5672657028463584
|
90 |
+
},
|
91 |
+
"GPT_4o_mini": {
|
92 |
+
"core": {
|
93 |
+
"num_eval_tasks": 273,
|
94 |
+
"num_eval_samples": 4116,
|
95 |
+
"num_not_eval_samples": 0,
|
96 |
+
"macro_mean_score": 0.44285970964797233,
|
97 |
+
"micro_mean_score": 0.43756073858114675
|
98 |
+
},
|
99 |
+
"open": {
|
100 |
+
"num_eval_tasks": 42,
|
101 |
+
"num_eval_samples": 813,
|
102 |
+
"num_not_eval_samples": 0,
|
103 |
+
"macro_mean_score": 0.595574663769726,
|
104 |
+
"micro_mean_score": 0.6334563345633456
|
105 |
+
},
|
106 |
+
"overall_score": 0.46322170353087283
|
107 |
+
},
|
108 |
+
"Gemini_1.5_flash_002": {
|
109 |
+
"core": {
|
110 |
+
"num_eval_tasks": 273,
|
111 |
+
"num_eval_samples": 4116,
|
112 |
+
"num_not_eval_samples": 0,
|
113 |
+
"macro_mean_score": 0.42188460865574384,
|
114 |
+
"micro_mean_score": 0.413508260447036
|
115 |
+
},
|
116 |
+
"open": {
|
117 |
+
"num_eval_tasks": 42,
|
118 |
+
"num_eval_samples": 813,
|
119 |
+
"num_not_eval_samples": 0,
|
120 |
+
"macro_mean_score": 0.5787083135236054,
|
121 |
+
"micro_mean_score": 0.6186961869618696
|
122 |
+
},
|
123 |
+
"overall_score": 0.44279443597145873
|
124 |
+
},
|
125 |
+
"Gemini_1.5_pro_002": {
|
126 |
+
"core": {
|
127 |
+
"num_eval_tasks": 273,
|
128 |
+
"num_eval_samples": 4116,
|
129 |
+
"num_not_eval_samples": 0,
|
130 |
+
"macro_mean_score": 0.4914311038229404,
|
131 |
+
"micro_mean_score": 0.48323615160349853
|
132 |
+
},
|
133 |
+
"open": {
|
134 |
+
"num_eval_tasks": 42,
|
135 |
+
"num_eval_samples": 813,
|
136 |
+
"num_not_eval_samples": 0,
|
137 |
+
"macro_mean_score": 0.5814975405131552,
|
138 |
+
"micro_mean_score": 0.6174661746617466
|
139 |
+
},
|
140 |
+
"overall_score": 0.5034399620483024
|
141 |
+
},
|
142 |
+
"Idefics3": {
|
143 |
+
"core": {
|
144 |
+
"num_eval_tasks": 273,
|
145 |
+
"num_eval_samples": 4116,
|
146 |
+
"num_not_eval_samples": 0,
|
147 |
+
"macro_mean_score": 0.08941182847569326,
|
148 |
+
"micro_mean_score": 0.08779475233900695,
|
149 |
+
"missing_tasks": []
|
150 |
+
},
|
151 |
+
"open": {
|
152 |
+
"num_eval_tasks": 42,
|
153 |
+
"num_eval_samples": 813,
|
154 |
+
"num_not_eval_samples": 0,
|
155 |
+
"macro_mean_score": 0.3231434267517844,
|
156 |
+
"micro_mean_score": 0.3618081180811809,
|
157 |
+
"missing_tasks": []
|
158 |
+
},
|
159 |
+
"overall_score": 0.12057604157917208
|
160 |
+
},
|
161 |
+
"InternVL2_2B": {
|
162 |
+
"core": {
|
163 |
+
"num_eval_tasks": 273,
|
164 |
+
"num_eval_samples": 4116,
|
165 |
+
"num_not_eval_samples": 0,
|
166 |
+
"macro_mean_score": 0.12069001041308772,
|
167 |
+
"micro_mean_score": 0.11842605219090299,
|
168 |
+
"missing_tasks": []
|
169 |
+
},
|
170 |
+
"open": {
|
171 |
+
"num_eval_tasks": 42,
|
172 |
+
"num_eval_samples": 813,
|
173 |
+
"num_not_eval_samples": 0,
|
174 |
+
"macro_mean_score": 0.28522459992910454,
|
175 |
+
"micro_mean_score": 0.28886838868388687,
|
176 |
+
"missing_tasks": []
|
177 |
+
},
|
178 |
+
"overall_score": 0.14262795568189
|
179 |
+
},
|
180 |
+
"InternVL2_76B": {
|
181 |
+
"core": {
|
182 |
+
"num_eval_tasks": 273,
|
183 |
+
"num_eval_samples": 4116,
|
184 |
+
"num_not_eval_samples": 0,
|
185 |
+
"macro_mean_score": 0.3998616568018755,
|
186 |
+
"micro_mean_score": 0.39149064302628933,
|
187 |
+
"missing_tasks": []
|
188 |
+
},
|
189 |
+
"open": {
|
190 |
+
"num_eval_tasks": 42,
|
191 |
+
"num_eval_samples": 813,
|
192 |
+
"num_not_eval_samples": 0,
|
193 |
+
"macro_mean_score": 0.554748737158244,
|
194 |
+
"micro_mean_score": 0.5800738007380073,
|
195 |
+
"missing_tasks": []
|
196 |
+
},
|
197 |
+
"overall_score": 0.42051326751605805
|
198 |
+
},
|
199 |
+
"InternVL2_8B": {
|
200 |
+
"core": {
|
201 |
+
"num_eval_tasks": 273,
|
202 |
+
"num_eval_samples": 4116,
|
203 |
+
"num_not_eval_samples": 0,
|
204 |
+
"macro_mean_score": 0.27650612401825575,
|
205 |
+
"micro_mean_score": 0.27119471729837735,
|
206 |
+
"missing_tasks": []
|
207 |
+
},
|
208 |
+
"open": {
|
209 |
+
"num_eval_tasks": 42,
|
210 |
+
"num_eval_samples": 813,
|
211 |
+
"num_not_eval_samples": 0,
|
212 |
+
"macro_mean_score": 0.39388373890935635,
|
213 |
+
"micro_mean_score": 0.4045510455104551,
|
214 |
+
"missing_tasks": []
|
215 |
+
},
|
216 |
+
"overall_score": 0.29215647267040246
|
217 |
+
},
|
218 |
+
"Llama_3_2_11B": {
|
219 |
+
"core": {
|
220 |
+
"num_eval_tasks": 273,
|
221 |
+
"num_eval_samples": 4116,
|
222 |
+
"num_not_eval_samples": 0,
|
223 |
+
"macro_mean_score": 0.20789144960796493,
|
224 |
+
"micro_mean_score": 0.20163641703273802,
|
225 |
+
"missing_tasks": []
|
226 |
+
},
|
227 |
+
"open": {
|
228 |
+
"num_eval_tasks": 42,
|
229 |
+
"num_eval_samples": 813,
|
230 |
+
"num_not_eval_samples": 0,
|
231 |
+
"macro_mean_score": 0.3861125858565788,
|
232 |
+
"micro_mean_score": 0.4130381303813038,
|
233 |
+
"missing_tasks": []
|
234 |
+
},
|
235 |
+
"overall_score": 0.2316542677744468
|
236 |
+
},
|
237 |
+
"MiniCPM_v2.6": {
|
238 |
+
"core": {
|
239 |
+
"num_eval_tasks": 273,
|
240 |
+
"num_eval_samples": 4116,
|
241 |
+
"num_not_eval_samples": 0,
|
242 |
+
"macro_mean_score": 0.23230765810722817,
|
243 |
+
"micro_mean_score": 0.22684118052665975,
|
244 |
+
"missing_tasks": []
|
245 |
+
},
|
246 |
+
"open": {
|
247 |
+
"num_eval_tasks": 42,
|
248 |
+
"num_eval_samples": 813,
|
249 |
+
"num_not_eval_samples": 0,
|
250 |
+
"macro_mean_score": 0.4360655066213874,
|
251 |
+
"micro_mean_score": 0.4588560885608856,
|
252 |
+
"missing_tasks": []
|
253 |
+
},
|
254 |
+
"overall_score": 0.2594753712424494
|
255 |
+
},
|
256 |
+
"Molmo_72B": {
|
257 |
+
"core": {
|
258 |
+
"num_eval_tasks": 270,
|
259 |
+
"num_eval_samples": 4073,
|
260 |
+
"num_not_eval_samples": 0,
|
261 |
+
"macro_mean_score": 0.36480000609384927,
|
262 |
+
"micro_mean_score": 0.36205779758110807,
|
263 |
+
"missing_tasks": [
|
264 |
+
"MMSoc_Misinformation_PolitiFact",
|
265 |
+
"table_understanding",
|
266 |
+
"planning_screenshot_termes"
|
267 |
+
]
|
268 |
+
},
|
269 |
+
"open": {
|
270 |
+
"num_eval_tasks": 42,
|
271 |
+
"num_eval_samples": 813,
|
272 |
+
"num_not_eval_samples": 0,
|
273 |
+
"macro_mean_score": 0.4465682063915481,
|
274 |
+
"micro_mean_score": 0.4850553505535054,
|
275 |
+
"missing_tasks": []
|
276 |
+
},
|
277 |
+
"overall_score": 0.3758072638262318
|
278 |
+
},
|
279 |
+
"Molmo_7B_D": {
|
280 |
+
"core": {
|
281 |
+
"num_eval_tasks": 272,
|
282 |
+
"num_eval_samples": 4102,
|
283 |
+
"num_not_eval_samples": 0,
|
284 |
+
"macro_mean_score": 0.2098088446992518,
|
285 |
+
"micro_mean_score": 0.20550929661464645,
|
286 |
+
"missing_tasks": [
|
287 |
+
"MMSoc_Misinformation_PolitiFact"
|
288 |
+
]
|
289 |
+
},
|
290 |
+
"open": {
|
291 |
+
"num_eval_tasks": 42,
|
292 |
+
"num_eval_samples": 813,
|
293 |
+
"num_not_eval_samples": 0,
|
294 |
+
"macro_mean_score": 0.35697926179118733,
|
295 |
+
"micro_mean_score": 0.38936039360393604,
|
296 |
+
"missing_tasks": []
|
297 |
+
},
|
298 |
+
"overall_score": 0.22949405972428777
|
299 |
+
},
|
300 |
+
"NVLM": {
|
301 |
+
"core": {
|
302 |
+
"num_eval_tasks": 273,
|
303 |
+
"num_eval_samples": 4116,
|
304 |
+
"num_not_eval_samples": 0,
|
305 |
+
"macro_mean_score": 0.32989872890926025,
|
306 |
+
"micro_mean_score": 0.32315683713111915,
|
307 |
+
"missing_tasks": []
|
308 |
+
},
|
309 |
+
"open": {
|
310 |
+
"num_eval_tasks": 42,
|
311 |
+
"num_eval_samples": 813,
|
312 |
+
"num_not_eval_samples": 0,
|
313 |
+
"macro_mean_score": 0.4469349818134809,
|
314 |
+
"micro_mean_score": 0.4881303813038132,
|
315 |
+
"missing_tasks": []
|
316 |
+
},
|
317 |
+
"overall_score": 0.34550356262982296
|
318 |
+
},
|
319 |
+
"POINTS_7B": {
|
320 |
+
"core": {
|
321 |
+
"num_eval_tasks": 273,
|
322 |
+
"num_eval_samples": 4116,
|
323 |
+
"num_not_eval_samples": 0,
|
324 |
+
"macro_mean_score": 0.25511317681632334,
|
325 |
+
"micro_mean_score": 0.24927711632415062,
|
326 |
+
"missing_tasks": []
|
327 |
+
},
|
328 |
+
"open": {
|
329 |
+
"num_eval_tasks": 42,
|
330 |
+
"num_eval_samples": 813,
|
331 |
+
"num_not_eval_samples": 0,
|
332 |
+
"macro_mean_score": 0.30315625179016,
|
333 |
+
"micro_mean_score": 0.3313653136531366,
|
334 |
+
"missing_tasks": []
|
335 |
+
},
|
336 |
+
"overall_score": 0.26151892014616823
|
337 |
+
},
|
338 |
+
"Phi-3.5-vision": {
|
339 |
+
"core": {
|
340 |
+
"num_eval_tasks": 273,
|
341 |
+
"num_eval_samples": 4116,
|
342 |
+
"num_not_eval_samples": 0,
|
343 |
+
"macro_mean_score": 0.2561274958722834,
|
344 |
+
"micro_mean_score": 0.2504214576875906,
|
345 |
+
"missing_tasks": []
|
346 |
+
},
|
347 |
+
"open": {
|
348 |
+
"num_eval_tasks": 42,
|
349 |
+
"num_eval_samples": 813,
|
350 |
+
"num_not_eval_samples": 0,
|
351 |
+
"macro_mean_score": 0.4272267419054576,
|
352 |
+
"micro_mean_score": 0.445879458794588,
|
353 |
+
"missing_tasks": []
|
354 |
+
},
|
355 |
+
"overall_score": 0.2789407286767066
|
356 |
+
},
|
357 |
+
"Pixtral_12B": {
|
358 |
+
"core": {
|
359 |
+
"num_eval_tasks": 273,
|
360 |
+
"num_eval_samples": 4116,
|
361 |
+
"num_not_eval_samples": 0,
|
362 |
+
"macro_mean_score": 0.3436942439614412,
|
363 |
+
"micro_mean_score": 0.3373564384613738,
|
364 |
+
"missing_tasks": []
|
365 |
+
},
|
366 |
+
"open": {
|
367 |
+
"num_eval_tasks": 42,
|
368 |
+
"num_eval_samples": 813,
|
369 |
+
"num_not_eval_samples": 0,
|
370 |
+
"macro_mean_score": 0.4417271955536318,
|
371 |
+
"micro_mean_score": 0.4845633456334564,
|
372 |
+
"missing_tasks": []
|
373 |
+
},
|
374 |
+
"overall_score": 0.3567653041737333
|
375 |
+
},
|
376 |
+
"Qwen2_VL_2B": {
|
377 |
+
"core": {
|
378 |
+
"num_eval_tasks": 273,
|
379 |
+
"num_eval_samples": 4116,
|
380 |
+
"num_not_eval_samples": 0,
|
381 |
+
"macro_mean_score": 0.22787906973244856,
|
382 |
+
"micro_mean_score": 0.2234748515064842,
|
383 |
+
"missing_tasks": []
|
384 |
+
},
|
385 |
+
"open": {
|
386 |
+
"num_eval_tasks": 42,
|
387 |
+
"num_eval_samples": 813,
|
388 |
+
"num_not_eval_samples": 0,
|
389 |
+
"macro_mean_score": 0.3509364634962041,
|
390 |
+
"micro_mean_score": 0.3768757687576875,
|
391 |
+
"missing_tasks": []
|
392 |
+
},
|
393 |
+
"overall_score": 0.24428672223428263
|
394 |
+
},
|
395 |
+
"Qwen2_VL_72B": {
|
396 |
+
"core": {
|
397 |
+
"num_eval_tasks": 273,
|
398 |
+
"num_eval_samples": 4116,
|
399 |
+
"num_not_eval_samples": 0,
|
400 |
+
"macro_mean_score": 0.4730536307784527,
|
401 |
+
"micro_mean_score": 0.4659830915476831,
|
402 |
+
"missing_tasks": []
|
403 |
+
},
|
404 |
+
"open": {
|
405 |
+
"num_eval_tasks": 42,
|
406 |
+
"num_eval_samples": 813,
|
407 |
+
"num_not_eval_samples": 0,
|
408 |
+
"macro_mean_score": 0.5510079982505317,
|
409 |
+
"micro_mean_score": 0.5826568265682657,
|
410 |
+
"missing_tasks": []
|
411 |
+
},
|
412 |
+
"overall_score": 0.48344754644139654
|
413 |
+
},
|
414 |
+
"Qwen2_VL_7B": {
|
415 |
+
"core": {
|
416 |
+
"num_eval_tasks": 273,
|
417 |
+
"num_eval_samples": 4116,
|
418 |
+
"num_not_eval_samples": 0,
|
419 |
+
"macro_mean_score": 0.3538656561495699,
|
420 |
+
"micro_mean_score": 0.34581250459157137,
|
421 |
+
"missing_tasks": []
|
422 |
+
},
|
423 |
+
"open": {
|
424 |
+
"num_eval_tasks": 42,
|
425 |
+
"num_eval_samples": 813,
|
426 |
+
"num_not_eval_samples": 0,
|
427 |
+
"macro_mean_score": 0.4517429592549692,
|
428 |
+
"micro_mean_score": 0.4730012300123002,
|
429 |
+
"missing_tasks": []
|
430 |
+
},
|
431 |
+
"overall_score": 0.3669159632302898
|
432 |
+
},
|
433 |
+
"llava_onevision_72B": {
|
434 |
+
"core": {
|
435 |
+
"num_eval_tasks": 273,
|
436 |
+
"num_eval_samples": 4116,
|
437 |
+
"num_not_eval_samples": 0,
|
438 |
+
"macro_mean_score": 0.312618242621264,
|
439 |
+
"micro_mean_score": 0.3098623876487132,
|
440 |
+
"missing_tasks": []
|
441 |
+
},
|
442 |
+
"open": {
|
443 |
+
"num_eval_tasks": 42,
|
444 |
+
"num_eval_samples": 813,
|
445 |
+
"num_not_eval_samples": 0,
|
446 |
+
"macro_mean_score": 0.4425822460912829,
|
447 |
+
"micro_mean_score": 0.47539975399754,
|
448 |
+
"missing_tasks": []
|
449 |
+
},
|
450 |
+
"overall_score": 0.32994677641726655
|
451 |
+
},
|
452 |
+
"llava_onevision_7B": {
|
453 |
+
"core": {
|
454 |
+
"num_eval_tasks": 273,
|
455 |
+
"num_eval_samples": 4116,
|
456 |
+
"num_not_eval_samples": 0,
|
457 |
+
"macro_mean_score": 0.23683339637631812,
|
458 |
+
"micro_mean_score": 0.23283041278687175,
|
459 |
+
"missing_tasks": []
|
460 |
+
},
|
461 |
+
"open": {
|
462 |
+
"num_eval_tasks": 42,
|
463 |
+
"num_eval_samples": 813,
|
464 |
+
"num_not_eval_samples": 0,
|
465 |
+
"macro_mean_score": 0.3871602360316429,
|
466 |
+
"micro_mean_score": 0.4113161131611316,
|
467 |
+
"missing_tasks": []
|
468 |
+
},
|
469 |
+
"overall_score": 0.25687697499702805
|
470 |
+
}
|
471 |
+
}
|
utils.py
CHANGED
@@ -11,7 +11,7 @@ MODEL_NAME_MAP = {
|
|
11 |
"InternVL2_76B": "InternVL2-Llama3-76B",
|
12 |
"Qwen2_VL_72B": "Qwen2-VL-72B",
|
13 |
"llava_onevision_72B": "Llava-OneVision-72B",
|
14 |
-
"NVLM": "NVLM-72B",
|
15 |
"GPT_4o_mini": "GPT-4o mini",
|
16 |
"Gemini_1.5_flash_002": "Gemini-1.5-Flash-002",
|
17 |
"Pixtral_12B": "Pixtral 12B",
|
@@ -83,6 +83,34 @@ KEYWORD_NAME_MAP = {
|
|
83 |
"video": "Video",
|
84 |
}
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
class BaseDataLoader:
|
87 |
# Define the base MODEL_GROUPS structure
|
88 |
BASE_MODEL_GROUPS = {
|
@@ -183,10 +211,10 @@ class DefaultDataLoader(BaseDataLoader):
|
|
183 |
core_noncot_score = summary["core_noncot"]["macro_mean_score"]
|
184 |
core_cot_score = summary["core_cot"]["macro_mean_score"]
|
185 |
row = {
|
186 |
-
"Models": get_display_model_name(model),
|
187 |
"Overall": round(summary["overall_score"] * 100, 2),
|
188 |
-
"Core
|
189 |
-
"Core
|
190 |
"Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
|
191 |
}
|
192 |
for display_name in self.SUPER_GROUPS[selected_super_group]:
|
@@ -203,21 +231,54 @@ class DefaultDataLoader(BaseDataLoader):
|
|
203 |
|
204 |
def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]:
|
205 |
df = self.get_df(selected_super_group, selected_model_group)
|
206 |
-
|
207 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
return headers, data
|
209 |
|
210 |
|
211 |
-
class
|
212 |
def __init__(self):
|
213 |
super().__init__()
|
214 |
|
215 |
def _load_model_data(self) -> Dict[str, Any]:
|
216 |
-
with open("./static/eval_results/
|
217 |
return json.load(f)
|
218 |
|
219 |
def _load_summary_data(self) -> Dict[str, Any]:
|
220 |
-
with open("./static/eval_results/
|
221 |
return json.load(f)
|
222 |
|
223 |
def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame:
|
@@ -226,10 +287,11 @@ class CoreSingleDataLoader(BaseDataLoader):
|
|
226 |
for model in self.MODEL_GROUPS[selected_model_group]:
|
227 |
model_data = self.MODEL_DATA[model]
|
228 |
summary = self.SUMMARY_DATA[model]
|
229 |
-
core_si_score = summary["macro_mean_score"]
|
230 |
row = {
|
231 |
-
"Models": get_display_model_name(model),
|
232 |
-
"
|
|
|
|
|
233 |
}
|
234 |
for display_name in self.SUPER_GROUPS[selected_super_group]:
|
235 |
original_keyword = self.keyword_display_map[display_name]
|
@@ -240,13 +302,43 @@ class CoreSingleDataLoader(BaseDataLoader):
|
|
240 |
data.append(row)
|
241 |
|
242 |
df = pd.DataFrame(data)
|
243 |
-
df = df.sort_values(by="
|
244 |
return df
|
245 |
|
246 |
def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]:
|
247 |
df = self.get_df(selected_super_group, selected_model_group)
|
248 |
-
|
249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
return headers, data
|
251 |
|
252 |
|
@@ -257,5 +349,8 @@ def get_original_dimension(mapped_dimension):
|
|
257 |
def get_original_keyword(mapped_keyword):
|
258 |
return next((k for k, v in KEYWORD_NAME_MAP.items() if v == mapped_keyword), mapped_keyword)
|
259 |
|
260 |
-
def get_display_model_name(model_name):
|
261 |
-
|
|
|
|
|
|
|
|
11 |
"InternVL2_76B": "InternVL2-Llama3-76B",
|
12 |
"Qwen2_VL_72B": "Qwen2-VL-72B",
|
13 |
"llava_onevision_72B": "Llava-OneVision-72B",
|
14 |
+
"NVLM": "NVLM-D-72B",
|
15 |
"GPT_4o_mini": "GPT-4o mini",
|
16 |
"Gemini_1.5_flash_002": "Gemini-1.5-Flash-002",
|
17 |
"Pixtral_12B": "Pixtral 12B",
|
|
|
83 |
"video": "Video",
|
84 |
}
|
85 |
|
86 |
+
MODEL_URLS = {
|
87 |
+
"Claude_3.5_new": "https://www.anthropic.com/news/3-5-models-and-computer-use",
|
88 |
+
"GPT_4o": "https://platform.openai.com/docs/models/gpt-4o",
|
89 |
+
"Claude_3.5": "https://www.anthropic.com/news/claude-3-5-sonnet",
|
90 |
+
"Gemini_1.5_pro_002": "https://ai.google.dev/gemini-api/docs/models/gemini",
|
91 |
+
"Gemini_1.5_flash_002": "https://ai.google.dev/gemini-api/docs/models/gemini",
|
92 |
+
"GPT_4o_mini": "https://platform.openai.com/docs/models#gpt-4o-mini",
|
93 |
+
"Qwen2_VL_72B": "https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct",
|
94 |
+
"InternVL2_76B": "https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B",
|
95 |
+
"llava_onevision_72B": "https://huggingface.co/lmms-lab/llava-onevision-qwen2-72b-ov-chat",
|
96 |
+
"NVLM": "https://huggingface.co/nvidia/NVLM-D-72B",
|
97 |
+
"Molmo_72B": "https://huggingface.co/allenai/Molmo-72B-0924",
|
98 |
+
"Qwen2_VL_7B": "https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct",
|
99 |
+
"Pixtral_12B": "https://huggingface.co/mistralai/Pixtral-12B-2409",
|
100 |
+
"Aria": "https://huggingface.co/rhymes-ai/Aria",
|
101 |
+
"InternVL2_8B": "https://huggingface.co/OpenGVLab/InternVL2-8B",
|
102 |
+
"Phi-3.5-vision": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct",
|
103 |
+
"MiniCPM_v2.6": "https://huggingface.co/openbmb/MiniCPM-V-2_6",
|
104 |
+
"llava_onevision_7B": "https://huggingface.co/lmms-lab/llava-onevision-qwen2-7b-ov",
|
105 |
+
"Llama_3_2_11B": "https://huggingface.co/meta-llama/Llama-3.2-11B-Vision",
|
106 |
+
"Idefics3": "https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3",
|
107 |
+
"Molmo_7B_D": "https://huggingface.co/allenai/Molmo-7B-D-0924",
|
108 |
+
"Aquila_VL_2B": "https://huggingface.co/BAAI/Aquila-VL-2B-llava-qwen",
|
109 |
+
"POINTS_7B": "https://huggingface.co/WePOINTS/POINTS-Qwen-2-5-7B-Chat",
|
110 |
+
"Qwen2_VL_2B": "https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct",
|
111 |
+
"InternVL2_2B": "https://huggingface.co/OpenGVLab/InternVL2-2B"
|
112 |
+
}
|
113 |
+
|
114 |
class BaseDataLoader:
|
115 |
# Define the base MODEL_GROUPS structure
|
116 |
BASE_MODEL_GROUPS = {
|
|
|
211 |
core_noncot_score = summary["core_noncot"]["macro_mean_score"]
|
212 |
core_cot_score = summary["core_cot"]["macro_mean_score"]
|
213 |
row = {
|
214 |
+
"Models": get_display_model_name(model, as_link=True),
|
215 |
"Overall": round(summary["overall_score"] * 100, 2),
|
216 |
+
"Core w/o CoT": round(core_noncot_score * 100, 2),
|
217 |
+
"Core w/ CoT": round(core_cot_score * 100, 2),
|
218 |
"Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
|
219 |
}
|
220 |
for display_name in self.SUPER_GROUPS[selected_super_group]:
|
|
|
231 |
|
232 |
def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]:
|
233 |
df = self.get_df(selected_super_group, selected_model_group)
|
234 |
+
|
235 |
+
# Get total task counts from the first model's data
|
236 |
+
sample_model = next(iter(self.MODEL_DATA))
|
237 |
+
total_core_tasks = self.SUMMARY_DATA[sample_model]["core_noncot"]["num_eval_tasks"]
|
238 |
+
total_open_tasks = self.SUMMARY_DATA[sample_model]["open"]["num_eval_tasks"]
|
239 |
+
total_tasks = total_core_tasks + total_open_tasks
|
240 |
+
|
241 |
+
# Define headers with task counts
|
242 |
+
column_headers = {
|
243 |
+
"Models": "Models",
|
244 |
+
"Overall": f"Overall ({total_tasks})",
|
245 |
+
"Core w/o CoT": f"Core(w/o CoT) ({total_core_tasks})",
|
246 |
+
"Core w/ CoT": f"Core(w/ CoT) ({total_core_tasks})",
|
247 |
+
"Open-ended": f"Open-ended ({total_open_tasks})"
|
248 |
+
}
|
249 |
+
|
250 |
+
# Rename the columns in DataFrame to match headers
|
251 |
+
df = df.rename(columns=column_headers)
|
252 |
+
|
253 |
+
headers = [
|
254 |
+
column_headers["Models"],
|
255 |
+
column_headers["Overall"],
|
256 |
+
column_headers["Core w/o CoT"],
|
257 |
+
column_headers["Core w/ CoT"],
|
258 |
+
column_headers["Open-ended"]
|
259 |
+
] + self.SUPER_GROUPS[selected_super_group]
|
260 |
+
|
261 |
+
data = df[[
|
262 |
+
column_headers["Models"],
|
263 |
+
column_headers["Overall"],
|
264 |
+
column_headers["Core w/o CoT"],
|
265 |
+
column_headers["Core w/ CoT"],
|
266 |
+
column_headers["Open-ended"]
|
267 |
+
] + self.SUPER_GROUPS[selected_super_group]].values.tolist()
|
268 |
+
|
269 |
return headers, data
|
270 |
|
271 |
|
272 |
+
class SingleImageDataLoader(BaseDataLoader):
|
273 |
def __init__(self):
|
274 |
super().__init__()
|
275 |
|
276 |
def _load_model_data(self) -> Dict[str, Any]:
|
277 |
+
with open("./static/eval_results/SI/all_model_keywords_stats.json", "r") as f:
|
278 |
return json.load(f)
|
279 |
|
280 |
def _load_summary_data(self) -> Dict[str, Any]:
|
281 |
+
with open("./static/eval_results/SI/all_summary.json", "r") as f:
|
282 |
return json.load(f)
|
283 |
|
284 |
def get_df(self, selected_super_group: str, selected_model_group: str) -> pd.DataFrame:
|
|
|
287 |
for model in self.MODEL_GROUPS[selected_model_group]:
|
288 |
model_data = self.MODEL_DATA[model]
|
289 |
summary = self.SUMMARY_DATA[model]
|
|
|
290 |
row = {
|
291 |
+
"Models": get_display_model_name(model, as_link=True),
|
292 |
+
"Overall": round(summary["overall_score"] * 100, 2),
|
293 |
+
"Core": round(summary["core"]["macro_mean_score"] * 100, 2),
|
294 |
+
"Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
|
295 |
}
|
296 |
for display_name in self.SUPER_GROUPS[selected_super_group]:
|
297 |
original_keyword = self.keyword_display_map[display_name]
|
|
|
302 |
data.append(row)
|
303 |
|
304 |
df = pd.DataFrame(data)
|
305 |
+
df = df.sort_values(by="Overall", ascending=False)
|
306 |
return df
|
307 |
|
308 |
def get_leaderboard_data(self, selected_super_group: str, selected_model_group: str) -> Tuple[list, list]:
|
309 |
df = self.get_df(selected_super_group, selected_model_group)
|
310 |
+
|
311 |
+
# Get total task counts from the first model's data
|
312 |
+
sample_model = next(iter(self.MODEL_DATA))
|
313 |
+
total_core_tasks = self.SUMMARY_DATA[sample_model]["core"]["num_eval_tasks"]
|
314 |
+
total_open_tasks = self.SUMMARY_DATA[sample_model]["open"]["num_eval_tasks"]
|
315 |
+
total_tasks = total_core_tasks + total_open_tasks
|
316 |
+
|
317 |
+
# Define headers with task counts
|
318 |
+
column_headers = {
|
319 |
+
"Models": "Models",
|
320 |
+
"Overall": f"Overall ({total_tasks})",
|
321 |
+
"Core": f"Core ({total_core_tasks})",
|
322 |
+
"Open-ended": f"Open-ended ({total_open_tasks})"
|
323 |
+
}
|
324 |
+
|
325 |
+
# Rename the columns in DataFrame to match headers
|
326 |
+
df = df.rename(columns=column_headers)
|
327 |
+
|
328 |
+
headers = [
|
329 |
+
column_headers["Models"],
|
330 |
+
column_headers["Overall"],
|
331 |
+
column_headers["Core"],
|
332 |
+
column_headers["Open-ended"]
|
333 |
+
] + self.SUPER_GROUPS[selected_super_group]
|
334 |
+
|
335 |
+
data = df[[
|
336 |
+
column_headers["Models"],
|
337 |
+
column_headers["Overall"],
|
338 |
+
column_headers["Core"],
|
339 |
+
column_headers["Open-ended"]
|
340 |
+
] + self.SUPER_GROUPS[selected_super_group]].values.tolist()
|
341 |
+
|
342 |
return headers, data
|
343 |
|
344 |
|
|
|
349 |
def get_original_keyword(mapped_keyword):
|
350 |
return next((k for k, v in KEYWORD_NAME_MAP.items() if v == mapped_keyword), mapped_keyword)
|
351 |
|
352 |
+
def get_display_model_name(model_name: str, as_link: bool = True) -> str:
|
353 |
+
display_name = MODEL_NAME_MAP.get(model_name, model_name)
|
354 |
+
if as_link and model_name in MODEL_URLS:
|
355 |
+
return f'<a href="{MODEL_URLS[model_name]}" target="_blank" style="text-decoration: none; color: #2196F3;">{display_name}</a>'
|
356 |
+
return display_name
|