albertvillanova HF staff commited on
Commit
523fad9
1 Parent(s): 96f60e1

Support comparing model tree generations

Browse files
Files changed (6) hide show
  1. app.py +43 -6
  2. src/constants.py +10 -0
  3. src/details.py +9 -2
  4. src/hub.py +22 -3
  5. src/model_tree.py +42 -0
  6. src/results.py +9 -6
app.py CHANGED
@@ -11,6 +11,7 @@ from src.details import (
11
  update_subtasks_component,
12
  update_task_description_component,
13
  )
 
14
  from src.results import (
15
  clear_results,
16
  clear_results_file,
@@ -20,7 +21,6 @@ from src.results import (
20
  load_result_paths_per_model,
21
  load_results,
22
  plot_results,
23
- update_load_results_component,
24
  update_tasks_component,
25
  )
26
 
@@ -41,6 +41,18 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
41
  model_ids = gr.Dropdown(label="Models", multiselect=True)
42
  result_paths_per_model = gr.State()
43
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  with gr.Row():
45
  with gr.Tab("Results"):
46
  load_results_btn = gr.Button("Load", interactive=False)
@@ -119,19 +131,25 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
119
  outputs=model_ids,
120
  )
121
 
122
- # RESULTS:
123
  gr.on(
124
  triggers=[model_ids.input],
125
- fn=update_load_results_component,
126
- outputs=[load_results_btn, load_configs_btn],
127
  )
 
 
128
  gr.on(
129
  triggers=[load_results_btn.click, load_configs_btn.click],
130
  fn=display_loading_message_for_results,
131
  outputs=[results, configs],
132
  ).then(
133
  fn=load_results,
134
- inputs=[model_ids, result_paths_per_model],
 
 
 
 
135
  outputs=results_dataframe,
136
  ).then(
137
  fn=update_tasks_component,
@@ -185,6 +203,12 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
185
  results_task,
186
  configs_task,
187
  ],
 
 
 
 
 
 
188
  ).then(
189
  fn=clear_results_file,
190
  outputs=results_file,
@@ -211,7 +235,11 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
211
  outputs=details,
212
  ).then(
213
  fn=load_details,
214
- inputs=[model_ids, subtask],
 
 
 
 
215
  outputs=details_dataframe,
216
  ).then(
217
  fn=update_sample_idx_component,
@@ -240,4 +268,13 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
240
  ],
241
  )
242
 
 
 
 
 
 
 
 
 
 
243
  demo.launch()
 
11
  update_subtasks_component,
12
  update_task_description_component,
13
  )
14
+ from src.model_tree import load_model_tree
15
  from src.results import (
16
  clear_results,
17
  clear_results_file,
 
21
  load_result_paths_per_model,
22
  load_results,
23
  plot_results,
 
24
  update_tasks_component,
25
  )
26
 
 
41
  model_ids = gr.Dropdown(label="Models", multiselect=True)
42
  result_paths_per_model = gr.State()
43
 
44
+ with gr.Accordion("Model tree: Compare base and derived models", open=False):
45
+ load_model_tree_btn = gr.Button("Load Model Tree", interactive=False)
46
+ model_tree_labels = [constants.BASE_MODEL_TYPE[0]] + [
47
+ derived_model_type[0] for derived_model_type in constants.DERIVED_MODEL_TYPES
48
+ ]
49
+ base_and_derived_models = [
50
+ gr.Dropdown(label=model_tree_labels[0], multiselect=True),
51
+ ]
52
+ with gr.Row():
53
+ for label in model_tree_labels[1:]:
54
+ base_and_derived_models.append(gr.Dropdown(label=label, multiselect=True, interactive=False))
55
+
56
  with gr.Row():
57
  with gr.Tab("Results"):
58
  load_results_btn = gr.Button("Load", interactive=False)
 
131
  outputs=model_ids,
132
  )
133
 
134
+ # Buttons:
135
  gr.on(
136
  triggers=[model_ids.input],
137
+ fn=lambda: (gr.Button(interactive=True),) * 3,
138
+ outputs=[load_model_tree_btn, load_results_btn, load_configs_btn],
139
  )
140
+
141
+ # RESULTS:
142
  gr.on(
143
  triggers=[load_results_btn.click, load_configs_btn.click],
144
  fn=display_loading_message_for_results,
145
  outputs=[results, configs],
146
  ).then(
147
  fn=load_results,
148
+ inputs=[
149
+ result_paths_per_model,
150
+ model_ids,
151
+ *base_and_derived_models,
152
+ ],
153
  outputs=results_dataframe,
154
  ).then(
155
  fn=update_tasks_component,
 
203
  results_task,
204
  configs_task,
205
  ],
206
+ ).then(
207
+ fn=lambda: gr.Button(interactive=False),
208
+ outputs=load_model_tree_btn,
209
+ ).then(
210
+ fn=lambda: [gr.Dropdown(label=label, multiselect=True, interactive=False) for label in model_tree_labels],
211
+ outputs=[*base_and_derived_models],
212
  ).then(
213
  fn=clear_results_file,
214
  outputs=results_file,
 
235
  outputs=details,
236
  ).then(
237
  fn=load_details,
238
+ inputs=[
239
+ subtask,
240
+ model_ids,
241
+ *base_and_derived_models,
242
+ ],
243
  outputs=details_dataframe,
244
  ).then(
245
  fn=update_sample_idx_component,
 
268
  ],
269
  )
270
 
271
+ # MODEL TREE:
272
+ load_model_tree_btn.click(
273
+ fn=load_model_tree,
274
+ inputs=[result_paths_per_model, model_ids],
275
+ outputs=[
276
+ *base_and_derived_models,
277
+ ],
278
+ )
279
+
280
  demo.launch()
src/constants.py CHANGED
@@ -72,3 +72,13 @@ TASK_DESCRIPTIONS = {
72
  "leaderboard_mmlu_pro": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original.",
73
  "leaderboard_musr": "MuSR is a new dataset consisting of algorithmically generated complex problems, each around 1,000 words in length. The problems include murder mysteries, object placement questions, and team allocation optimizations. Solving these problems requires models to integrate reasoning with long-range context parsing. Few models achieve better than random performance on this dataset.",
74
  }
 
 
 
 
 
 
 
 
 
 
 
72
  "leaderboard_mmlu_pro": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original.",
73
  "leaderboard_musr": "MuSR is a new dataset consisting of algorithmically generated complex problems, each around 1,000 words in length. The problems include murder mysteries, object placement questions, and team allocation optimizations. Solving these problems requires models to integrate reasoning with long-range context parsing. Few models achieve better than random performance on this dataset.",
74
  }
75
+
76
+
77
+ HF_API_URL = "https://huggingface.co/api"
78
+ BASE_MODEL_TYPE = ("Base models", "base_model")
79
+ DERIVED_MODEL_TYPES = [
80
+ ("Adapters", "adapter"),
81
+ ("Finetunes", "finetune"),
82
+ ("Merges", "merge"),
83
+ ("Quantizations", "quantized"),
84
+ ]
src/details.py CHANGED
@@ -61,8 +61,15 @@ async def load_details_dataframe(model_id, subtask):
61
  return df.sort_values("doc_id").set_index("doc_id", drop=False).set_index("model_name", append=True)
62
 
63
 
64
- async def load_details(model_ids, subtask):
65
- dfs = await asyncio.gather(*[load_details_dataframe(model_id, subtask) for model_id in model_ids])
 
 
 
 
 
 
 
66
  if dfs:
67
  return pd.concat(dfs)
68
 
 
61
  return df.sort_values("doc_id").set_index("doc_id", drop=False).set_index("model_name", append=True)
62
 
63
 
64
+ async def load_details(subtask, *model_ids_lists):
65
+ dfs = await asyncio.gather(
66
+ *[
67
+ load_details_dataframe(model_id, subtask)
68
+ for model_ids in model_ids_lists
69
+ if model_ids
70
+ for model_id in model_ids
71
+ ]
72
+ )
73
  if dfs:
74
  return pd.concat(dfs)
75
 
src/hub.py CHANGED
@@ -2,9 +2,11 @@ import io
2
  import json
3
 
4
  import httpx
5
- from huggingface_hub import HfFileSystem, hf_hub_url
6
  from huggingface_hub.utils import build_hf_headers
7
 
 
 
8
 
9
  client = httpx.AsyncClient(follow_redirects=True)
10
  fs = HfFileSystem()
@@ -29,5 +31,22 @@ async def load_jsonlines_file(path):
29
 
30
 
31
  def to_url(path):
32
- _, org_name, ds_name, filename = path.split("/", 3)
33
- return hf_hub_url(repo_id=f"{org_name}/{ds_name}", filename=filename, repo_type="dataset")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import json
3
 
4
  import httpx
5
+ from huggingface_hub import HfFileSystem, ModelCard, hf_hub_url
6
  from huggingface_hub.utils import build_hf_headers
7
 
8
+ import src.constants as constants
9
+
10
 
11
  client = httpx.AsyncClient(follow_redirects=True)
12
  fs = HfFileSystem()
 
31
 
32
 
33
  def to_url(path):
34
+ *repo_type, org_name, ds_name, filename = path.split("/", 3)
35
+ repo_type = repo_type[0][:-1] if repo_type else None
36
+ print(path)
37
+ print(repo_type, org_name, ds_name, filename)
38
+ return hf_hub_url(repo_id=f"{org_name}/{ds_name}", filename=filename, repo_type=repo_type)
39
+
40
+
41
+ async def load_model_card(model_id):
42
+ url = to_url(f"{model_id}/README.md")
43
+ r = await client.get(url)
44
+ return ModelCard(r.text, ignore_metadata_errors=True)
45
+
46
+
47
+ async def list_models(filtering=None):
48
+ params = {}
49
+ if filtering:
50
+ params["filter"] = filtering
51
+ r = await client.get(f"{constants.HF_API_URL}/models", params=params)
52
+ return r.json()
src/model_tree.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+
3
+ import gradio as gr
4
+
5
+ import src.constants as constants
6
+ from src.hub import list_models, load_model_card
7
+
8
+
9
+ async def load_model_tree(result_paths_per_model, model_ids):
10
+ # TODO: Multiple models?
11
+ model_id = model_ids[0]
12
+ model_tree = await asyncio.gather(
13
+ load_base_models(model_id),
14
+ *[
15
+ load_derived_models_by_type(model_id, derived_model_type[1])
16
+ for derived_model_type in constants.DERIVED_MODEL_TYPES
17
+ ],
18
+ )
19
+ model_tree_choices = [
20
+ [model_id for model_id in model_ids if model_id in result_paths_per_model] for model_ids in model_tree
21
+ ]
22
+ model_tree_labels = [constants.BASE_MODEL_TYPE[0]] + [
23
+ derived_model_type[0] for derived_model_type in constants.DERIVED_MODEL_TYPES
24
+ ]
25
+ return [
26
+ gr.Dropdown(choices=choices, label=f"{label} ({len(choices)})", interactive=True if choices else False)
27
+ for choices, label in zip(model_tree_choices, model_tree_labels)
28
+ ]
29
+
30
+
31
+ async def load_base_models(model_id) -> list[str]:
32
+ card = await load_model_card(model_id)
33
+ base_models = getattr(card.data, constants.BASE_MODEL_TYPE[1])
34
+ if not isinstance(base_models, list):
35
+ base_models = [base_models]
36
+ return base_models
37
+
38
+
39
+ async def load_derived_models_by_type(model_id, derived_model_type):
40
+ models = await list_models(filtering=f"base_model:{derived_model_type}:{model_id}")
41
+ models = [model["id"] for model in models]
42
+ return models
src/results.py CHANGED
@@ -29,10 +29,6 @@ def sort_result_paths_per_model(paths):
29
  return {model_id: sorted(paths) for model_id, paths in d.items()}
30
 
31
 
32
- def update_load_results_component():
33
- return (gr.Button("Load", interactive=True),) * 2
34
-
35
-
36
  async def load_results_dataframe(model_id, result_paths_per_model=None):
37
  if not model_id or not result_paths_per_model:
38
  return
@@ -48,8 +44,15 @@ async def load_results_dataframe(model_id, result_paths_per_model=None):
48
  return df.set_index(pd.Index([model_name]))
49
 
50
 
51
- async def load_results(model_ids, result_paths_per_model=None):
52
- dfs = await asyncio.gather(*[load_results_dataframe(model_id, result_paths_per_model) for model_id in model_ids])
 
 
 
 
 
 
 
53
  dfs = [df for df in dfs if df is not None]
54
  if dfs:
55
  return pd.concat(dfs)
 
29
  return {model_id: sorted(paths) for model_id, paths in d.items()}
30
 
31
 
 
 
 
 
32
  async def load_results_dataframe(model_id, result_paths_per_model=None):
33
  if not model_id or not result_paths_per_model:
34
  return
 
44
  return df.set_index(pd.Index([model_name]))
45
 
46
 
47
+ async def load_results(result_paths_per_model, *model_ids_lists):
48
+ dfs = await asyncio.gather(
49
+ *[
50
+ load_results_dataframe(model_id, result_paths_per_model)
51
+ for model_ids in model_ids_lists
52
+ if model_ids
53
+ for model_id in model_ids
54
+ ]
55
+ )
56
  dfs = [df for df in dfs if df is not None]
57
  if dfs:
58
  return pd.concat(dfs)