Spaces:

open-llm-leaderboard
/

comparator

Running

App Files Files Community

albertvillanova HF staff commited on 10 days ago

Commit

523fad9

•

1 Parent(s): 96f60e1

Support comparing model tree generations

Browse files

Files changed (6) hide show

app.py +43 -6
src/constants.py +10 -0
src/details.py +9 -2
src/hub.py +22 -3
src/model_tree.py +42 -0
src/results.py +9 -6

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from src.details import (
     update_subtasks_component,
     update_task_description_component,
 )
 from src.results import (
     clear_results,
     clear_results_file,
@@ -20,7 +21,6 @@ from src.results import (
     load_result_paths_per_model,
     load_results,
     plot_results,
-    update_load_results_component,
     update_tasks_component,
 )
@@ -41,6 +41,18 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
         model_ids = gr.Dropdown(label="Models", multiselect=True)
         result_paths_per_model = gr.State()
     with gr.Row():
         with gr.Tab("Results"):
             load_results_btn = gr.Button("Load", interactive=False)
@@ -119,19 +131,25 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
         outputs=model_ids,
     )
-    # RESULTS:
     gr.on(
         triggers=[model_ids.input],
-        fn=update_load_results_component,
-        outputs=[load_results_btn, load_configs_btn],
     )
     gr.on(
         triggers=[load_results_btn.click, load_configs_btn.click],
         fn=display_loading_message_for_results,
         outputs=[results, configs],
     ).then(
         fn=load_results,
-        inputs=[model_ids, result_paths_per_model],
         outputs=results_dataframe,
     ).then(
         fn=update_tasks_component,
@@ -185,6 +203,12 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
             results_task,
             configs_task,
         ],
     ).then(
         fn=clear_results_file,
         outputs=results_file,
@@ -211,7 +235,11 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
         outputs=details,
     ).then(
         fn=load_details,
-        inputs=[model_ids, subtask],
         outputs=details_dataframe,
     ).then(
         fn=update_sample_idx_component,
@@ -240,4 +268,13 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
         ],
     )
 demo.launch()

     update_subtasks_component,
     update_task_description_component,
 )
+from src.model_tree import load_model_tree
 from src.results import (
     clear_results,
     clear_results_file,
     load_result_paths_per_model,
     load_results,
     plot_results,
     update_tasks_component,
 )
         model_ids = gr.Dropdown(label="Models", multiselect=True)
         result_paths_per_model = gr.State()
+    with gr.Accordion("Model tree: Compare base and derived models", open=False):
+        load_model_tree_btn = gr.Button("Load Model Tree", interactive=False)
+        model_tree_labels = [constants.BASE_MODEL_TYPE[0]] + [
+            derived_model_type[0] for derived_model_type in constants.DERIVED_MODEL_TYPES
+        ]
+        base_and_derived_models = [
+            gr.Dropdown(label=model_tree_labels[0], multiselect=True),
+        ]
+        with gr.Row():
+            for label in model_tree_labels[1:]:
+                base_and_derived_models.append(gr.Dropdown(label=label, multiselect=True, interactive=False))
     with gr.Row():
         with gr.Tab("Results"):
             load_results_btn = gr.Button("Load", interactive=False)
         outputs=model_ids,
     )
+    # Buttons:
     gr.on(
         triggers=[model_ids.input],
+        fn=lambda: (gr.Button(interactive=True),) * 3,
+        outputs=[load_model_tree_btn, load_results_btn, load_configs_btn],
     )
+    # RESULTS:
     gr.on(
         triggers=[load_results_btn.click, load_configs_btn.click],
         fn=display_loading_message_for_results,
         outputs=[results, configs],
     ).then(
         fn=load_results,
+        inputs=[
+            result_paths_per_model,
+            model_ids,
+            *base_and_derived_models,
+        ],
         outputs=results_dataframe,
     ).then(
         fn=update_tasks_component,
             results_task,
             configs_task,
         ],
+    ).then(
+        fn=lambda: gr.Button(interactive=False),
+        outputs=load_model_tree_btn,
+    ).then(
+        fn=lambda: [gr.Dropdown(label=label, multiselect=True, interactive=False) for label in model_tree_labels],
+        outputs=[*base_and_derived_models],
     ).then(
         fn=clear_results_file,
         outputs=results_file,
         outputs=details,
     ).then(
         fn=load_details,
+        inputs=[
+            subtask,
+            model_ids,
+            *base_and_derived_models,
+        ],
         outputs=details_dataframe,
     ).then(
         fn=update_sample_idx_component,
         ],
     )
+    # MODEL TREE:
+    load_model_tree_btn.click(
+        fn=load_model_tree,
+        inputs=[result_paths_per_model, model_ids],
+        outputs=[
+            *base_and_derived_models,
+        ],
+    )
 demo.launch()

src/constants.py CHANGED Viewed

@@ -72,3 +72,13 @@ TASK_DESCRIPTIONS = {
     "leaderboard_mmlu_pro": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original.",
     "leaderboard_musr": "MuSR is a new dataset consisting of algorithmically generated complex problems, each around 1,000 words in length. The problems include murder mysteries, object placement questions, and team allocation optimizations. Solving these problems requires models to integrate reasoning with long-range context parsing. Few models achieve better than random performance on this dataset.",
 }

     "leaderboard_mmlu_pro": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original.",
     "leaderboard_musr": "MuSR is a new dataset consisting of algorithmically generated complex problems, each around 1,000 words in length. The problems include murder mysteries, object placement questions, and team allocation optimizations. Solving these problems requires models to integrate reasoning with long-range context parsing. Few models achieve better than random performance on this dataset.",
 }
+HF_API_URL = "https://huggingface.co/api"
+BASE_MODEL_TYPE = ("Base models", "base_model")
+DERIVED_MODEL_TYPES = [
+    ("Adapters", "adapter"),
+    ("Finetunes", "finetune"),
+    ("Merges", "merge"),
+    ("Quantizations", "quantized"),
+]

src/details.py CHANGED Viewed

@@ -61,8 +61,15 @@ async def load_details_dataframe(model_id, subtask):
     return df.sort_values("doc_id").set_index("doc_id", drop=False).set_index("model_name", append=True)
-async def load_details(model_ids, subtask):
-    dfs = await asyncio.gather(*[load_details_dataframe(model_id, subtask) for model_id in model_ids])
     if dfs:
         return pd.concat(dfs)

     return df.sort_values("doc_id").set_index("doc_id", drop=False).set_index("model_name", append=True)
+async def load_details(subtask, *model_ids_lists):
+    dfs = await asyncio.gather(
+        *[
+            load_details_dataframe(model_id, subtask)
+            for model_ids in model_ids_lists
+            if model_ids
+            for model_id in model_ids
+        ]
+    )
     if dfs:
         return pd.concat(dfs)

src/hub.py CHANGED Viewed

@@ -2,9 +2,11 @@ import io
 import json
 import httpx
-from huggingface_hub import HfFileSystem, hf_hub_url
 from huggingface_hub.utils import build_hf_headers
 client = httpx.AsyncClient(follow_redirects=True)
 fs = HfFileSystem()
@@ -29,5 +31,22 @@ async def load_jsonlines_file(path):
 def to_url(path):
-    _, org_name, ds_name, filename = path.split("/", 3)
-    return hf_hub_url(repo_id=f"{org_name}/{ds_name}", filename=filename, repo_type="dataset")

 import json
 import httpx
+from huggingface_hub import HfFileSystem, ModelCard, hf_hub_url
 from huggingface_hub.utils import build_hf_headers
+import src.constants as constants
 client = httpx.AsyncClient(follow_redirects=True)
 fs = HfFileSystem()
 def to_url(path):
+    *repo_type, org_name, ds_name, filename = path.split("/", 3)
+    repo_type = repo_type[0][:-1] if repo_type else None
+    print(path)
+    print(repo_type, org_name, ds_name, filename)
+    return hf_hub_url(repo_id=f"{org_name}/{ds_name}", filename=filename, repo_type=repo_type)
+async def load_model_card(model_id):
+    url = to_url(f"{model_id}/README.md")
+    r = await client.get(url)
+    return ModelCard(r.text, ignore_metadata_errors=True)
+async def list_models(filtering=None):
+    params = {}
+    if filtering:
+        params["filter"] = filtering
+    r = await client.get(f"{constants.HF_API_URL}/models", params=params)
+    return r.json()

src/model_tree.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import asyncio
+import gradio as gr
+import src.constants as constants
+from src.hub import list_models, load_model_card
+async def load_model_tree(result_paths_per_model, model_ids):
+    # TODO: Multiple models?
+    model_id = model_ids[0]
+    model_tree = await asyncio.gather(
+        load_base_models(model_id),
+        *[
+            load_derived_models_by_type(model_id, derived_model_type[1])
+            for derived_model_type in constants.DERIVED_MODEL_TYPES
+        ],
+    )
+    model_tree_choices = [
+        [model_id for model_id in model_ids if model_id in result_paths_per_model] for model_ids in model_tree
+    ]
+    model_tree_labels = [constants.BASE_MODEL_TYPE[0]] + [
+        derived_model_type[0] for derived_model_type in constants.DERIVED_MODEL_TYPES
+    ]
+    return [
+        gr.Dropdown(choices=choices, label=f"{label} ({len(choices)})", interactive=True if choices else False)
+        for choices, label in zip(model_tree_choices, model_tree_labels)
+    ]
+async def load_base_models(model_id) -> list[str]:
+    card = await load_model_card(model_id)
+    base_models = getattr(card.data, constants.BASE_MODEL_TYPE[1])
+    if not isinstance(base_models, list):
+        base_models = [base_models]
+    return base_models
+async def load_derived_models_by_type(model_id, derived_model_type):
+    models = await list_models(filtering=f"base_model:{derived_model_type}:{model_id}")
+    models = [model["id"] for model in models]
+    return models

src/results.py CHANGED Viewed

@@ -29,10 +29,6 @@ def sort_result_paths_per_model(paths):
     return {model_id: sorted(paths) for model_id, paths in d.items()}
-def update_load_results_component():
-    return (gr.Button("Load", interactive=True),) * 2
 async def load_results_dataframe(model_id, result_paths_per_model=None):
     if not model_id or not result_paths_per_model:
         return
@@ -48,8 +44,15 @@ async def load_results_dataframe(model_id, result_paths_per_model=None):
     return df.set_index(pd.Index([model_name]))
-async def load_results(model_ids, result_paths_per_model=None):
-    dfs = await asyncio.gather(*[load_results_dataframe(model_id, result_paths_per_model) for model_id in model_ids])
     dfs = [df for df in dfs if df is not None]
     if dfs:
         return pd.concat(dfs)

     return {model_id: sorted(paths) for model_id, paths in d.items()}
 async def load_results_dataframe(model_id, result_paths_per_model=None):
     if not model_id or not result_paths_per_model:
         return
     return df.set_index(pd.Index([model_name]))
+async def load_results(result_paths_per_model, *model_ids_lists):
+    dfs = await asyncio.gather(
+        *[
+            load_results_dataframe(model_id, result_paths_per_model)
+            for model_ids in model_ids_lists
+            if model_ids
+            for model_id in model_ids
+        ]
+    )
     dfs = [df for df in dfs if df is not None]
     if dfs:
         return pd.concat(dfs)